大数据爬虫项目之京东模拟登陆(附下载)
江左梅郎2016/12/06         
在做网络爬虫项目的时候,有些网站设置了权限,只有在登录了之后才能爬取网站的内容,如何模拟登录呢?请看下面的代码!
后面持续更新腾讯微博、新浪微博模拟登陆等,请关注!
package com.dajiangtai.djt_spider.util; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.List; import java.util.Map; import java.io.BufferedReader; import java.io.File; import java.io.FileOutputStream; import java.io.InputStream; import java.io.InputStreamReader; import java.net.URI; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Map.Entry; import org.apache.http.HttpEntity; import org.apache.http.NameValuePair; import org.apache.http.client.entity.UrlEncodedFormEntity; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.client.methods.HttpPost; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.apache.http.message.BasicNameValuePair; import org.apache.http.util.EntityUtils; import org.htmlparser.NodeFilter; import org.htmlparser.Parser; import org.htmlparser.filters.HasAttributeFilter; import org.htmlparser.filters.TagNameFilter; import org.htmlparser.nodes.TagNode; import org.htmlparser.util.NodeList; import org.apache.http.HttpEntity; import org.apache.http.HttpResponse; import org.apache.http.NameValuePair; import org.apache.http.client.ClientProtocolException; import org.apache.http.client.entity.UrlEncodedFormEntity; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.client.methods.HttpPost; import org.apache.http.client.methods.HttpUriRequest; import org.apache.http.client.protocol.HttpClientContext; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.apache.http.message.BasicNameValuePair; import org.apache.http.util.EntityUtils; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; /** * 京东登陆 * @author John * */ public class TestJDLogin { private CloseableHttpClient client = HttpClients.createDefault(); private String body=""; private CloseableHttpResponse httpresponse=null; private HttpEntity entity=null; public void sendPost(String url, Map<String, String> paramValue, String encoding) throws Exception { HttpPost httpPost = new HttpPost(url); List<NameValuePair> nvps = new ArrayList<NameValuePair>(); for (Entry<String, String> entry : paramValue.entrySet()) { nvps.add(new BasicNameValuePair(entry.getKey(), entry.getValue())); } // 设置参数到请求对象中 httpPost.setEntity(new UrlEncodedFormEntity(nvps, encoding)); httpPost.setHeader("Content-type", "application/x-www-form-urlencoded"); httpPost.setHeader("User-Agent", "Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; DigExt)"); //执行请求操作,并拿到结果(同步阻塞) httpresponse = client.execute(httpPost); //获取结果实体 entity = httpresponse.getEntity(); if (entity != null) { //按指定编码转换结果实体为String类型 body = EntityUtils.toString(entity, encoding); } EntityUtils.consume(entity); //释放链接 if(httpresponse!=null){ httpresponse.close(); } } private HttpEntity sendGetReturnEntity(String url,Map<String, String> param)throws Exception{ HttpGet httpGet=new HttpGet(url); List<NameValuePair> params=new ArrayList<NameValuePair>(); if(param!=null){ for( Map.Entry<String, String> entry:param.entrySet()){ params.add(new BasicNameValuePair(entry.getKey(),entry.getValue())); } } String str = EntityUtils.toString(new UrlEncodedFormEntity(params)); httpGet.setURI(new URI(httpGet.getURI().toString() + "?" + str)); httpresponse = client.execute(httpGet); // 获取返回数据 return httpresponse.getEntity(); } public void sendGet(String url,Map<String, String> param) throws Exception{ this.entity=sendGetReturnEntity(url, param); body = EntityUtils.toString(entity); if (entity != null) { entity.consumeContent(); } if(httpresponse!=null){ httpresponse.close(); } } public InputStream sendGetReturnStream(String url,Map<String, String> param) throws Exception{ this.entity=this.sendGetReturnEntity(url, param); if(httpresponse!=null){ httpresponse.close(); } return entity.getContent(); } public NodeList parseHTMLByAttr(String attrName,String attrValue) throws Exception{ Parser parser=new Parser(body); NodeFilter nodeFilter=new HasAttributeFilter(attrName, attrValue); return parser.extractAllNodesThatMatch(nodeFilter); } public NodeList parseHTMLByTag(String tagname) throws Exception{ Parser parser=new Parser(body); NodeFilter nodeFilter=new TagNameFilter(tagname); return parser.extractAllNodesThatMatch(nodeFilter); } public String getBody() { return body; } public void closeClient() throws Exception{ if(client!=null){ client.close(); } } /** * Post请求 * @param url * @param nvps * @return * @throws IOException * @throws ClientProtocolException */ public String sendPost(String url) throws ClientProtocolException, IOException { //构造需要访问的页面 HttpUriRequest httpUriRequest = new HttpPost( url); // 添加必要的头信息 httpUriRequest .setHeader("Accept", "application/json, text/javascript, */*; q=0.01"); httpUriRequest.setHeader("Accept-Encoding", "gzip, deflate, br"); httpUriRequest.setHeader("Accept-Language", "zh-CN,zh;q=0.8"); httpUriRequest.setHeader("Connection", "keep-alive"); // 模拟浏览器,否则CSDN服务器限制访问 httpUriRequest .setHeader( "User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"); // 【特别注意】:这个一定需要和登录用同一个“httpClient”,不然会失败。登陆信息全部在“httpClient”中保存 HttpResponse response = client.execute(httpUriRequest); HttpEntity entity = response.getEntity(); String result = EntityUtils.toString(entity); return result; } public static void main(String[] args) throws Exception { String uuid=""; String token=""; String verify=""; Map<String, String> map = new HashMap<String, String>(); map.put("chkRememberMe", "on"); map.put("eid", "53f722ce0a8849fcae541ab98cefaf6d1913048173"); map.put("fp", "3719ad31eebf154b80c4f42316233ae2"); map.put("loginname", AcountUtil.JD_USERNAME); map.put("loginpwd", AcountUtil.JD_PASSWORD); map.put("nloginpwd", AcountUtil.JD_NPASSWORD); map.put("machineCpu", ""); map.put("machineDisk", ""); map.put("machineNet", ""); //获取登录界面动态数据 String url="https://passport.jd.com/uc/login?ltype=logout"; TestJDLogin jdLogin=new TestJDLogin(); jdLogin.sendGet(url, null); NodeList nodelist=jdLogin.parseHTMLByAttr("id", "uuid"); if(nodelist!=null){ for(int i=0;i<nodelist.size();i++){ TagNode node=(TagNode) nodelist.elementAt(i); uuid=node.getAttribute("value"); } } map.put("uuid", uuid); nodelist=jdLogin.parseHTMLByAttr("name", "_t"); if(nodelist!=null){ for(int i=0;i<nodelist.size();i++){ TagNode node=(TagNode) nodelist.elementAt(i); token=node.getAttribute("value"); } } map.put("_t", token); nodelist=jdLogin.parseHTMLByTag("input"); if(nodelist!=null){ int cnt=0; for(int i=0;i<nodelist.size();i++){ TagNode node=(TagNode) nodelist.elementAt(i); String name=node.getAttribute("name"); if(name.equals("_t")){ cnt=i; break; } } TagNode extTagNode=(TagNode) nodelist.elementAt(cnt+1); String extname=extTagNode.getAttribute("name"); String extvalue=extTagNode.getAttribute("value"); map.put(extname, extvalue); } //获取验证码 url="https://authcode.jd.com/verify/image?a=1&acid="+uuid+"&uid="+uuid; InputStream instream=jdLogin.sendGetReturnStream(url, null); File storeFile = new File("d:\\jd\\p.jpg"); FileOutputStream output = new FileOutputStream(storeFile); byte b[] = new byte[1024]; int j = 0; while( (j = instream.read(b))!=-1){ output.write(b,0,j); } output.flush(); output.close(); instream.close(); //读入验证码(手写) System.out.println("请输入验证码:"); InputStream isInputStream=System.in; BufferedReader br=new BufferedReader(new InputStreamReader(isInputStream)); verify=br.readLine(); isInputStream.close(); br.close(); map.put("authcode", verify); //登录 url = "https://passport.jd.com/uc/loginService"; jdLogin.sendPost(url, map, "utf-8"); System.out.println(jdLogin.getBody()); //异步加载订单名称数据 url = "https://order.jd.com/lazy/getOrderProductInfo.action"; Map<String, String> map1 = new HashMap<String, String>(); map1.put("orderWareIds", "2289679,1861092"); map1.put("orderWareTypes", "0,0"); map1.put("orderIds", "37672247959,37398751141"); map1.put("orderTypes", "0,0"); map1.put("orderSiteIds", "0,0"); jdLogin.sendPost(url, map1, "utf-8"); System.out.println(jdLogin.getBody()); jdLogin.closeClient(); } }
京东模拟登陆.rar 资源大小: 3KB
看完这篇文章的人大多学习了更多课程>>

PM 2016/12/06
回复