大数据爬虫项目之新浪微博模拟登陆(附下载)

江左梅郎2016/12/05         
在做网络爬虫项目的时候,有些网站设置了权限,只有在登录了之后才能爬取网站的内容,如何模拟登录呢?请看下面的代码!
后面持续更新腾讯微博、新浪微博模拟登陆等,请关注!
// 获取登陆后的 client 就可以访问明星的任何页面数据 @SuppressWarnings("deprecation") public static DefaultHttpClient login(String u, String p) { DefaultHttpClient client = new DefaultHttpClient(); HttpClientParams.setCookiePolicy(client.getParams(), CookiePolicy.BROWSER_COMPATIBILITY); try { /** 获得rsaPubkey,rsakv,servertime等参数值 **/ HashMapparams = preLogin(encodeAccount(u), client); /******** 登录操作 *********/ HttpPost post = new HttpPost( "http://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.5)"); post.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"); post.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 5.1; rv:9.0.1) Gecko/20100101 Firefox/9.0.1"); post.setHeader("Accept-Language", "zh-cn,zh;q=0.5"); post.setHeader("Accept-Charset", "GB2312,utf-8;q=0.7,*;q=0.7"); post.setHeader("Referer", "http://weibo.com/?c=spr_web_sq_firefox_weibo_t001"); post.setHeader("Content-Type", "application/x-www-form-urlencoded"); String nonce = makeNonce(6); Listnvps = new ArrayList(); nvps.add(new BasicNameValuePair("encoding", "UTF-8")); nvps.add(new BasicNameValuePair("entry", "weibo")); nvps.add(new BasicNameValuePair("from", "")); nvps.add(new BasicNameValuePair("gateway", "1")); nvps.add(new BasicNameValuePair("nonce", nonce)); nvps.add(new BasicNameValuePair("pagerefer", "http://i.firefoxchina.cn/old/")); nvps.add(new BasicNameValuePair("prelt", "111")); nvps.add(new BasicNameValuePair("pwencode", "rsa2")); nvps.add(new BasicNameValuePair("returntype", "META")); nvps.add(new BasicNameValuePair("rsakv", params.get("rsakv"))); nvps.add(new BasicNameValuePair("savestate", "0")); nvps.add(new BasicNameValuePair("servertime", params .get("servertime"))); nvps.add(new BasicNameValuePair("service", "miniblog")); /******************** *加密密码 ***************************/ ScriptEngineManager sem = new ScriptEngineManager(); ScriptEngine se = sem.getEngineByName("javascript"); se.eval(getJs()); String pass = ""; if (se instanceof Invocable) { Invocable invoke = (Invocable) se; // 调用preprocess方法,并传入两个参数密码和验证码 pass = invoke.invokeFunction("getpass", p, params.get("servertime"), nonce, params.get("pubkey")) .toString(); System.out.println("c = " + pass); } nvps.add(new BasicNameValuePair("sp", pass)); nvps.add(new BasicNameValuePair("su", encodeAccount(u))); nvps.add(new BasicNameValuePair( "url", "http://weibo.com/ajaxlogin.php?framelogin=1&callback=parent.sinaSSOController.feedBackUrlCallBack")); nvps.add(new BasicNameValuePair("useticket", "1")); nvps.add(new BasicNameValuePair("vsnf", "1")); post.setEntity(new UrlEncodedFormEntity(nvps, HTTP.UTF_8)); HttpResponse response = client.execute(post); String entity = EntityUtils.toString(response.getEntity()); if (entity.replace("\"", "").indexOf("retcode=0") > -1) { // http://passport.weibo.com/wbsso/login? 新变化的 // retcode=0 新变化的 String url = entity.substring(entity .indexOf("http://passport.weibo.com/wbsso/login?"), entity.indexOf("retcode=0") + 9); String nick = "chenkun"; // 昵称 // 获取到实际url进行连接 HttpGet getMethod = new HttpGet(url); response = client.execute(getMethod); entity = EntityUtils.toString(response.getEntity()); // nick = entity.substring(entity.indexOf("displayname") + 14, // entity.lastIndexOf("userdomain") - 3).trim(); url = entity.substring(entity.indexOf("userdomain") + 13, entity.lastIndexOf("\"")); String original = "p/1035051087770692/weibo?profile_ftype=0&is_ori=1&from=page_103505_home&wvr=5.1&mod=originalweibo#place"; // 登陆后可以请求 任何明星微博页面数据 getMethod = new HttpGet("http://weibo.com/" + original); response = client.execute(getMethod); // 返回页面数据内容 entity = EntityUtils.toString(response.getEntity()); // System.out.println(entity); } } catch (Exception e) { e.printStackTrace(); } return client; }
大数据爬虫项目之新浪微博模拟登陆代码.docx 资源大小: 17KB
189****8167 2016/12/30
回复
PM 2016/12/08
回复