爱程序网

使用PHP的CURL模拟POST采集开了viewstate的asp.net网页数据

来源: 阅读:

用.NET做的网站如果做成POST提交方式,且开了viewstate的话,采集起来有点小繁琐,在此跟大家分享一下做法。

采的难点是必須先取得表單裏面的viewstate和datavalidtion兩個字段的值,並模擬POST給服務器,才能取到後面頁面的數據。由於回傳數據比較大,不能用默認的form/url-encode方法傳,要用傳文件的那個表單模式。主要代碼如下:

  1 /**  2     QQ群:223494678  3     函数:模拟post得到所有分页的页面信息  4     参数:  5         string $EVENTARGUMENT  6         string $VIEWSTATE  7         string $EVENTVALIDATION  8         string $EVENTTARGET  9     返回: 10         string 11 /**/ 12 function getn($EVENTARGUMENT = "", $VIEWSTATE = "", $EVENTVALIDATION = "", $EVENTTARGET = "pager"){ 13     $args = array(); 14     if($EVENTARGUMENT){ 15         $args = array( 16             '__EVENTTARGET'=>$EVENTTARGET, 17             '__EVENTARGUMENT'=>$EVENTARGUMENT, 18             '__VIEWSTATE'=>$VIEWSTATE, 19             '__EVENTVALIDATION'=>$EVENTVALIDATION, 20             '__VIEWSTATEENCRYPTED'=>'', 21             'search$txtFundName='=>'', 22             'search$txtFundManger'=>'', 23             'search$ddlFoundationDateOperater'=>'1', 24             'search$txtFoundationDate'=>'', 25             'search$dltFundType$ctl01$chkFundType'=>'on', 26             'search$dltFundType$ctl01$chklFundChildType$0'=>'on', 27             'search$dltFundType$ctl01$chklFundChildType$1'=>'on', 28             'search$dltFundType$ctl01$chklFundChildType$2'=>'on', 29             'search$dltFundType$ctl01$chklFundChildType$3'=>'on', 30             'search$dltFundType$ctl01$chklFundChildType$4'=>'on', 31             'search$chklFundStatus$0'=>'on', 32             'search$ddlFundOrg'=>'0', 33             'search$txtFundOrgName'=>'', 34             'search$ddlStatisticDateOperater'=>'1', 35             'search$txtStatisticDate'=>'', 36             'search$radlStatisticMode'=>'1' 37         ); 38     } 39  40     $user_agent = "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.79 Safari/535.11"; 41  42     $ch = curl_init(); 43     curl_setopt($ch, CURLOPT_URL, 'http://???/default.aspx'); 44     curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);// 设为TRUE让结果不要直接输出 45     curl_setopt($ch, CURLOPT_VERBOSE, TRUE); 46     curl_setopt($ch, CURLOPT_AUTOREFERER, TRUE); 47     curl_setopt($ch, CURLOPT_FAILONERROR, TRUE); 48     curl_setopt($ch, CURLOPT_FOLLOWLOCATION, TRUE); 49     curl_setopt($ch, CURLOPT_HEADER, TRUE); 50     curl_setopt($ch, CURLINFO_HEADER_OUT, TRUE); 51  52     curl_setopt($ch, CURLOPT_HTTPHEADER, array( 53     'Accept:text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 54     'Accept-Language:zh-CN,zh;q=0.8', 55     'Connection: Keep-Alive', 56     'Cache-Control:max-age=0', 57     'Referer:http://???/default.aspx', 58     'Expect:' 59     )); 60  61     curl_setopt($ch, CURLOPT_POST, true); //启用POST提交 62     curl_setopt($ch, CURLOPT_POSTFIELDS, $args); //设置POST提交的字符串 63     curl_setopt($ch, CURLOPT_USERAGENT, $user_agent);//HTTP请求User-Agent:头 64  65     $document = curl_exec($ch); //执行预定义的CURL 66     return $document; 67 } 68  69 /** 70     QQ群:223494678 71     函数:根据模拟post所得的页面信息,提取所需post的数据和分页,最后分解需要的html 72     返回: 73         string 74 /**/ 75 function getHtml(){ 76     global $html; 77     $first = getn(); 78     preg_match('/<font color="black"><b>(d+?)</b></font> 页</span>/is', $first, $matches); 79     $total = $matches[1]; 80     preg_match('/<table id="dltData".+?<!-- AspNetPager/is', $first, $matches); 81     $html .= str_replace('<!-- AspNetPager', '', $matches[0]); 82     $VIEWSTATE = ""; 83     $EVENTVALIDATION = ""; 84     preg_match('/<input type="hidden" name="__VIEWSTATE" id="__VIEWSTATE" value="(.+?)"/is', $first, $matches); 85     $VIEWSTATE = $matches[1]; 86     preg_match('/<input type="hidden" name="__EVENTVALIDATION" id="__EVENTVALIDATION" value="(.+?)"/is', $first, $matches); 87     $EVENTVALIDATION = $matches[1]; 88     for($i = 2; $i <= $total; $i++){ 89         sleep(1); 90         $EVENTARGUMENT = $i; 91         $result = getn($EVENTARGUMENT, $VIEWSTATE, $EVENTVALIDATION); 92         preg_match('/<table id="dltData".+?<!-- AspNetPager/is', $result, $matches); 93         $html .= str_replace('<!-- AspNetPager', '', $matches[0]); 94         $VIEWSTATE = ""; 95         $EVENTVALIDATION = ""; 96         preg_match('/<input type="hidden" name="__VIEWSTATE" id="__VIEWSTATE" value="(.+?)"/is', $result, $matches); 97         $VIEWSTATE = $matches[1]; 98         preg_match('/<input type="hidden" name="__EVENTVALIDATION" id="__EVENTVALIDATION" value="(.+?)"/is', $result, $matches); 99         $EVENTVALIDATION = $matches[1];100     }101     return $html;102 }

相关文章列表: