123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183 |
- <?php
- /**
- * QueryList
- *
- * 一个基于phpQuery的通用列表采集类
- *
- * @author Jaeger
- * @email 734708094@qq.com
- * @link http://git.oschina.net/jae/QueryList
- * @version 1.6.0
- */
- require('phpQuery/phpQuery.php');
- class QueryList {
- private $pageURL;
- private $regArr = array();
- public $jsonArr = array();
- private $regRange;
- private $html;
- private $output_encoding;
- private $html_encoding;
- /**
- * 构造函数
- * @param string $page 要抓取的网页URL地址(支持https);或者是html源代码
- * @param array $regArr 【选择器数组】说明:格式array("名称"=>array("选择器","类型"),.......),【类型】说明:值 "text" ,"html" ,"属性"
- * @param string $regRange 【块选择器】:指 先按照规则 选出 几个大块 ,然后再分别再在块里面 进行相关的选择
- * @param string $getHtmlWay 【源码获取方式】指是通过curl抓取源码,还是通过file_get_contents抓取源码
- * @param string $output_encoding【输出编码格式】指要以什么编码输出(UTF-8,GB2312,.....),防止出现乱码,如果设置为 假值 则不改变原字符串编码
- */
- public function QueryList($page, $regArr, $regRange = '', $getHtmlWay = "curl", $output_encoding = false) {
- $this->output_encoding = $output_encoding;
- if ($this->isURL($page)) {
- $this->pageURL = $page;
- if ($getHtmlWay == "curl") {
- //为了能获取https://
- $ch = curl_init();
- curl_setopt($ch, CURLOPT_URL, $this->pageURL);
- curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
- curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false);
- curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
- //伪造来源referer
- $referer = 'http://www.meituan.com';
- curl_setopt ($ch,CURLOPT_REFERER,$referer);
- //伪造来源ip //伪造来源ip
- //curl_setopt($ch, CURLOPT_HTTPHEADER, '36.110.144.110');
-
- $this->html = curl_exec($ch);
- curl_close($ch);
- } else {
- $this->html = file_get_contents($this->pageURL);
- }
- } else {
- $this->html = $page;
- }
- //获取编码格式
- $this->html_encoding = $this->get_encode($this->html);
- if (!empty($regArr)) {
- $this->regArr = $regArr;
- $this->regRange = $regRange;
- $this->getList();
- }
- }
- public function setQuery($regArr, $regRange = '') {
- $this->jsonArr = array();
- $this->regArr = $regArr;
- $this->regRange = $regRange;
- $this->getList();
- }
- private function getList() {
- $hobj = phpQuery::newDocumentHTML($this->html);
- if (!empty($this->regRange)) {
- $robj = pq($hobj)->find($this->regRange);
- $i = 0;
- foreach ($robj as $item) {
- while (list($key, $reg_value) = each($this->regArr)) {
- $iobj = pq($item)->find($reg_value[0]);
- switch ($reg_value[1]) {
- case 'text':
- $this->jsonArr[$i][$key] = trim(pq($iobj)->text());
- break;
- case 'html':
- $this->jsonArr[$i][$key] = trim(pq($iobj)->html());
- break;
- default:
- $this->jsonArr[$i][$key] = pq($iobj)->attr($reg_value[1]);
- break;
- }
- }
- //重置数组指针
- reset($this->regArr);
- $i++;
- }
- } else {
- while (list($key, $reg_value) = each($this->regArr)) {
- $lobj = pq($hobj)->find($reg_value[0]);
- $i = 0;
- foreach ($lobj as $item) {
- switch ($reg_value[1]) {
- case 'text':
- $this->jsonArr[$i++][$key] = trim(pq($item)->text());
- break;
- case 'html':
- $this->jsonArr[$i++][$key] = trim(pq($item)->html());
- break;
- default:
- $this->jsonArr[$i++][$key] = pq($item)->attr($reg_value[1]);
- break;
- }
- }
- }
- }
- if ($this->output_encoding) {
- //编码转换
- $this->jsonArr = $this->array_convert_encoding($this->jsonArr, $this->output_encoding, $this->html_encoding);
- }
- }
- public function getJSON() {
- return json_encode($this->jsonArr);
- }
- /**
- * 获取文件编码
- * @param $string
- * @return string
- */
- private function get_encode($string) {
- return mb_detect_encoding($string, array('ASCII', 'GB2312', 'GBK', 'UTF-8'));
- }
- /**
- * 递归转换数组值得编码格式
- * @param array $arr
- * @param string $to_encoding
- * @param string $from_encoding
- * @return array
- */
- private function array_convert_encoding($arr, $to_encoding, $from_encoding) {
- if (!is_array($arr))
- return $arr;
- foreach ($arr as $key => $value) {
- if (is_array($value)) {
- $arr[$key] = $this->array_convert_encoding($value, $to_encoding, $from_encoding);
- } else {
- $arr[$key] = mb_convert_encoding($value, $to_encoding, $from_encoding);
- }
- }
- return $arr;
- }
- /**
- * 简单的判断一下参数是否为一个URL链接
- * @param string $str
- * @return boolean
- */
- private function isURL($str) {
- if (preg_match('/^http(s)?:\/\/.+/', $str)) {
- return true;
- }
- return false;
- }
- }
|