array("选择器","类型"),.......),【类型】说明:值 "text" ,"html" ,"属性" * @param string $regRange 【块选择器】:指 先按照规则 选出 几个大块 ,然后再分别再在块里面 进行相关的选择 * @param string $getHtmlWay 【源码获取方式】指是通过curl抓取源码,还是通过file_get_contents抓取源码 * @param string $output_encoding【输出编码格式】指要以什么编码输出(UTF-8,GB2312,.....),防止出现乱码,如果设置为 假值 则不改变原字符串编码 */ public function QueryList($page, $regArr, $regRange = '', $getHtmlWay = "curl", $output_encoding = false) { $this->output_encoding = $output_encoding; if ($this->isURL($page)) { $this->pageURL = $page; if ($getHtmlWay == "curl") { //为了能获取https:// $ch = curl_init(); curl_setopt($ch, CURLOPT_URL, $this->pageURL); curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false); curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); //伪造来源referer $referer = 'http://www.meituan.com'; curl_setopt ($ch,CURLOPT_REFERER,$referer); //伪造来源ip //伪造来源ip //curl_setopt($ch, CURLOPT_HTTPHEADER, '36.110.144.110'); $this->html = curl_exec($ch); curl_close($ch); } else { $this->html = file_get_contents($this->pageURL); } } else { $this->html = $page; } //获取编码格式 $this->html_encoding = $this->get_encode($this->html); if (!empty($regArr)) { $this->regArr = $regArr; $this->regRange = $regRange; $this->getList(); } } public function setQuery($regArr, $regRange = '') { $this->jsonArr = array(); $this->regArr = $regArr; $this->regRange = $regRange; $this->getList(); } private function getList() { $hobj = phpQuery::newDocumentHTML($this->html); if (!empty($this->regRange)) { $robj = pq($hobj)->find($this->regRange); $i = 0; foreach ($robj as $item) { while (list($key, $reg_value) = each($this->regArr)) { $iobj = pq($item)->find($reg_value[0]); switch ($reg_value[1]) { case 'text': $this->jsonArr[$i][$key] = trim(pq($iobj)->text()); break; case 'html': $this->jsonArr[$i][$key] = trim(pq($iobj)->html()); break; default: $this->jsonArr[$i][$key] = pq($iobj)->attr($reg_value[1]); break; } } //重置数组指针 reset($this->regArr); $i++; } } else { while (list($key, $reg_value) = each($this->regArr)) { $lobj = pq($hobj)->find($reg_value[0]); $i = 0; foreach ($lobj as $item) { switch ($reg_value[1]) { case 'text': $this->jsonArr[$i++][$key] = trim(pq($item)->text()); break; case 'html': $this->jsonArr[$i++][$key] = trim(pq($item)->html()); break; default: $this->jsonArr[$i++][$key] = pq($item)->attr($reg_value[1]); break; } } } } if ($this->output_encoding) { //编码转换 $this->jsonArr = $this->array_convert_encoding($this->jsonArr, $this->output_encoding, $this->html_encoding); } } public function getJSON() { return json_encode($this->jsonArr); } /** * 获取文件编码 * @param $string * @return string */ private function get_encode($string) { return mb_detect_encoding($string, array('ASCII', 'GB2312', 'GBK', 'UTF-8')); } /** * 递归转换数组值得编码格式 * @param array $arr * @param string $to_encoding * @param string $from_encoding * @return array */ private function array_convert_encoding($arr, $to_encoding, $from_encoding) { if (!is_array($arr)) return $arr; foreach ($arr as $key => $value) { if (is_array($value)) { $arr[$key] = $this->array_convert_encoding($value, $to_encoding, $from_encoding); } else { $arr[$key] = mb_convert_encoding($value, $to_encoding, $from_encoding); } } return $arr; } /** * 简单的判断一下参数是否为一个URL链接 * @param string $str * @return boolean */ private function isURL($str) { if (preg_match('/^http(s)?:\/\/.+/', $str)) { return true; } return false; } }