QueryList.class.php 6.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183
  1. <?php
  2. /**
  3. * QueryList
  4. *
  5. * 一个基于phpQuery的通用列表采集类
  6. *
  7. * @author Jaeger
  8. * @email 734708094@qq.com
  9. * @link http://git.oschina.net/jae/QueryList
  10. * @version 1.6.0
  11. */
  12. require('phpQuery/phpQuery.php');
  13. class QueryList {
  14. private $pageURL;
  15. private $regArr = array();
  16. public $jsonArr = array();
  17. private $regRange;
  18. private $html;
  19. private $output_encoding;
  20. private $html_encoding;
  21. /**
  22. * 构造函数
  23. * @param string $page 要抓取的网页URL地址(支持https);或者是html源代码
  24. * @param array $regArr 【选择器数组】说明:格式array("名称"=>array("选择器","类型"),.......),【类型】说明:值 "text" ,"html" ,"属性"
  25. * @param string $regRange 【块选择器】:指 先按照规则 选出 几个大块 ,然后再分别再在块里面 进行相关的选择
  26. * @param string $getHtmlWay 【源码获取方式】指是通过curl抓取源码,还是通过file_get_contents抓取源码
  27. * @param string $output_encoding【输出编码格式】指要以什么编码输出(UTF-8,GB2312,.....),防止出现乱码,如果设置为 假值 则不改变原字符串编码
  28. */
  29. public function QueryList($page, $regArr, $regRange = '', $getHtmlWay = "curl", $output_encoding = false) {
  30. $this->output_encoding = $output_encoding;
  31. if ($this->isURL($page)) {
  32. $this->pageURL = $page;
  33. if ($getHtmlWay == "curl") {
  34. //为了能获取https://
  35. $ch = curl_init();
  36. curl_setopt($ch, CURLOPT_URL, $this->pageURL);
  37. curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
  38. curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false);
  39. curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
  40. //伪造来源referer
  41. $referer = 'http://www.meituan.com';
  42. curl_setopt ($ch,CURLOPT_REFERER,$referer);
  43. //伪造来源ip //伪造来源ip
  44. //curl_setopt($ch, CURLOPT_HTTPHEADER, '36.110.144.110');
  45. $this->html = curl_exec($ch);
  46. curl_close($ch);
  47. } else {
  48. $this->html = file_get_contents($this->pageURL);
  49. }
  50. } else {
  51. $this->html = $page;
  52. }
  53. //获取编码格式
  54. $this->html_encoding = $this->get_encode($this->html);
  55. if (!empty($regArr)) {
  56. $this->regArr = $regArr;
  57. $this->regRange = $regRange;
  58. $this->getList();
  59. }
  60. }
  61. public function setQuery($regArr, $regRange = '') {
  62. $this->jsonArr = array();
  63. $this->regArr = $regArr;
  64. $this->regRange = $regRange;
  65. $this->getList();
  66. }
  67. private function getList() {
  68. $hobj = phpQuery::newDocumentHTML($this->html);
  69. if (!empty($this->regRange)) {
  70. $robj = pq($hobj)->find($this->regRange);
  71. $i = 0;
  72. foreach ($robj as $item) {
  73. while (list($key, $reg_value) = each($this->regArr)) {
  74. $iobj = pq($item)->find($reg_value[0]);
  75. switch ($reg_value[1]) {
  76. case 'text':
  77. $this->jsonArr[$i][$key] = trim(pq($iobj)->text());
  78. break;
  79. case 'html':
  80. $this->jsonArr[$i][$key] = trim(pq($iobj)->html());
  81. break;
  82. default:
  83. $this->jsonArr[$i][$key] = pq($iobj)->attr($reg_value[1]);
  84. break;
  85. }
  86. }
  87. //重置数组指针
  88. reset($this->regArr);
  89. $i++;
  90. }
  91. } else {
  92. while (list($key, $reg_value) = each($this->regArr)) {
  93. $lobj = pq($hobj)->find($reg_value[0]);
  94. $i = 0;
  95. foreach ($lobj as $item) {
  96. switch ($reg_value[1]) {
  97. case 'text':
  98. $this->jsonArr[$i++][$key] = trim(pq($item)->text());
  99. break;
  100. case 'html':
  101. $this->jsonArr[$i++][$key] = trim(pq($item)->html());
  102. break;
  103. default:
  104. $this->jsonArr[$i++][$key] = pq($item)->attr($reg_value[1]);
  105. break;
  106. }
  107. }
  108. }
  109. }
  110. if ($this->output_encoding) {
  111. //编码转换
  112. $this->jsonArr = $this->array_convert_encoding($this->jsonArr, $this->output_encoding, $this->html_encoding);
  113. }
  114. }
  115. public function getJSON() {
  116. return json_encode($this->jsonArr);
  117. }
  118. /**
  119. * 获取文件编码
  120. * @param $string
  121. * @return string
  122. */
  123. private function get_encode($string) {
  124. return mb_detect_encoding($string, array('ASCII', 'GB2312', 'GBK', 'UTF-8'));
  125. }
  126. /**
  127. * 递归转换数组值得编码格式
  128. * @param array $arr
  129. * @param string $to_encoding
  130. * @param string $from_encoding
  131. * @return array
  132. */
  133. private function array_convert_encoding($arr, $to_encoding, $from_encoding) {
  134. if (!is_array($arr))
  135. return $arr;
  136. foreach ($arr as $key => $value) {
  137. if (is_array($value)) {
  138. $arr[$key] = $this->array_convert_encoding($value, $to_encoding, $from_encoding);
  139. } else {
  140. $arr[$key] = mb_convert_encoding($value, $to_encoding, $from_encoding);
  141. }
  142. }
  143. return $arr;
  144. }
  145. /**
  146. * 简单的判断一下参数是否为一个URL链接
  147. * @param string $str
  148. * @return boolean
  149. */
  150. private function isURL($str) {
  151. if (preg_match('/^http(s)?:\/\/.+/', $str)) {
  152. return true;
  153. }
  154. return false;
  155. }
  156. }