PHP可以灵活配置使用的采集器
PHP可以灵活配置使用的采集器?就跟随百分网小编一起去了解下吧,想了解更多相关信息请持续关注我们应届毕业生考试网!
代码:
<?php
/**
* 可以灵活配置使用的采集器
* 作者:Rain
* 创建时间:2015-02-03 15:17:30
* 版本信息:V1.0
*/
/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/
/pic/p>
define('DB_HOST', 'localhost');
define('DB_USER', 'root');
define('DB_PWD', 'test123456');
define('DB_NAME', 'test_dbname');
define('DB_CHARSET', 'utf8');
define('TABLE_NAME', 'tb_book');
/pic/p>
/pic/p>
define('WEB_CHARSET', 'gbk');
/pic/p>
define('WEB_LIST_URL', '/pic/book/1_%d.htm');
/pic/p>
define('PAGE_COUNT', 14);
/pic/p>
define('PAGE_START', 1);
/pic/,例如:/\/xuefu2008\/article\/details\/(\d)+/i
define('WEB_CONTENT_URL_REG', '/\/book\/(\d)+\.htm/i');
/pic/,例如:/pic/p>
define('WEB_HOST', '/pic/p>
/pic/p>
define('WEB_LIST_POSTION', '/book_name\.gif(.*?)<td\swidth="15\%"\snowrap>/i');
/pic/p>
/pic/p>
define('SLEEP_TIME', 1);
define('IS_DEBUG', false);
define('INSERT_DB', true);
/pic/p>
define('OUTPUT_SPEED', 1);
/pic/p>
/pic/p>
$text_filter = array(
'- 中华电脑书库' => '',
'_电脑电子书' => '',
'_电脑书籍' => '',
'下载' => '',
);
/pic/p>
$table_mapping = array(
/pic/p>
'size' => '/软件大小.*?000000>(.*?)<\/font>/i',
'logo' => '/pic/index/uploads/images/20150105/0b8461910de101cc51a07684cdab797e.jpg',
'field1' => '/<title>(.*?)<\/title>/i',
'field2' => '/软件简介.*?000000>(.*?)<\/font>/i',
'field3' => '1',
'field4' => '1',
'field5' => '1',
'field6' => '电子书,计算机,图像,图形',
'platform' => 'window/Linux',
'ishot' => '1',
'agreement' => '免费',
'downurl' => '/(\/down\.asp\?id=.*?)"/i',
'istop' => '1',
);
/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/pic/
$ga = new Gather();
$ga->run();
class Gather
{
public function __construct()
{
$this->init_check();
}
public function run()
{
global $table_mapping, $text_filter;
for ($page = PAGE_START; $page <= PAGE_COUNT; $page++)
{
$this->write('开始采集列表第'.$page.'页的内容...');
$list_content = $this->get(sprintf(WEB_LIST_URL, $page));
if (empty($list_content))
{
$this->write('抓取的列表页的内容为空,所以过滤掉');
continue;
}
$list_content = str_replace("\r", '', $list_content);
$list_content = str_replace("\n", '', $list_content);
/pic/p>
if (!preg_match(WEB_LIST_POSTION, $list_content, $list_search))
{
$this->write('精准匹配列表页的内容失败,所以过滤掉');
continue;
}
if (isset($list_search[1]))
$list_content = $list_search[1];
else
$list_content = $list_search[0];
/pic/p>
preg_match_all(WEB_CONTENT_URL_REG, $list_content, $match);
if (is_array($match[0]) && !empty($match[0]))
{
$this->write('当前的列表页面,总共匹配到:'.count($match[0]).'个内容页');
foreach ($match[0] as $val)
{
if (strpos($val, 'http:') === false)
{
if (substr($val, 0, 1) == '/')
$val = WEB_HOST.$val;
else
$val = WEB_HOST.'/'.$val;
}
$web_content = $this->get($val);
if (empty($web_content))
{
$this->write('抓取的内容页为空,所以过滤掉');
continue;
}
$web_content = str_replace("\r", '', $web_content);
$web_content = str_replace("\n", '【】', $web_content);
$sql = "INSERT INTO ".TABLE_NAME."(".implode(', ', array_keys($table_mapping)).")VALUES(";
foreach ($table_mapping as $field => $reg)
$sql .= ':'.$field.',';
$sql = substr($sql ,0, -1);
$sql .= ')';
if (IS_DEBUG)
$this->write('执行SQL '.$sql);
$dsn = 'mysql:dbname='.DB_NAME.';host='.DB_HOST;
try {
$dbh = new PDO($dsn, DB_USER, DB_PWD);
} catch (PDOException $e) {
$this->write( 'Connection failed: ' . $e->getMessage(), true);
}
$dbh->query("set names 'utf8'");
$sth = $dbh->prepare($sql);
foreach ($table_mapping as $field => $reg)
{
if (substr($reg, 0, 1) != '/')
{
$$field = $reg;
}
else
{
if (!preg_match($reg, $web_content, $tmp_match))
{
$this->write('对不起,匹配字段:'.$field.'失败,过滤此记录');
continue 2;
}
$$field = $tmp_match[1];
$$field = $this->closetags($$field);
/pic/p>
$$field = preg_replace('/<script(.*?)>(.*?)<\/script>/i', '', $$field);
/pic/p>
$$field = preg_replace('/<a(.*?)>(.*?)<\/a>/i', '${2}', $$field);
/pic/p>
preg_match_all('/<img.*?src=("|\')+(.*?)("|\')+.*?>/i', $$field, $img_match);
if (isset($img_match[2]) && is_array($img_match[2]) && !empty($img_match[2]))
{
foreach ($img_match[2] as $img_val)
{
if (strpos($img_val, 'http:') === false)
{
$new_val = $img_val;
if (substr($new_val, 0, 1) != '/')
$new_val = '/'.$img_val;
$new_val = WEB_HOST.$new_val;
$$field = str_replace($img_val, $new_val, $$field);
}
}
}
/pic/p>
/pic/p>
$$field = preg_replace('/<pre.*?>(.*?)<\/pre>/i', '<pre class="prettyprint">${1}</pre>', $$field);
preg_match_all('/<pre>(.*?)<\/pre>/i', $$field, $pre_match);
if (isset($pre_match[1]) && is_array($pre_match[1]) && !empty($pre_match[1]))
{
foreach ($pre_match[1] as $pre_val)
$$field = str_replace($pre_val, str_replace("【】", "\r\n", $pre_val), $$field);
}
/pic/p>
}
/pic/p>
$$field = str_replace('【】', "\r\n", $$field);
/pic/p>
if (is_array($text_filter) && !empty($text_filter))
{
foreach ($text_filter as $tk => $tv)
$$field = str_ireplace($tk, $tv, $$field);
}
if (IS_DEBUG)
$this->write('*'."\t".'字段:'.$field.' 值:'."\n****************************************************\n".$$field."\n****************************************************");
if ('downurl' == $field && stripos($$field, 'http:') === false)
if (substr($$field, 0, 1) == '/')
$$field = WEB_HOST.trim($$field);
else
$$field = WEB_HOST.'/'.trim($$field);
$sth->bindValue(':'.$field, trim($$field));
}
if (INSERT_DB)
$sth->execute();
$sth->closeCursor();
$this->write( '休息,暂停'.SLEEP_TIME.'秒后继续抓取...');
sleep(SLEEP_TIME);
}
}
else
{
$this->write('列表页面没有抓取到内容,所以过滤掉');
}
}
$this->write('', true);
}
protected function closetags($html)
{
/pic/p>
$arr_single_tags = array('meta', 'img', 'br', 'link', 'area');
/pic/p>
preg_match_all('#<([a-z]+)(?: .*)?(?<![/|/ ])>#iU', $html, $result);
$openedtags = $result[1];
/pic/p>
preg_match_all('#</([a-z]+)>#iU', $html, $result);
$closedtags = $result[1];
/pic/p>
$len_opened = count($openedtags);
if (count($closedtags) == $len_opened) {
return $html;
}
/pic/p>
$openedtags = array_reverse($openedtags);
/pic/p>
for ($i = 0; $i < $len_opened; $i++) {
/pic/p>
if (!in_array($openedtags[$i], $arr_single_tags)) {
/pic/p>
if (!in_array($openedtags[$i], $closedtags)) {
/pic/p>
$html .= '</' . $openedtags[$i] . '>';
} else {
unset($closedtags[array_search($openedtags[$i], $closedtags)]);
}
}
}
return $html;
}
protected function init_check()
{
if (!$this->check_curl_support())
$this->write('对不起,请先开启CURL的类库的支持,否则无法执行', true);
$this->check_mysql_connect();
$this->write('程序初始化检查通过,执行后续的流程...');
}
private function get($url, $data = array())
{
$this->write('开始执行抓取: '.$url);
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
/pic/pic/search/spider.htm)");
curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)");
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_HTTPHEADER, $data);
$ret = curl_exec($ch);
$error = curl_error($ch);
curl_close($ch);
unset($ch);
if (!empty($error))
{
$this->write('程序抓取URL: '.$url.'发生错误,错误信息: '.$error);
return false;
}
if (WEB_CHARSET != 'utf-8')
$ret = iconv(WEB_CHARSET, 'utf-8', $ret);
return $ret;
}
/pic/p>
private function check_mysql_connect()
【PHP可以灵活配置使用的采集器】相关文章:
php学习之php配置03-11
PHP基础配置10-29
PHP安装与配置11-22
PHP socket的配置及实例11-22
如何配置php环境11-21
php环境怎么配置12-25
PHP配置文件详解php.ini03-17
PHP环境搭建与配置的方法01-25
如何正确配置 Nginx + PHP03-01