您的当前位置：首页 php网页分析内容抓取爬虫文件分析

php网页分析内容抓取爬虫文件分析

来源：五一七教育网

php网页分析内容抓取爬虫文件分析

(2011-11-19 20:53:10)标签：杂谈

转载

▼

//获取所有内容url保存到文件

function get_index($save_file, $prefix=\"index_\"){ $count = 68; $i = 1;

if (file_exists($save_file)) @unlink($save_file);

$fp = fopen($save_file, \"a+\") or die(\"Open \". $save_file .\" failed\"); while($i<$count){

$url = $prefix . $i .\".htm\"; echo \"Get \". $url .\"...\";

$url_str = get_content_url(get_url($url)); echo \" OK\\n\";

fwrite($fp, $url_str); ++$i; }

fclose($fp); }

//获取目标多媒体对象

function get_object($url_file, $save_file, $split=\"|--:**:--|\"){ if (!file_exists($url_file)) die($url_file .\" not exist\"); $file_arr = file($url_file);

if (!is_array($file_arr) || empty($file_arr)) die($url_file .\" not content\"); $url_arr = array_unique($file_arr);

if (file_exists($save_file)) @unlink($save_file);

$fp = fopen($save_file, \"a+\") or die(\"Open save file \". $save_file .\" failed\"); foreach($url_arr as $url){ if (empty($url)) continue; echo \"Get \". $url .\"...\"; $html_str = get_url($url); echo $html_str; echo $url; exit;

$obj_str = get_content_object($html_str); echo \" OK\\n\";

fwrite($fp, $obj_str); }

fclose($fp);

}

//遍历目录获取文件内容

function get_dir($save_file, $dir){ $dp = opendir($dir);

if (file_exists($save_file)) @unlink($save_file);

$fp = fopen($save_file, \"a+\") or die(\"Open save file \". $save_file .\" failed\"); while(($file = readdir($dp)) != false){ if ($file!=\".\" && $file!=\"..\"){

echo \"Read file \". $file .\"...\";

$file_content = file_get_contents($dir . $file); $obj_str = get_content_object($file_content); echo \" OK\\n\";

fwrite($fp, $obj_str); } }

fclose($fp); }

//获取指定url内容

function get_url($url){

$reg = '/^http:\\/\\/[^\\/].+$/';

if (!preg_match($reg, $url)) die($url .\" invalid\");

$fp = fopen($url, \"r\") or die(\"Open url: \". $url .\" failed.\"); while($fc = fread($fp, 8192)){ $content .= $fc; }

fclose($fp);

if (empty($content)){

die(\"Get url: \". $url .\" content failed.\"); }

return $content; }

//使用socket获取指定网页

function get_content_by_socket($url, $host){

$fp = fsockopen($host, 80) or die(\"Open \". $url .\" failed\"); $header = \"GET /\".$url .\" HTTP/1.1\\r\\n\"; $header .= \"Accept: *i\";

$reg = '/^(down.*?\\.html)$/i';

preg_match_all ($rex, $file_contents, $r); $result = \"\"; //array(); foreach($r as $c){

if (is_array($c)){ foreach($c as $d){

if (preg_match($reg, $d)){ $result .= $host_url . $d.\"\\n\"; } } } }

return $result; }

//获取指定内容中的多媒体文件

function get_content_object($str, $split=\"|--:**:--|\"){

$regx = \"/href\\s*=\\s*['\\\"]*([^>'\\\"\\s]+)[\\\"'>]*\\s*(.*?<\\/b>)/i\"; preg_match_all($regx, $str, $result);

if (count($result) == 3){

$result[2] = str_replace(\"多媒体： \ $result[2] = str_replace(\"\

$result = $result[1][0] . $split .$result[2][0] . \"\\n\"; }

return $result; } ?>

//PHP 访问网页 $page = '';

$handler = fopen('http://www.baidu.com','r'); while(!feof($handler)){

$page.=fread($handler,1048576); }

fclose($handler); echo $page; ?>

2:判断这个页面是否是报错页面 function getHttpStatus($host,$get=\"\") { $fp = fsockopen($host, 80); if (!$fp) {

$res= -1;

} else {

fwrite($fp, \"GET /\".$get.\" HTTP/1.0\\r\\n\\r\\n\"); stream_set_timeout($fp, 2); $res = fread($fp, 128);

$info = stream_get_meta_data($fp); fclose($fp);

if ($info['timed_out']) { $res=0; } else {

$res= substr($res,9,3); } }

return $res; }

$good=array(\"200\

if(in_array(getHttpStatus(\"5y.nuc.edu.cn\echo \"正常\"; } else {

echo getHttpStatus(\"5y.nuc.edu.cn\}

if(in_array(getHttpStatus(\"5y.nuc.edu.cn\echo \"正常\"; } else {

echo getHttpStatus(\"5y.nuc.edu.cn\} ?> 返回

第一个返回\"正常\"

第二个不存在返回\"404\"

function getHttpStatus($host,$get=\"\") { //访问网页获得服务器状态码

$fp = fsockopen($host, 80); if (!$fp) {

$res= -1; } else {

fwrite($fp, \"GET /\".$get.\" HTTP/1.0\\r\\n\\r\\n\"); stream_set_timeout($fp, 2); $res = fread($fp, 128);

$info = stream_get_meta_data($fp); fclose($fp);

if ($info['timed_out']) { $res=0; } else {

$res= substr($res,9,3); } }

return $res; }

echo getHttpStatus(\"5y.nuc.edu.cn\echo

getHttpStatus(\"community.csdn.net\\"); 返回

1 无法连接服务器 0 超时

200 OK (成功返回) 302 Found (找到) 404 没有找到 ... //遍历所有网页 ($type指定类型)

function getAllPage($path=\"./\global $p;

if ($handle = opendir($path)) {

while (false !== ($file = readdir($handle))) { if(is_dir($file) && $file!=\".\" && $file!=\"..\") { getAllPage($path.$file.\"/\ } else {

$ex=array_pop(explode(\".\

if(in_array(strtolower($ex),$type)) { array_push($p, $path.$file); } } }

closedir($handle); } }

$p=array();

getAllPage(\"./\"); echo \"

\";   print_r($p);   echo   \"

\"; ?>
//抓取页面内容中所有URL。
$str='href=\"http://blog.csdn.net/love01px/archive/2006/05/30/2.php\" Title =
\"permalink\">\"permalink\">2006年05月30日 15:13:00 | href=\"http://blog.csdn.net/love01px/archive/2006/05/30/6.php?id=1\" Title = \"comments, pingbacks, trackbacks\">评论 (0)';
preg_match_all(\"/href=\\\"([^\\\"]*\\.*php[^\\\"]*)\\\"/si\//换用下面这个获取所有类型URL
//preg_match_all(\"/href=\\\"([^\\\"]*)\\\"/si\
print_r($m[1]); ?>
包含在链接里带get参数的地址 $str=file_get_contents(\"http://www.php.net\");
preg_match_all(\"/href=\\\"([^\\\"]*\\.*php\\?[^\\\"]*)\\\"/si\pr

因篇幅问题不能全部显示，请点此查看更多更全内容

查看全文

php网页分析 内容抓取 爬虫 文件分析

php网页分析内容抓取爬虫文件分析