php网页分析 内容抓取 爬虫 文件分析
(2011-11-19 20:53:10)标签: 杂谈
转载
▼
//获取所有内容url保存到文件
function get_index($save_file, $prefix=\"index_\"){ $count = 68; $i = 1;
if (file_exists($save_file)) @unlink($save_file);
$fp = fopen($save_file, \"a+\") or die(\"Open \". $save_file .\" failed\"); while($i<$count){
$url = $prefix . $i .\".htm\"; echo \"Get \". $url .\"...\";
$url_str = get_content_url(get_url($url)); echo \" OK\\n\";
fwrite($fp, $url_str); ++$i; }
fclose($fp); }
//获取目标多媒体对象
function get_object($url_file, $save_file, $split=\"|--:**:--|\"){ if (!file_exists($url_file)) die($url_file .\" not exist\"); $file_arr = file($url_file);
if (!is_array($file_arr) || empty($file_arr)) die($url_file .\" not content\"); $url_arr = array_unique($file_arr);
if (file_exists($save_file)) @unlink($save_file);
$fp = fopen($save_file, \"a+\") or die(\"Open save file \". $save_file .\" failed\"); foreach($url_arr as $url){ if (empty($url)) continue; echo \"Get \". $url .\"...\"; $html_str = get_url($url); echo $html_str; echo $url; exit;
$obj_str = get_content_object($html_str); echo \" OK\\n\";
fwrite($fp, $obj_str); }
fclose($fp);
}
//遍历目录获取文件内容
function get_dir($save_file, $dir){ $dp = opendir($dir);
if (file_exists($save_file)) @unlink($save_file);
$fp = fopen($save_file, \"a+\") or die(\"Open save file \". $save_file .\" failed\"); while(($file = readdir($dp)) != false){ if ($file!=\".\" && $file!=\"..\"){
echo \"Read file \". $file .\"...\";
$file_content = file_get_contents($dir . $file); $obj_str = get_content_object($file_content); echo \" OK\\n\";
fwrite($fp, $obj_str); } }
fclose($fp); }
//获取指定url内容
function get_url($url){
$reg = '/^http:\\/\\/[^\\/].+$/';
if (!preg_match($reg, $url)) die($url .\" invalid\");
$fp = fopen($url, \"r\") or die(\"Open url: \". $url .\" failed.\"); while($fc = fread($fp, 8192)){ $content .= $fc; }
fclose($fp);
if (empty($content)){
die(\"Get url: \". $url .\" content failed.\"); }
return $content; }
//使用socket获取指定网页
function get_content_by_socket($url, $host){
$fp = fsockopen($host, 80) or die(\"Open \". $url .\" failed\"); $header = \"GET /\".$url .\" HTTP/1.1\\r\\n\"; $header .= \"Accept: *i\";
$reg = '/^(down.*?\\.html)$/i';
preg_match_all ($rex, $file_contents, $r); $result = \"\"; //array(); foreach($r as $c){
if (is_array($c)){ foreach($c as $d){
if (preg_match($reg, $d)){ $result .= $host_url . $d.\"\\n\"; } } } }
return $result; }
//获取指定内容中的多媒体文件
function get_content_object($str, $split=\"|--:**:--|\"){
$regx = \"/href\\s*=\\s*['\\\"]*([^>'\\\"\\s]+)[\\\"'>]*\\s*(.*?<\\/b>)/i\"; preg_match_all($regx, $str, $result);
if (count($result) == 3){
$result[2] = str_replace(\"多媒体: \ $result[2] = str_replace(\"\
$result = $result[1][0] . $split .$result[2][0] . \"\\n\"; }
return $result; } ?>
//PHP 访问网页 $page = '';
$handler = fopen('http://www.baidu.com','r'); while(!feof($handler)){
$page.=fread($handler,1048576); }
fclose($handler); echo $page; ?>
2:判断这个页面是否是报错页面 function getHttpStatus($host,$get=\"\") { $fp = fsockopen($host, 80); if (!$fp) {
$res= -1;
} else {
fwrite($fp, \"GET /\".$get.\" HTTP/1.0\\r\\n\\r\\n\"); stream_set_timeout($fp, 2); $res = fread($fp, 128);
$info = stream_get_meta_data($fp); fclose($fp);
if ($info['timed_out']) { $res=0; } else {
$res= substr($res,9,3); } }
return $res; }
$good=array(\"200\
if(in_array(getHttpStatus(\"5y.nuc.edu.cn\echo \"正常\"; } else {
echo getHttpStatus(\"5y.nuc.edu.cn\}
if(in_array(getHttpStatus(\"5y.nuc.edu.cn\echo \"正常\"; } else {
echo getHttpStatus(\"5y.nuc.edu.cn\} ?> 返回
第一个返回\"正常\"
第二个不存在返回\"404\"
function getHttpStatus($host,$get=\"\") { //访问网页 获得服务器状态码
$fp = fsockopen($host, 80); if (!$fp) {
$res= -1; } else {
fwrite($fp, \"GET /\".$get.\" HTTP/1.0\\r\\n\\r\\n\"); stream_set_timeout($fp, 2); $res = fread($fp, 128);
$info = stream_get_meta_data($fp); fclose($fp);
if ($info['timed_out']) { $res=0; } else {
$res= substr($res,9,3); } }
return $res; }
echo getHttpStatus(\"5y.nuc.edu.cn\echo
getHttpStatus(\"community.csdn.net\\"); 返回
1 无法连接服务器 0 超时
200 OK (成功返回) 302 Found (找到) 404 没有找到 ... //遍历所有网页 ($type指定类型)
function getAllPage($path=\"./\global $p;
if ($handle = opendir($path)) {
while (false !== ($file = readdir($handle))) { if(is_dir($file) && $file!=\".\" && $file!=\"..\") { getAllPage($path.$file.\"/\ } else {
$ex=array_pop(explode(\".\
if(in_array(strtolower($ex),$type)) { array_push($p, $path.$file); } } }
closedir($handle); } }
$p=array();
getAllPage(\"./\"); echo \"
\"; print_r($p); echo \"
\"; ?>
//抓取页面内容中所有URL。
$str='href=\"http://blog.csdn.net/love01px/archive/2006/05/30/2.php\" Title =\"permalink\">\"permalink\">2006年05月30日 15:13:00 | href=\"http://blog.csdn.net/love01px/archive/2006/05/30/6.php?id=1\" Title = \"comments, pingbacks, trackbacks\">评论 (0)';
preg_match_all(\"/href=\\\"([^\\\"]*\\.*php[^\\\"]*)\\\"/si\//换用下面这个获取所有类型URL
//preg_match_all(\"/href=\\\"([^\\\"]*)\\\"/si\
print_r($m[1]); ?>
包含在链接里带get参数的地址 $str=file_get_contents(\"http://www.php.net\");
preg_match_all(\"/href=\\\"([^\\\"]*\\.*php\\?[^\\\"]*)\\\"/si\pr