php利用simple_html_dom抓取知乎图片。[多图慎入]
17-02-19 22:49
字数 6922
阅读 3844
深夜福利贴,翻电脑时无意看到以前写的一个小玩意,bug多多,慎用。
set_time_limit(0);
class get_img_from_url{
public $simple_html_class = '';
public $file_name = '';
public $new_line_char = "rn";
public function __construct($file_name='')
{
require "simple_html_dom.php";
$this->simple_html_class = new simple_html_dom();
if(!empty($file_name)){
$this->file_name = $file_name;
}else{
$this->file_name = rand(0,99999) . '_resource.text';
}
$this->new_line_char = $this->getNewLineChar();
}
public function __destruct()
{
$this->simple_html_class->clear();
}
public function curl_method($url, $params = array(), $encode = true, $method = 'post'){
$ch = curl_init();
if ($method == 'get') {
//$link_char = strpos($url, '?') ? '&' : '?';
//$url = $url . $link_char . http_build_query($params);
$url = $url . http_build_query($params);
$url = $encode ? $url : urldecode($url);
curl_setopt($ch, CURLOPT_URL, $url);
} else {
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_POST, true);
curl_setopt($ch, CURLOPT_POSTFIELDS, $params);
}
if(strpos($url, 'https') !== false){
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, FALSE);
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, FALSE);
}
curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)");
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
$resp = curl_exec($ch);
curl_close($ch);
return $resp;
}
public function getNewLineChar(){
$is_cli = PHP_SAPI == 'cli' ? 1 : 0;
if($is_cli){
$char = "n";
}else{
$char = "<br>";
}
return $char;
}
public function grabImage($url, $filename = "") {
if ($url == ""):return false;
endif;
//如果$url地址为空,直接退出
if ($filename == "") {
//如果没有指定新的文件名
$ext = strrchr($url, ".");
//得到$url的图片格式
if ($ext != ".gif" && $ext != ".jpg" && $ext != ".png" && $ext != ".jpeg"):return false;
endif;
//如果图片格式不为.gif或者.jpg,直接退出
$filename = 'img/'.date("YmdHis") . rand(1,99) . $ext;
//用天月面时分秒来命名新的文件名
}
ob_start();//打开输出
readfile($url);//输出图片文件
$img = ob_get_contents();//得到浏览器输出
ob_end_clean();//清除输出并关闭
$size = strlen($img);//得到图片大小
if($size < 5120){//小于5kb跳过
return false;
}
$fp2 = @fopen($filename, "a");
fwrite($fp2, $img);//向当前目录写入图片文件,并重新命名
fclose($fp2);
return $filename;//返回新的文件名
}
public function getQuestionId($url){
$param = parse_url($url);
$path = $param['path'] ?? '';
if(!empty($param)){
$question_id = substr($path, strrpos($path, '/') + 1);
}
return $question_id ?? 0;
}
public function run($url=''){
if(!empty($url)){
if(!file_exists($this->file_name)){
$result = $this->curl_method($url,[],false,'get');
file_put_contents($this->file_name, $result);
echo "append success~".$this->new_line_char;
$runtime_log = "append success~".$this->new_line_char;
}else{
echo "find file successfully....".$this->new_line_char;
$runtime_log = "find file successfully....".$this->new_line_char;
}
}else{
echo 'dismiss url!'.$this->new_line_char;
exit;
}
$rs = $this->simple_html_class->load_file($this->file_name);
if(!$rs){
echo "load file successfully....".$this->new_line_char;
$runtime_log .= "load file successfully....".$this->new_line_char;
}
$inner_html = $this->simple_html_class->find('img');
/*if(!empty($inner_html)){
echo "find ".count($inner_html)." `img` success,stat analyse...".$this->new_line_char;
$runtime_log .= "find ".count($inner_html)." `img` success,stat analyse...".$this->new_line_char;
foreach ($inner_html as $node){
$img_src = isset($node->attr['src']) ? trim($node->attr['src']) : '';
$img_actualsrc = isset($node->attr['data-actualsrc']) ? trim($node->attr['data-actualsrc']) : '';
if(!empty($img_src)){
echo $img_src . $this->new_line_char;
$runtime_log .= $img_src.$this->new_line_char;
try{
$img_gets = $this->grabImage($img_src);
if($img_gets){
echo "save img successfully " . $img_gets . $this->new_line_char;
$runtime_log .= "save img successfully " . $img_gets . $this->new_line_char;
}else{
echo "error happened ".$this->new_line_char;
$runtime_log .= "error happened ".$this->new_line_char;
}
}catch (Exception $e){
try{
$img_gets = $this->grabImage($img_actualsrc);
if($img_gets){
echo "save img successfully " . $img_gets . $this->new_line_char;
$runtime_log .= "save img successfully " . $img_gets . $this->new_line_char;
}else{
echo "error happened ".$this->new_line_char;
$runtime_log .= "error happened ".$this->new_line_char;
}
}catch (Exception $e){
echo $e->getMessage().$this->new_line_char;
$runtime_log .= $e->getMessage().$this->new_line_char;
}
echo $e->getMessage().$this->new_line_char;
$runtime_log .= $e->getMessage().$this->new_line_char;
}
}else{
echo "get image src failed".$this->new_line_char;
$runtime_log .= "get image src failed".$this->new_line_char;
}
}
}*/
echo "get index page image over,stat to get next page....".$this->new_line_char;
$runtime_log .= "get index page image over,stat to get next page....".$this->new_line_char;
echo "================================================================================".$this->new_line_char;
$runtime_log .= "================================================================================".$this->new_line_char;
$this->get_next_page_img($url);
file_put_contents('runtime/runtime_'.date('YmdH').'.log', $runtime_log.PHP_EOL,FILE_APPEND);
exit("end");
}
//翻页获取数据
public function get_next_page_img($url){
if(!empty($url)){
$question_id = $this->getQuestionId($url);
$more_url = 'https://www.zhihu.com/node/QuestionAnswerListV2';
while (true){
$offset = 10;
$post_data = [
'method' => 'next',
'params' => '{"url_token":'.$question_id.',"pagesize":10,"offset":'.$offset.'}',
];
$offset += 10;
$res = $this->curl_method($more_url, $post_data);
$res = json_decode($res, true);
$msg = $res['msg'] ?? '';
if(!empty($msg)){
foreach ($msg as $html_data){
if(empty($html_data)){
break;
}
$file_name = $question_id . '_' . $offset . '.log';
file_put_contents($file_name, $html_data.PHP_EOL,FILE_APPEND);
$rs = $this->simple_html_class->load_file($file_name);
$runtime_log = '';
if(!$rs){
echo "load file successfully....".$this->new_line_char;
$runtime_log .= "load file successfully....".$this->new_line_char;
}
$inner_html = $this->simple_html_class->find('img');
if(!empty($inner_html)){
echo "find ".count($inner_html)." `img` success,stat analyse...".$this->new_line_char;
$runtime_log .= "find ".count($inner_html)." `img` success,stat analyse...".$this->new_line_char;
foreach ($inner_html as $node){
$img_src = isset($node->attr['src']) ? trim($node->attr['src']) : '';
$img_actualsrc = isset($node->attr['data-actualsrc']) ? trim($node->attr['data-actualsrc']) : '';
if(!empty($img_src)){
echo $img_src . $this->new_line_char;
$runtime_log .= $img_src.$this->new_line_char;
try{
$img_gets = $this->grabImage($img_src);
if($img_gets){
echo "save img successfully " . $img_gets . $this->new_line_char;
$runtime_log .= "save img successfully " . $img_gets . $this->new_line_char;
}else{
echo "error happened ".$this->new_line_char;
$runtime_log .= "error happened ".$this->new_line_char;
}
}catch (Exception $e){
try{
$img_gets = $this->grabImage($img_actualsrc);
if($img_gets){
echo "save img successfully " . $img_gets . $this->new_line_char;
$runtime_log .= "save img successfully " . $img_gets . $this->new_line_char;
}else{
echo "error happened ".$this->new_line_char;
$runtime_log .= "error happened ".$this->new_line_char;
}
}catch (Exception $e){
echo $e->getMessage().$this->new_line_char;
$runtime_log .= $e->getMessage().$this->new_line_char;
}
echo $e->getMessage().$this->new_line_char;
$runtime_log .= $e->getMessage().$this->new_line_char;
}
}else{
echo "get image src failed".$this->new_line_char;
$runtime_log .= "get image src failed".$this->new_line_char;
}
}
}
echo "total end".PHP_EOL;
}
}
}
}
}
}
还没写完,注释掉了一部分。
下面展示下成果。
0人点赞>
请登录后发表评论
相关推荐
文章归档
最新文章
最受欢迎
22-11-16 10:13
21-10-18 12:11
21-10-17 23:27
20-08-18 17:58
20-01-06 12:12
你这是要疯啊。。。
这样的文章再给我来一打![em_48]
没想到你是这样的楼主。[em_41]