php利用simple_html_dom抓取知乎图片。[多图慎入]

17-02-19 22:49 字数 6922 阅读 3740

深夜福利贴,翻电脑时无意看到以前写的一个小玩意,bug多多,慎用。

set_time_limit(0);

class get_img_from_url{


    public  $simple_html_class = '';

    public $file_name = '';

    public $new_line_char = "rn";

    public function __construct($file_name='')
    {
        require "simple_html_dom.php";
        $this->simple_html_class =  new simple_html_dom();
        if(!empty($file_name)){
            $this->file_name = $file_name;
        }else{
            $this->file_name = rand(0,99999) . '_resource.text';
        }

        $this->new_line_char = $this->getNewLineChar();
    }

    public function __destruct()
    {
        $this->simple_html_class->clear();
    }

    public function curl_method($url, $params = array(), $encode = true, $method = 'post'){
        $ch = curl_init();
        if ($method == 'get') {
            //$link_char = strpos($url, '?') ? '&' : '?';
            //$url = $url . $link_char . http_build_query($params);
            $url = $url . http_build_query($params);
            $url = $encode ? $url : urldecode($url);
            curl_setopt($ch, CURLOPT_URL, $url);
        } else {
            curl_setopt($ch, CURLOPT_URL, $url);
            curl_setopt($ch, CURLOPT_POST, true);
            curl_setopt($ch, CURLOPT_POSTFIELDS, $params);
        }
        if(strpos($url, 'https') !== false){
            curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, FALSE);
            curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, FALSE);
        }
        curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)");
        curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
        $resp = curl_exec($ch);
        curl_close($ch);
        return $resp;
    }

    public function getNewLineChar(){
        $is_cli = PHP_SAPI == 'cli' ? 1 : 0;
        if($is_cli){
            $char = "n";
        }else{
            $char = "<br>";
        }

        return $char;
    }

    public function grabImage($url, $filename = "") {
        if ($url == ""):return false;
        endif;
        //如果$url地址为空,直接退出
        if ($filename == "") {
            //如果没有指定新的文件名
            $ext = strrchr($url, ".");
            //得到$url的图片格式
            if ($ext != ".gif" && $ext != ".jpg" && $ext != ".png" && $ext != ".jpeg"):return false;
            endif;
            //如果图片格式不为.gif或者.jpg,直接退出
            $filename = 'img/'.date("YmdHis") . rand(1,99) . $ext;
            //用天月面时分秒来命名新的文件名
        }
        ob_start();//打开输出
        readfile($url);//输出图片文件
        $img = ob_get_contents();//得到浏览器输出
        ob_end_clean();//清除输出并关闭
        $size = strlen($img);//得到图片大小
        if($size < 5120){//小于5kb跳过
            return false;
        }
        $fp2 = @fopen($filename, "a");
        fwrite($fp2, $img);//向当前目录写入图片文件,并重新命名
        fclose($fp2);
        return $filename;//返回新的文件名
    }

    public function getQuestionId($url){
        $param = parse_url($url);
        $path = $param['path'] ?? '';
        if(!empty($param)){
            $question_id = substr($path, strrpos($path, '/') + 1);
        }

        return $question_id ?? 0;
    }

    public function run($url=''){

        if(!empty($url)){
            if(!file_exists($this->file_name)){

                $result = $this->curl_method($url,[],false,'get');

                file_put_contents($this->file_name, $result);

                echo "append success~".$this->new_line_char;
                $runtime_log = "append success~".$this->new_line_char;
            }else{
                echo "find file successfully....".$this->new_line_char;
                $runtime_log = "find file successfully....".$this->new_line_char;
            }
        }else{
            echo  'dismiss url!'.$this->new_line_char;
            exit;
        }

        $rs = $this->simple_html_class->load_file($this->file_name);

        if(!$rs){
            echo "load file successfully....".$this->new_line_char;
            $runtime_log .= "load file successfully....".$this->new_line_char;
        }
        $inner_html = $this->simple_html_class->find('img');

        /*if(!empty($inner_html)){
            echo "find ".count($inner_html)." `img` success,stat analyse...".$this->new_line_char;
            $runtime_log .= "find ".count($inner_html)." `img` success,stat analyse...".$this->new_line_char;

            foreach ($inner_html as $node){

                $img_src = isset($node->attr['src']) ? trim($node->attr['src']) : '';
                $img_actualsrc = isset($node->attr['data-actualsrc']) ? trim($node->attr['data-actualsrc']) : '';
                if(!empty($img_src)){
                    echo $img_src . $this->new_line_char;
                    $runtime_log .= $img_src.$this->new_line_char;
                    try{
                        $img_gets = $this->grabImage($img_src);
                        if($img_gets){
                            echo "save img successfully " . $img_gets . $this->new_line_char;
                            $runtime_log .= "save img successfully " . $img_gets . $this->new_line_char;
                        }else{
                            echo "error happened ".$this->new_line_char;
                            $runtime_log .= "error happened ".$this->new_line_char;
                        }
                    }catch (Exception $e){
                        try{
                            $img_gets = $this->grabImage($img_actualsrc);
                            if($img_gets){
                                echo "save img successfully " . $img_gets . $this->new_line_char;
                                $runtime_log .= "save img successfully " . $img_gets . $this->new_line_char;
                            }else{
                                echo "error happened ".$this->new_line_char;
                                $runtime_log .= "error happened ".$this->new_line_char;
                            }
                        }catch (Exception $e){
                            echo $e->getMessage().$this->new_line_char;
                            $runtime_log .= $e->getMessage().$this->new_line_char;
                        }
                        echo $e->getMessage().$this->new_line_char;
                        $runtime_log .= $e->getMessage().$this->new_line_char;
                    }

                }else{
                    echo "get image src failed".$this->new_line_char;
                    $runtime_log .= "get image src failed".$this->new_line_char;
                }
            }
        }*/

        echo "get index page image over,stat to get next page....".$this->new_line_char;
        $runtime_log .= "get index page image over,stat to get next page....".$this->new_line_char;

        echo "================================================================================".$this->new_line_char;
        $runtime_log .= "================================================================================".$this->new_line_char;

        $this->get_next_page_img($url);

        file_put_contents('runtime/runtime_'.date('YmdH').'.log', $runtime_log.PHP_EOL,FILE_APPEND);
        exit("end");
    }


    //翻页获取数据
    public function get_next_page_img($url){
        if(!empty($url)){
            $question_id = $this->getQuestionId($url);

            $more_url = 'https://www.zhihu.com/node/QuestionAnswerListV2';

            while (true){
                $offset = 10;
                $post_data = [
                    'method' => 'next',
                    'params' => '{"url_token":'.$question_id.',"pagesize":10,"offset":'.$offset.'}',
                ];
                $offset += 10;

                $res = $this->curl_method($more_url, $post_data);

                $res = json_decode($res, true);

                $msg = $res['msg'] ?? '';
                if(!empty($msg)){
                    foreach ($msg as $html_data){
                        if(empty($html_data)){
                            break;
                        }
                        $file_name = $question_id . '_' . $offset . '.log';
                        file_put_contents($file_name, $html_data.PHP_EOL,FILE_APPEND);


                        $rs = $this->simple_html_class->load_file($file_name);
                        $runtime_log = '';
                        if(!$rs){
                            echo "load file successfully....".$this->new_line_char;
                            $runtime_log .= "load file successfully....".$this->new_line_char;
                        }
                        $inner_html = $this->simple_html_class->find('img');

                        if(!empty($inner_html)){
                            echo "find ".count($inner_html)." `img` success,stat analyse...".$this->new_line_char;
                            $runtime_log .= "find ".count($inner_html)." `img` success,stat analyse...".$this->new_line_char;

                            foreach ($inner_html as $node){

                                $img_src = isset($node->attr['src']) ? trim($node->attr['src']) : '';
                                $img_actualsrc = isset($node->attr['data-actualsrc']) ? trim($node->attr['data-actualsrc']) : '';
                                if(!empty($img_src)){
                                    echo $img_src . $this->new_line_char;
                                    $runtime_log .= $img_src.$this->new_line_char;
                                    try{
                                        $img_gets = $this->grabImage($img_src);
                                        if($img_gets){
                                            echo "save img successfully " . $img_gets . $this->new_line_char;
                                            $runtime_log .= "save img successfully " . $img_gets . $this->new_line_char;
                                        }else{
                                            echo "error happened ".$this->new_line_char;
                                            $runtime_log .= "error happened ".$this->new_line_char;
                                        }
                                    }catch (Exception $e){
                                        try{
                                            $img_gets = $this->grabImage($img_actualsrc);
                                            if($img_gets){
                                                echo "save img successfully " . $img_gets . $this->new_line_char;
                                                $runtime_log .= "save img successfully " . $img_gets . $this->new_line_char;
                                            }else{
                                                echo "error happened ".$this->new_line_char;
                                                $runtime_log .= "error happened ".$this->new_line_char;
                                            }
                                        }catch (Exception $e){
                                            echo $e->getMessage().$this->new_line_char;
                                            $runtime_log .= $e->getMessage().$this->new_line_char;
                                        }
                                        echo $e->getMessage().$this->new_line_char;
                                        $runtime_log .= $e->getMessage().$this->new_line_char;
                                    }

                                }else{
                                    echo "get image src failed".$this->new_line_char;
                                    $runtime_log .= "get image src failed".$this->new_line_char;
                                }
                            }
                        }
                        echo "total end".PHP_EOL;
                    }
                }
            }
        }
    }
}

还没写完,注释掉了一部分。

下面展示下成果。


0人点赞>
关注 收藏 改进 举报
2 条评论
排序方式 时间 投票
boyfix

你这是要疯啊。。。
这样的文章再给我来一打![em_48]

香草大叔

没想到你是这样的楼主。[em_41]

请登录后发表评论