统计
  • 建站日期:2021-03-10
  • 文章总数:120 篇
  • 评论总数:0 条
  • 分类总数:16 个
  • 最后更新:6月20日
文章 php代码

php的DOMDocument来解析HTML,实现豆瓣信息爬虫

叶子
首页 php代码 正文

<?php
header("Content-Type: text/json;charset=utf-8");
header('Access-Control-Allow-Origin: *'); //允许所有域名的脚本访问该资源。

header("Access-Control-Allow-Headers: token,Origin, X-Requested-With, Content-Type, Accept"); 

header('Access-Control-Allow-Methods: POST,GET'); //请求方式
$url = "https://movie.douban.com/subject/".$_GET['id']."/";

// 初始化cURL会话
$ch = curl_init();

// 设置cURL选项
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);

// 执行cURL请求并获取响应
$response = curl_exec($ch);

// 检查是否有错误发生
if(curl_errno($ch)) {
    echo 'Curl error: ' . curl_error($ch);
} else {
    // 解析HTML文档
    $dom = new DOMDocument();
    @$dom->loadHTML($response); // 使用@符号抑制警告

    // 获取导演信息
    $directorElement = $dom->getElementById('info')->getElementsByTagName('span');
    $director = '';
    foreach ($directorElement as $element) {
        if (strpos($element->nodeValue,"导演")!==false) {
            // $director = trim($element->nextSibling->textContent);
            $director = trim($element->nodeValue);
            break;
        }
    }

    // 获取编剧信息
    $writerElement = $dom->getElementById('info')->getElementsByTagName('span');
    $writer = '';
    foreach ($writerElement as $elementbj) {

        if (strpos($elementbj->nodeValue,"编剧")!==false) {
            // $director = trim($elementbj->nextSibling->textContent);
            $writer = trim($elementbj->nodeValue);
            break;
        }
    }

    // 获取主演信息
    $castElement = $dom->getElementById('info')->getElementsByTagName('a');
    $casts = [];
    foreach ($castElement as $element) {
        $casts[] = trim($element->nodeValue);
    }
    $castsString = implode(', ', $casts);

    // 获取类型信息
    $genreElements = $dom->getElementById('info')->getElementsByTagName('span');
    $genres = [];
    foreach ($genreElements as $element) {
        if ($element->hasAttribute('property') && $element->getAttribute('property') == 'v:genre') {
            $genres[] = trim($element->nodeValue);
        }
    }
    $genresString = implode(', ', $genres);

    // 获取地区信息
    $regionElement = $dom->getElementById('info')->getElementsByTagName('span');
    $regions = [];
    foreach ($regionElement as $element) {
        if ($element->nodeValue == '制片国家/地区:') {
            $regions = array_map('trim', explode('/', $element->nextSibling->textContent));
            break;
        }
    }
    $regionsString = implode(', ', $regions);

    // 获取首播信息
    /*
    $releaseDateElement = $dom->getElementById('info')->getElementsByTagName('span');
    $releaseDate = '';
    foreach ($releaseDateElement as $elementsb) {
        if (strpos($elementsb->getAttribute('content'),"-")!==false) {
            $releaseDate=$elementsb->getAttribute('content');
            break;
        }
    }*/
    // 获取首播信息
    $releaseDateElement = $dom->getElementById('info')->getElementsByTagName('span');
    $releaseDate = '';
    foreach ($releaseDateElement as $element) {
        if ($element->hasAttribute('property') && $element->getAttribute('property') == 'v:initialReleaseDate') {
            $releaseDate = trim($element->nodeValue);
            break;
        }
    }

    // 图片信息
    $imgElement = $dom->getElementsByTagName('img');
    $imgDate = '';
    foreach ($imgElement as $elementimg) {

        if($elementimg->getAttribute('rel')=="v:image"){
            $imgDate=$elementimg->getAttribute('src');
        }
    }

    // 获取年份信息1
    // $yearElement = $dom->getElementById('content')->getElementsByTagName('h1')[0]->getElementsByTagName('span')[1];
    // $year = '';
    // if ($yearElement) {
    //     $year = trim($yearElement->nodeValue);
    // }

     // 获取年份信息2
    // 查找ID为 content 的元素
    $contentElement = $dom->getElementById('content');
    if ($contentElement) {
        // 查找 span 标签
        $spans = $contentElement->getElementsByTagName('span');
        // 初始化变量存储结果
        $itemReviewed = '';
        $year = '';
        // 遍历所有 span 标签
        foreach ($spans as $span) {
            if ($span->hasAttribute('property') && $span->getAttribute('property') === 'v:itemreviewed') {
                $itemReviewed = $dom->saveHTML($span);
                $itemReviewed=$span->nodeValue;
            }
            if ($span->hasAttribute('class') && $span->getAttribute('class') === 'year') {
                $year = $dom->saveHTML($span);
            }
        }
    } 

    // 获取语言信息
    $languageElement = $dom->getElementById('info')->getElementsByTagName('span');
    $languages = [];
    foreach ($languageElement as $element) {
        if ($element->nodeValue == '语言:') {
            $languages = array_map('trim', explode('/', $element->nextSibling->textContent));
            break;
        }
    }
    $languagesString = implode(', ', $languages);

    // 获取集数信息
    $episodesElement = $dom->getElementById('info')->getElementsByTagName('span');
    $episodes = '';
    foreach ($episodesElement as $element) {
        if ($element->nodeValue == '集数:') {
            $episodes = trim($element->nextSibling->textContent);
            break;
        }
    }

    // 获取集数时间长度
    $jichangElement = $dom->getElementById('info')->getElementsByTagName('span');
    $jichang = '';
    foreach ($jichangElement as $element) {
        if ($element->nodeValue == '单集片长:') {
            $jichang = trim($element->nextSibling->textContent);
            break;
        }
    }

    // 获取imdb
    $imdbElement = $dom->getElementById('info')->getElementsByTagName('span');
    $imdb = '';
    foreach ($imdbElement as $element) {
        if ($element->nodeValue == 'IMDb:') {
            $imdb = trim($element->nextSibling->textContent);
            break;
        }
    }

    // 获取 豆瓣评分
    $imdbElement = $dom->getElementsByTagName('strong');
    $score = '';
    foreach ($imdbElement as $element) {

        if ($element->hasAttribute('property') && $element->getAttribute('property') === 'v:average') {
                $score=$element->nodeValue;
        }
    }

    // 获取剧情简介
    $imdbElement = $dom->getElementById('link-report-intra')->getElementsByTagName('span');
    $vod_content = '';
    foreach ($imdbElement as $element) {

        if ($element->hasAttribute('property') && $element->getAttribute('property') === 'v:summary') {
                $vod_content=$element->nodeValue;
                // 使用 trim 函数去除前后空格
                $trimmedString = trim($vod_content);
                // 使用 preg_replace 函数去除多余的中间空格
                $vod_content = preg_replace('/s+/', ' ', $trimmedString);
                break;
        }
    }

    $dateString = "2021-02-05(中国大陆)";
    $pattern = '/(d{4}-d{2}-d{2})/';
    if (preg_match($pattern, $releaseDate, $matches)) {
        $releaseDate = $matches[1];
        $year = date('Y', strtotime($releaseDate));
    }

    $jsonok = array(
    'code' => 200, 
    'msg' => "获取成功", 
    'data' => array(
        'vod_name' => $itemReviewed . "", 
        "vod_sub" => $itemReviewed. "", 
        "vod_pic" => htmlspecialchars($imgDate) . "", 
        "vod_year" => htmlspecialchars($year) . "", 
        "vod_lang" => htmlspecialchars($languagesString) . "", 
        "vod_area" => htmlspecialchars($regionsString) . "", 
        'vod_total' => htmlspecialchars($episodes) . "", 
        'vod_class' => htmlspecialchars($genresString) . "", 
        'vod_tag' => htmlspecialchars($genresString), 
        'vod_actor' => htmlspecialchars($castsString). "", 
        'vod_director' => explode(":",htmlspecialchars($director))[1] . "", 
        'vod_pubdate' => $releaseDate. "", 
        'vod_writer' => explode(":",htmlspecialchars($writer))[1] . "", 
        'vod_douban_score' => htmlspecialchars($score). "", 
        'vod_duration' => htmlspecialchars($jichang). "", 
        'vod_content' =>trim($vod_content) . "", 
        'vod_douban_id' => "".$_GET['id'],
        'vod_imdb_id' =>$imdb,
        'dburl' => "https://movie.douban.com/subject/".$_GET['id']."/",
        'ylurl' => $_SERVER['REQUEST_SCHEME']."://".$_SERVER['HTTP_HOST'].$_SERVER['REQUEST_URI'],
        'procedure' => "星际导航API程序", 
        'apiDomain' => "https://daohw.fengye.wang/", 
        ),
        );
    exit(json_encode($jsonok, JSON_NUMERIC_CHECK | JSON_PRETTY_PRINT | JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES)) ; 
}

// 关闭cURL会话
curl_close($ch);

?>
看完后的心情
当前内容共有0人进行评价
您的评价对其他人很有帮助,请认真对待!
×
精品总值得回味
当前内容共有0人收藏
账号未登录

您还未登录,请登录账号之后再进行操作

×
感谢支持与厚爱
  • 微信支付
  • 支付宝支付
微信收款码
支付宝收款码
打赏请备注姓名及联系方式,方便进行感谢!
×
分享给朋友
使用微信手机QQ微博扫码分享
×
×

扫描二维码,在手机上阅读

版权说明
文章采用: 《署名-非商业性使用-相同方式共享 4.0 国际 (CC BY-NC-SA 4.0)》许可协议授权。
版权声明:未标注转载均为本站原创,转载时请以链接形式注明文章出处。如有侵权、不妥之处,请联系站长删除。敬请谅解!

-- 展开阅读全文 --
java基础
« 上一篇
php
下一篇 »

发表评论

HI ! 请登录
注册会员,享受下载全站资源特权。

微语录

  • 2025-06-13:周一到周四,都是红的,今天周五,而且大盘指数上了3400,明明我周3周四都在防3400回调,我为何周五忘记了呢,是不是有点不把钱当回事了呢。

    叶子_ 1 个月前 更多微语

最新文章

热门文章

最新评论

标签