为了账号安全,请及时绑定邮箱和手机立即绑定

如何通过html内容获取href和文本内容

如何通过html内容获取href和文本内容

PHP
一只甜甜圈 2023-08-06 15:55:17
我想要获取内容和网址,包括所有其他 td 数据。我的代码:$context = stream_context_create(    array(        "http" => array(            "header" => "User-Agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36"        )    ));$htmlContent = file_get_contents("https://www.iana.org/domains/root/db", false, $context);    $DOM = new DOMDocument();$DOM->loadHTML($htmlContent);$FirstdTable = $DOM->getElementsByTagName('table')->item(0);$Header = $FirstdTable->getElementsByTagName('th');$Detail = $FirstdTable->getElementsByTagName('td');//#Get header name of the tableforeach($Header as $NodeHeader) {    $aDataTableHeaderHTML[] = trim($NodeHeader->textContent);}//#Get row data/detail table without header name as key$i = 0;$j = 0;foreach($Detail as $sNodeDetail){       $aDataTableDetailHTML[$j][] = trim($sNodeDetail->textContent);    $i = $i + 1;    $j = $i % count($aDataTableHeaderHTML) == 0 ? $j + 1 : $j;}电流输出:Array(    [0] => Array        (            [0] => .aaa            [1] => generic            [2] => American Automobile Association, Inc.        )    [1] => Array        (            [0] => .aarp            [1] => generic            [2] => AARP        )    [2] => Array        (            [0] => .abarth            [1] => generic            [2] => Fiat Chrysler Automobiles N.V.        )}我在这里想要:Array(    [0] => Array        (            [0] => .aaa            [1] => generic            [2] => American Automobile Association, Inc.            [3] => https://www.iana.org/domains/root/db/aaa.html        )    [1] => Array        (            [0] => .aarp            [1] => generic            [2] => AARP            [3] => https://www.iana.org/domains/root/db/aarp.html        )    [2] => Array        (            [0] => .abarth            [1] => generic            [2] => Fiat Chrysler Automobiles N.V.            [3] => https://www.iana.org/domains/root/db/abarth.html        )}
查看完整描述

1 回答

?
凤凰求蛊

TA贡献1825条经验 获得超4个赞

目前,您只是获取 all 中的所有文本内容<td>。并且它不会将链接包含在锚标记内。为此,您需要更深入地研究<td>.


这是使用以下方法来完成此操作的一种方法xpath:


$xpath = new DOMXpath($DOM);

$base = 'https://www.iana.org/';

foreach($Detail as $sNodeDetail)

{

    $aDataTableDetailHTML[$j][] = trim($sNodeDetail->textContent);

    if ($link = $xpath->evaluate("string(./span[contains(@class, 'domain')]/a/@href)", $sNodeDetail)) {

        $aDataTableDetailHTML[$j][] = "{$base}{$link}";

    }

    $i = $i + 1;

    $j = $i % count($aDataTableHeaderHTML) == 0 ? $j + 1 : $j;

}

基本上,查询只是提取href当前<td>迭代中的值<span class="domain tld"><a href="xxxx">xxx</a></span>并获取该href值。


另一种方法是迭代每个<tr>而不是每个<td>:


$aDataTableDetailHTML = [];

$DOM = new DOMDocument();

$DOM->loadHTML($htmlContent);

$xpath = new DOMXpath($DOM);

$base = 'https://www.iana.org/';

foreach($xpath->query('//table[@id="tld-table"]/tbody/tr') as $row) {

    $domain = trim($xpath->evaluate("string(./td[1])", $row));

    $type = $xpath->evaluate("string(./td[2])", $row);

    $tld_manager = $xpath->evaluate("string(./td[3])", $row);

    $url = $xpath->evaluate("string(./td[1]/span/a/@href)", $row);

    $aDataTableDetailHTML[] = [$domain, $type, $tld_manager, "{$base}{$url}"];

}


查看完整回答
反对 回复 2023-08-06
  • 1 回答
  • 0 关注
  • 72 浏览

添加回答

举报

0/150
提交
取消
意见反馈 帮助中心 APP下载
官方微信