Tạo trang lấy dữ liệu ncov tự động bằng ajax trong WordPress
📅 — 👀 1388 — 👦Thấy cái này đang hot theo trend 2020 nên mình sẽ hướng dẫn anh em làm. Cũng khá là đơn giản, không phức tạp như các bạn nghĩ. Sau đây không nói nhiều, chúng ta cùng bắt đầu. Cần có thư viện Jquery nhé.
Mình chỉ ghi code ở đây, làm biếng giải thích code. Nếu các bạn có gì không hiểu thì để lại bình luận bên dưới hoặc inbox cho mình qua fb.com/thichcode.net. Ok, ta phải xem demo thử xem thích không nhé => https://www.vk2.cc/ncov/
P/S 1: Code sẽ tạo 2 file ncov_tg.js và ncov_vn.js mỗi khi lấy dữ liệu thành công. Nếu ở lần sau lấy không được dữ liệu mới thì sẽ lấy dữ liệu từ 2 file này. Đồng thời 2 file này bạn có thể share giống như API.
P/S 2: Do mình ko làm cron job (tự động chạy job trong thời gian qui định) do 01 số host không hỗ trợ nên khi bạn truy cập vào https://www.vk2.cc/ncov/ thì code sẽ kiểm tra thời gian modify 2 file js. Nếu thời gian này lớn hơn 5 phút thì mới lấy dữ liệu mới.
Trang lấy dữ liệu :
- https://vi.wikipedia.org/wiki/Bản_mẫu:Số_ca_nhiễm_COVID-19_theo_tỉnh_thành_Việt_Nam
- https://vi.wikipedia.org/wiki/Bản_mẫu:Dữ_liệu_dịch_virus_corona_2019–20
Tạo file page-ncov.php nằm trong thư mục theme của bạn. FIle này sẽ tạo Template Page trong wp-admin
<?php /** * Template Name: ncov * * Create by ThichCode.NET */ get_header(); ?> <style> .csslder { display: block; text-align: center; height: 20px; position: relative; clear: both; } .csslder .csswrap { position: absolute; top: 50%; left: 50%; -webkit-transform: translate(-50%, -50%); transform: translate(-50%, -50%); } .cssdot { width: 10px; height: 10px; border: 1px solid #288ad6; background: #288ad6; border-radius: 50%; float: left; margin: 0 5px; -webkit-transform: scale(0); transform: scale(0); -webkit-animation: fx 1000ms ease infinite 0ms; animation: fx 1000ms ease infinite 0ms; } .cssdot:nth-child(2) { -webkit-animation: fx 1000ms ease infinite 300ms; animation: fx 1000ms ease infinite 300ms; } .cssdot:nth-child(3) { -webkit-animation: fx 1000ms ease infinite 600ms; animation: fx 1000ms ease infinite 600ms; } .loadingcover { position: fixed; top: 0; left: 0; right: 0; bottom: 0; background-color: rgba(255, 255, 255, .75); /*display: none;*/ z-index: 2; } .loadingcover .csslder { top: 50%; } /*! CSS Used keyframes */ @-webkit-keyframes fx { 50% { -webkit-transform: scale(1); transform: scale(1); opacity: 1; } 100% { opacity: 0; } } @keyframes fx { 50% { -webkit-transform: scale(1); transform: scale(1); opacity: 1; } 100% { opacity: 0; } } @media only screen and (max-width: 480px){ .cart-table { width: 100%; overflow: unset; display: table; } .kk-table td span{ display: inline-block; padding: .25em .4em; font-size: 75%; font-weight: 700; line-height: 1; text-align: center; white-space: nowrap; vertical-align: baseline; border-radius: .25rem; } .xac_nhan{ color: #212529; background-color: #ffc107; } .tu_vong{ color: #fff; background-color: #dc3545; } .hoi_phuc{ color: #fff; background-color: #28a745; } } </style> <div class="loadingcover"> <p class="csslder"> <span class="csswrap"> <span class="cssdot"></span> <span class="cssdot"></span> <span class="cssdot"></span> </span> </p> </div> <style> .kk-table{ width: 100%; overflow: hidden; display: table; } .kk-table tbody tr:first-child { background: red; color: white; } .kk-table thead tr th { background: black; color: white; padding: 10px;text-align: center; } .kk-table tbody tr td { padding: 5px; font-size: unset; /*min-width: 100px;*/ text-align: center; } .kk-table tbody tr:nth-child(2n) { background: #f3f3f3; } .kk-table td:nth-child(2){ text-align:left; } </style> <div class="container"> <div class="row margin-top-30" id="main"></div> </div> <?php get_footer(); ?> <script> $(function(){ var loadError = 0; function LoadData() { var data = { 'action': 'ncov' }; $.ajax({ url : "<?=admin_url('admin-ajax.php');?>", data:data, type:'POST', beforeSend: function( xhr ){ flag = true; }, success:function(data){ if(data) { $('#main').empty().append(data); $('.loadingcover').fadeOut(); } loadError = 0; flag = true; }, error: function(xhr, status, error) { loadError++; if (loadError < 5) LoadData(); else { $('.loadingcover').html("<div style='top: 40%;position: absolute;text-align: center;width: 100%;'><h4>Lỗi trong quá trình tải dữ liệu!<br/><br/>Vui lòng nhấn F5 để thử lại...</h4></div>"); } } }); } LoadData(); }); </script>
Sau đây, thêm các hàm này file function.php
<?php include_once('simple_html_dom.php'); function replace_between($str, $needle_start, $needle_end, $replacement) { while (strpos($str, $needle_start) !== false && strpos($str, $needle_end) !== false) { $pos = strpos($str, $needle_start); $start = $pos === false ? 0 : $pos + strlen($needle_start); $start = $start - strlen($needle_start); $pos = strpos($str, $needle_end, $start); $end = $pos === false ? strlen($str) : $pos; $end = $end + strlen($needle_end); $str = substr_replace($str, $replacement, $start, $end - $start); } return $str; } function Get_Ncov_Country($url) { $vn_ncov_txt = get_template_directory() . '/report/ncov_vn.js'; $arr_data = []; $html = file_get_html($url); $tables = $html->find('table'); foreach($tables as $t) { if (strpos($t, 'Số ca nhiễm theo tỉnh thành tại Việt Nam') !== false) { $rows = $t->find('tr'); foreach ($rows as $row) { if (strpos($row, 'colspan="4"') === false) { $stt = 0; $ten = ''; $xn = 0; $tv = 0; $hp = 0; foreach ($row->children() as $cell) { $name = html_entity_decode($cell->plaintext); $name = replace_between($name, '[', ']', ''); switch($stt) { case 0:{ $ten = $name; break; } case 1:{ $xn = $name; break;} case 2:{ $tv = $name; break;} case 3:{ $hp = $name; break;} } $stt++; } $arr_data[] = array( 'tinh_tp'=> $ten, 'so_ca_nhiem' => $xn, 'tu_vong' => $tv, 'da_xuat_vien'=>$hp ); } } $fp = fopen($vn_ncov_txt, 'w'); fwrite($fp, json_encode($arr_data)); fclose($fp); break; } } return $arr_data; } add_action('wp_ajax_ncov', 'kk_ncov_ajax_handler'); add_action('wp_ajax_nopriv_ncov', 'kk_ncov_ajax_handler'); function kk_ncov_ajax_handler(){ $tg_ncov_txt = get_template_directory() . '/report/ncov_tg.js'; $arr_data = []; $arr_data_vn = []; $date1 = date("Y-m-d H:i:s.", filemtime($tg_ncov_txt)); $date2 = date('Y-m-d H:i:s'); $diff = abs(strtotime($date2) - strtotime($date1)); $years = floor($diff / (365*60*60*24)); $months = floor(($diff - $years * 365*60*60*24) / (30*60*60*24)); $days = floor(($diff - $years * 365*60*60*24 - $months*30*60*60*24)/ (60*60*24)); $hours = floor(($diff - $years * 365*60*60*24 - $months*30*60*60*24 - $days*60*60*24) / (60*60)); $minutes = floor(($diff - $years * 365*60*60*24 - $months*30*60*60*24 - $days*60*60*24 - $hours*60*60)/60); if ($minutes > 5) { $arr_data_vn = Get_Ncov_Country('https://vi.wikipedia.org/wiki/B%E1%BA%A3n_m%E1%BA%ABu:S%E1%BB%91_ca_nhi%E1%BB%85m_COVID-19_theo_t%E1%BB%89nh_th%C3%A0nh_Vi%E1%BB%87t_Nam'); $html = file_get_html('https://vi.wikipedia.org/wiki/B%E1%BA%A3n_m%E1%BA%ABu:D%E1%BB%AF_li%E1%BB%87u_d%E1%BB%8Bch_virus_corona_2019%E2%80%9320'); $tables = $html->find('table'); foreach($tables as $t) { if (strpos($t, 'Đại dịch COVID-19 theo quốc gia và vùng lãnh thổ') !== false) { $rows = $t->find('tr'); //All rows of first table //Loop through each row foreach ($rows as $row) { if (strpos($row, 'colspan="4"') === false) { $stt = 0; $flag = ''; $ten = ''; $xn = 0; $tv = 0; $hp = 0; $url = ''; if (strpos($row, 'lãnh thổ') !== false || strpos($row, 'Tổng khu vực') !== false) { //Loop through each child (cell) of the row foreach ($row->children() as $cell) { $name = html_entity_decode($cell->plaintext); $name = replace_between($name, '[', ']', ''); switch($stt) { case 0:{ $ten = $name; break; } case 1:{ $xn = $name; break;} case 2:{ $tv = $name; break;} case 3:{ $hp = $name; break;} } $stt++; } $arr_data[] = array( 'co' => $flag, 'quoc_gia'=> $ten, 'xac_nhan' => $xn, 'tu_vong' => $tv, 'hoi_phuc'=>$hp ); } else { //Loop through each child (cell) of the row foreach ($row->children() as $cell) { $name = html_entity_decode($cell->plaintext); $name = replace_between($name, '[', ']', ''); preg_match('%<img.*?src=["\'](.*?)["\'].*?/>%i', $cell, $matches); $imgSrc = ''; if (count($matches) > 0) { $imgSrc = '<img class="lazyload margin-right-10" data-src="'.$matches[1].'" alt="'.$name.'"/>'; $flag = $matches[1]; } switch($stt) { case 0:{ $ten = $name; break; } case 1:{ $xn = $name; break;} case 2:{ $tv = $name; break;} case 3:{ $hp = $name; break;} } foreach($cell->find('a') as $element) if(strpos($element, '<img') === false && strpos($element, '/wiki/') !== false) { $url = str_replace('/wiki/', '', $element->href); //name = '<a href="javascript;" data-href="'.$url.'">'.$name.'</a>'; } $stt++; } $arr_data[] = array( 'co' => $flag, 'quoc_gia'=> $ten, 'xac_nhan' => $xn, 'tu_vong' => $tv, 'hoi_phuc'=>$hp ); } } } break; } } $fp = fopen($tg_ncov_txt, 'w'); fwrite($fp, json_encode($arr_data)); fclose($fp); } if (empty($arr_data_vn)) { $vn_ncov_txt = get_template_directory() . '/report/ncov_vn.js'; $myfile = fopen($vn_ncov_txt, "r") or die("Unable to open file!"); $arr_data_vn = json_decode(fread($myfile,filesize($vn_ncov_txt)), true); fclose($myfile); } if (empty($arr_data)) { $myfile = fopen($tg_ncov_txt, "r") or die("Unable to open file!"); $arr_data = json_decode(fread($myfile,filesize($tg_ncov_txt)), true); fclose($myfile); } echo '<div class="col-md-6">'; echo '<table class="cart-table table table-bordered kk-table">'; $html_tbody = ''; $number = count($arr_data_vn); for($i=0;$i<$number;$i++) { if ($i==0) { echo '<thead class="hidden-xs"><tr><th class="hidden-xs">#</th><th>'.$arr_data_vn[$i]['tinh_tp'].'</th><th>'.$arr_data_vn[$i]['so_ca_nhiem'].'</th><th>'.$arr_data_vn[$i]['tu_vong'].'</th><th>'.$arr_data_vn[$i]['da_xuat_vien'].'</th></tr></thead>'; } else { $html_tbody .= '<tr><td class="hidden-xs">'.$i.'</td><td>'.$arr_data_vn[$i]['tinh_tp'].'</td><td><span class="xac_nhan">'.$arr_data_vn[$i]['so_ca_nhiem'].'</span></td><td><span class="tu_vong">'.$arr_data_vn[$i]['tu_vong'].'</span></td><td><span class="hoi_phuc">'.$arr_data_vn[$i]['da_xuat_vien'].'</span></td></tr>'; } } echo '<tbody>'.$html_tbody.'</tbody>'; echo '</table></div>'; echo '<div class="col-md-6"><table class="cart-table table table-bordered kk-table">'; $html_tbody = ''; for($i=0;$i<count($arr_data);$i++) { if ($i==0) { echo '<thead class="hidden-xs"><tr><th class="hidden-xs">#</th><th>'.$arr_data[$i]['quoc_gia'].'</th><th>'.$arr_data[$i]['xac_nhan'].'</th><th>'.$arr_data[$i]['tu_vong'].'</th><th>'.$arr_data[$i]['hoi_phuc'].'</th></tr></thead>'; } else { $html_tbody .= '<tr><td class="hidden-xs">'.$i.'</td><td><img src="'.$arr_data[$i]['co'].'"/>'.$arr_data[$i]['quoc_gia'].'</td><td><span class="xac_nhan">'.$arr_data[$i]['xac_nhan'].'</span></td><td><span class="tu_vong">'.$arr_data[$i]['tu_vong'].'</span></td><td><span class="hoi_phuc">'.$arr_data[$i]['hoi_phuc'].'</span></td></tr>'; } } echo '<tbody>'.$html_tbody.'</tbody>'; echo '</table></div>'; die; // here we exit the script and even no wp_reset_query() required! } ?>
Cuối cùng ta tạo file simple_html_dom.php cùng cấp với file function.php
<?php /** * Website: http://sourceforge.net/projects/simplehtmldom/ * Additional projects: http://sourceforge.net/projects/debugobject/ * Acknowledge: Jose Solorzano (https://sourceforge.net/projects/php-html/) * * Licensed under The MIT License * See the LICENSE file in the project root for more information. * * Authors: * S.C. Chen * John Schlick * Rus Carroll * logmanoriginal * * Contributors: * Yousuke Kumakura * Vadim Voituk * Antcs * * Version Rev. 1.9.1 (291) */ define('HDOM_TYPE_ELEMENT', 1); define('HDOM_TYPE_COMMENT', 2); define('HDOM_TYPE_TEXT', 3); define('HDOM_TYPE_ENDTAG', 4); define('HDOM_TYPE_ROOT', 5); define('HDOM_TYPE_UNKNOWN', 6); define('HDOM_QUOTE_DOUBLE', 0); define('HDOM_QUOTE_SINGLE', 1); define('HDOM_QUOTE_NO', 3); define('HDOM_INFO_BEGIN', 0); define('HDOM_INFO_END', 1); define('HDOM_INFO_QUOTE', 2); define('HDOM_INFO_SPACE', 3); define('HDOM_INFO_TEXT', 4); define('HDOM_INFO_INNER', 5); define('HDOM_INFO_OUTER', 6); define('HDOM_INFO_ENDSPACE', 7); defined('DEFAULT_TARGET_CHARSET') || define('DEFAULT_TARGET_CHARSET', 'UTF-8'); defined('DEFAULT_BR_TEXT') || define('DEFAULT_BR_TEXT', "\r\n"); defined('DEFAULT_SPAN_TEXT') || define('DEFAULT_SPAN_TEXT', ' '); defined('MAX_FILE_SIZE') || define('MAX_FILE_SIZE', 600000); define('HDOM_SMARTY_AS_TEXT', 1); function file_get_html( $url, $use_include_path = false, $context = null, $offset = 0, $maxLen = -1, $lowercase = true, $forceTagsClosed = true, $target_charset = DEFAULT_TARGET_CHARSET, $stripRN = true, $defaultBRText = DEFAULT_BR_TEXT, $defaultSpanText = DEFAULT_SPAN_TEXT) { if($maxLen <= 0) { $maxLen = MAX_FILE_SIZE; } $dom = new simple_html_dom( null, $lowercase, $forceTagsClosed, $target_charset, $stripRN, $defaultBRText, $defaultSpanText ); /** * For sourceforge users: uncomment the next line and comment the * retrieve_url_contents line 2 lines down if it is not already done. */ $contents = file_get_contents( $url, $use_include_path, $context, $offset, $maxLen ); // $contents = retrieve_url_contents($url); if (empty($contents) || strlen($contents) > $maxLen) { $dom->clear(); return false; } return $dom->load($contents, $lowercase, $stripRN); } function str_get_html( $str, $lowercase = true, $forceTagsClosed = true, $target_charset = DEFAULT_TARGET_CHARSET, $stripRN = true, $defaultBRText = DEFAULT_BR_TEXT, $defaultSpanText = DEFAULT_SPAN_TEXT) { $dom = new simple_html_dom( null, $lowercase, $forceTagsClosed, $target_charset, $stripRN, $defaultBRText, $defaultSpanText ); if (empty($str) || strlen($str) > MAX_FILE_SIZE) { $dom->clear(); return false; } return $dom->load($str, $lowercase, $stripRN); } function dump_html_tree($node, $show_attr = true, $deep = 0) { $node->dump($node); } class simple_html_dom_node { public $nodetype = HDOM_TYPE_TEXT; public $tag = 'text'; public $attr = array(); public $children = array(); public $nodes = array(); public $parent = null; public $_ = array(); public $tag_start = 0; private $dom = null; function __construct($dom) { $this->dom = $dom; $dom->nodes[] = $this; } function __destruct() { $this->clear(); } function __toString() { return $this->outertext(); } function clear() { $this->dom = null; $this->nodes = null; $this->parent = null; $this->children = null; } function dump($show_attr = true, $depth = 0) { echo str_repeat("\t", $depth) . $this->tag; if ($show_attr && count($this->attr) > 0) { echo '('; foreach ($this->attr as $k => $v) { echo "[$k]=>\"$v\", "; } echo ')'; } echo "\n"; if ($this->nodes) { foreach ($this->nodes as $node) { $node->dump($show_attr, $depth + 1); } } } function dump_node($echo = true) { $string = $this->tag; if (count($this->attr) > 0) { $string .= '('; foreach ($this->attr as $k => $v) { $string .= "[$k]=>\"$v\", "; } $string .= ')'; } if (count($this->_) > 0) { $string .= ' $_ ('; foreach ($this->_ as $k => $v) { if (is_array($v)) { $string .= "[$k]=>("; foreach ($v as $k2 => $v2) { $string .= "[$k2]=>\"$v2\", "; } $string .= ')'; } else { $string .= "[$k]=>\"$v\", "; } } $string .= ')'; } if (isset($this->text)) { $string .= " text: ({$this->text})"; } $string .= ' HDOM_INNER_INFO: '; if (isset($node->_[HDOM_INFO_INNER])) { $string .= "'" . $node->_[HDOM_INFO_INNER] . "'"; } else { $string .= ' NULL '; } $string .= ' children: ' . count($this->children); $string .= ' nodes: ' . count($this->nodes); $string .= ' tag_start: ' . $this->tag_start; $string .= "\n"; if ($echo) { echo $string; return; } else { return $string; } } function parent($parent = null) { // I am SURE that this doesn't work properly. // It fails to unset the current node from it's current parents nodes or // children list first. if ($parent !== null) { $this->parent = $parent; $this->parent->nodes[] = $this; $this->parent->children[] = $this; } return $this->parent; } function has_child() { return !empty($this->children); } function children($idx = -1) { if ($idx === -1) { return $this->children; } if (isset($this->children[$idx])) { return $this->children[$idx]; } return null; } function first_child() { if (count($this->children) > 0) { return $this->children[0]; } return null; } function last_child() { if (count($this->children) > 0) { return end($this->children); } return null; } function next_sibling() { if ($this->parent === null) { return null; } $idx = array_search($this, $this->parent->children, true); if ($idx !== false && isset($this->parent->children[$idx + 1])) { return $this->parent->children[$idx + 1]; } return null; } function prev_sibling() { if ($this->parent === null) { return null; } $idx = array_search($this, $this->parent->children, true); if ($idx !== false && $idx > 0) { return $this->parent->children[$idx - 1]; } return null; } function find_ancestor_tag($tag) { global $debug_object; if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } if ($this->parent === null) { return null; } $ancestor = $this->parent; while (!is_null($ancestor)) { if (is_object($debug_object)) { $debug_object->debug_log(2, 'Current tag is: ' . $ancestor->tag); } if ($ancestor->tag === $tag) { break; } $ancestor = $ancestor->parent; } return $ancestor; } function innertext() { if (isset($this->_[HDOM_INFO_INNER])) { return $this->_[HDOM_INFO_INNER]; } if (isset($this->_[HDOM_INFO_TEXT])) { return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); } $ret = ''; foreach ($this->nodes as $n) { $ret .= $n->outertext(); } return $ret; } function outertext() { global $debug_object; if (is_object($debug_object)) { $text = ''; if ($this->tag === 'text') { if (!empty($this->text)) { $text = ' with text: ' . $this->text; } } $debug_object->debug_log(1, 'Innertext of tag: ' . $this->tag . $text); } if ($this->tag === 'root') { return $this->innertext(); } // todo: What is the use of this callback? Remove? if ($this->dom && $this->dom->callback !== null) { call_user_func_array($this->dom->callback, array($this)); } if (isset($this->_[HDOM_INFO_OUTER])) { return $this->_[HDOM_INFO_OUTER]; } if (isset($this->_[HDOM_INFO_TEXT])) { return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); } $ret = ''; if ($this->dom && $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]) { $ret = $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]->makeup(); } if (isset($this->_[HDOM_INFO_INNER])) { // todo: <br> should either never have HDOM_INFO_INNER or always if ($this->tag !== 'br') { $ret .= $this->_[HDOM_INFO_INNER]; } } elseif ($this->nodes) { foreach ($this->nodes as $n) { $ret .= $this->convert_text($n->outertext()); } } if (isset($this->_[HDOM_INFO_END]) && $this->_[HDOM_INFO_END] != 0) { $ret .= '</' . $this->tag . '>'; } return $ret; } function text() { if (isset($this->_[HDOM_INFO_INNER])) { return $this->_[HDOM_INFO_INNER]; } switch ($this->nodetype) { case HDOM_TYPE_TEXT: return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); case HDOM_TYPE_COMMENT: return ''; case HDOM_TYPE_UNKNOWN: return ''; } if (strcasecmp($this->tag, 'script') === 0) { return ''; } if (strcasecmp($this->tag, 'style') === 0) { return ''; } $ret = ''; // In rare cases, (always node type 1 or HDOM_TYPE_ELEMENT - observed // for some span tags, and some p tags) $this->nodes is set to NULL. // NOTE: This indicates that there is a problem where it's set to NULL // without a clear happening. // WHY is this happening? if (!is_null($this->nodes)) { foreach ($this->nodes as $n) { // Start paragraph after a blank line if ($n->tag === 'p') { $ret = trim($ret) . "\n\n"; } $ret .= $this->convert_text($n->text()); // If this node is a span... add a space at the end of it so // multiple spans don't run into each other. This is plaintext // after all. if ($n->tag === 'span') { $ret .= $this->dom->default_span_text; } } } return $ret; } function xmltext() { $ret = $this->innertext(); $ret = str_ireplace('<![CDATA[', '', $ret); $ret = str_replace(']]>', '', $ret); return $ret; } function makeup() { // text, comment, unknown if (isset($this->_[HDOM_INFO_TEXT])) { return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); } $ret = '<' . $this->tag; $i = -1; foreach ($this->attr as $key => $val) { ++$i; // skip removed attribute if ($val === null || $val === false) { continue; } $ret .= $this->_[HDOM_INFO_SPACE][$i][0]; //no value attr: nowrap, checked selected... if ($val === true) { $ret .= $key; } else { switch ($this->_[HDOM_INFO_QUOTE][$i]) { case HDOM_QUOTE_DOUBLE: $quote = '"'; break; case HDOM_QUOTE_SINGLE: $quote = '\''; break; default: $quote = ''; } $ret .= $key . $this->_[HDOM_INFO_SPACE][$i][1] . '=' . $this->_[HDOM_INFO_SPACE][$i][2] . $quote . $val . $quote; } } $ret = $this->dom->restore_noise($ret); return $ret . $this->_[HDOM_INFO_ENDSPACE] . '>'; } function find($selector, $idx = null, $lowercase = false) { $selectors = $this->parse_selector($selector); if (($count = count($selectors)) === 0) { return array(); } $found_keys = array(); // find each selector for ($c = 0; $c < $count; ++$c) { // The change on the below line was documented on the sourceforge // code tracker id 2788009 // used to be: if (($levle=count($selectors[0]))===0) return array(); if (($levle = count($selectors[$c])) === 0) { return array(); } if (!isset($this->_[HDOM_INFO_BEGIN])) { return array(); } $head = array($this->_[HDOM_INFO_BEGIN] => 1); $cmd = ' '; // Combinator // handle descendant selectors, no recursive! for ($l = 0; $l < $levle; ++$l) { $ret = array(); foreach ($head as $k => $v) { $n = ($k === -1) ? $this->dom->root : $this->dom->nodes[$k]; //PaperG - Pass this optional parameter on to the seek function. $n->seek($selectors[$c][$l], $ret, $cmd, $lowercase); } $head = $ret; $cmd = $selectors[$c][$l][4]; // Next Combinator } foreach ($head as $k => $v) { if (!isset($found_keys[$k])) { $found_keys[$k] = 1; } } } // sort keys ksort($found_keys); $found = array(); foreach ($found_keys as $k => $v) { $found[] = $this->dom->nodes[$k]; } // return nth-element or array if (is_null($idx)) { return $found; } elseif ($idx < 0) { $idx = count($found) + $idx; } return (isset($found[$idx])) ? $found[$idx] : null; } protected function seek($selector, &$ret, $parent_cmd, $lowercase = false) { global $debug_object; if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } list($tag, $id, $class, $attributes, $cmb) = $selector; $nodes = array(); if ($parent_cmd === ' ') { // Descendant Combinator // Find parent closing tag if the current element doesn't have a closing // tag (i.e. void element) $end = (!empty($this->_[HDOM_INFO_END])) ? $this->_[HDOM_INFO_END] : 0; if ($end == 0) { $parent = $this->parent; while (!isset($parent->_[HDOM_INFO_END]) && $parent !== null) { $end -= 1; $parent = $parent->parent; } $end += $parent->_[HDOM_INFO_END]; } // Get list of target nodes $nodes_start = $this->_[HDOM_INFO_BEGIN] + 1; $nodes_count = $end - $nodes_start; $nodes = array_slice($this->dom->nodes, $nodes_start, $nodes_count, true); } elseif ($parent_cmd === '>') { // Child Combinator $nodes = $this->children; } elseif ($parent_cmd === '+' && $this->parent && in_array($this, $this->parent->children)) { // Next-Sibling Combinator $index = array_search($this, $this->parent->children, true) + 1; if ($index < count($this->parent->children)) $nodes[] = $this->parent->children[$index]; } elseif ($parent_cmd === '~' && $this->parent && in_array($this, $this->parent->children)) { // Subsequent Sibling Combinator $index = array_search($this, $this->parent->children, true); $nodes = array_slice($this->parent->children, $index); } // Go throgh each element starting at this element until the end tag // Note: If this element is a void tag, any previous void element is // skipped. foreach($nodes as $node) { $pass = true; // Skip root nodes if(!$node->parent) { $pass = false; } // Handle 'text' selector if($pass && $tag === 'text' && $node->tag === 'text') { $ret[array_search($node, $this->dom->nodes, true)] = 1; unset($node); continue; } // Skip if node isn't a child node (i.e. text nodes) if($pass && !in_array($node, $node->parent->children, true)) { $pass = false; } // Skip if tag doesn't match if ($pass && $tag !== '' && $tag !== $node->tag && $tag !== '*') { $pass = false; } // Skip if ID doesn't exist if ($pass && $id !== '' && !isset($node->attr['id'])) { $pass = false; } // Check if ID matches if ($pass && $id !== '' && isset($node->attr['id'])) { // Note: Only consider the first ID (as browsers do) $node_id = explode(' ', trim($node->attr['id']))[0]; if($id !== $node_id) { $pass = false; } } // Check if all class(es) exist if ($pass && $class !== '' && is_array($class) && !empty($class)) { if (isset($node->attr['class'])) { $node_classes = explode(' ', $node->attr['class']); if ($lowercase) { $node_classes = array_map('strtolower', $node_classes); } foreach($class as $c) { if(!in_array($c, $node_classes)) { $pass = false; break; } } } else { $pass = false; } } // Check attributes if ($pass && $attributes !== '' && is_array($attributes) && !empty($attributes)) { foreach($attributes as $a) { list ( $att_name, $att_expr, $att_val, $att_inv, $att_case_sensitivity ) = $a; // Handle indexing attributes (i.e. "[2]") /** * Note: This is not supported by the CSS Standard but adds * the ability to select items compatible to XPath (i.e. * the 3rd element within it's parent). * * Note: This doesn't conflict with the CSS Standard which * doesn't work on numeric attributes anyway. */ if (is_numeric($att_name) && $att_expr === '' && $att_val === '') { $count = 0; // Find index of current element in parent foreach ($node->parent->children as $c) { if ($c->tag === $node->tag) ++$count; if ($c === $node) break; } // If this is the correct node, continue with next // attribute if ($count === (int)$att_name) continue; } // Check attribute availability if ($att_inv) { // Attribute should NOT be set if (isset($node->attr[$att_name])) { $pass = false; break; } } else { // Attribute should be set // todo: "plaintext" is not a valid CSS selector! if ($att_name !== 'plaintext' && !isset($node->attr[$att_name])) { $pass = false; break; } } // Continue with next attribute if expression isn't defined if ($att_expr === '') continue; // If they have told us that this is a "plaintext" // search then we want the plaintext of the node - right? // todo "plaintext" is not a valid CSS selector! if ($att_name === 'plaintext') { $nodeKeyValue = $node->text(); } else { $nodeKeyValue = $node->attr[$att_name]; } if (is_object($debug_object)) { $debug_object->debug_log(2, 'testing node: ' . $node->tag . ' for attribute: ' . $att_name . $att_expr . $att_val . ' where nodes value is: ' . $nodeKeyValue ); } // If lowercase is set, do a case insensitive test of // the value of the selector. if ($lowercase) { $check = $this->match( $att_expr, strtolower($att_val), strtolower($nodeKeyValue), $att_case_sensitivity ); } else { $check = $this->match( $att_expr, $att_val, $nodeKeyValue, $att_case_sensitivity ); } if (is_object($debug_object)) { $debug_object->debug_log(2, 'after match: ' . ($check ? 'true' : 'false') ); } if (!$check) { $pass = false; break; } } } // Found a match. Add to list and clear node if ($pass) $ret[$node->_[HDOM_INFO_BEGIN]] = 1; unset($node); } // It's passed by reference so this is actually what this function returns. if (is_object($debug_object)) { $debug_object->debug_log(1, 'EXIT - ret: ', $ret); } } protected function match($exp, $pattern, $value, $case_sensitivity) { global $debug_object; if (is_object($debug_object)) {$debug_object->debug_log_entry(1);} if ($case_sensitivity === 'i') { $pattern = strtolower($pattern); $value = strtolower($value); } switch ($exp) { case '=': return ($value === $pattern); case '!=': return ($value !== $pattern); case '^=': return preg_match('/^' . preg_quote($pattern, '/') . '/', $value); case '$=': return preg_match('/' . preg_quote($pattern, '/') . '$/', $value); case '*=': return preg_match('/' . preg_quote($pattern, '/') . '/', $value); case '|=': /** * [att|=val] * * Represents an element with the att attribute, its value * either being exactly "val" or beginning with "val" * immediately followed by "-" (U+002D). */ return strpos($value, $pattern) === 0; case '~=': /** * [att~=val] * * Represents an element with the att attribute whose value is a * whitespace-separated list of words, one of which is exactly * "val". If "val" contains whitespace, it will never represent * anything (since the words are separated by spaces). Also if * "val" is the empty string, it will never represent anything. */ return in_array($pattern, explode(' ', trim($value)), true); } return false; } protected function parse_selector($selector_string) { global $debug_object; if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } /** * Pattern of CSS selectors, modified from mootools (https://mootools.net/) * * Paperg: Add the colon to the attribute, so that it properly finds * <tag attr:ibute="something" > like google does. * * Note: if you try to look at this attribute, you MUST use getAttribute * since $dom->x:y will fail the php syntax check. * * Notice the \[ starting the attribute? and the @? following? This * implies that an attribute can begin with an @ sign that is not * captured. This implies that an html attribute specifier may start * with an @ sign that is NOT captured by the expression. Farther study * is required to determine of this should be documented or removed. * * Matches selectors in this order: * * [0] - full match * * [1] - tag name * ([\w:\*-]*) * Matches the tag name consisting of zero or more words, colons, * asterisks and hyphens. * * [2] - id name * (?:\#([\w-]+)) * Optionally matches a id name, consisting of an "#" followed by * the id name (one or more words and hyphens). * * [3] - class names (including dots) * (?:\.([\w\.-]+))? * Optionally matches a list of classs, consisting of an "." * followed by the class name (one or more words and hyphens) * where multiple classes can be chained (i.e. ".foo.bar.baz") * * [4] - attributes * ((?:\[@?(?:!?[\w:-]+)(?:(?:[!*^$|~]?=)[\"']?(?:.*?)[\"']?)?(?:\s*?(?:[iIsS])?)?\])+)? * Optionally matches the attributes list * * [5] - separator * ([\/, >+~]+) * Matches the selector list separator */ // phpcs:ignore Generic.Files.LineLength $pattern = "/([\w:\*-]*)(?:\#([\w-]+))?(?:|\.([\w\.-]+))?((?:\[@?(?:!?[\w:-]+)(?:(?:[!*^$|~]?=)[\"']?(?:.*?)[\"']?)?(?:\s*?(?:[iIsS])?)?\])+)?([\/, >+~]+)/is"; preg_match_all( $pattern, trim($selector_string) . ' ', // Add final ' ' as pseudo separator $matches, PREG_SET_ORDER ); if (is_object($debug_object)) { $debug_object->debug_log(2, 'Matches Array: ', $matches); } $selectors = array(); $result = array(); foreach ($matches as $m) { $m[0] = trim($m[0]); // Skip NoOps if ($m[0] === '' || $m[0] === '/' || $m[0] === '//') { continue; } // Convert to lowercase if ($this->dom->lowercase) { $m[1] = strtolower($m[1]); } // Extract classes if ($m[3] !== '') { $m[3] = explode('.', $m[3]); } /* Extract attributes (pattern based on the pattern above!) * [0] - full match * [1] - attribute name * [2] - attribute expression * [3] - attribute value * [4] - case sensitivity * * Note: Attributes can be negated with a "!" prefix to their name */ if($m[4] !== '') { preg_match_all( "/\[@?(!?[\w:-]+)(?:([!*^$|~]?=)[\"']?(.*?)[\"']?)?(?:\s+?([iIsS])?)?\]/is", trim($m[4]), $attributes, PREG_SET_ORDER ); // Replace element by array $m[4] = array(); foreach($attributes as $att) { // Skip empty matches if(trim($att[0]) === '') { continue; } $inverted = (isset($att[1][0]) && $att[1][0] === '!'); $m[4][] = array( $inverted ? substr($att[1], 1) : $att[1], // Name (isset($att[2])) ? $att[2] : '', // Expression (isset($att[3])) ? $att[3] : '', // Value $inverted, // Inverted Flag (isset($att[4])) ? strtolower($att[4]) : '', // Case-Sensitivity ); } } // Sanitize Separator if ($m[5] !== '' && trim($m[5]) === '') { // Descendant Separator $m[5] = ' '; } else { // Other Separator $m[5] = trim($m[5]); } // Clear Separator if it's a Selector List if ($is_list = ($m[5] === ',')) { $m[5] = ''; } // Remove full match before adding to results array_shift($m); $result[] = $m; if ($is_list) { // Selector List $selectors[] = $result; $result = array(); } } if (count($result) > 0) { $selectors[] = $result; } return $selectors; } function __get($name) { if (isset($this->attr[$name])) { return $this->convert_text($this->attr[$name]); } switch ($name) { case 'outertext': return $this->outertext(); case 'innertext': return $this->innertext(); case 'plaintext': return $this->text(); case 'xmltext': return $this->xmltext(); default: return array_key_exists($name, $this->attr); } } function __set($name, $value) { global $debug_object; if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } switch ($name) { case 'outertext': return $this->_[HDOM_INFO_OUTER] = $value; case 'innertext': if (isset($this->_[HDOM_INFO_TEXT])) { return $this->_[HDOM_INFO_TEXT] = $value; } return $this->_[HDOM_INFO_INNER] = $value; } if (!isset($this->attr[$name])) { $this->_[HDOM_INFO_SPACE][] = array(' ', '', ''); $this->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE; } $this->attr[$name] = $value; } function __isset($name) { switch ($name) { case 'outertext': return true; case 'innertext': return true; case 'plaintext': return true; } //no value attr: nowrap, checked selected... return (array_key_exists($name, $this->attr)) ? true : isset($this->attr[$name]); } function __unset($name) { if (isset($this->attr[$name])) { unset($this->attr[$name]); } } function convert_text($text) { global $debug_object; if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } $converted_text = $text; $sourceCharset = ''; $targetCharset = ''; if ($this->dom) { $sourceCharset = strtoupper($this->dom->_charset); $targetCharset = strtoupper($this->dom->_target_charset); } if (is_object($debug_object)) { $debug_object->debug_log(3, 'source charset: ' . $sourceCharset . ' target charaset: ' . $targetCharset ); } if (!empty($sourceCharset) && !empty($targetCharset) && (strcasecmp($sourceCharset, $targetCharset) != 0)) { // Check if the reported encoding could have been incorrect and the text is actually already UTF-8 if ((strcasecmp($targetCharset, 'UTF-8') == 0) && ($this->is_utf8($text))) { $converted_text = $text; } else { //echo $sourceCharset; //echo $text; //$formattedString = mb_strtolower($text); //$converted_text = mb_convert_encoding($text, $targetCharset, $sourceCharset);//('UTF-8', 'UTF-8//IGNORE', $formattedString); //echo $converted_text; //$convertedString = @mb_convert_encoding($text, $targetCharset, $sourceCharset) ? : @iconv($sourceCharset, $targetCharset . '//IGNORE', $text); echo $text; return; $converted_text = iconv($sourceCharset, $targetCharset, $text); } } // Lets make sure that we don't have that silly BOM issue with any of the utf-8 text we output. if ($targetCharset === 'UTF-8') { if (substr($converted_text, 0, 3) === "\xef\xbb\xbf") { $converted_text = substr($converted_text, 3); } if (substr($converted_text, -3) === "\xef\xbb\xbf") { $converted_text = substr($converted_text, 0, -3); } } return $converted_text; } static function is_utf8($str) { $c = 0; $b = 0; $bits = 0; $len = strlen($str); for($i = 0; $i < $len; $i++) { $c = ord($str[$i]); if($c > 128) { if(($c >= 254)) { return false; } elseif($c >= 252) { $bits = 6; } elseif($c >= 248) { $bits = 5; } elseif($c >= 240) { $bits = 4; } elseif($c >= 224) { $bits = 3; } elseif($c >= 192) { $bits = 2; } else { return false; } if(($i + $bits) > $len) { return false; } while($bits > 1) { $i++; $b = ord($str[$i]); if($b < 128 || $b > 191) { return false; } $bits--; } } } return true; } function get_display_size() { global $debug_object; $width = -1; $height = -1; if ($this->tag !== 'img') { return false; } // See if there is aheight or width attribute in the tag itself. if (isset($this->attr['width'])) { $width = $this->attr['width']; } if (isset($this->attr['height'])) { $height = $this->attr['height']; } // Now look for an inline style. if (isset($this->attr['style'])) { // Thanks to user gnarf from stackoverflow for this regular expression. $attributes = array(); preg_match_all( '/([\w-]+)\s*:\s*([^;]+)\s*;?/', $this->attr['style'], $matches, PREG_SET_ORDER ); foreach ($matches as $match) { $attributes[$match[1]] = $match[2]; } // If there is a width in the style attributes: if (isset($attributes['width']) && $width == -1) { // check that the last two characters are px (pixels) if (strtolower(substr($attributes['width'], -2)) === 'px') { $proposed_width = substr($attributes['width'], 0, -2); // Now make sure that it's an integer and not something stupid. if (filter_var($proposed_width, FILTER_VALIDATE_INT)) { $width = $proposed_width; } } } // If there is a width in the style attributes: if (isset($attributes['height']) && $height == -1) { // check that the last two characters are px (pixels) if (strtolower(substr($attributes['height'], -2)) == 'px') { $proposed_height = substr($attributes['height'], 0, -2); // Now make sure that it's an integer and not something stupid. if (filter_var($proposed_height, FILTER_VALIDATE_INT)) { $height = $proposed_height; } } } } // Future enhancement: // Look in the tag to see if there is a class or id specified that has // a height or width attribute to it. // Far future enhancement // Look at all the parent tags of this image to see if they specify a // class or id that has an img selector that specifies a height or width // Note that in this case, the class or id will have the img subselector // for it to apply to the image. // ridiculously far future development // If the class or id is specified in a SEPARATE css file thats not on // the page, go get it and do what we were just doing for the ones on // the page. $result = array( 'height' => $height, 'width' => $width ); return $result; } function save($filepath = '') { $ret = $this->outertext(); if ($filepath !== '') { file_put_contents($filepath, $ret, LOCK_EX); } return $ret; } function addClass($class) { if (is_string($class)) { $class = explode(' ', $class); } if (is_array($class)) { foreach($class as $c) { if (isset($this->class)) { if ($this->hasClass($c)) { continue; } else { $this->class .= ' ' . $c; } } else { $this->class = $c; } } } else { if (is_object($debug_object)) { $debug_object->debug_log(2, 'Invalid type: ', gettype($class)); } } } function hasClass($class) { if (is_string($class)) { if (isset($this->class)) { return in_array($class, explode(' ', $this->class), true); } } else { if (is_object($debug_object)) { $debug_object->debug_log(2, 'Invalid type: ', gettype($class)); } } return false; } function removeClass($class = null) { if (!isset($this->class)) { return; } if (is_null($class)) { $this->removeAttribute('class'); return; } if (is_string($class)) { $class = explode(' ', $class); } if (is_array($class)) { $class = array_diff(explode(' ', $this->class), $class); if (empty($class)) { $this->removeAttribute('class'); } else { $this->class = implode(' ', $class); } } } function getAllAttributes() { return $this->attr; } function getAttribute($name) { return $this->__get($name); } function setAttribute($name, $value) { $this->__set($name, $value); } function hasAttribute($name) { return $this->__isset($name); } function removeAttribute($name) { $this->__set($name, null); } function remove() { if ($this->parent) { $this->parent->removeChild($this); } } function removeChild($node) { $nidx = array_search($node, $this->nodes, true); $cidx = array_search($node, $this->children, true); $didx = array_search($node, $this->dom->nodes, true); if ($nidx !== false && $cidx !== false && $didx !== false) { foreach($node->children as $child) { $node->removeChild($child); } foreach($node->nodes as $entity) { $enidx = array_search($entity, $node->nodes, true); $edidx = array_search($entity, $node->dom->nodes, true); if ($enidx !== false && $edidx !== false) { unset($node->nodes[$enidx]); unset($node->dom->nodes[$edidx]); } } unset($this->nodes[$nidx]); unset($this->children[$cidx]); unset($this->dom->nodes[$didx]); $node->clear(); } } function getElementById($id) { return $this->find("#$id", 0); } function getElementsById($id, $idx = null) { return $this->find("#$id", $idx); } function getElementByTagName($name) { return $this->find($name, 0); } function getElementsByTagName($name, $idx = null) { return $this->find($name, $idx); } function parentNode() { return $this->parent(); } function childNodes($idx = -1) { return $this->children($idx); } function firstChild() { return $this->first_child(); } function lastChild() { return $this->last_child(); } function nextSibling() { return $this->next_sibling(); } function previousSibling() { return $this->prev_sibling(); } function hasChildNodes() { return $this->has_child(); } function nodeName() { return $this->tag; } function appendChild($node) { $node->parent($this); return $node; } } class simple_html_dom { public $root = null; public $nodes = array(); public $callback = null; public $lowercase = false; public $original_size; public $size; protected $pos; protected $doc; protected $char; protected $cursor; protected $parent; protected $noise = array(); protected $token_blank = " \t\r\n"; protected $token_equal = ' =/>'; protected $token_slash = " />\r\n\t"; protected $token_attr = ' >'; public $_charset = ''; public $_target_charset = ''; protected $default_br_text = ''; public $default_span_text = ''; protected $self_closing_tags = array( 'area' => 1, 'base' => 1, 'br' => 1, 'col' => 1, 'embed' => 1, 'hr' => 1, 'img' => 1, 'input' => 1, 'link' => 1, 'meta' => 1, 'param' => 1, 'source' => 1, 'track' => 1, 'wbr' => 1 ); protected $block_tags = array( 'body' => 1, 'div' => 1, 'form' => 1, 'root' => 1, 'span' => 1, 'table' => 1 ); protected $optional_closing_tags = array( // Not optional, see // https://www.w3.org/TR/html/textlevel-semantics.html#the-b-element 'b' => array('b' => 1), 'dd' => array('dd' => 1, 'dt' => 1), // Not optional, see // https://www.w3.org/TR/html/grouping-content.html#the-dl-element 'dl' => array('dd' => 1, 'dt' => 1), 'dt' => array('dd' => 1, 'dt' => 1), 'li' => array('li' => 1), 'optgroup' => array('optgroup' => 1, 'option' => 1), 'option' => array('optgroup' => 1, 'option' => 1), 'p' => array('p' => 1), 'rp' => array('rp' => 1, 'rt' => 1), 'rt' => array('rp' => 1, 'rt' => 1), 'td' => array('td' => 1, 'th' => 1), 'th' => array('td' => 1, 'th' => 1), 'tr' => array('td' => 1, 'th' => 1, 'tr' => 1), ); function __construct( $str = null, $lowercase = true, $forceTagsClosed = true, $target_charset = DEFAULT_TARGET_CHARSET, $stripRN = true, $defaultBRText = DEFAULT_BR_TEXT, $defaultSpanText = DEFAULT_SPAN_TEXT, $options = 0) { if ($str) { if (preg_match('/^http:\/\//i', $str) || is_file($str)) { $this->load_file($str); } else { $this->load( $str, $lowercase, $stripRN, $defaultBRText, $defaultSpanText, $options ); } } // Forcing tags to be closed implies that we don't trust the html, but // it can lead to parsing errors if we SHOULD trust the html. if (!$forceTagsClosed) { $this->optional_closing_array = array(); } $this->_target_charset = $target_charset; } function __destruct() { $this->clear(); } function load( $str, $lowercase = true, $stripRN = true, $defaultBRText = DEFAULT_BR_TEXT, $defaultSpanText = DEFAULT_SPAN_TEXT, $options = 0) { global $debug_object; // prepare $this->prepare($str, $lowercase, $defaultBRText, $defaultSpanText); // Per sourceforge http://sourceforge.net/tracker/?func=detail&aid=2949097&group_id=218559&atid=1044037 // Script tags removal now preceeds style tag removal. // strip out <script> tags $this->remove_noise("'<\s*script[^>]*[^/]>(.*?)<\s*/\s*script\s*>'is"); $this->remove_noise("'<\s*script\s*>(.*?)<\s*/\s*script\s*>'is"); // strip out the \r \n's if we are told to. if ($stripRN) { $this->doc = str_replace("\r", ' ', $this->doc); $this->doc = str_replace("\n", ' ', $this->doc); // set the length of content since we have changed it. $this->size = strlen($this->doc); } // strip out cdata $this->remove_noise("'<!\[CDATA\[(.*?)\]\]>'is", true); // strip out comments $this->remove_noise("'<!--(.*?)-->'is"); // strip out <style> tags $this->remove_noise("'<\s*style[^>]*[^/]>(.*?)<\s*/\s*style\s*>'is"); $this->remove_noise("'<\s*style\s*>(.*?)<\s*/\s*style\s*>'is"); // strip out preformatted tags $this->remove_noise("'<\s*(?:code)[^>]*>(.*?)<\s*/\s*(?:code)\s*>'is"); // strip out server side scripts $this->remove_noise("'(<\?)(.*?)(\?>)'s", true); if($options & HDOM_SMARTY_AS_TEXT) { // Strip Smarty scripts $this->remove_noise("'(\{\w)(.*?)(\})'s", true); } // parsing $this->parse(); // end $this->root->_[HDOM_INFO_END] = $this->cursor; $this->parse_charset(); // make load function chainable return $this; } function load_file() { $args = func_get_args(); if(($doc = call_user_func_array('file_get_contents', $args)) !== false) { $this->load($doc, true); } else { return false; } } function set_callback($function_name) { $this->callback = $function_name; } function remove_callback() { $this->callback = null; } function save($filepath = '') { $ret = $this->root->innertext(); if ($filepath !== '') { file_put_contents($filepath, $ret, LOCK_EX); } return $ret; } function find($selector, $idx = null, $lowercase = false) { return $this->root->find($selector, $idx, $lowercase); } function clear() { if (isset($this->nodes)) { foreach ($this->nodes as $n) { $n->clear(); $n = null; } } // This add next line is documented in the sourceforge repository. // 2977248 as a fix for ongoing memory leaks that occur even with the // use of clear. if (isset($this->children)) { foreach ($this->children as $n) { $n->clear(); $n = null; } } if (isset($this->parent)) { $this->parent->clear(); unset($this->parent); } if (isset($this->root)) { $this->root->clear(); unset($this->root); } unset($this->doc); unset($this->noise); } function dump($show_attr = true) { $this->root->dump($show_attr); } protected function prepare( $str, $lowercase = true, $defaultBRText = DEFAULT_BR_TEXT, $defaultSpanText = DEFAULT_SPAN_TEXT) { $this->clear(); $this->doc = trim($str); $this->size = strlen($this->doc); $this->original_size = $this->size; // original size of the html $this->pos = 0; $this->cursor = 1; $this->noise = array(); $this->nodes = array(); $this->lowercase = $lowercase; $this->default_br_text = $defaultBRText; $this->default_span_text = $defaultSpanText; $this->root = new simple_html_dom_node($this); $this->root->tag = 'root'; $this->root->_[HDOM_INFO_BEGIN] = -1; $this->root->nodetype = HDOM_TYPE_ROOT; $this->parent = $this->root; if ($this->size > 0) { $this->char = $this->doc[0]; } } protected function parse() { while (true) { // Read next tag if there is no text between current position and the // next opening tag. if (($s = $this->copy_until_char('<')) === '') { if($this->read_tag()) { continue; } else { return true; } } // Add a text node for text between tags $node = new simple_html_dom_node($this); ++$this->cursor; $node->_[HDOM_INFO_TEXT] = $s; $this->link_nodes($node, false); } } protected function parse_charset() { global $debug_object; $charset = null; if (function_exists('get_last_retrieve_url_contents_content_type')) { $contentTypeHeader = get_last_retrieve_url_contents_content_type(); $success = preg_match('/charset=(.+)/', $contentTypeHeader, $matches); if ($success) { $charset = $matches[1]; if (is_object($debug_object)) { $debug_object->debug_log(2, 'header content-type found charset of: ' . $charset ); } } } if (empty($charset)) { // https://www.w3.org/TR/html/document-metadata.html#statedef-http-equiv-content-type $el = $this->root->find('meta[http-equiv=Content-Type]', 0, true); if (!empty($el)) { $fullvalue = $el->content; if (is_object($debug_object)) { $debug_object->debug_log(2, 'meta content-type tag found' . $fullvalue ); } if (!empty($fullvalue)) { $success = preg_match( '/charset=(.+)/i', $fullvalue, $matches ); if ($success) { $charset = $matches[1]; } else { // If there is a meta tag, and they don't specify the // character set, research says that it's typically // ISO-8859-1 if (is_object($debug_object)) { $debug_object->debug_log(2, 'meta content-type tag couldn\'t be parsed. using iso-8859 default.' ); } $charset = 'ISO-8859-1'; } } } } if (empty($charset)) { // https://www.w3.org/TR/html/document-metadata.html#character-encoding-declaration if ($meta = $this->root->find('meta[charset]', 0)) { $charset = $meta->charset; if (is_object($debug_object)) { $debug_object->debug_log(2, 'meta charset: ' . $charset); } } } if (empty($charset)) { // Try to guess the charset based on the content // Requires Multibyte String (mbstring) support (optional) if (function_exists('mb_detect_encoding')) { /** * mb_detect_encoding() is not intended to distinguish between * charsets, especially single-byte charsets. Its primary * purpose is to detect which multibyte encoding is in use, * i.e. UTF-8, UTF-16, shift-JIS, etc. * * -- https://bugs.php.net/bug.php?id=38138 * * Adding both CP1251/ISO-8859-5 and CP1252/ISO-8859-1 will * always result in CP1251/ISO-8859-5 and vice versa. * * Thus, only detect if it's either UTF-8 or CP1252/ISO-8859-1 * to stay compatible. */ $encoding = mb_detect_encoding( $this->doc, array( 'UTF-8', 'CP1252', 'ISO-8859-1' ) ); if ($encoding === 'CP1252' || $encoding === 'ISO-8859-1') { // Due to a limitation of mb_detect_encoding // 'CP1251'/'ISO-8859-5' will be detected as // 'CP1252'/'ISO-8859-1'. This will cause iconv to fail, in // which case we can simply assume it is the other charset. if (!@iconv('CP1252', 'UTF-8', $this->doc)) { $encoding = 'CP1251'; } } if ($encoding !== false) { $charset = $encoding; if (is_object($debug_object)) { $debug_object->debug_log(2, 'mb_detect: ' . $charset); } } } } if (empty($charset)) { // Assume it's UTF-8 as it is the most likely charset to be used $charset = 'UTF-8'; if (is_object($debug_object)) { $debug_object->debug_log(2, 'No match found, assume ' . $charset); } } // Since CP1252 is a superset, if we get one of it's subsets, we want // it instead. if ((strtolower($charset) == 'iso-8859-1') || (strtolower($charset) == 'latin1') || (strtolower($charset) == 'latin-1')) { $charset = 'CP1252'; if (is_object($debug_object)) { $debug_object->debug_log(2, 'replacing ' . $charset . ' with CP1252 as its a superset' ); } } if (is_object($debug_object)) { $debug_object->debug_log(1, 'EXIT - ' . $charset); } return $this->_charset = $charset; } protected function read_tag() { // Set end position if no further tags found if ($this->char !== '<') { $this->root->_[HDOM_INFO_END] = $this->cursor; return false; } $begin_tag_pos = $this->pos; $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next // end tag if ($this->char === '/') { $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next // Skip whitespace in end tags (i.e. in "</ html>") $this->skip($this->token_blank); $tag = $this->copy_until_char('>'); // Skip attributes in end tags if (($pos = strpos($tag, ' ')) !== false) { $tag = substr($tag, 0, $pos); } $parent_lower = strtolower($this->parent->tag); $tag_lower = strtolower($tag); // The end tag is supposed to close the parent tag. Handle situations // when it doesn't if ($parent_lower !== $tag_lower) { // Parent tag does not have to be closed necessarily (optional closing tag) // Current tag is a block tag, so it may close an ancestor if (isset($this->optional_closing_tags[$parent_lower]) && isset($this->block_tags[$tag_lower])) { $this->parent->_[HDOM_INFO_END] = 0; $org_parent = $this->parent; // Traverse ancestors to find a matching opening tag // Stop at root node while (($this->parent->parent) && strtolower($this->parent->tag) !== $tag_lower ){ $this->parent = $this->parent->parent; } // If we don't have a match add current tag as text node if (strtolower($this->parent->tag) !== $tag_lower) { $this->parent = $org_parent; // restore origonal parent if ($this->parent->parent) { $this->parent = $this->parent->parent; } $this->parent->_[HDOM_INFO_END] = $this->cursor; return $this->as_text_node($tag); } } elseif (($this->parent->parent) && isset($this->block_tags[$tag_lower]) ) { // Grandparent exists and current tag is a block tag, so our // parent doesn't have an end tag $this->parent->_[HDOM_INFO_END] = 0; // No end tag $org_parent = $this->parent; // Traverse ancestors to find a matching opening tag // Stop at root node while (($this->parent->parent) && strtolower($this->parent->tag) !== $tag_lower ) { $this->parent = $this->parent->parent; } // If we don't have a match add current tag as text node if (strtolower($this->parent->tag) !== $tag_lower) { $this->parent = $org_parent; // restore origonal parent $this->parent->_[HDOM_INFO_END] = $this->cursor; return $this->as_text_node($tag); } } elseif (($this->parent->parent) && strtolower($this->parent->parent->tag) === $tag_lower ) { // Grandparent exists and current tag closes it $this->parent->_[HDOM_INFO_END] = 0; $this->parent = $this->parent->parent; } else { // Random tag, add as text node return $this->as_text_node($tag); } } // Set end position of parent tag to current cursor position $this->parent->_[HDOM_INFO_END] = $this->cursor; if ($this->parent->parent) { $this->parent = $this->parent->parent; } $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next return true; } // start tag $node = new simple_html_dom_node($this); $node->_[HDOM_INFO_BEGIN] = $this->cursor; ++$this->cursor; $tag = $this->copy_until($this->token_slash); // Get tag name $node->tag_start = $begin_tag_pos; // doctype, cdata & comments... // <!DOCTYPE html> // <![CDATA[ ... ]]> // <!-- Comment --> if (isset($tag[0]) && $tag[0] === '!') { $node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until_char('>'); if (isset($tag[2]) && $tag[1] === '-' && $tag[2] === '-') { // Comment ("<!--") $node->nodetype = HDOM_TYPE_COMMENT; $node->tag = 'comment'; } else { // Could be doctype or CDATA but we don't care $node->nodetype = HDOM_TYPE_UNKNOWN; $node->tag = 'unknown'; } if ($this->char === '>') { $node->_[HDOM_INFO_TEXT] .= '>'; } $this->link_nodes($node, true); $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next return true; } // The start tag cannot contain another start tag, if so add as text // i.e. "<<html>" if ($pos = strpos($tag, '<') !== false) { $tag = '<' . substr($tag, 0, -1); $node->_[HDOM_INFO_TEXT] = $tag; $this->link_nodes($node, false); $this->char = $this->doc[--$this->pos]; // prev return true; } // Handle invalid tag names (i.e. "<html#doc>") if (!preg_match('/^\w[\w:-]*$/', $tag)) { $node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until('<>'); // Next char is the beginning of a new tag, don't touch it. if ($this->char === '<') { $this->link_nodes($node, false); return true; } // Next char closes current tag, add and be done with it. if ($this->char === '>') { $node->_[HDOM_INFO_TEXT] .= '>'; } $this->link_nodes($node, false); $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next return true; } // begin tag, add new node $node->nodetype = HDOM_TYPE_ELEMENT; $tag_lower = strtolower($tag); $node->tag = ($this->lowercase) ? $tag_lower : $tag; // handle optional closing tags if (isset($this->optional_closing_tags[$tag_lower])) { // Traverse ancestors to close all optional closing tags while (isset($this->optional_closing_tags[$tag_lower][strtolower($this->parent->tag)])) { $this->parent->_[HDOM_INFO_END] = 0; $this->parent = $this->parent->parent; } $node->parent = $this->parent; } $guard = 0; // prevent infinity loop // [0] Space between tag and first attribute $space = array($this->copy_skip($this->token_blank), '', ''); // attributes do { // Everything until the first equal sign should be the attribute name $name = $this->copy_until($this->token_equal); if ($name === '' && $this->char !== null && $space[0] === '') { break; } if ($guard === $this->pos) { // Escape infinite loop $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next continue; } $guard = $this->pos; // handle endless '<' // Out of bounds before the tag ended if ($this->pos >= $this->size - 1 && $this->char !== '>') { $node->nodetype = HDOM_TYPE_TEXT; $node->_[HDOM_INFO_END] = 0; $node->_[HDOM_INFO_TEXT] = '<' . $tag . $space[0] . $name; $node->tag = 'text'; $this->link_nodes($node, false); return true; } // handle mismatch '<' // Attributes cannot start after opening tag if ($this->doc[$this->pos - 1] == '<') { $node->nodetype = HDOM_TYPE_TEXT; $node->tag = 'text'; $node->attr = array(); $node->_[HDOM_INFO_END] = 0; $node->_[HDOM_INFO_TEXT] = substr( $this->doc, $begin_tag_pos, $this->pos - $begin_tag_pos - 1 ); $this->pos -= 2; $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next $this->link_nodes($node, false); return true; } if ($name !== '/' && $name !== '') { // this is a attribute name // [1] Whitespace after attribute name $space[1] = $this->copy_skip($this->token_blank); $name = $this->restore_noise($name); // might be a noisy name if ($this->lowercase) { $name = strtolower($name); } if ($this->char === '=') { // attribute with value $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next $this->parse_attr($node, $name, $space); // get attribute value } else { //no value attr: nowrap, checked selected... $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO; $node->attr[$name] = true; if ($this->char != '>') { $this->char = $this->doc[--$this->pos]; } // prev } $node->_[HDOM_INFO_SPACE][] = $space; // prepare for next attribute $space = array( $this->copy_skip($this->token_blank), '', '' ); } else { // no more attributes break; } } while ($this->char !== '>' && $this->char !== '/'); // go until the tag ended $this->link_nodes($node, true); $node->_[HDOM_INFO_ENDSPACE] = $space[0]; // handle empty tags (i.e. "<div/>") if ($this->copy_until_char('>') === '/') { $node->_[HDOM_INFO_ENDSPACE] .= '/'; $node->_[HDOM_INFO_END] = 0; } else { // reset parent if (!isset($this->self_closing_tags[strtolower($node->tag)])) { $this->parent = $node; } } $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next // If it's a BR tag, we need to set it's text to the default text. // This way when we see it in plaintext, we can generate formatting that the user wants. // since a br tag never has sub nodes, this works well. if ($node->tag === 'br') { $node->_[HDOM_INFO_INNER] = $this->default_br_text; } return true; } protected function parse_attr($node, $name, &$space) { $is_duplicate = isset($node->attr[$name]); if (!$is_duplicate) // Copy whitespace between "=" and value $space[2] = $this->copy_skip($this->token_blank); switch ($this->char) { case '"': $quote_type = HDOM_QUOTE_DOUBLE; $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next $value = $this->copy_until_char('"'); $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next break; case '\'': $quote_type = HDOM_QUOTE_SINGLE; $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next $value = $this->copy_until_char('\''); $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next break; default: $quote_type = HDOM_QUOTE_NO; $value = $this->copy_until($this->token_attr); } $value = $this->restore_noise($value); // PaperG: Attributes should not have \r or \n in them, that counts as // html whitespace. $value = str_replace("\r", '', $value); $value = str_replace("\n", '', $value); // PaperG: If this is a "class" selector, lets get rid of the preceeding // and trailing space since some people leave it in the multi class case. if ($name === 'class') { $value = trim($value); } if (!$is_duplicate) { $node->_[HDOM_INFO_QUOTE][] = $quote_type; $node->attr[$name] = $value; } } protected function link_nodes(&$node, $is_child) { $node->parent = $this->parent; $this->parent->nodes[] = $node; if ($is_child) { $this->parent->children[] = $node; } } protected function as_text_node($tag) { $node = new simple_html_dom_node($this); ++$this->cursor; $node->_[HDOM_INFO_TEXT] = '</' . $tag . '>'; $this->link_nodes($node, false); $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next return true; } protected function skip($chars) { $this->pos += strspn($this->doc, $chars, $this->pos); $this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next } protected function copy_skip($chars) { $pos = $this->pos; $len = strspn($this->doc, $chars, $pos); $this->pos += $len; $this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next if ($len === 0) { return ''; } return substr($this->doc, $pos, $len); } protected function copy_until($chars) { $pos = $this->pos; $len = strcspn($this->doc, $chars, $pos); $this->pos += $len; $this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next return substr($this->doc, $pos, $len); } protected function copy_until_char($char) { if ($this->char === null) { return ''; } if (($pos = strpos($this->doc, $char, $this->pos)) === false) { $ret = substr($this->doc, $this->pos, $this->size - $this->pos); $this->char = null; $this->pos = $this->size; return $ret; } if ($pos === $this->pos) { return ''; } $pos_old = $this->pos; $this->char = $this->doc[$pos]; $this->pos = $pos; return substr($this->doc, $pos_old, $pos - $pos_old); } protected function remove_noise($pattern, $remove_tag = false) { global $debug_object; if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } $count = preg_match_all( $pattern, $this->doc, $matches, PREG_SET_ORDER | PREG_OFFSET_CAPTURE ); for ($i = $count - 1; $i > -1; --$i) { $key = '___noise___' . sprintf('% 5d', count($this->noise) + 1000); if (is_object($debug_object)) { $debug_object->debug_log(2, 'key is: ' . $key); } $idx = ($remove_tag) ? 0 : 1; // 0 = entire match, 1 = submatch $this->noise[$key] = $matches[$i][$idx][0]; $this->doc = substr_replace($this->doc, $key, $matches[$i][$idx][1], strlen($matches[$i][$idx][0])); } // reset the length of content $this->size = strlen($this->doc); if ($this->size > 0) { $this->char = $this->doc[0]; } } function restore_noise($text) { global $debug_object; if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } while (($pos = strpos($text, '___noise___')) !== false) { // Sometimes there is a broken piece of markup, and we don't GET the // pos+11 etc... token which indicates a problem outside of us... // todo: "___noise___1000" (or any number with four or more digits) // in the DOM causes an infinite loop which could be utilized by // malicious software if (strlen($text) > $pos + 15) { $key = '___noise___' . $text[$pos + 11] . $text[$pos + 12] . $text[$pos + 13] . $text[$pos + 14] . $text[$pos + 15]; if (is_object($debug_object)) { $debug_object->debug_log(2, 'located key of: ' . $key); } if (isset($this->noise[$key])) { $text = substr($text, 0, $pos) . $this->noise[$key] . substr($text, $pos + 16); } else { // do this to prevent an infinite loop. $text = substr($text, 0, $pos) . 'UNDEFINED NOISE FOR KEY: ' . $key . substr($text, $pos + 16); } } else { // There is no valid key being given back to us... We must get // rid of the ___noise___ or we will have a problem. $text = substr($text, 0, $pos) . 'NO NUMERIC NOISE KEY' . substr($text, $pos + 11); } } return $text; } function search_noise($text) { global $debug_object; if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } foreach($this->noise as $noiseElement) { if (strpos($noiseElement, $text) !== false) { return $noiseElement; } } } function __toString() { return $this->root->innertext(); } function __get($name) { switch ($name) { case 'outertext': return $this->root->innertext(); case 'innertext': return $this->root->innertext(); case 'plaintext': return $this->root->text(); case 'charset': return $this->_charset; case 'target_charset': return $this->_target_charset; } } function childNodes($idx = -1) { return $this->root->childNodes($idx); } function firstChild() { return $this->root->first_child(); } function lastChild() { return $this->root->last_child(); } function createElement($name, $value = null) { return @str_get_html("<$name>$value</$name>")->firstChild(); } function createTextNode($value) { return @end(str_get_html($value)->nodes); } function getElementById($id) { return $this->find("#$id", 0); } function getElementsById($id, $idx = null) { return $this->find("#$id", $idx); } function getElementByTagName($name) { return $this->find($name, 0); } function getElementsByTagName($name, $idx = -1) { return $this->find($name, $idx); } function loadFile() { $args = func_get_args(); $this->load_file($args); } } ?>
📁 Wordpress
🔖