1
0
mirror of https://git.tt-rss.org/git/tt-rss.git synced 2025-12-13 04:35:56 +00:00

domdocument: remove old meta charset unicode hacks, replace with shorter xml preamble utf8 hack (on loadhtml where it makes sense)

af_readability: better (?) charset hack for non-unicode pages
This commit is contained in:
Andrew Dolgov
2019-03-21 21:08:02 +03:00
parent 3bd3324e5a
commit 671f4cee65
8 changed files with 15 additions and 46 deletions

View File

@@ -172,14 +172,10 @@ class Af_Readability extends Plugin {
if (!$tmpdoc->loadHTML($tmp))
return false;
// this is the worst hack yet :(
if (strtolower($tmpdoc->encoding) != 'utf-8') {
$tmpxpath = new DOMXPath($tmpdoc);
foreach ($tmpxpath->query("//meta") as $elem) {
$elem->parentNode->removeChild($elem);
}
$tmp = $tmpdoc->saveHTML();
$tmp = preg_replace("/<meta.*?charset.*?\/>/i", "", $tmp);
$tmp = mb_convert_encoding($tmp, 'utf-8', $tmpdoc->encoding);
}
try {
@@ -210,7 +206,6 @@ class Af_Readability extends Plugin {
} catch (Exception $e) {
return false;
}
}
return false;