From e6dda6dc9813471359cb2441a9aeda1fa1abea33 Mon Sep 17 00:00:00 2001 From: Petr Chromec Date: Wed, 25 Oct 2023 17:22:27 +0200 Subject: [PATCH] Encode html entities in HtmlHelper to prevent parse errors --- CHANGELOG.md | 1 + src/Service/HtmlHelper.php | 15 +++++++++++++++ tests/Service/HtmlHelperTest.php | 18 ++++++++++++++++++ 3 files changed, 34 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 76c8be8..0126e28 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,7 @@ ## Unreleased +- Encode html entities in `HtmlHelper` to prevent parse errors ## 7.3.0 - 2023-05-23 - Support `figure` html tag in `HtmlHelper::xpathHtmlDocument` method diff --git a/src/Service/HtmlHelper.php b/src/Service/HtmlHelper.php index 50ff40c..83a7c61 100644 --- a/src/Service/HtmlHelper.php +++ b/src/Service/HtmlHelper.php @@ -26,6 +26,7 @@ public function xpathHtmlDocument(string $content, string $xpathQuery): ISeq { return Seq::init(function () use ($xpathQuery, $content) { $htmlContent = $this->transformUnsupportedHtml($content); + $htmlContent = $this->encodeHtml($htmlContent); // @see https://www.php.net/manual/en/domdocument.loadhtml.php#95251 $dom = new \DOMDocument(); @@ -40,6 +41,20 @@ public function xpathHtmlDocument(string $content, string $xpathQuery): ISeq }); } + /** + * @see https://stackoverflow.com/questions/1685277/warning-domdocumentloadhtml-htmlparseentityref-expecting-in-entity + * + * It is meant to encode html entities, which would otherwise break the DOMDocument::loadHTML() method. + */ + private function encodeHtml(string $content): string + { + return str_replace( + ['>', '<'], + ['>', '<'], + htmlentities($content, ENT_NOQUOTES, 'UTF-8', false) + ); + } + private function transformUnsupportedHtml(string $originalContent): string { $unsupportedTags = Map::from([ diff --git a/tests/Service/HtmlHelperTest.php b/tests/Service/HtmlHelperTest.php index 3a4c711..58bd7a0 100644 --- a/tests/Service/HtmlHelperTest.php +++ b/tests/Service/HtmlHelperTest.php @@ -263,6 +263,24 @@ public function linksProvider(): array ]), ], ], + 'with one link containing not encoded entities' => [ + '
content
', + [ + ' href="https://www.vysokeskoly.cz?param1=1¶m2=2" class="active"' => new Link([ + 'href' => 'https://www.vysokeskoly.cz?param1=1¶m2=2', + 'class' => 'active', + ]), + ], + ], + 'with one link containing both encoded and not encoded entities' => [ + '
content
', + [ + ' href="https://www.vysokeskoly.cz?param1=1¶m2=2&param3=3" class="active"' => new Link([ + 'href' => 'https://www.vysokeskoly.cz?param1=1¶m2=2¶m3=3', + 'class' => 'active', + ]), + ], + ], 'with multi line link' => [ '
content