diff --git a/CHANGELOG.md b/CHANGELOG.md index 76c8be8..0126e28 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,7 @@ ## Unreleased +- Encode html entities in `HtmlHelper` to prevent parse errors ## 7.3.0 - 2023-05-23 - Support `figure` html tag in `HtmlHelper::xpathHtmlDocument` method diff --git a/composer.json b/composer.json index af3df7c..67bb44e 100644 --- a/composer.json +++ b/composer.json @@ -46,8 +46,8 @@ }, "config": { "allow-plugins": { - "phpstan/extension-installer": true, - "ergebnis/composer-normalize": true + "ergebnis/composer-normalize": true, + "phpstan/extension-installer": true }, "secure-http": false, "sort-packages": true diff --git a/src/Service/HtmlHelper.php b/src/Service/HtmlHelper.php index 50ff40c..0e0bcd2 100644 --- a/src/Service/HtmlHelper.php +++ b/src/Service/HtmlHelper.php @@ -26,6 +26,7 @@ public function xpathHtmlDocument(string $content, string $xpathQuery): ISeq { return Seq::init(function () use ($xpathQuery, $content) { $htmlContent = $this->transformUnsupportedHtml($content); + $htmlContent = $this->encodeHtml($htmlContent); // @see https://www.php.net/manual/en/domdocument.loadhtml.php#95251 $dom = new \DOMDocument(); @@ -40,6 +41,20 @@ public function xpathHtmlDocument(string $content, string $xpathQuery): ISeq }); } + /** + * @see https://stackoverflow.com/questions/1685277/warning-domdocumentloadhtml-htmlparseentityref-expecting-in-entity + * + * It is meant to encode html entities, which would otherwise break the DOMDocument::loadHTML() method. + */ + private function encodeHtml(string $content): string + { + return str_replace( + ['>', '<'], + ['>', '<'], + htmlentities($content, ENT_NOQUOTES, 'UTF-8', false), + ); + } + private function transformUnsupportedHtml(string $originalContent): string { $unsupportedTags = Map::from([ diff --git a/tests/Service/HtmlHelperTest.php b/tests/Service/HtmlHelperTest.php index 3a4c711..58bd7a0 100644 --- a/tests/Service/HtmlHelperTest.php +++ b/tests/Service/HtmlHelperTest.php @@ -263,6 +263,24 @@ public function linksProvider(): array ]), ], ], + 'with one link containing not encoded entities' => [ + '
content
', + [ + ' href="https://www.vysokeskoly.cz?param1=1¶m2=2" class="active"' => new Link([ + 'href' => 'https://www.vysokeskoly.cz?param1=1¶m2=2', + 'class' => 'active', + ]), + ], + ], + 'with one link containing both encoded and not encoded entities' => [ + '
content
', + [ + ' href="https://www.vysokeskoly.cz?param1=1¶m2=2&param3=3" class="active"' => new Link([ + 'href' => 'https://www.vysokeskoly.cz?param1=1¶m2=2¶m3=3', + 'class' => 'active', + ]), + ], + ], 'with multi line link' => [ '
content