From bcd2f7fc0902fc036fc66c54795b70339eb792e3 Mon Sep 17 00:00:00 2001 From: Touhidur Rahman Date: Thu, 13 Jun 2024 00:39:05 +0600 Subject: [PATCH] pkp/pkp-lib#7916 Added core PKPHtmlSanitizer class (#9257) * pkp/pkp-lib#7916 Added core PKPHtmlSanitizer class * pkp/pkp-lib#7916 fixed empty string issue for DOMDocument::loadHTML * pkp/pkp-lib#7916 removed dom node traversing and implememted W3C conigurations * pkp/pkp-lib#7916 updated implementation with necessary doc blocks and sanitizer property cache * pkp/pkp-lib#7916 patch update after rebase * pkp/pkp-lib#7916 removed html_entity_decode and removed config passing to core sanitization class * pkp/pkp-lib#7916 removed sanitizer config setter and getter --- classes/core/PKPHtmlSanitizer.php | 173 ++++++++++++++++++++++++++++++ classes/core/PKPString.php | 51 ++------- 2 files changed, 179 insertions(+), 45 deletions(-) create mode 100644 classes/core/PKPHtmlSanitizer.php diff --git a/classes/core/PKPHtmlSanitizer.php b/classes/core/PKPHtmlSanitizer.php new file mode 100644 index 00000000000..7c396d77245 --- /dev/null +++ b/classes/core/PKPHtmlSanitizer.php @@ -0,0 +1,173 @@ +w3cValidElements = collect( + array_merge(W3CReference::HEAD_ELEMENTS, W3CReference::BODY_ELEMENTS) + )->keys(); + + $this->htmlSanitizer = new HtmlSanitizer( + $this->buildSanitizerConfig( + $this->generateAllowedTagToAttributeMap( + $allowable + ) + ) + ); + } + + /** + * Sanitize the given html string + */ + public function sanitize(string $html): string + { + return $this->htmlSanitizer->sanitize( + /** + * Here we are removing any html tags that should not be handled by sanitizer + * as defined in the \Symfony\Component\HtmlSanitizer\Reference\W3CReference::HEAD_ELEMENTS + * and \Symfony\Component\HtmlSanitizer\Reference\W3CReference::BODY_ELEMENTS as combined. + */ + strip_tags($html, $this->getSanitizableTags()->toArray()) + ); + } + + /** + * Build up the \Symfony\Component\HtmlSanitizer\HtmlSanitizerConfig instance + * + * @param Collection $allowedTagToAttributeMap See the @return docblock for + * PKPHtmlSanitizer::generateAllowedTagToAttributeMap() + */ + protected function buildSanitizerConfig(Collection $allowedTagToAttributeMap): HtmlSanitizerConfig + { + $this->htmlSanitizerConfig = (new HtmlSanitizerConfig()) + ->allowLinkSchemes(['https', 'http', 'mailto']) + ->allowMediaSchemes(['https', 'http']); + + if ($allowedTagToAttributeMap->count()) { + $allowedTagToAttributeMap->each( + fn (array $attributes, string $tag) => $this->htmlSanitizerConfig = $this->htmlSanitizerConfig->allowElement($tag, $attributes) + ); + } + + $this->getNonAllowedHtmlTags()->each( + fn (string $tag) => $this->htmlSanitizerConfig = $this->htmlSanitizerConfig->blockElement($tag) + ); + + return $this->htmlSanitizerConfig; + } + + /** + * Get the collection of non allowed tags for the given configuration + */ + protected function getNonAllowedHtmlTags(): Collection + { + return $this->w3cValidElements + ->diff(collect($this->htmlSanitizerConfig->getAllowedElements())->keys()); + } + + /** + * Get the collection of tags that will not be stripped/removed by php's "strip_tags" function + * but rather should be handled by the sanitizer class + */ + protected function getSanitizableTags(): Collection + { + return $this->w3cValidElements + ->merge(collect($this->htmlSanitizerConfig->getAllowedElements())->keys()) + ->merge(collect([ // list of dangerous tags that should be only handled by sanitization library + 'script', + ])) + ->unique(); + } + + /** + * Generate the collection of allowed tags to allowed attributes map as key/value[array] structure + * + * @param string $allowable Allowded tag to attribute map as the + * structure define in config keys such as + * security.[allowed_html/allowed_title_html] + * + * @return Collection Collection of allowed tags to allowed attributes map as key/value. + * In collection each tag will be mapped to an array that may + * contain allowed attributes or it can be empty which define that + * no attribute is allowed for that tag, structure such as + * [ + * HTML_TAG_1 => [ALLOWED_ATTRIBUTE_FOR_HTML_TAG_1, ...], + * HTML_TAG_2 => [], + * ... + * ] + */ + protected function generateAllowedTagToAttributeMap(string $allowable): Collection + { + return Str::of($allowable) + ->explode(',') + ->mapWithKeys(function(string $allowedTagWithAttr) { + + // Extract the tag itself (e.g. div, p, a ...) + preg_match('/\[[^][]+]\K|\w+/', $allowedTagWithAttr, $matches); + $allowedTag = collect($matches)->first(); + + // Extract the attributes associated with tag (e.g. class, href ...) + preg_match("/\[([^\]]*)\]/", $allowedTagWithAttr, $matches); + $allowedAttributes = collect($matches)->last(); + + if($allowedTag) { + return [ + $allowedTag => Str::of($allowedAttributes) + ->explode('|') + ->filter() + ->toArray() + ]; + } + + return []; + }); + } +} diff --git a/classes/core/PKPString.php b/classes/core/PKPString.php index d83fb69be7b..5f7063a68fe 100644 --- a/classes/core/PKPString.php +++ b/classes/core/PKPString.php @@ -17,7 +17,6 @@ namespace PKP\core; -use Illuminate\Support\Str; use PKP\config\Config; use Symfony\Component\HtmlSanitizer\HtmlSanitizer; use Symfony\Component\HtmlSanitizer\HtmlSanitizerConfig; @@ -170,52 +169,14 @@ public static function stripUnsafeHtml(?string $input, string $configKey = 'allo } static $caches; - + if (!isset($caches[$configKey])) { - - $config = (new HtmlSanitizerConfig()) - ->allowLinkSchemes(['https', 'http', 'mailto']) - ->allowMediaSchemes(['https', 'http']); - - $allowedTagToAttributeMap = Str::of(Config::getVar('security', $configKey)) - ->explode(',') - ->mapWithKeys(function (string $allowedTagWithAttr) { - - // Extract the tag itself (e.g. div, p, a ...) - preg_match('/\[[^][]+]\K|\w+/', $allowedTagWithAttr, $matches); - $allowedTag = collect($matches)->first(); - - // Extract the attributes associated with tag (e.g. class, href ...) - preg_match("/\[([^\]]*)\]/", $allowedTagWithAttr, $matches); - $allowedAttributes = collect($matches)->last(); - - if($allowedTag) { - return [ - $allowedTag => Str::of($allowedAttributes) - ->explode('|') - ->filter() - ->toArray() - ]; - } - - return []; - }) - ->each(function (array $attributes, string $tag) use (&$config) { - $config = $config->allowElement($tag, $attributes); - }); - - $caches[$configKey] = [ - 'allowedTagToAttributeMap' => $allowedTagToAttributeMap, - 'sanitizer' => new HtmlSanitizer($config), - ]; + $caches[$configKey] = new \PKP\core\PKPHtmlSanitizer( + Config::getVar('security', $configKey) + ); } - - // need to apply html_entity_decode as sanitizer apply htmlentities internally for special chars - return html_entity_decode( - $caches[$configKey]['sanitizer']->sanitize( - strip_tags($input, $caches[$configKey]['allowedTagToAttributeMap']->keys()->toArray()) - ) - ); + + return $caches[$configKey]->sanitize($input); } /**