From 9a336ad18f0c235b44a8f69ace46e317288a9abb Mon Sep 17 00:00:00 2001 From: Earlopain Date: Tue, 16 May 2023 10:04:28 +0200 Subject: [PATCH] feat: add an option to preserve whitespace to FullSanitizer --- CHANGELOG.md | 3 ++ README.md | 15 +++------ lib/rails/html/sanitizer.rb | 31 +++++++++++++++++ test/sanitizer_test.rb | 66 ++++++++++++++++++++++++------------- 4 files changed, 82 insertions(+), 33 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f7ff74a..9eb5cb3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,9 @@ *Mike Dalessio* +* `FullSanitizer` now supports the optional argument `preserve_whitespace` to keep whitespace around block elements and line break elements. + + *Earlopain* ## 1.5.0 / 2023-01-20 diff --git a/README.md b/README.md index 059368e..a043095 100644 --- a/README.md +++ b/README.md @@ -62,20 +62,15 @@ All sanitizers respond to `sanitize`, and are available in variants that use eit full_sanitizer = Rails::HTML5::FullSanitizer.new full_sanitizer.sanitize("Bold no more! See more here...") # => Bold no more! See more here... -``` -or, if you insist on parsing the content as HTML4: +# Whitespace is swallowed by default. If whitespace is significant you must pass an option to preserve it. +# This option is slower, but is clever about whitespace around block elements and line break elements. -```ruby -full_sanitizer = Rails::HTML4::FullSanitizer.new -full_sanitizer.sanitize("Bold no more! See more here...") -# => Bold no more! See more here... +full_sanitizer = Rails::HTML5::FullSanitizer.new +full_sanitizer.sanitize("

Paragraphs

and
newlines", preserve_whitespace: true) +# => \nParagraphs\n and \n newlines ``` -HTML5 version: - - - #### LinkSanitizer ```ruby diff --git a/lib/rails/html/sanitizer.rb b/lib/rails/html/sanitizer.rb index dfbdb1d..632bd35 100644 --- a/lib/rails/html/sanitizer.rb +++ b/lib/rails/html/sanitizer.rb @@ -66,6 +66,19 @@ def parse_fragment(html) end if Rails::HTML::Sanitizer.html5_support? end + module Sanitizer + module PreserveWhitespace + def sanitize(html, options = {}) + return unless html + if options[:preserve_whitespace] + parse_fragment(html).to_text + else + super + end + end + end + end + module Scrubber module Full def scrub(fragment, options = {}) @@ -217,11 +230,20 @@ module HTML4 # full_sanitizer.sanitize("Bold no more! See more here...") # # => "Bold no more! See more here..." # + # === Options + # + # If whitespace is significant you can pass preserve_whitespace: true. + # This option is slower, but is clever about whitespace around block elements and line break elements. + # + # full_sanitizer = Rails::HTML4::FullSanitizer.new + # full_sanitizer.sanitize("

Paragraphs

and
newlines", preserve_whitespace: true) + # # => \nParagraphs\n and \n newlines class FullSanitizer < Rails::HTML::Sanitizer include HTML::Concern::ComposedSanitize include HTML::Concern::Parser::HTML4 include HTML::Concern::Scrubber::Full include HTML::Concern::Serializer::UTF8Encode + include HTML::Concern::Sanitizer::PreserveWhitespace end # == Rails::HTML4::LinkSanitizer @@ -307,11 +329,20 @@ module HTML5 # full_sanitizer.sanitize("Bold no more! See more here...") # # => "Bold no more! See more here..." # + # === Options + # + # If whitespace is significant you can pass preserve_whitespace: true. + # This option is slower, but is clever about whitespace around block elements and line break elements. + # + # full_sanitizer = Rails::HTML5::FullSanitizer.new + # full_sanitizer.sanitize("

Paragraphs

and
newlines", preserve_whitespace: true) + # # => \nParagraphs\n and \n newlines class FullSanitizer < Rails::HTML::Sanitizer include HTML::Concern::ComposedSanitize include HTML::Concern::Parser::HTML5 include HTML::Concern::Scrubber::Full include HTML::Concern::Serializer::UTF8Encode + include HTML::Concern::Sanitizer::PreserveWhitespace end # == Rails::HTML5::LinkSanitizer diff --git a/test/sanitizer_test.rb b/test/sanitizer_test.rb index 3cde41a..331a588 100644 --- a/test/sanitizer_test.rb +++ b/test/sanitizer_test.rb @@ -80,37 +80,41 @@ module FullSanitizerTest def test_strip_tags_with_quote input = '<" hi' - result = full_sanitize(input) acceptable_results = [ # libxml2 >= 2.9.14 and xerces+neko %{<" hi}, # other libxml2 %{ hi}, + # preserve_whitespace: true + "<" hi", ] - assert_includes(acceptable_results, result) + assert_full_sanitized(acceptable_results, input) end def test_strip_invalid_html - assert_equal "<<", full_sanitize("<<This is a test.\n\n\n\n

It no longer contains any HTML.

\n} + acceptable_results = [ + %{This is a test.\n\n\n\nIt no longer contains any HTML.\n}, + # preserve_whitespace: true + %{\nThis is a test.\n\nIt no longer contains any HTML.\n\n} + ] - assert_equal expected, full_sanitize(input) + assert_full_sanitized acceptable_results, input end def test_remove_unclosed_tags input = "This is <-- not\n a comment here." - result = full_sanitize(input) acceptable_results = [ # libxml2 >= 2.9.14 and xerces+neko %{This is <-- not\n a comment here.}, @@ -118,12 +122,11 @@ def test_remove_unclosed_tags %{This is }, ] - assert_includes(acceptable_results, result) + assert_full_sanitized(acceptable_results, input) end def test_strip_cdata input = "This has a ]]> here." - result = full_sanitize(input) acceptable_results = [ # libxml2 = 2.9.14 %{This has a <![CDATA[]]> here.}, @@ -133,51 +136,68 @@ def test_strip_cdata %{This has a here.}, ] - assert_includes(acceptable_results, result) + assert_full_sanitized(acceptable_results, input) end def test_strip_blank_string assert_nil full_sanitize(nil) - assert_equal "", full_sanitize("") - assert_equal " ", full_sanitize(" ") + assert_nil full_sanitize(nil, preserve_whitespace: true) + assert_full_sanitized "", "" + assert_full_sanitized " ", " " end def test_strip_tags_with_plaintext - assert_equal "Don't touch me", full_sanitize("Don't touch me") + assert_full_sanitized "Don't touch me", "Don't touch me" end def test_strip_tags_with_tags - assert_equal "This is a test.", full_sanitize("

This is a test.

") + assert_full_sanitized "This is a test.", "This is a test." end def test_escape_tags_with_many_open_quotes - assert_equal "<<", full_sanitize("<<") + assert_full_sanitized "<<", "<<" end def test_strip_tags_with_sentence - assert_equal "This is a test.", full_sanitize("This is a test.") + assert_full_sanitized "This is a test.", "This is a test." end def test_strip_tags_with_comment - assert_equal "This has a here.", full_sanitize("This has a here.") + assert_full_sanitized "This has a here.", "This has a here." end def test_strip_tags_with_frozen_string - assert_equal "Frozen string with no tags", full_sanitize("Frozen string with no tags") + assert_full_sanitized "Frozen string with no tags", "Frozen string with no tags" end def test_full_sanitize_respect_html_escaping_of_the_given_string - assert_equal 'test\r\nstring', full_sanitize('test\r\nstring') - assert_equal "&", full_sanitize("&") - assert_equal "&", full_sanitize("&") - assert_equal "&amp;", full_sanitize("&amp;") - assert_equal "omg <script>BOM</script>", full_sanitize("omg <script>BOM</script>") + assert_full_sanitized 'test\r\nstring', 'test\r\nstring' + assert_full_sanitized "&", "&" + assert_full_sanitized "&", "&" + assert_full_sanitized "&amp;", "&amp;" + assert_full_sanitized "omg <script>BOM</script>", "omg <script>BOM</script>" + end + + def test_full_sanitize_preserve_whitespace + assert_equal "\nParagraphs\n and \n newlines", full_sanitize("

Paragraphs

and
newlines", preserve_whitespace: true) + end + + def test_full_sanitize_preserve_whitespace_ascii_8bit_string + full_sanitize("hello".encode("ASCII-8BIT")).tap do |sanitized| + assert_equal "hello", sanitized + assert_equal Encoding::UTF_8, sanitized.encoding + end end protected def full_sanitize(input, options = {}) module_under_test::FullSanitizer.new.sanitize(input, options) end + + def assert_full_sanitized(acceptable_results, input) + assert_includes(Array(acceptable_results), full_sanitize(input)) + assert_includes(Array(acceptable_results), full_sanitize(input, preserve_whitespace: true)) + end end class HTML4FullSanitizerTest < Minitest::Test