Skip to content

Commit

Permalink
Switch HTML Parser to HTML5Lib
Browse files Browse the repository at this point in the history
Make parser more robust rubys/feedvalidator#38
while still doing strict validity checking (which new HTMLParser doesn't allow see rubys/feedvalidator#28
  • Loading branch information
dontcallmedom committed Mar 2, 2020
1 parent d135a60 commit f249ddd
Show file tree
Hide file tree
Showing 5 changed files with 16 additions and 478 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
Some tests, and some functionality, will not be enabled unless a full set
of 32-bit character encodings are available through Python.

The feedvalidator relies on html5lib for parsing HTML.

The package 'iconvcodec' provides the necessary codecs, if your underlying
operating system supports them. Its web page is at
<http://cjkpython.i18n.org/#iconvcodec>, and a range of packages are
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
html5lib
40 changes: 13 additions & 27 deletions src/feedvalidator/validators.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,8 +102,8 @@ def startElementNS(self, name, qname, attrs):
# eat children
self.push(self.__class__(), name, attrs)

from feedvalidator.vendor.HTMLParser import HTMLParser, HTMLParseError
class HTMLValidator(HTMLParser):
from html5lib.html5parser import HTMLParser, ParseError
class HTMLValidator:
htmltags = [
"a", "abbr", "acronym", "address", "applet", "area", "article", "aside",
"audio", "b", "base", "basefont", "bdi", "bdo", "big", "blockquote", "body",
Expand Down Expand Up @@ -241,40 +241,41 @@ class HTMLValidator(HTMLParser):
'zoomAndPan']

def log(self,msg):
offset = [self.element.line + self.getpos()[0] - 1 -
offset = [self.element.line - 1 -
self.element.dispatcher.locator.getLineNumber(),
-self.element.dispatcher.locator.getColumnNumber()]
self.element.log(msg, offset)

def __init__(self,value,element):
self.element=element
self.stack = []
self.valid = True
HTMLParser.__init__(self)
self.parser = HTMLParser(strict=True)
if value.lower().find('<?import ') >= 0:
self.log(SecurityRisk({"parent":self.element.parent.name, "element":self.element.name, "tag":"?import"}))
try:
self.feed(value)
self.close()
etree = self.parser.parseFragment(value)
if self.valid:
self.log(ValidHtml({"parent":self.element.parent.name, "element":self.element.name}))
except HTMLParseError as msg:
from pprint import pprint
for tag in etree.iter():
if tag.tag != "DOCUMENT_FRAGMENT":
self.handle_tag(tag.tag.split('}')[-1], tag.attrib, tag.text)
except ParseError as msg:
element = self.element
offset = [element.line - element.dispatcher.locator.getLineNumber(),
- element.dispatcher.locator.getColumnNumber()]
match = re.search(', at line (\d+), column (\d+)',str(msg))
if match: offset[0] += int(match.group(1))-1
element.log(NotHtml({"parent":element.parent.name, "element":element.name, "message":"Invalid HTML", "value": str(msg)}),offset)

def handle_starttag(self, tag, attributes):
def handle_tag(self, tag, attributes, text):
if tag.lower() not in self.htmltags:
self.log(NotHtml({"parent":self.element.parent.name, "element":self.element.name,"value":tag, "message": "Non-html tag"}))
self.valid = False
elif tag.lower() not in HTMLValidator.acceptable_elements:
if not 'embed' in self.stack and not 'object' in self.stack:
self.log(SecurityRisk({"parent":self.element.parent.name, "element":self.element.name, "tag":tag}))
self.log(SecurityRisk({"parent":self.element.parent.name, "element":self.element.name, "tag":tag}))
else:
for (name,value) in attributes:
for (name,value) in attributes.iteritems():
if name.lower() == 'style':
for evil in checkStyle(value):
self.log(DangerousStyleAttr({"parent":self.element.parent.name, "element":self.element.name, "attr":"style", "value":evil}))
Expand All @@ -283,21 +284,6 @@ def handle_starttag(self, tag, attributes):
if name.lower()[:5] != "data-":
self.log(SecurityRiskAttr({"parent":self.element.parent.name, "element":self.element.name, "attr":name}))

self.stack.append(tag)

def handle_endtag(self, tag):
if tag in self.stack:
while self.stack[-1] != tag: self.stack.pop()
self.stack.pop()

def handle_charref(self, name):
if name.startswith('x'):
value = int(name[1:],16)
else:
value = int(name)
if 0x80 <= value <= 0x9F or value == 0xfffd:
self.log(BadCharacters({"parent":self.element.parent.name,
"element":self.element.name, "value":"&#" + name + ";"}))

#
# Scub CSS properties for potentially evil intent
Expand Down
Loading

0 comments on commit f249ddd

Please sign in to comment.