Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit 075cb7c

Browse files
committedFeb 27, 2024
Make parsing of text be non-quadratic.
In Python, appending strings is not guaranteed to be constant-time, since they are documented to be immutable. In some corner cases, CPython is able to make these operations constant-time, but reaching into ETree objects is not such a case. This leads to parse times being quadratic in the size of the text in the input in pathological cases where parsing outputs a large number of adjacent text nodes which must be combined (e.g. HTML-escaped values). Specifically, we expect doubling the size of the input to result in approximately doubling the time to parse; instead, we observe quadratic behavior: ``` In [1]: import html5lib In [2]: %timeit -n1 -r5 html5lib.parse("<" * 200000) 2.99 s ± 269 ms per loop (mean ± std. dev. of 5 runs, 1 loop each) In [3]: %timeit -n1 -r5 html5lib.parse("<" * 400000) 6.7 s ± 242 ms per loop (mean ± std. dev. of 5 runs, 1 loop each) In [4]: %timeit -n1 -r5 html5lib.parse("<" * 800000) 19.5 s ± 1.48 s per loop (mean ± std. dev. of 5 runs, 1 loop each) ``` Switch from appending to the internal `str`, to appending text to an array of text chunks, as appends can be done in constant time. Using `bytearray` is a similar solution, but benchmarks slightly worse because the strings must be encoded before being appended. This improves parsing of text documents noticeably: ``` In [1]: import html5lib In [2]: %timeit -n1 -r5 html5lib.parse("<" * 200000) 2.3 s ± 373 ms per loop (mean ± std. dev. of 5 runs, 1 loop each) In [3]: %timeit -n1 -r5 html5lib.parse("<" * 400000) 3.85 s ± 29.7 ms per loop (mean ± std. dev. of 5 runs, 1 loop each) In [4]: %timeit -n1 -r5 html5lib.parse("<" * 800000) 8.04 s ± 317 ms per loop (mean ± std. dev. of 5 runs, 1 loop each) ```
1 parent fd4f032 commit 075cb7c

File tree

2 files changed

+60
-33
lines changed

2 files changed

+60
-33
lines changed
 

Diff for: ‎html5lib/treebuilders/etree.py

+52-29
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,23 @@
1616
tag_regexp = re.compile("{([^}]*)}(.*)")
1717

1818

19+
class TextBuffer:
20+
def __init__(self, initial=""):
21+
self.chunks = [initial]
22+
23+
def __str__(self):
24+
return "".join(self.chunks)
25+
26+
def getvalue(self):
27+
return "".join(self.chunks)
28+
29+
def append(self, other):
30+
self.chunks.append(other)
31+
32+
def __eq__(self, other):
33+
return self.getvalue() == other
34+
35+
1936
def getETreeBuilder(ElementTreeImplementation, fullTree=False):
2037
ElementTree = ElementTreeImplementation
2138
ElementTreeCommentType = ElementTree.Comment("asd").tag
@@ -110,25 +127,25 @@ def removeChild(self, node):
110127
def insertText(self, data, insertBefore=None):
111128
if not len(self._element):
112129
if not self._element.text:
113-
self._element.text = ""
114-
self._element.text += data
130+
self._element.text = TextBuffer("")
131+
self._element.text.append(data)
115132
elif insertBefore is None:
116133
# Insert the text as the tail of the last child element
117134
if not self._element[-1].tail:
118-
self._element[-1].tail = ""
119-
self._element[-1].tail += data
135+
self._element[-1].tail = TextBuffer("")
136+
self._element[-1].tail.append(data)
120137
else:
121138
# Insert the text before the specified node
122139
children = list(self._element)
123140
index = children.index(insertBefore._element)
124141
if index > 0:
125142
if not self._element[index - 1].tail:
126-
self._element[index - 1].tail = ""
127-
self._element[index - 1].tail += data
143+
self._element[index - 1].tail = TextBuffer("")
144+
self._element[index - 1].tail.append(data)
128145
else:
129146
if not self._element.text:
130-
self._element.text = ""
131-
self._element.text += data
147+
self._element.text = TextBuffer("")
148+
self._element.text.append(data)
132149

133150
def cloneNode(self):
134151
element = type(self)(self.name, self.namespace)
@@ -138,36 +155,39 @@ def cloneNode(self):
138155

139156
def reparentChildren(self, newParent):
140157
if newParent.childNodes:
141-
newParent.childNodes[-1]._element.tail += self._element.text
158+
newParent.childNodes[-1]._element.tail.append(
159+
self._element.text.getvalue()
160+
)
142161
else:
143162
if not newParent._element.text:
144-
newParent._element.text = ""
163+
newParent._element.text = TextBuffer("")
145164
if self._element.text is not None:
146-
newParent._element.text += self._element.text
147-
self._element.text = ""
165+
newParent._element.text.append(self._element.text.getvalue())
166+
self._element.text = TextBuffer("")
148167
base.Node.reparentChildren(self, newParent)
149168

150169
class Comment(Element):
151170
def __init__(self, data):
152171
# Use the superclass constructor to set all properties on the
153172
# wrapper element
154173
self._element = ElementTree.Comment(data)
174+
self._element.text = TextBuffer(data)
155175
self.parent = None
156176
self._childNodes = []
157177
self._flags = []
158178

159179
def _getData(self):
160-
return self._element.text
180+
return self._element.text.getvalue()
161181

162182
def _setData(self, value):
163-
self._element.text = value
183+
self._element.text = TextBuffer(value)
164184

165185
data = property(_getData, _setData)
166186

167187
class DocumentType(Element):
168188
def __init__(self, name, publicId, systemId):
169189
Element.__init__(self, "<!DOCTYPE>")
170-
self._element.text = name
190+
self._element.text = TextBuffer(name)
171191
self.publicId = publicId
172192
self.systemId = systemId
173193

@@ -208,19 +228,19 @@ def serializeElement(element, indent=0):
208228
publicId = element.get("publicId") or ""
209229
systemId = element.get("systemId") or ""
210230
rv.append("""<!DOCTYPE %s "%s" "%s">""" %
211-
(element.text, publicId, systemId))
231+
(element.text.getvalue(), publicId, systemId))
212232
else:
213-
rv.append("<!DOCTYPE %s>" % (element.text,))
233+
rv.append("<!DOCTYPE %s>" % (element.text.getvalue(),))
214234
elif element.tag == "DOCUMENT_ROOT":
215235
rv.append("#document")
216236
if element.text is not None:
217-
rv.append("|%s\"%s\"" % (' ' * (indent + 2), element.text))
237+
rv.append("|%s\"%s\"" % (' ' * (indent + 2), element.text.getvalue()))
218238
if element.tail is not None:
219239
raise TypeError("Document node cannot have tail")
220240
if hasattr(element, "attrib") and len(element.attrib):
221241
raise TypeError("Document node cannot have attributes")
222242
elif element.tag == ElementTreeCommentType:
223-
rv.append("|%s<!-- %s -->" % (' ' * indent, element.text))
243+
rv.append("|%s<!-- %s -->" % (' ' * indent, element.text.getvalue()))
224244
else:
225245
assert isinstance(element.tag, text_type), \
226246
"Expected unicode, got %s, %s" % (type(element.tag), element.tag)
@@ -248,13 +268,14 @@ def serializeElement(element, indent=0):
248268

249269
for name, value in sorted(attributes):
250270
rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value))
251-
if element.text:
252-
rv.append("|%s\"%s\"" % (' ' * (indent + 2), element.text))
271+
if element.text and element.text.getvalue():
272+
rv.append("|%s\"%s\"" % (' ' * (indent + 2), element.text.getvalue()))
253273
indent += 2
254274
for child in element:
255275
serializeElement(child, indent)
256276
if element.tail:
257-
rv.append("|%s\"%s\"" % (' ' * (indent - 2), element.tail))
277+
rv.append("|%s\"%s\"" % (' ' * (indent - 2), element.tail.getvalue()))
278+
258279
serializeElement(element, 0)
259280

260281
return "\n".join(rv)
@@ -272,13 +293,15 @@ def serializeElement(element):
272293
if element.get("publicId") or element.get("systemId"):
273294
publicId = element.get("publicId") or ""
274295
systemId = element.get("systemId") or ""
275-
rv.append("""<!DOCTYPE %s PUBLIC "%s" "%s">""" %
276-
(element.text, publicId, systemId))
296+
rv.append(
297+
"""<!DOCTYPE %s PUBLIC "%s" "%s">"""
298+
% (element.text.getvalue(), publicId, systemId)
299+
)
277300
else:
278-
rv.append("<!DOCTYPE %s>" % (element.text,))
301+
rv.append("<!DOCTYPE %s>" % (element.text.getvalue(),))
279302
elif element.tag == "DOCUMENT_ROOT":
280303
if element.text is not None:
281-
rv.append(element.text)
304+
rv.append(element.text.getvalue())
282305
if element.tail is not None:
283306
raise TypeError("Document node cannot have tail")
284307
if hasattr(element, "attrib") and len(element.attrib):
@@ -288,7 +311,7 @@ def serializeElement(element):
288311
serializeElement(child)
289312

290313
elif element.tag == ElementTreeCommentType:
291-
rv.append("<!--%s-->" % (element.text,))
314+
rv.append("<!--%s-->" % (element.text.getvalue(),))
292315
else:
293316
# This is assumed to be an ordinary element
294317
if not element.attrib:
@@ -299,15 +322,15 @@ def serializeElement(element):
299322
for name, value in element.attrib.items()])
300323
rv.append("<%s %s>" % (element.tag, attr))
301324
if element.text:
302-
rv.append(element.text)
325+
rv.append(element.text.getvalue())
303326

304327
for child in element:
305328
serializeElement(child)
306329

307330
rv.append("</%s>" % (element.tag,))
308331

309332
if element.tail:
310-
rv.append(element.tail)
333+
rv.append(element.tail.getvalue())
311334

312335
serializeElement(element)
313336

Diff for: ‎html5lib/treewalkers/etree.py

+8-4
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ def getNodeDetails(self, node):
3333
if isinstance(node, tuple): # It might be the root Element
3434
elt, _, _, flag = node
3535
if flag in ("text", "tail"):
36-
return base.TEXT, getattr(elt, flag)
36+
return base.TEXT, getattr(elt, flag).getvalue()
3737
else:
3838
node = elt
3939

@@ -44,11 +44,15 @@ def getNodeDetails(self, node):
4444
return (base.DOCUMENT,)
4545

4646
elif node.tag == "<!DOCTYPE>":
47-
return (base.DOCTYPE, node.text,
48-
node.get("publicId"), node.get("systemId"))
47+
return (
48+
base.DOCTYPE,
49+
node.text.getvalue(),
50+
node.get("publicId"),
51+
node.get("systemId"),
52+
)
4953

5054
elif node.tag == ElementTreeCommentType:
51-
return base.COMMENT, node.text
55+
return base.COMMENT, node.text.getvalue()
5256

5357
else:
5458
assert isinstance(node.tag, string_types), type(node.tag)

0 commit comments

Comments
 (0)
Please sign in to comment.