1
2 u"""
3 :Copyright:
4
5 Copyright 2006 - 2013
6 Andr\xe9 Malo or his licensors, as applicable
7
8 :License:
9
10 Licensed under the Apache License, Version 2.0 (the "License");
11 you may not use this file except in compliance with the License.
12 You may obtain a copy of the License at
13
14 http://www.apache.org/licenses/LICENSE-2.0
15
16 Unless required by applicable law or agreed to in writing, software
17 distributed under the License is distributed on an "AS IS" BASIS,
18 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
19 See the License for the specific language governing permissions and
20 limitations under the License.
21
22 =====================
23 Markup Parser Logic
24 =====================
25
26 Soup Parser
27 ~~~~~~~~~~~
28
29 This module provides a very lenient HTML/XML lexer. The `SoupLexer` class is
30 initialized with a listener object, which receives all low level events
31 (like starttag, endtag, text etc). Listeners must implement the
32 `ListenerInterface`.
33
34 On top of the lexer there's `SoupParser` class, which actually implements the
35 `ListenerInterface` itself (the parser listens to the lexer). The parser adds
36 HTML semantics to the lexed data and passes the events to a building listener
37 (`BuildingListenerInterface`). In addition to the events sent by the lexer the
38 `SoupParser` class generates endtag events (with empty data arguments) for
39 implicitly closed elements. Furthermore it knows about CDATA elements like
40 ``<script>`` or ``<style>`` and modifies the lexer state accordingly.
41
42 The actual semantics are provided by a DTD query class (implementing
43 `DTDInterface`.)
44 """
45 __author__ = u"Andr\xe9 Malo"
46 __docformat__ = "restructuredtext en"
47
48 import re as _re
49
50 from tdi._exceptions import LexerEOFError, LexerFinalizedError
51 from tdi.markup.soup import dtd as _dtd
52 from tdi import interfaces as _interfaces
56 """
57 (X)HTML Tagsoup Lexer
58
59 The lexer works hard to preserve the original data. In order to achieve
60 this goal, it does not validate the input and recognizes its input in a
61 quite lenient way.
62
63 :Groups:
64 - `Lexer states` :
65 `TEXT`,
66 `CDATA`,
67 `MARKUP`,
68 `STARTTAG`,
69 `ENDTAG`,
70 `COMMENT`,
71 `MSECTION`,
72 `DECL`,
73 `PI`,
74 `EMPTY`,
75 `FINAL`
76 - `Regex Matchers` :
77 `_START_MATCH`,
78 `_ATT_ITER`,
79 `_COMMENT_SEARCH`,
80 `_MSECTION_MATCH`,
81 `_MSECTIONINVALID_MATCH`,
82 `_MEND_SEARCH`,
83 `_MSEND_SEARCH`,
84 `_DECL_MATCH`
85
86 :CVariables:
87 `TEXT` : ``int``
88 Lexer state ``TEXT`` (between tags)
89
90 `CDATA` : ``int``
91 Lexer state ``CDATA`` (between (P)CDATA tags)
92
93 `MARKUP` : ``int``
94 Lexer state ``MARKUP`` (``<``)
95
96 `STARTTAG` : ``int``
97 Lexer state ``STARTTAG`` (``<[letter]``)
98
99 `ENDTAG` : ``int``
100 Lexer state ``ENDTAG`` (``</``)
101
102 `COMMENT` : ``int``
103 Lexer state ``COMMENT`` (``<!--``)
104
105 `MSECTION` : ``int``
106 Lexer state ``MSECTION`` (``<![``)
107
108 `DECL` : ``int``
109 Lexer state ``DECL`` (``<!``)
110
111 `PI` : ``int``
112 Lexer state ``PI`` (``<?``)
113
114 `EMPTY` : ``int``
115 Lexer state ``EMPTY`` (``<>``)
116
117 `FINAL` : ``int``
118 Lexer state ``FINAL``
119
120 `_LEXERS` : ``tuple``
121 The state lexer method names (``('method', ...)``)
122
123 `_STATES` : ``tuple``
124 The state names (``('name', ...)``)
125
126 :IVariables:
127 `_state` : ``int``
128 The current lexer state
129
130 `_lexers` : ``list``
131 The state lexer methods (``[method, ...]``)
132
133 `_listener` : `ListenerInterface`
134 The listener the events shall be sent to
135
136 `_buffer` : ``str``
137 Current unprocessed buffer
138
139 `_conditional_ie_comments` : ``bool``
140 Handle conditional IE comments as text?
141 """
142
143
144 - def __init__(self, listener, conditional_ie_comments=True):
145 r"""
146 Initialization
147
148 :Parameters:
149 `listener` : `ListenerInterface`
150 The event listener
151
152 `conditional_ie_comments` : ``bool``
153 Handle conditional IE comments as text?
154
155 Conditional comments are described in full detail
156 at `MSDN`_\.
157
158 .. _MSDN: http://msdn.microsoft.com/en-us/library/
159 ms537512%28v=vs.85%29.aspx
160 """
161 self._listener = listener
162 self._normalize = None
163 self._cdata_name = None
164
165 self._state = self.TEXT
166 self._lexers = [getattr(self, name) for name in self._LEXERS]
167 self._buffer = ''
168 self._conditional_ie_comments = bool(conditional_ie_comments)
169
170 - def feed(self, food):
171 """
172 Feed the lexer with new data
173
174 :Parameters:
175 `food` : ``str``
176 The data to process
177 """
178 self._buffer += food
179 self._lex()
180
182 """
183 Finalize the lexer
184
185 This processes the rest buffer (if any)
186
187 :Exceptions:
188 - `LexerEOFError` : The rest buffer could not be consumed
189 """
190 self._lex()
191 if self._buffer:
192 raise LexerEOFError(
193 "Unfinished parser state %s" % self._STATES[self._state]
194 )
195
196 self._state = self.FINAL
197
198 - def cdata(self, normalize, name):
199 """ Set CDATA state """
200 if self._state != self.FINAL:
201 self._state = self.CDATA
202 self._normalize = normalize
203 self._cdata_name = normalize(name)
204
206 """ Parse the current buffer """
207 while self._buffer:
208 if self._lexers[self._state]():
209 break
210
211 - def _lex_text(self):
212 """
213 Text lexer
214
215 State: We are between tags or at the very beginning of the document
216 and look for a ``<``.
217
218 :Return: Unfinished state?
219 :Rtype: ``bool``
220 """
221 data = self._buffer
222 pos = data.find('<')
223 if pos == 0:
224 self._state = self.MARKUP
225 return False
226 elif pos == -1:
227 self._buffer = ''
228 else:
229 self._buffer, data = data[pos:], data[:pos]
230 self._state = self.MARKUP
231
232 self._listener.handle_text(data)
233 return False
234
236 """
237 (PR)CDATA lexer
238
239 State: We are inside a text element and looking for the end tag only
240
241 :Return: Unfinished state?
242 :Rtype: ``bool``
243 """
244 incomplete = False
245 data, pos = self._buffer, 0
246 while True:
247 pos = data.find('<', pos)
248 if pos == -1:
249 pos = len(data)
250 self._buffer = ''
251 break
252 else:
253 char = data[pos + 1:pos + 2]
254 if char == '/':
255 self._state = self.ENDTAG
256 break
257 elif char == '':
258 incomplete = True
259 break
260 else:
261 pos += 1
262
263 if pos > 0:
264 self._buffer, data = data[pos:], data[:pos]
265 self._listener.handle_text(data)
266
267 return incomplete
268
269
270
271
272 _TAGNAME_MATCH = _re.compile(r'[a-zA-Z0-9]').match
273
275 """
276 Markup lexer
277
278 State: We've hit a ``<`` character and now find out, what it's
279 becoming
280
281 :Return: Unfinished state?
282 :Rtype: ``bool``
283 """
284 data = self._buffer
285 if len(data) < 2:
286 return True
287
288 char = data[1]
289 state = (self.ENDTAG, self.DECL, self.PI, self.EMPTY, -1)[
290 "/!?>".find(char)
291 ]
292 if state == -1:
293 if self._TAGNAME_MATCH(char):
294 state = self.STARTTAG
295 else:
296 state = self.TEXT
297 self._buffer = data[1:]
298 self._listener.handle_text(data[0])
299
300 self._state = state
301 return False
302
303
304
305
306 _START_MATCH = _re.compile(r'''
307 <
308 (?P<name>[^ \t\r\n\f/>]+)
309 (?P<attr>
310 [^"'>]*
311 (?:
312 (?:
313 "[^"]*"
314 | '[^']*'
315 )
316 [^"'>]*
317 )*
318 )
319 [ \t\r\n\f]*
320 >
321 ''', _re.X).match
322
323
324
325
326 _ATT_ITER = _re.compile(r'''
327 [ \t\r\n\f]*
328 (?P<name>(?:/|[^ \t\r\n\f/=>]*)) # attribute name
329 [ \t\r\n\f]*
330 (?:
331 =
332 (?P<value> # optional value
333 [ \t\r\n\f]*"[^"]*"
334 | [ \t\r\n\f]*'[^']*'
335 | [^ \t\r\n\f/>]*
336 )
337 )?
338 ''', _re.X).finditer
339
341 """
342 Starttag lexer
343
344 State: We've hit a ``<x`` and now look for the ``>``.
345
346 :Return: Unfinished State?
347 :Rtype: ``bool``
348 """
349 data = self._buffer
350 match = self._START_MATCH(data)
351 if match is None:
352 return True
353
354 pos = match.end()
355 self._buffer, data = data[pos:], data[:pos]
356
357 name, attrstring = match.group('name', 'attr')
358 attr, closed = [], False
359 if attrstring:
360 for match in self._ATT_ITER(attrstring):
361 key, value = match.group('name', 'value')
362 if key == '/' and value is None:
363 closed = True
364 continue
365 if key or value is not None:
366 if value:
367 value = value.strip()
368 attr.append((key.strip(), value))
369 else:
370 break
371
372 self._state = self.TEXT
373 self._listener.handle_starttag(name, attr, closed, data)
374 return False
375
377 """
378 Endtag lexer
379
380 State: We've hit ``</``.
381
382 :Return: Unfinished state?
383 :Rtype: ``bool``
384 """
385 data = self._buffer
386 pos = data.find('>') + 1
387 if pos == 0:
388 return True
389
390 self._buffer, data = data[pos:], data[:pos]
391 name = data[2:-1].strip()
392
393 if self._cdata_name is not None and \
394 self._normalize(name) != self._cdata_name:
395 self._state = self.CDATA
396 self._listener.handle_text(data)
397 else:
398 self._cdata_name = self._normalize = None
399 self._state = self.TEXT
400 self._listener.handle_endtag(name, data)
401 return False
402
403
404
405
406
407 _COMMENT_SEARCH = _re.compile(r'--[ \t\r\n\f]*>').search
408
409
410
411
412 _IE_COMMENT_MATCH = _re.compile(r'''
413 \[[ \t\r\n\f]* (?:
414 [iI][fF] | [eE][lL][sS][eE] | [eE][nN][dD][iI][fF]
415 ) [^\]]+]>
416 ''', _re.X).match
417
450
451
452
453
454 _MSSECTIONS = ('if', 'else', 'endif')
455
456
457
458
459 _MSECTION_MATCH = _re.compile(r'''
460 <!\[[ \t\r\n\f]*(?P<name>[^\][ \t\r\n\f>]+)(?=[\][ \t\r\n\f>])
461 ''', _re.X).match
462
463
464
465
466 _MSECTIONINVALID_MATCH = _re.compile(r'<!\[[ \t\r\n\f]*[\][>]').match
467
468
469
470
471 _MEND_SEARCH = _re.compile(r'][ \t\r\n\f]*][ \t\r\n\f]*>').search
472
473
474
475
476 _MSEND_SEARCH = _re.compile(r'][ \t\r\n\f]*(?:--)?[ \t\r\n\f]*>').search
477
479 """
480 Marked section lexer
481
482 State: We've hit a ``<![`` and now seek the end
483
484 :Return: Unfinished state?
485 :Rtype: ``bool``
486 """
487 data = self._buffer
488 match = self._MSECTION_MATCH(data)
489 if match is None:
490 match = self._MSECTIONINVALID_MATCH(data)
491 if match is not None:
492 pos = match.end()
493 self._buffer = data[pos:]
494 data = data[:pos]
495 self._state = self.TEXT
496 self._listener.handle_text(data)
497 return False
498 return True
499
500 name = match.group('name')
501 start = match.end()
502 if self._conditional_ie_comments and name.lower() in self._MSSECTIONS:
503 match = iec = self._MSEND_SEARCH(data, start)
504 else:
505 pos = data.find('[', start)
506 if pos >= 0:
507 start = pos + 1
508 match = self._MEND_SEARCH(data, start)
509 iec = None
510 if match is None:
511 return True
512 pos, end = match.end(), match.start()
513 value = data[start:end]
514 self._buffer, data = data[pos:], data[:pos]
515
516 self._state = self.TEXT
517 if iec:
518 self._listener.handle_text(data)
519 else:
520 self._listener.handle_msection(name, value, data)
521 return False
522
523
524
525
526
527
528
529
530 _DECL_MATCH = _re.compile(r'''
531 <!
532 (?P<name>[^\][ \t\r\n\f>]*)
533 (?P<value>
534 [^"'<>-]* # any nonspecial
535 (?:
536 (?:
537 "[^"]*" # double quoted string
538 | '[^']*' # single quoted string (valid?)
539 | <!\[ # marked section
540 [^\]]*
541 (?:
542 ](?![ \t\r\n\f]*][ \t\r\n\f]*>)
543 [^\]]*
544 )*
545 ][ \t\r\n\f]*][ \t\r\n\f]*>
546 | <(?!!\[) # declaration
547 # hopefully not a doctype
548 # (but unlikely, because we are
549 # probably already in a DT subset)
550 [^"'>-]*
551 (?:
552 (?:
553 "[^"]*"
554 | '[^']*'
555 | -- # comment
556 [^-]*
557 (?:-[^-]+)*
558 --
559 | -(?!-) # just a hyphen
560 )
561 [^"'>-]*
562 )*
563 >
564 | -- # comment
565 [^-]*
566 (?:-[^-]+)*
567 --
568 | -(?!-) # just a hyphen
569 )
570 [^"'<>-]* # more non-specials
571 )*
572 )
573 >
574 ''', _re.X).match
576 """
577 Declaration lexer
578
579 State: We've hit a ``<!`` and now peek inside
580
581 :Return: Unfinished state?
582 :Rtype: ``bool``
583 """
584 data = self._buffer
585 if len(data) < 3:
586 return True
587
588 if data.startswith('<!--'):
589 self._state = self.COMMENT
590 return False
591 elif data.startswith('<!['):
592 self._state = self.MSECTION
593 return False
594 elif data == '<!-':
595 return True
596
597 match = self._DECL_MATCH(data)
598 if match is None:
599 return True
600
601 name, value = match.group('name', 'value')
602 pos = match.end()
603 self._buffer, data = data[pos:], data[:pos]
604
605 self._state = self.TEXT
606 self._listener.handle_decl(name, value.strip(), data)
607 return False
608
610 """
611 Processing instruction lexer
612
613 State: We've hit a ``<?`` and now peek inside
614
615 :Return: Unfinished state?
616 :Rtype: ``bool``
617 """
618 data = self._buffer
619 pos = data.find('?>', 2)
620 if pos == -1:
621 return True
622 pos += 2
623
624 self._buffer, data = data[pos:], data[:pos]
625
626 self._state = self.TEXT
627 self._listener.handle_pi(data)
628 return False
629
631 """
632 Empty tag lexer
633
634 State: We've hit a ``<>``
635
636 :Return: Unfinished state?
637 :Rtype: ``bool``
638 """
639 self._buffer, data = self._buffer[2:], self._buffer[:2]
640
641 self._state = self.TEXT
642 self._listener.handle_starttag('', [], False, data)
643 return False
644
646 """
647 Called after the lexer was finalized
648
649 State: after all
650
651 :Exceptions:
652 - `LexerFinalizedError` : The lexer was already finalized
653 (raised always)
654 """
655 raise LexerFinalizedError("The lexer was already finalized")
656
657 _LEXERS = []
658 _STATES = []
659 for _idx, (_statename, _funcname) in enumerate([
660 ('FINAL', '_lex_final'),
661 ('TEXT', '_lex_text'),
662 ('CDATA', '_lex_cdata'),
663 ('MARKUP', '_lex_markup'),
664 ('STARTTAG', '_lex_start'),
665 ('ENDTAG', '_lex_end'),
666 ('COMMENT', '_lex_comment'),
667 ('MSECTION', '_lex_msection'),
668 ('DECL', '_lex_decl'),
669 ('PI', '_lex_pi'),
670 ('EMPTY', '_lex_empty'),
671 ]):
672 setattr(SoupLexer, _statename, _idx)
673 _LEXERS.append(_funcname)
674 _STATES.append(_statename)
675
676 SoupLexer._LEXERS = tuple(_LEXERS)
677 SoupLexer._STATES = tuple(_STATES)
678 del _idx, _statename, _funcname, _LEXERS, _STATES
679
680
681 from tdi import c
682 c = c.load('impl')
683 if c is not None:
684 DEFAULT_LEXER = c.SoupLexer
685 else:
686 DEFAULT_LEXER = SoupLexer
687 del c
691 """
692 =========================
693 (X)HTML Tag Soup Parser
694 =========================
695
696 Overview
697 ~~~~~~~~
698
699 The parser is actually a tagsoup parser by design in order to process
700 most of the "HTML" that can be found out there. Of course, if the HTML
701 is well-formed and valid, this would be the best. There is only as
702 much HTML syntax applied as necessary to parse it. You can influence
703 these syntax definitions by picking another lexer. You can change
704 the semantics by picking another dtd query class.
705
706 This parser guarantees, that for each not-self-closing starttag event also
707 an endtag event is generated (if the endtag is not actually there, the
708 data parameter is an empty string). This also happens for empty tags (like
709 ``br``). On the other hand, there may be more endtag events than starttag
710 events, because of unbalanced or wrongly nested tags.
711
712 Special constructs, which are comments, PIs, marked sections and
713 declarations may occur anywhere, i.e. they are not closing elements
714 implicitly.
715
716 The default lexer does not deal with NET tags (<h1/Heading/). Neither
717 does it handle unfinished starttags by SGML rules like ``<map<area>``.
718 It *does* know about empty tags (``<>`` and ``</>``).
719
720 CDATA elements and comments are handled in a simplified way. Once
721 the particular state is entered, it's only left, when the accompanying
722 end marker was found (``<script>...</script>``, ``<!-- ... -->``).
723 Anything in between is text.
724
725 How is it used?
726 ~~~~~~~~~~~~~~~
727
728 The parser API is "streamy" on the input side and event based on the
729 output side. So, what you need first is a building listener, which will
730 receive all generated parser events and process them. Such is listener
731 object is expected to implement the `BuildingListenerInterface`.
732
733 Now you create a `SoupParser` instance and pass the listener object to
734 the contructor and the parser is ready to be fed. You can feed as many
735 chunks of input data you like into the parser by using the `feed`
736 method. Every feed call may generate mutiple events on the output side.
737 When you're done feeding, call the parser's `finalize` method in order
738 to clean up. This also flushes pending events to the listener.
739
740 :IVariables:
741 `listener` : `BuildingListenerInterface`
742 The building listener to send the events to
743
744 `lexer` : `SoupLexer`
745 The lexer instance
746
747 `_tagstack` : ``list``
748 The current tag stack
749
750 `_inempty` : ``bool``
751 indicates if the last tag on the stack is an empty one
752
753 `_lastopen` : ``str``
754 Stores the last seen open tag name
755 """
756 __implements__ = [
757 _interfaces.ListenerInterface, _interfaces.ParserInterface
758 ]
759
760 - def __init__(self, listener, dtd, lexer=None):
761 """
762 Initialization
763
764 :Parameters:
765 `listener` : `ListenerInterface`
766 The building listener
767
768 `dtd` : `DTDInterface`
769 DTD query object
770
771 `lexer` : ``callable``
772 Lexer class/factory. This mus be a callable taking an
773 event listener and returning a lexer instance. If omitted or
774 ``None``, the default lexer will be used (`DEFAULT_LEXER`).
775 """
776 self._tagstack, self._inempty, self._lastopen = [], False, ''
777 self.listener = listener
778 self._is_nestable = dtd.nestable
779 self._is_cdata = dtd.cdata
780 self._is_empty = dtd.empty
781 if lexer is None:
782 lexer = DEFAULT_LEXER
783 self._lexer = lexer(self)
784 self._normalize = listener.decoder.normalize
785
786 @classmethod
787 - def html(cls, listener):
788 """
789 Construct a parser using the `HTMLDTD`
790
791 :Parameters:
792 `listener` : `BuildingListenerInterface`
793 The building listener
794
795 :Return: The new parser instance
796 :Rtype: `SoupParser`
797 """
798 return cls(listener, _dtd.HTMLDTD())
799
800 @classmethod
801 - def xml(cls, listener):
802 """
803 Construct a parser using the `XMLDTD`
804
805 :Parameters:
806 `listener` : `ListenerInterface`
807 The building listener
808
809 :Return: The new parser instance
810 :Rtype: `SoupParser`
811 """
812 return cls(listener, _dtd.XMLDTD())
813
815 """ Ensure we close last empty tag """
816 if self._inempty:
817 self._inempty = False
818 self.listener.handle_endtag(self._tagstack.pop()[1], '')
819
820
821
822
823
824 - def handle_text(self, data):
825 """ :See: `ListenerInterface` """
826 self._close_empty()
827 self.listener.handle_text(data)
828
830 """ :See: `ListenerInterface` """
831 self._close_empty()
832
833 if name == '' and not attrs:
834 name = self._lastopen
835 else:
836 self._lastopen = name
837
838 tagstack = self._tagstack
839 nestable = self._is_nestable
840 starttag = self._normalize(name)
841 while tagstack and not nestable(tagstack[-1][0], starttag):
842 self.listener.handle_endtag(tagstack.pop()[1], '')
843
844 if closed:
845 self.listener.handle_starttag(name, attrs, closed, data)
846 else:
847 if self._is_cdata(starttag):
848 self._lexer.cdata(self._normalize, starttag)
849 self.listener.handle_starttag(name, attrs, closed, data)
850 tagstack.append((starttag, name))
851 if self._is_empty(starttag):
852 self._inempty = True
853
855 """ :See: `ListenerInterface` """
856 tagstack = self._tagstack
857 if tagstack:
858 if name == '':
859 name = tagstack[-1][1]
860 endtag = self._normalize(name)
861 if endtag in dict(tagstack):
862 toclose, original = tagstack.pop()
863 self._inempty = False
864 while toclose != endtag:
865 self.listener.handle_endtag(original, '')
866 toclose, original = tagstack.pop()
867
868 self._close_empty()
869 self.listener.handle_endtag(name, data)
870
875
877 """ :See: `ListenerInterface` """
878 self._close_empty()
879 self.listener.handle_msection(name, value, data)
880
882 """ :See: `ListenerInterface` """
883 self._close_empty()
884 self.listener.handle_decl(name, value, data)
885
887 """ :See: `ListenerInterface` """
888 self._close_empty()
889 self.listener.handle_pi(data)
890
892 """ :See: `ListenerInterface` """
893
894 raise AssertionError()
895
896
897
898
899
900 - def feed(self, food):
901 """ :See: `ParserInterface` """
902 self._lexer.feed(food)
903
905 """
906 :See: `ParserInterface`
907
908 :Exceptions:
909 - `LexerEOFError` : EOF in the middle of a state
910 """
911 if self._lexer is not None:
912 self._lexer, _ = None, self._lexer.finalize()
913
914 tagstack = self._tagstack
915 while tagstack:
916 self.listener.handle_endtag(tagstack.pop()[1], '')
917
918
919 from tdi import c
920 c = c.load('impl')
921 if c is not None:
922 DEFAULT_PARSER = c.SoupParser
923 else:
924 DEFAULT_PARSER = SoupParser
925 del c
926