Package tdi :: Package markup :: Package soup :: Module parser
[frames] | no frames]

Source Code for Module tdi.markup.soup.parser

  1  # -*- coding: ascii -*- 
  2  u""" 
  3  :Copyright: 
  4   
  5   Copyright 2006 - 2013 
  6   Andr\xe9 Malo or his licensors, as applicable 
  7   
  8  :License: 
  9   
 10   Licensed under the Apache License, Version 2.0 (the "License"); 
 11   you may not use this file except in compliance with the License. 
 12   You may obtain a copy of the License at 
 13   
 14       http://www.apache.org/licenses/LICENSE-2.0 
 15   
 16   Unless required by applicable law or agreed to in writing, software 
 17   distributed under the License is distributed on an "AS IS" BASIS, 
 18   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
 19   See the License for the specific language governing permissions and 
 20   limitations under the License. 
 21   
 22  ===================== 
 23   Markup Parser Logic 
 24  ===================== 
 25   
 26  Soup Parser 
 27  ~~~~~~~~~~~ 
 28   
 29  This module provides a very lenient HTML/XML lexer. The `SoupLexer` class is 
 30  initialized with a listener object, which receives all low level events 
 31  (like starttag, endtag, text etc). Listeners must implement the 
 32  `ListenerInterface`. 
 33   
 34  On top of the lexer there's `SoupParser` class, which actually implements the 
 35  `ListenerInterface` itself (the parser listens to the lexer). The parser adds 
 36  HTML semantics to the lexed data and passes the events to a building listener 
 37  (`BuildingListenerInterface`). In addition to the events sent by the lexer the 
 38  `SoupParser` class generates endtag events (with empty data arguments) for 
 39  implicitly closed elements. Furthermore it knows about CDATA elements like 
 40  ``<script>`` or ``<style>`` and modifies the lexer state accordingly. 
 41   
 42  The actual semantics are provided by a DTD query class (implementing 
 43  `DTDInterface`.) 
 44  """ 
 45  __author__ = u"Andr\xe9 Malo" 
 46  __docformat__ = "restructuredtext en" 
 47   
 48  import re as _re 
 49   
 50  from tdi._exceptions import LexerEOFError, LexerFinalizedError 
 51  from tdi.markup.soup import dtd as _dtd 
 52  from tdi import interfaces as _interfaces 
53 54 55 -class SoupLexer(object):
56 """ 57 (X)HTML Tagsoup Lexer 58 59 The lexer works hard to preserve the original data. In order to achieve 60 this goal, it does not validate the input and recognizes its input in a 61 quite lenient way. 62 63 :Groups: 64 - `Lexer states` : 65 `TEXT`, 66 `CDATA`, 67 `MARKUP`, 68 `STARTTAG`, 69 `ENDTAG`, 70 `COMMENT`, 71 `MSECTION`, 72 `DECL`, 73 `PI`, 74 `EMPTY`, 75 `FINAL` 76 - `Regex Matchers` : 77 `_START_MATCH`, 78 `_ATT_ITER`, 79 `_COMMENT_SEARCH`, 80 `_MSECTION_MATCH`, 81 `_MSECTIONINVALID_MATCH`, 82 `_MEND_SEARCH`, 83 `_MSEND_SEARCH`, 84 `_DECL_MATCH` 85 86 :CVariables: 87 `TEXT` : ``int`` 88 Lexer state ``TEXT`` (between tags) 89 90 `CDATA` : ``int`` 91 Lexer state ``CDATA`` (between (P)CDATA tags) 92 93 `MARKUP` : ``int`` 94 Lexer state ``MARKUP`` (``<``) 95 96 `STARTTAG` : ``int`` 97 Lexer state ``STARTTAG`` (``<[letter]``) 98 99 `ENDTAG` : ``int`` 100 Lexer state ``ENDTAG`` (``</``) 101 102 `COMMENT` : ``int`` 103 Lexer state ``COMMENT`` (``<!--``) 104 105 `MSECTION` : ``int`` 106 Lexer state ``MSECTION`` (``<![``) 107 108 `DECL` : ``int`` 109 Lexer state ``DECL`` (``<!``) 110 111 `PI` : ``int`` 112 Lexer state ``PI`` (``<?``) 113 114 `EMPTY` : ``int`` 115 Lexer state ``EMPTY`` (``<>``) 116 117 `FINAL` : ``int`` 118 Lexer state ``FINAL`` 119 120 `_LEXERS` : ``tuple`` 121 The state lexer method names (``('method', ...)``) 122 123 `_STATES` : ``tuple`` 124 The state names (``('name', ...)``) 125 126 :IVariables: 127 `_state` : ``int`` 128 The current lexer state 129 130 `_lexers` : ``list`` 131 The state lexer methods (``[method, ...]``) 132 133 `_listener` : `ListenerInterface` 134 The listener the events shall be sent to 135 136 `_buffer` : ``str`` 137 Current unprocessed buffer 138 139 `_conditional_ie_comments` : ``bool`` 140 Handle conditional IE comments as text? 141 """ 142 # pylint: disable = E1101 143
144 - def __init__(self, listener, conditional_ie_comments=True):
145 r""" 146 Initialization 147 148 :Parameters: 149 `listener` : `ListenerInterface` 150 The event listener 151 152 `conditional_ie_comments` : ``bool`` 153 Handle conditional IE comments as text? 154 155 Conditional comments are described in full detail 156 at `MSDN`_\. 157 158 .. _MSDN: http://msdn.microsoft.com/en-us/library/ 159 ms537512%28v=vs.85%29.aspx 160 """ 161 self._listener = listener 162 self._normalize = None 163 self._cdata_name = None 164 165 self._state = self.TEXT 166 self._lexers = [getattr(self, name) for name in self._LEXERS] 167 self._buffer = '' 168 self._conditional_ie_comments = bool(conditional_ie_comments)
169
170 - def feed(self, food):
171 """ 172 Feed the lexer with new data 173 174 :Parameters: 175 `food` : ``str`` 176 The data to process 177 """ 178 self._buffer += food 179 self._lex()
180
181 - def finalize(self):
182 """ 183 Finalize the lexer 184 185 This processes the rest buffer (if any) 186 187 :Exceptions: 188 - `LexerEOFError` : The rest buffer could not be consumed 189 """ 190 self._lex() 191 if self._buffer: 192 raise LexerEOFError( 193 "Unfinished parser state %s" % self._STATES[self._state] 194 ) 195 196 self._state = self.FINAL
197
198 - def cdata(self, normalize, name):
199 """ Set CDATA state """ 200 if self._state != self.FINAL: 201 self._state = self.CDATA 202 self._normalize = normalize 203 self._cdata_name = normalize(name)
204
205 - def _lex(self):
206 """ Parse the current buffer """ 207 while self._buffer: 208 if self._lexers[self._state](): 209 break
210
211 - def _lex_text(self):
212 """ 213 Text lexer 214 215 State: We are between tags or at the very beginning of the document 216 and look for a ``<``. 217 218 :Return: Unfinished state? 219 :Rtype: ``bool`` 220 """ 221 data = self._buffer 222 pos = data.find('<') 223 if pos == 0: 224 self._state = self.MARKUP 225 return False 226 elif pos == -1: 227 self._buffer = '' 228 else: 229 self._buffer, data = data[pos:], data[:pos] 230 self._state = self.MARKUP 231 232 self._listener.handle_text(data) 233 return False
234
235 - def _lex_cdata(self):
236 """ 237 (PR)CDATA lexer 238 239 State: We are inside a text element and looking for the end tag only 240 241 :Return: Unfinished state? 242 :Rtype: ``bool`` 243 """ 244 incomplete = False 245 data, pos = self._buffer, 0 246 while True: 247 pos = data.find('<', pos) 248 if pos == -1: 249 pos = len(data) 250 self._buffer = '' 251 break 252 else: 253 char = data[pos + 1:pos + 2] 254 if char == '/': 255 self._state = self.ENDTAG 256 break 257 elif char == '': 258 incomplete = True 259 break 260 else: 261 pos += 1 262 263 if pos > 0: 264 self._buffer, data = data[pos:], data[:pos] 265 self._listener.handle_text(data) 266 267 return incomplete
268 269 #: Regex matcher for a tagname character 270 #: 271 #: :Type: ``callable`` 272 _TAGNAME_MATCH = _re.compile(r'[a-zA-Z0-9]').match 273
274 - def _lex_markup(self):
275 """ 276 Markup lexer 277 278 State: We've hit a ``<`` character and now find out, what it's 279 becoming 280 281 :Return: Unfinished state? 282 :Rtype: ``bool`` 283 """ 284 data = self._buffer 285 if len(data) < 2: 286 return True 287 288 char = data[1] 289 state = (self.ENDTAG, self.DECL, self.PI, self.EMPTY, -1)[ 290 "/!?>".find(char) 291 ] 292 if state == -1: 293 if self._TAGNAME_MATCH(char): 294 state = self.STARTTAG 295 else: 296 state = self.TEXT 297 self._buffer = data[1:] 298 self._listener.handle_text(data[0]) 299 300 self._state = state 301 return False
302 303 #: Regex matcher for a start tag 304 #: 305 #: :Type: ``callable`` 306 _START_MATCH = _re.compile(r''' 307 < 308 (?P<name>[^ \t\r\n\f/>]+) 309 (?P<attr> 310 [^"'>]* 311 (?: 312 (?: 313 "[^"]*" 314 | '[^']*' 315 ) 316 [^"'>]* 317 )* 318 ) 319 [ \t\r\n\f]* 320 > 321 ''', _re.X).match 322 323 #: Regex iterator for extracting start tag attributes 324 #: 325 #: :Type: ``callable`` 326 _ATT_ITER = _re.compile(r''' 327 [ \t\r\n\f]* 328 (?P<name>(?:/|[^ \t\r\n\f/=>]*)) # attribute name 329 [ \t\r\n\f]* 330 (?: 331 = 332 (?P<value> # optional value 333 [ \t\r\n\f]*"[^"]*" 334 | [ \t\r\n\f]*'[^']*' 335 | [^ \t\r\n\f/>]* 336 ) 337 )? 338 ''', _re.X).finditer 339
340 - def _lex_start(self):
341 """ 342 Starttag lexer 343 344 State: We've hit a ``<x`` and now look for the ``>``. 345 346 :Return: Unfinished State? 347 :Rtype: ``bool`` 348 """ 349 data = self._buffer 350 match = self._START_MATCH(data) 351 if match is None: 352 return True 353 354 pos = match.end() 355 self._buffer, data = data[pos:], data[:pos] 356 357 name, attrstring = match.group('name', 'attr') 358 attr, closed = [], False 359 if attrstring: 360 for match in self._ATT_ITER(attrstring): 361 key, value = match.group('name', 'value') 362 if key == '/' and value is None: 363 closed = True 364 continue 365 if key or value is not None: 366 if value: 367 value = value.strip() 368 attr.append((key.strip(), value)) 369 else: # bug protection for Python < 2.3.5 (fixed in rev 37262) 370 break 371 372 self._state = self.TEXT 373 self._listener.handle_starttag(name, attr, closed, data) 374 return False
375
376 - def _lex_end(self):
377 """ 378 Endtag lexer 379 380 State: We've hit ``</``. 381 382 :Return: Unfinished state? 383 :Rtype: ``bool`` 384 """ 385 data = self._buffer 386 pos = data.find('>') + 1 387 if pos == 0: 388 return True 389 390 self._buffer, data = data[pos:], data[:pos] 391 name = data[2:-1].strip() 392 393 if self._cdata_name is not None and \ 394 self._normalize(name) != self._cdata_name: 395 self._state = self.CDATA 396 self._listener.handle_text(data) 397 else: 398 self._cdata_name = self._normalize = None 399 self._state = self.TEXT 400 self._listener.handle_endtag(name, data) 401 return False
402 403 404 #: Regex searcher for finding the end of a comment 405 #: 406 #: :Type: ``callable`` 407 _COMMENT_SEARCH = _re.compile(r'--[ \t\r\n\f]*>').search 408 409 #: Regex searcher for matching IE conditional comment 410 #: 411 #: :Type: ``callable`` 412 _IE_COMMENT_MATCH = _re.compile(r''' 413 \[[ \t\r\n\f]* (?: 414 [iI][fF] | [eE][lL][sS][eE] | [eE][nN][dD][iI][fF] 415 ) [^\]]+]> 416 ''', _re.X).match 417
418 - def _lex_comment(self):
419 """ 420 Comment lexer 421 422 State: We've hit ``<!--``. 423 424 :Return: Unfinished state? 425 :Rtype: ``bool`` 426 """ 427 data = self._buffer 428 if len(data) < 7: 429 return True 430 431 if self._conditional_ie_comments: 432 match = iec = self._IE_COMMENT_MATCH(data, 4) 433 else: 434 match = iec = None 435 if match is None: 436 match = self._COMMENT_SEARCH(data, 4) 437 if match is None: 438 return True 439 440 pos = match.end() 441 self._buffer, data = data[pos:], data[:pos] 442 443 self._state = self.TEXT 444 if iec: 445 self._listener.handle_text(data) 446 else: 447 self._listener.handle_comment(data) 448 449 return False
450 451 #: List of MS-specific marked section names (lowercased) 452 #: 453 #: :Type: ``tuple`` 454 _MSSECTIONS = ('if', 'else', 'endif') 455 456 #: Regex matcher for the start of a marked section 457 #: 458 #: :Type: ``callable`` 459 _MSECTION_MATCH = _re.compile(r''' 460 <!\[[ \t\r\n\f]*(?P<name>[^\][ \t\r\n\f>]+)(?=[\][ \t\r\n\f>]) 461 ''', _re.X).match 462 463 #: Regex matcher for the start of an invalid marked section 464 #: 465 #: :Type: ``callable`` 466 _MSECTIONINVALID_MATCH = _re.compile(r'<!\[[ \t\r\n\f]*[\][>]').match 467 468 #: Regex searcher for the end of a marked section 469 #: 470 #: :Type: ``callable`` 471 _MEND_SEARCH = _re.compile(r'][ \t\r\n\f]*][ \t\r\n\f]*>').search 472 473 #: Regex searcher for the end of a MS specific marked section 474 #: 475 #: :Type: ``callable`` 476 _MSEND_SEARCH = _re.compile(r'][ \t\r\n\f]*(?:--)?[ \t\r\n\f]*>').search 477
478 - def _lex_msection(self):
479 """ 480 Marked section lexer 481 482 State: We've hit a ``<![`` and now seek the end 483 484 :Return: Unfinished state? 485 :Rtype: ``bool`` 486 """ 487 data = self._buffer 488 match = self._MSECTION_MATCH(data) 489 if match is None: 490 match = self._MSECTIONINVALID_MATCH(data) 491 if match is not None: # pass invalid msection as text 492 pos = match.end() 493 self._buffer = data[pos:] 494 data = data[:pos] 495 self._state = self.TEXT 496 self._listener.handle_text(data) 497 return False 498 return True 499 500 name = match.group('name') 501 start = match.end() 502 if self._conditional_ie_comments and name.lower() in self._MSSECTIONS: 503 match = iec = self._MSEND_SEARCH(data, start) 504 else: 505 pos = data.find('[', start) 506 if pos >= 0: 507 start = pos + 1 508 match = self._MEND_SEARCH(data, start) 509 iec = None 510 if match is None: 511 return True 512 pos, end = match.end(), match.start() 513 value = data[start:end] 514 self._buffer, data = data[pos:], data[:pos] 515 516 self._state = self.TEXT 517 if iec: 518 self._listener.handle_text(data) 519 else: 520 self._listener.handle_msection(name, value, data) 521 return False
522 523 #: Regex matcher for a complete declaration 524 #: 525 #: This regex seems a bit nasty, but it should catch all stuff allowed 526 #: in declarations (including doctype). Some day, it probably needs to 527 #: be replaced it by real lexer states... 528 #: 529 #: :Type: ``callable`` 530 _DECL_MATCH = _re.compile(r''' 531 <! 532 (?P<name>[^\][ \t\r\n\f>]*) 533 (?P<value> 534 [^"'<>-]* # any nonspecial 535 (?: 536 (?: 537 "[^"]*" # double quoted string 538 | '[^']*' # single quoted string (valid?) 539 | <!\[ # marked section 540 [^\]]* 541 (?: 542 ](?![ \t\r\n\f]*][ \t\r\n\f]*>) 543 [^\]]* 544 )* 545 ][ \t\r\n\f]*][ \t\r\n\f]*> 546 | <(?!!\[) # declaration 547 # hopefully not a doctype 548 # (but unlikely, because we are 549 # probably already in a DT subset) 550 [^"'>-]* 551 (?: 552 (?: 553 "[^"]*" 554 | '[^']*' 555 | -- # comment 556 [^-]* 557 (?:-[^-]+)* 558 -- 559 | -(?!-) # just a hyphen 560 ) 561 [^"'>-]* 562 )* 563 > 564 | -- # comment 565 [^-]* 566 (?:-[^-]+)* 567 -- 568 | -(?!-) # just a hyphen 569 ) 570 [^"'<>-]* # more non-specials 571 )* 572 ) 573 > 574 ''', _re.X).match
575 - def _lex_decl(self):
576 """ 577 Declaration lexer 578 579 State: We've hit a ``<!`` and now peek inside 580 581 :Return: Unfinished state? 582 :Rtype: ``bool`` 583 """ 584 data = self._buffer 585 if len(data) < 3: 586 return True 587 588 if data.startswith('<!--'): 589 self._state = self.COMMENT 590 return False 591 elif data.startswith('<!['): 592 self._state = self.MSECTION 593 return False 594 elif data == '<!-': 595 return True 596 597 match = self._DECL_MATCH(data) 598 if match is None: 599 return True 600 601 name, value = match.group('name', 'value') 602 pos = match.end() 603 self._buffer, data = data[pos:], data[:pos] 604 605 self._state = self.TEXT 606 self._listener.handle_decl(name, value.strip(), data) 607 return False
608
609 - def _lex_pi(self):
610 """ 611 Processing instruction lexer 612 613 State: We've hit a ``<?`` and now peek inside 614 615 :Return: Unfinished state? 616 :Rtype: ``bool`` 617 """ 618 data = self._buffer 619 pos = data.find('?>', 2) 620 if pos == -1: 621 return True 622 pos += 2 623 624 self._buffer, data = data[pos:], data[:pos] 625 626 self._state = self.TEXT 627 self._listener.handle_pi(data) 628 return False
629
630 - def _lex_empty(self):
631 """ 632 Empty tag lexer 633 634 State: We've hit a ``<>`` 635 636 :Return: Unfinished state? 637 :Rtype: ``bool`` 638 """ 639 self._buffer, data = self._buffer[2:], self._buffer[:2] 640 641 self._state = self.TEXT 642 self._listener.handle_starttag('', [], False, data) 643 return False
644
645 - def _lex_final(self):
646 """ 647 Called after the lexer was finalized 648 649 State: after all 650 651 :Exceptions: 652 - `LexerFinalizedError` : The lexer was already finalized 653 (raised always) 654 """ 655 raise LexerFinalizedError("The lexer was already finalized")
656 657 _LEXERS = [] 658 _STATES = [] 659 for _idx, (_statename, _funcname) in enumerate([ 660 ('FINAL', '_lex_final'), 661 ('TEXT', '_lex_text'), 662 ('CDATA', '_lex_cdata'), 663 ('MARKUP', '_lex_markup'), 664 ('STARTTAG', '_lex_start'), 665 ('ENDTAG', '_lex_end'), 666 ('COMMENT', '_lex_comment'), 667 ('MSECTION', '_lex_msection'), 668 ('DECL', '_lex_decl'), 669 ('PI', '_lex_pi'), 670 ('EMPTY', '_lex_empty'), 671 ]): 672 setattr(SoupLexer, _statename, _idx) 673 _LEXERS.append(_funcname) 674 _STATES.append(_statename) 675 676 SoupLexer._LEXERS = tuple(_LEXERS) # pylint: disable = W0212 677 SoupLexer._STATES = tuple(_STATES) # pylint: disable = W0212 678 del _idx, _statename, _funcname, _LEXERS, _STATES # pylint: disable = W0631 679 680 681 from tdi import c 682 c = c.load('impl') 683 if c is not None: 684 DEFAULT_LEXER = c.SoupLexer 685 else: 686 DEFAULT_LEXER = SoupLexer 687 del c
688 689 690 -class SoupParser(object):
691 """ 692 ========================= 693 (X)HTML Tag Soup Parser 694 ========================= 695 696 Overview 697 ~~~~~~~~ 698 699 The parser is actually a tagsoup parser by design in order to process 700 most of the "HTML" that can be found out there. Of course, if the HTML 701 is well-formed and valid, this would be the best. There is only as 702 much HTML syntax applied as necessary to parse it. You can influence 703 these syntax definitions by picking another lexer. You can change 704 the semantics by picking another dtd query class. 705 706 This parser guarantees, that for each not-self-closing starttag event also 707 an endtag event is generated (if the endtag is not actually there, the 708 data parameter is an empty string). This also happens for empty tags (like 709 ``br``). On the other hand, there may be more endtag events than starttag 710 events, because of unbalanced or wrongly nested tags. 711 712 Special constructs, which are comments, PIs, marked sections and 713 declarations may occur anywhere, i.e. they are not closing elements 714 implicitly. 715 716 The default lexer does not deal with NET tags (<h1/Heading/). Neither 717 does it handle unfinished starttags by SGML rules like ``<map<area>``. 718 It *does* know about empty tags (``<>`` and ``</>``). 719 720 CDATA elements and comments are handled in a simplified way. Once 721 the particular state is entered, it's only left, when the accompanying 722 end marker was found (``<script>...</script>``, ``<!-- ... -->``). 723 Anything in between is text. 724 725 How is it used? 726 ~~~~~~~~~~~~~~~ 727 728 The parser API is "streamy" on the input side and event based on the 729 output side. So, what you need first is a building listener, which will 730 receive all generated parser events and process them. Such is listener 731 object is expected to implement the `BuildingListenerInterface`. 732 733 Now you create a `SoupParser` instance and pass the listener object to 734 the contructor and the parser is ready to be fed. You can feed as many 735 chunks of input data you like into the parser by using the `feed` 736 method. Every feed call may generate mutiple events on the output side. 737 When you're done feeding, call the parser's `finalize` method in order 738 to clean up. This also flushes pending events to the listener. 739 740 :IVariables: 741 `listener` : `BuildingListenerInterface` 742 The building listener to send the events to 743 744 `lexer` : `SoupLexer` 745 The lexer instance 746 747 `_tagstack` : ``list`` 748 The current tag stack 749 750 `_inempty` : ``bool`` 751 indicates if the last tag on the stack is an empty one 752 753 `_lastopen` : ``str`` 754 Stores the last seen open tag name 755 """ 756 __implements__ = [ 757 _interfaces.ListenerInterface, _interfaces.ParserInterface 758 ] 759
760 - def __init__(self, listener, dtd, lexer=None):
761 """ 762 Initialization 763 764 :Parameters: 765 `listener` : `ListenerInterface` 766 The building listener 767 768 `dtd` : `DTDInterface` 769 DTD query object 770 771 `lexer` : ``callable`` 772 Lexer class/factory. This mus be a callable taking an 773 event listener and returning a lexer instance. If omitted or 774 ``None``, the default lexer will be used (`DEFAULT_LEXER`). 775 """ 776 self._tagstack, self._inempty, self._lastopen = [], False, '' 777 self.listener = listener 778 self._is_nestable = dtd.nestable 779 self._is_cdata = dtd.cdata 780 self._is_empty = dtd.empty 781 if lexer is None: 782 lexer = DEFAULT_LEXER 783 self._lexer = lexer(self) 784 self._normalize = listener.decoder.normalize
785 786 @classmethod
787 - def html(cls, listener):
788 """ 789 Construct a parser using the `HTMLDTD` 790 791 :Parameters: 792 `listener` : `BuildingListenerInterface` 793 The building listener 794 795 :Return: The new parser instance 796 :Rtype: `SoupParser` 797 """ 798 return cls(listener, _dtd.HTMLDTD())
799 800 @classmethod
801 - def xml(cls, listener):
802 """ 803 Construct a parser using the `XMLDTD` 804 805 :Parameters: 806 `listener` : `ListenerInterface` 807 The building listener 808 809 :Return: The new parser instance 810 :Rtype: `SoupParser` 811 """ 812 return cls(listener, _dtd.XMLDTD())
813
814 - def _close_empty(self):
815 """ Ensure we close last empty tag """ 816 if self._inempty: 817 self._inempty = False 818 self.listener.handle_endtag(self._tagstack.pop()[1], '')
819 820 ######################################################################### 821 ### ListenerInterface ################################################### 822 ######################################################################### 823
824 - def handle_text(self, data):
825 """ :See: `ListenerInterface` """ 826 self._close_empty() 827 self.listener.handle_text(data)
828
829 - def handle_starttag(self, name, attrs, closed, data):
830 """ :See: `ListenerInterface` """ 831 self._close_empty() 832 833 if name == '' and not attrs: 834 name = self._lastopen 835 else: 836 self._lastopen = name 837 838 tagstack = self._tagstack 839 nestable = self._is_nestable 840 starttag = self._normalize(name) 841 while tagstack and not nestable(tagstack[-1][0], starttag): 842 self.listener.handle_endtag(tagstack.pop()[1], '') 843 844 if closed: 845 self.listener.handle_starttag(name, attrs, closed, data) 846 else: 847 if self._is_cdata(starttag): 848 self._lexer.cdata(self._normalize, starttag) 849 self.listener.handle_starttag(name, attrs, closed, data) 850 tagstack.append((starttag, name)) 851 if self._is_empty(starttag): 852 self._inempty = True
853
854 - def handle_endtag(self, name, data):
855 """ :See: `ListenerInterface` """ 856 tagstack = self._tagstack 857 if tagstack: 858 if name == '': 859 name = tagstack[-1][1] 860 endtag = self._normalize(name) 861 if endtag in dict(tagstack): 862 toclose, original = tagstack.pop() 863 self._inempty = False 864 while toclose != endtag: 865 self.listener.handle_endtag(original, '') 866 toclose, original = tagstack.pop() 867 868 self._close_empty() 869 self.listener.handle_endtag(name, data)
870
871 - def handle_comment(self, data):
872 """ :See: `ListenerInterface` """ 873 self._close_empty() 874 self.listener.handle_comment(data)
875
876 - def handle_msection(self, name, value, data):
877 """ :See: `ListenerInterface` """ 878 self._close_empty() 879 self.listener.handle_msection(name, value, data)
880
881 - def handle_decl(self, name, value, data):
882 """ :See: `ListenerInterface` """ 883 self._close_empty() 884 self.listener.handle_decl(name, value, data)
885
886 - def handle_pi(self, data):
887 """ :See: `ListenerInterface` """ 888 self._close_empty() 889 self.listener.handle_pi(data)
890
891 - def handle_escape(self, escaped, data):
892 """ :See: `ListenerInterface` """ 893 # pylint: disable = W0613 894 raise AssertionError()
895 896 ######################################################################### 897 ### ParserInterface ##################################################### 898 ######################################################################### 899
900 - def feed(self, food):
901 """ :See: `ParserInterface` """ 902 self._lexer.feed(food)
903
904 - def finalize(self):
905 """ 906 :See: `ParserInterface` 907 908 :Exceptions: 909 - `LexerEOFError` : EOF in the middle of a state 910 """ 911 if self._lexer is not None: 912 self._lexer, _ = None, self._lexer.finalize() 913 914 tagstack = self._tagstack 915 while tagstack: 916 self.listener.handle_endtag(tagstack.pop()[1], '')
917 918 919 from tdi import c # pylint: disable = W0404 920 c = c.load('impl') 921 if c is not None: 922 DEFAULT_PARSER = c.SoupParser 923 else: 924 DEFAULT_PARSER = SoupParser 925 del c 926