Package tdi :: Package markup :: Package text :: Module parser
[frames] | no frames]

Source Code for Module tdi.markup.text.parser

  1  # -*- coding: ascii -*- 
  2  u""" 
  3  :Copyright: 
  4   
  5   Copyright 2012 - 2013 
  6   Andr\xe9 Malo or his licensors, as applicable 
  7   
  8  :License: 
  9   
 10   Licensed under the Apache License, Version 2.0 (the "License"); 
 11   you may not use this file except in compliance with the License. 
 12   You may obtain a copy of the License at 
 13   
 14       http://www.apache.org/licenses/LICENSE-2.0 
 15   
 16   Unless required by applicable law or agreed to in writing, software 
 17   distributed under the License is distributed on an "AS IS" BASIS, 
 18   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
 19   See the License for the specific language governing permissions and 
 20   limitations under the License. 
 21   
 22  =================== 
 23   Text Parser Logic 
 24  =================== 
 25   
 26  Text Parser. 
 27  """ 
 28  __author__ = u"Andr\xe9 Malo" 
 29  __docformat__ = "restructuredtext en" 
 30   
 31  import re as _re 
 32   
 33  from tdi._exceptions import LexerEOFError, LexerFinalizedError 
 34  from tdi import interfaces as _interfaces 
 35   
 36   
37 -class TextLexer(object):
38 """ Text Lexer """ 39 # pylint: disable = E1101 40
41 - def __init__(self, listener):
42 """ 43 Initialization 44 45 :Parameters: 46 `listener` : `ListenerInterface` 47 The event listener 48 """ 49 self._listener = listener 50 51 self.state = self.TEXT 52 self._lexers = [getattr(self, name) for name in self._LEXERS] 53 self._buffer = ''
54
55 - def feed(self, food):
56 """ 57 Feed the lexer with new data 58 59 :Parameters: 60 `food` : ``str`` 61 The data to process 62 """ 63 self._buffer += food 64 self._lex()
65
66 - def finalize(self):
67 """ 68 Finalize the lexer 69 70 This processes the rest buffer (if any) 71 72 :Exceptions: 73 - `LexerEOFError` : The rest buffer could not be consumed 74 """ 75 self._lex() 76 if self._buffer: 77 raise LexerEOFError( 78 "Unfinished parser state %s" % self._STATES[self.state] 79 ) 80 81 self.state = self.FINAL
82
83 - def _lex(self):
84 """ Parse the current buffer """ 85 while self._buffer: 86 if self._lexers[self.state](): 87 break
88
89 - def _lex_text(self):
90 """ 91 Text lexer 92 93 State: We are between tags or at the very beginning of the document 94 and look for a ``[``. 95 96 :Return: Unfinished state? 97 :Rtype: ``bool`` 98 """ 99 data = self._buffer 100 pos = data.find('[') 101 if pos == 0: 102 self.state = self.MARKUP 103 return False 104 elif pos == -1: 105 self._buffer = '' 106 else: 107 self._buffer, data = data[pos:], data[:pos] 108 self.state = self.MARKUP 109 110 self._listener.handle_text(data) 111 return False
112
113 - def _lex_markup(self):
114 """ 115 Markup lexer 116 117 State: We've hit a ``[`` character and now find out, what it's 118 becoming 119 120 :Return: Unfinished state? 121 :Rtype: ``bool`` 122 """ 123 data = self._buffer 124 if len(data) < 2: 125 return True 126 127 char = data[1] 128 if char == '/': 129 state = self.ENDTAG 130 elif char == '#': 131 state = self.COMMENT 132 elif char == '?': 133 state = self.PI 134 elif char == ']': 135 state = self.TEXT 136 self._listener.handle_escape(data[0], data[:2]) 137 self._buffer = data[2:] 138 else: 139 state = self.STARTTAG 140 141 self.state = state 142 return False
143 144 145 #: Regex matcher for a start tag 146 #: 147 #: :Type: ``callable`` 148 _START_MATCH = _re.compile(r''' 149 \[ 150 ( 151 [^\\"'\[\]]* 152 (?: 153 (?: 154 "[^\\"]*(?:\\.[^\\"]*)*" 155 | '[^\\']*(?:\\.[^\\']*)*' 156 ) 157 [^\\"'\[\]]* 158 )* 159 ) 160 \] 161 ''', _re.X | _re.S).match 162 163 #: Regex matcher for an empty start tag 164 #: 165 #: :Type: ``callable`` 166 _EMPTY_START_MATCH = _re.compile(r''' 167 \[ 168 ( 169 \[ 170 [^\\"'\[\]]* 171 (?: 172 (?: 173 "[^\\"]*(?:\\.[^\\"]*)*" 174 | '[^\\']*(?:\\.[^\\']*)*' 175 ) 176 [^\\"'\[\]]* 177 )* 178 \] 179 ) 180 \] 181 ''', _re.X | _re.S).match 182 183 184 #: Regex iterator for extracting start tag attributes 185 #: 186 #: :Type: ``callable`` 187 _ATT_ITER = _re.compile(r''' 188 \s* 189 (?P<name>[^\s=\]]*) # attribute name 190 \s* 191 (?: 192 = 193 (?P<value> # optional value 194 \s* "[^\\"]*(?:\\.[^\\"]*)*" 195 | \s* '[^\\']*(?:\\.[^\\']*)*' 196 | [^\\\s\]]* 197 ) 198 )? 199 ''', _re.X | _re.S).finditer 200
201 - def _lex_start(self):
202 """ 203 Starttag lexer 204 205 State: We've hit a ``[tag`` and now look for the ``]`` 206 207 :Return: Unfinished State? 208 :Rtype: ``bool`` 209 """ 210 data = self._buffer 211 match = self._EMPTY_START_MATCH(data) or self._START_MATCH(data) 212 if match is None: 213 return True 214 215 pos = match.end() 216 self._buffer, data = data[pos:], data[:pos] 217 218 attrstring = match.group(1) 219 quoted = attrstring.startswith('[') 220 if quoted: 221 attrstring = attrstring[1:-1] 222 223 splitted = attrstring.split(None, 1) 224 if not splitted: 225 self._listener.handle_text(data) 226 self.state = self.TEXT 227 return False 228 name = splitted[0] 229 if '=' in name: 230 name = '' 231 elif len(splitted) == 1: 232 attrstring = None 233 else: 234 attrstring = splitted[1] 235 236 attr = [] 237 if attrstring: 238 for match in self._ATT_ITER(attrstring): 239 key, value = match.group('name', 'value') 240 if key or value is not None: 241 if value: 242 value = value.strip() 243 attr.append((key.strip(), value)) 244 else: # bug protection for Python < 2.3.5 (fixed in rev 37262) 245 break 246 247 self.state = self.TEXT 248 self._listener.handle_starttag(name, attr, quoted, data) 249 return False
250
251 - def _lex_end(self):
252 """ 253 Endtag lexer 254 255 State: We've hit ``[/``. 256 257 :Return: Unfinished state? 258 :Rtype: ``bool`` 259 """ 260 data = self._buffer 261 pos = data.find(']') + 1 262 if pos == 0: 263 return True 264 265 self._buffer, data = data[pos:], data[:pos] 266 name = data[2:-1].strip() 267 268 self.state = self.TEXT 269 self._listener.handle_endtag(name, data) 270 return False
271 272 273 #: Regex searcher for finding the end of a comment 274 #: 275 #: :Type: ``callable`` 276 _COMMENT_SEARCH = _re.compile(r'#\]').search 277
278 - def _lex_comment(self):
279 """ 280 Comment lexer 281 282 State: We've hit ``[#``. 283 284 :Return: Unfinished state? 285 :Rtype: ``bool`` 286 """ 287 data = self._buffer 288 if len(data) < 4: 289 return True 290 291 match = self._COMMENT_SEARCH(data, 2) 292 if match is None: 293 return True 294 295 pos = match.end() 296 self._buffer, data = data[pos:], data[:pos] 297 298 self.state = self.TEXT 299 self._listener.handle_comment(data) 300 return False
301
302 - def _lex_pi(self):
303 """ 304 Processing instruction lexer 305 306 State: We've hit a ``[?`` and now peek inside 307 308 :Return: Unfinished state? 309 :Rtype: ``bool`` 310 """ 311 data = self._buffer 312 pos = data.find('?]', 2) 313 if pos == -1: 314 return True 315 pos += 2 316 317 self._buffer, data = data[pos:], data[:pos] 318 319 self.state = self.TEXT 320 self._listener.handle_pi(data) 321 return False
322
323 - def _lex_final(self):
324 """ 325 Called after the lexer was finalized 326 327 State: after all 328 329 :Exceptions: 330 - `LexerFinalizedError` : The lexer was already finalized 331 (raised always) 332 """ 333 raise LexerFinalizedError("The lexer was already finalized")
334 335 _LEXERS = [] 336 _STATES = [] 337 for _idx, (_statename, _funcname) in enumerate([ 338 ('FINAL', '_lex_final'), 339 ('TEXT', '_lex_text'), 340 ('MARKUP', '_lex_markup'), 341 ('STARTTAG', '_lex_start'), 342 ('ENDTAG', '_lex_end'), 343 ('PI', '_lex_pi'), 344 ('COMMENT', '_lex_comment'), 345 ]): 346 setattr(TextLexer, _statename, _idx) 347 _LEXERS.append(_funcname) 348 _STATES.append(_statename) 349 350 TextLexer._LEXERS = tuple(_LEXERS) # pylint: disable = W0212 351 TextLexer._STATES = tuple(_STATES) # pylint: disable = W0212 352 del _idx, _statename, _funcname, _LEXERS, _STATES # pylint: disable = W0631 353 354
355 -class TextParser(object):
356 """ Text Parser """ 357 __implements__ = [ 358 _interfaces.ListenerInterface, _interfaces.ParserInterface 359 ] 360
361 - def __init__(self, listener, lexer=TextLexer):
362 """ 363 Initialization 364 365 :Parameters: 366 `listener` : `BuildingListenerInterface` 367 The building listener 368 369 `lexer` : ``callable`` 370 Lexer class/factory. This must be a callable taking an 371 event listener and returning a lexer instance 372 """ 373 self._tagstack = [] 374 self.listener = listener 375 self._lexer = lexer(self) 376 self._normalize = self.listener.decoder.normalize
377 378 ######################################################################### 379 ### ListenerInterface ################################################### 380 ######################################################################### 381
382 - def handle_text(self, data):
383 """ :See: `ListenerInterface` """ 384 self.listener.handle_text(data)
385
386 - def handle_escape(self, escaped, data):
387 """ :See: `ListenerInterface` """ 388 self.listener.handle_escape(escaped, data)
389
390 - def handle_starttag(self, name, attrs, closed, data):
391 """ :See: `ListenerInterface` """ 392 self.listener.handle_starttag(name, attrs, closed, data) 393 if not closed: 394 self._tagstack.append((self._normalize(name), name))
395
396 - def handle_endtag(self, name, data):
397 """ :See: `ListenerInterface` """ 398 tagstack = self._tagstack 399 if tagstack: 400 if name == '': 401 name = tagstack[-1][1] 402 endtag = self._normalize(name) 403 if endtag in dict(tagstack): 404 toclose, original = tagstack.pop() 405 while toclose != name: 406 self.listener.handle_endtag(original, '') 407 toclose, original = tagstack.pop() 408 self.listener.handle_endtag(name, data)
409
410 - def handle_comment(self, data):
411 """ :See: `ListenerInterface` """ 412 self.listener.handle_comment(data)
413
414 - def handle_pi(self, data):
415 """ :See: `ListenerInterface` """ 416 self.listener.handle_pi(data)
417
418 - def handle_msection(self, name, value, data):
419 """ :See: `ListenerInterface` """ 420 # pylint: disable = W0613 421 raise AssertionError()
422
423 - def handle_decl(self, name, value, data):
424 """ :See: `ListenerInterface` """ 425 # pylint: disable = W0613 426 raise AssertionError()
427 428 ######################################################################### 429 ### ParserInterface ##################################################### 430 ######################################################################### 431
432 - def feed(self, food):
433 """ :See: `ParserInterface` """ 434 self._lexer.feed(food)
435
436 - def finalize(self):
437 """ 438 :See: `ParserInterface` 439 440 :Exceptions: 441 - `LexerEOFError` : EOF in the middle of a state 442 """ 443 if self._lexer is not None: 444 self._lexer, _ = None, self._lexer.finalize() 445 446 tagstack = self._tagstack 447 while tagstack: 448 self.listener.handle_endtag(tagstack.pop()[1], '')
449