1
2 u"""
3 :Copyright:
4
5 Copyright 2012 - 2013
6 Andr\xe9 Malo or his licensors, as applicable
7
8 :License:
9
10 Licensed under the Apache License, Version 2.0 (the "License");
11 you may not use this file except in compliance with the License.
12 You may obtain a copy of the License at
13
14 http://www.apache.org/licenses/LICENSE-2.0
15
16 Unless required by applicable law or agreed to in writing, software
17 distributed under the License is distributed on an "AS IS" BASIS,
18 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
19 See the License for the specific language governing permissions and
20 limitations under the License.
21
22 ===================
23 Text Parser Logic
24 ===================
25
26 Text Parser.
27 """
28 __author__ = u"Andr\xe9 Malo"
29 __docformat__ = "restructuredtext en"
30
31 import re as _re
32
33 from tdi._exceptions import LexerEOFError, LexerFinalizedError
34 from tdi import interfaces as _interfaces
35
36
37 -class TextLexer(object):
38 """ Text Lexer """
39
40
41 - def __init__(self, listener):
42 """
43 Initialization
44
45 :Parameters:
46 `listener` : `ListenerInterface`
47 The event listener
48 """
49 self._listener = listener
50
51 self.state = self.TEXT
52 self._lexers = [getattr(self, name) for name in self._LEXERS]
53 self._buffer = ''
54
55 - def feed(self, food):
56 """
57 Feed the lexer with new data
58
59 :Parameters:
60 `food` : ``str``
61 The data to process
62 """
63 self._buffer += food
64 self._lex()
65
67 """
68 Finalize the lexer
69
70 This processes the rest buffer (if any)
71
72 :Exceptions:
73 - `LexerEOFError` : The rest buffer could not be consumed
74 """
75 self._lex()
76 if self._buffer:
77 raise LexerEOFError(
78 "Unfinished parser state %s" % self._STATES[self.state]
79 )
80
81 self.state = self.FINAL
82
84 """ Parse the current buffer """
85 while self._buffer:
86 if self._lexers[self.state]():
87 break
88
89 - def _lex_text(self):
90 """
91 Text lexer
92
93 State: We are between tags or at the very beginning of the document
94 and look for a ``[``.
95
96 :Return: Unfinished state?
97 :Rtype: ``bool``
98 """
99 data = self._buffer
100 pos = data.find('[')
101 if pos == 0:
102 self.state = self.MARKUP
103 return False
104 elif pos == -1:
105 self._buffer = ''
106 else:
107 self._buffer, data = data[pos:], data[:pos]
108 self.state = self.MARKUP
109
110 self._listener.handle_text(data)
111 return False
112
113 - def _lex_markup(self):
114 """
115 Markup lexer
116
117 State: We've hit a ``[`` character and now find out, what it's
118 becoming
119
120 :Return: Unfinished state?
121 :Rtype: ``bool``
122 """
123 data = self._buffer
124 if len(data) < 2:
125 return True
126
127 char = data[1]
128 if char == '/':
129 state = self.ENDTAG
130 elif char == '#':
131 state = self.COMMENT
132 elif char == '?':
133 state = self.PI
134 elif char == ']':
135 state = self.TEXT
136 self._listener.handle_escape(data[0], data[:2])
137 self._buffer = data[2:]
138 else:
139 state = self.STARTTAG
140
141 self.state = state
142 return False
143
144
145
146
147
148 _START_MATCH = _re.compile(r'''
149 \[
150 (
151 [^\\"'\[\]]*
152 (?:
153 (?:
154 "[^\\"]*(?:\\.[^\\"]*)*"
155 | '[^\\']*(?:\\.[^\\']*)*'
156 )
157 [^\\"'\[\]]*
158 )*
159 )
160 \]
161 ''', _re.X | _re.S).match
162
163
164
165
166 _EMPTY_START_MATCH = _re.compile(r'''
167 \[
168 (
169 \[
170 [^\\"'\[\]]*
171 (?:
172 (?:
173 "[^\\"]*(?:\\.[^\\"]*)*"
174 | '[^\\']*(?:\\.[^\\']*)*'
175 )
176 [^\\"'\[\]]*
177 )*
178 \]
179 )
180 \]
181 ''', _re.X | _re.S).match
182
183
184
185
186
187 _ATT_ITER = _re.compile(r'''
188 \s*
189 (?P<name>[^\s=\]]*) # attribute name
190 \s*
191 (?:
192 =
193 (?P<value> # optional value
194 \s* "[^\\"]*(?:\\.[^\\"]*)*"
195 | \s* '[^\\']*(?:\\.[^\\']*)*'
196 | [^\\\s\]]*
197 )
198 )?
199 ''', _re.X | _re.S).finditer
200
201 - def _lex_start(self):
202 """
203 Starttag lexer
204
205 State: We've hit a ``[tag`` and now look for the ``]``
206
207 :Return: Unfinished State?
208 :Rtype: ``bool``
209 """
210 data = self._buffer
211 match = self._EMPTY_START_MATCH(data) or self._START_MATCH(data)
212 if match is None:
213 return True
214
215 pos = match.end()
216 self._buffer, data = data[pos:], data[:pos]
217
218 attrstring = match.group(1)
219 quoted = attrstring.startswith('[')
220 if quoted:
221 attrstring = attrstring[1:-1]
222
223 splitted = attrstring.split(None, 1)
224 if not splitted:
225 self._listener.handle_text(data)
226 self.state = self.TEXT
227 return False
228 name = splitted[0]
229 if '=' in name:
230 name = ''
231 elif len(splitted) == 1:
232 attrstring = None
233 else:
234 attrstring = splitted[1]
235
236 attr = []
237 if attrstring:
238 for match in self._ATT_ITER(attrstring):
239 key, value = match.group('name', 'value')
240 if key or value is not None:
241 if value:
242 value = value.strip()
243 attr.append((key.strip(), value))
244 else:
245 break
246
247 self.state = self.TEXT
248 self._listener.handle_starttag(name, attr, quoted, data)
249 return False
250
251 - def _lex_end(self):
252 """
253 Endtag lexer
254
255 State: We've hit ``[/``.
256
257 :Return: Unfinished state?
258 :Rtype: ``bool``
259 """
260 data = self._buffer
261 pos = data.find(']') + 1
262 if pos == 0:
263 return True
264
265 self._buffer, data = data[pos:], data[:pos]
266 name = data[2:-1].strip()
267
268 self.state = self.TEXT
269 self._listener.handle_endtag(name, data)
270 return False
271
272
273
274
275
276 _COMMENT_SEARCH = _re.compile(r'#\]').search
277
279 """
280 Comment lexer
281
282 State: We've hit ``[#``.
283
284 :Return: Unfinished state?
285 :Rtype: ``bool``
286 """
287 data = self._buffer
288 if len(data) < 4:
289 return True
290
291 match = self._COMMENT_SEARCH(data, 2)
292 if match is None:
293 return True
294
295 pos = match.end()
296 self._buffer, data = data[pos:], data[:pos]
297
298 self.state = self.TEXT
299 self._listener.handle_comment(data)
300 return False
301
303 """
304 Processing instruction lexer
305
306 State: We've hit a ``[?`` and now peek inside
307
308 :Return: Unfinished state?
309 :Rtype: ``bool``
310 """
311 data = self._buffer
312 pos = data.find('?]', 2)
313 if pos == -1:
314 return True
315 pos += 2
316
317 self._buffer, data = data[pos:], data[:pos]
318
319 self.state = self.TEXT
320 self._listener.handle_pi(data)
321 return False
322
323 - def _lex_final(self):
324 """
325 Called after the lexer was finalized
326
327 State: after all
328
329 :Exceptions:
330 - `LexerFinalizedError` : The lexer was already finalized
331 (raised always)
332 """
333 raise LexerFinalizedError("The lexer was already finalized")
334
335 _LEXERS = []
336 _STATES = []
337 for _idx, (_statename, _funcname) in enumerate([
338 ('FINAL', '_lex_final'),
339 ('TEXT', '_lex_text'),
340 ('MARKUP', '_lex_markup'),
341 ('STARTTAG', '_lex_start'),
342 ('ENDTAG', '_lex_end'),
343 ('PI', '_lex_pi'),
344 ('COMMENT', '_lex_comment'),
345 ]):
346 setattr(TextLexer, _statename, _idx)
347 _LEXERS.append(_funcname)
348 _STATES.append(_statename)
349
350 TextLexer._LEXERS = tuple(_LEXERS)
351 TextLexer._STATES = tuple(_STATES)
352 del _idx, _statename, _funcname, _LEXERS, _STATES
353
354
355 -class TextParser(object):
356 """ Text Parser """
357 __implements__ = [
358 _interfaces.ListenerInterface, _interfaces.ParserInterface
359 ]
360
361 - def __init__(self, listener, lexer=TextLexer):
362 """
363 Initialization
364
365 :Parameters:
366 `listener` : `BuildingListenerInterface`
367 The building listener
368
369 `lexer` : ``callable``
370 Lexer class/factory. This must be a callable taking an
371 event listener and returning a lexer instance
372 """
373 self._tagstack = []
374 self.listener = listener
375 self._lexer = lexer(self)
376 self._normalize = self.listener.decoder.normalize
377
378
379
380
381
382 - def handle_text(self, data):
383 """ :See: `ListenerInterface` """
384 self.listener.handle_text(data)
385
386 - def handle_escape(self, escaped, data):
387 """ :See: `ListenerInterface` """
388 self.listener.handle_escape(escaped, data)
389
390 - def handle_starttag(self, name, attrs, closed, data):
391 """ :See: `ListenerInterface` """
392 self.listener.handle_starttag(name, attrs, closed, data)
393 if not closed:
394 self._tagstack.append((self._normalize(name), name))
395
396 - def handle_endtag(self, name, data):
397 """ :See: `ListenerInterface` """
398 tagstack = self._tagstack
399 if tagstack:
400 if name == '':
401 name = tagstack[-1][1]
402 endtag = self._normalize(name)
403 if endtag in dict(tagstack):
404 toclose, original = tagstack.pop()
405 while toclose != name:
406 self.listener.handle_endtag(original, '')
407 toclose, original = tagstack.pop()
408 self.listener.handle_endtag(name, data)
409
411 """ :See: `ListenerInterface` """
412 self.listener.handle_comment(data)
413
414 - def handle_pi(self, data):
415 """ :See: `ListenerInterface` """
416 self.listener.handle_pi(data)
417
418 - def handle_msection(self, name, value, data):
419 """ :See: `ListenerInterface` """
420
421 raise AssertionError()
422
423 - def handle_decl(self, name, value, data):
424 """ :See: `ListenerInterface` """
425
426 raise AssertionError()
427
428
429
430
431
432 - def feed(self, food):
433 """ :See: `ParserInterface` """
434 self._lexer.feed(food)
435
436 - def finalize(self):
437 """
438 :See: `ParserInterface`
439
440 :Exceptions:
441 - `LexerEOFError` : EOF in the middle of a state
442 """
443 if self._lexer is not None:
444 self._lexer, _ = None, self._lexer.finalize()
445
446 tagstack = self._tagstack
447 while tagstack:
448 self.listener.handle_endtag(tagstack.pop()[1], '')
449