1
2 u"""
3 :Copyright:
4
5 Copyright 2006 - 2013
6 Andr\xe9 Malo or his licensors, as applicable
7
8 :License:
9
10 Licensed under the Apache License, Version 2.0 (the "License");
11 you may not use this file except in compliance with the License.
12 You may obtain a copy of the License at
13
14 http://www.apache.org/licenses/LICENSE-2.0
15
16 Unless required by applicable law or agreed to in writing, software
17 distributed under the License is distributed on an "AS IS" BASIS,
18 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
19 See the License for the specific language governing permissions and
20 limitations under the License.
21
22 ============
23 HTML Tools
24 ============
25
26 HTML Tools.
27 """
28 __author__ = u"Andr\xe9 Malo"
29 __docformat__ = "restructuredtext en"
30 __all__ = [
31 'decode', 'entities', 'class_add', 'class_del', 'multiline',
32 'CommentStripFilter', 'MinifyFilter', 'minify'
33 ]
34
35 import codecs as _codecs
36 import re as _re
37 try:
38 import cStringIO as _string_io
39 except ImportError:
40 import StringIO as _string_io
41
42 from tdi import LexerError
43 from tdi import factory as _factory
44 from tdi import filters as _filters
45 from tdi import interfaces as _interfaces
46 from tdi.markup.soup import dtd as _dtd
47 from tdi.markup.soup import encoder as _encoder
48 from tdi.markup.soup import decoder as _decoder
49 from tdi.markup.soup import parser as _parser
50 from tdi.tools import css as _css
51 from tdi.tools import javascript as _javascript
52 from tdi._htmldecode import decode
53 from tdi._htmlentities import htmlentities as entities
54
55
56
57
58
59
60
61
62
63 entities = dict(entities)
64
65
67 """
68 Add class(es) to a node's class attribute
69
70 :Parameters:
71 `node` : TDI node
72 The node to modify
73
74 `class_` : ``tuple``
75 Class name(s) to add
76 """
77 try:
78 old = decode(node[u'class'], node.raw.encoder.encoding).split()
79 except KeyError:
80 class_ = u' '.join(class_)
81 else:
82 class_ = u' '.join(old + list(class_))
83 if class_:
84 node[u'class'] = class_
85 else:
86 del node[u'class']
87
88
90 """
91 Remove class(es) from node's class attribute
92
93 :Parameters:
94 `node` : TDI node
95 The node to modify
96
97 `class_` : ``tuple``
98 Class name(s) to remove. It is *not* an error if a class is not
99 defined before.
100 """
101 try:
102 old = decode(node[u'class'], node.raw.encoder.encoding).split()
103 except KeyError:
104 pass
105 else:
106 class_ = u' '.join([item for item in old if item not in class_])
107 if class_:
108 node[u'class'] = class_
109 else:
110 del node[u'class']
111
112
114 """ Make multiline encoder """
115
116
117 divmod_, len_ = divmod, len
118 def space_func(match):
119 """ Space filler """
120 length, rest = divmod_(len_(match.group(0)), 2)
121 if length == 0:
122 return u' '
123 return u' ' * rest + u' ' * length
124 ws_sub = _re.compile(ur'\s+').sub
125 ws1_sub = _re.compile(ur'^\s(\S)').sub
126
127 def multiline(content, encoding='ascii', tabwidth=8, xhtml=True):
128 """
129 Encode multiline content to HTML, assignable to ``node.raw.content``
130
131 :Parameters:
132 `content` : ``unicode``
133 Content to encode
134
135 `encoding` : ``str``
136 Target encoding
137
138 `tabwidth` : ``int``
139 Tab width? Used to expand tabs. If ``None``, tabs are not
140 expanded.
141
142 `xhtml` : ``bool``
143 XHTML? Only used to determine if <br> or <br /> is emitted.
144
145 :Return: The multilined content
146 :Rtype: ``str``
147 """
148 content = (content
149 .replace(u'&', u'&')
150 .replace(u'<', u'<')
151 .replace(u'>', u'>')
152 )
153 lines = []
154 for line in content.splitlines():
155 line = line.rstrip()
156 if not line:
157 line = u' '
158 else:
159 if tabwidth is not None:
160 line = line.expandtabs(tabwidth)
161 line = ws1_sub(ur' \1', line)
162 line = ws_sub(space_func, line)
163 lines.append(line)
164 if xhtml:
165 res = u'<br />'.join(lines)
166 else:
167 res = u'<br>'.join(lines)
168 return res.encode(encoding, 'xmlcharrefreplace')
169
170 return multiline
171
172 multiline = _make_multiline()
173
174
181
182
184 """
185 Strip unneeded whitespace and comments
186
187 :IVariables:
188 `_buffer` : ``list``
189 Current text buffer
190
191 `_stack` : ``list``
192 Current tag stack
193
194 `_last` : ``str``
195 Last seen endtag name (normalized) or ``None``
196
197 `_blocks` : ``dict``
198 List of block elements (in a dict for better lookup)
199 """
200
201 - def __init__(self, builder, comment_filter=None):
202 """
203 Initialization
204
205 :Parameters:
206 `builder` : `BuildingListenerInterface`
207 Next level builder.
208
209 `comment_filter` : callable
210 Comment filter. A function which takes the comment data and
211 returns a filtered comment (which is passed through to the
212 builder) or ``None`` (meaning the comment can be stripped
213 completely). For example::
214
215 def keep_ad_comments(data):
216 if 'google_ad_section' in data:
217 return data
218 return None
219
220 If omitted or ``None``, all comments are stripped.
221 """
222 super(MinifyFilter, self).__init__(builder)
223 self._buffer = []
224 self._stack = []
225 self._last = None
226 self._dtd = _dtd.HTMLDTD()
227 self._normalize = self.builder.decoder.normalize
228 if comment_filter is None:
229 comment_filter = lambda x: None
230 self._comment_filter = comment_filter
231 self._blocks = dict([(item, None) for item in (
232 'address',
233 'article',
234 'aside',
235 'blockquote',
236 'body',
237 'caption',
238 'col',
239 'colgroup',
240 'dd',
241 'dir',
242 'div',
243 'dl',
244 'dt',
245 'fieldset',
246 'figcaption',
247 'figure',
248 'footer',
249 'form',
250 'frame',
251 'frameset',
252 'h1',
253 'h2',
254 'h3',
255 'h4',
256 'h5',
257 'h6',
258 'head',
259 'header',
260 'hgroup',
261 'hr',
262 'html',
263 'isindex',
264 'layer',
265 'li',
266 'listing',
267 'map',
268 'marquee',
269 'menu',
270 'multicol',
271 'nav',
272 'noframes',
273 'ol',
274 'option',
275 'p',
276 'script',
277 'style',
278 'section',
279 'table',
280 'tbody',
281 'td',
282 'title',
283 'tfoot',
284 'th',
285 'thead',
286 'tr',
287 'ul',
288 'xmp',
289 )])
290
291
292
293
294 _WS_SUB = _re.compile(r'\s+').sub
295
296 - def _flush(self, endtag=False, starttag=None):
297 """
298 Flush the current text buffer to the builder
299
300 :Parameters:
301 `endtag` : ``bool``
302 Endtag flush?
303
304 `starttag` : ``str``
305 Next starttag (normalized) if starttag flush
306 """
307 if self._buffer:
308 self._buffer, buf, stack = [], ''.join(self._buffer), self._stack
309 if stack and \
310 (self._dtd.cdata(stack[-1]) or stack[-1] == 'pre'):
311 if stack[-1] == 'pre':
312 buf = [line.rstrip()
313 for line in buf.rstrip().splitlines(False)
314 ]
315 elif stack[-1] in ('script', 'style'):
316 buf = buf.strip().splitlines(False)
317 else:
318 buf = buf.splitlines(False)
319 buf = '\n'.join(buf)
320 else:
321 buf = self._WS_SUB(' ', buf)
322 if self._last in self._blocks:
323 buf = buf.lstrip()
324 if (endtag and stack and stack[-1] in self._blocks) \
325 or starttag in self._blocks:
326 buf = buf.rstrip()
327 self.builder.handle_text(buf)
328
330 """
331 Flush the last chunk
332
333 :See: `tdi.interfaces.BuilderInterface`
334 """
335 self._flush(starttag=self._blocks.keys()[0])
336 return self.builder.finalize()
337
338 - def handle_text(self, data):
339 """
340 Buffer the text
341
342 :See: `tdi.interfaces.ListenerInterface`
343 """
344 self._buffer.append(data)
345
347 """ :See: `tdi.interfaces.ListenerInterface` """
348 norm = self._normalize
349 norm_name = norm(name)
350 self._flush(False, norm_name)
351 if not closed:
352 self._stack.append(norm_name)
353 newattr = [(norm(key), value) for key, value in attr]
354 newattr.sort()
355 data = self.encoder.starttag(
356 norm_name, newattr, closed
357 )
358 self.builder.handle_starttag(norm_name, attr, closed, data)
359
361 """ :See: `tdi.interfaces.ListenerInterface` """
362 self._flush(True)
363 norm_name, stack = self._normalize(name), self._stack
364 if stack and norm_name == stack[-1]:
365 self._last = stack.pop()
366 if data:
367 data = self.encoder.endtag(norm_name)
368 self.builder.handle_endtag(norm_name, data)
369
375
380
382 """ :See: `tdi.interfaces.ListenerInterface` """
383 self._flush()
384 self.builder.handle_decl(name, value, data)
385
387 """ :See: `tdi.interfaces.ListenerInterface` """
388 self._flush()
389 self.builder.handle_pi(data)
390
391
392 -def minify(html, encoding='ascii', fail_silently=False, comment_filter=None,
393 cdata_containers=False):
394 """
395 Minify HTML
396
397 Enclosed <script> and <style> blocks are minified as well.
398
399 :Parameters:
400 `html` : ``basestring``
401 HTML to minify
402
403 `encoding` : ``str``
404 Initially assumed encoding. Only marginally interesting.
405
406 `fail_silently` : ``bool``
407 Fail if a parse error is encountered? If true, the parse error is
408 passed. Otherwise it's swallowed and the input html is returned.
409
410 `comment_filter` : callable
411 HTML Comment filter. A function which takes the comment data and
412 returns a filtered comment (which is passed through to the
413 builder) or ``None`` (meaning the comment can be stripped
414 completely). For example::
415
416 def keep_ad_comments(data):
417 if 'google_ad_section' in data:
418 return data
419 return None
420
421 If omitted or ``None``, all HTML comments are stripped.
422
423 `cdata_containers` : ``bool``
424 Add CDATA containers to enclosed <script> or <style> content? If true,
425 these containers are added after minimization of the content. Default
426 is false.
427
428 :Return: the minified HTML - typed as input
429 :Rtype: ``basestring``
430 """
431 def js_minify(builder):
432 """ Javascript minifier filter factory """
433 return _javascript.MinifyFilter(builder, standalone=True)
434
435 def js_cdata(builder):
436 """ Javascript cdata container filter factory """
437 return _javascript.CDATAFilter(builder, standalone=True)
438
439 def css_minify(builder):
440 """ CSS minifier filter factory """
441 return _css.MinifyFilter(builder, standalone=True)
442
443 def css_cdata(builder):
444 """ CSS cdata container filter factory """
445 return _css.CDATAFilter(builder, standalone=True)
446
447 def html_minify(builder):
448 """ HTML minifier filter factory """
449 return MinifyFilter(builder, comment_filter=comment_filter)
450
451 filters = cdata_containers and [js_cdata, css_cdata] or []
452 isuni = isinstance(html, unicode)
453 if isuni:
454 html = html.encode('utf-8')
455 try:
456 result = _factory.Loader(
457 builder=_StringBuilder,
458 parser=_parser.SoupParser.html,
459 encoder=_encoder.SoupEncoder,
460 decoder=_decoder.HTMLDecoder,
461 eventfilters=filters + [
462 js_minify,
463 css_minify,
464 html_minify,
465 ]
466 )(_string_io.StringIO(html), '<string>', encoding)
467 except LexerError:
468 if not fail_silently:
469 raise
470 result = html
471 if isuni:
472 return result.decode('utf-8')
473 return result
474
475
477 """ String builder """
478 __implements__ = [_interfaces.BuilderInterface,
479 _interfaces.BuildingListenerInterface]
480
481 encoding = 'ascii'
482
484 """
485 Initialization
486
487 :Parameters:
488 `encoder` : ``callable``
489 Encoder factory
490
491 `decoder` : ``callable``
492 Decoder factory
493 """
494 self._result = []
495 self.encoder = encoder(self.encoding)
496 self.decoder = decoder(self.encoding)
497
498 - def handle_text(self, data):
499 """ :see: `ListenerInterface` """
500 self._result.append(data)
501
503 """ :see: `ListenerInterface` """
504
505 self._result.append(data)
506
508 """ :see: `ListenerInterface` """
509
510 self._result.append(data)
511
513 """ :see: `ListenerInterface` """
514
515 self._result.append(data)
516
520
522 """ :see: `ListenerInterface` """
523
524 self._result.append(data)
525
527 """ :see: `ListenerInterface` """
528
529 self._result.append(data)
530
532 """ :see: `ListenerInterface` """
533 self._result.append(data)
534
546
548 """ :See: `tdi.interfaces.BuilderInterface` """
549 return ''.join(self._result)
550