Package tdi :: Package tools :: Module html
[frames] | no frames]

Source Code for Module tdi.tools.html

  1  # -*- coding: ascii -*- 
  2  u""" 
  3  :Copyright: 
  4   
  5   Copyright 2006 - 2013 
  6   Andr\xe9 Malo or his licensors, as applicable 
  7   
  8  :License: 
  9   
 10   Licensed under the Apache License, Version 2.0 (the "License"); 
 11   you may not use this file except in compliance with the License. 
 12   You may obtain a copy of the License at 
 13   
 14       http://www.apache.org/licenses/LICENSE-2.0 
 15   
 16   Unless required by applicable law or agreed to in writing, software 
 17   distributed under the License is distributed on an "AS IS" BASIS, 
 18   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
 19   See the License for the specific language governing permissions and 
 20   limitations under the License. 
 21   
 22  ============ 
 23   HTML Tools 
 24  ============ 
 25   
 26  HTML Tools. 
 27  """ 
 28  __author__ = u"Andr\xe9 Malo" 
 29  __docformat__ = "restructuredtext en" 
 30  __all__ = [ 
 31      'decode', 'entities', 'class_add', 'class_del', 'multiline', 
 32      'CommentStripFilter', 'MinifyFilter', 'minify' 
 33  ] 
 34   
 35  import codecs as _codecs 
 36  import re as _re 
 37  try: 
 38      import cStringIO as _string_io 
 39  except ImportError: 
 40      import StringIO as _string_io 
 41   
 42  from tdi import LexerError 
 43  from tdi import factory as _factory 
 44  from tdi import filters as _filters 
 45  from tdi import interfaces as _interfaces 
 46  from tdi.markup.soup import dtd as _dtd 
 47  from tdi.markup.soup import encoder as _encoder 
 48  from tdi.markup.soup import decoder as _decoder 
 49  from tdi.markup.soup import parser as _parser 
 50  from tdi.tools import css as _css 
 51  from tdi.tools import javascript as _javascript 
 52  from tdi._htmldecode import decode 
 53  from tdi._htmlentities import htmlentities as entities 
 54   
 55   
 56  #: HTML named character references, generated from 
 57  #: `the HTML5 spec`_\. 
 58  #: 
 59  #: .. _the HTML5 spec: http://www.w3.org/TR/html5/ 
 60  #:    syntax.html#named-character-references 
 61  #: 
 62  #: :Type: ``dict`` 
 63  entities = dict(entities) 
 64   
 65   
66 -def class_add(node, *class_):
67 """ 68 Add class(es) to a node's class attribute 69 70 :Parameters: 71 `node` : TDI node 72 The node to modify 73 74 `class_` : ``tuple`` 75 Class name(s) to add 76 """ 77 try: 78 old = decode(node[u'class'], node.raw.encoder.encoding).split() 79 except KeyError: 80 class_ = u' '.join(class_) 81 else: 82 class_ = u' '.join(old + list(class_)) 83 if class_: 84 node[u'class'] = class_ 85 else: 86 del node[u'class']
87 88
89 -def class_del(node, *class_):
90 """ 91 Remove class(es) from node's class attribute 92 93 :Parameters: 94 `node` : TDI node 95 The node to modify 96 97 `class_` : ``tuple`` 98 Class name(s) to remove. It is *not* an error if a class is not 99 defined before. 100 """ 101 try: 102 old = decode(node[u'class'], node.raw.encoder.encoding).split() 103 except KeyError: 104 pass 105 else: 106 class_ = u' '.join([item for item in old if item not in class_]) 107 if class_: 108 node[u'class'] = class_ 109 else: 110 del node[u'class']
111 112
113 -def _make_multiline():
114 """ Make multiline encoder """ 115 # pylint: disable = W0621 116 117 divmod_, len_ = divmod, len 118 def space_func(match): 119 """ Space filler """ 120 length, rest = divmod_(len_(match.group(0)), 2) 121 if length == 0: 122 return u' ' 123 return u' ' * rest + u'  ' * length
124 ws_sub = _re.compile(ur'\s+').sub 125 ws1_sub = _re.compile(ur'^\s(\S)').sub 126 127 def multiline(content, encoding='ascii', tabwidth=8, xhtml=True): 128 """ 129 Encode multiline content to HTML, assignable to ``node.raw.content`` 130 131 :Parameters: 132 `content` : ``unicode`` 133 Content to encode 134 135 `encoding` : ``str`` 136 Target encoding 137 138 `tabwidth` : ``int`` 139 Tab width? Used to expand tabs. If ``None``, tabs are not 140 expanded. 141 142 `xhtml` : ``bool`` 143 XHTML? Only used to determine if <br> or <br /> is emitted. 144 145 :Return: The multilined content 146 :Rtype: ``str`` 147 """ 148 content = (content 149 .replace(u'&', u'&amp;') 150 .replace(u'<', u'&lt;') 151 .replace(u'>', u'&gt;') 152 ) 153 lines = [] 154 for line in content.splitlines(): 155 line = line.rstrip() 156 if not line: 157 line = u'&nbsp;' 158 else: 159 if tabwidth is not None: 160 line = line.expandtabs(tabwidth) 161 line = ws1_sub(ur'&nbsp;\1', line) 162 line = ws_sub(space_func, line) 163 lines.append(line) 164 if xhtml: 165 res = u'<br />'.join(lines) 166 else: 167 res = u'<br>'.join(lines) 168 return res.encode(encoding, 'xmlcharrefreplace') 169 170 return multiline 171 172 multiline = _make_multiline() 173 174
175 -class CommentStripFilter(_filters.BaseEventFilter):
176 """ Strip comments from the event chain """ 177
178 - def handle_comment(self, data):
179 """ :See: `tdi.interfaces.ListenerInterface` """ 180 pass
181 182
183 -class MinifyFilter(_filters.BaseEventFilter):
184 """ 185 Strip unneeded whitespace and comments 186 187 :IVariables: 188 `_buffer` : ``list`` 189 Current text buffer 190 191 `_stack` : ``list`` 192 Current tag stack 193 194 `_last` : ``str`` 195 Last seen endtag name (normalized) or ``None`` 196 197 `_blocks` : ``dict`` 198 List of block elements (in a dict for better lookup) 199 """ 200
201 - def __init__(self, builder, comment_filter=None):
202 """ 203 Initialization 204 205 :Parameters: 206 `builder` : `BuildingListenerInterface` 207 Next level builder. 208 209 `comment_filter` : callable 210 Comment filter. A function which takes the comment data and 211 returns a filtered comment (which is passed through to the 212 builder) or ``None`` (meaning the comment can be stripped 213 completely). For example:: 214 215 def keep_ad_comments(data): 216 if 'google_ad_section' in data: 217 return data 218 return None 219 220 If omitted or ``None``, all comments are stripped. 221 """ 222 super(MinifyFilter, self).__init__(builder) 223 self._buffer = [] 224 self._stack = [] 225 self._last = None 226 self._dtd = _dtd.HTMLDTD() 227 self._normalize = self.builder.decoder.normalize 228 if comment_filter is None: 229 comment_filter = lambda x: None 230 self._comment_filter = comment_filter 231 self._blocks = dict([(item, None) for item in ( 232 'address', 233 'article', 234 'aside', 235 'blockquote', 236 'body', 237 'caption', 238 'col', 239 'colgroup', 240 'dd', 241 'dir', 242 'div', 243 'dl', 244 'dt', 245 'fieldset', 246 'figcaption', 247 'figure', 248 'footer', 249 'form', 250 'frame', 251 'frameset', 252 'h1', 253 'h2', 254 'h3', 255 'h4', 256 'h5', 257 'h6', 258 'head', 259 'header', 260 'hgroup', 261 'hr', 262 'html', 263 'isindex', 264 'layer', 265 'li', 266 'listing', 267 'map', 268 'marquee', 269 'menu', 270 'multicol', 271 'nav', 272 'noframes', 273 'ol', 274 'option', 275 'p', 276 'script', 277 'style', 278 'section', 279 'table', 280 'tbody', 281 'td', 282 'title', 283 'tfoot', 284 'th', 285 'thead', 286 'tr', 287 'ul', 288 'xmp', 289 )])
290 291 #: Whitespace substitutor 292 #: 293 #: :Type: ``callable`` 294 _WS_SUB = _re.compile(r'\s+').sub 295
296 - def _flush(self, endtag=False, starttag=None):
297 """ 298 Flush the current text buffer to the builder 299 300 :Parameters: 301 `endtag` : ``bool`` 302 Endtag flush? 303 304 `starttag` : ``str`` 305 Next starttag (normalized) if starttag flush 306 """ 307 if self._buffer: 308 self._buffer, buf, stack = [], ''.join(self._buffer), self._stack 309 if stack and \ 310 (self._dtd.cdata(stack[-1]) or stack[-1] == 'pre'): 311 if stack[-1] == 'pre': 312 buf = [line.rstrip() 313 for line in buf.rstrip().splitlines(False) 314 ] 315 elif stack[-1] in ('script', 'style'): 316 buf = buf.strip().splitlines(False) 317 else: 318 buf = buf.splitlines(False) 319 buf = '\n'.join(buf) 320 else: 321 buf = self._WS_SUB(' ', buf) 322 if self._last in self._blocks: 323 buf = buf.lstrip() 324 if (endtag and stack and stack[-1] in self._blocks) \ 325 or starttag in self._blocks: 326 buf = buf.rstrip() 327 self.builder.handle_text(buf)
328
329 - def finalize(self):
330 """ 331 Flush the last chunk 332 333 :See: `tdi.interfaces.BuilderInterface` 334 """ 335 self._flush(starttag=self._blocks.keys()[0]) 336 return self.builder.finalize()
337
338 - def handle_text(self, data):
339 """ 340 Buffer the text 341 342 :See: `tdi.interfaces.ListenerInterface` 343 """ 344 self._buffer.append(data)
345
346 - def handle_starttag(self, name, attr, closed, data):
347 """ :See: `tdi.interfaces.ListenerInterface` """ 348 norm = self._normalize 349 norm_name = norm(name) 350 self._flush(False, norm_name) 351 if not closed: 352 self._stack.append(norm_name) 353 newattr = [(norm(key), value) for key, value in attr] 354 newattr.sort() 355 data = self.encoder.starttag( 356 norm_name, newattr, closed 357 ) 358 self.builder.handle_starttag(norm_name, attr, closed, data)
359
360 - def handle_endtag(self, name, data):
361 """ :See: `tdi.interfaces.ListenerInterface` """ 362 self._flush(True) 363 norm_name, stack = self._normalize(name), self._stack 364 if stack and norm_name == stack[-1]: 365 self._last = stack.pop() 366 if data: 367 data = self.encoder.endtag(norm_name) 368 self.builder.handle_endtag(norm_name, data)
369
370 - def handle_comment(self, data):
371 """ :See: `tdi.interfaces.ListenerInterface` """ 372 data = self._comment_filter(data) 373 if data is not None: 374 self.builder.handle_comment(data)
375
376 - def handle_msection(self, name, value, data):
377 """ :See: `tdi.interfaces.ListenerInterface` """ 378 self._flush() 379 self.builder.handle_msection(name, value, data)
380
381 - def handle_decl(self, name, value, data):
382 """ :See: `tdi.interfaces.ListenerInterface` """ 383 self._flush() 384 self.builder.handle_decl(name, value, data)
385
386 - def handle_pi(self, data):
387 """ :See: `tdi.interfaces.ListenerInterface` """ 388 self._flush() 389 self.builder.handle_pi(data)
390 391
392 -def minify(html, encoding='ascii', fail_silently=False, comment_filter=None, 393 cdata_containers=False):
394 """ 395 Minify HTML 396 397 Enclosed <script> and <style> blocks are minified as well. 398 399 :Parameters: 400 `html` : ``basestring`` 401 HTML to minify 402 403 `encoding` : ``str`` 404 Initially assumed encoding. Only marginally interesting. 405 406 `fail_silently` : ``bool`` 407 Fail if a parse error is encountered? If true, the parse error is 408 passed. Otherwise it's swallowed and the input html is returned. 409 410 `comment_filter` : callable 411 HTML Comment filter. A function which takes the comment data and 412 returns a filtered comment (which is passed through to the 413 builder) or ``None`` (meaning the comment can be stripped 414 completely). For example:: 415 416 def keep_ad_comments(data): 417 if 'google_ad_section' in data: 418 return data 419 return None 420 421 If omitted or ``None``, all HTML comments are stripped. 422 423 `cdata_containers` : ``bool`` 424 Add CDATA containers to enclosed <script> or <style> content? If true, 425 these containers are added after minimization of the content. Default 426 is false. 427 428 :Return: the minified HTML - typed as input 429 :Rtype: ``basestring`` 430 """ 431 def js_minify(builder): 432 """ Javascript minifier filter factory """ 433 return _javascript.MinifyFilter(builder, standalone=True)
434 435 def js_cdata(builder): 436 """ Javascript cdata container filter factory """ 437 return _javascript.CDATAFilter(builder, standalone=True) 438 439 def css_minify(builder): 440 """ CSS minifier filter factory """ 441 return _css.MinifyFilter(builder, standalone=True) 442 443 def css_cdata(builder): 444 """ CSS cdata container filter factory """ 445 return _css.CDATAFilter(builder, standalone=True) 446 447 def html_minify(builder): 448 """ HTML minifier filter factory """ 449 return MinifyFilter(builder, comment_filter=comment_filter) 450 451 filters = cdata_containers and [js_cdata, css_cdata] or [] 452 isuni = isinstance(html, unicode) 453 if isuni: 454 html = html.encode('utf-8') 455 try: 456 result = _factory.Loader( 457 builder=_StringBuilder, 458 parser=_parser.SoupParser.html, 459 encoder=_encoder.SoupEncoder, 460 decoder=_decoder.HTMLDecoder, 461 eventfilters=filters + [ 462 js_minify, 463 css_minify, 464 html_minify, 465 ] 466 )(_string_io.StringIO(html), '<string>', encoding) 467 except LexerError: 468 if not fail_silently: 469 raise 470 result = html 471 if isuni: 472 return result.decode('utf-8') 473 return result 474 475
476 -class _StringBuilder(object):
477 """ String builder """ 478 __implements__ = [_interfaces.BuilderInterface, 479 _interfaces.BuildingListenerInterface] 480 481 encoding = 'ascii' 482
483 - def __init__(self, encoder, decoder):
484 """ 485 Initialization 486 487 :Parameters: 488 `encoder` : ``callable`` 489 Encoder factory 490 491 `decoder` : ``callable`` 492 Decoder factory 493 """ 494 self._result = [] 495 self.encoder = encoder(self.encoding) 496 self.decoder = decoder(self.encoding)
497
498 - def handle_text(self, data):
499 """ :see: `ListenerInterface` """ 500 self._result.append(data)
501
502 - def handle_escape(self, escaped, data):
503 """ :see: `ListenerInterface` """ 504 # pylint: disable = W0613 505 self._result.append(data)
506
507 - def handle_starttag(self, name, attr, closed, data):
508 """ :see: `ListenerInterface` """ 509 # pylint: disable = W0613 510 self._result.append(data)
511
512 - def handle_endtag(self, name, data):
513 """ :see: `ListenerInterface` """ 514 # pylint: disable = W0613 515 self._result.append(data)
516
517 - def handle_comment(self, data):
518 """ :see: `ListenerInterface` """ 519 self._result.append(data)
520
521 - def handle_msection(self, name, value, data):
522 """ :see: `ListenerInterface` """ 523 # pylint: disable = W0613 524 self._result.append(data)
525
526 - def handle_decl(self, name, value, data):
527 """ :see: `ListenerInterface` """ 528 # pylint: disable = W0613 529 self._result.append(data)
530
531 - def handle_pi(self, data):
532 """ :see: `ListenerInterface` """ 533 self._result.append(data)
534
535 - def handle_encoding(self, encoding):
536 """ :See: `tdi.interfaces.BuildingListenerInterface` """ 537 try: 538 _codecs.lookup(encoding) 539 except LookupError: 540 pass 541 else: 542 if self.encoding != encoding: 543 self.encoding = encoding 544 self.encoder.encoding = encoding 545 self.decoder.encoding = encoding
546
547 - def finalize(self):
548 """ :See: `tdi.interfaces.BuilderInterface` """ 549 return ''.join(self._result)
550