Package tdi :: Module _htmldecode
[frames] | no frames]

Source Code for Module tdi._htmldecode

  1  # -*- coding: ascii -*- 
  2  u""" 
  3   
  4   Copyright 2006 - 2013 
  5   Andr\xe9 Malo or his licensors, as applicable 
  6   
  7  :License: 
  8   
  9   Licensed under the Apache License, Version 2.0 (the "License"); 
 10   you may not use this file except in compliance with the License. 
 11   You may obtain a copy of the License at 
 12   
 13       http://www.apache.org/licenses/LICENSE-2.0 
 14   
 15   Unless required by applicable law or agreed to in writing, software 
 16   distributed under the License is distributed on an "AS IS" BASIS, 
 17   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
 18   See the License for the specific language governing permissions and 
 19   limitations under the License. 
 20   
 21  ============== 
 22   HTML Decoder 
 23  ============== 
 24   
 25  HTML Decoder. 
 26  """ 
 27  __author__ = u"Andr\xe9 Malo" 
 28  __docformat__ = "restructuredtext en" 
 29   
 30  import re as _re 
 31   
 32  from tdi import _htmlentities 
 33   
 34   
35 -def _make_decode():
36 """ Make decoder """ 37 # pylint: disable = R0912 38 39 from tdi import c 40 c = c.load('impl') 41 if c is not None: 42 return c.htmldecode 43 44 sub = _re.compile(ur'&([^& \t\n\r\f;]*);').sub 45 unicode_, unichr_, str_, int_ = unicode, unichr, str, int 46 isinstance_ = isinstance 47 default_entities = dict(_htmlentities.htmlentities) 48 49 # pylint: disable = W0621 50 def decode(value, encoding='latin-1', errors='strict', entities=None): 51 """ 52 Decode HTML encoded text 53 54 :Parameters: 55 `value` : ``basestring`` 56 HTML content to decode 57 58 `encoding` : ``str`` 59 Unicode encoding to be applied before value is being processed 60 further. If value is already a unicode instance, the encoding is 61 ignored. If omitted, 'latin-1' is applied (because it can't fail 62 and maps bytes 1:1 to unicode codepoints). 63 64 `errors` : ``str`` 65 Error handling, passed to .decode() and evaluated for entities. 66 If the entity name or character codepoint could not be found or 67 not be parsed then the error handler has the following semantics: 68 69 ``strict`` (or anything different from the other tokens below) 70 A ``ValueError`` is raised. 71 72 ``ignore`` 73 The original entity is passed through 74 75 ``replace`` 76 The character is replaced by the replacement character 77 (U+FFFD) 78 79 `entities` : ``dict`` 80 Entity name mapping (unicode(name) -> unicode(value)). If 81 omitted or ``None``, the `HTML5 entity list`_ is applied. 82 83 .. _HTML5 entity list: http://www.w3.org/TR/html5/ 84 syntax.html#named-character-references 85 86 :Return: The decoded content 87 :Rtype: ``unicode`` 88 """ 89 # pylint: disable = E1101 90 # pylint: disable = R0912 91 if not isinstance_(value, unicode_): 92 value = str_(value).decode(encoding, errors) 93 if entities is None: 94 entities = default_entities 95 def subber(match): 96 """ Substituter """ 97 name = match.group(1) 98 if not name.startswith(u'#'): 99 try: 100 return entities[name] 101 except KeyError: 102 pass 103 else: 104 if name.startswith(u'#x') or name.startswith(u'#X'): 105 base = 16 106 name = name[2:] 107 else: 108 base = 10 109 name = name[1:] 110 try: 111 return unichr_(int_(name, base)) 112 except (ValueError, TypeError, OverflowError): 113 pass 114 115 if errors == 'ignore': 116 return match.group(0) 117 elif errors == 'replace': 118 return u'\ufffd' 119 else: 120 raise ValueError( 121 "Unresolved entity %r" % (match.group(0),) 122 )
123 124 return sub(subber, value) 125 return decode 126 127 decode = _make_decode() 128