1
2 u"""
3
4 Copyright 2006 - 2013
5 Andr\xe9 Malo or his licensors, as applicable
6
7 :License:
8
9 Licensed under the Apache License, Version 2.0 (the "License");
10 you may not use this file except in compliance with the License.
11 You may obtain a copy of the License at
12
13 http://www.apache.org/licenses/LICENSE-2.0
14
15 Unless required by applicable law or agreed to in writing, software
16 distributed under the License is distributed on an "AS IS" BASIS,
17 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 See the License for the specific language governing permissions and
19 limitations under the License.
20
21 ==============
22 HTML Decoder
23 ==============
24
25 HTML Decoder.
26 """
27 __author__ = u"Andr\xe9 Malo"
28 __docformat__ = "restructuredtext en"
29
30 import re as _re
31
32 from tdi import _htmlentities
33
34
36 """ Make decoder """
37
38
39 from tdi import c
40 c = c.load('impl')
41 if c is not None:
42 return c.htmldecode
43
44 sub = _re.compile(ur'&([^& \t\n\r\f;]*);').sub
45 unicode_, unichr_, str_, int_ = unicode, unichr, str, int
46 isinstance_ = isinstance
47 default_entities = dict(_htmlentities.htmlentities)
48
49
50 def decode(value, encoding='latin-1', errors='strict', entities=None):
51 """
52 Decode HTML encoded text
53
54 :Parameters:
55 `value` : ``basestring``
56 HTML content to decode
57
58 `encoding` : ``str``
59 Unicode encoding to be applied before value is being processed
60 further. If value is already a unicode instance, the encoding is
61 ignored. If omitted, 'latin-1' is applied (because it can't fail
62 and maps bytes 1:1 to unicode codepoints).
63
64 `errors` : ``str``
65 Error handling, passed to .decode() and evaluated for entities.
66 If the entity name or character codepoint could not be found or
67 not be parsed then the error handler has the following semantics:
68
69 ``strict`` (or anything different from the other tokens below)
70 A ``ValueError`` is raised.
71
72 ``ignore``
73 The original entity is passed through
74
75 ``replace``
76 The character is replaced by the replacement character
77 (U+FFFD)
78
79 `entities` : ``dict``
80 Entity name mapping (unicode(name) -> unicode(value)). If
81 omitted or ``None``, the `HTML5 entity list`_ is applied.
82
83 .. _HTML5 entity list: http://www.w3.org/TR/html5/
84 syntax.html#named-character-references
85
86 :Return: The decoded content
87 :Rtype: ``unicode``
88 """
89
90
91 if not isinstance_(value, unicode_):
92 value = str_(value).decode(encoding, errors)
93 if entities is None:
94 entities = default_entities
95 def subber(match):
96 """ Substituter """
97 name = match.group(1)
98 if not name.startswith(u'#'):
99 try:
100 return entities[name]
101 except KeyError:
102 pass
103 else:
104 if name.startswith(u'#x') or name.startswith(u'#X'):
105 base = 16
106 name = name[2:]
107 else:
108 base = 10
109 name = name[1:]
110 try:
111 return unichr_(int_(name, base))
112 except (ValueError, TypeError, OverflowError):
113 pass
114
115 if errors == 'ignore':
116 return match.group(0)
117 elif errors == 'replace':
118 return u'\ufffd'
119 else:
120 raise ValueError(
121 "Unresolved entity %r" % (match.group(0),)
122 )
123
124 return sub(subber, value)
125 return decode
126
127 decode = _make_decode()
128