1
2 u"""
3 :Copyright:
4
5 Copyright 2006 - 2013
6 Andr\xe9 Malo or his licensors, as applicable
7
8 :License:
9
10 Licensed under the Apache License, Version 2.0 (the "License");
11 you may not use this file except in compliance with the License.
12 You may obtain a copy of the License at
13
14 http://www.apache.org/licenses/LICENSE-2.0
15
16 Unless required by applicable law or agreed to in writing, software
17 distributed under the License is distributed on an "AS IS" BASIS,
18 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
19 See the License for the specific language governing permissions and
20 limitations under the License.
21
22 =====================
23 Soup Filter Classes
24 =====================
25
26 Filters for soup templates.
27 """
28 __author__ = u"Andr\xe9 Malo"
29 __docformat__ = "restructuredtext en"
30
31 import re as _re
32
33 from tdi import util as _util
34 from tdi import filters as _filters
35
36
38 """ Extract template encoding and pass it properly to the builder """
39 __slots__ = ('_normalize', '_meta')
40
46
48 """
49 Extract encoding from HTML meta element
50
51 Here are samples for the expected formats::
52
53 <meta charset="utf-8"> <!-- HTML5 -->
54
55 <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
56
57 The event is passed to the builder nevertheless.
58
59 :See: `BuildingListenerInterface`
60 """
61 normalize = self._normalize
62
63 iname = normalize(name)
64 if iname == self._meta:
65 adict = dict([(normalize(key), val) for key, val in attr])
66 value = str((adict.get(normalize('charset')) or ''))
67 if value.startswith('"') or value.startswith("'"):
68 value = value[1:-1].strip()
69 if value:
70 self.builder.handle_encoding(value)
71 else:
72 value = (adict.get(normalize('http-equiv')) or '').lower()
73 if value.startswith('"') or value.startswith("'"):
74 value = value[1:-1].strip()
75 if value == 'content-type':
76 ctype = adict.get(normalize('content'))
77 if ctype:
78 if ctype.startswith('"') or ctype.startswith("'"):
79 ctype = ctype[1:-1].strip()
80
81 parsed = _util.parse_content_type(ctype)
82 if parsed is not None:
83 encoding = parsed[1].get('charset')
84 if encoding:
85 self.builder.handle_encoding(
86 encoding[0].strip()
87 )
88
89 self.builder.handle_starttag(name, attr, closed, data)
90
91
92
93
94 _PI_MATCH = _re.compile(r'''
95 <\? \s* [xX][mM][lL] \s+ (?P<attr>
96 [^"'?]*
97 (?:
98 (?:
99 "[^"]*"
100 | '[^']*'
101 )
102 [^"'?]*
103 )*
104 )
105 \s* \?>$
106 ''', _re.X).match
107
108
109
110
111 _PI_ATT_ITER = _re.compile(r'''
112 \s*
113 (?P<name>[^\s=]*) # attribute name
114 \s*
115 =
116 (?P<value> # value
117 \s*"[^"]*"
118 | \s*'[^']*'
119 )
120 ''', _re.X).finditer
121
123 """
124 Extract encoding from xml declaration
125
126 Here's a sample for the expected format::
127
128 <?xml version="1.0" encoding="ascii" ?>
129
130 The event is passed to the builder nevertheless.
131
132 :See: `BuildingListenerInterface`
133 """
134 match = self._PI_MATCH(str(data))
135 if match:
136 encoding = 'utf-8'
137 for match in self._PI_ATT_ITER(match.group('attr')):
138 key, value = match.group('name', 'value')
139 if key or value:
140 if key == 'encoding':
141 value = value.strip()
142 if value.startswith('"') or value.startswith("'"):
143 value = value[1:-1].strip()
144 if value:
145 encoding = value
146 break
147 else:
148 break
149 self.builder.handle_encoding(encoding)
150 self.builder.handle_pi(data)
151
152 from tdi import c
153 c = c.load('impl')
154 if c is not None:
155 EncodingDetectFilter = c.SoupEncodingDetectFilter
156 del c
157