1 #!/usr/bin/python |
|
2 # -*- coding: utf-8 -*- |
|
3 import codecs |
|
4 import _multibytecodec as mbc |
|
5 |
|
6 ENCODING_NAME = "gsm0338" |
|
7 |
|
8 decoding_map= { |
|
9 unichr(0x00):unichr(0x0040), # COMMERCIAL AT |
|
10 unichr(0x01):unichr(0x00A3), # POUND SIGN |
|
11 unichr(0x02):unichr(0x0024), # DOLLAR SIGN |
|
12 unichr(0x03):unichr(0x00A5), # YEN SIGN |
|
13 unichr(0x04):unichr(0x00E8), # LATIN SMALL LETTER E WITH GRAVE |
|
14 unichr(0x05):unichr(0x00E9), # LATIN SMALL LETTER E WITH ACUTE |
|
15 unichr(0x06):unichr(0x00F9), # LATIN SMALL LETTER U WITH GRAVE |
|
16 unichr(0x07):unichr(0x00EC), # LATIN SMALL LETTER I WITH GRAVE |
|
17 unichr(0x08):unichr(0x00F2), # LATIN SMALL LETTER O WITH GRAVE |
|
18 unichr(0x09):unichr(0x00E7), # LATIN SMALL LETTER C WITH CEDILLA |
|
19 unichr(0x0A):unichr(0x000A), # LINE FEED |
|
20 unichr(0x0B):unichr(0x00D8), # LATIN CAPITAL LETTER O WITH STROKE |
|
21 unichr(0x0C):unichr(0x00F8), # LATIN SMALL LETTER O WITH STROKE |
|
22 unichr(0x0D):unichr(0x000D), # CARRIAGE RETURN |
|
23 unichr(0x0E):unichr(0x00C5), # LATIN CAPITAL LETTER A WITH RING ABOVE |
|
24 unichr(0x0F):unichr(0x00E5), # LATIN SMALL LETTER A WITH RING ABOVE |
|
25 unichr(0x10):unichr(0x0394), # GREEK CAPITAL LETTER DELTA |
|
26 unichr(0x11):unichr(0x005F), # LOW LINE |
|
27 unichr(0x12):unichr(0x03A6), # GREEK CAPITAL LETTER PHI |
|
28 unichr(0x13):unichr(0x0393), # GREEK CAPITAL LETTER GAMMA |
|
29 unichr(0x14):unichr(0x039B), # GREEK CAPITAL LETTER LAMDA |
|
30 unichr(0x15):unichr(0x03A9), # GREEK CAPITAL LETTER OMEGA |
|
31 unichr(0x16):unichr(0x03A0), # GREEK CAPITAL LETTER PI |
|
32 unichr(0x17):unichr(0x03A8), # GREEK CAPITAL LETTER PSI |
|
33 unichr(0x18):unichr(0x03A3), # GREEK CAPITAL LETTER SIGMA |
|
34 unichr(0x19):unichr(0x0398), # GREEK CAPITAL LETTER THETA |
|
35 unichr(0x1A):unichr(0x039E), # GREEK CAPITAL LETTER XI |
|
36 unichr(0x1B):unichr(0x00A0), # ESCAPE TO EXTENSION TABLE (or displayed as NBSP, see note above) |
|
37 unichr(0x1B0A):unichr(0x000C), # FORM FEED |
|
38 unichr(0x1B14):unichr(0x005E), # CIRCUMFLEX ACCENT |
|
39 unichr(0x1B28):unichr(0x007B), # LEFT CURLY BRACKET |
|
40 unichr(0x1B29):unichr(0x007D), # RIGHT CURLY BRACKET |
|
41 unichr(0x1B2F):unichr(0x005C), # REVERSE SOLIDUS |
|
42 unichr(0x1B3C):unichr(0x005B), # LEFT SQUARE BRACKET |
|
43 unichr(0x1B3D):unichr(0x007E), # TILDE |
|
44 unichr(0x1B3E):unichr(0x005D), # RIGHT SQUARE BRACKET |
|
45 unichr(0x1B40):unichr(0x007C), # VERTICAL LINE |
|
46 unichr(0x1B65):unichr(0x20AC), # EURO SIGN |
|
47 unichr(0x1C):unichr(0x00C6), # LATIN CAPITAL LETTER AE |
|
48 unichr(0x1D):unichr(0x00E6), # LATIN SMALL LETTER AE |
|
49 unichr(0x1E):unichr(0x00DF), # LATIN SMALL LETTER SHARP S (German) |
|
50 unichr(0x1F):unichr(0x00C9), # LATIN CAPITAL LETTER E WITH ACUTE |
|
51 unichr(0x20):unichr(0x0020), # SPACE |
|
52 unichr(0x21):unichr(0x0021), # EXCLAMATION MARK |
|
53 unichr(0x22):unichr(0x0022), # QUOTATION MARK |
|
54 unichr(0x23):unichr(0x0023), # NUMBER SIGN |
|
55 unichr(0x24):unichr(0x00A4), # CURRENCY SIGN |
|
56 unichr(0x25):unichr(0x0025), # PERCENT SIGN |
|
57 unichr(0x26):unichr(0x0026), # AMPERSAND |
|
58 unichr(0x27):unichr(0x0027), # APOSTROPHE |
|
59 unichr(0x28):unichr(0x0028), # LEFT PARENTHESIS |
|
60 unichr(0x29):unichr(0x0029), # RIGHT PARENTHESIS |
|
61 unichr(0x2A):unichr(0x002A), # ASTERISK |
|
62 unichr(0x2B):unichr(0x002B), # PLUS SIGN |
|
63 unichr(0x2C):unichr(0x002C), # COMMA |
|
64 unichr(0x2D):unichr(0x002D), # HYPHEN-MINUS |
|
65 unichr(0x2E):unichr(0x002E), # FULL STOP |
|
66 unichr(0x2F):unichr(0x002F), # SOLIDUS |
|
67 unichr(0x30):unichr(0x0030), # DIGIT ZERO |
|
68 unichr(0x31):unichr(0x0031), # DIGIT ONE |
|
69 unichr(0x32):unichr(0x0032), # DIGIT TWO |
|
70 unichr(0x33):unichr(0x0033), # DIGIT THREE |
|
71 unichr(0x34):unichr(0x0034), # DIGIT FOUR |
|
72 unichr(0x35):unichr(0x0035), # DIGIT FIVE |
|
73 unichr(0x36):unichr(0x0036), # DIGIT SIX |
|
74 unichr(0x37):unichr(0x0037), # DIGIT SEVEN |
|
75 unichr(0x38):unichr(0x0038), # DIGIT EIGHT |
|
76 unichr(0x39):unichr(0x0039), # DIGIT NINE |
|
77 unichr(0x3A):unichr(0x003A), # COLON |
|
78 unichr(0x3B):unichr(0x003B), # SEMICOLON |
|
79 unichr(0x3C):unichr(0x003C), # LESS-THAN SIGN |
|
80 unichr(0x3D):unichr(0x003D), # EQUALS SIGN |
|
81 unichr(0x3E):unichr(0x003E), # GREATER-THAN SIGN |
|
82 unichr(0x3F):unichr(0x003F), # QUESTION MARK |
|
83 unichr(0x40):unichr(0x00A1), # INVERTED EXCLAMATION MARK |
|
84 unichr(0x41):unichr(0x0041), # LATIN CAPITAL LETTER A |
|
85 unichr(0x42):unichr(0x0042), # LATIN CAPITAL LETTER B |
|
86 unichr(0x43):unichr(0x0043), # LATIN CAPITAL LETTER C |
|
87 unichr(0x44):unichr(0x0044), # LATIN CAPITAL LETTER D |
|
88 unichr(0x45):unichr(0x0045), # LATIN CAPITAL LETTER E |
|
89 unichr(0x46):unichr(0x0046), # LATIN CAPITAL LETTER F |
|
90 unichr(0x47):unichr(0x0047), # LATIN CAPITAL LETTER G |
|
91 unichr(0x48):unichr(0x0048), # LATIN CAPITAL LETTER H |
|
92 unichr(0x49):unichr(0x0049), # LATIN CAPITAL LETTER I |
|
93 unichr(0x4A):unichr(0x004A), # LATIN CAPITAL LETTER J |
|
94 unichr(0x4B):unichr(0x004B), # LATIN CAPITAL LETTER K |
|
95 unichr(0x4C):unichr(0x004C), # LATIN CAPITAL LETTER L |
|
96 unichr(0x4D):unichr(0x004D), # LATIN CAPITAL LETTER M |
|
97 unichr(0x4E):unichr(0x004E), # LATIN CAPITAL LETTER N |
|
98 unichr(0x4F):unichr(0x004F), # LATIN CAPITAL LETTER O |
|
99 unichr(0x50):unichr(0x0050), # LATIN CAPITAL LETTER P |
|
100 unichr(0x51):unichr(0x0051), # LATIN CAPITAL LETTER Q |
|
101 unichr(0x52):unichr(0x0052), # LATIN CAPITAL LETTER R |
|
102 unichr(0x53):unichr(0x0053), # LATIN CAPITAL LETTER S |
|
103 unichr(0x54):unichr(0x0054), # LATIN CAPITAL LETTER T |
|
104 unichr(0x55):unichr(0x0055), # LATIN CAPITAL LETTER U |
|
105 unichr(0x56):unichr(0x0056), # LATIN CAPITAL LETTER V |
|
106 unichr(0x57):unichr(0x0057), # LATIN CAPITAL LETTER W |
|
107 unichr(0x58):unichr(0x0058), # LATIN CAPITAL LETTER X |
|
108 unichr(0x59):unichr(0x0059), # LATIN CAPITAL LETTER Y |
|
109 unichr(0x5A):unichr(0x005A), # LATIN CAPITAL LETTER Z |
|
110 unichr(0x5B):unichr(0x00C4), # LATIN CAPITAL LETTER A WITH DIAERESIS |
|
111 unichr(0x5C):unichr(0x00D6), # LATIN CAPITAL LETTER O WITH DIAERESIS |
|
112 unichr(0x5D):unichr(0x00D1), # LATIN CAPITAL LETTER N WITH TILDE |
|
113 unichr(0x5E):unichr(0x00DC), # LATIN CAPITAL LETTER U WITH DIAERESIS |
|
114 unichr(0x5F):unichr(0x00A7), # SECTION SIGN |
|
115 unichr(0x60):unichr(0x00BF), # INVERTED QUESTION MARK |
|
116 unichr(0x61):unichr(0x0061), # LATIN SMALL LETTER A |
|
117 unichr(0x62):unichr(0x0062), # LATIN SMALL LETTER B |
|
118 unichr(0x63):unichr(0x0063), # LATIN SMALL LETTER C |
|
119 unichr(0x64):unichr(0x0064), # LATIN SMALL LETTER D |
|
120 unichr(0x65):unichr(0x0065), # LATIN SMALL LETTER E |
|
121 unichr(0x66):unichr(0x0066), # LATIN SMALL LETTER F |
|
122 unichr(0x67):unichr(0x0067), # LATIN SMALL LETTER G |
|
123 unichr(0x68):unichr(0x0068), # LATIN SMALL LETTER H |
|
124 unichr(0x69):unichr(0x0069), # LATIN SMALL LETTER I |
|
125 unichr(0x6A):unichr(0x006A), # LATIN SMALL LETTER J |
|
126 unichr(0x6B):unichr(0x006B), # LATIN SMALL LETTER K |
|
127 unichr(0x6C):unichr(0x006C), # LATIN SMALL LETTER L |
|
128 unichr(0x6D):unichr(0x006D), # LATIN SMALL LETTER M |
|
129 unichr(0x6E):unichr(0x006E), # LATIN SMALL LETTER N |
|
130 unichr(0x6F):unichr(0x006F), # LATIN SMALL LETTER O |
|
131 unichr(0x70):unichr(0x0070), # LATIN SMALL LETTER P |
|
132 unichr(0x71):unichr(0x0071), # LATIN SMALL LETTER Q |
|
133 unichr(0x72):unichr(0x0072), # LATIN SMALL LETTER R |
|
134 unichr(0x73):unichr(0x0073), # LATIN SMALL LETTER S |
|
135 unichr(0x74):unichr(0x0074), # LATIN SMALL LETTER T |
|
136 unichr(0x75):unichr(0x0075), # LATIN SMALL LETTER U |
|
137 unichr(0x76):unichr(0x0076), # LATIN SMALL LETTER V |
|
138 unichr(0x77):unichr(0x0077), # LATIN SMALL LETTER W |
|
139 unichr(0x78):unichr(0x0078), # LATIN SMALL LETTER X |
|
140 unichr(0x79):unichr(0x0079), # LATIN SMALL LETTER Y |
|
141 unichr(0x7A):unichr(0x007A), # LATIN SMALL LETTER Z |
|
142 unichr(0x7B):unichr(0x00E4), # LATIN SMALL LETTER A WITH DIAERESIS |
|
143 unichr(0x7C):unichr(0x00F6), # LATIN SMALL LETTER O WITH DIAERESIS |
|
144 unichr(0x7D):unichr(0x00F1), # LATIN SMALL LETTER N WITH TILDE |
|
145 unichr(0x7E):unichr(0x00FC), # LATIN SMALL LETTER U WITH DIAERESIS |
|
146 unichr(0x7F):unichr(0x00E0), # LATIN SMALL LETTER A WITH GRAVE |
|
147 } |
|
148 |
|
149 encoding_map=dict([(v,k) for (k,v) in decoding_map.items()]) |
|
150 |
|
151 class Codec(codecs.Codec): |
|
152 def encode(self,input,errors='strict'): |
|
153 ret="" |
|
154 for i in input: |
|
155 ret+=encoding_map[i] |
|
156 return (ret,len(ret)) |
|
157 def decode(self,input,errors='strict'): |
|
158 ret="" |
|
159 for i in input: |
|
160 ret+=decoding_map[i] |
|
161 return (ret,len(ret)) |
|
162 |
|
163 |
|
164 |
|
165 class StreamWriter(Codec,mbc.MultibyteStreamWriter,codecs.StreamWriter): |
|
166 pass |
|
167 |
|
168 class StreamReader(Codec,mbc.MultibyteStreamReader,codecs.StreamReader): |
|
169 pass |
|
170 |
|
171 ### encodings module API |
|
172 |
|
173 def getregentry(): |
|
174 return (Codec().encode,Codec().decode,StreamReader,StreamWriter) |
|
175 |
|
176 |
|
177 def gsm_search(encoding): |
|
178 if not encoding == ENCODING_NAME: |
|
179 return |
|
180 return getregentry() |
|
181 |
|
182 # Register our codec when we load the module |
|
183 codecs.register(gsm_search) |
|
184 |
|
185 if __name__ == "__main__": |
|
186 text = "€öäüß" |
|
187 text2 = unicode(text,"utf-8").encode("gsm0338") |
|
188 assert(text2==u"\u1B65\x7C\x7B\x7E\x1E") |
|
189 text="" |
|
190 text2 = unicode(text,"utf-8").encode("gsm0338") |
|
191 assert(text==text2) |
|
192 |
|