_encoded_words.py 8.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232
  1. """ Routines for manipulating RFC2047 encoded words.
  2. This is currently a package-private API, but will be considered for promotion
  3. to a public API if there is demand.
  4. """
  5. from __future__ import unicode_literals
  6. from __future__ import division
  7. from __future__ import absolute_import
  8. from future.builtins import bytes
  9. from future.builtins import chr
  10. from future.builtins import int
  11. from future.builtins import str
  12. # An ecoded word looks like this:
  13. #
  14. # =?charset[*lang]?cte?encoded_string?=
  15. #
  16. # for more information about charset see the charset module. Here it is one
  17. # of the preferred MIME charset names (hopefully; you never know when parsing).
  18. # cte (Content Transfer Encoding) is either 'q' or 'b' (ignoring case). In
  19. # theory other letters could be used for other encodings, but in practice this
  20. # (almost?) never happens. There could be a public API for adding entries
  21. # to the CTE tables, but YAGNI for now. 'q' is Quoted Printable, 'b' is
  22. # Base64. The meaning of encoded_string should be obvious. 'lang' is optional
  23. # as indicated by the brackets (they are not part of the syntax) but is almost
  24. # never encountered in practice.
  25. #
  26. # The general interface for a CTE decoder is that it takes the encoded_string
  27. # as its argument, and returns a tuple (cte_decoded_string, defects). The
  28. # cte_decoded_string is the original binary that was encoded using the
  29. # specified cte. 'defects' is a list of MessageDefect instances indicating any
  30. # problems encountered during conversion. 'charset' and 'lang' are the
  31. # corresponding strings extracted from the EW, case preserved.
  32. #
  33. # The general interface for a CTE encoder is that it takes a binary sequence
  34. # as input and returns the cte_encoded_string, which is an ascii-only string.
  35. #
  36. # Each decoder must also supply a length function that takes the binary
  37. # sequence as its argument and returns the length of the resulting encoded
  38. # string.
  39. #
  40. # The main API functions for the module are decode, which calls the decoder
  41. # referenced by the cte specifier, and encode, which adds the appropriate
  42. # RFC 2047 "chrome" to the encoded string, and can optionally automatically
  43. # select the shortest possible encoding. See their docstrings below for
  44. # details.
  45. import re
  46. import base64
  47. import binascii
  48. import functools
  49. from string import ascii_letters, digits
  50. from future.backports.email import errors
  51. __all__ = ['decode_q',
  52. 'encode_q',
  53. 'decode_b',
  54. 'encode_b',
  55. 'len_q',
  56. 'len_b',
  57. 'decode',
  58. 'encode',
  59. ]
  60. #
  61. # Quoted Printable
  62. #
  63. # regex based decoder.
  64. _q_byte_subber = functools.partial(re.compile(br'=([a-fA-F0-9]{2})').sub,
  65. lambda m: bytes([int(m.group(1), 16)]))
  66. def decode_q(encoded):
  67. encoded = bytes(encoded.replace(b'_', b' '))
  68. return _q_byte_subber(encoded), []
  69. # dict mapping bytes to their encoded form
  70. class _QByteMap(dict):
  71. safe = bytes(b'-!*+/' + ascii_letters.encode('ascii') + digits.encode('ascii'))
  72. def __missing__(self, key):
  73. if key in self.safe:
  74. self[key] = chr(key)
  75. else:
  76. self[key] = "={:02X}".format(key)
  77. return self[key]
  78. _q_byte_map = _QByteMap()
  79. # In headers spaces are mapped to '_'.
  80. _q_byte_map[ord(' ')] = '_'
  81. def encode_q(bstring):
  82. return str(''.join(_q_byte_map[x] for x in bytes(bstring)))
  83. def len_q(bstring):
  84. return sum(len(_q_byte_map[x]) for x in bytes(bstring))
  85. #
  86. # Base64
  87. #
  88. def decode_b(encoded):
  89. defects = []
  90. pad_err = len(encoded) % 4
  91. if pad_err:
  92. defects.append(errors.InvalidBase64PaddingDefect())
  93. padded_encoded = encoded + b'==='[:4-pad_err]
  94. else:
  95. padded_encoded = encoded
  96. try:
  97. # The validate kwarg to b64decode is not supported in Py2.x
  98. if not re.match(b'^[A-Za-z0-9+/]*={0,2}$', padded_encoded):
  99. raise binascii.Error('Non-base64 digit found')
  100. return base64.b64decode(padded_encoded), defects
  101. except binascii.Error:
  102. # Since we had correct padding, this must an invalid char error.
  103. defects = [errors.InvalidBase64CharactersDefect()]
  104. # The non-alphabet characters are ignored as far as padding
  105. # goes, but we don't know how many there are. So we'll just
  106. # try various padding lengths until something works.
  107. for i in 0, 1, 2, 3:
  108. try:
  109. return base64.b64decode(encoded+b'='*i), defects
  110. except (binascii.Error, TypeError): # Py2 raises a TypeError
  111. if i==0:
  112. defects.append(errors.InvalidBase64PaddingDefect())
  113. else:
  114. # This should never happen.
  115. raise AssertionError("unexpected binascii.Error")
  116. def encode_b(bstring):
  117. return base64.b64encode(bstring).decode('ascii')
  118. def len_b(bstring):
  119. groups_of_3, leftover = divmod(len(bstring), 3)
  120. # 4 bytes out for each 3 bytes (or nonzero fraction thereof) in.
  121. return groups_of_3 * 4 + (4 if leftover else 0)
  122. _cte_decoders = {
  123. 'q': decode_q,
  124. 'b': decode_b,
  125. }
  126. def decode(ew):
  127. """Decode encoded word and return (string, charset, lang, defects) tuple.
  128. An RFC 2047/2243 encoded word has the form:
  129. =?charset*lang?cte?encoded_string?=
  130. where '*lang' may be omitted but the other parts may not be.
  131. This function expects exactly such a string (that is, it does not check the
  132. syntax and may raise errors if the string is not well formed), and returns
  133. the encoded_string decoded first from its Content Transfer Encoding and
  134. then from the resulting bytes into unicode using the specified charset. If
  135. the cte-decoded string does not successfully decode using the specified
  136. character set, a defect is added to the defects list and the unknown octets
  137. are replaced by the unicode 'unknown' character \uFDFF.
  138. The specified charset and language are returned. The default for language,
  139. which is rarely if ever encountered, is the empty string.
  140. """
  141. _, charset, cte, cte_string, _ = str(ew).split('?')
  142. charset, _, lang = charset.partition('*')
  143. cte = cte.lower()
  144. # Recover the original bytes and do CTE decoding.
  145. bstring = cte_string.encode('ascii', 'surrogateescape')
  146. bstring, defects = _cte_decoders[cte](bstring)
  147. # Turn the CTE decoded bytes into unicode.
  148. try:
  149. string = bstring.decode(charset)
  150. except UnicodeError:
  151. defects.append(errors.UndecodableBytesDefect("Encoded word "
  152. "contains bytes not decodable using {} charset".format(charset)))
  153. string = bstring.decode(charset, 'surrogateescape')
  154. except LookupError:
  155. string = bstring.decode('ascii', 'surrogateescape')
  156. if charset.lower() != 'unknown-8bit':
  157. defects.append(errors.CharsetError("Unknown charset {} "
  158. "in encoded word; decoded as unknown bytes".format(charset)))
  159. return string, charset, lang, defects
  160. _cte_encoders = {
  161. 'q': encode_q,
  162. 'b': encode_b,
  163. }
  164. _cte_encode_length = {
  165. 'q': len_q,
  166. 'b': len_b,
  167. }
  168. def encode(string, charset='utf-8', encoding=None, lang=''):
  169. """Encode string using the CTE encoding that produces the shorter result.
  170. Produces an RFC 2047/2243 encoded word of the form:
  171. =?charset*lang?cte?encoded_string?=
  172. where '*lang' is omitted unless the 'lang' parameter is given a value.
  173. Optional argument charset (defaults to utf-8) specifies the charset to use
  174. to encode the string to binary before CTE encoding it. Optional argument
  175. 'encoding' is the cte specifier for the encoding that should be used ('q'
  176. or 'b'); if it is None (the default) the encoding which produces the
  177. shortest encoded sequence is used, except that 'q' is preferred if it is up
  178. to five characters longer. Optional argument 'lang' (default '') gives the
  179. RFC 2243 language string to specify in the encoded word.
  180. """
  181. string = str(string)
  182. if charset == 'unknown-8bit':
  183. bstring = string.encode('ascii', 'surrogateescape')
  184. else:
  185. bstring = string.encode(charset)
  186. if encoding is None:
  187. qlen = _cte_encode_length['q'](bstring)
  188. blen = _cte_encode_length['b'](bstring)
  189. # Bias toward q. 5 is arbitrary.
  190. encoding = 'q' if qlen - blen < 5 else 'b'
  191. encoded = _cte_encoders[encoding](bstring)
  192. if lang:
  193. lang = '*' + lang
  194. return "=?{0}{1}?{2}?{3}?=".format(charset, lang, encoding, encoded)