_parseaddr.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546
  1. # Copyright (C) 2002-2007 Python Software Foundation
  2. # Contact: email-sig@python.org
  3. """Email address parsing code.
  4. Lifted directly from rfc822.py. This should eventually be rewritten.
  5. """
  6. from __future__ import unicode_literals
  7. from __future__ import print_function
  8. from __future__ import division
  9. from __future__ import absolute_import
  10. from future.builtins import int
  11. __all__ = [
  12. 'mktime_tz',
  13. 'parsedate',
  14. 'parsedate_tz',
  15. 'quote',
  16. ]
  17. import time, calendar
  18. SPACE = ' '
  19. EMPTYSTRING = ''
  20. COMMASPACE = ', '
  21. # Parse a date field
  22. _monthnames = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul',
  23. 'aug', 'sep', 'oct', 'nov', 'dec',
  24. 'january', 'february', 'march', 'april', 'may', 'june', 'july',
  25. 'august', 'september', 'october', 'november', 'december']
  26. _daynames = ['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun']
  27. # The timezone table does not include the military time zones defined
  28. # in RFC822, other than Z. According to RFC1123, the description in
  29. # RFC822 gets the signs wrong, so we can't rely on any such time
  30. # zones. RFC1123 recommends that numeric timezone indicators be used
  31. # instead of timezone names.
  32. _timezones = {'UT':0, 'UTC':0, 'GMT':0, 'Z':0,
  33. 'AST': -400, 'ADT': -300, # Atlantic (used in Canada)
  34. 'EST': -500, 'EDT': -400, # Eastern
  35. 'CST': -600, 'CDT': -500, # Central
  36. 'MST': -700, 'MDT': -600, # Mountain
  37. 'PST': -800, 'PDT': -700 # Pacific
  38. }
  39. def parsedate_tz(data):
  40. """Convert a date string to a time tuple.
  41. Accounts for military timezones.
  42. """
  43. res = _parsedate_tz(data)
  44. if not res:
  45. return
  46. if res[9] is None:
  47. res[9] = 0
  48. return tuple(res)
  49. def _parsedate_tz(data):
  50. """Convert date to extended time tuple.
  51. The last (additional) element is the time zone offset in seconds, except if
  52. the timezone was specified as -0000. In that case the last element is
  53. None. This indicates a UTC timestamp that explicitly declaims knowledge of
  54. the source timezone, as opposed to a +0000 timestamp that indicates the
  55. source timezone really was UTC.
  56. """
  57. if not data:
  58. return
  59. data = data.split()
  60. # The FWS after the comma after the day-of-week is optional, so search and
  61. # adjust for this.
  62. if data[0].endswith(',') or data[0].lower() in _daynames:
  63. # There's a dayname here. Skip it
  64. del data[0]
  65. else:
  66. i = data[0].rfind(',')
  67. if i >= 0:
  68. data[0] = data[0][i+1:]
  69. if len(data) == 3: # RFC 850 date, deprecated
  70. stuff = data[0].split('-')
  71. if len(stuff) == 3:
  72. data = stuff + data[1:]
  73. if len(data) == 4:
  74. s = data[3]
  75. i = s.find('+')
  76. if i == -1:
  77. i = s.find('-')
  78. if i > 0:
  79. data[3:] = [s[:i], s[i:]]
  80. else:
  81. data.append('') # Dummy tz
  82. if len(data) < 5:
  83. return None
  84. data = data[:5]
  85. [dd, mm, yy, tm, tz] = data
  86. mm = mm.lower()
  87. if mm not in _monthnames:
  88. dd, mm = mm, dd.lower()
  89. if mm not in _monthnames:
  90. return None
  91. mm = _monthnames.index(mm) + 1
  92. if mm > 12:
  93. mm -= 12
  94. if dd[-1] == ',':
  95. dd = dd[:-1]
  96. i = yy.find(':')
  97. if i > 0:
  98. yy, tm = tm, yy
  99. if yy[-1] == ',':
  100. yy = yy[:-1]
  101. if not yy[0].isdigit():
  102. yy, tz = tz, yy
  103. if tm[-1] == ',':
  104. tm = tm[:-1]
  105. tm = tm.split(':')
  106. if len(tm) == 2:
  107. [thh, tmm] = tm
  108. tss = '0'
  109. elif len(tm) == 3:
  110. [thh, tmm, tss] = tm
  111. elif len(tm) == 1 and '.' in tm[0]:
  112. # Some non-compliant MUAs use '.' to separate time elements.
  113. tm = tm[0].split('.')
  114. if len(tm) == 2:
  115. [thh, tmm] = tm
  116. tss = 0
  117. elif len(tm) == 3:
  118. [thh, tmm, tss] = tm
  119. else:
  120. return None
  121. try:
  122. yy = int(yy)
  123. dd = int(dd)
  124. thh = int(thh)
  125. tmm = int(tmm)
  126. tss = int(tss)
  127. except ValueError:
  128. return None
  129. # Check for a yy specified in two-digit format, then convert it to the
  130. # appropriate four-digit format, according to the POSIX standard. RFC 822
  131. # calls for a two-digit yy, but RFC 2822 (which obsoletes RFC 822)
  132. # mandates a 4-digit yy. For more information, see the documentation for
  133. # the time module.
  134. if yy < 100:
  135. # The year is between 1969 and 1999 (inclusive).
  136. if yy > 68:
  137. yy += 1900
  138. # The year is between 2000 and 2068 (inclusive).
  139. else:
  140. yy += 2000
  141. tzoffset = None
  142. tz = tz.upper()
  143. if tz in _timezones:
  144. tzoffset = _timezones[tz]
  145. else:
  146. try:
  147. tzoffset = int(tz)
  148. except ValueError:
  149. pass
  150. if tzoffset==0 and tz.startswith('-'):
  151. tzoffset = None
  152. # Convert a timezone offset into seconds ; -0500 -> -18000
  153. if tzoffset:
  154. if tzoffset < 0:
  155. tzsign = -1
  156. tzoffset = -tzoffset
  157. else:
  158. tzsign = 1
  159. tzoffset = tzsign * ( (tzoffset//100)*3600 + (tzoffset % 100)*60)
  160. # Daylight Saving Time flag is set to -1, since DST is unknown.
  161. return [yy, mm, dd, thh, tmm, tss, 0, 1, -1, tzoffset]
  162. def parsedate(data):
  163. """Convert a time string to a time tuple."""
  164. t = parsedate_tz(data)
  165. if isinstance(t, tuple):
  166. return t[:9]
  167. else:
  168. return t
  169. def mktime_tz(data):
  170. """Turn a 10-tuple as returned by parsedate_tz() into a POSIX timestamp."""
  171. if data[9] is None:
  172. # No zone info, so localtime is better assumption than GMT
  173. return time.mktime(data[:8] + (-1,))
  174. else:
  175. t = calendar.timegm(data)
  176. return t - data[9]
  177. def quote(str):
  178. """Prepare string to be used in a quoted string.
  179. Turns backslash and double quote characters into quoted pairs. These
  180. are the only characters that need to be quoted inside a quoted string.
  181. Does not add the surrounding double quotes.
  182. """
  183. return str.replace('\\', '\\\\').replace('"', '\\"')
  184. class AddrlistClass(object):
  185. """Address parser class by Ben Escoto.
  186. To understand what this class does, it helps to have a copy of RFC 2822 in
  187. front of you.
  188. Note: this class interface is deprecated and may be removed in the future.
  189. Use email.utils.AddressList instead.
  190. """
  191. def __init__(self, field):
  192. """Initialize a new instance.
  193. `field' is an unparsed address header field, containing
  194. one or more addresses.
  195. """
  196. self.specials = '()<>@,:;.\"[]'
  197. self.pos = 0
  198. self.LWS = ' \t'
  199. self.CR = '\r\n'
  200. self.FWS = self.LWS + self.CR
  201. self.atomends = self.specials + self.LWS + self.CR
  202. # Note that RFC 2822 now specifies `.' as obs-phrase, meaning that it
  203. # is obsolete syntax. RFC 2822 requires that we recognize obsolete
  204. # syntax, so allow dots in phrases.
  205. self.phraseends = self.atomends.replace('.', '')
  206. self.field = field
  207. self.commentlist = []
  208. def gotonext(self):
  209. """Skip white space and extract comments."""
  210. wslist = []
  211. while self.pos < len(self.field):
  212. if self.field[self.pos] in self.LWS + '\n\r':
  213. if self.field[self.pos] not in '\n\r':
  214. wslist.append(self.field[self.pos])
  215. self.pos += 1
  216. elif self.field[self.pos] == '(':
  217. self.commentlist.append(self.getcomment())
  218. else:
  219. break
  220. return EMPTYSTRING.join(wslist)
  221. def getaddrlist(self):
  222. """Parse all addresses.
  223. Returns a list containing all of the addresses.
  224. """
  225. result = []
  226. while self.pos < len(self.field):
  227. ad = self.getaddress()
  228. if ad:
  229. result += ad
  230. else:
  231. result.append(('', ''))
  232. return result
  233. def getaddress(self):
  234. """Parse the next address."""
  235. self.commentlist = []
  236. self.gotonext()
  237. oldpos = self.pos
  238. oldcl = self.commentlist
  239. plist = self.getphraselist()
  240. self.gotonext()
  241. returnlist = []
  242. if self.pos >= len(self.field):
  243. # Bad email address technically, no domain.
  244. if plist:
  245. returnlist = [(SPACE.join(self.commentlist), plist[0])]
  246. elif self.field[self.pos] in '.@':
  247. # email address is just an addrspec
  248. # this isn't very efficient since we start over
  249. self.pos = oldpos
  250. self.commentlist = oldcl
  251. addrspec = self.getaddrspec()
  252. returnlist = [(SPACE.join(self.commentlist), addrspec)]
  253. elif self.field[self.pos] == ':':
  254. # address is a group
  255. returnlist = []
  256. fieldlen = len(self.field)
  257. self.pos += 1
  258. while self.pos < len(self.field):
  259. self.gotonext()
  260. if self.pos < fieldlen and self.field[self.pos] == ';':
  261. self.pos += 1
  262. break
  263. returnlist = returnlist + self.getaddress()
  264. elif self.field[self.pos] == '<':
  265. # Address is a phrase then a route addr
  266. routeaddr = self.getrouteaddr()
  267. if self.commentlist:
  268. returnlist = [(SPACE.join(plist) + ' (' +
  269. ' '.join(self.commentlist) + ')', routeaddr)]
  270. else:
  271. returnlist = [(SPACE.join(plist), routeaddr)]
  272. else:
  273. if plist:
  274. returnlist = [(SPACE.join(self.commentlist), plist[0])]
  275. elif self.field[self.pos] in self.specials:
  276. self.pos += 1
  277. self.gotonext()
  278. if self.pos < len(self.field) and self.field[self.pos] == ',':
  279. self.pos += 1
  280. return returnlist
  281. def getrouteaddr(self):
  282. """Parse a route address (Return-path value).
  283. This method just skips all the route stuff and returns the addrspec.
  284. """
  285. if self.field[self.pos] != '<':
  286. return
  287. expectroute = False
  288. self.pos += 1
  289. self.gotonext()
  290. adlist = ''
  291. while self.pos < len(self.field):
  292. if expectroute:
  293. self.getdomain()
  294. expectroute = False
  295. elif self.field[self.pos] == '>':
  296. self.pos += 1
  297. break
  298. elif self.field[self.pos] == '@':
  299. self.pos += 1
  300. expectroute = True
  301. elif self.field[self.pos] == ':':
  302. self.pos += 1
  303. else:
  304. adlist = self.getaddrspec()
  305. self.pos += 1
  306. break
  307. self.gotonext()
  308. return adlist
  309. def getaddrspec(self):
  310. """Parse an RFC 2822 addr-spec."""
  311. aslist = []
  312. self.gotonext()
  313. while self.pos < len(self.field):
  314. preserve_ws = True
  315. if self.field[self.pos] == '.':
  316. if aslist and not aslist[-1].strip():
  317. aslist.pop()
  318. aslist.append('.')
  319. self.pos += 1
  320. preserve_ws = False
  321. elif self.field[self.pos] == '"':
  322. aslist.append('"%s"' % quote(self.getquote()))
  323. elif self.field[self.pos] in self.atomends:
  324. if aslist and not aslist[-1].strip():
  325. aslist.pop()
  326. break
  327. else:
  328. aslist.append(self.getatom())
  329. ws = self.gotonext()
  330. if preserve_ws and ws:
  331. aslist.append(ws)
  332. if self.pos >= len(self.field) or self.field[self.pos] != '@':
  333. return EMPTYSTRING.join(aslist)
  334. aslist.append('@')
  335. self.pos += 1
  336. self.gotonext()
  337. return EMPTYSTRING.join(aslist) + self.getdomain()
  338. def getdomain(self):
  339. """Get the complete domain name from an address."""
  340. sdlist = []
  341. while self.pos < len(self.field):
  342. if self.field[self.pos] in self.LWS:
  343. self.pos += 1
  344. elif self.field[self.pos] == '(':
  345. self.commentlist.append(self.getcomment())
  346. elif self.field[self.pos] == '[':
  347. sdlist.append(self.getdomainliteral())
  348. elif self.field[self.pos] == '.':
  349. self.pos += 1
  350. sdlist.append('.')
  351. elif self.field[self.pos] in self.atomends:
  352. break
  353. else:
  354. sdlist.append(self.getatom())
  355. return EMPTYSTRING.join(sdlist)
  356. def getdelimited(self, beginchar, endchars, allowcomments=True):
  357. """Parse a header fragment delimited by special characters.
  358. `beginchar' is the start character for the fragment.
  359. If self is not looking at an instance of `beginchar' then
  360. getdelimited returns the empty string.
  361. `endchars' is a sequence of allowable end-delimiting characters.
  362. Parsing stops when one of these is encountered.
  363. If `allowcomments' is non-zero, embedded RFC 2822 comments are allowed
  364. within the parsed fragment.
  365. """
  366. if self.field[self.pos] != beginchar:
  367. return ''
  368. slist = ['']
  369. quote = False
  370. self.pos += 1
  371. while self.pos < len(self.field):
  372. if quote:
  373. slist.append(self.field[self.pos])
  374. quote = False
  375. elif self.field[self.pos] in endchars:
  376. self.pos += 1
  377. break
  378. elif allowcomments and self.field[self.pos] == '(':
  379. slist.append(self.getcomment())
  380. continue # have already advanced pos from getcomment
  381. elif self.field[self.pos] == '\\':
  382. quote = True
  383. else:
  384. slist.append(self.field[self.pos])
  385. self.pos += 1
  386. return EMPTYSTRING.join(slist)
  387. def getquote(self):
  388. """Get a quote-delimited fragment from self's field."""
  389. return self.getdelimited('"', '"\r', False)
  390. def getcomment(self):
  391. """Get a parenthesis-delimited fragment from self's field."""
  392. return self.getdelimited('(', ')\r', True)
  393. def getdomainliteral(self):
  394. """Parse an RFC 2822 domain-literal."""
  395. return '[%s]' % self.getdelimited('[', ']\r', False)
  396. def getatom(self, atomends=None):
  397. """Parse an RFC 2822 atom.
  398. Optional atomends specifies a different set of end token delimiters
  399. (the default is to use self.atomends). This is used e.g. in
  400. getphraselist() since phrase endings must not include the `.' (which
  401. is legal in phrases)."""
  402. atomlist = ['']
  403. if atomends is None:
  404. atomends = self.atomends
  405. while self.pos < len(self.field):
  406. if self.field[self.pos] in atomends:
  407. break
  408. else:
  409. atomlist.append(self.field[self.pos])
  410. self.pos += 1
  411. return EMPTYSTRING.join(atomlist)
  412. def getphraselist(self):
  413. """Parse a sequence of RFC 2822 phrases.
  414. A phrase is a sequence of words, which are in turn either RFC 2822
  415. atoms or quoted-strings. Phrases are canonicalized by squeezing all
  416. runs of continuous whitespace into one space.
  417. """
  418. plist = []
  419. while self.pos < len(self.field):
  420. if self.field[self.pos] in self.FWS:
  421. self.pos += 1
  422. elif self.field[self.pos] == '"':
  423. plist.append(self.getquote())
  424. elif self.field[self.pos] == '(':
  425. self.commentlist.append(self.getcomment())
  426. elif self.field[self.pos] in self.phraseends:
  427. break
  428. else:
  429. plist.append(self.getatom(self.phraseends))
  430. return plist
  431. class AddressList(AddrlistClass):
  432. """An AddressList encapsulates a list of parsed RFC 2822 addresses."""
  433. def __init__(self, field):
  434. AddrlistClass.__init__(self, field)
  435. if field:
  436. self.addresslist = self.getaddrlist()
  437. else:
  438. self.addresslist = []
  439. def __len__(self):
  440. return len(self.addresslist)
  441. def __add__(self, other):
  442. # Set union
  443. newaddr = AddressList(None)
  444. newaddr.addresslist = self.addresslist[:]
  445. for x in other.addresslist:
  446. if not x in self.addresslist:
  447. newaddr.addresslist.append(x)
  448. return newaddr
  449. def __iadd__(self, other):
  450. # Set union, in-place
  451. for x in other.addresslist:
  452. if not x in self.addresslist:
  453. self.addresslist.append(x)
  454. return self
  455. def __sub__(self, other):
  456. # Set difference
  457. newaddr = AddressList(None)
  458. for x in self.addresslist:
  459. if not x in other.addresslist:
  460. newaddr.addresslist.append(x)
  461. return newaddr
  462. def __isub__(self, other):
  463. # Set difference, in-place
  464. for x in other.addresslist:
  465. if x in self.addresslist:
  466. self.addresslist.remove(x)
  467. return self
  468. def __getitem__(self, index):
  469. # Make indexing, slices, and 'in' work
  470. return self.addresslist[index]