treebank.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368
  1. # Natural Language Toolkit: Tokenizers
  2. #
  3. # Copyright (C) 2001-2020 NLTK Project
  4. # Author: Edward Loper <edloper@gmail.com>
  5. # Michael Heilman <mheilman@cmu.edu> (re-port from http://www.cis.upenn.edu/~treebank/tokenizer.sed)
  6. #
  7. # URL: <http://nltk.sourceforge.net>
  8. # For license information, see LICENSE.TXT
  9. r"""
  10. Penn Treebank Tokenizer
  11. The Treebank tokenizer uses regular expressions to tokenize text as in Penn Treebank.
  12. This implementation is a port of the tokenizer sed script written by Robert McIntyre
  13. and available at http://www.cis.upenn.edu/~treebank/tokenizer.sed.
  14. """
  15. import re
  16. from nltk.tokenize.api import TokenizerI
  17. from nltk.tokenize.util import align_tokens
  18. from nltk.tokenize.destructive import MacIntyreContractions
  19. class TreebankWordTokenizer(TokenizerI):
  20. """
  21. The Treebank tokenizer uses regular expressions to tokenize text as in Penn Treebank.
  22. This is the method that is invoked by ``word_tokenize()``. It assumes that the
  23. text has already been segmented into sentences, e.g. using ``sent_tokenize()``.
  24. This tokenizer performs the following steps:
  25. - split standard contractions, e.g. ``don't`` -> ``do n't`` and ``they'll`` -> ``they 'll``
  26. - treat most punctuation characters as separate tokens
  27. - split off commas and single quotes, when followed by whitespace
  28. - separate periods that appear at the end of line
  29. >>> from nltk.tokenize import TreebankWordTokenizer
  30. >>> s = '''Good muffins cost $3.88\\nin New York. Please buy me\\ntwo of them.\\nThanks.'''
  31. >>> TreebankWordTokenizer().tokenize(s)
  32. ['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks', '.']
  33. >>> s = "They'll save and invest more."
  34. >>> TreebankWordTokenizer().tokenize(s)
  35. ['They', "'ll", 'save', 'and', 'invest', 'more', '.']
  36. >>> s = "hi, my name can't hello,"
  37. >>> TreebankWordTokenizer().tokenize(s)
  38. ['hi', ',', 'my', 'name', 'ca', "n't", 'hello', ',']
  39. """
  40. # starting quotes
  41. STARTING_QUOTES = [
  42. (re.compile(r"^\""), r"``"),
  43. (re.compile(r"(``)"), r" \1 "),
  44. (re.compile(r"([ \(\[{<])(\"|\'{2})"), r"\1 `` "),
  45. ]
  46. # punctuation
  47. PUNCTUATION = [
  48. (re.compile(r"([:,])([^\d])"), r" \1 \2"),
  49. (re.compile(r"([:,])$"), r" \1 "),
  50. (re.compile(r"\.\.\."), r" ... "),
  51. (re.compile(r"[;@#$%&]"), r" \g<0> "),
  52. (
  53. re.compile(r'([^\.])(\.)([\]\)}>"\']*)\s*$'),
  54. r"\1 \2\3 ",
  55. ), # Handles the final period.
  56. (re.compile(r"[?!]"), r" \g<0> "),
  57. (re.compile(r"([^'])' "), r"\1 ' "),
  58. ]
  59. # Pads parentheses
  60. PARENS_BRACKETS = (re.compile(r"[\]\[\(\)\{\}\<\>]"), r" \g<0> ")
  61. # Optionally: Convert parentheses, brackets and converts them to PTB symbols.
  62. CONVERT_PARENTHESES = [
  63. (re.compile(r"\("), "-LRB-"),
  64. (re.compile(r"\)"), "-RRB-"),
  65. (re.compile(r"\["), "-LSB-"),
  66. (re.compile(r"\]"), "-RSB-"),
  67. (re.compile(r"\{"), "-LCB-"),
  68. (re.compile(r"\}"), "-RCB-"),
  69. ]
  70. DOUBLE_DASHES = (re.compile(r"--"), r" -- ")
  71. # ending quotes
  72. ENDING_QUOTES = [
  73. (re.compile(r'"'), " '' "),
  74. (re.compile(r"(\S)(\'\')"), r"\1 \2 "),
  75. (re.compile(r"([^' ])('[sS]|'[mM]|'[dD]|') "), r"\1 \2 "),
  76. (re.compile(r"([^' ])('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) "), r"\1 \2 "),
  77. ]
  78. # List of contractions adapted from Robert MacIntyre's tokenizer.
  79. _contractions = MacIntyreContractions()
  80. CONTRACTIONS2 = list(map(re.compile, _contractions.CONTRACTIONS2))
  81. CONTRACTIONS3 = list(map(re.compile, _contractions.CONTRACTIONS3))
  82. def tokenize(self, text, convert_parentheses=False, return_str=False):
  83. for regexp, substitution in self.STARTING_QUOTES:
  84. text = regexp.sub(substitution, text)
  85. for regexp, substitution in self.PUNCTUATION:
  86. text = regexp.sub(substitution, text)
  87. # Handles parentheses.
  88. regexp, substitution = self.PARENS_BRACKETS
  89. text = regexp.sub(substitution, text)
  90. # Optionally convert parentheses
  91. if convert_parentheses:
  92. for regexp, substitution in self.CONVERT_PARENTHESES:
  93. text = regexp.sub(substitution, text)
  94. # Handles double dash.
  95. regexp, substitution = self.DOUBLE_DASHES
  96. text = regexp.sub(substitution, text)
  97. # add extra space to make things easier
  98. text = " " + text + " "
  99. for regexp, substitution in self.ENDING_QUOTES:
  100. text = regexp.sub(substitution, text)
  101. for regexp in self.CONTRACTIONS2:
  102. text = regexp.sub(r" \1 \2 ", text)
  103. for regexp in self.CONTRACTIONS3:
  104. text = regexp.sub(r" \1 \2 ", text)
  105. # We are not using CONTRACTIONS4 since
  106. # they are also commented out in the SED scripts
  107. # for regexp in self._contractions.CONTRACTIONS4:
  108. # text = regexp.sub(r' \1 \2 \3 ', text)
  109. return text if return_str else text.split()
  110. def span_tokenize(self, text):
  111. """
  112. Uses the post-hoc nltk.tokens.align_tokens to return the offset spans.
  113. >>> from nltk.tokenize import TreebankWordTokenizer
  114. >>> s = '''Good muffins cost $3.88\\nin New (York). Please (buy) me\\ntwo of them.\\n(Thanks).'''
  115. >>> expected = [(0, 4), (5, 12), (13, 17), (18, 19), (19, 23),
  116. ... (24, 26), (27, 30), (31, 32), (32, 36), (36, 37), (37, 38),
  117. ... (40, 46), (47, 48), (48, 51), (51, 52), (53, 55), (56, 59),
  118. ... (60, 62), (63, 68), (69, 70), (70, 76), (76, 77), (77, 78)]
  119. >>> list(TreebankWordTokenizer().span_tokenize(s)) == expected
  120. True
  121. >>> expected = ['Good', 'muffins', 'cost', '$', '3.88', 'in',
  122. ... 'New', '(', 'York', ')', '.', 'Please', '(', 'buy', ')',
  123. ... 'me', 'two', 'of', 'them.', '(', 'Thanks', ')', '.']
  124. >>> [s[start:end] for start, end in TreebankWordTokenizer().span_tokenize(s)] == expected
  125. True
  126. Additional example
  127. >>> from nltk.tokenize import TreebankWordTokenizer
  128. >>> s = '''I said, "I'd like to buy some ''good muffins" which cost $3.88\\n each in New (York)."'''
  129. >>> expected = [(0, 1), (2, 6), (6, 7), (8, 9), (9, 10), (10, 12),
  130. ... (13, 17), (18, 20), (21, 24), (25, 29), (30, 32), (32, 36),
  131. ... (37, 44), (44, 45), (46, 51), (52, 56), (57, 58), (58, 62),
  132. ... (64, 68), (69, 71), (72, 75), (76, 77), (77, 81), (81, 82),
  133. ... (82, 83), (83, 84)]
  134. >>> list(TreebankWordTokenizer().span_tokenize(s)) == expected
  135. True
  136. >>> expected = ['I', 'said', ',', '"', 'I', "'d", 'like', 'to',
  137. ... 'buy', 'some', "''", "good", 'muffins', '"', 'which', 'cost',
  138. ... '$', '3.88', 'each', 'in', 'New', '(', 'York', ')', '.', '"']
  139. >>> [s[start:end] for start, end in TreebankWordTokenizer().span_tokenize(s)] == expected
  140. True
  141. """
  142. raw_tokens = self.tokenize(text)
  143. # Convert converted quotes back to original double quotes
  144. # Do this only if original text contains double quote(s) or double
  145. # single-quotes (because '' might be transformed to `` if it is
  146. # treated as starting quotes).
  147. if ('"' in text) or ("''" in text):
  148. # Find double quotes and converted quotes
  149. matched = [m.group() for m in re.finditer(r"``|'{2}|\"", text)]
  150. # Replace converted quotes back to double quotes
  151. tokens = [
  152. matched.pop(0) if tok in ['"', "``", "''"] else tok
  153. for tok in raw_tokens
  154. ]
  155. else:
  156. tokens = raw_tokens
  157. for tok in align_tokens(tokens, text):
  158. yield tok
  159. class TreebankWordDetokenizer(TokenizerI):
  160. """
  161. The Treebank detokenizer uses the reverse regex operations corresponding to
  162. the Treebank tokenizer's regexes.
  163. Note:
  164. - There're additional assumption mades when undoing the padding of [;@#$%&]
  165. punctuation symbols that isn't presupposed in the TreebankTokenizer.
  166. - There're additional regexes added in reversing the parentheses tokenization,
  167. - the r'([\]\)\}\>])\s([:;,.])' removes the additional right padding added
  168. to the closing parentheses precedding [:;,.].
  169. - It's not possible to return the original whitespaces as they were because
  170. there wasn't explicit records of where '\n', '\t' or '\s' were removed at
  171. the text.split() operation.
  172. >>> from nltk.tokenize.treebank import TreebankWordTokenizer, TreebankWordDetokenizer
  173. >>> s = '''Good muffins cost $3.88\\nin New York. Please buy me\\ntwo of them.\\nThanks.'''
  174. >>> d = TreebankWordDetokenizer()
  175. >>> t = TreebankWordTokenizer()
  176. >>> toks = t.tokenize(s)
  177. >>> d.detokenize(toks)
  178. 'Good muffins cost $3.88 in New York. Please buy me two of them. Thanks.'
  179. The MXPOST parentheses substitution can be undone using the `convert_parentheses`
  180. parameter:
  181. >>> s = '''Good muffins cost $3.88\\nin New (York). Please (buy) me\\ntwo of them.\\n(Thanks).'''
  182. >>> expected_tokens = ['Good', 'muffins', 'cost', '$', '3.88', 'in',
  183. ... 'New', '-LRB-', 'York', '-RRB-', '.', 'Please', '-LRB-', 'buy',
  184. ... '-RRB-', 'me', 'two', 'of', 'them.', '-LRB-', 'Thanks', '-RRB-', '.']
  185. >>> expected_tokens == t.tokenize(s, convert_parentheses=True)
  186. True
  187. >>> expected_detoken = 'Good muffins cost $3.88 in New (York). Please (buy) me two of them. (Thanks).'
  188. >>> expected_detoken == d.detokenize(t.tokenize(s, convert_parentheses=True), convert_parentheses=True)
  189. True
  190. During tokenization it's safe to add more spaces but during detokenization,
  191. simply undoing the padding doesn't really help.
  192. - During tokenization, left and right pad is added to [!?], when
  193. detokenizing, only left shift the [!?] is needed.
  194. Thus (re.compile(r'\s([?!])'), r'\g<1>')
  195. - During tokenization [:,] are left and right padded but when detokenizing,
  196. only left shift is necessary and we keep right pad after comma/colon
  197. if the string after is a non-digit.
  198. Thus (re.compile(r'\s([:,])\s([^\d])'), r'\1 \2')
  199. >>> from nltk.tokenize.treebank import TreebankWordDetokenizer
  200. >>> toks = ['hello', ',', 'i', 'ca', "n't", 'feel', 'my', 'feet', '!', 'Help', '!', '!']
  201. >>> twd = TreebankWordDetokenizer()
  202. >>> twd.detokenize(toks)
  203. "hello, i can't feel my feet! Help!!"
  204. >>> toks = ['hello', ',', 'i', "can't", 'feel', ';', 'my', 'feet', '!',
  205. ... 'Help', '!', '!', 'He', 'said', ':', 'Help', ',', 'help', '?', '!']
  206. >>> twd.detokenize(toks)
  207. "hello, i can't feel; my feet! Help!! He said: Help, help?!"
  208. """
  209. _contractions = MacIntyreContractions()
  210. CONTRACTIONS2 = [
  211. re.compile(pattern.replace("(?#X)", "\s"))
  212. for pattern in _contractions.CONTRACTIONS2
  213. ]
  214. CONTRACTIONS3 = [
  215. re.compile(pattern.replace("(?#X)", "\s"))
  216. for pattern in _contractions.CONTRACTIONS3
  217. ]
  218. # ending quotes
  219. ENDING_QUOTES = [
  220. (re.compile(r"([^' ])\s('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) "), r"\1\2 "),
  221. (re.compile(r"([^' ])\s('[sS]|'[mM]|'[dD]|') "), r"\1\2 "),
  222. (re.compile(r"(\S)(\'\')"), r"\1\2 "),
  223. (re.compile(r" '' "), '"'),
  224. ]
  225. # Handles double dashes
  226. DOUBLE_DASHES = (re.compile(r" -- "), r"--")
  227. # Optionally: Convert parentheses, brackets and converts them from PTB symbols.
  228. CONVERT_PARENTHESES = [
  229. (re.compile("-LRB-"), "("),
  230. (re.compile("-RRB-"), ")"),
  231. (re.compile("-LSB-"), "["),
  232. (re.compile("-RSB-"), "]"),
  233. (re.compile("-LCB-"), "{"),
  234. (re.compile("-RCB-"), "}"),
  235. ]
  236. # Undo padding on parentheses.
  237. PARENS_BRACKETS = [
  238. (re.compile(r"\s([\[\(\{\<])\s"), r" \g<1>"),
  239. (re.compile(r"\s([\]\)\}\>])\s"), r"\g<1> "),
  240. (re.compile(r"([\]\)\}\>])\s([:;,.])"), r"\1\2"),
  241. ]
  242. # punctuation
  243. PUNCTUATION = [
  244. (re.compile(r"([^'])\s'\s"), r"\1' "),
  245. (re.compile(r"\s([?!])"), r"\g<1>"), # Strip left pad for [?!]
  246. # (re.compile(r'\s([?!])\s'), r'\g<1>'),
  247. (re.compile(r'([^\.])\s(\.)([\]\)}>"\']*)\s*$'), r"\1\2\3"),
  248. # When tokenizing, [;@#$%&] are padded with whitespace regardless of
  249. # whether there are spaces before or after them.
  250. # But during detokenization, we need to distinguish between left/right
  251. # pad, so we split this up.
  252. (re.compile(r"\s([#$])\s"), r" \g<1>"), # Left pad.
  253. (re.compile(r"\s([;%])\s"), r"\g<1> "), # Right pad.
  254. (re.compile(r"\s([&*])\s"), r" \g<1> "), # Unknown pad.
  255. (re.compile(r"\s\.\.\.\s"), r"..."),
  256. (re.compile(r"\s([:,])\s$"), r"\1"),
  257. (
  258. re.compile(r"\s([:,])\s([^\d])"),
  259. r"\1 \2",
  260. ) # Keep right pad after comma/colon before non-digits.
  261. # (re.compile(r'\s([:,])\s([^\d])'), r'\1\2')
  262. ]
  263. # starting quotes
  264. STARTING_QUOTES = [
  265. (re.compile(r"([ (\[{<])\s``"), r'\1"'),
  266. (re.compile(r"\s(``)\s"), r"\1"),
  267. (re.compile(r"^``"), r"\""),
  268. ]
  269. def tokenize(self, tokens, convert_parentheses=False):
  270. """
  271. Treebank detokenizer, created by undoing the regexes from
  272. the TreebankWordTokenizer.tokenize.
  273. :param tokens: A list of strings, i.e. tokenized text.
  274. :type tokens: list(str)
  275. :return: str
  276. """
  277. text = " ".join(tokens)
  278. # Reverse the contractions regexes.
  279. # Note: CONTRACTIONS4 are not used in tokenization.
  280. for regexp in self.CONTRACTIONS3:
  281. text = regexp.sub(r"\1\2", text)
  282. for regexp in self.CONTRACTIONS2:
  283. text = regexp.sub(r"\1\2", text)
  284. # Reverse the regexes applied for ending quotes.
  285. for regexp, substitution in self.ENDING_QUOTES:
  286. text = regexp.sub(substitution, text)
  287. # Undo the space padding.
  288. text = text.strip()
  289. # Reverse the padding on double dashes.
  290. regexp, substitution = self.DOUBLE_DASHES
  291. text = regexp.sub(substitution, text)
  292. if convert_parentheses:
  293. for regexp, substitution in self.CONVERT_PARENTHESES:
  294. text = regexp.sub(substitution, text)
  295. # Reverse the padding regexes applied for parenthesis/brackets.
  296. for regexp, substitution in self.PARENS_BRACKETS:
  297. text = regexp.sub(substitution, text)
  298. # Reverse the regexes applied for punctuations.
  299. for regexp, substitution in self.PUNCTUATION:
  300. text = regexp.sub(substitution, text)
  301. # Reverse the regexes applied for starting quotes.
  302. for regexp, substitution in self.STARTING_QUOTES:
  303. text = regexp.sub(substitution, text)
  304. return text.strip()
  305. def detokenize(self, tokens, convert_parentheses=False):
  306. """ Duck-typing the abstract *tokenize()*."""
  307. return self.tokenize(tokens, convert_parentheses)