nist.py 7.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180
  1. # -*- coding: utf-8 -*-
  2. # Natural Language Toolkit: Python port of the mteval-v14.pl tokenizer.
  3. #
  4. # Copyright (C) 2001-2015 NLTK Project
  5. # Author: Liling Tan (ported from ftp://jaguar.ncsl.nist.gov/mt/resources/mteval-v14.pl)
  6. # Contributors: Ozan Caglayan, Wiktor Stribizew
  7. #
  8. # URL: <http://nltk.sourceforge.net>
  9. # For license information, see LICENSE.TXT
  10. """
  11. This is a NLTK port of the tokenizer used in the NIST BLEU evaluation script,
  12. https://github.com/moses-smt/mosesdecoder/blob/master/scripts/generic/mteval-v14.pl#L926
  13. which was also ported into Python in
  14. https://github.com/lium-lst/nmtpy/blob/master/nmtpy/metrics/mtevalbleu.py#L162
  15. """
  16. import io
  17. import re
  18. from nltk.corpus import perluniprops
  19. from nltk.tokenize.api import TokenizerI
  20. from nltk.tokenize.util import xml_unescape
  21. class NISTTokenizer(TokenizerI):
  22. """
  23. This NIST tokenizer is sentence-based instead of the original
  24. paragraph-based tokenization from mteval-14.pl; The sentence-based
  25. tokenization is consistent with the other tokenizers available in NLTK.
  26. >>> from nltk.tokenize.nist import NISTTokenizer
  27. >>> nist = NISTTokenizer()
  28. >>> s = "Good muffins cost $3.88 in New York."
  29. >>> expected_lower = [u'good', u'muffins', u'cost', u'$', u'3.88', u'in', u'new', u'york', u'.']
  30. >>> expected_cased = [u'Good', u'muffins', u'cost', u'$', u'3.88', u'in', u'New', u'York', u'.']
  31. >>> nist.tokenize(s, lowercase=False) == expected_cased
  32. True
  33. >>> nist.tokenize(s, lowercase=True) == expected_lower # Lowercased.
  34. True
  35. The international_tokenize() is the preferred function when tokenizing
  36. non-european text, e.g.
  37. >>> from nltk.tokenize.nist import NISTTokenizer
  38. >>> nist = NISTTokenizer()
  39. # Input strings.
  40. >>> albb = u'Alibaba Group Holding Limited (Chinese: 阿里巴巴集团控股 有限公司) us a Chinese e-commerce company...'
  41. >>> amz = u'Amazon.com, Inc. (/ˈæməzɒn/) is an American electronic commerce...'
  42. >>> rkt = u'Rakuten, Inc. (楽天株式会社 Rakuten Kabushiki-gaisha) is a Japanese electronic commerce and Internet company based in Tokyo.'
  43. # Expected tokens.
  44. >>> expected_albb = [u'Alibaba', u'Group', u'Holding', u'Limited', u'(', u'Chinese', u':', u'\u963f\u91cc\u5df4\u5df4\u96c6\u56e2\u63a7\u80a1', u'\u6709\u9650\u516c\u53f8', u')']
  45. >>> expected_amz = [u'Amazon', u'.', u'com', u',', u'Inc', u'.', u'(', u'/', u'\u02c8\xe6', u'm']
  46. >>> expected_rkt = [u'Rakuten', u',', u'Inc', u'.', u'(', u'\u697d\u5929\u682a\u5f0f\u4f1a\u793e', u'Rakuten', u'Kabushiki', u'-', u'gaisha']
  47. >>> nist.international_tokenize(albb)[:10] == expected_albb
  48. True
  49. >>> nist.international_tokenize(amz)[:10] == expected_amz
  50. True
  51. >>> nist.international_tokenize(rkt)[:10] == expected_rkt
  52. True
  53. # Doctest for patching issue #1926
  54. >>> sent = u'this is a foo\u2604sentence.'
  55. >>> expected_sent = [u'this', u'is', u'a', u'foo', u'\u2604', u'sentence', u'.']
  56. >>> nist.international_tokenize(sent) == expected_sent
  57. True
  58. """
  59. # Strip "skipped" tags
  60. STRIP_SKIP = re.compile("<skipped>"), ""
  61. # Strip end-of-line hyphenation and join lines
  62. STRIP_EOL_HYPHEN = re.compile("\u2028"), " "
  63. # Tokenize punctuation.
  64. PUNCT = re.compile("([\{-\~\[-\` -\&\(-\+\:-\@\/])"), " \\1 "
  65. # Tokenize period and comma unless preceded by a digit.
  66. PERIOD_COMMA_PRECEED = re.compile("([^0-9])([\.,])"), "\\1 \\2 "
  67. # Tokenize period and comma unless followed by a digit.
  68. PERIOD_COMMA_FOLLOW = re.compile("([\.,])([^0-9])"), " \\1 \\2"
  69. # Tokenize dash when preceded by a digit
  70. DASH_PRECEED_DIGIT = re.compile("([0-9])(-)"), "\\1 \\2 "
  71. LANG_DEPENDENT_REGEXES = [
  72. PUNCT,
  73. PERIOD_COMMA_PRECEED,
  74. PERIOD_COMMA_FOLLOW,
  75. DASH_PRECEED_DIGIT,
  76. ]
  77. # Perluniprops characters used in NIST tokenizer.
  78. pup_number = str("".join(set(perluniprops.chars("Number")))) # i.e. \p{N}
  79. pup_punct = str("".join(set(perluniprops.chars("Punctuation")))) # i.e. \p{P}
  80. pup_symbol = str("".join(set(perluniprops.chars("Symbol")))) # i.e. \p{S}
  81. # Python regexes needs to escape some special symbols, see
  82. # see https://stackoverflow.com/q/45670950/610569
  83. number_regex = re.sub(r"[]^\\-]", r"\\\g<0>", pup_number)
  84. punct_regex = re.sub(r"[]^\\-]", r"\\\g<0>", pup_punct)
  85. symbol_regex = re.sub(r"[]^\\-]", r"\\\g<0>", pup_symbol)
  86. # Note: In the original perl implementation, \p{Z} and \p{Zl} were used to
  87. # (i) strip trailing and heading spaces and
  88. # (ii) de-deuplicate spaces.
  89. # In Python, this would do: ' '.join(str.strip().split())
  90. # Thus, the next two lines were commented out.
  91. # Line_Separator = str(''.join(perluniprops.chars('Line_Separator'))) # i.e. \p{Zl}
  92. # Separator = str(''.join(perluniprops.chars('Separator'))) # i.e. \p{Z}
  93. # Pads non-ascii strings with space.
  94. NONASCII = re.compile("([\x00-\x7f]+)"), r" \1 "
  95. # Tokenize any punctuation unless followed AND preceded by a digit.
  96. PUNCT_1 = (
  97. re.compile("([{n}])([{p}])".format(n=number_regex, p=punct_regex)),
  98. "\\1 \\2 ",
  99. )
  100. PUNCT_2 = (
  101. re.compile("([{p}])([{n}])".format(n=number_regex, p=punct_regex)),
  102. " \\1 \\2",
  103. )
  104. # Tokenize symbols
  105. SYMBOLS = re.compile("([{s}])".format(s=symbol_regex)), " \\1 "
  106. INTERNATIONAL_REGEXES = [NONASCII, PUNCT_1, PUNCT_2, SYMBOLS]
  107. def lang_independent_sub(self, text):
  108. """Performs the language independent string substituitions. """
  109. # It's a strange order of regexes.
  110. # It'll be better to unescape after STRIP_EOL_HYPHEN
  111. # but let's keep it close to the original NIST implementation.
  112. regexp, substitution = self.STRIP_SKIP
  113. text = regexp.sub(substitution, text)
  114. text = xml_unescape(text)
  115. regexp, substitution = self.STRIP_EOL_HYPHEN
  116. text = regexp.sub(substitution, text)
  117. return text
  118. def tokenize(self, text, lowercase=False, western_lang=True, return_str=False):
  119. text = str(text)
  120. # Language independent regex.
  121. text = self.lang_independent_sub(text)
  122. # Language dependent regex.
  123. if western_lang:
  124. # Pad string with whitespace.
  125. text = " " + text + " "
  126. if lowercase:
  127. text = text.lower()
  128. for regexp, substitution in self.LANG_DEPENDENT_REGEXES:
  129. text = regexp.sub(substitution, text)
  130. # Remove contiguous whitespaces.
  131. text = " ".join(text.split())
  132. # Finally, strips heading and trailing spaces
  133. # and converts output string into unicode.
  134. text = str(text.strip())
  135. return text if return_str else text.split()
  136. def international_tokenize(
  137. self, text, lowercase=False, split_non_ascii=True, return_str=False
  138. ):
  139. text = str(text)
  140. # Different from the 'normal' tokenize(), STRIP_EOL_HYPHEN is applied
  141. # first before unescaping.
  142. regexp, substitution = self.STRIP_SKIP
  143. text = regexp.sub(substitution, text)
  144. regexp, substitution = self.STRIP_EOL_HYPHEN
  145. text = regexp.sub(substitution, text)
  146. text = xml_unescape(text)
  147. if lowercase:
  148. text = text.lower()
  149. for regexp, substitution in self.INTERNATIONAL_REGEXES:
  150. text = regexp.sub(substitution, text)
  151. # Make sure that there's only one space only between words.
  152. # Strip leading and trailing spaces.
  153. text = " ".join(text.strip().split())
  154. return text if return_str else text.split()