stanford_segmenter.py 9.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. # Natural Language Toolkit: Interface to the Stanford Segmenter
  4. # for Chinese and Arabic
  5. #
  6. # Copyright (C) 2001-2020 NLTK Project
  7. # Author: 52nlp <52nlpcn@gmail.com>
  8. # Casper Lehmann-Strøm <casperlehmann@gmail.com>
  9. # Alex Constantin <alex@keyworder.ch>
  10. #
  11. # URL: <http://nltk.org/>
  12. # For license information, see LICENSE.TXT
  13. import tempfile
  14. import os
  15. import json
  16. import warnings
  17. from subprocess import PIPE
  18. from nltk.internals import (
  19. find_jar,
  20. find_file,
  21. find_dir,
  22. config_java,
  23. java,
  24. _java_options,
  25. )
  26. from nltk.tokenize.api import TokenizerI
  27. _stanford_url = "https://nlp.stanford.edu/software"
  28. class StanfordSegmenter(TokenizerI):
  29. """Interface to the Stanford Segmenter
  30. If stanford-segmenter version is older than 2016-10-31, then path_to_slf4j
  31. should be provieded, for example::
  32. seg = StanfordSegmenter(path_to_slf4j='/YOUR_PATH/slf4j-api.jar')
  33. >>> from nltk.tokenize.stanford_segmenter import StanfordSegmenter
  34. >>> seg = StanfordSegmenter()
  35. >>> seg.default_config('zh')
  36. >>> sent = u'这是斯坦福中文分词器测试'
  37. >>> print(seg.segment(sent))
  38. \u8fd9 \u662f \u65af\u5766\u798f \u4e2d\u6587 \u5206\u8bcd\u5668 \u6d4b\u8bd5
  39. <BLANKLINE>
  40. >>> seg.default_config('ar')
  41. >>> sent = u'هذا هو تصنيف ستانفورد العربي للكلمات'
  42. >>> print(seg.segment(sent.split()))
  43. \u0647\u0630\u0627 \u0647\u0648 \u062a\u0635\u0646\u064a\u0641 \u0633\u062a\u0627\u0646\u0641\u0648\u0631\u062f \u0627\u0644\u0639\u0631\u0628\u064a \u0644 \u0627\u0644\u0643\u0644\u0645\u0627\u062a
  44. <BLANKLINE>
  45. """
  46. _JAR = "stanford-segmenter.jar"
  47. def __init__(
  48. self,
  49. path_to_jar=None,
  50. path_to_slf4j=None,
  51. java_class=None,
  52. path_to_model=None,
  53. path_to_dict=None,
  54. path_to_sihan_corpora_dict=None,
  55. sihan_post_processing="false",
  56. keep_whitespaces="false",
  57. encoding="UTF-8",
  58. options=None,
  59. verbose=False,
  60. java_options="-mx2g",
  61. ):
  62. # Raise deprecation warning.
  63. warnings.simplefilter("always", DeprecationWarning)
  64. warnings.warn(
  65. str(
  66. "\nThe StanfordTokenizer will "
  67. "be deprecated in version 3.2.5.\n"
  68. "Please use \033[91mnltk.parse.corenlp.CoreNLPTokenizer\033[0m instead.'"
  69. ),
  70. DeprecationWarning,
  71. stacklevel=2,
  72. )
  73. warnings.simplefilter("ignore", DeprecationWarning)
  74. stanford_segmenter = find_jar(
  75. self._JAR,
  76. path_to_jar,
  77. env_vars=("STANFORD_SEGMENTER",),
  78. searchpath=(),
  79. url=_stanford_url,
  80. verbose=verbose,
  81. )
  82. if path_to_slf4j is not None:
  83. slf4j = find_jar(
  84. "slf4j-api.jar",
  85. path_to_slf4j,
  86. env_vars=("SLF4J", "STANFORD_SEGMENTER"),
  87. searchpath=(),
  88. url=_stanford_url,
  89. verbose=verbose,
  90. )
  91. else:
  92. slf4j = None
  93. # This is passed to java as the -cp option, the old version of segmenter needs slf4j.
  94. # The new version of stanford-segmenter-2016-10-31 doesn't need slf4j
  95. self._stanford_jar = os.pathsep.join(
  96. _ for _ in [stanford_segmenter, slf4j] if _ is not None
  97. )
  98. self._java_class = java_class
  99. self._model = path_to_model
  100. self._sihan_corpora_dict = path_to_sihan_corpora_dict
  101. self._sihan_post_processing = sihan_post_processing
  102. self._keep_whitespaces = keep_whitespaces
  103. self._dict = path_to_dict
  104. self._encoding = encoding
  105. self.java_options = java_options
  106. options = {} if options is None else options
  107. self._options_cmd = ",".join(
  108. "{0}={1}".format(key, json.dumps(val)) for key, val in options.items()
  109. )
  110. def default_config(self, lang):
  111. """
  112. Attempt to intialize Stanford Word Segmenter for the specified language
  113. using the STANFORD_SEGMENTER and STANFORD_MODELS environment variables
  114. """
  115. search_path = ()
  116. if os.environ.get("STANFORD_SEGMENTER"):
  117. search_path = {os.path.join(os.environ.get("STANFORD_SEGMENTER"), "data")}
  118. # init for Chinese-specific files
  119. self._dict = None
  120. self._sihan_corpora_dict = None
  121. self._sihan_post_processing = "false"
  122. if lang == "ar":
  123. self._java_class = (
  124. "edu.stanford.nlp.international.arabic.process.ArabicSegmenter"
  125. )
  126. model = "arabic-segmenter-atb+bn+arztrain.ser.gz"
  127. elif lang == "zh":
  128. self._java_class = "edu.stanford.nlp.ie.crf.CRFClassifier"
  129. model = "pku.gz"
  130. self._sihan_post_processing = "true"
  131. path_to_dict = "dict-chris6.ser.gz"
  132. try:
  133. self._dict = find_file(
  134. path_to_dict,
  135. searchpath=search_path,
  136. url=_stanford_url,
  137. verbose=False,
  138. env_vars=("STANFORD_MODELS",),
  139. )
  140. except LookupError:
  141. raise LookupError(
  142. "Could not find '%s' (tried using env. "
  143. "variables STANFORD_MODELS and <STANFORD_SEGMENTER>/data/)"
  144. % path_to_dict
  145. )
  146. sihan_dir = "./data/"
  147. try:
  148. path_to_sihan_dir = find_dir(
  149. sihan_dir,
  150. url=_stanford_url,
  151. verbose=False,
  152. env_vars=("STANFORD_SEGMENTER",),
  153. )
  154. self._sihan_corpora_dict = os.path.join(path_to_sihan_dir, sihan_dir)
  155. except LookupError:
  156. raise LookupError(
  157. "Could not find '%s' (tried using the "
  158. "STANFORD_SEGMENTER environment variable)" % sihan_dir
  159. )
  160. else:
  161. raise LookupError("Unsupported language {}".format(lang))
  162. try:
  163. self._model = find_file(
  164. model,
  165. searchpath=search_path,
  166. url=_stanford_url,
  167. verbose=False,
  168. env_vars=("STANFORD_MODELS", "STANFORD_SEGMENTER"),
  169. )
  170. except LookupError:
  171. raise LookupError(
  172. "Could not find '%s' (tried using env. "
  173. "variables STANFORD_MODELS and <STANFORD_SEGMENTER>/data/)" % model
  174. )
  175. def tokenize(self, s):
  176. super().tokenize(s)
  177. def segment_file(self, input_file_path):
  178. """
  179. """
  180. cmd = [
  181. self._java_class,
  182. "-loadClassifier",
  183. self._model,
  184. "-keepAllWhitespaces",
  185. self._keep_whitespaces,
  186. "-textFile",
  187. input_file_path,
  188. ]
  189. if self._sihan_corpora_dict is not None:
  190. cmd.extend(
  191. [
  192. "-serDictionary",
  193. self._dict,
  194. "-sighanCorporaDict",
  195. self._sihan_corpora_dict,
  196. "-sighanPostProcessing",
  197. self._sihan_post_processing,
  198. ]
  199. )
  200. stdout = self._execute(cmd)
  201. return stdout
  202. def segment(self, tokens):
  203. return self.segment_sents([tokens])
  204. def segment_sents(self, sentences):
  205. """
  206. """
  207. encoding = self._encoding
  208. # Create a temporary input file
  209. _input_fh, self._input_file_path = tempfile.mkstemp(text=True)
  210. # Write the actural sentences to the temporary input file
  211. _input_fh = os.fdopen(_input_fh, "wb")
  212. _input = "\n".join((" ".join(x) for x in sentences))
  213. if isinstance(_input, str) and encoding:
  214. _input = _input.encode(encoding)
  215. _input_fh.write(_input)
  216. _input_fh.close()
  217. cmd = [
  218. self._java_class,
  219. "-loadClassifier",
  220. self._model,
  221. "-keepAllWhitespaces",
  222. self._keep_whitespaces,
  223. "-textFile",
  224. self._input_file_path,
  225. ]
  226. if self._sihan_corpora_dict is not None:
  227. cmd.extend(
  228. [
  229. "-serDictionary",
  230. self._dict,
  231. "-sighanCorporaDict",
  232. self._sihan_corpora_dict,
  233. "-sighanPostProcessing",
  234. self._sihan_post_processing,
  235. ]
  236. )
  237. stdout = self._execute(cmd)
  238. # Delete the temporary file
  239. os.unlink(self._input_file_path)
  240. return stdout
  241. def _execute(self, cmd, verbose=False):
  242. encoding = self._encoding
  243. cmd.extend(["-inputEncoding", encoding])
  244. _options_cmd = self._options_cmd
  245. if _options_cmd:
  246. cmd.extend(["-options", self._options_cmd])
  247. default_options = " ".join(_java_options)
  248. # Configure java.
  249. config_java(options=self.java_options, verbose=verbose)
  250. stdout, _stderr = java(
  251. cmd, classpath=self._stanford_jar, stdout=PIPE, stderr=PIPE
  252. )
  253. stdout = stdout.decode(encoding)
  254. # Return java configurations to their default values.
  255. config_java(options=default_options, verbose=False)
  256. return stdout
  257. def setup_module(module):
  258. from nose import SkipTest
  259. try:
  260. seg = StanfordSegmenter()
  261. seg.default_config("ar")
  262. seg.default_config("zh")
  263. except LookupError as e:
  264. raise SkipTest(
  265. "Tests for nltk.tokenize.stanford_segmenter skipped: %s" % str(e)
  266. )