sinica_treebank.py 2.4 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576
  1. # Natural Language Toolkit: Sinica Treebank Reader
  2. #
  3. # Copyright (C) 2001-2020 NLTK Project
  4. # Author: Steven Bird <stevenbird1@gmail.com>
  5. # URL: <http://nltk.org/>
  6. # For license information, see LICENSE.TXT
  7. """
  8. Sinica Treebank Corpus Sample
  9. http://rocling.iis.sinica.edu.tw/CKIP/engversion/treebank.htm
  10. 10,000 parsed sentences, drawn from the Academia Sinica Balanced
  11. Corpus of Modern Chinese. Parse tree notation is based on
  12. Information-based Case Grammar. Tagset documentation is available
  13. at http://www.sinica.edu.tw/SinicaCorpus/modern_e_wordtype.html
  14. Language and Knowledge Processing Group, Institute of Information
  15. Science, Academia Sinica
  16. The data is distributed with the Natural Language Toolkit under the terms of
  17. the Creative Commons Attribution-NonCommercial-ShareAlike License
  18. [http://creativecommons.org/licenses/by-nc-sa/2.5/].
  19. References:
  20. Feng-Yi Chen, Pi-Fang Tsai, Keh-Jiann Chen, and Chu-Ren Huang (1999)
  21. The Construction of Sinica Treebank. Computational Linguistics and
  22. Chinese Language Processing, 4, pp 87-104.
  23. Huang Chu-Ren, Keh-Jiann Chen, Feng-Yi Chen, Keh-Jiann Chen, Zhao-Ming
  24. Gao, and Kuang-Yu Chen. 2000. Sinica Treebank: Design Criteria,
  25. Annotation Guidelines, and On-line Interface. Proceedings of 2nd
  26. Chinese Language Processing Workshop, Association for Computational
  27. Linguistics.
  28. Chen Keh-Jiann and Yu-Ming Hsieh (2004) Chinese Treebanks and Grammar
  29. Extraction, Proceedings of IJCNLP-04, pp560-565.
  30. """
  31. from nltk.tree import sinica_parse
  32. from nltk.tag import map_tag
  33. from nltk.corpus.reader.util import *
  34. from nltk.corpus.reader.api import *
  35. IDENTIFIER = re.compile(r"^#\S+\s")
  36. APPENDIX = re.compile(r"(?<=\))#.*$")
  37. TAGWORD = re.compile(r":([^:()|]+):([^:()|]+)")
  38. WORD = re.compile(r":[^:()|]+:([^:()|]+)")
  39. class SinicaTreebankCorpusReader(SyntaxCorpusReader):
  40. """
  41. Reader for the sinica treebank.
  42. """
  43. def _read_block(self, stream):
  44. sent = stream.readline()
  45. sent = IDENTIFIER.sub("", sent)
  46. sent = APPENDIX.sub("", sent)
  47. return [sent]
  48. def _parse(self, sent):
  49. return sinica_parse(sent)
  50. def _tag(self, sent, tagset=None):
  51. tagged_sent = [(w, t) for (t, w) in TAGWORD.findall(sent)]
  52. if tagset and tagset != self._tagset:
  53. tagged_sent = [
  54. (w, map_tag(self._tagset, tagset, t)) for (w, t) in tagged_sent
  55. ]
  56. return tagged_sent
  57. def _word(self, sent):
  58. return WORD.findall(sent)