indian.py 3.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101
  1. # Natural Language Toolkit: Indian Language POS-Tagged Corpus Reader
  2. #
  3. # Copyright (C) 2001-2020 NLTK Project
  4. # Author: Steven Bird <stevenbird1@gmail.com>
  5. # Edward Loper <edloper@gmail.com>
  6. # URL: <http://nltk.org/>
  7. # For license information, see LICENSE.TXT
  8. """
  9. Indian Language POS-Tagged Corpus
  10. Collected by A Kumaran, Microsoft Research, India
  11. Distributed with permission
  12. Contents:
  13. - Bangla: IIT Kharagpur
  14. - Hindi: Microsoft Research India
  15. - Marathi: IIT Bombay
  16. - Telugu: IIIT Hyderabad
  17. """
  18. from nltk.tag import str2tuple, map_tag
  19. from nltk.corpus.reader.util import *
  20. from nltk.corpus.reader.api import *
  21. class IndianCorpusReader(CorpusReader):
  22. """
  23. List of words, one per line. Blank lines are ignored.
  24. """
  25. def words(self, fileids=None):
  26. return concat(
  27. [
  28. IndianCorpusView(fileid, enc, False, False)
  29. for (fileid, enc) in self.abspaths(fileids, True)
  30. ]
  31. )
  32. def tagged_words(self, fileids=None, tagset=None):
  33. if tagset and tagset != self._tagset:
  34. tag_mapping_function = lambda t: map_tag(self._tagset, tagset, t)
  35. else:
  36. tag_mapping_function = None
  37. return concat(
  38. [
  39. IndianCorpusView(fileid, enc, True, False, tag_mapping_function)
  40. for (fileid, enc) in self.abspaths(fileids, True)
  41. ]
  42. )
  43. def sents(self, fileids=None):
  44. return concat(
  45. [
  46. IndianCorpusView(fileid, enc, False, True)
  47. for (fileid, enc) in self.abspaths(fileids, True)
  48. ]
  49. )
  50. def tagged_sents(self, fileids=None, tagset=None):
  51. if tagset and tagset != self._tagset:
  52. tag_mapping_function = lambda t: map_tag(self._tagset, tagset, t)
  53. else:
  54. tag_mapping_function = None
  55. return concat(
  56. [
  57. IndianCorpusView(fileid, enc, True, True, tag_mapping_function)
  58. for (fileid, enc) in self.abspaths(fileids, True)
  59. ]
  60. )
  61. def raw(self, fileids=None):
  62. if fileids is None:
  63. fileids = self._fileids
  64. elif isinstance(fileids, str):
  65. fileids = [fileids]
  66. return concat([self.open(f).read() for f in fileids])
  67. class IndianCorpusView(StreamBackedCorpusView):
  68. def __init__(
  69. self, corpus_file, encoding, tagged, group_by_sent, tag_mapping_function=None
  70. ):
  71. self._tagged = tagged
  72. self._group_by_sent = group_by_sent
  73. self._tag_mapping_function = tag_mapping_function
  74. StreamBackedCorpusView.__init__(self, corpus_file, encoding=encoding)
  75. def read_block(self, stream):
  76. line = stream.readline()
  77. if line.startswith("<"):
  78. return []
  79. sent = [str2tuple(word, sep="_") for word in line.split()]
  80. if self._tag_mapping_function:
  81. sent = [(w, self._tag_mapping_function(t)) for (w, t) in sent]
  82. if not self._tagged:
  83. sent = [w for (w, t) in sent]
  84. if self._group_by_sent:
  85. return [sent]
  86. else:
  87. return sent