pl196x.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381
  1. # Natural Language Toolkit:
  2. #
  3. # Copyright (C) 2001-2020 NLTK Project
  4. # Author: Piotr Kasprzyk <p.j.kasprzyk@gmail.com>
  5. # URL: <http://nltk.org/>
  6. # For license information, see LICENSE.TXT
  7. from nltk.corpus.reader.api import *
  8. from nltk.corpus.reader.xmldocs import XMLCorpusReader
  9. PARA = re.compile(r"<p(?: [^>]*){0,1}>(.*?)</p>")
  10. SENT = re.compile(r"<s(?: [^>]*){0,1}>(.*?)</s>")
  11. TAGGEDWORD = re.compile(r"<([wc](?: [^>]*){0,1}>)(.*?)</[wc]>")
  12. WORD = re.compile(r"<[wc](?: [^>]*){0,1}>(.*?)</[wc]>")
  13. TYPE = re.compile(r'type="(.*?)"')
  14. ANA = re.compile(r'ana="(.*?)"')
  15. TEXTID = re.compile(r'text id="(.*?)"')
  16. class TEICorpusView(StreamBackedCorpusView):
  17. def __init__(
  18. self,
  19. corpus_file,
  20. tagged,
  21. group_by_sent,
  22. group_by_para,
  23. tagset=None,
  24. head_len=0,
  25. textids=None,
  26. ):
  27. self._tagged = tagged
  28. self._textids = textids
  29. self._group_by_sent = group_by_sent
  30. self._group_by_para = group_by_para
  31. # WARNING -- skip header
  32. StreamBackedCorpusView.__init__(self, corpus_file, startpos=head_len)
  33. _pagesize = 4096
  34. def read_block(self, stream):
  35. block = stream.readlines(self._pagesize)
  36. block = concat(block)
  37. while (block.count("<text id") > block.count("</text>")) or block.count(
  38. "<text id"
  39. ) == 0:
  40. tmp = stream.readline()
  41. if len(tmp) <= 0:
  42. break
  43. block += tmp
  44. block = block.replace("\n", "")
  45. textids = TEXTID.findall(block)
  46. if self._textids:
  47. for tid in textids:
  48. if tid not in self._textids:
  49. beg = block.find(tid) - 1
  50. end = block[beg:].find("</text>") + len("</text>")
  51. block = block[:beg] + block[beg + end :]
  52. output = []
  53. for para_str in PARA.findall(block):
  54. para = []
  55. for sent_str in SENT.findall(para_str):
  56. if not self._tagged:
  57. sent = WORD.findall(sent_str)
  58. else:
  59. sent = list(map(self._parse_tag, TAGGEDWORD.findall(sent_str)))
  60. if self._group_by_sent:
  61. para.append(sent)
  62. else:
  63. para.extend(sent)
  64. if self._group_by_para:
  65. output.append(para)
  66. else:
  67. output.extend(para)
  68. return output
  69. def _parse_tag(self, tag_word_tuple):
  70. (tag, word) = tag_word_tuple
  71. if tag.startswith("w"):
  72. tag = ANA.search(tag).group(1)
  73. else: # tag.startswith('c')
  74. tag = TYPE.search(tag).group(1)
  75. return word, tag
  76. class Pl196xCorpusReader(CategorizedCorpusReader, XMLCorpusReader):
  77. head_len = 2770
  78. def __init__(self, *args, **kwargs):
  79. if "textid_file" in kwargs:
  80. self._textids = kwargs["textid_file"]
  81. else:
  82. self._textids = None
  83. XMLCorpusReader.__init__(self, *args)
  84. CategorizedCorpusReader.__init__(self, kwargs)
  85. self._init_textids()
  86. def _init_textids(self):
  87. self._f2t = defaultdict(list)
  88. self._t2f = defaultdict(list)
  89. if self._textids is not None:
  90. with open(self._textids) as fp:
  91. for line in fp:
  92. line = line.strip()
  93. file_id, text_ids = line.split(" ", 1)
  94. if file_id not in self.fileids():
  95. raise ValueError(
  96. "In text_id mapping file %s: %s not found"
  97. % (self._textids, file_id)
  98. )
  99. for text_id in text_ids.split(self._delimiter):
  100. self._add_textids(file_id, text_id)
  101. def _add_textids(self, file_id, text_id):
  102. self._f2t[file_id].append(text_id)
  103. self._t2f[text_id].append(file_id)
  104. def _resolve(self, fileids, categories, textids=None):
  105. tmp = None
  106. if (
  107. len(list(
  108. filter(
  109. lambda accessor: accessor is None, (fileids, categories, textids)
  110. )
  111. ))
  112. != 1
  113. ):
  114. raise ValueError(
  115. "Specify exactly one of: fileids, " "categories or textids"
  116. )
  117. if fileids is not None:
  118. return fileids, None
  119. if categories is not None:
  120. return self.fileids(categories), None
  121. if textids is not None:
  122. if isinstance(textids, str):
  123. textids = [textids]
  124. files = sum((self._t2f[t] for t in textids), [])
  125. tdict = dict()
  126. for f in files:
  127. tdict[f] = set(self._f2t[f]) & set(textids)
  128. return files, tdict
  129. def decode_tag(self, tag):
  130. # to be implemented
  131. return tag
  132. def textids(self, fileids=None, categories=None):
  133. """
  134. In the pl196x corpus each category is stored in single
  135. file and thus both methods provide identical functionality. In order
  136. to accommodate finer granularity, a non-standard textids() method was
  137. implemented. All the main functions can be supplied with a list
  138. of required chunks---giving much more control to the user.
  139. """
  140. fileids, _ = self._resolve(fileids, categories)
  141. if fileids is None:
  142. return sorted(self._t2f)
  143. if isinstance(fileids, str):
  144. fileids = [fileids]
  145. return sorted(sum((self._f2t[d] for d in fileids), []))
  146. def words(self, fileids=None, categories=None, textids=None):
  147. fileids, textids = self._resolve(fileids, categories, textids)
  148. if fileids is None:
  149. fileids = self._fileids
  150. elif isinstance(fileids, str):
  151. fileids = [fileids]
  152. if textids:
  153. return concat(
  154. [
  155. TEICorpusView(
  156. self.abspath(fileid),
  157. False,
  158. False,
  159. False,
  160. head_len=self.head_len,
  161. textids=textids[fileid],
  162. )
  163. for fileid in fileids
  164. ]
  165. )
  166. else:
  167. return concat(
  168. [
  169. TEICorpusView(
  170. self.abspath(fileid),
  171. False,
  172. False,
  173. False,
  174. head_len=self.head_len,
  175. )
  176. for fileid in fileids
  177. ]
  178. )
  179. def sents(self, fileids=None, categories=None, textids=None):
  180. fileids, textids = self._resolve(fileids, categories, textids)
  181. if fileids is None:
  182. fileids = self._fileids
  183. elif isinstance(fileids, str):
  184. fileids = [fileids]
  185. if textids:
  186. return concat(
  187. [
  188. TEICorpusView(
  189. self.abspath(fileid),
  190. False,
  191. True,
  192. False,
  193. head_len=self.head_len,
  194. textids=textids[fileid],
  195. )
  196. for fileid in fileids
  197. ]
  198. )
  199. else:
  200. return concat(
  201. [
  202. TEICorpusView(
  203. self.abspath(fileid), False, True, False, head_len=self.head_len
  204. )
  205. for fileid in fileids
  206. ]
  207. )
  208. def paras(self, fileids=None, categories=None, textids=None):
  209. fileids, textids = self._resolve(fileids, categories, textids)
  210. if fileids is None:
  211. fileids = self._fileids
  212. elif isinstance(fileids, str):
  213. fileids = [fileids]
  214. if textids:
  215. return concat(
  216. [
  217. TEICorpusView(
  218. self.abspath(fileid),
  219. False,
  220. True,
  221. True,
  222. head_len=self.head_len,
  223. textids=textids[fileid],
  224. )
  225. for fileid in fileids
  226. ]
  227. )
  228. else:
  229. return concat(
  230. [
  231. TEICorpusView(
  232. self.abspath(fileid), False, True, True, head_len=self.head_len
  233. )
  234. for fileid in fileids
  235. ]
  236. )
  237. def tagged_words(self, fileids=None, categories=None, textids=None):
  238. fileids, textids = self._resolve(fileids, categories, textids)
  239. if fileids is None:
  240. fileids = self._fileids
  241. elif isinstance(fileids, str):
  242. fileids = [fileids]
  243. if textids:
  244. return concat(
  245. [
  246. TEICorpusView(
  247. self.abspath(fileid),
  248. True,
  249. False,
  250. False,
  251. head_len=self.head_len,
  252. textids=textids[fileid],
  253. )
  254. for fileid in fileids
  255. ]
  256. )
  257. else:
  258. return concat(
  259. [
  260. TEICorpusView(
  261. self.abspath(fileid), True, False, False, head_len=self.head_len
  262. )
  263. for fileid in fileids
  264. ]
  265. )
  266. def tagged_sents(self, fileids=None, categories=None, textids=None):
  267. fileids, textids = self._resolve(fileids, categories, textids)
  268. if fileids is None:
  269. fileids = self._fileids
  270. elif isinstance(fileids, str):
  271. fileids = [fileids]
  272. if textids:
  273. return concat(
  274. [
  275. TEICorpusView(
  276. self.abspath(fileid),
  277. True,
  278. True,
  279. False,
  280. head_len=self.head_len,
  281. textids=textids[fileid],
  282. )
  283. for fileid in fileids
  284. ]
  285. )
  286. else:
  287. return concat(
  288. [
  289. TEICorpusView(
  290. self.abspath(fileid), True, True, False, head_len=self.head_len
  291. )
  292. for fileid in fileids
  293. ]
  294. )
  295. def tagged_paras(self, fileids=None, categories=None, textids=None):
  296. fileids, textids = self._resolve(fileids, categories, textids)
  297. if fileids is None:
  298. fileids = self._fileids
  299. elif isinstance(fileids, str):
  300. fileids = [fileids]
  301. if textids:
  302. return concat(
  303. [
  304. TEICorpusView(
  305. self.abspath(fileid),
  306. True,
  307. True,
  308. True,
  309. head_len=self.head_len,
  310. textids=textids[fileid],
  311. )
  312. for fileid in fileids
  313. ]
  314. )
  315. else:
  316. return concat(
  317. [
  318. TEICorpusView(
  319. self.abspath(fileid), True, True, True, head_len=self.head_len
  320. )
  321. for fileid in fileids
  322. ]
  323. )
  324. def xml(self, fileids=None, categories=None):
  325. fileids, _ = self._resolve(fileids, categories)
  326. if len(fileids) == 1:
  327. return XMLCorpusReader.xml(self, fileids[0])
  328. else:
  329. raise TypeError("Expected a single file")
  330. def raw(self, fileids=None, categories=None):
  331. fileids, _ = self._resolve(fileids, categories)
  332. if fileids is None:
  333. fileids = self._fileids
  334. elif isinstance(fileids, str):
  335. fileids = [fileids]
  336. return concat([self.open(f).read() for f in fileids])