nkjp.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487
  1. # Natural Language Toolkit: NKJP Corpus Reader
  2. #
  3. # Copyright (C) 2001-2020 NLTK Project
  4. # Author: Gabriela Kaczka
  5. # URL: <http://nltk.org/>
  6. # For license information, see LICENSE.TXT
  7. import functools
  8. import os
  9. import re
  10. import tempfile
  11. from nltk.corpus.reader.util import concat
  12. from nltk.corpus.reader.xmldocs import XMLCorpusReader, XMLCorpusView
  13. def _parse_args(fun):
  14. """
  15. Wraps function arguments:
  16. if fileids not specified then function set NKJPCorpusReader paths.
  17. """
  18. @functools.wraps(fun)
  19. def decorator(self, fileids=None, **kwargs):
  20. if not fileids:
  21. fileids = self._paths
  22. return fun(self, fileids, **kwargs)
  23. return decorator
  24. class NKJPCorpusReader(XMLCorpusReader):
  25. WORDS_MODE = 0
  26. SENTS_MODE = 1
  27. HEADER_MODE = 2
  28. RAW_MODE = 3
  29. def __init__(self, root, fileids=".*"):
  30. """
  31. Corpus reader designed to work with National Corpus of Polish.
  32. See http://nkjp.pl/ for more details about NKJP.
  33. use example:
  34. import nltk
  35. import nkjp
  36. from nkjp import NKJPCorpusReader
  37. x = NKJPCorpusReader(root='/home/USER/nltk_data/corpora/nkjp/', fileids='') # obtain the whole corpus
  38. x.header()
  39. x.raw()
  40. x.words()
  41. x.tagged_words(tags=['subst', 'comp']) #Link to find more tags: nkjp.pl/poliqarp/help/ense2.html
  42. x.sents()
  43. x = NKJPCorpusReader(root='/home/USER/nltk_data/corpora/nkjp/', fileids='Wilk*') # obtain particular file(s)
  44. x.header(fileids=['WilkDom', '/home/USER/nltk_data/corpora/nkjp/WilkWilczy'])
  45. x.tagged_words(fileids=['WilkDom', '/home/USER/nltk_data/corpora/nkjp/WilkWilczy'], tags=['subst', 'comp'])
  46. """
  47. if isinstance(fileids, str):
  48. XMLCorpusReader.__init__(self, root, fileids + ".*/header.xml")
  49. else:
  50. XMLCorpusReader.__init__(
  51. self, root, [fileid + "/header.xml" for fileid in fileids]
  52. )
  53. self._paths = self.get_paths()
  54. def get_paths(self):
  55. return [
  56. os.path.join(str(self._root), f.split("header.xml")[0])
  57. for f in self._fileids
  58. ]
  59. def fileids(self):
  60. """
  61. Returns a list of file identifiers for the fileids that make up
  62. this corpus.
  63. """
  64. return [f.split("header.xml")[0] for f in self._fileids]
  65. def _view(self, filename, tags=None, **kwargs):
  66. """
  67. Returns a view specialised for use with particular corpus file.
  68. """
  69. mode = kwargs.pop("mode", NKJPCorpusReader.WORDS_MODE)
  70. if mode is NKJPCorpusReader.WORDS_MODE:
  71. return NKJPCorpus_Morph_View(filename, tags=tags)
  72. elif mode is NKJPCorpusReader.SENTS_MODE:
  73. return NKJPCorpus_Segmentation_View(filename, tags=tags)
  74. elif mode is NKJPCorpusReader.HEADER_MODE:
  75. return NKJPCorpus_Header_View(filename, tags=tags)
  76. elif mode is NKJPCorpusReader.RAW_MODE:
  77. return NKJPCorpus_Text_View(
  78. filename, tags=tags, mode=NKJPCorpus_Text_View.RAW_MODE
  79. )
  80. else:
  81. raise NameError("No such mode!")
  82. def add_root(self, fileid):
  83. """
  84. Add root if necessary to specified fileid.
  85. """
  86. if self.root in fileid:
  87. return fileid
  88. return self.root + fileid
  89. @_parse_args
  90. def header(self, fileids=None, **kwargs):
  91. """
  92. Returns header(s) of specified fileids.
  93. """
  94. return concat(
  95. [
  96. self._view(
  97. self.add_root(fileid), mode=NKJPCorpusReader.HEADER_MODE, **kwargs
  98. ).handle_query()
  99. for fileid in fileids
  100. ]
  101. )
  102. @_parse_args
  103. def sents(self, fileids=None, **kwargs):
  104. """
  105. Returns sentences in specified fileids.
  106. """
  107. return concat(
  108. [
  109. self._view(
  110. self.add_root(fileid), mode=NKJPCorpusReader.SENTS_MODE, **kwargs
  111. ).handle_query()
  112. for fileid in fileids
  113. ]
  114. )
  115. @_parse_args
  116. def words(self, fileids=None, **kwargs):
  117. """
  118. Returns words in specified fileids.
  119. """
  120. return concat(
  121. [
  122. self._view(
  123. self.add_root(fileid), mode=NKJPCorpusReader.WORDS_MODE, **kwargs
  124. ).handle_query()
  125. for fileid in fileids
  126. ]
  127. )
  128. @_parse_args
  129. def tagged_words(self, fileids=None, **kwargs):
  130. """
  131. Call with specified tags as a list, e.g. tags=['subst', 'comp'].
  132. Returns tagged words in specified fileids.
  133. """
  134. tags = kwargs.pop("tags", [])
  135. return concat(
  136. [
  137. self._view(
  138. self.add_root(fileid),
  139. mode=NKJPCorpusReader.WORDS_MODE,
  140. tags=tags,
  141. **kwargs
  142. ).handle_query()
  143. for fileid in fileids
  144. ]
  145. )
  146. @_parse_args
  147. def raw(self, fileids=None, **kwargs):
  148. """
  149. Returns words in specified fileids.
  150. """
  151. return concat(
  152. [
  153. self._view(
  154. self.add_root(fileid), mode=NKJPCorpusReader.RAW_MODE, **kwargs
  155. ).handle_query()
  156. for fileid in fileids
  157. ]
  158. )
  159. class NKJPCorpus_Header_View(XMLCorpusView):
  160. def __init__(self, filename, **kwargs):
  161. """
  162. HEADER_MODE
  163. A stream backed corpus view specialized for use with
  164. header.xml files in NKJP corpus.
  165. """
  166. self.tagspec = ".*/sourceDesc$"
  167. XMLCorpusView.__init__(self, filename + "header.xml", self.tagspec)
  168. def handle_query(self):
  169. self._open()
  170. header = []
  171. while True:
  172. segm = XMLCorpusView.read_block(self, self._stream)
  173. if len(segm) == 0:
  174. break
  175. header.extend(segm)
  176. self.close()
  177. return header
  178. def handle_elt(self, elt, context):
  179. titles = elt.findall("bibl/title")
  180. title = []
  181. if titles:
  182. title = "\n".join(title.text.strip() for title in titles)
  183. authors = elt.findall("bibl/author")
  184. author = []
  185. if authors:
  186. author = "\n".join(author.text.strip() for author in authors)
  187. dates = elt.findall("bibl/date")
  188. date = []
  189. if dates:
  190. date = "\n".join(date.text.strip() for date in dates)
  191. publishers = elt.findall("bibl/publisher")
  192. publisher = []
  193. if publishers:
  194. publisher = "\n".join(publisher.text.strip() for publisher in publishers)
  195. idnos = elt.findall("bibl/idno")
  196. idno = []
  197. if idnos:
  198. idno = "\n".join(idno.text.strip() for idno in idnos)
  199. notes = elt.findall("bibl/note")
  200. note = []
  201. if notes:
  202. note = "\n".join(note.text.strip() for note in notes)
  203. return {
  204. "title": title,
  205. "author": author,
  206. "date": date,
  207. "publisher": publisher,
  208. "idno": idno,
  209. "note": note,
  210. }
  211. class XML_Tool:
  212. """
  213. Helper class creating xml file to one without references to nkjp: namespace.
  214. That's needed because the XMLCorpusView assumes that one can find short substrings
  215. of XML that are valid XML, which is not true if a namespace is declared at top level
  216. """
  217. def __init__(self, root, filename):
  218. self.read_file = os.path.join(root, filename)
  219. self.write_file = tempfile.NamedTemporaryFile(delete=False)
  220. def build_preprocessed_file(self):
  221. try:
  222. fr = open(self.read_file, "r")
  223. fw = self.write_file
  224. line = " "
  225. while len(line):
  226. line = fr.readline()
  227. x = re.split(r"nkjp:[^ ]* ", line) # in all files
  228. ret = " ".join(x)
  229. x = re.split("<nkjp:paren>", ret) # in ann_segmentation.xml
  230. ret = " ".join(x)
  231. x = re.split("</nkjp:paren>", ret) # in ann_segmentation.xml
  232. ret = " ".join(x)
  233. x = re.split("<choice>", ret) # in ann_segmentation.xml
  234. ret = " ".join(x)
  235. x = re.split("</choice>", ret) # in ann_segmentation.xml
  236. ret = " ".join(x)
  237. fw.write(ret)
  238. fr.close()
  239. fw.close()
  240. return self.write_file.name
  241. except Exception:
  242. self.remove_preprocessed_file()
  243. raise Exception
  244. def remove_preprocessed_file(self):
  245. os.remove(self.write_file.name)
  246. class NKJPCorpus_Segmentation_View(XMLCorpusView):
  247. """
  248. A stream backed corpus view specialized for use with
  249. ann_segmentation.xml files in NKJP corpus.
  250. """
  251. def __init__(self, filename, **kwargs):
  252. self.tagspec = ".*p/.*s"
  253. # intersperse NKJPCorpus_Text_View
  254. self.text_view = NKJPCorpus_Text_View(
  255. filename, mode=NKJPCorpus_Text_View.SENTS_MODE
  256. )
  257. self.text_view.handle_query()
  258. # xml preprocessing
  259. self.xml_tool = XML_Tool(filename, "ann_segmentation.xml")
  260. # base class init
  261. XMLCorpusView.__init__(
  262. self, self.xml_tool.build_preprocessed_file(), self.tagspec
  263. )
  264. def get_segm_id(self, example_word):
  265. return example_word.split("(")[1].split(",")[0]
  266. def get_sent_beg(self, beg_word):
  267. # returns index of beginning letter in sentence
  268. return int(beg_word.split(",")[1])
  269. def get_sent_end(self, end_word):
  270. # returns index of end letter in sentence
  271. splitted = end_word.split(")")[0].split(",")
  272. return int(splitted[1]) + int(splitted[2])
  273. def get_sentences(self, sent_segm):
  274. # returns one sentence
  275. id = self.get_segm_id(sent_segm[0])
  276. segm = self.text_view.segm_dict[id] # text segment
  277. beg = self.get_sent_beg(sent_segm[0])
  278. end = self.get_sent_end(sent_segm[len(sent_segm) - 1])
  279. return segm[beg:end]
  280. def remove_choice(self, segm):
  281. ret = []
  282. prev_txt_end = -1
  283. prev_txt_nr = -1
  284. for word in segm:
  285. txt_nr = self.get_segm_id(word)
  286. # get increasing sequence of ids: in case of choice get first possibility
  287. if self.get_sent_beg(word) > prev_txt_end - 1 or prev_txt_nr != txt_nr:
  288. ret.append(word)
  289. prev_txt_end = self.get_sent_end(word)
  290. prev_txt_nr = txt_nr
  291. return ret
  292. def handle_query(self):
  293. try:
  294. self._open()
  295. sentences = []
  296. while True:
  297. sent_segm = XMLCorpusView.read_block(self, self._stream)
  298. if len(sent_segm) == 0:
  299. break
  300. for segm in sent_segm:
  301. segm = self.remove_choice(segm)
  302. sentences.append(self.get_sentences(segm))
  303. self.close()
  304. self.xml_tool.remove_preprocessed_file()
  305. return sentences
  306. except Exception:
  307. self.xml_tool.remove_preprocessed_file()
  308. raise Exception
  309. def handle_elt(self, elt, context):
  310. ret = []
  311. for seg in elt:
  312. ret.append(seg.get("corresp"))
  313. return ret
  314. class NKJPCorpus_Text_View(XMLCorpusView):
  315. """
  316. A stream backed corpus view specialized for use with
  317. text.xml files in NKJP corpus.
  318. """
  319. SENTS_MODE = 0
  320. RAW_MODE = 1
  321. def __init__(self, filename, **kwargs):
  322. self.mode = kwargs.pop("mode", 0)
  323. self.tagspec = ".*/div/ab"
  324. self.segm_dict = dict()
  325. # xml preprocessing
  326. self.xml_tool = XML_Tool(filename, "text.xml")
  327. # base class init
  328. XMLCorpusView.__init__(
  329. self, self.xml_tool.build_preprocessed_file(), self.tagspec
  330. )
  331. def handle_query(self):
  332. try:
  333. self._open()
  334. x = self.read_block(self._stream)
  335. self.close()
  336. self.xml_tool.remove_preprocessed_file()
  337. return x
  338. except Exception:
  339. self.xml_tool.remove_preprocessed_file()
  340. raise Exception
  341. def read_block(self, stream, tagspec=None, elt_handler=None):
  342. """
  343. Returns text as a list of sentences.
  344. """
  345. txt = []
  346. while True:
  347. segm = XMLCorpusView.read_block(self, stream)
  348. if len(segm) == 0:
  349. break
  350. for part in segm:
  351. txt.append(part)
  352. return [" ".join([segm for segm in txt])]
  353. def get_segm_id(self, elt):
  354. for attr in elt.attrib:
  355. if attr.endswith("id"):
  356. return elt.get(attr)
  357. def handle_elt(self, elt, context):
  358. # fill dictionary to use later in sents mode
  359. if self.mode is NKJPCorpus_Text_View.SENTS_MODE:
  360. self.segm_dict[self.get_segm_id(elt)] = elt.text
  361. return elt.text
  362. class NKJPCorpus_Morph_View(XMLCorpusView):
  363. """
  364. A stream backed corpus view specialized for use with
  365. ann_morphosyntax.xml files in NKJP corpus.
  366. """
  367. def __init__(self, filename, **kwargs):
  368. self.tags = kwargs.pop("tags", None)
  369. self.tagspec = ".*/seg/fs"
  370. self.xml_tool = XML_Tool(filename, "ann_morphosyntax.xml")
  371. XMLCorpusView.__init__(
  372. self, self.xml_tool.build_preprocessed_file(), self.tagspec
  373. )
  374. def handle_query(self):
  375. try:
  376. self._open()
  377. words = []
  378. while True:
  379. segm = XMLCorpusView.read_block(self, self._stream)
  380. if len(segm) == 0:
  381. break
  382. for part in segm:
  383. if part is not None:
  384. words.append(part)
  385. self.close()
  386. self.xml_tool.remove_preprocessed_file()
  387. return words
  388. except Exception:
  389. self.xml_tool.remove_preprocessed_file()
  390. raise Exception
  391. def handle_elt(self, elt, context):
  392. word = ""
  393. flag = False
  394. is_not_interp = True
  395. # if tags not specified, then always return word
  396. if self.tags is None:
  397. flag = True
  398. for child in elt:
  399. # get word
  400. if "name" in child.keys() and child.attrib["name"] == "orth":
  401. for symbol in child:
  402. if symbol.tag == "string":
  403. word = symbol.text
  404. elif "name" in child.keys() and child.attrib["name"] == "interps":
  405. for symbol in child:
  406. if "type" in symbol.keys() and symbol.attrib["type"] == "lex":
  407. for symbol2 in symbol:
  408. if (
  409. "name" in symbol2.keys()
  410. and symbol2.attrib["name"] == "ctag"
  411. ):
  412. for symbol3 in symbol2:
  413. if (
  414. "value" in symbol3.keys()
  415. and self.tags is not None
  416. and symbol3.attrib["value"] in self.tags
  417. ):
  418. flag = True
  419. elif (
  420. "value" in symbol3.keys()
  421. and symbol3.attrib["value"] == "interp"
  422. ):
  423. is_not_interp = False
  424. if flag and is_not_interp:
  425. return word