toolbox.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530
  1. # coding: utf-8
  2. # Natural Language Toolkit: Toolbox Reader
  3. #
  4. # Copyright (C) 2001-2020 NLTK Project
  5. # Author: Greg Aumann <greg_aumann@sil.org>
  6. # URL: <http://nltk.org>
  7. # For license information, see LICENSE.TXT
  8. """
  9. Module for reading, writing and manipulating
  10. Toolbox databases and settings files.
  11. """
  12. import re, codecs
  13. from xml.etree.ElementTree import ElementTree, TreeBuilder, Element, SubElement
  14. from io import StringIO
  15. from nltk.data import PathPointer, find
  16. class StandardFormat(object):
  17. """
  18. Class for reading and processing standard format marker files and strings.
  19. """
  20. def __init__(self, filename=None, encoding=None):
  21. self._encoding = encoding
  22. if filename is not None:
  23. self.open(filename)
  24. def open(self, sfm_file):
  25. """
  26. Open a standard format marker file for sequential reading.
  27. :param sfm_file: name of the standard format marker input file
  28. :type sfm_file: str
  29. """
  30. if isinstance(sfm_file, PathPointer):
  31. # [xx] We don't use 'rU' mode here -- do we need to?
  32. # (PathPointer.open doesn't take a mode option)
  33. self._file = sfm_file.open(self._encoding)
  34. else:
  35. self._file = codecs.open(sfm_file, "rU", self._encoding)
  36. def open_string(self, s):
  37. """
  38. Open a standard format marker string for sequential reading.
  39. :param s: string to parse as a standard format marker input file
  40. :type s: str
  41. """
  42. self._file = StringIO(s)
  43. def raw_fields(self):
  44. """
  45. Return an iterator that returns the next field in a (marker, value)
  46. tuple. Linebreaks and trailing white space are preserved except
  47. for the final newline in each field.
  48. :rtype: iter(tuple(str, str))
  49. """
  50. join_string = "\n"
  51. line_regexp = r"^%s(?:\\(\S+)\s*)?(.*)$"
  52. # discard a BOM in the first line
  53. first_line_pat = re.compile(line_regexp % "(?:\xef\xbb\xbf)?")
  54. line_pat = re.compile(line_regexp % "")
  55. # need to get first line outside the loop for correct handling
  56. # of the first marker if it spans multiple lines
  57. file_iter = iter(self._file)
  58. # PEP 479, prevent RuntimeError when StopIteration is raised inside generator
  59. try:
  60. line = next(file_iter)
  61. except StopIteration:
  62. # no more data is available, terminate the generator
  63. return
  64. mobj = re.match(first_line_pat, line)
  65. mkr, line_value = mobj.groups()
  66. value_lines = [line_value]
  67. self.line_num = 0
  68. for line in file_iter:
  69. self.line_num += 1
  70. mobj = re.match(line_pat, line)
  71. line_mkr, line_value = mobj.groups()
  72. if line_mkr:
  73. yield (mkr, join_string.join(value_lines))
  74. mkr = line_mkr
  75. value_lines = [line_value]
  76. else:
  77. value_lines.append(line_value)
  78. self.line_num += 1
  79. yield (mkr, join_string.join(value_lines))
  80. def fields(
  81. self,
  82. strip=True,
  83. unwrap=True,
  84. encoding=None,
  85. errors="strict",
  86. unicode_fields=None,
  87. ):
  88. """
  89. Return an iterator that returns the next field in a ``(marker, value)``
  90. tuple, where ``marker`` and ``value`` are unicode strings if an ``encoding``
  91. was specified in the ``fields()`` method. Otherwise they are non-unicode strings.
  92. :param strip: strip trailing whitespace from the last line of each field
  93. :type strip: bool
  94. :param unwrap: Convert newlines in a field to spaces.
  95. :type unwrap: bool
  96. :param encoding: Name of an encoding to use. If it is specified then
  97. the ``fields()`` method returns unicode strings rather than non
  98. unicode strings.
  99. :type encoding: str or None
  100. :param errors: Error handling scheme for codec. Same as the ``decode()``
  101. builtin string method.
  102. :type errors: str
  103. :param unicode_fields: Set of marker names whose values are UTF-8 encoded.
  104. Ignored if encoding is None. If the whole file is UTF-8 encoded set
  105. ``encoding='utf8'`` and leave ``unicode_fields`` with its default
  106. value of None.
  107. :type unicode_fields: sequence
  108. :rtype: iter(tuple(str, str))
  109. """
  110. if encoding is None and unicode_fields is not None:
  111. raise ValueError("unicode_fields is set but not encoding.")
  112. unwrap_pat = re.compile(r"\n+")
  113. for mkr, val in self.raw_fields():
  114. if unwrap:
  115. val = unwrap_pat.sub(" ", val)
  116. if strip:
  117. val = val.rstrip()
  118. yield (mkr, val)
  119. def close(self):
  120. """Close a previously opened standard format marker file or string."""
  121. self._file.close()
  122. try:
  123. del self.line_num
  124. except AttributeError:
  125. pass
  126. class ToolboxData(StandardFormat):
  127. def parse(self, grammar=None, **kwargs):
  128. if grammar:
  129. return self._chunk_parse(grammar=grammar, **kwargs)
  130. else:
  131. return self._record_parse(**kwargs)
  132. def _record_parse(self, key=None, **kwargs):
  133. """
  134. Returns an element tree structure corresponding to a toolbox data file with
  135. all markers at the same level.
  136. Thus the following Toolbox database::
  137. \_sh v3.0 400 Rotokas Dictionary
  138. \_DateStampHasFourDigitYear
  139. \lx kaa
  140. \ps V.A
  141. \ge gag
  142. \gp nek i pas
  143. \lx kaa
  144. \ps V.B
  145. \ge strangle
  146. \gp pasim nek
  147. after parsing will end up with the same structure (ignoring the extra
  148. whitespace) as the following XML fragment after being parsed by
  149. ElementTree::
  150. <toolbox_data>
  151. <header>
  152. <_sh>v3.0 400 Rotokas Dictionary</_sh>
  153. <_DateStampHasFourDigitYear/>
  154. </header>
  155. <record>
  156. <lx>kaa</lx>
  157. <ps>V.A</ps>
  158. <ge>gag</ge>
  159. <gp>nek i pas</gp>
  160. </record>
  161. <record>
  162. <lx>kaa</lx>
  163. <ps>V.B</ps>
  164. <ge>strangle</ge>
  165. <gp>pasim nek</gp>
  166. </record>
  167. </toolbox_data>
  168. :param key: Name of key marker at the start of each record. If set to
  169. None (the default value) the first marker that doesn't begin with
  170. an underscore is assumed to be the key.
  171. :type key: str
  172. :param kwargs: Keyword arguments passed to ``StandardFormat.fields()``
  173. :type kwargs: dict
  174. :rtype: ElementTree._ElementInterface
  175. :return: contents of toolbox data divided into header and records
  176. """
  177. builder = TreeBuilder()
  178. builder.start("toolbox_data", {})
  179. builder.start("header", {})
  180. in_records = False
  181. for mkr, value in self.fields(**kwargs):
  182. if key is None and not in_records and mkr[0] != "_":
  183. key = mkr
  184. if mkr == key:
  185. if in_records:
  186. builder.end("record")
  187. else:
  188. builder.end("header")
  189. in_records = True
  190. builder.start("record", {})
  191. builder.start(mkr, {})
  192. builder.data(value)
  193. builder.end(mkr)
  194. if in_records:
  195. builder.end("record")
  196. else:
  197. builder.end("header")
  198. builder.end("toolbox_data")
  199. return builder.close()
  200. def _tree2etree(self, parent):
  201. from nltk.tree import Tree
  202. root = Element(parent.label())
  203. for child in parent:
  204. if isinstance(child, Tree):
  205. root.append(self._tree2etree(child))
  206. else:
  207. text, tag = child
  208. e = SubElement(root, tag)
  209. e.text = text
  210. return root
  211. def _chunk_parse(self, grammar=None, root_label="record", trace=0, **kwargs):
  212. """
  213. Returns an element tree structure corresponding to a toolbox data file
  214. parsed according to the chunk grammar.
  215. :type grammar: str
  216. :param grammar: Contains the chunking rules used to parse the
  217. database. See ``chunk.RegExp`` for documentation.
  218. :type root_label: str
  219. :param root_label: The node value that should be used for the
  220. top node of the chunk structure.
  221. :type trace: int
  222. :param trace: The level of tracing that should be used when
  223. parsing a text. ``0`` will generate no tracing output;
  224. ``1`` will generate normal tracing output; and ``2`` or
  225. higher will generate verbose tracing output.
  226. :type kwargs: dict
  227. :param kwargs: Keyword arguments passed to ``toolbox.StandardFormat.fields()``
  228. :rtype: ElementTree._ElementInterface
  229. """
  230. from nltk import chunk
  231. from nltk.tree import Tree
  232. cp = chunk.RegexpParser(grammar, root_label=root_label, trace=trace)
  233. db = self.parse(**kwargs)
  234. tb_etree = Element("toolbox_data")
  235. header = db.find("header")
  236. tb_etree.append(header)
  237. for record in db.findall("record"):
  238. parsed = cp.parse([(elem.text, elem.tag) for elem in record])
  239. tb_etree.append(self._tree2etree(parsed))
  240. return tb_etree
  241. _is_value = re.compile(r"\S")
  242. def to_sfm_string(tree, encoding=None, errors="strict", unicode_fields=None):
  243. """
  244. Return a string with a standard format representation of the toolbox
  245. data in tree (tree can be a toolbox database or a single record).
  246. :param tree: flat representation of toolbox data (whole database or single record)
  247. :type tree: ElementTree._ElementInterface
  248. :param encoding: Name of an encoding to use.
  249. :type encoding: str
  250. :param errors: Error handling scheme for codec. Same as the ``encode()``
  251. builtin string method.
  252. :type errors: str
  253. :param unicode_fields:
  254. :type unicode_fields: dict(str) or set(str)
  255. :rtype: str
  256. """
  257. if tree.tag == "record":
  258. root = Element("toolbox_data")
  259. root.append(tree)
  260. tree = root
  261. if tree.tag != "toolbox_data":
  262. raise ValueError("not a toolbox_data element structure")
  263. if encoding is None and unicode_fields is not None:
  264. raise ValueError(
  265. "if encoding is not specified then neither should unicode_fields"
  266. )
  267. l = []
  268. for rec in tree:
  269. l.append("\n")
  270. for field in rec:
  271. mkr = field.tag
  272. value = field.text
  273. if encoding is not None:
  274. if unicode_fields is not None and mkr in unicode_fields:
  275. cur_encoding = "utf8"
  276. else:
  277. cur_encoding = encoding
  278. if re.search(_is_value, value):
  279. l.append(
  280. ("\\%s %s\n" % (mkr, value)).encode(cur_encoding, errors)
  281. )
  282. else:
  283. l.append(
  284. ("\\%s%s\n" % (mkr, value)).encode(cur_encoding, errors)
  285. )
  286. else:
  287. if re.search(_is_value, value):
  288. l.append("\\%s %s\n" % (mkr, value))
  289. else:
  290. l.append("\\%s%s\n" % (mkr, value))
  291. return "".join(l[1:])
  292. class ToolboxSettings(StandardFormat):
  293. """This class is the base class for settings files."""
  294. def __init__(self):
  295. super(ToolboxSettings, self).__init__()
  296. def parse(self, encoding=None, errors="strict", **kwargs):
  297. """
  298. Return the contents of toolbox settings file with a nested structure.
  299. :param encoding: encoding used by settings file
  300. :type encoding: str
  301. :param errors: Error handling scheme for codec. Same as ``decode()`` builtin method.
  302. :type errors: str
  303. :param kwargs: Keyword arguments passed to ``StandardFormat.fields()``
  304. :type kwargs: dict
  305. :rtype: ElementTree._ElementInterface
  306. """
  307. builder = TreeBuilder()
  308. for mkr, value in self.fields(encoding=encoding, errors=errors, **kwargs):
  309. # Check whether the first char of the field marker
  310. # indicates a block start (+) or end (-)
  311. block = mkr[0]
  312. if block in ("+", "-"):
  313. mkr = mkr[1:]
  314. else:
  315. block = None
  316. # Build tree on the basis of block char
  317. if block == "+":
  318. builder.start(mkr, {})
  319. builder.data(value)
  320. elif block == "-":
  321. builder.end(mkr)
  322. else:
  323. builder.start(mkr, {})
  324. builder.data(value)
  325. builder.end(mkr)
  326. return builder.close()
  327. def to_settings_string(tree, encoding=None, errors="strict", unicode_fields=None):
  328. # write XML to file
  329. l = list()
  330. _to_settings_string(
  331. tree.getroot(),
  332. l,
  333. encoding=encoding,
  334. errors=errors,
  335. unicode_fields=unicode_fields,
  336. )
  337. return "".join(l)
  338. def _to_settings_string(node, l, **kwargs):
  339. # write XML to file
  340. tag = node.tag
  341. text = node.text
  342. if len(node) == 0:
  343. if text:
  344. l.append("\\%s %s\n" % (tag, text))
  345. else:
  346. l.append("\\%s\n" % tag)
  347. else:
  348. if text:
  349. l.append("\\+%s %s\n" % (tag, text))
  350. else:
  351. l.append("\\+%s\n" % tag)
  352. for n in node:
  353. _to_settings_string(n, l, **kwargs)
  354. l.append("\\-%s\n" % tag)
  355. return
  356. def remove_blanks(elem):
  357. """
  358. Remove all elements and subelements with no text and no child elements.
  359. :param elem: toolbox data in an elementtree structure
  360. :type elem: ElementTree._ElementInterface
  361. """
  362. out = list()
  363. for child in elem:
  364. remove_blanks(child)
  365. if child.text or len(child) > 0:
  366. out.append(child)
  367. elem[:] = out
  368. def add_default_fields(elem, default_fields):
  369. """
  370. Add blank elements and subelements specified in default_fields.
  371. :param elem: toolbox data in an elementtree structure
  372. :type elem: ElementTree._ElementInterface
  373. :param default_fields: fields to add to each type of element and subelement
  374. :type default_fields: dict(tuple)
  375. """
  376. for field in default_fields.get(elem.tag, []):
  377. if elem.find(field) is None:
  378. SubElement(elem, field)
  379. for child in elem:
  380. add_default_fields(child, default_fields)
  381. def sort_fields(elem, field_orders):
  382. """
  383. Sort the elements and subelements in order specified in field_orders.
  384. :param elem: toolbox data in an elementtree structure
  385. :type elem: ElementTree._ElementInterface
  386. :param field_orders: order of fields for each type of element and subelement
  387. :type field_orders: dict(tuple)
  388. """
  389. order_dicts = dict()
  390. for field, order in field_orders.items():
  391. order_dicts[field] = order_key = dict()
  392. for i, subfield in enumerate(order):
  393. order_key[subfield] = i
  394. _sort_fields(elem, order_dicts)
  395. def _sort_fields(elem, orders_dicts):
  396. """sort the children of elem"""
  397. try:
  398. order = orders_dicts[elem.tag]
  399. except KeyError:
  400. pass
  401. else:
  402. tmp = sorted(
  403. [((order.get(child.tag, 1e9), i), child) for i, child in enumerate(elem)]
  404. )
  405. elem[:] = [child for key, child in tmp]
  406. for child in elem:
  407. if len(child):
  408. _sort_fields(child, orders_dicts)
  409. def add_blank_lines(tree, blanks_before, blanks_between):
  410. """
  411. Add blank lines before all elements and subelements specified in blank_before.
  412. :param elem: toolbox data in an elementtree structure
  413. :type elem: ElementTree._ElementInterface
  414. :param blank_before: elements and subelements to add blank lines before
  415. :type blank_before: dict(tuple)
  416. """
  417. try:
  418. before = blanks_before[tree.tag]
  419. between = blanks_between[tree.tag]
  420. except KeyError:
  421. for elem in tree:
  422. if len(elem):
  423. add_blank_lines(elem, blanks_before, blanks_between)
  424. else:
  425. last_elem = None
  426. for elem in tree:
  427. tag = elem.tag
  428. if last_elem is not None and last_elem.tag != tag:
  429. if tag in before and last_elem is not None:
  430. e = last_elem.getiterator()[-1]
  431. e.text = (e.text or "") + "\n"
  432. else:
  433. if tag in between:
  434. e = last_elem.getiterator()[-1]
  435. e.text = (e.text or "") + "\n"
  436. if len(elem):
  437. add_blank_lines(elem, blanks_before, blanks_between)
  438. last_elem = elem
  439. def demo():
  440. from itertools import islice
  441. # zip_path = find('corpora/toolbox.zip')
  442. # lexicon = ToolboxData(ZipFilePathPointer(zip_path, 'toolbox/rotokas.dic')).parse()
  443. file_path = find("corpora/toolbox/rotokas.dic")
  444. lexicon = ToolboxData(file_path).parse()
  445. print("first field in fourth record:")
  446. print(lexicon[3][0].tag)
  447. print(lexicon[3][0].text)
  448. print("\nfields in sequential order:")
  449. for field in islice(lexicon.find("record"), 10):
  450. print(field.tag, field.text)
  451. print("\nlx fields:")
  452. for field in islice(lexicon.findall("record/lx"), 10):
  453. print(field.text)
  454. settings = ToolboxSettings()
  455. file_path = find("corpora/toolbox/MDF/MDF_AltH.typ")
  456. settings.open(file_path)
  457. # settings.open(ZipFilePathPointer(zip_path, entry='toolbox/MDF/MDF_AltH.typ'))
  458. tree = settings.parse(unwrap=False, encoding="cp1252")
  459. print(tree.find("expset/expMDF/rtfPageSetup/paperSize").text)
  460. settings_tree = ElementTree(tree)
  461. print(to_settings_string(settings_tree).encode("utf8"))
  462. if __name__ == "__main__":
  463. demo()