xmldocs.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403
  1. # Natural Language Toolkit: XML Corpus Reader
  2. #
  3. # Copyright (C) 2001-2020 NLTK Project
  4. # Author: Steven Bird <stevenbird1@gmail.com>
  5. # URL: <http://nltk.org/>
  6. # For license information, see LICENSE.TXT
  7. """
  8. Corpus reader for corpora whose documents are xml files.
  9. (note -- not named 'xml' to avoid conflicting w/ standard xml package)
  10. """
  11. import codecs
  12. from xml.etree import ElementTree
  13. from nltk.data import SeekableUnicodeStreamReader
  14. from nltk.tokenize import WordPunctTokenizer
  15. from nltk.internals import ElementWrapper
  16. from nltk.corpus.reader.api import CorpusReader
  17. from nltk.corpus.reader.util import *
  18. class XMLCorpusReader(CorpusReader):
  19. """
  20. Corpus reader for corpora whose documents are xml files.
  21. Note that the ``XMLCorpusReader`` constructor does not take an
  22. ``encoding`` argument, because the unicode encoding is specified by
  23. the XML files themselves. See the XML specs for more info.
  24. """
  25. def __init__(self, root, fileids, wrap_etree=False):
  26. self._wrap_etree = wrap_etree
  27. CorpusReader.__init__(self, root, fileids)
  28. def xml(self, fileid=None):
  29. # Make sure we have exactly one file -- no concatenating XML.
  30. if fileid is None and len(self._fileids) == 1:
  31. fileid = self._fileids[0]
  32. if not isinstance(fileid, str):
  33. raise TypeError("Expected a single file identifier string")
  34. # Read the XML in using ElementTree.
  35. elt = ElementTree.parse(self.abspath(fileid).open()).getroot()
  36. # If requested, wrap it.
  37. if self._wrap_etree:
  38. elt = ElementWrapper(elt)
  39. # Return the ElementTree element.
  40. return elt
  41. def words(self, fileid=None):
  42. """
  43. Returns all of the words and punctuation symbols in the specified file
  44. that were in text nodes -- ie, tags are ignored. Like the xml() method,
  45. fileid can only specify one file.
  46. :return: the given file's text nodes as a list of words and punctuation symbols
  47. :rtype: list(str)
  48. """
  49. elt = self.xml(fileid)
  50. encoding = self.encoding(fileid)
  51. word_tokenizer = WordPunctTokenizer()
  52. iterator = elt.getiterator()
  53. out = []
  54. for node in iterator:
  55. text = node.text
  56. if text is not None:
  57. if isinstance(text, bytes):
  58. text = text.decode(encoding)
  59. toks = word_tokenizer.tokenize(text)
  60. out.extend(toks)
  61. return out
  62. def raw(self, fileids=None):
  63. if fileids is None:
  64. fileids = self._fileids
  65. elif isinstance(fileids, str):
  66. fileids = [fileids]
  67. return concat([self.open(f).read() for f in fileids])
  68. class XMLCorpusView(StreamBackedCorpusView):
  69. """
  70. A corpus view that selects out specified elements from an XML
  71. file, and provides a flat list-like interface for accessing them.
  72. (Note: ``XMLCorpusView`` is not used by ``XMLCorpusReader`` itself,
  73. but may be used by subclasses of ``XMLCorpusReader``.)
  74. Every XML corpus view has a "tag specification", indicating what
  75. XML elements should be included in the view; and each (non-nested)
  76. element that matches this specification corresponds to one item in
  77. the view. Tag specifications are regular expressions over tag
  78. paths, where a tag path is a list of element tag names, separated
  79. by '/', indicating the ancestry of the element. Some examples:
  80. - ``'foo'``: A top-level element whose tag is ``foo``.
  81. - ``'foo/bar'``: An element whose tag is ``bar`` and whose parent
  82. is a top-level element whose tag is ``foo``.
  83. - ``'.*/foo'``: An element whose tag is ``foo``, appearing anywhere
  84. in the xml tree.
  85. - ``'.*/(foo|bar)'``: An wlement whose tag is ``foo`` or ``bar``,
  86. appearing anywhere in the xml tree.
  87. The view items are generated from the selected XML elements via
  88. the method ``handle_elt()``. By default, this method returns the
  89. element as-is (i.e., as an ElementTree object); but it can be
  90. overridden, either via subclassing or via the ``elt_handler``
  91. constructor parameter.
  92. """
  93. #: If true, then display debugging output to stdout when reading
  94. #: blocks.
  95. _DEBUG = False
  96. #: The number of characters read at a time by this corpus reader.
  97. _BLOCK_SIZE = 1024
  98. def __init__(self, fileid, tagspec, elt_handler=None):
  99. """
  100. Create a new corpus view based on a specified XML file.
  101. Note that the ``XMLCorpusView`` constructor does not take an
  102. ``encoding`` argument, because the unicode encoding is
  103. specified by the XML files themselves.
  104. :type tagspec: str
  105. :param tagspec: A tag specification, indicating what XML
  106. elements should be included in the view. Each non-nested
  107. element that matches this specification corresponds to one
  108. item in the view.
  109. :param elt_handler: A function used to transform each element
  110. to a value for the view. If no handler is specified, then
  111. ``self.handle_elt()`` is called, which returns the element
  112. as an ElementTree object. The signature of elt_handler is::
  113. elt_handler(elt, tagspec) -> value
  114. """
  115. if elt_handler:
  116. self.handle_elt = elt_handler
  117. self._tagspec = re.compile(tagspec + r"\Z")
  118. """The tag specification for this corpus view."""
  119. self._tag_context = {0: ()}
  120. """A dictionary mapping from file positions (as returned by
  121. ``stream.seek()`` to XML contexts. An XML context is a
  122. tuple of XML tag names, indicating which tags have not yet
  123. been closed."""
  124. encoding = self._detect_encoding(fileid)
  125. StreamBackedCorpusView.__init__(self, fileid, encoding=encoding)
  126. def _detect_encoding(self, fileid):
  127. if isinstance(fileid, PathPointer):
  128. try:
  129. infile = fileid.open()
  130. s = infile.readline()
  131. finally:
  132. infile.close()
  133. else:
  134. with open(fileid, "rb") as infile:
  135. s = infile.readline()
  136. if s.startswith(codecs.BOM_UTF16_BE):
  137. return "utf-16-be"
  138. if s.startswith(codecs.BOM_UTF16_LE):
  139. return "utf-16-le"
  140. if s.startswith(codecs.BOM_UTF32_BE):
  141. return "utf-32-be"
  142. if s.startswith(codecs.BOM_UTF32_LE):
  143. return "utf-32-le"
  144. if s.startswith(codecs.BOM_UTF8):
  145. return "utf-8"
  146. m = re.match(br'\s*<\?xml\b.*\bencoding="([^"]+)"', s)
  147. if m:
  148. return m.group(1).decode()
  149. m = re.match(br"\s*<\?xml\b.*\bencoding='([^']+)'", s)
  150. if m:
  151. return m.group(1).decode()
  152. # No encoding found -- what should the default be?
  153. return "utf-8"
  154. def handle_elt(self, elt, context):
  155. """
  156. Convert an element into an appropriate value for inclusion in
  157. the view. Unless overridden by a subclass or by the
  158. ``elt_handler`` constructor argument, this method simply
  159. returns ``elt``.
  160. :return: The view value corresponding to ``elt``.
  161. :type elt: ElementTree
  162. :param elt: The element that should be converted.
  163. :type context: str
  164. :param context: A string composed of element tags separated by
  165. forward slashes, indicating the XML context of the given
  166. element. For example, the string ``'foo/bar/baz'``
  167. indicates that the element is a ``baz`` element whose
  168. parent is a ``bar`` element and whose grandparent is a
  169. top-level ``foo`` element.
  170. """
  171. return elt
  172. #: A regular expression that matches XML fragments that do not
  173. #: contain any un-closed tags.
  174. _VALID_XML_RE = re.compile(
  175. r"""
  176. [^<]*
  177. (
  178. ((<!--.*?-->) | # comment
  179. (<![CDATA[.*?]]) | # raw character data
  180. (<!DOCTYPE\s+[^\[]*(\[[^\]]*])?\s*>) | # doctype decl
  181. (<[^!>][^>]*>)) # tag or PI
  182. [^<]*)*
  183. \Z""",
  184. re.DOTALL | re.VERBOSE,
  185. )
  186. #: A regular expression used to extract the tag name from a start tag,
  187. #: end tag, or empty-elt tag string.
  188. _XML_TAG_NAME = re.compile("<\s*/?\s*([^\s>]+)")
  189. #: A regular expression used to find all start-tags, end-tags, and
  190. #: emtpy-elt tags in an XML file. This regexp is more lenient than
  191. #: the XML spec -- e.g., it allows spaces in some places where the
  192. #: spec does not.
  193. _XML_PIECE = re.compile(
  194. r"""
  195. # Include these so we can skip them:
  196. (?P<COMMENT> <!--.*?--> )|
  197. (?P<CDATA> <![CDATA[.*?]]> )|
  198. (?P<PI> <\?.*?\?> )|
  199. (?P<DOCTYPE> <!DOCTYPE\s+[^\[^>]*(\[[^\]]*])?\s*>)|
  200. # These are the ones we actually care about:
  201. (?P<EMPTY_ELT_TAG> <\s*[^>/\?!\s][^>]*/\s*> )|
  202. (?P<START_TAG> <\s*[^>/\?!\s][^>]*> )|
  203. (?P<END_TAG> <\s*/[^>/\?!\s][^>]*> )""",
  204. re.DOTALL | re.VERBOSE,
  205. )
  206. def _read_xml_fragment(self, stream):
  207. """
  208. Read a string from the given stream that does not contain any
  209. un-closed tags. In particular, this function first reads a
  210. block from the stream of size ``self._BLOCK_SIZE``. It then
  211. checks if that block contains an un-closed tag. If it does,
  212. then this function either backtracks to the last '<', or reads
  213. another block.
  214. """
  215. fragment = ""
  216. if isinstance(stream, SeekableUnicodeStreamReader):
  217. startpos = stream.tell()
  218. while True:
  219. # Read a block and add it to the fragment.
  220. xml_block = stream.read(self._BLOCK_SIZE)
  221. fragment += xml_block
  222. # Do we have a well-formed xml fragment?
  223. if self._VALID_XML_RE.match(fragment):
  224. return fragment
  225. # Do we have a fragment that will never be well-formed?
  226. if re.search("[<>]", fragment).group(0) == ">":
  227. pos = stream.tell() - (
  228. len(fragment) - re.search("[<>]", fragment).end()
  229. )
  230. raise ValueError('Unexpected ">" near char %s' % pos)
  231. # End of file?
  232. if not xml_block:
  233. raise ValueError("Unexpected end of file: tag not closed")
  234. # If not, then we must be in the middle of a <..tag..>.
  235. # If appropriate, backtrack to the most recent '<'
  236. # character.
  237. last_open_bracket = fragment.rfind("<")
  238. if last_open_bracket > 0:
  239. if self._VALID_XML_RE.match(fragment[:last_open_bracket]):
  240. if isinstance(stream, SeekableUnicodeStreamReader):
  241. stream.seek(startpos)
  242. stream.char_seek_forward(last_open_bracket)
  243. else:
  244. stream.seek(-(len(fragment) - last_open_bracket), 1)
  245. return fragment[:last_open_bracket]
  246. # Otherwise, read another block. (i.e., return to the
  247. # top of the loop.)
  248. def read_block(self, stream, tagspec=None, elt_handler=None):
  249. """
  250. Read from ``stream`` until we find at least one element that
  251. matches ``tagspec``, and return the result of applying
  252. ``elt_handler`` to each element found.
  253. """
  254. if tagspec is None:
  255. tagspec = self._tagspec
  256. if elt_handler is None:
  257. elt_handler = self.handle_elt
  258. # Use a stack of strings to keep track of our context:
  259. context = list(self._tag_context.get(stream.tell()))
  260. assert context is not None # check this -- could it ever happen?
  261. elts = []
  262. elt_start = None # where does the elt start
  263. elt_depth = None # what context depth
  264. elt_text = ""
  265. while elts == [] or elt_start is not None:
  266. if isinstance(stream, SeekableUnicodeStreamReader):
  267. startpos = stream.tell()
  268. xml_fragment = self._read_xml_fragment(stream)
  269. # End of file.
  270. if not xml_fragment:
  271. if elt_start is None:
  272. break
  273. else:
  274. raise ValueError("Unexpected end of file")
  275. # Process each <tag> in the xml fragment.
  276. for piece in self._XML_PIECE.finditer(xml_fragment):
  277. if self._DEBUG:
  278. print("%25s %s" % ("/".join(context)[-20:], piece.group()))
  279. if piece.group("START_TAG"):
  280. name = self._XML_TAG_NAME.match(piece.group()).group(1)
  281. # Keep context up-to-date.
  282. context.append(name)
  283. # Is this one of the elts we're looking for?
  284. if elt_start is None:
  285. if re.match(tagspec, "/".join(context)):
  286. elt_start = piece.start()
  287. elt_depth = len(context)
  288. elif piece.group("END_TAG"):
  289. name = self._XML_TAG_NAME.match(piece.group()).group(1)
  290. # sanity checks:
  291. if not context:
  292. raise ValueError("Unmatched tag </%s>" % name)
  293. if name != context[-1]:
  294. raise ValueError(
  295. "Unmatched tag <%s>...</%s>" % (context[-1], name)
  296. )
  297. # Is this the end of an element?
  298. if elt_start is not None and elt_depth == len(context):
  299. elt_text += xml_fragment[elt_start : piece.end()]
  300. elts.append((elt_text, "/".join(context)))
  301. elt_start = elt_depth = None
  302. elt_text = ""
  303. # Keep context up-to-date
  304. context.pop()
  305. elif piece.group("EMPTY_ELT_TAG"):
  306. name = self._XML_TAG_NAME.match(piece.group()).group(1)
  307. if elt_start is None:
  308. if re.match(tagspec, "/".join(context) + "/" + name):
  309. elts.append((piece.group(), "/".join(context) + "/" + name))
  310. if elt_start is not None:
  311. # If we haven't found any elements yet, then keep
  312. # looping until we do.
  313. if elts == []:
  314. elt_text += xml_fragment[elt_start:]
  315. elt_start = 0
  316. # If we've found at least one element, then try
  317. # backtracking to the start of the element that we're
  318. # inside of.
  319. else:
  320. # take back the last start-tag, and return what
  321. # we've gotten so far (elts is non-empty).
  322. if self._DEBUG:
  323. print(" " * 36 + "(backtrack)")
  324. if isinstance(stream, SeekableUnicodeStreamReader):
  325. stream.seek(startpos)
  326. stream.char_seek_forward(elt_start)
  327. else:
  328. stream.seek(-(len(xml_fragment) - elt_start), 1)
  329. context = context[: elt_depth - 1]
  330. elt_start = elt_depth = None
  331. elt_text = ""
  332. # Update the _tag_context dict.
  333. pos = stream.tell()
  334. if pos in self._tag_context:
  335. assert tuple(context) == self._tag_context[pos]
  336. else:
  337. self._tag_context[pos] = tuple(context)
  338. return [
  339. elt_handler(
  340. ElementTree.fromstring(elt.encode("ascii", "xmlcharrefreplace")),
  341. context,
  342. )
  343. for (elt, context) in elts
  344. ]