| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412 |
- """
- A reader for corpora whose documents are in MTE format.
- """
- import os
- import re
- from functools import reduce
- from nltk.corpus.reader import concat, TaggedCorpusReader
- from nltk.corpus.reader.xmldocs import XMLCorpusView
- def xpath(root, path, ns):
- return root.findall(path, ns)
- class MTECorpusView(XMLCorpusView):
- """
- Class for lazy viewing the MTE Corpus.
- """
- def __init__(self, fileid, tagspec, elt_handler=None):
- XMLCorpusView.__init__(self, fileid, tagspec, elt_handler)
- def read_block(self, stream, tagspec=None, elt_handler=None):
- return list(
- filter(
- lambda x: x is not None,
- XMLCorpusView.read_block(self, stream, tagspec, elt_handler),
- )
- )
- class MTEFileReader:
- """
- Class for loading the content of the multext-east corpus. It
- parses the xml files and does some tag-filtering depending on the
- given method parameters.
- """
- ns = {
- "tei": "http://www.tei-c.org/ns/1.0",
- "xml": "http://www.w3.org/XML/1998/namespace",
- }
- tag_ns = "{http://www.tei-c.org/ns/1.0}"
- xml_ns = "{http://www.w3.org/XML/1998/namespace}"
- word_path = "TEI/text/body/div/div/p/s/(w|c)"
- sent_path = "TEI/text/body/div/div/p/s"
- para_path = "TEI/text/body/div/div/p"
- def __init__(self, file_path):
- self.__file_path = file_path
- @classmethod
- def _word_elt(cls, elt, context):
- return elt.text
- @classmethod
- def _sent_elt(cls, elt, context):
- return [cls._word_elt(w, None) for w in xpath(elt, "*", cls.ns)]
- @classmethod
- def _para_elt(cls, elt, context):
- return [cls._sent_elt(s, None) for s in xpath(elt, "*", cls.ns)]
- @classmethod
- def _tagged_word_elt(cls, elt, context):
- if "ana" not in elt.attrib:
- return (elt.text, "")
- if cls.__tags == "" and cls.__tagset == "msd":
- return (elt.text, elt.attrib["ana"])
- elif cls.__tags == "" and cls.__tagset == "universal":
- return (elt.text, MTETagConverter.msd_to_universal(elt.attrib["ana"]))
- else:
- tags = re.compile("^" + re.sub("-", ".", cls.__tags) + ".*$")
- if tags.match(elt.attrib["ana"]):
- if cls.__tagset == "msd":
- return (elt.text, elt.attrib["ana"])
- else:
- return (
- elt.text,
- MTETagConverter.msd_to_universal(elt.attrib["ana"]),
- )
- else:
- return None
- @classmethod
- def _tagged_sent_elt(cls, elt, context):
- return list(
- filter(
- lambda x: x is not None,
- [cls._tagged_word_elt(w, None) for w in xpath(elt, "*", cls.ns)],
- )
- )
- @classmethod
- def _tagged_para_elt(cls, elt, context):
- return list(
- filter(
- lambda x: x is not None,
- [cls._tagged_sent_elt(s, None) for s in xpath(elt, "*", cls.ns)],
- )
- )
- @classmethod
- def _lemma_word_elt(cls, elt, context):
- if "lemma" not in elt.attrib:
- return (elt.text, "")
- else:
- return (elt.text, elt.attrib["lemma"])
- @classmethod
- def _lemma_sent_elt(cls, elt, context):
- return [cls._lemma_word_elt(w, None) for w in xpath(elt, "*", cls.ns)]
- @classmethod
- def _lemma_para_elt(cls, elt, context):
- return [cls._lemma_sent_elt(s, None) for s in xpath(elt, "*", cls.ns)]
- def words(self):
- return MTECorpusView(
- self.__file_path, MTEFileReader.word_path, MTEFileReader._word_elt
- )
- def sents(self):
- return MTECorpusView(
- self.__file_path, MTEFileReader.sent_path, MTEFileReader._sent_elt
- )
- def paras(self):
- return MTECorpusView(
- self.__file_path, MTEFileReader.para_path, MTEFileReader._para_elt
- )
- def lemma_words(self):
- return MTECorpusView(
- self.__file_path, MTEFileReader.word_path, MTEFileReader._lemma_word_elt
- )
- def tagged_words(self, tagset, tags):
- MTEFileReader.__tagset = tagset
- MTEFileReader.__tags = tags
- return MTECorpusView(
- self.__file_path, MTEFileReader.word_path, MTEFileReader._tagged_word_elt
- )
- def lemma_sents(self):
- return MTECorpusView(
- self.__file_path, MTEFileReader.sent_path, MTEFileReader._lemma_sent_elt
- )
- def tagged_sents(self, tagset, tags):
- MTEFileReader.__tagset = tagset
- MTEFileReader.__tags = tags
- return MTECorpusView(
- self.__file_path, MTEFileReader.sent_path, MTEFileReader._tagged_sent_elt
- )
- def lemma_paras(self):
- return MTECorpusView(
- self.__file_path, MTEFileReader.para_path, MTEFileReader._lemma_para_elt
- )
- def tagged_paras(self, tagset, tags):
- MTEFileReader.__tagset = tagset
- MTEFileReader.__tags = tags
- return MTECorpusView(
- self.__file_path, MTEFileReader.para_path, MTEFileReader._tagged_para_elt
- )
- class MTETagConverter:
- """
- Class for converting msd tags to universal tags, more conversion
- options are currently not implemented.
- """
- mapping_msd_universal = {
- "A": "ADJ",
- "S": "ADP",
- "R": "ADV",
- "C": "CONJ",
- "D": "DET",
- "N": "NOUN",
- "M": "NUM",
- "Q": "PRT",
- "P": "PRON",
- "V": "VERB",
- ".": ".",
- "-": "X",
- }
- @staticmethod
- def msd_to_universal(tag):
- """
- This function converts the annotation from the Multex-East to the universal tagset
- as described in Chapter 5 of the NLTK-Book
- Unknown Tags will be mapped to X. Punctuation marks are not supported in MSD tags, so
- """
- indicator = tag[0] if not tag[0] == "#" else tag[1]
- if not indicator in MTETagConverter.mapping_msd_universal:
- indicator = "-"
- return MTETagConverter.mapping_msd_universal[indicator]
- class MTECorpusReader(TaggedCorpusReader):
- """
- Reader for corpora following the TEI-p5 xml scheme, such as MULTEXT-East.
- MULTEXT-East contains part-of-speech-tagged words with a quite precise tagging
- scheme. These tags can be converted to the Universal tagset
- """
- def __init__(self, root=None, fileids=None, encoding="utf8"):
- """
- Construct a new MTECorpusreader for a set of documents
- located at the given root directory. Example usage:
- >>> root = '/...path to corpus.../'
- >>> reader = MTECorpusReader(root, 'oana-*.xml', 'utf8') # doctest: +SKIP
- :param root: The root directory for this corpus. (default points to location in multext config file)
- :param fileids: A list or regexp specifying the fileids in this corpus. (default is oana-en.xml)
- :param enconding: The encoding of the given files (default is utf8)
- """
- TaggedCorpusReader.__init__(self, root, fileids, encoding)
- def __fileids(self, fileids):
- if fileids is None:
- fileids = self._fileids
- elif isinstance(fileids, str):
- fileids = [fileids]
- # filter wrong userinput
- fileids = filter(lambda x: x in self._fileids, fileids)
- # filter multext-east sourcefiles that are not compatible to the teip5 specification
- fileids = filter(lambda x: x not in ["oana-bg.xml", "oana-mk.xml"], fileids)
- if not fileids:
- print("No valid multext-east file specified")
- return fileids
- def readme(self):
- """
- Prints some information about this corpus.
- :return: the content of the attached README file
- :rtype: str
- """
- return self.open("00README.txt").read()
- def raw(self, fileids=None):
- """
- :param fileids: A list specifying the fileids that should be used.
- :return: the given file(s) as a single string.
- :rtype: str
- """
- return reduce([self.open(f).read() for f in self.__fileids(fileids)], [])
- def words(self, fileids=None):
- """
- :param fileids: A list specifying the fileids that should be used.
- :return: the given file(s) as a list of words and punctuation symbols.
- :rtype: list(str)
- """
- return concat(
- [
- MTEFileReader(os.path.join(self._root, f)).words()
- for f in self.__fileids(fileids)
- ]
- )
- def sents(self, fileids=None):
- """
- :param fileids: A list specifying the fileids that should be used.
- :return: the given file(s) as a list of sentences or utterances,
- each encoded as a list of word strings
- :rtype: list(list(str))
- """
- return concat(
- [
- MTEFileReader(os.path.join(self._root, f)).sents()
- for f in self.__fileids(fileids)
- ]
- )
- def paras(self, fileids=None):
- """
- :param fileids: A list specifying the fileids that should be used.
- :return: the given file(s) as a list of paragraphs, each encoded as a list
- of sentences, which are in turn encoded as lists of word string
- :rtype: list(list(list(str)))
- """
- return concat(
- [
- MTEFileReader(os.path.join(self._root, f)).paras()
- for f in self.__fileids(fileids)
- ]
- )
- def lemma_words(self, fileids=None):
- """
- :param fileids: A list specifying the fileids that should be used.
- :return: the given file(s) as a list of words, the corresponding lemmas
- and punctuation symbols, encoded as tuples (word, lemma)
- :rtype: list(tuple(str,str))
- """
- return concat(
- [
- MTEFileReader(os.path.join(self._root, f)).lemma_words()
- for f in self.__fileids(fileids)
- ]
- )
- def tagged_words(self, fileids=None, tagset="msd", tags=""):
- """
- :param fileids: A list specifying the fileids that should be used.
- :param tagset: The tagset that should be used in the returned object,
- either "universal" or "msd", "msd" is the default
- :param tags: An MSD Tag that is used to filter all parts of the used corpus
- that are not more precise or at least equal to the given tag
- :return: the given file(s) as a list of tagged words and punctuation symbols
- encoded as tuples (word, tag)
- :rtype: list(tuple(str, str))
- """
- if tagset == "universal" or tagset == "msd":
- return concat(
- [
- MTEFileReader(os.path.join(self._root, f)).tagged_words(
- tagset, tags
- )
- for f in self.__fileids(fileids)
- ]
- )
- else:
- print("Unknown tagset specified.")
- def lemma_sents(self, fileids=None):
- """
- :param fileids: A list specifying the fileids that should be used.
- :return: the given file(s) as a list of sentences or utterances, each
- encoded as a list of tuples of the word and the corresponding
- lemma (word, lemma)
- :rtype: list(list(tuple(str, str)))
- """
- return concat(
- [
- MTEFileReader(os.path.join(self._root, f)).lemma_sents()
- for f in self.__fileids(fileids)
- ]
- )
- def tagged_sents(self, fileids=None, tagset="msd", tags=""):
- """
- :param fileids: A list specifying the fileids that should be used.
- :param tagset: The tagset that should be used in the returned object,
- either "universal" or "msd", "msd" is the default
- :param tags: An MSD Tag that is used to filter all parts of the used corpus
- that are not more precise or at least equal to the given tag
- :return: the given file(s) as a list of sentences or utterances, each
- each encoded as a list of (word,tag) tuples
- :rtype: list(list(tuple(str, str)))
- """
- if tagset == "universal" or tagset == "msd":
- return concat(
- [
- MTEFileReader(os.path.join(self._root, f)).tagged_sents(
- tagset, tags
- )
- for f in self.__fileids(fileids)
- ]
- )
- else:
- print("Unknown tagset specified.")
- def lemma_paras(self, fileids=None):
- """
- :param fileids: A list specifying the fileids that should be used.
- :return: the given file(s) as a list of paragraphs, each encoded as a
- list of sentences, which are in turn encoded as a list of
- tuples of the word and the corresponding lemma (word, lemma)
- :rtype: list(List(List(tuple(str, str))))
- """
- return concat(
- [
- MTEFileReader(os.path.join(self._root, f)).lemma_paras()
- for f in self.__fileids(fileids)
- ]
- )
- def tagged_paras(self, fileids=None, tagset="msd", tags=""):
- """
- :param fileids: A list specifying the fileids that should be used.
- :param tagset: The tagset that should be used in the returned object,
- either "universal" or "msd", "msd" is the default
- :param tags: An MSD Tag that is used to filter all parts of the used corpus
- that are not more precise or at least equal to the given tag
- :return: the given file(s) as a list of paragraphs, each encoded as a
- list of sentences, which are in turn encoded as a list
- of (word,tag) tuples
- :rtype: list(list(list(tuple(str, str))))
- """
- if tagset == "universal" or tagset == "msd":
- return concat(
- [
- MTEFileReader(os.path.join(self._root, f)).tagged_paras(
- tagset, tags
- )
- for f in self.__fileids(fileids)
- ]
- )
- else:
- print("Unknown tagset specified.")
|