| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259 |
- # -*- coding: iso-8859-1 -*-
- # Natural Language Toolkit: York-Toronto-Helsinki Parsed Corpus of Old English Prose (YCOE)
- #
- # Copyright (C) 2001-2015 NLTK Project
- # Author: Selina Dennis <selina@tranzfusion.net>
- # URL: <http://nltk.org/>
- # For license information, see LICENSE.TXT
- """
- Corpus reader for the York-Toronto-Helsinki Parsed Corpus of Old
- English Prose (YCOE), a 1.5 million word syntactically-annotated
- corpus of Old English prose texts. The corpus is distributed by the
- Oxford Text Archive: http://www.ota.ahds.ac.uk/ It is not included
- with NLTK.
- The YCOE corpus is divided into 100 files, each representing
- an Old English prose text. Tags used within each text complies
- to the YCOE standard: http://www-users.york.ac.uk/~lang22/YCOE/YcoeHome.htm
- """
- import os
- import re
- from nltk.tokenize import RegexpTokenizer
- from nltk.corpus.reader.bracket_parse import BracketParseCorpusReader
- from nltk.corpus.reader.tagged import TaggedCorpusReader
- from nltk.corpus.reader.util import *
- from nltk.corpus.reader.api import *
- class YCOECorpusReader(CorpusReader):
- """
- Corpus reader for the York-Toronto-Helsinki Parsed Corpus of Old
- English Prose (YCOE), a 1.5 million word syntactically-annotated
- corpus of Old English prose texts.
- """
- def __init__(self, root, encoding="utf8"):
- CorpusReader.__init__(self, root, [], encoding)
- self._psd_reader = YCOEParseCorpusReader(
- self.root.join("psd"), ".*", ".psd", encoding=encoding
- )
- self._pos_reader = YCOETaggedCorpusReader(self.root.join("pos"), ".*", ".pos")
- # Make sure we have a consistent set of items:
- documents = set(f[:-4] for f in self._psd_reader.fileids())
- if set(f[:-4] for f in self._pos_reader.fileids()) != documents:
- raise ValueError('Items in "psd" and "pos" ' "subdirectories do not match.")
- fileids = sorted(
- ["%s.psd" % doc for doc in documents]
- + ["%s.pos" % doc for doc in documents]
- )
- CorpusReader.__init__(self, root, fileids, encoding)
- self._documents = sorted(documents)
- def documents(self, fileids=None):
- """
- Return a list of document identifiers for all documents in
- this corpus, or for the documents with the given file(s) if
- specified.
- """
- if fileids is None:
- return self._documents
- if isinstance(fileids, str):
- fileids = [fileids]
- for f in fileids:
- if f not in self._fileids:
- raise KeyError("File id %s not found" % fileids)
- # Strip off the '.pos' and '.psd' extensions.
- return sorted(set(f[:-4] for f in fileids))
- def fileids(self, documents=None):
- """
- Return a list of file identifiers for the files that make up
- this corpus, or that store the given document(s) if specified.
- """
- if documents is None:
- return self._fileids
- elif isinstance(documents, str):
- documents = [documents]
- return sorted(
- set(
- ["%s.pos" % doc for doc in documents]
- + ["%s.psd" % doc for doc in documents]
- )
- )
- def _getfileids(self, documents, subcorpus):
- """
- Helper that selects the appropriate fileids for a given set of
- documents from a given subcorpus (pos or psd).
- """
- if documents is None:
- documents = self._documents
- else:
- if isinstance(documents, str):
- documents = [documents]
- for document in documents:
- if document not in self._documents:
- if document[-4:] in (".pos", ".psd"):
- raise ValueError(
- "Expected a document identifier, not a file "
- "identifier. (Use corpus.documents() to get "
- "a list of document identifiers."
- )
- else:
- raise ValueError("Document identifier %s not found" % document)
- return ["%s.%s" % (d, subcorpus) for d in documents]
- # Delegate to one of our two sub-readers:
- def words(self, documents=None):
- return self._pos_reader.words(self._getfileids(documents, "pos"))
- def sents(self, documents=None):
- return self._pos_reader.sents(self._getfileids(documents, "pos"))
- def paras(self, documents=None):
- return self._pos_reader.paras(self._getfileids(documents, "pos"))
- def tagged_words(self, documents=None):
- return self._pos_reader.tagged_words(self._getfileids(documents, "pos"))
- def tagged_sents(self, documents=None):
- return self._pos_reader.tagged_sents(self._getfileids(documents, "pos"))
- def tagged_paras(self, documents=None):
- return self._pos_reader.tagged_paras(self._getfileids(documents, "pos"))
- def parsed_sents(self, documents=None):
- return self._psd_reader.parsed_sents(self._getfileids(documents, "psd"))
- class YCOEParseCorpusReader(BracketParseCorpusReader):
- """Specialized version of the standard bracket parse corpus reader
- that strips out (CODE ...) and (ID ...) nodes."""
- def _parse(self, t):
- t = re.sub(r"(?u)\((CODE|ID)[^\)]*\)", "", t)
- if re.match(r"\s*\(\s*\)\s*$", t):
- return None
- return BracketParseCorpusReader._parse(self, t)
- class YCOETaggedCorpusReader(TaggedCorpusReader):
- def __init__(self, root, items, encoding="utf8"):
- gaps_re = r"(?u)(?<=/\.)\s+|\s*\S*_CODE\s*|\s*\S*_ID\s*"
- sent_tokenizer = RegexpTokenizer(gaps_re, gaps=True)
- TaggedCorpusReader.__init__(
- self, root, items, sep="_", sent_tokenizer=sent_tokenizer
- )
- #: A list of all documents and their titles in ycoe.
- documents = {
- "coadrian.o34": "Adrian and Ritheus",
- "coaelhom.o3": "Ælfric, Supplemental Homilies",
- "coaelive.o3": "Ælfric's Lives of Saints",
- "coalcuin": "Alcuin De virtutibus et vitiis",
- "coalex.o23": "Alexander's Letter to Aristotle",
- "coapollo.o3": "Apollonius of Tyre",
- "coaugust": "Augustine",
- "cobede.o2": "Bede's History of the English Church",
- "cobenrul.o3": "Benedictine Rule",
- "coblick.o23": "Blickling Homilies",
- "coboeth.o2": "Boethius' Consolation of Philosophy",
- "cobyrhtf.o3": "Byrhtferth's Manual",
- "cocanedgD": "Canons of Edgar (D)",
- "cocanedgX": "Canons of Edgar (X)",
- "cocathom1.o3": "Ælfric's Catholic Homilies I",
- "cocathom2.o3": "Ælfric's Catholic Homilies II",
- "cochad.o24": "Saint Chad",
- "cochdrul": "Chrodegang of Metz, Rule",
- "cochristoph": "Saint Christopher",
- "cochronA.o23": "Anglo-Saxon Chronicle A",
- "cochronC": "Anglo-Saxon Chronicle C",
- "cochronD": "Anglo-Saxon Chronicle D",
- "cochronE.o34": "Anglo-Saxon Chronicle E",
- "cocura.o2": "Cura Pastoralis",
- "cocuraC": "Cura Pastoralis (Cotton)",
- "codicts.o34": "Dicts of Cato",
- "codocu1.o1": "Documents 1 (O1)",
- "codocu2.o12": "Documents 2 (O1/O2)",
- "codocu2.o2": "Documents 2 (O2)",
- "codocu3.o23": "Documents 3 (O2/O3)",
- "codocu3.o3": "Documents 3 (O3)",
- "codocu4.o24": "Documents 4 (O2/O4)",
- "coeluc1": "Honorius of Autun, Elucidarium 1",
- "coeluc2": "Honorius of Autun, Elucidarium 1",
- "coepigen.o3": "Ælfric's Epilogue to Genesis",
- "coeuphr": "Saint Euphrosyne",
- "coeust": "Saint Eustace and his companions",
- "coexodusP": "Exodus (P)",
- "cogenesiC": "Genesis (C)",
- "cogregdC.o24": "Gregory's Dialogues (C)",
- "cogregdH.o23": "Gregory's Dialogues (H)",
- "coherbar": "Pseudo-Apuleius, Herbarium",
- "coinspolD.o34": "Wulfstan's Institute of Polity (D)",
- "coinspolX": "Wulfstan's Institute of Polity (X)",
- "cojames": "Saint James",
- "colacnu.o23": "Lacnunga",
- "colaece.o2": "Leechdoms",
- "colaw1cn.o3": "Laws, Cnut I",
- "colaw2cn.o3": "Laws, Cnut II",
- "colaw5atr.o3": "Laws, Æthelred V",
- "colaw6atr.o3": "Laws, Æthelred VI",
- "colawaf.o2": "Laws, Alfred",
- "colawafint.o2": "Alfred's Introduction to Laws",
- "colawger.o34": "Laws, Gerefa",
- "colawine.ox2": "Laws, Ine",
- "colawnorthu.o3": "Northumbra Preosta Lagu",
- "colawwllad.o4": "Laws, William I, Lad",
- "coleofri.o4": "Leofric",
- "colsigef.o3": "Ælfric's Letter to Sigefyrth",
- "colsigewB": "Ælfric's Letter to Sigeweard (B)",
- "colsigewZ.o34": "Ælfric's Letter to Sigeweard (Z)",
- "colwgeat": "Ælfric's Letter to Wulfgeat",
- "colwsigeT": "Ælfric's Letter to Wulfsige (T)",
- "colwsigeXa.o34": "Ælfric's Letter to Wulfsige (Xa)",
- "colwstan1.o3": "Ælfric's Letter to Wulfstan I",
- "colwstan2.o3": "Ælfric's Letter to Wulfstan II",
- "comargaC.o34": "Saint Margaret (C)",
- "comargaT": "Saint Margaret (T)",
- "comart1": "Martyrology, I",
- "comart2": "Martyrology, II",
- "comart3.o23": "Martyrology, III",
- "comarvel.o23": "Marvels of the East",
- "comary": "Mary of Egypt",
- "coneot": "Saint Neot",
- "conicodA": "Gospel of Nicodemus (A)",
- "conicodC": "Gospel of Nicodemus (C)",
- "conicodD": "Gospel of Nicodemus (D)",
- "conicodE": "Gospel of Nicodemus (E)",
- "coorosiu.o2": "Orosius",
- "cootest.o3": "Heptateuch",
- "coprefcath1.o3": "Ælfric's Preface to Catholic Homilies I",
- "coprefcath2.o3": "Ælfric's Preface to Catholic Homilies II",
- "coprefcura.o2": "Preface to the Cura Pastoralis",
- "coprefgen.o3": "Ælfric's Preface to Genesis",
- "copreflives.o3": "Ælfric's Preface to Lives of Saints",
- "coprefsolilo": "Preface to Augustine's Soliloquies",
- "coquadru.o23": "Pseudo-Apuleius, Medicina de quadrupedibus",
- "corood": "History of the Holy Rood-Tree",
- "cosevensl": "Seven Sleepers",
- "cosolilo": "St. Augustine's Soliloquies",
- "cosolsat1.o4": "Solomon and Saturn I",
- "cosolsat2": "Solomon and Saturn II",
- "cotempo.o3": "Ælfric's De Temporibus Anni",
- "coverhom": "Vercelli Homilies",
- "coverhomE": "Vercelli Homilies (E)",
- "coverhomL": "Vercelli Homilies (L)",
- "covinceB": "Saint Vincent (Bodley 343)",
- "covinsal": "Vindicta Salvatoris",
- "cowsgosp.o3": "West-Saxon Gospels",
- "cowulf.o34": "Wulfstan's Homilies",
- }
|