ycoe.py 10 KB


  1. # -*- coding: iso-8859-1 -*-
  2. # Natural Language Toolkit: York-Toronto-Helsinki Parsed Corpus of Old English Prose (YCOE)
  3. #
  4. # Copyright (C) 2001-2015 NLTK Project
  5. # Author: Selina Dennis <selina@tranzfusion.net>
  6. # URL: <http://nltk.org/>
  7. # For license information, see LICENSE.TXT
  8. """
  9. Corpus reader for the York-Toronto-Helsinki Parsed Corpus of Old
  10. English Prose (YCOE), a 1.5 million word syntactically-annotated
  11. corpus of Old English prose texts. The corpus is distributed by the
  12. Oxford Text Archive: http://www.ota.ahds.ac.uk/ It is not included
  13. with NLTK.
  14. The YCOE corpus is divided into 100 files, each representing
  15. an Old English prose text. Tags used within each text complies
  16. to the YCOE standard: http://www-users.york.ac.uk/~lang22/YCOE/YcoeHome.htm
  17. """
  18. import os
  19. import re
  20. from nltk.tokenize import RegexpTokenizer
  21. from nltk.corpus.reader.bracket_parse import BracketParseCorpusReader
  22. from nltk.corpus.reader.tagged import TaggedCorpusReader
  23. from nltk.corpus.reader.util import *
  24. from nltk.corpus.reader.api import *
  25. class YCOECorpusReader(CorpusReader):
  26. """
  27. Corpus reader for the York-Toronto-Helsinki Parsed Corpus of Old
  28. English Prose (YCOE), a 1.5 million word syntactically-annotated
  29. corpus of Old English prose texts.
  30. """
  31. def __init__(self, root, encoding="utf8"):
  32. CorpusReader.__init__(self, root, [], encoding)
  33. self._psd_reader = YCOEParseCorpusReader(
  34. self.root.join("psd"), ".*", ".psd", encoding=encoding
  35. )
  36. self._pos_reader = YCOETaggedCorpusReader(self.root.join("pos"), ".*", ".pos")
  37. # Make sure we have a consistent set of items:
  38. documents = set(f[:-4] for f in self._psd_reader.fileids())
  39. if set(f[:-4] for f in self._pos_reader.fileids()) != documents:
  40. raise ValueError('Items in "psd" and "pos" ' "subdirectories do not match.")
  41. fileids = sorted(
  42. ["%s.psd" % doc for doc in documents]
  43. + ["%s.pos" % doc for doc in documents]
  44. )
  45. CorpusReader.__init__(self, root, fileids, encoding)
  46. self._documents = sorted(documents)
  47. def documents(self, fileids=None):
  48. """
  49. Return a list of document identifiers for all documents in
  50. this corpus, or for the documents with the given file(s) if
  51. specified.
  52. """
  53. if fileids is None:
  54. return self._documents
  55. if isinstance(fileids, str):
  56. fileids = [fileids]
  57. for f in fileids:
  58. if f not in self._fileids:
  59. raise KeyError("File id %s not found" % fileids)
  60. # Strip off the '.pos' and '.psd' extensions.
  61. return sorted(set(f[:-4] for f in fileids))
  62. def fileids(self, documents=None):
  63. """
  64. Return a list of file identifiers for the files that make up
  65. this corpus, or that store the given document(s) if specified.
  66. """
  67. if documents is None:
  68. return self._fileids
  69. elif isinstance(documents, str):
  70. documents = [documents]
  71. return sorted(
  72. set(
  73. ["%s.pos" % doc for doc in documents]
  74. + ["%s.psd" % doc for doc in documents]
  75. )
  76. )
  77. def _getfileids(self, documents, subcorpus):
  78. """
  79. Helper that selects the appropriate fileids for a given set of
  80. documents from a given subcorpus (pos or psd).
  81. """
  82. if documents is None:
  83. documents = self._documents
  84. else:
  85. if isinstance(documents, str):
  86. documents = [documents]
  87. for document in documents:
  88. if document not in self._documents:
  89. if document[-4:] in (".pos", ".psd"):
  90. raise ValueError(
  91. "Expected a document identifier, not a file "
  92. "identifier. (Use corpus.documents() to get "
  93. "a list of document identifiers."
  94. )
  95. else:
  96. raise ValueError("Document identifier %s not found" % document)
  97. return ["%s.%s" % (d, subcorpus) for d in documents]
  98. # Delegate to one of our two sub-readers:
  99. def words(self, documents=None):
  100. return self._pos_reader.words(self._getfileids(documents, "pos"))
  101. def sents(self, documents=None):
  102. return self._pos_reader.sents(self._getfileids(documents, "pos"))
  103. def paras(self, documents=None):
  104. return self._pos_reader.paras(self._getfileids(documents, "pos"))
  105. def tagged_words(self, documents=None):
  106. return self._pos_reader.tagged_words(self._getfileids(documents, "pos"))
  107. def tagged_sents(self, documents=None):
  108. return self._pos_reader.tagged_sents(self._getfileids(documents, "pos"))
  109. def tagged_paras(self, documents=None):
  110. return self._pos_reader.tagged_paras(self._getfileids(documents, "pos"))
  111. def parsed_sents(self, documents=None):
  112. return self._psd_reader.parsed_sents(self._getfileids(documents, "psd"))
  113. class YCOEParseCorpusReader(BracketParseCorpusReader):
  114. """Specialized version of the standard bracket parse corpus reader
  115. that strips out (CODE ...) and (ID ...) nodes."""
  116. def _parse(self, t):
  117. t = re.sub(r"(?u)\((CODE|ID)[^\)]*\)", "", t)
  118. if re.match(r"\s*\(\s*\)\s*$", t):
  119. return None
  120. return BracketParseCorpusReader._parse(self, t)
  121. class YCOETaggedCorpusReader(TaggedCorpusReader):
  122. def __init__(self, root, items, encoding="utf8"):
  123. gaps_re = r"(?u)(?<=/\.)\s+|\s*\S*_CODE\s*|\s*\S*_ID\s*"
  124. sent_tokenizer = RegexpTokenizer(gaps_re, gaps=True)
  125. TaggedCorpusReader.__init__(
  126. self, root, items, sep="_", sent_tokenizer=sent_tokenizer
  127. )
  128. #: A list of all documents and their titles in ycoe.
  129. documents = {
  130. "coadrian.o34": "Adrian and Ritheus",
  131. "coaelhom.o3": "Ælfric, Supplemental Homilies",
  132. "coaelive.o3": "Ælfric's Lives of Saints",
  133. "coalcuin": "Alcuin De virtutibus et vitiis",
  134. "coalex.o23": "Alexander's Letter to Aristotle",
  135. "coapollo.o3": "Apollonius of Tyre",
  136. "coaugust": "Augustine",
  137. "cobede.o2": "Bede's History of the English Church",
  138. "cobenrul.o3": "Benedictine Rule",
  139. "coblick.o23": "Blickling Homilies",
  140. "coboeth.o2": "Boethius' Consolation of Philosophy",
  141. "cobyrhtf.o3": "Byrhtferth's Manual",
  142. "cocanedgD": "Canons of Edgar (D)",
  143. "cocanedgX": "Canons of Edgar (X)",
  144. "cocathom1.o3": "Ælfric's Catholic Homilies I",
  145. "cocathom2.o3": "Ælfric's Catholic Homilies II",
  146. "cochad.o24": "Saint Chad",
  147. "cochdrul": "Chrodegang of Metz, Rule",
  148. "cochristoph": "Saint Christopher",
  149. "cochronA.o23": "Anglo-Saxon Chronicle A",
  150. "cochronC": "Anglo-Saxon Chronicle C",
  151. "cochronD": "Anglo-Saxon Chronicle D",
  152. "cochronE.o34": "Anglo-Saxon Chronicle E",
  153. "cocura.o2": "Cura Pastoralis",
  154. "cocuraC": "Cura Pastoralis (Cotton)",
  155. "codicts.o34": "Dicts of Cato",
  156. "codocu1.o1": "Documents 1 (O1)",
  157. "codocu2.o12": "Documents 2 (O1/O2)",
  158. "codocu2.o2": "Documents 2 (O2)",
  159. "codocu3.o23": "Documents 3 (O2/O3)",
  160. "codocu3.o3": "Documents 3 (O3)",
  161. "codocu4.o24": "Documents 4 (O2/O4)",
  162. "coeluc1": "Honorius of Autun, Elucidarium 1",
  163. "coeluc2": "Honorius of Autun, Elucidarium 1",
  164. "coepigen.o3": "Ælfric's Epilogue to Genesis",
  165. "coeuphr": "Saint Euphrosyne",
  166. "coeust": "Saint Eustace and his companions",
  167. "coexodusP": "Exodus (P)",
  168. "cogenesiC": "Genesis (C)",
  169. "cogregdC.o24": "Gregory's Dialogues (C)",
  170. "cogregdH.o23": "Gregory's Dialogues (H)",
  171. "coherbar": "Pseudo-Apuleius, Herbarium",
  172. "coinspolD.o34": "Wulfstan's Institute of Polity (D)",
  173. "coinspolX": "Wulfstan's Institute of Polity (X)",
  174. "cojames": "Saint James",
  175. "colacnu.o23": "Lacnunga",
  176. "colaece.o2": "Leechdoms",
  177. "colaw1cn.o3": "Laws, Cnut I",
  178. "colaw2cn.o3": "Laws, Cnut II",
  179. "colaw5atr.o3": "Laws, Æthelred V",
  180. "colaw6atr.o3": "Laws, Æthelred VI",
  181. "colawaf.o2": "Laws, Alfred",
  182. "colawafint.o2": "Alfred's Introduction to Laws",
  183. "colawger.o34": "Laws, Gerefa",
  184. "colawine.ox2": "Laws, Ine",
  185. "colawnorthu.o3": "Northumbra Preosta Lagu",
  186. "colawwllad.o4": "Laws, William I, Lad",
  187. "coleofri.o4": "Leofric",
  188. "colsigef.o3": "Ælfric's Letter to Sigefyrth",
  189. "colsigewB": "Ælfric's Letter to Sigeweard (B)",
  190. "colsigewZ.o34": "Ælfric's Letter to Sigeweard (Z)",
  191. "colwgeat": "Ælfric's Letter to Wulfgeat",
  192. "colwsigeT": "Ælfric's Letter to Wulfsige (T)",
  193. "colwsigeXa.o34": "Ælfric's Letter to Wulfsige (Xa)",
  194. "colwstan1.o3": "Ælfric's Letter to Wulfstan I",
  195. "colwstan2.o3": "Ælfric's Letter to Wulfstan II",
  196. "comargaC.o34": "Saint Margaret (C)",
  197. "comargaT": "Saint Margaret (T)",
  198. "comart1": "Martyrology, I",
  199. "comart2": "Martyrology, II",
  200. "comart3.o23": "Martyrology, III",
  201. "comarvel.o23": "Marvels of the East",
  202. "comary": "Mary of Egypt",
  203. "coneot": "Saint Neot",
  204. "conicodA": "Gospel of Nicodemus (A)",
  205. "conicodC": "Gospel of Nicodemus (C)",
  206. "conicodD": "Gospel of Nicodemus (D)",
  207. "conicodE": "Gospel of Nicodemus (E)",
  208. "coorosiu.o2": "Orosius",
  209. "cootest.o3": "Heptateuch",
  210. "coprefcath1.o3": "Ælfric's Preface to Catholic Homilies I",
  211. "coprefcath2.o3": "Ælfric's Preface to Catholic Homilies II",
  212. "coprefcura.o2": "Preface to the Cura Pastoralis",
  213. "coprefgen.o3": "Ælfric's Preface to Genesis",
  214. "copreflives.o3": "Ælfric's Preface to Lives of Saints",
  215. "coprefsolilo": "Preface to Augustine's Soliloquies",
  216. "coquadru.o23": "Pseudo-Apuleius, Medicina de quadrupedibus",
  217. "corood": "History of the Holy Rood-Tree",
  218. "cosevensl": "Seven Sleepers",
  219. "cosolilo": "St. Augustine's Soliloquies",
  220. "cosolsat1.o4": "Solomon and Saturn I",
  221. "cosolsat2": "Solomon and Saturn II",
  222. "cotempo.o3": "Ælfric's De Temporibus Anni",
  223. "coverhom": "Vercelli Homilies",
  224. "coverhomE": "Vercelli Homilies (E)",
  225. "coverhomL": "Vercelli Homilies (L)",
  226. "covinceB": "Saint Vincent (Bodley 343)",
  227. "covinsal": "Vindicta Salvatoris",
  228. "cowsgosp.o3": "West-Saxon Gospels",
  229. "cowulf.o34": "Wulfstan's Homilies",
  230. }