corpus.doctest 97 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199
  1. .. Copyright (C) 2001-2020 NLTK Project
  2. .. For license information, see LICENSE.TXT
  3. ================
  4. Corpus Readers
  5. ================
  6. The `nltk.corpus` package defines a collection of *corpus reader*
  7. classes, which can be used to access the contents of a diverse set of
  8. corpora. The list of available corpora is given at:
  9. http://www.nltk.org/nltk_data/
  10. Each corpus reader class is specialized to handle a specific
  11. corpus format. In addition, the `nltk.corpus` package automatically
  12. creates a set of corpus reader instances that can be used to access
  13. the corpora in the NLTK data package.
  14. Section `Corpus Reader Objects`_ ("Corpus Reader Objects") describes
  15. the corpus reader instances that can be used to read the corpora in
  16. the NLTK data package. Section `Corpus Reader Classes`_ ("Corpus
  17. Reader Classes") describes the corpus reader classes themselves, and
  18. discusses the issues involved in creating new corpus reader objects
  19. and new corpus reader classes. Section `Regression Tests`_
  20. ("Regression Tests") contains regression tests for the corpus readers
  21. and associated functions and classes.
  22. .. contents:: **Table of Contents**
  23. :depth: 2
  24. :backlinks: none
  25. ---------------------
  26. Corpus Reader Objects
  27. ---------------------
  28. Overview
  29. ========
  30. NLTK includes a diverse set of corpora which can be
  31. read using the ``nltk.corpus`` package. Each corpus is accessed by
  32. means of a "corpus reader" object from ``nltk.corpus``:
  33. >>> import nltk.corpus
  34. >>> # The Brown corpus:
  35. >>> print(str(nltk.corpus.brown).replace('\\\\','/'))
  36. <CategorizedTaggedCorpusReader in '.../corpora/brown'...>
  37. >>> # The Penn Treebank Corpus:
  38. >>> print(str(nltk.corpus.treebank).replace('\\\\','/'))
  39. <BracketParseCorpusReader in '.../corpora/treebank/combined'...>
  40. >>> # The Name Genders Corpus:
  41. >>> print(str(nltk.corpus.names).replace('\\\\','/'))
  42. <WordListCorpusReader in '.../corpora/names'...>
  43. >>> # The Inaugural Address Corpus:
  44. >>> print(str(nltk.corpus.inaugural).replace('\\\\','/'))
  45. <PlaintextCorpusReader in '.../corpora/inaugural'...>
  46. Most corpora consist of a set of files, each containing a document (or
  47. other pieces of text). A list of identifiers for these files is
  48. accessed via the ``fileids()`` method of the corpus reader:
  49. >>> nltk.corpus.treebank.fileids() # doctest: +ELLIPSIS
  50. ['wsj_0001.mrg', 'wsj_0002.mrg', 'wsj_0003.mrg', 'wsj_0004.mrg', ...]
  51. >>> nltk.corpus.inaugural.fileids() # doctest: +ELLIPSIS
  52. ['1789-Washington.txt', '1793-Washington.txt', '1797-Adams.txt', ...]
  53. Each corpus reader provides a variety of methods to read data from the
  54. corpus, depending on the format of the corpus. For example, plaintext
  55. corpora support methods to read the corpus as raw text, a list of
  56. words, a list of sentences, or a list of paragraphs.
  57. >>> from nltk.corpus import inaugural
  58. >>> inaugural.raw('1789-Washington.txt') # doctest: +ELLIPSIS
  59. 'Fellow-Citizens of the Senate ...'
  60. >>> inaugural.words('1789-Washington.txt')
  61. ['Fellow', '-', 'Citizens', 'of', 'the', ...]
  62. >>> inaugural.sents('1789-Washington.txt') # doctest: +ELLIPSIS
  63. [['Fellow', '-', 'Citizens'...], ['Among', 'the', 'vicissitudes'...]...]
  64. >>> inaugural.paras('1789-Washington.txt') # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
  65. [[['Fellow', '-', 'Citizens'...]],
  66. [['Among', 'the', 'vicissitudes'...],
  67. ['On', 'the', 'one', 'hand', ',', 'I'...]...]...]
  68. Each of these reader methods may be given a single document's item
  69. name or a list of document item names. When given a list of document
  70. item names, the reader methods will concatenate together the contents
  71. of the individual documents.
  72. >>> l1 = len(inaugural.words('1789-Washington.txt'))
  73. >>> l2 = len(inaugural.words('1793-Washington.txt'))
  74. >>> l3 = len(inaugural.words(['1789-Washington.txt', '1793-Washington.txt']))
  75. >>> print('%s+%s == %s' % (l1, l2, l3))
  76. 1538+147 == 1685
  77. If the reader methods are called without any arguments, they will
  78. typically load all documents in the corpus.
  79. >>> len(inaugural.words())
  80. 149797
  81. If a corpus contains a README file, it can be accessed with a ``readme()`` method:
  82. >>> inaugural.readme()[:32]
  83. 'C-Span Inaugural Address Corpus\n'
  84. Plaintext Corpora
  85. =================
  86. Here are the first few words from each of NLTK's plaintext corpora:
  87. >>> nltk.corpus.abc.words()
  88. ['PM', 'denies', 'knowledge', 'of', 'AWB', ...]
  89. >>> nltk.corpus.genesis.words()
  90. ['In', 'the', 'beginning', 'God', 'created', ...]
  91. >>> nltk.corpus.gutenberg.words(fileids='austen-emma.txt')
  92. ['[', 'Emma', 'by', 'Jane', 'Austen', '1816', ...]
  93. >>> nltk.corpus.inaugural.words()
  94. ['Fellow', '-', 'Citizens', 'of', 'the', ...]
  95. >>> nltk.corpus.state_union.words()
  96. ['PRESIDENT', 'HARRY', 'S', '.', 'TRUMAN', "'", ...]
  97. >>> nltk.corpus.webtext.words()
  98. ['Cookie', 'Manager', ':', '"', 'Don', "'", 't', ...]
  99. Tagged Corpora
  100. ==============
  101. In addition to the plaintext corpora, NLTK's data package also
  102. contains a wide variety of annotated corpora. For example, the Brown
  103. Corpus is annotated with part-of-speech tags, and defines additional
  104. methods ``tagged_*()`` which words as `(word,tag)` tuples, rather
  105. than just bare word strings.
  106. >>> from nltk.corpus import brown
  107. >>> print(brown.words())
  108. ['The', 'Fulton', 'County', 'Grand', 'Jury', ...]
  109. >>> print(brown.tagged_words())
  110. [('The', 'AT'), ('Fulton', 'NP-TL'), ...]
  111. >>> print(brown.sents()) # doctest: +ELLIPSIS
  112. [['The', 'Fulton', 'County'...], ['The', 'jury', 'further'...], ...]
  113. >>> print(brown.tagged_sents()) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
  114. [[('The', 'AT'), ('Fulton', 'NP-TL')...],
  115. [('The', 'AT'), ('jury', 'NN'), ('further', 'RBR')...]...]
  116. >>> print(brown.paras(categories='reviews')) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
  117. [[['It', 'is', 'not', 'news', 'that', 'Nathan', 'Milstein'...],
  118. ['Certainly', 'not', 'in', 'Orchestra', 'Hall', 'where'...]],
  119. [['There', 'was', 'about', 'that', 'song', 'something', ...],
  120. ['Not', 'the', 'noblest', 'performance', 'we', 'have', ...], ...], ...]
  121. >>> print(brown.tagged_paras(categories='reviews')) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
  122. [[[('It', 'PPS'), ('is', 'BEZ'), ('not', '*'), ...],
  123. [('Certainly', 'RB'), ('not', '*'), ('in', 'IN'), ...]],
  124. [[('There', 'EX'), ('was', 'BEDZ'), ('about', 'IN'), ...],
  125. [('Not', '*'), ('the', 'AT'), ('noblest', 'JJT'), ...], ...], ...]
  126. Similarly, the Indian Language POS-Tagged Corpus includes samples of
  127. Indian text annotated with part-of-speech tags:
  128. >>> from nltk.corpus import indian
  129. >>> print(indian.words()) # doctest: +SKIP
  130. ['\xe0\xa6\xae\xe0\xa6\xb9\xe0\xa6\xbf\...',
  131. '\xe0\xa6\xb8\xe0\xa6\xa8\xe0\xa7\x8d\xe0...', ...]
  132. >>> print(indian.tagged_words()) # doctest: +SKIP
  133. [('\xe0\xa6\xae\xe0\xa6\xb9\xe0\xa6\xbf...', 'NN'),
  134. ('\xe0\xa6\xb8\xe0\xa6\xa8\xe0\xa7\x8d\xe0...', 'NN'), ...]
  135. Several tagged corpora support access to a simplified, universal tagset, e.g. where all nouns
  136. tags are collapsed to a single category ``NOUN``:
  137. >>> print(brown.tagged_sents(tagset='universal')) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
  138. [[('The', 'DET'), ('Fulton', 'NOUN'), ('County', 'NOUN'), ('Grand', 'ADJ'), ('Jury', 'NOUN'), ...],
  139. [('The', 'DET'), ('jury', 'NOUN'), ('further', 'ADV'), ('said', 'VERB'), ('in', 'ADP'), ...]...]
  140. >>> from nltk.corpus import conll2000, switchboard
  141. >>> print(conll2000.tagged_words(tagset='universal')) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
  142. [('Confidence', 'NOUN'), ('in', 'ADP'), ...]
  143. Use ``nltk.app.pos_concordance()`` to access a GUI for searching tagged corpora.
  144. Chunked Corpora
  145. ===============
  146. The CoNLL corpora also provide chunk structures, which are encoded as
  147. flat trees. The CoNLL 2000 Corpus includes phrasal chunks; and the
  148. CoNLL 2002 Corpus includes named entity chunks.
  149. >>> from nltk.corpus import conll2000, conll2002
  150. >>> print(conll2000.sents()) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
  151. [['Confidence', 'in', 'the', 'pound', 'is', 'widely', ...],
  152. ['Chancellor', 'of', 'the', 'Exchequer', ...], ...]
  153. >>> for tree in conll2000.chunked_sents()[:2]:
  154. ... print(tree) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
  155. (S
  156. (NP Confidence/NN)
  157. (PP in/IN)
  158. (NP the/DT pound/NN)
  159. (VP is/VBZ widely/RB expected/VBN to/TO take/VB)
  160. (NP another/DT sharp/JJ dive/NN)
  161. if/IN
  162. ...)
  163. (S
  164. Chancellor/NNP
  165. (PP of/IN)
  166. (NP the/DT Exchequer/NNP)
  167. ...)
  168. >>> print(conll2002.sents()) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
  169. [['Sao', 'Paulo', '(', 'Brasil', ')', ',', ...], ['-'], ...]
  170. >>> for tree in conll2002.chunked_sents()[:2]:
  171. ... print(tree) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
  172. (S
  173. (LOC Sao/NC Paulo/VMI)
  174. (/Fpa
  175. (LOC Brasil/NC)
  176. )/Fpt
  177. ...)
  178. (S -/Fg)
  179. .. note:: Since the CONLL corpora do not contain paragraph break
  180. information, these readers do not support the ``para()`` method.)
  181. .. warning:: if you call the conll corpora reader methods without any
  182. arguments, they will return the contents of the entire corpus,
  183. *including* the 'test' portions of the corpus.)
  184. SemCor is a subset of the Brown corpus tagged with WordNet senses and
  185. named entities. Both kinds of lexical items include multiword units,
  186. which are encoded as chunks (senses and part-of-speech tags pertain
  187. to the entire chunk).
  188. >>> from nltk.corpus import semcor
  189. >>> semcor.words()
  190. ['The', 'Fulton', 'County', 'Grand', 'Jury', ...]
  191. >>> semcor.chunks()
  192. [['The'], ['Fulton', 'County', 'Grand', 'Jury'], ...]
  193. >>> semcor.sents() # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
  194. [['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', ...],
  195. ['The', 'jury', 'further', 'said', ...], ...]
  196. >>> semcor.chunk_sents() # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
  197. [[['The'], ['Fulton', 'County', 'Grand', 'Jury'], ['said'], ...
  198. ['.']], [['The'], ['jury'], ['further'], ['said'], ... ['.']], ...]
  199. >>> list(map(str, semcor.tagged_chunks(tag='both')[:3]))
  200. ['(DT The)', "(Lemma('group.n.01.group') (NE (NNP Fulton County Grand Jury)))", "(Lemma('state.v.01.say') (VB said))"]
  201. >>> [[str(c) for c in s] for s in semcor.tagged_sents(tag='both')[:2]]
  202. [['(DT The)', "(Lemma('group.n.01.group') (NE (NNP Fulton County Grand Jury)))", ...
  203. '(None .)'], ['(DT The)', ... '(None .)']]
  204. The IEER corpus is another chunked corpus. This corpus is unusual in
  205. that each corpus item contains multiple documents. (This reflects the
  206. fact that each corpus file contains multiple documents.) The IEER
  207. corpus defines the `parsed_docs` method, which returns the documents
  208. in a given item as `IEERDocument` objects:
  209. >>> from nltk.corpus import ieer
  210. >>> ieer.fileids() # doctest: +NORMALIZE_WHITESPACE
  211. ['APW_19980314', 'APW_19980424', 'APW_19980429',
  212. 'NYT_19980315', 'NYT_19980403', 'NYT_19980407']
  213. >>> docs = ieer.parsed_docs('APW_19980314')
  214. >>> print(docs[0])
  215. <IEERDocument APW19980314.0391: 'Kenyans protest tax hikes'>
  216. >>> print(docs[0].docno)
  217. APW19980314.0391
  218. >>> print(docs[0].doctype)
  219. NEWS STORY
  220. >>> print(docs[0].date_time)
  221. 03/14/1998 10:36:00
  222. >>> print(docs[0].headline)
  223. (DOCUMENT Kenyans protest tax hikes)
  224. >>> print(docs[0].text) # doctest: +ELLIPSIS
  225. (DOCUMENT
  226. (LOCATION NAIROBI)
  227. ,
  228. (LOCATION Kenya)
  229. (
  230. (ORGANIZATION AP)
  231. )
  232. _
  233. (CARDINAL Thousands)
  234. of
  235. laborers,
  236. ...
  237. on
  238. (DATE Saturday)
  239. ...)
  240. Parsed Corpora
  241. ==============
  242. The Treebank corpora provide a syntactic parse for each sentence. The
  243. NLTK data package includes a 10% sample of the Penn Treebank (in
  244. ``treebank``), as well as the Sinica Treebank (in ``sinica_treebank``).
  245. Reading the Penn Treebank (Wall Street Journal sample):
  246. >>> from nltk.corpus import treebank
  247. >>> print(treebank.fileids()) # doctest: +ELLIPSIS
  248. ['wsj_0001.mrg', 'wsj_0002.mrg', 'wsj_0003.mrg', 'wsj_0004.mrg', ...]
  249. >>> print(treebank.words('wsj_0003.mrg'))
  250. ['A', 'form', 'of', 'asbestos', 'once', 'used', ...]
  251. >>> print(treebank.tagged_words('wsj_0003.mrg'))
  252. [('A', 'DT'), ('form', 'NN'), ('of', 'IN'), ...]
  253. >>> print(treebank.parsed_sents('wsj_0003.mrg')[0]) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
  254. (S
  255. (S-TPC-1
  256. (NP-SBJ
  257. (NP (NP (DT A) (NN form)) (PP (IN of) (NP (NN asbestos))))
  258. (RRC ...)...)...)
  259. ...
  260. (VP (VBD reported) (SBAR (-NONE- 0) (S (-NONE- *T*-1))))
  261. (. .))
  262. If you have access to a full installation of the Penn Treebank, NLTK
  263. can be configured to load it as well. Download the ``ptb`` package,
  264. and in the directory ``nltk_data/corpora/ptb`` place the ``BROWN``
  265. and ``WSJ`` directories of the Treebank installation (symlinks work
  266. as well). Then use the ``ptb`` module instead of ``treebank``:
  267. >>> from nltk.corpus import ptb
  268. >>> print(ptb.fileids()) # doctest: +SKIP
  269. ['BROWN/CF/CF01.MRG', 'BROWN/CF/CF02.MRG', 'BROWN/CF/CF03.MRG', 'BROWN/CF/CF04.MRG', ...]
  270. >>> print(ptb.words('WSJ/00/WSJ_0003.MRG')) # doctest: +SKIP
  271. ['A', 'form', 'of', 'asbestos', 'once', 'used', '*', ...]
  272. >>> print(ptb.tagged_words('WSJ/00/WSJ_0003.MRG')) # doctest: +SKIP
  273. [('A', 'DT'), ('form', 'NN'), ('of', 'IN'), ...]
  274. ...and so forth, like ``treebank`` but with extended fileids. Categories
  275. specified in ``allcats.txt`` can be used to filter by genre; they consist
  276. of ``news`` (for WSJ articles) and names of the Brown subcategories
  277. (``fiction``, ``humor``, ``romance``, etc.):
  278. >>> ptb.categories() # doctest: +SKIP
  279. ['adventure', 'belles_lettres', 'fiction', 'humor', 'lore', 'mystery', 'news', 'romance', 'science_fiction']
  280. >>> print(ptb.fileids('news')) # doctest: +SKIP
  281. ['WSJ/00/WSJ_0001.MRG', 'WSJ/00/WSJ_0002.MRG', 'WSJ/00/WSJ_0003.MRG', ...]
  282. >>> print(ptb.words(categories=['humor','fiction'])) # doctest: +SKIP
  283. ['Thirty-three', 'Scotty', 'did', 'not', 'go', 'back', ...]
  284. As PropBank and NomBank depend on the (WSJ portion of the) Penn Treebank,
  285. the modules ``propbank_ptb`` and ``nombank_ptb`` are provided for access
  286. to a full PTB installation.
  287. Reading the Sinica Treebank:
  288. >>> from nltk.corpus import sinica_treebank
  289. >>> print(sinica_treebank.sents()) # doctest: +SKIP
  290. [['\xe4\xb8\x80'], ['\xe5\x8f\x8b\xe6\x83\x85'], ...]
  291. >>> sinica_treebank.parsed_sents()[25] # doctest: +SKIP
  292. Tree('S',
  293. [Tree('NP',
  294. [Tree('Nba', ['\xe5\x98\x89\xe7\x8f\x8d'])]),
  295. Tree('V\xe2\x80\xa7\xe5\x9c\xb0',
  296. [Tree('VA11', ['\xe4\xb8\x8d\xe5\x81\x9c']),
  297. Tree('DE', ['\xe7\x9a\x84'])]),
  298. Tree('VA4', ['\xe5\x93\xad\xe6\xb3\xa3'])])
  299. Reading the CoNLL 2007 Dependency Treebanks:
  300. >>> from nltk.corpus import conll2007
  301. >>> conll2007.sents('esp.train')[0] # doctest: +SKIP
  302. ['El', 'aumento', 'del', 'índice', 'de', 'desempleo', ...]
  303. >>> conll2007.parsed_sents('esp.train')[0] # doctest: +SKIP
  304. <DependencyGraph with 38 nodes>
  305. >>> print(conll2007.parsed_sents('esp.train')[0].tree()) # doctest: +SKIP
  306. (fortaleció
  307. (aumento El (del (índice (de (desempleo estadounidense)))))
  308. hoy
  309. considerablemente
  310. (al
  311. (euro
  312. (cotizaba
  313. ,
  314. que
  315. (a (15.35 las GMT))
  316. se
  317. (en (mercado el (de divisas) (de Fráncfort)))
  318. (a 0,9452_dólares)
  319. (frente_a , (0,9349_dólares los (de (mañana esta)))))))
  320. .)
  321. Word Lists and Lexicons
  322. =======================
  323. The NLTK data package also includes a number of lexicons and word
  324. lists. These are accessed just like text corpora. The following
  325. examples illustrate the use of the wordlist corpora:
  326. >>> from nltk.corpus import names, stopwords, words
  327. >>> words.fileids()
  328. ['en', 'en-basic']
  329. >>> words.words('en') # doctest: +ELLIPSIS
  330. ['A', 'a', 'aa', 'aal', 'aalii', 'aam', 'Aani', 'aardvark', 'aardwolf', ...]
  331. >>> stopwords.fileids() # doctest: +ELLIPSIS
  332. ['arabic', 'azerbaijani', 'danish', 'dutch', 'english', 'finnish', 'french', ...]
  333. >>> sorted(stopwords.words('portuguese')) # doctest: +ELLIPSIS
  334. ['a', 'ao', 'aos', 'aquela', 'aquelas', 'aquele', 'aqueles', ...]
  335. >>> names.fileids()
  336. ['female.txt', 'male.txt']
  337. >>> names.words('male.txt') # doctest: +ELLIPSIS
  338. ['Aamir', 'Aaron', 'Abbey', 'Abbie', 'Abbot', 'Abbott', ...]
  339. >>> names.words('female.txt') # doctest: +ELLIPSIS
  340. ['Abagael', 'Abagail', 'Abbe', 'Abbey', 'Abbi', 'Abbie', ...]
  341. The CMU Pronunciation Dictionary corpus contains pronounciation
  342. transcriptions for over 100,000 words. It can be accessed as a list
  343. of entries (where each entry consists of a word, an identifier, and a
  344. transcription) or as a dictionary from words to lists of
  345. transcriptions. Transcriptions are encoded as tuples of phoneme
  346. strings.
  347. >>> from nltk.corpus import cmudict
  348. >>> print(cmudict.entries()[653:659]) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
  349. [('acetate', ['AE1', 'S', 'AH0', 'T', 'EY2', 'T']),
  350. ('acetic', ['AH0', 'S', 'EH1', 'T', 'IH0', 'K']),
  351. ('acetic', ['AH0', 'S', 'IY1', 'T', 'IH0', 'K']),
  352. ('aceto', ['AA0', 'S', 'EH1', 'T', 'OW0']),
  353. ('acetochlor', ['AA0', 'S', 'EH1', 'T', 'OW0', 'K', 'L', 'AO2', 'R']),
  354. ('acetone', ['AE1', 'S', 'AH0', 'T', 'OW2', 'N'])]
  355. >>> # Load the entire cmudict corpus into a Python dictionary:
  356. >>> transcr = cmudict.dict()
  357. >>> print([transcr[w][0] for w in 'Natural Language Tool Kit'.lower().split()]) # doctest: +NORMALIZE_WHITESPACE
  358. [['N', 'AE1', 'CH', 'ER0', 'AH0', 'L'],
  359. ['L', 'AE1', 'NG', 'G', 'W', 'AH0', 'JH'],
  360. ['T', 'UW1', 'L'],
  361. ['K', 'IH1', 'T']]
  362. WordNet
  363. =======
  364. Please see the separate WordNet howto.
  365. FrameNet
  366. ========
  367. Please see the separate FrameNet howto.
  368. PropBank
  369. ========
  370. Please see the separate PropBank howto.
  371. SentiWordNet
  372. ============
  373. Please see the separate SentiWordNet howto.
  374. Categorized Corpora
  375. ===================
  376. Several corpora included with NLTK contain documents that have been categorized for
  377. topic, genre, polarity, etc. In addition to the standard corpus interface, these
  378. corpora provide access to the list of categories and the mapping between the documents
  379. and their categories (in both directions). Access the categories using the ``categories()``
  380. method, e.g.:
  381. >>> from nltk.corpus import brown, movie_reviews, reuters
  382. >>> brown.categories() # doctest: +NORMALIZE_WHITESPACE
  383. ['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor',
  384. 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']
  385. >>> movie_reviews.categories()
  386. ['neg', 'pos']
  387. >>> reuters.categories() # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS
  388. ['acq', 'alum', 'barley', 'bop', 'carcass', 'castor-oil', 'cocoa',
  389. 'coconut', 'coconut-oil', 'coffee', 'copper', 'copra-cake', 'corn',
  390. 'cotton', 'cotton-oil', 'cpi', 'cpu', 'crude', 'dfl', 'dlr', ...]
  391. This method has an optional argument that specifies a document or a list
  392. of documents, allowing us to map from (one or more) documents to (one or more) categories:
  393. >>> brown.categories('ca01')
  394. ['news']
  395. >>> brown.categories(['ca01','cb01'])
  396. ['editorial', 'news']
  397. >>> reuters.categories('training/9865')
  398. ['barley', 'corn', 'grain', 'wheat']
  399. >>> reuters.categories(['training/9865', 'training/9880'])
  400. ['barley', 'corn', 'grain', 'money-fx', 'wheat']
  401. We can go back the other way using the optional argument of the ``fileids()`` method:
  402. >>> reuters.fileids('barley') # doctest: +ELLIPSIS
  403. ['test/15618', 'test/15649', 'test/15676', 'test/15728', 'test/15871', ...]
  404. Both the ``categories()`` and ``fileids()`` methods return a sorted list containing
  405. no duplicates.
  406. In addition to mapping between categories and documents, these corpora permit
  407. direct access to their contents via the categories. Instead of accessing a subset
  408. of a corpus by specifying one or more fileids, we can identify one or more categories, e.g.:
  409. >>> brown.tagged_words(categories='news')
  410. [('The', 'AT'), ('Fulton', 'NP-TL'), ...]
  411. >>> brown.sents(categories=['editorial','reviews']) # doctest: +NORMALIZE_WHITESPACE
  412. [['Assembly', 'session', 'brought', 'much', 'good'], ['The', 'General',
  413. 'Assembly', ',', 'which', 'adjourns', 'today', ',', 'has', 'performed',
  414. 'in', 'an', 'atmosphere', 'of', 'crisis', 'and', 'struggle', 'from',
  415. 'the', 'day', 'it', 'convened', '.'], ...]
  416. Note that it is an error to specify both documents and categories.
  417. In the context of a text categorization system, we can easily test if the
  418. category assigned to a document is correct as follows:
  419. >>> def classify(doc): return 'news' # Trivial classifier
  420. >>> doc = 'ca01'
  421. >>> classify(doc) in brown.categories(doc)
  422. True
  423. Other Corpora
  424. =============
  425. comparative_sentences
  426. ---------------------
  427. A list of sentences from various sources, especially reviews and articles. Each
  428. line contains one sentence; sentences were separated by using a sentence tokenizer.
  429. Comparative sentences have been annotated with their type, entities, features and
  430. keywords.
  431. >>> from nltk.corpus import comparative_sentences
  432. >>> comparison = comparative_sentences.comparisons()[0]
  433. >>> comparison.text
  434. ['its', 'fast-forward', 'and', 'rewind', 'work', 'much', 'more', 'smoothly',
  435. 'and', 'consistently', 'than', 'those', 'of', 'other', 'models', 'i', "'ve",
  436. 'had', '.']
  437. >>> comparison.entity_2
  438. 'models'
  439. >>> (comparison.feature, comparison.keyword)
  440. ('rewind', 'more')
  441. >>> len(comparative_sentences.comparisons())
  442. 853
  443. opinion_lexicon
  444. ---------------
  445. A list of positive and negative opinion words or sentiment words for English.
  446. >>> from nltk.corpus import opinion_lexicon
  447. >>> opinion_lexicon.words()[:4]
  448. ['2-faced', '2-faces', 'abnormal', 'abolish']
  449. The OpinionLexiconCorpusReader also provides shortcuts to retrieve positive/negative
  450. words:
  451. >>> opinion_lexicon.negative()[:4]
  452. ['2-faced', '2-faces', 'abnormal', 'abolish']
  453. Note that words from `words()` method in opinion_lexicon are sorted by file id,
  454. not alphabetically:
  455. >>> opinion_lexicon.words()[0:10]
  456. ['2-faced', '2-faces', 'abnormal', 'abolish', 'abominable', 'abominably',
  457. 'abominate', 'abomination', 'abort', 'aborted']
  458. >>> sorted(opinion_lexicon.words())[0:10]
  459. ['2-faced', '2-faces', 'a+', 'abnormal', 'abolish', 'abominable', 'abominably',
  460. 'abominate', 'abomination', 'abort']
  461. ppattach
  462. --------
  463. The Prepositional Phrase Attachment corpus is a corpus of
  464. prepositional phrase attachment decisions. Each instance in the
  465. corpus is encoded as a ``PPAttachment`` object:
  466. >>> from nltk.corpus import ppattach
  467. >>> ppattach.attachments('training') # doctest: +NORMALIZE_WHITESPACE
  468. [PPAttachment(sent='0', verb='join', noun1='board',
  469. prep='as', noun2='director', attachment='V'),
  470. PPAttachment(sent='1', verb='is', noun1='chairman',
  471. prep='of', noun2='N.V.', attachment='N'),
  472. ...]
  473. >>> inst = ppattach.attachments('training')[0]
  474. >>> (inst.sent, inst.verb, inst.noun1, inst.prep, inst.noun2)
  475. ('0', 'join', 'board', 'as', 'director')
  476. >>> inst.attachment
  477. 'V'
  478. product_reviews_1 and product_reviews_2
  479. ---------------------------------------
  480. These two datasets respectively contain annotated customer reviews of 5 and 9
  481. products from amazon.com.
  482. >>> from nltk.corpus import product_reviews_1
  483. >>> camera_reviews = product_reviews_1.reviews('Canon_G3.txt')
  484. >>> review = camera_reviews[0]
  485. >>> review.sents()[0]
  486. ['i', 'recently', 'purchased', 'the', 'canon', 'powershot', 'g3', 'and', 'am',
  487. 'extremely', 'satisfied', 'with', 'the', 'purchase', '.']
  488. >>> review.features()
  489. [('canon powershot g3', '+3'), ('use', '+2'), ('picture', '+2'),
  490. ('picture quality', '+1'), ('picture quality', '+1'), ('camera', '+2'),
  491. ('use', '+2'), ('feature', '+1'), ('picture quality', '+3'), ('use', '+1'),
  492. ('option', '+1')]
  493. It is also possible to reach the same information directly from the stream:
  494. >>> product_reviews_1.features('Canon_G3.txt')
  495. [('canon powershot g3', '+3'), ('use', '+2'), ...]
  496. We can compute stats for specific product features:
  497. >>> n_reviews = len([(feat,score) for (feat,score) in product_reviews_1.features('Canon_G3.txt') if feat=='picture'])
  498. >>> tot = sum([int(score) for (feat,score) in product_reviews_1.features('Canon_G3.txt') if feat=='picture'])
  499. >>> mean = tot / n_reviews
  500. >>> print(n_reviews, tot, mean)
  501. 15 24 1.6
  502. pros_cons
  503. ---------
  504. A list of pros/cons sentences for determining context (aspect) dependent
  505. sentiment words, which are then applied to sentiment analysis of comparative
  506. sentences.
  507. >>> from nltk.corpus import pros_cons
  508. >>> pros_cons.sents(categories='Cons')
  509. [['East', 'batteries', '!', 'On', '-', 'off', 'switch', 'too', 'easy',
  510. 'to', 'maneuver', '.'], ['Eats', '...', 'no', ',', 'GULPS', 'batteries'],
  511. ...]
  512. >>> pros_cons.words('IntegratedPros.txt')
  513. ['Easy', 'to', 'use', ',', 'economical', '!', ...]
  514. semcor
  515. ------
  516. The Brown Corpus, annotated with WordNet senses.
  517. >>> from nltk.corpus import semcor
  518. >>> semcor.words('brown2/tagfiles/br-n12.xml') # doctest: +ELLIPSIS
  519. ['When', 'several', 'minutes', 'had', 'passed', ...]
  520. >>> sent = semcor.xml('brown2/tagfiles/br-n12.xml').findall('context/p/s')[0]
  521. >>> for wordform in sent.getchildren():
  522. ... print(wordform.text, end=' ')
  523. ... for key in sorted(wordform.keys()):
  524. ... print(key + '=' + wordform.get(key), end=' ')
  525. ... print()
  526. ...
  527. When cmd=ignore pos=WRB
  528. several cmd=done lemma=several lexsn=5:00:00:some(a):00 pos=JJ wnsn=1
  529. minutes cmd=done lemma=minute lexsn=1:28:00:: pos=NN wnsn=1
  530. had cmd=done ot=notag pos=VBD
  531. passed cmd=done lemma=pass lexsn=2:38:03:: pos=VB wnsn=4
  532. and cmd=ignore pos=CC
  533. Curt cmd=done lemma=person lexsn=1:03:00:: pn=person pos=NNP rdf=person wnsn=1
  534. had cmd=done ot=notag pos=VBD
  535. n't cmd=done lemma=n't lexsn=4:02:00:: pos=RB wnsn=0
  536. emerged cmd=done lemma=emerge lexsn=2:30:00:: pos=VB wnsn=1
  537. from cmd=ignore pos=IN
  538. the cmd=ignore pos=DT
  539. livery_stable cmd=done lemma=livery_stable lexsn=1:06:00:: pos=NN wnsn=1
  540. ,
  541. Brenner cmd=done lemma=person lexsn=1:03:00:: pn=person pos=NNP rdf=person wnsn=1
  542. re-entered cmd=done lemma=re-enter lexsn=2:38:00:: pos=VB wnsn=1
  543. the cmd=ignore pos=DT
  544. hotel cmd=done lemma=hotel lexsn=1:06:00:: pos=NN wnsn=1
  545. and cmd=ignore pos=CC
  546. faced cmd=done lemma=face lexsn=2:42:02:: pos=VB wnsn=4
  547. Summers cmd=done lemma=person lexsn=1:03:00:: pn=person pos=NNP rdf=person wnsn=1
  548. across cmd=ignore pos=IN
  549. the cmd=ignore pos=DT
  550. counter cmd=done lemma=counter lexsn=1:06:00:: pos=NN wnsn=1
  551. .
  552. senseval
  553. --------
  554. The Senseval 2 corpus is a word sense disambiguation corpus. Each
  555. item in the corpus corresponds to a single ambiguous word. For each
  556. of these words, the corpus contains a list of instances, corresponding
  557. to occurrences of that word. Each instance provides the word; a list
  558. of word senses that apply to the word occurrence; and the word's
  559. context.
  560. >>> from nltk.corpus import senseval
  561. >>> senseval.fileids()
  562. ['hard.pos', 'interest.pos', 'line.pos', 'serve.pos']
  563. >>> senseval.instances('hard.pos')
  564. ... # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
  565. [SensevalInstance(word='hard-a',
  566. position=20,
  567. context=[('``', '``'), ('he', 'PRP'), ...('hard', 'JJ'), ...],
  568. senses=('HARD1',)),
  569. SensevalInstance(word='hard-a',
  570. position=10,
  571. context=[('clever', 'NNP'), ...('hard', 'JJ'), ('time', 'NN'), ...],
  572. senses=('HARD1',)), ...]
  573. The following code looks at instances of the word 'interest', and
  574. displays their local context (2 words on each side) and word sense(s):
  575. >>> for inst in senseval.instances('interest.pos')[:10]:
  576. ... p = inst.position
  577. ... left = ' '.join(w for (w,t) in inst.context[p-2:p])
  578. ... word = ' '.join(w for (w,t) in inst.context[p:p+1])
  579. ... right = ' '.join(w for (w,t) in inst.context[p+1:p+3])
  580. ... senses = ' '.join(inst.senses)
  581. ... print('%20s |%10s | %-15s -> %s' % (left, word, right, senses))
  582. declines in | interest | rates . -> interest_6
  583. indicate declining | interest | rates because -> interest_6
  584. in short-term | interest | rates . -> interest_6
  585. 4 % | interest | in this -> interest_5
  586. company with | interests | in the -> interest_5
  587. , plus | interest | . -> interest_6
  588. set the | interest | rate on -> interest_6
  589. 's own | interest | , prompted -> interest_4
  590. principal and | interest | is the -> interest_6
  591. increase its | interest | to 70 -> interest_5
  592. sentence_polarity
  593. -----------------
  594. The Sentence Polarity dataset contains 5331 positive and 5331 negative processed
  595. sentences.
  596. >>> from nltk.corpus import sentence_polarity
  597. >>> sentence_polarity.sents()
  598. [['simplistic', ',', 'silly', 'and', 'tedious', '.'], ["it's", 'so', 'laddish',
  599. 'and', 'juvenile', ',', 'only', 'teenage', 'boys', 'could', 'possibly', 'find',
  600. 'it', 'funny', '.'], ...]
  601. >>> sentence_polarity.categories()
  602. ['neg', 'pos']
  603. >>> sentence_polarity.sents()[1]
  604. ["it's", 'so', 'laddish', 'and', 'juvenile', ',', 'only', 'teenage', 'boys',
  605. 'could', 'possibly', 'find', 'it', 'funny', '.']
  606. shakespeare
  607. -----------
  608. The Shakespeare corpus contains a set of Shakespeare plays, formatted
  609. as XML files. These corpora are returned as ElementTree objects:
  610. >>> from nltk.corpus import shakespeare
  611. >>> from xml.etree import ElementTree
  612. >>> shakespeare.fileids() # doctest: +ELLIPSIS
  613. ['a_and_c.xml', 'dream.xml', 'hamlet.xml', 'j_caesar.xml', ...]
  614. >>> play = shakespeare.xml('dream.xml')
  615. >>> print(play) # doctest: +ELLIPSIS
  616. <Element 'PLAY' at ...>
  617. >>> print('%s: %s' % (play[0].tag, play[0].text))
  618. TITLE: A Midsummer Night's Dream
  619. >>> personae = [persona.text for persona in
  620. ... play.findall('PERSONAE/PERSONA')]
  621. >>> print(personae) # doctest: +ELLIPSIS
  622. ['THESEUS, Duke of Athens.', 'EGEUS, father to Hermia.', ...]
  623. >>> # Find and print speakers not listed as personae
  624. >>> names = [persona.split(',')[0] for persona in personae]
  625. >>> speakers = set(speaker.text for speaker in
  626. ... play.findall('*/*/*/SPEAKER'))
  627. >>> print(sorted(speakers.difference(names))) # doctest: +NORMALIZE_WHITESPACE
  628. ['ALL', 'COBWEB', 'DEMETRIUS', 'Fairy', 'HERNIA', 'LYSANDER',
  629. 'Lion', 'MOTH', 'MUSTARDSEED', 'Moonshine', 'PEASEBLOSSOM',
  630. 'Prologue', 'Pyramus', 'Thisbe', 'Wall']
  631. subjectivity
  632. -----------
  633. The Subjectivity Dataset contains 5000 subjective and 5000 objective processed
  634. sentences.
  635. >>> from nltk.corpus import subjectivity
  636. >>> subjectivity.categories()
  637. ['obj', 'subj']
  638. >>> subjectivity.sents()[23]
  639. ['television', 'made', 'him', 'famous', ',', 'but', 'his', 'biggest', 'hits',
  640. 'happened', 'off', 'screen', '.']
  641. >>> subjectivity.words(categories='subj')
  642. ['smart', 'and', 'alert', ',', 'thirteen', ...]
  643. toolbox
  644. -------
  645. The Toolbox corpus distributed with NLTK contains a sample lexicon and
  646. several sample texts from the Rotokas language. The Toolbox corpus
  647. reader returns Toolbox files as XML ElementTree objects. The
  648. following example loads the Rotokas dictionary, and figures out the
  649. distribution of part-of-speech tags for reduplicated words.
  650. .. doctest: +SKIP
  651. >>> from nltk.corpus import toolbox
  652. >>> from nltk.probability import FreqDist
  653. >>> from xml.etree import ElementTree
  654. >>> import re
  655. >>> rotokas = toolbox.xml('rotokas.dic')
  656. >>> redup_pos_freqdist = FreqDist()
  657. >>> # Note: we skip over the first record, which is actually
  658. >>> # the header.
  659. >>> for record in rotokas[1:]:
  660. ... lexeme = record.find('lx').text
  661. ... if re.match(r'(.*)\1$', lexeme):
  662. ... redup_pos_freqdist[record.find('ps').text] += 1
  663. >>> for item, count in redup_pos_freqdist.most_common():
  664. ... print(item, count)
  665. V 41
  666. N 14
  667. ??? 4
  668. This example displays some records from a Rotokas text:
  669. .. doctest: +SKIP
  670. >>> river = toolbox.xml('rotokas/river.txt', key='ref')
  671. >>> for record in river.findall('record')[:3]:
  672. ... for piece in record:
  673. ... if len(piece.text) > 60:
  674. ... print('%-6s %s...' % (piece.tag, piece.text[:57]))
  675. ... else:
  676. ... print('%-6s %s' % (piece.tag, piece.text))
  677. ref Paragraph 1
  678. t ``Viapau oisio ra ovaupasi ...
  679. m viapau oisio ra ovau -pa -si ...
  680. g NEG this way/like this and forget -PROG -2/3.DL...
  681. p NEG ??? CONJ V.I -SUFF.V.3 -SUFF.V...
  682. f ``No ken lus tingting wanema samting papa i bin tok,'' Na...
  683. fe ``Don't forget what Dad said,'' yelled Naomi.
  684. ref 2
  685. t Osa Ira ora Reviti viapau uvupasiva.
  686. m osa Ira ora Reviti viapau uvu -pa -si ...
  687. g as/like name and name NEG hear/smell -PROG -2/3...
  688. p CONJ N.PN CONJ N.PN NEG V.T -SUFF.V.3 -SUF...
  689. f Tasol Ila na David no bin harim toktok.
  690. fe But Ila and David took no notice.
  691. ref 3
  692. t Ikaupaoro rokosiva ...
  693. m ikau -pa -oro roko -si -va ...
  694. g run/hurry -PROG -SIM go down -2/3.DL.M -RP ...
  695. p V.T -SUFF.V.3 -SUFF.V.4 ADV -SUFF.V.4 -SUFF.VT....
  696. f Tupela i bin hariap i go long wara .
  697. fe They raced to the river.
  698. timit
  699. -----
  700. The NLTK data package includes a fragment of the TIMIT
  701. Acoustic-Phonetic Continuous Speech Corpus. This corpus is broken
  702. down into small speech samples, each of which is available as a wave
  703. file, a phonetic transcription, and a tokenized word list.
  704. >>> from nltk.corpus import timit
  705. >>> print(timit.utteranceids()) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
  706. ['dr1-fvmh0/sa1', 'dr1-fvmh0/sa2', 'dr1-fvmh0/si1466',
  707. 'dr1-fvmh0/si2096', 'dr1-fvmh0/si836', 'dr1-fvmh0/sx116',
  708. 'dr1-fvmh0/sx206', 'dr1-fvmh0/sx26', 'dr1-fvmh0/sx296', ...]
  709. >>> item = timit.utteranceids()[5]
  710. >>> print(timit.phones(item)) # doctest: +NORMALIZE_WHITESPACE
  711. ['h#', 'k', 'l', 'ae', 's', 'pcl', 'p', 'dh', 'ax',
  712. 's', 'kcl', 'k', 'r', 'ux', 'ix', 'nx', 'y', 'ax',
  713. 'l', 'eh', 'f', 'tcl', 't', 'hh', 'ae', 'n', 'dcl',
  714. 'd', 'h#']
  715. >>> print(timit.words(item))
  716. ['clasp', 'the', 'screw', 'in', 'your', 'left', 'hand']
  717. >>> timit.play(item) # doctest: +SKIP
  718. The corpus reader can combine the word segmentation information with
  719. the phonemes to produce a single tree structure:
  720. >>> for tree in timit.phone_trees(item):
  721. ... print(tree)
  722. (S
  723. h#
  724. (clasp k l ae s pcl p)
  725. (the dh ax)
  726. (screw s kcl k r ux)
  727. (in ix nx)
  728. (your y ax)
  729. (left l eh f tcl t)
  730. (hand hh ae n dcl d)
  731. h#)
  732. The start time and stop time of each phoneme, word, and sentence are
  733. also available:
  734. >>> print(timit.phone_times(item)) # doctest: +ELLIPSIS
  735. [('h#', 0, 2190), ('k', 2190, 3430), ('l', 3430, 4326), ...]
  736. >>> print(timit.word_times(item)) # doctest: +ELLIPSIS
  737. [('clasp', 2190, 8804), ('the', 8804, 9734), ...]
  738. >>> print(timit.sent_times(item))
  739. [('Clasp the screw in your left hand.', 0, 32154)]
  740. We can use these times to play selected pieces of a speech sample:
  741. >>> timit.play(item, 2190, 8804) # 'clasp' # doctest: +SKIP
  742. The corpus reader can also be queried for information about the
  743. speaker and sentence identifier for a given speech sample:
  744. >>> print(timit.spkrid(item))
  745. dr1-fvmh0
  746. >>> print(timit.sentid(item))
  747. sx116
  748. >>> print(timit.spkrinfo(timit.spkrid(item))) # doctest: +NORMALIZE_WHITESPACE
  749. SpeakerInfo(id='VMH0',
  750. sex='F',
  751. dr='1',
  752. use='TRN',
  753. recdate='03/11/86',
  754. birthdate='01/08/60',
  755. ht='5\'05"',
  756. race='WHT',
  757. edu='BS',
  758. comments='BEST NEW ENGLAND ACCENT SO FAR')
  759. >>> # List the speech samples from the same speaker:
  760. >>> timit.utteranceids(spkrid=timit.spkrid(item)) # doctest: +ELLIPSIS
  761. ['dr1-fvmh0/sa1', 'dr1-fvmh0/sa2', 'dr1-fvmh0/si1466', ...]
  762. twitter_samples
  763. ---------------
  764. Twitter is well-known microblog service that allows public data to be
  765. collected via APIs. NLTK's twitter corpus currently contains a sample of 20k Tweets
  766. retrieved from the Twitter Streaming API.
  767. >>> from nltk.corpus import twitter_samples
  768. >>> twitter_samples.fileids()
  769. ['negative_tweets.json', 'positive_tweets.json', 'tweets.20150430-223406.json']
  770. We follow standard practice in storing full Tweets as line-separated
  771. JSON. These data structures can be accessed via `tweets.docs()`. However, in general it
  772. is more practical to focus just on the text field of the Tweets, which
  773. are accessed via the `strings()` method.
  774. >>> twitter_samples.strings('tweets.20150430-223406.json')
  775. ['RT @KirkKus: Indirect cost of the UK being in the EU is estimated to be costing Britain \xa3170 billion per year! #BetterOffOut #UKIP', ...]
  776. The default tokenizer for Tweets is specialised for 'casual' text, and
  777. the `tokenized()` method returns a list of lists of tokens.
  778. >>> twitter_samples.tokenized('tweets.20150430-223406.json')
  779. [['RT', '@KirkKus', ':', 'Indirect', 'cost', 'of', 'the', 'UK', 'being', 'in', ...],
  780. ['VIDEO', ':', 'Sturgeon', 'on', 'post-election', 'deals', 'http://t.co/BTJwrpbmOY'], ...]
  781. rte
  782. ---
  783. The RTE (Recognizing Textual Entailment) corpus was derived from the
  784. RTE1, RTE2 and RTE3 datasets (dev and test data), and consists of a
  785. list of XML-formatted 'text'/'hypothesis' pairs.
  786. >>> from nltk.corpus import rte
  787. >>> print(rte.fileids()) # doctest: +ELLIPSIS
  788. ['rte1_dev.xml', 'rte1_test.xml', 'rte2_dev.xml', ..., 'rte3_test.xml']
  789. >>> rtepairs = rte.pairs(['rte2_test.xml', 'rte3_test.xml'])
  790. >>> print(rtepairs) # doctest: +ELLIPSIS
  791. [<RTEPair: gid=2-8>, <RTEPair: gid=2-9>, <RTEPair: gid=2-15>, ...]
  792. In the gold standard test sets, each pair is labeled according to
  793. whether or not the text 'entails' the hypothesis; the
  794. entailment value is mapped to an integer 1 (True) or 0 (False).
  795. >>> rtepairs[5]
  796. <RTEPair: gid=2-23>
  797. >>> rtepairs[5].text # doctest: +NORMALIZE_WHITESPACE
  798. 'His wife Strida won a seat in parliament after forging an alliance
  799. with the main anti-Syrian coalition in the recent election.'
  800. >>> rtepairs[5].hyp
  801. 'Strida elected to parliament.'
  802. >>> rtepairs[5].value
  803. 1
  804. The RTE corpus also supports an ``xml()`` method which produces ElementTrees.
  805. >>> xmltree = rte.xml('rte3_dev.xml')
  806. >>> xmltree # doctest: +SKIP
  807. <Element entailment-corpus at ...>
  808. >>> xmltree[7].findtext('t') # doctest: +NORMALIZE_WHITESPACE
  809. "Mrs. Bush's approval ratings have remained very high, above 80%,
  810. even as her husband's have recently dropped below 50%."
  811. verbnet
  812. -------
  813. The VerbNet corpus is a lexicon that divides verbs into classes, based
  814. on their syntax-semantics linking behavior. The basic elements in the
  815. lexicon are verb lemmas, such as 'abandon' and 'accept', and verb
  816. classes, which have identifiers such as 'remove-10.1' and
  817. 'admire-31.2-1'. These class identifiers consist of a representative
  818. verb selected from the class, followed by a numerical identifier. The
  819. list of verb lemmas, and the list of class identifiers, can be
  820. retrieved with the following methods:
  821. >>> from nltk.corpus import verbnet
  822. >>> verbnet.lemmas()[20:25]
  823. ['accelerate', 'accept', 'acclaim', 'accompany', 'accrue']
  824. >>> verbnet.classids()[:5]
  825. ['accompany-51.7', 'admire-31.2', 'admire-31.2-1', 'admit-65', 'adopt-93']
  826. The `classids()` method may also be used to retrieve the classes that
  827. a given lemma belongs to:
  828. >>> verbnet.classids('accept')
  829. ['approve-77', 'characterize-29.2-1-1', 'obtain-13.5.2']
  830. The `classids()` method may additionally be used to retrieve all classes
  831. within verbnet if nothing is passed:
  832. >>> verbnet.classids()
  833. ['accompany-51.7', 'admire-31.2', 'admire-31.2-1', 'admit-65', 'adopt-93', 'advise-37.9', 'advise-37.9-1', 'allow-64', 'amalgamate-22.2', 'amalgamate-22.2-1', 'amalgamate-22.2-1-1', 'amalgamate-22.2-2', 'amalgamate-22.2-2-1', 'amalgamate-22.2-3', 'amalgamate-22.2-3-1', 'amalgamate-22.2-3-1-1', 'amalgamate-22.2-3-2', 'amuse-31.1', 'animal_sounds-38', 'appeal-31.4', 'appeal-31.4-1', 'appeal-31.4-2', 'appeal-31.4-3', 'appear-48.1.1', 'appoint-29.1', 'approve-77', 'assessment-34', 'assuming_position-50', 'avoid-52', 'banish-10.2', 'battle-36.4', 'battle-36.4-1', 'begin-55.1', 'begin-55.1-1', 'being_dressed-41.3.3', 'bend-45.2', 'berry-13.7', 'bill-54.5', 'body_internal_motion-49', 'body_internal_states-40.6', 'braid-41.2.2', 'break-45.1', 'breathe-40.1.2', 'breathe-40.1.2-1', 'bring-11.3', 'bring-11.3-1', 'build-26.1', 'build-26.1-1', 'bulge-47.5.3', 'bump-18.4', 'bump-18.4-1', 'butter-9.9', 'calibratable_cos-45.6', 'calibratable_cos-45.6-1', 'calve-28', 'captain-29.8', 'captain-29.8-1', 'captain-29.8-1-1', 'care-88', 'care-88-1', 'carry-11.4', 'carry-11.4-1', 'carry-11.4-1-1', 'carve-21.2', 'carve-21.2-1', 'carve-21.2-2', 'change_bodily_state-40.8.4', 'characterize-29.2', 'characterize-29.2-1', 'characterize-29.2-1-1', 'characterize-29.2-1-2', 'chase-51.6', 'cheat-10.6', 'cheat-10.6-1', 'cheat-10.6-1-1', 'chew-39.2', 'chew-39.2-1', 'chew-39.2-2', 'chit_chat-37.6', 'clear-10.3', 'clear-10.3-1', 'cling-22.5', 'coil-9.6', 'coil-9.6-1', 'coloring-24', 'complain-37.8', 'complete-55.2', 'concealment-16', 'concealment-16-1', 'confess-37.10', 'confine-92', 'confine-92-1', 'conjecture-29.5', 'conjecture-29.5-1', 'conjecture-29.5-2', 'consider-29.9', 'consider-29.9-1', 'consider-29.9-1-1', 'consider-29.9-1-1-1', 'consider-29.9-2', 'conspire-71', 'consume-66', 'consume-66-1', 'contiguous_location-47.8', 'contiguous_location-47.8-1', 'contiguous_location-47.8-2', 'continue-55.3', 'contribute-13.2', 'contribute-13.2-1', 'contribute-13.2-1-1', 'contribute-13.2-1-1-1', 'contribute-13.2-2', 'contribute-13.2-2-1', 'convert-26.6.2', 'convert-26.6.2-1', 'cooking-45.3', 'cooperate-73', 'cooperate-73-1', 'cooperate-73-2', 'cooperate-73-3', 'cope-83', 'cope-83-1', 'cope-83-1-1', 'correlate-86', 'correspond-36.1', 'correspond-36.1-1', 'correspond-36.1-1-1', 'cost-54.2', 'crane-40.3.2', 'create-26.4', 'create-26.4-1', 'curtsey-40.3.3', 'cut-21.1', 'cut-21.1-1', 'debone-10.8', 'declare-29.4', 'declare-29.4-1', 'declare-29.4-1-1', 'declare-29.4-1-1-1', 'declare-29.4-1-1-2', 'declare-29.4-1-1-3', 'declare-29.4-2', 'dedicate-79', 'defend-85', 'destroy-44', 'devour-39.4', 'devour-39.4-1', 'devour-39.4-2', 'differ-23.4', 'dine-39.5', 'disappearance-48.2', 'disassemble-23.3', 'discover-84', 'discover-84-1', 'discover-84-1-1', 'dress-41.1.1', 'dressing_well-41.3.2', 'drive-11.5', 'drive-11.5-1', 'dub-29.3', 'dub-29.3-1', 'eat-39.1', 'eat-39.1-1', 'eat-39.1-2', 'enforce-63', 'engender-27', 'entity_specific_cos-45.5', 'entity_specific_modes_being-47.2', 'equip-13.4.2', 'equip-13.4.2-1', 'equip-13.4.2-1-1', 'escape-51.1', 'escape-51.1-1', 'escape-51.1-2', 'escape-51.1-2-1', 'exceed-90', 'exchange-13.6', 'exchange-13.6-1', 'exchange-13.6-1-1', 'exhale-40.1.3', 'exhale-40.1.3-1', 'exhale-40.1.3-2', 'exist-47.1', 'exist-47.1-1', 'exist-47.1-1-1', 'feeding-39.7', 'ferret-35.6', 'fill-9.8', 'fill-9.8-1', 'fit-54.3', 'flinch-40.5', 'floss-41.2.1', 'focus-87', 'forbid-67', 'force-59', 'force-59-1', 'free-80', 'free-80-1', 'fulfilling-13.4.1', 'fulfilling-13.4.1-1', 'fulfilling-13.4.1-2', 'funnel-9.3', 'funnel-9.3-1', 'funnel-9.3-2', 'funnel-9.3-2-1', 'future_having-13.3', 'get-13.5.1', 'get-13.5.1-1', 'give-13.1', 'give-13.1-1', 'gobble-39.3', 'gobble-39.3-1', 'gobble-39.3-2', 'gorge-39.6', 'groom-41.1.2', 'grow-26.2', 'help-72', 'help-72-1', 'herd-47.5.2', 'hiccup-40.1.1', 'hit-18.1', 'hit-18.1-1', 'hold-15.1', 'hold-15.1-1', 'hunt-35.1', 'hurt-40.8.3', 'hurt-40.8.3-1', 'hurt-40.8.3-1-1', 'hurt-40.8.3-2', 'illustrate-25.3', 'image_impression-25.1', 'indicate-78', 'indicate-78-1', 'indicate-78-1-1', 'inquire-37.1.2', 'instr_communication-37.4', 'investigate-35.4', 'judgement-33', 'keep-15.2', 'knead-26.5', 'learn-14', 'learn-14-1', 'learn-14-2', 'learn-14-2-1', 'leave-51.2', 'leave-51.2-1', 'lecture-37.11', 'lecture-37.11-1', 'lecture-37.11-1-1', 'lecture-37.11-2', 'light_emission-43.1', 'limit-76', 'linger-53.1', 'linger-53.1-1', 'lodge-46', 'long-32.2', 'long-32.2-1', 'long-32.2-2', 'manner_speaking-37.3', 'marry-36.2', 'marvel-31.3', 'marvel-31.3-1', 'marvel-31.3-2', 'marvel-31.3-3', 'marvel-31.3-4', 'marvel-31.3-5', 'marvel-31.3-6', 'marvel-31.3-7', 'marvel-31.3-8', 'marvel-31.3-9', 'masquerade-29.6', 'masquerade-29.6-1', 'masquerade-29.6-2', 'matter-91', 'meander-47.7', 'meet-36.3', 'meet-36.3-1', 'meet-36.3-2', 'mine-10.9', 'mix-22.1', 'mix-22.1-1', 'mix-22.1-1-1', 'mix-22.1-2', 'mix-22.1-2-1', 'modes_of_being_with_motion-47.3', 'murder-42.1', 'murder-42.1-1', 'neglect-75', 'neglect-75-1', 'neglect-75-1-1', 'neglect-75-2', 'nonvehicle-51.4.2', 'nonverbal_expression-40.2', 'obtain-13.5.2', 'obtain-13.5.2-1', 'occurrence-48.3', 'order-60', 'order-60-1', 'orphan-29.7', 'other_cos-45.4', 'pain-40.8.1', 'pay-68', 'peer-30.3', 'pelt-17.2', 'performance-26.7', 'performance-26.7-1', 'performance-26.7-1-1', 'performance-26.7-2', 'performance-26.7-2-1', 'pit-10.7', 'pocket-9.10', 'pocket-9.10-1', 'poison-42.2', 'poke-19', 'pour-9.5', 'preparing-26.3', 'preparing-26.3-1', 'preparing-26.3-2', 'price-54.4', 'push-12', 'push-12-1', 'push-12-1-1', 'put-9.1', 'put-9.1-1', 'put-9.1-2', 'put_direction-9.4', 'put_spatial-9.2', 'put_spatial-9.2-1', 'reach-51.8', 'reflexive_appearance-48.1.2', 'refrain-69', 'register-54.1', 'rely-70', 'remove-10.1', 'risk-94', 'risk-94-1', 'roll-51.3.1', 'rummage-35.5', 'run-51.3.2', 'rush-53.2', 'say-37.7', 'say-37.7-1', 'say-37.7-1-1', 'say-37.7-2', 'scribble-25.2', 'search-35.2', 'see-30.1', 'see-30.1-1', 'see-30.1-1-1', 'send-11.1', 'send-11.1-1', 'separate-23.1', 'separate-23.1-1', 'separate-23.1-2', 'settle-89', 'shake-22.3', 'shake-22.3-1', 'shake-22.3-1-1', 'shake-22.3-2', 'shake-22.3-2-1', 'sight-30.2', 'simple_dressing-41.3.1', 'slide-11.2', 'slide-11.2-1-1', 'smell_emission-43.3', 'snooze-40.4', 'sound_emission-43.2', 'sound_existence-47.4', 'spank-18.3', 'spatial_configuration-47.6', 'split-23.2', 'spray-9.7', 'spray-9.7-1', 'spray-9.7-1-1', 'spray-9.7-2', 'stalk-35.3', 'steal-10.5', 'stimulus_subject-30.4', 'stop-55.4', 'stop-55.4-1', 'substance_emission-43.4', 'succeed-74', 'succeed-74-1', 'succeed-74-1-1', 'succeed-74-2', 'suffocate-40.7', 'suspect-81', 'swarm-47.5.1', 'swarm-47.5.1-1', 'swarm-47.5.1-2', 'swarm-47.5.1-2-1', 'swat-18.2', 'talk-37.5', 'tape-22.4', 'tape-22.4-1', 'tell-37.2', 'throw-17.1', 'throw-17.1-1', 'throw-17.1-1-1', 'tingle-40.8.2', 'touch-20', 'touch-20-1', 'transcribe-25.4', 'transfer_mesg-37.1.1', 'transfer_mesg-37.1.1-1', 'transfer_mesg-37.1.1-1-1', 'try-61', 'turn-26.6.1', 'turn-26.6.1-1', 'urge-58', 'vehicle-51.4.1', 'vehicle-51.4.1-1', 'waltz-51.5', 'want-32.1', 'want-32.1-1', 'want-32.1-1-1', 'weather-57', 'weekend-56', 'wink-40.3.1', 'wink-40.3.1-1', 'wipe_instr-10.4.2', 'wipe_instr-10.4.2-1', 'wipe_manner-10.4.1', 'wipe_manner-10.4.1-1', 'wish-62', 'withdraw-82', 'withdraw-82-1', 'withdraw-82-2', 'withdraw-82-3']
  834. The primary object in the lexicon is a class record, which is stored
  835. as an ElementTree xml object. The class record for a given class
  836. identifier is returned by the `vnclass()` method:
  837. >>> verbnet.vnclass('remove-10.1') # doctest: +ELLIPSIS
  838. <Element 'VNCLASS' at ...>
  839. The `vnclass()` method also accepts "short" identifiers, such as '10.1':
  840. >>> verbnet.vnclass('10.1') # doctest: +ELLIPSIS
  841. <Element 'VNCLASS' at ...>
  842. See the Verbnet documentation, or the Verbnet files, for information
  843. about the structure of this xml. As an example, we can retrieve a
  844. list of thematic roles for a given Verbnet class:
  845. >>> vn_31_2 = verbnet.vnclass('admire-31.2')
  846. >>> for themrole in vn_31_2.findall('THEMROLES/THEMROLE'):
  847. ... print(themrole.attrib['type'], end=' ')
  848. ... for selrestr in themrole.findall('SELRESTRS/SELRESTR'):
  849. ... print('[%(Value)s%(type)s]' % selrestr.attrib, end=' ')
  850. ... print()
  851. Theme
  852. Experiencer [+animate]
  853. Predicate
  854. The Verbnet corpus also provides a variety of pretty printing
  855. functions that can be used to display the xml contents in a more
  856. concise form. The simplest such method is `pprint()`:
  857. >>> print(verbnet.pprint('57'))
  858. weather-57
  859. Subclasses: (none)
  860. Members: blow clear drizzle fog freeze gust hail howl lightning mist
  861. mizzle pelt pour precipitate rain roar shower sleet snow spit spot
  862. sprinkle storm swelter teem thaw thunder
  863. Thematic roles:
  864. * Theme[+concrete +force]
  865. Frames:
  866. Intransitive (Expletive Subject)
  867. Example: It's raining.
  868. Syntax: LEX[it] LEX[[+be]] VERB
  869. Semantics:
  870. * weather(during(E), Weather_type, ?Theme)
  871. NP (Expletive Subject, Theme Object)
  872. Example: It's raining cats and dogs.
  873. Syntax: LEX[it] LEX[[+be]] VERB NP[Theme]
  874. Semantics:
  875. * weather(during(E), Weather_type, Theme)
  876. PP (Expletive Subject, Theme-PP)
  877. Example: It was pelting with rain.
  878. Syntax: LEX[it[+be]] VERB PREP[with] NP[Theme]
  879. Semantics:
  880. * weather(during(E), Weather_type, Theme)
  881. Verbnet gives us frames that link the syntax and semantics using an example.
  882. These frames are part of the corpus and we can use `frames()` to get a frame
  883. for a given verbnet class.
  884. >>> frame = verbnet.frames('57')
  885. >>> frame == [{'semantics': [{'arguments': [{'value': 'during(E)', 'type': 'Event'}, {'value': 'Weather_type', 'type': 'VerbSpecific'}, {'value': '?Theme', 'type': 'ThemRole'}], 'predicate_value': 'weather'}], 'example': "It's raining.", 'syntax': [{'pos_tag': 'LEX', 'modifiers': {'value': 'it', 'synrestrs': [], 'selrestrs': []}}, {'pos_tag': 'LEX', 'modifiers': {'value': '[+be]', 'synrestrs': [], 'selrestrs': []}}, {'pos_tag': 'VERB', 'modifiers': {'value': '', 'synrestrs': [], 'selrestrs': []}}], 'description': {'primary': 'Intransitive', 'secondary': 'Expletive Subject'}}, {'semantics': [{'arguments': [{'value': 'during(E)', 'type': 'Event'}, {'value': 'Weather_type', 'type': 'VerbSpecific'}, {'value': 'Theme', 'type': 'ThemRole'}], 'predicate_value': 'weather'}], 'example': "It's raining cats and dogs.", 'syntax': [{'pos_tag': 'LEX', 'modifiers': {'value': 'it', 'synrestrs': [], 'selrestrs': []}}, {'pos_tag': 'LEX', 'modifiers': {'value': '[+be]', 'synrestrs': [], 'selrestrs': []}}, {'pos_tag': 'VERB', 'modifiers': {'value': '', 'synrestrs': [], 'selrestrs': []}}, {'pos_tag': 'NP', 'modifiers': {'value': 'Theme', 'synrestrs': [], 'selrestrs': []}}], 'description': {'primary': 'NP', 'secondary': 'Expletive Subject, Theme Object'}}, {'semantics': [{'arguments': [{'value': 'during(E)', 'type': 'Event'}, {'value': 'Weather_type', 'type': 'VerbSpecific'}, {'value': 'Theme', 'type': 'ThemRole'}], 'predicate_value': 'weather'}], 'example': 'It was pelting with rain.', 'syntax': [{'pos_tag': 'LEX', 'modifiers': {'value': 'it[+be]', 'synrestrs': [], 'selrestrs': []}}, {'pos_tag': 'VERB', 'modifiers': {'value': '', 'synrestrs': [], 'selrestrs': []}}, {'pos_tag': 'PREP', 'modifiers': {'value': 'with', 'synrestrs': [], 'selrestrs': []}}, {'pos_tag': 'NP', 'modifiers': {'value': 'Theme', 'synrestrs': [], 'selrestrs': []}}], 'description': {'primary': 'PP', 'secondary': 'Expletive Subject, Theme-PP'}}]
  886. True
  887. Verbnet corpus lets us access thematic roles individually using `themroles()`.
  888. >>> themroles = verbnet.themroles('57')
  889. >>> themroles == [{'modifiers': [{'type': 'concrete', 'value': '+'}, {'type': 'force', 'value': '+'}], 'type': 'Theme'}]
  890. True
  891. Verbnet classes may also have subclasses sharing similar syntactic and semantic properties
  892. while having differences with the superclass. The Verbnet corpus allows us to access these
  893. subclasses using `subclasses()`.
  894. >>> print(verbnet.subclasses('9.1')) #Testing for 9.1 since '57' does not have subclasses
  895. ['put-9.1-1', 'put-9.1-2']
  896. nps_chat
  897. --------
  898. The NPS Chat Corpus, Release 1.0 consists of over 10,000 posts in age-specific
  899. chat rooms, which have been anonymized, POS-tagged and dialogue-act tagged.
  900. >>> print(nltk.corpus.nps_chat.words())
  901. ['now', 'im', 'left', 'with', 'this', 'gay', ...]
  902. >>> print(nltk.corpus.nps_chat.tagged_words())
  903. [('now', 'RB'), ('im', 'PRP'), ('left', 'VBD'), ...]
  904. >>> print(nltk.corpus.nps_chat.tagged_posts()) # doctest: +NORMALIZE_WHITESPACE
  905. [[('now', 'RB'), ('im', 'PRP'), ('left', 'VBD'), ('with', 'IN'),
  906. ('this', 'DT'), ('gay', 'JJ'), ('name', 'NN')], [(':P', 'UH')], ...]
  907. We can access the XML elements corresponding to individual posts. These elements
  908. have ``class`` and ``user`` attributes that we can access using ``p.attrib['class']``
  909. and ``p.attrib['user']``. They also have text content, accessed using ``p.text``.
  910. >>> print(nltk.corpus.nps_chat.xml_posts()) # doctest: +ELLIPSIS
  911. [<Element 'Post' at 0...>, <Element 'Post' at 0...>, ...]
  912. >>> posts = nltk.corpus.nps_chat.xml_posts()
  913. >>> sorted(nltk.FreqDist(p.attrib['class'] for p in posts).keys())
  914. ['Accept', 'Bye', 'Clarify', 'Continuer', 'Emotion', 'Emphasis',
  915. 'Greet', 'Other', 'Reject', 'Statement', 'System', 'nAnswer',
  916. 'whQuestion', 'yAnswer', 'ynQuestion']
  917. >>> posts[0].text
  918. 'now im left with this gay name'
  919. In addition to the above methods for accessing tagged text, we can navigate
  920. the XML structure directly, as follows:
  921. >>> tokens = posts[0].findall('terminals/t')
  922. >>> [t.attrib['pos'] + "/" + t.attrib['word'] for t in tokens]
  923. ['RB/now', 'PRP/im', 'VBD/left', 'IN/with', 'DT/this', 'JJ/gay', 'NN/name']
  924. multext_east
  925. ------------
  926. The Multext-East Corpus consists of POS-tagged versions of George Orwell's book
  927. 1984 in 12 languages: English, Czech, Hungarian, Macedonian, Slovenian, Serbian,
  928. Slovak, Romanian, Estonian, Farsi, Bulgarian and Polish.
  929. The corpus can be accessed using the usual methods for tagged corpora. The tagset
  930. can be transformed from the Multext-East specific MSD tags to the Universal tagset
  931. using the "tagset" parameter of all functions returning tagged parts of the corpus.
  932. >>> print(nltk.corpus.multext_east.words("oana-en.xml"))
  933. ['It', 'was', 'a', 'bright', ...]
  934. >>> print(nltk.corpus.multext_east.tagged_words("oana-en.xml"))
  935. [('It', '#Pp3ns'), ('was', '#Vmis3s'), ('a', '#Di'), ...]
  936. >>> print(nltk.corpus.multext_east.tagged_sents("oana-en.xml", "universal"))
  937. [[('It', 'PRON'), ('was', 'VERB'), ('a', 'DET'), ...]
  938. ---------------------
  939. Corpus Reader Classes
  940. ---------------------
  941. NLTK's *corpus reader* classes are used to access the contents of a
  942. diverse set of corpora. Each corpus reader class is specialized to
  943. handle a specific corpus format. Examples include the
  944. `PlaintextCorpusReader`, which handles corpora that consist of a set
  945. of unannotated text files, and the `BracketParseCorpusReader`, which
  946. handles corpora that consist of files containing
  947. parenthesis-delineated parse trees.
  948. Automatically Created Corpus Reader Instances
  949. =============================================
  950. When the `nltk.corpus` module is imported, it automatically creates a
  951. set of corpus reader instances that can be used to access the corpora
  952. in the NLTK data distribution. Here is a small sample of those
  953. corpus reader instances:
  954. >>> import nltk
  955. >>> nltk.corpus.brown # doctest: +ELLIPSIS
  956. <CategorizedTaggedCorpusReader ...>
  957. >>> nltk.corpus.treebank # doctest: +ELLIPSIS
  958. <BracketParseCorpusReader ...>
  959. >>> nltk.corpus.names # doctest: +ELLIPSIS
  960. <WordListCorpusReader ...>
  961. >>> nltk.corpus.genesis # doctest: +ELLIPSIS
  962. <PlaintextCorpusReader ...>
  963. >>> nltk.corpus.inaugural # doctest: +ELLIPSIS
  964. <PlaintextCorpusReader ...>
  965. This sample illustrates that different corpus reader classes are used
  966. to read different corpora; but that the same corpus reader class may
  967. be used for more than one corpus (e.g., ``genesis`` and ``inaugural``).
  968. Creating New Corpus Reader Instances
  969. ====================================
  970. Although the `nltk.corpus` module automatically creates corpus reader
  971. instances for the corpora in the NLTK data distribution, you may
  972. sometimes need to create your own corpus reader. In particular, you
  973. would need to create your own corpus reader if you want...
  974. - To access a corpus that is not included in the NLTK data
  975. distribution.
  976. - To access a full copy of a corpus for which the NLTK data
  977. distribution only provides a sample.
  978. - To access a corpus using a customized corpus reader (e.g., with
  979. a customized tokenizer).
  980. To create a new corpus reader, you will first need to look up the
  981. signature for that corpus reader's constructor. Different corpus
  982. readers have different constructor signatures, but most of the
  983. constructor signatures have the basic form::
  984. SomeCorpusReader(root, files, ...options...)
  985. Where ``root`` is an absolute path to the directory containing the
  986. corpus data files; ``files`` is either a list of file names (relative
  987. to ``root``) or a regexp specifying which files should be included;
  988. and ``options`` are additional reader-specific options. For example,
  989. we can create a customized corpus reader for the genesis corpus that
  990. uses a different sentence tokenizer as follows:
  991. >>> # Find the directory where the corpus lives.
  992. >>> genesis_dir = nltk.data.find('corpora/genesis')
  993. >>> # Create our custom sentence tokenizer.
  994. >>> my_sent_tokenizer = nltk.RegexpTokenizer('[^.!?]+')
  995. >>> # Create the new corpus reader object.
  996. >>> my_genesis = nltk.corpus.PlaintextCorpusReader(
  997. ... genesis_dir, '.*\.txt', sent_tokenizer=my_sent_tokenizer)
  998. >>> # Use the new corpus reader object.
  999. >>> print(my_genesis.sents('english-kjv.txt')[0]) # doctest: +NORMALIZE_WHITESPACE
  1000. ['In', 'the', 'beginning', 'God', 'created', 'the', 'heaven',
  1001. 'and', 'the', 'earth']
  1002. If you wish to read your own plaintext corpus, which is stored in the
  1003. directory '/usr/share/some-corpus', then you can create a corpus
  1004. reader for it with::
  1005. >>> my_corpus = nltk.corpus.PlaintextCorpusReader(
  1006. ... '/usr/share/some-corpus', '.*\.txt') # doctest: +SKIP
  1007. For a complete list of corpus reader subclasses, see the API
  1008. documentation for `nltk.corpus.reader`.
  1009. Corpus Types
  1010. ============
  1011. Corpora vary widely in the types of content they include. This is
  1012. reflected in the fact that the base class `CorpusReader` only defines
  1013. a few general-purpose methods for listing and accessing the files that
  1014. make up a corpus. It is up to the subclasses to define *data access
  1015. methods* that provide access to the information in the corpus.
  1016. However, corpus reader subclasses should be consistent in their
  1017. definitions of these data access methods wherever possible.
  1018. At a high level, corpora can be divided into three basic types:
  1019. - A *token corpus* contains information about specific occurrences of
  1020. language use (or linguistic tokens), such as dialogues or written
  1021. texts. Examples of token corpora are collections of written text
  1022. and collections of speech.
  1023. - A *type corpus*, or *lexicon*, contains information about a coherent
  1024. set of lexical items (or linguistic types). Examples of lexicons
  1025. are dictionaries and word lists.
  1026. - A *language description corpus* contains information about a set of
  1027. non-lexical linguistic constructs, such as grammar rules.
  1028. However, many individual corpora blur the distinctions between these
  1029. types. For example, corpora that are primarily lexicons may include
  1030. token data in the form of example sentences; and corpora that are
  1031. primarily token corpora may be accompanied by one or more word lists
  1032. or other lexical data sets.
  1033. Because corpora vary so widely in their information content, we have
  1034. decided that it would not be wise to use separate corpus reader base
  1035. classes for different corpus types. Instead, we simply try to make
  1036. the corpus readers consistent wherever possible, but let them differ
  1037. where the underlying data itself differs.
  1038. Common Corpus Reader Methods
  1039. ============================
  1040. As mentioned above, there are only a handful of methods that all
  1041. corpus readers are guaranteed to implement. These methods provide
  1042. access to the files that contain the corpus data. Every corpus is
  1043. assumed to consist of one or more files, all located in a common root
  1044. directory (or in subdirectories of that root directory). The absolute
  1045. path to the root directory is stored in the ``root`` property:
  1046. >>> import os
  1047. >>> str(nltk.corpus.genesis.root).replace(os.path.sep,'/') # doctest: +ELLIPSIS
  1048. '.../nltk_data/corpora/genesis'
  1049. Each file within the corpus is identified by a platform-independent
  1050. identifier, which is basically a path string that uses ``/`` as the
  1051. path separator. I.e., this identifier can be converted to a relative
  1052. path as follows:
  1053. >>> some_corpus_file_id = nltk.corpus.reuters.fileids()[0]
  1054. >>> import os.path
  1055. >>> os.path.normpath(some_corpus_file_id).replace(os.path.sep,'/')
  1056. 'test/14826'
  1057. To get a list of all data files that make up a corpus, use the
  1058. ``fileids()`` method. In some corpora, these files will not all contain
  1059. the same type of data; for example, for the ``nltk.corpus.timit``
  1060. corpus, ``fileids()`` will return a list including text files, word
  1061. segmentation files, phonetic transcription files, sound files, and
  1062. metadata files. For corpora with diverse file types, the ``fileids()``
  1063. method will often take one or more optional arguments, which can be
  1064. used to get a list of the files with a specific file type:
  1065. >>> nltk.corpus.timit.fileids() # doctest: +ELLIPSIS
  1066. ['dr1-fvmh0/sa1.phn', 'dr1-fvmh0/sa1.txt', 'dr1-fvmh0/sa1.wav', ...]
  1067. >>> nltk.corpus.timit.fileids('phn') # doctest: +ELLIPSIS
  1068. ['dr1-fvmh0/sa1.phn', 'dr1-fvmh0/sa2.phn', 'dr1-fvmh0/si1466.phn', ...]
  1069. In some corpora, the files are divided into distinct categories. For
  1070. these corpora, the ``fileids()`` method takes an optional argument,
  1071. which can be used to get a list of the files within a specific category:
  1072. >>> nltk.corpus.brown.fileids('hobbies') # doctest: +ELLIPSIS
  1073. ['ce01', 'ce02', 'ce03', 'ce04', 'ce05', 'ce06', 'ce07', ...]
  1074. The ``abspath()`` method can be used to find the absolute path to a
  1075. corpus file, given its file identifier:
  1076. >>> str(nltk.corpus.brown.abspath('ce06')).replace(os.path.sep,'/') # doctest: +ELLIPSIS
  1077. '.../corpora/brown/ce06'
  1078. The ``abspaths()`` method can be used to find the absolute paths for
  1079. one corpus file, a list of corpus files, or (if no fileids are specified),
  1080. all corpus files.
  1081. This method is mainly useful as a helper method when defining corpus
  1082. data access methods, since data access methods can usually be called
  1083. with a string argument (to get a view for a specific file), with a
  1084. list argument (to get a view for a specific list of files), or with no
  1085. argument (to get a view for the whole corpus).
  1086. Data Access Methods
  1087. ===================
  1088. Individual corpus reader subclasses typically extend this basic set of
  1089. file-access methods with one or more *data access methods*, which provide
  1090. easy access to the data contained in the corpus. The signatures for
  1091. data access methods often have the basic form::
  1092. corpus_reader.some_data access(fileids=None, ...options...)
  1093. Where ``fileids`` can be a single file identifier string (to get a view
  1094. for a specific file); a list of file identifier strings (to get a view
  1095. for a specific list of files); or None (to get a view for the entire
  1096. corpus). Some of the common data access methods, and their return
  1097. types, are:
  1098. - I{corpus}.words(): list of str
  1099. - I{corpus}.sents(): list of (list of str)
  1100. - I{corpus}.paras(): list of (list of (list of str))
  1101. - I{corpus}.tagged_words(): list of (str,str) tuple
  1102. - I{corpus}.tagged_sents(): list of (list of (str,str))
  1103. - I{corpus}.tagged_paras(): list of (list of (list of (str,str)))
  1104. - I{corpus}.chunked_sents(): list of (Tree w/ (str,str) leaves)
  1105. - I{corpus}.parsed_sents(): list of (Tree with str leaves)
  1106. - I{corpus}.parsed_paras(): list of (list of (Tree with str leaves))
  1107. - I{corpus}.xml(): A single xml ElementTree
  1108. - I{corpus}.raw(): str (unprocessed corpus contents)
  1109. For example, the `words()` method is supported by many different
  1110. corpora, and returns a flat list of word strings:
  1111. >>> nltk.corpus.brown.words()
  1112. ['The', 'Fulton', 'County', 'Grand', 'Jury', ...]
  1113. >>> nltk.corpus.treebank.words()
  1114. ['Pierre', 'Vinken', ',', '61', 'years', 'old', ...]
  1115. >>> nltk.corpus.conll2002.words()
  1116. ['Sao', 'Paulo', '(', 'Brasil', ')', ',', '23', ...]
  1117. >>> nltk.corpus.genesis.words()
  1118. ['In', 'the', 'beginning', 'God', 'created', ...]
  1119. On the other hand, the `tagged_words()` method is only supported by
  1120. corpora that include part-of-speech annotations:
  1121. >>> nltk.corpus.brown.tagged_words()
  1122. [('The', 'AT'), ('Fulton', 'NP-TL'), ...]
  1123. >>> nltk.corpus.treebank.tagged_words()
  1124. [('Pierre', 'NNP'), ('Vinken', 'NNP'), ...]
  1125. >>> nltk.corpus.conll2002.tagged_words()
  1126. [('Sao', 'NC'), ('Paulo', 'VMI'), ('(', 'Fpa'), ...]
  1127. >>> nltk.corpus.genesis.tagged_words()
  1128. Traceback (most recent call last):
  1129. ...
  1130. AttributeError: 'PlaintextCorpusReader' object has no attribute 'tagged_words'
  1131. Although most corpus readers use file identifiers to index their
  1132. content, some corpora use different identifiers instead. For example,
  1133. the data access methods for the ``timit`` corpus uses *utterance
  1134. identifiers* to select which corpus items should be returned:
  1135. >>> nltk.corpus.timit.utteranceids() # doctest: +ELLIPSIS
  1136. ['dr1-fvmh0/sa1', 'dr1-fvmh0/sa2', 'dr1-fvmh0/si1466', ...]
  1137. >>> nltk.corpus.timit.words('dr1-fvmh0/sa2')
  1138. ["don't", 'ask', 'me', 'to', 'carry', 'an', 'oily', 'rag', 'like', 'that']
  1139. Attempting to call ``timit``\ 's data access methods with a file
  1140. identifier will result in an exception:
  1141. >>> nltk.corpus.timit.fileids() # doctest: +ELLIPSIS
  1142. ['dr1-fvmh0/sa1.phn', 'dr1-fvmh0/sa1.txt', 'dr1-fvmh0/sa1.wav', ...]
  1143. >>> nltk.corpus.timit.words('dr1-fvmh0/sa1.txt') # doctest: +SKIP
  1144. Traceback (most recent call last):
  1145. ...
  1146. IOError: No such file or directory: '.../dr1-fvmh0/sa1.txt.wrd'
  1147. As another example, the ``propbank`` corpus defines the ``roleset()``
  1148. method, which expects a roleset identifier, not a file identifier:
  1149. >>> roleset = nltk.corpus.propbank.roleset('eat.01')
  1150. >>> from xml.etree import ElementTree as ET
  1151. >>> print(ET.tostring(roleset).decode('utf8')) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
  1152. <roleset id="eat.01" name="consume" vncls="39.1">
  1153. <roles>
  1154. <role descr="consumer, eater" n="0">...</role>...
  1155. </roles>...
  1156. </roleset>...
  1157. Stream Backed Corpus Views
  1158. ==========================
  1159. An important feature of NLTK's corpus readers is that many of them
  1160. access the underlying data files using "corpus views." A *corpus
  1161. view* is an object that acts like a simple data structure (such as a
  1162. list), but does not store the data elements in memory; instead, data
  1163. elements are read from the underlying data files on an as-needed
  1164. basis.
  1165. By only loading items from the file on an as-needed basis, corpus
  1166. views maintain both memory efficiency and responsiveness. The memory
  1167. efficiency of corpus readers is important because some corpora contain
  1168. very large amounts of data, and storing the entire data set in memory
  1169. could overwhelm many machines. The responsiveness is important when
  1170. experimenting with corpora in interactive sessions and in in-class
  1171. demonstrations.
  1172. The most common corpus view is the `StreamBackedCorpusView`, which
  1173. acts as a read-only list of tokens. Two additional corpus view
  1174. classes, `ConcatenatedCorpusView` and `LazySubsequence`, make it
  1175. possible to create concatenations and take slices of
  1176. `StreamBackedCorpusView` objects without actually storing the
  1177. resulting list-like object's elements in memory.
  1178. In the future, we may add additional corpus views that act like other
  1179. basic data structures, such as dictionaries.
  1180. Writing New Corpus Readers
  1181. ==========================
  1182. In order to add support for new corpus formats, it is necessary to
  1183. define new corpus reader classes. For many corpus formats, writing
  1184. new corpus readers is relatively straight-forward. In this section,
  1185. we'll describe what's involved in creating a new corpus reader. If
  1186. you do create a new corpus reader, we encourage you to contribute it
  1187. back to the NLTK project.
  1188. Don't Reinvent the Wheel
  1189. ------------------------
  1190. Before you start writing a new corpus reader, you should check to be
  1191. sure that the desired format can't be read using an existing corpus
  1192. reader with appropriate constructor arguments. For example, although
  1193. the `TaggedCorpusReader` assumes that words and tags are separated by
  1194. ``/`` characters by default, an alternative tag-separation character
  1195. can be specified via the ``sep`` constructor argument. You should
  1196. also check whether the new corpus format can be handled by subclassing
  1197. an existing corpus reader, and tweaking a few methods or variables.
  1198. Design
  1199. ------
  1200. If you decide to write a new corpus reader from scratch, then you
  1201. should first decide which data access methods you want the reader to
  1202. provide, and what their signatures should be. You should look at
  1203. existing corpus readers that process corpora with similar data
  1204. contents, and try to be consistent with those corpus readers whenever
  1205. possible.
  1206. You should also consider what sets of identifiers are appropriate for
  1207. the corpus format. Where it's practical, file identifiers should be
  1208. used. However, for some corpora, it may make sense to use additional
  1209. sets of identifiers. Each set of identifiers should have a distinct
  1210. name (e.g., fileids, utteranceids, rolesets); and you should be consistent
  1211. in using that name to refer to that identifier. Do not use parameter
  1212. names like ``id``, which leave it unclear what type of identifier is
  1213. required.
  1214. Once you've decided what data access methods and identifiers are
  1215. appropriate for your corpus, you should decide if there are any
  1216. customizable parameters that you'd like the corpus reader to handle.
  1217. These parameters make it possible to use a single corpus reader to
  1218. handle a wider variety of corpora. The ``sep`` argument for
  1219. `TaggedCorpusReader`, mentioned above, is an example of a customizable
  1220. corpus reader parameter.
  1221. Implementation
  1222. --------------
  1223. Constructor
  1224. ~~~~~~~~~~~
  1225. If your corpus reader implements any customizable parameters, then
  1226. you'll need to override the constructor. Typically, the new
  1227. constructor will first call its base class's constructor, and then
  1228. store the customizable parameters. For example, the
  1229. `ConllChunkCorpusReader`\ 's constructor is defined as follows:
  1230. def __init__(self, root, fileids, chunk_types, encoding='utf8',
  1231. tagset=None, separator=None):
  1232. ConllCorpusReader.__init__(
  1233. self, root, fileids, ('words', 'pos', 'chunk'),
  1234. chunk_types=chunk_types, encoding=encoding,
  1235. tagset=tagset, separator=separator)
  1236. If your corpus reader does not implement any customization parameters,
  1237. then you can often just inherit the base class's constructor.
  1238. Data Access Methods
  1239. ~~~~~~~~~~~~~~~~~~~
  1240. The most common type of data access method takes an argument
  1241. identifying which files to access, and returns a view covering those
  1242. files. This argument may be a single file identifier string (to get a
  1243. view for a specific file); a list of file identifier strings (to get a
  1244. view for a specific list of files); or None (to get a view for the
  1245. entire corpus). The method's implementation converts this argument to
  1246. a list of path names using the `abspaths()` method, which handles all
  1247. three value types (string, list, and None):
  1248. >>> print(str(nltk.corpus.brown.abspaths()).replace('\\\\','/')) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
  1249. [FileSystemPathPointer('.../corpora/brown/ca01'),
  1250. FileSystemPathPointer('.../corpora/brown/ca02'), ...]
  1251. >>> print(str(nltk.corpus.brown.abspaths('ce06')).replace('\\\\','/')) # doctest: +ELLIPSIS
  1252. [FileSystemPathPointer('.../corpora/brown/ce06')]
  1253. >>> print(str(nltk.corpus.brown.abspaths(['ce06', 'ce07'])).replace('\\\\','/')) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
  1254. [FileSystemPathPointer('.../corpora/brown/ce06'),
  1255. FileSystemPathPointer('.../corpora/brown/ce07')]
  1256. An example of this type of method is the `words()` method, defined by
  1257. the `PlaintextCorpusReader` as follows:
  1258. >>> def words(self, fileids=None):
  1259. ... return concat([self.CorpusView(fileid, self._read_word_block)
  1260. ... for fileid in self.abspaths(fileids)])
  1261. This method first uses `abspaths()` to convert ``fileids`` to a list of
  1262. absolute paths. It then creates a corpus view for each file, using
  1263. the `PlaintextCorpusReader._read_word_block()` method to read elements
  1264. from the data file (see the discussion of corpus views below).
  1265. Finally, it combines these corpus views using the
  1266. `nltk.corpus.reader.util.concat()` function.
  1267. When writing a corpus reader for a corpus that is never expected to be
  1268. very large, it can sometimes be appropriate to read the files
  1269. directly, rather than using a corpus view. For example, the
  1270. `WordListCorpusView` class defines its `words()` method as follows:
  1271. >>> def words(self, fileids=None):
  1272. ... return concat([[w for w in open(fileid).read().split('\n') if w]
  1273. ... for fileid in self.abspaths(fileids)])
  1274. (This is usually more appropriate for lexicons than for token corpora.)
  1275. If the type of data returned by a data access method is one for which
  1276. NLTK has a conventional representation (e.g., words, tagged words, and
  1277. parse trees), then you should use that representation. Otherwise, you
  1278. may find it necessary to define your own representation. For data
  1279. structures that are relatively corpus-specific, it's usually best to
  1280. define new classes for these elements. For example, the ``propbank``
  1281. corpus defines the `PropbankInstance` class to store the semantic role
  1282. labeling instances described by the corpus; and the ``ppattach``
  1283. corpus defines the `PPAttachment` class to store the prepositional
  1284. attachment instances described by the corpus.
  1285. Corpus Views
  1286. ~~~~~~~~~~~~
  1287. .. (Much of the content for this section is taken from the
  1288. StreamBackedCorpusView docstring.)
  1289. The heart of a `StreamBackedCorpusView` is its *block reader*
  1290. function, which reads zero or more tokens from a stream, and returns
  1291. them as a list. A very simple example of a block reader is:
  1292. >>> def simple_block_reader(stream):
  1293. ... return stream.readline().split()
  1294. This simple block reader reads a single line at a time, and returns a
  1295. single token (consisting of a string) for each whitespace-separated
  1296. substring on the line. A `StreamBackedCorpusView` built from this
  1297. block reader will act like a read-only list of all the
  1298. whitespace-separated tokens in an underlying file.
  1299. When deciding how to define the block reader for a given corpus,
  1300. careful consideration should be given to the size of blocks handled by
  1301. the block reader. Smaller block sizes will increase the memory
  1302. requirements of the corpus view's internal data structures (by 2
  1303. integers per block). On the other hand, larger block sizes may
  1304. decrease performance for random access to the corpus. (But note that
  1305. larger block sizes will *not* decrease performance for iteration.)
  1306. Internally, the `StreamBackedCorpusView` class maintains a partial
  1307. mapping from token index to file position, with one entry per block.
  1308. When a token with a given index *i* is requested, the corpus view
  1309. constructs it as follows:
  1310. 1. First, it searches the toknum/filepos mapping for the token index
  1311. closest to (but less than or equal to) *i*.
  1312. 2. Then, starting at the file position corresponding to that index, it
  1313. reads one block at a time using the block reader until it reaches
  1314. the requested token.
  1315. The toknum/filepos mapping is created lazily: it is initially empty,
  1316. but every time a new block is read, the block's initial token is added
  1317. to the mapping. (Thus, the toknum/filepos map has one entry per
  1318. block.)
  1319. You can create your own corpus view in one of two ways:
  1320. 1. Call the `StreamBackedCorpusView` constructor, and provide your
  1321. block reader function via the ``block_reader`` argument.
  1322. 2. Subclass `StreamBackedCorpusView`, and override the
  1323. `read_block()` method.
  1324. The first option is usually easier, but the second option can allow
  1325. you to write a single `read_block` method whose behavior can be
  1326. customized by different parameters to the subclass's constructor. For
  1327. an example of this design pattern, see the `TaggedCorpusView` class,
  1328. which is used by `TaggedCorpusView`.
  1329. ----------------
  1330. Regression Tests
  1331. ----------------
  1332. The following helper functions are used to create and then delete
  1333. testing corpora that are stored in temporary directories. These
  1334. testing corpora are used to make sure the readers work correctly.
  1335. >>> import tempfile, os.path, textwrap
  1336. >>> def make_testcorpus(ext='', **fileids):
  1337. ... root = tempfile.mkdtemp()
  1338. ... for fileid, contents in fileids.items():
  1339. ... fileid += ext
  1340. ... f = open(os.path.join(root, fileid), 'w')
  1341. ... f.write(textwrap.dedent(contents))
  1342. ... f.close()
  1343. ... return root
  1344. >>> def del_testcorpus(root):
  1345. ... for fileid in os.listdir(root):
  1346. ... os.remove(os.path.join(root, fileid))
  1347. ... os.rmdir(root)
  1348. Plaintext Corpus Reader
  1349. =======================
  1350. The plaintext corpus reader is used to access corpora that consist of
  1351. unprocessed plaintext data. It assumes that paragraph breaks are
  1352. indicated by blank lines. Sentences and words can be tokenized using
  1353. the default tokenizers, or by custom tokenizers specified as
  1354. parameters to the constructor.
  1355. >>> root = make_testcorpus(ext='.txt',
  1356. ... a="""\
  1357. ... This is the first sentence. Here is another
  1358. ... sentence! And here's a third sentence.
  1359. ...
  1360. ... This is the second paragraph. Tokenization is currently
  1361. ... fairly simple, so the period in Mr. gets tokenized.
  1362. ... """,
  1363. ... b="""This is the second file.""")
  1364. >>> from nltk.corpus.reader.plaintext import PlaintextCorpusReader
  1365. The list of documents can be specified explicitly, or implicitly (using a
  1366. regexp). The ``ext`` argument specifies a file extension.
  1367. >>> corpus = PlaintextCorpusReader(root, ['a.txt', 'b.txt'])
  1368. >>> corpus.fileids()
  1369. ['a.txt', 'b.txt']
  1370. >>> corpus = PlaintextCorpusReader(root, '.*\.txt')
  1371. >>> corpus.fileids()
  1372. ['a.txt', 'b.txt']
  1373. The directory containing the corpus is corpus.root:
  1374. >>> str(corpus.root) == str(root)
  1375. True
  1376. We can get a list of words, or the raw string:
  1377. >>> corpus.words()
  1378. ['This', 'is', 'the', 'first', 'sentence', '.', ...]
  1379. >>> corpus.raw()[:40]
  1380. 'This is the first sentence. Here is ano'
  1381. Check that reading individual documents works, and reading all documents at
  1382. once works:
  1383. >>> len(corpus.words()), [len(corpus.words(d)) for d in corpus.fileids()]
  1384. (46, [40, 6])
  1385. >>> corpus.words('a.txt')
  1386. ['This', 'is', 'the', 'first', 'sentence', '.', ...]
  1387. >>> corpus.words('b.txt')
  1388. ['This', 'is', 'the', 'second', 'file', '.']
  1389. >>> corpus.words()[:4], corpus.words()[-4:]
  1390. (['This', 'is', 'the', 'first'], ['the', 'second', 'file', '.'])
  1391. We're done with the test corpus:
  1392. >>> del_testcorpus(root)
  1393. Test the plaintext corpora that come with nltk:
  1394. >>> from nltk.corpus import abc, genesis, inaugural
  1395. >>> from nltk.corpus import state_union, webtext
  1396. >>> for corpus in (abc, genesis, inaugural, state_union,
  1397. ... webtext):
  1398. ... print(str(corpus).replace('\\\\','/'))
  1399. ... print(' ', repr(corpus.fileids())[:60])
  1400. ... print(' ', repr(corpus.words()[:10])[:60])
  1401. <PlaintextCorpusReader in '.../nltk_data/corpora/ab...'>
  1402. ['rural.txt', 'science.txt']
  1403. ['PM', 'denies', 'knowledge', 'of', 'AWB', ...
  1404. <PlaintextCorpusReader in '.../nltk_data/corpora/genesi...'>
  1405. ['english-kjv.txt', 'english-web.txt', 'finnish.txt', ...
  1406. ['In', 'the', 'beginning', 'God', 'created', 'the', ...
  1407. <PlaintextCorpusReader in '.../nltk_data/corpora/inaugura...'>
  1408. ['1789-Washington.txt', '1793-Washington.txt', ...
  1409. ['Fellow', '-', 'Citizens', 'of', 'the', 'Senate', ...
  1410. <PlaintextCorpusReader in '.../nltk_data/corpora/state_unio...'>
  1411. ['1945-Truman.txt', '1946-Truman.txt', ...
  1412. ['PRESIDENT', 'HARRY', 'S', '.', 'TRUMAN', "'", ...
  1413. <PlaintextCorpusReader in '.../nltk_data/corpora/webtex...'>
  1414. ['firefox.txt', 'grail.txt', 'overheard.txt', ...
  1415. ['Cookie', 'Manager', ':', '"', 'Don', "'", 't', ...
  1416. Tagged Corpus Reader
  1417. ====================
  1418. The Tagged Corpus reader can give us words, sentences, and paragraphs,
  1419. each tagged or untagged. All of the read methods can take one item
  1420. (in which case they return the contents of that file) or a list of
  1421. documents (in which case they concatenate the contents of those files).
  1422. By default, they apply to all documents in the corpus.
  1423. >>> root = make_testcorpus(
  1424. ... a="""\
  1425. ... This/det is/verb the/det first/adj sentence/noun ./punc
  1426. ... Here/det is/verb another/adj sentence/noun ./punc
  1427. ... Note/verb that/comp you/pron can/verb use/verb \
  1428. ... any/noun tag/noun set/noun
  1429. ...
  1430. ... This/det is/verb the/det second/adj paragraph/noun ./punc
  1431. ... word/n without/adj a/det tag/noun :/: hello ./punc
  1432. ... """,
  1433. ... b="""\
  1434. ... This/det is/verb the/det second/adj file/noun ./punc
  1435. ... """)
  1436. >>> from nltk.corpus.reader.tagged import TaggedCorpusReader
  1437. >>> corpus = TaggedCorpusReader(root, list('ab'))
  1438. >>> corpus.fileids()
  1439. ['a', 'b']
  1440. >>> str(corpus.root) == str(root)
  1441. True
  1442. >>> corpus.words()
  1443. ['This', 'is', 'the', 'first', 'sentence', '.', ...]
  1444. >>> corpus.sents() # doctest: +ELLIPSIS
  1445. [['This', 'is', 'the', 'first', ...], ['Here', 'is', 'another'...], ...]
  1446. >>> corpus.paras() # doctest: +ELLIPSIS
  1447. [[['This', ...], ['Here', ...], ...], [['This', ...], ...], ...]
  1448. >>> corpus.tagged_words() # doctest: +ELLIPSIS
  1449. [('This', 'DET'), ('is', 'VERB'), ('the', 'DET'), ...]
  1450. >>> corpus.tagged_sents() # doctest: +ELLIPSIS
  1451. [[('This', 'DET'), ('is', 'VERB'), ...], [('Here', 'DET'), ...], ...]
  1452. >>> corpus.tagged_paras() # doctest: +ELLIPSIS
  1453. [[[('This', 'DET'), ...], ...], [[('This', 'DET'), ...], ...], ...]
  1454. >>> corpus.raw()[:40]
  1455. 'This/det is/verb the/det first/adj sente'
  1456. >>> len(corpus.words()), [len(corpus.words(d)) for d in corpus.fileids()]
  1457. (38, [32, 6])
  1458. >>> len(corpus.sents()), [len(corpus.sents(d)) for d in corpus.fileids()]
  1459. (6, [5, 1])
  1460. >>> len(corpus.paras()), [len(corpus.paras(d)) for d in corpus.fileids()]
  1461. (3, [2, 1])
  1462. >>> print(corpus.words('a'))
  1463. ['This', 'is', 'the', 'first', 'sentence', '.', ...]
  1464. >>> print(corpus.words('b'))
  1465. ['This', 'is', 'the', 'second', 'file', '.']
  1466. >>> del_testcorpus(root)
  1467. The Brown Corpus uses the tagged corpus reader:
  1468. >>> from nltk.corpus import brown
  1469. >>> brown.fileids() # doctest: +ELLIPSIS
  1470. ['ca01', 'ca02', 'ca03', 'ca04', 'ca05', 'ca06', 'ca07', ...]
  1471. >>> brown.categories() # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
  1472. ['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor',
  1473. 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']
  1474. >>> print(repr(brown.root).replace('\\\\','/')) # doctest: +ELLIPSIS
  1475. FileSystemPathPointer('.../corpora/brown')
  1476. >>> brown.words()
  1477. ['The', 'Fulton', 'County', 'Grand', 'Jury', ...]
  1478. >>> brown.sents() # doctest: +ELLIPSIS
  1479. [['The', 'Fulton', 'County', 'Grand', ...], ...]
  1480. >>> brown.paras() # doctest: +ELLIPSIS
  1481. [[['The', 'Fulton', 'County', ...]], [['The', 'jury', ...]], ...]
  1482. >>> brown.tagged_words() # doctest: +ELLIPSIS
  1483. [('The', 'AT'), ('Fulton', 'NP-TL'), ...]
  1484. >>> brown.tagged_sents() # doctest: +ELLIPSIS
  1485. [[('The', 'AT'), ('Fulton', 'NP-TL'), ('County', 'NN-TL'), ...], ...]
  1486. >>> brown.tagged_paras() # doctest: +ELLIPSIS
  1487. [[[('The', 'AT'), ...]], [[('The', 'AT'), ...]], ...]
  1488. Verbnet Corpus Reader
  1489. =====================
  1490. Make sure we're picking up the right number of elements:
  1491. >>> from nltk.corpus import verbnet
  1492. >>> len(verbnet.lemmas())
  1493. 3621
  1494. >>> len(verbnet.wordnetids())
  1495. 4953
  1496. >>> len(verbnet.classids())
  1497. 429
  1498. Selecting classids based on various selectors:
  1499. >>> verbnet.classids(lemma='take') # doctest: +NORMALIZE_WHITESPACE
  1500. ['bring-11.3', 'characterize-29.2', 'convert-26.6.2', 'cost-54.2',
  1501. 'fit-54.3', 'performance-26.7-2', 'steal-10.5']
  1502. >>> verbnet.classids(wordnetid='lead%2:38:01')
  1503. ['accompany-51.7']
  1504. >>> verbnet.classids(fileid='approve-77.xml')
  1505. ['approve-77']
  1506. >>> verbnet.classids(classid='admire-31.2') # subclasses
  1507. ['admire-31.2-1']
  1508. vnclass() accepts filenames, long ids, and short ids:
  1509. >>> a = ElementTree.tostring(verbnet.vnclass('admire-31.2.xml'))
  1510. >>> b = ElementTree.tostring(verbnet.vnclass('admire-31.2'))
  1511. >>> c = ElementTree.tostring(verbnet.vnclass('31.2'))
  1512. >>> a == b == c
  1513. True
  1514. fileids() can be used to get files based on verbnet class ids:
  1515. >>> verbnet.fileids('admire-31.2')
  1516. ['admire-31.2.xml']
  1517. >>> verbnet.fileids(['admire-31.2', 'obtain-13.5.2'])
  1518. ['admire-31.2.xml', 'obtain-13.5.2.xml']
  1519. >>> verbnet.fileids('badidentifier')
  1520. Traceback (most recent call last):
  1521. . . .
  1522. ValueError: vnclass identifier 'badidentifier' not found
  1523. longid() and shortid() can be used to convert identifiers:
  1524. >>> verbnet.longid('31.2')
  1525. 'admire-31.2'
  1526. >>> verbnet.longid('admire-31.2')
  1527. 'admire-31.2'
  1528. >>> verbnet.shortid('31.2')
  1529. '31.2'
  1530. >>> verbnet.shortid('admire-31.2')
  1531. '31.2'
  1532. >>> verbnet.longid('badidentifier')
  1533. Traceback (most recent call last):
  1534. . . .
  1535. ValueError: vnclass identifier 'badidentifier' not found
  1536. >>> verbnet.shortid('badidentifier')
  1537. Traceback (most recent call last):
  1538. . . .
  1539. ValueError: vnclass identifier 'badidentifier' not found
  1540. Corpus View Regression Tests
  1541. ============================
  1542. Select some corpus files to play with:
  1543. >>> import nltk.data
  1544. >>> # A very short file (160 chars):
  1545. >>> f1 = nltk.data.find('corpora/inaugural/README')
  1546. >>> # A relatively short file (791 chars):
  1547. >>> f2 = nltk.data.find('corpora/inaugural/1793-Washington.txt')
  1548. >>> # A longer file (32k chars):
  1549. >>> f3 = nltk.data.find('corpora/inaugural/1909-Taft.txt')
  1550. >>> fileids = [f1, f2, f3]
  1551. Concatenation
  1552. -------------
  1553. Check that concatenation works as intended.
  1554. >>> from nltk.corpus.reader.util import *
  1555. >>> c1 = StreamBackedCorpusView(f1, read_whitespace_block, encoding='utf-8')
  1556. >>> c2 = StreamBackedCorpusView(f2, read_whitespace_block, encoding='utf-8')
  1557. >>> c3 = StreamBackedCorpusView(f3, read_whitespace_block, encoding='utf-8')
  1558. >>> c123 = c1+c2+c3
  1559. >>> print(c123)
  1560. ['C-Span', 'Inaugural', 'Address', 'Corpus', 'US', ...]
  1561. >>> l1 = f1.open(encoding='utf-8').read().split()
  1562. >>> l2 = f2.open(encoding='utf-8').read().split()
  1563. >>> l3 = f3.open(encoding='utf-8').read().split()
  1564. >>> l123 = l1+l2+l3
  1565. >>> list(c123) == l123
  1566. True
  1567. >>> (c1+c2+c3)[100] == l123[100]
  1568. True
  1569. Slicing
  1570. -------
  1571. First, do some tests with fairly small slices. These will all
  1572. generate tuple values.
  1573. >>> from nltk.util import LazySubsequence
  1574. >>> c1 = StreamBackedCorpusView(f1, read_whitespace_block, encoding='utf-8')
  1575. >>> l1 = f1.open(encoding='utf-8').read().split()
  1576. >>> print(len(c1))
  1577. 21
  1578. >>> len(c1) < LazySubsequence.MIN_SIZE
  1579. True
  1580. Choose a list of indices, based on the length, that covers the
  1581. important corner cases:
  1582. >>> indices = [-60, -30, -22, -21, -20, -1,
  1583. ... 0, 1, 10, 20, 21, 22, 30, 60]
  1584. Test slicing with explicit start & stop value:
  1585. >>> for s in indices:
  1586. ... for e in indices:
  1587. ... assert list(c1[s:e]) == l1[s:e]
  1588. Test slicing with stop=None:
  1589. >>> for s in indices:
  1590. ... assert list(c1[s:]) == l1[s:]
  1591. Test slicing with start=None:
  1592. >>> for e in indices:
  1593. ... assert list(c1[:e]) == l1[:e]
  1594. Test slicing with start=stop=None:
  1595. >>> list(c1[:]) == list(l1[:])
  1596. True
  1597. Next, we'll do some tests with much longer slices. These will
  1598. generate LazySubsequence objects.
  1599. >>> c3 = StreamBackedCorpusView(f3, read_whitespace_block, encoding='utf-8')
  1600. >>> l3 = f3.open(encoding='utf-8').read().split()
  1601. >>> print(len(c3))
  1602. 5430
  1603. >>> len(c3) > LazySubsequence.MIN_SIZE*2
  1604. True
  1605. Choose a list of indices, based on the length, that covers the
  1606. important corner cases:
  1607. >>> indices = [-12000, -6000, -5431, -5430, -5429, -3000, -200, -1,
  1608. ... 0, 1, 200, 3000, 5000, 5429, 5430, 5431, 6000, 12000]
  1609. Test slicing with explicit start & stop value:
  1610. >>> for s in indices:
  1611. ... for e in indices:
  1612. ... assert list(c3[s:e]) == l3[s:e]
  1613. Test slicing with stop=None:
  1614. >>> for s in indices:
  1615. ... assert list(c3[s:]) == l3[s:]
  1616. Test slicing with start=None:
  1617. >>> for e in indices:
  1618. ... assert list(c3[:e]) == l3[:e]
  1619. Test slicing with start=stop=None:
  1620. >>> list(c3[:]) == list(l3[:])
  1621. True
  1622. Multiple Iterators
  1623. ------------------
  1624. If multiple iterators are created for the same corpus view, their
  1625. iteration can be interleaved:
  1626. >>> c3 = StreamBackedCorpusView(f3, read_whitespace_block)
  1627. >>> iterators = [c3.iterate_from(n) for n in [0,15,30,45]]
  1628. >>> for i in range(15):
  1629. ... for iterator in iterators:
  1630. ... print('%-15s' % next(iterator), end=' ')
  1631. ... print()
  1632. My a duties in
  1633. fellow heavy of a
  1634. citizens: weight the proper
  1635. Anyone of office sense
  1636. who responsibility. upon of
  1637. has If which the
  1638. taken not, he obligation
  1639. the he is which
  1640. oath has about the
  1641. I no to oath
  1642. have conception enter, imposes.
  1643. just of or The
  1644. taken the he office
  1645. must powers is of
  1646. feel and lacking an
  1647. SeekableUnicodeStreamReader
  1648. ===========================
  1649. The file-like objects provided by the ``codecs`` module unfortunately
  1650. suffer from a bug that prevents them from working correctly with
  1651. corpus view objects. In particular, although the expose ``seek()``
  1652. and ``tell()`` methods, those methods do not exhibit the expected
  1653. behavior, because they are not synchronized with the internal buffers
  1654. that are kept by the file-like objects. For example, the ``tell()``
  1655. method will return the file position at the end of the buffers (whose
  1656. contents have not yet been returned by the stream); and therefore this
  1657. file position can not be used to return to the 'current' location in
  1658. the stream (since ``seek()`` has no way to reconstruct the buffers).
  1659. To get around these problems, we define a new class,
  1660. `SeekableUnicodeStreamReader`, to act as a file-like interface to
  1661. files containing encoded unicode data. This class is loosely based on
  1662. the ``codecs.StreamReader`` class. To construct a new reader, we call
  1663. the constructor with an underlying stream and an encoding name:
  1664. >>> from io import StringIO, BytesIO
  1665. >>> from nltk.data import SeekableUnicodeStreamReader
  1666. >>> stream = BytesIO(b"""\
  1667. ... This is a test file.
  1668. ... It is encoded in ascii.
  1669. ... """.decode('ascii').encode('ascii'))
  1670. >>> reader = SeekableUnicodeStreamReader(stream, 'ascii')
  1671. `SeekableUnicodeStreamReader`\ s support all of the normal operations
  1672. supplied by a read-only stream. Note that all of the read operations
  1673. return ``unicode`` objects (not ``str`` objects).
  1674. >>> reader.read() # read the entire file.
  1675. 'This is a test file.\nIt is encoded in ascii.\n'
  1676. >>> reader.seek(0) # rewind to the start.
  1677. >>> reader.read(5) # read at most 5 bytes.
  1678. 'This '
  1679. >>> reader.readline() # read to the end of the line.
  1680. 'is a test file.\n'
  1681. >>> reader.seek(0) # rewind to the start.
  1682. >>> for line in reader:
  1683. ... print(repr(line)) # iterate over lines
  1684. 'This is a test file.\n'
  1685. 'It is encoded in ascii.\n'
  1686. >>> reader.seek(0) # rewind to the start.
  1687. >>> reader.readlines() # read a list of line strings
  1688. ['This is a test file.\n', 'It is encoded in ascii.\n']
  1689. >>> reader.close()
  1690. Size argument to ``read()``
  1691. ---------------------------
  1692. The ``size`` argument to ``read()`` specifies the maximum number of
  1693. *bytes* to read, not the maximum number of *characters*. Thus, for
  1694. encodings that use multiple bytes per character, it may return fewer
  1695. characters than the ``size`` argument:
  1696. >>> stream = BytesIO(b"""\
  1697. ... This is a test file.
  1698. ... It is encoded in utf-16.
  1699. ... """.decode('ascii').encode('utf-16'))
  1700. >>> reader = SeekableUnicodeStreamReader(stream, 'utf-16')
  1701. >>> reader.read(10)
  1702. 'This '
  1703. If a read block ends in the middle of the byte string encoding a
  1704. single character, then that byte string is stored in an internal
  1705. buffer, and re-used on the next call to ``read()``. However, if the
  1706. size argument is too small to read even a single character, even
  1707. though at least one character is available, then the ``read()`` method
  1708. will read additional bytes until it can return a single character.
  1709. This ensures that the ``read()`` method does not return an empty
  1710. string, which could be mistaken for indicating the end of the file.
  1711. >>> reader.seek(0) # rewind to the start.
  1712. >>> reader.read(1) # we actually need to read 4 bytes
  1713. 'T'
  1714. >>> int(reader.tell())
  1715. 4
  1716. The ``readline()`` method may read more than a single line of text, in
  1717. which case it stores the text that it does not return in a buffer. If
  1718. this buffer is not empty, then its contents will be included in the
  1719. value returned by the next call to ``read()``, regardless of the
  1720. ``size`` argument, since they are available without reading any new
  1721. bytes from the stream:
  1722. >>> reader.seek(0) # rewind to the start.
  1723. >>> reader.readline() # stores extra text in a buffer
  1724. 'This is a test file.\n'
  1725. >>> print(reader.linebuffer) # examine the buffer contents
  1726. ['It is encoded i']
  1727. >>> reader.read(0) # returns the contents of the buffer
  1728. 'It is encoded i'
  1729. >>> print(reader.linebuffer) # examine the buffer contents
  1730. None
  1731. Seek and Tell
  1732. -------------
  1733. In addition to these basic read operations,
  1734. `SeekableUnicodeStreamReader` also supports the ``seek()`` and
  1735. ``tell()`` operations. However, some care must still be taken when
  1736. using these operations. In particular, the only file offsets that
  1737. should be passed to ``seek()`` are ``0`` and any offset that has been
  1738. returned by ``tell``.
  1739. >>> stream = BytesIO(b"""\
  1740. ... This is a test file.
  1741. ... It is encoded in utf-16.
  1742. ... """.decode('ascii').encode('utf-16'))
  1743. >>> reader = SeekableUnicodeStreamReader(stream, 'utf-16')
  1744. >>> reader.read(20)
  1745. 'This is a '
  1746. >>> pos = reader.tell(); print(pos)
  1747. 22
  1748. >>> reader.read(20)
  1749. 'test file.'
  1750. >>> reader.seek(pos) # rewind to the position from tell.
  1751. >>> reader.read(20)
  1752. 'test file.'
  1753. The ``seek()`` and ``tell()`` methods work property even when
  1754. ``readline()`` is used.
  1755. >>> stream = BytesIO(b"""\
  1756. ... This is a test file.
  1757. ... It is encoded in utf-16.
  1758. ... """.decode('ascii').encode('utf-16'))
  1759. >>> reader = SeekableUnicodeStreamReader(stream, 'utf-16')
  1760. >>> reader.readline()
  1761. 'This is a test file.\n'
  1762. >>> pos = reader.tell(); print(pos)
  1763. 44
  1764. >>> reader.readline()
  1765. 'It is encoded in utf-16.\n'
  1766. >>> reader.seek(pos) # rewind to the position from tell.
  1767. >>> reader.readline()
  1768. 'It is encoded in utf-16.\n'
  1769. Squashed Bugs
  1770. =============
  1771. svn 5276 fixed a bug in the comment-stripping behavior of
  1772. parse_sexpr_block.
  1773. >>> from io import StringIO
  1774. >>> from nltk.corpus.reader.util import read_sexpr_block
  1775. >>> f = StringIO(b"""
  1776. ... (a b c)
  1777. ... # This line is a comment.
  1778. ... (d e f\ng h)""".decode('ascii'))
  1779. >>> print(read_sexpr_block(f, block_size=38, comment_char='#'))
  1780. ['(a b c)']
  1781. >>> print(read_sexpr_block(f, block_size=38, comment_char='#'))
  1782. ['(d e f\ng h)']
  1783. svn 5277 fixed a bug in parse_sexpr_block, which would cause it to
  1784. enter an infinite loop if a file ended mid-sexpr, or ended with a
  1785. token that was not followed by whitespace. A related bug caused
  1786. an infinite loop if the corpus ended in an unmatched close paren --
  1787. this was fixed in svn 5279
  1788. >>> f = StringIO(b"""
  1789. ... This file ends mid-sexpr
  1790. ... (hello (world""".decode('ascii'))
  1791. >>> for i in range(3): print(read_sexpr_block(f))
  1792. ['This', 'file', 'ends', 'mid-sexpr']
  1793. ['(hello (world']
  1794. []
  1795. >>> f = StringIO(b"This file has no trailing whitespace.".decode('ascii'))
  1796. >>> for i in range(3): print(read_sexpr_block(f))
  1797. ['This', 'file', 'has', 'no', 'trailing']
  1798. ['whitespace.']
  1799. []
  1800. >>> # Bug fixed in 5279:
  1801. >>> f = StringIO(b"a b c)".decode('ascii'))
  1802. >>> for i in range(3): print(read_sexpr_block(f))
  1803. ['a', 'b']
  1804. ['c)']
  1805. []
  1806. svn 5624 & 5265 fixed a bug in ConcatenatedCorpusView, which caused it
  1807. to return the wrong items when indexed starting at any index beyond
  1808. the first file.
  1809. >>> import nltk
  1810. >>> sents = nltk.corpus.brown.sents()
  1811. >>> print(sents[6000])
  1812. ['Cholesterol', 'and', 'thyroid']
  1813. >>> print(sents[6000])
  1814. ['Cholesterol', 'and', 'thyroid']
  1815. svn 5728 fixed a bug in Categorized*CorpusReader, which caused them
  1816. to return words from *all* files when just one file was specified.
  1817. >>> from nltk.corpus import reuters
  1818. >>> reuters.words('training/13085')
  1819. ['SNYDER', '&', 'lt', ';', 'SOI', '>', 'MAKES', ...]
  1820. >>> reuters.words('training/5082')
  1821. ['SHEPPARD', 'RESOURCES', 'TO', 'MERGE', 'WITH', ...]
  1822. svn 7227 fixed a bug in the qc corpus reader, which prevented
  1823. access to its tuples() method
  1824. >>> from nltk.corpus import qc
  1825. >>> qc.tuples('test.txt')
  1826. [('NUM:dist', 'How far is it from Denver to Aspen ?'), ('LOC:city', 'What county is Modesto , California in ?'), ...]