__init__.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493
  1. # Natural Language Toolkit: Corpus Readers
  2. #
  3. # Copyright (C) 2001-2020 NLTK Project
  4. # Author: Edward Loper <edloper@gmail.com>
  5. # URL: <http://nltk.org/>
  6. # For license information, see LICENSE.TXT
  7. # TODO this docstring isn't up-to-date!
  8. """
  9. NLTK corpus readers. The modules in this package provide functions
  10. that can be used to read corpus files in a variety of formats. These
  11. functions can be used to read both the corpus files that are
  12. distributed in the NLTK corpus package, and corpus files that are part
  13. of external corpora.
  14. Available Corpora
  15. =================
  16. Please see http://www.nltk.org/nltk_data/ for a complete list.
  17. Install corpora using nltk.download().
  18. Corpus Reader Functions
  19. =======================
  20. Each corpus module defines one or more "corpus reader functions",
  21. which can be used to read documents from that corpus. These functions
  22. take an argument, ``item``, which is used to indicate which document
  23. should be read from the corpus:
  24. - If ``item`` is one of the unique identifiers listed in the corpus
  25. module's ``items`` variable, then the corresponding document will
  26. be loaded from the NLTK corpus package.
  27. - If ``item`` is a filename, then that file will be read.
  28. Additionally, corpus reader functions can be given lists of item
  29. names; in which case, they will return a concatenation of the
  30. corresponding documents.
  31. Corpus reader functions are named based on the type of information
  32. they return. Some common examples, and their return types, are:
  33. - words(): list of str
  34. - sents(): list of (list of str)
  35. - paras(): list of (list of (list of str))
  36. - tagged_words(): list of (str,str) tuple
  37. - tagged_sents(): list of (list of (str,str))
  38. - tagged_paras(): list of (list of (list of (str,str)))
  39. - chunked_sents(): list of (Tree w/ (str,str) leaves)
  40. - parsed_sents(): list of (Tree with str leaves)
  41. - parsed_paras(): list of (list of (Tree with str leaves))
  42. - xml(): A single xml ElementTree
  43. - raw(): unprocessed corpus contents
  44. For example, to read a list of the words in the Brown Corpus, use
  45. ``nltk.corpus.brown.words()``:
  46. >>> from nltk.corpus import brown
  47. >>> print(", ".join(brown.words()))
  48. The, Fulton, County, Grand, Jury, said, ...
  49. """
  50. import re
  51. from nltk.tokenize import RegexpTokenizer
  52. from nltk.corpus.util import LazyCorpusLoader
  53. from nltk.corpus.reader import *
  54. abc = LazyCorpusLoader(
  55. "abc",
  56. PlaintextCorpusReader,
  57. r"(?!\.).*\.txt",
  58. encoding=[("science", "latin_1"), ("rural", "utf8")],
  59. )
  60. alpino = LazyCorpusLoader("alpino", AlpinoCorpusReader, tagset="alpino")
  61. brown = LazyCorpusLoader(
  62. "brown",
  63. CategorizedTaggedCorpusReader,
  64. r"c[a-z]\d\d",
  65. cat_file="cats.txt",
  66. tagset="brown",
  67. encoding="ascii",
  68. )
  69. cess_cat = LazyCorpusLoader(
  70. "cess_cat",
  71. BracketParseCorpusReader,
  72. r"(?!\.).*\.tbf",
  73. tagset="unknown",
  74. encoding="ISO-8859-15",
  75. )
  76. cess_esp = LazyCorpusLoader(
  77. "cess_esp",
  78. BracketParseCorpusReader,
  79. r"(?!\.).*\.tbf",
  80. tagset="unknown",
  81. encoding="ISO-8859-15",
  82. )
  83. cmudict = LazyCorpusLoader("cmudict", CMUDictCorpusReader, ["cmudict"])
  84. comtrans = LazyCorpusLoader("comtrans", AlignedCorpusReader, r"(?!\.).*\.txt")
  85. comparative_sentences = LazyCorpusLoader(
  86. "comparative_sentences",
  87. ComparativeSentencesCorpusReader,
  88. r"labeledSentences\.txt",
  89. encoding="latin-1",
  90. )
  91. conll2000 = LazyCorpusLoader(
  92. "conll2000",
  93. ConllChunkCorpusReader,
  94. ["train.txt", "test.txt"],
  95. ("NP", "VP", "PP"),
  96. tagset="wsj",
  97. encoding="ascii",
  98. )
  99. conll2002 = LazyCorpusLoader(
  100. "conll2002",
  101. ConllChunkCorpusReader,
  102. ".*\.(test|train).*",
  103. ("LOC", "PER", "ORG", "MISC"),
  104. encoding="utf-8",
  105. )
  106. conll2007 = LazyCorpusLoader(
  107. "conll2007",
  108. DependencyCorpusReader,
  109. ".*\.(test|train).*",
  110. encoding=[("eus", "ISO-8859-2"), ("esp", "utf8")],
  111. )
  112. crubadan = LazyCorpusLoader("crubadan", CrubadanCorpusReader, ".*\.txt")
  113. dependency_treebank = LazyCorpusLoader(
  114. "dependency_treebank", DependencyCorpusReader, ".*\.dp", encoding="ascii"
  115. )
  116. floresta = LazyCorpusLoader(
  117. "floresta",
  118. BracketParseCorpusReader,
  119. r"(?!\.).*\.ptb",
  120. "#",
  121. tagset="unknown",
  122. encoding="ISO-8859-15",
  123. )
  124. framenet15 = LazyCorpusLoader(
  125. "framenet_v15",
  126. FramenetCorpusReader,
  127. [
  128. "frRelation.xml",
  129. "frameIndex.xml",
  130. "fulltextIndex.xml",
  131. "luIndex.xml",
  132. "semTypes.xml",
  133. ],
  134. )
  135. framenet = LazyCorpusLoader(
  136. "framenet_v17",
  137. FramenetCorpusReader,
  138. [
  139. "frRelation.xml",
  140. "frameIndex.xml",
  141. "fulltextIndex.xml",
  142. "luIndex.xml",
  143. "semTypes.xml",
  144. ],
  145. )
  146. gazetteers = LazyCorpusLoader(
  147. "gazetteers", WordListCorpusReader, r"(?!LICENSE|\.).*\.txt", encoding="ISO-8859-2"
  148. )
  149. genesis = LazyCorpusLoader(
  150. "genesis",
  151. PlaintextCorpusReader,
  152. r"(?!\.).*\.txt",
  153. encoding=[
  154. ("finnish|french|german", "latin_1"),
  155. ("swedish", "cp865"),
  156. (".*", "utf_8"),
  157. ],
  158. )
  159. gutenberg = LazyCorpusLoader(
  160. "gutenberg", PlaintextCorpusReader, r"(?!\.).*\.txt", encoding="latin1"
  161. )
  162. ieer = LazyCorpusLoader("ieer", IEERCorpusReader, r"(?!README|\.).*")
  163. inaugural = LazyCorpusLoader(
  164. "inaugural", PlaintextCorpusReader, r"(?!\.).*\.txt", encoding="latin1"
  165. )
  166. # [XX] This should probably just use TaggedCorpusReader:
  167. indian = LazyCorpusLoader(
  168. "indian", IndianCorpusReader, r"(?!\.).*\.pos", tagset="unknown", encoding="utf8"
  169. )
  170. jeita = LazyCorpusLoader("jeita", ChasenCorpusReader, r".*\.chasen", encoding="utf-8")
  171. knbc = LazyCorpusLoader("knbc/corpus1", KNBCorpusReader, r".*/KN.*", encoding="euc-jp")
  172. lin_thesaurus = LazyCorpusLoader("lin_thesaurus", LinThesaurusCorpusReader, r".*\.lsp")
  173. mac_morpho = LazyCorpusLoader(
  174. "mac_morpho",
  175. MacMorphoCorpusReader,
  176. r"(?!\.).*\.txt",
  177. tagset="unknown",
  178. encoding="latin-1",
  179. )
  180. machado = LazyCorpusLoader(
  181. "machado",
  182. PortugueseCategorizedPlaintextCorpusReader,
  183. r"(?!\.).*\.txt",
  184. cat_pattern=r"([a-z]*)/.*",
  185. encoding="latin-1",
  186. )
  187. masc_tagged = LazyCorpusLoader(
  188. "masc_tagged",
  189. CategorizedTaggedCorpusReader,
  190. r"(spoken|written)/.*\.txt",
  191. cat_file="categories.txt",
  192. tagset="wsj",
  193. encoding="utf-8",
  194. sep="_",
  195. )
  196. movie_reviews = LazyCorpusLoader(
  197. "movie_reviews",
  198. CategorizedPlaintextCorpusReader,
  199. r"(?!\.).*\.txt",
  200. cat_pattern=r"(neg|pos)/.*",
  201. encoding="ascii",
  202. )
  203. multext_east = LazyCorpusLoader(
  204. "mte_teip5", MTECorpusReader, r"(oana).*\.xml", encoding="utf-8"
  205. )
  206. names = LazyCorpusLoader(
  207. "names", WordListCorpusReader, r"(?!\.).*\.txt", encoding="ascii"
  208. )
  209. nps_chat = LazyCorpusLoader(
  210. "nps_chat", NPSChatCorpusReader, r"(?!README|\.).*\.xml", tagset="wsj"
  211. )
  212. opinion_lexicon = LazyCorpusLoader(
  213. "opinion_lexicon",
  214. OpinionLexiconCorpusReader,
  215. r"(\w+)\-words\.txt",
  216. encoding="ISO-8859-2",
  217. )
  218. ppattach = LazyCorpusLoader(
  219. "ppattach", PPAttachmentCorpusReader, ["training", "test", "devset"]
  220. )
  221. product_reviews_1 = LazyCorpusLoader(
  222. "product_reviews_1", ReviewsCorpusReader, r"^(?!Readme).*\.txt", encoding="utf8"
  223. )
  224. product_reviews_2 = LazyCorpusLoader(
  225. "product_reviews_2", ReviewsCorpusReader, r"^(?!Readme).*\.txt", encoding="utf8"
  226. )
  227. pros_cons = LazyCorpusLoader(
  228. "pros_cons",
  229. ProsConsCorpusReader,
  230. r"Integrated(Cons|Pros)\.txt",
  231. cat_pattern=r"Integrated(Cons|Pros)\.txt",
  232. encoding="ISO-8859-2",
  233. )
  234. ptb = LazyCorpusLoader( # Penn Treebank v3: WSJ and Brown portions
  235. "ptb",
  236. CategorizedBracketParseCorpusReader,
  237. r"(WSJ/\d\d/WSJ_\d\d|BROWN/C[A-Z]/C[A-Z])\d\d.MRG",
  238. cat_file="allcats.txt",
  239. tagset="wsj",
  240. )
  241. qc = LazyCorpusLoader(
  242. "qc", StringCategoryCorpusReader, ["train.txt", "test.txt"], encoding="ISO-8859-2"
  243. )
  244. reuters = LazyCorpusLoader(
  245. "reuters",
  246. CategorizedPlaintextCorpusReader,
  247. "(training|test).*",
  248. cat_file="cats.txt",
  249. encoding="ISO-8859-2",
  250. )
  251. rte = LazyCorpusLoader("rte", RTECorpusReader, r"(?!\.).*\.xml")
  252. senseval = LazyCorpusLoader("senseval", SensevalCorpusReader, r"(?!\.).*\.pos")
  253. sentence_polarity = LazyCorpusLoader(
  254. "sentence_polarity",
  255. CategorizedSentencesCorpusReader,
  256. r"rt-polarity\.(neg|pos)",
  257. cat_pattern=r"rt-polarity\.(neg|pos)",
  258. encoding="utf-8",
  259. )
  260. sentiwordnet = LazyCorpusLoader(
  261. "sentiwordnet", SentiWordNetCorpusReader, "SentiWordNet_3.0.0.txt", encoding="utf-8"
  262. )
  263. shakespeare = LazyCorpusLoader("shakespeare", XMLCorpusReader, r"(?!\.).*\.xml")
  264. sinica_treebank = LazyCorpusLoader(
  265. "sinica_treebank",
  266. SinicaTreebankCorpusReader,
  267. ["parsed"],
  268. tagset="unknown",
  269. encoding="utf-8",
  270. )
  271. state_union = LazyCorpusLoader(
  272. "state_union", PlaintextCorpusReader, r"(?!\.).*\.txt", encoding="ISO-8859-2"
  273. )
  274. stopwords = LazyCorpusLoader(
  275. "stopwords", WordListCorpusReader, r"(?!README|\.).*", encoding="utf8"
  276. )
  277. subjectivity = LazyCorpusLoader(
  278. "subjectivity",
  279. CategorizedSentencesCorpusReader,
  280. r"(quote.tok.gt9|plot.tok.gt9)\.5000",
  281. cat_map={"quote.tok.gt9.5000": ["subj"], "plot.tok.gt9.5000": ["obj"]},
  282. encoding="latin-1",
  283. )
  284. swadesh = LazyCorpusLoader(
  285. "swadesh", SwadeshCorpusReader, r"(?!README|\.).*", encoding="utf8"
  286. )
  287. swadesh110 = LazyCorpusLoader(
  288. 'panlex_swadesh', PanlexSwadeshCorpusReader, r'swadesh110/.*\.txt', encoding='utf8'
  289. )
  290. swadesh207 = LazyCorpusLoader(
  291. 'panlex_swadesh', PanlexSwadeshCorpusReader, r'swadesh207/.*\.txt', encoding='utf8'
  292. )
  293. switchboard = LazyCorpusLoader("switchboard", SwitchboardCorpusReader, tagset="wsj")
  294. timit = LazyCorpusLoader("timit", TimitCorpusReader)
  295. timit_tagged = LazyCorpusLoader(
  296. "timit", TimitTaggedCorpusReader, ".+\.tags", tagset="wsj", encoding="ascii"
  297. )
  298. toolbox = LazyCorpusLoader(
  299. "toolbox", ToolboxCorpusReader, r"(?!.*(README|\.)).*\.(dic|txt)"
  300. )
  301. treebank = LazyCorpusLoader(
  302. "treebank/combined",
  303. BracketParseCorpusReader,
  304. r"wsj_.*\.mrg",
  305. tagset="wsj",
  306. encoding="ascii",
  307. )
  308. treebank_chunk = LazyCorpusLoader(
  309. "treebank/tagged",
  310. ChunkedCorpusReader,
  311. r"wsj_.*\.pos",
  312. sent_tokenizer=RegexpTokenizer(r"(?<=/\.)\s*(?![^\[]*\])", gaps=True),
  313. para_block_reader=tagged_treebank_para_block_reader,
  314. tagset="wsj",
  315. encoding="ascii",
  316. )
  317. treebank_raw = LazyCorpusLoader(
  318. "treebank/raw", PlaintextCorpusReader, r"wsj_.*", encoding="ISO-8859-2"
  319. )
  320. twitter_samples = LazyCorpusLoader("twitter_samples", TwitterCorpusReader, ".*\.json")
  321. udhr = LazyCorpusLoader("udhr", UdhrCorpusReader)
  322. udhr2 = LazyCorpusLoader("udhr2", PlaintextCorpusReader, r".*\.txt", encoding="utf8")
  323. universal_treebanks = LazyCorpusLoader(
  324. "universal_treebanks_v20",
  325. ConllCorpusReader,
  326. r".*\.conll",
  327. columntypes=(
  328. "ignore",
  329. "words",
  330. "ignore",
  331. "ignore",
  332. "pos",
  333. "ignore",
  334. "ignore",
  335. "ignore",
  336. "ignore",
  337. "ignore",
  338. ),
  339. )
  340. verbnet = LazyCorpusLoader("verbnet", VerbnetCorpusReader, r"(?!\.).*\.xml")
  341. webtext = LazyCorpusLoader(
  342. "webtext", PlaintextCorpusReader, r"(?!README|\.).*\.txt", encoding="ISO-8859-2"
  343. )
  344. wordnet = LazyCorpusLoader(
  345. "wordnet",
  346. WordNetCorpusReader,
  347. LazyCorpusLoader("omw", CorpusReader, r".*/wn-data-.*\.tab", encoding="utf8"),
  348. )
  349. wordnet_ic = LazyCorpusLoader("wordnet_ic", WordNetICCorpusReader, ".*\.dat")
  350. words = LazyCorpusLoader(
  351. "words", WordListCorpusReader, r"(?!README|\.).*", encoding="ascii"
  352. )
  353. # defined after treebank
  354. propbank = LazyCorpusLoader(
  355. "propbank",
  356. PropbankCorpusReader,
  357. "prop.txt",
  358. "frames/.*\.xml",
  359. "verbs.txt",
  360. lambda filename: re.sub(r"^wsj/\d\d/", "", filename),
  361. treebank,
  362. ) # Must be defined *after* treebank corpus.
  363. nombank = LazyCorpusLoader(
  364. "nombank.1.0",
  365. NombankCorpusReader,
  366. "nombank.1.0",
  367. "frames/.*\.xml",
  368. "nombank.1.0.words",
  369. lambda filename: re.sub(r"^wsj/\d\d/", "", filename),
  370. treebank,
  371. ) # Must be defined *after* treebank corpus.
  372. propbank_ptb = LazyCorpusLoader(
  373. "propbank",
  374. PropbankCorpusReader,
  375. "prop.txt",
  376. "frames/.*\.xml",
  377. "verbs.txt",
  378. lambda filename: filename.upper(),
  379. ptb,
  380. ) # Must be defined *after* ptb corpus.
  381. nombank_ptb = LazyCorpusLoader(
  382. "nombank.1.0",
  383. NombankCorpusReader,
  384. "nombank.1.0",
  385. "frames/.*\.xml",
  386. "nombank.1.0.words",
  387. lambda filename: filename.upper(),
  388. ptb,
  389. ) # Must be defined *after* ptb corpus.
  390. semcor = LazyCorpusLoader(
  391. "semcor", SemcorCorpusReader, r"brown./tagfiles/br-.*\.xml", wordnet
  392. ) # Must be defined *after* wordnet corpus.
  393. nonbreaking_prefixes = LazyCorpusLoader(
  394. "nonbreaking_prefixes",
  395. NonbreakingPrefixesCorpusReader,
  396. r"(?!README|\.).*",
  397. encoding="utf8",
  398. )
  399. perluniprops = LazyCorpusLoader(
  400. "perluniprops",
  401. UnicharsCorpusReader,
  402. r"(?!README|\.).*",
  403. nltk_data_subdir="misc",
  404. encoding="utf8",
  405. )
  406. # mwa_ppdb = LazyCorpusLoader(
  407. # 'mwa_ppdb', MWAPPDBCorpusReader, r'(?!README|\.).*', nltk_data_subdir='misc', encoding='utf8')
  408. # See https://github.com/nltk/nltk/issues/1579
  409. # and https://github.com/nltk/nltk/issues/1716
  410. #
  411. # pl196x = LazyCorpusLoader(
  412. # 'pl196x', Pl196xCorpusReader, r'[a-z]-.*\.xml',
  413. # cat_file='cats.txt', textid_file='textids.txt', encoding='utf8')
  414. #
  415. # ipipan = LazyCorpusLoader(
  416. # 'ipipan', IPIPANCorpusReader, r'(?!\.).*morph\.xml')
  417. #
  418. # nkjp = LazyCorpusLoader(
  419. # 'nkjp', NKJPCorpusReader, r'', encoding='utf8')
  420. #
  421. # panlex_lite = LazyCorpusLoader(
  422. # 'panlex_lite', PanLexLiteCorpusReader)
  423. #
  424. # ycoe = LazyCorpusLoader(
  425. # 'ycoe', YCOECorpusReader)
  426. #
  427. # corpus not available with NLTK; these lines caused help(nltk.corpus) to break
  428. # hebrew_treebank = LazyCorpusLoader(
  429. # 'hebrew_treebank', BracketParseCorpusReader, r'.*\.txt')
  430. # FIXME: override any imported demo from various corpora, see https://github.com/nltk/nltk/issues/2116
  431. def demo():
  432. # This is out-of-date:
  433. abc.demo()
  434. brown.demo()
  435. # chat80.demo()
  436. cmudict.demo()
  437. conll2000.demo()
  438. conll2002.demo()
  439. genesis.demo()
  440. gutenberg.demo()
  441. ieer.demo()
  442. inaugural.demo()
  443. indian.demo()
  444. names.demo()
  445. ppattach.demo()
  446. senseval.demo()
  447. shakespeare.demo()
  448. sinica_treebank.demo()
  449. state_union.demo()
  450. stopwords.demo()
  451. timit.demo()
  452. toolbox.demo()
  453. treebank.demo()
  454. udhr.demo()
  455. webtext.demo()
  456. words.demo()
  457. # ycoe.demo()
  458. if __name__ == "__main__":
  459. # demo()
  460. pass
  461. # ** this is for nose **
  462. # unload all corpus after tests
  463. def teardown_module(module=None):
  464. import nltk.corpus
  465. for name in dir(nltk.corpus):
  466. obj = getattr(nltk.corpus, name, None)
  467. if isinstance(obj, CorpusReader) and hasattr(obj, "_unload"):
  468. obj._unload()