book.py 3.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213
  1. # Natural Language Toolkit: Some texts for exploration in chapter 1 of the book
  2. #
  3. # Copyright (C) 2001-2020 NLTK Project
  4. # Author: Steven Bird <stevenbird1@gmail.com>
  5. #
  6. # URL: <http://nltk.org/>
  7. # For license information, see LICENSE.TXT
  8. from nltk.corpus import (
  9. gutenberg,
  10. genesis,
  11. inaugural,
  12. nps_chat,
  13. webtext,
  14. treebank,
  15. wordnet,
  16. )
  17. from nltk.text import Text
  18. from nltk.probability import FreqDist
  19. from nltk.util import bigrams
  20. print("*** Introductory Examples for the NLTK Book ***")
  21. print("Loading text1, ..., text9 and sent1, ..., sent9")
  22. print("Type the name of the text or sentence to view it.")
  23. print("Type: 'texts()' or 'sents()' to list the materials.")
  24. text1 = Text(gutenberg.words("melville-moby_dick.txt"))
  25. print("text1:", text1.name)
  26. text2 = Text(gutenberg.words("austen-sense.txt"))
  27. print("text2:", text2.name)
  28. text3 = Text(genesis.words("english-kjv.txt"), name="The Book of Genesis")
  29. print("text3:", text3.name)
  30. text4 = Text(inaugural.words(), name="Inaugural Address Corpus")
  31. print("text4:", text4.name)
  32. text5 = Text(nps_chat.words(), name="Chat Corpus")
  33. print("text5:", text5.name)
  34. text6 = Text(webtext.words("grail.txt"), name="Monty Python and the Holy Grail")
  35. print("text6:", text6.name)
  36. text7 = Text(treebank.words(), name="Wall Street Journal")
  37. print("text7:", text7.name)
  38. text8 = Text(webtext.words("singles.txt"), name="Personals Corpus")
  39. print("text8:", text8.name)
  40. text9 = Text(gutenberg.words("chesterton-thursday.txt"))
  41. print("text9:", text9.name)
  42. def texts():
  43. print("text1:", text1.name)
  44. print("text2:", text2.name)
  45. print("text3:", text3.name)
  46. print("text4:", text4.name)
  47. print("text5:", text5.name)
  48. print("text6:", text6.name)
  49. print("text7:", text7.name)
  50. print("text8:", text8.name)
  51. print("text9:", text9.name)
  52. sent1 = ["Call", "me", "Ishmael", "."]
  53. sent2 = [
  54. "The",
  55. "family",
  56. "of",
  57. "Dashwood",
  58. "had",
  59. "long",
  60. "been",
  61. "settled",
  62. "in",
  63. "Sussex",
  64. ".",
  65. ]
  66. sent3 = [
  67. "In",
  68. "the",
  69. "beginning",
  70. "God",
  71. "created",
  72. "the",
  73. "heaven",
  74. "and",
  75. "the",
  76. "earth",
  77. ".",
  78. ]
  79. sent4 = [
  80. "Fellow",
  81. "-",
  82. "Citizens",
  83. "of",
  84. "the",
  85. "Senate",
  86. "and",
  87. "of",
  88. "the",
  89. "House",
  90. "of",
  91. "Representatives",
  92. ":",
  93. ]
  94. sent5 = [
  95. "I",
  96. "have",
  97. "a",
  98. "problem",
  99. "with",
  100. "people",
  101. "PMing",
  102. "me",
  103. "to",
  104. "lol",
  105. "JOIN",
  106. ]
  107. sent6 = [
  108. "SCENE",
  109. "1",
  110. ":",
  111. "[",
  112. "wind",
  113. "]",
  114. "[",
  115. "clop",
  116. "clop",
  117. "clop",
  118. "]",
  119. "KING",
  120. "ARTHUR",
  121. ":",
  122. "Whoa",
  123. "there",
  124. "!",
  125. ]
  126. sent7 = [
  127. "Pierre",
  128. "Vinken",
  129. ",",
  130. "61",
  131. "years",
  132. "old",
  133. ",",
  134. "will",
  135. "join",
  136. "the",
  137. "board",
  138. "as",
  139. "a",
  140. "nonexecutive",
  141. "director",
  142. "Nov.",
  143. "29",
  144. ".",
  145. ]
  146. sent8 = [
  147. "25",
  148. "SEXY",
  149. "MALE",
  150. ",",
  151. "seeks",
  152. "attrac",
  153. "older",
  154. "single",
  155. "lady",
  156. ",",
  157. "for",
  158. "discreet",
  159. "encounters",
  160. ".",
  161. ]
  162. sent9 = [
  163. "THE",
  164. "suburb",
  165. "of",
  166. "Saffron",
  167. "Park",
  168. "lay",
  169. "on",
  170. "the",
  171. "sunset",
  172. "side",
  173. "of",
  174. "London",
  175. ",",
  176. "as",
  177. "red",
  178. "and",
  179. "ragged",
  180. "as",
  181. "a",
  182. "cloud",
  183. "of",
  184. "sunset",
  185. ".",
  186. ]
  187. def sents():
  188. print("sent1:", " ".join(sent1))
  189. print("sent2:", " ".join(sent2))
  190. print("sent3:", " ".join(sent3))
  191. print("sent4:", " ".join(sent4))
  192. print("sent5:", " ".join(sent5))
  193. print("sent6:", " ".join(sent6))
  194. print("sent7:", " ".join(sent7))
  195. print("sent8:", " ".join(sent8))
  196. print("sent9:", " ".join(sent9))