collocations_app.py 14 KB


  1. # Natural Language Toolkit: Collocations Application
  2. # Much of the GUI code is imported from concordance.py; We intend to merge these tools together
  3. # Copyright (C) 2001-2020 NLTK Project
  4. # Author: Sumukh Ghodke <sghodke@csse.unimelb.edu.au>
  5. # URL: <http://nltk.org/>
  6. # For license information, see LICENSE.TXT
  7. #
  8. import threading
  9. import queue as q
  10. from tkinter.font import Font
  11. from tkinter import (
  12. Button,
  13. END,
  14. Frame,
  15. IntVar,
  16. LEFT,
  17. Label,
  18. Menu,
  19. OptionMenu,
  20. SUNKEN,
  21. Scrollbar,
  22. StringVar,
  23. Text,
  24. Tk,
  25. )
  26. from nltk.corpus import (
  27. cess_cat,
  28. brown,
  29. nps_chat,
  30. treebank,
  31. sinica_treebank,
  32. alpino,
  33. indian,
  34. floresta,
  35. mac_morpho,
  36. machado,
  37. cess_esp,
  38. )
  39. from nltk.util import in_idle
  40. from nltk.probability import FreqDist
  41. CORPUS_LOADED_EVENT = "<<CL_EVENT>>"
  42. ERROR_LOADING_CORPUS_EVENT = "<<ELC_EVENT>>"
  43. POLL_INTERVAL = 100
  44. _DEFAULT = "English: Brown Corpus (Humor)"
  45. _CORPORA = {
  46. "Catalan: CESS-CAT Corpus": lambda: cess_cat.words(),
  47. "English: Brown Corpus": lambda: brown.words(),
  48. "English: Brown Corpus (Press)": lambda: brown.words(
  49. categories=["news", "editorial", "reviews"]
  50. ),
  51. "English: Brown Corpus (Religion)": lambda: brown.words(categories="religion"),
  52. "English: Brown Corpus (Learned)": lambda: brown.words(categories="learned"),
  53. "English: Brown Corpus (Science Fiction)": lambda: brown.words(
  54. categories="science_fiction"
  55. ),
  56. "English: Brown Corpus (Romance)": lambda: brown.words(categories="romance"),
  57. "English: Brown Corpus (Humor)": lambda: brown.words(categories="humor"),
  58. "English: NPS Chat Corpus": lambda: nps_chat.words(),
  59. "English: Wall Street Journal Corpus": lambda: treebank.words(),
  60. "Chinese: Sinica Corpus": lambda: sinica_treebank.words(),
  61. "Dutch: Alpino Corpus": lambda: alpino.words(),
  62. "Hindi: Indian Languages Corpus": lambda: indian.words(files="hindi.pos"),
  63. "Portuguese: Floresta Corpus (Portugal)": lambda: floresta.words(),
  64. "Portuguese: MAC-MORPHO Corpus (Brazil)": lambda: mac_morpho.words(),
  65. "Portuguese: Machado Corpus (Brazil)": lambda: machado.words(),
  66. "Spanish: CESS-ESP Corpus": lambda: cess_esp.words(),
  67. }
  68. class CollocationsView:
  69. _BACKGROUND_COLOUR = "#FFF" # white
  70. def __init__(self):
  71. self.queue = q.Queue()
  72. self.model = CollocationsModel(self.queue)
  73. self.top = Tk()
  74. self._init_top(self.top)
  75. self._init_menubar()
  76. self._init_widgets(self.top)
  77. self.load_corpus(self.model.DEFAULT_CORPUS)
  78. self.after = self.top.after(POLL_INTERVAL, self._poll)
  79. def _init_top(self, top):
  80. top.geometry("550x650+50+50")
  81. top.title("NLTK Collocations List")
  82. top.bind("<Control-q>", self.destroy)
  83. top.protocol("WM_DELETE_WINDOW", self.destroy)
  84. top.minsize(550, 650)
  85. def _init_widgets(self, parent):
  86. self.main_frame = Frame(
  87. parent, dict(background=self._BACKGROUND_COLOUR, padx=1, pady=1, border=1)
  88. )
  89. self._init_corpus_select(self.main_frame)
  90. self._init_results_box(self.main_frame)
  91. self._init_paging(self.main_frame)
  92. self._init_status(self.main_frame)
  93. self.main_frame.pack(fill="both", expand=True)
  94. def _init_corpus_select(self, parent):
  95. innerframe = Frame(parent, background=self._BACKGROUND_COLOUR)
  96. self.var = StringVar(innerframe)
  97. self.var.set(self.model.DEFAULT_CORPUS)
  98. Label(
  99. innerframe,
  100. justify=LEFT,
  101. text=" Corpus: ",
  102. background=self._BACKGROUND_COLOUR,
  103. padx=2,
  104. pady=1,
  105. border=0,
  106. ).pack(side="left")
  107. other_corpora = list(self.model.CORPORA.keys()).remove(
  108. self.model.DEFAULT_CORPUS
  109. )
  110. om = OptionMenu(
  111. innerframe,
  112. self.var,
  113. self.model.DEFAULT_CORPUS,
  114. command=self.corpus_selected,
  115. *self.model.non_default_corpora()
  116. )
  117. om["borderwidth"] = 0
  118. om["highlightthickness"] = 1
  119. om.pack(side="left")
  120. innerframe.pack(side="top", fill="x", anchor="n")
  121. def _init_status(self, parent):
  122. self.status = Label(
  123. parent,
  124. justify=LEFT,
  125. relief=SUNKEN,
  126. background=self._BACKGROUND_COLOUR,
  127. border=0,
  128. padx=1,
  129. pady=0,
  130. )
  131. self.status.pack(side="top", anchor="sw")
  132. def _init_menubar(self):
  133. self._result_size = IntVar(self.top)
  134. menubar = Menu(self.top)
  135. filemenu = Menu(menubar, tearoff=0, borderwidth=0)
  136. filemenu.add_command(
  137. label="Exit", underline=1, command=self.destroy, accelerator="Ctrl-q"
  138. )
  139. menubar.add_cascade(label="File", underline=0, menu=filemenu)
  140. editmenu = Menu(menubar, tearoff=0)
  141. rescntmenu = Menu(editmenu, tearoff=0)
  142. rescntmenu.add_radiobutton(
  143. label="20",
  144. variable=self._result_size,
  145. underline=0,
  146. value=20,
  147. command=self.set_result_size,
  148. )
  149. rescntmenu.add_radiobutton(
  150. label="50",
  151. variable=self._result_size,
  152. underline=0,
  153. value=50,
  154. command=self.set_result_size,
  155. )
  156. rescntmenu.add_radiobutton(
  157. label="100",
  158. variable=self._result_size,
  159. underline=0,
  160. value=100,
  161. command=self.set_result_size,
  162. )
  163. rescntmenu.invoke(1)
  164. editmenu.add_cascade(label="Result Count", underline=0, menu=rescntmenu)
  165. menubar.add_cascade(label="Edit", underline=0, menu=editmenu)
  166. self.top.config(menu=menubar)
  167. def set_result_size(self, **kwargs):
  168. self.model.result_count = self._result_size.get()
  169. def _init_results_box(self, parent):
  170. innerframe = Frame(parent)
  171. i1 = Frame(innerframe)
  172. i2 = Frame(innerframe)
  173. vscrollbar = Scrollbar(i1, borderwidth=1)
  174. hscrollbar = Scrollbar(i2, borderwidth=1, orient="horiz")
  175. self.results_box = Text(
  176. i1,
  177. font=Font(family="courier", size="16"),
  178. state="disabled",
  179. borderwidth=1,
  180. yscrollcommand=vscrollbar.set,
  181. xscrollcommand=hscrollbar.set,
  182. wrap="none",
  183. width="40",
  184. height="20",
  185. exportselection=1,
  186. )
  187. self.results_box.pack(side="left", fill="both", expand=True)
  188. vscrollbar.pack(side="left", fill="y", anchor="e")
  189. vscrollbar.config(command=self.results_box.yview)
  190. hscrollbar.pack(side="left", fill="x", expand=True, anchor="w")
  191. hscrollbar.config(command=self.results_box.xview)
  192. # there is no other way of avoiding the overlap of scrollbars while using pack layout manager!!!
  193. Label(i2, text=" ", background=self._BACKGROUND_COLOUR).pack(
  194. side="left", anchor="e"
  195. )
  196. i1.pack(side="top", fill="both", expand=True, anchor="n")
  197. i2.pack(side="bottom", fill="x", anchor="s")
  198. innerframe.pack(side="top", fill="both", expand=True)
  199. def _init_paging(self, parent):
  200. innerframe = Frame(parent, background=self._BACKGROUND_COLOUR)
  201. self.prev = prev = Button(
  202. innerframe,
  203. text="Previous",
  204. command=self.previous,
  205. width="10",
  206. borderwidth=1,
  207. highlightthickness=1,
  208. state="disabled",
  209. )
  210. prev.pack(side="left", anchor="center")
  211. self.next = next = Button(
  212. innerframe,
  213. text="Next",
  214. command=self.__next__,
  215. width="10",
  216. borderwidth=1,
  217. highlightthickness=1,
  218. state="disabled",
  219. )
  220. next.pack(side="right", anchor="center")
  221. innerframe.pack(side="top", fill="y")
  222. self.reset_current_page()
  223. def reset_current_page(self):
  224. self.current_page = -1
  225. def _poll(self):
  226. try:
  227. event = self.queue.get(block=False)
  228. except q.Empty:
  229. pass
  230. else:
  231. if event == CORPUS_LOADED_EVENT:
  232. self.handle_corpus_loaded(event)
  233. elif event == ERROR_LOADING_CORPUS_EVENT:
  234. self.handle_error_loading_corpus(event)
  235. self.after = self.top.after(POLL_INTERVAL, self._poll)
  236. def handle_error_loading_corpus(self, event):
  237. self.status["text"] = "Error in loading " + self.var.get()
  238. self.unfreeze_editable()
  239. self.clear_results_box()
  240. self.freeze_editable()
  241. self.reset_current_page()
  242. def handle_corpus_loaded(self, event):
  243. self.status["text"] = self.var.get() + " is loaded"
  244. self.unfreeze_editable()
  245. self.clear_results_box()
  246. self.reset_current_page()
  247. # self.next()
  248. collocations = self.model.next(self.current_page + 1)
  249. self.write_results(collocations)
  250. self.current_page += 1
  251. def corpus_selected(self, *args):
  252. new_selection = self.var.get()
  253. self.load_corpus(new_selection)
  254. def previous(self):
  255. self.freeze_editable()
  256. collocations = self.model.prev(self.current_page - 1)
  257. self.current_page = self.current_page - 1
  258. self.clear_results_box()
  259. self.write_results(collocations)
  260. self.unfreeze_editable()
  261. def __next__(self):
  262. self.freeze_editable()
  263. collocations = self.model.next(self.current_page + 1)
  264. self.clear_results_box()
  265. self.write_results(collocations)
  266. self.current_page += 1
  267. self.unfreeze_editable()
  268. def load_corpus(self, selection):
  269. if self.model.selected_corpus != selection:
  270. self.status["text"] = "Loading " + selection + "..."
  271. self.freeze_editable()
  272. self.model.load_corpus(selection)
  273. def freeze_editable(self):
  274. self.prev["state"] = "disabled"
  275. self.next["state"] = "disabled"
  276. def clear_results_box(self):
  277. self.results_box["state"] = "normal"
  278. self.results_box.delete("1.0", END)
  279. self.results_box["state"] = "disabled"
  280. def fire_event(self, event):
  281. # Firing an event so that rendering of widgets happen in the mainloop thread
  282. self.top.event_generate(event, when="tail")
  283. def destroy(self, *e):
  284. if self.top is None:
  285. return
  286. self.top.after_cancel(self.after)
  287. self.top.destroy()
  288. self.top = None
  289. def mainloop(self, *args, **kwargs):
  290. if in_idle():
  291. return
  292. self.top.mainloop(*args, **kwargs)
  293. def unfreeze_editable(self):
  294. self.set_paging_button_states()
  295. def set_paging_button_states(self):
  296. if self.current_page == -1 or self.current_page == 0:
  297. self.prev["state"] = "disabled"
  298. else:
  299. self.prev["state"] = "normal"
  300. if self.model.is_last_page(self.current_page):
  301. self.next["state"] = "disabled"
  302. else:
  303. self.next["state"] = "normal"
  304. def write_results(self, results):
  305. self.results_box["state"] = "normal"
  306. row = 1
  307. for each in results:
  308. self.results_box.insert(str(row) + ".0", each[0] + " " + each[1] + "\n")
  309. row += 1
  310. self.results_box["state"] = "disabled"
  311. class CollocationsModel:
  312. def __init__(self, queue):
  313. self.result_count = None
  314. self.selected_corpus = None
  315. self.collocations = None
  316. self.CORPORA = _CORPORA
  317. self.DEFAULT_CORPUS = _DEFAULT
  318. self.queue = queue
  319. self.reset_results()
  320. def reset_results(self):
  321. self.result_pages = []
  322. self.results_returned = 0
  323. def load_corpus(self, name):
  324. self.selected_corpus = name
  325. self.collocations = None
  326. runner_thread = self.LoadCorpus(name, self)
  327. runner_thread.start()
  328. self.reset_results()
  329. def non_default_corpora(self):
  330. copy = []
  331. copy.extend(list(self.CORPORA.keys()))
  332. copy.remove(self.DEFAULT_CORPUS)
  333. copy.sort()
  334. return copy
  335. def is_last_page(self, number):
  336. if number < len(self.result_pages):
  337. return False
  338. return self.results_returned + (
  339. number - len(self.result_pages)
  340. ) * self.result_count >= len(self.collocations)
  341. def next(self, page):
  342. if (len(self.result_pages) - 1) < page:
  343. for i in range(page - (len(self.result_pages) - 1)):
  344. self.result_pages.append(
  345. self.collocations[
  346. self.results_returned : self.results_returned
  347. + self.result_count
  348. ]
  349. )
  350. self.results_returned += self.result_count
  351. return self.result_pages[page]
  352. def prev(self, page):
  353. if page == -1:
  354. return []
  355. return self.result_pages[page]
  356. class LoadCorpus(threading.Thread):
  357. def __init__(self, name, model):
  358. threading.Thread.__init__(self)
  359. self.model, self.name = model, name
  360. def run(self):
  361. try:
  362. words = self.model.CORPORA[self.name]()
  363. from operator import itemgetter
  364. text = [w for w in words if len(w) > 2]
  365. fd = FreqDist(tuple(text[i : i + 2]) for i in range(len(text) - 1))
  366. vocab = FreqDist(text)
  367. scored = [
  368. ((w1, w2), fd[(w1, w2)] ** 3 / (vocab[w1] * vocab[w2]))
  369. for w1, w2 in fd
  370. ]
  371. scored.sort(key=itemgetter(1), reverse=True)
  372. self.model.collocations = list(map(itemgetter(0), scored))
  373. self.model.queue.put(CORPUS_LOADED_EVENT)
  374. except Exception as e:
  375. print(e)
  376. self.model.queue.put(ERROR_LOADING_CORPUS_EVENT)
  377. # def collocations():
  378. # colloc_strings = [w1 + ' ' + w2 for w1, w2 in self._collocations[:num]]
  379. def app():
  380. c = CollocationsView()
  381. c.mainloop()
  382. if __name__ == "__main__":
  383. app()
  384. __all__ = ["app"]