concordance_app.py 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710
  1. # Natural Language Toolkit: Concordance Application
  2. #
  3. # Copyright (C) 2001-2020 NLTK Project
  4. # Author: Sumukh Ghodke <sghodke@csse.unimelb.edu.au>
  5. # URL: <http://nltk.org/>
  6. # For license information, see LICENSE.TXT
  7. import re
  8. import threading
  9. import queue as q
  10. from tkinter.font import Font
  11. from tkinter import (
  12. Tk,
  13. Button,
  14. END,
  15. Entry,
  16. Frame,
  17. IntVar,
  18. LEFT,
  19. Label,
  20. Menu,
  21. OptionMenu,
  22. SUNKEN,
  23. Scrollbar,
  24. StringVar,
  25. Text,
  26. )
  27. from nltk.corpus import (
  28. cess_cat,
  29. brown,
  30. nps_chat,
  31. treebank,
  32. sinica_treebank,
  33. alpino,
  34. indian,
  35. floresta,
  36. mac_morpho,
  37. cess_esp,
  38. )
  39. from nltk.util import in_idle
  40. from nltk.draw.util import ShowText
  41. WORD_OR_TAG = "[^/ ]+"
  42. BOUNDARY = r"\b"
  43. CORPUS_LOADED_EVENT = "<<CL_EVENT>>"
  44. SEARCH_TERMINATED_EVENT = "<<ST_EVENT>>"
  45. SEARCH_ERROR_EVENT = "<<SE_EVENT>>"
  46. ERROR_LOADING_CORPUS_EVENT = "<<ELC_EVENT>>"
  47. POLL_INTERVAL = 50
  48. # NB All corpora must be specified in a lambda expression so as not to be
  49. # loaded when the module is imported.
  50. _DEFAULT = "English: Brown Corpus (Humor, simplified)"
  51. _CORPORA = {
  52. "Catalan: CESS-CAT Corpus (simplified)": lambda: cess_cat.tagged_sents(
  53. tagset="universal"
  54. ),
  55. "English: Brown Corpus": lambda: brown.tagged_sents(),
  56. "English: Brown Corpus (simplified)": lambda: brown.tagged_sents(
  57. tagset="universal"
  58. ),
  59. "English: Brown Corpus (Press, simplified)": lambda: brown.tagged_sents(
  60. categories=["news", "editorial", "reviews"], tagset="universal"
  61. ),
  62. "English: Brown Corpus (Religion, simplified)": lambda: brown.tagged_sents(
  63. categories="religion", tagset="universal"
  64. ),
  65. "English: Brown Corpus (Learned, simplified)": lambda: brown.tagged_sents(
  66. categories="learned", tagset="universal"
  67. ),
  68. "English: Brown Corpus (Science Fiction, simplified)": lambda: brown.tagged_sents(
  69. categories="science_fiction", tagset="universal"
  70. ),
  71. "English: Brown Corpus (Romance, simplified)": lambda: brown.tagged_sents(
  72. categories="romance", tagset="universal"
  73. ),
  74. "English: Brown Corpus (Humor, simplified)": lambda: brown.tagged_sents(
  75. categories="humor", tagset="universal"
  76. ),
  77. "English: NPS Chat Corpus": lambda: nps_chat.tagged_posts(),
  78. "English: NPS Chat Corpus (simplified)": lambda: nps_chat.tagged_posts(
  79. tagset="universal"
  80. ),
  81. "English: Wall Street Journal Corpus": lambda: treebank.tagged_sents(),
  82. "English: Wall Street Journal Corpus (simplified)": lambda: treebank.tagged_sents(
  83. tagset="universal"
  84. ),
  85. "Chinese: Sinica Corpus": lambda: sinica_treebank.tagged_sents(),
  86. "Chinese: Sinica Corpus (simplified)": lambda: sinica_treebank.tagged_sents(
  87. tagset="universal"
  88. ),
  89. "Dutch: Alpino Corpus": lambda: alpino.tagged_sents(),
  90. "Dutch: Alpino Corpus (simplified)": lambda: alpino.tagged_sents(
  91. tagset="universal"
  92. ),
  93. "Hindi: Indian Languages Corpus": lambda: indian.tagged_sents(files="hindi.pos"),
  94. "Hindi: Indian Languages Corpus (simplified)": lambda: indian.tagged_sents(
  95. files="hindi.pos", tagset="universal"
  96. ),
  97. "Portuguese: Floresta Corpus (Portugal)": lambda: floresta.tagged_sents(),
  98. "Portuguese: Floresta Corpus (Portugal, simplified)": lambda: floresta.tagged_sents(
  99. tagset="universal"
  100. ),
  101. "Portuguese: MAC-MORPHO Corpus (Brazil)": lambda: mac_morpho.tagged_sents(),
  102. "Portuguese: MAC-MORPHO Corpus (Brazil, simplified)": lambda: mac_morpho.tagged_sents(
  103. tagset="universal"
  104. ),
  105. "Spanish: CESS-ESP Corpus (simplified)": lambda: cess_esp.tagged_sents(
  106. tagset="universal"
  107. ),
  108. }
  109. class ConcordanceSearchView(object):
  110. _BACKGROUND_COLOUR = "#FFF" # white
  111. # Colour of highlighted results
  112. _HIGHLIGHT_WORD_COLOUR = "#F00" # red
  113. _HIGHLIGHT_WORD_TAG = "HL_WRD_TAG"
  114. _HIGHLIGHT_LABEL_COLOUR = "#C0C0C0" # dark grey
  115. _HIGHLIGHT_LABEL_TAG = "HL_LBL_TAG"
  116. # Percentage of text left of the scrollbar position
  117. _FRACTION_LEFT_TEXT = 0.30
  118. def __init__(self):
  119. self.queue = q.Queue()
  120. self.model = ConcordanceSearchModel(self.queue)
  121. self.top = Tk()
  122. self._init_top(self.top)
  123. self._init_menubar()
  124. self._init_widgets(self.top)
  125. self.load_corpus(self.model.DEFAULT_CORPUS)
  126. self.after = self.top.after(POLL_INTERVAL, self._poll)
  127. def _init_top(self, top):
  128. top.geometry("950x680+50+50")
  129. top.title("NLTK Concordance Search")
  130. top.bind("<Control-q>", self.destroy)
  131. top.protocol("WM_DELETE_WINDOW", self.destroy)
  132. top.minsize(950, 680)
  133. def _init_widgets(self, parent):
  134. self.main_frame = Frame(
  135. parent, dict(background=self._BACKGROUND_COLOUR, padx=1, pady=1, border=1)
  136. )
  137. self._init_corpus_select(self.main_frame)
  138. self._init_query_box(self.main_frame)
  139. self._init_results_box(self.main_frame)
  140. self._init_paging(self.main_frame)
  141. self._init_status(self.main_frame)
  142. self.main_frame.pack(fill="both", expand=True)
  143. def _init_menubar(self):
  144. self._result_size = IntVar(self.top)
  145. self._cntx_bf_len = IntVar(self.top)
  146. self._cntx_af_len = IntVar(self.top)
  147. menubar = Menu(self.top)
  148. filemenu = Menu(menubar, tearoff=0, borderwidth=0)
  149. filemenu.add_command(
  150. label="Exit", underline=1, command=self.destroy, accelerator="Ctrl-q"
  151. )
  152. menubar.add_cascade(label="File", underline=0, menu=filemenu)
  153. editmenu = Menu(menubar, tearoff=0)
  154. rescntmenu = Menu(editmenu, tearoff=0)
  155. rescntmenu.add_radiobutton(
  156. label="20",
  157. variable=self._result_size,
  158. underline=0,
  159. value=20,
  160. command=self.set_result_size,
  161. )
  162. rescntmenu.add_radiobutton(
  163. label="50",
  164. variable=self._result_size,
  165. underline=0,
  166. value=50,
  167. command=self.set_result_size,
  168. )
  169. rescntmenu.add_radiobutton(
  170. label="100",
  171. variable=self._result_size,
  172. underline=0,
  173. value=100,
  174. command=self.set_result_size,
  175. )
  176. rescntmenu.invoke(1)
  177. editmenu.add_cascade(label="Result Count", underline=0, menu=rescntmenu)
  178. cntxmenu = Menu(editmenu, tearoff=0)
  179. cntxbfmenu = Menu(cntxmenu, tearoff=0)
  180. cntxbfmenu.add_radiobutton(
  181. label="60 characters",
  182. variable=self._cntx_bf_len,
  183. underline=0,
  184. value=60,
  185. command=self.set_cntx_bf_len,
  186. )
  187. cntxbfmenu.add_radiobutton(
  188. label="80 characters",
  189. variable=self._cntx_bf_len,
  190. underline=0,
  191. value=80,
  192. command=self.set_cntx_bf_len,
  193. )
  194. cntxbfmenu.add_radiobutton(
  195. label="100 characters",
  196. variable=self._cntx_bf_len,
  197. underline=0,
  198. value=100,
  199. command=self.set_cntx_bf_len,
  200. )
  201. cntxbfmenu.invoke(1)
  202. cntxmenu.add_cascade(label="Before", underline=0, menu=cntxbfmenu)
  203. cntxafmenu = Menu(cntxmenu, tearoff=0)
  204. cntxafmenu.add_radiobutton(
  205. label="70 characters",
  206. variable=self._cntx_af_len,
  207. underline=0,
  208. value=70,
  209. command=self.set_cntx_af_len,
  210. )
  211. cntxafmenu.add_radiobutton(
  212. label="90 characters",
  213. variable=self._cntx_af_len,
  214. underline=0,
  215. value=90,
  216. command=self.set_cntx_af_len,
  217. )
  218. cntxafmenu.add_radiobutton(
  219. label="110 characters",
  220. variable=self._cntx_af_len,
  221. underline=0,
  222. value=110,
  223. command=self.set_cntx_af_len,
  224. )
  225. cntxafmenu.invoke(1)
  226. cntxmenu.add_cascade(label="After", underline=0, menu=cntxafmenu)
  227. editmenu.add_cascade(label="Context", underline=0, menu=cntxmenu)
  228. menubar.add_cascade(label="Edit", underline=0, menu=editmenu)
  229. self.top.config(menu=menubar)
  230. def set_result_size(self, **kwargs):
  231. self.model.result_count = self._result_size.get()
  232. def set_cntx_af_len(self, **kwargs):
  233. self._char_after = self._cntx_af_len.get()
  234. def set_cntx_bf_len(self, **kwargs):
  235. self._char_before = self._cntx_bf_len.get()
  236. def _init_corpus_select(self, parent):
  237. innerframe = Frame(parent, background=self._BACKGROUND_COLOUR)
  238. self.var = StringVar(innerframe)
  239. self.var.set(self.model.DEFAULT_CORPUS)
  240. Label(
  241. innerframe,
  242. justify=LEFT,
  243. text=" Corpus: ",
  244. background=self._BACKGROUND_COLOUR,
  245. padx=2,
  246. pady=1,
  247. border=0,
  248. ).pack(side="left")
  249. other_corpora = list(self.model.CORPORA.keys()).remove(
  250. self.model.DEFAULT_CORPUS
  251. )
  252. om = OptionMenu(
  253. innerframe,
  254. self.var,
  255. self.model.DEFAULT_CORPUS,
  256. command=self.corpus_selected,
  257. *self.model.non_default_corpora()
  258. )
  259. om["borderwidth"] = 0
  260. om["highlightthickness"] = 1
  261. om.pack(side="left")
  262. innerframe.pack(side="top", fill="x", anchor="n")
  263. def _init_status(self, parent):
  264. self.status = Label(
  265. parent,
  266. justify=LEFT,
  267. relief=SUNKEN,
  268. background=self._BACKGROUND_COLOUR,
  269. border=0,
  270. padx=1,
  271. pady=0,
  272. )
  273. self.status.pack(side="top", anchor="sw")
  274. def _init_query_box(self, parent):
  275. innerframe = Frame(parent, background=self._BACKGROUND_COLOUR)
  276. another = Frame(innerframe, background=self._BACKGROUND_COLOUR)
  277. self.query_box = Entry(another, width=60)
  278. self.query_box.pack(side="left", fill="x", pady=25, anchor="center")
  279. self.search_button = Button(
  280. another,
  281. text="Search",
  282. command=self.search,
  283. borderwidth=1,
  284. highlightthickness=1,
  285. )
  286. self.search_button.pack(side="left", fill="x", pady=25, anchor="center")
  287. self.query_box.bind("<KeyPress-Return>", self.search_enter_keypress_handler)
  288. another.pack()
  289. innerframe.pack(side="top", fill="x", anchor="n")
  290. def search_enter_keypress_handler(self, *event):
  291. self.search()
  292. def _init_results_box(self, parent):
  293. innerframe = Frame(parent)
  294. i1 = Frame(innerframe)
  295. i2 = Frame(innerframe)
  296. vscrollbar = Scrollbar(i1, borderwidth=1)
  297. hscrollbar = Scrollbar(i2, borderwidth=1, orient="horiz")
  298. self.results_box = Text(
  299. i1,
  300. font=Font(family="courier", size="16"),
  301. state="disabled",
  302. borderwidth=1,
  303. yscrollcommand=vscrollbar.set,
  304. xscrollcommand=hscrollbar.set,
  305. wrap="none",
  306. width="40",
  307. height="20",
  308. exportselection=1,
  309. )
  310. self.results_box.pack(side="left", fill="both", expand=True)
  311. self.results_box.tag_config(
  312. self._HIGHLIGHT_WORD_TAG, foreground=self._HIGHLIGHT_WORD_COLOUR
  313. )
  314. self.results_box.tag_config(
  315. self._HIGHLIGHT_LABEL_TAG, foreground=self._HIGHLIGHT_LABEL_COLOUR
  316. )
  317. vscrollbar.pack(side="left", fill="y", anchor="e")
  318. vscrollbar.config(command=self.results_box.yview)
  319. hscrollbar.pack(side="left", fill="x", expand=True, anchor="w")
  320. hscrollbar.config(command=self.results_box.xview)
  321. # there is no other way of avoiding the overlap of scrollbars while using pack layout manager!!!
  322. Label(i2, text=" ", background=self._BACKGROUND_COLOUR).pack(
  323. side="left", anchor="e"
  324. )
  325. i1.pack(side="top", fill="both", expand=True, anchor="n")
  326. i2.pack(side="bottom", fill="x", anchor="s")
  327. innerframe.pack(side="top", fill="both", expand=True)
  328. def _init_paging(self, parent):
  329. innerframe = Frame(parent, background=self._BACKGROUND_COLOUR)
  330. self.prev = prev = Button(
  331. innerframe,
  332. text="Previous",
  333. command=self.previous,
  334. width="10",
  335. borderwidth=1,
  336. highlightthickness=1,
  337. state="disabled",
  338. )
  339. prev.pack(side="left", anchor="center")
  340. self.next = next = Button(
  341. innerframe,
  342. text="Next",
  343. command=self.__next__,
  344. width="10",
  345. borderwidth=1,
  346. highlightthickness=1,
  347. state="disabled",
  348. )
  349. next.pack(side="right", anchor="center")
  350. innerframe.pack(side="top", fill="y")
  351. self.current_page = 0
  352. def previous(self):
  353. self.clear_results_box()
  354. self.freeze_editable()
  355. self.model.prev(self.current_page - 1)
  356. def __next__(self):
  357. self.clear_results_box()
  358. self.freeze_editable()
  359. self.model.next(self.current_page + 1)
  360. def about(self, *e):
  361. ABOUT = "NLTK Concordance Search Demo\n"
  362. TITLE = "About: NLTK Concordance Search Demo"
  363. try:
  364. from tkinter.messagebox import Message
  365. Message(message=ABOUT, title=TITLE, parent=self.main_frame).show()
  366. except:
  367. ShowText(self.top, TITLE, ABOUT)
  368. def _bind_event_handlers(self):
  369. self.top.bind(CORPUS_LOADED_EVENT, self.handle_corpus_loaded)
  370. self.top.bind(SEARCH_TERMINATED_EVENT, self.handle_search_terminated)
  371. self.top.bind(SEARCH_ERROR_EVENT, self.handle_search_error)
  372. self.top.bind(ERROR_LOADING_CORPUS_EVENT, self.handle_error_loading_corpus)
  373. def _poll(self):
  374. try:
  375. event = self.queue.get(block=False)
  376. except q.Empty:
  377. pass
  378. else:
  379. if event == CORPUS_LOADED_EVENT:
  380. self.handle_corpus_loaded(event)
  381. elif event == SEARCH_TERMINATED_EVENT:
  382. self.handle_search_terminated(event)
  383. elif event == SEARCH_ERROR_EVENT:
  384. self.handle_search_error(event)
  385. elif event == ERROR_LOADING_CORPUS_EVENT:
  386. self.handle_error_loading_corpus(event)
  387. self.after = self.top.after(POLL_INTERVAL, self._poll)
  388. def handle_error_loading_corpus(self, event):
  389. self.status["text"] = "Error in loading " + self.var.get()
  390. self.unfreeze_editable()
  391. self.clear_all()
  392. self.freeze_editable()
  393. def handle_corpus_loaded(self, event):
  394. self.status["text"] = self.var.get() + " is loaded"
  395. self.unfreeze_editable()
  396. self.clear_all()
  397. self.query_box.focus_set()
  398. def handle_search_terminated(self, event):
  399. # todo: refactor the model such that it is less state sensitive
  400. results = self.model.get_results()
  401. self.write_results(results)
  402. self.status["text"] = ""
  403. if len(results) == 0:
  404. self.status["text"] = "No results found for " + self.model.query
  405. else:
  406. self.current_page = self.model.last_requested_page
  407. self.unfreeze_editable()
  408. self.results_box.xview_moveto(self._FRACTION_LEFT_TEXT)
  409. def handle_search_error(self, event):
  410. self.status["text"] = "Error in query " + self.model.query
  411. self.unfreeze_editable()
  412. def corpus_selected(self, *args):
  413. new_selection = self.var.get()
  414. self.load_corpus(new_selection)
  415. def load_corpus(self, selection):
  416. if self.model.selected_corpus != selection:
  417. self.status["text"] = "Loading " + selection + "..."
  418. self.freeze_editable()
  419. self.model.load_corpus(selection)
  420. def search(self):
  421. self.current_page = 0
  422. self.clear_results_box()
  423. self.model.reset_results()
  424. query = self.query_box.get()
  425. if len(query.strip()) == 0:
  426. return
  427. self.status["text"] = "Searching for " + query
  428. self.freeze_editable()
  429. self.model.search(query, self.current_page + 1)
  430. def write_results(self, results):
  431. self.results_box["state"] = "normal"
  432. row = 1
  433. for each in results:
  434. sent, pos1, pos2 = each[0].strip(), each[1], each[2]
  435. if len(sent) != 0:
  436. if pos1 < self._char_before:
  437. sent, pos1, pos2 = self.pad(sent, pos1, pos2)
  438. sentence = sent[pos1 - self._char_before : pos1 + self._char_after]
  439. if not row == len(results):
  440. sentence += "\n"
  441. self.results_box.insert(str(row) + ".0", sentence)
  442. word_markers, label_markers = self.words_and_labels(sent, pos1, pos2)
  443. for marker in word_markers:
  444. self.results_box.tag_add(
  445. self._HIGHLIGHT_WORD_TAG,
  446. str(row) + "." + str(marker[0]),
  447. str(row) + "." + str(marker[1]),
  448. )
  449. for marker in label_markers:
  450. self.results_box.tag_add(
  451. self._HIGHLIGHT_LABEL_TAG,
  452. str(row) + "." + str(marker[0]),
  453. str(row) + "." + str(marker[1]),
  454. )
  455. row += 1
  456. self.results_box["state"] = "disabled"
  457. def words_and_labels(self, sentence, pos1, pos2):
  458. search_exp = sentence[pos1:pos2]
  459. words, labels = [], []
  460. labeled_words = search_exp.split(" ")
  461. index = 0
  462. for each in labeled_words:
  463. if each == "":
  464. index += 1
  465. else:
  466. word, label = each.split("/")
  467. words.append(
  468. (self._char_before + index, self._char_before + index + len(word))
  469. )
  470. index += len(word) + 1
  471. labels.append(
  472. (self._char_before + index, self._char_before + index + len(label))
  473. )
  474. index += len(label)
  475. index += 1
  476. return words, labels
  477. def pad(self, sent, hstart, hend):
  478. if hstart >= self._char_before:
  479. return sent, hstart, hend
  480. d = self._char_before - hstart
  481. sent = "".join([" "] * d) + sent
  482. return sent, hstart + d, hend + d
  483. def destroy(self, *e):
  484. if self.top is None:
  485. return
  486. self.top.after_cancel(self.after)
  487. self.top.destroy()
  488. self.top = None
  489. def clear_all(self):
  490. self.query_box.delete(0, END)
  491. self.model.reset_query()
  492. self.clear_results_box()
  493. def clear_results_box(self):
  494. self.results_box["state"] = "normal"
  495. self.results_box.delete("1.0", END)
  496. self.results_box["state"] = "disabled"
  497. def freeze_editable(self):
  498. self.query_box["state"] = "disabled"
  499. self.search_button["state"] = "disabled"
  500. self.prev["state"] = "disabled"
  501. self.next["state"] = "disabled"
  502. def unfreeze_editable(self):
  503. self.query_box["state"] = "normal"
  504. self.search_button["state"] = "normal"
  505. self.set_paging_button_states()
  506. def set_paging_button_states(self):
  507. if self.current_page == 0 or self.current_page == 1:
  508. self.prev["state"] = "disabled"
  509. else:
  510. self.prev["state"] = "normal"
  511. if self.model.has_more_pages(self.current_page):
  512. self.next["state"] = "normal"
  513. else:
  514. self.next["state"] = "disabled"
  515. def fire_event(self, event):
  516. # Firing an event so that rendering of widgets happen in the mainloop thread
  517. self.top.event_generate(event, when="tail")
  518. def mainloop(self, *args, **kwargs):
  519. if in_idle():
  520. return
  521. self.top.mainloop(*args, **kwargs)
  522. class ConcordanceSearchModel(object):
  523. def __init__(self, queue):
  524. self.queue = queue
  525. self.CORPORA = _CORPORA
  526. self.DEFAULT_CORPUS = _DEFAULT
  527. self.selected_corpus = None
  528. self.reset_query()
  529. self.reset_results()
  530. self.result_count = None
  531. self.last_sent_searched = 0
  532. def non_default_corpora(self):
  533. copy = []
  534. copy.extend(list(self.CORPORA.keys()))
  535. copy.remove(self.DEFAULT_CORPUS)
  536. copy.sort()
  537. return copy
  538. def load_corpus(self, name):
  539. self.selected_corpus = name
  540. self.tagged_sents = []
  541. runner_thread = self.LoadCorpus(name, self)
  542. runner_thread.start()
  543. def search(self, query, page):
  544. self.query = query
  545. self.last_requested_page = page
  546. self.SearchCorpus(self, page, self.result_count).start()
  547. def next(self, page):
  548. self.last_requested_page = page
  549. if len(self.results) < page:
  550. self.search(self.query, page)
  551. else:
  552. self.queue.put(SEARCH_TERMINATED_EVENT)
  553. def prev(self, page):
  554. self.last_requested_page = page
  555. self.queue.put(SEARCH_TERMINATED_EVENT)
  556. def reset_results(self):
  557. self.last_sent_searched = 0
  558. self.results = []
  559. self.last_page = None
  560. def reset_query(self):
  561. self.query = None
  562. def set_results(self, page, resultset):
  563. self.results.insert(page - 1, resultset)
  564. def get_results(self):
  565. return self.results[self.last_requested_page - 1]
  566. def has_more_pages(self, page):
  567. if self.results == [] or self.results[0] == []:
  568. return False
  569. if self.last_page is None:
  570. return True
  571. return page < self.last_page
  572. class LoadCorpus(threading.Thread):
  573. def __init__(self, name, model):
  574. threading.Thread.__init__(self)
  575. self.model, self.name = model, name
  576. def run(self):
  577. try:
  578. ts = self.model.CORPORA[self.name]()
  579. self.model.tagged_sents = [
  580. " ".join(w + "/" + t for (w, t) in sent) for sent in ts
  581. ]
  582. self.model.queue.put(CORPUS_LOADED_EVENT)
  583. except Exception as e:
  584. print(e)
  585. self.model.queue.put(ERROR_LOADING_CORPUS_EVENT)
  586. class SearchCorpus(threading.Thread):
  587. def __init__(self, model, page, count):
  588. self.model, self.count, self.page = model, count, page
  589. threading.Thread.__init__(self)
  590. def run(self):
  591. q = self.processed_query()
  592. sent_pos, i, sent_count = [], 0, 0
  593. for sent in self.model.tagged_sents[self.model.last_sent_searched :]:
  594. try:
  595. m = re.search(q, sent)
  596. except re.error:
  597. self.model.reset_results()
  598. self.model.queue.put(SEARCH_ERROR_EVENT)
  599. return
  600. if m:
  601. sent_pos.append((sent, m.start(), m.end()))
  602. i += 1
  603. if i > self.count:
  604. self.model.last_sent_searched += sent_count - 1
  605. break
  606. sent_count += 1
  607. if self.count >= len(sent_pos):
  608. self.model.last_sent_searched += sent_count - 1
  609. self.model.last_page = self.page
  610. self.model.set_results(self.page, sent_pos)
  611. else:
  612. self.model.set_results(self.page, sent_pos[:-1])
  613. self.model.queue.put(SEARCH_TERMINATED_EVENT)
  614. def processed_query(self):
  615. new = []
  616. for term in self.model.query.split():
  617. term = re.sub(r"\.", r"[^/ ]", term)
  618. if re.match("[A-Z]+$", term):
  619. new.append(BOUNDARY + WORD_OR_TAG + "/" + term + BOUNDARY)
  620. elif "/" in term:
  621. new.append(BOUNDARY + term + BOUNDARY)
  622. else:
  623. new.append(BOUNDARY + term + "/" + WORD_OR_TAG + BOUNDARY)
  624. return " ".join(new)
  625. def app():
  626. d = ConcordanceSearchView()
  627. d.mainloop()
  628. if __name__ == "__main__":
  629. app()
  630. __all__ = ["app"]