| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503 |
- # Natural Language Toolkit: Regexp Chunk Parser Application
- #
- # Copyright (C) 2001-2020 NLTK Project
- # Author: Edward Loper <edloper@gmail.com>
- # URL: <http://nltk.org/>
- # For license information, see LICENSE.TXT
- """
- A graphical tool for exploring the regular expression based chunk
- parser ``nltk.chunk.RegexpChunkParser``.
- """
- # Todo: Add a way to select the development set from the menubar. This
- # might just need to be a selection box (conll vs treebank etc) plus
- # configuration parameters to select what's being chunked (eg VP vs NP)
- # and what part of the data is being used as the development set.
- import time
- import textwrap
- import re
- import random
- from tkinter import (
- Button,
- Canvas,
- Checkbutton,
- Frame,
- IntVar,
- Label,
- Menu,
- Scrollbar,
- Text,
- Tk,
- )
- from tkinter.filedialog import askopenfilename, asksaveasfilename
- from tkinter.font import Font
- from nltk.tree import Tree
- from nltk.util import in_idle
- from nltk.draw.util import ShowText
- from nltk.corpus import conll2000, treebank_chunk
- from nltk.chunk import ChunkScore, RegexpChunkParser
- from nltk.chunk.regexp import RegexpChunkRule
- class RegexpChunkApp(object):
- """
- A graphical tool for exploring the regular expression based chunk
- parser ``nltk.chunk.RegexpChunkParser``.
- See ``HELP`` for instructional text.
- """
- ##/////////////////////////////////////////////////////////////////
- ## Help Text
- ##/////////////////////////////////////////////////////////////////
- #: A dictionary mapping from part of speech tags to descriptions,
- #: which is used in the help text. (This should probably live with
- #: the conll and/or treebank corpus instead.)
- TAGSET = {
- "CC": "Coordinating conjunction",
- "PRP$": "Possessive pronoun",
- "CD": "Cardinal number",
- "RB": "Adverb",
- "DT": "Determiner",
- "RBR": "Adverb, comparative",
- "EX": "Existential there",
- "RBS": "Adverb, superlative",
- "FW": "Foreign word",
- "RP": "Particle",
- "JJ": "Adjective",
- "TO": "to",
- "JJR": "Adjective, comparative",
- "UH": "Interjection",
- "JJS": "Adjective, superlative",
- "VB": "Verb, base form",
- "LS": "List item marker",
- "VBD": "Verb, past tense",
- "MD": "Modal",
- "NNS": "Noun, plural",
- "NN": "Noun, singular or masps",
- "VBN": "Verb, past participle",
- "VBZ": "Verb,3rd ps. sing. present",
- "NNP": "Proper noun, singular",
- "NNPS": "Proper noun plural",
- "WDT": "wh-determiner",
- "PDT": "Predeterminer",
- "WP": "wh-pronoun",
- "POS": "Possessive ending",
- "WP$": "Possessive wh-pronoun",
- "PRP": "Personal pronoun",
- "WRB": "wh-adverb",
- "(": "open parenthesis",
- ")": "close parenthesis",
- "``": "open quote",
- ",": "comma",
- "''": "close quote",
- ".": "period",
- "#": "pound sign (currency marker)",
- "$": "dollar sign (currency marker)",
- "IN": "Preposition/subord. conjunction",
- "SYM": "Symbol (mathematical or scientific)",
- "VBG": "Verb, gerund/present participle",
- "VBP": "Verb, non-3rd ps. sing. present",
- ":": "colon",
- }
- #: Contents for the help box. This is a list of tuples, one for
- #: each help page, where each tuple has four elements:
- #: - A title (displayed as a tab)
- #: - A string description of tabstops (see Tkinter.Text for details)
- #: - The text contents for the help page. You can use expressions
- #: like <red>...</red> to colorize the text; see ``HELP_AUTOTAG``
- #: for a list of tags you can use for colorizing.
- HELP = [
- (
- "Help",
- "20",
- "Welcome to the regular expression chunk-parser grammar editor. "
- "You can use this editor to develop and test chunk parser grammars "
- "based on NLTK's RegexpChunkParser class.\n\n"
- # Help box.
- "Use this box ('Help') to learn more about the editor; click on the "
- "tabs for help on specific topics:"
- "<indent>\n"
- "Rules: grammar rule types\n"
- "Regexps: regular expression syntax\n"
- "Tags: part of speech tags\n</indent>\n"
- # Grammar.
- "Use the upper-left box ('Grammar') to edit your grammar. "
- "Each line of your grammar specifies a single 'rule', "
- "which performs an action such as creating a chunk or merging "
- "two chunks.\n\n"
- # Dev set.
- "The lower-left box ('Development Set') runs your grammar on the "
- "development set, and displays the results. "
- "Your grammar's chunks are <highlight>highlighted</highlight>, and "
- "the correct (gold standard) chunks are "
- "<underline>underlined</underline>. If they "
- "match, they are displayed in <green>green</green>; otherwise, "
- "they are displayed in <red>red</red>. The box displays a single "
- "sentence from the development set at a time; use the scrollbar or "
- "the next/previous buttons view additional sentences.\n\n"
- # Performance
- "The lower-right box ('Evaluation') tracks the performance of "
- "your grammar on the development set. The 'precision' axis "
- "indicates how many of your grammar's chunks are correct; and "
- "the 'recall' axis indicates how many of the gold standard "
- "chunks your system generated. Typically, you should try to "
- "design a grammar that scores high on both metrics. The "
- "exact precision and recall of the current grammar, as well "
- "as their harmonic mean (the 'f-score'), are displayed in "
- "the status bar at the bottom of the window.",
- ),
- (
- "Rules",
- "10",
- "<h1>{...regexp...}</h1>"
- "<indent>\nChunk rule: creates new chunks from words matching "
- "regexp.</indent>\n\n"
- "<h1>}...regexp...{</h1>"
- "<indent>\nChink rule: removes words matching regexp from existing "
- "chunks.</indent>\n\n"
- "<h1>...regexp1...}{...regexp2...</h1>"
- "<indent>\nSplit rule: splits chunks that match regexp1 followed by "
- "regexp2 in two.</indent>\n\n"
- "<h1>...regexp...{}...regexp...</h1>"
- "<indent>\nMerge rule: joins consecutive chunks that match regexp1 "
- "and regexp2</indent>\n",
- ),
- (
- "Regexps",
- "10 60",
- # "Regular Expression Syntax Summary:\n\n"
- "<h1>Pattern\t\tMatches...</h1>\n"
- "<hangindent>"
- "\t<<var>T</var>>\ta word with tag <var>T</var> "
- "(where <var>T</var> may be a regexp).\n"
- "\t<var>x</var>?\tan optional <var>x</var>\n"
- "\t<var>x</var>+\ta sequence of 1 or more <var>x</var>'s\n"
- "\t<var>x</var>*\ta sequence of 0 or more <var>x</var>'s\n"
- "\t<var>x</var>|<var>y</var>\t<var>x</var> or <var>y</var>\n"
- "\t.\tmatches any character\n"
- "\t(<var>x</var>)\tTreats <var>x</var> as a group\n"
- "\t# <var>x...</var>\tTreats <var>x...</var> "
- "(to the end of the line) as a comment\n"
- "\t\\<var>C</var>\tmatches character <var>C</var> "
- "(useful when <var>C</var> is a special character "
- "like + or #)\n"
- "</hangindent>"
- "\n<h1>Examples:</h1>\n"
- "<hangindent>"
- "\t<regexp><NN></regexp>\n"
- '\t\tMatches <match>"cow/NN"</match>\n'
- '\t\tMatches <match>"green/NN"</match>\n'
- "\t<regexp><VB.*></regexp>\n"
- '\t\tMatches <match>"eating/VBG"</match>\n'
- '\t\tMatches <match>"ate/VBD"</match>\n'
- "\t<regexp><IN><DT><NN></regexp>\n"
- '\t\tMatches <match>"on/IN the/DT car/NN"</match>\n'
- "\t<regexp><RB>?<VBD></regexp>\n"
- '\t\tMatches <match>"ran/VBD"</match>\n'
- '\t\tMatches <match>"slowly/RB ate/VBD"</match>\n'
- "\t<regexp><\#><CD> # This is a comment...</regexp>\n"
- '\t\tMatches <match>"#/# 100/CD"</match>\n'
- "</hangindent>",
- ),
- (
- "Tags",
- "10 60",
- "<h1>Part of Speech Tags:</h1>\n"
- + "<hangindent>"
- + "<<TAGSET>>"
- + "</hangindent>\n", # this gets auto-substituted w/ self.TAGSET
- ),
- ]
- HELP_AUTOTAG = [
- ("red", dict(foreground="#a00")),
- ("green", dict(foreground="#080")),
- ("highlight", dict(background="#ddd")),
- ("underline", dict(underline=True)),
- ("h1", dict(underline=True)),
- ("indent", dict(lmargin1=20, lmargin2=20)),
- ("hangindent", dict(lmargin1=0, lmargin2=60)),
- ("var", dict(foreground="#88f")),
- ("regexp", dict(foreground="#ba7")),
- ("match", dict(foreground="#6a6")),
- ]
- ##/////////////////////////////////////////////////////////////////
- ## Config Parmeters
- ##/////////////////////////////////////////////////////////////////
- _EVAL_DELAY = 1
- """If the user has not pressed any key for this amount of time (in
- seconds), and the current grammar has not been evaluated, then
- the eval demon will evaluate it."""
- _EVAL_CHUNK = 15
- """The number of sentences that should be evaluated by the eval
- demon each time it runs."""
- _EVAL_FREQ = 0.2
- """The frequency (in seconds) at which the eval demon is run"""
- _EVAL_DEMON_MIN = 0.02
- """The minimum amount of time that the eval demon should take each time
- it runs -- if it takes less than this time, _EVAL_CHUNK will be
- modified upwards."""
- _EVAL_DEMON_MAX = 0.04
- """The maximum amount of time that the eval demon should take each time
- it runs -- if it takes more than this time, _EVAL_CHUNK will be
- modified downwards."""
- _GRAMMARBOX_PARAMS = dict(
- width=40,
- height=12,
- background="#efe",
- highlightbackground="#efe",
- highlightthickness=1,
- relief="groove",
- border=2,
- wrap="word",
- )
- _HELPBOX_PARAMS = dict(
- width=15,
- height=15,
- background="#efe",
- highlightbackground="#efe",
- foreground="#555",
- highlightthickness=1,
- relief="groove",
- border=2,
- wrap="word",
- )
- _DEVSETBOX_PARAMS = dict(
- width=70,
- height=10,
- background="#eef",
- highlightbackground="#eef",
- highlightthickness=1,
- relief="groove",
- border=2,
- wrap="word",
- tabs=(30,),
- )
- _STATUS_PARAMS = dict(background="#9bb", relief="groove", border=2)
- _FONT_PARAMS = dict(family="helvetica", size=-20)
- _FRAME_PARAMS = dict(background="#777", padx=2, pady=2, border=3)
- _EVALBOX_PARAMS = dict(
- background="#eef",
- highlightbackground="#eef",
- highlightthickness=1,
- relief="groove",
- border=2,
- width=300,
- height=280,
- )
- _BUTTON_PARAMS = dict(
- background="#777", activebackground="#777", highlightbackground="#777"
- )
- _HELPTAB_BG_COLOR = "#aba"
- _HELPTAB_FG_COLOR = "#efe"
- _HELPTAB_FG_PARAMS = dict(background="#efe")
- _HELPTAB_BG_PARAMS = dict(background="#aba")
- _HELPTAB_SPACER = 6
- def normalize_grammar(self, grammar):
- # Strip comments
- grammar = re.sub(r"((\\.|[^#])*)(#.*)?", r"\1", grammar)
- # Normalize whitespace
- grammar = re.sub(" +", " ", grammar)
- grammar = re.sub("\n\s+", "\n", grammar)
- grammar = grammar.strip()
- # [xx] Hack: automatically backslash $!
- grammar = re.sub(r"([^\\])\$", r"\1\\$", grammar)
- return grammar
- def __init__(
- self,
- devset_name="conll2000",
- devset=None,
- grammar="",
- chunk_label="NP",
- tagset=None,
- ):
- """
- :param devset_name: The name of the development set; used for
- display & for save files. If either the name 'treebank'
- or the name 'conll2000' is used, and devset is None, then
- devset will be set automatically.
- :param devset: A list of chunked sentences
- :param grammar: The initial grammar to display.
- :param tagset: Dictionary from tags to string descriptions, used
- for the help page. Defaults to ``self.TAGSET``.
- """
- self._chunk_label = chunk_label
- if tagset is None:
- tagset = self.TAGSET
- self.tagset = tagset
- # Named development sets:
- if devset is None:
- if devset_name == "conll2000":
- devset = conll2000.chunked_sents("train.txt") # [:100]
- elif devset == "treebank":
- devset = treebank_chunk.chunked_sents() # [:100]
- else:
- raise ValueError("Unknown development set %s" % devset_name)
- self.chunker = None
- """The chunker built from the grammar string"""
- self.grammar = grammar
- """The unparsed grammar string"""
- self.normalized_grammar = None
- """A normalized version of ``self.grammar``."""
- self.grammar_changed = 0
- """The last time() that the grammar was changed."""
- self.devset = devset
- """The development set -- a list of chunked sentences."""
- self.devset_name = devset_name
- """The name of the development set (for save files)."""
- self.devset_index = -1
- """The index into the development set of the first instance
- that's currently being viewed."""
- self._last_keypress = 0
- """The time() when a key was most recently pressed"""
- self._history = []
- """A list of (grammar, precision, recall, fscore) tuples for
- grammars that the user has already tried."""
- self._history_index = 0
- """When the user is scrolling through previous grammars, this
- is used to keep track of which grammar they're looking at."""
- self._eval_grammar = None
- """The grammar that is being currently evaluated by the eval
- demon."""
- self._eval_normalized_grammar = None
- """A normalized copy of ``_eval_grammar``."""
- self._eval_index = 0
- """The index of the next sentence in the development set that
- should be looked at by the eval demon."""
- self._eval_score = ChunkScore(chunk_label=chunk_label)
- """The ``ChunkScore`` object that's used to keep track of the score
- of the current grammar on the development set."""
- # Set up the main window.
- top = self.top = Tk()
- top.geometry("+50+50")
- top.title("Regexp Chunk Parser App")
- top.bind("<Control-q>", self.destroy)
- # Varaible that restricts how much of the devset we look at.
- self._devset_size = IntVar(top)
- self._devset_size.set(100)
- # Set up all the tkinter widgets
- self._init_fonts(top)
- self._init_widgets(top)
- self._init_bindings(top)
- self._init_menubar(top)
- self.grammarbox.focus()
- # If a grammar was given, then display it.
- if grammar:
- self.grammarbox.insert("end", grammar + "\n")
- self.grammarbox.mark_set("insert", "1.0")
- # Display the first item in the development set
- self.show_devset(0)
- self.update()
- def _init_bindings(self, top):
- top.bind("<Control-n>", self._devset_next)
- top.bind("<Control-p>", self._devset_prev)
- top.bind("<Control-t>", self.toggle_show_trace)
- top.bind("<KeyPress>", self.update)
- top.bind("<Control-s>", lambda e: self.save_grammar())
- top.bind("<Control-o>", lambda e: self.load_grammar())
- self.grammarbox.bind("<Control-t>", self.toggle_show_trace)
- self.grammarbox.bind("<Control-n>", self._devset_next)
- self.grammarbox.bind("<Control-p>", self._devset_prev)
- # Redraw the eval graph when the window size changes
- self.evalbox.bind("<Configure>", self._eval_plot)
- def _init_fonts(self, top):
- # TWhat's our font size (default=same as sysfont)
- self._size = IntVar(top)
- self._size.set(20)
- self._font = Font(family="helvetica", size=-self._size.get())
- self._smallfont = Font(
- family="helvetica", size=-(int(self._size.get() * 14 // 20))
- )
- def _init_menubar(self, parent):
- menubar = Menu(parent)
- filemenu = Menu(menubar, tearoff=0)
- filemenu.add_command(label="Reset Application", underline=0, command=self.reset)
- filemenu.add_command(
- label="Save Current Grammar",
- underline=0,
- accelerator="Ctrl-s",
- command=self.save_grammar,
- )
- filemenu.add_command(
- label="Load Grammar",
- underline=0,
- accelerator="Ctrl-o",
- command=self.load_grammar,
- )
- filemenu.add_command(
- label="Save Grammar History", underline=13, command=self.save_history
- )
- filemenu.add_command(
- label="Exit", underline=1, command=self.destroy, accelerator="Ctrl-q"
- )
- menubar.add_cascade(label="File", underline=0, menu=filemenu)
- viewmenu = Menu(menubar, tearoff=0)
- viewmenu.add_radiobutton(
- label="Tiny",
- variable=self._size,
- underline=0,
- value=10,
- command=self.resize,
- )
- viewmenu.add_radiobutton(
- label="Small",
- variable=self._size,
- underline=0,
- value=16,
- command=self.resize,
- )
- viewmenu.add_radiobutton(
- label="Medium",
- variable=self._size,
- underline=0,
- value=20,
- command=self.resize,
- )
- viewmenu.add_radiobutton(
- label="Large",
- variable=self._size,
- underline=0,
- value=24,
- command=self.resize,
- )
- viewmenu.add_radiobutton(
- label="Huge",
- variable=self._size,
- underline=0,
- value=34,
- command=self.resize,
- )
- menubar.add_cascade(label="View", underline=0, menu=viewmenu)
- devsetmenu = Menu(menubar, tearoff=0)
- devsetmenu.add_radiobutton(
- label="50 sentences",
- variable=self._devset_size,
- value=50,
- command=self.set_devset_size,
- )
- devsetmenu.add_radiobutton(
- label="100 sentences",
- variable=self._devset_size,
- value=100,
- command=self.set_devset_size,
- )
- devsetmenu.add_radiobutton(
- label="200 sentences",
- variable=self._devset_size,
- value=200,
- command=self.set_devset_size,
- )
- devsetmenu.add_radiobutton(
- label="500 sentences",
- variable=self._devset_size,
- value=500,
- command=self.set_devset_size,
- )
- menubar.add_cascade(label="Development-Set", underline=0, menu=devsetmenu)
- helpmenu = Menu(menubar, tearoff=0)
- helpmenu.add_command(label="About", underline=0, command=self.about)
- menubar.add_cascade(label="Help", underline=0, menu=helpmenu)
- parent.config(menu=menubar)
- def toggle_show_trace(self, *e):
- if self._showing_trace:
- self.show_devset()
- else:
- self.show_trace()
- return "break"
- _SCALE_N = 5 # center on the last 5 examples.
- _DRAW_LINES = False
- def _eval_plot(self, *e, **config):
- width = config.get("width", self.evalbox.winfo_width())
- height = config.get("height", self.evalbox.winfo_height())
- # Clear the canvas
- self.evalbox.delete("all")
- # Draw the precision & recall labels.
- tag = self.evalbox.create_text(
- 10, height // 2 - 10, justify="left", anchor="w", text="Precision"
- )
- left, right = self.evalbox.bbox(tag)[2] + 5, width - 10
- tag = self.evalbox.create_text(
- left + (width - left) // 2,
- height - 10,
- anchor="s",
- text="Recall",
- justify="center",
- )
- top, bot = 10, self.evalbox.bbox(tag)[1] - 10
- # Draw masks for clipping the plot.
- bg = self._EVALBOX_PARAMS["background"]
- self.evalbox.lower(
- self.evalbox.create_rectangle(0, 0, left - 1, 5000, fill=bg, outline=bg)
- )
- self.evalbox.lower(
- self.evalbox.create_rectangle(0, bot + 1, 5000, 5000, fill=bg, outline=bg)
- )
- # Calculate the plot's scale.
- if self._autoscale.get() and len(self._history) > 1:
- max_precision = max_recall = 0
- min_precision = min_recall = 1
- for i in range(1, min(len(self._history), self._SCALE_N + 1)):
- grammar, precision, recall, fmeasure = self._history[-i]
- min_precision = min(precision, min_precision)
- min_recall = min(recall, min_recall)
- max_precision = max(precision, max_precision)
- max_recall = max(recall, max_recall)
- # if max_precision-min_precision > max_recall-min_recall:
- # min_recall -= (max_precision-min_precision)/2
- # max_recall += (max_precision-min_precision)/2
- # else:
- # min_precision -= (max_recall-min_recall)/2
- # max_precision += (max_recall-min_recall)/2
- # if min_recall < 0:
- # max_recall -= min_recall
- # min_recall = 0
- # if min_precision < 0:
- # max_precision -= min_precision
- # min_precision = 0
- min_precision = max(min_precision - 0.01, 0)
- min_recall = max(min_recall - 0.01, 0)
- max_precision = min(max_precision + 0.01, 1)
- max_recall = min(max_recall + 0.01, 1)
- else:
- min_precision = min_recall = 0
- max_precision = max_recall = 1
- # Draw the axis lines & grid lines
- for i in range(11):
- x = left + (right - left) * (
- (i / 10.0 - min_recall) / (max_recall - min_recall)
- )
- y = bot - (bot - top) * (
- (i / 10.0 - min_precision) / (max_precision - min_precision)
- )
- if left < x < right:
- self.evalbox.create_line(x, top, x, bot, fill="#888")
- if top < y < bot:
- self.evalbox.create_line(left, y, right, y, fill="#888")
- self.evalbox.create_line(left, top, left, bot)
- self.evalbox.create_line(left, bot, right, bot)
- # Display the plot's scale
- self.evalbox.create_text(
- left - 3,
- bot,
- justify="right",
- anchor="se",
- text="%d%%" % (100 * min_precision),
- )
- self.evalbox.create_text(
- left - 3,
- top,
- justify="right",
- anchor="ne",
- text="%d%%" % (100 * max_precision),
- )
- self.evalbox.create_text(
- left,
- bot + 3,
- justify="center",
- anchor="nw",
- text="%d%%" % (100 * min_recall),
- )
- self.evalbox.create_text(
- right,
- bot + 3,
- justify="center",
- anchor="ne",
- text="%d%%" % (100 * max_recall),
- )
- # Display the scores.
- prev_x = prev_y = None
- for i, (_, precision, recall, fscore) in enumerate(self._history):
- x = left + (right - left) * (
- (recall - min_recall) / (max_recall - min_recall)
- )
- y = bot - (bot - top) * (
- (precision - min_precision) / (max_precision - min_precision)
- )
- if i == self._history_index:
- self.evalbox.create_oval(
- x - 2, y - 2, x + 2, y + 2, fill="#0f0", outline="#000"
- )
- self.status["text"] = (
- "Precision: %.2f%%\t" % (precision * 100)
- + "Recall: %.2f%%\t" % (recall * 100)
- + "F-score: %.2f%%" % (fscore * 100)
- )
- else:
- self.evalbox.lower(
- self.evalbox.create_oval(
- x - 2, y - 2, x + 2, y + 2, fill="#afa", outline="#8c8"
- )
- )
- if prev_x is not None and self._eval_lines.get():
- self.evalbox.lower(
- self.evalbox.create_line(prev_x, prev_y, x, y, fill="#8c8")
- )
- prev_x, prev_y = x, y
- _eval_demon_running = False
- def _eval_demon(self):
- if self.top is None:
- return
- if self.chunker is None:
- self._eval_demon_running = False
- return
- # Note our starting time.
- t0 = time.time()
- # If are still typing, then wait for them to finish.
- if (
- time.time() - self._last_keypress < self._EVAL_DELAY
- and self.normalized_grammar != self._eval_normalized_grammar
- ):
- self._eval_demon_running = True
- return self.top.after(int(self._EVAL_FREQ * 1000), self._eval_demon)
- # If the grammar changed, restart the evaluation.
- if self.normalized_grammar != self._eval_normalized_grammar:
- # Check if we've seen this grammar already. If so, then
- # just use the old evaluation values.
- for (g, p, r, f) in self._history:
- if self.normalized_grammar == self.normalize_grammar(g):
- self._history.append((g, p, r, f))
- self._history_index = len(self._history) - 1
- self._eval_plot()
- self._eval_demon_running = False
- self._eval_normalized_grammar = None
- return
- self._eval_index = 0
- self._eval_score = ChunkScore(chunk_label=self._chunk_label)
- self._eval_grammar = self.grammar
- self._eval_normalized_grammar = self.normalized_grammar
- # If the grammar is empty, the don't bother evaluating it, or
- # recording it in history -- the score will just be 0.
- if self.normalized_grammar.strip() == "":
- # self._eval_index = self._devset_size.get()
- self._eval_demon_running = False
- return
- # Score the next set of examples
- for gold in self.devset[
- self._eval_index : min(
- self._eval_index + self._EVAL_CHUNK, self._devset_size.get()
- )
- ]:
- guess = self._chunkparse(gold.leaves())
- self._eval_score.score(gold, guess)
- # update our index in the devset.
- self._eval_index += self._EVAL_CHUNK
- # Check if we're done
- if self._eval_index >= self._devset_size.get():
- self._history.append(
- (
- self._eval_grammar,
- self._eval_score.precision(),
- self._eval_score.recall(),
- self._eval_score.f_measure(),
- )
- )
- self._history_index = len(self._history) - 1
- self._eval_plot()
- self._eval_demon_running = False
- self._eval_normalized_grammar = None
- else:
- progress = 100 * self._eval_index / self._devset_size.get()
- self.status["text"] = "Evaluating on Development Set (%d%%)" % progress
- self._eval_demon_running = True
- self._adaptively_modify_eval_chunk(time.time() - t0)
- self.top.after(int(self._EVAL_FREQ * 1000), self._eval_demon)
- def _adaptively_modify_eval_chunk(self, t):
- """
- Modify _EVAL_CHUNK to try to keep the amount of time that the
- eval demon takes between _EVAL_DEMON_MIN and _EVAL_DEMON_MAX.
- :param t: The amount of time that the eval demon took.
- """
- if t > self._EVAL_DEMON_MAX and self._EVAL_CHUNK > 5:
- self._EVAL_CHUNK = min(
- self._EVAL_CHUNK - 1,
- max(
- int(self._EVAL_CHUNK * (self._EVAL_DEMON_MAX / t)),
- self._EVAL_CHUNK - 10,
- ),
- )
- elif t < self._EVAL_DEMON_MIN:
- self._EVAL_CHUNK = max(
- self._EVAL_CHUNK + 1,
- min(
- int(self._EVAL_CHUNK * (self._EVAL_DEMON_MIN / t)),
- self._EVAL_CHUNK + 10,
- ),
- )
- def _init_widgets(self, top):
- frame0 = Frame(top, **self._FRAME_PARAMS)
- frame0.grid_columnconfigure(0, weight=4)
- frame0.grid_columnconfigure(3, weight=2)
- frame0.grid_rowconfigure(1, weight=1)
- frame0.grid_rowconfigure(5, weight=1)
- # The grammar
- self.grammarbox = Text(frame0, font=self._font, **self._GRAMMARBOX_PARAMS)
- self.grammarlabel = Label(
- frame0,
- font=self._font,
- text="Grammar:",
- highlightcolor="black",
- background=self._GRAMMARBOX_PARAMS["background"],
- )
- self.grammarlabel.grid(column=0, row=0, sticky="SW")
- self.grammarbox.grid(column=0, row=1, sticky="NEWS")
- # Scroll bar for grammar
- grammar_scrollbar = Scrollbar(frame0, command=self.grammarbox.yview)
- grammar_scrollbar.grid(column=1, row=1, sticky="NWS")
- self.grammarbox.config(yscrollcommand=grammar_scrollbar.set)
- # grammar buttons
- bg = self._FRAME_PARAMS["background"]
- frame3 = Frame(frame0, background=bg)
- frame3.grid(column=0, row=2, sticky="EW")
- Button(
- frame3,
- text="Prev Grammar",
- command=self._history_prev,
- **self._BUTTON_PARAMS
- ).pack(side="left")
- Button(
- frame3,
- text="Next Grammar",
- command=self._history_next,
- **self._BUTTON_PARAMS
- ).pack(side="left")
- # Help box
- self.helpbox = Text(frame0, font=self._smallfont, **self._HELPBOX_PARAMS)
- self.helpbox.grid(column=3, row=1, sticky="NEWS")
- self.helptabs = {}
- bg = self._FRAME_PARAMS["background"]
- helptab_frame = Frame(frame0, background=bg)
- helptab_frame.grid(column=3, row=0, sticky="SW")
- for i, (tab, tabstops, text) in enumerate(self.HELP):
- label = Label(helptab_frame, text=tab, font=self._smallfont)
- label.grid(column=i * 2, row=0, sticky="S")
- # help_frame.grid_columnconfigure(i, weight=1)
- # label.pack(side='left')
- label.bind("<ButtonPress>", lambda e, tab=tab: self.show_help(tab))
- self.helptabs[tab] = label
- Frame(
- helptab_frame, height=1, width=self._HELPTAB_SPACER, background=bg
- ).grid(column=i * 2 + 1, row=0)
- self.helptabs[self.HELP[0][0]].configure(font=self._font)
- self.helpbox.tag_config("elide", elide=True)
- for (tag, params) in self.HELP_AUTOTAG:
- self.helpbox.tag_config("tag-%s" % tag, **params)
- self.show_help(self.HELP[0][0])
- # Scroll bar for helpbox
- help_scrollbar = Scrollbar(frame0, command=self.helpbox.yview)
- self.helpbox.config(yscrollcommand=help_scrollbar.set)
- help_scrollbar.grid(column=4, row=1, sticky="NWS")
- # The dev set
- frame4 = Frame(frame0, background=self._FRAME_PARAMS["background"])
- self.devsetbox = Text(frame4, font=self._font, **self._DEVSETBOX_PARAMS)
- self.devsetbox.pack(expand=True, fill="both")
- self.devsetlabel = Label(
- frame0,
- font=self._font,
- text="Development Set:",
- justify="right",
- background=self._DEVSETBOX_PARAMS["background"],
- )
- self.devsetlabel.grid(column=0, row=4, sticky="SW")
- frame4.grid(column=0, row=5, sticky="NEWS")
- # dev set scrollbars
- self.devset_scroll = Scrollbar(frame0, command=self._devset_scroll)
- self.devset_scroll.grid(column=1, row=5, sticky="NWS")
- self.devset_xscroll = Scrollbar(
- frame4, command=self.devsetbox.xview, orient="horiz"
- )
- self.devsetbox["xscrollcommand"] = self.devset_xscroll.set
- self.devset_xscroll.pack(side="bottom", fill="x")
- # dev set buttons
- bg = self._FRAME_PARAMS["background"]
- frame1 = Frame(frame0, background=bg)
- frame1.grid(column=0, row=7, sticky="EW")
- Button(
- frame1,
- text="Prev Example (Ctrl-p)",
- command=self._devset_prev,
- **self._BUTTON_PARAMS
- ).pack(side="left")
- Button(
- frame1,
- text="Next Example (Ctrl-n)",
- command=self._devset_next,
- **self._BUTTON_PARAMS
- ).pack(side="left")
- self.devset_button = Button(
- frame1,
- text="Show example",
- command=self.show_devset,
- state="disabled",
- **self._BUTTON_PARAMS
- )
- self.devset_button.pack(side="right")
- self.trace_button = Button(
- frame1, text="Show trace", command=self.show_trace, **self._BUTTON_PARAMS
- )
- self.trace_button.pack(side="right")
- # evaluation box
- self.evalbox = Canvas(frame0, **self._EVALBOX_PARAMS)
- label = Label(
- frame0,
- font=self._font,
- text="Evaluation:",
- justify="right",
- background=self._EVALBOX_PARAMS["background"],
- )
- label.grid(column=3, row=4, sticky="SW")
- self.evalbox.grid(column=3, row=5, sticky="NEWS", columnspan=2)
- # evaluation box buttons
- bg = self._FRAME_PARAMS["background"]
- frame2 = Frame(frame0, background=bg)
- frame2.grid(column=3, row=7, sticky="EW")
- self._autoscale = IntVar(self.top)
- self._autoscale.set(False)
- Checkbutton(
- frame2,
- variable=self._autoscale,
- command=self._eval_plot,
- text="Zoom",
- **self._BUTTON_PARAMS
- ).pack(side="left")
- self._eval_lines = IntVar(self.top)
- self._eval_lines.set(False)
- Checkbutton(
- frame2,
- variable=self._eval_lines,
- command=self._eval_plot,
- text="Lines",
- **self._BUTTON_PARAMS
- ).pack(side="left")
- Button(frame2, text="History", **self._BUTTON_PARAMS).pack(side="right")
- # The status label
- self.status = Label(frame0, font=self._font, **self._STATUS_PARAMS)
- self.status.grid(column=0, row=9, sticky="NEW", padx=3, pady=2, columnspan=5)
- # Help box & devset box can't be edited.
- self.helpbox["state"] = "disabled"
- self.devsetbox["state"] = "disabled"
- # Spacers
- bg = self._FRAME_PARAMS["background"]
- Frame(frame0, height=10, width=0, background=bg).grid(column=0, row=3)
- Frame(frame0, height=0, width=10, background=bg).grid(column=2, row=0)
- Frame(frame0, height=6, width=0, background=bg).grid(column=0, row=8)
- # pack the frame.
- frame0.pack(fill="both", expand=True)
- # Set up colors for the devset box
- self.devsetbox.tag_config("true-pos", background="#afa", underline="True")
- self.devsetbox.tag_config("false-neg", underline="True", foreground="#800")
- self.devsetbox.tag_config("false-pos", background="#faa")
- self.devsetbox.tag_config("trace", foreground="#666", wrap="none")
- self.devsetbox.tag_config("wrapindent", lmargin2=30, wrap="none")
- self.devsetbox.tag_config("error", foreground="#800")
- # And for the grammarbox
- self.grammarbox.tag_config("error", background="#fec")
- self.grammarbox.tag_config("comment", foreground="#840")
- self.grammarbox.tag_config("angle", foreground="#00f")
- self.grammarbox.tag_config("brace", foreground="#0a0")
- self.grammarbox.tag_config("hangindent", lmargin1=0, lmargin2=40)
- _showing_trace = False
- def show_trace(self, *e):
- self._showing_trace = True
- self.trace_button["state"] = "disabled"
- self.devset_button["state"] = "normal"
- self.devsetbox["state"] = "normal"
- # self.devsetbox['wrap'] = 'none'
- self.devsetbox.delete("1.0", "end")
- self.devsetlabel["text"] = "Development Set (%d/%d)" % (
- (self.devset_index + 1, self._devset_size.get())
- )
- if self.chunker is None:
- self.devsetbox.insert("1.0", "Trace: waiting for a valid grammar.")
- self.devsetbox.tag_add("error", "1.0", "end")
- return # can't do anything more
- gold_tree = self.devset[self.devset_index]
- rules = self.chunker.rules()
- # Calculate the tag sequence
- tagseq = "\t"
- charnum = [1]
- for wordnum, (word, pos) in enumerate(gold_tree.leaves()):
- tagseq += "%s " % pos
- charnum.append(len(tagseq))
- self.charnum = dict(
- ((i, j), charnum[j])
- for i in range(len(rules) + 1)
- for j in range(len(charnum))
- )
- self.linenum = dict((i, i * 2 + 2) for i in range(len(rules) + 1))
- for i in range(len(rules) + 1):
- if i == 0:
- self.devsetbox.insert("end", "Start:\n")
- self.devsetbox.tag_add("trace", "end -2c linestart", "end -2c")
- else:
- self.devsetbox.insert("end", "Apply %s:\n" % rules[i - 1])
- self.devsetbox.tag_add("trace", "end -2c linestart", "end -2c")
- # Display the tag sequence.
- self.devsetbox.insert("end", tagseq + "\n")
- self.devsetbox.tag_add("wrapindent", "end -2c linestart", "end -2c")
- # Run a partial parser, and extract gold & test chunks
- chunker = RegexpChunkParser(rules[:i])
- test_tree = self._chunkparse(gold_tree.leaves())
- gold_chunks = self._chunks(gold_tree)
- test_chunks = self._chunks(test_tree)
- # Compare them.
- for chunk in gold_chunks.intersection(test_chunks):
- self._color_chunk(i, chunk, "true-pos")
- for chunk in gold_chunks - test_chunks:
- self._color_chunk(i, chunk, "false-neg")
- for chunk in test_chunks - gold_chunks:
- self._color_chunk(i, chunk, "false-pos")
- self.devsetbox.insert("end", "Finished.\n")
- self.devsetbox.tag_add("trace", "end -2c linestart", "end -2c")
- # This is a hack, because the x-scrollbar isn't updating its
- # position right -- I'm not sure what the underlying cause is
- # though. (This is on OS X w/ python 2.5)
- self.top.after(100, self.devset_xscroll.set, 0, 0.3)
- def show_help(self, tab):
- self.helpbox["state"] = "normal"
- self.helpbox.delete("1.0", "end")
- for (name, tabstops, text) in self.HELP:
- if name == tab:
- text = text.replace(
- "<<TAGSET>>",
- "\n".join(
- (
- "\t%s\t%s" % item
- for item in sorted(
- list(self.tagset.items()),
- key=lambda t_w: re.match("\w+", t_w[0])
- and (0, t_w[0])
- or (1, t_w[0]),
- )
- )
- ),
- )
- self.helptabs[name].config(**self._HELPTAB_FG_PARAMS)
- self.helpbox.config(tabs=tabstops)
- self.helpbox.insert("1.0", text + "\n" * 20)
- C = "1.0 + %d chars"
- for (tag, params) in self.HELP_AUTOTAG:
- pattern = "(?s)(<%s>)(.*?)(</%s>)" % (tag, tag)
- for m in re.finditer(pattern, text):
- self.helpbox.tag_add("elide", C % m.start(1), C % m.end(1))
- self.helpbox.tag_add(
- "tag-%s" % tag, C % m.start(2), C % m.end(2)
- )
- self.helpbox.tag_add("elide", C % m.start(3), C % m.end(3))
- else:
- self.helptabs[name].config(**self._HELPTAB_BG_PARAMS)
- self.helpbox["state"] = "disabled"
- def _history_prev(self, *e):
- self._view_history(self._history_index - 1)
- return "break"
- def _history_next(self, *e):
- self._view_history(self._history_index + 1)
- return "break"
- def _view_history(self, index):
- # Bounds & sanity checking:
- index = max(0, min(len(self._history) - 1, index))
- if not self._history:
- return
- # Already viewing the requested history item?
- if index == self._history_index:
- return
- # Show the requested grammar. It will get added to _history
- # only if they edit it (causing self.update() to get run.)
- self.grammarbox["state"] = "normal"
- self.grammarbox.delete("1.0", "end")
- self.grammarbox.insert("end", self._history[index][0])
- self.grammarbox.mark_set("insert", "1.0")
- self._history_index = index
- self._syntax_highlight_grammar(self._history[index][0])
- # Record the normalized grammar & regenerate the chunker.
- self.normalized_grammar = self.normalize_grammar(self._history[index][0])
- if self.normalized_grammar:
- rules = [
- RegexpChunkRule.fromstring(line)
- for line in self.normalized_grammar.split("\n")
- ]
- else:
- rules = []
- self.chunker = RegexpChunkParser(rules)
- # Show the score.
- self._eval_plot()
- # Update the devset box
- self._highlight_devset()
- if self._showing_trace:
- self.show_trace()
- # Update the grammar label
- if self._history_index < len(self._history) - 1:
- self.grammarlabel["text"] = "Grammar %s/%s:" % (
- self._history_index + 1,
- len(self._history),
- )
- else:
- self.grammarlabel["text"] = "Grammar:"
- def _devset_next(self, *e):
- self._devset_scroll("scroll", 1, "page")
- return "break"
- def _devset_prev(self, *e):
- self._devset_scroll("scroll", -1, "page")
- return "break"
- def destroy(self, *e):
- if self.top is None:
- return
- self.top.destroy()
- self.top = None
- def _devset_scroll(self, command, *args):
- N = 1 # size of a page -- one sentence.
- showing_trace = self._showing_trace
- if command == "scroll" and args[1].startswith("unit"):
- self.show_devset(self.devset_index + int(args[0]))
- elif command == "scroll" and args[1].startswith("page"):
- self.show_devset(self.devset_index + N * int(args[0]))
- elif command == "moveto":
- self.show_devset(int(float(args[0]) * self._devset_size.get()))
- else:
- assert 0, "bad scroll command %s %s" % (command, args)
- if showing_trace:
- self.show_trace()
- def show_devset(self, index=None):
- if index is None:
- index = self.devset_index
- # Bounds checking
- index = min(max(0, index), self._devset_size.get() - 1)
- if index == self.devset_index and not self._showing_trace:
- return
- self.devset_index = index
- self._showing_trace = False
- self.trace_button["state"] = "normal"
- self.devset_button["state"] = "disabled"
- # Clear the text box.
- self.devsetbox["state"] = "normal"
- self.devsetbox["wrap"] = "word"
- self.devsetbox.delete("1.0", "end")
- self.devsetlabel["text"] = "Development Set (%d/%d)" % (
- (self.devset_index + 1, self._devset_size.get())
- )
- # Add the sentences
- sample = self.devset[self.devset_index : self.devset_index + 1]
- self.charnum = {}
- self.linenum = {0: 1}
- for sentnum, sent in enumerate(sample):
- linestr = ""
- for wordnum, (word, pos) in enumerate(sent.leaves()):
- self.charnum[sentnum, wordnum] = len(linestr)
- linestr += "%s/%s " % (word, pos)
- self.charnum[sentnum, wordnum + 1] = len(linestr)
- self.devsetbox.insert("end", linestr[:-1] + "\n\n")
- # Highlight chunks in the dev set
- if self.chunker is not None:
- self._highlight_devset()
- self.devsetbox["state"] = "disabled"
- # Update the scrollbar
- first = self.devset_index / self._devset_size.get()
- last = (self.devset_index + 2) / self._devset_size.get()
- self.devset_scroll.set(first, last)
- def _chunks(self, tree):
- chunks = set()
- wordnum = 0
- for child in tree:
- if isinstance(child, Tree):
- if child.label() == self._chunk_label:
- chunks.add((wordnum, wordnum + len(child)))
- wordnum += len(child)
- else:
- wordnum += 1
- return chunks
- def _syntax_highlight_grammar(self, grammar):
- if self.top is None:
- return
- self.grammarbox.tag_remove("comment", "1.0", "end")
- self.grammarbox.tag_remove("angle", "1.0", "end")
- self.grammarbox.tag_remove("brace", "1.0", "end")
- self.grammarbox.tag_add("hangindent", "1.0", "end")
- for lineno, line in enumerate(grammar.split("\n")):
- if not line.strip():
- continue
- m = re.match(r"(\\.|[^#])*(#.*)?", line)
- comment_start = None
- if m.group(2):
- comment_start = m.start(2)
- s = "%d.%d" % (lineno + 1, m.start(2))
- e = "%d.%d" % (lineno + 1, m.end(2))
- self.grammarbox.tag_add("comment", s, e)
- for m in re.finditer("[<>{}]", line):
- if comment_start is not None and m.start() >= comment_start:
- break
- s = "%d.%d" % (lineno + 1, m.start())
- e = "%d.%d" % (lineno + 1, m.end())
- if m.group() in "<>":
- self.grammarbox.tag_add("angle", s, e)
- else:
- self.grammarbox.tag_add("brace", s, e)
- def _grammarcheck(self, grammar):
- if self.top is None:
- return
- self.grammarbox.tag_remove("error", "1.0", "end")
- self._grammarcheck_errs = []
- for lineno, line in enumerate(grammar.split("\n")):
- line = re.sub(r"((\\.|[^#])*)(#.*)?", r"\1", line)
- line = line.strip()
- if line:
- try:
- RegexpChunkRule.fromstring(line)
- except ValueError as e:
- self.grammarbox.tag_add(
- "error", "%s.0" % (lineno + 1), "%s.0 lineend" % (lineno + 1)
- )
- self.status["text"] = ""
- def update(self, *event):
- # Record when update was called (for grammarcheck)
- if event:
- self._last_keypress = time.time()
- # Read the grammar from the Text box.
- self.grammar = grammar = self.grammarbox.get("1.0", "end")
- # If the grammar hasn't changed, do nothing:
- normalized_grammar = self.normalize_grammar(grammar)
- if normalized_grammar == self.normalized_grammar:
- return
- else:
- self.normalized_grammar = normalized_grammar
- # If the grammar has changed, and we're looking at history,
- # then stop looking at history.
- if self._history_index < len(self._history) - 1:
- self.grammarlabel["text"] = "Grammar:"
- self._syntax_highlight_grammar(grammar)
- # The grammar has changed; try parsing it. If it doesn't
- # parse, do nothing. (flag error location?)
- try:
- # Note: the normalized grammar has no blank lines.
- if normalized_grammar:
- rules = [
- RegexpChunkRule.fromstring(line)
- for line in normalized_grammar.split("\n")
- ]
- else:
- rules = []
- except ValueError as e:
- # Use the un-normalized grammar for error highlighting.
- self._grammarcheck(grammar)
- self.chunker = None
- return
- self.chunker = RegexpChunkParser(rules)
- self.grammarbox.tag_remove("error", "1.0", "end")
- self.grammar_changed = time.time()
- # Display the results
- if self._showing_trace:
- self.show_trace()
- else:
- self._highlight_devset()
- # Start the eval demon
- if not self._eval_demon_running:
- self._eval_demon()
- def _highlight_devset(self, sample=None):
- if sample is None:
- sample = self.devset[self.devset_index : self.devset_index + 1]
- self.devsetbox.tag_remove("true-pos", "1.0", "end")
- self.devsetbox.tag_remove("false-neg", "1.0", "end")
- self.devsetbox.tag_remove("false-pos", "1.0", "end")
- # Run the grammar on the test cases.
- for sentnum, gold_tree in enumerate(sample):
- # Run the chunk parser
- test_tree = self._chunkparse(gold_tree.leaves())
- # Extract gold & test chunks
- gold_chunks = self._chunks(gold_tree)
- test_chunks = self._chunks(test_tree)
- # Compare them.
- for chunk in gold_chunks.intersection(test_chunks):
- self._color_chunk(sentnum, chunk, "true-pos")
- for chunk in gold_chunks - test_chunks:
- self._color_chunk(sentnum, chunk, "false-neg")
- for chunk in test_chunks - gold_chunks:
- self._color_chunk(sentnum, chunk, "false-pos")
- def _chunkparse(self, words):
- try:
- return self.chunker.parse(words)
- except (ValueError, IndexError) as e:
- # There's an error somewhere in the grammar, but we're not sure
- # exactly where, so just mark the whole grammar as bad.
- # E.g., this is caused by: "({<NN>})"
- self.grammarbox.tag_add("error", "1.0", "end")
- # Treat it as tagging nothing:
- return words
- def _color_chunk(self, sentnum, chunk, tag):
- start, end = chunk
- self.devsetbox.tag_add(
- tag,
- "%s.%s" % (self.linenum[sentnum], self.charnum[sentnum, start]),
- "%s.%s" % (self.linenum[sentnum], self.charnum[sentnum, end] - 1),
- )
- def reset(self):
- # Clear various variables
- self.chunker = None
- self.grammar = None
- self.normalized_grammar = None
- self.grammar_changed = 0
- self._history = []
- self._history_index = 0
- # Update the on-screen display.
- self.grammarbox.delete("1.0", "end")
- self.show_devset(0)
- self.update()
- # self._eval_plot()
- SAVE_GRAMMAR_TEMPLATE = (
- "# Regexp Chunk Parsing Grammar\n"
- "# Saved %(date)s\n"
- "#\n"
- "# Development set: %(devset)s\n"
- "# Precision: %(precision)s\n"
- "# Recall: %(recall)s\n"
- "# F-score: %(fscore)s\n\n"
- "%(grammar)s\n"
- )
- def save_grammar(self, filename=None):
- if not filename:
- ftypes = [("Chunk Gramamr", ".chunk"), ("All files", "*")]
- filename = asksaveasfilename(filetypes=ftypes, defaultextension=".chunk")
- if not filename:
- return
- if self._history and self.normalized_grammar == self.normalize_grammar(
- self._history[-1][0]
- ):
- precision, recall, fscore = [
- "%.2f%%" % (100 * v) for v in self._history[-1][1:]
- ]
- elif self.chunker is None:
- precision = recall = fscore = "Grammar not well formed"
- else:
- precision = recall = fscore = "Not finished evaluation yet"
- with open(filename, "w") as outfile:
- outfile.write(
- self.SAVE_GRAMMAR_TEMPLATE
- % dict(
- date=time.ctime(),
- devset=self.devset_name,
- precision=precision,
- recall=recall,
- fscore=fscore,
- grammar=self.grammar.strip(),
- )
- )
- def load_grammar(self, filename=None):
- if not filename:
- ftypes = [("Chunk Gramamr", ".chunk"), ("All files", "*")]
- filename = askopenfilename(filetypes=ftypes, defaultextension=".chunk")
- if not filename:
- return
- self.grammarbox.delete("1.0", "end")
- self.update()
- with open(filename, "r") as infile:
- grammar = infile.read()
- grammar = re.sub(
- "^\# Regexp Chunk Parsing Grammar[\s\S]*" "F-score:.*\n", "", grammar
- ).lstrip()
- self.grammarbox.insert("1.0", grammar)
- self.update()
- def save_history(self, filename=None):
- if not filename:
- ftypes = [("Chunk Gramamr History", ".txt"), ("All files", "*")]
- filename = asksaveasfilename(filetypes=ftypes, defaultextension=".txt")
- if not filename:
- return
- with open(filename, "w") as outfile:
- outfile.write("# Regexp Chunk Parsing Grammar History\n")
- outfile.write("# Saved %s\n" % time.ctime())
- outfile.write("# Development set: %s\n" % self.devset_name)
- for i, (g, p, r, f) in enumerate(self._history):
- hdr = (
- "Grammar %d/%d (precision=%.2f%%, recall=%.2f%%, "
- "fscore=%.2f%%)"
- % (i + 1, len(self._history), p * 100, r * 100, f * 100)
- )
- outfile.write("\n%s\n" % hdr)
- outfile.write("".join(" %s\n" % line for line in g.strip().split()))
- if not (
- self._history
- and self.normalized_grammar
- == self.normalize_grammar(self._history[-1][0])
- ):
- if self.chunker is None:
- outfile.write("\nCurrent Grammar (not well-formed)\n")
- else:
- outfile.write("\nCurrent Grammar (not evaluated)\n")
- outfile.write(
- "".join(" %s\n" % line for line in self.grammar.strip().split())
- )
- def about(self, *e):
- ABOUT = "NLTK RegExp Chunk Parser Application\n" + "Written by Edward Loper"
- TITLE = "About: Regular Expression Chunk Parser Application"
- try:
- from tkinter.messagebox import Message
- Message(message=ABOUT, title=TITLE).show()
- except:
- ShowText(self.top, TITLE, ABOUT)
- def set_devset_size(self, size=None):
- if size is not None:
- self._devset_size.set(size)
- self._devset_size.set(min(len(self.devset), self._devset_size.get()))
- self.show_devset(1)
- self.show_devset(0)
- # what about history? Evaluated at diff dev set sizes!
- def resize(self, size=None):
- if size is not None:
- self._size.set(size)
- size = self._size.get()
- self._font.configure(size=-(abs(size)))
- self._smallfont.configure(size=min(-10, -(abs(size)) * 14 // 20))
- def mainloop(self, *args, **kwargs):
- """
- Enter the Tkinter mainloop. This function must be called if
- this demo is created from a non-interactive program (e.g.
- from a secript); otherwise, the demo will close as soon as
- the script completes.
- """
- if in_idle():
- return
- self.top.mainloop(*args, **kwargs)
- def app():
- RegexpChunkApp().mainloop()
- if __name__ == "__main__":
- app()
- __all__ = ["app"]
|