| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450 |
- # Natural Language Toolkit: Framenet Corpus Reader
- #
- # Copyright (C) 2001-2020 NLTK Project
- # Authors: Chuck Wooters <wooters@icsi.berkeley.edu>,
- # Nathan Schneider <nathan.schneider@georgetown.edu>
- # URL: <http://nltk.org/>
- # For license information, see LICENSE.TXT
- """
- Corpus reader for the FrameNet 1.7 lexicon and corpus.
- """
- import os
- import re
- import textwrap
- import itertools
- import sys
- import types
- from collections import defaultdict, OrderedDict
- from operator import itemgetter
- from itertools import zip_longest
- from pprint import pprint
- from nltk.corpus.reader import XMLCorpusReader, XMLCorpusView
- from nltk.util import LazyConcatenation, LazyMap, LazyIteratorList
- __docformat__ = "epytext en"
- def mimic_wrap(lines, wrap_at=65, **kwargs):
- """
- Wrap the first of 'lines' with textwrap and the remaining lines at exactly the same
- positions as the first.
- """
- l0 = textwrap.fill(lines[0], wrap_at, drop_whitespace=False).split("\n")
- yield l0
- def _(line):
- il0 = 0
- while line and il0 < len(l0) - 1:
- yield line[: len(l0[il0])]
- line = line[len(l0[il0]) :]
- il0 += 1
- if line: # Remaining stuff on this line past the end of the mimicked line.
- # So just textwrap this line.
- for ln in textwrap.fill(line, wrap_at, drop_whitespace=False).split("\n"):
- yield ln
- for l in lines[1:]:
- yield list(_(l))
- def _pretty_longstring(defstr, prefix="", wrap_at=65):
- """
- Helper function for pretty-printing a long string.
- :param defstr: The string to be printed.
- :type defstr: str
- :return: A nicely formated string representation of the long string.
- :rtype: str
- """
- outstr = ""
- for line in textwrap.fill(defstr, wrap_at).split("\n"):
- outstr += prefix + line + "\n"
- return outstr
- def _pretty_any(obj):
- """
- Helper function for pretty-printing any AttrDict object.
- :param obj: The obj to be printed.
- :type obj: AttrDict
- :return: A nicely formated string representation of the AttrDict object.
- :rtype: str
- """
- outstr = ""
- for k in obj:
- if isinstance(obj[k], str) and len(obj[k]) > 65:
- outstr += "[{0}]\n".format(k)
- outstr += "{0}".format(_pretty_longstring(obj[k], prefix=" "))
- outstr += "\n"
- else:
- outstr += "[{0}] {1}\n".format(k, obj[k])
- return outstr
- def _pretty_semtype(st):
- """
- Helper function for pretty-printing a semantic type.
- :param st: The semantic type to be printed.
- :type st: AttrDict
- :return: A nicely formated string representation of the semantic type.
- :rtype: str
- """
- semkeys = st.keys()
- if len(semkeys) == 1:
- return "<None>"
- outstr = ""
- outstr += "semantic type ({0.ID}): {0.name}\n".format(st)
- if "abbrev" in semkeys:
- outstr += "[abbrev] {0}\n".format(st.abbrev)
- if "definition" in semkeys:
- outstr += "[definition]\n"
- outstr += _pretty_longstring(st.definition, " ")
- outstr += "[rootType] {0}({1})\n".format(st.rootType.name, st.rootType.ID)
- if st.superType is None:
- outstr += "[superType] <None>\n"
- else:
- outstr += "[superType] {0}({1})\n".format(st.superType.name, st.superType.ID)
- outstr += "[subTypes] {0} subtypes\n".format(len(st.subTypes))
- outstr += (
- " "
- + ", ".join("{0}({1})".format(x.name, x.ID) for x in st.subTypes)
- + "\n" * (len(st.subTypes) > 0)
- )
- return outstr
- def _pretty_frame_relation_type(freltyp):
- """
- Helper function for pretty-printing a frame relation type.
- :param freltyp: The frame relation type to be printed.
- :type freltyp: AttrDict
- :return: A nicely formated string representation of the frame relation type.
- :rtype: str
- """
- outstr = "<frame relation type ({0.ID}): {0.superFrameName} -- {0.name} -> {0.subFrameName}>".format(
- freltyp
- )
- return outstr
- def _pretty_frame_relation(frel):
- """
- Helper function for pretty-printing a frame relation.
- :param frel: The frame relation to be printed.
- :type frel: AttrDict
- :return: A nicely formated string representation of the frame relation.
- :rtype: str
- """
- outstr = "<{0.type.superFrameName}={0.superFrameName} -- {0.type.name} -> {0.type.subFrameName}={0.subFrameName}>".format(
- frel
- )
- return outstr
- def _pretty_fe_relation(ferel):
- """
- Helper function for pretty-printing an FE relation.
- :param ferel: The FE relation to be printed.
- :type ferel: AttrDict
- :return: A nicely formated string representation of the FE relation.
- :rtype: str
- """
- outstr = "<{0.type.superFrameName}={0.frameRelation.superFrameName}.{0.superFEName} -- {0.type.name} -> {0.type.subFrameName}={0.frameRelation.subFrameName}.{0.subFEName}>".format(
- ferel
- )
- return outstr
- def _pretty_lu(lu):
- """
- Helper function for pretty-printing a lexical unit.
- :param lu: The lu to be printed.
- :type lu: AttrDict
- :return: A nicely formated string representation of the lexical unit.
- :rtype: str
- """
- lukeys = lu.keys()
- outstr = ""
- outstr += "lexical unit ({0.ID}): {0.name}\n\n".format(lu)
- if "definition" in lukeys:
- outstr += "[definition]\n"
- outstr += _pretty_longstring(lu.definition, " ")
- if "frame" in lukeys:
- outstr += "\n[frame] {0}({1})\n".format(lu.frame.name, lu.frame.ID)
- if "incorporatedFE" in lukeys:
- outstr += "\n[incorporatedFE] {0}\n".format(lu.incorporatedFE)
- if "POS" in lukeys:
- outstr += "\n[POS] {0}\n".format(lu.POS)
- if "status" in lukeys:
- outstr += "\n[status] {0}\n".format(lu.status)
- if "totalAnnotated" in lukeys:
- outstr += "\n[totalAnnotated] {0} annotated examples\n".format(
- lu.totalAnnotated
- )
- if "lexemes" in lukeys:
- outstr += "\n[lexemes] {0}\n".format(
- " ".join("{0}/{1}".format(lex.name, lex.POS) for lex in lu.lexemes)
- )
- if "semTypes" in lukeys:
- outstr += "\n[semTypes] {0} semantic types\n".format(len(lu.semTypes))
- outstr += (
- " " * (len(lu.semTypes) > 0)
- + ", ".join("{0}({1})".format(x.name, x.ID) for x in lu.semTypes)
- + "\n" * (len(lu.semTypes) > 0)
- )
- if "URL" in lukeys:
- outstr += "\n[URL] {0}\n".format(lu.URL)
- if "subCorpus" in lukeys:
- subc = [x.name for x in lu.subCorpus]
- outstr += "\n[subCorpus] {0} subcorpora\n".format(len(lu.subCorpus))
- for line in textwrap.fill(", ".join(sorted(subc)), 60).split("\n"):
- outstr += " {0}\n".format(line)
- if "exemplars" in lukeys:
- outstr += "\n[exemplars] {0} sentences across all subcorpora\n".format(
- len(lu.exemplars)
- )
- return outstr
- def _pretty_exemplars(exemplars, lu):
- """
- Helper function for pretty-printing a list of exemplar sentences for a lexical unit.
- :param sent: The list of exemplar sentences to be printed.
- :type sent: list(AttrDict)
- :return: An index of the text of the exemplar sentences.
- :rtype: str
- """
- outstr = ""
- outstr += "exemplar sentences for {0.name} in {0.frame.name}:\n\n".format(lu)
- for i, sent in enumerate(exemplars):
- outstr += "[{0}] {1}\n".format(i, sent.text)
- outstr += "\n"
- return outstr
- def _pretty_fulltext_sentences(sents):
- """
- Helper function for pretty-printing a list of annotated sentences for a full-text document.
- :param sent: The list of sentences to be printed.
- :type sent: list(AttrDict)
- :return: An index of the text of the sentences.
- :rtype: str
- """
- outstr = ""
- outstr += "full-text document ({0.ID}) {0.name}:\n\n".format(sents)
- outstr += "[corpid] {0.corpid}\n[corpname] {0.corpname}\n[description] {0.description}\n[URL] {0.URL}\n\n".format(
- sents
- )
- outstr += "[sentence]\n".format(sents)
- for i, sent in enumerate(sents.sentence):
- outstr += "[{0}] {1}\n".format(i, sent.text)
- outstr += "\n"
- return outstr
- def _pretty_fulltext_sentence(sent):
- """
- Helper function for pretty-printing an annotated sentence from a full-text document.
- :param sent: The sentence to be printed.
- :type sent: list(AttrDict)
- :return: The text of the sentence with annotation set indices on frame targets.
- :rtype: str
- """
- outstr = ""
- outstr += "full-text sentence ({0.ID}) in {1}:\n\n".format(
- sent, sent.doc.get("name", sent.doc.description)
- )
- outstr += "\n[POS] {0} tags\n".format(len(sent.POS))
- outstr += "\n[POS_tagset] {0}\n\n".format(sent.POS_tagset)
- outstr += "[text] + [annotationSet]\n\n"
- outstr += sent._ascii() # -> _annotation_ascii()
- outstr += "\n"
- return outstr
- def _pretty_pos(aset):
- """
- Helper function for pretty-printing a sentence with its POS tags.
- :param aset: The POS annotation set of the sentence to be printed.
- :type sent: list(AttrDict)
- :return: The text of the sentence and its POS tags.
- :rtype: str
- """
- outstr = ""
- outstr += "POS annotation set ({0.ID}) {0.POS_tagset} in sentence {0.sent.ID}:\n\n".format(
- aset
- )
- # list the target spans and their associated aset index
- overt = sorted(aset.POS)
- sent = aset.sent
- s0 = sent.text
- s1 = ""
- s2 = ""
- i = 0
- adjust = 0
- for j, k, lbl in overt:
- assert j >= i, ("Overlapping targets?", (j, k, lbl))
- s1 += " " * (j - i) + "-" * (k - j)
- if len(lbl) > (k - j):
- # add space in the sentence to make room for the annotation index
- amt = len(lbl) - (k - j)
- s0 = (
- s0[: k + adjust] + "~" * amt + s0[k + adjust :]
- ) # '~' to prevent line wrapping
- s1 = s1[: k + adjust] + " " * amt + s1[k + adjust :]
- adjust += amt
- s2 += " " * (j - i) + lbl.ljust(k - j)
- i = k
- long_lines = [s0, s1, s2]
- outstr += "\n\n".join(
- map("\n".join, zip_longest(*mimic_wrap(long_lines), fillvalue=" "))
- ).replace("~", " ")
- outstr += "\n"
- return outstr
- def _pretty_annotation(sent, aset_level=False):
- """
- Helper function for pretty-printing an exemplar sentence for a lexical unit.
- :param sent: An annotation set or exemplar sentence to be printed.
- :param aset_level: If True, 'sent' is actually an annotation set within a sentence.
- :type sent: AttrDict
- :return: A nicely formated string representation of the exemplar sentence
- with its target, frame, and FE annotations.
- :rtype: str
- """
- sentkeys = sent.keys()
- outstr = "annotation set" if aset_level else "exemplar sentence"
- outstr += " ({0.ID}):\n".format(sent)
- if aset_level: # TODO: any UNANN exemplars?
- outstr += "\n[status] {0}\n".format(sent.status)
- for k in ("corpID", "docID", "paragNo", "sentNo", "aPos"):
- if k in sentkeys:
- outstr += "[{0}] {1}\n".format(k, sent[k])
- outstr += (
- "\n[LU] ({0.ID}) {0.name} in {0.frame.name}\n".format(sent.LU)
- if sent.LU
- else "\n[LU] Not found!"
- )
- outstr += "\n[frame] ({0.ID}) {0.name}\n".format(
- sent.frame
- ) # redundant with above, but .frame is convenient
- if not aset_level:
- outstr += "\n[annotationSet] {0} annotation sets\n".format(
- len(sent.annotationSet)
- )
- outstr += "\n[POS] {0} tags\n".format(len(sent.POS))
- outstr += "\n[POS_tagset] {0}\n".format(sent.POS_tagset)
- outstr += "\n[GF] {0} relation{1}\n".format(
- len(sent.GF), "s" if len(sent.GF) != 1 else ""
- )
- outstr += "\n[PT] {0} phrase{1}\n".format(
- len(sent.PT), "s" if len(sent.PT) != 1 else ""
- )
- """
- Special Layers
- --------------
- The 'NER' layer contains, for some of the data, named entity labels.
- The 'WSL' (word status layer) contains, for some of the data,
- spans which should not in principle be considered targets (NT).
- The 'Other' layer records relative clause constructions (Rel=relativizer, Ant=antecedent),
- pleonastic 'it' (Null), and existential 'there' (Exist).
- On occasion they are duplicated by accident (e.g., annotationSet 1467275 in lu6700.xml).
- The 'Sent' layer appears to contain labels that the annotator has flagged the
- sentence with for their convenience: values include
- 'sense1', 'sense2', 'sense3', etc.;
- 'Blend', 'Canonical', 'Idiom', 'Metaphor', 'Special-Sent',
- 'keepS', 'deleteS', 'reexamine'
- (sometimes they are duplicated for no apparent reason).
- The POS-specific layers may contain the following kinds of spans:
- Asp (aspectual particle), Non-Asp (non-aspectual particle),
- Cop (copula), Supp (support), Ctrlr (controller),
- Gov (governor), X. Gov and X always cooccur.
- >>> from nltk.corpus import framenet as fn
- >>> def f(luRE, lyr, ignore=set()):
- ... for i,ex in enumerate(fn.exemplars(luRE)):
- ... if lyr in ex and ex[lyr] and set(zip(*ex[lyr])[2]) - ignore:
- ... print(i,ex[lyr])
- - Verb: Asp, Non-Asp
- - Noun: Cop, Supp, Ctrlr, Gov, X
- - Adj: Cop, Supp, Ctrlr, Gov, X
- - Prep: Cop, Supp, Ctrlr
- - Adv: Ctrlr
- - Scon: (none)
- - Art: (none)
- """
- for lyr in ("NER", "WSL", "Other", "Sent"):
- if lyr in sent and sent[lyr]:
- outstr += "\n[{0}] {1} entr{2}\n".format(
- lyr, len(sent[lyr]), "ies" if len(sent[lyr]) != 1 else "y"
- )
- outstr += "\n[text] + [Target] + [FE]"
- # POS-specific layers: syntactically important words that are neither the target
- # nor the FEs. Include these along with the first FE layer but with '^' underlining.
- for lyr in ("Verb", "Noun", "Adj", "Adv", "Prep", "Scon", "Art"):
- if lyr in sent and sent[lyr]:
- outstr += " + [{0}]".format(lyr)
- if "FE2" in sentkeys:
- outstr += " + [FE2]"
- if "FE3" in sentkeys:
- outstr += " + [FE3]"
- outstr += "\n\n"
- outstr += sent._ascii() # -> _annotation_ascii()
- outstr += "\n"
- return outstr
- def _annotation_ascii(sent):
- """
- Given a sentence or FE annotation set, construct the width-limited string showing
- an ASCII visualization of the sentence's annotations, calling either
- _annotation_ascii_frames() or _annotation_ascii_FEs() as appropriate.
- This will be attached as a method to appropriate AttrDict instances
- and called in the full pretty-printing of the instance.
- """
- if sent._type == "fulltext_sentence" or (
- "annotationSet" in sent and len(sent.annotationSet) > 2
- ):
- # a full-text sentence OR sentence with multiple targets.
- # (multiple targets = >2 annotation sets, because the first annotation set is POS.)
- return _annotation_ascii_frames(sent)
- else: # an FE annotation set, or an LU sentence with 1 target
- return _annotation_ascii_FEs(sent)
- def _annotation_ascii_frames(sent):
- """
- ASCII string rendering of the sentence along with its targets and frame names.
- Called for all full-text sentences, as well as the few LU sentences with multiple
- targets (e.g., fn.lu(6412).exemplars[82] has two want.v targets).
- Line-wrapped to limit the display width.
- """
- # list the target spans and their associated aset index
- overt = []
- for a, aset in enumerate(sent.annotationSet[1:]):
- for j, k in aset.Target:
- indexS = "[{0}]".format(a + 1)
- if aset.status == "UNANN" or aset.LU.status == "Problem":
- indexS += " "
- if aset.status == "UNANN":
- indexS += (
- "!"
- ) # warning indicator that there is a frame annotation but no FE annotation
- if aset.LU.status == "Problem":
- indexS += (
- "?"
- ) # warning indicator that there is a missing LU definition (because the LU has Problem status)
- overt.append((j, k, aset.LU.frame.name, indexS))
- overt = sorted(overt)
- duplicates = set()
- for o, (j, k, fname, asetIndex) in enumerate(overt):
- if o > 0 and j <= overt[o - 1][1]:
- # multiple annotation sets on the same target
- # (e.g. due to a coordination construction or multiple annotators)
- if (
- overt[o - 1][:2] == (j, k) and overt[o - 1][2] == fname
- ): # same target, same frame
- # splice indices together
- combinedIndex = (
- overt[o - 1][3] + asetIndex
- ) # e.g., '[1][2]', '[1]! [2]'
- combinedIndex = combinedIndex.replace(" !", "! ").replace(" ?", "? ")
- overt[o - 1] = overt[o - 1][:3] + (combinedIndex,)
- duplicates.add(o)
- else: # different frames, same or overlapping targets
- s = sent.text
- for j, k, fname, asetIndex in overt:
- s += "\n" + asetIndex + " " + sent.text[j:k] + " :: " + fname
- s += "\n(Unable to display sentence with targets marked inline due to overlap)"
- return s
- for o in reversed(sorted(duplicates)):
- del overt[o]
- s0 = sent.text
- s1 = ""
- s11 = ""
- s2 = ""
- i = 0
- adjust = 0
- fAbbrevs = OrderedDict()
- for j, k, fname, asetIndex in overt:
- if not j >= i:
- assert j >= i, (
- "Overlapping targets?"
- + (
- " UNANN"
- if any(aset.status == "UNANN" for aset in sent.annotationSet[1:])
- else ""
- ),
- (j, k, asetIndex),
- )
- s1 += " " * (j - i) + "*" * (k - j)
- short = fname[: k - j]
- if (k - j) < len(fname):
- r = 0
- while short in fAbbrevs:
- if fAbbrevs[short] == fname:
- break
- r += 1
- short = fname[: k - j - 1] + str(r)
- else: # short not in fAbbrevs
- fAbbrevs[short] = fname
- s11 += " " * (j - i) + short.ljust(k - j)
- if len(asetIndex) > (k - j):
- # add space in the sentence to make room for the annotation index
- amt = len(asetIndex) - (k - j)
- s0 = (
- s0[: k + adjust] + "~" * amt + s0[k + adjust :]
- ) # '~' to prevent line wrapping
- s1 = s1[: k + adjust] + " " * amt + s1[k + adjust :]
- s11 = s11[: k + adjust] + " " * amt + s11[k + adjust :]
- adjust += amt
- s2 += " " * (j - i) + asetIndex.ljust(k - j)
- i = k
- long_lines = [s0, s1, s11, s2]
- outstr = "\n\n".join(
- map("\n".join, zip_longest(*mimic_wrap(long_lines), fillvalue=" "))
- ).replace("~", " ")
- outstr += "\n"
- if fAbbrevs:
- outstr += " (" + ", ".join("=".join(pair) for pair in fAbbrevs.items()) + ")"
- assert len(fAbbrevs) == len(dict(fAbbrevs)), "Abbreviation clash"
- return outstr
- def _annotation_ascii_FE_layer(overt, ni, feAbbrevs):
- """Helper for _annotation_ascii_FEs()."""
- s1 = ""
- s2 = ""
- i = 0
- for j, k, fename in overt:
- s1 += " " * (j - i) + ("^" if fename.islower() else "-") * (k - j)
- short = fename[: k - j]
- if len(fename) > len(short):
- r = 0
- while short in feAbbrevs:
- if feAbbrevs[short] == fename:
- break
- r += 1
- short = fename[: k - j - 1] + str(r)
- else: # short not in feAbbrevs
- feAbbrevs[short] = fename
- s2 += " " * (j - i) + short.ljust(k - j)
- i = k
- sNI = ""
- if ni:
- sNI += " [" + ", ".join(":".join(x) for x in sorted(ni.items())) + "]"
- return [s1, s2, sNI]
- def _annotation_ascii_FEs(sent):
- """
- ASCII string rendering of the sentence along with a single target and its FEs.
- Secondary and tertiary FE layers are included if present.
- 'sent' can be an FE annotation set or an LU sentence with a single target.
- Line-wrapped to limit the display width.
- """
- feAbbrevs = OrderedDict()
- posspec = [] # POS-specific layer spans (e.g., Supp[ort], Cop[ula])
- posspec_separate = False
- for lyr in ("Verb", "Noun", "Adj", "Adv", "Prep", "Scon", "Art"):
- if lyr in sent and sent[lyr]:
- for a, b, lbl in sent[lyr]:
- if (
- lbl == "X"
- ): # skip this, which covers an entire phrase typically containing the target and all its FEs
- # (but do display the Gov)
- continue
- if any(1 for x, y, felbl in sent.FE[0] if x <= a < y or a <= x < b):
- # overlap between one of the POS-specific layers and first FE layer
- posspec_separate = (
- True
- ) # show POS-specific layers on a separate line
- posspec.append(
- (a, b, lbl.lower().replace("-", ""))
- ) # lowercase Cop=>cop, Non-Asp=>nonasp, etc. to distinguish from FE names
- if posspec_separate:
- POSSPEC = _annotation_ascii_FE_layer(posspec, {}, feAbbrevs)
- FE1 = _annotation_ascii_FE_layer(
- sorted(sent.FE[0] + (posspec if not posspec_separate else [])),
- sent.FE[1],
- feAbbrevs,
- )
- FE2 = FE3 = None
- if "FE2" in sent:
- FE2 = _annotation_ascii_FE_layer(sent.FE2[0], sent.FE2[1], feAbbrevs)
- if "FE3" in sent:
- FE3 = _annotation_ascii_FE_layer(sent.FE3[0], sent.FE3[1], feAbbrevs)
- for i, j in sent.Target:
- FE1span, FE1name, FE1exp = FE1
- if len(FE1span) < j:
- FE1span += " " * (j - len(FE1span))
- if len(FE1name) < j:
- FE1name += " " * (j - len(FE1name))
- FE1[1] = FE1name
- FE1[0] = (
- FE1span[:i] + FE1span[i:j].replace(" ", "*").replace("-", "=") + FE1span[j:]
- )
- long_lines = [sent.text]
- if posspec_separate:
- long_lines.extend(POSSPEC[:2])
- long_lines.extend([FE1[0], FE1[1] + FE1[2]]) # lines with no length limit
- if FE2:
- long_lines.extend([FE2[0], FE2[1] + FE2[2]])
- if FE3:
- long_lines.extend([FE3[0], FE3[1] + FE3[2]])
- long_lines.append("")
- outstr = "\n".join(
- map("\n".join, zip_longest(*mimic_wrap(long_lines), fillvalue=" "))
- )
- if feAbbrevs:
- outstr += "(" + ", ".join("=".join(pair) for pair in feAbbrevs.items()) + ")"
- assert len(feAbbrevs) == len(dict(feAbbrevs)), "Abbreviation clash"
- outstr += "\n"
- return outstr
- def _pretty_fe(fe):
- """
- Helper function for pretty-printing a frame element.
- :param fe: The frame element to be printed.
- :type fe: AttrDict
- :return: A nicely formated string representation of the frame element.
- :rtype: str
- """
- fekeys = fe.keys()
- outstr = ""
- outstr += "frame element ({0.ID}): {0.name}\n of {1.name}({1.ID})\n".format(
- fe, fe.frame
- )
- if "definition" in fekeys:
- outstr += "[definition]\n"
- outstr += _pretty_longstring(fe.definition, " ")
- if "abbrev" in fekeys:
- outstr += "[abbrev] {0}\n".format(fe.abbrev)
- if "coreType" in fekeys:
- outstr += "[coreType] {0}\n".format(fe.coreType)
- if "requiresFE" in fekeys:
- outstr += "[requiresFE] "
- if fe.requiresFE is None:
- outstr += "<None>\n"
- else:
- outstr += "{0}({1})\n".format(fe.requiresFE.name, fe.requiresFE.ID)
- if "excludesFE" in fekeys:
- outstr += "[excludesFE] "
- if fe.excludesFE is None:
- outstr += "<None>\n"
- else:
- outstr += "{0}({1})\n".format(fe.excludesFE.name, fe.excludesFE.ID)
- if "semType" in fekeys:
- outstr += "[semType] "
- if fe.semType is None:
- outstr += "<None>\n"
- else:
- outstr += "\n " + "{0}({1})".format(fe.semType.name, fe.semType.ID) + "\n"
- return outstr
- def _pretty_frame(frame):
- """
- Helper function for pretty-printing a frame.
- :param frame: The frame to be printed.
- :type frame: AttrDict
- :return: A nicely formated string representation of the frame.
- :rtype: str
- """
- outstr = ""
- outstr += "frame ({0.ID}): {0.name}\n\n".format(frame)
- outstr += "[URL] {0}\n\n".format(frame.URL)
- outstr += "[definition]\n"
- outstr += _pretty_longstring(frame.definition, " ") + "\n"
- outstr += "[semTypes] {0} semantic types\n".format(len(frame.semTypes))
- outstr += (
- " " * (len(frame.semTypes) > 0)
- + ", ".join("{0}({1})".format(x.name, x.ID) for x in frame.semTypes)
- + "\n" * (len(frame.semTypes) > 0)
- )
- outstr += "\n[frameRelations] {0} frame relations\n".format(
- len(frame.frameRelations)
- )
- outstr += " " + "\n ".join(repr(frel) for frel in frame.frameRelations) + "\n"
- outstr += "\n[lexUnit] {0} lexical units\n".format(len(frame.lexUnit))
- lustrs = []
- for luName, lu in sorted(frame.lexUnit.items()):
- tmpstr = "{0} ({1})".format(luName, lu.ID)
- lustrs.append(tmpstr)
- outstr += "{0}\n".format(_pretty_longstring(", ".join(lustrs), prefix=" "))
- outstr += "\n[FE] {0} frame elements\n".format(len(frame.FE))
- fes = {}
- for feName, fe in sorted(frame.FE.items()):
- try:
- fes[fe.coreType].append("{0} ({1})".format(feName, fe.ID))
- except KeyError:
- fes[fe.coreType] = []
- fes[fe.coreType].append("{0} ({1})".format(feName, fe.ID))
- for ct in sorted(
- fes.keys(),
- key=lambda ct2: [
- "Core",
- "Core-Unexpressed",
- "Peripheral",
- "Extra-Thematic",
- ].index(ct2),
- ):
- outstr += "{0:>16}: {1}\n".format(ct, ", ".join(sorted(fes[ct])))
- outstr += "\n[FEcoreSets] {0} frame element core sets\n".format(
- len(frame.FEcoreSets)
- )
- outstr += (
- " "
- + "\n ".join(
- ", ".join([x.name for x in coreSet]) for coreSet in frame.FEcoreSets
- )
- + "\n"
- )
- return outstr
- class FramenetError(Exception):
- """An exception class for framenet-related errors."""
- class AttrDict(dict):
- """A class that wraps a dict and allows accessing the keys of the
- dict as if they were attributes. Taken from here:
- http://stackoverflow.com/a/14620633/8879
- >>> foo = {'a':1, 'b':2, 'c':3}
- >>> bar = AttrDict(foo)
- >>> pprint(dict(bar))
- {'a': 1, 'b': 2, 'c': 3}
- >>> bar.b
- 2
- >>> bar.d = 4
- >>> pprint(dict(bar))
- {'a': 1, 'b': 2, 'c': 3, 'd': 4}
- """
- def __init__(self, *args, **kwargs):
- super(AttrDict, self).__init__(*args, **kwargs)
- # self.__dict__ = self
- def __setattr__(self, name, value):
- self[name] = value
- def __getattr__(self, name):
- if name == "_short_repr":
- return self._short_repr
- return self[name]
- def __getitem__(self, name):
- v = super(AttrDict, self).__getitem__(name)
- if isinstance(v, Future):
- return v._data()
- return v
- def _short_repr(self):
- if "_type" in self:
- if self["_type"].endswith("relation"):
- return self.__repr__()
- try:
- return "<{0} ID={1} name={2}>".format(
- self["_type"], self["ID"], self["name"]
- )
- except KeyError:
- try: # no ID--e.g., for _type=lusubcorpus
- return "<{0} name={1}>".format(self["_type"], self["name"])
- except KeyError: # no name--e.g., for _type=lusentence
- return "<{0} ID={1}>".format(self["_type"], self["ID"])
- else:
- return self.__repr__()
- def _str(self):
- outstr = ""
- if "_type" not in self:
- outstr = _pretty_any(self)
- elif self["_type"] == "frame":
- outstr = _pretty_frame(self)
- elif self["_type"] == "fe":
- outstr = _pretty_fe(self)
- elif self["_type"] == "lu":
- outstr = _pretty_lu(self)
- elif self["_type"] == "luexemplars": # list of ALL exemplars for LU
- outstr = _pretty_exemplars(self, self[0].LU)
- elif (
- self["_type"] == "fulltext_annotation"
- ): # list of all sentences for full-text doc
- outstr = _pretty_fulltext_sentences(self)
- elif self["_type"] == "lusentence":
- outstr = _pretty_annotation(self)
- elif self["_type"] == "fulltext_sentence":
- outstr = _pretty_fulltext_sentence(self)
- elif self["_type"] in ("luannotationset", "fulltext_annotationset"):
- outstr = _pretty_annotation(self, aset_level=True)
- elif self["_type"] == "posannotationset":
- outstr = _pretty_pos(self)
- elif self["_type"] == "semtype":
- outstr = _pretty_semtype(self)
- elif self["_type"] == "framerelationtype":
- outstr = _pretty_frame_relation_type(self)
- elif self["_type"] == "framerelation":
- outstr = _pretty_frame_relation(self)
- elif self["_type"] == "ferelation":
- outstr = _pretty_fe_relation(self)
- else:
- outstr = _pretty_any(self)
- # ensure result is unicode string prior to applying the
- # decorator (because non-ASCII characters
- # could in principle occur in the data and would trigger an encoding error when
- # passed as arguments to str.format()).
- # assert isinstance(outstr, unicode) # not in Python 3.2
- return outstr
- def __str__(self):
- return self._str()
- def __repr__(self):
- return self.__str__()
- class SpecialList(list):
- """
- A list subclass which adds a '_type' attribute for special printing
- (similar to an AttrDict, though this is NOT an AttrDict subclass).
- """
- def __init__(self, typ, *args, **kwargs):
- super(SpecialList, self).__init__(*args, **kwargs)
- self._type = typ
- def _str(self):
- outstr = ""
- assert self._type
- if len(self) == 0:
- outstr = "[]"
- elif self._type == "luexemplars": # list of ALL exemplars for LU
- outstr = _pretty_exemplars(self, self[0].LU)
- else:
- assert False, self._type
- return outstr
- def __str__(self):
- return self._str()
- def __repr__(self):
- return self.__str__()
- class Future(object):
- """
- Wraps and acts as a proxy for a value to be loaded lazily (on demand).
- Adapted from https://gist.github.com/sergey-miryanov/2935416
- """
- def __init__(self, loader, *args, **kwargs):
- """
- :param loader: when called with no arguments, returns the value to be stored
- :type loader: callable
- """
- super(Future, self).__init__(*args, **kwargs)
- self._loader = loader
- self._d = None
- def _data(self):
- if callable(self._loader):
- self._d = self._loader()
- self._loader = None # the data is now cached
- return self._d
- def __nonzero__(self):
- return bool(self._data())
- def __len__(self):
- return len(self._data())
- def __setitem__(self, key, value):
- return self._data().__setitem__(key, value)
- def __getitem__(self, key):
- return self._data().__getitem__(key)
- def __getattr__(self, key):
- return self._data().__getattr__(key)
- def __str__(self):
- return self._data().__str__()
- def __repr__(self):
- return self._data().__repr__()
- class PrettyDict(AttrDict):
- """
- Displays an abbreviated repr of values where possible.
- Inherits from AttrDict, so a callable value will
- be lazily converted to an actual value.
- """
- def __init__(self, *args, **kwargs):
- _BREAK_LINES = kwargs.pop("breakLines", False)
- super(PrettyDict, self).__init__(*args, **kwargs)
- dict.__setattr__(self, "_BREAK_LINES", _BREAK_LINES)
- def __repr__(self):
- parts = []
- for k, v in sorted(self.items()):
- kv = repr(k) + ": "
- try:
- kv += v._short_repr()
- except AttributeError:
- kv += repr(v)
- parts.append(kv)
- return "{" + (",\n " if self._BREAK_LINES else ", ").join(parts) + "}"
- class PrettyList(list):
- """
- Displays an abbreviated repr of only the first several elements, not the whole list.
- """
- # from nltk.util
- def __init__(self, *args, **kwargs):
- self._MAX_REPR_SIZE = kwargs.pop("maxReprSize", 60)
- self._BREAK_LINES = kwargs.pop("breakLines", False)
- super(PrettyList, self).__init__(*args, **kwargs)
- def __repr__(self):
- """
- Return a string representation for this corpus view that is
- similar to a list's representation; but if it would be more
- than 60 characters long, it is truncated.
- """
- pieces = []
- length = 5
- for elt in self:
- pieces.append(
- elt._short_repr()
- ) # key difference from inherited version: call to _short_repr()
- length += len(pieces[-1]) + 2
- if self._MAX_REPR_SIZE and length > self._MAX_REPR_SIZE and len(pieces) > 2:
- return "[%s, ...]" % str(
- ",\n " if self._BREAK_LINES else ", "
- ).join(pieces[:-1])
- return "[%s]" % str(",\n " if self._BREAK_LINES else ", ").join(pieces)
- class PrettyLazyMap(LazyMap):
- """
- Displays an abbreviated repr of only the first several elements, not the whole list.
- """
- # from nltk.util
- _MAX_REPR_SIZE = 60
- def __repr__(self):
- """
- Return a string representation for this corpus view that is
- similar to a list's representation; but if it would be more
- than 60 characters long, it is truncated.
- """
- pieces = []
- length = 5
- for elt in self:
- pieces.append(
- elt._short_repr()
- ) # key difference from inherited version: call to _short_repr()
- length += len(pieces[-1]) + 2
- if length > self._MAX_REPR_SIZE and len(pieces) > 2:
- return "[%s, ...]" % str(", ").join(pieces[:-1])
- return "[%s]" % str(", ").join(pieces)
- class PrettyLazyIteratorList(LazyIteratorList):
- """
- Displays an abbreviated repr of only the first several elements, not the whole list.
- """
- # from nltk.util
- _MAX_REPR_SIZE = 60
- def __repr__(self):
- """
- Return a string representation for this corpus view that is
- similar to a list's representation; but if it would be more
- than 60 characters long, it is truncated.
- """
- pieces = []
- length = 5
- for elt in self:
- pieces.append(
- elt._short_repr()
- ) # key difference from inherited version: call to _short_repr()
- length += len(pieces[-1]) + 2
- if length > self._MAX_REPR_SIZE and len(pieces) > 2:
- return "[%s, ...]" % str(", ").join(pieces[:-1])
- return "[%s]" % str(", ").join(pieces)
- class PrettyLazyConcatenation(LazyConcatenation):
- """
- Displays an abbreviated repr of only the first several elements, not the whole list.
- """
- # from nltk.util
- _MAX_REPR_SIZE = 60
- def __repr__(self):
- """
- Return a string representation for this corpus view that is
- similar to a list's representation; but if it would be more
- than 60 characters long, it is truncated.
- """
- pieces = []
- length = 5
- for elt in self:
- pieces.append(
- elt._short_repr()
- ) # key difference from inherited version: call to _short_repr()
- length += len(pieces[-1]) + 2
- if length > self._MAX_REPR_SIZE and len(pieces) > 2:
- return "[%s, ...]" % str(", ").join(pieces[:-1])
- return "[%s]" % str(", ").join(pieces)
- def __add__(self, other):
- """Return a list concatenating self with other."""
- return PrettyLazyIteratorList(itertools.chain(self, other))
- def __radd__(self, other):
- """Return a list concatenating other with self."""
- return PrettyLazyIteratorList(itertools.chain(other, self))
- class FramenetCorpusReader(XMLCorpusReader):
- """A corpus reader for the Framenet Corpus.
- >>> from nltk.corpus import framenet as fn
- >>> fn.lu(3238).frame.lexUnit['glint.v'] is fn.lu(3238)
- True
- >>> fn.frame_by_name('Replacing') is fn.lus('replace.v')[0].frame
- True
- >>> fn.lus('prejudice.n')[0].frame.frameRelations == fn.frame_relations('Partiality')
- True
- """
- _bad_statuses = ["Problem"]
- """
- When loading LUs for a frame, those whose status is in this list will be ignored.
- Due to caching, if user code modifies this, it should do so before loading any data.
- 'Problem' should always be listed for FrameNet 1.5, as these LUs are not included
- in the XML index.
- """
- _warnings = False
- def warnings(self, v):
- """Enable or disable warnings of data integrity issues as they are encountered.
- If v is truthy, warnings will be enabled.
- (This is a function rather than just an attribute/property to ensure that if
- enabling warnings is the first action taken, the corpus reader is instantiated first.)
- """
- self._warnings = v
- def __init__(self, root, fileids):
- XMLCorpusReader.__init__(self, root, fileids)
- # framenet corpus sub dirs
- # sub dir containing the xml files for frames
- self._frame_dir = "frame"
- # sub dir containing the xml files for lexical units
- self._lu_dir = "lu"
- # sub dir containing the xml files for fulltext annotation files
- self._fulltext_dir = "fulltext"
- # location of latest development version of FrameNet
- self._fnweb_url = "https://framenet2.icsi.berkeley.edu/fnReports/data"
- # Indexes used for faster look-ups
- self._frame_idx = None
- self._cached_frames = {} # name -> ID
- self._lu_idx = None
- self._fulltext_idx = None
- self._semtypes = None
- self._freltyp_idx = None # frame relation types (Inheritance, Using, etc.)
- self._frel_idx = None # frame-to-frame relation instances
- self._ferel_idx = None # FE-to-FE relation instances
- self._frel_f_idx = None # frame-to-frame relations associated with each frame
- def help(self, attrname=None):
- """Display help information summarizing the main methods."""
- if attrname is not None:
- return help(self.__getattribute__(attrname))
- # No need to mention frame_by_name() or frame_by_id(),
- # as it's easier to just call frame().
- # Also not mentioning lu_basic().
- msg = """
- Citation: Nathan Schneider and Chuck Wooters (2017),
- "The NLTK FrameNet API: Designing for Discoverability with a Rich Linguistic Resource".
- Proceedings of EMNLP: System Demonstrations. https://arxiv.org/abs/1703.07438
- Use the following methods to access data in FrameNet.
- Provide a method name to `help()` for more information.
- FRAMES
- ======
- frame() to look up a frame by its exact name or ID
- frames() to get frames matching a name pattern
- frames_by_lemma() to get frames containing an LU matching a name pattern
- frame_ids_and_names() to get a mapping from frame IDs to names
- FRAME ELEMENTS
- ==============
- fes() to get frame elements (a.k.a. roles) matching a name pattern, optionally constrained
- by a frame name pattern
- LEXICAL UNITS
- =============
- lu() to look up an LU by its ID
- lus() to get lexical units matching a name pattern, optionally constrained by frame
- lu_ids_and_names() to get a mapping from LU IDs to names
- RELATIONS
- =========
- frame_relation_types() to get the different kinds of frame-to-frame relations
- (Inheritance, Subframe, Using, etc.).
- frame_relations() to get the relation instances, optionally constrained by
- frame(s) or relation type
- fe_relations() to get the frame element pairs belonging to a frame-to-frame relation
- SEMANTIC TYPES
- ==============
- semtypes() to get the different kinds of semantic types that can be applied to
- FEs, LUs, and entire frames
- semtype() to look up a particular semtype by name, ID, or abbreviation
- semtype_inherits() to check whether two semantic types have a subtype-supertype
- relationship in the semtype hierarchy
- propagate_semtypes() to apply inference rules that distribute semtypes over relations
- between FEs
- ANNOTATIONS
- ===========
- annotations() to get annotation sets, in which a token in a sentence is annotated
- with a lexical unit in a frame, along with its frame elements and their syntactic properties;
- can be constrained by LU name pattern and limited to lexicographic exemplars or full-text.
- Sentences of full-text annotation can have multiple annotation sets.
- sents() to get annotated sentences illustrating one or more lexical units
- exemplars() to get sentences of lexicographic annotation, most of which have
- just 1 annotation set; can be constrained by LU name pattern, frame, and overt FE(s)
- doc() to look up a document of full-text annotation by its ID
- docs() to get documents of full-text annotation that match a name pattern
- docs_metadata() to get metadata about all full-text documents without loading them
- ft_sents() to iterate over sentences of full-text annotation
- UTILITIES
- =========
- buildindexes() loads metadata about all frames, LUs, etc. into memory to avoid
- delay when one is accessed for the first time. It does not load annotations.
- readme() gives the text of the FrameNet README file
- warnings(True) to display corpus consistency warnings when loading data
- """
- print(msg)
- def _buildframeindex(self):
- # The total number of Frames in Framenet is fairly small (~1200) so
- # this index should not be very large
- if not self._frel_idx:
- self._buildrelationindex() # always load frame relations before frames,
- # otherwise weird ordering effects might result in incomplete information
- self._frame_idx = {}
- for f in XMLCorpusView(
- self.abspath("frameIndex.xml"), "frameIndex/frame", self._handle_elt
- ):
- self._frame_idx[f["ID"]] = f
- def _buildcorpusindex(self):
- # The total number of fulltext annotated documents in Framenet
- # is fairly small (~90) so this index should not be very large
- self._fulltext_idx = {}
- for doclist in XMLCorpusView(
- self.abspath("fulltextIndex.xml"),
- "fulltextIndex/corpus",
- self._handle_fulltextindex_elt,
- ):
- for doc in doclist:
- self._fulltext_idx[doc.ID] = doc
- def _buildluindex(self):
- # The number of LUs in Framenet is about 13,000 so this index
- # should not be very large
- self._lu_idx = {}
- for lu in XMLCorpusView(
- self.abspath("luIndex.xml"), "luIndex/lu", self._handle_elt
- ):
- self._lu_idx[
- lu["ID"]
- ] = lu # populate with LU index entries. if any of these
- # are looked up they will be replaced by full LU objects.
- def _buildrelationindex(self):
- # print('building relation index...', file=sys.stderr)
- freltypes = PrettyList(
- x
- for x in XMLCorpusView(
- self.abspath("frRelation.xml"),
- "frameRelations/frameRelationType",
- self._handle_framerelationtype_elt,
- )
- )
- self._freltyp_idx = {}
- self._frel_idx = {}
- self._frel_f_idx = defaultdict(set)
- self._ferel_idx = {}
- for freltyp in freltypes:
- self._freltyp_idx[freltyp.ID] = freltyp
- for frel in freltyp.frameRelations:
- supF = frel.superFrame = frel[freltyp.superFrameName] = Future(
- (lambda fID: lambda: self.frame_by_id(fID))(frel.supID)
- )
- subF = frel.subFrame = frel[freltyp.subFrameName] = Future(
- (lambda fID: lambda: self.frame_by_id(fID))(frel.subID)
- )
- self._frel_idx[frel.ID] = frel
- self._frel_f_idx[frel.supID].add(frel.ID)
- self._frel_f_idx[frel.subID].add(frel.ID)
- for ferel in frel.feRelations:
- ferel.superFrame = supF
- ferel.subFrame = subF
- ferel.superFE = Future(
- (lambda fer: lambda: fer.superFrame.FE[fer.superFEName])(ferel)
- )
- ferel.subFE = Future(
- (lambda fer: lambda: fer.subFrame.FE[fer.subFEName])(ferel)
- )
- self._ferel_idx[ferel.ID] = ferel
- # print('...done building relation index', file=sys.stderr)
- def _warn(self, *message, **kwargs):
- if self._warnings:
- kwargs.setdefault("file", sys.stderr)
- print(*message, **kwargs)
- def readme(self):
- """
- Return the contents of the corpus README.txt (or README) file.
- """
- try:
- return self.open("README.txt").read()
- except IOError:
- return self.open("README").read()
- def buildindexes(self):
- """
- Build the internal indexes to make look-ups faster.
- """
- # Frames
- self._buildframeindex()
- # LUs
- self._buildluindex()
- # Fulltext annotation corpora index
- self._buildcorpusindex()
- # frame and FE relations
- self._buildrelationindex()
- def doc(self, fn_docid):
- """
- Returns the annotated document whose id number is
- ``fn_docid``. This id number can be obtained by calling the
- Documents() function.
- The dict that is returned from this function will contain the
- following keys:
- - '_type' : 'fulltextannotation'
- - 'sentence' : a list of sentences in the document
- - Each item in the list is a dict containing the following keys:
- - 'ID' : the ID number of the sentence
- - '_type' : 'sentence'
- - 'text' : the text of the sentence
- - 'paragNo' : the paragraph number
- - 'sentNo' : the sentence number
- - 'docID' : the document ID number
- - 'corpID' : the corpus ID number
- - 'aPos' : the annotation position
- - 'annotationSet' : a list of annotation layers for the sentence
- - Each item in the list is a dict containing the following keys:
- - 'ID' : the ID number of the annotation set
- - '_type' : 'annotationset'
- - 'status' : either 'MANUAL' or 'UNANN'
- - 'luName' : (only if status is 'MANUAL')
- - 'luID' : (only if status is 'MANUAL')
- - 'frameID' : (only if status is 'MANUAL')
- - 'frameName': (only if status is 'MANUAL')
- - 'layer' : a list of labels for the layer
- - Each item in the layer is a dict containing the
- following keys:
- - '_type': 'layer'
- - 'rank'
- - 'name'
- - 'label' : a list of labels in the layer
- - Each item is a dict containing the following keys:
- - 'start'
- - 'end'
- - 'name'
- - 'feID' (optional)
- :param fn_docid: The Framenet id number of the document
- :type fn_docid: int
- :return: Information about the annotated document
- :rtype: dict
- """
- try:
- xmlfname = self._fulltext_idx[fn_docid].filename
- except TypeError: # happens when self._fulltext_idx == None
- # build the index
- self._buildcorpusindex()
- xmlfname = self._fulltext_idx[fn_docid].filename
- except KeyError: # probably means that fn_docid was not in the index
- raise FramenetError("Unknown document id: {0}".format(fn_docid))
- # construct the path name for the xml file containing the document info
- locpath = os.path.join("{0}".format(self._root), self._fulltext_dir, xmlfname)
- # Grab the top-level xml element containing the fulltext annotation
- elt = XMLCorpusView(locpath, "fullTextAnnotation")[0]
- info = self._handle_fulltextannotation_elt(elt)
- # add metadata
- for k, v in self._fulltext_idx[fn_docid].items():
- info[k] = v
- return info
- def frame_by_id(self, fn_fid, ignorekeys=[]):
- """
- Get the details for the specified Frame using the frame's id
- number.
- Usage examples:
- >>> from nltk.corpus import framenet as fn
- >>> f = fn.frame_by_id(256)
- >>> f.ID
- 256
- >>> f.name
- 'Medical_specialties'
- >>> f.definition
- "This frame includes words that name ..."
- :param fn_fid: The Framenet id number of the frame
- :type fn_fid: int
- :param ignorekeys: The keys to ignore. These keys will not be
- included in the output. (optional)
- :type ignorekeys: list(str)
- :return: Information about a frame
- :rtype: dict
- Also see the ``frame()`` function for details about what is
- contained in the dict that is returned.
- """
- # get the name of the frame with this id number
- try:
- fentry = self._frame_idx[fn_fid]
- if "_type" in fentry:
- return fentry # full frame object is cached
- name = fentry["name"]
- except TypeError:
- self._buildframeindex()
- name = self._frame_idx[fn_fid]["name"]
- except KeyError:
- raise FramenetError("Unknown frame id: {0}".format(fn_fid))
- return self.frame_by_name(name, ignorekeys, check_cache=False)
- def frame_by_name(self, fn_fname, ignorekeys=[], check_cache=True):
- """
- Get the details for the specified Frame using the frame's name.
- Usage examples:
- >>> from nltk.corpus import framenet as fn
- >>> f = fn.frame_by_name('Medical_specialties')
- >>> f.ID
- 256
- >>> f.name
- 'Medical_specialties'
- >>> f.definition
- "This frame includes words that name ..."
- :param fn_fname: The name of the frame
- :type fn_fname: str
- :param ignorekeys: The keys to ignore. These keys will not be
- included in the output. (optional)
- :type ignorekeys: list(str)
- :return: Information about a frame
- :rtype: dict
- Also see the ``frame()`` function for details about what is
- contained in the dict that is returned.
- """
- if check_cache and fn_fname in self._cached_frames:
- return self._frame_idx[self._cached_frames[fn_fname]]
- elif not self._frame_idx:
- self._buildframeindex()
- # construct the path name for the xml file containing the Frame info
- locpath = os.path.join(
- "{0}".format(self._root), self._frame_dir, fn_fname + ".xml"
- )
- # print(locpath, file=sys.stderr)
- # Grab the xml for the frame
- try:
- elt = XMLCorpusView(locpath, "frame")[0]
- except IOError:
- raise FramenetError("Unknown frame: {0}".format(fn_fname))
- fentry = self._handle_frame_elt(elt, ignorekeys)
- assert fentry
- fentry.URL = self._fnweb_url + "/" + self._frame_dir + "/" + fn_fname + ".xml"
- # INFERENCE RULE: propagate lexical semtypes from the frame to all its LUs
- for st in fentry.semTypes:
- if st.rootType.name == "Lexical_type":
- for lu in fentry.lexUnit.values():
- if not any(
- x is st for x in lu.semTypes
- ): # identity containment check
- lu.semTypes.append(st)
- self._frame_idx[fentry.ID] = fentry
- self._cached_frames[fentry.name] = fentry.ID
- """
- # now set up callables to resolve the LU pointers lazily.
- # (could also do this here--caching avoids infinite recursion.)
- for luName,luinfo in fentry.lexUnit.items():
- fentry.lexUnit[luName] = (lambda luID: Future(lambda: self.lu(luID)))(luinfo.ID)
- """
- return fentry
- def frame(self, fn_fid_or_fname, ignorekeys=[]):
- """
- Get the details for the specified Frame using the frame's name
- or id number.
- Usage examples:
- >>> from nltk.corpus import framenet as fn
- >>> f = fn.frame(256)
- >>> f.name
- 'Medical_specialties'
- >>> f = fn.frame('Medical_specialties')
- >>> f.ID
- 256
- >>> # ensure non-ASCII character in definition doesn't trigger an encoding error:
- >>> fn.frame('Imposing_obligation')
- frame (1494): Imposing_obligation...
- The dict that is returned from this function will contain the
- following information about the Frame:
- - 'name' : the name of the Frame (e.g. 'Birth', 'Apply_heat', etc.)
- - 'definition' : textual definition of the Frame
- - 'ID' : the internal ID number of the Frame
- - 'semTypes' : a list of semantic types for this frame
- - Each item in the list is a dict containing the following keys:
- - 'name' : can be used with the semtype() function
- - 'ID' : can be used with the semtype() function
- - 'lexUnit' : a dict containing all of the LUs for this frame.
- The keys in this dict are the names of the LUs and
- the value for each key is itself a dict containing
- info about the LU (see the lu() function for more info.)
- - 'FE' : a dict containing the Frame Elements that are part of this frame
- The keys in this dict are the names of the FEs (e.g. 'Body_system')
- and the values are dicts containing the following keys
- - 'definition' : The definition of the FE
- - 'name' : The name of the FE e.g. 'Body_system'
- - 'ID' : The id number
- - '_type' : 'fe'
- - 'abbrev' : Abbreviation e.g. 'bod'
- - 'coreType' : one of "Core", "Peripheral", or "Extra-Thematic"
- - 'semType' : if not None, a dict with the following two keys:
- - 'name' : name of the semantic type. can be used with
- the semtype() function
- - 'ID' : id number of the semantic type. can be used with
- the semtype() function
- - 'requiresFE' : if not None, a dict with the following two keys:
- - 'name' : the name of another FE in this frame
- - 'ID' : the id of the other FE in this frame
- - 'excludesFE' : if not None, a dict with the following two keys:
- - 'name' : the name of another FE in this frame
- - 'ID' : the id of the other FE in this frame
- - 'frameRelation' : a list of objects describing frame relations
- - 'FEcoreSets' : a list of Frame Element core sets for this frame
- - Each item in the list is a list of FE objects
- :param fn_fid_or_fname: The Framenet name or id number of the frame
- :type fn_fid_or_fname: int or str
- :param ignorekeys: The keys to ignore. These keys will not be
- included in the output. (optional)
- :type ignorekeys: list(str)
- :return: Information about a frame
- :rtype: dict
- """
- # get the frame info by name or id number
- if isinstance(fn_fid_or_fname, str):
- f = self.frame_by_name(fn_fid_or_fname, ignorekeys)
- else:
- f = self.frame_by_id(fn_fid_or_fname, ignorekeys)
- return f
- def frames_by_lemma(self, pat):
- """
- Returns a list of all frames that contain LUs in which the
- ``name`` attribute of the LU matchs the given regular expression
- ``pat``. Note that LU names are composed of "lemma.POS", where
- the "lemma" part can be made up of either a single lexeme
- (e.g. 'run') or multiple lexemes (e.g. 'a little').
- Note: if you are going to be doing a lot of this type of
- searching, you'd want to build an index that maps from lemmas to
- frames because each time frames_by_lemma() is called, it has to
- search through ALL of the frame XML files in the db.
- >>> from nltk.corpus import framenet as fn
- >>> from nltk.corpus.reader.framenet import PrettyList
- >>> PrettyList(sorted(fn.frames_by_lemma(r'(?i)a little'), key=itemgetter('ID'))) # doctest: +ELLIPSIS
- [<frame ID=189 name=Quanti...>, <frame ID=2001 name=Degree>]
- :return: A list of frame objects.
- :rtype: list(AttrDict)
- """
- return PrettyList(
- f
- for f in self.frames()
- if any(re.search(pat, luName) for luName in f.lexUnit)
- )
- def lu_basic(self, fn_luid):
- """
- Returns basic information about the LU whose id is
- ``fn_luid``. This is basically just a wrapper around the
- ``lu()`` function with "subCorpus" info excluded.
- >>> from nltk.corpus import framenet as fn
- >>> lu = PrettyDict(fn.lu_basic(256), breakLines=True)
- >>> # ellipses account for differences between FN 1.5 and 1.7
- >>> lu # doctest: +ELLIPSIS
- {'ID': 256,
- 'POS': 'V',
- 'URL': 'https://framenet2.icsi.berkeley.edu/fnReports/data/lu/lu256.xml',
- '_type': 'lu',
- 'cBy': ...,
- 'cDate': '02/08/2001 01:27:50 PST Thu',
- 'definition': 'COD: be aware of beforehand; predict.',
- 'definitionMarkup': 'COD: be aware of beforehand; predict.',
- 'frame': <frame ID=26 name=Expectation>,
- 'lemmaID': 15082,
- 'lexemes': [{'POS': 'V', 'breakBefore': 'false', 'headword': 'false', 'name': 'foresee', 'order': 1}],
- 'name': 'foresee.v',
- 'semTypes': [],
- 'sentenceCount': {'annotated': ..., 'total': ...},
- 'status': 'FN1_Sent'}
- :param fn_luid: The id number of the desired LU
- :type fn_luid: int
- :return: Basic information about the lexical unit
- :rtype: dict
- """
- return self.lu(fn_luid, ignorekeys=["subCorpus", "exemplars"])
- def lu(self, fn_luid, ignorekeys=[], luName=None, frameID=None, frameName=None):
- """
- Access a lexical unit by its ID. luName, frameID, and frameName are used
- only in the event that the LU does not have a file in the database
- (which is the case for LUs with "Problem" status); in this case,
- a placeholder LU is created which just contains its name, ID, and frame.
- Usage examples:
- >>> from nltk.corpus import framenet as fn
- >>> fn.lu(256).name
- 'foresee.v'
- >>> fn.lu(256).definition
- 'COD: be aware of beforehand; predict.'
- >>> fn.lu(256).frame.name
- 'Expectation'
- >>> pprint(list(map(PrettyDict, fn.lu(256).lexemes)))
- [{'POS': 'V', 'breakBefore': 'false', 'headword': 'false', 'name': 'foresee', 'order': 1}]
- >>> fn.lu(227).exemplars[23]
- exemplar sentence (352962):
- [sentNo] 0
- [aPos] 59699508
- <BLANKLINE>
- [LU] (227) guess.v in Coming_to_believe
- <BLANKLINE>
- [frame] (23) Coming_to_believe
- <BLANKLINE>
- [annotationSet] 2 annotation sets
- <BLANKLINE>
- [POS] 18 tags
- <BLANKLINE>
- [POS_tagset] BNC
- <BLANKLINE>
- [GF] 3 relations
- <BLANKLINE>
- [PT] 3 phrases
- <BLANKLINE>
- [Other] 1 entry
- <BLANKLINE>
- [text] + [Target] + [FE]
- <BLANKLINE>
- When he was inside the house , Culley noticed the characteristic
- ------------------
- Content
- <BLANKLINE>
- he would n't have guessed at .
- -- ******* --
- Co C1 [Evidence:INI]
- (Co=Cognizer, C1=Content)
- <BLANKLINE>
- <BLANKLINE>
- The dict that is returned from this function will contain most of the
- following information about the LU. Note that some LUs do not contain
- all of these pieces of information - particularly 'totalAnnotated' and
- 'incorporatedFE' may be missing in some LUs:
- - 'name' : the name of the LU (e.g. 'merger.n')
- - 'definition' : textual definition of the LU
- - 'ID' : the internal ID number of the LU
- - '_type' : 'lu'
- - 'status' : e.g. 'Created'
- - 'frame' : Frame that this LU belongs to
- - 'POS' : the part of speech of this LU (e.g. 'N')
- - 'totalAnnotated' : total number of examples annotated with this LU
- - 'incorporatedFE' : FE that incorporates this LU (e.g. 'Ailment')
- - 'sentenceCount' : a dict with the following two keys:
- - 'annotated': number of sentences annotated with this LU
- - 'total' : total number of sentences with this LU
- - 'lexemes' : a list of dicts describing the lemma of this LU.
- Each dict in the list contains these keys:
- - 'POS' : part of speech e.g. 'N'
- - 'name' : either single-lexeme e.g. 'merger' or
- multi-lexeme e.g. 'a little'
- - 'order': the order of the lexeme in the lemma (starting from 1)
- - 'headword': a boolean ('true' or 'false')
- - 'breakBefore': Can this lexeme be separated from the previous lexeme?
- Consider: "take over.v" as in:
- Germany took over the Netherlands in 2 days.
- Germany took the Netherlands over in 2 days.
- In this case, 'breakBefore' would be "true" for the lexeme
- "over". Contrast this with "take after.v" as in:
- Mary takes after her grandmother.
- *Mary takes her grandmother after.
- In this case, 'breakBefore' would be "false" for the lexeme "after"
- - 'lemmaID' : Can be used to connect lemmas in different LUs
- - 'semTypes' : a list of semantic type objects for this LU
- - 'subCorpus' : a list of subcorpora
- - Each item in the list is a dict containing the following keys:
- - 'name' :
- - 'sentence' : a list of sentences in the subcorpus
- - each item in the list is a dict with the following keys:
- - 'ID':
- - 'sentNo':
- - 'text': the text of the sentence
- - 'aPos':
- - 'annotationSet': a list of annotation sets
- - each item in the list is a dict with the following keys:
- - 'ID':
- - 'status':
- - 'layer': a list of layers
- - each layer is a dict containing the following keys:
- - 'name': layer name (e.g. 'BNC')
- - 'rank':
- - 'label': a list of labels for the layer
- - each label is a dict containing the following keys:
- - 'start': start pos of label in sentence 'text' (0-based)
- - 'end': end pos of label in sentence 'text' (0-based)
- - 'name': name of label (e.g. 'NN1')
- Under the hood, this implementation looks up the lexical unit information
- in the *frame* definition file. That file does not contain
- corpus annotations, so the LU files will be accessed on demand if those are
- needed. In principle, valence patterns could be loaded here too,
- though these are not currently supported.
- :param fn_luid: The id number of the lexical unit
- :type fn_luid: int
- :param ignorekeys: The keys to ignore. These keys will not be
- included in the output. (optional)
- :type ignorekeys: list(str)
- :return: All information about the lexical unit
- :rtype: dict
- """
- # look for this LU in cache
- if not self._lu_idx:
- self._buildluindex()
- OOV = object()
- luinfo = self._lu_idx.get(fn_luid, OOV)
- if luinfo is OOV:
- # LU not in the index. We create a placeholder by falling back to
- # luName, frameID, and frameName. However, this will not be listed
- # among the LUs for its frame.
- self._warn(
- "LU ID not found: {0} ({1}) in {2} ({3})".format(
- luName, fn_luid, frameName, frameID
- )
- )
- luinfo = AttrDict(
- {
- "_type": "lu",
- "ID": fn_luid,
- "name": luName,
- "frameID": frameID,
- "status": "Problem",
- }
- )
- f = self.frame_by_id(luinfo.frameID)
- assert f.name == frameName, (f.name, frameName)
- luinfo["frame"] = f
- self._lu_idx[fn_luid] = luinfo
- elif "_type" not in luinfo:
- # we only have an index entry for the LU. loading the frame will replace this.
- f = self.frame_by_id(luinfo.frameID)
- luinfo = self._lu_idx[fn_luid]
- if ignorekeys:
- return AttrDict(
- dict((k, v) for k, v in luinfo.items() if k not in ignorekeys)
- )
- return luinfo
- def _lu_file(self, lu, ignorekeys=[]):
- """
- Augment the LU information that was loaded from the frame file
- with additional information from the LU file.
- """
- fn_luid = lu.ID
- fname = "lu{0}.xml".format(fn_luid)
- locpath = os.path.join("{0}".format(self._root), self._lu_dir, fname)
- # print(locpath, file=sys.stderr)
- if not self._lu_idx:
- self._buildluindex()
- try:
- elt = XMLCorpusView(locpath, "lexUnit")[0]
- except IOError:
- raise FramenetError("Unknown LU id: {0}".format(fn_luid))
- lu2 = self._handle_lexunit_elt(elt, ignorekeys)
- lu.URL = self._fnweb_url + "/" + self._lu_dir + "/" + fname
- lu.subCorpus = lu2.subCorpus
- lu.exemplars = SpecialList(
- "luexemplars", [sent for subc in lu.subCorpus for sent in subc.sentence]
- )
- for sent in lu.exemplars:
- sent["LU"] = lu
- sent["frame"] = lu.frame
- for aset in sent.annotationSet:
- aset["LU"] = lu
- aset["frame"] = lu.frame
- return lu
- def _loadsemtypes(self):
- """Create the semantic types index."""
- self._semtypes = AttrDict()
- semtypeXML = [
- x
- for x in XMLCorpusView(
- self.abspath("semTypes.xml"),
- "semTypes/semType",
- self._handle_semtype_elt,
- )
- ]
- for st in semtypeXML:
- n = st["name"]
- a = st["abbrev"]
- i = st["ID"]
- # Both name and abbrev should be able to retrieve the
- # ID. The ID will retrieve the semantic type dict itself.
- self._semtypes[n] = i
- self._semtypes[a] = i
- self._semtypes[i] = st
- # now that all individual semtype XML is loaded, we can link them together
- roots = []
- for st in self.semtypes():
- if st.superType:
- st.superType = self.semtype(st.superType.supID)
- st.superType.subTypes.append(st)
- else:
- if st not in roots:
- roots.append(st)
- st.rootType = st
- queue = list(roots)
- assert queue
- while queue:
- st = queue.pop(0)
- for child in st.subTypes:
- child.rootType = st.rootType
- queue.append(child)
- # self.propagate_semtypes() # apply inferencing over FE relations
- def propagate_semtypes(self):
- """
- Apply inference rules to distribute semtypes over relations between FEs.
- For FrameNet 1.5, this results in 1011 semtypes being propagated.
- (Not done by default because it requires loading all frame files,
- which takes several seconds. If this needed to be fast, it could be rewritten
- to traverse the neighboring relations on demand for each FE semtype.)
- >>> from nltk.corpus import framenet as fn
- >>> x = sum(1 for f in fn.frames() for fe in f.FE.values() if fe.semType)
- >>> fn.propagate_semtypes()
- >>> y = sum(1 for f in fn.frames() for fe in f.FE.values() if fe.semType)
- >>> y-x > 1000
- True
- """
- if not self._semtypes:
- self._loadsemtypes()
- if not self._ferel_idx:
- self._buildrelationindex()
- changed = True
- i = 0
- nPropagations = 0
- while changed:
- # make a pass and see if anything needs to be propagated
- i += 1
- changed = False
- for ferel in self.fe_relations():
- superST = ferel.superFE.semType
- subST = ferel.subFE.semType
- try:
- if superST and superST is not subST:
- # propagate downward
- assert subST is None or self.semtype_inherits(subST, superST), (
- superST.name,
- ferel,
- subST.name,
- )
- if subST is None:
- ferel.subFE.semType = subST = superST
- changed = True
- nPropagations += 1
- if (
- ferel.type.name in ["Perspective_on", "Subframe", "Precedes"]
- and subST
- and subST is not superST
- ):
- # propagate upward
- assert superST is None, (superST.name, ferel, subST.name)
- ferel.superFE.semType = superST = subST
- changed = True
- nPropagations += 1
- except AssertionError as ex:
- # bug in the data! ignore
- # print(ex, file=sys.stderr)
- continue
- # print(i, nPropagations, file=sys.stderr)
- def semtype(self, key):
- """
- >>> from nltk.corpus import framenet as fn
- >>> fn.semtype(233).name
- 'Temperature'
- >>> fn.semtype(233).abbrev
- 'Temp'
- >>> fn.semtype('Temperature').ID
- 233
- :param key: The name, abbreviation, or id number of the semantic type
- :type key: string or int
- :return: Information about a semantic type
- :rtype: dict
- """
- if isinstance(key, int):
- stid = key
- else:
- try:
- stid = self._semtypes[key]
- except TypeError:
- self._loadsemtypes()
- stid = self._semtypes[key]
- try:
- st = self._semtypes[stid]
- except TypeError:
- self._loadsemtypes()
- st = self._semtypes[stid]
- return st
- def semtype_inherits(self, st, superST):
- if not isinstance(st, dict):
- st = self.semtype(st)
- if not isinstance(superST, dict):
- superST = self.semtype(superST)
- par = st.superType
- while par:
- if par is superST:
- return True
- par = par.superType
- return False
- def frames(self, name=None):
- """
- Obtain details for a specific frame.
- >>> from nltk.corpus import framenet as fn
- >>> len(fn.frames()) in (1019, 1221) # FN 1.5 and 1.7, resp.
- True
- >>> x = PrettyList(fn.frames(r'(?i)crim'), maxReprSize=0, breakLines=True)
- >>> x.sort(key=itemgetter('ID'))
- >>> x
- [<frame ID=200 name=Criminal_process>,
- <frame ID=500 name=Criminal_investigation>,
- <frame ID=692 name=Crime_scenario>,
- <frame ID=700 name=Committing_crime>]
- A brief intro to Frames (excerpted from "FrameNet II: Extended
- Theory and Practice" by Ruppenhofer et. al., 2010):
- A Frame is a script-like conceptual structure that describes a
- particular type of situation, object, or event along with the
- participants and props that are needed for that Frame. For
- example, the "Apply_heat" frame describes a common situation
- involving a Cook, some Food, and a Heating_Instrument, and is
- evoked by words such as bake, blanch, boil, broil, brown,
- simmer, steam, etc.
- We call the roles of a Frame "frame elements" (FEs) and the
- frame-evoking words are called "lexical units" (LUs).
- FrameNet includes relations between Frames. Several types of
- relations are defined, of which the most important are:
- - Inheritance: An IS-A relation. The child frame is a subtype
- of the parent frame, and each FE in the parent is bound to
- a corresponding FE in the child. An example is the
- "Revenge" frame which inherits from the
- "Rewards_and_punishments" frame.
- - Using: The child frame presupposes the parent frame as
- background, e.g the "Speed" frame "uses" (or presupposes)
- the "Motion" frame; however, not all parent FEs need to be
- bound to child FEs.
- - Subframe: The child frame is a subevent of a complex event
- represented by the parent, e.g. the "Criminal_process" frame
- has subframes of "Arrest", "Arraignment", "Trial", and
- "Sentencing".
- - Perspective_on: The child frame provides a particular
- perspective on an un-perspectivized parent frame. A pair of
- examples consists of the "Hiring" and "Get_a_job" frames,
- which perspectivize the "Employment_start" frame from the
- Employer's and the Employee's point of view, respectively.
- :param name: A regular expression pattern used to match against
- Frame names. If 'name' is None, then a list of all
- Framenet Frames will be returned.
- :type name: str
- :return: A list of matching Frames (or all Frames).
- :rtype: list(AttrDict)
- """
- try:
- fIDs = list(self._frame_idx.keys())
- except AttributeError:
- self._buildframeindex()
- fIDs = list(self._frame_idx.keys())
- if name is not None:
- return PrettyList(
- self.frame(fID) for fID, finfo in self.frame_ids_and_names(name).items()
- )
- else:
- return PrettyLazyMap(self.frame, fIDs)
- def frame_ids_and_names(self, name=None):
- """
- Uses the frame index, which is much faster than looking up each frame definition
- if only the names and IDs are needed.
- """
- if not self._frame_idx:
- self._buildframeindex()
- return dict(
- (fID, finfo.name)
- for fID, finfo in self._frame_idx.items()
- if name is None or re.search(name, finfo.name) is not None
- )
- def fes(self, name=None, frame=None):
- """
- Lists frame element objects. If 'name' is provided, this is treated as
- a case-insensitive regular expression to filter by frame name.
- (Case-insensitivity is because casing of frame element names is not always
- consistent across frames.) Specify 'frame' to filter by a frame name pattern,
- ID, or object.
- >>> from nltk.corpus import framenet as fn
- >>> fn.fes('Noise_maker')
- [<fe ID=6043 name=Noise_maker>]
- >>> sorted([(fe.frame.name,fe.name) for fe in fn.fes('sound')])
- [('Cause_to_make_noise', 'Sound_maker'), ('Make_noise', 'Sound'),
- ('Make_noise', 'Sound_source'), ('Sound_movement', 'Location_of_sound_source'),
- ('Sound_movement', 'Sound'), ('Sound_movement', 'Sound_source'),
- ('Sounds', 'Component_sound'), ('Sounds', 'Location_of_sound_source'),
- ('Sounds', 'Sound_source'), ('Vocalizations', 'Location_of_sound_source'),
- ('Vocalizations', 'Sound_source')]
- >>> sorted([(fe.frame.name,fe.name) for fe in fn.fes('sound',r'(?i)make_noise')])
- [('Cause_to_make_noise', 'Sound_maker'),
- ('Make_noise', 'Sound'),
- ('Make_noise', 'Sound_source')]
- >>> sorted(set(fe.name for fe in fn.fes('^sound')))
- ['Sound', 'Sound_maker', 'Sound_source']
- >>> len(fn.fes('^sound$'))
- 2
- :param name: A regular expression pattern used to match against
- frame element names. If 'name' is None, then a list of all
- frame elements will be returned.
- :type name: str
- :return: A list of matching frame elements
- :rtype: list(AttrDict)
- """
- # what frames are we searching in?
- if frame is not None:
- if isinstance(frame, int):
- frames = [self.frame(frame)]
- elif isinstance(frame, str):
- frames = self.frames(frame)
- else:
- frames = [frame]
- else:
- frames = self.frames()
- return PrettyList(
- fe
- for f in frames
- for fename, fe in f.FE.items()
- if name is None or re.search(name, fename, re.I)
- )
- def lus(self, name=None, frame=None):
- """
- Obtain details for lexical units.
- Optionally restrict by lexical unit name pattern, and/or to a certain frame
- or frames whose name matches a pattern.
- >>> from nltk.corpus import framenet as fn
- >>> len(fn.lus()) in (11829, 13572) # FN 1.5 and 1.7, resp.
- True
- >>> PrettyList(sorted(fn.lus(r'(?i)a little'), key=itemgetter('ID')), maxReprSize=0, breakLines=True)
- [<lu ID=14733 name=a little.n>,
- <lu ID=14743 name=a little.adv>,
- <lu ID=14744 name=a little bit.adv>]
- >>> PrettyList(sorted(fn.lus(r'interest', r'(?i)stimulus'), key=itemgetter('ID')))
- [<lu ID=14894 name=interested.a>, <lu ID=14920 name=interesting.a>]
- A brief intro to Lexical Units (excerpted from "FrameNet II:
- Extended Theory and Practice" by Ruppenhofer et. al., 2010):
- A lexical unit (LU) is a pairing of a word with a meaning. For
- example, the "Apply_heat" Frame describes a common situation
- involving a Cook, some Food, and a Heating Instrument, and is
- _evoked_ by words such as bake, blanch, boil, broil, brown,
- simmer, steam, etc. These frame-evoking words are the LUs in the
- Apply_heat frame. Each sense of a polysemous word is a different
- LU.
- We have used the word "word" in talking about LUs. The reality
- is actually rather complex. When we say that the word "bake" is
- polysemous, we mean that the lemma "bake.v" (which has the
- word-forms "bake", "bakes", "baked", and "baking") is linked to
- three different frames:
- - Apply_heat: "Michelle baked the potatoes for 45 minutes."
- - Cooking_creation: "Michelle baked her mother a cake for her birthday."
- - Absorb_heat: "The potatoes have to bake for more than 30 minutes."
- These constitute three different LUs, with different
- definitions.
- Multiword expressions such as "given name" and hyphenated words
- like "shut-eye" can also be LUs. Idiomatic phrases such as
- "middle of nowhere" and "give the slip (to)" are also defined as
- LUs in the appropriate frames ("Isolated_places" and "Evading",
- respectively), and their internal structure is not analyzed.
- Framenet provides multiple annotated examples of each sense of a
- word (i.e. each LU). Moreover, the set of examples
- (approximately 20 per LU) illustrates all of the combinatorial
- possibilities of the lexical unit.
- Each LU is linked to a Frame, and hence to the other words which
- evoke that Frame. This makes the FrameNet database similar to a
- thesaurus, grouping together semantically similar words.
- In the simplest case, frame-evoking words are verbs such as
- "fried" in:
- "Matilde fried the catfish in a heavy iron skillet."
- Sometimes event nouns may evoke a Frame. For example,
- "reduction" evokes "Cause_change_of_scalar_position" in:
- "...the reduction of debt levels to $665 million from $2.6 billion."
- Adjectives may also evoke a Frame. For example, "asleep" may
- evoke the "Sleep" frame as in:
- "They were asleep for hours."
- Many common nouns, such as artifacts like "hat" or "tower",
- typically serve as dependents rather than clearly evoking their
- own frames.
- :param name: A regular expression pattern used to search the LU
- names. Note that LU names take the form of a dotted
- string (e.g. "run.v" or "a little.adv") in which a
- lemma preceeds the "." and a POS follows the
- dot. The lemma may be composed of a single lexeme
- (e.g. "run") or of multiple lexemes (e.g. "a
- little"). If 'name' is not given, then all LUs will
- be returned.
- The valid POSes are:
- v - verb
- n - noun
- a - adjective
- adv - adverb
- prep - preposition
- num - numbers
- intj - interjection
- art - article
- c - conjunction
- scon - subordinating conjunction
- :type name: str
- :type frame: str or int or frame
- :return: A list of selected (or all) lexical units
- :rtype: list of LU objects (dicts). See the lu() function for info
- about the specifics of LU objects.
- """
- if not self._lu_idx:
- self._buildluindex()
- if name is not None: # match LUs, then restrict by frame
- result = PrettyList(
- self.lu(luID) for luID, luName in self.lu_ids_and_names(name).items()
- )
- if frame is not None:
- if isinstance(frame, int):
- frameIDs = {frame}
- elif isinstance(frame, str):
- frameIDs = {f.ID for f in self.frames(frame)}
- else:
- frameIDs = {frame.ID}
- result = PrettyList(lu for lu in result if lu.frame.ID in frameIDs)
- elif frame is not None: # all LUs in matching frames
- if isinstance(frame, int):
- frames = [self.frame(frame)]
- elif isinstance(frame, str):
- frames = self.frames(frame)
- else:
- frames = [frame]
- result = PrettyLazyIteratorList(
- iter(LazyConcatenation(list(f.lexUnit.values()) for f in frames))
- )
- else: # all LUs
- luIDs = [
- luID
- for luID, lu in self._lu_idx.items()
- if lu.status not in self._bad_statuses
- ]
- result = PrettyLazyMap(self.lu, luIDs)
- return result
- def lu_ids_and_names(self, name=None):
- """
- Uses the LU index, which is much faster than looking up each LU definition
- if only the names and IDs are needed.
- """
- if not self._lu_idx:
- self._buildluindex()
- return {
- luID: luinfo.name
- for luID, luinfo in self._lu_idx.items()
- if luinfo.status not in self._bad_statuses
- and (name is None or re.search(name, luinfo.name) is not None)
- }
- def docs_metadata(self, name=None):
- """
- Return an index of the annotated documents in Framenet.
- Details for a specific annotated document can be obtained using this
- class's doc() function and pass it the value of the 'ID' field.
- >>> from nltk.corpus import framenet as fn
- >>> len(fn.docs()) in (78, 107) # FN 1.5 and 1.7, resp.
- True
- >>> set([x.corpname for x in fn.docs_metadata()])>=set(['ANC', 'KBEval', \
- 'LUCorpus-v0.3', 'Miscellaneous', 'NTI', 'PropBank'])
- True
- :param name: A regular expression pattern used to search the
- file name of each annotated document. The document's
- file name contains the name of the corpus that the
- document is from, followed by two underscores "__"
- followed by the document name. So, for example, the
- file name "LUCorpus-v0.3__20000410_nyt-NEW.xml" is
- from the corpus named "LUCorpus-v0.3" and the
- document name is "20000410_nyt-NEW.xml".
- :type name: str
- :return: A list of selected (or all) annotated documents
- :rtype: list of dicts, where each dict object contains the following
- keys:
- - 'name'
- - 'ID'
- - 'corpid'
- - 'corpname'
- - 'description'
- - 'filename'
- """
- try:
- ftlist = PrettyList(self._fulltext_idx.values())
- except AttributeError:
- self._buildcorpusindex()
- ftlist = PrettyList(self._fulltext_idx.values())
- if name is None:
- return ftlist
- else:
- return PrettyList(
- x for x in ftlist if re.search(name, x["filename"]) is not None
- )
- def docs(self, name=None):
- """
- Return a list of the annotated full-text documents in FrameNet,
- optionally filtered by a regex to be matched against the document name.
- """
- return PrettyLazyMap((lambda x: self.doc(x.ID)), self.docs_metadata(name))
- def sents(self, exemplars=True, full_text=True):
- """
- Annotated sentences matching the specified criteria.
- """
- if exemplars:
- if full_text:
- return self.exemplars() + self.ft_sents()
- else:
- return self.exemplars()
- elif full_text:
- return self.ft_sents()
- def annotations(self, luNamePattern=None, exemplars=True, full_text=True):
- """
- Frame annotation sets matching the specified criteria.
- """
- if exemplars:
- epart = PrettyLazyIteratorList(
- sent.frameAnnotation for sent in self.exemplars(luNamePattern)
- )
- else:
- epart = []
- if full_text:
- if luNamePattern is not None:
- matchedLUIDs = set(self.lu_ids_and_names(luNamePattern).keys())
- ftpart = PrettyLazyIteratorList(
- aset
- for sent in self.ft_sents()
- for aset in sent.annotationSet[1:]
- if luNamePattern is None or aset.get("luID", "CXN_ASET") in matchedLUIDs
- )
- else:
- ftpart = []
- if exemplars:
- if full_text:
- return epart + ftpart
- else:
- return epart
- elif full_text:
- return ftpart
- def exemplars(self, luNamePattern=None, frame=None, fe=None, fe2=None):
- """
- Lexicographic exemplar sentences, optionally filtered by LU name and/or 1-2 FEs that
- are realized overtly. 'frame' may be a name pattern, frame ID, or frame instance.
- 'fe' may be a name pattern or FE instance; if specified, 'fe2' may also
- be specified to retrieve sentences with both overt FEs (in either order).
- """
- if fe is None and fe2 is not None:
- raise FramenetError("exemplars(..., fe=None, fe2=<value>) is not allowed")
- elif fe is not None and fe2 is not None:
- if not isinstance(fe2, str):
- if isinstance(fe, str):
- # fe2 is specific to a particular frame. swap fe and fe2 so fe is always used to determine the frame.
- fe, fe2 = fe2, fe
- elif fe.frame is not fe2.frame: # ensure frames match
- raise FramenetError(
- "exemplars() call with inconsistent `fe` and `fe2` specification (frames must match)"
- )
- if frame is None and fe is not None and not isinstance(fe, str):
- frame = fe.frame
- # narrow down to frames matching criteria
- lusByFrame = defaultdict(
- list
- ) # frame name -> matching LUs, if luNamePattern is specified
- if frame is not None or luNamePattern is not None:
- if frame is None or isinstance(frame, str):
- if luNamePattern is not None:
- frames = set()
- for lu in self.lus(luNamePattern, frame=frame):
- frames.add(lu.frame.ID)
- lusByFrame[lu.frame.name].append(lu)
- frames = LazyMap(self.frame, list(frames))
- else:
- frames = self.frames(frame)
- else:
- if isinstance(frame, int):
- frames = [self.frame(frame)]
- else: # frame object
- frames = [frame]
- if luNamePattern is not None:
- lusByFrame = {frame.name: self.lus(luNamePattern, frame=frame)}
- if fe is not None: # narrow to frames that define this FE
- if isinstance(fe, str):
- frames = PrettyLazyIteratorList(
- f
- for f in frames
- if fe in f.FE
- or any(re.search(fe, ffe, re.I) for ffe in f.FE.keys())
- )
- else:
- if fe.frame not in frames:
- raise FramenetError(
- "exemplars() call with inconsistent `frame` and `fe` specification"
- )
- frames = [fe.frame]
- if fe2 is not None: # narrow to frames that ALSO define this FE
- if isinstance(fe2, str):
- frames = PrettyLazyIteratorList(
- f
- for f in frames
- if fe2 in f.FE
- or any(re.search(fe2, ffe, re.I) for ffe in f.FE.keys())
- )
- # else we already narrowed it to a single frame
- else: # frame, luNamePattern are None. fe, fe2 are None or strings
- if fe is not None:
- frames = {ffe.frame.ID for ffe in self.fes(fe)}
- if fe2 is not None:
- frames2 = {ffe.frame.ID for ffe in self.fes(fe2)}
- frames = frames & frames2
- frames = LazyMap(self.frame, list(frames))
- else:
- frames = self.frames()
- # we've narrowed down 'frames'
- # now get exemplars for relevant LUs in those frames
- def _matching_exs():
- for f in frames:
- fes = fes2 = None # FEs of interest
- if fe is not None:
- fes = (
- {ffe for ffe in f.FE.keys() if re.search(fe, ffe, re.I)}
- if isinstance(fe, str)
- else {fe.name}
- )
- if fe2 is not None:
- fes2 = (
- {ffe for ffe in f.FE.keys() if re.search(fe2, ffe, re.I)}
- if isinstance(fe2, str)
- else {fe2.name}
- )
- for lu in (
- lusByFrame[f.name]
- if luNamePattern is not None
- else f.lexUnit.values()
- ):
- for ex in lu.exemplars:
- if (fes is None or self._exemplar_of_fes(ex, fes)) and (
- fes2 is None or self._exemplar_of_fes(ex, fes2)
- ):
- yield ex
- return PrettyLazyIteratorList(_matching_exs())
- def _exemplar_of_fes(self, ex, fes=None):
- """
- Given an exemplar sentence and a set of FE names, return the subset of FE names
- that are realized overtly in the sentence on the FE, FE2, or FE3 layer.
- If 'fes' is None, returns all overt FE names.
- """
- overtNames = set(list(zip(*ex.FE[0]))[2]) if ex.FE[0] else set()
- if "FE2" in ex:
- overtNames |= set(list(zip(*ex.FE2[0]))[2]) if ex.FE2[0] else set()
- if "FE3" in ex:
- overtNames |= set(list(zip(*ex.FE3[0]))[2]) if ex.FE3[0] else set()
- return overtNames & fes if fes is not None else overtNames
- def ft_sents(self, docNamePattern=None):
- """
- Full-text annotation sentences, optionally filtered by document name.
- """
- return PrettyLazyIteratorList(
- sent for d in self.docs(docNamePattern) for sent in d.sentence
- )
- def frame_relation_types(self):
- """
- Obtain a list of frame relation types.
- >>> from nltk.corpus import framenet as fn
- >>> frts = sorted(fn.frame_relation_types(), key=itemgetter('ID'))
- >>> isinstance(frts, list)
- True
- >>> len(frts) in (9, 10) # FN 1.5 and 1.7, resp.
- True
- >>> PrettyDict(frts[0], breakLines=True)
- {'ID': 1,
- '_type': 'framerelationtype',
- 'frameRelations': [<Parent=Event -- Inheritance -> Child=Change_of_consistency>, <Parent=Event -- Inheritance -> Child=Rotting>, ...],
- 'name': 'Inheritance',
- 'subFrameName': 'Child',
- 'superFrameName': 'Parent'}
- :return: A list of all of the frame relation types in framenet
- :rtype: list(dict)
- """
- if not self._freltyp_idx:
- self._buildrelationindex()
- return self._freltyp_idx.values()
- def frame_relations(self, frame=None, frame2=None, type=None):
- """
- :param frame: (optional) frame object, name, or ID; only relations involving
- this frame will be returned
- :param frame2: (optional; 'frame' must be a different frame) only show relations
- between the two specified frames, in either direction
- :param type: (optional) frame relation type (name or object); show only relations
- of this type
- :type frame: int or str or AttrDict
- :return: A list of all of the frame relations in framenet
- :rtype: list(dict)
- >>> from nltk.corpus import framenet as fn
- >>> frels = fn.frame_relations()
- >>> isinstance(frels, list)
- True
- >>> len(frels) in (1676, 2070) # FN 1.5 and 1.7, resp.
- True
- >>> PrettyList(fn.frame_relations('Cooking_creation'), maxReprSize=0, breakLines=True)
- [<Parent=Intentionally_create -- Inheritance -> Child=Cooking_creation>,
- <Parent=Apply_heat -- Using -> Child=Cooking_creation>,
- <MainEntry=Apply_heat -- See_also -> ReferringEntry=Cooking_creation>]
- >>> PrettyList(fn.frame_relations(274), breakLines=True)
- [<Parent=Avoiding -- Inheritance -> Child=Dodging>,
- <Parent=Avoiding -- Inheritance -> Child=Evading>, ...]
- >>> PrettyList(fn.frame_relations(fn.frame('Cooking_creation')), breakLines=True)
- [<Parent=Intentionally_create -- Inheritance -> Child=Cooking_creation>,
- <Parent=Apply_heat -- Using -> Child=Cooking_creation>, ...]
- >>> PrettyList(fn.frame_relations('Cooking_creation', type='Inheritance'))
- [<Parent=Intentionally_create -- Inheritance -> Child=Cooking_creation>]
- >>> PrettyList(fn.frame_relations('Cooking_creation', 'Apply_heat'), breakLines=True)
- [<Parent=Apply_heat -- Using -> Child=Cooking_creation>,
- <MainEntry=Apply_heat -- See_also -> ReferringEntry=Cooking_creation>]
- """
- relation_type = type
- if not self._frel_idx:
- self._buildrelationindex()
- rels = None
- if relation_type is not None:
- if not isinstance(relation_type, dict):
- type = [rt for rt in self.frame_relation_types() if rt.name == type][0]
- assert isinstance(type, dict)
- # lookup by 'frame'
- if frame is not None:
- if isinstance(frame, dict) and "frameRelations" in frame:
- rels = PrettyList(frame.frameRelations)
- else:
- if not isinstance(frame, int):
- if isinstance(frame, dict):
- frame = frame.ID
- else:
- frame = self.frame_by_name(frame).ID
- rels = [self._frel_idx[frelID] for frelID in self._frel_f_idx[frame]]
- # filter by 'type'
- if type is not None:
- rels = [rel for rel in rels if rel.type is type]
- elif type is not None:
- # lookup by 'type'
- rels = type.frameRelations
- else:
- rels = self._frel_idx.values()
- # filter by 'frame2'
- if frame2 is not None:
- if frame is None:
- raise FramenetError(
- "frame_relations(frame=None, frame2=<value>) is not allowed"
- )
- if not isinstance(frame2, int):
- if isinstance(frame2, dict):
- frame2 = frame2.ID
- else:
- frame2 = self.frame_by_name(frame2).ID
- if frame == frame2:
- raise FramenetError(
- "The two frame arguments to frame_relations() must be different frames"
- )
- rels = [
- rel
- for rel in rels
- if rel.superFrame.ID == frame2 or rel.subFrame.ID == frame2
- ]
- return PrettyList(
- sorted(
- rels,
- key=lambda frel: (frel.type.ID, frel.superFrameName, frel.subFrameName),
- )
- )
- def fe_relations(self):
- """
- Obtain a list of frame element relations.
- >>> from nltk.corpus import framenet as fn
- >>> ferels = fn.fe_relations()
- >>> isinstance(ferels, list)
- True
- >>> len(ferels) in (10020, 12393) # FN 1.5 and 1.7, resp.
- True
- >>> PrettyDict(ferels[0], breakLines=True)
- {'ID': 14642,
- '_type': 'ferelation',
- 'frameRelation': <Parent=Abounding_with -- Inheritance -> Child=Lively_place>,
- 'subFE': <fe ID=11370 name=Degree>,
- 'subFEName': 'Degree',
- 'subFrame': <frame ID=1904 name=Lively_place>,
- 'subID': 11370,
- 'supID': 2271,
- 'superFE': <fe ID=2271 name=Degree>,
- 'superFEName': 'Degree',
- 'superFrame': <frame ID=262 name=Abounding_with>,
- 'type': <framerelationtype ID=1 name=Inheritance>}
- :return: A list of all of the frame element relations in framenet
- :rtype: list(dict)
- """
- if not self._ferel_idx:
- self._buildrelationindex()
- return PrettyList(
- sorted(
- self._ferel_idx.values(),
- key=lambda ferel: (
- ferel.type.ID,
- ferel.frameRelation.superFrameName,
- ferel.superFEName,
- ferel.frameRelation.subFrameName,
- ferel.subFEName,
- ),
- )
- )
- def semtypes(self):
- """
- Obtain a list of semantic types.
- >>> from nltk.corpus import framenet as fn
- >>> stypes = fn.semtypes()
- >>> len(stypes) in (73, 109) # FN 1.5 and 1.7, resp.
- True
- >>> sorted(stypes[0].keys())
- ['ID', '_type', 'abbrev', 'definition', 'definitionMarkup', 'name', 'rootType', 'subTypes', 'superType']
- :return: A list of all of the semantic types in framenet
- :rtype: list(dict)
- """
- if not self._semtypes:
- self._loadsemtypes()
- return PrettyList(
- self._semtypes[i] for i in self._semtypes if isinstance(i, int)
- )
- def _load_xml_attributes(self, d, elt):
- """
- Extracts a subset of the attributes from the given element and
- returns them in a dictionary.
- :param d: A dictionary in which to store the attributes.
- :type d: dict
- :param elt: An ElementTree Element
- :type elt: Element
- :return: Returns the input dict ``d`` possibly including attributes from ``elt``
- :rtype: dict
- """
- d = type(d)(d)
- try:
- attr_dict = elt.attrib
- except AttributeError:
- return d
- if attr_dict is None:
- return d
- # Ignore these attributes when loading attributes from an xml node
- ignore_attrs = [ #'cBy', 'cDate', 'mDate', # <-- annotation metadata that could be of interest
- "xsi",
- "schemaLocation",
- "xmlns",
- "bgColor",
- "fgColor",
- ]
- for attr in attr_dict:
- if any(attr.endswith(x) for x in ignore_attrs):
- continue
- val = attr_dict[attr]
- if val.isdigit():
- d[attr] = int(val)
- else:
- d[attr] = val
- return d
- def _strip_tags(self, data):
- """
- Gets rid of all tags and newline characters from the given input
- :return: A cleaned-up version of the input string
- :rtype: str
- """
- try:
- """
- # Look for boundary issues in markup. (Sometimes FEs are pluralized in definitions.)
- m = re.search(r'\w[<][^/]|[<][/][^>]+[>](s\w|[a-rt-z0-9])', data)
- if m:
- print('Markup boundary:', data[max(0,m.start(0)-10):m.end(0)+10].replace('\n',' '), file=sys.stderr)
- """
- data = data.replace("<t>", "")
- data = data.replace("</t>", "")
- data = re.sub('<fex name="[^"]+">', "", data)
- data = data.replace("</fex>", "")
- data = data.replace("<fen>", "")
- data = data.replace("</fen>", "")
- data = data.replace("<m>", "")
- data = data.replace("</m>", "")
- data = data.replace("<ment>", "")
- data = data.replace("</ment>", "")
- data = data.replace("<ex>", "'")
- data = data.replace("</ex>", "'")
- data = data.replace("<gov>", "")
- data = data.replace("</gov>", "")
- data = data.replace("<x>", "")
- data = data.replace("</x>", "")
- # Get rid of <def-root> and </def-root> tags
- data = data.replace("<def-root>", "")
- data = data.replace("</def-root>", "")
- data = data.replace("\n", " ")
- except AttributeError:
- pass
- return data
- def _handle_elt(self, elt, tagspec=None):
- """Extracts and returns the attributes of the given element"""
- return self._load_xml_attributes(AttrDict(), elt)
- def _handle_fulltextindex_elt(self, elt, tagspec=None):
- """
- Extracts corpus/document info from the fulltextIndex.xml file.
- Note that this function "flattens" the information contained
- in each of the "corpus" elements, so that each "document"
- element will contain attributes for the corpus and
- corpusid. Also, each of the "document" items will contain a
- new attribute called "filename" that is the base file name of
- the xml file for the document in the "fulltext" subdir of the
- Framenet corpus.
- """
- ftinfo = self._load_xml_attributes(AttrDict(), elt)
- corpname = ftinfo.name
- corpid = ftinfo.ID
- retlist = []
- for sub in elt:
- if sub.tag.endswith("document"):
- doc = self._load_xml_attributes(AttrDict(), sub)
- if "name" in doc:
- docname = doc.name
- else:
- docname = doc.description
- doc.filename = "{0}__{1}.xml".format(corpname, docname)
- doc.URL = (
- self._fnweb_url + "/" + self._fulltext_dir + "/" + doc.filename
- )
- doc.corpname = corpname
- doc.corpid = corpid
- retlist.append(doc)
- return retlist
- def _handle_frame_elt(self, elt, ignorekeys=[]):
- """Load the info for a Frame from a frame xml file"""
- frinfo = self._load_xml_attributes(AttrDict(), elt)
- frinfo["_type"] = "frame"
- frinfo["definition"] = ""
- frinfo["definitionMarkup"] = ""
- frinfo["FE"] = PrettyDict()
- frinfo["FEcoreSets"] = []
- frinfo["lexUnit"] = PrettyDict()
- frinfo["semTypes"] = []
- for k in ignorekeys:
- if k in frinfo:
- del frinfo[k]
- for sub in elt:
- if sub.tag.endswith("definition") and "definition" not in ignorekeys:
- frinfo["definitionMarkup"] = sub.text
- frinfo["definition"] = self._strip_tags(sub.text)
- elif sub.tag.endswith("FE") and "FE" not in ignorekeys:
- feinfo = self._handle_fe_elt(sub)
- frinfo["FE"][feinfo.name] = feinfo
- feinfo["frame"] = frinfo # backpointer
- elif sub.tag.endswith("FEcoreSet") and "FEcoreSet" not in ignorekeys:
- coreset = self._handle_fecoreset_elt(sub)
- # assumes all FEs have been loaded before coresets
- frinfo["FEcoreSets"].append(
- PrettyList(frinfo["FE"][fe.name] for fe in coreset)
- )
- elif sub.tag.endswith("lexUnit") and "lexUnit" not in ignorekeys:
- luentry = self._handle_framelexunit_elt(sub)
- if luentry["status"] in self._bad_statuses:
- # problematic LU entry; ignore it
- continue
- luentry["frame"] = frinfo
- luentry["URL"] = (
- self._fnweb_url
- + "/"
- + self._lu_dir
- + "/"
- + "lu{0}.xml".format(luentry["ID"])
- )
- luentry["subCorpus"] = Future(
- (lambda lu: lambda: self._lu_file(lu).subCorpus)(luentry)
- )
- luentry["exemplars"] = Future(
- (lambda lu: lambda: self._lu_file(lu).exemplars)(luentry)
- )
- frinfo["lexUnit"][luentry.name] = luentry
- if not self._lu_idx:
- self._buildluindex()
- self._lu_idx[luentry.ID] = luentry
- elif sub.tag.endswith("semType") and "semTypes" not in ignorekeys:
- semtypeinfo = self._load_xml_attributes(AttrDict(), sub)
- frinfo["semTypes"].append(self.semtype(semtypeinfo.ID))
- frinfo["frameRelations"] = self.frame_relations(frame=frinfo)
- # resolve 'requires' and 'excludes' links between FEs of this frame
- for fe in frinfo.FE.values():
- if fe.requiresFE:
- name, ID = fe.requiresFE.name, fe.requiresFE.ID
- fe.requiresFE = frinfo.FE[name]
- assert fe.requiresFE.ID == ID
- if fe.excludesFE:
- name, ID = fe.excludesFE.name, fe.excludesFE.ID
- fe.excludesFE = frinfo.FE[name]
- assert fe.excludesFE.ID == ID
- return frinfo
- def _handle_fecoreset_elt(self, elt):
- """Load fe coreset info from xml."""
- info = self._load_xml_attributes(AttrDict(), elt)
- tmp = []
- for sub in elt:
- tmp.append(self._load_xml_attributes(AttrDict(), sub))
- return tmp
- def _handle_framerelationtype_elt(self, elt, *args):
- """Load frame-relation element and its child fe-relation elements from frRelation.xml."""
- info = self._load_xml_attributes(AttrDict(), elt)
- info["_type"] = "framerelationtype"
- info["frameRelations"] = PrettyList()
- for sub in elt:
- if sub.tag.endswith("frameRelation"):
- frel = self._handle_framerelation_elt(sub)
- frel["type"] = info # backpointer
- for ferel in frel.feRelations:
- ferel["type"] = info
- info["frameRelations"].append(frel)
- return info
- def _handle_framerelation_elt(self, elt):
- """Load frame-relation element and its child fe-relation elements from frRelation.xml."""
- info = self._load_xml_attributes(AttrDict(), elt)
- assert info["superFrameName"] != info["subFrameName"], (elt, info)
- info["_type"] = "framerelation"
- info["feRelations"] = PrettyList()
- for sub in elt:
- if sub.tag.endswith("FERelation"):
- ferel = self._handle_elt(sub)
- ferel["_type"] = "ferelation"
- ferel["frameRelation"] = info # backpointer
- info["feRelations"].append(ferel)
- return info
- def _handle_fulltextannotation_elt(self, elt):
- """Load full annotation info for a document from its xml
- file. The main element (fullTextAnnotation) contains a 'header'
- element (which we ignore here) and a bunch of 'sentence'
- elements."""
- info = AttrDict()
- info["_type"] = "fulltext_annotation"
- info["sentence"] = []
- for sub in elt:
- if sub.tag.endswith("header"):
- continue # not used
- elif sub.tag.endswith("sentence"):
- s = self._handle_fulltext_sentence_elt(sub)
- s.doc = info
- info["sentence"].append(s)
- return info
- def _handle_fulltext_sentence_elt(self, elt):
- """Load information from the given 'sentence' element. Each
- 'sentence' element contains a "text" and "annotationSet" sub
- elements."""
- info = self._load_xml_attributes(AttrDict(), elt)
- info["_type"] = "fulltext_sentence"
- info["annotationSet"] = []
- info["targets"] = []
- target_spans = set()
- info["_ascii"] = types.MethodType(
- _annotation_ascii, info
- ) # attach a method for this instance
- info["text"] = ""
- for sub in elt:
- if sub.tag.endswith("text"):
- info["text"] = self._strip_tags(sub.text)
- elif sub.tag.endswith("annotationSet"):
- a = self._handle_fulltextannotationset_elt(
- sub, is_pos=(len(info["annotationSet"]) == 0)
- )
- if "cxnID" in a: # ignoring construction annotations for now
- continue
- a.sent = info
- a.text = info.text
- info["annotationSet"].append(a)
- if "Target" in a:
- for tspan in a.Target:
- if tspan in target_spans:
- self._warn(
- 'Duplicate target span "{0}"'.format(
- info.text[slice(*tspan)]
- ),
- tspan,
- "in sentence",
- info["ID"],
- info.text,
- )
- # this can happen in cases like "chemical and biological weapons"
- # being annotated as "chemical weapons" and "biological weapons"
- else:
- target_spans.add(tspan)
- info["targets"].append((a.Target, a.luName, a.frameName))
- assert info["annotationSet"][0].status == "UNANN"
- info["POS"] = info["annotationSet"][0].POS
- info["POS_tagset"] = info["annotationSet"][0].POS_tagset
- return info
- def _handle_fulltextannotationset_elt(self, elt, is_pos=False):
- """Load information from the given 'annotationSet' element. Each
- 'annotationSet' contains several "layer" elements."""
- info = self._handle_luannotationset_elt(elt, is_pos=is_pos)
- if not is_pos:
- info["_type"] = "fulltext_annotationset"
- if "cxnID" not in info: # ignoring construction annotations for now
- info["LU"] = self.lu(
- info.luID,
- luName=info.luName,
- frameID=info.frameID,
- frameName=info.frameName,
- )
- info["frame"] = info.LU.frame
- return info
- def _handle_fulltextlayer_elt(self, elt):
- """Load information from the given 'layer' element. Each
- 'layer' contains several "label" elements."""
- info = self._load_xml_attributes(AttrDict(), elt)
- info["_type"] = "layer"
- info["label"] = []
- for sub in elt:
- if sub.tag.endswith("label"):
- l = self._load_xml_attributes(AttrDict(), sub)
- info["label"].append(l)
- return info
- def _handle_framelexunit_elt(self, elt):
- """Load the lexical unit info from an xml element in a frame's xml file."""
- luinfo = AttrDict()
- luinfo["_type"] = "lu"
- luinfo = self._load_xml_attributes(luinfo, elt)
- luinfo["definition"] = ""
- luinfo["definitionMarkup"] = ""
- luinfo["sentenceCount"] = PrettyDict()
- luinfo["lexemes"] = PrettyList() # multiword LUs have multiple lexemes
- luinfo["semTypes"] = PrettyList() # an LU can have multiple semtypes
- for sub in elt:
- if sub.tag.endswith("definition"):
- luinfo["definitionMarkup"] = sub.text
- luinfo["definition"] = self._strip_tags(sub.text)
- elif sub.tag.endswith("sentenceCount"):
- luinfo["sentenceCount"] = self._load_xml_attributes(PrettyDict(), sub)
- elif sub.tag.endswith("lexeme"):
- lexemeinfo = self._load_xml_attributes(PrettyDict(), sub)
- if not isinstance(lexemeinfo.name, str):
- # some lexeme names are ints by default: e.g.,
- # thousand.num has lexeme with name="1000"
- lexemeinfo.name = str(lexemeinfo.name)
- luinfo["lexemes"].append(lexemeinfo)
- elif sub.tag.endswith("semType"):
- semtypeinfo = self._load_xml_attributes(PrettyDict(), sub)
- luinfo["semTypes"].append(self.semtype(semtypeinfo.ID))
- # sort lexemes by 'order' attribute
- # otherwise, e.g., 'write down.v' may have lexemes in wrong order
- luinfo["lexemes"].sort(key=lambda x: x.order)
- return luinfo
- def _handle_lexunit_elt(self, elt, ignorekeys):
- """
- Load full info for a lexical unit from its xml file.
- This should only be called when accessing corpus annotations
- (which are not included in frame files).
- """
- luinfo = self._load_xml_attributes(AttrDict(), elt)
- luinfo["_type"] = "lu"
- luinfo["definition"] = ""
- luinfo["definitionMarkup"] = ""
- luinfo["subCorpus"] = PrettyList()
- luinfo["lexemes"] = PrettyList() # multiword LUs have multiple lexemes
- luinfo["semTypes"] = PrettyList() # an LU can have multiple semtypes
- for k in ignorekeys:
- if k in luinfo:
- del luinfo[k]
- for sub in elt:
- if sub.tag.endswith("header"):
- continue # not used
- elif sub.tag.endswith("valences"):
- continue # not used
- elif sub.tag.endswith("definition") and "definition" not in ignorekeys:
- luinfo["definitionMarkup"] = sub.text
- luinfo["definition"] = self._strip_tags(sub.text)
- elif sub.tag.endswith("subCorpus") and "subCorpus" not in ignorekeys:
- sc = self._handle_lusubcorpus_elt(sub)
- if sc is not None:
- luinfo["subCorpus"].append(sc)
- elif sub.tag.endswith("lexeme") and "lexeme" not in ignorekeys:
- luinfo["lexemes"].append(self._load_xml_attributes(PrettyDict(), sub))
- elif sub.tag.endswith("semType") and "semType" not in ignorekeys:
- semtypeinfo = self._load_xml_attributes(AttrDict(), sub)
- luinfo["semTypes"].append(self.semtype(semtypeinfo.ID))
- return luinfo
- def _handle_lusubcorpus_elt(self, elt):
- """Load a subcorpus of a lexical unit from the given xml."""
- sc = AttrDict()
- try:
- sc["name"] = elt.get("name")
- except AttributeError:
- return None
- sc["_type"] = "lusubcorpus"
- sc["sentence"] = []
- for sub in elt:
- if sub.tag.endswith("sentence"):
- s = self._handle_lusentence_elt(sub)
- if s is not None:
- sc["sentence"].append(s)
- return sc
- def _handle_lusentence_elt(self, elt):
- """Load a sentence from a subcorpus of an LU from xml."""
- info = self._load_xml_attributes(AttrDict(), elt)
- info["_type"] = "lusentence"
- info["annotationSet"] = []
- info["_ascii"] = types.MethodType(
- _annotation_ascii, info
- ) # attach a method for this instance
- for sub in elt:
- if sub.tag.endswith("text"):
- info["text"] = self._strip_tags(sub.text)
- elif sub.tag.endswith("annotationSet"):
- annset = self._handle_luannotationset_elt(
- sub, is_pos=(len(info["annotationSet"]) == 0)
- )
- if annset is not None:
- assert annset.status == "UNANN" or "FE" in annset, annset
- if annset.status != "UNANN":
- info["frameAnnotation"] = annset
- # copy layer info up to current level
- for k in (
- "Target",
- "FE",
- "FE2",
- "FE3",
- "GF",
- "PT",
- "POS",
- "POS_tagset",
- "Other",
- "Sent",
- "Verb",
- "Noun",
- "Adj",
- "Adv",
- "Prep",
- "Scon",
- "Art",
- ):
- if k in annset:
- info[k] = annset[k]
- info["annotationSet"].append(annset)
- annset["sent"] = info
- annset["text"] = info.text
- return info
- def _handle_luannotationset_elt(self, elt, is_pos=False):
- """Load an annotation set from a sentence in an subcorpus of an LU"""
- info = self._load_xml_attributes(AttrDict(), elt)
- info["_type"] = "posannotationset" if is_pos else "luannotationset"
- info["layer"] = []
- info["_ascii"] = types.MethodType(
- _annotation_ascii, info
- ) # attach a method for this instance
- if "cxnID" in info: # ignoring construction annotations for now.
- return info
- for sub in elt:
- if sub.tag.endswith("layer"):
- l = self._handle_lulayer_elt(sub)
- if l is not None:
- overt = []
- ni = {} # null instantiations
- info["layer"].append(l)
- for lbl in l.label:
- if "start" in lbl:
- thespan = (lbl.start, lbl.end + 1, lbl.name)
- if l.name not in (
- "Sent",
- "Other",
- ): # 'Sent' and 'Other' layers sometimes contain accidental duplicate spans
- assert thespan not in overt, (info.ID, l.name, thespan)
- overt.append(thespan)
- else: # null instantiation
- if lbl.name in ni:
- self._warn(
- "FE with multiple NI entries:",
- lbl.name,
- ni[lbl.name],
- lbl.itype,
- )
- else:
- ni[lbl.name] = lbl.itype
- overt = sorted(overt)
- if l.name == "Target":
- if not overt:
- self._warn(
- "Skipping empty Target layer in annotation set ID={0}".format(
- info.ID
- )
- )
- continue
- assert all(lblname == "Target" for i, j, lblname in overt)
- if "Target" in info:
- self._warn(
- "Annotation set {0} has multiple Target layers".format(
- info.ID
- )
- )
- else:
- info["Target"] = [(i, j) for (i, j, _) in overt]
- elif l.name == "FE":
- if l.rank == 1:
- assert "FE" not in info
- info["FE"] = (overt, ni)
- # assert False,info
- else:
- # sometimes there are 3 FE layers! e.g. Change_position_on_a_scale.fall.v
- assert 2 <= l.rank <= 3, l.rank
- k = "FE" + str(l.rank)
- assert k not in info
- info[k] = (overt, ni)
- elif l.name in ("GF", "PT"):
- assert l.rank == 1
- info[l.name] = overt
- elif l.name in ("BNC", "PENN"):
- assert l.rank == 1
- info["POS"] = overt
- info["POS_tagset"] = l.name
- else:
- if is_pos:
- if l.name not in ("NER", "WSL"):
- self._warn(
- "Unexpected layer in sentence annotationset:",
- l.name,
- )
- else:
- if l.name not in (
- "Sent",
- "Verb",
- "Noun",
- "Adj",
- "Adv",
- "Prep",
- "Scon",
- "Art",
- "Other",
- ):
- self._warn(
- "Unexpected layer in frame annotationset:", l.name
- )
- info[l.name] = overt
- if not is_pos and "cxnID" not in info:
- if "Target" not in info:
- self._warn("Missing target in annotation set ID={0}".format(info.ID))
- assert "FE" in info
- if "FE3" in info:
- assert "FE2" in info
- return info
- def _handle_lulayer_elt(self, elt):
- """Load a layer from an annotation set"""
- layer = self._load_xml_attributes(AttrDict(), elt)
- layer["_type"] = "lulayer"
- layer["label"] = []
- for sub in elt:
- if sub.tag.endswith("label"):
- l = self._load_xml_attributes(AttrDict(), sub)
- if l is not None:
- layer["label"].append(l)
- return layer
- def _handle_fe_elt(self, elt):
- feinfo = self._load_xml_attributes(AttrDict(), elt)
- feinfo["_type"] = "fe"
- feinfo["definition"] = ""
- feinfo["definitionMarkup"] = ""
- feinfo["semType"] = None
- feinfo["requiresFE"] = None
- feinfo["excludesFE"] = None
- for sub in elt:
- if sub.tag.endswith("definition"):
- feinfo["definitionMarkup"] = sub.text
- feinfo["definition"] = self._strip_tags(sub.text)
- elif sub.tag.endswith("semType"):
- stinfo = self._load_xml_attributes(AttrDict(), sub)
- feinfo["semType"] = self.semtype(stinfo.ID)
- elif sub.tag.endswith("requiresFE"):
- feinfo["requiresFE"] = self._load_xml_attributes(AttrDict(), sub)
- elif sub.tag.endswith("excludesFE"):
- feinfo["excludesFE"] = self._load_xml_attributes(AttrDict(), sub)
- return feinfo
- def _handle_semtype_elt(self, elt, tagspec=None):
- semt = self._load_xml_attributes(AttrDict(), elt)
- semt["_type"] = "semtype"
- semt["superType"] = None
- semt["subTypes"] = PrettyList()
- for sub in elt:
- if sub.text is not None:
- semt["definitionMarkup"] = sub.text
- semt["definition"] = self._strip_tags(sub.text)
- else:
- supertypeinfo = self._load_xml_attributes(AttrDict(), sub)
- semt["superType"] = supertypeinfo
- # the supertype may not have been loaded yet
- return semt
- #
- # Demo
- #
- def demo():
- from nltk.corpus import framenet as fn
- #
- # It is not necessary to explicitly build the indexes by calling
- # buildindexes(). We do this here just for demo purposes. If the
- # indexes are not built explicitely, they will be built as needed.
- #
- print("Building the indexes...")
- fn.buildindexes()
- #
- # Get some statistics about the corpus
- #
- print("Number of Frames:", len(fn.frames()))
- print("Number of Lexical Units:", len(fn.lus()))
- print("Number of annotated documents:", len(fn.docs()))
- print()
- #
- # Frames
- #
- print(
- 'getting frames whose name matches the (case insensitive) regex: "(?i)medical"'
- )
- medframes = fn.frames(r"(?i)medical")
- print('Found {0} Frames whose name matches "(?i)medical":'.format(len(medframes)))
- print([(f.name, f.ID) for f in medframes])
- #
- # store the first frame in the list of frames
- #
- tmp_id = medframes[0].ID
- m_frame = fn.frame(tmp_id) # reads all info for the frame
- #
- # get the frame relations
- #
- print(
- '\nNumber of frame relations for the "{0}" ({1}) frame:'.format(
- m_frame.name, m_frame.ID
- ),
- len(m_frame.frameRelations),
- )
- for fr in m_frame.frameRelations:
- print(" ", fr)
- #
- # get the names of the Frame Elements
- #
- print(
- '\nNumber of Frame Elements in the "{0}" frame:'.format(m_frame.name),
- len(m_frame.FE),
- )
- print(" ", [x for x in m_frame.FE])
- #
- # get the names of the "Core" Frame Elements
- #
- print('\nThe "core" Frame Elements in the "{0}" frame:'.format(m_frame.name))
- print(" ", [x.name for x in m_frame.FE.values() if x.coreType == "Core"])
- #
- # get all of the Lexical Units that are incorporated in the
- # 'Ailment' FE of the 'Medical_conditions' frame (id=239)
- #
- print('\nAll Lexical Units that are incorporated in the "Ailment" FE:')
- m_frame = fn.frame(239)
- ailment_lus = [
- x
- for x in m_frame.lexUnit.values()
- if "incorporatedFE" in x and x.incorporatedFE == "Ailment"
- ]
- print(" ", [x.name for x in ailment_lus])
- #
- # get all of the Lexical Units for the frame
- #
- print(
- '\nNumber of Lexical Units in the "{0}" frame:'.format(m_frame.name),
- len(m_frame.lexUnit),
- )
- print(" ", [x.name for x in m_frame.lexUnit.values()][:5], "...")
- #
- # get basic info on the second LU in the frame
- #
- tmp_id = m_frame.lexUnit["ailment.n"].ID # grab the id of the specified LU
- luinfo = fn.lu_basic(tmp_id) # get basic info on the LU
- print("\nInformation on the LU: {0}".format(luinfo.name))
- pprint(luinfo)
- #
- # Get a list of all of the corpora used for fulltext annotation
- #
- print("\nNames of all of the corpora used for fulltext annotation:")
- allcorpora = set(x.corpname for x in fn.docs_metadata())
- pprint(list(allcorpora))
- #
- # Get the names of the annotated documents in the first corpus
- #
- firstcorp = list(allcorpora)[0]
- firstcorp_docs = fn.docs(firstcorp)
- print('\nNames of the annotated documents in the "{0}" corpus:'.format(firstcorp))
- pprint([x.filename for x in firstcorp_docs])
- #
- # Search for frames containing LUs whose name attribute matches a
- # regexp pattern.
- #
- # Note: if you were going to be doing a lot of this type of
- # searching, you'd want to build an index that maps from
- # lemmas to frames because each time frames_by_lemma() is
- # called, it has to search through ALL of the frame XML files
- # in the db.
- print(
- '\nSearching for all Frames that have a lemma that matches the regexp: "^run.v$":'
- )
- pprint(fn.frames_by_lemma(r"^run.v$"))
- if __name__ == "__main__":
- demo()
|