wordnet.py 77 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170
  1. # -*- coding: utf-8 -*-
  2. # Natural Language Toolkit: WordNet
  3. #
  4. # Copyright (C) 2001-2020 NLTK Project
  5. # Author: Steven Bethard <Steven.Bethard@colorado.edu>
  6. # Steven Bird <stevenbird1@gmail.com>
  7. # Edward Loper <edloper@gmail.com>
  8. # Nitin Madnani <nmadnani@ets.org>
  9. # Nasruddin A’aidil Shari
  10. # Sim Wei Ying Geraldine
  11. # Soe Lynn
  12. # Francis Bond <bond@ieee.org>
  13. # URL: <http://nltk.org/>
  14. # For license information, see LICENSE.TXT
  15. """
  16. An NLTK interface for WordNet
  17. WordNet is a lexical database of English.
  18. Using synsets, helps find conceptual relationships between words
  19. such as hypernyms, hyponyms, synonyms, antonyms etc.
  20. For details about WordNet see:
  21. http://wordnet.princeton.edu/
  22. This module also allows you to find lemmas in languages
  23. other than English from the Open Multilingual Wordnet
  24. http://compling.hss.ntu.edu.sg/omw/
  25. """
  26. import math
  27. import re
  28. from itertools import islice, chain
  29. from functools import total_ordering
  30. from operator import itemgetter
  31. from collections import defaultdict, deque
  32. from nltk.corpus.reader import CorpusReader
  33. from nltk.util import binary_search_file as _binary_search_file
  34. from nltk.probability import FreqDist
  35. from nltk.internals import deprecated
  36. ######################################################################
  37. # Table of Contents
  38. ######################################################################
  39. # - Constants
  40. # - Data Classes
  41. # - WordNetError
  42. # - Lemma
  43. # - Synset
  44. # - WordNet Corpus Reader
  45. # - WordNet Information Content Corpus Reader
  46. # - Similarity Metrics
  47. # - Demo
  48. ######################################################################
  49. # Constants
  50. ######################################################################
  51. #: Positive infinity (for similarity functions)
  52. _INF = 1e300
  53. # { Part-of-speech constants
  54. ADJ, ADJ_SAT, ADV, NOUN, VERB = "a", "s", "r", "n", "v"
  55. # }
  56. POS_LIST = [NOUN, VERB, ADJ, ADV]
  57. # A table of strings that are used to express verb frames.
  58. VERB_FRAME_STRINGS = (
  59. None,
  60. "Something %s",
  61. "Somebody %s",
  62. "It is %sing",
  63. "Something is %sing PP",
  64. "Something %s something Adjective/Noun",
  65. "Something %s Adjective/Noun",
  66. "Somebody %s Adjective",
  67. "Somebody %s something",
  68. "Somebody %s somebody",
  69. "Something %s somebody",
  70. "Something %s something",
  71. "Something %s to somebody",
  72. "Somebody %s on something",
  73. "Somebody %s somebody something",
  74. "Somebody %s something to somebody",
  75. "Somebody %s something from somebody",
  76. "Somebody %s somebody with something",
  77. "Somebody %s somebody of something",
  78. "Somebody %s something on somebody",
  79. "Somebody %s somebody PP",
  80. "Somebody %s something PP",
  81. "Somebody %s PP",
  82. "Somebody's (body part) %s",
  83. "Somebody %s somebody to INFINITIVE",
  84. "Somebody %s somebody INFINITIVE",
  85. "Somebody %s that CLAUSE",
  86. "Somebody %s to somebody",
  87. "Somebody %s to INFINITIVE",
  88. "Somebody %s whether INFINITIVE",
  89. "Somebody %s somebody into V-ing something",
  90. "Somebody %s something with something",
  91. "Somebody %s INFINITIVE",
  92. "Somebody %s VERB-ing",
  93. "It %s that CLAUSE",
  94. "Something %s INFINITIVE",
  95. )
  96. SENSENUM_RE = re.compile(r"\.[\d]+\.")
  97. ######################################################################
  98. # Data Classes
  99. ######################################################################
  100. class WordNetError(Exception):
  101. """An exception class for wordnet-related errors."""
  102. @total_ordering
  103. class _WordNetObject(object):
  104. """A common base class for lemmas and synsets."""
  105. def hypernyms(self):
  106. return self._related("@")
  107. def _hypernyms(self):
  108. return self._related("@")
  109. def instance_hypernyms(self):
  110. return self._related("@i")
  111. def _instance_hypernyms(self):
  112. return self._related("@i")
  113. def hyponyms(self):
  114. return self._related("~")
  115. def instance_hyponyms(self):
  116. return self._related("~i")
  117. def member_holonyms(self):
  118. return self._related("#m")
  119. def substance_holonyms(self):
  120. return self._related("#s")
  121. def part_holonyms(self):
  122. return self._related("#p")
  123. def member_meronyms(self):
  124. return self._related("%m")
  125. def substance_meronyms(self):
  126. return self._related("%s")
  127. def part_meronyms(self):
  128. return self._related("%p")
  129. def topic_domains(self):
  130. return self._related(";c")
  131. def in_topic_domains(self):
  132. return self._related("-c")
  133. def region_domains(self):
  134. return self._related(";r")
  135. def in_region_domains(self):
  136. return self._related("-r")
  137. def usage_domains(self):
  138. return self._related(";u")
  139. def in_usage_domains(self):
  140. return self._related("-u")
  141. def attributes(self):
  142. return self._related("=")
  143. def entailments(self):
  144. return self._related("*")
  145. def causes(self):
  146. return self._related(">")
  147. def also_sees(self):
  148. return self._related("^")
  149. def verb_groups(self):
  150. return self._related("$")
  151. def similar_tos(self):
  152. return self._related("&")
  153. def __hash__(self):
  154. return hash(self._name)
  155. def __eq__(self, other):
  156. return self._name == other._name
  157. def __ne__(self, other):
  158. return self._name != other._name
  159. def __lt__(self, other):
  160. return self._name < other._name
  161. class Lemma(_WordNetObject):
  162. """
  163. The lexical entry for a single morphological form of a
  164. sense-disambiguated word.
  165. Create a Lemma from a "<word>.<pos>.<number>.<lemma>" string where:
  166. <word> is the morphological stem identifying the synset
  167. <pos> is one of the module attributes ADJ, ADJ_SAT, ADV, NOUN or VERB
  168. <number> is the sense number, counting from 0.
  169. <lemma> is the morphological form of interest
  170. Note that <word> and <lemma> can be different, e.g. the Synset
  171. 'salt.n.03' has the Lemmas 'salt.n.03.salt', 'salt.n.03.saltiness' and
  172. 'salt.n.03.salinity'.
  173. Lemma attributes, accessible via methods with the same name:
  174. - name: The canonical name of this lemma.
  175. - synset: The synset that this lemma belongs to.
  176. - syntactic_marker: For adjectives, the WordNet string identifying the
  177. syntactic position relative modified noun. See:
  178. https://wordnet.princeton.edu/documentation/wninput5wn
  179. For all other parts of speech, this attribute is None.
  180. - count: The frequency of this lemma in wordnet.
  181. Lemma methods:
  182. Lemmas have the following methods for retrieving related Lemmas. They
  183. correspond to the names for the pointer symbols defined here:
  184. https://wordnet.princeton.edu/documentation/wninput5wn
  185. These methods all return lists of Lemmas:
  186. - antonyms
  187. - hypernyms, instance_hypernyms
  188. - hyponyms, instance_hyponyms
  189. - member_holonyms, substance_holonyms, part_holonyms
  190. - member_meronyms, substance_meronyms, part_meronyms
  191. - topic_domains, region_domains, usage_domains
  192. - attributes
  193. - derivationally_related_forms
  194. - entailments
  195. - causes
  196. - also_sees
  197. - verb_groups
  198. - similar_tos
  199. - pertainyms
  200. """
  201. __slots__ = [
  202. "_wordnet_corpus_reader",
  203. "_name",
  204. "_syntactic_marker",
  205. "_synset",
  206. "_frame_strings",
  207. "_frame_ids",
  208. "_lexname_index",
  209. "_lex_id",
  210. "_lang",
  211. "_key",
  212. ]
  213. def __init__(
  214. self,
  215. wordnet_corpus_reader,
  216. synset,
  217. name,
  218. lexname_index,
  219. lex_id,
  220. syntactic_marker,
  221. ):
  222. self._wordnet_corpus_reader = wordnet_corpus_reader
  223. self._name = name
  224. self._syntactic_marker = syntactic_marker
  225. self._synset = synset
  226. self._frame_strings = []
  227. self._frame_ids = []
  228. self._lexname_index = lexname_index
  229. self._lex_id = lex_id
  230. self._lang = "eng"
  231. self._key = None # gets set later.
  232. def name(self):
  233. return self._name
  234. def syntactic_marker(self):
  235. return self._syntactic_marker
  236. def synset(self):
  237. return self._synset
  238. def frame_strings(self):
  239. return self._frame_strings
  240. def frame_ids(self):
  241. return self._frame_ids
  242. def lang(self):
  243. return self._lang
  244. def key(self):
  245. return self._key
  246. def __repr__(self):
  247. tup = type(self).__name__, self._synset._name, self._name
  248. return "%s('%s.%s')" % tup
  249. def _related(self, relation_symbol):
  250. get_synset = self._wordnet_corpus_reader.synset_from_pos_and_offset
  251. if (self._name, relation_symbol) not in self._synset._lemma_pointers:
  252. return []
  253. return [
  254. get_synset(pos, offset)._lemmas[lemma_index]
  255. for pos, offset, lemma_index in self._synset._lemma_pointers[
  256. self._name, relation_symbol
  257. ]
  258. ]
  259. def count(self):
  260. """Return the frequency count for this Lemma"""
  261. return self._wordnet_corpus_reader.lemma_count(self)
  262. def antonyms(self):
  263. return self._related("!")
  264. def derivationally_related_forms(self):
  265. return self._related("+")
  266. def pertainyms(self):
  267. return self._related("\\")
  268. class Synset(_WordNetObject):
  269. """Create a Synset from a "<lemma>.<pos>.<number>" string where:
  270. <lemma> is the word's morphological stem
  271. <pos> is one of the module attributes ADJ, ADJ_SAT, ADV, NOUN or VERB
  272. <number> is the sense number, counting from 0.
  273. Synset attributes, accessible via methods with the same name:
  274. - name: The canonical name of this synset, formed using the first lemma
  275. of this synset. Note that this may be different from the name
  276. passed to the constructor if that string used a different lemma to
  277. identify the synset.
  278. - pos: The synset's part of speech, matching one of the module level
  279. attributes ADJ, ADJ_SAT, ADV, NOUN or VERB.
  280. - lemmas: A list of the Lemma objects for this synset.
  281. - definition: The definition for this synset.
  282. - examples: A list of example strings for this synset.
  283. - offset: The offset in the WordNet dict file of this synset.
  284. - lexname: The name of the lexicographer file containing this synset.
  285. Synset methods:
  286. Synsets have the following methods for retrieving related Synsets.
  287. They correspond to the names for the pointer symbols defined here:
  288. https://wordnet.princeton.edu/documentation/wninput5wn
  289. These methods all return lists of Synsets.
  290. - hypernyms, instance_hypernyms
  291. - hyponyms, instance_hyponyms
  292. - member_holonyms, substance_holonyms, part_holonyms
  293. - member_meronyms, substance_meronyms, part_meronyms
  294. - attributes
  295. - entailments
  296. - causes
  297. - also_sees
  298. - verb_groups
  299. - similar_tos
  300. Additionally, Synsets support the following methods specific to the
  301. hypernym relation:
  302. - root_hypernyms
  303. - common_hypernyms
  304. - lowest_common_hypernyms
  305. Note that Synsets do not support the following relations because
  306. these are defined by WordNet as lexical relations:
  307. - antonyms
  308. - derivationally_related_forms
  309. - pertainyms
  310. """
  311. __slots__ = [
  312. "_pos",
  313. "_offset",
  314. "_name",
  315. "_frame_ids",
  316. "_lemmas",
  317. "_lemma_names",
  318. "_definition",
  319. "_examples",
  320. "_lexname",
  321. "_pointers",
  322. "_lemma_pointers",
  323. "_max_depth",
  324. "_min_depth",
  325. ]
  326. def __init__(self, wordnet_corpus_reader):
  327. self._wordnet_corpus_reader = wordnet_corpus_reader
  328. # All of these attributes get initialized by
  329. # WordNetCorpusReader._synset_from_pos_and_line()
  330. self._pos = None
  331. self._offset = None
  332. self._name = None
  333. self._frame_ids = []
  334. self._lemmas = []
  335. self._lemma_names = []
  336. self._definition = None
  337. self._examples = []
  338. self._lexname = None # lexicographer name
  339. self._all_hypernyms = None
  340. self._pointers = defaultdict(set)
  341. self._lemma_pointers = defaultdict(list)
  342. def pos(self):
  343. return self._pos
  344. def offset(self):
  345. return self._offset
  346. def name(self):
  347. return self._name
  348. def frame_ids(self):
  349. return self._frame_ids
  350. def definition(self):
  351. return self._definition
  352. def examples(self):
  353. return self._examples
  354. def lexname(self):
  355. return self._lexname
  356. def _needs_root(self):
  357. if self._pos == NOUN:
  358. if self._wordnet_corpus_reader.get_version() == "1.6":
  359. return True
  360. else:
  361. return False
  362. elif self._pos == VERB:
  363. return True
  364. def lemma_names(self, lang="eng"):
  365. """Return all the lemma_names associated with the synset"""
  366. if lang == "eng":
  367. return self._lemma_names
  368. else:
  369. self._wordnet_corpus_reader._load_lang_data(lang)
  370. i = self._wordnet_corpus_reader.ss2of(self, lang)
  371. if i in self._wordnet_corpus_reader._lang_data[lang][0]:
  372. return self._wordnet_corpus_reader._lang_data[lang][0][i]
  373. else:
  374. return []
  375. def lemmas(self, lang="eng"):
  376. """Return all the lemma objects associated with the synset"""
  377. if lang == "eng":
  378. return self._lemmas
  379. else:
  380. self._wordnet_corpus_reader._load_lang_data(lang)
  381. lemmark = []
  382. lemmy = self.lemma_names(lang)
  383. for lem in lemmy:
  384. temp = Lemma(
  385. self._wordnet_corpus_reader,
  386. self,
  387. lem,
  388. self._wordnet_corpus_reader._lexnames.index(self.lexname()),
  389. 0,
  390. None,
  391. )
  392. temp._lang = lang
  393. lemmark.append(temp)
  394. return lemmark
  395. def root_hypernyms(self):
  396. """Get the topmost hypernyms of this synset in WordNet."""
  397. result = []
  398. seen = set()
  399. todo = [self]
  400. while todo:
  401. next_synset = todo.pop()
  402. if next_synset not in seen:
  403. seen.add(next_synset)
  404. next_hypernyms = (
  405. next_synset.hypernyms() + next_synset.instance_hypernyms()
  406. )
  407. if not next_hypernyms:
  408. result.append(next_synset)
  409. else:
  410. todo.extend(next_hypernyms)
  411. return result
  412. # Simpler implementation which makes incorrect assumption that
  413. # hypernym hierarchy is acyclic:
  414. #
  415. # if not self.hypernyms():
  416. # return [self]
  417. # else:
  418. # return list(set(root for h in self.hypernyms()
  419. # for root in h.root_hypernyms()))
  420. def max_depth(self):
  421. """
  422. :return: The length of the longest hypernym path from this
  423. synset to the root.
  424. """
  425. if "_max_depth" not in self.__dict__:
  426. hypernyms = self.hypernyms() + self.instance_hypernyms()
  427. if not hypernyms:
  428. self._max_depth = 0
  429. else:
  430. self._max_depth = 1 + max(h.max_depth() for h in hypernyms)
  431. return self._max_depth
  432. def min_depth(self):
  433. """
  434. :return: The length of the shortest hypernym path from this
  435. synset to the root.
  436. """
  437. if "_min_depth" not in self.__dict__:
  438. hypernyms = self.hypernyms() + self.instance_hypernyms()
  439. if not hypernyms:
  440. self._min_depth = 0
  441. else:
  442. self._min_depth = 1 + min(h.min_depth() for h in hypernyms)
  443. return self._min_depth
  444. def closure(self, rel, depth=-1):
  445. """Return the transitive closure of source under the rel
  446. relationship, breadth-first
  447. >>> from nltk.corpus import wordnet as wn
  448. >>> dog = wn.synset('dog.n.01')
  449. >>> hyp = lambda s:s.hypernyms()
  450. >>> list(dog.closure(hyp))
  451. [Synset('canine.n.02'), Synset('domestic_animal.n.01'),
  452. Synset('carnivore.n.01'), Synset('animal.n.01'),
  453. Synset('placental.n.01'), Synset('organism.n.01'),
  454. Synset('mammal.n.01'), Synset('living_thing.n.01'),
  455. Synset('vertebrate.n.01'), Synset('whole.n.02'),
  456. Synset('chordate.n.01'), Synset('object.n.01'),
  457. Synset('physical_entity.n.01'), Synset('entity.n.01')]
  458. """
  459. from nltk.util import breadth_first
  460. synset_offsets = []
  461. for synset in breadth_first(self, rel, depth):
  462. if synset._offset != self._offset:
  463. if synset._offset not in synset_offsets:
  464. synset_offsets.append(synset._offset)
  465. yield synset
  466. def hypernym_paths(self):
  467. """
  468. Get the path(s) from this synset to the root, where each path is a
  469. list of the synset nodes traversed on the way to the root.
  470. :return: A list of lists, where each list gives the node sequence
  471. connecting the initial ``Synset`` node and a root node.
  472. """
  473. paths = []
  474. hypernyms = self.hypernyms() + self.instance_hypernyms()
  475. if len(hypernyms) == 0:
  476. paths = [[self]]
  477. for hypernym in hypernyms:
  478. for ancestor_list in hypernym.hypernym_paths():
  479. ancestor_list.append(self)
  480. paths.append(ancestor_list)
  481. return paths
  482. def common_hypernyms(self, other):
  483. """
  484. Find all synsets that are hypernyms of this synset and the
  485. other synset.
  486. :type other: Synset
  487. :param other: other input synset.
  488. :return: The synsets that are hypernyms of both synsets.
  489. """
  490. if not self._all_hypernyms:
  491. self._all_hypernyms = set(
  492. self_synset
  493. for self_synsets in self._iter_hypernym_lists()
  494. for self_synset in self_synsets
  495. )
  496. if not other._all_hypernyms:
  497. other._all_hypernyms = set(
  498. other_synset
  499. for other_synsets in other._iter_hypernym_lists()
  500. for other_synset in other_synsets
  501. )
  502. return list(self._all_hypernyms.intersection(other._all_hypernyms))
  503. def lowest_common_hypernyms(self, other, simulate_root=False, use_min_depth=False):
  504. """
  505. Get a list of lowest synset(s) that both synsets have as a hypernym.
  506. When `use_min_depth == False` this means that the synset which appears
  507. as a hypernym of both `self` and `other` with the lowest maximum depth
  508. is returned or if there are multiple such synsets at the same depth
  509. they are all returned
  510. However, if `use_min_depth == True` then the synset(s) which has/have
  511. the lowest minimum depth and appear(s) in both paths is/are returned.
  512. By setting the use_min_depth flag to True, the behavior of NLTK2 can be
  513. preserved. This was changed in NLTK3 to give more accurate results in a
  514. small set of cases, generally with synsets concerning people. (eg:
  515. 'chef.n.01', 'fireman.n.01', etc.)
  516. This method is an implementation of Ted Pedersen's "Lowest Common
  517. Subsumer" method from the Perl Wordnet module. It can return either
  518. "self" or "other" if they are a hypernym of the other.
  519. :type other: Synset
  520. :param other: other input synset
  521. :type simulate_root: bool
  522. :param simulate_root: The various verb taxonomies do not
  523. share a single root which disallows this metric from working for
  524. synsets that are not connected. This flag (False by default)
  525. creates a fake root that connects all the taxonomies. Set it
  526. to True to enable this behavior. For the noun taxonomy,
  527. there is usually a default root except for WordNet version 1.6.
  528. If you are using wordnet 1.6, a fake root will need to be added
  529. for nouns as well.
  530. :type use_min_depth: bool
  531. :param use_min_depth: This setting mimics older (v2) behavior of NLTK
  532. wordnet If True, will use the min_depth function to calculate the
  533. lowest common hypernyms. This is known to give strange results for
  534. some synset pairs (eg: 'chef.n.01', 'fireman.n.01') but is retained
  535. for backwards compatibility
  536. :return: The synsets that are the lowest common hypernyms of both
  537. synsets
  538. """
  539. synsets = self.common_hypernyms(other)
  540. if simulate_root:
  541. fake_synset = Synset(None)
  542. fake_synset._name = "*ROOT*"
  543. fake_synset.hypernyms = lambda: []
  544. fake_synset.instance_hypernyms = lambda: []
  545. synsets.append(fake_synset)
  546. try:
  547. if use_min_depth:
  548. max_depth = max(s.min_depth() for s in synsets)
  549. unsorted_lch = [s for s in synsets if s.min_depth() == max_depth]
  550. else:
  551. max_depth = max(s.max_depth() for s in synsets)
  552. unsorted_lch = [s for s in synsets if s.max_depth() == max_depth]
  553. return sorted(unsorted_lch)
  554. except ValueError:
  555. return []
  556. def hypernym_distances(self, distance=0, simulate_root=False):
  557. """
  558. Get the path(s) from this synset to the root, counting the distance
  559. of each node from the initial node on the way. A set of
  560. (synset, distance) tuples is returned.
  561. :type distance: int
  562. :param distance: the distance (number of edges) from this hypernym to
  563. the original hypernym ``Synset`` on which this method was called.
  564. :return: A set of ``(Synset, int)`` tuples where each ``Synset`` is
  565. a hypernym of the first ``Synset``.
  566. """
  567. distances = set([(self, distance)])
  568. for hypernym in self._hypernyms() + self._instance_hypernyms():
  569. distances |= hypernym.hypernym_distances(distance + 1, simulate_root=False)
  570. if simulate_root:
  571. fake_synset = Synset(None)
  572. fake_synset._name = "*ROOT*"
  573. fake_synset_distance = max(distances, key=itemgetter(1))[1]
  574. distances.add((fake_synset, fake_synset_distance + 1))
  575. return distances
  576. def _shortest_hypernym_paths(self, simulate_root):
  577. if self._name == "*ROOT*":
  578. return {self: 0}
  579. queue = deque([(self, 0)])
  580. path = {}
  581. while queue:
  582. s, depth = queue.popleft()
  583. if s in path:
  584. continue
  585. path[s] = depth
  586. depth += 1
  587. queue.extend((hyp, depth) for hyp in s._hypernyms())
  588. queue.extend((hyp, depth) for hyp in s._instance_hypernyms())
  589. if simulate_root:
  590. fake_synset = Synset(None)
  591. fake_synset._name = "*ROOT*"
  592. path[fake_synset] = max(path.values()) + 1
  593. return path
  594. def shortest_path_distance(self, other, simulate_root=False):
  595. """
  596. Returns the distance of the shortest path linking the two synsets (if
  597. one exists). For each synset, all the ancestor nodes and their
  598. distances are recorded and compared. The ancestor node common to both
  599. synsets that can be reached with the minimum number of traversals is
  600. used. If no ancestor nodes are common, None is returned. If a node is
  601. compared with itself 0 is returned.
  602. :type other: Synset
  603. :param other: The Synset to which the shortest path will be found.
  604. :return: The number of edges in the shortest path connecting the two
  605. nodes, or None if no path exists.
  606. """
  607. if self == other:
  608. return 0
  609. dist_dict1 = self._shortest_hypernym_paths(simulate_root)
  610. dist_dict2 = other._shortest_hypernym_paths(simulate_root)
  611. # For each ancestor synset common to both subject synsets, find the
  612. # connecting path length. Return the shortest of these.
  613. inf = float("inf")
  614. path_distance = inf
  615. for synset, d1 in dist_dict1.items():
  616. d2 = dist_dict2.get(synset, inf)
  617. path_distance = min(path_distance, d1 + d2)
  618. return None if math.isinf(path_distance) else path_distance
  619. def tree(self, rel, depth=-1, cut_mark=None):
  620. """
  621. >>> from nltk.corpus import wordnet as wn
  622. >>> dog = wn.synset('dog.n.01')
  623. >>> hyp = lambda s:s.hypernyms()
  624. >>> from pprint import pprint
  625. >>> pprint(dog.tree(hyp))
  626. [Synset('dog.n.01'),
  627. [Synset('canine.n.02'),
  628. [Synset('carnivore.n.01'),
  629. [Synset('placental.n.01'),
  630. [Synset('mammal.n.01'),
  631. [Synset('vertebrate.n.01'),
  632. [Synset('chordate.n.01'),
  633. [Synset('animal.n.01'),
  634. [Synset('organism.n.01'),
  635. [Synset('living_thing.n.01'),
  636. [Synset('whole.n.02'),
  637. [Synset('object.n.01'),
  638. [Synset('physical_entity.n.01'),
  639. [Synset('entity.n.01')]]]]]]]]]]]]],
  640. [Synset('domestic_animal.n.01'),
  641. [Synset('animal.n.01'),
  642. [Synset('organism.n.01'),
  643. [Synset('living_thing.n.01'),
  644. [Synset('whole.n.02'),
  645. [Synset('object.n.01'),
  646. [Synset('physical_entity.n.01'), [Synset('entity.n.01')]]]]]]]]]
  647. """
  648. tree = [self]
  649. if depth != 0:
  650. tree += [x.tree(rel, depth - 1, cut_mark) for x in rel(self)]
  651. elif cut_mark:
  652. tree += [cut_mark]
  653. return tree
  654. # interface to similarity methods
  655. def path_similarity(self, other, verbose=False, simulate_root=True):
  656. """
  657. Path Distance Similarity:
  658. Return a score denoting how similar two word senses are, based on the
  659. shortest path that connects the senses in the is-a (hypernym/hypnoym)
  660. taxonomy. The score is in the range 0 to 1, except in those cases where
  661. a path cannot be found (will only be true for verbs as there are many
  662. distinct verb taxonomies), in which case None is returned. A score of
  663. 1 represents identity i.e. comparing a sense with itself will return 1.
  664. :type other: Synset
  665. :param other: The ``Synset`` that this ``Synset`` is being compared to.
  666. :type simulate_root: bool
  667. :param simulate_root: The various verb taxonomies do not
  668. share a single root which disallows this metric from working for
  669. synsets that are not connected. This flag (True by default)
  670. creates a fake root that connects all the taxonomies. Set it
  671. to false to disable this behavior. For the noun taxonomy,
  672. there is usually a default root except for WordNet version 1.6.
  673. If you are using wordnet 1.6, a fake root will be added for nouns
  674. as well.
  675. :return: A score denoting the similarity of the two ``Synset`` objects,
  676. normally between 0 and 1. None is returned if no connecting path
  677. could be found. 1 is returned if a ``Synset`` is compared with
  678. itself.
  679. """
  680. distance = self.shortest_path_distance(
  681. other, simulate_root=simulate_root and self._needs_root()
  682. )
  683. if distance is None or distance < 0:
  684. return None
  685. return 1.0 / (distance + 1)
  686. def lch_similarity(self, other, verbose=False, simulate_root=True):
  687. """
  688. Leacock Chodorow Similarity:
  689. Return a score denoting how similar two word senses are, based on the
  690. shortest path that connects the senses (as above) and the maximum depth
  691. of the taxonomy in which the senses occur. The relationship is given as
  692. -log(p/2d) where p is the shortest path length and d is the taxonomy
  693. depth.
  694. :type other: Synset
  695. :param other: The ``Synset`` that this ``Synset`` is being compared to.
  696. :type simulate_root: bool
  697. :param simulate_root: The various verb taxonomies do not
  698. share a single root which disallows this metric from working for
  699. synsets that are not connected. This flag (True by default)
  700. creates a fake root that connects all the taxonomies. Set it
  701. to false to disable this behavior. For the noun taxonomy,
  702. there is usually a default root except for WordNet version 1.6.
  703. If you are using wordnet 1.6, a fake root will be added for nouns
  704. as well.
  705. :return: A score denoting the similarity of the two ``Synset`` objects,
  706. normally greater than 0. None is returned if no connecting path
  707. could be found. If a ``Synset`` is compared with itself, the
  708. maximum score is returned, which varies depending on the taxonomy
  709. depth.
  710. """
  711. if self._pos != other._pos:
  712. raise WordNetError(
  713. "Computing the lch similarity requires "
  714. "%s and %s to have the same part of speech." % (self, other)
  715. )
  716. need_root = self._needs_root()
  717. if self._pos not in self._wordnet_corpus_reader._max_depth:
  718. self._wordnet_corpus_reader._compute_max_depth(self._pos, need_root)
  719. depth = self._wordnet_corpus_reader._max_depth[self._pos]
  720. distance = self.shortest_path_distance(
  721. other, simulate_root=simulate_root and need_root
  722. )
  723. if distance is None or distance < 0 or depth == 0:
  724. return None
  725. return -math.log((distance + 1) / (2.0 * depth))
  726. def wup_similarity(self, other, verbose=False, simulate_root=True):
  727. """
  728. Wu-Palmer Similarity:
  729. Return a score denoting how similar two word senses are, based on the
  730. depth of the two senses in the taxonomy and that of their Least Common
  731. Subsumer (most specific ancestor node). Previously, the scores computed
  732. by this implementation did _not_ always agree with those given by
  733. Pedersen's Perl implementation of WordNet Similarity. However, with
  734. the addition of the simulate_root flag (see below), the score for
  735. verbs now almost always agree but not always for nouns.
  736. The LCS does not necessarily feature in the shortest path connecting
  737. the two senses, as it is by definition the common ancestor deepest in
  738. the taxonomy, not closest to the two senses. Typically, however, it
  739. will so feature. Where multiple candidates for the LCS exist, that
  740. whose shortest path to the root node is the longest will be selected.
  741. Where the LCS has multiple paths to the root, the longer path is used
  742. for the purposes of the calculation.
  743. :type other: Synset
  744. :param other: The ``Synset`` that this ``Synset`` is being compared to.
  745. :type simulate_root: bool
  746. :param simulate_root: The various verb taxonomies do not
  747. share a single root which disallows this metric from working for
  748. synsets that are not connected. This flag (True by default)
  749. creates a fake root that connects all the taxonomies. Set it
  750. to false to disable this behavior. For the noun taxonomy,
  751. there is usually a default root except for WordNet version 1.6.
  752. If you are using wordnet 1.6, a fake root will be added for nouns
  753. as well.
  754. :return: A float score denoting the similarity of the two ``Synset``
  755. objects, normally greater than zero. If no connecting path between
  756. the two senses can be found, None is returned.
  757. """
  758. need_root = self._needs_root()
  759. # Note that to preserve behavior from NLTK2 we set use_min_depth=True
  760. # It is possible that more accurate results could be obtained by
  761. # removing this setting and it should be tested later on
  762. subsumers = self.lowest_common_hypernyms(
  763. other, simulate_root=simulate_root and need_root, use_min_depth=True
  764. )
  765. # If no LCS was found return None
  766. if len(subsumers) == 0:
  767. return None
  768. subsumer = self if self in subsumers else subsumers[0]
  769. # Get the longest path from the LCS to the root,
  770. # including a correction:
  771. # - add one because the calculations include both the start and end
  772. # nodes
  773. depth = subsumer.max_depth() + 1
  774. # Note: No need for an additional add-one correction for non-nouns
  775. # to account for an imaginary root node because that is now
  776. # automatically handled by simulate_root
  777. # if subsumer._pos != NOUN:
  778. # depth += 1
  779. # Get the shortest path from the LCS to each of the synsets it is
  780. # subsuming. Add this to the LCS path length to get the path
  781. # length from each synset to the root.
  782. len1 = self.shortest_path_distance(
  783. subsumer, simulate_root=simulate_root and need_root
  784. )
  785. len2 = other.shortest_path_distance(
  786. subsumer, simulate_root=simulate_root and need_root
  787. )
  788. if len1 is None or len2 is None:
  789. return None
  790. len1 += depth
  791. len2 += depth
  792. return (2.0 * depth) / (len1 + len2)
  793. def res_similarity(self, other, ic, verbose=False):
  794. """
  795. Resnik Similarity:
  796. Return a score denoting how similar two word senses are, based on the
  797. Information Content (IC) of the Least Common Subsumer (most specific
  798. ancestor node).
  799. :type other: Synset
  800. :param other: The ``Synset`` that this ``Synset`` is being compared to.
  801. :type ic: dict
  802. :param ic: an information content object (as returned by
  803. ``nltk.corpus.wordnet_ic.ic()``).
  804. :return: A float score denoting the similarity of the two ``Synset``
  805. objects. Synsets whose LCS is the root node of the taxonomy will
  806. have a score of 0 (e.g. N['dog'][0] and N['table'][0]).
  807. """
  808. ic1, ic2, lcs_ic = _lcs_ic(self, other, ic)
  809. return lcs_ic
  810. def jcn_similarity(self, other, ic, verbose=False):
  811. """
  812. Jiang-Conrath Similarity:
  813. Return a score denoting how similar two word senses are, based on the
  814. Information Content (IC) of the Least Common Subsumer (most specific
  815. ancestor node) and that of the two input Synsets. The relationship is
  816. given by the equation 1 / (IC(s1) + IC(s2) - 2 * IC(lcs)).
  817. :type other: Synset
  818. :param other: The ``Synset`` that this ``Synset`` is being compared to.
  819. :type ic: dict
  820. :param ic: an information content object (as returned by
  821. ``nltk.corpus.wordnet_ic.ic()``).
  822. :return: A float score denoting the similarity of the two ``Synset``
  823. objects.
  824. """
  825. if self == other:
  826. return _INF
  827. ic1, ic2, lcs_ic = _lcs_ic(self, other, ic)
  828. # If either of the input synsets are the root synset, or have a
  829. # frequency of 0 (sparse data problem), return 0.
  830. if ic1 == 0 or ic2 == 0:
  831. return 0
  832. ic_difference = ic1 + ic2 - 2 * lcs_ic
  833. if ic_difference == 0:
  834. return _INF
  835. return 1 / ic_difference
  836. def lin_similarity(self, other, ic, verbose=False):
  837. """
  838. Lin Similarity:
  839. Return a score denoting how similar two word senses are, based on the
  840. Information Content (IC) of the Least Common Subsumer (most specific
  841. ancestor node) and that of the two input Synsets. The relationship is
  842. given by the equation 2 * IC(lcs) / (IC(s1) + IC(s2)).
  843. :type other: Synset
  844. :param other: The ``Synset`` that this ``Synset`` is being compared to.
  845. :type ic: dict
  846. :param ic: an information content object (as returned by
  847. ``nltk.corpus.wordnet_ic.ic()``).
  848. :return: A float score denoting the similarity of the two ``Synset``
  849. objects, in the range 0 to 1.
  850. """
  851. ic1, ic2, lcs_ic = _lcs_ic(self, other, ic)
  852. return (2.0 * lcs_ic) / (ic1 + ic2)
  853. def _iter_hypernym_lists(self):
  854. """
  855. :return: An iterator over ``Synset`` objects that are either proper
  856. hypernyms or instance of hypernyms of the synset.
  857. """
  858. todo = [self]
  859. seen = set()
  860. while todo:
  861. for synset in todo:
  862. seen.add(synset)
  863. yield todo
  864. todo = [
  865. hypernym
  866. for synset in todo
  867. for hypernym in (synset.hypernyms() + synset.instance_hypernyms())
  868. if hypernym not in seen
  869. ]
  870. def __repr__(self):
  871. return "%s('%s')" % (type(self).__name__, self._name)
  872. def _related(self, relation_symbol, sort=True):
  873. get_synset = self._wordnet_corpus_reader.synset_from_pos_and_offset
  874. if relation_symbol not in self._pointers:
  875. return []
  876. pointer_tuples = self._pointers[relation_symbol]
  877. r = [get_synset(pos, offset) for pos, offset in pointer_tuples]
  878. if sort:
  879. r.sort()
  880. return r
  881. ######################################################################
  882. # WordNet Corpus Reader
  883. ######################################################################
  884. class WordNetCorpusReader(CorpusReader):
  885. """
  886. A corpus reader used to access wordnet or its variants.
  887. """
  888. _ENCODING = "utf8"
  889. # { Part-of-speech constants
  890. ADJ, ADJ_SAT, ADV, NOUN, VERB = "a", "s", "r", "n", "v"
  891. # }
  892. # { Filename constants
  893. _FILEMAP = {ADJ: "adj", ADV: "adv", NOUN: "noun", VERB: "verb"}
  894. # }
  895. # { Part of speech constants
  896. _pos_numbers = {NOUN: 1, VERB: 2, ADJ: 3, ADV: 4, ADJ_SAT: 5}
  897. _pos_names = dict(tup[::-1] for tup in _pos_numbers.items())
  898. # }
  899. #: A list of file identifiers for all the fileids used by this
  900. #: corpus reader.
  901. _FILES = (
  902. "cntlist.rev",
  903. "lexnames",
  904. "index.sense",
  905. "index.adj",
  906. "index.adv",
  907. "index.noun",
  908. "index.verb",
  909. "data.adj",
  910. "data.adv",
  911. "data.noun",
  912. "data.verb",
  913. "adj.exc",
  914. "adv.exc",
  915. "noun.exc",
  916. "verb.exc",
  917. )
  918. def __init__(self, root, omw_reader):
  919. """
  920. Construct a new wordnet corpus reader, with the given root
  921. directory.
  922. """
  923. super(WordNetCorpusReader, self).__init__(
  924. root, self._FILES, encoding=self._ENCODING
  925. )
  926. # A index that provides the file offset
  927. # Map from lemma -> pos -> synset_index -> offset
  928. self._lemma_pos_offset_map = defaultdict(dict)
  929. # A cache so we don't have to reconstuct synsets
  930. # Map from pos -> offset -> synset
  931. self._synset_offset_cache = defaultdict(dict)
  932. # A lookup for the maximum depth of each part of speech. Useful for
  933. # the lch similarity metric.
  934. self._max_depth = defaultdict(dict)
  935. # Corpus reader containing omw data.
  936. self._omw_reader = omw_reader
  937. # A cache to store the wordnet data of multiple languages
  938. self._lang_data = defaultdict(list)
  939. self._data_file_map = {}
  940. self._exception_map = {}
  941. self._lexnames = []
  942. self._key_count_file = None
  943. self._key_synset_file = None
  944. # Load the lexnames
  945. for i, line in enumerate(self.open("lexnames")):
  946. index, lexname, _ = line.split()
  947. assert int(index) == i
  948. self._lexnames.append(lexname)
  949. # Load the indices for lemmas and synset offsets
  950. self._load_lemma_pos_offset_map()
  951. # load the exception file data into memory
  952. self._load_exception_map()
  953. # Open Multilingual WordNet functions, contributed by
  954. # Nasruddin A’aidil Shari, Sim Wei Ying Geraldine, and Soe Lynn
  955. def of2ss(self, of):
  956. """ take an id and return the synsets """
  957. return self.synset_from_pos_and_offset(of[-1], int(of[:8]))
  958. def ss2of(self, ss, lang=None):
  959. """ return the ID of the synset """
  960. pos = ss.pos()
  961. # Only these 3 WordNets retain the satellite pos tag
  962. if lang not in ["nld", "lit", "slk"] and pos == "s":
  963. pos = "a"
  964. return "{:08d}-{}".format(ss.offset(), pos)
  965. def _load_lang_data(self, lang):
  966. """ load the wordnet data of the requested language from the file to
  967. the cache, _lang_data """
  968. if lang in self._lang_data.keys():
  969. return
  970. if lang not in self.langs():
  971. raise WordNetError("Language is not supported.")
  972. f = self._omw_reader.open("{0:}/wn-data-{0:}.tab".format(lang))
  973. self.custom_lemmas(f, lang)
  974. f.close()
  975. def langs(self):
  976. """ return a list of languages supported by Multilingual Wordnet """
  977. import os
  978. langs = ["eng"]
  979. fileids = self._omw_reader.fileids()
  980. for fileid in fileids:
  981. file_name, file_extension = os.path.splitext(fileid)
  982. if file_extension == ".tab":
  983. langs.append(file_name.split("-")[-1])
  984. return langs
  985. def _load_lemma_pos_offset_map(self):
  986. for suffix in self._FILEMAP.values():
  987. # parse each line of the file (ignoring comment lines)
  988. for i, line in enumerate(self.open("index.%s" % suffix)):
  989. if line.startswith(" "):
  990. continue
  991. _iter = iter(line.split())
  992. def _next_token():
  993. return next(_iter)
  994. try:
  995. # get the lemma and part-of-speech
  996. lemma = _next_token()
  997. pos = _next_token()
  998. # get the number of synsets for this lemma
  999. n_synsets = int(_next_token())
  1000. assert n_synsets > 0
  1001. # get and ignore the pointer symbols for all synsets of
  1002. # this lemma
  1003. n_pointers = int(_next_token())
  1004. [_next_token() for _ in range(n_pointers)]
  1005. # same as number of synsets
  1006. n_senses = int(_next_token())
  1007. assert n_synsets == n_senses
  1008. # get and ignore number of senses ranked according to
  1009. # frequency
  1010. _next_token()
  1011. # get synset offsets
  1012. synset_offsets = [int(_next_token()) for _ in range(n_synsets)]
  1013. # raise more informative error with file name and line number
  1014. except (AssertionError, ValueError) as e:
  1015. tup = ("index.%s" % suffix), (i + 1), e
  1016. raise WordNetError("file %s, line %i: %s" % tup)
  1017. # map lemmas and parts of speech to synsets
  1018. self._lemma_pos_offset_map[lemma][pos] = synset_offsets
  1019. if pos == ADJ:
  1020. self._lemma_pos_offset_map[lemma][ADJ_SAT] = synset_offsets
  1021. def _load_exception_map(self):
  1022. # load the exception file data into memory
  1023. for pos, suffix in self._FILEMAP.items():
  1024. self._exception_map[pos] = {}
  1025. for line in self.open("%s.exc" % suffix):
  1026. terms = line.split()
  1027. self._exception_map[pos][terms[0]] = terms[1:]
  1028. self._exception_map[ADJ_SAT] = self._exception_map[ADJ]
  1029. def _compute_max_depth(self, pos, simulate_root):
  1030. """
  1031. Compute the max depth for the given part of speech. This is
  1032. used by the lch similarity metric.
  1033. """
  1034. depth = 0
  1035. for ii in self.all_synsets(pos):
  1036. try:
  1037. depth = max(depth, ii.max_depth())
  1038. except RuntimeError:
  1039. print(ii)
  1040. if simulate_root:
  1041. depth += 1
  1042. self._max_depth[pos] = depth
  1043. def get_version(self):
  1044. fh = self._data_file(ADJ)
  1045. for line in fh:
  1046. match = re.search(r"WordNet (\d+\.\d+) Copyright", line)
  1047. if match is not None:
  1048. version = match.group(1)
  1049. fh.seek(0)
  1050. return version
  1051. #############################################################
  1052. # Loading Lemmas
  1053. #############################################################
  1054. def lemma(self, name, lang="eng"):
  1055. """Return lemma object that matches the name"""
  1056. # cannot simply split on first '.',
  1057. # e.g.: '.45_caliber.a.01..45_caliber'
  1058. separator = SENSENUM_RE.search(name).end()
  1059. synset_name, lemma_name = name[: separator - 1], name[separator:]
  1060. synset = self.synset(synset_name)
  1061. for lemma in synset.lemmas(lang):
  1062. if lemma._name == lemma_name:
  1063. return lemma
  1064. raise WordNetError("no lemma %r in %r" % (lemma_name, synset_name))
  1065. def lemma_from_key(self, key):
  1066. # Keys are case sensitive and always lower-case
  1067. key = key.lower()
  1068. lemma_name, lex_sense = key.split("%")
  1069. pos_number, lexname_index, lex_id, _, _ = lex_sense.split(":")
  1070. pos = self._pos_names[int(pos_number)]
  1071. # open the key -> synset file if necessary
  1072. if self._key_synset_file is None:
  1073. self._key_synset_file = self.open("index.sense")
  1074. # Find the synset for the lemma.
  1075. synset_line = _binary_search_file(self._key_synset_file, key)
  1076. if not synset_line:
  1077. raise WordNetError("No synset found for key %r" % key)
  1078. offset = int(synset_line.split()[1])
  1079. synset = self.synset_from_pos_and_offset(pos, offset)
  1080. # return the corresponding lemma
  1081. for lemma in synset._lemmas:
  1082. if lemma._key == key:
  1083. return lemma
  1084. raise WordNetError("No lemma found for for key %r" % key)
  1085. #############################################################
  1086. # Loading Synsets
  1087. #############################################################
  1088. def synset(self, name):
  1089. # split name into lemma, part of speech and synset number
  1090. lemma, pos, synset_index_str = name.lower().rsplit(".", 2)
  1091. synset_index = int(synset_index_str) - 1
  1092. # get the offset for this synset
  1093. try:
  1094. offset = self._lemma_pos_offset_map[lemma][pos][synset_index]
  1095. except KeyError:
  1096. message = "no lemma %r with part of speech %r"
  1097. raise WordNetError(message % (lemma, pos))
  1098. except IndexError:
  1099. n_senses = len(self._lemma_pos_offset_map[lemma][pos])
  1100. message = "lemma %r with part of speech %r has only %i %s"
  1101. if n_senses == 1:
  1102. tup = lemma, pos, n_senses, "sense"
  1103. else:
  1104. tup = lemma, pos, n_senses, "senses"
  1105. raise WordNetError(message % tup)
  1106. # load synset information from the appropriate file
  1107. synset = self.synset_from_pos_and_offset(pos, offset)
  1108. # some basic sanity checks on loaded attributes
  1109. if pos == "s" and synset._pos == "a":
  1110. message = (
  1111. "adjective satellite requested but only plain "
  1112. "adjective found for lemma %r"
  1113. )
  1114. raise WordNetError(message % lemma)
  1115. assert synset._pos == pos or (pos == "a" and synset._pos == "s")
  1116. # Return the synset object.
  1117. return synset
  1118. def _data_file(self, pos):
  1119. """
  1120. Return an open file pointer for the data file for the given
  1121. part of speech.
  1122. """
  1123. if pos == ADJ_SAT:
  1124. pos = ADJ
  1125. if self._data_file_map.get(pos) is None:
  1126. fileid = "data.%s" % self._FILEMAP[pos]
  1127. self._data_file_map[pos] = self.open(fileid)
  1128. return self._data_file_map[pos]
  1129. def synset_from_pos_and_offset(self, pos, offset):
  1130. # Check to see if the synset is in the cache
  1131. if offset in self._synset_offset_cache[pos]:
  1132. return self._synset_offset_cache[pos][offset]
  1133. data_file = self._data_file(pos)
  1134. data_file.seek(offset)
  1135. data_file_line = data_file.readline()
  1136. synset = self._synset_from_pos_and_line(pos, data_file_line)
  1137. assert synset._offset == offset
  1138. self._synset_offset_cache[pos][offset] = synset
  1139. return synset
  1140. @deprecated("Use public method synset_from_pos_and_offset() instead")
  1141. def _synset_from_pos_and_offset(self, *args, **kwargs):
  1142. """
  1143. Hack to help people like the readers of
  1144. http://stackoverflow.com/a/27145655/1709587
  1145. who were using this function before it was officially a public method
  1146. """
  1147. return self.synset_from_pos_and_offset(*args, **kwargs)
  1148. def _synset_from_pos_and_line(self, pos, data_file_line):
  1149. # Construct a new (empty) synset.
  1150. synset = Synset(self)
  1151. # parse the entry for this synset
  1152. try:
  1153. # parse out the definitions and examples from the gloss
  1154. columns_str, gloss = data_file_line.strip().split("|")
  1155. definition = re.sub(r"[\"].*?[\"]", "", gloss).strip()
  1156. examples = re.findall(r'"([^"]*)"', gloss)
  1157. for example in examples:
  1158. synset._examples.append(example)
  1159. synset._definition = definition.strip("; ")
  1160. # split the other info into fields
  1161. _iter = iter(columns_str.split())
  1162. def _next_token():
  1163. return next(_iter)
  1164. # get the offset
  1165. synset._offset = int(_next_token())
  1166. # determine the lexicographer file name
  1167. lexname_index = int(_next_token())
  1168. synset._lexname = self._lexnames[lexname_index]
  1169. # get the part of speech
  1170. synset._pos = _next_token()
  1171. # create Lemma objects for each lemma
  1172. n_lemmas = int(_next_token(), 16)
  1173. for _ in range(n_lemmas):
  1174. # get the lemma name
  1175. lemma_name = _next_token()
  1176. # get the lex_id (used for sense_keys)
  1177. lex_id = int(_next_token(), 16)
  1178. # If the lemma has a syntactic marker, extract it.
  1179. m = re.match(r"(.*?)(\(.*\))?$", lemma_name)
  1180. lemma_name, syn_mark = m.groups()
  1181. # create the lemma object
  1182. lemma = Lemma(self, synset, lemma_name, lexname_index, lex_id, syn_mark)
  1183. synset._lemmas.append(lemma)
  1184. synset._lemma_names.append(lemma._name)
  1185. # collect the pointer tuples
  1186. n_pointers = int(_next_token())
  1187. for _ in range(n_pointers):
  1188. symbol = _next_token()
  1189. offset = int(_next_token())
  1190. pos = _next_token()
  1191. lemma_ids_str = _next_token()
  1192. if lemma_ids_str == "0000":
  1193. synset._pointers[symbol].add((pos, offset))
  1194. else:
  1195. source_index = int(lemma_ids_str[:2], 16) - 1
  1196. target_index = int(lemma_ids_str[2:], 16) - 1
  1197. source_lemma_name = synset._lemmas[source_index]._name
  1198. lemma_pointers = synset._lemma_pointers
  1199. tups = lemma_pointers[source_lemma_name, symbol]
  1200. tups.append((pos, offset, target_index))
  1201. # read the verb frames
  1202. try:
  1203. frame_count = int(_next_token())
  1204. except StopIteration:
  1205. pass
  1206. else:
  1207. for _ in range(frame_count):
  1208. # read the plus sign
  1209. plus = _next_token()
  1210. assert plus == "+"
  1211. # read the frame and lemma number
  1212. frame_number = int(_next_token())
  1213. frame_string_fmt = VERB_FRAME_STRINGS[frame_number]
  1214. lemma_number = int(_next_token(), 16)
  1215. # lemma number of 00 means all words in the synset
  1216. if lemma_number == 0:
  1217. synset._frame_ids.append(frame_number)
  1218. for lemma in synset._lemmas:
  1219. lemma._frame_ids.append(frame_number)
  1220. lemma._frame_strings.append(frame_string_fmt % lemma._name)
  1221. # only a specific word in the synset
  1222. else:
  1223. lemma = synset._lemmas[lemma_number - 1]
  1224. lemma._frame_ids.append(frame_number)
  1225. lemma._frame_strings.append(frame_string_fmt % lemma._name)
  1226. # raise a more informative error with line text
  1227. except ValueError as e:
  1228. raise WordNetError("line %r: %s" % (data_file_line, e))
  1229. # set sense keys for Lemma objects - note that this has to be
  1230. # done afterwards so that the relations are available
  1231. for lemma in synset._lemmas:
  1232. if synset._pos == ADJ_SAT:
  1233. head_lemma = synset.similar_tos()[0]._lemmas[0]
  1234. head_name = head_lemma._name
  1235. head_id = "%02d" % head_lemma._lex_id
  1236. else:
  1237. head_name = head_id = ""
  1238. tup = (
  1239. lemma._name,
  1240. WordNetCorpusReader._pos_numbers[synset._pos],
  1241. lemma._lexname_index,
  1242. lemma._lex_id,
  1243. head_name,
  1244. head_id,
  1245. )
  1246. lemma._key = ("%s%%%d:%02d:%02d:%s:%s" % tup).lower()
  1247. # the canonical name is based on the first lemma
  1248. lemma_name = synset._lemmas[0]._name.lower()
  1249. offsets = self._lemma_pos_offset_map[lemma_name][synset._pos]
  1250. sense_index = offsets.index(synset._offset)
  1251. tup = lemma_name, synset._pos, sense_index + 1
  1252. synset._name = "%s.%s.%02i" % tup
  1253. return synset
  1254. def synset_from_sense_key(self, sense_key):
  1255. """
  1256. Retrieves synset based on a given sense_key. Sense keys can be
  1257. obtained from lemma.key()
  1258. From https://wordnet.princeton.edu/documentation/senseidx5wn:
  1259. A sense_key is represented as:
  1260. lemma % lex_sense (e.g. 'dog%1:18:01::')
  1261. where lex_sense is encoded as:
  1262. ss_type:lex_filenum:lex_id:head_word:head_id
  1263. lemma: ASCII text of word/collocation, in lower case
  1264. ss_type: synset type for the sense (1 digit int)
  1265. The synset type is encoded as follows:
  1266. 1 NOUN
  1267. 2 VERB
  1268. 3 ADJECTIVE
  1269. 4 ADVERB
  1270. 5 ADJECTIVE SATELLITE
  1271. lex_filenum: name of lexicographer file containing the synset for the sense (2 digit int)
  1272. lex_id: when paired with lemma, uniquely identifies a sense in the lexicographer file (2 digit int)
  1273. head_word: lemma of the first word in satellite's head synset
  1274. Only used if sense is in an adjective satellite synset
  1275. head_id: uniquely identifies sense in a lexicographer file when paired with head_word
  1276. Only used if head_word is present (2 digit int)
  1277. """
  1278. sense_key_regex = re.compile(r"(.*)\%(.*):(.*):(.*):(.*):(.*)")
  1279. synset_types = {1: NOUN, 2: VERB, 3: ADJ, 4: ADV, 5: ADJ_SAT}
  1280. lemma, ss_type, _, lex_id, _, _ = sense_key_regex.match(sense_key).groups()
  1281. # check that information extracted from sense_key is valid
  1282. error = None
  1283. if not lemma:
  1284. error = "lemma"
  1285. elif int(ss_type) not in synset_types:
  1286. error = "ss_type"
  1287. elif int(lex_id) < 0 or int(lex_id) > 99:
  1288. error = "lex_id"
  1289. if error:
  1290. raise WordNetError(
  1291. "valid {} could not be extracted from the sense key".format(error)
  1292. )
  1293. synset_id = ".".join([lemma, synset_types[int(ss_type)], lex_id])
  1294. return self.synset(synset_id)
  1295. #############################################################
  1296. # Retrieve synsets and lemmas.
  1297. #############################################################
  1298. def synsets(self, lemma, pos=None, lang="eng", check_exceptions=True):
  1299. """Load all synsets with a given lemma and part of speech tag.
  1300. If no pos is specified, all synsets for all parts of speech
  1301. will be loaded.
  1302. If lang is specified, all the synsets associated with the lemma name
  1303. of that language will be returned.
  1304. """
  1305. lemma = lemma.lower()
  1306. if lang == "eng":
  1307. get_synset = self.synset_from_pos_and_offset
  1308. index = self._lemma_pos_offset_map
  1309. if pos is None:
  1310. pos = POS_LIST
  1311. return [
  1312. get_synset(p, offset)
  1313. for p in pos
  1314. for form in self._morphy(lemma, p, check_exceptions)
  1315. for offset in index[form].get(p, [])
  1316. ]
  1317. else:
  1318. self._load_lang_data(lang)
  1319. synset_list = []
  1320. if lemma in self._lang_data[lang][1]:
  1321. for l in self._lang_data[lang][1][lemma]:
  1322. if pos is not None and l[-1] != pos:
  1323. continue
  1324. synset_list.append(self.of2ss(l))
  1325. return synset_list
  1326. def lemmas(self, lemma, pos=None, lang="eng"):
  1327. """Return all Lemma objects with a name matching the specified lemma
  1328. name and part of speech tag. Matches any part of speech tag if none is
  1329. specified."""
  1330. lemma = lemma.lower()
  1331. if lang == "eng":
  1332. return [
  1333. lemma_obj
  1334. for synset in self.synsets(lemma, pos)
  1335. for lemma_obj in synset.lemmas()
  1336. if lemma_obj.name().lower() == lemma
  1337. ]
  1338. else:
  1339. self._load_lang_data(lang)
  1340. lemmas = []
  1341. syn = self.synsets(lemma, lang=lang)
  1342. for s in syn:
  1343. if pos is not None and s.pos() != pos:
  1344. continue
  1345. for lemma_obj in s.lemmas(lang=lang):
  1346. if lemma_obj.name().lower() == lemma:
  1347. lemmas.append(lemma_obj)
  1348. return lemmas
  1349. def all_lemma_names(self, pos=None, lang="eng"):
  1350. """Return all lemma names for all synsets for the given
  1351. part of speech tag and language or languages. If pos is
  1352. not specified, all synsets for all parts of speech will
  1353. be used."""
  1354. if lang == "eng":
  1355. if pos is None:
  1356. return iter(self._lemma_pos_offset_map)
  1357. else:
  1358. return (
  1359. lemma
  1360. for lemma in self._lemma_pos_offset_map
  1361. if pos in self._lemma_pos_offset_map[lemma]
  1362. )
  1363. else:
  1364. self._load_lang_data(lang)
  1365. lemma = []
  1366. for i in self._lang_data[lang][0]:
  1367. if pos is not None and i[-1] != pos:
  1368. continue
  1369. lemma.extend(self._lang_data[lang][0][i])
  1370. lemma = iter(set(lemma))
  1371. return lemma
  1372. def all_synsets(self, pos=None):
  1373. """Iterate over all synsets with a given part of speech tag.
  1374. If no pos is specified, all synsets for all parts of speech
  1375. will be loaded.
  1376. """
  1377. if pos is None:
  1378. pos_tags = self._FILEMAP.keys()
  1379. else:
  1380. pos_tags = [pos]
  1381. cache = self._synset_offset_cache
  1382. from_pos_and_line = self._synset_from_pos_and_line
  1383. # generate all synsets for each part of speech
  1384. for pos_tag in pos_tags:
  1385. # Open the file for reading. Note that we can not re-use
  1386. # the file poitners from self._data_file_map here, because
  1387. # we're defining an iterator, and those file pointers might
  1388. # be moved while we're not looking.
  1389. if pos_tag == ADJ_SAT:
  1390. pos_tag = ADJ
  1391. fileid = "data.%s" % self._FILEMAP[pos_tag]
  1392. data_file = self.open(fileid)
  1393. try:
  1394. # generate synsets for each line in the POS file
  1395. offset = data_file.tell()
  1396. line = data_file.readline()
  1397. while line:
  1398. if not line[0].isspace():
  1399. if offset in cache[pos_tag]:
  1400. # See if the synset is cached
  1401. synset = cache[pos_tag][offset]
  1402. else:
  1403. # Otherwise, parse the line
  1404. synset = from_pos_and_line(pos_tag, line)
  1405. cache[pos_tag][offset] = synset
  1406. # adjective satellites are in the same file as
  1407. # adjectives so only yield the synset if it's actually
  1408. # a satellite
  1409. if synset._pos == ADJ_SAT:
  1410. yield synset
  1411. # for all other POS tags, yield all synsets (this means
  1412. # that adjectives also include adjective satellites)
  1413. else:
  1414. yield synset
  1415. offset = data_file.tell()
  1416. line = data_file.readline()
  1417. # close the extra file handle we opened
  1418. except:
  1419. data_file.close()
  1420. raise
  1421. else:
  1422. data_file.close()
  1423. def words(self, lang="eng"):
  1424. """return lemmas of the given language as list of words"""
  1425. return self.all_lemma_names(lang=lang)
  1426. def license(self, lang="eng"):
  1427. """Return the contents of LICENSE (for omw)
  1428. use lang=lang to get the license for an individual language"""
  1429. if lang == "eng":
  1430. return self.open("LICENSE").read()
  1431. elif lang in self.langs():
  1432. return self._omw_reader.open("{}/LICENSE".format(lang)).read()
  1433. elif lang == "omw":
  1434. # under the assumption you don't mean Omwunra-Toqura
  1435. return self._omw_reader.open("LICENSE").read()
  1436. elif lang in self._lang_data:
  1437. raise WordNetError("Cannot determine license for user-provided tab file")
  1438. else:
  1439. raise WordNetError("Language is not supported.")
  1440. def readme(self, lang="omw"):
  1441. """Return the contents of README (for omw)
  1442. use lang=lang to get the readme for an individual language"""
  1443. if lang == "eng":
  1444. return self.open("README").read()
  1445. elif lang in self.langs():
  1446. return self._omw_reader.open("{}/README".format(lang)).read()
  1447. elif lang == "omw":
  1448. # under the assumption you don't mean Omwunra-Toqura
  1449. return self._omw_reader.open("README").read()
  1450. elif lang in self._lang_data:
  1451. raise WordNetError("No README for user-provided tab file")
  1452. else:
  1453. raise WordNetError("Language is not supported.")
  1454. def citation(self, lang="omw"):
  1455. """Return the contents of citation.bib file (for omw)
  1456. use lang=lang to get the citation for an individual language"""
  1457. if lang == "eng":
  1458. return self.open("citation.bib").read()
  1459. elif lang in self.langs():
  1460. return self._omw_reader.open("{}/citation.bib".format(lang)).read()
  1461. elif lang == "omw":
  1462. # under the assumption you don't mean Omwunra-Toqura
  1463. return self._omw_reader.open("citation.bib").read()
  1464. elif lang in self._lang_data:
  1465. raise WordNetError("citation not known for user-provided tab file")
  1466. else:
  1467. raise WordNetError("Language is not supported.")
  1468. #############################################################
  1469. # Misc
  1470. #############################################################
  1471. def lemma_count(self, lemma):
  1472. """Return the frequency count for this Lemma"""
  1473. # Currently, count is only work for English
  1474. if lemma._lang != "eng":
  1475. return 0
  1476. # open the count file if we haven't already
  1477. if self._key_count_file is None:
  1478. self._key_count_file = self.open("cntlist.rev")
  1479. # find the key in the counts file and return the count
  1480. line = _binary_search_file(self._key_count_file, lemma._key)
  1481. if line:
  1482. return int(line.rsplit(" ", 1)[-1])
  1483. else:
  1484. return 0
  1485. def path_similarity(self, synset1, synset2, verbose=False, simulate_root=True):
  1486. return synset1.path_similarity(synset2, verbose, simulate_root)
  1487. path_similarity.__doc__ = Synset.path_similarity.__doc__
  1488. def lch_similarity(self, synset1, synset2, verbose=False, simulate_root=True):
  1489. return synset1.lch_similarity(synset2, verbose, simulate_root)
  1490. lch_similarity.__doc__ = Synset.lch_similarity.__doc__
  1491. def wup_similarity(self, synset1, synset2, verbose=False, simulate_root=True):
  1492. return synset1.wup_similarity(synset2, verbose, simulate_root)
  1493. wup_similarity.__doc__ = Synset.wup_similarity.__doc__
  1494. def res_similarity(self, synset1, synset2, ic, verbose=False):
  1495. return synset1.res_similarity(synset2, ic, verbose)
  1496. res_similarity.__doc__ = Synset.res_similarity.__doc__
  1497. def jcn_similarity(self, synset1, synset2, ic, verbose=False):
  1498. return synset1.jcn_similarity(synset2, ic, verbose)
  1499. jcn_similarity.__doc__ = Synset.jcn_similarity.__doc__
  1500. def lin_similarity(self, synset1, synset2, ic, verbose=False):
  1501. return synset1.lin_similarity(synset2, ic, verbose)
  1502. lin_similarity.__doc__ = Synset.lin_similarity.__doc__
  1503. #############################################################
  1504. # Morphy
  1505. #############################################################
  1506. # Morphy, adapted from Oliver Steele's pywordnet
  1507. def morphy(self, form, pos=None, check_exceptions=True):
  1508. """
  1509. Find a possible base form for the given form, with the given
  1510. part of speech, by checking WordNet's list of exceptional
  1511. forms, and by recursively stripping affixes for this part of
  1512. speech until a form in WordNet is found.
  1513. >>> from nltk.corpus import wordnet as wn
  1514. >>> print(wn.morphy('dogs'))
  1515. dog
  1516. >>> print(wn.morphy('churches'))
  1517. church
  1518. >>> print(wn.morphy('aardwolves'))
  1519. aardwolf
  1520. >>> print(wn.morphy('abaci'))
  1521. abacus
  1522. >>> wn.morphy('hardrock', wn.ADV)
  1523. >>> print(wn.morphy('book', wn.NOUN))
  1524. book
  1525. >>> wn.morphy('book', wn.ADJ)
  1526. """
  1527. if pos is None:
  1528. morphy = self._morphy
  1529. analyses = chain(a for p in POS_LIST for a in morphy(form, p))
  1530. else:
  1531. analyses = self._morphy(form, pos, check_exceptions)
  1532. # get the first one we find
  1533. first = list(islice(analyses, 1))
  1534. if len(first) == 1:
  1535. return first[0]
  1536. else:
  1537. return None
  1538. MORPHOLOGICAL_SUBSTITUTIONS = {
  1539. NOUN: [
  1540. ("s", ""),
  1541. ("ses", "s"),
  1542. ("ves", "f"),
  1543. ("xes", "x"),
  1544. ("zes", "z"),
  1545. ("ches", "ch"),
  1546. ("shes", "sh"),
  1547. ("men", "man"),
  1548. ("ies", "y"),
  1549. ],
  1550. VERB: [
  1551. ("s", ""),
  1552. ("ies", "y"),
  1553. ("es", "e"),
  1554. ("es", ""),
  1555. ("ed", "e"),
  1556. ("ed", ""),
  1557. ("ing", "e"),
  1558. ("ing", ""),
  1559. ],
  1560. ADJ: [("er", ""), ("est", ""), ("er", "e"), ("est", "e")],
  1561. ADV: [],
  1562. }
  1563. MORPHOLOGICAL_SUBSTITUTIONS[ADJ_SAT] = MORPHOLOGICAL_SUBSTITUTIONS[ADJ]
  1564. def _morphy(self, form, pos, check_exceptions=True):
  1565. # from jordanbg:
  1566. # Given an original string x
  1567. # 1. Apply rules once to the input to get y1, y2, y3, etc.
  1568. # 2. Return all that are in the database
  1569. # 3. If there are no matches, keep applying rules until you either
  1570. # find a match or you can't go any further
  1571. exceptions = self._exception_map[pos]
  1572. substitutions = self.MORPHOLOGICAL_SUBSTITUTIONS[pos]
  1573. def apply_rules(forms):
  1574. return [
  1575. form[: -len(old)] + new
  1576. for form in forms
  1577. for old, new in substitutions
  1578. if form.endswith(old)
  1579. ]
  1580. def filter_forms(forms):
  1581. result = []
  1582. seen = set()
  1583. for form in forms:
  1584. if form in self._lemma_pos_offset_map:
  1585. if pos in self._lemma_pos_offset_map[form]:
  1586. if form not in seen:
  1587. result.append(form)
  1588. seen.add(form)
  1589. return result
  1590. # 0. Check the exception lists
  1591. if check_exceptions:
  1592. if form in exceptions:
  1593. return filter_forms([form] + exceptions[form])
  1594. # 1. Apply rules once to the input to get y1, y2, y3, etc.
  1595. forms = apply_rules([form])
  1596. # 2. Return all that are in the database (and check the original too)
  1597. results = filter_forms([form] + forms)
  1598. if results:
  1599. return results
  1600. # 3. If there are no matches, keep applying rules until we find a match
  1601. while forms:
  1602. forms = apply_rules(forms)
  1603. results = filter_forms(forms)
  1604. if results:
  1605. return results
  1606. # Return an empty list if we can't find anything
  1607. return []
  1608. #############################################################
  1609. # Create information content from corpus
  1610. #############################################################
  1611. def ic(self, corpus, weight_senses_equally=False, smoothing=1.0):
  1612. """
  1613. Creates an information content lookup dictionary from a corpus.
  1614. :type corpus: CorpusReader
  1615. :param corpus: The corpus from which we create an information
  1616. content dictionary.
  1617. :type weight_senses_equally: bool
  1618. :param weight_senses_equally: If this is True, gives all
  1619. possible senses equal weight rather than dividing by the
  1620. number of possible senses. (If a word has 3 synses, each
  1621. sense gets 0.3333 per appearance when this is False, 1.0 when
  1622. it is true.)
  1623. :param smoothing: How much do we smooth synset counts (default is 1.0)
  1624. :type smoothing: float
  1625. :return: An information content dictionary
  1626. """
  1627. counts = FreqDist()
  1628. for ww in corpus.words():
  1629. counts[ww] += 1
  1630. ic = {}
  1631. for pp in POS_LIST:
  1632. ic[pp] = defaultdict(float)
  1633. # Initialize the counts with the smoothing value
  1634. if smoothing > 0.0:
  1635. for ss in self.all_synsets():
  1636. pos = ss._pos
  1637. if pos == ADJ_SAT:
  1638. pos = ADJ
  1639. ic[pos][ss._offset] = smoothing
  1640. for ww in counts:
  1641. possible_synsets = self.synsets(ww)
  1642. if len(possible_synsets) == 0:
  1643. continue
  1644. # Distribute weight among possible synsets
  1645. weight = float(counts[ww])
  1646. if not weight_senses_equally:
  1647. weight /= float(len(possible_synsets))
  1648. for ss in possible_synsets:
  1649. pos = ss._pos
  1650. if pos == ADJ_SAT:
  1651. pos = ADJ
  1652. for level in ss._iter_hypernym_lists():
  1653. for hh in level:
  1654. ic[pos][hh._offset] += weight
  1655. # Add the weight to the root
  1656. ic[pos][0] += weight
  1657. return ic
  1658. def custom_lemmas(self, tab_file, lang):
  1659. """
  1660. Reads a custom tab file containing mappings of lemmas in the given
  1661. language to Princeton WordNet 3.0 synset offsets, allowing NLTK's
  1662. WordNet functions to then be used with that language.
  1663. See the "Tab files" section at http://compling.hss.ntu.edu.sg/omw/ for
  1664. documentation on the Multilingual WordNet tab file format.
  1665. :param tab_file: Tab file as a file or file-like object
  1666. :type lang str
  1667. :param lang ISO 639-3 code of the language of the tab file
  1668. """
  1669. if len(lang) != 3:
  1670. raise ValueError("lang should be a (3 character) ISO 639-3 code")
  1671. self._lang_data[lang] = [defaultdict(list), defaultdict(list)]
  1672. for line in tab_file.readlines():
  1673. if isinstance(line, bytes):
  1674. # Support byte-stream files (e.g. as returned by Python 2's
  1675. # open() function) as well as text-stream ones
  1676. line = line.decode("utf-8")
  1677. if not line.startswith("#"):
  1678. offset_pos, lemma_type, lemma = line.strip().split("\t")
  1679. lemma = lemma.strip().replace(" ", "_")
  1680. self._lang_data[lang][0][offset_pos].append(lemma)
  1681. self._lang_data[lang][1][lemma.lower()].append(offset_pos)
  1682. # Make sure no more entries are accidentally added subsequently
  1683. self._lang_data[lang][0].default_factory = None
  1684. self._lang_data[lang][1].default_factory = None
  1685. ######################################################################
  1686. # WordNet Information Content Corpus Reader
  1687. ######################################################################
  1688. class WordNetICCorpusReader(CorpusReader):
  1689. """
  1690. A corpus reader for the WordNet information content corpus.
  1691. """
  1692. def __init__(self, root, fileids):
  1693. CorpusReader.__init__(self, root, fileids, encoding="utf8")
  1694. # this load function would be more efficient if the data was pickled
  1695. # Note that we can't use NLTK's frequency distributions because
  1696. # synsets are overlapping (each instance of a synset also counts
  1697. # as an instance of its hypernyms)
  1698. def ic(self, icfile):
  1699. """
  1700. Load an information content file from the wordnet_ic corpus
  1701. and return a dictionary. This dictionary has just two keys,
  1702. NOUN and VERB, whose values are dictionaries that map from
  1703. synsets to information content values.
  1704. :type icfile: str
  1705. :param icfile: The name of the wordnet_ic file (e.g. "ic-brown.dat")
  1706. :return: An information content dictionary
  1707. """
  1708. ic = {}
  1709. ic[NOUN] = defaultdict(float)
  1710. ic[VERB] = defaultdict(float)
  1711. for num, line in enumerate(self.open(icfile)):
  1712. if num == 0: # skip the header
  1713. continue
  1714. fields = line.split()
  1715. offset = int(fields[0][:-1])
  1716. value = float(fields[1])
  1717. pos = _get_pos(fields[0])
  1718. if len(fields) == 3 and fields[2] == "ROOT":
  1719. # Store root count.
  1720. ic[pos][0] += value
  1721. if value != 0:
  1722. ic[pos][offset] = value
  1723. return ic
  1724. ######################################################################
  1725. # Similarity metrics
  1726. ######################################################################
  1727. # TODO: Add in the option to manually add a new root node; this will be
  1728. # useful for verb similarity as there exist multiple verb taxonomies.
  1729. # More information about the metrics is available at
  1730. # http://marimba.d.umn.edu/similarity/measures.html
  1731. def path_similarity(synset1, synset2, verbose=False, simulate_root=True):
  1732. return synset1.path_similarity(synset2, verbose, simulate_root)
  1733. def lch_similarity(synset1, synset2, verbose=False, simulate_root=True):
  1734. return synset1.lch_similarity(synset2, verbose, simulate_root)
  1735. def wup_similarity(synset1, synset2, verbose=False, simulate_root=True):
  1736. return synset1.wup_similarity(synset2, verbose, simulate_root)
  1737. def res_similarity(synset1, synset2, ic, verbose=False):
  1738. return synset1.res_similarity(synset2, verbose)
  1739. def jcn_similarity(synset1, synset2, ic, verbose=False):
  1740. return synset1.jcn_similarity(synset2, verbose)
  1741. def lin_similarity(synset1, synset2, ic, verbose=False):
  1742. return synset1.lin_similarity(synset2, verbose)
  1743. path_similarity.__doc__ = Synset.path_similarity.__doc__
  1744. lch_similarity.__doc__ = Synset.lch_similarity.__doc__
  1745. wup_similarity.__doc__ = Synset.wup_similarity.__doc__
  1746. res_similarity.__doc__ = Synset.res_similarity.__doc__
  1747. jcn_similarity.__doc__ = Synset.jcn_similarity.__doc__
  1748. lin_similarity.__doc__ = Synset.lin_similarity.__doc__
  1749. def _lcs_ic(synset1, synset2, ic, verbose=False):
  1750. """
  1751. Get the information content of the least common subsumer that has
  1752. the highest information content value. If two nodes have no
  1753. explicit common subsumer, assume that they share an artificial
  1754. root node that is the hypernym of all explicit roots.
  1755. :type synset1: Synset
  1756. :param synset1: First input synset.
  1757. :type synset2: Synset
  1758. :param synset2: Second input synset. Must be the same part of
  1759. speech as the first synset.
  1760. :type ic: dict
  1761. :param ic: an information content object (as returned by ``load_ic()``).
  1762. :return: The information content of the two synsets and their most
  1763. informative subsumer
  1764. """
  1765. if synset1._pos != synset2._pos:
  1766. raise WordNetError(
  1767. "Computing the least common subsumer requires "
  1768. "%s and %s to have the same part of speech." % (synset1, synset2)
  1769. )
  1770. ic1 = information_content(synset1, ic)
  1771. ic2 = information_content(synset2, ic)
  1772. subsumers = synset1.common_hypernyms(synset2)
  1773. if len(subsumers) == 0:
  1774. subsumer_ic = 0
  1775. else:
  1776. subsumer_ic = max(information_content(s, ic) for s in subsumers)
  1777. if verbose:
  1778. print("> LCS Subsumer by content:", subsumer_ic)
  1779. return ic1, ic2, subsumer_ic
  1780. # Utility functions
  1781. def information_content(synset, ic):
  1782. try:
  1783. icpos = ic[synset._pos]
  1784. except KeyError:
  1785. msg = "Information content file has no entries for part-of-speech: %s"
  1786. raise WordNetError(msg % synset._pos)
  1787. counts = icpos[synset._offset]
  1788. if counts == 0:
  1789. return _INF
  1790. else:
  1791. return -math.log(counts / icpos[0])
  1792. # get the part of speech (NOUN or VERB) from the information content record
  1793. # (each identifier has a 'n' or 'v' suffix)
  1794. def _get_pos(field):
  1795. if field[-1] == "n":
  1796. return NOUN
  1797. elif field[-1] == "v":
  1798. return VERB
  1799. else:
  1800. msg = (
  1801. "Unidentified part of speech in WordNet Information Content file "
  1802. "for field %s" % field
  1803. )
  1804. raise ValueError(msg)
  1805. # unload corpus after tests
  1806. def teardown_module(module=None):
  1807. from nltk.corpus import wordnet
  1808. wordnet._unload()