lin.py 6.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183
  1. # Natural Language Toolkit: Lin's Thesaurus
  2. #
  3. # Copyright (C) 2001-2020 NLTK Project
  4. # Author: Dan Blanchard <dblanchard@ets.org>
  5. # URL: <http://nltk.org/>
  6. # For license information, see LICENSE.txt
  7. import re
  8. from collections import defaultdict
  9. from functools import reduce
  10. from nltk.corpus.reader import CorpusReader
  11. class LinThesaurusCorpusReader(CorpusReader):
  12. """ Wrapper for the LISP-formatted thesauruses distributed by Dekang Lin. """
  13. # Compiled regular expression for extracting the key from the first line of each
  14. # thesaurus entry
  15. _key_re = re.compile(r'\("?([^"]+)"? \(desc [0-9.]+\).+')
  16. @staticmethod
  17. def __defaultdict_factory():
  18. """ Factory for creating defaultdict of defaultdict(dict)s """
  19. return defaultdict(dict)
  20. def __init__(self, root, badscore=0.0):
  21. """
  22. Initialize the thesaurus.
  23. :param root: root directory containing thesaurus LISP files
  24. :type root: C{string}
  25. :param badscore: the score to give to words which do not appear in each other's sets of synonyms
  26. :type badscore: C{float}
  27. """
  28. super(LinThesaurusCorpusReader, self).__init__(root, r"sim[A-Z]\.lsp")
  29. self._thesaurus = defaultdict(LinThesaurusCorpusReader.__defaultdict_factory)
  30. self._badscore = badscore
  31. for path, encoding, fileid in self.abspaths(
  32. include_encoding=True, include_fileid=True
  33. ):
  34. with open(path) as lin_file:
  35. first = True
  36. for line in lin_file:
  37. line = line.strip()
  38. # Start of entry
  39. if first:
  40. key = LinThesaurusCorpusReader._key_re.sub(r"\1", line)
  41. first = False
  42. # End of entry
  43. elif line == "))":
  44. first = True
  45. # Lines with pairs of ngrams and scores
  46. else:
  47. split_line = line.split("\t")
  48. if len(split_line) == 2:
  49. ngram, score = split_line
  50. self._thesaurus[fileid][key][ngram.strip('"')] = float(
  51. score
  52. )
  53. def similarity(self, ngram1, ngram2, fileid=None):
  54. """
  55. Returns the similarity score for two ngrams.
  56. :param ngram1: first ngram to compare
  57. :type ngram1: C{string}
  58. :param ngram2: second ngram to compare
  59. :type ngram2: C{string}
  60. :param fileid: thesaurus fileid to search in. If None, search all fileids.
  61. :type fileid: C{string}
  62. :return: If fileid is specified, just the score for the two ngrams; otherwise,
  63. list of tuples of fileids and scores.
  64. """
  65. # Entries don't contain themselves, so make sure similarity between item and itself is 1.0
  66. if ngram1 == ngram2:
  67. if fileid:
  68. return 1.0
  69. else:
  70. return [(fid, 1.0) for fid in self._fileids]
  71. else:
  72. if fileid:
  73. return (
  74. self._thesaurus[fileid][ngram1][ngram2]
  75. if ngram2 in self._thesaurus[fileid][ngram1]
  76. else self._badscore
  77. )
  78. else:
  79. return [
  80. (
  81. fid,
  82. (
  83. self._thesaurus[fid][ngram1][ngram2]
  84. if ngram2 in self._thesaurus[fid][ngram1]
  85. else self._badscore
  86. ),
  87. )
  88. for fid in self._fileids
  89. ]
  90. def scored_synonyms(self, ngram, fileid=None):
  91. """
  92. Returns a list of scored synonyms (tuples of synonyms and scores) for the current ngram
  93. :param ngram: ngram to lookup
  94. :type ngram: C{string}
  95. :param fileid: thesaurus fileid to search in. If None, search all fileids.
  96. :type fileid: C{string}
  97. :return: If fileid is specified, list of tuples of scores and synonyms; otherwise,
  98. list of tuples of fileids and lists, where inner lists consist of tuples of
  99. scores and synonyms.
  100. """
  101. if fileid:
  102. return self._thesaurus[fileid][ngram].items()
  103. else:
  104. return [
  105. (fileid, self._thesaurus[fileid][ngram].items())
  106. for fileid in self._fileids
  107. ]
  108. def synonyms(self, ngram, fileid=None):
  109. """
  110. Returns a list of synonyms for the current ngram.
  111. :param ngram: ngram to lookup
  112. :type ngram: C{string}
  113. :param fileid: thesaurus fileid to search in. If None, search all fileids.
  114. :type fileid: C{string}
  115. :return: If fileid is specified, list of synonyms; otherwise, list of tuples of fileids and
  116. lists, where inner lists contain synonyms.
  117. """
  118. if fileid:
  119. return self._thesaurus[fileid][ngram].keys()
  120. else:
  121. return [
  122. (fileid, self._thesaurus[fileid][ngram].keys())
  123. for fileid in self._fileids
  124. ]
  125. def __contains__(self, ngram):
  126. """
  127. Determines whether or not the given ngram is in the thesaurus.
  128. :param ngram: ngram to lookup
  129. :type ngram: C{string}
  130. :return: whether the given ngram is in the thesaurus.
  131. """
  132. return reduce(
  133. lambda accum, fileid: accum or (ngram in self._thesaurus[fileid]),
  134. self._fileids,
  135. False,
  136. )
  137. ######################################################################
  138. # Demo
  139. ######################################################################
  140. def demo():
  141. from nltk.corpus import lin_thesaurus as thes
  142. word1 = "business"
  143. word2 = "enterprise"
  144. print("Getting synonyms for " + word1)
  145. print(thes.synonyms(word1))
  146. print("Getting scored synonyms for " + word1)
  147. print(thes.scored_synonyms(word1))
  148. print("Getting synonyms from simN.lsp (noun subsection) for " + word1)
  149. print(thes.synonyms(word1, fileid="simN.lsp"))
  150. print("Getting synonyms from simN.lsp (noun subsection) for " + word1)
  151. print(thes.synonyms(word1, fileid="simN.lsp"))
  152. print("Similarity score for %s and %s:" % (word1, word2))
  153. print(thes.similarity(word1, word2))
  154. if __name__ == "__main__":
  155. demo()