| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183 |
- # Natural Language Toolkit: Lin's Thesaurus
- #
- # Copyright (C) 2001-2020 NLTK Project
- # Author: Dan Blanchard <dblanchard@ets.org>
- # URL: <http://nltk.org/>
- # For license information, see LICENSE.txt
- import re
- from collections import defaultdict
- from functools import reduce
- from nltk.corpus.reader import CorpusReader
- class LinThesaurusCorpusReader(CorpusReader):
- """ Wrapper for the LISP-formatted thesauruses distributed by Dekang Lin. """
- # Compiled regular expression for extracting the key from the first line of each
- # thesaurus entry
- _key_re = re.compile(r'\("?([^"]+)"? \(desc [0-9.]+\).+')
- @staticmethod
- def __defaultdict_factory():
- """ Factory for creating defaultdict of defaultdict(dict)s """
- return defaultdict(dict)
- def __init__(self, root, badscore=0.0):
- """
- Initialize the thesaurus.
- :param root: root directory containing thesaurus LISP files
- :type root: C{string}
- :param badscore: the score to give to words which do not appear in each other's sets of synonyms
- :type badscore: C{float}
- """
- super(LinThesaurusCorpusReader, self).__init__(root, r"sim[A-Z]\.lsp")
- self._thesaurus = defaultdict(LinThesaurusCorpusReader.__defaultdict_factory)
- self._badscore = badscore
- for path, encoding, fileid in self.abspaths(
- include_encoding=True, include_fileid=True
- ):
- with open(path) as lin_file:
- first = True
- for line in lin_file:
- line = line.strip()
- # Start of entry
- if first:
- key = LinThesaurusCorpusReader._key_re.sub(r"\1", line)
- first = False
- # End of entry
- elif line == "))":
- first = True
- # Lines with pairs of ngrams and scores
- else:
- split_line = line.split("\t")
- if len(split_line) == 2:
- ngram, score = split_line
- self._thesaurus[fileid][key][ngram.strip('"')] = float(
- score
- )
- def similarity(self, ngram1, ngram2, fileid=None):
- """
- Returns the similarity score for two ngrams.
- :param ngram1: first ngram to compare
- :type ngram1: C{string}
- :param ngram2: second ngram to compare
- :type ngram2: C{string}
- :param fileid: thesaurus fileid to search in. If None, search all fileids.
- :type fileid: C{string}
- :return: If fileid is specified, just the score for the two ngrams; otherwise,
- list of tuples of fileids and scores.
- """
- # Entries don't contain themselves, so make sure similarity between item and itself is 1.0
- if ngram1 == ngram2:
- if fileid:
- return 1.0
- else:
- return [(fid, 1.0) for fid in self._fileids]
- else:
- if fileid:
- return (
- self._thesaurus[fileid][ngram1][ngram2]
- if ngram2 in self._thesaurus[fileid][ngram1]
- else self._badscore
- )
- else:
- return [
- (
- fid,
- (
- self._thesaurus[fid][ngram1][ngram2]
- if ngram2 in self._thesaurus[fid][ngram1]
- else self._badscore
- ),
- )
- for fid in self._fileids
- ]
- def scored_synonyms(self, ngram, fileid=None):
- """
- Returns a list of scored synonyms (tuples of synonyms and scores) for the current ngram
- :param ngram: ngram to lookup
- :type ngram: C{string}
- :param fileid: thesaurus fileid to search in. If None, search all fileids.
- :type fileid: C{string}
- :return: If fileid is specified, list of tuples of scores and synonyms; otherwise,
- list of tuples of fileids and lists, where inner lists consist of tuples of
- scores and synonyms.
- """
- if fileid:
- return self._thesaurus[fileid][ngram].items()
- else:
- return [
- (fileid, self._thesaurus[fileid][ngram].items())
- for fileid in self._fileids
- ]
- def synonyms(self, ngram, fileid=None):
- """
- Returns a list of synonyms for the current ngram.
- :param ngram: ngram to lookup
- :type ngram: C{string}
- :param fileid: thesaurus fileid to search in. If None, search all fileids.
- :type fileid: C{string}
- :return: If fileid is specified, list of synonyms; otherwise, list of tuples of fileids and
- lists, where inner lists contain synonyms.
- """
- if fileid:
- return self._thesaurus[fileid][ngram].keys()
- else:
- return [
- (fileid, self._thesaurus[fileid][ngram].keys())
- for fileid in self._fileids
- ]
- def __contains__(self, ngram):
- """
- Determines whether or not the given ngram is in the thesaurus.
- :param ngram: ngram to lookup
- :type ngram: C{string}
- :return: whether the given ngram is in the thesaurus.
- """
- return reduce(
- lambda accum, fileid: accum or (ngram in self._thesaurus[fileid]),
- self._fileids,
- False,
- )
- ######################################################################
- # Demo
- ######################################################################
- def demo():
- from nltk.corpus import lin_thesaurus as thes
- word1 = "business"
- word2 = "enterprise"
- print("Getting synonyms for " + word1)
- print(thes.synonyms(word1))
- print("Getting scored synonyms for " + word1)
- print(thes.scored_synonyms(word1))
- print("Getting synonyms from simN.lsp (noun subsection) for " + word1)
- print(thes.synonyms(word1, fileid="simN.lsp"))
- print("Getting synonyms from simN.lsp (noun subsection) for " + word1)
- print(thes.synonyms(word1, fileid="simN.lsp"))
- print("Similarity score for %s and %s:" % (word1, word2))
- print(thes.similarity(word1, word2))
- if __name__ == "__main__":
- demo()
|