| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091 |
- # -*- coding: utf-8 -*-
- # Natural Language Toolkit: Word List Corpus Reader
- #
- # Copyright (C) 2001-2020 NLTK Project
- # Author: Steven Bird <stevenbird1@gmail.com>
- # Edward Loper <edloper@gmail.com>
- # URL: <http://nltk.org/>
- # For license information, see LICENSE.TXT
- from collections import namedtuple, defaultdict
- import re
- from nltk.tokenize import line_tokenize
- from nltk.corpus.reader.wordlist import WordListCorpusReader
- from nltk.corpus.reader.util import *
- from nltk.corpus.reader.api import *
- PanlexLanguage = namedtuple('PanlexLanguage',
- ['panlex_uid', # (1) PanLex UID
- 'iso639', # (2) ISO 639 language code
- 'iso639_type', # (3) ISO 639 language type, see README
- 'script', # (4) normal scripts of expressions
- 'name', # (5) PanLex default name
- 'langvar_uid' # (6) UID of the language variety in which the default name is an expression
- ])
- class PanlexSwadeshCorpusReader(WordListCorpusReader):
- """
- This is a class to read the PanLex Swadesh list from
- David Kamholz, Jonathan Pool, and Susan M. Colowick (2014).
- PanLex: Building a Resource for Panlingual Lexical Translation.
- In LREC. http://www.lrec-conf.org/proceedings/lrec2014/pdf/1029_Paper.pdf
- License: CC0 1.0 Universal
- https://creativecommons.org/publicdomain/zero/1.0/legalcode
- """
- def __init__(self, *args, **kwargs):
- super(PanlexSwadeshCorpusReader, self).__init__(*args, **kwargs)
- # Find the swadesh size using the fileids' path.
- self.swadesh_size = re.match(r'swadesh([0-9].*)\/', self.fileids()[0]).group(1)
- self._languages = {lang.panlex_uid:lang for lang in self.get_languages()}
- self._macro_langauges = self.get_macrolanguages()
- def license(self):
- print('CC0 1.0 Universal')
- def readme(self):
- print(self.raw('README'))
- def language_codes(self):
- return self._languages.keys()
- def get_languages(self):
- for line in self.raw('langs{}.txt'.format(self.swadesh_size)).split('\n'):
- if not line.strip(): # Skip empty lines.
- continue
- yield PanlexLanguage(*line.strip().split('\t'))
- def get_macrolanguages(self):
- macro_langauges = defaultdict(list)
- for lang in self._languages.values():
- macro_langauges[lang.iso639].append(lang.panlex_uid)
- return macro_langauges
- def words_by_lang(self, lang_code):
- """
- :return: a list of list(str)
- """
- fileid = 'swadesh{}/{}.txt'.format(self.swadesh_size, lang_code)
- return [concept.split('\t') for concept in self.words(fileid)]
- def words_by_iso639(self, iso63_code):
- """
- :return: a list of list(str)
- """
- fileids = ['swadesh{}/{}.txt'.format(self.swadesh_size, lang_code)
- for lang_code in self._macro_langauges[iso63_code]]
- return [concept.split('\t') for fileid in fileids for concept in self.words(fileid)]
- def entries(self, fileids=None):
- """
- :return: a tuple of words for the specified fileids.
- """
- if not fileids:
- fileids = self.fileids()
- wordlists = [self.words(f) for f in fileids]
- return list(zip(*wordlists))
|