panlex_swadesh.py 3.3 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091
  1. # -*- coding: utf-8 -*-
  2. # Natural Language Toolkit: Word List Corpus Reader
  3. #
  4. # Copyright (C) 2001-2020 NLTK Project
  5. # Author: Steven Bird <stevenbird1@gmail.com>
  6. # Edward Loper <edloper@gmail.com>
  7. # URL: <http://nltk.org/>
  8. # For license information, see LICENSE.TXT
  9. from collections import namedtuple, defaultdict
  10. import re
  11. from nltk.tokenize import line_tokenize
  12. from nltk.corpus.reader.wordlist import WordListCorpusReader
  13. from nltk.corpus.reader.util import *
  14. from nltk.corpus.reader.api import *
  15. PanlexLanguage = namedtuple('PanlexLanguage',
  16. ['panlex_uid', # (1) PanLex UID
  17. 'iso639', # (2) ISO 639 language code
  18. 'iso639_type', # (3) ISO 639 language type, see README
  19. 'script', # (4) normal scripts of expressions
  20. 'name', # (5) PanLex default name
  21. 'langvar_uid' # (6) UID of the language variety in which the default name is an expression
  22. ])
  23. class PanlexSwadeshCorpusReader(WordListCorpusReader):
  24. """
  25. This is a class to read the PanLex Swadesh list from
  26. David Kamholz, Jonathan Pool, and Susan M. Colowick (2014).
  27. PanLex: Building a Resource for Panlingual Lexical Translation.
  28. In LREC. http://www.lrec-conf.org/proceedings/lrec2014/pdf/1029_Paper.pdf
  29. License: CC0 1.0 Universal
  30. https://creativecommons.org/publicdomain/zero/1.0/legalcode
  31. """
  32. def __init__(self, *args, **kwargs):
  33. super(PanlexSwadeshCorpusReader, self).__init__(*args, **kwargs)
  34. # Find the swadesh size using the fileids' path.
  35. self.swadesh_size = re.match(r'swadesh([0-9].*)\/', self.fileids()[0]).group(1)
  36. self._languages = {lang.panlex_uid:lang for lang in self.get_languages()}
  37. self._macro_langauges = self.get_macrolanguages()
  38. def license(self):
  39. print('CC0 1.0 Universal')
  40. def readme(self):
  41. print(self.raw('README'))
  42. def language_codes(self):
  43. return self._languages.keys()
  44. def get_languages(self):
  45. for line in self.raw('langs{}.txt'.format(self.swadesh_size)).split('\n'):
  46. if not line.strip(): # Skip empty lines.
  47. continue
  48. yield PanlexLanguage(*line.strip().split('\t'))
  49. def get_macrolanguages(self):
  50. macro_langauges = defaultdict(list)
  51. for lang in self._languages.values():
  52. macro_langauges[lang.iso639].append(lang.panlex_uid)
  53. return macro_langauges
  54. def words_by_lang(self, lang_code):
  55. """
  56. :return: a list of list(str)
  57. """
  58. fileid = 'swadesh{}/{}.txt'.format(self.swadesh_size, lang_code)
  59. return [concept.split('\t') for concept in self.words(fileid)]
  60. def words_by_iso639(self, iso63_code):
  61. """
  62. :return: a list of list(str)
  63. """
  64. fileids = ['swadesh{}/{}.txt'.format(self.swadesh_size, lang_code)
  65. for lang_code in self._macro_langauges[iso63_code]]
  66. return [concept.split('\t') for fileid in fileids for concept in self.words(fileid)]
  67. def entries(self, fileids=None):
  68. """
  69. :return: a tuple of words for the specified fileids.
  70. """
  71. if not fileids:
  72. fileids = self.fileids()
  73. wordlists = [self.words(f) for f in fileids]
  74. return list(zip(*wordlists))