cmudict.py 3.4 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798
  1. # Natural Language Toolkit: Carnegie Mellon Pronouncing Dictionary Corpus Reader
  2. #
  3. # Copyright (C) 2001-2020 NLTK Project
  4. # Author: Steven Bird <stevenbird1@gmail.com>
  5. # URL: <http://nltk.org/>
  6. # For license information, see LICENSE.TXT
  7. """
  8. The Carnegie Mellon Pronouncing Dictionary [cmudict.0.6]
  9. ftp://ftp.cs.cmu.edu/project/speech/dict/
  10. Copyright 1998 Carnegie Mellon University
  11. File Format: Each line consists of an uppercased word, a counter
  12. (for alternative pronunciations), and a transcription. Vowels are
  13. marked for stress (1=primary, 2=secondary, 0=no stress). E.g.:
  14. NATURAL 1 N AE1 CH ER0 AH0 L
  15. The dictionary contains 127069 entries. Of these, 119400 words are assigned
  16. a unique pronunciation, 6830 words have two pronunciations, and 839 words have
  17. three or more pronunciations. Many of these are fast-speech variants.
  18. Phonemes: There are 39 phonemes, as shown below:
  19. Phoneme Example Translation Phoneme Example Translation
  20. ------- ------- ----------- ------- ------- -----------
  21. AA odd AA D AE at AE T
  22. AH hut HH AH T AO ought AO T
  23. AW cow K AW AY hide HH AY D
  24. B be B IY CH cheese CH IY Z
  25. D dee D IY DH thee DH IY
  26. EH Ed EH D ER hurt HH ER T
  27. EY ate EY T F fee F IY
  28. G green G R IY N HH he HH IY
  29. IH it IH T IY eat IY T
  30. JH gee JH IY K key K IY
  31. L lee L IY M me M IY
  32. N knee N IY NG ping P IH NG
  33. OW oat OW T OY toy T OY
  34. P pee P IY R read R IY D
  35. S sea S IY SH she SH IY
  36. T tea T IY TH theta TH EY T AH
  37. UH hood HH UH D UW two T UW
  38. V vee V IY W we W IY
  39. Y yield Y IY L D Z zee Z IY
  40. ZH seizure S IY ZH ER
  41. """
  42. from nltk.util import Index
  43. from nltk.corpus.reader.util import *
  44. from nltk.corpus.reader.api import *
  45. class CMUDictCorpusReader(CorpusReader):
  46. def entries(self):
  47. """
  48. :return: the cmudict lexicon as a list of entries
  49. containing (word, transcriptions) tuples.
  50. """
  51. return concat(
  52. [
  53. StreamBackedCorpusView(fileid, read_cmudict_block, encoding=enc)
  54. for fileid, enc in self.abspaths(None, True)
  55. ]
  56. )
  57. def raw(self):
  58. """
  59. :return: the cmudict lexicon as a raw string.
  60. """
  61. fileids = self._fileids
  62. if isinstance(fileids, str):
  63. fileids = [fileids]
  64. return concat([self.open(f).read() for f in fileids])
  65. def words(self):
  66. """
  67. :return: a list of all words defined in the cmudict lexicon.
  68. """
  69. return [word.lower() for (word, _) in self.entries()]
  70. def dict(self):
  71. """
  72. :return: the cmudict lexicon as a dictionary, whose keys are
  73. lowercase words and whose values are lists of pronunciations.
  74. """
  75. return dict(Index(self.entries()))
  76. def read_cmudict_block(stream):
  77. entries = []
  78. while len(entries) < 100: # Read 100 at a time.
  79. line = stream.readline()
  80. if line == "":
  81. return entries # end of file.
  82. pieces = line.split()
  83. entries.append((pieces[0].lower(), pieces[2:]))
  84. return entries