sonority_sequencing.py 7.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192
  1. # Natural Language Toolkit: Tokenizers
  2. #
  3. # Copyright (C) 2001-2020 NLTK Project
  4. # Author: Christopher Hench <chris.l.hench@gmail.com>
  5. # Alex Estes
  6. # URL: <http://nltk.sourceforge.net>
  7. # For license information, see LICENSE.TXT
  8. """
  9. The Sonority Sequencing Principle (SSP) is a language agnostic algorithm proposed
  10. by Otto Jesperson in 1904. The sonorous quality of a phoneme is judged by the
  11. openness of the lips. Syllable breaks occur before troughs in sonority. For more
  12. on the SSP see Selkirk (1984).
  13. The default implementation uses the English alphabet, but the `sonority_hiearchy`
  14. can be modified to IPA or any other alphabet for the use-case. The SSP is a
  15. universal syllabification algorithm, but that does not mean it performs equally
  16. across languages. Bartlett et al. (2009) is a good benchmark for English accuracy
  17. if utilizing IPA (pg. 311).
  18. Importantly, if a custom hiearchy is supplied and vowels span across more than
  19. one level, they should be given separately to the `vowels` class attribute.
  20. References:
  21. - Otto Jespersen. 1904. Lehrbuch der Phonetik.
  22. Leipzig, Teubner. Chapter 13, Silbe, pp. 185-203.
  23. - Elisabeth Selkirk. 1984. On the major class features and syllable theory.
  24. In Aronoff & Oehrle (eds.) Language Sound Structure: Studies in Phonology.
  25. Cambridge, MIT Press. pp. 107-136.
  26. - Susan Bartlett, et al. 2009. On the Syllabification of Phonemes.
  27. In HLT-NAACL. pp. 308-316.
  28. """
  29. import warnings
  30. import re
  31. from string import punctuation
  32. from nltk.tokenize.api import TokenizerI
  33. from nltk.util import ngrams
  34. class SyllableTokenizer(TokenizerI):
  35. """
  36. Syllabifies words based on the Sonority Sequencing Principle (SSP).
  37. >>> from nltk.tokenize import SyllableTokenizer
  38. >>> from nltk import word_tokenize
  39. >>> SSP = SyllableTokenizer()
  40. >>> SSP.tokenize('justification')
  41. ['jus', 'ti', 'fi', 'ca', 'tion']
  42. >>> text = "This is a foobar-like sentence."
  43. >>> [SSP.tokenize(token) for token in word_tokenize(text)]
  44. [['This'], ['is'], ['a'], ['foo', 'bar', '-', 'li', 'ke'], ['sen', 'ten', 'ce'], ['.']]
  45. """
  46. def __init__(self, lang="en", sonority_hierarchy=False):
  47. """
  48. :param lang: Language parameter, default is English, 'en'
  49. :type lang: str
  50. :param sonority_hierarchy: Sonority hierarchy according to the
  51. Sonority Sequencing Principle.
  52. :type sonority_hierarchy: list(str)
  53. """
  54. # Sonority hierarchy should be provided in descending order.
  55. # If vowels are spread across multiple levels, they should be
  56. # passed assigned self.vowels var together, otherwise should be
  57. # placed in first index of hierarchy.
  58. if not sonority_hierarchy and lang == "en":
  59. sonority_hierarchy = [
  60. "aeiouy", # vowels.
  61. "lmnrw", # nasals.
  62. "zvsf", # fricatives.
  63. "bcdgtkpqxhj", # stops.
  64. ]
  65. self.vowels = sonority_hierarchy[0]
  66. self.phoneme_map = {}
  67. for i, level in enumerate(sonority_hierarchy):
  68. for c in level:
  69. sonority_level = len(sonority_hierarchy) - i
  70. self.phoneme_map[c] = sonority_level
  71. self.phoneme_map[c.upper()] = sonority_level
  72. def assign_values(self, token):
  73. """
  74. Assigns each phoneme its value from the sonority hierarchy.
  75. Note: Sentence/text has to be tokenized first.
  76. :param token: Single word or token
  77. :type token: str
  78. :return: List of tuples, first element is character/phoneme and
  79. second is the soronity value.
  80. :rtype: list(tuple(str, int))
  81. """
  82. syllables_values = []
  83. for c in token:
  84. try:
  85. syllables_values.append((c, self.phoneme_map[c]))
  86. except KeyError:
  87. if c not in punctuation:
  88. warnings.warn(
  89. "Character not defined in sonority_hierarchy,"
  90. " assigning as vowel: '{}'".format(c)
  91. )
  92. syllables_values.append((c, max(self.phoneme_map.values())))
  93. self.vowels += c
  94. else: # If it's a punctuation, assing -1.
  95. syllables_values.append((c, -1))
  96. return syllables_values
  97. def validate_syllables(self, syllable_list):
  98. """
  99. Ensures each syllable has at least one vowel.
  100. If the following syllable doesn't have vowel, add it to the current one.
  101. :param syllable_list: Single word or token broken up into syllables.
  102. :type syllable_list: list(str)
  103. :return: Single word or token broken up into syllables
  104. (with added syllables if necessary)
  105. :rtype: list(str)
  106. """
  107. valid_syllables = []
  108. front = ""
  109. for i, syllable in enumerate(syllable_list):
  110. if syllable in punctuation:
  111. valid_syllables.append(syllable)
  112. continue
  113. if not re.search("|".join(self.vowels), syllable):
  114. if len(valid_syllables) == 0:
  115. front += syllable
  116. else:
  117. valid_syllables = valid_syllables[:-1] + [
  118. valid_syllables[-1] + syllable
  119. ]
  120. else:
  121. if len(valid_syllables) == 0:
  122. valid_syllables.append(front + syllable)
  123. else:
  124. valid_syllables.append(syllable)
  125. return valid_syllables
  126. def tokenize(self, token):
  127. """
  128. Apply the SSP to return a list of syllables.
  129. Note: Sentence/text has to be tokenized first.
  130. :param token: Single word or token
  131. :type token: str
  132. :return syllable_list: Single word or token broken up into syllables.
  133. :rtype: list(str)
  134. """
  135. # assign values from hierarchy
  136. syllables_values = self.assign_values(token)
  137. # if only one vowel return word
  138. if sum(token.count(x) for x in self.vowels) <= 1:
  139. return [token]
  140. syllable_list = []
  141. syllable = syllables_values[0][0] # start syllable with first phoneme
  142. for trigram in ngrams(syllables_values, n=3):
  143. phonemes, values = zip(*trigram)
  144. # Sonority of previous, focal and following phoneme
  145. prev_value, focal_value, next_value = values
  146. # Focal phoneme.
  147. focal_phoneme = phonemes[1]
  148. # These cases trigger syllable break.
  149. if focal_value == -1: # If it's a punctuation, just break.
  150. syllable_list.append(syllable)
  151. syllable_list.append(focal_phoneme)
  152. syllable = ""
  153. elif prev_value >= focal_value == next_value:
  154. syllable += focal_phoneme
  155. syllable_list.append(syllable)
  156. syllable = ""
  157. elif prev_value > focal_value < next_value:
  158. syllable_list.append(syllable)
  159. syllable = ""
  160. syllable += focal_phoneme
  161. # no syllable break
  162. else:
  163. syllable += focal_phoneme
  164. syllable += syllables_values[-1][0] # append last phoneme
  165. syllable_list.append(syllable)
  166. return self.validate_syllables(syllable_list)