aline.py 31 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354
  1. # -*- coding: utf-8 -*-
  2. # Natural Language Toolkit: ALINE
  3. #
  4. # Copyright (C) 2001-2020 NLTK Project
  5. # Author: Greg Kondrak <gkondrak@ualberta.ca>
  6. # Geoff Bacon <bacon@berkeley.edu> (Python port)
  7. # URL: <http://nltk.org/>
  8. # For license information, see LICENSE.TXT
  9. """
  10. ALINE
  11. http://webdocs.cs.ualberta.ca/~kondrak/
  12. Copyright 2002 by Grzegorz Kondrak.
  13. ALINE is an algorithm for aligning phonetic sequences, described in [1].
  14. This module is a port of Kondrak's (2002) ALINE. It provides functions for
  15. phonetic sequence alignment and similarity analysis. These are useful in
  16. historical linguistics, sociolinguistics and synchronic phonology.
  17. ALINE has parameters that can be tuned for desired output. These parameters are:
  18. - C_skip, C_sub, C_exp, C_vwl
  19. - Salience weights
  20. - Segmental features
  21. In this implementation, some parameters have been changed from their default
  22. values as described in [1], in order to replicate published results. All changes
  23. are noted in comments.
  24. Example usage
  25. -------------
  26. # Get optimal alignment of two phonetic sequences
  27. >>> align('θin', 'tenwis') # doctest: +SKIP
  28. [[('θ', 't'), ('i', 'e'), ('n', 'n'), ('-', 'w'), ('-', 'i'), ('-', 's')]]
  29. [1] G. Kondrak. Algorithms for Language Reconstruction. PhD dissertation,
  30. University of Toronto.
  31. """
  32. try:
  33. import numpy as np
  34. except ImportError:
  35. np = None
  36. # === Constants ===
  37. inf = float("inf")
  38. # Default values for maximum similarity scores (Kondrak 2002: 54)
  39. C_skip = 10 # Indels
  40. C_sub = 35 # Substitutions
  41. C_exp = 45 # Expansions/compressions
  42. C_vwl = 5 # Vowel/consonant relative weight (decreased from 10)
  43. consonants = [
  44. "B",
  45. "N",
  46. "R",
  47. "b",
  48. "c",
  49. "d",
  50. "f",
  51. "g",
  52. "h",
  53. "j",
  54. "k",
  55. "l",
  56. "m",
  57. "n",
  58. "p",
  59. "q",
  60. "r",
  61. "s",
  62. "t",
  63. "v",
  64. "x",
  65. "z",
  66. "ç",
  67. "ð",
  68. "ħ",
  69. "ŋ",
  70. "ɖ",
  71. "ɟ",
  72. "ɢ",
  73. "ɣ",
  74. "ɦ",
  75. "ɬ",
  76. "ɮ",
  77. "ɰ",
  78. "ɱ",
  79. "ɲ",
  80. "ɳ",
  81. "ɴ",
  82. "ɸ",
  83. "ɹ",
  84. "ɻ",
  85. "ɽ",
  86. "ɾ",
  87. "ʀ",
  88. "ʁ",
  89. "ʂ",
  90. "ʃ",
  91. "ʈ",
  92. "ʋ",
  93. "ʐ ",
  94. "ʒ",
  95. "ʔ",
  96. "ʕ",
  97. "ʙ",
  98. "ʝ",
  99. "β",
  100. "θ",
  101. "χ",
  102. "ʐ",
  103. "w",
  104. ]
  105. # Relevant features for comparing consonants and vowels
  106. R_c = [
  107. "aspirated",
  108. "lateral",
  109. "manner",
  110. "nasal",
  111. "place",
  112. "retroflex",
  113. "syllabic",
  114. "voice",
  115. ]
  116. # 'high' taken out of R_v because same as manner
  117. R_v = [
  118. "back",
  119. "lateral",
  120. "long",
  121. "manner",
  122. "nasal",
  123. "place",
  124. "retroflex",
  125. "round",
  126. "syllabic",
  127. "voice",
  128. ]
  129. # Flattened feature matrix (Kondrak 2002: 56)
  130. similarity_matrix = {
  131. # place
  132. "bilabial": 1.0,
  133. "labiodental": 0.95,
  134. "dental": 0.9,
  135. "alveolar": 0.85,
  136. "retroflex": 0.8,
  137. "palato-alveolar": 0.75,
  138. "palatal": 0.7,
  139. "velar": 0.6,
  140. "uvular": 0.5,
  141. "pharyngeal": 0.3,
  142. "glottal": 0.1,
  143. "labiovelar": 1.0,
  144. "vowel": -1.0, # added 'vowel'
  145. # manner
  146. "stop": 1.0,
  147. "affricate": 0.9,
  148. "fricative": 0.85, # increased fricative from 0.8
  149. "trill": 0.7,
  150. "tap": 0.65,
  151. "approximant": 0.6,
  152. "high vowel": 0.4,
  153. "mid vowel": 0.2,
  154. "low vowel": 0.0,
  155. "vowel2": 0.5, # added vowel
  156. # high
  157. "high": 1.0,
  158. "mid": 0.5,
  159. "low": 0.0,
  160. # back
  161. "front": 1.0,
  162. "central": 0.5,
  163. "back": 0.0,
  164. # binary features
  165. "plus": 1.0,
  166. "minus": 0.0,
  167. }
  168. # Relative weights of phonetic features (Kondrak 2002: 55)
  169. salience = {
  170. "syllabic": 5,
  171. "place": 40,
  172. "manner": 50,
  173. "voice": 5, # decreased from 10
  174. "nasal": 20, # increased from 10
  175. "retroflex": 10,
  176. "lateral": 10,
  177. "aspirated": 5,
  178. "long": 0, # decreased from 1
  179. "high": 3, # decreased from 5
  180. "back": 2, # decreased from 5
  181. "round": 2, # decreased from 5
  182. }
  183. # (Kondrak 2002: 59-60)
  184. feature_matrix = {
  185. # Consonants
  186. "p": {
  187. "place": "bilabial",
  188. "manner": "stop",
  189. "syllabic": "minus",
  190. "voice": "minus",
  191. "nasal": "minus",
  192. "retroflex": "minus",
  193. "lateral": "minus",
  194. "aspirated": "minus",
  195. },
  196. "b": {
  197. "place": "bilabial",
  198. "manner": "stop",
  199. "syllabic": "minus",
  200. "voice": "plus",
  201. "nasal": "minus",
  202. "retroflex": "minus",
  203. "lateral": "minus",
  204. "aspirated": "minus",
  205. },
  206. "t": {
  207. "place": "alveolar",
  208. "manner": "stop",
  209. "syllabic": "minus",
  210. "voice": "minus",
  211. "nasal": "minus",
  212. "retroflex": "minus",
  213. "lateral": "minus",
  214. "aspirated": "minus",
  215. },
  216. "d": {
  217. "place": "alveolar",
  218. "manner": "stop",
  219. "syllabic": "minus",
  220. "voice": "plus",
  221. "nasal": "minus",
  222. "retroflex": "minus",
  223. "lateral": "minus",
  224. "aspirated": "minus",
  225. },
  226. "ʈ": {
  227. "place": "retroflex",
  228. "manner": "stop",
  229. "syllabic": "minus",
  230. "voice": "minus",
  231. "nasal": "minus",
  232. "retroflex": "plus",
  233. "lateral": "minus",
  234. "aspirated": "minus",
  235. },
  236. "ɖ": {
  237. "place": "retroflex",
  238. "manner": "stop",
  239. "syllabic": "minus",
  240. "voice": "plus",
  241. "nasal": "minus",
  242. "retroflex": "plus",
  243. "lateral": "minus",
  244. "aspirated": "minus",
  245. },
  246. "c": {
  247. "place": "palatal",
  248. "manner": "stop",
  249. "syllabic": "minus",
  250. "voice": "minus",
  251. "nasal": "minus",
  252. "retroflex": "minus",
  253. "lateral": "minus",
  254. "aspirated": "minus",
  255. },
  256. "ɟ": {
  257. "place": "palatal",
  258. "manner": "stop",
  259. "syllabic": "minus",
  260. "voice": "plus",
  261. "nasal": "minus",
  262. "retroflex": "minus",
  263. "lateral": "minus",
  264. "aspirated": "minus",
  265. },
  266. "k": {
  267. "place": "velar",
  268. "manner": "stop",
  269. "syllabic": "minus",
  270. "voice": "minus",
  271. "nasal": "minus",
  272. "retroflex": "minus",
  273. "lateral": "minus",
  274. "aspirated": "minus",
  275. },
  276. "g": {
  277. "place": "velar",
  278. "manner": "stop",
  279. "syllabic": "minus",
  280. "voice": "plus",
  281. "nasal": "minus",
  282. "retroflex": "minus",
  283. "lateral": "minus",
  284. "aspirated": "minus",
  285. },
  286. "q": {
  287. "place": "uvular",
  288. "manner": "stop",
  289. "syllabic": "minus",
  290. "voice": "minus",
  291. "nasal": "minus",
  292. "retroflex": "minus",
  293. "lateral": "minus",
  294. "aspirated": "minus",
  295. },
  296. "ɢ": {
  297. "place": "uvular",
  298. "manner": "stop",
  299. "syllabic": "minus",
  300. "voice": "plus",
  301. "nasal": "minus",
  302. "retroflex": "minus",
  303. "lateral": "minus",
  304. "aspirated": "minus",
  305. },
  306. "ʔ": {
  307. "place": "glottal",
  308. "manner": "stop",
  309. "syllabic": "minus",
  310. "voice": "minus",
  311. "nasal": "minus",
  312. "retroflex": "minus",
  313. "lateral": "minus",
  314. "aspirated": "minus",
  315. },
  316. "m": {
  317. "place": "bilabial",
  318. "manner": "stop",
  319. "syllabic": "minus",
  320. "voice": "plus",
  321. "nasal": "plus",
  322. "retroflex": "minus",
  323. "lateral": "minus",
  324. "aspirated": "minus",
  325. },
  326. "ɱ": {
  327. "place": "labiodental",
  328. "manner": "stop",
  329. "syllabic": "minus",
  330. "voice": "plus",
  331. "nasal": "plus",
  332. "retroflex": "minus",
  333. "lateral": "minus",
  334. "aspirated": "minus",
  335. },
  336. "n": {
  337. "place": "alveolar",
  338. "manner": "stop",
  339. "syllabic": "minus",
  340. "voice": "plus",
  341. "nasal": "plus",
  342. "retroflex": "minus",
  343. "lateral": "minus",
  344. "aspirated": "minus",
  345. },
  346. "ɳ": {
  347. "place": "retroflex",
  348. "manner": "stop",
  349. "syllabic": "minus",
  350. "voice": "plus",
  351. "nasal": "plus",
  352. "retroflex": "plus",
  353. "lateral": "minus",
  354. "aspirated": "minus",
  355. },
  356. "ɲ": {
  357. "place": "palatal",
  358. "manner": "stop",
  359. "syllabic": "minus",
  360. "voice": "plus",
  361. "nasal": "plus",
  362. "retroflex": "minus",
  363. "lateral": "minus",
  364. "aspirated": "minus",
  365. },
  366. "ŋ": {
  367. "place": "velar",
  368. "manner": "stop",
  369. "syllabic": "minus",
  370. "voice": "plus",
  371. "nasal": "plus",
  372. "retroflex": "minus",
  373. "lateral": "minus",
  374. "aspirated": "minus",
  375. },
  376. "ɴ": {
  377. "place": "uvular",
  378. "manner": "stop",
  379. "syllabic": "minus",
  380. "voice": "plus",
  381. "nasal": "plus",
  382. "retroflex": "minus",
  383. "lateral": "minus",
  384. "aspirated": "minus",
  385. },
  386. "N": {
  387. "place": "uvular",
  388. "manner": "stop",
  389. "syllabic": "minus",
  390. "voice": "plus",
  391. "nasal": "plus",
  392. "retroflex": "minus",
  393. "lateral": "minus",
  394. "aspirated": "minus",
  395. },
  396. "ʙ": {
  397. "place": "bilabial",
  398. "manner": "trill",
  399. "syllabic": "minus",
  400. "voice": "plus",
  401. "nasal": "minus",
  402. "retroflex": "minus",
  403. "lateral": "minus",
  404. "aspirated": "minus",
  405. },
  406. "B": {
  407. "place": "bilabial",
  408. "manner": "trill",
  409. "syllabic": "minus",
  410. "voice": "plus",
  411. "nasal": "minus",
  412. "retroflex": "minus",
  413. "lateral": "minus",
  414. "aspirated": "minus",
  415. },
  416. "r": {
  417. "place": "alveolar",
  418. "manner": "trill",
  419. "syllabic": "minus",
  420. "voice": "plus",
  421. "nasal": "minus",
  422. "retroflex": "plus",
  423. "lateral": "minus",
  424. "aspirated": "minus",
  425. },
  426. "ʀ": {
  427. "place": "uvular",
  428. "manner": "trill",
  429. "syllabic": "minus",
  430. "voice": "plus",
  431. "nasal": "minus",
  432. "retroflex": "minus",
  433. "lateral": "minus",
  434. "aspirated": "minus",
  435. },
  436. "R": {
  437. "place": "uvular",
  438. "manner": "trill",
  439. "syllabic": "minus",
  440. "voice": "plus",
  441. "nasal": "minus",
  442. "retroflex": "minus",
  443. "lateral": "minus",
  444. "aspirated": "minus",
  445. },
  446. "ɾ": {
  447. "place": "alveolar",
  448. "manner": "tap",
  449. "syllabic": "minus",
  450. "voice": "plus",
  451. "nasal": "minus",
  452. "retroflex": "minus",
  453. "lateral": "minus",
  454. "aspirated": "minus",
  455. },
  456. "ɽ": {
  457. "place": "retroflex",
  458. "manner": "tap",
  459. "syllabic": "minus",
  460. "voice": "plus",
  461. "nasal": "minus",
  462. "retroflex": "plus",
  463. "lateral": "minus",
  464. "aspirated": "minus",
  465. },
  466. "ɸ": {
  467. "place": "bilabial",
  468. "manner": "fricative",
  469. "syllabic": "minus",
  470. "voice": "minus",
  471. "nasal": "minus",
  472. "retroflex": "minus",
  473. "lateral": "minus",
  474. "aspirated": "minus",
  475. },
  476. "β": {
  477. "place": "bilabial",
  478. "manner": "fricative",
  479. "syllabic": "minus",
  480. "voice": "plus",
  481. "nasal": "minus",
  482. "retroflex": "minus",
  483. "lateral": "minus",
  484. "aspirated": "minus",
  485. },
  486. "f": {
  487. "place": "labiodental",
  488. "manner": "fricative",
  489. "syllabic": "minus",
  490. "voice": "minus",
  491. "nasal": "minus",
  492. "retroflex": "minus",
  493. "lateral": "minus",
  494. "aspirated": "minus",
  495. },
  496. "v": {
  497. "place": "labiodental",
  498. "manner": "fricative",
  499. "syllabic": "minus",
  500. "voice": "plus",
  501. "nasal": "minus",
  502. "retroflex": "minus",
  503. "lateral": "minus",
  504. "aspirated": "minus",
  505. },
  506. "θ": {
  507. "place": "dental",
  508. "manner": "fricative",
  509. "syllabic": "minus",
  510. "voice": "minus",
  511. "nasal": "minus",
  512. "retroflex": "minus",
  513. "lateral": "minus",
  514. "aspirated": "minus",
  515. },
  516. "ð": {
  517. "place": "dental",
  518. "manner": "fricative",
  519. "syllabic": "minus",
  520. "voice": "plus",
  521. "nasal": "minus",
  522. "retroflex": "minus",
  523. "lateral": "minus",
  524. "aspirated": "minus",
  525. },
  526. "s": {
  527. "place": "alveolar",
  528. "manner": "fricative",
  529. "syllabic": "minus",
  530. "voice": "minus",
  531. "nasal": "minus",
  532. "retroflex": "minus",
  533. "lateral": "minus",
  534. "aspirated": "minus",
  535. },
  536. "z": {
  537. "place": "alveolar",
  538. "manner": "fricative",
  539. "syllabic": "minus",
  540. "voice": "plus",
  541. "nasal": "minus",
  542. "retroflex": "minus",
  543. "lateral": "minus",
  544. "aspirated": "minus",
  545. },
  546. "ʃ": {
  547. "place": "palato-alveolar",
  548. "manner": "fricative",
  549. "syllabic": "minus",
  550. "voice": "minus",
  551. "nasal": "minus",
  552. "retroflex": "minus",
  553. "lateral": "minus",
  554. "aspirated": "minus",
  555. },
  556. "ʒ": {
  557. "place": "palato-alveolar",
  558. "manner": "fricative",
  559. "syllabic": "minus",
  560. "voice": "plus",
  561. "nasal": "minus",
  562. "retroflex": "minus",
  563. "lateral": "minus",
  564. "aspirated": "minus",
  565. },
  566. "ʂ": {
  567. "place": "retroflex",
  568. "manner": "fricative",
  569. "syllabic": "minus",
  570. "voice": "minus",
  571. "nasal": "minus",
  572. "retroflex": "plus",
  573. "lateral": "minus",
  574. "aspirated": "minus",
  575. },
  576. "ʐ": {
  577. "place": "retroflex",
  578. "manner": "fricative",
  579. "syllabic": "minus",
  580. "voice": "plus",
  581. "nasal": "minus",
  582. "retroflex": "plus",
  583. "lateral": "minus",
  584. "aspirated": "minus",
  585. },
  586. "ç": {
  587. "place": "palatal",
  588. "manner": "fricative",
  589. "syllabic": "minus",
  590. "voice": "minus",
  591. "nasal": "minus",
  592. "retroflex": "minus",
  593. "lateral": "minus",
  594. "aspirated": "minus",
  595. },
  596. "ʝ": {
  597. "place": "palatal",
  598. "manner": "fricative",
  599. "syllabic": "minus",
  600. "voice": "plus",
  601. "nasal": "minus",
  602. "retroflex": "minus",
  603. "lateral": "minus",
  604. "aspirated": "minus",
  605. },
  606. "x": {
  607. "place": "velar",
  608. "manner": "fricative",
  609. "syllabic": "minus",
  610. "voice": "minus",
  611. "nasal": "minus",
  612. "retroflex": "minus",
  613. "lateral": "minus",
  614. "aspirated": "minus",
  615. },
  616. "ɣ": {
  617. "place": "velar",
  618. "manner": "fricative",
  619. "syllabic": "minus",
  620. "voice": "plus",
  621. "nasal": "minus",
  622. "retroflex": "minus",
  623. "lateral": "minus",
  624. "aspirated": "minus",
  625. },
  626. "χ": {
  627. "place": "uvular",
  628. "manner": "fricative",
  629. "syllabic": "minus",
  630. "voice": "minus",
  631. "nasal": "minus",
  632. "retroflex": "minus",
  633. "lateral": "minus",
  634. "aspirated": "minus",
  635. },
  636. "ʁ": {
  637. "place": "uvular",
  638. "manner": "fricative",
  639. "syllabic": "minus",
  640. "voice": "plus",
  641. "nasal": "minus",
  642. "retroflex": "minus",
  643. "lateral": "minus",
  644. "aspirated": "minus",
  645. },
  646. "ħ": {
  647. "place": "pharyngeal",
  648. "manner": "fricative",
  649. "syllabic": "minus",
  650. "voice": "minus",
  651. "nasal": "minus",
  652. "retroflex": "minus",
  653. "lateral": "minus",
  654. "aspirated": "minus",
  655. },
  656. "ʕ": {
  657. "place": "pharyngeal",
  658. "manner": "fricative",
  659. "syllabic": "minus",
  660. "voice": "plus",
  661. "nasal": "minus",
  662. "retroflex": "minus",
  663. "lateral": "minus",
  664. "aspirated": "minus",
  665. },
  666. "h": {
  667. "place": "glottal",
  668. "manner": "fricative",
  669. "syllabic": "minus",
  670. "voice": "minus",
  671. "nasal": "minus",
  672. "retroflex": "minus",
  673. "lateral": "minus",
  674. "aspirated": "minus",
  675. },
  676. "ɦ": {
  677. "place": "glottal",
  678. "manner": "fricative",
  679. "syllabic": "minus",
  680. "voice": "plus",
  681. "nasal": "minus",
  682. "retroflex": "minus",
  683. "lateral": "minus",
  684. "aspirated": "minus",
  685. },
  686. "ɬ": {
  687. "place": "alveolar",
  688. "manner": "fricative",
  689. "syllabic": "minus",
  690. "voice": "minus",
  691. "nasal": "minus",
  692. "retroflex": "minus",
  693. "lateral": "plus",
  694. "aspirated": "minus",
  695. },
  696. "ɮ": {
  697. "place": "alveolar",
  698. "manner": "fricative",
  699. "syllabic": "minus",
  700. "voice": "plus",
  701. "nasal": "minus",
  702. "retroflex": "minus",
  703. "lateral": "plus",
  704. "aspirated": "minus",
  705. },
  706. "ʋ": {
  707. "place": "labiodental",
  708. "manner": "approximant",
  709. "syllabic": "minus",
  710. "voice": "plus",
  711. "nasal": "minus",
  712. "retroflex": "minus",
  713. "lateral": "minus",
  714. "aspirated": "minus",
  715. },
  716. "ɹ": {
  717. "place": "alveolar",
  718. "manner": "approximant",
  719. "syllabic": "minus",
  720. "voice": "plus",
  721. "nasal": "minus",
  722. "retroflex": "minus",
  723. "lateral": "minus",
  724. "aspirated": "minus",
  725. },
  726. "ɻ": {
  727. "place": "retroflex",
  728. "manner": "approximant",
  729. "syllabic": "minus",
  730. "voice": "plus",
  731. "nasal": "minus",
  732. "retroflex": "plus",
  733. "lateral": "minus",
  734. "aspirated": "minus",
  735. },
  736. "j": {
  737. "place": "palatal",
  738. "manner": "approximant",
  739. "syllabic": "minus",
  740. "voice": "plus",
  741. "nasal": "minus",
  742. "retroflex": "minus",
  743. "lateral": "minus",
  744. "aspirated": "minus",
  745. },
  746. "ɰ": {
  747. "place": "velar",
  748. "manner": "approximant",
  749. "syllabic": "minus",
  750. "voice": "plus",
  751. "nasal": "minus",
  752. "retroflex": "minus",
  753. "lateral": "minus",
  754. "aspirated": "minus",
  755. },
  756. "l": {
  757. "place": "alveolar",
  758. "manner": "approximant",
  759. "syllabic": "minus",
  760. "voice": "plus",
  761. "nasal": "minus",
  762. "retroflex": "minus",
  763. "lateral": "plus",
  764. "aspirated": "minus",
  765. },
  766. "w": {
  767. "place": "labiovelar",
  768. "manner": "approximant",
  769. "syllabic": "minus",
  770. "voice": "plus",
  771. "nasal": "minus",
  772. "retroflex": "minus",
  773. "lateral": "minus",
  774. "aspirated": "minus",
  775. },
  776. # Vowels
  777. "i": {
  778. "place": "vowel",
  779. "manner": "vowel2",
  780. "syllabic": "plus",
  781. "voice": "plus",
  782. "nasal": "minus",
  783. "retroflex": "minus",
  784. "lateral": "minus",
  785. "high": "high",
  786. "back": "front",
  787. "round": "minus",
  788. "long": "minus",
  789. "aspirated": "minus",
  790. },
  791. "y": {
  792. "place": "vowel",
  793. "manner": "vowel2",
  794. "syllabic": "plus",
  795. "voice": "plus",
  796. "nasal": "minus",
  797. "retroflex": "minus",
  798. "lateral": "minus",
  799. "high": "high",
  800. "back": "front",
  801. "round": "plus",
  802. "long": "minus",
  803. "aspirated": "minus",
  804. },
  805. "e": {
  806. "place": "vowel",
  807. "manner": "vowel2",
  808. "syllabic": "plus",
  809. "voice": "plus",
  810. "nasal": "minus",
  811. "retroflex": "minus",
  812. "lateral": "minus",
  813. "high": "mid",
  814. "back": "front",
  815. "round": "minus",
  816. "long": "minus",
  817. "aspirated": "minus",
  818. },
  819. "E": {
  820. "place": "vowel",
  821. "manner": "vowel2",
  822. "syllabic": "plus",
  823. "voice": "plus",
  824. "nasal": "minus",
  825. "retroflex": "minus",
  826. "lateral": "minus",
  827. "high": "mid",
  828. "back": "front",
  829. "round": "minus",
  830. "long": "plus",
  831. "aspirated": "minus",
  832. },
  833. "ø": {
  834. "place": "vowel",
  835. "manner": "vowel2",
  836. "syllabic": "plus",
  837. "voice": "plus",
  838. "nasal": "minus",
  839. "retroflex": "minus",
  840. "lateral": "minus",
  841. "high": "mid",
  842. "back": "front",
  843. "round": "plus",
  844. "long": "minus",
  845. "aspirated": "minus",
  846. },
  847. "ɛ": {
  848. "place": "vowel",
  849. "manner": "vowel2",
  850. "syllabic": "plus",
  851. "voice": "plus",
  852. "nasal": "minus",
  853. "retroflex": "minus",
  854. "lateral": "minus",
  855. "high": "mid",
  856. "back": "front",
  857. "round": "minus",
  858. "long": "minus",
  859. "aspirated": "minus",
  860. },
  861. "œ": {
  862. "place": "vowel",
  863. "manner": "vowel2",
  864. "syllabic": "plus",
  865. "voice": "plus",
  866. "nasal": "minus",
  867. "retroflex": "minus",
  868. "lateral": "minus",
  869. "high": "mid",
  870. "back": "front",
  871. "round": "plus",
  872. "long": "minus",
  873. "aspirated": "minus",
  874. },
  875. "æ": {
  876. "place": "vowel",
  877. "manner": "vowel2",
  878. "syllabic": "plus",
  879. "voice": "plus",
  880. "nasal": "minus",
  881. "retroflex": "minus",
  882. "lateral": "minus",
  883. "high": "low",
  884. "back": "front",
  885. "round": "minus",
  886. "long": "minus",
  887. "aspirated": "minus",
  888. },
  889. "a": {
  890. "place": "vowel",
  891. "manner": "vowel2",
  892. "syllabic": "plus",
  893. "voice": "plus",
  894. "nasal": "minus",
  895. "retroflex": "minus",
  896. "lateral": "minus",
  897. "high": "low",
  898. "back": "front",
  899. "round": "minus",
  900. "long": "minus",
  901. "aspirated": "minus",
  902. },
  903. "A": {
  904. "place": "vowel",
  905. "manner": "vowel2",
  906. "syllabic": "plus",
  907. "voice": "plus",
  908. "nasal": "minus",
  909. "retroflex": "minus",
  910. "lateral": "minus",
  911. "high": "low",
  912. "back": "front",
  913. "round": "minus",
  914. "long": "plus",
  915. "aspirated": "minus",
  916. },
  917. "ɨ": {
  918. "place": "vowel",
  919. "manner": "vowel2",
  920. "syllabic": "plus",
  921. "voice": "plus",
  922. "nasal": "minus",
  923. "retroflex": "minus",
  924. "lateral": "minus",
  925. "high": "high",
  926. "back": "central",
  927. "round": "minus",
  928. "long": "minus",
  929. "aspirated": "minus",
  930. },
  931. "ʉ": {
  932. "place": "vowel",
  933. "manner": "vowel2",
  934. "syllabic": "plus",
  935. "voice": "plus",
  936. "nasal": "minus",
  937. "retroflex": "minus",
  938. "lateral": "minus",
  939. "high": "high",
  940. "back": "central",
  941. "round": "plus",
  942. "long": "minus",
  943. "aspirated": "minus",
  944. },
  945. "ə": {
  946. "place": "vowel",
  947. "manner": "vowel2",
  948. "syllabic": "plus",
  949. "voice": "plus",
  950. "nasal": "minus",
  951. "retroflex": "minus",
  952. "lateral": "minus",
  953. "high": "mid",
  954. "back": "central",
  955. "round": "minus",
  956. "long": "minus",
  957. "aspirated": "minus",
  958. },
  959. "u": {
  960. "place": "vowel",
  961. "manner": "vowel2",
  962. "syllabic": "plus",
  963. "voice": "plus",
  964. "nasal": "minus",
  965. "retroflex": "minus",
  966. "lateral": "minus",
  967. "high": "high",
  968. "back": "back",
  969. "round": "plus",
  970. "long": "minus",
  971. "aspirated": "minus",
  972. },
  973. "U": {
  974. "place": "vowel",
  975. "manner": "vowel2",
  976. "syllabic": "plus",
  977. "voice": "plus",
  978. "nasal": "minus",
  979. "retroflex": "minus",
  980. "lateral": "minus",
  981. "high": "high",
  982. "back": "back",
  983. "round": "plus",
  984. "long": "plus",
  985. "aspirated": "minus",
  986. },
  987. "o": {
  988. "place": "vowel",
  989. "manner": "vowel2",
  990. "syllabic": "plus",
  991. "voice": "plus",
  992. "nasal": "minus",
  993. "retroflex": "minus",
  994. "lateral": "minus",
  995. "high": "mid",
  996. "back": "back",
  997. "round": "plus",
  998. "long": "minus",
  999. "aspirated": "minus",
  1000. },
  1001. "O": {
  1002. "place": "vowel",
  1003. "manner": "vowel2",
  1004. "syllabic": "plus",
  1005. "voice": "plus",
  1006. "nasal": "minus",
  1007. "retroflex": "minus",
  1008. "lateral": "minus",
  1009. "high": "mid",
  1010. "back": "back",
  1011. "round": "plus",
  1012. "long": "plus",
  1013. "aspirated": "minus",
  1014. },
  1015. "ɔ": {
  1016. "place": "vowel",
  1017. "manner": "vowel2",
  1018. "syllabic": "plus",
  1019. "voice": "plus",
  1020. "nasal": "minus",
  1021. "retroflex": "minus",
  1022. "lateral": "minus",
  1023. "high": "mid",
  1024. "back": "back",
  1025. "round": "plus",
  1026. "long": "minus",
  1027. "aspirated": "minus",
  1028. },
  1029. "ɒ": {
  1030. "place": "vowel",
  1031. "manner": "vowel2",
  1032. "syllabic": "plus",
  1033. "voice": "plus",
  1034. "nasal": "minus",
  1035. "retroflex": "minus",
  1036. "lateral": "minus",
  1037. "high": "low",
  1038. "back": "back",
  1039. "round": "minus",
  1040. "long": "minus",
  1041. "aspirated": "minus",
  1042. },
  1043. "I": {
  1044. "place": "vowel",
  1045. "manner": "vowel2",
  1046. "syllabic": "plus",
  1047. "voice": "plus",
  1048. "nasal": "minus",
  1049. "retroflex": "minus",
  1050. "lateral": "minus",
  1051. "high": "high",
  1052. "back": "front",
  1053. "round": "minus",
  1054. "long": "plus",
  1055. "aspirated": "minus",
  1056. },
  1057. }
  1058. # === Algorithm ===
  1059. def align(str1, str2, epsilon=0):
  1060. """
  1061. Compute the alignment of two phonetic strings.
  1062. :type str1, str2: str
  1063. :param str1, str2: Two strings to be aligned
  1064. :type epsilon: float (0.0 to 1.0)
  1065. :param epsilon: Adjusts threshold similarity score for near-optimal alignments
  1066. :rtpye: list(list(tuple(str, str)))
  1067. :return: Alignment(s) of str1 and str2
  1068. (Kondrak 2002: 51)
  1069. """
  1070. if np is None:
  1071. raise ImportError("You need numpy in order to use the align function")
  1072. assert 0.0 <= epsilon <= 1.0, "Epsilon must be between 0.0 and 1.0."
  1073. m = len(str1)
  1074. n = len(str2)
  1075. # This includes Kondrak's initialization of row 0 and column 0 to all 0s.
  1076. S = np.zeros((m + 1, n + 1), dtype=float)
  1077. # If i <= 1 or j <= 1, don't allow expansions as it doesn't make sense,
  1078. # and breaks array and string indices. Make sure they never get chosen
  1079. # by setting them to -inf.
  1080. for i in range(1, m + 1):
  1081. for j in range(1, n + 1):
  1082. edit1 = S[i - 1, j] + sigma_skip(str1[i - 1])
  1083. edit2 = S[i, j - 1] + sigma_skip(str2[j - 1])
  1084. edit3 = S[i - 1, j - 1] + sigma_sub(str1[i - 1], str2[j - 1])
  1085. if i > 1:
  1086. edit4 = S[i - 2, j - 1] + sigma_exp(str2[j - 1], str1[i - 2 : i])
  1087. else:
  1088. edit4 = -inf
  1089. if j > 1:
  1090. edit5 = S[i - 1, j - 2] + sigma_exp(str1[i - 1], str2[j - 2 : j])
  1091. else:
  1092. edit5 = -inf
  1093. S[i, j] = max(edit1, edit2, edit3, edit4, edit5, 0)
  1094. T = (1 - epsilon) * np.amax(S) # Threshold score for near-optimal alignments
  1095. alignments = []
  1096. for i in range(1, m + 1):
  1097. for j in range(1, n + 1):
  1098. if S[i, j] >= T:
  1099. alignments.append(_retrieve(i, j, 0, S, T, str1, str2, []))
  1100. return alignments
  1101. def _retrieve(i, j, s, S, T, str1, str2, out):
  1102. """
  1103. Retrieve the path through the similarity matrix S starting at (i, j).
  1104. :rtype: list(tuple(str, str))
  1105. :return: Alignment of str1 and str2
  1106. """
  1107. if S[i, j] == 0:
  1108. return out
  1109. else:
  1110. if j > 1 and S[i - 1, j - 2] + sigma_exp(str1[i - 1], str2[j - 2 : j]) + s >= T:
  1111. out.insert(0, (str1[i - 1], str2[j - 2 : j]))
  1112. _retrieve(
  1113. i - 1,
  1114. j - 2,
  1115. s + sigma_exp(str1[i - 1], str2[j - 2 : j]),
  1116. S,
  1117. T,
  1118. str1,
  1119. str2,
  1120. out,
  1121. )
  1122. elif (
  1123. i > 1 and S[i - 2, j - 1] + sigma_exp(str2[j - 1], str1[i - 2 : i]) + s >= T
  1124. ):
  1125. out.insert(0, (str1[i - 2 : i], str2[j - 1]))
  1126. _retrieve(
  1127. i - 2,
  1128. j - 1,
  1129. s + sigma_exp(str2[j - 1], str1[i - 2 : i]),
  1130. S,
  1131. T,
  1132. str1,
  1133. str2,
  1134. out,
  1135. )
  1136. elif S[i, j - 1] + sigma_skip(str2[j - 1]) + s >= T:
  1137. out.insert(0, ("-", str2[j - 1]))
  1138. _retrieve(i, j - 1, s + sigma_skip(str2[j - 1]), S, T, str1, str2, out)
  1139. elif S[i - 1, j] + sigma_skip(str1[i - 1]) + s >= T:
  1140. out.insert(0, (str1[i - 1], "-"))
  1141. _retrieve(i - 1, j, s + sigma_skip(str1[i - 1]), S, T, str1, str2, out)
  1142. elif S[i - 1, j - 1] + sigma_sub(str1[i - 1], str2[j - 1]) + s >= T:
  1143. out.insert(0, (str1[i - 1], str2[j - 1]))
  1144. _retrieve(
  1145. i - 1,
  1146. j - 1,
  1147. s + sigma_sub(str1[i - 1], str2[j - 1]),
  1148. S,
  1149. T,
  1150. str1,
  1151. str2,
  1152. out,
  1153. )
  1154. return out
  1155. def sigma_skip(p):
  1156. """
  1157. Returns score of an indel of P.
  1158. (Kondrak 2002: 54)
  1159. """
  1160. return C_skip
  1161. def sigma_sub(p, q):
  1162. """
  1163. Returns score of a substitution of P with Q.
  1164. (Kondrak 2002: 54)
  1165. """
  1166. return C_sub - delta(p, q) - V(p) - V(q)
  1167. def sigma_exp(p, q):
  1168. """
  1169. Returns score of an expansion/compression.
  1170. (Kondrak 2002: 54)
  1171. """
  1172. q1 = q[0]
  1173. q2 = q[1]
  1174. return C_exp - delta(p, q1) - delta(p, q2) - V(p) - max(V(q1), V(q2))
  1175. def delta(p, q):
  1176. """
  1177. Return weighted sum of difference between P and Q.
  1178. (Kondrak 2002: 54)
  1179. """
  1180. features = R(p, q)
  1181. total = 0
  1182. for f in features:
  1183. total += diff(p, q, f) * salience[f]
  1184. return total
  1185. def diff(p, q, f):
  1186. """
  1187. Returns difference between phonetic segments P and Q for feature F.
  1188. (Kondrak 2002: 52, 54)
  1189. """
  1190. p_features, q_features = feature_matrix[p], feature_matrix[q]
  1191. return abs(similarity_matrix[p_features[f]] - similarity_matrix[q_features[f]])
  1192. def R(p, q):
  1193. """
  1194. Return relevant features for segment comparsion.
  1195. (Kondrak 2002: 54)
  1196. """
  1197. if p in consonants or q in consonants:
  1198. return R_c
  1199. return R_v
  1200. def V(p):
  1201. """
  1202. Return vowel weight if P is vowel.
  1203. (Kondrak 2002: 54)
  1204. """
  1205. if p in consonants:
  1206. return 0
  1207. return C_vwl
  1208. # === Test ===
  1209. def demo():
  1210. """
  1211. A demonstration of the result of aligning phonetic sequences
  1212. used in Kondrak's (2002) dissertation.
  1213. """
  1214. data = [pair.split(",") for pair in cognate_data.split("\n")]
  1215. for pair in data:
  1216. alignment = align(pair[0], pair[1])[0]
  1217. alignment = ["({}, {})".format(a[0], a[1]) for a in alignment]
  1218. alignment = " ".join(alignment)
  1219. print("{} ~ {} : {}".format(pair[0], pair[1], alignment))
  1220. cognate_data = """jo,ʒə
  1221. tu,ty
  1222. nosotros,nu
  1223. kjen,ki
  1224. ke,kwa
  1225. todos,tu
  1226. una,ən
  1227. dos,dø
  1228. tres,trwa
  1229. ombre,om
  1230. arbol,arbrə
  1231. pluma,plym
  1232. kabeθa,kap
  1233. boka,buʃ
  1234. pje,pje
  1235. koraθon,kœr
  1236. ber,vwar
  1237. benir,vənir
  1238. deθir,dir
  1239. pobre,povrə
  1240. ðis,dIzes
  1241. ðæt,das
  1242. wat,vas
  1243. nat,nixt
  1244. loŋ,laŋ
  1245. mæn,man
  1246. fleʃ,flajʃ
  1247. bləd,blyt
  1248. feðər,fEdər
  1249. hær,hAr
  1250. ir,Or
  1251. aj,awgə
  1252. nowz,nAzə
  1253. mawθ,munt
  1254. təŋ,tsuŋə
  1255. fut,fys
  1256. nij,knI
  1257. hænd,hant
  1258. hart,herts
  1259. livər,lEbər
  1260. ænd,ante
  1261. æt,ad
  1262. blow,flAre
  1263. ir,awris
  1264. ijt,edere
  1265. fiʃ,piʃkis
  1266. flow,fluere
  1267. staɾ,stella
  1268. ful,plenus
  1269. græs,gramen
  1270. hart,kordis
  1271. horn,korny
  1272. aj,ego
  1273. nij,genU
  1274. məðər,mAter
  1275. mawntən,mons
  1276. nejm,nomen
  1277. njuw,nowus
  1278. wən,unus
  1279. rawnd,rotundus
  1280. sow,suere
  1281. sit,sedere
  1282. θrij,tres
  1283. tuwθ,dentis
  1284. θin,tenwis
  1285. kinwawa,kenuaʔ
  1286. nina,nenah
  1287. napewa,napɛw
  1288. wapimini,wapemen
  1289. namesa,namɛʔs
  1290. okimawa,okemaw
  1291. ʃiʃipa,seʔsep
  1292. ahkohkwa,ahkɛh
  1293. pematesiweni,pematesewen
  1294. asenja,aʔsɛn"""
  1295. if __name__ == "__main__":
  1296. demo()