|
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
# Natural Language Toolkit: ALINE
|
|
|
|
|
#
|
|
|
|
|
# Copyright (C) 2001-2020 NLTK Project
|
|
|
|
|
# Author: Greg Kondrak <gkondrak@ualberta.ca>
|
|
|
|
|
# Geoff Bacon <bacon@berkeley.edu> (Python port)
|
|
|
|
|
# URL: <http://nltk.org/>
|
|
|
|
|
# For license information, see LICENSE.TXT
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
ALINE
|
|
|
|
|
http://webdocs.cs.ualberta.ca/~kondrak/
|
|
|
|
|
Copyright 2002 by Grzegorz Kondrak.
|
|
|
|
|
|
|
|
|
|
ALINE is an algorithm for aligning phonetic sequences, described in [1].
|
|
|
|
|
This module is a port of Kondrak's (2002) ALINE. It provides functions for
|
|
|
|
|
phonetic sequence alignment and similarity analysis. These are useful in
|
|
|
|
|
historical linguistics, sociolinguistics and synchronic phonology.
|
|
|
|
|
|
|
|
|
|
ALINE has parameters that can be tuned for desired output. These parameters are:
|
|
|
|
|
- C_skip, C_sub, C_exp, C_vwl
|
|
|
|
|
- Salience weights
|
|
|
|
|
- Segmental features
|
|
|
|
|
|
|
|
|
|
In this implementation, some parameters have been changed from their default
|
|
|
|
|
values as described in [1], in order to replicate published results. All changes
|
|
|
|
|
are noted in comments.
|
|
|
|
|
|
|
|
|
|
Example usage
|
|
|
|
|
-------------
|
|
|
|
|
|
|
|
|
|
# Get optimal alignment of two phonetic sequences
|
|
|
|
|
|
|
|
|
|
>>> align('θin', 'tenwis') # doctest: +SKIP
|
|
|
|
|
[[('θ', 't'), ('i', 'e'), ('n', 'n'), ('-', 'w'), ('-', 'i'), ('-', 's')]]
|
|
|
|
|
|
|
|
|
|
[1] G. Kondrak. Algorithms for Language Reconstruction. PhD dissertation,
|
|
|
|
|
University of Toronto.
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
import numpy as np
|
|
|
|
|
except ImportError:
|
|
|
|
|
np = None
|
|
|
|
|
|
|
|
|
|
# === Constants ===
|
|
|
|
|
|
|
|
|
|
inf = float("inf")
|
|
|
|
|
|
|
|
|
|
# Default values for maximum similarity scores (Kondrak 2002: 54)
|
|
|
|
|
C_skip = 10 # Indels
|
|
|
|
|
C_sub = 35 # Substitutions
|
|
|
|
|
C_exp = 45 # Expansions/compressions
|
|
|
|
|
C_vwl = 5 # Vowel/consonant relative weight (decreased from 10)
|
|
|
|
|
|
|
|
|
|
consonants = [
|
|
|
|
|
"B",
|
|
|
|
|
"N",
|
|
|
|
|
"R",
|
|
|
|
|
"b",
|
|
|
|
|
"c",
|
|
|
|
|
"d",
|
|
|
|
|
"f",
|
|
|
|
|
"g",
|
|
|
|
|
"h",
|
|
|
|
|
"j",
|
|
|
|
|
"k",
|
|
|
|
|
"l",
|
|
|
|
|
"m",
|
|
|
|
|
"n",
|
|
|
|
|
"p",
|
|
|
|
|
"q",
|
|
|
|
|
"r",
|
|
|
|
|
"s",
|
|
|
|
|
"t",
|
|
|
|
|
"v",
|
|
|
|
|
"x",
|
|
|
|
|
"z",
|
|
|
|
|
"ç",
|
|
|
|
|
"ð",
|
|
|
|
|
"ħ",
|
|
|
|
|
"ŋ",
|
|
|
|
|
"ɖ",
|
|
|
|
|
"ɟ",
|
|
|
|
|
"ɢ",
|
|
|
|
|
"ɣ",
|
|
|
|
|
"ɦ",
|
|
|
|
|
"ɬ",
|
|
|
|
|
"ɮ",
|
|
|
|
|
"ɰ",
|
|
|
|
|
"ɱ",
|
|
|
|
|
"ɲ",
|
|
|
|
|
"ɳ",
|
|
|
|
|
"ɴ",
|
|
|
|
|
"ɸ",
|
|
|
|
|
"ɹ",
|
|
|
|
|
"ɻ",
|
|
|
|
|
"ɽ",
|
|
|
|
|
"ɾ",
|
|
|
|
|
"ʀ",
|
|
|
|
|
"ʁ",
|
|
|
|
|
"ʂ",
|
|
|
|
|
"ʃ",
|
|
|
|
|
"ʈ",
|
|
|
|
|
"ʋ",
|
|
|
|
|
"ʐ ",
|
|
|
|
|
"ʒ",
|
|
|
|
|
"ʔ",
|
|
|
|
|
"ʕ",
|
|
|
|
|
"ʙ",
|
|
|
|
|
"ʝ",
|
|
|
|
|
"β",
|
|
|
|
|
"θ",
|
|
|
|
|
"χ",
|
|
|
|
|
"ʐ",
|
|
|
|
|
"w",
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
# Relevant features for comparing consonants and vowels
|
|
|
|
|
R_c = [
|
|
|
|
|
"aspirated",
|
|
|
|
|
"lateral",
|
|
|
|
|
"manner",
|
|
|
|
|
"nasal",
|
|
|
|
|
"place",
|
|
|
|
|
"retroflex",
|
|
|
|
|
"syllabic",
|
|
|
|
|
"voice",
|
|
|
|
|
]
|
|
|
|
|
# 'high' taken out of R_v because same as manner
|
|
|
|
|
R_v = [
|
|
|
|
|
"back",
|
|
|
|
|
"lateral",
|
|
|
|
|
"long",
|
|
|
|
|
"manner",
|
|
|
|
|
"nasal",
|
|
|
|
|
"place",
|
|
|
|
|
"retroflex",
|
|
|
|
|
"round",
|
|
|
|
|
"syllabic",
|
|
|
|
|
"voice",
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
# Flattened feature matrix (Kondrak 2002: 56)
|
|
|
|
|
similarity_matrix = {
|
|
|
|
|
# place
|
|
|
|
|
"bilabial": 1.0,
|
|
|
|
|
"labiodental": 0.95,
|
|
|
|
|
"dental": 0.9,
|
|
|
|
|
"alveolar": 0.85,
|
|
|
|
|
"retroflex": 0.8,
|
|
|
|
|
"palato-alveolar": 0.75,
|
|
|
|
|
"palatal": 0.7,
|
|
|
|
|
"velar": 0.6,
|
|
|
|
|
"uvular": 0.5,
|
|
|
|
|
"pharyngeal": 0.3,
|
|
|
|
|
"glottal": 0.1,
|
|
|
|
|
"labiovelar": 1.0,
|
|
|
|
|
"vowel": -1.0, # added 'vowel'
|
|
|
|
|
# manner
|
|
|
|
|
"stop": 1.0,
|
|
|
|
|
"affricate": 0.9,
|
|
|
|
|
"fricative": 0.85, # increased fricative from 0.8
|
|
|
|
|
"trill": 0.7,
|
|
|
|
|
"tap": 0.65,
|
|
|
|
|
"approximant": 0.6,
|
|
|
|
|
"high vowel": 0.4,
|
|
|
|
|
"mid vowel": 0.2,
|
|
|
|
|
"low vowel": 0.0,
|
|
|
|
|
"vowel2": 0.5, # added vowel
|
|
|
|
|
# high
|
|
|
|
|
"high": 1.0,
|
|
|
|
|
"mid": 0.5,
|
|
|
|
|
"low": 0.0,
|
|
|
|
|
# back
|
|
|
|
|
"front": 1.0,
|
|
|
|
|
"central": 0.5,
|
|
|
|
|
"back": 0.0,
|
|
|
|
|
# binary features
|
|
|
|
|
"plus": 1.0,
|
|
|
|
|
"minus": 0.0,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
# Relative weights of phonetic features (Kondrak 2002: 55)
|
|
|
|
|
salience = {
|
|
|
|
|
"syllabic": 5,
|
|
|
|
|
"place": 40,
|
|
|
|
|
"manner": 50,
|
|
|
|
|
"voice": 5, # decreased from 10
|
|
|
|
|
"nasal": 20, # increased from 10
|
|
|
|
|
"retroflex": 10,
|
|
|
|
|
"lateral": 10,
|
|
|
|
|
"aspirated": 5,
|
|
|
|
|
"long": 0, # decreased from 1
|
|
|
|
|
"high": 3, # decreased from 5
|
|
|
|
|
"back": 2, # decreased from 5
|
|
|
|
|
"round": 2, # decreased from 5
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
# (Kondrak 2002: 59-60)
|
|
|
|
|
feature_matrix = {
|
|
|
|
|
# Consonants
|
|
|
|
|
"p": {
|
|
|
|
|
"place": "bilabial",
|
|
|
|
|
"manner": "stop",
|
|
|
|
|
"syllabic": "minus",
|
|
|
|
|
"voice": "minus",
|
|
|
|
|
"nasal": "minus",
|
|
|
|
|
"retroflex": "minus",
|
|
|
|
|
"lateral": "minus",
|
|
|
|
|
"aspirated": "minus",
|
|
|
|
|
},
|
|
|
|
|
"b": {
|
|
|
|
|
"place": "bilabial",
|
|
|
|
|
"manner": "stop",
|
|
|
|
|
"syllabic": "minus",
|
|
|
|
|
"voice": "plus",
|
|
|
|
|
"nasal": "minus",
|
|
|
|
|
"retroflex": "minus",
|
|
|
|
|
"lateral": "minus",
|
|
|
|
|
"aspirated": "minus",
|
|
|
|
|
},
|
|
|
|
|
"t": {
|
|
|
|
|
"place": "alveolar",
|
|
|
|
|
"manner": "stop",
|
|
|
|
|
"syllabic": "minus",
|
|
|
|
|
"voice": "minus",
|
|
|
|
|
"nasal": "minus",
|
|
|
|
|
"retroflex": "minus",
|
|
|
|
|
"lateral": "minus",
|
|
|
|
|
"aspirated": "minus",
|
|
|
|
|
},
|
|
|
|
|
"d": {
|
|
|
|
|
"place": "alveolar",
|
|
|
|
|
"manner": "stop",
|
|
|
|
|
"syllabic": "minus",
|
|
|
|
|
"voice": "plus",
|
|
|
|
|
"nasal": "minus",
|
|
|
|
|
"retroflex": "minus",
|
|
|
|
|
"lateral": "minus",
|
|
|
|
|
"aspirated": "minus",
|
|
|
|
|
},
|
|
|
|
|
"ʈ": {
|
|
|
|
|
"place": "retroflex",
|
|
|
|
|
"manner": "stop",
|
|
|
|
|
"syllabic": "minus",
|
|
|
|
|
"voice": "minus",
|
|
|
|
|
"nasal": "minus",
|
|
|
|
|
"retroflex": "plus",
|
|
|
|
|
"lateral": "minus",
|
|
|
|
|
"aspirated": "minus",
|
|
|
|
|
},
|
|
|
|
|
"ɖ": {
|
|
|
|
|
"place": "retroflex",
|
|
|
|
|
"manner": "stop",
|
|
|
|
|
"syllabic": "minus",
|
|
|
|
|
"voice": "plus",
|
|
|
|
|
"nasal": "minus",
|
|
|
|
|
"retroflex": "plus",
|
|
|
|
|
"lateral": "minus",
|
|
|
|
|
"aspirated": "minus",
|
|
|
|
|
},
|
|
|
|
|
"c": {
|
|
|
|
|
"place": "palatal",
|
|
|
|
|
"manner": "stop",
|
|
|
|
|
"syllabic": "minus",
|
|
|
|
|
"voice": "minus",
|
|
|
|
|
"nasal": "minus",
|
|
|
|
|
"retroflex": "minus",
|
|
|
|
|
"lateral": "minus",
|
|
|
|
|
"aspirated": "minus",
|
|
|
|
|
},
|
|
|
|
|
"ɟ": {
|
|
|
|
|
"place": "palatal",
|
|
|
|
|
"manner": "stop",
|
|
|
|
|
"syllabic": "minus",
|
|
|
|
|
"voice": "plus",
|
|
|
|
|
"nasal": "minus",
|
|
|
|
|
"retroflex": "minus",
|
|
|
|
|
"lateral": "minus",
|
|
|
|
|
"aspirated": "minus",
|
|
|
|
|
},
|
|
|
|
|
"k": {
|
|
|
|
|
"place": "velar",
|
|
|
|
|
"manner": "stop",
|
|
|
|
|
"syllabic": "minus",
|
|
|
|
|
"voice": "minus",
|
|
|
|
|
"nasal": "minus",
|
|
|
|
|
"retroflex": "minus",
|
|
|
|
|
"lateral": "minus",
|
|
|
|
|
"aspirated": "minus",
|
|
|
|
|
},
|
|
|
|
|
"g": {
|
|
|
|
|
"place": "velar",
|
|
|
|
|
"manner": "stop",
|
|
|
|
|
"syllabic": "minus",
|
|
|
|
|
"voice": "plus",
|
|
|
|
|
"nasal": "minus",
|
|
|
|
|
"retroflex": "minus",
|
|
|
|
|
"lateral": "minus",
|
|
|
|
|
"aspirated": "minus",
|
|
|
|
|
},
|
|
|
|
|
"q": {
|
|
|
|
|
"place": "uvular",
|
|
|
|
|
"manner": "stop",
|
|
|
|
|
"syllabic": "minus",
|
|
|
|
|
"voice": "minus",
|
|
|
|
|
"nasal": "minus",
|
|
|
|
|
"retroflex": "minus",
|
|
|
|
|
"lateral": "minus",
|
|
|
|
|
"aspirated": "minus",
|
|
|
|
|
},
|
|
|
|
|
"ɢ": {
|
|
|
|
|
"place": "uvular",
|
|
|
|
|
"manner": "stop",
|
|
|
|
|
"syllabic": "minus",
|
|
|
|
|
"voice": "plus",
|
|
|
|
|
"nasal": "minus",
|
|
|
|
|
"retroflex": "minus",
|
|
|
|
|
"lateral": "minus",
|
|
|
|
|
"aspirated": "minus",
|
|
|
|
|
},
|
|
|
|
|
"ʔ": {
|
|
|
|
|
"place": "glottal",
|
|
|
|
|
"manner": "stop",
|
|
|
|
|
"syllabic": "minus",
|
|
|
|
|
"voice": "minus",
|
|
|
|
|
"nasal": "minus",
|
|
|
|
|
"retroflex": "minus",
|
|
|
|
|
"lateral": "minus",
|
|
|
|
|
"aspirated": "minus",
|
|
|
|
|
},
|
|
|
|
|
"m": {
|
|
|
|
|
"place": "bilabial",
|
|
|
|
|
"manner": "stop",
|
|
|
|
|
"syllabic": "minus",
|
|
|
|
|
"voice": "plus",
|
|
|
|
|
"nasal": "plus",
|
|
|
|
|
"retroflex": "minus",
|
|
|
|
|
"lateral": "minus",
|
|
|
|
|
"aspirated": "minus",
|
|
|
|
|
},
|
|
|
|
|
"ɱ": {
|
|
|
|
|
"place": "labiodental",
|
|
|
|
|
"manner": "stop",
|
|
|
|
|
"syllabic": "minus",
|
|
|
|
|
"voice": "plus",
|
|
|
|
|
"nasal": "plus",
|
|
|
|
|
"retroflex": "minus",
|
|
|
|
|
"lateral": "minus",
|
|
|
|
|
"aspirated": "minus",
|
|
|
|
|
},
|
|
|
|
|
"n": {
|
|
|
|
|
"place": "alveolar",
|
|
|
|
|
"manner": "stop",
|
|
|
|
|
"syllabic": "minus",
|
|
|
|
|
"voice": "plus",
|
|
|
|
|
"nasal": "plus",
|
|
|
|
|
"retroflex": "minus",
|
|
|
|
|
"lateral": "minus",
|
|
|
|
|
"aspirated": "minus",
|
|
|
|
|
},
|
|
|
|
|
"ɳ": {
|
|
|
|
|
"place": "retroflex",
|
|
|
|
|
"manner": "stop",
|
|
|
|
|
"syllabic": "minus",
|
|
|
|
|
"voice": "plus",
|
|
|
|
|
"nasal": "plus",
|
|
|
|
|
"retroflex": "plus",
|
|
|
|
|
"lateral": "minus",
|
|
|
|
|
"aspirated": "minus",
|
|
|
|
|
},
|
|
|
|
|
"ɲ": {
|
|
|
|
|
"place": "palatal",
|
|
|
|
|
"manner": "stop",
|
|
|
|
|
"syllabic": "minus",
|
|
|
|
|
"voice": "plus",
|
|
|
|
|
"nasal": "plus",
|
|
|
|
|
"retroflex": "minus",
|
|
|
|
|
"lateral": "minus",
|
|
|
|
|
"aspirated": "minus",
|
|
|
|
|
},
|
|
|
|
|
"ŋ": {
|
|
|
|
|
"place": "velar",
|
|
|
|
|
"manner": "stop",
|
|
|
|
|
"syllabic": "minus",
|
|
|
|
|
"voice": "plus",
|
|
|
|
|
"nasal": "plus",
|
|
|
|
|
"retroflex": "minus",
|
|
|
|
|
"lateral": "minus",
|
|
|
|
|
"aspirated": "minus",
|
|
|
|
|
},
|
|
|
|
|
"ɴ": {
|
|
|
|
|
"place": "uvular",
|
|
|
|
|
"manner": "stop",
|
|
|
|
|
"syllabic": "minus",
|
|
|
|
|
"voice": "plus",
|
|
|
|
|
"nasal": "plus",
|
|
|
|
|
"retroflex": "minus",
|
|
|
|
|
"lateral": "minus",
|
|
|
|
|
"aspirated": "minus",
|
|
|
|
|
},
|
|
|
|
|
"N": {
|
|
|
|
|
"place": "uvular",
|
|
|
|
|
"manner": "stop",
|
|
|
|
|
"syllabic": "minus",
|
|
|
|
|
"voice": "plus",
|
|
|
|
|
"nasal": "plus",
|
|
|
|
|
"retroflex": "minus",
|
|
|
|
|
"lateral": "minus",
|
|
|
|
|
"aspirated": "minus",
|
|
|
|
|
},
|
|
|
|
|
"ʙ": {
|
|
|
|
|
"place": "bilabial",
|
|
|
|
|
"manner": "trill",
|
|
|
|
|
"syllabic": "minus",
|
|
|
|
|
"voice": "plus",
|
|
|
|
|
"nasal": "minus",
|
|
|
|
|
"retroflex": "minus",
|
|
|
|
|
"lateral": "minus",
|
|
|
|
|
"aspirated": "minus",
|
|
|
|
|
},
|
|
|
|
|
"B": {
|
|
|
|
|
"place": "bilabial",
|
|
|
|
|
"manner": "trill",
|
|
|
|
|
"syllabic": "minus",
|
|
|
|
|
"voice": "plus",
|
|
|
|
|
"nasal": "minus",
|
|
|
|
|
"retroflex": "minus",
|
|
|
|
|
"lateral": "minus",
|
|
|
|
|
"aspirated": "minus",
|
|
|
|
|
},
|
|
|
|
|
"r": {
|
|
|
|
|
"place": "alveolar",
|
|
|
|
|
"manner": "trill",
|
|
|
|
|
"syllabic": "minus",
|
|
|
|
|
"voice": "plus",
|
|
|
|
|
"nasal": "minus",
|
|
|
|
|
"retroflex": "plus",
|
|
|
|
|
"lateral": "minus",
|
|
|
|
|
"aspirated": "minus",
|
|
|
|
|
},
|
|
|
|
|
"ʀ": {
|
|
|
|
|
"place": "uvular",
|
|
|
|
|
"manner": "trill",
|
|
|
|
|
"syllabic": "minus",
|
|
|
|
|
"voice": "plus",
|
|
|
|
|
"nasal": "minus",
|
|
|
|
|
"retroflex": "minus",
|
|
|
|
|
"lateral": "minus",
|
|
|
|
|
"aspirated": "minus",
|
|
|
|
|
},
|
|
|
|
|
"R": {
|
|
|
|
|
"place": "uvular",
|
|
|
|
|
"manner": "trill",
|
|
|
|
|
"syllabic": "minus",
|
|
|
|
|
"voice": "plus",
|
|
|
|
|
"nasal": "minus",
|
|
|
|
|
"retroflex": "minus",
|
|
|
|
|
"lateral": "minus",
|
|
|
|
|
"aspirated": "minus",
|
|
|
|
|
},
|
|
|
|
|
"ɾ": {
|
|
|
|
|
"place": "alveolar",
|
|
|
|
|
"manner": "tap",
|
|
|
|
|
"syllabic": "minus",
|
|
|
|
|
"voice": "plus",
|
|
|
|
|
"nasal": "minus",
|
|
|
|
|
"retroflex": "minus",
|
|
|
|
|
"lateral": "minus",
|
|
|
|
|
"aspirated": "minus",
|
|
|
|
|
},
|
|
|
|
|
"ɽ": {
|
|
|
|
|
"place": "retroflex",
|
|
|
|
|
"manner": "tap",
|
|
|
|
|
"syllabic": "minus",
|
|
|
|
|
"voice": "plus",
|
|
|
|
|
"nasal": "minus",
|
|
|
|
|
"retroflex": "plus",
|
|
|
|
|
"lateral": "minus",
|
|
|
|
|
"aspirated": "minus",
|
|
|
|
|
},
|
|
|
|
|
"ɸ": {
|
|
|
|
|
"place": "bilabial",
|
|
|
|
|
"manner": "fricative",
|
|
|
|
|
"syllabic": "minus",
|
|
|
|
|
"voice": "minus",
|
|
|
|
|
"nasal": "minus",
|
|
|
|
|
"retroflex": "minus",
|
|
|
|
|
"lateral": "minus",
|
|
|
|
|
"aspirated": "minus",
|
|
|
|
|
},
|
|
|
|
|
"β": {
|
|
|
|
|
"place": "bilabial",
|
|
|
|
|
"manner": "fricative",
|
|
|
|
|
"syllabic": "minus",
|
|
|
|
|
"voice": "plus",
|
|
|
|
|
"nasal": "minus",
|
|
|
|
|
"retroflex": "minus",
|
|
|
|
|
"lateral": "minus",
|
|
|
|
|
"aspirated": "minus",
|
|
|
|
|
},
|
|
|
|
|
"f": {
|
|
|
|
|
"place": "labiodental",
|
|
|
|
|
"manner": "fricative",
|
|
|
|
|
"syllabic": "minus",
|
|
|
|
|
"voice": "minus",
|
|
|
|
|
"nasal": "minus",
|
|
|
|
|
"retroflex": "minus",
|
|
|
|
|
"lateral": "minus",
|
|
|
|
|
"aspirated": "minus",
|
|
|
|
|
},
|
|
|
|
|
"v": {
|
|
|
|
|
"place": "labiodental",
|
|
|
|
|
"manner": "fricative",
|
|
|
|
|
"syllabic": "minus",
|
|
|
|
|
"voice": "plus",
|
|
|
|
|
"nasal": "minus",
|
|
|
|
|
"retroflex": "minus",
|
|
|
|
|
"lateral": "minus",
|
|
|
|
|
"aspirated": "minus",
|
|
|
|
|
},
|
|
|
|
|
"θ": {
|
|
|
|
|
"place": "dental",
|
|
|
|
|
"manner": "fricative",
|
|
|
|
|
"syllabic": "minus",
|
|
|
|
|
"voice": "minus",
|
|
|
|
|
"nasal": "minus",
|
|
|
|
|
"retroflex": "minus",
|
|
|
|
|
"lateral": "minus",
|
|
|
|
|
"aspirated": "minus",
|
|
|
|
|
},
|
|
|
|
|
"ð": {
|
|
|
|
|
"place": "dental",
|
|
|
|
|
"manner": "fricative",
|
|
|
|
|
"syllabic": "minus",
|
|
|
|
|
"voice": "plus",
|
|
|
|
|
"nasal": "minus",
|
|
|
|
|
"retroflex": "minus",
|
|
|
|
|
"lateral": "minus",
|
|
|
|
|
"aspirated": "minus",
|
|
|
|
|
},
|
|
|
|
|
"s": {
|
|
|
|
|
"place": "alveolar",
|
|
|
|
|
"manner": "fricative",
|
|
|
|
|
"syllabic": "minus",
|
|
|
|
|
"voice": "minus",
|
|
|
|
|
"nasal": "minus",
|
|
|
|
|
"retroflex": "minus",
|
|
|
|
|
"lateral": "minus",
|
|
|
|
|
"aspirated": "minus",
|
|
|
|
|
},
|
|
|
|
|
"z": {
|
|
|
|
|
"place": "alveolar",
|
|
|
|
|
"manner": "fricative",
|
|
|
|
|
"syllabic": "minus",
|
|
|
|
|
"voice": "plus",
|
|
|
|
|
"nasal": "minus",
|
|
|
|
|
"retroflex": "minus",
|
|
|
|
|
"lateral": "minus",
|
|
|
|
|
"aspirated": "minus",
|
|
|
|
|
},
|
|
|
|
|
"ʃ": {
|
|
|
|
|
"place": "palato-alveolar",
|
|
|
|
|
"manner": "fricative",
|
|
|
|
|
"syllabic": "minus",
|
|
|
|
|
"voice": "minus",
|
|
|
|
|
"nasal": "minus",
|
|
|
|
|
"retroflex": "minus",
|
|
|
|
|
"lateral": "minus",
|
|
|
|
|
"aspirated": "minus",
|
|
|
|
|
},
|
|
|
|
|
"ʒ": {
|
|
|
|
|
"place": "palato-alveolar",
|
|
|
|
|
"manner": "fricative",
|
|
|
|
|
"syllabic": "minus",
|
|
|
|
|
"voice": "plus",
|
|
|
|
|
"nasal": "minus",
|
|
|
|
|
"retroflex": "minus",
|
|
|
|
|
"lateral": "minus",
|
|
|
|
|
"aspirated": "minus",
|
|
|
|
|
},
|
|
|
|
|
"ʂ": {
|
|
|
|
|
"place": "retroflex",
|
|
|
|
|
"manner": "fricative",
|
|
|
|
|
"syllabic": "minus",
|
|
|
|
|
"voice": "minus",
|
|
|
|
|
"nasal": "minus",
|
|
|
|
|
"retroflex": "plus",
|
|
|
|
|
"lateral": "minus",
|
|
|
|
|
"aspirated": "minus",
|
|
|
|
|
},
|
|
|
|
|
"ʐ": {
|
|
|
|
|
"place": "retroflex",
|
|
|
|
|
"manner": "fricative",
|
|
|
|
|
"syllabic": "minus",
|
|
|
|
|
"voice": "plus",
|
|
|
|
|
"nasal": "minus",
|
|
|
|
|
"retroflex": "plus",
|
|
|
|
|
"lateral": "minus",
|
|
|
|
|
"aspirated": "minus",
|
|
|
|
|
},
|
|
|
|
|
"ç": {
|
|
|
|
|
"place": "palatal",
|
|
|
|
|
"manner": "fricative",
|
|
|
|
|
"syllabic": "minus",
|
|
|
|
|
"voice": "minus",
|
|
|
|
|
"nasal": "minus",
|
|
|
|
|
"retroflex": "minus",
|
|
|
|
|
"lateral": "minus",
|
|
|
|
|
"aspirated": "minus",
|
|
|
|
|
},
|
|
|
|
|
"ʝ": {
|
|
|
|
|
"place": "palatal",
|
|
|
|
|
"manner": "fricative",
|
|
|
|
|
"syllabic": "minus",
|
|
|
|
|
"voice": "plus",
|
|
|
|
|
"nasal": "minus",
|
|
|
|
|
"retroflex": "minus",
|
|
|
|
|
"lateral": "minus",
|
|
|
|
|
"aspirated": "minus",
|
|
|
|
|
},
|
|
|
|
|
"x": {
|
|
|
|
|
"place": "velar",
|
|
|
|
|
"manner": "fricative",
|
|
|
|
|
"syllabic": "minus",
|
|
|
|
|
"voice": "minus",
|
|
|
|
|
"nasal": "minus",
|
|
|
|
|
"retroflex": "minus",
|
|
|
|
|
"lateral": "minus",
|
|
|
|
|
"aspirated": "minus",
|
|
|
|
|
},
|
|
|
|
|
"ɣ": {
|
|
|
|
|
"place": "velar",
|
|
|
|
|
"manner": "fricative",
|
|
|
|
|
"syllabic": "minus",
|
|
|
|
|
"voice": "plus",
|
|
|
|
|
"nasal": "minus",
|
|
|
|
|
"retroflex": "minus",
|
|
|
|
|
"lateral": "minus",
|
|
|
|
|
"aspirated": "minus",
|
|
|
|
|
},
|
|
|
|
|
"χ": {
|
|
|
|
|
"place": "uvular",
|
|
|
|
|
"manner": "fricative",
|
|
|
|
|
"syllabic": "minus",
|
|
|
|
|
"voice": "minus",
|
|
|
|
|
"nasal": "minus",
|
|
|
|
|
"retroflex": "minus",
|
|
|
|
|
"lateral": "minus",
|
|
|
|
|
"aspirated": "minus",
|
|
|
|
|
},
|
|
|
|
|
"ʁ": {
|
|
|
|
|
"place": "uvular",
|
|
|
|
|
"manner": "fricative",
|
|
|
|
|
"syllabic": "minus",
|
|
|
|
|
"voice": "plus",
|
|
|
|
|
"nasal": "minus",
|
|
|
|
|
"retroflex": "minus",
|
|
|
|
|
"lateral": "minus",
|
|
|
|
|
"aspirated": "minus",
|
|
|
|
|
},
|
|
|
|
|
"ħ": {
|
|
|
|
|
"place": "pharyngeal",
|
|
|
|
|
"manner": "fricative",
|
|
|
|
|
"syllabic": "minus",
|
|
|
|
|
"voice": "minus",
|
|
|
|
|
"nasal": "minus",
|
|
|
|
|
"retroflex": "minus",
|
|
|
|
|
"lateral": "minus",
|
|
|
|
|
"aspirated": "minus",
|
|
|
|
|
},
|
|
|
|
|
"ʕ": {
|
|
|
|
|
"place": "pharyngeal",
|
|
|
|
|
"manner": "fricative",
|
|
|
|
|
"syllabic": "minus",
|
|
|
|
|
"voice": "plus",
|
|
|
|
|
"nasal": "minus",
|
|
|
|
|
"retroflex": "minus",
|
|
|
|
|
"lateral": "minus",
|
|
|
|
|
"aspirated": "minus",
|
|
|
|
|
},
|
|
|
|
|
"h": {
|
|
|
|
|
"place": "glottal",
|
|
|
|
|
"manner": "fricative",
|
|
|
|
|
"syllabic": "minus",
|
|
|
|
|
"voice": "minus",
|
|
|
|
|
"nasal": "minus",
|
|
|
|
|
"retroflex": "minus",
|
|
|
|
|
"lateral": "minus",
|
|
|
|
|
"aspirated": "minus",
|
|
|
|
|
},
|
|
|
|
|
"ɦ": {
|
|
|
|
|
"place": "glottal",
|
|
|
|
|
"manner": "fricative",
|
|
|
|
|
"syllabic": "minus",
|
|
|
|
|
"voice": "plus",
|
|
|
|
|
"nasal": "minus",
|
|
|
|
|
"retroflex": "minus",
|
|
|
|
|
"lateral": "minus",
|
|
|
|
|
"aspirated": "minus",
|
|
|
|
|
},
|
|
|
|
|
"ɬ": {
|
|
|
|
|
"place": "alveolar",
|
|
|
|
|
"manner": "fricative",
|
|
|
|
|
"syllabic": "minus",
|
|
|
|
|
"voice": "minus",
|
|
|
|
|
"nasal": "minus",
|
|
|
|
|
"retroflex": "minus",
|
|
|
|
|
"lateral": "plus",
|
|
|
|
|
"aspirated": "minus",
|
|
|
|
|
},
|
|
|
|
|
"ɮ": {
|
|
|
|
|
"place": "alveolar",
|
|
|
|
|
"manner": "fricative",
|
|
|
|
|
"syllabic": "minus",
|
|
|
|
|
"voice": "plus",
|
|
|
|
|
"nasal": "minus",
|
|
|
|
|
"retroflex": "minus",
|
|
|
|
|
"lateral": "plus",
|
|
|
|
|
"aspirated": "minus",
|
|
|
|
|
},
|
|
|
|
|
"ʋ": {
|
|
|
|
|
"place": "labiodental",
|
|
|
|
|
"manner": "approximant",
|
|
|
|
|
"syllabic": "minus",
|
|
|
|
|
"voice": "plus",
|
|
|
|
|
"nasal": "minus",
|
|
|
|
|
"retroflex": "minus",
|
|
|
|
|
"lateral": "minus",
|
|
|
|
|
"aspirated": "minus",
|
|
|
|
|
},
|
|
|
|
|
"ɹ": {
|
|
|
|
|
"place": "alveolar",
|
|
|
|
|
"manner": "approximant",
|
|
|
|
|
"syllabic": "minus",
|
|
|
|
|
"voice": "plus",
|
|
|
|
|
"nasal": "minus",
|
|
|
|
|
"retroflex": "minus",
|
|
|
|
|
"lateral": "minus",
|
|
|
|
|
"aspirated": "minus",
|
|
|
|
|
},
|
|
|
|
|
"ɻ": {
|
|
|
|
|
"place": "retroflex",
|
|
|
|
|
"manner": "approximant",
|
|
|
|
|
"syllabic": "minus",
|
|
|
|
|
"voice": "plus",
|
|
|
|
|
"nasal": "minus",
|
|
|
|
|
"retroflex": "plus",
|
|
|
|
|
"lateral": "minus",
|
|
|
|
|
"aspirated": "minus",
|
|
|
|
|
},
|
|
|
|
|
"j": {
|
|
|
|
|
"place": "palatal",
|
|
|
|
|
"manner": "approximant",
|
|
|
|
|
"syllabic": "minus",
|
|
|
|
|
"voice": "plus",
|
|
|
|
|
"nasal": "minus",
|
|
|
|
|
"retroflex": "minus",
|
|
|
|
|
"lateral": "minus",
|
|
|
|
|
"aspirated": "minus",
|
|
|
|
|
},
|
|
|
|
|
"ɰ": {
|
|
|
|
|
"place": "velar",
|
|
|
|
|
"manner": "approximant",
|
|
|
|
|
"syllabic": "minus",
|
|
|
|
|
"voice": "plus",
|
|
|
|
|
"nasal": "minus",
|
|
|
|
|
"retroflex": "minus",
|
|
|
|
|
"lateral": "minus",
|
|
|
|
|
"aspirated": "minus",
|
|
|
|
|
},
|
|
|
|
|
"l": {
|
|
|
|
|
"place": "alveolar",
|
|
|
|
|
"manner": "approximant",
|
|
|
|
|
"syllabic": "minus",
|
|
|
|
|
"voice": "plus",
|
|
|
|
|
"nasal": "minus",
|
|
|
|
|
"retroflex": "minus",
|
|
|
|
|
"lateral": "plus",
|
|
|
|
|
"aspirated": "minus",
|
|
|
|
|
},
|
|
|
|
|
"w": {
|
|
|
|
|
"place": "labiovelar",
|
|
|
|
|
"manner": "approximant",
|
|
|
|
|
"syllabic": "minus",
|
|
|
|
|
"voice": "plus",
|
|
|
|
|
"nasal": "minus",
|
|
|
|
|
"retroflex": "minus",
|
|
|
|
|
"lateral": "minus",
|
|
|
|
|
"aspirated": "minus",
|
|
|
|
|
},
|
|
|
|
|
# Vowels
|
|
|
|
|
"i": {
|
|
|
|
|
"place": "vowel",
|
|
|
|
|
"manner": "vowel2",
|
|
|
|
|
"syllabic": "plus",
|
|
|
|
|
"voice": "plus",
|
|
|
|
|
"nasal": "minus",
|
|
|
|
|
"retroflex": "minus",
|
|
|
|
|
"lateral": "minus",
|
|
|
|
|
"high": "high",
|
|
|
|
|
"back": "front",
|
|
|
|
|
"round": "minus",
|
|
|
|
|
"long": "minus",
|
|
|
|
|
"aspirated": "minus",
|
|
|
|
|
},
|
|
|
|
|
"y": {
|
|
|
|
|
"place": "vowel",
|
|
|
|
|
"manner": "vowel2",
|
|
|
|
|
"syllabic": "plus",
|
|
|
|
|
"voice": "plus",
|
|
|
|
|
"nasal": "minus",
|
|
|
|
|
"retroflex": "minus",
|
|
|
|
|
"lateral": "minus",
|
|
|
|
|
"high": "high",
|
|
|
|
|
"back": "front",
|
|
|
|
|
"round": "plus",
|
|
|
|
|
"long": "minus",
|
|
|
|
|
"aspirated": "minus",
|
|
|
|
|
},
|
|
|
|
|
"e": {
|
|
|
|
|
"place": "vowel",
|
|
|
|
|
"manner": "vowel2",
|
|
|
|
|
"syllabic": "plus",
|
|
|
|
|
"voice": "plus",
|
|
|
|
|
"nasal": "minus",
|
|
|
|
|
"retroflex": "minus",
|
|
|
|
|
"lateral": "minus",
|
|
|
|
|
"high": "mid",
|
|
|
|
|
"back": "front",
|
|
|
|
|
"round": "minus",
|
|
|
|
|
"long": "minus",
|
|
|
|
|
"aspirated": "minus",
|
|
|
|
|
},
|
|
|
|
|
"E": {
|
|
|
|
|
"place": "vowel",
|
|
|
|
|
"manner": "vowel2",
|
|
|
|
|
"syllabic": "plus",
|
|
|
|
|
"voice": "plus",
|
|
|
|
|
"nasal": "minus",
|
|
|
|
|
"retroflex": "minus",
|
|
|
|
|
"lateral": "minus",
|
|
|
|
|
"high": "mid",
|
|
|
|
|
"back": "front",
|
|
|
|
|
"round": "minus",
|
|
|
|
|
"long": "plus",
|
|
|
|
|
"aspirated": "minus",
|
|
|
|
|
},
|
|
|
|
|
"ø": {
|
|
|
|
|
"place": "vowel",
|
|
|
|
|
"manner": "vowel2",
|
|
|
|
|
"syllabic": "plus",
|
|
|
|
|
"voice": "plus",
|
|
|
|
|
"nasal": "minus",
|
|
|
|
|
"retroflex": "minus",
|
|
|
|
|
"lateral": "minus",
|
|
|
|
|
"high": "mid",
|
|
|
|
|
"back": "front",
|
|
|
|
|
"round": "plus",
|
|
|
|
|
"long": "minus",
|
|
|
|
|
"aspirated": "minus",
|
|
|
|
|
},
|
|
|
|
|
"ɛ": {
|
|
|
|
|
"place": "vowel",
|
|
|
|
|
"manner": "vowel2",
|
|
|
|
|
"syllabic": "plus",
|
|
|
|
|
"voice": "plus",
|
|
|
|
|
"nasal": "minus",
|
|
|
|
|
"retroflex": "minus",
|
|
|
|
|
"lateral": "minus",
|
|
|
|
|
"high": "mid",
|
|
|
|
|
"back": "front",
|
|
|
|
|
"round": "minus",
|
|
|
|
|
"long": "minus",
|
|
|
|
|
"aspirated": "minus",
|
|
|
|
|
},
|
|
|
|
|
"œ": {
|
|
|
|
|
"place": "vowel",
|
|
|
|
|
"manner": "vowel2",
|
|
|
|
|
"syllabic": "plus",
|
|
|
|
|
"voice": "plus",
|
|
|
|
|
"nasal": "minus",
|
|
|
|
|
"retroflex": "minus",
|
|
|
|
|
"lateral": "minus",
|
|
|
|
|
"high": "mid",
|
|
|
|
|
"back": "front",
|
|
|
|
|
"round": "plus",
|
|
|
|
|
"long": "minus",
|
|
|
|
|
"aspirated": "minus",
|
|
|
|
|
},
|
|
|
|
|
"æ": {
|
|
|
|
|
"place": "vowel",
|
|
|
|
|
"manner": "vowel2",
|
|
|
|
|
"syllabic": "plus",
|
|
|
|
|
"voice": "plus",
|
|
|
|
|
"nasal": "minus",
|
|
|
|
|
"retroflex": "minus",
|
|
|
|
|
"lateral": "minus",
|
|
|
|
|
"high": "low",
|
|
|
|
|
"back": "front",
|
|
|
|
|
"round": "minus",
|
|
|
|
|
"long": "minus",
|
|
|
|
|
"aspirated": "minus",
|
|
|
|
|
},
|
|
|
|
|
"a": {
|
|
|
|
|
"place": "vowel",
|
|
|
|
|
"manner": "vowel2",
|
|
|
|
|
"syllabic": "plus",
|
|
|
|
|
"voice": "plus",
|
|
|
|
|
"nasal": "minus",
|
|
|
|
|
"retroflex": "minus",
|
|
|
|
|
"lateral": "minus",
|
|
|
|
|
"high": "low",
|
|
|
|
|
"back": "front",
|
|
|
|
|
"round": "minus",
|
|
|
|
|
"long": "minus",
|
|
|
|
|
"aspirated": "minus",
|
|
|
|
|
},
|
|
|
|
|
"A": {
|
|
|
|
|
"place": "vowel",
|
|
|
|
|
"manner": "vowel2",
|
|
|
|
|
"syllabic": "plus",
|
|
|
|
|
"voice": "plus",
|
|
|
|
|
"nasal": "minus",
|
|
|
|
|
"retroflex": "minus",
|
|
|
|
|
"lateral": "minus",
|
|
|
|
|
"high": "low",
|
|
|
|
|
"back": "front",
|
|
|
|
|
"round": "minus",
|
|
|
|
|
"long": "plus",
|
|
|
|
|
"aspirated": "minus",
|
|
|
|
|
},
|
|
|
|
|
"ɨ": {
|
|
|
|
|
"place": "vowel",
|
|
|
|
|
"manner": "vowel2",
|
|
|
|
|
"syllabic": "plus",
|
|
|
|
|
"voice": "plus",
|
|
|
|
|
"nasal": "minus",
|
|
|
|
|
"retroflex": "minus",
|
|
|
|
|
"lateral": "minus",
|
|
|
|
|
"high": "high",
|
|
|
|
|
"back": "central",
|
|
|
|
|
"round": "minus",
|
|
|
|
|
"long": "minus",
|
|
|
|
|
"aspirated": "minus",
|
|
|
|
|
},
|
|
|
|
|
"ʉ": {
|
|
|
|
|
"place": "vowel",
|
|
|
|
|
"manner": "vowel2",
|
|
|
|
|
"syllabic": "plus",
|
|
|
|
|
"voice": "plus",
|
|
|
|
|
"nasal": "minus",
|
|
|
|
|
"retroflex": "minus",
|
|
|
|
|
"lateral": "minus",
|
|
|
|
|
"high": "high",
|
|
|
|
|
"back": "central",
|
|
|
|
|
"round": "plus",
|
|
|
|
|
"long": "minus",
|
|
|
|
|
"aspirated": "minus",
|
|
|
|
|
},
|
|
|
|
|
"ə": {
|
|
|
|
|
"place": "vowel",
|
|
|
|
|
"manner": "vowel2",
|
|
|
|
|
"syllabic": "plus",
|
|
|
|
|
"voice": "plus",
|
|
|
|
|
"nasal": "minus",
|
|
|
|
|
"retroflex": "minus",
|
|
|
|
|
"lateral": "minus",
|
|
|
|
|
"high": "mid",
|
|
|
|
|
"back": "central",
|
|
|
|
|
"round": "minus",
|
|
|
|
|
"long": "minus",
|
|
|
|
|
"aspirated": "minus",
|
|
|
|
|
},
|
|
|
|
|
"u": {
|
|
|
|
|
"place": "vowel",
|
|
|
|
|
"manner": "vowel2",
|
|
|
|
|
"syllabic": "plus",
|
|
|
|
|
"voice": "plus",
|
|
|
|
|
"nasal": "minus",
|
|
|
|
|
"retroflex": "minus",
|
|
|
|
|
"lateral": "minus",
|
|
|
|
|
"high": "high",
|
|
|
|
|
"back": "back",
|
|
|
|
|
"round": "plus",
|
|
|
|
|
"long": "minus",
|
|
|
|
|
"aspirated": "minus",
|
|
|
|
|
},
|
|
|
|
|
"U": {
|
|
|
|
|
"place": "vowel",
|
|
|
|
|
"manner": "vowel2",
|
|
|
|
|
"syllabic": "plus",
|
|
|
|
|
"voice": "plus",
|
|
|
|
|
"nasal": "minus",
|
|
|
|
|
"retroflex": "minus",
|
|
|
|
|
"lateral": "minus",
|
|
|
|
|
"high": "high",
|
|
|
|
|
"back": "back",
|
|
|
|
|
"round": "plus",
|
|
|
|
|
"long": "plus",
|
|
|
|
|
"aspirated": "minus",
|
|
|
|
|
},
|
|
|
|
|
"o": {
|
|
|
|
|
"place": "vowel",
|
|
|
|
|
"manner": "vowel2",
|
|
|
|
|
"syllabic": "plus",
|
|
|
|
|
"voice": "plus",
|
|
|
|
|
"nasal": "minus",
|
|
|
|
|
"retroflex": "minus",
|
|
|
|
|
"lateral": "minus",
|
|
|
|
|
"high": "mid",
|
|
|
|
|
"back": "back",
|
|
|
|
|
"round": "plus",
|
|
|
|
|
"long": "minus",
|
|
|
|
|
"aspirated": "minus",
|
|
|
|
|
},
|
|
|
|
|
"O": {
|
|
|
|
|
"place": "vowel",
|
|
|
|
|
"manner": "vowel2",
|
|
|
|
|
"syllabic": "plus",
|
|
|
|
|
"voice": "plus",
|
|
|
|
|
"nasal": "minus",
|
|
|
|
|
"retroflex": "minus",
|
|
|
|
|
"lateral": "minus",
|
|
|
|
|
"high": "mid",
|
|
|
|
|
"back": "back",
|
|
|
|
|
"round": "plus",
|
|
|
|
|
"long": "plus",
|
|
|
|
|
"aspirated": "minus",
|
|
|
|
|
},
|
|
|
|
|
"ɔ": {
|
|
|
|
|
"place": "vowel",
|
|
|
|
|
"manner": "vowel2",
|
|
|
|
|
"syllabic": "plus",
|
|
|
|
|
"voice": "plus",
|
|
|
|
|
"nasal": "minus",
|
|
|
|
|
"retroflex": "minus",
|
|
|
|
|
"lateral": "minus",
|
|
|
|
|
"high": "mid",
|
|
|
|
|
"back": "back",
|
|
|
|
|
"round": "plus",
|
|
|
|
|
"long": "minus",
|
|
|
|
|
"aspirated": "minus",
|
|
|
|
|
},
|
|
|
|
|
"ɒ": {
|
|
|
|
|
"place": "vowel",
|
|
|
|
|
"manner": "vowel2",
|
|
|
|
|
"syllabic": "plus",
|
|
|
|
|
"voice": "plus",
|
|
|
|
|
"nasal": "minus",
|
|
|
|
|
"retroflex": "minus",
|
|
|
|
|
"lateral": "minus",
|
|
|
|
|
"high": "low",
|
|
|
|
|
"back": "back",
|
|
|
|
|
"round": "minus",
|
|
|
|
|
"long": "minus",
|
|
|
|
|
"aspirated": "minus",
|
|
|
|
|
},
|
|
|
|
|
"I": {
|
|
|
|
|
"place": "vowel",
|
|
|
|
|
"manner": "vowel2",
|
|
|
|
|
"syllabic": "plus",
|
|
|
|
|
"voice": "plus",
|
|
|
|
|
"nasal": "minus",
|
|
|
|
|
"retroflex": "minus",
|
|
|
|
|
"lateral": "minus",
|
|
|
|
|
"high": "high",
|
|
|
|
|
"back": "front",
|
|
|
|
|
"round": "minus",
|
|
|
|
|
"long": "plus",
|
|
|
|
|
"aspirated": "minus",
|
|
|
|
|
},
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
# === Algorithm ===
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def align(str1, str2, epsilon=0):
|
|
|
|
|
"""
|
|
|
|
|
Compute the alignment of two phonetic strings.
|
|
|
|
|
|
|
|
|
|
:type str1, str2: str
|
|
|
|
|
:param str1, str2: Two strings to be aligned
|
|
|
|
|
:type epsilon: float (0.0 to 1.0)
|
|
|
|
|
:param epsilon: Adjusts threshold similarity score for near-optimal alignments
|
|
|
|
|
|
|
|
|
|
:rtpye: list(list(tuple(str, str)))
|
|
|
|
|
:return: Alignment(s) of str1 and str2
|
|
|
|
|
|
|
|
|
|
(Kondrak 2002: 51)
|
|
|
|
|
"""
|
|
|
|
|
if np is None:
|
|
|
|
|
raise ImportError("You need numpy in order to use the align function")
|
|
|
|
|
|
|
|
|
|
assert 0.0 <= epsilon <= 1.0, "Epsilon must be between 0.0 and 1.0."
|
|
|
|
|
m = len(str1)
|
|
|
|
|
n = len(str2)
|
|
|
|
|
# This includes Kondrak's initialization of row 0 and column 0 to all 0s.
|
|
|
|
|
S = np.zeros((m + 1, n + 1), dtype=float)
|
|
|
|
|
|
|
|
|
|
# If i <= 1 or j <= 1, don't allow expansions as it doesn't make sense,
|
|
|
|
|
# and breaks array and string indices. Make sure they never get chosen
|
|
|
|
|
# by setting them to -inf.
|
|
|
|
|
for i in range(1, m + 1):
|
|
|
|
|
for j in range(1, n + 1):
|
|
|
|
|
edit1 = S[i - 1, j] + sigma_skip(str1[i - 1])
|
|
|
|
|
edit2 = S[i, j - 1] + sigma_skip(str2[j - 1])
|
|
|
|
|
edit3 = S[i - 1, j - 1] + sigma_sub(str1[i - 1], str2[j - 1])
|
|
|
|
|
if i > 1:
|
|
|
|
|
edit4 = S[i - 2, j - 1] + sigma_exp(str2[j - 1], str1[i - 2 : i])
|
|
|
|
|
else:
|
|
|
|
|
edit4 = -inf
|
|
|
|
|
if j > 1:
|
|
|
|
|
edit5 = S[i - 1, j - 2] + sigma_exp(str1[i - 1], str2[j - 2 : j])
|
|
|
|
|
else:
|
|
|
|
|
edit5 = -inf
|
|
|
|
|
S[i, j] = max(edit1, edit2, edit3, edit4, edit5, 0)
|
|
|
|
|
|
|
|
|
|
T = (1 - epsilon) * np.amax(S) # Threshold score for near-optimal alignments
|
|
|
|
|
|
|
|
|
|
alignments = []
|
|
|
|
|
for i in range(1, m + 1):
|
|
|
|
|
for j in range(1, n + 1):
|
|
|
|
|
if S[i, j] >= T:
|
|
|
|
|
alignments.append(_retrieve(i, j, 0, S, T, str1, str2, []))
|
|
|
|
|
return alignments
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _retrieve(i, j, s, S, T, str1, str2, out):
|
|
|
|
|
"""
|
|
|
|
|
Retrieve the path through the similarity matrix S starting at (i, j).
|
|
|
|
|
|
|
|
|
|
:rtype: list(tuple(str, str))
|
|
|
|
|
:return: Alignment of str1 and str2
|
|
|
|
|
"""
|
|
|
|
|
if S[i, j] == 0:
|
|
|
|
|
return out
|
|
|
|
|
else:
|
|
|
|
|
if j > 1 and S[i - 1, j - 2] + sigma_exp(str1[i - 1], str2[j - 2 : j]) + s >= T:
|
|
|
|
|
out.insert(0, (str1[i - 1], str2[j - 2 : j]))
|
|
|
|
|
_retrieve(
|
|
|
|
|
i - 1,
|
|
|
|
|
j - 2,
|
|
|
|
|
s + sigma_exp(str1[i - 1], str2[j - 2 : j]),
|
|
|
|
|
S,
|
|
|
|
|
T,
|
|
|
|
|
str1,
|
|
|
|
|
str2,
|
|
|
|
|
out,
|
|
|
|
|
)
|
|
|
|
|
elif (
|
|
|
|
|
i > 1 and S[i - 2, j - 1] + sigma_exp(str2[j - 1], str1[i - 2 : i]) + s >= T
|
|
|
|
|
):
|
|
|
|
|
out.insert(0, (str1[i - 2 : i], str2[j - 1]))
|
|
|
|
|
_retrieve(
|
|
|
|
|
i - 2,
|
|
|
|
|
j - 1,
|
|
|
|
|
s + sigma_exp(str2[j - 1], str1[i - 2 : i]),
|
|
|
|
|
S,
|
|
|
|
|
T,
|
|
|
|
|
str1,
|
|
|
|
|
str2,
|
|
|
|
|
out,
|
|
|
|
|
)
|
|
|
|
|
elif S[i, j - 1] + sigma_skip(str2[j - 1]) + s >= T:
|
|
|
|
|
out.insert(0, ("-", str2[j - 1]))
|
|
|
|
|
_retrieve(i, j - 1, s + sigma_skip(str2[j - 1]), S, T, str1, str2, out)
|
|
|
|
|
elif S[i - 1, j] + sigma_skip(str1[i - 1]) + s >= T:
|
|
|
|
|
out.insert(0, (str1[i - 1], "-"))
|
|
|
|
|
_retrieve(i - 1, j, s + sigma_skip(str1[i - 1]), S, T, str1, str2, out)
|
|
|
|
|
elif S[i - 1, j - 1] + sigma_sub(str1[i - 1], str2[j - 1]) + s >= T:
|
|
|
|
|
out.insert(0, (str1[i - 1], str2[j - 1]))
|
|
|
|
|
_retrieve(
|
|
|
|
|
i - 1,
|
|
|
|
|
j - 1,
|
|
|
|
|
s + sigma_sub(str1[i - 1], str2[j - 1]),
|
|
|
|
|
S,
|
|
|
|
|
T,
|
|
|
|
|
str1,
|
|
|
|
|
str2,
|
|
|
|
|
out,
|
|
|
|
|
)
|
|
|
|
|
return out
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def sigma_skip(p):
|
|
|
|
|
"""
|
|
|
|
|
Returns score of an indel of P.
|
|
|
|
|
|
|
|
|
|
(Kondrak 2002: 54)
|
|
|
|
|
"""
|
|
|
|
|
return C_skip
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def sigma_sub(p, q):
|
|
|
|
|
"""
|
|
|
|
|
Returns score of a substitution of P with Q.
|
|
|
|
|
|
|
|
|
|
(Kondrak 2002: 54)
|
|
|
|
|
"""
|
|
|
|
|
return C_sub - delta(p, q) - V(p) - V(q)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def sigma_exp(p, q):
|
|
|
|
|
"""
|
|
|
|
|
Returns score of an expansion/compression.
|
|
|
|
|
|
|
|
|
|
(Kondrak 2002: 54)
|
|
|
|
|
"""
|
|
|
|
|
q1 = q[0]
|
|
|
|
|
q2 = q[1]
|
|
|
|
|
return C_exp - delta(p, q1) - delta(p, q2) - V(p) - max(V(q1), V(q2))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def delta(p, q):
|
|
|
|
|
"""
|
|
|
|
|
Return weighted sum of difference between P and Q.
|
|
|
|
|
|
|
|
|
|
(Kondrak 2002: 54)
|
|
|
|
|
"""
|
|
|
|
|
features = R(p, q)
|
|
|
|
|
total = 0
|
|
|
|
|
for f in features:
|
|
|
|
|
total += diff(p, q, f) * salience[f]
|
|
|
|
|
return total
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def diff(p, q, f):
|
|
|
|
|
"""
|
|
|
|
|
Returns difference between phonetic segments P and Q for feature F.
|
|
|
|
|
|
|
|
|
|
(Kondrak 2002: 52, 54)
|
|
|
|
|
"""
|
|
|
|
|
p_features, q_features = feature_matrix[p], feature_matrix[q]
|
|
|
|
|
return abs(similarity_matrix[p_features[f]] - similarity_matrix[q_features[f]])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def R(p, q):
|
|
|
|
|
"""
|
|
|
|
|
Return relevant features for segment comparsion.
|
|
|
|
|
|
|
|
|
|
(Kondrak 2002: 54)
|
|
|
|
|
"""
|
|
|
|
|
if p in consonants or q in consonants:
|
|
|
|
|
return R_c
|
|
|
|
|
return R_v
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def V(p):
|
|
|
|
|
"""
|
|
|
|
|
Return vowel weight if P is vowel.
|
|
|
|
|
|
|
|
|
|
(Kondrak 2002: 54)
|
|
|
|
|
"""
|
|
|
|
|
if p in consonants:
|
|
|
|
|
return 0
|
|
|
|
|
return C_vwl
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# === Test ===
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def demo():
|
|
|
|
|
"""
|
|
|
|
|
A demonstration of the result of aligning phonetic sequences
|
|
|
|
|
used in Kondrak's (2002) dissertation.
|
|
|
|
|
"""
|
|
|
|
|
data = [pair.split(",") for pair in cognate_data.split("\n")]
|
|
|
|
|
for pair in data:
|
|
|
|
|
alignment = align(pair[0], pair[1])[0]
|
|
|
|
|
alignment = ["({}, {})".format(a[0], a[1]) for a in alignment]
|
|
|
|
|
alignment = " ".join(alignment)
|
|
|
|
|
print("{} ~ {} : {}".format(pair[0], pair[1], alignment))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
cognate_data = """jo,ʒə
|
|
|
|
|
tu,ty
|
|
|
|
|
nosotros,nu
|
|
|
|
|
kjen,ki
|
|
|
|
|
ke,kwa
|
|
|
|
|
todos,tu
|
|
|
|
|
una,ən
|
|
|
|
|
dos,dø
|
|
|
|
|
tres,trwa
|
|
|
|
|
ombre,om
|
|
|
|
|
arbol,arbrə
|
|
|
|
|
pluma,plym
|
|
|
|
|
kabeθa,kap
|
|
|
|
|
boka,buʃ
|
|
|
|
|
pje,pje
|
|
|
|
|
koraθon,kœr
|
|
|
|
|
ber,vwar
|
|
|
|
|
benir,vənir
|
|
|
|
|
deθir,dir
|
|
|
|
|
pobre,povrə
|
|
|
|
|
ðis,dIzes
|
|
|
|
|
ðæt,das
|
|
|
|
|
wat,vas
|
|
|
|
|
nat,nixt
|
|
|
|
|
loŋ,laŋ
|
|
|
|
|
mæn,man
|
|
|
|
|
fleʃ,flajʃ
|
|
|
|
|
bləd,blyt
|
|
|
|
|
feðər,fEdər
|
|
|
|
|
hær,hAr
|
|
|
|
|
ir,Or
|
|
|
|
|
aj,awgə
|
|
|
|
|
nowz,nAzə
|
|
|
|
|
mawθ,munt
|
|
|
|
|
təŋ,tsuŋə
|
|
|
|
|
fut,fys
|
|
|
|
|
nij,knI
|
|
|
|
|
hænd,hant
|
|
|
|
|
hart,herts
|
|
|
|
|
livər,lEbər
|
|
|
|
|
ænd,ante
|
|
|
|
|
æt,ad
|
|
|
|
|
blow,flAre
|
|
|
|
|
ir,awris
|
|
|
|
|
ijt,edere
|
|
|
|
|
fiʃ,piʃkis
|
|
|
|
|
flow,fluere
|
|
|
|
|
staɾ,stella
|
|
|
|
|
ful,plenus
|
|
|
|
|
græs,gramen
|
|
|
|
|
hart,kordis
|
|
|
|
|
horn,korny
|
|
|
|
|
aj,ego
|
|
|
|
|
nij,genU
|
|
|
|
|
məðər,mAter
|
|
|
|
|
mawntən,mons
|
|
|
|
|
nejm,nomen
|
|
|
|
|
njuw,nowus
|
|
|
|
|
wən,unus
|
|
|
|
|
rawnd,rotundus
|
|
|
|
|
sow,suere
|
|
|
|
|
sit,sedere
|
|
|
|
|
θrij,tres
|
|
|
|
|
tuwθ,dentis
|
|
|
|
|
θin,tenwis
|
|
|
|
|
kinwawa,kenuaʔ
|
|
|
|
|
nina,nenah
|
|
|
|
|
napewa,napɛw
|
|
|
|
|
wapimini,wapemen
|
|
|
|
|
namesa,namɛʔs
|
|
|
|
|
okimawa,okemaw
|
|
|
|
|
ʃiʃipa,seʔsep
|
|
|
|
|
ahkohkwa,ahkɛh
|
|
|
|
|
pematesiweni,pematesewen
|
|
|
|
|
asenja,aʔsɛn"""
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
demo()
|