You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
875 lines
33 KiB
Python
875 lines
33 KiB
Python
6 years ago
|
#!/usr/bin/env python -O
|
||
|
#
|
||
|
# Copyright (c) 2011-2016 Kyle Gorman, Max Bane, Morgan Sonderegger
|
||
|
#
|
||
|
# Permission is hereby granted, free of charge, to any person obtaining a
|
||
|
# copy of this software and associated documentation files (the
|
||
|
# "Software"), to deal in the Software without restriction, including
|
||
|
# without limitation the rights to use, copy, modify, merge, publish,
|
||
|
# distribute, sublicense, and/or sell copies of the Software, and to
|
||
|
# permit persons to whom the Software is furnished to do so, subject to
|
||
|
# the following conditions:
|
||
|
#
|
||
|
# The above copyright notice and this permission notice shall be included
|
||
|
# in all copies or substantial portions of the Software.
|
||
|
#
|
||
|
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
|
||
|
# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||
|
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||
|
# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||
|
# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
||
|
# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||
|
# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||
|
#
|
||
|
# textgrid.py: classes for Praat TextGrid and HTK mlf files
|
||
|
#
|
||
|
# Max Bane <bane@uchicago.edu>
|
||
|
# Kyle Gorman <gormanky@ohsu.edu>
|
||
|
# Morgan Sonderegger <morgan.sonderegger@mcgill.ca>
|
||
|
|
||
|
from __future__ import print_function
|
||
|
|
||
|
import re
|
||
|
import codecs
|
||
|
import os.path
|
||
|
|
||
|
from sys import stderr
|
||
|
from bisect import bisect_left
|
||
|
from decimal import Decimal
|
||
|
|
||
|
from .exceptions import TextGridError
|
||
|
|
||
|
DEFAULT_TEXTGRID_PRECISION = 5
|
||
|
DEFAULT_MLF_PRECISION = 5
|
||
|
|
||
|
|
||
|
def _getMark(text, short):
|
||
|
"""
|
||
|
Return the mark or text entry on a line. Praat escapes double-quotes
|
||
|
by doubling them, so doubled double-quotes are read as single
|
||
|
double-quotes. Newlines within an entry are allowed.
|
||
|
"""
|
||
|
|
||
|
line = text.readline()
|
||
|
|
||
|
# check that the line begins with a valid entry type
|
||
|
if not short and not re.match(r'^\s*(text|mark) = "', line):
|
||
|
raise ValueError('Bad entry: ' + line)
|
||
|
|
||
|
# read until the number of double-quotes is even
|
||
|
while line.count('"') % 2:
|
||
|
next_line = text.readline()
|
||
|
|
||
|
if not next_line:
|
||
|
raise EOFError('Bad entry: ' + line[:20] + '...')
|
||
|
|
||
|
line += next_line
|
||
|
if short:
|
||
|
pattern = r'^"(.*?)"\s*$'
|
||
|
else:
|
||
|
pattern = r'^\s*(text|mark) = "(.*?)"\s*$'
|
||
|
entry = re.match(pattern, line, re.DOTALL)
|
||
|
|
||
|
return entry.groups()[-1].replace('""', '"')
|
||
|
|
||
|
|
||
|
def _formatMark(text):
|
||
|
return text.replace('"', '""')
|
||
|
|
||
|
|
||
|
def detectEncoding(f):
|
||
|
"""
|
||
|
This helper method returns the file encoding corresponding to path f.
|
||
|
This handles UTF-8, which is itself an ASCII extension, so also ASCII.
|
||
|
"""
|
||
|
encoding = 'ascii'
|
||
|
try:
|
||
|
with codecs.open(f, 'r', encoding='utf-16') as source:
|
||
|
source.readline() # Read one line to ensure correct encoding
|
||
|
except UnicodeError:
|
||
|
try:
|
||
|
with codecs.open(f, 'r', encoding='utf-8') as source:
|
||
|
source.readline() # Read one line to ensure correct encoding
|
||
|
except UnicodeError:
|
||
|
with codecs.open(f, 'r', encoding='ascii') as source:
|
||
|
source.readline() # Read one line to ensure correct encoding
|
||
|
else:
|
||
|
encoding = 'utf-8'
|
||
|
else:
|
||
|
encoding = 'utf-16'
|
||
|
|
||
|
return encoding
|
||
|
|
||
|
|
||
|
class Point(object):
|
||
|
"""
|
||
|
Represents a point in time with an associated textual mark, as stored
|
||
|
in a PointTier.
|
||
|
|
||
|
"""
|
||
|
|
||
|
def __init__(self, time, mark):
|
||
|
self.time = time
|
||
|
self.mark = mark
|
||
|
|
||
|
def __repr__(self):
|
||
|
return 'Point({0}, {1})'.format(self.time,
|
||
|
self.mark if self.mark else None)
|
||
|
|
||
|
def __lt__(self, other):
|
||
|
if hasattr(other, 'time'):
|
||
|
return self.time < other.time
|
||
|
elif hasattr(other, 'minTime'):
|
||
|
return self.time < other.minTime
|
||
|
else:
|
||
|
return self.time < other
|
||
|
|
||
|
def __gt__(self, other):
|
||
|
if hasattr(other, 'time'):
|
||
|
return self.time > other.time
|
||
|
elif hasattr(other, 'maxTime'):
|
||
|
return self.time > other.maxTime
|
||
|
else:
|
||
|
return self.time > other
|
||
|
|
||
|
def __eq__(self, other):
|
||
|
if isinstance(other, Point):
|
||
|
return self.time == other.time
|
||
|
elif isinstance(other, Interval):
|
||
|
return other.minTime < self.time < other.maxTime
|
||
|
else:
|
||
|
return self.time == other
|
||
|
|
||
|
def __gte__(self, other):
|
||
|
return self > other or self == other
|
||
|
|
||
|
def __lte__(self, other):
|
||
|
return self < other or self == other
|
||
|
|
||
|
def __cmp__(self, other):
|
||
|
"""
|
||
|
In addition to the obvious semantics, Point/Interval comparison is
|
||
|
0 iff the point is inside the interval (non-inclusively), if you
|
||
|
need inclusive membership, use Interval.__contains__
|
||
|
"""
|
||
|
if hasattr(other, 'time'):
|
||
|
return cmp(self.time, other.time)
|
||
|
elif hasattr(other, 'minTime') and hasattr(other, 'maxTime'):
|
||
|
return cmp(self.time, other.minTime) + \
|
||
|
cmp(self.time, other.maxTime)
|
||
|
else: # hopefully numerical
|
||
|
return cmp(self.time, other)
|
||
|
|
||
|
def __iadd__(self, other):
|
||
|
self.time += other
|
||
|
|
||
|
def __isub__(self, other):
|
||
|
self.time -= other
|
||
|
|
||
|
|
||
|
def decode(string):
|
||
|
"""
|
||
|
Decode HTK's mangling of UTF-8 strings into something useful
|
||
|
"""
|
||
|
# print(string)
|
||
|
return string
|
||
|
return string.decode('string_escape').decode('UTF-8')
|
||
|
|
||
|
|
||
|
class Interval(object):
|
||
|
"""
|
||
|
Represents an interval of time, with an associated textual mark, as
|
||
|
stored in an IntervalTier.
|
||
|
|
||
|
"""
|
||
|
|
||
|
def __init__(self, minTime, maxTime, mark):
|
||
|
if minTime >= maxTime:
|
||
|
# Praat does not support intervals with duration <= 0
|
||
|
raise ValueError(minTime, maxTime)
|
||
|
self.minTime = minTime
|
||
|
self.maxTime = maxTime
|
||
|
self.mark = mark
|
||
|
self.strict =True
|
||
|
|
||
|
def __repr__(self):
|
||
|
return 'Interval({0}, {1}, {2})'.format(self.minTime, self.maxTime,
|
||
|
self.mark if self.mark else None)
|
||
|
|
||
|
def duration(self):
|
||
|
"""
|
||
|
Returns the duration of the interval in seconds.
|
||
|
"""
|
||
|
return self.maxTime - self.minTime
|
||
|
|
||
|
def __lt__(self, other):
|
||
|
if hasattr(other, 'minTime'):
|
||
|
if self.strict and self.overlaps(other):
|
||
|
raise (ValueError(self, other))
|
||
|
return self.minTime < other.minTime
|
||
|
elif hasattr(other, 'time'):
|
||
|
return self.maxTime < other.time
|
||
|
else:
|
||
|
return self.maxTime < other
|
||
|
|
||
|
def __gt__(self, other):
|
||
|
if hasattr(other, 'maxTime'):
|
||
|
if self.strict and self.overlaps(other):
|
||
|
raise (ValueError(self, other))
|
||
|
return self.maxTime > other.maxTime
|
||
|
elif hasattr(other, 'time'):
|
||
|
return self.minTime > other.time
|
||
|
else:
|
||
|
return self.minTime > other
|
||
|
|
||
|
def __gte__(self, other):
|
||
|
return self > other or self == other
|
||
|
|
||
|
def __lte__(self, other):
|
||
|
return self < other or self == other
|
||
|
|
||
|
def __cmp__(self, other):
|
||
|
if hasattr(other, 'minTime') and hasattr(other, 'maxTime'):
|
||
|
if self.overlaps(other):
|
||
|
raise ValueError(self, other)
|
||
|
# this returns the two intervals, so user can patch things
|
||
|
# up if s/he so chooses
|
||
|
return cmp(self.minTime, other.minTime)
|
||
|
elif hasattr(other, 'time'): # comparing Intervals and Points
|
||
|
return cmp(self.minTime, other.time) + \
|
||
|
cmp(self.maxTime, other.time)
|
||
|
else:
|
||
|
return cmp(self.minTime, other) + cmp(self.maxTime, other)
|
||
|
|
||
|
def __eq__(self, other):
|
||
|
"""
|
||
|
This might seem superfluous but not that a ValueError will be
|
||
|
raised if you compare two intervals to each other...not anymore
|
||
|
"""
|
||
|
if hasattr(other, 'minTime') and hasattr(other, 'maxTime'):
|
||
|
if self.minTime == other.minTime:
|
||
|
if self.maxTime == other.maxTime:
|
||
|
return True
|
||
|
elif hasattr(other, 'time'):
|
||
|
return self.minTime < other.time < self.maxTime
|
||
|
else:
|
||
|
return False
|
||
|
|
||
|
def __iadd__(self, other):
|
||
|
self.minTime += other
|
||
|
self.maxTime += other
|
||
|
|
||
|
def __isub__(self, other):
|
||
|
self.minTime -= other
|
||
|
self.maxTime -= other
|
||
|
|
||
|
def overlaps(self, other):
|
||
|
"""
|
||
|
Tests whether self overlaps with the given interval. Symmetric.
|
||
|
See: http://www.rgrjr.com/emacs/overlap.html
|
||
|
"""
|
||
|
return other.minTime < self.maxTime and \
|
||
|
self.minTime < other.maxTime
|
||
|
|
||
|
def __contains__(self, other):
|
||
|
"""
|
||
|
Tests whether the given time point is contained in this interval,
|
||
|
either a numeric type or a Point object.
|
||
|
"""
|
||
|
if hasattr(other, 'minTime') and hasattr(other, 'maxTime'):
|
||
|
return self.minTime <= other.minTime and \
|
||
|
other.maxTime <= self.maxTime
|
||
|
elif hasattr(other, 'time'):
|
||
|
return self.minTime <= other.time <= self.maxTime
|
||
|
else:
|
||
|
return self.minTime <= other <= self.maxTime
|
||
|
|
||
|
def bounds(self):
|
||
|
return (self.minTime, self.maxTime)
|
||
|
|
||
|
|
||
|
class PointTier(object):
|
||
|
"""
|
||
|
Represents Praat PointTiers (also called TextTiers) as list of Points
|
||
|
(e.g., for point in pointtier). A PointTier is used much like a Python
|
||
|
set in that it has add/remove methods, not append/extend methods.
|
||
|
|
||
|
"""
|
||
|
|
||
|
def __init__(self, name=None, minTime=0., maxTime=None):
|
||
|
self.name = name
|
||
|
self.minTime = minTime
|
||
|
self.maxTime = maxTime
|
||
|
self.points = []
|
||
|
|
||
|
def __str__(self):
|
||
|
return '<PointTier {0}, {1} points>'.format(self.name, len(self))
|
||
|
|
||
|
def __repr__(self):
|
||
|
return 'PointTier({0}, {1})'.format(self.name, self.points)
|
||
|
|
||
|
def __iter__(self):
|
||
|
return iter(self.points)
|
||
|
|
||
|
def __len__(self):
|
||
|
return len(self.points)
|
||
|
|
||
|
def __getitem__(self, i):
|
||
|
return self.points[i]
|
||
|
|
||
|
def add(self, time, mark):
|
||
|
"""
|
||
|
constructs a Point and adds it to the PointTier, maintaining order
|
||
|
"""
|
||
|
self.addPoint(Point(time, mark))
|
||
|
|
||
|
def addPoint(self, point):
|
||
|
if point < self.minTime:
|
||
|
raise ValueError(self.minTime) # too early
|
||
|
if self.maxTime and point > self.maxTime:
|
||
|
raise ValueError(self.maxTime) # too late
|
||
|
i = bisect_left(self.points, point)
|
||
|
if i < len(self.points) and self.points[i].time == point.time:
|
||
|
raise ValueError(point) # we already got one right there
|
||
|
self.points.insert(i, point)
|
||
|
|
||
|
def remove(self, time, mark):
|
||
|
"""
|
||
|
removes a constructed Point i from the PointTier
|
||
|
"""
|
||
|
self.removePoint(Point(time, mark))
|
||
|
|
||
|
def removePoint(self, point):
|
||
|
self.points.remove(point)
|
||
|
|
||
|
def read(self, f, round_digits=DEFAULT_TEXTGRID_PRECISION):
|
||
|
"""
|
||
|
Read the Points contained in the Praat-formated PointTier/TextTier
|
||
|
file indicated by string f
|
||
|
"""
|
||
|
to_round = Decimal('.{}1'.format('0' * (round_digits - 1)))
|
||
|
encoding = detectEncoding(f)
|
||
|
with codecs.open(f, 'r', encoding=encoding) as source:
|
||
|
file_type, short = parse_header(source)
|
||
|
if file_type != 'TextTier':
|
||
|
raise TextGridError('The file could not be parsed as a PointTier as it is lacking a proper header.')
|
||
|
|
||
|
self.minTime = parse_line(source.readline(), short, to_round)
|
||
|
self.maxTime = parse_line(source.readline(), short, to_round)
|
||
|
n = int(parse_line(source.readline(), short, to_round))
|
||
|
for i in range(n):
|
||
|
source.readline().rstrip() # header
|
||
|
itim = parse_line(source.readline(), short, to_round)
|
||
|
imrk = _getMark(source, short)
|
||
|
self.points.append(Point(itim, imrk))
|
||
|
|
||
|
def write(self, f):
|
||
|
"""
|
||
|
Write the current state into a Praat-format PointTier/TextTier
|
||
|
file. f may be a file object to write to, or a string naming a
|
||
|
path for writing
|
||
|
"""
|
||
|
sink = f if hasattr(f, 'write') else codecs.open(f, 'w', 'UTF-8')
|
||
|
print('File type = "ooTextFile"', file=sink)
|
||
|
print('Object class = "TextTier"\n', file=sink)
|
||
|
|
||
|
print('xmin = {0}'.format(self.minTime), file=sink)
|
||
|
print('xmax = {0}'.format(self.maxTime if self.maxTime \
|
||
|
else self.points[-1].time), file=sink)
|
||
|
print('points: size = {0}'.format(len(self)), file=sink)
|
||
|
for (i, point) in enumerate(self.points, 1):
|
||
|
print('points [{0}]:'.format(i), file=sink)
|
||
|
print('\ttime = {0}'.format(point.time), file=sink)
|
||
|
mark = _formatMark(point.mark)
|
||
|
print('\tmark = "{0}"'.format(mark), file=sink)
|
||
|
sink.close()
|
||
|
|
||
|
def bounds(self):
|
||
|
return (self.minTime, self.maxTime or self.points[-1].time)
|
||
|
|
||
|
# alternative constructor
|
||
|
|
||
|
@classmethod
|
||
|
def fromFile(cls, f, name=None):
|
||
|
pt = cls(name=name)
|
||
|
pt.read(f)
|
||
|
return pt
|
||
|
|
||
|
|
||
|
class IntervalTier(object):
|
||
|
"""
|
||
|
Represents Praat IntervalTiers as list of sequence types of Intervals
|
||
|
(e.g., for interval in intervaltier). An IntervalTier is used much like a
|
||
|
Python set in that it has add/remove methods, not append/extend methods.
|
||
|
|
||
|
"""
|
||
|
|
||
|
def __init__(self, name=None, minTime=0., maxTime=None):
|
||
|
self.name = name
|
||
|
self.minTime = minTime
|
||
|
self.maxTime = maxTime
|
||
|
self.intervals = []
|
||
|
self.strict = True
|
||
|
|
||
|
def __str__(self):
|
||
|
return '<IntervalTier {0}, {1} intervals>'.format(self.name,
|
||
|
len(self))
|
||
|
|
||
|
def __repr__(self):
|
||
|
return 'IntervalTier({0}, {1})'.format(self.name, self.intervals)
|
||
|
|
||
|
def __iter__(self):
|
||
|
return iter(self.intervals)
|
||
|
|
||
|
def __len__(self):
|
||
|
return len(self.intervals)
|
||
|
|
||
|
def __getitem__(self, i):
|
||
|
return self.intervals[i]
|
||
|
|
||
|
def add(self, minTime, maxTime, mark):
|
||
|
interval = Interval(minTime, maxTime, mark)
|
||
|
interval.strict = self.strict
|
||
|
self.addInterval(interval)
|
||
|
|
||
|
def addInterval(self, interval):
|
||
|
if interval.minTime < self.minTime: # too early
|
||
|
raise ValueError(self.minTime)
|
||
|
if self.maxTime and interval.maxTime > self.maxTime: # too late
|
||
|
# raise ValueError, self.maxTime
|
||
|
raise ValueError(self.maxTime)
|
||
|
i = bisect_left(self.intervals, interval)
|
||
|
if i != len(self.intervals) and self.intervals[i] == interval:
|
||
|
raise ValueError(self.intervals[i])
|
||
|
interval.strict = self.strict
|
||
|
self.intervals.insert(i, interval)
|
||
|
|
||
|
def remove(self, minTime, maxTime, mark):
|
||
|
self.removeInterval(Interval(minTime, maxTime, mark))
|
||
|
|
||
|
def removeInterval(self, interval):
|
||
|
self.intervals.remove(interval)
|
||
|
|
||
|
def indexContaining(self, time):
|
||
|
"""
|
||
|
Returns the index of the interval containing the given time point,
|
||
|
or None if the time point is outside the bounds of this tier. The
|
||
|
argument can be a numeric type, or a Point object.
|
||
|
"""
|
||
|
i = bisect_left(self.intervals, time)
|
||
|
if i != len(self.intervals):
|
||
|
if self.intervals[i].minTime <= time <= \
|
||
|
self.intervals[i].maxTime:
|
||
|
return i
|
||
|
|
||
|
def intervalContaining(self, time):
|
||
|
"""
|
||
|
Returns the interval containing the given time point, or None if
|
||
|
the time point is outside the bounds of this tier. The argument
|
||
|
can be a numeric type, or a Point object.
|
||
|
"""
|
||
|
i = self.indexContaining(time)
|
||
|
if i:
|
||
|
return self.intervals[i]
|
||
|
|
||
|
def read(self, f, round_digits=DEFAULT_TEXTGRID_PRECISION):
|
||
|
"""
|
||
|
Read the Intervals contained in the Praat-formated IntervalTier
|
||
|
file indicated by string f
|
||
|
"""
|
||
|
to_round = Decimal('.{}1'.format('0' * (round_digits - 1)))
|
||
|
encoding = detectEncoding(f)
|
||
|
with codecs.open(f, 'r', encoding=encoding) as source:
|
||
|
file_type, short = parse_header(source)
|
||
|
if file_type != 'IntervalTier':
|
||
|
raise TextGridError('The file could not be parsed as a IntervalTier as it is lacking a proper header.')
|
||
|
|
||
|
self.minTime = parse_line(source.readline(), short, to_round)
|
||
|
self.maxTime = parse_line(source.readline(), short, to_round)
|
||
|
n = int(parse_line(source.readline(), short, to_round))
|
||
|
for i in range(n):
|
||
|
source.readline().rstrip() # header
|
||
|
imin = parse_line(source.readline(), short, to_round)
|
||
|
imax = parse_line(source.readline(), short, to_round)
|
||
|
imrk = _getMark(source, short)
|
||
|
self.intervals.append(Interval(imin, imax, imrk))
|
||
|
|
||
|
def _fillInTheGaps(self, null):
|
||
|
"""
|
||
|
Returns a pseudo-IntervalTier with the temporal gaps filled in
|
||
|
"""
|
||
|
prev_t = self.minTime
|
||
|
output = []
|
||
|
for interval in self.intervals:
|
||
|
if prev_t < interval.minTime:
|
||
|
output.append(Interval(prev_t, interval.minTime, null))
|
||
|
output.append(interval)
|
||
|
prev_t = interval.maxTime
|
||
|
# last interval
|
||
|
if self.maxTime is not None and prev_t < self.maxTime: # also false if maxTime isn't defined
|
||
|
output.append(Interval(prev_t, self.maxTime, null))
|
||
|
return output
|
||
|
|
||
|
def write(self, f, null=''):
|
||
|
"""
|
||
|
Write the current state into a Praat-format IntervalTier file. f
|
||
|
may be a file object to write to, or a string naming a path for
|
||
|
writing
|
||
|
"""
|
||
|
sink = f if hasattr(f, 'write') else open(f, 'w')
|
||
|
print('File type = "ooTextFile"', file=sink)
|
||
|
print('Object class = "IntervalTier"\n', file=sink)
|
||
|
print('xmin = {0}'.format(self.minTime), file=sink)
|
||
|
print('xmax = {0}'.format(self.maxTime if self.maxTime \
|
||
|
else self.intervals[-1].maxTime), file=sink)
|
||
|
# compute the number of intervals and make the empty ones
|
||
|
output = self._fillInTheGaps(null)
|
||
|
# write it all out
|
||
|
print('intervals: size = {0}'.format(len(output)), file=sink)
|
||
|
for (i, interval) in enumerate(output, 1):
|
||
|
print('intervals [{0}]'.format(i), file=sink)
|
||
|
print('\txmin = {0}'.format(interval.minTime), file=sink)
|
||
|
print('\txmax = {0}'.format(interval.maxTime), file=sink)
|
||
|
mark = _formatMark(interval.mark)
|
||
|
print('\ttext = "{0}"'.format(mark), file=sink)
|
||
|
sink.close()
|
||
|
|
||
|
def bounds(self):
|
||
|
return self.minTime, self.maxTime or self.intervals[-1].maxTime
|
||
|
|
||
|
# alternative constructor
|
||
|
|
||
|
@classmethod
|
||
|
def fromFile(cls, f, name=None):
|
||
|
it = cls(name=name)
|
||
|
it.intervals = []
|
||
|
it.read(f)
|
||
|
return it
|
||
|
|
||
|
|
||
|
def parse_line(line, short, to_round):
|
||
|
line = line.strip()
|
||
|
if short:
|
||
|
if '"' in line:
|
||
|
return line[1:-1]
|
||
|
return Decimal(line).quantize(to_round)
|
||
|
if '"' in line:
|
||
|
m = re.match(r'.+? = "(.*)"', line)
|
||
|
return m.groups()[0]
|
||
|
m = re.match(r'.+? = (.*)', line)
|
||
|
return Decimal(m.groups()[0]).quantize(to_round)
|
||
|
|
||
|
|
||
|
def parse_header(source):
|
||
|
header = source.readline() # header junk
|
||
|
m = re.match('File type = "([\w ]+)"', header)
|
||
|
if m is None or not m.groups()[0].startswith('ooTextFile'):
|
||
|
raise TextGridError('The file could not be parsed as a Praat text file as it is lacking a proper header.')
|
||
|
|
||
|
short = 'short' in m.groups()[0]
|
||
|
file_type = parse_line(source.readline(), short, '') # header junk
|
||
|
t = source.readline() # header junk
|
||
|
return file_type, short
|
||
|
|
||
|
|
||
|
class TextGrid(object):
|
||
|
"""
|
||
|
Represents Praat TextGrids as list of sequence types of tiers (e.g.,
|
||
|
for tier in textgrid), and as map from names to tiers (e.g.,
|
||
|
textgrid['tierName']). Whereas the *Tier classes that make up a
|
||
|
TextGrid impose a strict ordering on Points/Intervals, a TextGrid
|
||
|
instance is given order by the user. Like a true Python list, there
|
||
|
are append/extend methods for a TextGrid.
|
||
|
|
||
|
"""
|
||
|
|
||
|
def __init__(self, name=None, minTime=0., maxTime=None, strict = True):
|
||
|
"""
|
||
|
Construct a TextGrid instance with the given (optional) name
|
||
|
(which is only relevant for MLF stuff). If file is given, it is a
|
||
|
string naming the location of a Praat-format TextGrid file from
|
||
|
which to populate this instance.
|
||
|
"""
|
||
|
self.name = name
|
||
|
self.minTime = minTime
|
||
|
self.maxTime = maxTime
|
||
|
self.tiers = []
|
||
|
self.strict = strict
|
||
|
|
||
|
def __str__(self):
|
||
|
return '<TextGrid {0}, {1} Tiers>'.format(self.name, len(self))
|
||
|
|
||
|
def __repr__(self):
|
||
|
return 'TextGrid({0}, {1})'.format(self.name, self.tiers)
|
||
|
|
||
|
def __iter__(self):
|
||
|
return iter(self.tiers)
|
||
|
|
||
|
def __len__(self):
|
||
|
return len(self.tiers)
|
||
|
|
||
|
def __getitem__(self, i):
|
||
|
"""
|
||
|
Return the ith tier
|
||
|
"""
|
||
|
return self.tiers[i]
|
||
|
|
||
|
def getFirst(self, tierName):
|
||
|
"""
|
||
|
Return the first tier with the given name.
|
||
|
"""
|
||
|
for t in self.tiers:
|
||
|
if t.name == tierName:
|
||
|
return t
|
||
|
|
||
|
def getList(self, tierName):
|
||
|
"""
|
||
|
Return a list of all tiers with the given name.
|
||
|
"""
|
||
|
tiers = []
|
||
|
for t in self.tiers:
|
||
|
if t.name == tierName:
|
||
|
tiers.append(t)
|
||
|
return tiers
|
||
|
|
||
|
def getNames(self):
|
||
|
"""
|
||
|
return a list of the names of the intervals contained in this
|
||
|
TextGrid
|
||
|
"""
|
||
|
return [tier.name for tier in self.tiers]
|
||
|
|
||
|
def append(self, tier):
|
||
|
if self.maxTime is not None and tier.maxTime is not None and tier.maxTime > self.maxTime:
|
||
|
raise ValueError(self.maxTime) # too late
|
||
|
tier.strict = self.strict
|
||
|
for i in tier:
|
||
|
i.strict = self.strict
|
||
|
self.tiers.append(tier)
|
||
|
|
||
|
def extend(self, tiers):
|
||
|
if min([t.minTime for t in tiers]) < self.minTime:
|
||
|
raise ValueError(self.minTime) # too early
|
||
|
if self.maxTime and max([t.minTime for t in tiers]) > self.maxTime:
|
||
|
raise ValueError(self.maxTime) # too late
|
||
|
self.tiers.extend(tiers)
|
||
|
|
||
|
def pop(self, i=None):
|
||
|
"""
|
||
|
Remove and return tier at index i (default last). Will raise
|
||
|
IndexError if TextGrid is empty or index is out of range.
|
||
|
"""
|
||
|
return (self.tiers.pop(i) if i else self.tiers.pop())
|
||
|
|
||
|
def read(self, f, round_digits=DEFAULT_TEXTGRID_PRECISION):
|
||
|
"""
|
||
|
Read the tiers contained in the Praat-formatted TextGrid file
|
||
|
indicated by string f. Times are rounded to the specified precision.
|
||
|
"""
|
||
|
to_round = Decimal('.{}1'.format('0' * (round_digits - 1)))
|
||
|
encoding = detectEncoding(f)
|
||
|
with codecs.open(f, 'r', encoding=encoding) as source:
|
||
|
file_type, short = parse_header(source)
|
||
|
if file_type != 'TextGrid':
|
||
|
raise TextGridError('The file could not be parsed as a TextGrid as it is lacking a proper header.')
|
||
|
self.minTime = parse_line(source.readline(), short, to_round)
|
||
|
self.maxTime = parse_line(source.readline(), short, to_round)
|
||
|
source.readline() # more header junk
|
||
|
if short:
|
||
|
m = int(source.readline().strip()) # will be self.n
|
||
|
else:
|
||
|
m = int(source.readline().strip().split()[2]) # will be self.n
|
||
|
if not short:
|
||
|
source.readline()
|
||
|
for i in range(m): # loop over grids
|
||
|
if not short:
|
||
|
source.readline()
|
||
|
if parse_line(source.readline(), short, to_round) == 'IntervalTier':
|
||
|
inam = parse_line(source.readline(), short, to_round)
|
||
|
imin = parse_line(source.readline(), short, to_round)
|
||
|
imax = parse_line(source.readline(), short, to_round)
|
||
|
itie = IntervalTier(inam, imin, imax)
|
||
|
itie.strict = self.strict
|
||
|
n = int(parse_line(source.readline(), short, to_round))
|
||
|
for j in range(n):
|
||
|
if not short:
|
||
|
source.readline().rstrip().split() # header junk
|
||
|
jmin = parse_line(source.readline(), short, to_round)
|
||
|
jmax = parse_line(source.readline(), short, to_round)
|
||
|
jmrk = _getMark(source, short)
|
||
|
if jmin < jmax: # non-null
|
||
|
itie.addInterval(Interval(jmin, jmax, jmrk))
|
||
|
self.append(itie)
|
||
|
else: # pointTier
|
||
|
inam = parse_line(source.readline(), short, to_round)
|
||
|
imin = parse_line(source.readline(), short, to_round)
|
||
|
imax = parse_line(source.readline(), short, to_round)
|
||
|
itie = PointTier(inam)
|
||
|
n = int(parse_line(source.readline(), short, to_round))
|
||
|
for j in range(n):
|
||
|
source.readline().rstrip() # header junk
|
||
|
jtim = parse_line(source.readline(), short, to_round)
|
||
|
jmrk = _getMark(source, short)
|
||
|
itie.addPoint(Point(jtim, jmrk))
|
||
|
self.append(itie)
|
||
|
|
||
|
def write(self, f, null=''):
|
||
|
"""
|
||
|
Write the current state into a Praat-format TextGrid file. f may
|
||
|
be a file object to write to, or a string naming a path to open
|
||
|
for writing.
|
||
|
"""
|
||
|
sink = f if hasattr(f, 'write') else codecs.open(f, 'w', 'UTF-8')
|
||
|
print('File type = "ooTextFile"', file=sink)
|
||
|
print('Object class = "TextGrid"\n', file=sink)
|
||
|
print('xmin = {0}'.format(self.minTime), file=sink)
|
||
|
# compute max time
|
||
|
maxT = self.maxTime
|
||
|
if not maxT:
|
||
|
maxT = max([t.maxTime if t.maxTime else t[-1].maxTime \
|
||
|
for t in self.tiers])
|
||
|
print('xmax = {0}'.format(maxT), file=sink)
|
||
|
print('tiers? <exists>', file=sink)
|
||
|
print('size = {0}'.format(len(self)), file=sink)
|
||
|
print('item []:', file=sink)
|
||
|
for (i, tier) in enumerate(self.tiers, 1):
|
||
|
print('\titem [{0}]:'.format(i), file=sink)
|
||
|
if tier.__class__ == IntervalTier:
|
||
|
print('\t\tclass = "IntervalTier"', file=sink)
|
||
|
print('\t\tname = "{0}"'.format(tier.name), file=sink)
|
||
|
print('\t\txmin = {0}'.format(tier.minTime), file=sink)
|
||
|
print('\t\txmax = {0}'.format(maxT), file=sink)
|
||
|
# compute the number of intervals and make the empty ones
|
||
|
output = tier._fillInTheGaps(null)
|
||
|
print('\t\tintervals: size = {0}'.format(
|
||
|
len(output)), file=sink)
|
||
|
for (j, interval) in enumerate(output, 1):
|
||
|
print('\t\t\tintervals [{0}]:'.format(j), file=sink)
|
||
|
print('\t\t\t\txmin = {0}'.format(
|
||
|
interval.minTime), file=sink)
|
||
|
print('\t\t\t\txmax = {0}'.format(
|
||
|
interval.maxTime), file=sink)
|
||
|
mark = _formatMark(interval.mark)
|
||
|
print('\t\t\t\ttext = "{0}"'.format(mark), file=sink)
|
||
|
elif tier.__class__ == PointTier: # PointTier
|
||
|
print('\t\tclass = "TextTier"', file=sink)
|
||
|
print('\t\tname = "{0}"'.format(tier.name), file=sink)
|
||
|
print('\t\txmin = {0}'.format(tier.minTime), file=sink)
|
||
|
print('\t\txmax = {0}'.format(maxT), file=sink)
|
||
|
print('\t\tpoints: size = {0}'.format(len(tier)), file=sink)
|
||
|
for (k, point) in enumerate(tier, 1):
|
||
|
print('\t\t\tpoints [{0}]:'.format(k), file=sink)
|
||
|
print('\t\t\t\ttime = {0}'.format(point.time), file=sink)
|
||
|
mark = _formatMark(point.mark)
|
||
|
print('\t\t\t\tmark = "{0}"'.format(mark), file=sink)
|
||
|
sink.close()
|
||
|
|
||
|
# alternative constructor
|
||
|
|
||
|
@classmethod
|
||
|
def fromFile(cls, f, name=None):
|
||
|
tg = cls(name=name)
|
||
|
tg.read(f)
|
||
|
return tg
|
||
|
|
||
|
|
||
|
class MLF(object):
|
||
|
"""
|
||
|
Read in a HTK .mlf file generated with HVite -o SM and turn it into a
|
||
|
list of TextGrids. The resulting class can be iterated over to give
|
||
|
one TextGrid at a time, or the write(prefix='') class method can be
|
||
|
used to write all the resulting TextGrids into separate files.
|
||
|
|
||
|
Unlike other classes, this is always initialized from a text file.
|
||
|
"""
|
||
|
|
||
|
def __init__(self, f, samplerate=10e6):
|
||
|
self.grids = []
|
||
|
self.read(f, samplerate)
|
||
|
|
||
|
def __iter__(self):
|
||
|
return iter(self.grids)
|
||
|
|
||
|
def __str__(self):
|
||
|
return '<MLF, {0} TextGrids>'.format(len(self))
|
||
|
|
||
|
def __repr__(self):
|
||
|
return 'MLF({0})'.format(self.grids)
|
||
|
|
||
|
def __len__(self):
|
||
|
return len(self.grids)
|
||
|
|
||
|
def __getitem__(self, i):
|
||
|
"""
|
||
|
Return the ith TextGrid
|
||
|
"""
|
||
|
return self.grids[i]
|
||
|
|
||
|
def read(self, f, samplerate, round_digits=DEFAULT_MLF_PRECISION):
|
||
|
source = open(f, 'r') # HTK returns ostensible ASCII
|
||
|
samplerate = Decimal(samplerate)
|
||
|
source.readline() # header
|
||
|
to_round = Decimal('.{}1'.format('0' * (round_digits - 1)))
|
||
|
while True: # loop over text
|
||
|
name = re.match('\"(.*)\"', source.readline().rstrip())
|
||
|
if name:
|
||
|
name = name.groups()[0]
|
||
|
grid = TextGrid(name)
|
||
|
phon = IntervalTier(name='phones')
|
||
|
word = IntervalTier(name='words')
|
||
|
wmrk = ''
|
||
|
wsrt = 0.
|
||
|
wend = 0.
|
||
|
while 1: # loop over the lines in each grid
|
||
|
line = source.readline().rstrip().split()
|
||
|
if len(line) == 4: # word on this baby
|
||
|
pmin = Decimal(line[0]).quantize(to_round) / samplerate
|
||
|
pmax = Decimal(line[1]).quantize(to_round) / samplerate
|
||
|
if pmin == pmax:
|
||
|
raise ValueError('null duration interval')
|
||
|
phon.add(pmin, pmax, line[2])
|
||
|
if wmrk:
|
||
|
word.add(wsrt, wend, wmrk)
|
||
|
wmrk = decode(line[3])
|
||
|
wsrt = pmin
|
||
|
wend = pmax
|
||
|
elif len(line) == 3: # just phone
|
||
|
pmin = Decimal(line[0]).quantize(to_round) / samplerate
|
||
|
pmax = Decimal(line[1]).quantize(to_round) / samplerate
|
||
|
if line[2] == 'sp' and pmin != pmax:
|
||
|
if wmrk:
|
||
|
word.add(wsrt, wend, wmrk)
|
||
|
wmrk = decode(line[2])
|
||
|
wsrt = pmin
|
||
|
wend = pmax
|
||
|
elif pmin != pmax:
|
||
|
phon.add(pmin, pmax, line[2])
|
||
|
wend = pmax
|
||
|
else: # it's a period
|
||
|
word.add(wsrt, wend, wmrk)
|
||
|
self.grids.append(grid)
|
||
|
break
|
||
|
grid.append(phon)
|
||
|
grid.append(word)
|
||
|
else:
|
||
|
source.close()
|
||
|
break
|
||
|
|
||
|
def write(self, prefix=''):
|
||
|
"""
|
||
|
Write the current state into Praat-formatted TextGrids. The
|
||
|
filenames that the output is stored in are taken from the HTK
|
||
|
label files. If a string argument is given, then the any prefix in
|
||
|
the name of the label file (e.g., "mfc/myLabFile.lab"), it is
|
||
|
truncated and files are written to the directory given by the
|
||
|
prefix. An IOError will result if the folder does not exist.
|
||
|
|
||
|
The number of TextGrids is returned.
|
||
|
"""
|
||
|
for grid in self.grids:
|
||
|
(junk, tail) = os.path.split(grid.name)
|
||
|
(root, junk) = os.path.splitext(tail)
|
||
|
my_path = os.path.join(prefix, root + '.TextGrid')
|
||
|
grid.write(codecs.open(my_path, 'w', 'UTF-8'))
|
||
|
return len(self.grids)
|