You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
100 lines
3.5 KiB
Python
100 lines
3.5 KiB
Python
4 years ago
|
# Natural Language Toolkit: Carnegie Mellon Pronouncing Dictionary Corpus Reader
|
||
|
#
|
||
|
# Copyright (C) 2001-2019 NLTK Project
|
||
|
# Author: Steven Bird <stevenbird1@gmail.com>
|
||
|
# URL: <http://nltk.org/>
|
||
|
# For license information, see LICENSE.TXT
|
||
|
|
||
|
"""
|
||
|
The Carnegie Mellon Pronouncing Dictionary [cmudict.0.6]
|
||
|
ftp://ftp.cs.cmu.edu/project/speech/dict/
|
||
|
Copyright 1998 Carnegie Mellon University
|
||
|
|
||
|
File Format: Each line consists of an uppercased word, a counter
|
||
|
(for alternative pronunciations), and a transcription. Vowels are
|
||
|
marked for stress (1=primary, 2=secondary, 0=no stress). E.g.:
|
||
|
NATURAL 1 N AE1 CH ER0 AH0 L
|
||
|
|
||
|
The dictionary contains 127069 entries. Of these, 119400 words are assigned
|
||
|
a unique pronunciation, 6830 words have two pronunciations, and 839 words have
|
||
|
three or more pronunciations. Many of these are fast-speech variants.
|
||
|
|
||
|
Phonemes: There are 39 phonemes, as shown below:
|
||
|
|
||
|
Phoneme Example Translation Phoneme Example Translation
|
||
|
------- ------- ----------- ------- ------- -----------
|
||
|
AA odd AA D AE at AE T
|
||
|
AH hut HH AH T AO ought AO T
|
||
|
AW cow K AW AY hide HH AY D
|
||
|
B be B IY CH cheese CH IY Z
|
||
|
D dee D IY DH thee DH IY
|
||
|
EH Ed EH D ER hurt HH ER T
|
||
|
EY ate EY T F fee F IY
|
||
|
G green G R IY N HH he HH IY
|
||
|
IH it IH T IY eat IY T
|
||
|
JH gee JH IY K key K IY
|
||
|
L lee L IY M me M IY
|
||
|
N knee N IY NG ping P IH NG
|
||
|
OW oat OW T OY toy T OY
|
||
|
P pee P IY R read R IY D
|
||
|
S sea S IY SH she SH IY
|
||
|
T tea T IY TH theta TH EY T AH
|
||
|
UH hood HH UH D UW two T UW
|
||
|
V vee V IY W we W IY
|
||
|
Y yield Y IY L D Z zee Z IY
|
||
|
ZH seizure S IY ZH ER
|
||
|
"""
|
||
|
|
||
|
from nltk import compat
|
||
|
from nltk.util import Index
|
||
|
|
||
|
from nltk.corpus.reader.util import *
|
||
|
from nltk.corpus.reader.api import *
|
||
|
|
||
|
|
||
|
class CMUDictCorpusReader(CorpusReader):
|
||
|
def entries(self):
|
||
|
"""
|
||
|
:return: the cmudict lexicon as a list of entries
|
||
|
containing (word, transcriptions) tuples.
|
||
|
"""
|
||
|
return concat(
|
||
|
[
|
||
|
StreamBackedCorpusView(fileid, read_cmudict_block, encoding=enc)
|
||
|
for fileid, enc in self.abspaths(None, True)
|
||
|
]
|
||
|
)
|
||
|
|
||
|
def raw(self):
|
||
|
"""
|
||
|
:return: the cmudict lexicon as a raw string.
|
||
|
"""
|
||
|
fileids = self._fileids
|
||
|
if isinstance(fileids, string_types):
|
||
|
fileids = [fileids]
|
||
|
return concat([self.open(f).read() for f in fileids])
|
||
|
|
||
|
def words(self):
|
||
|
"""
|
||
|
:return: a list of all words defined in the cmudict lexicon.
|
||
|
"""
|
||
|
return [word.lower() for (word, _) in self.entries()]
|
||
|
|
||
|
def dict(self):
|
||
|
"""
|
||
|
:return: the cmudict lexicon as a dictionary, whose keys are
|
||
|
lowercase words and whose values are lists of pronunciations.
|
||
|
"""
|
||
|
return dict(Index(self.entries()))
|
||
|
|
||
|
|
||
|
def read_cmudict_block(stream):
|
||
|
entries = []
|
||
|
while len(entries) < 100: # Read 100 at a time.
|
||
|
line = stream.readline()
|
||
|
if line == '':
|
||
|
return entries # end of file.
|
||
|
pieces = line.split()
|
||
|
entries.append((pieces[0].lower(), pieces[2:]))
|
||
|
return entries
|