You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
266 lines
8.6 KiB
Python
266 lines
8.6 KiB
Python
8 years ago
|
# coding=utf-8
|
||
|
"""
|
||
|
Python library for ISO 639 standard
|
||
|
|
||
|
Copyright (c) 2014-2016 Mikael Karlsson (CSC - IT Center for Science Ltd.).
|
||
|
Licensed under AGPLv3.
|
||
|
"""
|
||
|
|
||
|
# Fix for Python 3.0 - 3.2
|
||
|
if not __package__:
|
||
|
__package__ = __name__.split('.')[0]
|
||
|
|
||
|
|
||
|
def _fabtabular():
|
||
|
"""
|
||
|
This function retrieves the ISO 639 and inverted names datasets as tsv files and returns them as lists.
|
||
|
"""
|
||
|
import csv
|
||
|
import sys
|
||
|
from pkg_resources import resource_filename
|
||
|
|
||
|
data = resource_filename(__package__, 'iso-639-3.tab')
|
||
|
inverted = resource_filename(__package__, 'iso-639-3_Name_Index.tab')
|
||
|
macro = resource_filename(__package__, 'iso-639-3-macrolanguages.tab')
|
||
|
part5 = resource_filename(__package__, 'iso639-5.tsv')
|
||
|
part2 = resource_filename(__package__, 'iso639-2.tsv')
|
||
|
part1 = resource_filename(__package__, 'iso639-1.tsv')
|
||
|
|
||
|
# if sys.version_info[0] == 2:
|
||
|
# from urllib2 import urlopen
|
||
|
# from contextlib import closing
|
||
|
# data_fo = closing(urlopen('http://www-01.sil.org/iso639-3/iso-639-3.tab'))
|
||
|
# inverted_fo = closing(urlopen('http://www-01.sil.org/iso639-3/iso-639-3_Name_Index.tab'))
|
||
|
# else:
|
||
|
# from urllib.request import urlopen
|
||
|
# import io
|
||
|
# data_fo = io.StringIO(urlopen('http://www-01.sil.org/iso639-3/iso-639-3.tab').read().decode())
|
||
|
# inverted_fo = io.StringIO(urlopen('http://www-01.sil.org/iso639-3/iso-639-3_Name_Index.tab').read().decode())
|
||
|
|
||
|
if sys.version_info[0] == 3:
|
||
|
from functools import partial
|
||
|
|
||
|
global open
|
||
|
open = partial(open, encoding='utf-8')
|
||
|
|
||
|
data_fo = open(data)
|
||
|
inverted_fo = open(inverted)
|
||
|
macro_fo = open(macro)
|
||
|
part5_fo = open(part5)
|
||
|
part2_fo = open(part2)
|
||
|
part1_fo = open(part1)
|
||
|
with data_fo as u:
|
||
|
with inverted_fo as i:
|
||
|
with macro_fo as m:
|
||
|
with part5_fo as p5:
|
||
|
with part2_fo as p2:
|
||
|
with part1_fo as p1:
|
||
|
return (list(csv.reader(u, delimiter='\t'))[1:],
|
||
|
list(csv.reader(i, delimiter='\t'))[1:],
|
||
|
list(csv.reader(m, delimiter='\t'))[1:],
|
||
|
list(csv.reader(p5, delimiter='\t'))[1:],
|
||
|
list(csv.reader(p2, delimiter='\t'))[1:],
|
||
|
list(csv.reader(p1, delimiter='\t'))[1:])
|
||
|
|
||
|
|
||
|
class _Language(object):
|
||
|
"""
|
||
|
This class represents a language. It provides pycountry language class compatibility.
|
||
|
"""
|
||
|
|
||
|
def __init__(self, part3, part2b, part2t, part1, name, inverted, macro, names, part5):
|
||
|
self.part3 = part3
|
||
|
self.part2b = part2b
|
||
|
self.part2t = part2t
|
||
|
self.part1 = part1
|
||
|
self.name = name
|
||
|
self.inverted = inverted
|
||
|
self.macro = macro
|
||
|
self.names = names
|
||
|
self.part5 = part5
|
||
|
|
||
|
def __getattr__(self, item):
|
||
|
compat = {
|
||
|
'alpha2': self.part1,
|
||
|
'bibliographic': self.part2b,
|
||
|
'terminology': self.part2t,
|
||
|
}
|
||
|
if item not in compat:
|
||
|
raise AttributeError("'{o}' object has no attribute '{a}'".format(o=type(self).__name__, a=item))
|
||
|
return compat[item]
|
||
|
|
||
|
|
||
|
class lazy_property(object):
|
||
|
"""
|
||
|
Implements a lazy property decorator, that overwrites itself/property with value
|
||
|
"""
|
||
|
|
||
|
def __init__(self, f):
|
||
|
self.f = f
|
||
|
self.name = f.__name__
|
||
|
|
||
|
def __get__(self, instance, owner=None):
|
||
|
if instance is None:
|
||
|
return self
|
||
|
val = self.f(instance)
|
||
|
setattr(instance, self.name, val)
|
||
|
return val
|
||
|
|
||
|
|
||
|
class Iso639(object):
|
||
|
"""
|
||
|
This class is a close to drop-in replacement for pycountry.languages.
|
||
|
But unlike pycountry.languages it also supports ISO 639-3.
|
||
|
|
||
|
It implements the Singleton design pattern for performance reasons.
|
||
|
Is uses lazy properties for faster import time.
|
||
|
"""
|
||
|
|
||
|
def __new__(cls):
|
||
|
if not hasattr(cls, '__instance'):
|
||
|
setattr(cls, '__instance', super(cls, cls).__new__(cls))
|
||
|
return getattr(cls, '__instance')
|
||
|
|
||
|
def __len__(self):
|
||
|
return len(self.languages)
|
||
|
|
||
|
def __iter__(self):
|
||
|
return iter(self.languages)
|
||
|
|
||
|
def __getattr__(self, item):
|
||
|
compat = {
|
||
|
'alpha2': self.part1,
|
||
|
'bibliographic': self.part2b,
|
||
|
'terminology': self.part2t,
|
||
|
}
|
||
|
if item not in compat:
|
||
|
raise AttributeError("'{o}' object has no attribute '{a}'".format(o=type(self).__name__, a=item))
|
||
|
return compat[item]
|
||
|
|
||
|
@lazy_property
|
||
|
def languages(self):
|
||
|
def generate():
|
||
|
# All of part3 and matching part2
|
||
|
for a, b, c, d, _, _, e, _ in l:
|
||
|
inv = alt[a].pop(e)
|
||
|
yield _Language(a, b, c,
|
||
|
d if d in p1c else '', # Fixes 'sh'
|
||
|
e, inv,
|
||
|
m.get(a, [''])[0],
|
||
|
list(alt[a].items()),
|
||
|
'')
|
||
|
p2.pop(b, None)
|
||
|
p2.pop(c, None)
|
||
|
|
||
|
# All of part5 and matching part2
|
||
|
for _, a, b, _ in p5:
|
||
|
yield _Language('',
|
||
|
a if a in p2 else '',
|
||
|
a if a in p2 else '',
|
||
|
p1n.get(b, ['', ''])[1],
|
||
|
b, '', '', '', a)
|
||
|
p2.pop(a, None)
|
||
|
|
||
|
# Rest of part2
|
||
|
p2.pop('qaa-qtz', None) # Is not a real code, but a range
|
||
|
for _, a, b, _ in p2.values():
|
||
|
n = [x.strip() for x in b.split('|')]
|
||
|
yield _Language('', a, a,
|
||
|
p1n.get(b, ['', ''])[1],
|
||
|
n[0], '', '', zip(n[1:], n[1:]), '')
|
||
|
|
||
|
import collections
|
||
|
|
||
|
l, i, m, p5, p2, p1 = _fabtabular()
|
||
|
alt = collections.defaultdict(dict)
|
||
|
for x in i:
|
||
|
alt[x[0]][x[1]] = x[2]
|
||
|
m = dict((x[1], x) for x in m)
|
||
|
p2 = dict((x[1], x) for x in p2)
|
||
|
p1c = dict((x[1], x) for x in p1)
|
||
|
p1n = dict((x[2].split('|')[0].strip(), x) for x in p1)
|
||
|
return list(generate())
|
||
|
|
||
|
@lazy_property
|
||
|
def part3(self):
|
||
|
return dict((x.part3, x) for x in self.languages if x.part3)
|
||
|
|
||
|
@lazy_property
|
||
|
def part2b(self):
|
||
|
return dict((x.part2b, x) for x in self.languages if x.part2b)
|
||
|
|
||
|
@lazy_property
|
||
|
def part2t(self):
|
||
|
return dict((x.part2t, x) for x in self.languages if x.part2t)
|
||
|
|
||
|
@lazy_property
|
||
|
def part1(self):
|
||
|
return dict((x.part1, x) for x in self.languages if x.part1)
|
||
|
|
||
|
@lazy_property
|
||
|
def part5(self):
|
||
|
return dict((x.part5, x) for x in self.languages if x.part5)
|
||
|
|
||
|
@lazy_property
|
||
|
def name(self):
|
||
|
def gen():
|
||
|
for x in self.languages:
|
||
|
if x.name:
|
||
|
yield x.name, x
|
||
|
for n in x.names:
|
||
|
yield n[0], x
|
||
|
|
||
|
return dict(gen())
|
||
|
|
||
|
@lazy_property
|
||
|
def inverted(self):
|
||
|
return dict((x.inverted, x) for x in self.languages if x.inverted)
|
||
|
|
||
|
@lazy_property
|
||
|
def macro(self):
|
||
|
import collections
|
||
|
|
||
|
m = collections.defaultdict(list)
|
||
|
for x in self.languages:
|
||
|
if x.macro:
|
||
|
m[x.macro].append(x)
|
||
|
return dict(m)
|
||
|
|
||
|
@lazy_property
|
||
|
def retired(self):
|
||
|
"""
|
||
|
Function for generating retired languages. Returns a dict('code', (datetime, [language, ...], 'description')).
|
||
|
"""
|
||
|
|
||
|
def gen():
|
||
|
import csv
|
||
|
import re
|
||
|
from datetime import datetime
|
||
|
from pkg_resources import resource_filename
|
||
|
|
||
|
with open(resource_filename(__package__, 'iso-639-3_Retirements.tab')) as rf:
|
||
|
rtd = list(csv.reader(rf, delimiter='\t'))[1:]
|
||
|
rc = [r[0] for r in rtd]
|
||
|
for i, _, _, m, s, d in rtd:
|
||
|
d = datetime.strptime(d, '%Y-%m-%d')
|
||
|
if not m:
|
||
|
m = re.findall('\[([a-z]{3})\]', s)
|
||
|
if m:
|
||
|
m = [m] if isinstance(m, str) else m
|
||
|
yield i, (d, [self.get(part3=x) for x in m if x not in rc], s)
|
||
|
else:
|
||
|
yield i, (d, [], s)
|
||
|
|
||
|
yield 'sh', self.get(part3='hbs') # Add 'sh' as deprecated
|
||
|
|
||
|
return dict(gen())
|
||
|
|
||
|
def get(self, **kwargs):
|
||
|
"""
|
||
|
Simple getter function for languages. Takes 1 keyword/value and returns 1 language object.
|
||
|
"""
|
||
|
if not len(kwargs) == 1:
|
||
|
raise AttributeError('Only one keyword expected')
|
||
|
key, value = kwargs.popitem()
|
||
|
return getattr(self, key)[value]
|