You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
parallel-library/vendor/iso639/iso639.py

266 lines
8.6 KiB
Python

# coding=utf-8
"""
Python library for ISO 639 standard
Copyright (c) 2014-2016 Mikael Karlsson (CSC - IT Center for Science Ltd.).
Licensed under AGPLv3.
"""
# Fix for Python 3.0 - 3.2
if not __package__:
__package__ = __name__.split('.')[0]
def _fabtabular():
"""
This function retrieves the ISO 639 and inverted names datasets as tsv files and returns them as lists.
"""
import csv
import sys
from pkg_resources import resource_filename
data = resource_filename(__package__, 'iso-639-3.tab')
inverted = resource_filename(__package__, 'iso-639-3_Name_Index.tab')
macro = resource_filename(__package__, 'iso-639-3-macrolanguages.tab')
part5 = resource_filename(__package__, 'iso639-5.tsv')
part2 = resource_filename(__package__, 'iso639-2.tsv')
part1 = resource_filename(__package__, 'iso639-1.tsv')
# if sys.version_info[0] == 2:
# from urllib2 import urlopen
# from contextlib import closing
# data_fo = closing(urlopen('http://www-01.sil.org/iso639-3/iso-639-3.tab'))
# inverted_fo = closing(urlopen('http://www-01.sil.org/iso639-3/iso-639-3_Name_Index.tab'))
# else:
# from urllib.request import urlopen
# import io
# data_fo = io.StringIO(urlopen('http://www-01.sil.org/iso639-3/iso-639-3.tab').read().decode())
# inverted_fo = io.StringIO(urlopen('http://www-01.sil.org/iso639-3/iso-639-3_Name_Index.tab').read().decode())
if sys.version_info[0] == 3:
from functools import partial
global open
open = partial(open, encoding='utf-8')
data_fo = open(data)
inverted_fo = open(inverted)
macro_fo = open(macro)
part5_fo = open(part5)
part2_fo = open(part2)
part1_fo = open(part1)
with data_fo as u:
with inverted_fo as i:
with macro_fo as m:
with part5_fo as p5:
with part2_fo as p2:
with part1_fo as p1:
return (list(csv.reader(u, delimiter='\t'))[1:],
list(csv.reader(i, delimiter='\t'))[1:],
list(csv.reader(m, delimiter='\t'))[1:],
list(csv.reader(p5, delimiter='\t'))[1:],
list(csv.reader(p2, delimiter='\t'))[1:],
list(csv.reader(p1, delimiter='\t'))[1:])
class _Language(object):
"""
This class represents a language. It provides pycountry language class compatibility.
"""
def __init__(self, part3, part2b, part2t, part1, name, inverted, macro, names, part5):
self.part3 = part3
self.part2b = part2b
self.part2t = part2t
self.part1 = part1
self.name = name
self.inverted = inverted
self.macro = macro
self.names = names
self.part5 = part5
def __getattr__(self, item):
compat = {
'alpha2': self.part1,
'bibliographic': self.part2b,
'terminology': self.part2t,
}
if item not in compat:
raise AttributeError("'{o}' object has no attribute '{a}'".format(o=type(self).__name__, a=item))
return compat[item]
class lazy_property(object):
"""
Implements a lazy property decorator, that overwrites itself/property with value
"""
def __init__(self, f):
self.f = f
self.name = f.__name__
def __get__(self, instance, owner=None):
if instance is None:
return self
val = self.f(instance)
setattr(instance, self.name, val)
return val
class Iso639(object):
"""
This class is a close to drop-in replacement for pycountry.languages.
But unlike pycountry.languages it also supports ISO 639-3.
It implements the Singleton design pattern for performance reasons.
Is uses lazy properties for faster import time.
"""
def __new__(cls):
if not hasattr(cls, '__instance'):
setattr(cls, '__instance', super(cls, cls).__new__(cls))
return getattr(cls, '__instance')
def __len__(self):
return len(self.languages)
def __iter__(self):
return iter(self.languages)
def __getattr__(self, item):
compat = {
'alpha2': self.part1,
'bibliographic': self.part2b,
'terminology': self.part2t,
}
if item not in compat:
raise AttributeError("'{o}' object has no attribute '{a}'".format(o=type(self).__name__, a=item))
return compat[item]
@lazy_property
def languages(self):
def generate():
# All of part3 and matching part2
for a, b, c, d, _, _, e, _ in l:
inv = alt[a].pop(e)
yield _Language(a, b, c,
d if d in p1c else '', # Fixes 'sh'
e, inv,
m.get(a, [''])[0],
list(alt[a].items()),
'')
p2.pop(b, None)
p2.pop(c, None)
# All of part5 and matching part2
for _, a, b, _ in p5:
yield _Language('',
a if a in p2 else '',
a if a in p2 else '',
p1n.get(b, ['', ''])[1],
b, '', '', '', a)
p2.pop(a, None)
# Rest of part2
p2.pop('qaa-qtz', None) # Is not a real code, but a range
for _, a, b, _ in p2.values():
n = [x.strip() for x in b.split('|')]
yield _Language('', a, a,
p1n.get(b, ['', ''])[1],
n[0], '', '', zip(n[1:], n[1:]), '')
import collections
l, i, m, p5, p2, p1 = _fabtabular()
alt = collections.defaultdict(dict)
for x in i:
alt[x[0]][x[1]] = x[2]
m = dict((x[1], x) for x in m)
p2 = dict((x[1], x) for x in p2)
p1c = dict((x[1], x) for x in p1)
p1n = dict((x[2].split('|')[0].strip(), x) for x in p1)
return list(generate())
@lazy_property
def part3(self):
return dict((x.part3, x) for x in self.languages if x.part3)
@lazy_property
def part2b(self):
return dict((x.part2b, x) for x in self.languages if x.part2b)
@lazy_property
def part2t(self):
return dict((x.part2t, x) for x in self.languages if x.part2t)
@lazy_property
def part1(self):
return dict((x.part1, x) for x in self.languages if x.part1)
@lazy_property
def part5(self):
return dict((x.part5, x) for x in self.languages if x.part5)
@lazy_property
def name(self):
def gen():
for x in self.languages:
if x.name:
yield x.name, x
for n in x.names:
yield n[0], x
return dict(gen())
@lazy_property
def inverted(self):
return dict((x.inverted, x) for x in self.languages if x.inverted)
@lazy_property
def macro(self):
import collections
m = collections.defaultdict(list)
for x in self.languages:
if x.macro:
m[x.macro].append(x)
return dict(m)
@lazy_property
def retired(self):
"""
Function for generating retired languages. Returns a dict('code', (datetime, [language, ...], 'description')).
"""
def gen():
import csv
import re
from datetime import datetime
from pkg_resources import resource_filename
with open(resource_filename(__package__, 'iso-639-3_Retirements.tab')) as rf:
rtd = list(csv.reader(rf, delimiter='\t'))[1:]
rc = [r[0] for r in rtd]
for i, _, _, m, s, d in rtd:
d = datetime.strptime(d, '%Y-%m-%d')
if not m:
m = re.findall('\[([a-z]{3})\]', s)
if m:
m = [m] if isinstance(m, str) else m
yield i, (d, [self.get(part3=x) for x in m if x not in rc], s)
else:
yield i, (d, [], s)
yield 'sh', self.get(part3='hbs') # Add 'sh' as deprecated
return dict(gen())
def get(self, **kwargs):
"""
Simple getter function for languages. Takes 1 keyword/value and returns 1 language object.
"""
if not len(kwargs) == 1:
raise AttributeError('Only one keyword expected')
key, value = kwargs.popitem()
return getattr(self, key)[value]