You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

88 lines
2.4 KiB
Python

#### PATTERN | VECTOR | WORDLIST ###################################################################
# Copyright (c) 2010 University of Antwerp, Belgium
# Author: Tom De Smedt <tom@organisms.be>
# License: BSD (see LICENSE.txt for details).
# http://www.clips.ua.ac.be/pages/pattern
####################################################################################################
from __future__ import print_function
from __future__ import unicode_literals
from __future__ import division
from builtins import str, bytes, dict, int
from builtins import map, zip, filter
from builtins import object, range
import os
from io import open
try:
MODULE = os.path.dirname(os.path.realpath(__file__))
except:
MODULE = ""
class Wordlist(object):
def __init__(self, name, data=[]):
""" Lazy read-only list of words.
"""
self._name = name
self._data = data
def _load(self):
if not self._data:
self._data = open(os.path.join(MODULE, self._name + ".txt")).read().split(", ")
def __repr__(self):
self._load()
return repr(self._data)
def __iter__(self):
self._load()
return iter(self._data)
def __len__(self):
self._load()
return len(self._data)
def __contains__(self, w):
self._load()
return w in self._data
def __add__(self, iterable):
self._load()
return Wordlist(None, data=sorted(self._data + list(iterable)))
def __getitem__(self, i):
self._load()
return self._data[i]
def __setitem__(self, i, v):
self._load()
self._data[i] = v
def insert(self, i, v):
self._load()
self._data.insert(i, v)
def append(self, v):
self._load()
self._data.append(v)
def extend(self, v):
self._load()
self._data.extend(v)
ACADEMIC = Wordlist("academic") # English academic words.
BASIC = Wordlist("basic") # English basic words (850) that express 90% of concepts.
PROFANITY = Wordlist("profanity") # English swear words.
TIME = Wordlist("time") # English time and date words.
STOPWORDS = Wordlist("stopwords") # English stop words ("a", "the", ...).
# Note: if used for lookups, performance can be increased by using a dict:
# blacklist = dict.fromkeys(PROFANITY+TIME, True)
# for i in range(1000):
# corpus.append(Document(src[i], exclude=blacklist))