You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

74 lines
1.9 KiB
Python

from __future__ import print_function
from __future__ import unicode_literals
from builtins import str, bytes, dict, int
import os
import sys
import codecs
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))
import pattern.text as text_module
from pattern.text.en.wordlist import STOPWORDS
paths = []
for f in os.listdir('./texts'):
paths.append('./texts/' + f)
texts = []
for p in paths:
with codecs.open(p, "rb", encoding='latin-1') as f:
if sys.version_info[0] < 3:
texts.append(f.read())
else:
texts.append(str(f.read()))
ng = text_module.train_topmine_ngrammer(texts, threshhold=1, regexp="[^a-zA-Z0-9]")
ngrams = text_module.topmine_ngramms(texts[0], ng, threshhold=1)
print("\n")
bigrams = []
trigrams = []
for key in ngrams.keys():
if len(key.split("_")) == 2:
bigrams.append(key)
elif len(key.split("_")) == 3:
trigrams.append(key)
print("Extracted {} bigrams:\n".format(len(bigrams)))
print(bigrams)
print("\n")
print("Extracted {} trigrams:\n".format(len(trigrams)))
print(trigrams)
print("\n")
# as we can see the extracted ngrams contain many stopwords, so, it's important to delete all
# stopwords before applying the algorythm
ng = text_module.train_topmine_ngrammer(texts, threshhold=1, regexp="[^a-zA-Z0-9]", stopwords=STOPWORDS)
ngrams = text_module.topmine_ngramms(texts[0], ng, threshhold=1)
# as we can see the extracted ngrams contain many stopwords, so, it's important to delete all
# stopwords before applying the algorythm
print("\n")
bigrams = []
trigrams = []
for key in ngrams.keys():
if len(key.split("_")) == 2:
bigrams.append(key)
elif len(key.split("_")) == 3:
trigrams.append(key)
print("Extracted {} bigrams (removed stopwords):\n".format(len(bigrams)))
print(bigrams)
print("\n")
print("Extracted {} trigrams (removed stopwords):\n".format(len(trigrams)))
print(trigrams)
print("\n")