You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
74 lines
1.9 KiB
Python
74 lines
1.9 KiB
Python
5 years ago
|
from __future__ import print_function
|
||
|
from __future__ import unicode_literals
|
||
|
|
||
|
from builtins import str, bytes, dict, int
|
||
|
|
||
|
import os
|
||
|
import sys
|
||
|
import codecs
|
||
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))
|
||
|
|
||
|
import pattern.text as text_module
|
||
|
from pattern.text.en.wordlist import STOPWORDS
|
||
|
|
||
|
paths = []
|
||
|
for f in os.listdir('./texts'):
|
||
|
paths.append('./texts/' + f)
|
||
|
|
||
|
texts = []
|
||
|
for p in paths:
|
||
|
with codecs.open(p, "rb", encoding='latin-1') as f:
|
||
|
if sys.version_info[0] < 3:
|
||
|
texts.append(f.read())
|
||
|
else:
|
||
|
texts.append(str(f.read()))
|
||
|
|
||
|
ng = text_module.train_topmine_ngrammer(texts, threshhold=1, regexp="[^a-zA-Z0-9]")
|
||
|
ngrams = text_module.topmine_ngramms(texts[0], ng, threshhold=1)
|
||
|
|
||
|
|
||
|
|
||
|
print("\n")
|
||
|
bigrams = []
|
||
|
trigrams = []
|
||
|
for key in ngrams.keys():
|
||
|
if len(key.split("_")) == 2:
|
||
|
bigrams.append(key)
|
||
|
elif len(key.split("_")) == 3:
|
||
|
trigrams.append(key)
|
||
|
|
||
|
print("Extracted {} bigrams:\n".format(len(bigrams)))
|
||
|
print(bigrams)
|
||
|
print("\n")
|
||
|
|
||
|
print("Extracted {} trigrams:\n".format(len(trigrams)))
|
||
|
print(trigrams)
|
||
|
print("\n")
|
||
|
|
||
|
|
||
|
# as we can see the extracted ngrams contain many stopwords, so, it's important to delete all
|
||
|
# stopwords before applying the algorythm
|
||
|
|
||
|
ng = text_module.train_topmine_ngrammer(texts, threshhold=1, regexp="[^a-zA-Z0-9]", stopwords=STOPWORDS)
|
||
|
ngrams = text_module.topmine_ngramms(texts[0], ng, threshhold=1)
|
||
|
|
||
|
|
||
|
# as we can see the extracted ngrams contain many stopwords, so, it's important to delete all
|
||
|
# stopwords before applying the algorythm
|
||
|
print("\n")
|
||
|
bigrams = []
|
||
|
trigrams = []
|
||
|
for key in ngrams.keys():
|
||
|
if len(key.split("_")) == 2:
|
||
|
bigrams.append(key)
|
||
|
elif len(key.split("_")) == 3:
|
||
|
trigrams.append(key)
|
||
|
|
||
|
print("Extracted {} bigrams (removed stopwords):\n".format(len(bigrams)))
|
||
|
print(bigrams)
|
||
|
print("\n")
|
||
|
|
||
|
print("Extracted {} trigrams (removed stopwords):\n".format(len(trigrams)))
|
||
|
print(trigrams)
|
||
|
print("\n")
|