You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
30 lines
1.1 KiB
Python
30 lines
1.1 KiB
Python
# https://radimrehurek.com/gensim/auto_examples/core/run_core_concepts.html#sphx-glr-auto-examples-core-run-core-concepts-py
|
|
|
|
import pprint
|
|
|
|
# Document: some text.
|
|
# Corpus: a collection of documents.
|
|
# Vector: a mathematically convenient representation of a document.
|
|
# Model: an algorithm for transforming vectors from one representation to another.
|
|
|
|
document = 'Lorem ipsum dolor sit amet eheh 123 gelato'
|
|
|
|
text_corpus = [
|
|
"Human machine interface for lab abc computer applications",
|
|
"A survey of user opinion of computer system response time",
|
|
"The EPS user interface management system",
|
|
"System and human system engineering testing of EPS",
|
|
"Relation of user perceived response time to error measurement",
|
|
"The generation of random binary unordered trees",
|
|
"The intersection graph of paths in trees",
|
|
"Graph minors IV Widths of trees and well quasi ordering",
|
|
"Graph minors A survey",
|
|
]
|
|
|
|
# Cleaning the corpus
|
|
|
|
# Create a set of frequent words
|
|
stoplist = set('for a of the and to in'.split(' '))
|
|
|
|
# Lowercase each document, split it by white space and filter out stopwords
|