diff --git a/Bag_of_words.md b/Bag_of_words.md new file mode 100644 index 0000000..cbe247d --- /dev/null +++ b/Bag_of_words.md @@ -0,0 +1,38 @@ +bag of words + +irreversability / stripping away the writer's process... + +book of words + +arbitrariness of the digital +link of language to economy of representation +separation from the body (pronouneable) + +illusion of a universal language + +"unadulterated data" + +what if the separation is not so easy to make + +Un/Structured + +Brin and Page's RESOURCES. + +Web RESOURCES + +ECONOMIES / Trade offs + + +BIG FINISH... + +In announcing Google's impending data center in Mons, Belgian prime minister Di Rupo invoked the link between the history of the mining industry in the region and the present and future interest in "data mining" as practiced by Google. + +Whether bales of cotton, barrels of oil, or bags of words, what links these processes is the way in which the notion of "raw material" obscures the labor and power structures employed to secure them. "Raw" is always relative: "purity" depends on processes of "refinement" that typically carry social/ecological impact. + +Stripping language of order is an act of "disembodiment", detaching it from the acts of writing and reading. The shift from (human) reading to machine reading involves a shift of responsibility from the individual human body to the obscured responsibilities and seemingly inevitable forces of the "machine", be it the machine of a market or the machine of an algorithm. + +The (computer scientists) view of textual content as "unstructured", be it in a webpage or the OCR scanned pages of a book, reflect a negligence to the processes and labor of writing, editing, design, layout, typesetting, and eventually publishing, collecting and cataloging [11]. + +"Unstructured" to the computer scientist then, means non-conformant to particular forms of machine reading. "Structuring" then is a social process by which particular (additional) conventions are upon and employed. The computer scientist oftens views a text through the eyes of their particular reading algorithm, and in the process (voluntarily) blinds themselves to the work practices which have produced and maintain these "resources". + +Berners-Lee, in chastising his audience of web publishers to not only publish online, but to release "unadulterated" data belies a lack of imagination in considering how language is itself structured and a blindness to the need for more than additional technical standards to connect to existing publishing practices. diff --git a/README.md b/README.md index e8bcc9e..f3b2fc9 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,9 @@ # Graph ![](8b.gif) -![](fig_12_3.png) \ No newline at end of file +![](fig_12_3.png) + +# Mathematical analysis with text +## Bag of Words + +## Term Frequency Inverse Document Frequency (TF-IDF) +Heuristically, it reflects the role of a given word in relation to a given corpus. Used in information retrieval. diff --git a/tfidf.py b/tfidf.py new file mode 100644 index 0000000..965f124 --- /dev/null +++ b/tfidf.py @@ -0,0 +1,191 @@ +import os, json, re +from math import log, exp +from flask import Markup + +from nltk import sent_tokenize +from nltk.tokenize import RegexpTokenizer +tokenizer = RegexpTokenizer(r'\w+') # initialize tokenizer + +import pprint +pp = pprint.PrettyPrinter(indent=4) + +def tfidf(query, words, corpus): + # Term Frequency + tf_count = 0 + for word in words: + if query == word: + tf_count += 1 + tf = tf_count/len(words) + # print('count:', tf_count) + # print('total:', len(words)) + # print('TF - count/total', tf_count/len(words)) + + # Inverse Document Frequency + idf_count = 0 + for words in corpus: + if query in words: + idf_count += 1 + # print('count:', idf_count) + idf = log(len(corpus)/idf_count) + # print('documents:', len(corpus)) + # print('documents/count', len(corpus)/idf_count) + # print('IDF - log(documents/count)', log(len(corpus)/idf_count)) + + tfidf_value = tf * idf + # print('TF-IDF:', tfidf_value) + + return tf_count, tf_count, tfidf_value + +def load_text_files(): + files = [] + corpus = [] + sentences = {} + dir = 'txt' + + for f in sorted(os.listdir(dir)): + # manifesto = f.replace('.txt','') + manifesto = f + lines = open(dir+'/'+f, "r").read() # list of lines in .txt file + words = [word for word in tokenizer.tokenize(lines)] # tokenize words, without punctuation + corpus.append(words) # all words of one manifesto, in reading order + s = sent_tokenize(lines) + sentences[manifesto] = s + files.append(manifesto) # list of filenames + + print('*txt files loaded*') + return files, corpus, sentences + +def create_index(): + files, corpus, sentences = load_text_files() + index = {} + + # index = { + # Fem manifesto : { + # 'words' : { + # 'aap': 39.2, + # 'beer': 20.456, + # 'citroen': 3.21 + # } + # 'tf' : { + # 'aap': 4, + # 'beer': 6, + # 'citroen': 2 + # } + # 'idf' : { + # 'aap': 4, + # 'beer': 6, + # 'citroen': 2 + # } + # } + # } + + for i, words in enumerate(corpus): + manifesto = files[i] + index[manifesto] = {} + index[manifesto]['sentences'] = sentences[manifesto] + for word in words: + tf_count, idf_count, tfidf_value = tfidf(word, words, corpus) + if 'words' not in index[manifesto]: + index[manifesto]['words'] = {} + index[manifesto]['words'][word] = tfidf_value + if 'tf' not in index[manifesto]: + index[manifesto]['tf'] = {} + index[manifesto]['tf'][word] = tf_count + + with open('index.json','w+') as out: + out.write(json.dumps(index, indent=4, sort_keys=True)) + out.close() + print('*index created*') + +def load_index(): + f = open('index.json').read() + index = json.loads(f) + return index + +def request_results(query): + query = query.strip() + f = open('index.json').read() + index = json.loads(f) + files = [manifesto for manifesto, _ in index.items()] + + results = {} + + # results = { + # 0 : { + # 'name' : 'Fem_manifesto', + # 'value' : 0.00041, + # 'sentences' : [ + # 'This is a first sentence.', + # 'This is a second sentence.', + # 'This is a third sentence.' + # ] + # } + # } + + # make a list of manifesto's that use the query word + result_matches = [] + for manifesto, _ in index.items(): + for word, value in index[manifesto]['words'].items(): + if query == word: + tf = index[manifesto]['tf'][word] + total = len(index[manifesto]['words']) + sentences = index[manifesto]['sentences'] + result_matches.append([value, manifesto, tf, total, sentences]) + + result_matches.sort(reverse=True) + for x, result in enumerate(result_matches): + results[x] = {} + results[x]['tfidf'] = result[0] + results[x]['name'] = result[1] + results[x]['tf'] = result[2] + results[x]['total'] = result[3] + results[x]['sentences'] = result[4] + + pp.pprint(results) + + # make a list of sentences that contain the query word + # and shape results object + for x, manifesto in results.items(): + value = manifesto['tfidf'] * 50000 + result_sentences = [] + # count = 0 + for s in manifesto['sentences']: + done = 'no' + for word in tokenizer.tokenize(s): + if word == query: + # if count < 3: # set to include a max 3 results/manifesto in the results list + # count += 1 + if done is not 'yes': + sentence = re.sub(r'[ .,;/\\*]'+query+r'[ ,.;/\\*]', ' {} '.format(100 + value, query), s) + html = Markup(sentence) + # if count == 3: + # html = html + Markup('
(...)*
') + result_sentences.append(html) + done = 'yes' + results[x]['sentences'] = result_sentences + + print('*results returned*') + return results, files + +def request_ordered(): + f = open('index.json').read() + index = json.loads(f) + files = [manifesto for manifesto, _ in index.items()] + results = {} + for manifesto, _ in index.items(): + words = sorted([[value, word] for word, value in index[manifesto]['words'].items()], reverse=True) + results[manifesto] = words + return results, files + +def request_ordered_all(): + f = open('index.json').read() + index = json.loads(f) + files = [manifesto for manifesto, _ in index.items()] + results = [] + i = 0 + for manifesto, _ in index.items(): + for word, value in index[manifesto]['words'].items(): + results.append([value, word, i]) + i += 1 + results = sorted(results) + return results, files diff --git a/todo.md b/todo.md index 42545e9..36d92aa 100644 --- a/todo.md +++ b/todo.md @@ -1,3 +1,6 @@ work on disruptive system text. cat >> to append new line! convert key:value value to integers +todo 1: []tf-idf []markov []concord []conway +todo 2: txt2img +todo 3: disruption sort