Merge branch 'master' of git.xpub.nl:/var/www/git.xpub.nl/repos/OuNoPo-make

master
ange 7 years ago
commit b66c5f7d54

BIN
.DS_Store vendored

Binary file not shown.

2
.gitignore vendored

@ -1,3 +1,3 @@
images/**
output/**
src/index.json

@ -1,4 +1,5 @@
images=$(sort $(wildcard images/*.jpg))
# @andre make wildcard so that it takes any image file but doesn't take the listimg.txt file
output_ocr:=$(dir_ocr)/output.txt
tmpfile:= $(shell mktemp)
space:= $(empty) $(empty)
@ -31,12 +32,13 @@ clean: ## removes output (target) files
dirs: ## create the dirs in working dir
@-mkdir -p images/
@-mkdir -p images-tiff/
@-mkdir -p output/
@-mkdir -p ocr/
@-mkdir -p hocr/
@echo $(color_r)'Directories made': images/ output/
testif:
ifeq ($(OS),Darwin)
@echo $(OS)
@ -49,13 +51,31 @@ ocr/output.txt: ## ocr with tesseract
echo $(listimgs) > $(@D)/list.txt
@echo $(basename $@ .txt)
tesseract $(@D)/list.txt $(basename $@ .txt)
python3 src/build_database.py $(@)
tiffs: ## convert images/ to images-tiff/ Depends on IM
echo $(images)
for i in $(images); \
do tiff=`basename $$i .jpg`.tiff; \
convert -density 300 $$i -alpha on images-tiff/$$tiff; \
echo $$tiff; \
done;
hocrs: ## hocr with tesseract and then change extension to .html
for i in images-tiff/*.tiff; \
do echo $$i; hocrfile=`basename $$i .tiff`; \
tesseract $$i hocr/$$hocrfile hocr; \
mv hocr/$$hocrfile.hocr hocr/$$hocrfile.html; \
done;
#OUTPUT GENERATION RECIPES
output/tagged-words.txt: ocr/output.txt ## Analyzes OCR'ed text using a Part of Speech (POS) tagger. Outputs a string of tags (e.g. nouns, verbs, adjectives, and adverbs). Dependencies: python3's nltk, nltk's averaged_perceptron_tagger
cat $< | python3 src/wordtagger.py > $(@)
output/wordtagger/index.html: ocr/output.txt ## Analyzes OCR'ed text using a Part of Speech (POS) tagger. Outputs a string of tags (e.g. nouns, verbs, adjectives, and adverbs). Dependencies: python3's nltk, jinja2
mkdir -p output/wordtagger
cp src/wordtagger/jquery.min.js output/wordtagger
cp src/wordtagger/script.js output/wordtagger
cp src/wordtagger/style.css output/wordtagger
cat $< | python3 src/wordtagger/wordtagger.py
# install nltk's 'averaged_perceptron_tagger':
# $ python 3
# >>> import nltk
@ -65,9 +85,16 @@ output/chatbot.txt: ocr/output.txt ## Comments a text with a simple chatbot. Dep
cat $< | python3 src/textbotconversation.py $(@)
output/n7.txt: ocr/output.txt ## DESCRIBE WHAT IT DOES. Dependencies: python3's chatterbot
output/n7.txt: ocr/output.txt ## Replaces nouns with the 7th noun that follows. Dependencies: 91k_nouns
cat $< | python3 src/n_7.py > $(@)
output/carlandre.txt: ocr/output.txt ## Alice: Creates visual poetry out of a text. Dependencies: pytest
cat $< | python3 src/carlandre.py > $(@)
# cat $(@) > /dev/usb/lp0
output/overunder: ocr/output.txt ## Alice: An interpreted language that translate simple weaving instructions and creates a weaving pattern on text.
python3 src/overunder.py
visualization: $(images) $(tmpfile) ##Creates data visualization from images/*.jpg. Dependencies: mplayer
@echo $(tmpfile)
@ -83,3 +110,6 @@ endif
ttssr-human-only: ocr/output.txt ## Loop: text to speech-speech recognition. Dependencies: espeak, pocketsphinx
bash src/ttssr-loop-human-only.sh ocr/output.txt
chatbook: ocr/output.txt #chatbot based on the knowledge of the scans Dependencies: nltk_rake, irc, nltk
python3 src/chatbook.py

@ -0,0 +1,3 @@
images/0029.jpg

@ -0,0 +1,37 @@
ZEROS + ONES DIGITAL WOMEN 4|» THE NEWTECHNOCULTURE
moments of unknown, disconnected lives, ”invisible voices
conducted through the tips of her fingers."
Poised as an interface between man and the world, she is
also wired to a network of digital machines: typists connected to
QWERTY alphabets, bodies shaped by the motion of the keys,
one hundred words a minute, viral speed, Thousands oi opera
tors, relays, calls, exchanges humming in Virtual conjunction,
learning the same phrases, flipping the same switches,
repeating the same responses, pushing plugs into the
answering iacks, maybe two hundred, three hundred times an
hours She has "a fingertip mastery of the ringing. listening, dial,
and other keys on her key shelf; of the row or rows of cords for
making connections; of the location and meaning of all parts of
the honey combed formation of jacks and trunks for recording,
for switching, for toll circuits, for tandem, for information-" It
becomes second nature it grows on her, "Having done this stufl
a few hundred thousand times, you become quite good at it. In
fact you're plugging, and connecting, and disconnecting ten,
twenty, forty cords at a time." After a while these processes
become "quite satisfying in a way, rather like weaving on an
upright loom,"
102

@ -0,0 +1,38 @@
import json
import argparse
import sys
from nltk.tokenize import sent_tokenize, word_tokenize
from rake_nltk import Rake
r= Rake()
ap = argparse.ArgumentParser("JSON Dumper")
ap.add_argument("text", nargs="+", help="text sources")
args=ap.parse_args()
with open('src/index.json') as f:
try:
index = json.load(f)
except:
index={}
# build the index of sentences organized by keywords
alltext = ""
for n in args.text:
text = open(n).read()
text = text.replace("\n", " ")
sentences = sent_tokenize(text)
for sentence in sentences:
r.extract_keywords_from_text(sentence)
keys = r.get_ranked_phrases()
for key in keys:
if key not in index:
index[key] = []
index[key].append({'filename': n, 'sentence': sentence, 'key': key})
alltext += text
#print(index)
with open('src/index.json', 'w') as outfile:
json.dump(index, outfile)

@ -0,0 +1,129 @@
import pytest
from math import ceil
import sys
from sys import stdout
import time
import os.path
def pop_items(words, num_items):
''' Removes num_items from words.'''
if not words:
return [], []
if num_items > len(words):
raise ValueError('Not enough items!')
popped = []
for number in range(num_items):
removed = words.pop(0)
popped.append(removed)
return popped, words
def all_words_less_than(words, maxlength):
''' Checks if the words have the correct length given in maxlength'''
for word in words:
if len(word) > maxlength:
return False
return True
def filterwords(words, maxlength):
''' Puts the words which have the correct length in a new list '''
goodwords = []
for word in words:
if len(word) <= maxlength and len(word) >=2:
goodwords.append(word)
return goodwords
def pattern(words, maxlength):
goodwords = filterwords(words, maxlength)
items_pattern = maxlength + (maxlength -4)
if len(goodwords) % items_pattern != 0:
rest = len(goodwords) % items_pattern
difference = len(goodwords) - rest
goodwords = goodwords[:difference]
times = int(len(words) / items_pattern)
final_pattern = []
for each_time in range(times):
popped, whatisleft = pop_items(goodwords, items_pattern)
if not popped:
continue
goodwords = whatisleft
middle = ceil(len(popped)/2)
ascending = sorted(popped[:middle], key=len)
descending = sorted(popped[middle:], key=len, reverse=True)
sorted_pattern = ascending + descending
final_pattern.append(sorted_pattern)
return final_pattern
def test_pattern_returns_list():
list_items = ['a', 'b', 'c', 'd', 'e']
assert type(pattern(list_items, 3)) == type([])
def test_pattern_removes_over_max_len():
list_words_right_length = [['a', 'aa', 'aaa', 'aa', 'a']]
words_wrong_length = list_words_right_length[0] + ['aaaaa']
assert pattern(words_wrong_length, 3) == list_words_right_length
def test_pop_items():
assert pop_items(['a', 'aaa'], 1) == (['a'], ['aaa'])
def test_pop_items_empty_list():
assert pop_items([], 70) == ([], [])
def test_pop_items_num_too_big():
with pytest.raises(ValueError):
pop_items(['a', 'b'], 3)
def test_cuts_for_pattern():
list_with_nine = ['a'] * 9
result = pattern(list_with_nine, 3)
assert len(result[0]) == 5
def test_empty_list_for_pattern():
result = pattern([], 3)
assert result == []
def test_list_too_short_for_pattern():
list_too_short = ['a', 'aa']
result = pattern(list_too_short, 3)
assert result == []
if __name__ == '__main__':
with open('ocr/output.txt', 'r') as handle:
contents = handle.read()
splitted = contents.split()
ll = (pattern(splitted, 8))
my_list = []
for l in ll:
for x in l:
my_list.append(x)
joined_list = '\n'.join(my_list)
my_path = '/dev/usb/lp0'
if os.path.exists(my_path):
sys.stdout = open(my_path, 'w')
escpos = {
"init_printer": "\x1B\x40",
'papercut':'\x1D\x56\x00',
}
for i in range(10):
print(escpos['init_printer'])
print(joined_list)
print(escpos['papercut'])

@ -0,0 +1,79 @@
import irc.bot
from rake_nltk import Rake
import random
from nltk.tokenize import sent_tokenize, word_tokenize
import json
#from thread import start_new_thread
import os
r = Rake()
def chunks(l, n):
for i in range(0, len(l), n):
yield l[i:i+n]
class HelloBot(irc.bot.SingleServerIRCBot):
def __init__(self, channel, nickname, server, port=6667, index=None):
print("connecting to chatroom...")
irc.bot.SingleServerIRCBot.__init__(self, [(server, port)], nickname, nickname)
self.channel = channel
self.index = index
def on_welcome(self, c, e):
c.join(self.channel)
print("joined chatroom")
def on_privmsg(self, c, e):
pass
def on_pubmsg(self, c, e):
print(e.arguments, e.source)
msg=e.arguments[0]
print(e.source.split("!")[0][:1])
incoming_msg = e.arguments[0]
r.extract_keywords_from_text(msg)
listOfKeys = r.get_ranked_phrases()
msg_where = ""
if e.source.split("!")[0][-3:] != "bot" or e.source.split("!")[0][:1] != "A":
print("true")
for keyWord in listOfKeys:
if keyWord in self.index:
msg = (index.get(keyWord)[0].get('sentence'))
msg_where = "I found this in {}".format(index.get(keyWord)[0].get('filename'))
else:
msg = "I don't know anything about that"
msg_where = ""
for chunk in chunks(msg, 400):
print(chunk)
c.privmsg(self.channel, chunk)
else:
print("bot")
if __name__ == "__main__":
import argparse
import sys
ap = argparse.ArgumentParser("IRC Bot")
ap.add_argument("--server", default="irc.freenode.net")
ap.add_argument("--port", type=int, default=6667)
ap.add_argument("--channel", default="#pzi")
ap.add_argument("--nickname", default="scanbot")
ap.add_argument("--text", nargs="+", help="database to use", default="index.json")
args=ap.parse_args()
# build the index of sentences organized by keywords
with open("src/index.json") as f:
try:
index = json.load(f)
except:
index={}
#print(index)
myhost = os.uname()[1]
bot = HelloBot(args.channel, "A-2{}-bot".format(len(index)), args.server, args.port, index)
bot.start()

File diff suppressed because one or more lines are too long

@ -0,0 +1,90 @@
import linecache
import textwrap
import sys
from sys import exit
class LeavingProgram(Exception):
pass
def parse(program):
cmds = program.split(',')
splitted_cmds = []
for cmd in cmds:
splitted = cmd.split()
splitted_cmds.append(splitted)
return splitted_cmds
#return tokenize(program)
def tokenize(s):
return s.split()
def repl():
while True:
try:
val = eval(parse(input('> ')))
if val is not None:
print(val)
except LeavingProgram:
break
text = None
line_number = 0
last_index = 0
def eval(cmds):
global text
global line_number
global last_index
for cmd in cmds:
if cmd == []:
line_number += 1
last_index = 0
elif cmd[0] == 'load':
contents = open('ocr/output.txt').read()
text = textwrap.wrap(contents, 40, break_long_words=True)
print('\n'.join(text))
line_number = 0
last_index = 0
elif cmd[0] == 'show':
print(text[line_number])
elif cmd[0] == 'under':
current_line = text[line_number]
char_number = int(cmd[1]) - 1
char_list = list(current_line)
x=range(last_index, char_number + last_index + 1)
for time in x:
if time < len(char_list):
char_list[time] = u'\u21e2'
last_index += char_number + 1
joined = ''.join(char_list)
text[line_number] = joined
elif cmd[0] == 'over':
last_index += int(cmd[1])
elif cmd[0] == 'pattern':
pattern = text[0:line_number + 1]
print('\n'.join(pattern))
elif cmd[0] == 'quit':
print('Come back soon!')
raise LeavingProgram()
else:
joined = ' '.join(cmd)
print('Did not understand command {}'.format(joined))
if __name__ == '__main__':
repl()

@ -1,82 +0,0 @@
import nltk
from sys import stdin, stdout
# Define input
input = stdin.read()
# FILTER FUNCTIONS
# This function cuts a string into words. Then runs a POS tagger for each word. Returns a list with tags
def postagger(string):
words = nltk.word_tokenize(string)
taggedwordlist = nltk.pos_tag(words)
for word, pos in nltk.pos_tag(words):
taggedwordlist = nltk.pos_tag(words)
#print('{0} is a {1}'.format(word,pos)) # Comment out to print the analysis step
taglist = [ pos for word,pos in taggedwordlist ]
#print(taglist)
return taglist;
# This function changes the tags to readable equivalents (NNP to noun for example)
def postagger_readable(list):
readabletaglist = []
for tag in list:
if tag in {"NNP","NNS","NN","NNPS"}:
readabletag = 'noun'
elif tag in {'VB','VBD','VBG','VBN','VBP','VBZ'}:
readabletag = 'verb'
elif tag in {'RB','RBR','RBS','WRB'}:
readabletag = 'adverb'
elif tag in {'PRP','PRP$'}:
readabletag = 'pronoun'
elif tag in {'JJ','JJR','JJS'}:
readabletag = 'adjective'
elif tag == 'IN':
readabletag = 'preposition'
elif tag == 'WDT':
readabletag = 'determiner'
elif tag in {'WP','WP$'}:
readabletag = 'pronoun'
elif tag == 'UH':
readabletag = 'interjection'
elif tag == 'POS':
readabletag = 'possesive ending'
elif tag == 'SYM':
readabletag = 'symbol'
elif tag == 'EX':
readabletag = 'existential there'
elif tag == 'DT':
readabletag = 'determiner'
elif tag == 'MD':
readabletag = 'modal'
elif tag == 'LS':
readabletag = 'list item marker'
elif tag == 'FW':
readabletag = 'foreign word'
elif tag == 'CC':
readabletag = 'coordinating conjunction '
elif tag == 'CD':
readabletag = 'cardinal number'
elif tag == 'TO':
readabletag = 'to'
elif tag == '.':
readabletag = 'line ending'
elif tag == ',':
readabletag = 'comma'
else:
readabletag = tag
readabletaglist.append(readabletag)
return readabletaglist;
# This function creates the output
def main():
taglist = postagger(input)
readabletaglist = postagger_readable(taglist)
stdout.write(' '.join(readabletaglist))
stdout.write('\n')
main()

File diff suppressed because one or more lines are too long

@ -0,0 +1,64 @@
$(document).ready(function(){
var state = 0;
$('.noun').addClass('fade-out');
$('.preposition').addClass('red');
$('.verb').addClass('blue');
$('.determiner').addClass('cyan');
$(document).bind('contextmenu', function(e) { return false; });
$( ".word" ).contextmenu(function() {
console.log($(this).hasClass('underline'));
$(this).hasClass('underline') == false
? $(this).addClass('underline')
: $(this).removeClass('underline');
});
$('.word').click( function() {
var el = $('.word');
console.log(state);
if (state == 0) {
$('.word').removeClass('fade-out red blue cyan');
$('.stopword').addClass('fade-out');
}
else if (state == 1) {
$('.stopword').removeClass('fade-out');
$('.neutral').addClass('fade-out');
}
else if (state == 2) {
$('.neutral').removeClass('fade-out');
$('.noun').addClass('fade-out');
$('.preposition').addClass('red');
$('.verb').addClass('blue');
state = -1;
}
$('.word').each(function() {
var el = $(this);
if (state == 0) {
el.empty();
el.html(el.data("stopword") + "&nbsp;");
}
else if (state == 1) {
el.empty();
el.html(el.data("sentiment") + "&nbsp;");
}
else {
el.empty();
el.html(el.data("pos") + "&nbsp;");
}
});
state = state+1;
});
});

@ -0,0 +1,86 @@
* {
min-height: 0;
min-width: 0;
}
body {
background: #639ab2;
font-size: 15px;
font-family: 'Ubuntu Mono', monospace;
}
.prelative {
flex-shrink: 0;
}
div.container {
width: 100%;
display: -webkit-box; /* OLD - iOS 6-, Safari 3.1-6 */
display: -moz-box; /* OLD - Firefox 19- (buggy but mostly works) */
display: -ms-flexbox; /* TWEENER - IE 10 */
display: -webkit-flex; /* NEW - Chrome */
display: flex;
flex-wrap: wrap;
}
.word {
font-size: 3rem;
float: left;
position: relative;
text-align: center;
display: -webkit-box; /* OLD - iOS 6-, Safari 3.1-6 */
display: -moz-box; /* OLD - Firefox 19- (buggy but mostly works) */
display: -ms-flexbox; /* TWEENER - IE 10 */
display: -webkit-flex; /* NEW - Chrome */
display:flex;
justify-content: center;
}
.word:before,
.word:after {
content: '';
color: #fff;
position: absolute;
font-family: 'PT Serif', serif;
font-weight: bold;
font-size: 1.5rem;
font-style: italic;
opacity: 0;
width: 100%;
}
.word:before {
content: attr(data-txt);
flex-shrink: 1;
}
.word:hover:before,
.word:active:after {
opacity: 1;
}
.fade-out {
color: #275152;
}
p {
margin: 1rem;
}
.red {
color: red;
}
.blue {
color: blue;
}
.cyan {
color: cyan;
}
.underline {
text-decoration: underline;
}

@ -0,0 +1,20 @@
<!DOCTYPE html>
<html>
<head>
<title>Wordtagger</title>
<meta charset="utf-8" />
<link rel="stylesheet" href="style.css" type="text/css" media="screen" />
<script type="text/javascript" src="jquery.min.js"></script>
<script type="text/javascript" src="script.js"></script>
<!--meta name="viewport" content="width=device-width"-->
</head>
<body>
<div class="container"><p>
{% for item, value in words_and_tags.items() %}
<span id="{{item}}" class="word {{words_and_tags[item]['sentiment']}} {{words_and_tags[item]['wordtype']}} {{words_and_tags[item]['POS']}}" data-txt="{{ words_and_tags[item]['word'] }}" data-pos="{{words_and_tags[item]['POS']}}" {% if words_and_tags[item]['word'] in [',','.','(',')'] %} data-sentiment= "{{ words_and_tags[item]['word'] }}" {% else %} data-sentiment= '{{ words_and_tags[item]['sentiment'] }}' {% endif %} {% if words_and_tags[item]['wordtype'] == 'stopword' %} data-stopword= "stopword" {% else %} data-stopword= '{{ words_and_tags[item]['word'] }}' {% endif %} >{{words_and_tags[item]['POS']}}&nbsp;</span>
{% endfor %}
</p>
</div>
</body>
</html>

@ -0,0 +1,156 @@
# LIBS
import nltk
import json
import os
from sys import stdin, stdout
from nltk import ne_chunk, pos_tag, word_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from jinja2 import Template
# == INPUT AND TOKENIZE ==
# Define input, tokenize and safe tokens to dictionary. Use index as ID for each word.
input = stdin.read()
words = nltk.word_tokenize(input)
words_and_tags = {'item ' + str(index) : {'word':word} for index , word in enumerate(words)}
print(words_and_tags)
# == FILTER FUNCTIONS ==
# === 1. POS_tagger & Named Entity Recognizer ===
# This function cuts a string into words. Then runs a POS tagger for each word. Returns a list with tags
def POS_tagger(list):
taggedwordlist = nltk.pos_tag(list)
for word, pos in nltk.pos_tag(list):
taggedwordlist = nltk.pos_tag(list)
#print('{0} is a {1}'.format(word,pos)) # Comment out to print the analysis step
print(taggedwordlist)
taglist = [ pos for word,pos in taggedwordlist ]
POS_tags = []
for tag in taglist:
if tag in {"NNP","NNS","NN","NNPS"}:
POS_tag = 'noun'
elif tag in {'VB','VBD','VBG','VBN','VBP','VBZ'}:
POS_tag = 'verb'
elif tag in {'RB','RBR','RBS','WRB'}:
POS_tag = 'adverb'
elif tag in {'PRP','PRP$'}:
POS_tag = 'pronoun'
elif tag in {'JJ','JJR','JJS'}:
POS_tag = 'adjective'
elif tag == 'IN':
POS_tag = 'preposition'
elif tag == 'WDT':
POS_tag = 'determiner'
elif tag in {'WP','WP$'}:
POS_tag = 'pronoun'
elif tag == 'UH':
POS_tag = 'interjection'
elif tag == 'POS':
POS_tag = 'possesive ending'
elif tag == 'SYM':
POS_tag = 'symbol'
elif tag == 'EX':
POS_tag = 'existential there'
elif tag == 'DT':
POS_tag = 'determiner'
elif tag == 'MD':
POS_tag = 'modal'
elif tag == 'LS':
POS_tag = 'list item marker'
elif tag == 'FW':
POS_tag = 'foreign word'
elif tag == 'CC':
POS_tag = 'coordinating conjunction '
elif tag == 'CD':
POS_tag = 'cardinal number'
elif tag == 'TO':
POS_tag = 'to'
elif tag == '.':
POS_tag = 'line ending'
elif tag == ',':
POS_tag = 'comma'
else:
POS_tag = tag
POS_tags.append(POS_tag)
#print(POS_tag)
return POS_tags;
# === 2. Sentiment tagger ===
# Sentiment analyzer based on the NLTK VADER tagger.
# This function uses words as an input. It tags each word based on its sentiment: negative, neutral or positive
def sentiment_tagger(word):
analyzer = SentimentIntensityAnalyzer()
score = analyzer.polarity_scores(word).get("compound")
if score < 0:
sentiment_tag = 'negative'
elif score > 0:
sentiment_tag = 'positive'
else:
sentiment_tag = 'neutral'
return sentiment_tag
# === 3. Stopword tagger ===
# Labels words on being a keyword or a stopword, based on the list in the NLTK corpus
def stopword_tagger(word):
stopWords = set(stopwords.words('english'))
if word in stopWords:
stopword_tag = 'stopword'
else:
stopword_tag = 'keyword'
return stopword_tag
# Run POS tagger
# This tagger outputs a list for all items in the dict at once
# To avoid double work, it is better to keep this outside the for loop
POS_tags = POS_tagger(words)
i = 0
# Adding tags to words in dictionary, which will be exported as a json file
# {'item 0' : {'word' : word, 'tagger 1': value 1}}
for item, value in words_and_tags.items():
word = words_and_tags[item]['word']
# POS
pos_tag = POS_tags[i]
words_and_tags[item]['POS'] = pos_tag
i = i+1
# Add sentiment tag
sentiment_tag = sentiment_tagger(word)
words_and_tags[item]['sentiment'] = sentiment_tag
# Add stopword tag
stopword_tag = stopword_tagger(word)
words_and_tags[item]['wordtype'] = stopword_tag
# Add entity tag
# Not functional yet
# Save data into a json file
print(words_and_tags)
#with open("data.json", 'w') as f:
with open(os.path.dirname(os.path.dirname(os.path.dirname( __file__ ))) + "output/wordtagger/data.json", 'w') as f:
json.dump(words_and_tags, f, ensure_ascii=False)
#let's bind it to a jinja2 template
# Jinja moves up one level by default, so I do not need to do it myself as in line 141
template_open = open("src/wordtagger/template.html", "r")
template = Template(template_open.read())
index_render = template.render(words_and_tags=words_and_tags)
#print(text_render)
# And render an html file!
print(index_render)
index_open = open("output/wordtagger/index.html", "w")
index_open.write(index_render)
index_open.close()
Loading…
Cancel
Save