Merge branch 'master' of git.xpub.nl:/var/www/git.xpub.nl/repos/OuNoPo-make
commit
b66c5f7d54
@ -1,3 +1,3 @@
|
|||||||
images/**
|
images/**
|
||||||
output/**
|
output/**
|
||||||
|
src/index.json
|
||||||
|
@ -0,0 +1,3 @@
|
|||||||
|
|
||||||
|
images/0029.jpg
|
||||||
|
|
@ -0,0 +1,37 @@
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
ZEROS + ONES DIGITAL WOMEN 4|» THE NEWTECHNOCULTURE
|
||||||
|
|
||||||
|
moments of unknown, disconnected lives, ”invisible voices
|
||||||
|
conducted through the tips of her fingers."
|
||||||
|
|
||||||
|
Poised as an interface between man and the world, she is
|
||||||
|
also wired to a network of digital machines: typists connected to
|
||||||
|
QWERTY alphabets, bodies shaped by the motion of the keys,
|
||||||
|
one hundred words a minute, viral speed, Thousands oi opera
|
||||||
|
tors, relays, calls, exchanges humming in Virtual conjunction,
|
||||||
|
learning the same phrases, flipping the same switches,
|
||||||
|
repeating the same responses, pushing plugs into the
|
||||||
|
answering iacks, maybe two hundred, three hundred times an
|
||||||
|
hours She has "a fingertip mastery of the ringing. listening, dial,
|
||||||
|
and other keys on her key shelf; of the row or rows of cords for
|
||||||
|
making connections; of the location and meaning of all parts of
|
||||||
|
the honey combed formation of jacks and trunks for recording,
|
||||||
|
for switching, for toll circuits, for tandem, for information-" It
|
||||||
|
becomes second nature it grows on her, "Having done this stufl
|
||||||
|
a few hundred thousand times, you become quite good at it. In
|
||||||
|
fact you're plugging, and connecting, and disconnecting ten,
|
||||||
|
twenty, forty cords at a time." After a while these processes
|
||||||
|
become "quite satisfying in a way, rather like weaving on an
|
||||||
|
upright loom,"
|
||||||
|
|
||||||
|
102
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -0,0 +1,38 @@
|
|||||||
|
import json
|
||||||
|
import argparse
|
||||||
|
import sys
|
||||||
|
from nltk.tokenize import sent_tokenize, word_tokenize
|
||||||
|
from rake_nltk import Rake
|
||||||
|
|
||||||
|
r= Rake()
|
||||||
|
|
||||||
|
ap = argparse.ArgumentParser("JSON Dumper")
|
||||||
|
ap.add_argument("text", nargs="+", help="text sources")
|
||||||
|
args=ap.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
with open('src/index.json') as f:
|
||||||
|
try:
|
||||||
|
index = json.load(f)
|
||||||
|
except:
|
||||||
|
index={}
|
||||||
|
# build the index of sentences organized by keywords
|
||||||
|
alltext = ""
|
||||||
|
|
||||||
|
for n in args.text:
|
||||||
|
text = open(n).read()
|
||||||
|
text = text.replace("\n", " ")
|
||||||
|
sentences = sent_tokenize(text)
|
||||||
|
for sentence in sentences:
|
||||||
|
r.extract_keywords_from_text(sentence)
|
||||||
|
keys = r.get_ranked_phrases()
|
||||||
|
for key in keys:
|
||||||
|
if key not in index:
|
||||||
|
index[key] = []
|
||||||
|
index[key].append({'filename': n, 'sentence': sentence, 'key': key})
|
||||||
|
alltext += text
|
||||||
|
|
||||||
|
#print(index)
|
||||||
|
|
||||||
|
with open('src/index.json', 'w') as outfile:
|
||||||
|
json.dump(index, outfile)
|
@ -0,0 +1,129 @@
|
|||||||
|
import pytest
|
||||||
|
from math import ceil
|
||||||
|
import sys
|
||||||
|
from sys import stdout
|
||||||
|
import time
|
||||||
|
import os.path
|
||||||
|
|
||||||
|
|
||||||
|
def pop_items(words, num_items):
|
||||||
|
''' Removes num_items from words.'''
|
||||||
|
if not words:
|
||||||
|
return [], []
|
||||||
|
|
||||||
|
if num_items > len(words):
|
||||||
|
raise ValueError('Not enough items!')
|
||||||
|
|
||||||
|
popped = []
|
||||||
|
for number in range(num_items):
|
||||||
|
removed = words.pop(0)
|
||||||
|
popped.append(removed)
|
||||||
|
return popped, words
|
||||||
|
|
||||||
|
def all_words_less_than(words, maxlength):
|
||||||
|
''' Checks if the words have the correct length given in maxlength'''
|
||||||
|
for word in words:
|
||||||
|
if len(word) > maxlength:
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
def filterwords(words, maxlength):
|
||||||
|
''' Puts the words which have the correct length in a new list '''
|
||||||
|
goodwords = []
|
||||||
|
for word in words:
|
||||||
|
if len(word) <= maxlength and len(word) >=2:
|
||||||
|
goodwords.append(word)
|
||||||
|
return goodwords
|
||||||
|
|
||||||
|
|
||||||
|
def pattern(words, maxlength):
|
||||||
|
goodwords = filterwords(words, maxlength)
|
||||||
|
items_pattern = maxlength + (maxlength -4)
|
||||||
|
|
||||||
|
if len(goodwords) % items_pattern != 0:
|
||||||
|
rest = len(goodwords) % items_pattern
|
||||||
|
difference = len(goodwords) - rest
|
||||||
|
goodwords = goodwords[:difference]
|
||||||
|
|
||||||
|
times = int(len(words) / items_pattern)
|
||||||
|
|
||||||
|
final_pattern = []
|
||||||
|
for each_time in range(times):
|
||||||
|
popped, whatisleft = pop_items(goodwords, items_pattern)
|
||||||
|
if not popped:
|
||||||
|
continue
|
||||||
|
goodwords = whatisleft
|
||||||
|
|
||||||
|
middle = ceil(len(popped)/2)
|
||||||
|
|
||||||
|
ascending = sorted(popped[:middle], key=len)
|
||||||
|
descending = sorted(popped[middle:], key=len, reverse=True)
|
||||||
|
|
||||||
|
|
||||||
|
sorted_pattern = ascending + descending
|
||||||
|
final_pattern.append(sorted_pattern)
|
||||||
|
|
||||||
|
return final_pattern
|
||||||
|
|
||||||
|
|
||||||
|
def test_pattern_returns_list():
|
||||||
|
list_items = ['a', 'b', 'c', 'd', 'e']
|
||||||
|
assert type(pattern(list_items, 3)) == type([])
|
||||||
|
|
||||||
|
def test_pattern_removes_over_max_len():
|
||||||
|
list_words_right_length = [['a', 'aa', 'aaa', 'aa', 'a']]
|
||||||
|
words_wrong_length = list_words_right_length[0] + ['aaaaa']
|
||||||
|
assert pattern(words_wrong_length, 3) == list_words_right_length
|
||||||
|
|
||||||
|
def test_pop_items():
|
||||||
|
assert pop_items(['a', 'aaa'], 1) == (['a'], ['aaa'])
|
||||||
|
|
||||||
|
def test_pop_items_empty_list():
|
||||||
|
assert pop_items([], 70) == ([], [])
|
||||||
|
|
||||||
|
def test_pop_items_num_too_big():
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
pop_items(['a', 'b'], 3)
|
||||||
|
|
||||||
|
def test_cuts_for_pattern():
|
||||||
|
list_with_nine = ['a'] * 9
|
||||||
|
result = pattern(list_with_nine, 3)
|
||||||
|
assert len(result[0]) == 5
|
||||||
|
|
||||||
|
def test_empty_list_for_pattern():
|
||||||
|
result = pattern([], 3)
|
||||||
|
assert result == []
|
||||||
|
|
||||||
|
def test_list_too_short_for_pattern():
|
||||||
|
list_too_short = ['a', 'aa']
|
||||||
|
result = pattern(list_too_short, 3)
|
||||||
|
assert result == []
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
with open('ocr/output.txt', 'r') as handle:
|
||||||
|
contents = handle.read()
|
||||||
|
splitted = contents.split()
|
||||||
|
ll = (pattern(splitted, 8))
|
||||||
|
my_list = []
|
||||||
|
for l in ll:
|
||||||
|
for x in l:
|
||||||
|
my_list.append(x)
|
||||||
|
joined_list = '\n'.join(my_list)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
my_path = '/dev/usb/lp0'
|
||||||
|
if os.path.exists(my_path):
|
||||||
|
sys.stdout = open(my_path, 'w')
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
escpos = {
|
||||||
|
"init_printer": "\x1B\x40",
|
||||||
|
'papercut':'\x1D\x56\x00',
|
||||||
|
}
|
||||||
|
|
||||||
|
for i in range(10):
|
||||||
|
print(escpos['init_printer'])
|
||||||
|
print(joined_list)
|
||||||
|
print(escpos['papercut'])
|
@ -0,0 +1,79 @@
|
|||||||
|
import irc.bot
|
||||||
|
from rake_nltk import Rake
|
||||||
|
import random
|
||||||
|
from nltk.tokenize import sent_tokenize, word_tokenize
|
||||||
|
import json
|
||||||
|
#from thread import start_new_thread
|
||||||
|
import os
|
||||||
|
|
||||||
|
r = Rake()
|
||||||
|
|
||||||
|
def chunks(l, n):
|
||||||
|
for i in range(0, len(l), n):
|
||||||
|
yield l[i:i+n]
|
||||||
|
|
||||||
|
class HelloBot(irc.bot.SingleServerIRCBot):
|
||||||
|
def __init__(self, channel, nickname, server, port=6667, index=None):
|
||||||
|
print("connecting to chatroom...")
|
||||||
|
irc.bot.SingleServerIRCBot.__init__(self, [(server, port)], nickname, nickname)
|
||||||
|
self.channel = channel
|
||||||
|
self.index = index
|
||||||
|
|
||||||
|
def on_welcome(self, c, e):
|
||||||
|
c.join(self.channel)
|
||||||
|
print("joined chatroom")
|
||||||
|
|
||||||
|
def on_privmsg(self, c, e):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def on_pubmsg(self, c, e):
|
||||||
|
print(e.arguments, e.source)
|
||||||
|
msg=e.arguments[0]
|
||||||
|
print(e.source.split("!")[0][:1])
|
||||||
|
incoming_msg = e.arguments[0]
|
||||||
|
r.extract_keywords_from_text(msg)
|
||||||
|
listOfKeys = r.get_ranked_phrases()
|
||||||
|
|
||||||
|
msg_where = ""
|
||||||
|
|
||||||
|
if e.source.split("!")[0][-3:] != "bot" or e.source.split("!")[0][:1] != "A":
|
||||||
|
print("true")
|
||||||
|
for keyWord in listOfKeys:
|
||||||
|
if keyWord in self.index:
|
||||||
|
msg = (index.get(keyWord)[0].get('sentence'))
|
||||||
|
msg_where = "I found this in {}".format(index.get(keyWord)[0].get('filename'))
|
||||||
|
else:
|
||||||
|
msg = "I don't know anything about that"
|
||||||
|
msg_where = ""
|
||||||
|
for chunk in chunks(msg, 400):
|
||||||
|
print(chunk)
|
||||||
|
c.privmsg(self.channel, chunk)
|
||||||
|
else:
|
||||||
|
print("bot")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import argparse
|
||||||
|
import sys
|
||||||
|
|
||||||
|
ap = argparse.ArgumentParser("IRC Bot")
|
||||||
|
ap.add_argument("--server", default="irc.freenode.net")
|
||||||
|
ap.add_argument("--port", type=int, default=6667)
|
||||||
|
ap.add_argument("--channel", default="#pzi")
|
||||||
|
ap.add_argument("--nickname", default="scanbot")
|
||||||
|
ap.add_argument("--text", nargs="+", help="database to use", default="index.json")
|
||||||
|
args=ap.parse_args()
|
||||||
|
|
||||||
|
# build the index of sentences organized by keywords
|
||||||
|
with open("src/index.json") as f:
|
||||||
|
try:
|
||||||
|
index = json.load(f)
|
||||||
|
except:
|
||||||
|
index={}
|
||||||
|
|
||||||
|
#print(index)
|
||||||
|
myhost = os.uname()[1]
|
||||||
|
|
||||||
|
|
||||||
|
bot = HelloBot(args.channel, "A-2{}-bot".format(len(index)), args.server, args.port, index)
|
||||||
|
bot.start()
|
File diff suppressed because one or more lines are too long
@ -0,0 +1,90 @@
|
|||||||
|
import linecache
|
||||||
|
import textwrap
|
||||||
|
import sys
|
||||||
|
from sys import exit
|
||||||
|
|
||||||
|
class LeavingProgram(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def parse(program):
|
||||||
|
cmds = program.split(',')
|
||||||
|
splitted_cmds = []
|
||||||
|
for cmd in cmds:
|
||||||
|
splitted = cmd.split()
|
||||||
|
splitted_cmds.append(splitted)
|
||||||
|
return splitted_cmds
|
||||||
|
|
||||||
|
#return tokenize(program)
|
||||||
|
def tokenize(s):
|
||||||
|
return s.split()
|
||||||
|
|
||||||
|
def repl():
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
val = eval(parse(input('> ')))
|
||||||
|
if val is not None:
|
||||||
|
print(val)
|
||||||
|
except LeavingProgram:
|
||||||
|
break
|
||||||
|
|
||||||
|
text = None
|
||||||
|
line_number = 0
|
||||||
|
last_index = 0
|
||||||
|
|
||||||
|
|
||||||
|
def eval(cmds):
|
||||||
|
global text
|
||||||
|
global line_number
|
||||||
|
global last_index
|
||||||
|
|
||||||
|
for cmd in cmds:
|
||||||
|
if cmd == []:
|
||||||
|
line_number += 1
|
||||||
|
last_index = 0
|
||||||
|
|
||||||
|
elif cmd[0] == 'load':
|
||||||
|
contents = open('ocr/output.txt').read()
|
||||||
|
text = textwrap.wrap(contents, 40, break_long_words=True)
|
||||||
|
print('\n'.join(text))
|
||||||
|
line_number = 0
|
||||||
|
last_index = 0
|
||||||
|
|
||||||
|
elif cmd[0] == 'show':
|
||||||
|
print(text[line_number])
|
||||||
|
|
||||||
|
elif cmd[0] == 'under':
|
||||||
|
current_line = text[line_number]
|
||||||
|
char_number = int(cmd[1]) - 1
|
||||||
|
char_list = list(current_line)
|
||||||
|
|
||||||
|
x=range(last_index, char_number + last_index + 1)
|
||||||
|
for time in x:
|
||||||
|
if time < len(char_list):
|
||||||
|
char_list[time] = u'\u21e2'
|
||||||
|
|
||||||
|
last_index += char_number + 1
|
||||||
|
|
||||||
|
joined = ''.join(char_list)
|
||||||
|
text[line_number] = joined
|
||||||
|
|
||||||
|
elif cmd[0] == 'over':
|
||||||
|
last_index += int(cmd[1])
|
||||||
|
|
||||||
|
elif cmd[0] == 'pattern':
|
||||||
|
|
||||||
|
pattern = text[0:line_number + 1]
|
||||||
|
print('\n'.join(pattern))
|
||||||
|
|
||||||
|
|
||||||
|
elif cmd[0] == 'quit':
|
||||||
|
print('Come back soon!')
|
||||||
|
raise LeavingProgram()
|
||||||
|
else:
|
||||||
|
joined = ' '.join(cmd)
|
||||||
|
print('Did not understand command {}'.format(joined))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
repl()
|
@ -1,82 +0,0 @@
|
|||||||
import nltk
|
|
||||||
from sys import stdin, stdout
|
|
||||||
|
|
||||||
# Define input
|
|
||||||
input = stdin.read()
|
|
||||||
|
|
||||||
# FILTER FUNCTIONS
|
|
||||||
# This function cuts a string into words. Then runs a POS tagger for each word. Returns a list with tags
|
|
||||||
def postagger(string):
|
|
||||||
words = nltk.word_tokenize(string)
|
|
||||||
taggedwordlist = nltk.pos_tag(words)
|
|
||||||
|
|
||||||
for word, pos in nltk.pos_tag(words):
|
|
||||||
taggedwordlist = nltk.pos_tag(words)
|
|
||||||
#print('{0} is a {1}'.format(word,pos)) # Comment out to print the analysis step
|
|
||||||
|
|
||||||
taglist = [ pos for word,pos in taggedwordlist ]
|
|
||||||
#print(taglist)
|
|
||||||
return taglist;
|
|
||||||
|
|
||||||
# This function changes the tags to readable equivalents (NNP to noun for example)
|
|
||||||
def postagger_readable(list):
|
|
||||||
readabletaglist = []
|
|
||||||
|
|
||||||
for tag in list:
|
|
||||||
if tag in {"NNP","NNS","NN","NNPS"}:
|
|
||||||
readabletag = 'noun'
|
|
||||||
elif tag in {'VB','VBD','VBG','VBN','VBP','VBZ'}:
|
|
||||||
readabletag = 'verb'
|
|
||||||
elif tag in {'RB','RBR','RBS','WRB'}:
|
|
||||||
readabletag = 'adverb'
|
|
||||||
elif tag in {'PRP','PRP$'}:
|
|
||||||
readabletag = 'pronoun'
|
|
||||||
elif tag in {'JJ','JJR','JJS'}:
|
|
||||||
readabletag = 'adjective'
|
|
||||||
elif tag == 'IN':
|
|
||||||
readabletag = 'preposition'
|
|
||||||
elif tag == 'WDT':
|
|
||||||
readabletag = 'determiner'
|
|
||||||
elif tag in {'WP','WP$'}:
|
|
||||||
readabletag = 'pronoun'
|
|
||||||
elif tag == 'UH':
|
|
||||||
readabletag = 'interjection'
|
|
||||||
elif tag == 'POS':
|
|
||||||
readabletag = 'possesive ending'
|
|
||||||
elif tag == 'SYM':
|
|
||||||
readabletag = 'symbol'
|
|
||||||
elif tag == 'EX':
|
|
||||||
readabletag = 'existential there'
|
|
||||||
elif tag == 'DT':
|
|
||||||
readabletag = 'determiner'
|
|
||||||
elif tag == 'MD':
|
|
||||||
readabletag = 'modal'
|
|
||||||
elif tag == 'LS':
|
|
||||||
readabletag = 'list item marker'
|
|
||||||
elif tag == 'FW':
|
|
||||||
readabletag = 'foreign word'
|
|
||||||
elif tag == 'CC':
|
|
||||||
readabletag = 'coordinating conjunction '
|
|
||||||
elif tag == 'CD':
|
|
||||||
readabletag = 'cardinal number'
|
|
||||||
elif tag == 'TO':
|
|
||||||
readabletag = 'to'
|
|
||||||
elif tag == '.':
|
|
||||||
readabletag = 'line ending'
|
|
||||||
elif tag == ',':
|
|
||||||
readabletag = 'comma'
|
|
||||||
else:
|
|
||||||
readabletag = tag
|
|
||||||
|
|
||||||
readabletaglist.append(readabletag)
|
|
||||||
return readabletaglist;
|
|
||||||
|
|
||||||
|
|
||||||
# This function creates the output
|
|
||||||
def main():
|
|
||||||
taglist = postagger(input)
|
|
||||||
readabletaglist = postagger_readable(taglist)
|
|
||||||
stdout.write(' '.join(readabletaglist))
|
|
||||||
stdout.write('\n')
|
|
||||||
|
|
||||||
main()
|
|
File diff suppressed because one or more lines are too long
@ -0,0 +1,64 @@
|
|||||||
|
$(document).ready(function(){
|
||||||
|
var state = 0;
|
||||||
|
|
||||||
|
$('.noun').addClass('fade-out');
|
||||||
|
$('.preposition').addClass('red');
|
||||||
|
$('.verb').addClass('blue');
|
||||||
|
$('.determiner').addClass('cyan');
|
||||||
|
|
||||||
|
$(document).bind('contextmenu', function(e) { return false; });
|
||||||
|
|
||||||
|
$( ".word" ).contextmenu(function() {
|
||||||
|
console.log($(this).hasClass('underline'));
|
||||||
|
$(this).hasClass('underline') == false
|
||||||
|
? $(this).addClass('underline')
|
||||||
|
: $(this).removeClass('underline');
|
||||||
|
});
|
||||||
|
|
||||||
|
$('.word').click( function() {
|
||||||
|
var el = $('.word');
|
||||||
|
console.log(state);
|
||||||
|
|
||||||
|
if (state == 0) {
|
||||||
|
$('.word').removeClass('fade-out red blue cyan');
|
||||||
|
|
||||||
|
$('.stopword').addClass('fade-out');
|
||||||
|
}
|
||||||
|
|
||||||
|
else if (state == 1) {
|
||||||
|
$('.stopword').removeClass('fade-out');
|
||||||
|
$('.neutral').addClass('fade-out');
|
||||||
|
}
|
||||||
|
|
||||||
|
else if (state == 2) {
|
||||||
|
$('.neutral').removeClass('fade-out');
|
||||||
|
$('.noun').addClass('fade-out');
|
||||||
|
$('.preposition').addClass('red');
|
||||||
|
$('.verb').addClass('blue');
|
||||||
|
state = -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
$('.word').each(function() {
|
||||||
|
var el = $(this);
|
||||||
|
|
||||||
|
if (state == 0) {
|
||||||
|
el.empty();
|
||||||
|
el.html(el.data("stopword") + " ");
|
||||||
|
}
|
||||||
|
|
||||||
|
else if (state == 1) {
|
||||||
|
el.empty();
|
||||||
|
el.html(el.data("sentiment") + " ");
|
||||||
|
}
|
||||||
|
|
||||||
|
else {
|
||||||
|
el.empty();
|
||||||
|
el.html(el.data("pos") + " ");
|
||||||
|
}
|
||||||
|
|
||||||
|
});
|
||||||
|
|
||||||
|
state = state+1;
|
||||||
|
});
|
||||||
|
|
||||||
|
});
|
@ -0,0 +1,86 @@
|
|||||||
|
* {
|
||||||
|
min-height: 0;
|
||||||
|
min-width: 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
body {
|
||||||
|
background: #639ab2;
|
||||||
|
font-size: 15px;
|
||||||
|
font-family: 'Ubuntu Mono', monospace;
|
||||||
|
}
|
||||||
|
|
||||||
|
.prelative {
|
||||||
|
flex-shrink: 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
div.container {
|
||||||
|
width: 100%;
|
||||||
|
display: -webkit-box; /* OLD - iOS 6-, Safari 3.1-6 */
|
||||||
|
display: -moz-box; /* OLD - Firefox 19- (buggy but mostly works) */
|
||||||
|
display: -ms-flexbox; /* TWEENER - IE 10 */
|
||||||
|
display: -webkit-flex; /* NEW - Chrome */
|
||||||
|
display: flex;
|
||||||
|
flex-wrap: wrap;
|
||||||
|
}
|
||||||
|
|
||||||
|
.word {
|
||||||
|
font-size: 3rem;
|
||||||
|
float: left;
|
||||||
|
position: relative;
|
||||||
|
text-align: center;
|
||||||
|
display: -webkit-box; /* OLD - iOS 6-, Safari 3.1-6 */
|
||||||
|
display: -moz-box; /* OLD - Firefox 19- (buggy but mostly works) */
|
||||||
|
display: -ms-flexbox; /* TWEENER - IE 10 */
|
||||||
|
display: -webkit-flex; /* NEW - Chrome */
|
||||||
|
display:flex;
|
||||||
|
justify-content: center;
|
||||||
|
}
|
||||||
|
|
||||||
|
.word:before,
|
||||||
|
.word:after {
|
||||||
|
content: '';
|
||||||
|
color: #fff;
|
||||||
|
position: absolute;
|
||||||
|
font-family: 'PT Serif', serif;
|
||||||
|
font-weight: bold;
|
||||||
|
font-size: 1.5rem;
|
||||||
|
font-style: italic;
|
||||||
|
opacity: 0;
|
||||||
|
width: 100%;
|
||||||
|
}
|
||||||
|
|
||||||
|
.word:before {
|
||||||
|
content: attr(data-txt);
|
||||||
|
flex-shrink: 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
.word:hover:before,
|
||||||
|
.word:active:after {
|
||||||
|
opacity: 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
.fade-out {
|
||||||
|
color: #275152;
|
||||||
|
}
|
||||||
|
|
||||||
|
p {
|
||||||
|
margin: 1rem;
|
||||||
|
}
|
||||||
|
|
||||||
|
.red {
|
||||||
|
color: red;
|
||||||
|
}
|
||||||
|
|
||||||
|
.blue {
|
||||||
|
color: blue;
|
||||||
|
}
|
||||||
|
|
||||||
|
.cyan {
|
||||||
|
color: cyan;
|
||||||
|
}
|
||||||
|
|
||||||
|
.underline {
|
||||||
|
text-decoration: underline;
|
||||||
|
}
|
@ -0,0 +1,20 @@
|
|||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>Wordtagger</title>
|
||||||
|
<meta charset="utf-8" />
|
||||||
|
<link rel="stylesheet" href="style.css" type="text/css" media="screen" />
|
||||||
|
<script type="text/javascript" src="jquery.min.js"></script>
|
||||||
|
<script type="text/javascript" src="script.js"></script>
|
||||||
|
<!--meta name="viewport" content="width=device-width"-->
|
||||||
|
</head>
|
||||||
|
|
||||||
|
<body>
|
||||||
|
<div class="container"><p>
|
||||||
|
{% for item, value in words_and_tags.items() %}
|
||||||
|
<span id="{{item}}" class="word {{words_and_tags[item]['sentiment']}} {{words_and_tags[item]['wordtype']}} {{words_and_tags[item]['POS']}}" data-txt="{{ words_and_tags[item]['word'] }}" data-pos="{{words_and_tags[item]['POS']}}" {% if words_and_tags[item]['word'] in [',','.','(',')'] %} data-sentiment= "{{ words_and_tags[item]['word'] }}" {% else %} data-sentiment= '{{ words_and_tags[item]['sentiment'] }}' {% endif %} {% if words_and_tags[item]['wordtype'] == 'stopword' %} data-stopword= "stopword" {% else %} data-stopword= '{{ words_and_tags[item]['word'] }}' {% endif %} >{{words_and_tags[item]['POS']}} </span>
|
||||||
|
{% endfor %}
|
||||||
|
</p>
|
||||||
|
</div>
|
||||||
|
</body>
|
||||||
|
</html>
|
@ -0,0 +1,156 @@
|
|||||||
|
# LIBS
|
||||||
|
import nltk
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
from sys import stdin, stdout
|
||||||
|
from nltk import ne_chunk, pos_tag, word_tokenize
|
||||||
|
from nltk.sentiment.vader import SentimentIntensityAnalyzer
|
||||||
|
from nltk.corpus import stopwords
|
||||||
|
from jinja2 import Template
|
||||||
|
|
||||||
|
# == INPUT AND TOKENIZE ==
|
||||||
|
# Define input, tokenize and safe tokens to dictionary. Use index as ID for each word.
|
||||||
|
input = stdin.read()
|
||||||
|
words = nltk.word_tokenize(input)
|
||||||
|
words_and_tags = {'item ' + str(index) : {'word':word} for index , word in enumerate(words)}
|
||||||
|
print(words_and_tags)
|
||||||
|
|
||||||
|
# == FILTER FUNCTIONS ==
|
||||||
|
|
||||||
|
# === 1. POS_tagger & Named Entity Recognizer ===
|
||||||
|
# This function cuts a string into words. Then runs a POS tagger for each word. Returns a list with tags
|
||||||
|
def POS_tagger(list):
|
||||||
|
taggedwordlist = nltk.pos_tag(list)
|
||||||
|
|
||||||
|
|
||||||
|
for word, pos in nltk.pos_tag(list):
|
||||||
|
taggedwordlist = nltk.pos_tag(list)
|
||||||
|
#print('{0} is a {1}'.format(word,pos)) # Comment out to print the analysis step
|
||||||
|
print(taggedwordlist)
|
||||||
|
taglist = [ pos for word,pos in taggedwordlist ]
|
||||||
|
POS_tags = []
|
||||||
|
|
||||||
|
for tag in taglist:
|
||||||
|
if tag in {"NNP","NNS","NN","NNPS"}:
|
||||||
|
POS_tag = 'noun'
|
||||||
|
elif tag in {'VB','VBD','VBG','VBN','VBP','VBZ'}:
|
||||||
|
POS_tag = 'verb'
|
||||||
|
elif tag in {'RB','RBR','RBS','WRB'}:
|
||||||
|
POS_tag = 'adverb'
|
||||||
|
elif tag in {'PRP','PRP$'}:
|
||||||
|
POS_tag = 'pronoun'
|
||||||
|
elif tag in {'JJ','JJR','JJS'}:
|
||||||
|
POS_tag = 'adjective'
|
||||||
|
elif tag == 'IN':
|
||||||
|
POS_tag = 'preposition'
|
||||||
|
elif tag == 'WDT':
|
||||||
|
POS_tag = 'determiner'
|
||||||
|
elif tag in {'WP','WP$'}:
|
||||||
|
POS_tag = 'pronoun'
|
||||||
|
elif tag == 'UH':
|
||||||
|
POS_tag = 'interjection'
|
||||||
|
elif tag == 'POS':
|
||||||
|
POS_tag = 'possesive ending'
|
||||||
|
elif tag == 'SYM':
|
||||||
|
POS_tag = 'symbol'
|
||||||
|
elif tag == 'EX':
|
||||||
|
POS_tag = 'existential there'
|
||||||
|
elif tag == 'DT':
|
||||||
|
POS_tag = 'determiner'
|
||||||
|
elif tag == 'MD':
|
||||||
|
POS_tag = 'modal'
|
||||||
|
elif tag == 'LS':
|
||||||
|
POS_tag = 'list item marker'
|
||||||
|
elif tag == 'FW':
|
||||||
|
POS_tag = 'foreign word'
|
||||||
|
elif tag == 'CC':
|
||||||
|
POS_tag = 'coordinating conjunction '
|
||||||
|
elif tag == 'CD':
|
||||||
|
POS_tag = 'cardinal number'
|
||||||
|
elif tag == 'TO':
|
||||||
|
POS_tag = 'to'
|
||||||
|
elif tag == '.':
|
||||||
|
POS_tag = 'line ending'
|
||||||
|
elif tag == ',':
|
||||||
|
POS_tag = 'comma'
|
||||||
|
else:
|
||||||
|
POS_tag = tag
|
||||||
|
POS_tags.append(POS_tag)
|
||||||
|
#print(POS_tag)
|
||||||
|
return POS_tags;
|
||||||
|
|
||||||
|
# === 2. Sentiment tagger ===
|
||||||
|
# Sentiment analyzer based on the NLTK VADER tagger.
|
||||||
|
# This function uses words as an input. It tags each word based on its sentiment: negative, neutral or positive
|
||||||
|
def sentiment_tagger(word):
|
||||||
|
analyzer = SentimentIntensityAnalyzer()
|
||||||
|
score = analyzer.polarity_scores(word).get("compound")
|
||||||
|
|
||||||
|
if score < 0:
|
||||||
|
sentiment_tag = 'negative'
|
||||||
|
elif score > 0:
|
||||||
|
sentiment_tag = 'positive'
|
||||||
|
else:
|
||||||
|
sentiment_tag = 'neutral'
|
||||||
|
|
||||||
|
return sentiment_tag
|
||||||
|
|
||||||
|
# === 3. Stopword tagger ===
|
||||||
|
# Labels words on being a keyword or a stopword, based on the list in the NLTK corpus
|
||||||
|
def stopword_tagger(word):
|
||||||
|
|
||||||
|
stopWords = set(stopwords.words('english'))
|
||||||
|
|
||||||
|
if word in stopWords:
|
||||||
|
stopword_tag = 'stopword'
|
||||||
|
else:
|
||||||
|
stopword_tag = 'keyword'
|
||||||
|
|
||||||
|
return stopword_tag
|
||||||
|
|
||||||
|
|
||||||
|
# Run POS tagger
|
||||||
|
# This tagger outputs a list for all items in the dict at once
|
||||||
|
# To avoid double work, it is better to keep this outside the for loop
|
||||||
|
POS_tags = POS_tagger(words)
|
||||||
|
i = 0
|
||||||
|
|
||||||
|
# Adding tags to words in dictionary, which will be exported as a json file
|
||||||
|
# {'item 0' : {'word' : word, 'tagger 1': value 1}}
|
||||||
|
for item, value in words_and_tags.items():
|
||||||
|
word = words_and_tags[item]['word']
|
||||||
|
|
||||||
|
# POS
|
||||||
|
pos_tag = POS_tags[i]
|
||||||
|
words_and_tags[item]['POS'] = pos_tag
|
||||||
|
i = i+1
|
||||||
|
|
||||||
|
# Add sentiment tag
|
||||||
|
sentiment_tag = sentiment_tagger(word)
|
||||||
|
words_and_tags[item]['sentiment'] = sentiment_tag
|
||||||
|
|
||||||
|
# Add stopword tag
|
||||||
|
stopword_tag = stopword_tagger(word)
|
||||||
|
words_and_tags[item]['wordtype'] = stopword_tag
|
||||||
|
|
||||||
|
# Add entity tag
|
||||||
|
# Not functional yet
|
||||||
|
|
||||||
|
# Save data into a json file
|
||||||
|
print(words_and_tags)
|
||||||
|
#with open("data.json", 'w') as f:
|
||||||
|
with open(os.path.dirname(os.path.dirname(os.path.dirname( __file__ ))) + "output/wordtagger/data.json", 'w') as f:
|
||||||
|
json.dump(words_and_tags, f, ensure_ascii=False)
|
||||||
|
|
||||||
|
#let's bind it to a jinja2 template
|
||||||
|
# Jinja moves up one level by default, so I do not need to do it myself as in line 141
|
||||||
|
template_open = open("src/wordtagger/template.html", "r")
|
||||||
|
template = Template(template_open.read())
|
||||||
|
index_render = template.render(words_and_tags=words_and_tags)
|
||||||
|
#print(text_render)
|
||||||
|
|
||||||
|
# And render an html file!
|
||||||
|
print(index_render)
|
||||||
|
index_open = open("output/wordtagger/index.html", "w")
|
||||||
|
index_open.write(index_render)
|
||||||
|
index_open.close()
|
Loading…
Reference in New Issue