From 3deaaeac0bf66fb217630ced5d0b1574a603ea2e Mon Sep 17 00:00:00 2001 From: jvdhorst Date: Thu, 22 Mar 2018 18:21:13 +0100 Subject: [PATCH] Changed name of Wordtagger to reading_structure. Updated script to newest version. Updated makefile. WARNING: Weasyprint is an extra dependency. --- Makefile | 18 +-- .../jquery.min.js | 0 src/reading_structure/print.css | 99 +++++++++++++ .../reading_structure.py} | 41 +++-- src/reading_structure/script.js | 112 ++++++++++++++ src/reading_structure/style.css | 140 ++++++++++++++++++ src/reading_structure/template.html | 33 +++++ src/wordtagger/script.js | 64 -------- src/wordtagger/style.css | 86 ----------- src/wordtagger/template.html | 20 --- 10 files changed, 417 insertions(+), 196 deletions(-) rename src/{wordtagger => reading_structure}/jquery.min.js (100%) create mode 100644 src/reading_structure/print.css rename src/{wordtagger/wordtagger.py => reading_structure/reading_structure.py} (83%) create mode 100644 src/reading_structure/script.js create mode 100644 src/reading_structure/style.css create mode 100644 src/reading_structure/template.html delete mode 100644 src/wordtagger/script.js delete mode 100644 src/wordtagger/style.css delete mode 100644 src/wordtagger/template.html diff --git a/Makefile b/Makefile index f26894e..4d18fb7 100644 --- a/Makefile +++ b/Makefile @@ -70,16 +70,14 @@ hocrs: ## hocr with tesseract and then change extension to .html #OUTPUT GENERATION RECIPES -output/wordtagger/index.html: ocr/output.txt ## Analyzes OCR'ed text using a Part of Speech (POS) tagger. Outputs a string of tags (e.g. nouns, verbs, adjectives, and adverbs). Dependencies: python3's nltk, jinja2 - mkdir -p output/wordtagger - cp src/wordtagger/jquery.min.js output/wordtagger - cp src/wordtagger/script.js output/wordtagger - cp src/wordtagger/style.css output/wordtagger - cat $< | python3 src/wordtagger/wordtagger.py -# install nltk's 'averaged_perceptron_tagger': -# $ python 3 -# >>> import nltk -# >>> nltk.download('averaged_perceptron_tagger') +output/reading_structure/index.html: ocr/output.txt ## Analyzes OCR'ed text using a Part of Speech (POS) tagger. Outputs a string of tags (e.g. nouns, verbs, adjectives, and adverbs). Dependencies: python3's nltk, jinja2, weasyprint + mkdir -p output/reading_structure + cp src/reading_structure/jquery.min.js output/reading_structure + cp src/reading_structure/script.js output/reading_structure + cp src/reading_structure/style.css output/reading_structure + cp src/reading_structure/print.css output/reading_structure + cat $< | python3 src/reading_structure/reading_structure.py + weasyprint -s output/reading_structure/print.css output/reading_structure/index.html output/reading_structure/poster.pdf output/chatbot.txt: ocr/output.txt ## Comments a text with a simple chatbot. Dependencies: python3's chatterbot cat $< | python3 src/textbotconversation.py $(@) diff --git a/src/wordtagger/jquery.min.js b/src/reading_structure/jquery.min.js similarity index 100% rename from src/wordtagger/jquery.min.js rename to src/reading_structure/jquery.min.js diff --git a/src/reading_structure/print.css b/src/reading_structure/print.css new file mode 100644 index 0000000..189288f --- /dev/null +++ b/src/reading_structure/print.css @@ -0,0 +1,99 @@ +@page { + /* dimensions for the whole page */ + size: A3 portrait; + margin: 2rem 2rem 3em 2rem; + background-color: #003cb3; + + @bottom-center { + content: 'make output/reading_structure'; + font-family: 'Ubuntu Mono', monospace; + white-space: pre; + color: #fff; + padding-bottom: 2rem; + } + +} + +/* --- + + BASIC ELEMENTS + + --- */ + +body { + background-color: none !important; + font-size: 20px; +} + +.container { +margin: 0 auto; +padding: 2rem 2.5rem; +max-width: 100% !important; +min-height: 90% !important; +background-color: #003cb3 !important; +border: none !important; +} + +span.wrapper { + display: inline-block; + font-family: 'PT Serif', serif; + font-size: 1.5rem; + text-align:center; + position: relative; + margin-bottom: 1rem; + color: #fff; +} + +.action-container { + display: none; +} + +/* --- + + ELEMENTS IN .WRAPPER + + --- */ + +.word, .tag { + display: block; + position: relative; + margin: 0 auto; + clear:both; +} + +.word { + margin-top: 0.2rem; +} + +span.invisible { + +} + +span.tag { + color: #fff; + opacity: 0.5; + font-family: 'Ubuntu Mono', monospace; + font-size: 0.8rem; +} + +/* Show original word when hovering label */ +span.word_label, .stopword .word { + display: block; + opacity: 0; + width: 100%; + font-size: 1rem; +} + +.stopword > .wordtype:hover, { + opacity: 1; + /*color: red;*/ +} + +.stopword > .wordtype { + opacity: 1; +} + +/* Spacing fix for punctuation */ +.punctuation { + margin-left: -4px; +} diff --git a/src/wordtagger/wordtagger.py b/src/reading_structure/reading_structure.py similarity index 83% rename from src/wordtagger/wordtagger.py rename to src/reading_structure/reading_structure.py index d2ae1f3..3a6e8a2 100644 --- a/src/wordtagger/wordtagger.py +++ b/src/reading_structure/reading_structure.py @@ -12,7 +12,7 @@ from jinja2 import Template # Define input, tokenize and safe tokens to dictionary. Use index as ID for each word. input = stdin.read() words = nltk.word_tokenize(input) -words_and_tags = {'item ' + str(index) : {'word':word} for index , word in enumerate(words)} +words_and_tags = {index : {'word':word} for index , word in enumerate(words)} print(words_and_tags) # == FILTER FUNCTIONS == @@ -68,7 +68,7 @@ def POS_tagger(list): elif tag == 'CD': POS_tag = 'cardinal number' elif tag == 'TO': - POS_tag = 'to' + POS_tag = 'infinitival to' elif tag == '.': POS_tag = 'line ending' elif tag == ',': @@ -82,18 +82,23 @@ def POS_tagger(list): # === 2. Sentiment tagger === # Sentiment analyzer based on the NLTK VADER tagger. # This function uses words as an input. It tags each word based on its sentiment: negative, neutral or positive -def sentiment_tagger(word): +def sentiment_tagger(list): analyzer = SentimentIntensityAnalyzer() - score = analyzer.polarity_scores(word).get("compound") + sentiment_tags = [] - if score < 0: - sentiment_tag = 'negative' - elif score > 0: - sentiment_tag = 'positive' - else: - sentiment_tag = 'neutral' + for word in list: + score = analyzer.polarity_scores(word).get("compound") + + if score < 0: + sentiment_tag = 'negative' + elif score > 0: + sentiment_tag = 'positive' + else: + sentiment_tag = 'neutral' - return sentiment_tag + sentiment_tags.append(sentiment_tag) + + return sentiment_tags # === 3. Stopword tagger === # Labels words on being a keyword or a stopword, based on the list in the NLTK corpus @@ -113,6 +118,7 @@ def stopword_tagger(word): # This tagger outputs a list for all items in the dict at once # To avoid double work, it is better to keep this outside the for loop POS_tags = POS_tagger(words) +sentiment_tags = sentiment_tagger(words) i = 0 # Adding tags to words in dictionary, which will be exported as a json file @@ -123,11 +129,14 @@ for item, value in words_and_tags.items(): # POS pos_tag = POS_tags[i] words_and_tags[item]['POS'] = pos_tag - i = i+1 + #i = i+1 # Add sentiment tag - sentiment_tag = sentiment_tagger(word) + #sentiment_tag = sentiment_tagger(word) + #words_and_tags[item]['sentiment'] = sentiment_tag + sentiment_tag = sentiment_tags[i] words_and_tags[item]['sentiment'] = sentiment_tag + i = i+1 # Add stopword tag stopword_tag = stopword_tagger(word) @@ -139,18 +148,18 @@ for item, value in words_and_tags.items(): # Save data into a json file print(words_and_tags) #with open("data.json", 'w') as f: -with open(os.path.dirname(os.path.dirname(os.path.dirname( __file__ ))) + "output/wordtagger/data.json", 'w') as f: +with open(os.path.dirname(os.path.dirname(os.path.dirname( __file__ ))) + "output/reading_structure/data.json", 'w') as f: json.dump(words_and_tags, f, ensure_ascii=False) #let's bind it to a jinja2 template # Jinja moves up one level by default, so I do not need to do it myself as in line 141 -template_open = open("src/wordtagger/template.html", "r") +template_open = open("src/reading_structure/template.html", "r") template = Template(template_open.read()) index_render = template.render(words_and_tags=words_and_tags) #print(text_render) # And render an html file! print(index_render) -index_open = open("output/wordtagger/index.html", "w") +index_open = open("output/reading_structure/index.html", "w") index_open.write(index_render) index_open.close() diff --git a/src/reading_structure/script.js b/src/reading_structure/script.js new file mode 100644 index 0000000..aa97bf7 --- /dev/null +++ b/src/reading_structure/script.js @@ -0,0 +1,112 @@ +/* +The index.html features all words and their labels. The function of this script is to show and hide +specific words based on their label. In this case the word is hidden, but its respective label is shown in the text. + +Each collection of words and labels is inside a .wrapper with an id. +The classes of these wrapper feature the word, and the values of the labels (e.g. class="wrapper software noun keyword neutral") +by default all words inside the wrapper have the class .word. This class is visible. +by default all labels (noun, neutral etc.) have the class .tag and another class with the type of label (POS, sentiment, etc). .tag is by default display:none + +If the user clicks on one of the .wrapper elements, the page changes the hidden words. The state changes. + +What happens inside the wrapper if the state changes to hide all nouns? + +- Previous filter is disabled. All tags invisible. All words visible. +- the words in the wrapper with class noun are selected. They get the class word_label. Which means: only visible on :hovering the wrapper. +- the span with the text 'noun' and class 'pos' will lose the class invisible. The tag is now visible in the text. + +Aside: What about Weasyprint? +Weasyprint can't see the changes made by this script. It will only show the normal text, without any labels. +To print the text with the desired styling, there is a specific stylesheet called print.css. +*/ + + +$(document).ready(function(){ + + // State 1 Selectors for stopword tagger: selecting the word, and the label 'stopword' + var stopword_word = $('.stopword > .word'); + var stopword_label = $('.stopword > .wordtype'); + + // State 2 Selectors for the sentiment tagger, showing neutral words and their labels + var posneg_word = $('.negative > .word, .positive > .word'); + var posneg_label = $('.negative > .sentiment, .positive > .sentiment'); + + // State 3 Selectors for the noun taggger + var noun_word = $('.noun > .word'); + var noun_label = $('.noun > .pos'); + + // State 4 Selectors for the adjective and adverb taggger + var ad_word = $('.adjective > .word, .adverb > .word'); + var ad_label = $('.adjective > .pos, .adverb > .pos'); + + // State 5 Selectors for the determiner, pronoun, preposition and infinitival to taggger + var dppt_word = $('.determiner > .word, .pronoun > .word, .to > .word, .preposition > .word'); + var dppt_label = $('.determiner > .pos, .pronoun > .pos, .to > .pos, .preposition > .pos'); + + // State 6 Selectors for the sentiment tagger, showing only positive and negative words and their labels + var neutral_word = $('.neutral > .word'); + var neutral_label = $('.neutral > .sentiment'); + + // On page load, prepare the right view for state one. Hiding all stopwords, showing the stopword label + var state = 1; + stopword_word.addClass('word_label'); + stopword_label.removeClass('invisible'); + + // Here we run through the states + $('.container').click( function() { + console.log(state); + + if (state == 1) { + stopword_word.removeClass('word_label'); + stopword_label.addClass('invisible'); + + posneg_word.addClass('word_label'); + posneg_label.removeClass('invisible'); + } + + if (state == 2) { + posneg_word.removeClass('word_label'); + posneg_label.addClass('invisible'); + + noun_word.addClass('word_label'); + noun_label.removeClass('invisible'); + } + + if (state == 3) { + noun_word.removeClass('word_label'); + noun_label.addClass('invisible'); + + ad_word.addClass('word_label'); + ad_label.removeClass('invisible'); + } + + if (state == 4) { + ad_word.removeClass('word_label'); + ad_label.addClass('invisible'); + + dppt_word.addClass('word_label'); + dppt_label.removeClass('invisible'); + } + + if (state == 5) { + dppt_word.removeClass('word_label'); + dppt_label.addClass('invisible'); + + neutral_word.addClass('word_label'); + neutral_label.removeClass('invisible'); + } + + if (state == 6) { + neutral_word.removeClass('word_label'); + neutral_label.addClass('invisible'); + + stopword_word.addClass('word_label'); + stopword_label.removeClass('invisible'); + + state = 0; + } + + state = state+1; +}); + +}); diff --git a/src/reading_structure/style.css b/src/reading_structure/style.css new file mode 100644 index 0000000..05ebdc9 --- /dev/null +++ b/src/reading_structure/style.css @@ -0,0 +1,140 @@ +/* --- + + BASIC ELEMENTS + + --- */ + +body { + background-color: #dfdfdf; +} + +.container { + margin: 2rem auto; + padding: 4rem 5rem; + min-width: 40rem; + max-width: 45%; + min-height: 90vh; + background-color: #fff; + cursor: pointer; + border: 0px solid #aeaeae; + box-shadow: 0 1px 15px rgba(0,0,0,0.12), 0 1px 3px rgba(0,0,0,0.24); +} + +.action-container { + margin: 2rem; + padding: 2rem 2.5rem; + max-width: 45%; + float: right; + position: fixed; + bottom: 2rem; + right: 0; +} + +span.wrapper { + display: inline-block; + font-family: 'PT Serif', serif; + font-size: 1.2rem; + text-align:center; + position: relative; + margin-bottom: 0.75rem; +} + +/* --- + + ELEMENTS IN ACTION CONTAINER + + --- */ + + .action-container a { + font-family: 'PT Serif', serif; + padding: 0.4rem; + background: black; + color: white; + text-decoration: none; + box-shadow: none; +transition: all 0.3s cubic-bezier(.25,.8,.25,1); + } + + .action-container a:hover { + box-shadow: 0 1px 10px rgba(0,0,0,0.12), 0 1px 10px rgba(0,0,0,0.24); + } + + .action-container p { + margin: 1.25rem 2rem 0 0; + } + + +/* --- + + ELEMENTS IN .WRAPPER + + --- */ + +.word, .tag { + display: block; + position: relative; + margin: auto; + clear:both; +} + +span.invisible { + display: none; +} + +span.word { + margin-top: 1.25rem; +} + +span.tag { + color: #fff; + font-family: 'Ubuntu Mono', monospace; + margin-bottom: -1.25rem; +} + + + +/* Show original word when hovering label */ +span.word_label { + display: block; + opacity: 0; + width: 100%; + font-family: 'Ubuntu Mono', monospace; +} + +.wrapper:hover > span.word_label { + opacity: 0.5; +} + +/* Spacing fix for punctuation */ +.punctuation { + margin-left: -4px; +} + +/* Colors for tags */ +.noun .pos { + color: #003cb3; +} + +.stopword .wordtype { + color: #b83e54; +} + +.neutral .sentiment { + color: #8491a5; +} + +.negative .sentiment { + color: #c9805b; +} + +.positive .sentiment { + color: #44a889; +} + +.adjective .pos, .adverb .pos { + color: #8e445e; +} + +.pronoun .pos, .determiner .pos, .to .pos, .preposition .pos { + color: #2f7f40; +} diff --git a/src/reading_structure/template.html b/src/reading_structure/template.html new file mode 100644 index 0000000..0aa6b51 --- /dev/null +++ b/src/reading_structure/template.html @@ -0,0 +1,33 @@ + + + + Reading the Structure + + + + + + + + +

+ {% for item, value in words_and_tags.items() %} + + + + + + {{ words_and_tags[item]['word'] }} + + + {% endfor %} + +

+ +
+

Export data as json

+

Print this as a poster

+
+ + + diff --git a/src/wordtagger/script.js b/src/wordtagger/script.js deleted file mode 100644 index 551c9ba..0000000 --- a/src/wordtagger/script.js +++ /dev/null @@ -1,64 +0,0 @@ -$(document).ready(function(){ - var state = 0; - - $('.noun').addClass('fade-out'); - $('.preposition').addClass('red'); - $('.verb').addClass('blue'); - $('.determiner').addClass('cyan'); - - $(document).bind('contextmenu', function(e) { return false; }); - - $( ".word" ).contextmenu(function() { - console.log($(this).hasClass('underline')); - $(this).hasClass('underline') == false - ? $(this).addClass('underline') - : $(this).removeClass('underline'); - }); - - $('.word').click( function() { - var el = $('.word'); - console.log(state); - - if (state == 0) { - $('.word').removeClass('fade-out red blue cyan'); - - $('.stopword').addClass('fade-out'); - } - - else if (state == 1) { - $('.stopword').removeClass('fade-out'); - $('.neutral').addClass('fade-out'); - } - - else if (state == 2) { - $('.neutral').removeClass('fade-out'); - $('.noun').addClass('fade-out'); - $('.preposition').addClass('red'); - $('.verb').addClass('blue'); - state = -1; - } - - $('.word').each(function() { - var el = $(this); - - if (state == 0) { - el.empty(); - el.html(el.data("stopword") + " "); - } - - else if (state == 1) { - el.empty(); - el.html(el.data("sentiment") + " "); - } - - else { - el.empty(); - el.html(el.data("pos") + " "); - } - - }); - - state = state+1; - }); - -}); diff --git a/src/wordtagger/style.css b/src/wordtagger/style.css deleted file mode 100644 index 85165ea..0000000 --- a/src/wordtagger/style.css +++ /dev/null @@ -1,86 +0,0 @@ -* { -min-height: 0; -min-width: 0; -} - -body { - background: #639ab2; - font-size: 15px; - font-family: 'Ubuntu Mono', monospace; -} - -.prelative { - flex-shrink: 0; -} - -div.container { - width: 100%; - display: -webkit-box; /* OLD - iOS 6-, Safari 3.1-6 */ - display: -moz-box; /* OLD - Firefox 19- (buggy but mostly works) */ - display: -ms-flexbox; /* TWEENER - IE 10 */ - display: -webkit-flex; /* NEW - Chrome */ - display: flex; - flex-wrap: wrap; -} - -.word { - font-size: 3rem; - float: left; - position: relative; - text-align: center; - display: -webkit-box; /* OLD - iOS 6-, Safari 3.1-6 */ - display: -moz-box; /* OLD - Firefox 19- (buggy but mostly works) */ - display: -ms-flexbox; /* TWEENER - IE 10 */ - display: -webkit-flex; /* NEW - Chrome */ - display:flex; - justify-content: center; -} - -.word:before, -.word:after { - content: ''; - color: #fff; - position: absolute; - font-family: 'PT Serif', serif; - font-weight: bold; - font-size: 1.5rem; - font-style: italic; - opacity: 0; - width: 100%; - } - -.word:before { - content: attr(data-txt); - flex-shrink: 1; -} - -.word:hover:before, -.word:active:after { - opacity: 1; -} - - - -.fade-out { - color: #275152; -} - -p { - margin: 1rem; -} - -.red { - color: red; -} - -.blue { - color: blue; -} - -.cyan { - color: cyan; -} - -.underline { - text-decoration: underline; -} diff --git a/src/wordtagger/template.html b/src/wordtagger/template.html deleted file mode 100644 index 9333db8..0000000 --- a/src/wordtagger/template.html +++ /dev/null @@ -1,20 +0,0 @@ - - - - Wordtagger - - - - - - - - -

- {% for item, value in words_and_tags.items() %} - {{words_and_tags[item]['POS']}}  - {% endfor %} -

-
- -