|
|
|
# October 2021, copyleft || Kamome and Funix || Speech-to-Derive * The Myth of Natural Language || Roodkapje, Rotterdam
|
|
|
|
|
|
|
|
|
|
|
|
# NLTK (Natural Language ToolKit) is a library for Natural Language Process.
|
|
|
|
# We will use it to get the Part Of Speech (POS) of the speech-to-text results.
|
|
|
|
#
|
|
|
|
# What does it mean?
|
|
|
|
#
|
|
|
|
# It works as grammar tagging: for instance, the sentence "Around the clouds"
|
|
|
|
# would have this output:
|
|
|
|
#
|
|
|
|
# [('Around', 'IN'), ('the', 'DT'), ('clouds', 'NN')]
|
|
|
|
#
|
|
|
|
# 'IN' means 'preposition' - 'DT' means 'determiner' - 'NN' means 'noun, common, singular or mass'
|
|
|
|
|
|
|
|
|
|
|
|
import time # to create delays :: for having a few seconds to check the console
|
|
|
|
import nltk # to use NLTK
|
|
|
|
|
|
|
|
# Open the speech-to-text result :: downloaded from the web interface >>
|
|
|
|
|
|
|
|
with open('../speech.txt','r') as speech: # let's import the text
|
|
|
|
text = speech.read() # and make python read it :)
|
|
|
|
print(text) # print it!
|
|
|
|
|
|
|
|
time.sleep(2) # check it in the console!
|
|
|
|
|
|
|
|
|
|
|
|
text = text.replace('<span class="interim"></span>','').replace('\n','. ') # delete this from the results
|
|
|
|
|
|
|
|
tokens = nltk.word_tokenize(text) # Tokenize the words :: split each word
|
|
|
|
pos = nltk.pos_tag(tokens) # Elaborate the Part of Speech! It will create an array, a list
|
|
|
|
print(pos) # print the array!
|
|
|
|
|
|
|
|
time.sleep(2) # check it in the console!
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# To see all the POS tags, open the terminal and copy:
|
|
|
|
#
|
|
|
|
# python3
|
|
|
|
# import nltk
|
|
|
|
# nltk.help.upenn_tagset()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# start the layouting :: html + css + paged.js >>
|
|
|
|
#
|
|
|
|
# declare html :: we will fill it in the process with loops
|
|
|
|
# declare the first part of the text for two html files with different CSS
|
|
|
|
|
|
|
|
html = ''
|
|
|
|
|
|
|
|
html1 = '''
|
|
|
|
<html>
|
|
|
|
<head>
|
|
|
|
<meta name="viewport" content="width=device-width, initial-scale=1">
|
|
|
|
<link rel="stylesheet" href="./pagedjs_files/interface.css">
|
|
|
|
<script src="./pagedjs_files/paged.polyfill.js"></script>
|
|
|
|
<link rel="stylesheet" href="./styles/1.css">
|
|
|
|
<meta charset="utf-8"/>
|
|
|
|
<title>📡 💻📘</title>
|
|
|
|
</head>
|
|
|
|
<body>
|
|
|
|
'''
|
|
|
|
|
|
|
|
html2 = '''
|
|
|
|
<html>
|
|
|
|
<head>
|
|
|
|
<meta name="viewport" content="width=device-width, initial-scale=1">
|
|
|
|
<link rel="stylesheet" href="./pagedjs_files/interface.css">
|
|
|
|
<script src="./pagedjs_files/paged.polyfill.js"></script>
|
|
|
|
<link rel="stylesheet" href="./styles/2.css">
|
|
|
|
<meta charset="utf-8"/>
|
|
|
|
<title>📡 💻📘</title>
|
|
|
|
</head>
|
|
|
|
<body>
|
|
|
|
'''
|
|
|
|
|
|
|
|
|
|
|
|
# Process each element of the list
|
|
|
|
|
|
|
|
for e in pos: # e is the current element, pos is the array to process
|
|
|
|
|
|
|
|
if e[0] == '.': # if e is a dot, its class will be 'dot'
|
|
|
|
html += " <span class='dot'>.</span><br> \n"
|
|
|
|
|
|
|
|
else: # fill the html with each word and assign it as class its POS
|
|
|
|
html += " <span class='"+e[1]+"'> "+e[0]+" </span>\n"
|
|
|
|
|
|
|
|
|
|
|
|
# Close the html text
|
|
|
|
html += '''</body>
|
|
|
|
</html>'''
|
|
|
|
|
|
|
|
html = html.replace(' .','.').replace(" '", "'") # to tidy wrong " . " and " ' " position
|
|
|
|
|
|
|
|
|
|
|
|
# Save the <html> files!
|
|
|
|
|
|
|
|
with open('../2_layout/1.html','w') as index:
|
|
|
|
index.write(html1)
|
|
|
|
index.write(html)
|
|
|
|
|
|
|
|
with open('../2_layout/2.html','w') as index:
|
|
|
|
index.write(html2)
|
|
|
|
index.write(html)
|
|
|
|
|
|
|
|
|