# from __future__ import division
from nltk import sent_tokenize , word_tokenize , pos_tag
from nltk . probability import FreqDist
from nltk . corpus import stopwords
import nltk
import codecs
import base64
nltk . download ( ' stopwords ' )
# faceapp_file = open('faceapp.txt','r')
with open ( ' tos_file/tiktok.txt ' , ' r ' ) as faceapp_file :
faceapp_text = faceapp_file . read ( )
faceapp_text_list = faceapp_text . split ( " \n \n " )
#tos stopwords
tos_default_stopwords = set ( stopwords . words ( ' english ' ) )
tos_custom_stopwords = set ( codecs . open ( ' stopwords.txt ' , ' r ' ) . read ( ) . splitlines ( ) )
tos_all_stopwords = tos_default_stopwords | tos_custom_stopwords
# multi-line string HTML
print ( ''' <!DOCTYPE>
< html >
< head >
< script src = " https://code.jquery.com/jquery-3.5.0.min.js " > < / script >
< script src = " https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js " > < / script >
< link rel = " stylesheet " href = " faceapp.css " >
< link rel = " stylesheet " href = " legend.css " >
< script src = " highlight.js " > < / script >
< meta charset = " utf-8 " >
< title > < / title >
< / head >
< body > ''' )
#wrapper
print ( ' <div class = " tos_wrapper " ><div class= " intro " > ' )
#insert an image
# https://upload.wikimedia.org/wikipedia/commons/1/15/Joffe_signing_the_Treaty_of_Tartu.jpg
FaceApp_img_url = base64 . b64encode ( open ( ' img/tiktok_logo.png ' , ' rb ' ) . read ( ) ) . decode ( ' utf-8 ' )
FaceApp_image = ' <div class= " img " style= " position: fixed; background-color: gainsboro; " >TikTok</div><br><img class= " image " src= " data:img/tiktok_logo.png;base64, {} " > ' . format ( FaceApp_img_url )
print ( FaceApp_image )
#info box
print ( ' <div class = " info " > ' )
infotext = [ ( ' Name of Service ' , ' Tiktok ' ) , ( ' Country of Origin ' , ' China ' ) , ( ' Initial release ' , ' September, 2016 ' ) , ( ' Type ' , ' Video sharing ' ) , ( ' Word Counts ' , ' 7,515 ' ) , ( ' Original Source ' , ' <a href= " https://www.tiktok.com/legal/terms-of-use?lang=en " target= " _blank " >link</a> ' ) , ( ' Description ' , ' TikTok is a Chinese video-sharing social networking service owned by ByteDance, a Beijing-based company founded in 2012 by Zhang Yiming. It is used to create short dance, lip-sync, comedy, and talent videos. ByteDance first launched Douyin for the China market in September 2016. ' ) ]
for title , info in infotext :
print ( ' <div class= " info_ {0} " ><div class= " info_title " ><b> {0} </b></div><div class= " info_content " > {1} </div></div><br> ' . format ( title , info ) )
print ( ' </div></div> ' )
print ( '''
< div class = " legend " >
< li class = " legendhide eachlegend " > stopwords < / li >
< li class = " legendadjective eachlegend " > adjective < / li >
< li class = " legendverb eachlegend " > verb < / li >
< li class = " legendnoun eachlegend " > noun < / li >
< li class = " legendpropernoun eachlegend " > proper noun < / li >
< li class = " legendadverb eachlegend " > adverb < / li >
< li class = " legendpossesivepronoun eachlegend " > possesive pronoun < / li >
< li class = " legendpresentparticiple eachlegend " > present participle < / li >
< li class = " legendadjectivesuperlative eachlegend " > adjective superlative < / li >
< li class = " legendadverb-comparative-superative eachlegend " > adverb comparative + superative < / li >
< / div >
''' )
#ToS text
print ( ' <div class = " paragraph " > ' )
tokenized_all = [ ]
for paragraph in faceapp_text_list :
tokenized = word_tokenize ( paragraph )
tokenized_all + = tokenized # add to the tokenized_all
tagged = pos_tag ( tokenized )
print ( ' <p> ' )
for word , pos in tagged :
print ( ' <span class= " {0} {1} eachwords " > {2} </span> ' . format ( pos . replace ( ' PRP$ ' , ' PRPS ' ) . replace ( ' . ' , ' dot ' ) . replace ( ' , ' , ' comma ' ) . replace ( ' ( ' , ' marks ' ) . replace ( ' ) ' , ' marks ' ) . replace ( ' : ' , ' marks ' ) . replace ( ' ; ' , ' marks ' ) , word . replace ( ' ’ ' , ' apostrophe ' ) . replace ( ' . ' , ' dot ' ) . replace ( ' , ' , ' comma ' ) . replace ( ' ( ' , ' marks ' ) . replace ( ' ) ' , ' marks ' ) . replace ( ' : ' , ' marks ' ) . replace ( ' ; ' , ' marks ' ) . lower ( ) , word ) )
print ( ' </p> ' )
print ( ' </div> ' )
#tos top words list
print ( ' <div class= " top_words " ><div class= " top_words_title " ><b>Frequent words</b></div> ' )
tokens_without_stopwords = nltk . FreqDist ( words . lower ( ) for words in tokenized_all if words . lower ( ) not in tos_custom_stopwords )
frequency_word = FreqDist ( tokens_without_stopwords )
top_words = tokens_without_stopwords . most_common ( 30 )
for chosen_words , frequency in top_words :
print ( ' <div class= " chosen_words " > {} ( {} ) </div> ' . format ( chosen_words , frequency ) )
print ( ' </div></div></div> ' )
# at the end of wrapper
print ( ' </div> ' )
print ( ' </div> ' )
print ( ''' </body></html> ''' )