all tokenized words

master
Castro0o 5 years ago
parent aa12139983
commit 3fb9249ca6

@ -5793,36 +5793,36 @@
</p> </p>
</div> </div>
<div class="top_words"><div class="top_words_title" ><b>Frequent words</b></div> <div class="top_words"><div class="top_words_title" ><b>Frequent words</b></div>
<div class="chosen_words" >&nbsp;agreement&nbsp;(5) </div> <div class="chosen_words" >&nbsp;services&nbsp;(69) </div>
<div class="chosen_words" >&nbsp;california&nbsp;(2) </div> <div class="chosen_words" >&nbsp;agreement&nbsp;(60) </div>
<div class="chosen_words" >&nbsp;section&nbsp;(2) </div> <div class="chosen_words" >&nbsp;arbitration&nbsp;(42) </div>
<div class="chosen_words" >&nbsp;consumer&nbsp;(2) </div> <div class="chosen_words" >&nbsp;content&nbsp;(37) </div>
<div class="chosen_words" >&nbsp;services&nbsp;(2) </div> <div class="chosen_words" >&nbsp;use&nbsp;(35) </div>
<div class="chosen_words" >&nbsp;right&nbsp;(2) </div> <div class="chosen_words" >&nbsp;user&nbsp;(28) </div>
<div class="chosen_words" >&nbsp;provision&nbsp;(2) </div> <div class="chosen_words" >&nbsp;claims&nbsp;(28) </div>
<div class="chosen_words" >&nbsp;intended&nbsp;(2) </div> <div class="chosen_words" >&nbsp;may&nbsp;(26) </div>
<div class="chosen_words" >&nbsp;accordance&nbsp;(1) </div> <div class="chosen_words" >&nbsp;rights&nbsp;(21) </div>
<div class="chosen_words" >&nbsp;civil&nbsp;(1) </div> <div class="chosen_words" >&nbsp;app&nbsp;(21) </div>
<div class="chosen_words" >&nbsp;code&nbsp;(1) </div> <div class="chosen_words" >&nbsp;parties&nbsp;(20) </div>
<div class="chosen_words" >&nbsp;may&nbsp;(1) </div> <div class="chosen_words" >&nbsp;apple&nbsp;(18) </div>
<div class="chosen_words" >&nbsp;report&nbsp;(1) </div> <div class="chosen_words" >&nbsp;law&nbsp;(17) </div>
<div class="chosen_words" >&nbsp;complaints&nbsp;(1) </div> <div class="chosen_words" >&nbsp;applicable&nbsp;(16) </div>
<div class="chosen_words" >&nbsp;complaint&nbsp;(1) </div> <div class="chosen_words" >&nbsp;access&nbsp;(15) </div>
<div class="chosen_words" >&nbsp;assistance&nbsp;(1) </div> <div class="chosen_words" >&nbsp;copyright&nbsp;(15) </div>
<div class="chosen_words" >&nbsp;unit&nbsp;(1) </div> <div class="chosen_words" >&nbsp;relief&nbsp;(15) </div>
<div class="chosen_words" >&nbsp;division&nbsp;(1) </div> <div class="chosen_words" >&nbsp;shall&nbsp;(15) </div>
<div class="chosen_words" >&nbsp;department&nbsp;(1) </div> <div class="chosen_words" >&nbsp;agree&nbsp;(14) </div>
<div class="chosen_words" >&nbsp;affairs&nbsp;(1) </div> <div class="chosen_words" >&nbsp;right&nbsp;(14) </div>
<div class="chosen_words" >&nbsp;contacting&nbsp;(1) </div> <div class="chosen_words" >&nbsp;acknowledge&nbsp;(13) </div>
<div class="chosen_words" >&nbsp;writing&nbsp;(1) </div> <div class="chosen_words" >&nbsp;subject&nbsp;(12) </div>
<div class="chosen_words" >&nbsp;400&nbsp;(1) </div> <div class="chosen_words" >&nbsp;limited&nbsp;(12) </div>
<div class="chosen_words" >&nbsp;r&nbsp;(1) </div> <div class="chosen_words" >&nbsp;u.s.&nbsp;(12) </div>
<div class="chosen_words" >&nbsp;street&nbsp;(1) </div> <div class="chosen_words" >&nbsp;arbitrator&nbsp;(12) </div>
<div class="chosen_words" >&nbsp;sacramento&nbsp;(1) </div> <div class="chosen_words" >&nbsp;claim&nbsp;(11) </div>
<div class="chosen_words" >&nbsp;ca&nbsp;(1) </div> <div class="chosen_words" >&nbsp;disputes&nbsp;(11) </div>
<div class="chosen_words" >&nbsp;95814&nbsp;(1) </div> <div class="chosen_words" >&nbsp;rules&nbsp;(11) </div>
<div class="chosen_words" >&nbsp;telephone&nbsp;(1) </div> <div class="chosen_words" >&nbsp;legal&nbsp;(10) </div>
<div class="chosen_words" >&nbsp;800&nbsp;(1) </div> <div class="chosen_words" >&nbsp;person&nbsp;(10) </div>
</div></div></div> </div></div></div>
</div> </div>
</div> </div>

@ -36,8 +36,8 @@ print('''<!DOCTYPE>
<html> <html>
<head> <head>
<script src="https://code.jquery.com/jquery-3.5.0.min.js"></script> <script src="https://code.jquery.com/jquery-3.5.0.min.js"></script>
<link rel="stylesheet" href="css/faceapp.css"> <link rel="stylesheet" href="faceapp.css">
<script src="js/highlight.js"></script> <script src="highlight.js"></script>
<meta charset="utf-8"> <meta charset="utf-8">
<title></title> <title></title>
@ -79,8 +79,10 @@ print('</div></div>')
#ToS text #ToS text
print('<div class ="paragraph">') print('<div class ="paragraph">')
tokenized_all = []
for paragraph in faceapp_text_list: for paragraph in faceapp_text_list:
tokenized = word_tokenize(paragraph) tokenized = word_tokenize(paragraph)
tokenized_all += tokenized # add to the tokenized_all
tagged = pos_tag(tokenized) tagged = pos_tag(tokenized)
print('<p>') print('<p>')
for word, pos in tagged: for word, pos in tagged:
@ -93,7 +95,7 @@ print('</div>')
#tos top words list #tos top words list
print('<div class="top_words"><div class="top_words_title" ><b>Frequent words</b></div>') print('<div class="top_words"><div class="top_words_title" ><b>Frequent words</b></div>')
tokens_without_stopwords = nltk.FreqDist(words.lower() for words in tokenized if words.lower() not in tos_all_stopwords) tokens_without_stopwords = nltk.FreqDist(words.lower() for words in tokenized_all if words.lower() not in tos_all_stopwords)
frequency_word = FreqDist(tokens_without_stopwords) frequency_word = FreqDist(tokens_without_stopwords)
top_words = tokens_without_stopwords.most_common(30) top_words = tokens_without_stopwords.most_common(30)

Loading…
Cancel
Save