You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
81 lines
2.5 KiB
Python
81 lines
2.5 KiB
Python
6 years ago
|
# import libraries
|
||
|
from selenium import webdriver
|
||
|
from selenium.webdriver.common.keys import Keys
|
||
|
import os
|
||
|
import time
|
||
|
import datetime
|
||
|
|
||
|
today = datetime.date.today()
|
||
|
|
||
|
# get the url from the terminal
|
||
|
url = input("Enter a url to scrape (include https:// etc.): ")
|
||
|
|
||
|
# Tell Selenium to open a new Firefox session
|
||
|
# and specify the path to the driver
|
||
|
driver = webdriver.Firefox(executable_path=os.path.dirname(os.path.realpath(__file__)) + '/geckodriver')
|
||
|
|
||
|
# Implicit wait tells Selenium how long it should wait before it throws an exception
|
||
|
driver.implicitly_wait(10)
|
||
|
driver.get(url)
|
||
|
time.sleep(3)
|
||
|
|
||
|
# Find the title element on the page
|
||
|
title = driver.find_element_by_xpath('//h1')
|
||
|
print ('Scraping comments from:')
|
||
|
print(title.text)
|
||
|
|
||
|
# scroll to just under the video in order to load the comments
|
||
|
driver.execute_script("window.scrollTo(1, 300);")
|
||
|
time.sleep(3)
|
||
|
|
||
|
# scroll again in order to load more comments
|
||
|
driver.execute_script('window.scrollTo(1, 2000);')
|
||
|
time.sleep(3)
|
||
|
|
||
|
# scroll again in order to load more comments
|
||
|
driver.execute_script('window.scrollTo(1, 4000);')
|
||
|
time.sleep(3)
|
||
|
|
||
|
# Find the element on the page where the comments are stored
|
||
|
comment_div=driver.find_element_by_xpath('//*[@id="contents"]')
|
||
|
comments=comment_div.find_elements_by_xpath('//*[@id="content-text"]')
|
||
|
authors=comment_div.find_elements_by_xpath('//*[@id="author-text"]')
|
||
|
|
||
|
# Extract the contents and add them to the lists
|
||
|
# This will let you create a dictionary later, of authors and comments
|
||
|
authors_list = []
|
||
|
comments_list = []
|
||
|
|
||
|
for author in authors:
|
||
|
authors_list.append(author.text)
|
||
|
|
||
|
for comment in comments:
|
||
|
comments_list.append(comment.text)
|
||
|
|
||
|
dictionary = dict(zip(authors_list, comments_list))
|
||
|
|
||
|
# Print the keys and values of our dictionary to the terminal
|
||
|
# then add them to a print_list which we'll use to write everything to a text file later
|
||
|
print_list = []
|
||
|
|
||
|
for a, b in dictionary.items():
|
||
|
print ("Comment by:", str(a), "-"*10)
|
||
|
print (str(b)+"\n")
|
||
|
print_list.append("Comment by: "+str(a)+" -"+"-"*10)
|
||
|
print_list.append(str(b)+"\n")
|
||
|
|
||
|
# Done? Great!
|
||
|
# Change the list into a collection of strings
|
||
|
# Open a txt file and put them there
|
||
|
# In case the file already exists, then just paste it at the bottom
|
||
|
# Super handy if you want to run the script for multiple sites and collect text
|
||
|
# in just one file
|
||
|
print_list_strings = "\n".join(print_list)
|
||
|
text_file = open("results.txt", "a+")
|
||
|
text_file.write("Video: "+title.text+"\n")
|
||
|
text_file.write("Date:"+str(today)+"\n"+"\n")
|
||
|
text_file.write(print_list_strings+"\n")
|
||
|
text_file.close()
|
||
|
|
||
|
# close the browser
|
||
|
driver.close()
|