You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

81 lines
2.5 KiB
Python

# import libraries
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import os
import time
import datetime
today = datetime.date.today()
# get the url from the terminal
url = input("Enter a url to scrape (include https:// etc.): ")
# Tell Selenium to open a new Firefox session
# and specify the path to the driver
driver = webdriver.Firefox(executable_path=os.path.dirname(os.path.realpath(__file__)) + '/geckodriver')
# Implicit wait tells Selenium how long it should wait before it throws an exception
driver.implicitly_wait(10)
driver.get(url)
time.sleep(3)
# Find the title element on the page
title = driver.find_element_by_xpath('//h1')
print ('Scraping comments from:')
print(title.text)
# scroll to just under the video in order to load the comments
driver.execute_script("window.scrollTo(1, 300);")
time.sleep(3)
# scroll again in order to load more comments
driver.execute_script('window.scrollTo(1, 2000);')
time.sleep(3)
# scroll again in order to load more comments
driver.execute_script('window.scrollTo(1, 4000);')
time.sleep(3)
# Find the element on the page where the comments are stored
comment_div=driver.find_element_by_xpath('//*[@id="contents"]')
comments=comment_div.find_elements_by_xpath('//*[@id="content-text"]')
authors=comment_div.find_elements_by_xpath('//*[@id="author-text"]')
# Extract the contents and add them to the lists
# This will let you create a dictionary later, of authors and comments
authors_list = []
comments_list = []
for author in authors:
authors_list.append(author.text)
for comment in comments:
comments_list.append(comment.text)
dictionary = dict(zip(authors_list, comments_list))
# Print the keys and values of our dictionary to the terminal
# then add them to a print_list which we'll use to write everything to a text file later
print_list = []
for a, b in dictionary.items():
print ("Comment by:", str(a), "-"*10)
print (str(b)+"\n")
print_list.append("Comment by: "+str(a)+" -"+"-"*10)
print_list.append(str(b)+"\n")
# Done? Great!
# Change the list into a collection of strings
# Open a txt file and put them there
# In case the file already exists, then just paste it at the bottom
# Super handy if you want to run the script for multiple sites and collect text
# in just one file
print_list_strings = "\n".join(print_list)
text_file = open("results.txt", "a+")
text_file.write("Video: "+title.text+"\n")
text_file.write("Date:"+str(today)+"\n"+"\n")
text_file.write(print_list_strings+"\n")
text_file.close()
# close the browser
driver.close()