grad_prototypes/workshop_pyratechnic_session_1/scrapescripts/Selenium/selenium1.py

# import libraries
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import os
import time
import datetime

today = datetime.date.today()

# get the url from the terminal
url = input("Enter a url to scrape (include https:// etc.): ")

# Tell Selenium to open a new Firefox session
# and specify the path to the driver
driver = webdriver.Firefox(executable_path=os.path.dirname(os.path.realpath(__file__)) + '/geckodriver')

# Implicit wait tells Selenium how long it should wait before it throws an exception
driver.implicitly_wait(10)
driver.get(url)
time.sleep(3)

# Find the title element on the page
title = driver.find_element_by_xpath('//h1')
print ('Scraping comments from:')
print(title.text)

# scroll to just under the video in order to load the comments
driver.execute_script("window.scrollTo(1, 300);")
time.sleep(3)

# scroll again in order to load more comments
driver.execute_script('window.scrollTo(1, 2000);')
time.sleep(3)

# scroll again in order to load more comments
driver.execute_script('window.scrollTo(1, 4000);')
time.sleep(3)

# Find the element on the page where the comments are stored
comment_div=driver.find_element_by_xpath('//*[@id="contents"]')
comments=comment_div.find_elements_by_xpath('//*[@id="content-text"]')
authors=comment_div.find_elements_by_xpath('//*[@id="author-text"]')

# Extract the contents and add them to the lists
# This will let you create a dictionary later, of authors and comments
authors_list = []
comments_list = []

for author in authors:
	authors_list.append(author.text)

for comment in comments:
	comments_list.append(comment.text)

dictionary = dict(zip(authors_list, comments_list))

# Print the keys and values of our dictionary to the terminal
# then add them to a print_list which we'll use to write everything to a text file later
print_list = []

for a, b in dictionary.items():
	print ("Comment by:", str(a), "-"*10)
	print (str(b)+"\n")
	print_list.append("Comment by: "+str(a)+" -"+"-"*10)
	print_list.append(str(b)+"\n")

# Done? Great!
# Change the list into a collection of strings
# Open a txt file and put them there
# In case the file already exists, then just paste it at the bottom
# Super handy if you want to run the script for multiple sites and collect text
# in just one file
print_list_strings = "\n".join(print_list)
text_file = open("results.txt", "a+")
text_file.write("Video: "+title.text+"\n")
text_file.write("Date:"+str(today)+"\n"+"\n")
text_file.write(print_list_strings+"\n")
text_file.close()

# close the browser
driver.close()