grad_prototypes/workshop_pyratechnic_session_1/scrapescripts/Selenium/selenium1.py

# import libraries
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import os
import time
import datetime

today = datetime.date.today()

# get the url from the terminal
url = input("Enter a url to scrape (include https:// etc.): ")

# Tell Selenium to open a new Firefox session
# and specify the path to the driver
driver = webdriver.Firefox(executable_path=os.path.dirname(os.path.realpath(__file__)) + '/geckodriver')

# Implicit wait tells Selenium how long it should wait before it throws an exception
driver.implicitly_wait(10)
driver.get(url)
time.sleep(3)

# Find the title element on the page
title = driver.find_element_by_xpath('//h1')
print ('Scraping comments from:')
print(title.text)
 
# scroll to just under the video in order to load the comments
driver.execute_script("window.scrollTo(1, 300);")
time.sleep(3)

# scroll again in order to load more comments
driver.execute_script('window.scrollTo(1, 2000);')
time.sleep(3)

# scroll again in order to load more comments
driver.execute_script('window.scrollTo(1, 4000);')
time.sleep(3)

# Find the element on the page where the comments are stored
comment_div=driver.find_element_by_xpath('//*[@id="contents"]')
comments=comment_div.find_elements_by_xpath('//*[@id="content-text"]')
authors=comment_div.find_elements_by_xpath('//*[@id="author-text"]')

# Extract the contents and add them to the lists
# This will let you create a dictionary later, of authors and comments
authors_list = []
comments_list = []

for author in authors:
	authors_list.append(author.text)

for comment in comments:
	comments_list.append(comment.text)

dictionary = dict(zip(authors_list, comments_list))

# Print the keys and values of our dictionary to the terminal
# then add them to a print_list which we'll use to write everything to a text file later
print_list = []

for a, b in dictionary.items():
	print ("Comment by:", str(a), "-"*10)
	print (str(b)+"\n")
	print_list.append("Comment by: "+str(a)+" -"+"-"*10)
	print_list.append(str(b)+"\n")

# Done? Great!
# Change the list into a collection of strings
# Open a txt file and put them there
# In case the file already exists, then just paste it at the bottom
# Super handy if you want to run the script for multiple sites and collect text
# in just one file
print_list_strings = "\n".join(print_list)
text_file = open("results.txt", "a+")
text_file.write("Video: "+title.text+"\n")
text_file.write("Date:"+str(today)+"\n"+"\n")
text_file.write(print_list_strings+"\n")
text_file.close()

# close the browser
driver.close()
Added sketches from t4 6 years ago			`# import libraries`
			`from selenium import webdriver`
			`from selenium.webdriver.common.keys import Keys`
			`import os`
			`import time`
			`import datetime`

			`today = datetime.date.today()`

			`# get the url from the terminal`
			`url = input("Enter a url to scrape (include https:// etc.): ")`

			`# Tell Selenium to open a new Firefox session`
			`# and specify the path to the driver`
			`driver = webdriver.Firefox(executable_path=os.path.dirname(os.path.realpath(__file__)) + '/geckodriver')`

			`# Implicit wait tells Selenium how long it should wait before it throws an exception`
			`driver.implicitly_wait(10)`
			`driver.get(url)`
			`time.sleep(3)`

			`# Find the title element on the page`
			`title = driver.find_element_by_xpath('//h1')`
			`print ('Scraping comments from:')`
			`print(title.text)`

			`# scroll to just under the video in order to load the comments`
			`driver.execute_script("window.scrollTo(1, 300);")`
			`time.sleep(3)`

			`# scroll again in order to load more comments`
			`driver.execute_script('window.scrollTo(1, 2000);')`
			`time.sleep(3)`

			`# scroll again in order to load more comments`
			`driver.execute_script('window.scrollTo(1, 4000);')`
			`time.sleep(3)`

			`# Find the element on the page where the comments are stored`
			`comment_div=driver.find_element_by_xpath('//*[@id="contents"]')`
			`comments=comment_div.find_elements_by_xpath('//*[@id="content-text"]')`
			`authors=comment_div.find_elements_by_xpath('//*[@id="author-text"]')`

			`# Extract the contents and add them to the lists`
			`# This will let you create a dictionary later, of authors and comments`
			`authors_list = []`
			`comments_list = []`

			`for author in authors:`
			`authors_list.append(author.text)`

			`for comment in comments:`
			`comments_list.append(comment.text)`

			`dictionary = dict(zip(authors_list, comments_list))`

			`# Print the keys and values of our dictionary to the terminal`
			`# then add them to a print_list which we'll use to write everything to a text file later`
			`print_list = []`

			`for a, b in dictionary.items():`
			`print ("Comment by:", str(a), "-"*10)`
			`print (str(b)+"\n")`
			`print_list.append("Comment by: "+str(a)+" -"+"-"*10)`
			`print_list.append(str(b)+"\n")`

			`# Done? Great!`
			`# Change the list into a collection of strings`
			`# Open a txt file and put them there`
			`# In case the file already exists, then just paste it at the bottom`
			`# Super handy if you want to run the script for multiple sites and collect text`
			`# in just one file`
			`print_list_strings = "\n".join(print_list)`
			`text_file = open("results.txt", "a+")`
			`text_file.write("Video: "+title.text+"\n")`
			`text_file.write("Date:"+str(today)+"\n"+"\n")`
			`text_file.write(print_list_strings+"\n")`
			`text_file.close()`

			`# close the browser`
			`driver.close()`