# import libraries from selenium import webdriver from selenium.webdriver.common.keys import Keys import os import time import datetime today = datetime.date.today() # get the url from the terminal url = input("Enter a url to scrape (include https:// etc.): ") # Tell Selenium to open a new Firefox session # and specify the path to the driver driver = webdriver.Firefox(executable_path=os.path.dirname(os.path.realpath(__file__)) + '/geckodriver') # Implicit wait tells Selenium how long it should wait before it throws an exception driver.implicitly_wait(10) driver.get(url) time.sleep(3) # Find the title element on the page title = driver.find_element_by_xpath('//h1') print ('Scraping comments from:') print(title.text) # scroll to just under the video in order to load the comments driver.execute_script("window.scrollTo(1, 300);") time.sleep(3) # scroll again in order to load more comments driver.execute_script('window.scrollTo(1, 2000);') time.sleep(3) # scroll again in order to load more comments driver.execute_script('window.scrollTo(1, 4000);') time.sleep(3) # Find the element on the page where the comments are stored comment_div=driver.find_element_by_xpath('//*[@id="contents"]') comments=comment_div.find_elements_by_xpath('//*[@id="content-text"]') authors=comment_div.find_elements_by_xpath('//*[@id="author-text"]') # Extract the contents and add them to the lists # This will let you create a dictionary later, of authors and comments authors_list = [] comments_list = [] for author in authors: authors_list.append(author.text) for comment in comments: comments_list.append(comment.text) dictionary = dict(zip(authors_list, comments_list)) # Print the keys and values of our dictionary to the terminal # then add them to a print_list which we'll use to write everything to a text file later print_list = [] for a, b in dictionary.items(): print ("Comment by:", str(a), "-"*10) print (str(b)+"\n") print_list.append("Comment by: "+str(a)+" -"+"-"*10) print_list.append(str(b)+"\n") # Done? Great! # Change the list into a collection of strings # Open a txt file and put them there # In case the file already exists, then just paste it at the bottom # Super handy if you want to run the script for multiple sites and collect text # in just one file print_list_strings = "\n".join(print_list) text_file = open("results.txt", "a+") text_file.write("Video: "+title.text+"\n") text_file.write("Date:"+str(today)+"\n"+"\n") text_file.write(print_list_strings+"\n") text_file.close() # close the browser driver.close()