You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
3.8 KiB
3.8 KiB
Dynamically loaded content downloader¶
Selenium + BeautifulSoup¶
In [32]:
# The standard library modules import os import sys # The wget module import wget # The BeautifulSoup module from bs4 import BeautifulSoup # The selenium module from selenium import webdriver from selenium.webdriver.common.keys import Keys from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.by import By
In [ ]:
driver = webdriver.Firefox("/Users/pnofrc/") #driver = webdriver.Chrome("/Users/pnofrc/") driver.get("https://mubi.com/it/films/music-and-apocalypse/watch") # load the web page
In [ ]:
# for websites that need you to login to access the information elem = driver.find_element_by_id("email") # Find the email input field of the login form elem.send_keys("user@example.com") # Send the users email elem = driver.find_element_by_id("pwd") # Find the password field of the login form elem.send_keys("userpwd") # send the users password elem.send_keys(Keys.RETURN) # press the enter key
In [ ]:
driver.get("http://www.example.com/path/of/video/page.html") # load the page that has the video
In [ ]:
WebDriverWait(driver, 50).until(EC.visibility_of_element_located((By.ID, "the-element-id"))) # waits till the element with the specific id appears src = driver.page_source # gets the html source of the page
In [ ]:
parser = BeautifulSoup(src,"lxml") # initialize the parser and parse the source "src" list_of_attributes = {"class" : "some-class", "name" : "some-name"} # A list of attributes that you want to check in a tag tag = parser.findAll('video',attrs=list_of_attributes) # Get the video tag from the source
In [ ]:
n = 0 # Specify the index of video element in the web page url = tag[n]['src'] # get the src attribute of the video wget.download(url,out="path/to/output/file") # download the video
In [ ]:
driver.close() # closes the driver