You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
SI13_federico_patches/DynamicallyLoadedContentDow...

3.8 KiB

Dynamically loaded content downloader

Selenium + BeautifulSoup

In [32]:
# The standard library modules
import os
import sys

# The wget module
import wget

# The BeautifulSoup module
from bs4 import BeautifulSoup

# The selenium module
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
In [ ]:
driver = webdriver.Firefox("/Users/pnofrc/")
#driver = webdriver.Chrome("/Users/pnofrc/")
driver.get("https://mubi.com/it/films/music-and-apocalypse/watch") # load the web page
In [ ]:
# for websites that need you to login to access the information
elem = driver.find_element_by_id("email") # Find the email input field of the login form
elem.send_keys("user@example.com") # Send the users email
elem = driver.find_element_by_id("pwd") # Find the password field of the login form
elem.send_keys("userpwd") # send the users password
elem.send_keys(Keys.RETURN) # press the enter key
In [ ]:
driver.get("http://www.example.com/path/of/video/page.html") # load the page that has the video
In [ ]:
WebDriverWait(driver, 50).until(EC.visibility_of_element_located((By.ID, "the-element-id"))) # waits till the element with the specific id appears
src = driver.page_source # gets the html source of the page
In [ ]:
parser = BeautifulSoup(src,"lxml") # initialize the parser and parse the source "src"
list_of_attributes = {"class" : "some-class", "name" : "some-name"} # A list of attributes that you want to check in a tag
tag = parser.findAll('video',attrs=list_of_attributes) # Get the video tag from the source
In [ ]:
n = 0 # Specify the index of video element in the web page
url = tag[n]['src'] # get the src attribute of the video
wget.download(url,out="path/to/output/file") # download the video
In [ ]:
driver.close() # closes the driver