{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Dynamically loaded content downloader" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Selenium + BeautifulSoup " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "https://dvenkatsagar.github.io/tutorials/python/2015/10/26/ddlv/" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [], "source": [ "# The standard library modules\n", "import os\n", "import sys\n", "\n", "# The wget module\n", "import wget\n", "\n", "# The BeautifulSoup module\n", "from bs4 import BeautifulSoup\n", "\n", "# The selenium module\n", "from selenium import webdriver\n", "from selenium.webdriver.common.keys import Keys\n", "from selenium.webdriver.support.ui import WebDriverWait\n", "from selenium.webdriver.support import expected_conditions as EC\n", "from selenium.webdriver.common.by import By" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "driver = webdriver.Firefox(\"/Users/pnofrc/\")\n", "#driver = webdriver.Chrome(\"/Users/pnofrc/\")\n", "driver.get(\"https://mubi.com/it/films/music-and-apocalypse/watch\") # load the web page" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# for websites that need you to login to access the information\n", "elem = driver.find_element_by_id(\"email\") # Find the email input field of the login form\n", "elem.send_keys(\"user@example.com\") # Send the users email\n", "elem = driver.find_element_by_id(\"pwd\") # Find the password field of the login form\n", "elem.send_keys(\"userpwd\") # send the users password\n", "elem.send_keys(Keys.RETURN) # press the enter key" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "driver.get(\"http://www.example.com/path/of/video/page.html\") # load the page that has the video" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "WebDriverWait(driver, 50).until(EC.visibility_of_element_located((By.ID, \"the-element-id\"))) # waits till the element with the specific id appears\n", "src = driver.page_source # gets the html source of the page" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "parser = BeautifulSoup(src,\"lxml\") # initialize the parser and parse the source \"src\"\n", "list_of_attributes = {\"class\" : \"some-class\", \"name\" : \"some-name\"} # A list of attributes that you want to check in a tag\n", "tag = parser.findAll('video',attrs=list_of_attributes) # Get the video tag from the source" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "n = 0 # Specify the index of video element in the web page\n", "url = tag[n]['src'] # get the src attribute of the video\n", "wget.download(url,out=\"path/to/output/file\") # download the video" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "driver.close() # closes the driver" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.1" } }, "nbformat": 4, "nbformat_minor": 4 }