|
|
@ -2,7 +2,7 @@
|
|
|
|
"cells": [
|
|
|
|
"cells": [
|
|
|
|
{
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": 7,
|
|
|
|
"execution_count": 1,
|
|
|
|
"id": "bbf97edb-4082-49fd-9946-56aa3f3d0eb4",
|
|
|
|
"id": "bbf97edb-4082-49fd-9946-56aa3f3d0eb4",
|
|
|
|
"metadata": {},
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [],
|
|
|
|
"outputs": [],
|
|
|
@ -26,7 +26,7 @@
|
|
|
|
},
|
|
|
|
},
|
|
|
|
{
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": 8,
|
|
|
|
"execution_count": 16,
|
|
|
|
"id": "d53ab4b1-da92-4ae2-ae11-02fa2d8ffe7c",
|
|
|
|
"id": "d53ab4b1-da92-4ae2-ae11-02fa2d8ffe7c",
|
|
|
|
"metadata": {},
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [],
|
|
|
|
"outputs": [],
|
|
|
@ -35,7 +35,7 @@
|
|
|
|
" dictionary = open('axios-example/pics.json',)\n",
|
|
|
|
" dictionary = open('axios-example/pics.json',)\n",
|
|
|
|
" l = json.load(dictionary)\n",
|
|
|
|
" l = json.load(dictionary)\n",
|
|
|
|
" \n",
|
|
|
|
" \n",
|
|
|
|
" cycles = 3\n",
|
|
|
|
" cycles = 10\n",
|
|
|
|
"\n",
|
|
|
|
"\n",
|
|
|
|
" def scroll_to_end(wd):\n",
|
|
|
|
" def scroll_to_end(wd):\n",
|
|
|
|
" wd.execute_script(\"window.scrollTo(0, document.body.scrollHeight);\")\n",
|
|
|
|
" wd.execute_script(\"window.scrollTo(0, document.body.scrollHeight);\")\n",
|
|
|
@ -82,7 +82,10 @@
|
|
|
|
"\n",
|
|
|
|
"\n",
|
|
|
|
" # extract image url\n",
|
|
|
|
" # extract image url\n",
|
|
|
|
" actual_images = wd.find_elements(By.CLASS_NAME,'n3VNCb')\n",
|
|
|
|
" actual_images = wd.find_elements(By.CLASS_NAME,'n3VNCb')\n",
|
|
|
|
" actual_image = actual_images[-1]\n",
|
|
|
|
" try:\n",
|
|
|
|
|
|
|
|
" actual_image = actual_images[-1]\n",
|
|
|
|
|
|
|
|
" except Exception:\n",
|
|
|
|
|
|
|
|
" continue\n",
|
|
|
|
" if actual_image.get_attribute('src') and 'http' in actual_image.get_attribute('src'):\n",
|
|
|
|
" if actual_image.get_attribute('src') and 'http' in actual_image.get_attribute('src'):\n",
|
|
|
|
" image_urls.add(actual_image.get_attribute('src'))\n",
|
|
|
|
" image_urls.add(actual_image.get_attribute('src'))\n",
|
|
|
|
" linkPic = actual_image.get_attribute('src')\n",
|
|
|
|
" linkPic = actual_image.get_attribute('src')\n",
|
|
|
@ -124,7 +127,7 @@
|
|
|
|
},
|
|
|
|
},
|
|
|
|
{
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": 9,
|
|
|
|
"execution_count": 17,
|
|
|
|
"id": "6de53389-ded8-47e2-b0aa-c132405a308f",
|
|
|
|
"id": "6de53389-ded8-47e2-b0aa-c132405a308f",
|
|
|
|
"metadata": {},
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [
|
|
|
|
"outputs": [
|
|
|
@ -132,8 +135,8 @@
|
|
|
|
"name": "stdout",
|
|
|
|
"name": "stdout",
|
|
|
|
"output_type": "stream",
|
|
|
|
"output_type": "stream",
|
|
|
|
"text": [
|
|
|
|
"text": [
|
|
|
|
"Found: 267 search results. Extracting links from 0:267\n",
|
|
|
|
"Found: 281 search results. Extracting links from 0:281\n",
|
|
|
|
"Found: 200 image links, done!\n"
|
|
|
|
"Found: 260 image links, looking for more ...\n"
|
|
|
|
]
|
|
|
|
]
|
|
|
|
}
|
|
|
|
}
|
|
|
|
],
|
|
|
|
],
|
|
|
@ -150,7 +153,7 @@
|
|
|
|
" search_box = wd.find_element(By.CLASS_NAME,'gLFyf')\n",
|
|
|
|
" search_box = wd.find_element(By.CLASS_NAME,'gLFyf')\n",
|
|
|
|
" search_box.send_keys(query)\n",
|
|
|
|
" search_box.send_keys(query)\n",
|
|
|
|
" \n",
|
|
|
|
" \n",
|
|
|
|
" links = harvesting(query,1000,wd)\n",
|
|
|
|
" links = harvesting(query,300,wd)\n",
|
|
|
|
" \n",
|
|
|
|
" \n",
|
|
|
|
" wd.quit()"
|
|
|
|
" wd.quit()"
|
|
|
|
]
|
|
|
|
]
|
|
|
|