The problem of using save_screenshot
is that we cannot save an image in its original quality and cannot restore the alpha channel in an image. Therefore, I propose another solution. Here is a complete example using the selenium-wire
library suggested by @codam_hsmits. It is possible to download images via ChromeDriver
.
I have defined the following function to parse each request and save the request body to a file when necessary.
from seleniumwire import webdriver # Import from seleniumwire
from urllib.parse import urlparse
import os
from mimetypes import guess_extension
import time
import datetime
def download_assets(requests,
asset_dir="temp",
default_fname="unnamed",
skip_domains=["facebook", "google", "yahoo", "agkn", "2mdn"],
exts=[".png", ".jpeg", ".jpg", ".svg", ".gif", ".pdf", ".bmp", ".webp", ".ico"],
append_ext=False):
asset_list = {}
for req_idx, request in enumerate(requests):
# request.headers
# request.response.body is the raw response body in bytes
if request is None or request.response is None or request.response.headers is None or 'Content-Type' not in request.response.headers:
continue
ext = guess_extension(request.response.headers['Content-Type'].split(';')[0].strip())
if ext is None or ext == "" or ext not in exts:
#Don't know the file extention, or not in the whitelist
continue
parsed_url = urlparse(request.url)
skip = False
for d in skip_domains:
if d in parsed_url.netloc:
skip = True
break
if skip:
continue
frelpath = parsed_url.path.strip()
if frelpath == "":
timestamp = str(datetime.datetime.now().replace(microsecond=0).isoformat())
frelpath = f"{default_fname}_{req_idx}_{timestamp}{ext}"
elif frelpath.endswith("\\") or frelpath.endswith("/"):
timestamp = str(datetime.datetime.now().replace(microsecond=0).isoformat())
frelpath = frelpath + f"{default_fname}_{req_idx}_{timestamp}{ext}"
elif append_ext and not frelpath.endswith(ext):
frelpath = frelpath + f"_{default_fname}{ext}" #Missing file extension but may not be a problem
if frelpath.startswith("\\") or frelpath.startswith("/"):
frelpath = frelpath[1:]
fpath = os.path.join(asset_dir, parsed_url.netloc, frelpath)
if os.path.isfile(fpath):
continue
os.makedirs(os.path.dirname(fpath), exist_ok=True)
print(f"Downloading {request.url} to {fpath}")
asset_list[fpath] = request.url
try:
with open(fpath, "wb") as file:
file.write(request.response.body)
except:
print(f"Cannot download {request.url} to {fpath}")
return asset_list
Let's download some images from Google homepage to temp
folder.
# Create a new instance of the Chrome/Firefox driver
driver = webdriver.Chrome()
# Go to the Google home page
driver.get('https://www.google.com')
# Download content to temp folder
asset_dir = "temp"
while True:
# Please browser the internet, it will collect the images for every second
time.sleep(1)
download_assets(driver.requests, asset_dir=asset_dir)
driver.close()
Note that it cannot decide which images can be seen on the page rather than being hidden in the background, so the users should actively click the buttons or links to trigger new download requests.