From be403bbff43d6b62f095048c85f79618b00a6492 Mon Sep 17 00:00:00 2001 From: rlaphoenix Date: Fri, 12 May 2023 07:02:39 +0100 Subject: [PATCH] Implement a Python-requests-based downloader --- CONFIG.md | 5 ++ devine/core/downloaders/__init__.py | 3 +- devine/core/downloaders/downloader.py | 3 +- devine/core/downloaders/requests.py | 91 +++++++++++++++++++++++++++ 4 files changed, 100 insertions(+), 2 deletions(-) create mode 100644 devine/core/downloaders/requests.py diff --git a/CONFIG.md b/CONFIG.md index f97b9cc..018e161 100644 --- a/CONFIG.md +++ b/CONFIG.md @@ -135,8 +135,13 @@ Choose what software to use to download data throughout Devine where needed. Options: - `aria2c` (default) - https://github.com/aria2/aria2 +- `requests` - https://github.com/psf/requests - `saldl` - https://github.com/saldl/saldl +Note that aria2c can reach the highest speeds as it utilizes threading and more connections than the other +downloaders. However, aria2c can also be one of the more unstable downloaders. It will work one day, then +not another day. It also does not support HTTP(S) proxies while the other downloaders do. + ## headers (dict) Case-Insensitive dictionary of headers that all Services begin their Request Session state with. diff --git a/devine/core/downloaders/__init__.py b/devine/core/downloaders/__init__.py index a39bb73..60de9c2 100644 --- a/devine/core/downloaders/__init__.py +++ b/devine/core/downloaders/__init__.py @@ -1,5 +1,6 @@ from .aria2c import aria2c +from .requests import requests from .saldl import saldl from .downloader import downloader -__ALL__ = (downloader, aria2c, saldl) +__ALL__ = (downloader, aria2c, requests, saldl) diff --git a/devine/core/downloaders/downloader.py b/devine/core/downloaders/downloader.py index 64b8174..9011d52 100644 --- a/devine/core/downloaders/downloader.py +++ b/devine/core/downloaders/downloader.py @@ -2,10 +2,11 @@ import asyncio from functools import partial from devine.core.config import config -from devine.core.downloaders import aria2c, saldl +from devine.core.downloaders import aria2c, requests, saldl downloader = { "aria2c": partial(asyncio.run, aria2c), + "requests": requests, "saldl": partial(asyncio.run, saldl) }[config.downloader] diff --git a/devine/core/downloaders/requests.py b/devine/core/downloaders/requests.py new file mode 100644 index 0000000..b961de8 --- /dev/null +++ b/devine/core/downloaders/requests.py @@ -0,0 +1,91 @@ +import time +from functools import partial +from pathlib import Path +from typing import Optional, Union, Any + +from requests import Session +from rich import filesize +from rich.filesize import decimal + + +def requests( + uri: Union[str, list[str]], + out: Path, + headers: Optional[dict] = None, + proxy: Optional[str] = None, + progress: Optional[partial] = None, + *_: Any, + **__: Any +) -> int: + """ + Download files using Python Requests. + https://requests.readthedocs.io + + If multiple URLs are provided they will be downloaded in the provided order + to the output directory. They will not be merged together. + """ + if isinstance(uri, list) and len(uri) == 1: + uri = uri[0] + + if isinstance(uri, list): + if out.is_file(): + raise ValueError("Expecting out to be a Directory path not a File as multiple URLs were provided") + uri = [ + (url, out / f"{i:08}.mp4") + for i, url in enumerate(uri) + ] + else: + uri = [(uri, out.parent / out.name)] + + session = Session() + if headers: + headers = { + k: v + for k, v in headers.items() + if k.lower() != "accept-encoding" + } + session.headers.update(headers) + if proxy: + session.proxies.update({"all": proxy}) + + total_size = sum( + int(session.head(url).headers["Content-Length"]) + for url, _ in uri + ) + + if progress: + progress(total=total_size) + + download_sizes = [] + last_speed_refresh = time.time() + + for url, out_path in uri: + stream = session.get(url, stream=True) + file_size = int(stream.headers["Content-Length"]) + with open(out_path, "wb") as f: + written = 0 + for chunk in stream.iter_content(chunk_size=1024): + download_size = len(chunk) + f.write(chunk) + written += download_size + if progress: + progress(advance=download_size) + + now = time.time() + time_since = now - last_speed_refresh + + download_sizes.append(download_size) + if time_since > 5 or download_size < 1024: + data_size = sum(download_sizes) + download_speed = data_size / (time_since or 1) + progress(downloaded=f"{filesize.decimal(download_speed)}/s") + last_speed_refresh = now + download_sizes.clear() + if written < file_size: + raise ValueError( + f"{url} finished downloading unexpectedly, got {decimal(written)}/{decimal(file_size)}") + + return 0 + + +__ALL__ = (requests,)