Implement a Python-requests-based downloader

2023-05-12 07:02:39 +01:00 · 2023-05-12 07:02:39 +01:00 · be403bbff4
parent cb82febb7c
commit be403bbff4
4 changed files with 100 additions and 2 deletions
--- a/CONFIG.md
+++ b/CONFIG.md
@ -135,8 +135,13 @@ Choose what software to use to download data throughout Devine where needed.
 Options:

 - `aria2c` (default) - https://github.com/aria2/aria2
+- `requests` - https://github.com/psf/requests
 - `saldl` - https://github.com/saldl/saldl

+Note that aria2c can reach the highest speeds as it utilizes threading and more connections than the other
+downloaders. However, aria2c can also be one of the more unstable downloaders. It will work one day, then
+not another day. It also does not support HTTP(S) proxies while the other downloaders do.
+
 ## headers (dict)

 Case-Insensitive dictionary of headers that all Services begin their Request Session state with.  
--- a/devine/core/downloaders/init.py
+++ b/devine/core/downloaders/init.py
@ -1,5 +1,6 @@
 from .aria2c import aria2c
+from .requests import requests
 from .saldl import saldl
 from .downloader import downloader

-__ALL__ = (downloader, aria2c, saldl)
+__ALL__ = (downloader, aria2c, requests, saldl)
--- a/devine/core/downloaders/downloader.py
+++ b/devine/core/downloaders/downloader.py
@ -2,10 +2,11 @@ import asyncio
 from functools import partial

 from devine.core.config import config
-from devine.core.downloaders import aria2c, saldl
+from devine.core.downloaders import aria2c, requests, saldl


 downloader = {
    "aria2c": partial(asyncio.run, aria2c),
+    "requests": requests,
    "saldl": partial(asyncio.run, saldl)
 }[config.downloader]
--- a/devine/core/downloaders/requests.py
+++ b/devine/core/downloaders/requests.py
@ -0,0 +1,91 @@
+import time
+from functools import partial
+from pathlib import Path
+from typing import Optional, Union, Any
+
+from requests import Session
+from rich import filesize
+from rich.filesize import decimal
+
+
+def requests(
+    uri: Union[str, list[str]],
+    out: Path,
+    headers: Optional[dict] = None,
+    proxy: Optional[str] = None,
+    progress: Optional[partial] = None,
+    *_: Any,
+    **__: Any
+) -> int:
+    """
+    Download files using Python Requests.
+    https://requests.readthedocs.io
+
+    If multiple URLs are provided they will be downloaded in the provided order
+    to the output directory. They will not be merged together.
+    """
+    if isinstance(uri, list) and len(uri) == 1:
+        uri = uri[0]
+
+    if isinstance(uri, list):
+        if out.is_file():
+            raise ValueError("Expecting out to be a Directory path not a File as multiple URLs were provided")
+        uri = [
+            (url, out / f"{i:08}.mp4")
+            for i, url in enumerate(uri)
+        ]
+    else:
+        uri = [(uri, out.parent / out.name)]
+
+    session = Session()
+    if headers:
+        headers = {
+            k: v
+            for k, v in headers.items()
+            if k.lower() != "accept-encoding"
+        }
+        session.headers.update(headers)
+    if proxy:
+        session.proxies.update({"all": proxy})
+
+    total_size = sum(
+        int(session.head(url).headers["Content-Length"])
+        for url, _ in uri
+    )
+
+    if progress:
+        progress(total=total_size)
+
+    download_sizes = []
+    last_speed_refresh = time.time()
+
+    for url, out_path in uri:
+        stream = session.get(url, stream=True)
+        file_size = int(stream.headers["Content-Length"])
+        with open(out_path, "wb") as f:
+            written = 0
+            for chunk in stream.iter_content(chunk_size=1024):
+                download_size = len(chunk)
+                f.write(chunk)
+                written += download_size
+                if progress:
+                    progress(advance=download_size)
+
+                    now = time.time()
+                    time_since = now - last_speed_refresh
+
+                    download_sizes.append(download_size)
+                    if time_since > 5 or download_size < 1024:
+                        data_size = sum(download_sizes)
+                        download_speed = data_size / (time_since or 1)
+                        progress(downloaded=f"{filesize.decimal(download_speed)}/s")
+                        last_speed_refresh = now
+                        download_sizes.clear()
+        if written < file_size:
+            raise ValueError(
+                f"{url} finished downloading unexpectedly, got {decimal(written)}/{decimal(file_size)}")
+
+    return 0
+
+
+__ALL__ = (requests,)