Implement a Python-requests-based downloader

2023-05-12 07:02:39 +01:00 · 2023-05-12 07:02:39 +01:00 · be403bbff4
parent cb82febb7c
commit be403bbff4
4 changed files with 100 additions and 2 deletions
--- a/CONFIG.md
+++ b/CONFIG.md
@ -135,8 +135,13 @@ Choose what software to use to download data throughout Devine where needed.
 Options:
 - `aria2c` (default) - https://github.com/aria2/aria2
 - `requests` - https://github.com/psf/requests
 - `saldl` - https://github.com/saldl/saldl
 Note that aria2c can reach the highest speeds as it utilizes threading and more connections than the other
 downloaders. However, aria2c can also be one of the more unstable downloaders. It will work one day, then
 not another day. It also does not support HTTP(S) proxies while the other downloaders do.
 ## headers (dict)
 Case-Insensitive dictionary of headers that all Services begin their Request Session state with.  
--- a/devine/core/downloaders/init.py
+++ b/devine/core/downloaders/init.py
@ -1,5 +1,6 @@
 from .aria2c import aria2c
 from .requests import requests
 from .saldl import saldl
 from .downloader import downloader
-__ALL__ = (downloader, aria2c, saldl)
+__ALL__ = (downloader, aria2c, requests, saldl)
--- a/devine/core/downloaders/downloader.py
+++ b/devine/core/downloaders/downloader.py
@ -2,10 +2,11 @@ import asyncio
 from functools import partial
 from devine.core.config import config
-from devine.core.downloaders import aria2c, saldl
+from devine.core.downloaders import aria2c, requests, saldl
 downloader = {
    "aria2c": partial(asyncio.run, aria2c),
    "requests": requests,
    "saldl": partial(asyncio.run, saldl)
 }[config.downloader]
--- a/devine/core/downloaders/requests.py
+++ b/devine/core/downloaders/requests.py
@ -0,0 +1,91 @@
 import time
 from functools import partial
 from pathlib import Path
 from typing import Optional, Union, Any
 from requests import Session
 from rich import filesize
 from rich.filesize import decimal
 def requests(
    uri: Union[str, list[str]],
    out: Path,
    headers: Optional[dict] = None,
    proxy: Optional[str] = None,
    progress: Optional[partial] = None,
    *_: Any,
    **__: Any
 ) -> int:
    """
    Download files using Python Requests.
    https://requests.readthedocs.io
    If multiple URLs are provided they will be downloaded in the provided order
    to the output directory. They will not be merged together.
    """
    if isinstance(uri, list) and len(uri) == 1:
        uri = uri[0]
    if isinstance(uri, list):
        if out.is_file():
            raise ValueError("Expecting out to be a Directory path not a File as multiple URLs were provided")
        uri = [
            (url, out / f"{i:08}.mp4")
            for i, url in enumerate(uri)
        ]
    else:
        uri = [(uri, out.parent / out.name)]
    session = Session()
    if headers:
        headers = {
            k: v
            for k, v in headers.items()
            if k.lower() != "accept-encoding"
        }
        session.headers.update(headers)
    if proxy:
        session.proxies.update({"all": proxy})
    total_size = sum(
        int(session.head(url).headers["Content-Length"])
        for url, _ in uri
    )
    if progress:
        progress(total=total_size)
    download_sizes = []
    last_speed_refresh = time.time()
    for url, out_path in uri:
        stream = session.get(url, stream=True)
        file_size = int(stream.headers["Content-Length"])
        with open(out_path, "wb") as f:
            written = 0
            for chunk in stream.iter_content(chunk_size=1024):
                download_size = len(chunk)
                f.write(chunk)
                written += download_size
                if progress:
                    progress(advance=download_size)
                    now = time.time()
                    time_since = now - last_speed_refresh
                    download_sizes.append(download_size)
                    if time_since > 5 or download_size < 1024:
                        data_size = sum(download_sizes)
                        download_speed = data_size / (time_since or 1)
                        progress(downloaded=f"{filesize.decimal(download_speed)}/s")
                        last_speed_refresh = now
                        download_sizes.clear()
        if written < file_size:
            raise ValueError(
                f"{url} finished downloading unexpectedly, got {decimal(written)}/{decimal(file_size)}")
    return 0
 __ALL__ = (requests,)