Implement a Python-requests-based downloader

This commit is contained in:
rlaphoenix 2023-05-12 07:02:39 +01:00
parent cb82febb7c
commit be403bbff4
4 changed files with 100 additions and 2 deletions

View File

@ -135,8 +135,13 @@ Choose what software to use to download data throughout Devine where needed.
Options: Options:
- `aria2c` (default) - https://github.com/aria2/aria2 - `aria2c` (default) - https://github.com/aria2/aria2
- `requests` - https://github.com/psf/requests
- `saldl` - https://github.com/saldl/saldl - `saldl` - https://github.com/saldl/saldl
Note that aria2c can reach the highest speeds as it utilizes threading and more connections than the other
downloaders. However, aria2c can also be one of the more unstable downloaders. It will work one day, then
not another day. It also does not support HTTP(S) proxies while the other downloaders do.
## headers (dict) ## headers (dict)
Case-Insensitive dictionary of headers that all Services begin their Request Session state with. Case-Insensitive dictionary of headers that all Services begin their Request Session state with.

View File

@ -1,5 +1,6 @@
from .aria2c import aria2c from .aria2c import aria2c
from .requests import requests
from .saldl import saldl from .saldl import saldl
from .downloader import downloader from .downloader import downloader
__ALL__ = (downloader, aria2c, saldl) __ALL__ = (downloader, aria2c, requests, saldl)

View File

@ -2,10 +2,11 @@ import asyncio
from functools import partial from functools import partial
from devine.core.config import config from devine.core.config import config
from devine.core.downloaders import aria2c, saldl from devine.core.downloaders import aria2c, requests, saldl
downloader = { downloader = {
"aria2c": partial(asyncio.run, aria2c), "aria2c": partial(asyncio.run, aria2c),
"requests": requests,
"saldl": partial(asyncio.run, saldl) "saldl": partial(asyncio.run, saldl)
}[config.downloader] }[config.downloader]

View File

@ -0,0 +1,91 @@
import time
from functools import partial
from pathlib import Path
from typing import Optional, Union, Any
from requests import Session
from rich import filesize
from rich.filesize import decimal
def requests(
uri: Union[str, list[str]],
out: Path,
headers: Optional[dict] = None,
proxy: Optional[str] = None,
progress: Optional[partial] = None,
*_: Any,
**__: Any
) -> int:
"""
Download files using Python Requests.
https://requests.readthedocs.io
If multiple URLs are provided they will be downloaded in the provided order
to the output directory. They will not be merged together.
"""
if isinstance(uri, list) and len(uri) == 1:
uri = uri[0]
if isinstance(uri, list):
if out.is_file():
raise ValueError("Expecting out to be a Directory path not a File as multiple URLs were provided")
uri = [
(url, out / f"{i:08}.mp4")
for i, url in enumerate(uri)
]
else:
uri = [(uri, out.parent / out.name)]
session = Session()
if headers:
headers = {
k: v
for k, v in headers.items()
if k.lower() != "accept-encoding"
}
session.headers.update(headers)
if proxy:
session.proxies.update({"all": proxy})
total_size = sum(
int(session.head(url).headers["Content-Length"])
for url, _ in uri
)
if progress:
progress(total=total_size)
download_sizes = []
last_speed_refresh = time.time()
for url, out_path in uri:
stream = session.get(url, stream=True)
file_size = int(stream.headers["Content-Length"])
with open(out_path, "wb") as f:
written = 0
for chunk in stream.iter_content(chunk_size=1024):
download_size = len(chunk)
f.write(chunk)
written += download_size
if progress:
progress(advance=download_size)
now = time.time()
time_since = now - last_speed_refresh
download_sizes.append(download_size)
if time_since > 5 or download_size < 1024:
data_size = sum(download_sizes)
download_speed = data_size / (time_since or 1)
progress(downloaded=f"{filesize.decimal(download_speed)}/s")
last_speed_refresh = now
download_sizes.clear()
if written < file_size:
raise ValueError(
f"{url} finished downloading unexpectedly, got {decimal(written)}/{decimal(file_size)}")
return 0
__ALL__ = (requests,)