Implement an urllib downloader

Note: Doesn't seem to work if a proxy is provided.
2024-01-09 02:21:02 +00:00 · 2024-01-09 02:21:02 +00:00 · 3c01237392
parent 172ab64017
commit 3c01237392
2 changed files with 133 additions and 2 deletions
--- a/devine/core/downloaders/init.py
+++ b/devine/core/downloaders/init.py
@ -4,12 +4,14 @@ from ..config import config
 from .aria2c import aria2c
 from .curl_impersonate import curl_impersonate
 from .requests import requests
+from .urllib import urllib

 downloader = {
    "aria2c": lambda *args, **kwargs: asyncio.run(aria2c(*args, **kwargs)),
    "curl_impersonate": curl_impersonate,
-    "requests": requests
+    "requests": requests,
+    "urllib": urllib
 }[config.downloader]


-__all__ = ("downloader", "aria2c", "curl_impersonate", "requests")
+__all__ = ("downloader", "aria2c", "curl_impersonate", "requests", "urllib")
--- a/devine/core/downloaders/urllib.py
+++ b/devine/core/downloaders/urllib.py
@ -0,0 +1,129 @@
+import time
+from copy import deepcopy
+from functools import partial
+from http.cookiejar import CookieJar
+from pathlib import Path
+from typing import Any, MutableMapping, Optional, Union
+from urllib.request import ProxyHandler, Request, build_opener, urlopen
+
+import requests
+from requests.cookies import RequestsCookieJar, cookiejar_from_dict, get_cookie_header
+from rich import filesize
+
+from devine.core.constants import DOWNLOAD_CANCELLED
+
+MAX_ATTEMPTS = 5
+RETRY_WAIT = 2
+
+
+def urllib(
+    uri: Union[str, list[str]],
+    out: Path,
+    headers: Optional[dict] = None,
+    cookies: Optional[Union[MutableMapping[str, str], RequestsCookieJar]] = None,
+    proxy: Optional[str] = None,
+    progress: Optional[partial] = None,
+    *_: Any,
+    **__: Any
+) -> int:
+    """
+    Download files using Python urllib.
+    https://docs.python.org/3/library/urllib.html
+
+    If multiple URLs are provided they will be downloaded in the provided order
+    to the output directory. They will not be merged together.
+    """
+    if isinstance(uri, list) and len(uri) == 1:
+        uri = uri[0]
+
+    if isinstance(uri, list):
+        if out.is_file():
+            raise ValueError("Expecting out to be a Directory path not a File as multiple URLs were provided")
+        uri = [
+            (url, out / f"{i:08}.mp4")
+            for i, url in enumerate(uri)
+        ]
+    else:
+        uri = [(uri, out.parent / out.name)]
+
+    headers = headers or {}
+    headers = {
+        k: v
+        for k, v in headers.items()
+        if k.lower() != "accept-encoding"
+    }
+
+    if cookies and not isinstance(cookies, CookieJar):
+        cookies = cookiejar_from_dict(cookies)
+
+    if proxy:
+        proxy_handler = ProxyHandler({
+            proxy.split("://")[0]: proxy
+        })
+        opener = build_opener(proxy_handler)
+    else:
+        opener = urlopen
+
+    if progress:
+        progress(total=len(uri))
+
+    download_sizes = []
+    last_speed_refresh = time.time()
+
+    for url, out_path in uri:
+        out_path.parent.mkdir(parents=True, exist_ok=True)
+
+        headers_ = deepcopy(headers)
+
+        if cookies:
+            mock_request = requests.Request(url=url)
+            cookie_header = get_cookie_header(cookies, mock_request)
+            if cookie_header:
+                headers_["Cookie"] = cookie_header
+
+        attempts = 1
+
+        while True:
+            try:
+                req = Request(
+                    url=url,
+                    headers=headers_
+                )
+
+                res = opener.open(req)
+
+                cookies.extract_cookies(res, req)
+
+                with open(out_path, "wb") as f:
+                    written = 0
+                    while True:
+                        chunk = res.read(1024)
+                        if not chunk:
+                            break
+                        download_size = len(chunk)
+                        f.write(chunk)
+                        written += download_size
+                        if progress:
+                            progress(advance=1)
+
+                            now = time.time()
+                            time_since = now - last_speed_refresh
+
+                            download_sizes.append(download_size)
+                            if time_since > 5 or download_size < 1024:
+                                data_size = sum(download_sizes)
+                                download_speed = data_size / (time_since or 1)
+                                progress(downloaded=f"{filesize.decimal(download_speed)}/s")
+                                last_speed_refresh = now
+                                download_sizes.clear()
+                break
+            except Exception as e:
+                if DOWNLOAD_CANCELLED.is_set() or attempts == MAX_ATTEMPTS:
+                    raise e
+                time.sleep(RETRY_WAIT)
+                attempts += 1
+
+    return 0
+
+
+__all__ = ("urllib",)