Merge pull request #354 from normalizedwater546/master

asyncio: fix downloader being run sequentially + httpx: fix proxy and missing headers
fix: add headers, proxy to async_request
2025-07-01 07:59:29 +02:00 · 2024-11-24 13:50:22 +08:00 · 2024-11-23 13:11:25 +00:00 · 2024-11-23 12:19:36 +00:00 · 2024-11-23 11:20:58 +00:00 · 2024-11-23 11:17:09 +00:00
9 changed files with 107 additions and 85 deletions
--- a/nhentai/init.py
+++ b/nhentai/init.py
@ -1,3 +1,3 @@
-__version__ = '0.5.10'
+__version__ = '0.5.12'
 __author__ = 'RicterZ'
 __email__ = 'ricterzheng@gmail.com'
--- a/nhentai/command.py
+++ b/nhentai/command.py
@ -77,7 +77,7 @@ def main():
        doujinshi_ids = list(set(map(int, doujinshi_ids)) - set(data))

    if not options.is_show:
-        downloader = Downloader(path=options.output_dir, size=options.threads,
+        downloader = Downloader(path=options.output_dir, threads=options.threads,
                                timeout=options.timeout, delay=options.delay)

        for doujinshi_id in doujinshi_ids:
--- a/nhentai/doujinshi.py
+++ b/nhentai/doujinshi.py
@ -12,6 +12,7 @@ EXT_MAP = {
    'j': 'jpg',
    'p': 'png',
    'g': 'gif',
+    'w': 'webp',
 }


--- a/nhentai/downloader.py
+++ b/nhentai/downloader.py
@ -1,24 +1,17 @@
 # coding: utf-

-import multiprocessing
-import signal
-
-import sys
 import os
-import requests
-import time
+import asyncio
+import httpx
 import urllib3.exceptions

 from urllib.parse import urlparse
 from nhentai import constant
 from nhentai.logger import logger
-from nhentai.parser import request
-from nhentai.utils import Singleton
+from nhentai.utils import Singleton, async_request


 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
-semaphore = multiprocessing.Semaphore(1)
-

 class NHentaiImageNotExistException(Exception):
    pass
@ -39,64 +32,66 @@ def download_callback(result):
        logger.log(16, f'{data} downloaded successfully')


-class Downloader(Singleton):
+async def fiber(tasks):
+    for completed_task in asyncio.as_completed(tasks):
+        try:
+            result = await completed_task
+            logger.info(f'{result[1]} download completed')
+        except Exception as e:
+            logger.error(f'An error occurred: {e}')

-    def __init__(self, path='', size=5, timeout=30, delay=0):
-        self.size = size
+
+class Downloader(Singleton):
+    def __init__(self, path='', threads=5, timeout=30, delay=0):
+        self.threads = threads
        self.path = str(path)
        self.timeout = timeout
        self.delay = delay

-    def download(self, url, folder='', filename='', retried=0, proxy=None):
-        if self.delay:
-            time.sleep(self.delay)
-        logger.info(f'Starting to download {url} ...')
-        filename = filename if filename else os.path.basename(urlparse(url).path)
-        base_filename, extension = os.path.splitext(filename)
+    async def _semaphore_download(self, semaphore, *args, **kwargs):
+        async with semaphore:
+            return await self.download(*args, **kwargs)
+
+    async def download(self, url, folder='', filename='', retried=0, proxy=None):
+        logger.info(f'Starting to download {url} ...')
+
+        if self.delay:
+            await asyncio.sleep(self.delay)
+
+        filename = filename if filename else os.path.basename(urlparse(url).path)
+
+        save_file_path = os.path.join(self.folder, filename)

-        save_file_path = os.path.join(folder, base_filename.zfill(3) + extension)
        try:
            if os.path.exists(save_file_path):
                logger.warning(f'Skipped download: {save_file_path} already exists')
                return 1, url

-            response = None
-            with open(save_file_path, "wb") as f:
-                i = 0
-                while i < 10:
-                    try:
-                        response = request('get', url, stream=True, timeout=self.timeout, proxies=proxy)
-                        if response.status_code != 200:
-                            path = urlparse(url).path
-                            for mirror in constant.IMAGE_URL_MIRRORS:
-                                print(f'{mirror}{path}')
-                                mirror_url = f'{mirror}{path}'
-                                response = request('get', mirror_url, stream=True,
-                                                   timeout=self.timeout, proxies=proxy)
-                                if response.status_code == 200:
-                                    break
+            response = await async_request('GET', url, timeout=self.timeout, proxies=proxy)

-                    except Exception as e:
-                        i += 1
-                        if not i < 10:
-                            logger.critical(str(e))
-                            return 0, None
-                        continue
+            if response.status_code != 200:
+                path = urlparse(url).path
+                for mirror in constant.IMAGE_URL_MIRRORS:
+                    logger.info(f"Try mirror: {mirror}{path}")
+                    mirror_url = f'{mirror}{path}'
+                    response = await async_request('GET', mirror_url, timeout=self.timeout, proxies=proxy)
+                    if response.status_code == 200:
+                        break

-                    break
+            if not await self.save(filename, response):
+                logger.error(f'Can not download image {url}')
+                return 1, None

-                length = response.headers.get('content-length')
-                if length is None:
-                    f.write(response.content)
-                else:
-                    for chunk in response.iter_content(2048):
-                        f.write(chunk)
-
-        except (requests.HTTPError, requests.Timeout) as e:
+        except (httpx.HTTPStatusError, httpx.TimeoutException, httpx.ConnectError) as e:
            if retried < 3:
-                logger.warning(f'Warning: {e}, retrying({retried}) ...')
-                return 0, self.download(url=url, folder=folder, filename=filename,
-                                        retried=retried+1, proxy=proxy)
+                logger.info(f'Download {filename} failed, retrying({retried + 1}) times...')
+                return await self.download(
+                    url=url,
+                    folder=folder,
+                    filename=filename,
+                    retried=retried + 1,
+                    proxy=proxy,
+                )
            else:
                return 0, None

@ -106,6 +101,8 @@ class Downloader(Singleton):

        except Exception as e:
            import traceback
+
+            logger.error(f"Exception type: {type(e)}")
            traceback.print_stack()
            logger.critical(str(e))
            return 0, None
@ -115,8 +112,24 @@ class Downloader(Singleton):

        return 1, url

+    async def save(self, save_file_path, response) -> bool:
+        if response is None:
+            logger.error('Error: Response is None')
+            return False
+        save_file_path = os.path.join(self.folder, save_file_path)
+        with open(save_file_path, 'wb') as f:
+            if response is not None:
+                length = response.headers.get('content-length')
+                if length is None:
+                    f.write(response.content)
+                else:
+                    async for chunk in response.aiter_bytes(2048):
+                        f.write(chunk)
+        return True
+
+
    def start_download(self, queue, folder='') -> bool:
-        if not isinstance(folder, (str, )):
+        if not isinstance(folder, (str,)):
            folder = str(folder)

        if self.path:
@ -128,34 +141,20 @@ class Downloader(Singleton):
                os.makedirs(folder)
            except EnvironmentError as e:
                logger.critical(str(e))
+        self.folder = folder

        if os.getenv('DEBUG', None) == 'NODOWNLOAD':
            # Assuming we want to continue with rest of process.
            return True
-        queue = [(self, url, folder, constant.CONFIG['proxy']) for url in queue]

-        pool = multiprocessing.Pool(self.size, init_worker)
-        [pool.apply_async(download_wrapper, args=item) for item in queue]
+        semaphore = asyncio.Semaphore(self.threads)

-        pool.close()
-        pool.join()
+        coroutines = [
+            self._semaphore_download(semaphore, url, filename=os.path.basename(urlparse(url).path))
+            for url in queue
+        ]
+
+        # Prevent coroutines infection
+        asyncio.run(fiber(coroutines))

        return True
-
-
-def download_wrapper(obj, url, folder='', proxy=None):
-    if sys.platform == 'darwin' or semaphore.get_value():
-        return Downloader.download(obj, url=url, folder=folder, proxy=proxy)
-    else:
-        return -3, None
-
-
-def init_worker():
-    signal.signal(signal.SIGINT, subprocess_signal)
-
-
-def subprocess_signal(sig, frame):
-    if semaphore.acquire(timeout=1):
-        logger.warning('Ctrl-C pressed, exiting sub processes ...')
-
-    raise KeyboardInterrupt
--- a/nhentai/parser.py
+++ b/nhentai/parser.py
@ -148,7 +148,7 @@ def doujinshi_parser(id_, counter=0):
    doujinshi['subtitle'] = subtitle.text if subtitle else ''

    doujinshi_cover = html.find('div', attrs={'id': 'cover'})
-    img_id = re.search('/galleries/([0-9]+)/cover.(jpg|png|gif)$',
+    img_id = re.search('/galleries/([0-9]+)/cover.(jpg|png|gif|webp)$',
                       doujinshi_cover.a.img.attrs['data-src'])

    ext = []
--- a/nhentai/utils.py
+++ b/nhentai/utils.py
@ -6,6 +6,7 @@ import os
 import zipfile
 import shutil

+import httpx
 import requests
 import sqlite3
 import urllib.parse
@ -32,8 +33,28 @@ def request(method, url, **kwargs):
    return getattr(session, method)(url, verify=False, **kwargs)


+async def async_request(method, url, proxies = None, **kwargs):
+    headers = {
+        'Referer': constant.LOGIN_URL,
+        'User-Agent': constant.CONFIG['useragent'],
+        'Cookie': constant.CONFIG['cookie'],
+    }
+
+    if proxies is None:
+        proxies = constant.CONFIG['proxy']
+
+    if proxies.get('http') == '' and proxies.get('https') == '':
+        proxies = None
+
+    async with httpx.AsyncClient(headers=headers, verify=False, proxies=proxies, **kwargs) as client:
+        response = await client.request(method, url, **kwargs)
+
+    return response
+
+
 def check_cookie():
    response = request('get', constant.BASE_URL)
+
    if response.status_code == 403 and 'Just a moment...' in response.text:
        logger.error('Blocked by Cloudflare captcha, please set your cookie and useragent')
        sys.exit(1)
@ -104,7 +125,7 @@ def generate_html(output_dir='.', doujinshi_obj=None, template='default'):
    file_list.sort()

    for image in file_list:
-        if not os.path.splitext(image)[1] in ('.jpg', '.png'):
+        if not os.path.splitext(image)[1] in ('.jpg', '.png', '.webp'):
            continue
        image_html += f'<img src="{image}" class="image-item"/>\n'

@ -230,7 +251,7 @@ def generate_doc(file_type='', output_dir='.', doujinshi_obj=None, regenerate=Fa
            import img2pdf

            """Write images to a PDF file using img2pdf."""
-            file_list = [f for f in os.listdir(doujinshi_dir) if f.lower().endswith(('.png', '.jpg', '.jpeg', '.gif'))]
+            file_list = [f for f in os.listdir(doujinshi_dir) if f.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.webp'))]
            file_list.sort()

            logger.info(f'Writing PDF file to path: {filename}')
@ -309,7 +330,7 @@ def generate_metadata_file(output_dir, doujinshi_obj):
              'TRANSLATOR', 'PUBLISHER', 'DESCRIPTION', 'STATUS', 'CHAPTERS', 'PAGES',
              'TAGS', 'TYPE', 'LANGUAGE', 'RELEASED', 'READING DIRECTION', 'CHARACTERS',
              'SERIES', 'PARODY', 'URL']
-    special_fields = ['PARODY', 'TITLE', 'ORIGINAL TITLE', 'CHARACTERS', 'AUTHOR', 'GROUPS',
+    special_fields = ['PARODY', 'TITLE', 'ORIGINAL TITLE', 'DATE', 'CHARACTERS', 'AUTHOR', 'GROUPS',
                      'LANGUAGE', 'TAGS', 'URL', 'PAGES']

    for i in range(len(fields)):
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,6 +1,6 @@
 [tool.poetry]
 name = "nhentai"
-version = "0.5.10"
+version = "0.5.12"
 description = "nhentai doujinshi downloader"
 authors = ["Ricter Z <ricterzheng@gmail.com>"]
 license = "MIT"
--- a/requirements.txt
+++ b/requirements.txt
@ -1,3 +1,4 @@
+httpx
 requests
 soupsieve
 setuptools
--- a/tests/test_download.py
+++ b/tests/test_download.py
@ -20,7 +20,7 @@ class TestDownload(unittest.TestCase):
    def test_download(self):
        did = 440546
        info = Doujinshi(**doujinshi_parser(did), name_format='%i')
-        info.downloader = Downloader(path='/tmp', size=5)
+        info.downloader = Downloader(path='/tmp', threads=5)
        info.download()

        self.assertTrue(os.path.exists(f'/tmp/{did}/001.jpg'))
Author	SHA1	Message	Date
Ricter Zheng	3d6263cf11	Merge pull request #354 from normalizedwater546/master asyncio: fix downloader being run sequentially + httpx: fix proxy and missing headers	2024-11-24 13:50:22 +08:00
normalizedwater546	e3410f5a9a	fix: add headers, proxy to async_request	2024-11-23 13:11:25 +00:00
normalizedwater546	feb7f45533	fix: semaphore bound to different event loop	2024-11-23 12:19:36 +00:00
normalizedwater546	0754caaeb7	fix: update threads argument	2024-11-23 11:20:58 +00:00
normalizedwater546	49e5a3094a	fix: recent asyncio change resulting in sequential downloads This was due to AsyncIO completely ignoring the thread (size) argument, and not updating sleep to be non-blocking.	2024-11-23 11:17:09 +00:00
Ricter Zheng	c044b64beb	Merge pull request #353 from hzxjy1/master Fix issue #7	2024-11-19 02:10:34 +08:00
Hellagur4225	f8334c09b5	Add dependence httpx	2024-11-19 01:16:51 +08:00
Hellagur4225	c90c486fb4	Add a fix fatch for downloader	2024-11-19 01:13:16 +08:00
Ricter Zheng	90b17832cc	Merge pull request #351 from hzxjy1/master Use coroutine in url download	2024-11-17 10:10:54 +08:00
Hellagur4225	14c6db9cc3	Use coroutine in url download and improve the extensibility of class Downloader	2024-11-16 15:57:59 +08:00
Ricter Zheng	f30ff59b2b	Merge pull request #348 from JustAHumanBean/webp add webp support	2024-11-08 16:33:21 +08:00
JustAHumanBean	1504ee779f	Update utils.py	2024-11-08 07:49:20 +00:00
JustAHumanBean	98d9eecf6d	Update parser.py	2024-11-08 07:47:50 +00:00
JustAHumanBean	e16e623b9d	Update doujinshi.py	2024-11-08 07:46:53 +00:00
ricterzheng	c3f3182df3	0.5.12	2024-10-01 22:55:01 +09:00
ricterzheng	12aad842f8	fix #347	2024-10-01 22:42:26 +09:00
ricterzheng	f9f76ab0f5	0.5.11	2024-10-01 12:48:28 +09:00