Merge pull request #354 from normalizedwater546/master

asyncio: fix downloader being run sequentially + httpx: fix proxy and missing headers
This commit is contained in:
Ricter Zheng 2024-11-24 13:50:22 +08:00 committed by GitHub
commit 3d6263cf11
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 53 additions and 31 deletions

View File

@ -77,7 +77,7 @@ def main():
doujinshi_ids = list(set(map(int, doujinshi_ids)) - set(data)) doujinshi_ids = list(set(map(int, doujinshi_ids)) - set(data))
if not options.is_show: if not options.is_show:
downloader = Downloader(path=options.output_dir, size=options.threads, downloader = Downloader(path=options.output_dir, threads=options.threads,
timeout=options.timeout, delay=options.delay) timeout=options.timeout, delay=options.delay)
for doujinshi_id in doujinshi_ids: for doujinshi_id in doujinshi_ids:

View File

@ -1,22 +1,17 @@
# coding: utf- # coding: utf-
import multiprocessing
import os import os
import time import asyncio
import httpx
import urllib3.exceptions import urllib3.exceptions
from urllib.parse import urlparse from urllib.parse import urlparse
from nhentai import constant from nhentai import constant
from nhentai.logger import logger from nhentai.logger import logger
from nhentai.utils import Singleton from nhentai.utils import Singleton, async_request
import asyncio
import httpx
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
semaphore = multiprocessing.Semaphore(1)
class NHentaiImageNotExistException(Exception): class NHentaiImageNotExistException(Exception):
pass pass
@ -37,17 +32,32 @@ def download_callback(result):
logger.log(16, f'{data} downloaded successfully') logger.log(16, f'{data} downloaded successfully')
async def fiber(tasks):
for completed_task in asyncio.as_completed(tasks):
try:
result = await completed_task
logger.info(f'{result[1]} download completed')
except Exception as e:
logger.error(f'An error occurred: {e}')
class Downloader(Singleton): class Downloader(Singleton):
def __init__(self, path='', size=5, timeout=30, delay=0): def __init__(self, path='', threads=5, timeout=30, delay=0):
self.size = size self.threads = threads
self.path = str(path) self.path = str(path)
self.timeout = timeout self.timeout = timeout
self.delay = delay self.delay = delay
async def _semaphore_download(self, semaphore, *args, **kwargs):
async with semaphore:
return await self.download(*args, **kwargs)
async def download(self, url, folder='', filename='', retried=0, proxy=None): async def download(self, url, folder='', filename='', retried=0, proxy=None):
if self.delay:
time.sleep(self.delay)
logger.info(f'Starting to download {url} ...') logger.info(f'Starting to download {url} ...')
if self.delay:
await asyncio.sleep(self.delay)
filename = filename if filename else os.path.basename(urlparse(url).path) filename = filename if filename else os.path.basename(urlparse(url).path)
save_file_path = os.path.join(self.folder, filename) save_file_path = os.path.join(self.folder, filename)
@ -57,14 +67,14 @@ class Downloader(Singleton):
logger.warning(f'Skipped download: {save_file_path} already exists') logger.warning(f'Skipped download: {save_file_path} already exists')
return 1, url return 1, url
response = await self.async_request(url, self.timeout) # TODO: Add proxy response = await async_request('GET', url, timeout=self.timeout, proxies=proxy)
if response.status_code != 200: if response.status_code != 200:
path = urlparse(url).path path = urlparse(url).path
for mirror in constant.IMAGE_URL_MIRRORS: for mirror in constant.IMAGE_URL_MIRRORS:
logger.info(f"Try mirror: {mirror}{path}") logger.info(f"Try mirror: {mirror}{path}")
mirror_url = f'{mirror}{path}' mirror_url = f'{mirror}{path}'
response = await self.async_request(mirror_url, self.timeout) response = await async_request('GET', mirror_url, timeout=self.timeout, proxies=proxy)
if response.status_code == 200: if response.status_code == 200:
break break
@ -117,13 +127,9 @@ class Downloader(Singleton):
f.write(chunk) f.write(chunk)
return True return True
async def async_request(self, url, timeout):
async with httpx.AsyncClient() as client:
return await client.get(url, timeout=timeout)
def start_download(self, queue, folder='') -> bool: def start_download(self, queue, folder='') -> bool:
logger.warning("Proxy temporarily unavailable, it will be fixed later. ") if not isinstance(folder, (str,)):
if not isinstance(folder, (str, )):
folder = str(folder) folder = str(folder)
if self.path: if self.path:
@ -141,19 +147,14 @@ class Downloader(Singleton):
# Assuming we want to continue with rest of process. # Assuming we want to continue with rest of process.
return True return True
async def fiber(tasks): semaphore = asyncio.Semaphore(self.threads)
for completed_task in asyncio.as_completed(tasks):
try:
result = await completed_task
logger.info(f'{result[1]} download completed')
except Exception as e:
logger.error(f'An error occurred: {e}')
tasks = [ coroutines = [
self.download(url, filename=os.path.basename(urlparse(url).path)) self._semaphore_download(semaphore, url, filename=os.path.basename(urlparse(url).path))
for url in queue for url in queue
] ]
# Prevent coroutines infection # Prevent coroutines infection
asyncio.run(fiber(tasks)) asyncio.run(fiber(coroutines))
return True return True

View File

@ -6,6 +6,7 @@ import os
import zipfile import zipfile
import shutil import shutil
import httpx
import requests import requests
import sqlite3 import sqlite3
import urllib.parse import urllib.parse
@ -32,8 +33,28 @@ def request(method, url, **kwargs):
return getattr(session, method)(url, verify=False, **kwargs) return getattr(session, method)(url, verify=False, **kwargs)
async def async_request(method, url, proxies = None, **kwargs):
headers = {
'Referer': constant.LOGIN_URL,
'User-Agent': constant.CONFIG['useragent'],
'Cookie': constant.CONFIG['cookie'],
}
if proxies is None:
proxies = constant.CONFIG['proxy']
if proxies.get('http') == '' and proxies.get('https') == '':
proxies = None
async with httpx.AsyncClient(headers=headers, verify=False, proxies=proxies, **kwargs) as client:
response = await client.request(method, url, **kwargs)
return response
def check_cookie(): def check_cookie():
response = request('get', constant.BASE_URL) response = request('get', constant.BASE_URL)
if response.status_code == 403 and 'Just a moment...' in response.text: if response.status_code == 403 and 'Just a moment...' in response.text:
logger.error('Blocked by Cloudflare captcha, please set your cookie and useragent') logger.error('Blocked by Cloudflare captcha, please set your cookie and useragent')
sys.exit(1) sys.exit(1)

View File

@ -20,7 +20,7 @@ class TestDownload(unittest.TestCase):
def test_download(self): def test_download(self):
did = 440546 did = 440546
info = Doujinshi(**doujinshi_parser(did), name_format='%i') info = Doujinshi(**doujinshi_parser(did), name_format='%i')
info.downloader = Downloader(path='/tmp', size=5) info.downloader = Downloader(path='/tmp', threads=5)
info.download() info.download()
self.assertTrue(os.path.exists(f'/tmp/{did}/001.jpg')) self.assertTrue(os.path.exists(f'/tmp/{did}/001.jpg'))