Compare commits

..

14 Commits

Author SHA1 Message Date
3d6263cf11 Merge pull request #354 from normalizedwater546/master
asyncio: fix downloader being run sequentially + httpx: fix proxy and missing headers
2024-11-24 13:50:22 +08:00
e3410f5a9a fix: add headers, proxy to async_request 2024-11-23 13:11:25 +00:00
feb7f45533 fix: semaphore bound to different event loop 2024-11-23 12:19:36 +00:00
0754caaeb7 fix: update threads argument 2024-11-23 11:20:58 +00:00
49e5a3094a fix: recent asyncio change resulting in sequential downloads
This was due to AsyncIO completely ignoring the thread (size) argument, and not updating sleep to be non-blocking.
2024-11-23 11:17:09 +00:00
c044b64beb Merge pull request #353 from hzxjy1/master
Fix issue #7
2024-11-19 02:10:34 +08:00
f8334c09b5 Add dependence httpx 2024-11-19 01:16:51 +08:00
c90c486fb4 Add a fix fatch for downloader 2024-11-19 01:13:16 +08:00
90b17832cc Merge pull request #351 from hzxjy1/master
Use coroutine in url download
2024-11-17 10:10:54 +08:00
14c6db9cc3 Use coroutine in url download and improve the extensibility of class Downloader 2024-11-16 15:57:59 +08:00
f30ff59b2b Merge pull request #348 from JustAHumanBean/webp
add webp support
2024-11-08 16:33:21 +08:00
1504ee779f Update utils.py 2024-11-08 07:49:20 +00:00
98d9eecf6d Update parser.py 2024-11-08 07:47:50 +00:00
e16e623b9d Update doujinshi.py 2024-11-08 07:46:53 +00:00
7 changed files with 104 additions and 82 deletions

View File

@ -77,7 +77,7 @@ def main():
doujinshi_ids = list(set(map(int, doujinshi_ids)) - set(data)) doujinshi_ids = list(set(map(int, doujinshi_ids)) - set(data))
if not options.is_show: if not options.is_show:
downloader = Downloader(path=options.output_dir, size=options.threads, downloader = Downloader(path=options.output_dir, threads=options.threads,
timeout=options.timeout, delay=options.delay) timeout=options.timeout, delay=options.delay)
for doujinshi_id in doujinshi_ids: for doujinshi_id in doujinshi_ids:

View File

@ -12,6 +12,7 @@ EXT_MAP = {
'j': 'jpg', 'j': 'jpg',
'p': 'png', 'p': 'png',
'g': 'gif', 'g': 'gif',
'w': 'webp',
} }

View File

@ -1,24 +1,17 @@
# coding: utf- # coding: utf-
import multiprocessing
import signal
import sys
import os import os
import requests import asyncio
import time import httpx
import urllib3.exceptions import urllib3.exceptions
from urllib.parse import urlparse from urllib.parse import urlparse
from nhentai import constant from nhentai import constant
from nhentai.logger import logger from nhentai.logger import logger
from nhentai.parser import request from nhentai.utils import Singleton, async_request
from nhentai.utils import Singleton
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
semaphore = multiprocessing.Semaphore(1)
class NHentaiImageNotExistException(Exception): class NHentaiImageNotExistException(Exception):
pass pass
@ -39,64 +32,66 @@ def download_callback(result):
logger.log(16, f'{data} downloaded successfully') logger.log(16, f'{data} downloaded successfully')
class Downloader(Singleton): async def fiber(tasks):
for completed_task in asyncio.as_completed(tasks):
try:
result = await completed_task
logger.info(f'{result[1]} download completed')
except Exception as e:
logger.error(f'An error occurred: {e}')
def __init__(self, path='', size=5, timeout=30, delay=0):
self.size = size class Downloader(Singleton):
def __init__(self, path='', threads=5, timeout=30, delay=0):
self.threads = threads
self.path = str(path) self.path = str(path)
self.timeout = timeout self.timeout = timeout
self.delay = delay self.delay = delay
def download(self, url, folder='', filename='', retried=0, proxy=None): async def _semaphore_download(self, semaphore, *args, **kwargs):
if self.delay: async with semaphore:
time.sleep(self.delay) return await self.download(*args, **kwargs)
logger.info(f'Starting to download {url} ...')
filename = filename if filename else os.path.basename(urlparse(url).path) async def download(self, url, folder='', filename='', retried=0, proxy=None):
base_filename, extension = os.path.splitext(filename) logger.info(f'Starting to download {url} ...')
if self.delay:
await asyncio.sleep(self.delay)
filename = filename if filename else os.path.basename(urlparse(url).path)
save_file_path = os.path.join(self.folder, filename)
save_file_path = os.path.join(folder, base_filename.zfill(3) + extension)
try: try:
if os.path.exists(save_file_path): if os.path.exists(save_file_path):
logger.warning(f'Skipped download: {save_file_path} already exists') logger.warning(f'Skipped download: {save_file_path} already exists')
return 1, url return 1, url
response = None response = await async_request('GET', url, timeout=self.timeout, proxies=proxy)
with open(save_file_path, "wb") as f:
i = 0
while i < 10:
try:
response = request('get', url, stream=True, timeout=self.timeout, proxies=proxy)
if response.status_code != 200:
path = urlparse(url).path
for mirror in constant.IMAGE_URL_MIRRORS:
print(f'{mirror}{path}')
mirror_url = f'{mirror}{path}'
response = request('get', mirror_url, stream=True,
timeout=self.timeout, proxies=proxy)
if response.status_code == 200:
break
except Exception as e: if response.status_code != 200:
i += 1 path = urlparse(url).path
if not i < 10: for mirror in constant.IMAGE_URL_MIRRORS:
logger.critical(str(e)) logger.info(f"Try mirror: {mirror}{path}")
return 0, None mirror_url = f'{mirror}{path}'
continue response = await async_request('GET', mirror_url, timeout=self.timeout, proxies=proxy)
if response.status_code == 200:
break
break if not await self.save(filename, response):
logger.error(f'Can not download image {url}')
return 1, None
length = response.headers.get('content-length') except (httpx.HTTPStatusError, httpx.TimeoutException, httpx.ConnectError) as e:
if length is None:
f.write(response.content)
else:
for chunk in response.iter_content(2048):
f.write(chunk)
except (requests.HTTPError, requests.Timeout) as e:
if retried < 3: if retried < 3:
logger.warning(f'Warning: {e}, retrying({retried}) ...') logger.info(f'Download {filename} failed, retrying({retried + 1}) times...')
return 0, self.download(url=url, folder=folder, filename=filename, return await self.download(
retried=retried+1, proxy=proxy) url=url,
folder=folder,
filename=filename,
retried=retried + 1,
proxy=proxy,
)
else: else:
return 0, None return 0, None
@ -106,6 +101,8 @@ class Downloader(Singleton):
except Exception as e: except Exception as e:
import traceback import traceback
logger.error(f"Exception type: {type(e)}")
traceback.print_stack() traceback.print_stack()
logger.critical(str(e)) logger.critical(str(e))
return 0, None return 0, None
@ -115,8 +112,24 @@ class Downloader(Singleton):
return 1, url return 1, url
async def save(self, save_file_path, response) -> bool:
if response is None:
logger.error('Error: Response is None')
return False
save_file_path = os.path.join(self.folder, save_file_path)
with open(save_file_path, 'wb') as f:
if response is not None:
length = response.headers.get('content-length')
if length is None:
f.write(response.content)
else:
async for chunk in response.aiter_bytes(2048):
f.write(chunk)
return True
def start_download(self, queue, folder='') -> bool: def start_download(self, queue, folder='') -> bool:
if not isinstance(folder, (str, )): if not isinstance(folder, (str,)):
folder = str(folder) folder = str(folder)
if self.path: if self.path:
@ -128,34 +141,20 @@ class Downloader(Singleton):
os.makedirs(folder) os.makedirs(folder)
except EnvironmentError as e: except EnvironmentError as e:
logger.critical(str(e)) logger.critical(str(e))
self.folder = folder
if os.getenv('DEBUG', None) == 'NODOWNLOAD': if os.getenv('DEBUG', None) == 'NODOWNLOAD':
# Assuming we want to continue with rest of process. # Assuming we want to continue with rest of process.
return True return True
queue = [(self, url, folder, constant.CONFIG['proxy']) for url in queue]
pool = multiprocessing.Pool(self.size, init_worker) semaphore = asyncio.Semaphore(self.threads)
[pool.apply_async(download_wrapper, args=item) for item in queue]
pool.close() coroutines = [
pool.join() self._semaphore_download(semaphore, url, filename=os.path.basename(urlparse(url).path))
for url in queue
]
# Prevent coroutines infection
asyncio.run(fiber(coroutines))
return True return True
def download_wrapper(obj, url, folder='', proxy=None):
if sys.platform == 'darwin' or semaphore.get_value():
return Downloader.download(obj, url=url, folder=folder, proxy=proxy)
else:
return -3, None
def init_worker():
signal.signal(signal.SIGINT, subprocess_signal)
def subprocess_signal(sig, frame):
if semaphore.acquire(timeout=1):
logger.warning('Ctrl-C pressed, exiting sub processes ...')
raise KeyboardInterrupt

View File

@ -148,7 +148,7 @@ def doujinshi_parser(id_, counter=0):
doujinshi['subtitle'] = subtitle.text if subtitle else '' doujinshi['subtitle'] = subtitle.text if subtitle else ''
doujinshi_cover = html.find('div', attrs={'id': 'cover'}) doujinshi_cover = html.find('div', attrs={'id': 'cover'})
img_id = re.search('/galleries/([0-9]+)/cover.(jpg|png|gif)$', img_id = re.search('/galleries/([0-9]+)/cover.(jpg|png|gif|webp)$',
doujinshi_cover.a.img.attrs['data-src']) doujinshi_cover.a.img.attrs['data-src'])
ext = [] ext = []

View File

@ -6,6 +6,7 @@ import os
import zipfile import zipfile
import shutil import shutil
import httpx
import requests import requests
import sqlite3 import sqlite3
import urllib.parse import urllib.parse
@ -32,8 +33,28 @@ def request(method, url, **kwargs):
return getattr(session, method)(url, verify=False, **kwargs) return getattr(session, method)(url, verify=False, **kwargs)
async def async_request(method, url, proxies = None, **kwargs):
headers = {
'Referer': constant.LOGIN_URL,
'User-Agent': constant.CONFIG['useragent'],
'Cookie': constant.CONFIG['cookie'],
}
if proxies is None:
proxies = constant.CONFIG['proxy']
if proxies.get('http') == '' and proxies.get('https') == '':
proxies = None
async with httpx.AsyncClient(headers=headers, verify=False, proxies=proxies, **kwargs) as client:
response = await client.request(method, url, **kwargs)
return response
def check_cookie(): def check_cookie():
response = request('get', constant.BASE_URL) response = request('get', constant.BASE_URL)
if response.status_code == 403 and 'Just a moment...' in response.text: if response.status_code == 403 and 'Just a moment...' in response.text:
logger.error('Blocked by Cloudflare captcha, please set your cookie and useragent') logger.error('Blocked by Cloudflare captcha, please set your cookie and useragent')
sys.exit(1) sys.exit(1)
@ -104,7 +125,7 @@ def generate_html(output_dir='.', doujinshi_obj=None, template='default'):
file_list.sort() file_list.sort()
for image in file_list: for image in file_list:
if not os.path.splitext(image)[1] in ('.jpg', '.png'): if not os.path.splitext(image)[1] in ('.jpg', '.png', '.webp'):
continue continue
image_html += f'<img src="{image}" class="image-item"/>\n' image_html += f'<img src="{image}" class="image-item"/>\n'
@ -230,7 +251,7 @@ def generate_doc(file_type='', output_dir='.', doujinshi_obj=None, regenerate=Fa
import img2pdf import img2pdf
"""Write images to a PDF file using img2pdf.""" """Write images to a PDF file using img2pdf."""
file_list = [f for f in os.listdir(doujinshi_dir) if f.lower().endswith(('.png', '.jpg', '.jpeg', '.gif'))] file_list = [f for f in os.listdir(doujinshi_dir) if f.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.webp'))]
file_list.sort() file_list.sort()
logger.info(f'Writing PDF file to path: {filename}') logger.info(f'Writing PDF file to path: {filename}')

View File

@ -1,3 +1,4 @@
httpx
requests requests
soupsieve soupsieve
setuptools setuptools

View File

@ -20,7 +20,7 @@ class TestDownload(unittest.TestCase):
def test_download(self): def test_download(self):
did = 440546 did = 440546
info = Doujinshi(**doujinshi_parser(did), name_format='%i') info = Doujinshi(**doujinshi_parser(did), name_format='%i')
info.downloader = Downloader(path='/tmp', size=5) info.downloader = Downloader(path='/tmp', threads=5)
info.download() info.download()
self.assertTrue(os.path.exists(f'/tmp/{did}/001.jpg')) self.assertTrue(os.path.exists(f'/tmp/{did}/001.jpg'))