Compare commits

..

1 Commits

Author SHA1 Message Date
cac07a517e remove debug print 2024-11-09 11:51:03 +08:00
5 changed files with 77 additions and 98 deletions

View File

@ -77,7 +77,7 @@ def main():
doujinshi_ids = list(set(map(int, doujinshi_ids)) - set(data))
if not options.is_show:
downloader = Downloader(path=options.output_dir, threads=options.threads,
downloader = Downloader(path=options.output_dir, size=options.threads,
timeout=options.timeout, delay=options.delay)
for doujinshi_id in doujinshi_ids:

View File

@ -1,17 +1,24 @@
# coding: utf-
import multiprocessing
import signal
import sys
import os
import asyncio
import httpx
import requests
import time
import urllib3.exceptions
from urllib.parse import urlparse
from nhentai import constant
from nhentai.logger import logger
from nhentai.utils import Singleton, async_request
from nhentai.parser import request
from nhentai.utils import Singleton
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
semaphore = multiprocessing.Semaphore(1)
class NHentaiImageNotExistException(Exception):
pass
@ -32,66 +39,64 @@ def download_callback(result):
logger.log(16, f'{data} downloaded successfully')
async def fiber(tasks):
for completed_task in asyncio.as_completed(tasks):
try:
result = await completed_task
logger.info(f'{result[1]} download completed')
except Exception as e:
logger.error(f'An error occurred: {e}')
class Downloader(Singleton):
def __init__(self, path='', threads=5, timeout=30, delay=0):
self.threads = threads
def __init__(self, path='', size=5, timeout=30, delay=0):
self.size = size
self.path = str(path)
self.timeout = timeout
self.delay = delay
async def _semaphore_download(self, semaphore, *args, **kwargs):
async with semaphore:
return await self.download(*args, **kwargs)
async def download(self, url, folder='', filename='', retried=0, proxy=None):
logger.info(f'Starting to download {url} ...')
def download(self, url, folder='', filename='', retried=0, proxy=None):
if self.delay:
await asyncio.sleep(self.delay)
time.sleep(self.delay)
logger.info(f'Starting to download {url} ...')
filename = filename if filename else os.path.basename(urlparse(url).path)
base_filename, extension = os.path.splitext(filename)
save_file_path = os.path.join(self.folder, filename)
save_file_path = os.path.join(folder, base_filename.zfill(3) + extension)
try:
if os.path.exists(save_file_path):
logger.warning(f'Skipped download: {save_file_path} already exists')
return 1, url
response = await async_request('GET', url, timeout=self.timeout, proxies=proxy)
response = None
with open(save_file_path, "wb") as f:
i = 0
while i < 10:
try:
response = request('get', url, stream=True, timeout=self.timeout, proxies=proxy)
if response.status_code != 200:
path = urlparse(url).path
for mirror in constant.IMAGE_URL_MIRRORS:
logger.info(f"Try mirror: {mirror}{path}")
# print(f'{mirror}{path}')
mirror_url = f'{mirror}{path}'
response = await async_request('GET', mirror_url, timeout=self.timeout, proxies=proxy)
response = request('get', mirror_url, stream=True,
timeout=self.timeout, proxies=proxy)
if response.status_code == 200:
break
if not await self.save(filename, response):
logger.error(f'Can not download image {url}')
return 1, None
except Exception as e:
i += 1
if not i < 10:
logger.critical(str(e))
return 0, None
continue
except (httpx.HTTPStatusError, httpx.TimeoutException, httpx.ConnectError) as e:
break
length = response.headers.get('content-length')
if length is None:
f.write(response.content)
else:
for chunk in response.iter_content(2048):
f.write(chunk)
except (requests.HTTPError, requests.Timeout) as e:
if retried < 3:
logger.info(f'Download {filename} failed, retrying({retried + 1}) times...')
return await self.download(
url=url,
folder=folder,
filename=filename,
retried=retried + 1,
proxy=proxy,
)
logger.warning(f'Warning: {e}, retrying({retried}) ...')
return 0, self.download(url=url, folder=folder, filename=filename,
retried=retried+1, proxy=proxy)
else:
return 0, None
@ -101,8 +106,6 @@ class Downloader(Singleton):
except Exception as e:
import traceback
logger.error(f"Exception type: {type(e)}")
traceback.print_stack()
logger.critical(str(e))
return 0, None
@ -112,22 +115,6 @@ class Downloader(Singleton):
return 1, url
async def save(self, save_file_path, response) -> bool:
if response is None:
logger.error('Error: Response is None')
return False
save_file_path = os.path.join(self.folder, save_file_path)
with open(save_file_path, 'wb') as f:
if response is not None:
length = response.headers.get('content-length')
if length is None:
f.write(response.content)
else:
async for chunk in response.aiter_bytes(2048):
f.write(chunk)
return True
def start_download(self, queue, folder='') -> bool:
if not isinstance(folder, (str, )):
folder = str(folder)
@ -141,20 +128,34 @@ class Downloader(Singleton):
os.makedirs(folder)
except EnvironmentError as e:
logger.critical(str(e))
self.folder = folder
if os.getenv('DEBUG', None) == 'NODOWNLOAD':
# Assuming we want to continue with rest of process.
return True
queue = [(self, url, folder, constant.CONFIG['proxy']) for url in queue]
semaphore = asyncio.Semaphore(self.threads)
pool = multiprocessing.Pool(self.size, init_worker)
[pool.apply_async(download_wrapper, args=item) for item in queue]
coroutines = [
self._semaphore_download(semaphore, url, filename=os.path.basename(urlparse(url).path))
for url in queue
]
# Prevent coroutines infection
asyncio.run(fiber(coroutines))
pool.close()
pool.join()
return True
def download_wrapper(obj, url, folder='', proxy=None):
if sys.platform == 'darwin' or semaphore.get_value():
return Downloader.download(obj, url=url, folder=folder, proxy=proxy)
else:
return -3, None
def init_worker():
signal.signal(signal.SIGINT, subprocess_signal)
def subprocess_signal(sig, frame):
if semaphore.acquire(timeout=1):
logger.warning('Ctrl-C pressed, exiting sub processes ...')
raise KeyboardInterrupt

View File

@ -6,7 +6,6 @@ import os
import zipfile
import shutil
import httpx
import requests
import sqlite3
import urllib.parse
@ -33,28 +32,8 @@ def request(method, url, **kwargs):
return getattr(session, method)(url, verify=False, **kwargs)
async def async_request(method, url, proxies = None, **kwargs):
headers = {
'Referer': constant.LOGIN_URL,
'User-Agent': constant.CONFIG['useragent'],
'Cookie': constant.CONFIG['cookie'],
}
if proxies is None:
proxies = constant.CONFIG['proxy']
if proxies.get('http') == '' and proxies.get('https') == '':
proxies = None
async with httpx.AsyncClient(headers=headers, verify=False, proxies=proxies, **kwargs) as client:
response = await client.request(method, url, **kwargs)
return response
def check_cookie():
response = request('get', constant.BASE_URL)
if response.status_code == 403 and 'Just a moment...' in response.text:
logger.error('Blocked by Cloudflare captcha, please set your cookie and useragent')
sys.exit(1)

View File

@ -1,4 +1,3 @@
httpx
requests
soupsieve
setuptools

View File

@ -20,7 +20,7 @@ class TestDownload(unittest.TestCase):
def test_download(self):
did = 440546
info = Doujinshi(**doujinshi_parser(did), name_format='%i')
info.downloader = Downloader(path='/tmp', threads=5)
info.downloader = Downloader(path='/tmp', size=5)
info.download()
self.assertTrue(os.path.exists(f'/tmp/{did}/001.jpg'))