Compare commits

..

1 Commits

Author SHA1 Message Date
cac07a517e remove debug print 2024-11-09 11:51:03 +08:00
12 changed files with 82 additions and 236 deletions

View File

@ -140,7 +140,6 @@ Format output doujinshi folder name:
Supported doujinshi folder formatter: Supported doujinshi folder formatter:
- %i: Doujinshi id - %i: Doujinshi id
- %f: Doujinshi favorite count
- %t: Doujinshi name - %t: Doujinshi name
- %s: Doujinshi subtitle (translated name) - %s: Doujinshi subtitle (translated name)
- %a: Doujinshi authors' name - %a: Doujinshi authors' name

View File

@ -1,3 +1,3 @@
__version__ = '0.5.15' __version__ = '0.5.12'
__author__ = 'RicterZ' __author__ = 'RicterZ'
__email__ = 'ricterzheng@gmail.com' __email__ = 'ricterzheng@gmail.com'

View File

@ -77,7 +77,7 @@ def main():
doujinshi_ids = list(set(map(int, doujinshi_ids)) - set(data)) doujinshi_ids = list(set(map(int, doujinshi_ids)) - set(data))
if not options.is_show: if not options.is_show:
downloader = Downloader(path=options.output_dir, threads=options.threads, downloader = Downloader(path=options.output_dir, size=options.threads,
timeout=options.timeout, delay=options.delay) timeout=options.timeout, delay=options.delay)
for doujinshi_id in doujinshi_ids: for doujinshi_id in doujinshi_ids:

View File

@ -29,12 +29,11 @@ class DoujinshiInfo(dict):
class Doujinshi(object): class Doujinshi(object):
def __init__(self, name=None, pretty_name=None, id=None, favorite_counts=0, img_id=None, def __init__(self, name=None, pretty_name=None, id=None, img_id=None,
ext='', pages=0, name_format='[%i][%a][%t]', **kwargs): ext='', pages=0, name_format='[%i][%a][%t]', **kwargs):
self.name = name self.name = name
self.pretty_name = pretty_name self.pretty_name = pretty_name
self.id = id self.id = id
self.favorite_counts = favorite_counts
self.img_id = img_id self.img_id = img_id
self.ext = ext self.ext = ext
self.pages = pages self.pages = pages
@ -46,7 +45,6 @@ class Doujinshi(object):
name_format = name_format.replace('%ag', format_filename(ag_value)) name_format = name_format.replace('%ag', format_filename(ag_value))
name_format = name_format.replace('%i', format_filename(str(self.id))) name_format = name_format.replace('%i', format_filename(str(self.id)))
name_format = name_format.replace('%f', format_filename(str(self.favorite_counts)))
name_format = name_format.replace('%a', format_filename(self.info.artists)) name_format = name_format.replace('%a', format_filename(self.info.artists))
name_format = name_format.replace('%g', format_filename(self.info.groups)) name_format = name_format.replace('%g', format_filename(self.info.groups))
@ -65,7 +63,6 @@ class Doujinshi(object):
['Groups', self.info.groups], ['Groups', self.info.groups],
['Languages', self.info.languages], ['Languages', self.info.languages],
['Tags', self.info.tags], ['Tags', self.info.tags],
['Favorite Counts', self.info.favorite_counts],
['URL', self.url], ['URL', self.url],
['Pages', self.pages], ['Pages', self.pages],
] ]

View File

@ -1,17 +1,24 @@
# coding: utf- # coding: utf-
import multiprocessing
import signal
import sys
import os import os
import asyncio import requests
import httpx import time
import urllib3.exceptions import urllib3.exceptions
from urllib.parse import urlparse from urllib.parse import urlparse
from nhentai import constant from nhentai import constant
from nhentai.logger import logger from nhentai.logger import logger
from nhentai.utils import Singleton, async_request from nhentai.parser import request
from nhentai.utils import Singleton
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
semaphore = multiprocessing.Semaphore(1)
class NHentaiImageNotExistException(Exception): class NHentaiImageNotExistException(Exception):
pass pass
@ -32,68 +39,64 @@ def download_callback(result):
logger.log(16, f'{data} downloaded successfully') logger.log(16, f'{data} downloaded successfully')
class Downloader(Singleton): class Downloader(Singleton):
def __init__(self, path='', threads=5, timeout=30, delay=0):
self.threads = threads def __init__(self, path='', size=5, timeout=30, delay=0):
self.size = size
self.path = str(path) self.path = str(path)
self.timeout = timeout self.timeout = timeout
self.delay = delay self.delay = delay
async def fiber(self, tasks): def download(self, url, folder='', filename='', retried=0, proxy=None):
self.semaphore = asyncio.Semaphore(self.threads)
for completed_task in asyncio.as_completed(tasks):
try:
result = await completed_task
logger.info(f'{result[1]} download completed')
except Exception as e:
logger.error(f'An error occurred: {e}')
async def _semaphore_download(self, *args, **kwargs):
async with self.semaphore:
return await self.download(*args, **kwargs)
async def download(self, url, folder='', filename='', retried=0, proxy=None):
logger.info(f'Starting to download {url} ...')
if self.delay: if self.delay:
await asyncio.sleep(self.delay) time.sleep(self.delay)
logger.info(f'Starting to download {url} ...')
filename = filename if filename else os.path.basename(urlparse(url).path) filename = filename if filename else os.path.basename(urlparse(url).path)
base_filename, extension = os.path.splitext(filename)
save_file_path = os.path.join(self.folder, filename) save_file_path = os.path.join(folder, base_filename.zfill(3) + extension)
try: try:
if os.path.exists(save_file_path): if os.path.exists(save_file_path):
logger.warning(f'Skipped download: {save_file_path} already exists') logger.warning(f'Skipped download: {save_file_path} already exists')
return 1, url return 1, url
response = await async_request('GET', url, timeout=self.timeout, proxies=proxy) response = None
with open(save_file_path, "wb") as f:
i = 0
while i < 10:
try:
response = request('get', url, stream=True, timeout=self.timeout, proxies=proxy)
if response.status_code != 200:
path = urlparse(url).path
for mirror in constant.IMAGE_URL_MIRRORS:
# print(f'{mirror}{path}')
mirror_url = f'{mirror}{path}'
response = request('get', mirror_url, stream=True,
timeout=self.timeout, proxies=proxy)
if response.status_code == 200:
break
if response.status_code != 200: except Exception as e:
path = urlparse(url).path i += 1
for mirror in constant.IMAGE_URL_MIRRORS: if not i < 10:
logger.info(f"Try mirror: {mirror}{path}") logger.critical(str(e))
mirror_url = f'{mirror}{path}' return 0, None
response = await async_request('GET', mirror_url, timeout=self.timeout, proxies=proxy) continue
if response.status_code == 200:
break
if not await self.save(filename, response): break
logger.error(f'Can not download image {url}')
return 1, None
except (httpx.HTTPStatusError, httpx.TimeoutException, httpx.ConnectError) as e: length = response.headers.get('content-length')
if length is None:
f.write(response.content)
else:
for chunk in response.iter_content(2048):
f.write(chunk)
except (requests.HTTPError, requests.Timeout) as e:
if retried < 3: if retried < 3:
logger.info(f'Download {filename} failed, retrying({retried + 1}) times...') logger.warning(f'Warning: {e}, retrying({retried}) ...')
return await self.download( return 0, self.download(url=url, folder=folder, filename=filename,
url=url, retried=retried+1, proxy=proxy)
folder=folder,
filename=filename,
retried=retried + 1,
proxy=proxy,
)
else: else:
return 0, None return 0, None
@ -103,8 +106,6 @@ class Downloader(Singleton):
except Exception as e: except Exception as e:
import traceback import traceback
logger.error(f"Exception type: {type(e)}")
traceback.print_stack() traceback.print_stack()
logger.critical(str(e)) logger.critical(str(e))
return 0, None return 0, None
@ -114,24 +115,8 @@ class Downloader(Singleton):
return 1, url return 1, url
async def save(self, save_file_path, response) -> bool:
if response is None:
logger.error('Error: Response is None')
return False
save_file_path = os.path.join(self.folder, save_file_path)
with open(save_file_path, 'wb') as f:
if response is not None:
length = response.headers.get('content-length')
if length is None:
f.write(response.content)
else:
async for chunk in response.aiter_bytes(2048):
f.write(chunk)
return True
def start_download(self, queue, folder='') -> bool: def start_download(self, queue, folder='') -> bool:
if not isinstance(folder, (str,)): if not isinstance(folder, (str, )):
folder = str(folder) folder = str(folder)
if self.path: if self.path:
@ -143,19 +128,34 @@ class Downloader(Singleton):
os.makedirs(folder) os.makedirs(folder)
except EnvironmentError as e: except EnvironmentError as e:
logger.critical(str(e)) logger.critical(str(e))
self.folder = folder
if os.getenv('DEBUG', None) == 'NODOWNLOAD': if os.getenv('DEBUG', None) == 'NODOWNLOAD':
# Assuming we want to continue with rest of process. # Assuming we want to continue with rest of process.
return True return True
queue = [(self, url, folder, constant.CONFIG['proxy']) for url in queue]
pool = multiprocessing.Pool(self.size, init_worker)
[pool.apply_async(download_wrapper, args=item) for item in queue]
coroutines = [ pool.close()
self._semaphore_download(url, filename=os.path.basename(urlparse(url).path)) pool.join()
for url in queue
]
# Prevent coroutines infection
asyncio.run(self.fiber(coroutines))
return True return True
def download_wrapper(obj, url, folder='', proxy=None):
if sys.platform == 'darwin' or semaphore.get_value():
return Downloader.download(obj, url=url, folder=folder, proxy=proxy)
else:
return -3, None
def init_worker():
signal.signal(signal.SIGINT, subprocess_signal)
def subprocess_signal(sig, frame):
if semaphore.acquire(timeout=1):
logger.warning('Ctrl-C pressed, exiting sub processes ...')
raise KeyboardInterrupt

View File

@ -142,14 +142,10 @@ def doujinshi_parser(id_, counter=0):
title = doujinshi_info.find('h1').text title = doujinshi_info.find('h1').text
pretty_name = doujinshi_info.find('h1').find('span', attrs={'class': 'pretty'}).text pretty_name = doujinshi_info.find('h1').find('span', attrs={'class': 'pretty'}).text
subtitle = doujinshi_info.find('h2') subtitle = doujinshi_info.find('h2')
favorite_counts = doujinshi_info.find('span', class_='nobold').find('span', class_='count')
if favorite_counts is None:
favorite_counts = '0'
doujinshi['name'] = title doujinshi['name'] = title
doujinshi['pretty_name'] = pretty_name doujinshi['pretty_name'] = pretty_name
doujinshi['subtitle'] = subtitle.text if subtitle else '' doujinshi['subtitle'] = subtitle.text if subtitle else ''
doujinshi['favorite_counts'] = favorite_counts.strip()
doujinshi_cover = html.find('div', attrs={'id': 'cover'}) doujinshi_cover = html.find('div', attrs={'id': 'cover'})
img_id = re.search('/galleries/([0-9]+)/cover.(jpg|png|gif|webp)$', img_id = re.search('/galleries/([0-9]+)/cover.(jpg|png|gif|webp)$',

View File

@ -8,8 +8,6 @@ from nhentai.constant import LANGUAGE_ISO
def serialize_json(doujinshi, output_dir): def serialize_json(doujinshi, output_dir):
metadata = {'title': doujinshi.name, metadata = {'title': doujinshi.name,
'subtitle': doujinshi.info.subtitle} 'subtitle': doujinshi.info.subtitle}
if doujinshi.info.favorite_counts:
metadata['favorite_counts'] = doujinshi.favorite_counts
if doujinshi.info.date: if doujinshi.info.date:
metadata['upload_date'] = doujinshi.info.date metadata['upload_date'] = doujinshi.info.date
if doujinshi.info.parodies: if doujinshi.info.parodies:
@ -46,7 +44,6 @@ def serialize_comic_xml(doujinshi, output_dir):
xml_write_simple_tag(f, 'PageCount', doujinshi.pages) xml_write_simple_tag(f, 'PageCount', doujinshi.pages)
xml_write_simple_tag(f, 'URL', doujinshi.url) xml_write_simple_tag(f, 'URL', doujinshi.url)
xml_write_simple_tag(f, 'NhentaiId', doujinshi.id) xml_write_simple_tag(f, 'NhentaiId', doujinshi.id)
xml_write_simple_tag(f, 'Favorites', doujinshi.favorite_counts)
xml_write_simple_tag(f, 'Genre', doujinshi.info.categories) xml_write_simple_tag(f, 'Genre', doujinshi.info.categories)
xml_write_simple_tag(f, 'BlackAndWhite', 'No' if doujinshi.info.tags and xml_write_simple_tag(f, 'BlackAndWhite', 'No' if doujinshi.info.tags and

View File

@ -5,9 +5,7 @@ import re
import os import os
import zipfile import zipfile
import shutil import shutil
import copy
import httpx
import requests import requests
import sqlite3 import sqlite3
import urllib.parse import urllib.parse
@ -34,32 +32,8 @@ def request(method, url, **kwargs):
return getattr(session, method)(url, verify=False, **kwargs) return getattr(session, method)(url, verify=False, **kwargs)
async def async_request(method, url, proxies = None, **kwargs):
headers = {
'Referer': constant.LOGIN_URL,
'User-Agent': constant.CONFIG['useragent'],
'Cookie': constant.CONFIG['cookie'],
}
if proxies is None:
proxies = constant.CONFIG['proxy']
if proxies.get('http') == '' and proxies.get('https') == '':
proxies = None
if proxies:
_proxies = {f'{k}://': v for k, v in proxies.items() if v}
proxies = _proxies
async with httpx.AsyncClient(headers=headers, verify=False, proxies=proxies, **kwargs) as client:
response = await client.request(method, url, **kwargs)
return response
def check_cookie(): def check_cookie():
response = request('get', constant.BASE_URL) response = request('get', constant.BASE_URL)
if response.status_code == 403 and 'Just a moment...' in response.text: if response.status_code == 403 and 'Just a moment...' in response.text:
logger.error('Blocked by Cloudflare captcha, please set your cookie and useragent') logger.error('Blocked by Cloudflare captcha, please set your cookie and useragent')
sys.exit(1) sys.exit(1)

119
poetry.lock generated
View File

@ -1,26 +1,4 @@
# This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand. # This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.
[[package]]
name = "anyio"
version = "4.5.2"
description = "High level compatibility layer for multiple asynchronous event loop implementations"
optional = false
python-versions = ">=3.8"
files = [
{file = "anyio-4.5.2-py3-none-any.whl", hash = "sha256:c011ee36bc1e8ba40e5a81cb9df91925c218fe9b778554e0b56a21e1b5d4716f"},
{file = "anyio-4.5.2.tar.gz", hash = "sha256:23009af4ed04ce05991845451e11ef02fc7c5ed29179ac9a420e5ad0ac7ddc5b"},
]
[package.dependencies]
exceptiongroup = {version = ">=1.0.2", markers = "python_version < \"3.11\""}
idna = ">=2.8"
sniffio = ">=1.1"
typing-extensions = {version = ">=4.1", markers = "python_version < \"3.11\""}
[package.extras]
doc = ["Sphinx (>=7.4,<8.0)", "packaging", "sphinx-autodoc-typehints (>=1.2.0)", "sphinx-rtd-theme"]
test = ["anyio[trio]", "coverage[toml] (>=7)", "exceptiongroup (>=1.2.0)", "hypothesis (>=4.0)", "psutil (>=5.9)", "pytest (>=7.0)", "pytest-mock (>=3.6.1)", "trustme", "truststore (>=0.9.1)", "uvloop (>=0.21.0b1)"]
trio = ["trio (>=0.26.1)"]
[[package]] [[package]]
name = "beautifulsoup4" name = "beautifulsoup4"
@ -148,77 +126,6 @@ files = [
{file = "charset_normalizer-3.0.1-py3-none-any.whl", hash = "sha256:7e189e2e1d3ed2f4aebabd2d5b0f931e883676e51c7624826e0a4e5fe8a0bf24"}, {file = "charset_normalizer-3.0.1-py3-none-any.whl", hash = "sha256:7e189e2e1d3ed2f4aebabd2d5b0f931e883676e51c7624826e0a4e5fe8a0bf24"},
] ]
[[package]]
name = "exceptiongroup"
version = "1.2.2"
description = "Backport of PEP 654 (exception groups)"
optional = false
python-versions = ">=3.7"
files = [
{file = "exceptiongroup-1.2.2-py3-none-any.whl", hash = "sha256:3111b9d131c238bec2f8f516e123e14ba243563fb135d3fe885990585aa7795b"},
{file = "exceptiongroup-1.2.2.tar.gz", hash = "sha256:47c2edf7c6738fafb49fd34290706d1a1a2f4d1c6df275526b62cbb4aa5393cc"},
]
[package.extras]
test = ["pytest (>=6)"]
[[package]]
name = "h11"
version = "0.14.0"
description = "A pure-Python, bring-your-own-I/O implementation of HTTP/1.1"
optional = false
python-versions = ">=3.7"
files = [
{file = "h11-0.14.0-py3-none-any.whl", hash = "sha256:e3fe4ac4b851c468cc8363d500db52c2ead036020723024a109d37346efaa761"},
{file = "h11-0.14.0.tar.gz", hash = "sha256:8f19fbbe99e72420ff35c00b27a34cb9937e902a8b810e2c88300c6f0a3b699d"},
]
[[package]]
name = "httpcore"
version = "1.0.7"
description = "A minimal low-level HTTP client."
optional = false
python-versions = ">=3.8"
files = [
{file = "httpcore-1.0.7-py3-none-any.whl", hash = "sha256:a3fff8f43dc260d5bd363d9f9cf1830fa3a458b332856f34282de498ed420edd"},
{file = "httpcore-1.0.7.tar.gz", hash = "sha256:8551cb62a169ec7162ac7be8d4817d561f60e08eaa485234898414bb5a8a0b4c"},
]
[package.dependencies]
certifi = "*"
h11 = ">=0.13,<0.15"
[package.extras]
asyncio = ["anyio (>=4.0,<5.0)"]
http2 = ["h2 (>=3,<5)"]
socks = ["socksio (==1.*)"]
trio = ["trio (>=0.22.0,<1.0)"]
[[package]]
name = "httpx"
version = "0.27.2"
description = "The next generation HTTP client."
optional = false
python-versions = ">=3.8"
files = [
{file = "httpx-0.27.2-py3-none-any.whl", hash = "sha256:7bb2708e112d8fdd7829cd4243970f0c223274051cb35ee80c03301ee29a3df0"},
{file = "httpx-0.27.2.tar.gz", hash = "sha256:f7c2be1d2f3c3c3160d441802406b206c2b76f5947b11115e6df10c6c65e66c2"},
]
[package.dependencies]
anyio = "*"
certifi = "*"
httpcore = "==1.*"
idna = "*"
sniffio = "*"
[package.extras]
brotli = ["brotli", "brotlicffi"]
cli = ["click (==8.*)", "pygments (==2.*)", "rich (>=10,<14)"]
http2 = ["h2 (>=3,<5)"]
socks = ["socksio (==1.*)"]
zstd = ["zstandard (>=0.18.0)"]
[[package]] [[package]]
name = "idna" name = "idna"
version = "3.7" version = "3.7"
@ -262,17 +169,6 @@ urllib3 = ">=1.21.1,<3"
socks = ["PySocks (>=1.5.6,!=1.5.7)"] socks = ["PySocks (>=1.5.6,!=1.5.7)"]
use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"] use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"]
[[package]]
name = "sniffio"
version = "1.3.1"
description = "Sniff out which async library your code is running under"
optional = false
python-versions = ">=3.7"
files = [
{file = "sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2"},
{file = "sniffio-1.3.1.tar.gz", hash = "sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc"},
]
[[package]] [[package]]
name = "soupsieve" name = "soupsieve"
version = "2.4" version = "2.4"
@ -298,17 +194,6 @@ files = [
[package.extras] [package.extras]
widechars = ["wcwidth"] widechars = ["wcwidth"]
[[package]]
name = "typing-extensions"
version = "4.12.2"
description = "Backported and Experimental Type Hints for Python 3.8+"
optional = false
python-versions = ">=3.8"
files = [
{file = "typing_extensions-4.12.2-py3-none-any.whl", hash = "sha256:04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d"},
{file = "typing_extensions-4.12.2.tar.gz", hash = "sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8"},
]
[[package]] [[package]]
name = "urllib3" name = "urllib3"
version = "1.26.19" version = "1.26.19"
@ -328,4 +213,4 @@ socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"]
[metadata] [metadata]
lock-version = "2.0" lock-version = "2.0"
python-versions = "^3.8" python-versions = "^3.8"
content-hash = "a69dbf5dcfd6dcc5afc0fd2de4ab153841f7d210d4be60c426e332e36a79d679" content-hash = "0a1d5abd47a669c7a1f2dc7b43824a449e29ba94908a4338d2ea0f2dfb4f805e"

View File

@ -1,6 +1,6 @@
[tool.poetry] [tool.poetry]
name = "nhentai" name = "nhentai"
version = "0.5.15" version = "0.5.12"
description = "nhentai doujinshi downloader" description = "nhentai doujinshi downloader"
authors = ["Ricter Z <ricterzheng@gmail.com>"] authors = ["Ricter Z <ricterzheng@gmail.com>"]
license = "MIT" license = "MIT"
@ -14,7 +14,6 @@ beautifulsoup4 = "^4.11.2"
tabulate = "^0.9.0" tabulate = "^0.9.0"
iso8601 = "^1.1.0" iso8601 = "^1.1.0"
urllib3 = "^1.26.14" urllib3 = "^1.26.14"
httpx = "0.27.2"
[build-system] [build-system]

View File

@ -1,4 +1,3 @@
httpx==0.27.2
requests requests
soupsieve soupsieve
setuptools setuptools

View File

@ -20,7 +20,7 @@ class TestDownload(unittest.TestCase):
def test_download(self): def test_download(self):
did = 440546 did = 440546
info = Doujinshi(**doujinshi_parser(did), name_format='%i') info = Doujinshi(**doujinshi_parser(did), name_format='%i')
info.downloader = Downloader(path='/tmp', threads=5) info.downloader = Downloader(path='/tmp', size=5)
info.download() info.download()
self.assertTrue(os.path.exists(f'/tmp/{did}/001.jpg')) self.assertTrue(os.path.exists(f'/tmp/{did}/001.jpg'))