From f4afcd549e63114d16ceab3a1026cd617da65b9e Mon Sep 17 00:00:00 2001 From: Waifu Date: Mon, 29 Jul 2019 09:11:45 +0200 Subject: [PATCH 1/7] Added sorting option --- nhentai/__init__.py | 2 +- nhentai/cmdline.py | 2 ++ nhentai/command.py | 4 ++-- nhentai/parser.py | 21 ++++++++++++--------- 4 files changed, 17 insertions(+), 12 deletions(-) diff --git a/nhentai/__init__.py b/nhentai/__init__.py index 2bc558a..f0d7796 100644 --- a/nhentai/__init__.py +++ b/nhentai/__init__.py @@ -1,3 +1,3 @@ -__version__ = '0.3.6' +__version__ = '0.3.7' __author__ = 'RicterZ' __email__ = 'ricterzheng@gmail.com' diff --git a/nhentai/cmdline.py b/nhentai/cmdline.py index 2a58736..475c662 100644 --- a/nhentai/cmdline.py +++ b/nhentai/cmdline.py @@ -58,6 +58,8 @@ def cmd_parser(): help='page number of search results') parser.add_option('--max-page', type='int', dest='max_page', action='store', default=1, help='The max page when recursive download tagged doujinshi') + parser.add_option('--sorting', type='string', dest='sorting', action='store', default='date', + help='sorting of doujinshi, e.g. date/popular') # download options parser.add_option('--output', '-o', type='string', dest='output_dir', action='store', default='', diff --git a/nhentai/command.py b/nhentai/command.py index 10176ea..9d3cb68 100644 --- a/nhentai/command.py +++ b/nhentai/command.py @@ -40,13 +40,13 @@ def main(): doujinshi_ids = map(lambda d: d['id'], doujinshis) elif options.tag: - doujinshis = tag_parser(options.tag, max_page=options.max_page) + doujinshis = tag_parser(options.tag, options.sorting, max_page=options.max_page) print_doujinshi(doujinshis) if options.is_download and doujinshis: doujinshi_ids = map(lambda d: d['id'], doujinshis) elif options.keyword: - doujinshis = search_parser(options.keyword, options.page) + doujinshis = search_parser(options.keyword, options.sorting, options.page) print_doujinshi(doujinshis) if options.is_download: doujinshi_ids = map(lambda d: d['id'], doujinshis) diff --git a/nhentai/parser.py b/nhentai/parser.py index cb53cc8..b6a23e2 100644 --- a/nhentai/parser.py +++ b/nhentai/parser.py @@ -169,10 +169,10 @@ def doujinshi_parser(id_): return doujinshi -def search_parser(keyword, page): +def search_parser(keyword, sorting, page): logger.debug('Searching doujinshis of keyword {0}'.format(keyword)) try: - response = request('get', url=constant.SEARCH_URL, params={'q': keyword, 'page': page}).content + response = request('get', url=constant.SEARCH_URL, params={'q': keyword, 'page': page, 'sort': sorting}).content except requests.ConnectionError as e: logger.critical(e) logger.warn('If you are in China, please configure the proxy to fu*k GFW.') @@ -194,14 +194,17 @@ def print_doujinshi(doujinshi_list): tabulate(tabular_data=doujinshi_list, headers=headers, tablefmt='rst')) -def tag_parser(tag_name, max_page=1): +def tag_parser(tag_name, sorting, max_page=1): result = [] tag_name = tag_name.lower() tag_name = tag_name.replace(' ', '-') + if sorting == 'date': + sorting = '' + for p in range(1, max_page + 1): logger.debug('Fetching page {0} for doujinshi with tag \'{1}\''.format(p, tag_name)) - response = request('get', url='%s/%s/?page=%d' % (constant.TAG_URL, tag_name, p)).content + response = request('get', url='%s/%s/%s?page=%d' % (constant.TAG_URL, tag_name, sorting, p)).content result += _get_title_and_id(response) if not result: @@ -214,13 +217,13 @@ def tag_parser(tag_name, max_page=1): return result -def __api_suspended_search_parser(keyword, page): +def __api_suspended_search_parser(keyword, sorting, page): logger.debug('Searching doujinshis using keywords {0}'.format(keyword)) result = [] i = 0 while i < 5: try: - response = request('get', url=constant.SEARCH_URL, params={'query': keyword, 'page': page}).json() + response = request('get', url=constant.SEARCH_URL, params={'query': keyword, 'page': page, 'sort': sorting}).json() except Exception as e: i += 1 if not i < 5: @@ -244,10 +247,10 @@ def __api_suspended_search_parser(keyword, page): return result -def __api_suspended_tag_parser(tag_id, max_page=1): +def __api_suspended_tag_parser(tag_id, sorting, max_page=1): logger.info('Searching for doujinshi with tag id {0}'.format(tag_id)) result = [] - response = request('get', url=constant.TAG_API_URL, params={'sort': 'popular', 'tag_id': tag_id}).json() + response = request('get', url=constant.TAG_API_URL, params={'sort': sorting, 'tag_id': tag_id}).json() page = max_page if max_page <= response['num_pages'] else int(response['num_pages']) for i in range(1, page + 1): @@ -255,7 +258,7 @@ def __api_suspended_tag_parser(tag_id, max_page=1): if page != 1: response = request('get', url=constant.TAG_API_URL, - params={'sort': 'popular', 'tag_id': tag_id}).json() + params={'sort': sorting, 'tag_id': tag_id}).json() for row in response['result']: title = row['title']['english'] title = title[:85] + '..' if len(title) > 85 else title From 1b499111667ea8f36c2b02ae3ad582c99494f62e Mon Sep 17 00:00:00 2001 From: RicterZ Date: Tue, 30 Jul 2019 23:03:29 +0800 Subject: [PATCH 2/7] code style --- nhentai/cmdline.py | 7 ++++--- nhentai/command.py | 4 ++-- nhentai/parser.py | 4 ++-- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/nhentai/cmdline.py b/nhentai/cmdline.py index 475c662..f34e54e 100644 --- a/nhentai/cmdline.py +++ b/nhentai/cmdline.py @@ -48,7 +48,8 @@ def cmd_parser(): # doujinshi options parser.add_option('--id', type='string', dest='id', action='store', help='doujinshi ids set, e.g. 1,2,3') - parser.add_option('--search', '-s', type='string', dest='keyword', action='store', help='search doujinshi by keyword') + parser.add_option('--search', '-s', type='string', dest='keyword', action='store', + help='search doujinshi by keyword') parser.add_option('--tag', type='string', dest='tag', action='store', help='download doujinshi by tag') parser.add_option('--favorites', '-F', action='store_true', dest='favorites', help='list or download your favorites.') @@ -58,8 +59,8 @@ def cmd_parser(): help='page number of search results') parser.add_option('--max-page', type='int', dest='max_page', action='store', default=1, help='The max page when recursive download tagged doujinshi') - parser.add_option('--sorting', type='string', dest='sorting', action='store', default='date', - help='sorting of doujinshi, e.g. date/popular') + parser.add_option('--sorting', dest='sorting', action='store', default='date', + help='sorting of doujinshi (date / popular)', choices=['date', 'popular']) # download options parser.add_option('--output', '-o', type='string', dest='output_dir', action='store', default='', diff --git a/nhentai/command.py b/nhentai/command.py index 9d3cb68..7d7fbf0 100644 --- a/nhentai/command.py +++ b/nhentai/command.py @@ -40,13 +40,13 @@ def main(): doujinshi_ids = map(lambda d: d['id'], doujinshis) elif options.tag: - doujinshis = tag_parser(options.tag, options.sorting, max_page=options.max_page) + doujinshis = tag_parser(options.tag, sorting=options.sorting, max_page=options.max_page) print_doujinshi(doujinshis) if options.is_download and doujinshis: doujinshi_ids = map(lambda d: d['id'], doujinshis) elif options.keyword: - doujinshis = search_parser(options.keyword, options.sorting, options.page) + doujinshis = search_parser(options.keyword, sorting=options.sorting, page=options.page) print_doujinshi(doujinshis) if options.is_download: doujinshi_ids = map(lambda d: d['id'], doujinshis) diff --git a/nhentai/parser.py b/nhentai/parser.py index b6a23e2..33130ec 100644 --- a/nhentai/parser.py +++ b/nhentai/parser.py @@ -169,7 +169,7 @@ def doujinshi_parser(id_): return doujinshi -def search_parser(keyword, sorting, page): +def search_parser(keyword, sorting='date', page=1): logger.debug('Searching doujinshis of keyword {0}'.format(keyword)) try: response = request('get', url=constant.SEARCH_URL, params={'q': keyword, 'page': page, 'sort': sorting}).content @@ -194,7 +194,7 @@ def print_doujinshi(doujinshi_list): tabulate(tabular_data=doujinshi_list, headers=headers, tablefmt='rst')) -def tag_parser(tag_name, sorting, max_page=1): +def tag_parser(tag_name, sorting='date', max_page=1): result = [] tag_name = tag_name.lower() tag_name = tag_name.replace(' ', '-') From bc70a2071bb8f4eea405124b960c511ee81c0438 Mon Sep 17 00:00:00 2001 From: RicterZ Date: Tue, 30 Jul 2019 23:04:23 +0800 Subject: [PATCH 3/7] add test for sorting --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index cdb9d4f..63a2663 100644 --- a/.travis.yml +++ b/.travis.yml @@ -16,7 +16,7 @@ script: - nhentai --cookie "csrftoken=3c4Mzn4f6NAI1awFqfIh495G3pv5Wade9n63Kx03mkSac8c2QR5vRR4jCwVzb3OR; sessionid=m034e2dyyxgbl9s07hbzgfhvadbap2tk" - nhentai --search umaru - nhentai --id=152503,146134 -t 10 --output=/tmp/ --cbz - - nhentai --tag lolicon + - nhentai --tag lolicon --sorting popular - nhentai -F - nhentai --file /tmp/test.txt - nhentai --id=152503,146134 --gen-main --output=/tmp/ From 7e826c52556dc61df382375f0d44bbe4beef773c Mon Sep 17 00:00:00 2001 From: RicterZ Date: Wed, 31 Jul 2019 01:22:54 +0800 Subject: [PATCH 4/7] use multiprocess instead of threadpool #78 --- nhentai/command.py | 8 ++---- nhentai/downloader.py | 65 +++++++++++++++++++++++++++++++++---------- nhentai/utils.py | 7 +++++ 3 files changed, 59 insertions(+), 21 deletions(-) diff --git a/nhentai/command.py b/nhentai/command.py index 7d7fbf0..e8b894f 100644 --- a/nhentai/command.py +++ b/nhentai/command.py @@ -11,7 +11,7 @@ from nhentai.doujinshi import Doujinshi from nhentai.downloader import Downloader from nhentai.logger import logger from nhentai.constant import BASE_URL -from nhentai.utils import generate_html, generate_cbz, generate_main_html, check_cookie +from nhentai.utils import generate_html, generate_cbz, generate_main_html, check_cookie, signal_handler def main(): @@ -83,12 +83,8 @@ def main(): [doujinshi.show() for doujinshi in doujinshi_list] -def signal_handler(signal, frame): - logger.error('Ctrl-C signal received. Stopping...') - exit(1) - - signal.signal(signal.SIGINT, signal_handler) + if __name__ == '__main__': main() diff --git a/nhentai/downloader.py b/nhentai/downloader.py index 81037f8..6d40016 100644 --- a/nhentai/downloader.py +++ b/nhentai/downloader.py @@ -1,10 +1,14 @@ # coding: utf- from __future__ import unicode_literals, print_function + +import signal + from future.builtins import str as text import os import requests import threadpool import time +import multiprocessing as mp try: from urllib.parse import urlparse @@ -13,13 +17,13 @@ except ImportError: from nhentai.logger import logger from nhentai.parser import request -from nhentai.utils import Singleton - +from nhentai.utils import Singleton, signal_handler requests.packages.urllib3.disable_warnings() +semaphore = mp.Semaphore() -class NhentaiImageNotExistException(Exception): +class NHentaiImageNotExistException(Exception): pass @@ -28,14 +32,14 @@ class Downloader(Singleton): def __init__(self, path='', thread=1, timeout=30, delay=0): if not isinstance(thread, (int, )) or thread < 1 or thread > 15: raise ValueError('Invalid threads count') + self.path = str(path) self.thread_count = thread self.threads = [] - self.thread_pool = None self.timeout = timeout self.delay = delay - def _download(self, url, folder='', filename='', retried=0): + def download_(self, url, folder='', filename='', retried=0): if self.delay: time.sleep(self.delay) logger.info('Starting to download {0} ...'.format(url)) @@ -54,9 +58,9 @@ class Downloader(Singleton): try: response = request('get', url, stream=True, timeout=self.timeout) if response.status_code != 200: - raise NhentaiImageNotExistException + raise NHentaiImageNotExistException - except NhentaiImageNotExistException as e: + except NHentaiImageNotExistException as e: raise e except Exception as e: @@ -78,27 +82,37 @@ class Downloader(Singleton): except (requests.HTTPError, requests.Timeout) as e: if retried < 3: logger.warning('Warning: {0}, retrying({1}) ...'.format(str(e), retried)) - return 0, self._download(url=url, folder=folder, filename=filename, retried=retried+1) + return 0, self.download_(url=url, folder=folder, filename=filename, retried=retried+1) else: return 0, None - except NhentaiImageNotExistException as e: + except NHentaiImageNotExistException as e: os.remove(os.path.join(folder, base_filename.zfill(3) + extension)) return -1, url except Exception as e: + import traceback + traceback.print_stack() logger.critical(str(e)) return 0, None + except KeyboardInterrupt: + return -3, None + return 1, url - def _download_callback(self, request, result): + def _download_callback(self, result): result, data = result if result == 0: logger.warning('fatal errors occurred, ignored') # exit(1) elif result == -1: logger.warning('url {} return status code 404'.format(data)) + elif result == -2: + logger.warning('Ctrl-C pressed, exiting sub processes ...') + elif result == -3: + # workers wont be run, just pass + pass else: logger.log(15, '{0} downloaded successfully'.format(data)) @@ -119,10 +133,31 @@ class Downloader(Singleton): else: logger.warn('Path \'{0}\' already exist.'.format(folder)) - queue = [([url], {'folder': folder}) for url in queue] + queue = [(self, url, folder) for url in queue] - self.thread_pool = threadpool.ThreadPool(self.thread_count) - requests_ = threadpool.makeRequests(self._download, queue, self._download_callback) - [self.thread_pool.putRequest(req) for req in requests_] + pool = mp.Pool(self.thread_count, init_worker) + for item in queue: + pool.apply_async(download_wrapper, args=item, callback=self._download_callback) - self.thread_pool.wait() + pool.close() + print(1) + pool.join() + print(2) + + +def download_wrapper(obj, url, folder=''): + if semaphore.get_value(): + return Downloader.download_(obj, url=url, folder=folder) + else: + return -3, None + + +def init_worker(): + signal.signal(signal.SIGINT, subprocess_signal) + + +def subprocess_signal(signal, frame): + if semaphore.acquire(timeout=1): + logger.warning('Ctrl-C pressed, exiting sub processes ...') + + raise KeyboardInterrupt diff --git a/nhentai/utils.py b/nhentai/utils.py index 6a0d00b..815c690 100644 --- a/nhentai/utils.py +++ b/nhentai/utils.py @@ -207,3 +207,10 @@ an invalid filename. # Remove [] from filename filename = filename.replace('[]', '') return filename + + +def signal_handler(signal, frame): + logger.error('Ctrl-C signal received. Stopping...') + exit(1) + + From 349e21193bbd35b1a9e5e3be2bbdd4e40bd1f4f1 Mon Sep 17 00:00:00 2001 From: RicterZ Date: Wed, 31 Jul 2019 19:04:25 +0800 Subject: [PATCH 5/7] remove print --- nhentai/downloader.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/nhentai/downloader.py b/nhentai/downloader.py index 6d40016..eee7d78 100644 --- a/nhentai/downloader.py +++ b/nhentai/downloader.py @@ -140,9 +140,7 @@ class Downloader(Singleton): pool.apply_async(download_wrapper, args=item, callback=self._download_callback) pool.close() - print(1) pool.join() - print(2) def download_wrapper(obj, url, folder=''): From 8dc7a1f40bc577efa5fb657788e776b4050d0385 Mon Sep 17 00:00:00 2001 From: RicterZ Date: Thu, 1 Aug 2019 18:52:30 +0800 Subject: [PATCH 6/7] singleton pool --- nhentai/downloader.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/nhentai/downloader.py b/nhentai/downloader.py index eee7d78..3d2db81 100644 --- a/nhentai/downloader.py +++ b/nhentai/downloader.py @@ -27,6 +27,17 @@ class NHentaiImageNotExistException(Exception): pass +class Pool(Singleton): + pool = None + + def __init__(self, size, init): + if self.pool is None: + if os.getenv('DEBUG'): + logger.info('Process pool created') + + self.pool = mp.Pool(size, initializer=init) + + class Downloader(Singleton): def __init__(self, path='', thread=1, timeout=30, delay=0): @@ -135,7 +146,7 @@ class Downloader(Singleton): queue = [(self, url, folder) for url in queue] - pool = mp.Pool(self.thread_count, init_worker) + pool = Pool(self.thread_count, init_worker).pool for item in queue: pool.apply_async(download_wrapper, args=item, callback=self._download_callback) From 23774d9526b808d2b534fdbd2360b3bfe269f15a Mon Sep 17 00:00:00 2001 From: RicterZ Date: Thu, 1 Aug 2019 21:06:40 +0800 Subject: [PATCH 7/7] fix bugs --- nhentai/command.py | 21 ++++++++++++++++----- nhentai/downloader.py | 25 ++++++------------------- nhentai/parser.py | 11 +++-------- 3 files changed, 25 insertions(+), 32 deletions(-) diff --git a/nhentai/command.py b/nhentai/command.py index e8b894f..083980d 100644 --- a/nhentai/command.py +++ b/nhentai/command.py @@ -4,11 +4,12 @@ from __future__ import unicode_literals, print_function import signal import platform import time +import multiprocessing from nhentai.cmdline import cmd_parser, banner from nhentai.parser import doujinshi_parser, search_parser, print_doujinshi, favorites_parser, tag_parser, login from nhentai.doujinshi import Doujinshi -from nhentai.downloader import Downloader +from nhentai.downloader import Downloader, init_worker from nhentai.logger import logger from nhentai.constant import BASE_URL from nhentai.utils import generate_html, generate_cbz, generate_main_html, check_cookie, signal_handler @@ -55,25 +56,35 @@ def main(): doujinshi_ids = options.id if doujinshi_ids: - for id_ in doujinshi_ids: + for i, id_ in enumerate(doujinshi_ids): if options.delay: time.sleep(options.delay) + doujinshi_info = doujinshi_parser(id_) - doujinshi_list.append(Doujinshi(name_format=options.name_format, **doujinshi_info)) + + if doujinshi_info: + doujinshi_list.append(Doujinshi(name_format=options.name_format, **doujinshi_info)) + + if (i + 1) % 10 == 0: + logger.info('Progress: %d / %d' % (i + 1, len(doujinshi_ids))) if not options.is_show: - downloader = Downloader(path=options.output_dir, - thread=options.threads, timeout=options.timeout, delay=options.delay) + downloader = Downloader(path=options.output_dir, size=options.threads, + timeout=options.timeout, delay=options.delay) for doujinshi in doujinshi_list: + doujinshi.downloader = downloader doujinshi.download() + if not options.is_nohtml and not options.is_cbz: generate_html(options.output_dir, doujinshi) elif options.is_cbz: generate_cbz(options.output_dir, doujinshi, options.rm_origin_dir) + if options.main_viewer: generate_main_html(options.output_dir) + if not platform.system() == 'Windows': logger.log(15, '🍻 All done.') else: diff --git a/nhentai/downloader.py b/nhentai/downloader.py index 3d2db81..2d9a0f4 100644 --- a/nhentai/downloader.py +++ b/nhentai/downloader.py @@ -1,6 +1,7 @@ # coding: utf- from __future__ import unicode_literals, print_function +import multiprocessing import signal from future.builtins import str as text @@ -27,26 +28,11 @@ class NHentaiImageNotExistException(Exception): pass -class Pool(Singleton): - pool = None - - def __init__(self, size, init): - if self.pool is None: - if os.getenv('DEBUG'): - logger.info('Process pool created') - - self.pool = mp.Pool(size, initializer=init) - - class Downloader(Singleton): - def __init__(self, path='', thread=1, timeout=30, delay=0): - if not isinstance(thread, (int, )) or thread < 1 or thread > 15: - raise ValueError('Invalid threads count') - + def __init__(self, path='', size=5, timeout=30, delay=0): + self.size = size self.path = str(path) - self.thread_count = thread - self.threads = [] self.timeout = timeout self.delay = delay @@ -140,13 +126,14 @@ class Downloader(Singleton): os.makedirs(folder) except EnvironmentError as e: logger.critical('{0}'.format(str(e))) - exit(1) + else: logger.warn('Path \'{0}\' already exist.'.format(folder)) queue = [(self, url, folder) for url in queue] - pool = Pool(self.thread_count, init_worker).pool + pool = multiprocessing.Pool(self.size, init_worker) + for item in queue: pool.apply_async(download_wrapper, args=item, callback=self._download_callback) diff --git a/nhentai/parser.py b/nhentai/parser.py index 33130ec..9b9232b 100644 --- a/nhentai/parser.py +++ b/nhentai/parser.py @@ -121,8 +121,8 @@ def doujinshi_parser(id_): return doujinshi_parser(str(id_)) except Exception as e: - logger.critical(str(e)) - raise SystemExit + logger.warn('Error: {}, ignored'.format(str(e))) + return None html = BeautifulSoup(response, 'html.parser') doujinshi_info = html.find('div', attrs={'id': 'info'}) @@ -171,12 +171,7 @@ def doujinshi_parser(id_): def search_parser(keyword, sorting='date', page=1): logger.debug('Searching doujinshis of keyword {0}'.format(keyword)) - try: - response = request('get', url=constant.SEARCH_URL, params={'q': keyword, 'page': page, 'sort': sorting}).content - except requests.ConnectionError as e: - logger.critical(e) - logger.warn('If you are in China, please configure the proxy to fu*k GFW.') - raise SystemExit + response = request('get', url=constant.SEARCH_URL, params={'q': keyword, 'page': page, 'sort': sorting}).content result = _get_title_and_id(response) if not result: