From 47c88050e4d6bbcf80cd63476a4f11bb795fea29 Mon Sep 17 00:00:00 2001 From: ricterz Date: Sun, 19 Apr 2015 13:13:55 +0800 Subject: [PATCH] multithreading download --- hentai/cmdline.py | 12 +++++++++++- hentai/constant.py | 5 ++++- hentai/dojinshi.py | 41 ++++++++++++++++++++++++++++++++++++++++- hentai/downloader.py | 38 +++++++++++++++++++++++++++++++++++++- hentai/parser.py | 40 +++++++++++++++++++++++++++++++++++++++- nhentai.py | 26 ++++++++++++++++++++++++++ requirements.txt | 3 +++ 7 files changed, 160 insertions(+), 5 deletions(-) diff --git a/hentai/cmdline.py b/hentai/cmdline.py index 22e5d9d..14ba288 100644 --- a/hentai/cmdline.py +++ b/hentai/cmdline.py @@ -1 +1,11 @@ -__author__ = 'ricter' +from optparse import OptionParser + + +def cmd_parser(): + parser = OptionParser() + parser.add_option('--search', type='string', dest='keyword', action='store') + parser.add_option('--download', dest='is_download', action='store_true') + parser.add_option('--id', type='int', dest='id', action='store') + + args, _ = parser.parse_args() + return args diff --git a/hentai/constant.py b/hentai/constant.py index 22e5d9d..8113a37 100644 --- a/hentai/constant.py +++ b/hentai/constant.py @@ -1 +1,4 @@ -__author__ = 'ricter' +SCHEMA = 'http://' +URL = '%snhentai.net' % SCHEMA +DETAIL_URL = '%s/g' % URL +IMAGE_URL = '%si.nhentai.net/galleries' % SCHEMA \ No newline at end of file diff --git a/hentai/dojinshi.py b/hentai/dojinshi.py index 22e5d9d..0cc214c 100644 --- a/hentai/dojinshi.py +++ b/hentai/dojinshi.py @@ -1 +1,40 @@ -__author__ = 'ricter' +import Queue +from constant import DETAIL_URL, IMAGE_URL + + +class Dojinshi(object): + def __init__(self, name=None, subtitle=None, id=None, pages=0): + self.name = name + self.subtitle = subtitle + self.id = id + self.pages = pages + self.downloader = None + self.url = '%s/%d' % (DETAIL_URL, self.id) + + def __repr__(self): + return '' % self.name + + def show(self): + print 'Dojinshi: %s' % self.name + print 'Subtitle: %s' % self.subtitle + print 'URL: %s' % self.url + print 'Pages: %d' % self.pages + + def download(self): + if self.downloader: + download_queue = Queue.Queue() + for i in xrange(1, self.pages + 1): + download_queue.put('%s/%d/%d.jpg' % (IMAGE_URL, self.id, i)) + self.downloader.download(download_queue) + else: + raise Exception('Downloader has not be loaded') + + +if __name__ == '__main__': + test = Dojinshi(name='test hentai dojinshi', id=1) + print test + test.show() + try: + test.download() + except Exception, e: + print 'Exception: %s' % str(e) \ No newline at end of file diff --git a/hentai/downloader.py b/hentai/downloader.py index 22e5d9d..35767e8 100644 --- a/hentai/downloader.py +++ b/hentai/downloader.py @@ -1 +1,37 @@ -__author__ = 'ricter' +import threading +import Queue + + +class Downloader(object): + def __init__(self): + self.threads = [] + + def _download(self, queue): + while True: + if not queue.qsize(): + queue.task_done() + break + try: + url = queue.get(False) + print 'Downloading: %s' % url + except Queue.Empty: + break + + def download(self, queue): + for i in range(10): + _ = threading.Thread(target=self._download, args=(queue, )) + self.threads.append(_) + + for i in self.threads: + i.start() + + for i in self.threads: + i.join() + + +if __name__ == '__main__': + d = Downloader() + q = Queue.Queue() + for i in range(0, 50): + q.put(i) + d.download(q) diff --git a/hentai/parser.py b/hentai/parser.py index 22e5d9d..ee029a6 100644 --- a/hentai/parser.py +++ b/hentai/parser.py @@ -1 +1,39 @@ -__author__ = 'ricter' +import re +import requests +from bs4 import BeautifulSoup +from constant import DETAIL_URL + + +dojinshi_fields = ['Artists:'] + + +def dojinshi_parser(id): + if not isinstance(id, (int, )) or (isinstance(id, (str, )) and not id.isdigit()): + raise Exception('Dojinshi id(%s) is not valid' % str(id)) + id = int(id) + dojinshi = dict() + dojinshi['id'] = id + url = '%s/%d/' % (DETAIL_URL, id) + + response = requests.get(url).content + html = BeautifulSoup(response) + dojinshi_info = html.find('div', attrs={'id': 'info'}) + + title = dojinshi_info.find('h1').text + subtitle = dojinshi_info.find('h2') + + dojinshi['name'] = title + dojinshi['subtitle'] = subtitle.text if subtitle else '' + + pages = 0 + for _ in dojinshi_info.find_all('div', class_=''): + pages = re.search('([\d]+) pages', _.text) + if pages: + pages = pages.group(1) + break + dojinshi['pages'] = int(pages) + return dojinshi + + +if __name__ == '__main__': + print dojinshi_parser(32271) \ No newline at end of file diff --git a/nhentai.py b/nhentai.py index e69de29..9f65464 100644 --- a/nhentai.py +++ b/nhentai.py @@ -0,0 +1,26 @@ +from hentai.cmdline import cmd_parser +from hentai.parser import dojinshi_parser +from hentai.dojinshi import Dojinshi +from hentai.downloader import Downloader + + +def main(): + options = cmd_parser() + dojinshi = None + + if options.id: + dojinshi_info = dojinshi_parser(options.id) + dojinshi = Dojinshi(**dojinshi_info) + elif options.keyword: + pass + else: + raise SystemExit + + dojinshi.show() + if options.is_download: + dojinshi.downloader = Downloader() + dojinshi.download() + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index e69de29..713b1e6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -0,0 +1,3 @@ +requests>=2.5.0 +wget>=2.2 +BeautifulSoup4>=4.0.0 \ No newline at end of file