multithreading download

This commit is contained in:
ricterz 2015-04-19 13:13:55 +08:00
parent 0a6285a868
commit 47c88050e4
7 changed files with 160 additions and 5 deletions

View File

@ -1 +1,11 @@
__author__ = 'ricter'
from optparse import OptionParser
def cmd_parser():
parser = OptionParser()
parser.add_option('--search', type='string', dest='keyword', action='store')
parser.add_option('--download', dest='is_download', action='store_true')
parser.add_option('--id', type='int', dest='id', action='store')
args, _ = parser.parse_args()
return args

View File

@ -1 +1,4 @@
__author__ = 'ricter'
SCHEMA = 'http://'
URL = '%snhentai.net' % SCHEMA
DETAIL_URL = '%s/g' % URL
IMAGE_URL = '%si.nhentai.net/galleries' % SCHEMA

View File

@ -1 +1,40 @@
__author__ = 'ricter'
import Queue
from constant import DETAIL_URL, IMAGE_URL
class Dojinshi(object):
def __init__(self, name=None, subtitle=None, id=None, pages=0):
self.name = name
self.subtitle = subtitle
self.id = id
self.pages = pages
self.downloader = None
self.url = '%s/%d' % (DETAIL_URL, self.id)
def __repr__(self):
return '<Dojinshi: %s>' % self.name
def show(self):
print 'Dojinshi: %s' % self.name
print 'Subtitle: %s' % self.subtitle
print 'URL: %s' % self.url
print 'Pages: %d' % self.pages
def download(self):
if self.downloader:
download_queue = Queue.Queue()
for i in xrange(1, self.pages + 1):
download_queue.put('%s/%d/%d.jpg' % (IMAGE_URL, self.id, i))
self.downloader.download(download_queue)
else:
raise Exception('Downloader has not be loaded')
if __name__ == '__main__':
test = Dojinshi(name='test hentai dojinshi', id=1)
print test
test.show()
try:
test.download()
except Exception, e:
print 'Exception: %s' % str(e)

View File

@ -1 +1,37 @@
__author__ = 'ricter'
import threading
import Queue
class Downloader(object):
def __init__(self):
self.threads = []
def _download(self, queue):
while True:
if not queue.qsize():
queue.task_done()
break
try:
url = queue.get(False)
print 'Downloading: %s' % url
except Queue.Empty:
break
def download(self, queue):
for i in range(10):
_ = threading.Thread(target=self._download, args=(queue, ))
self.threads.append(_)
for i in self.threads:
i.start()
for i in self.threads:
i.join()
if __name__ == '__main__':
d = Downloader()
q = Queue.Queue()
for i in range(0, 50):
q.put(i)
d.download(q)

View File

@ -1 +1,39 @@
__author__ = 'ricter'
import re
import requests
from bs4 import BeautifulSoup
from constant import DETAIL_URL
dojinshi_fields = ['Artists:']
def dojinshi_parser(id):
if not isinstance(id, (int, )) or (isinstance(id, (str, )) and not id.isdigit()):
raise Exception('Dojinshi id(%s) is not valid' % str(id))
id = int(id)
dojinshi = dict()
dojinshi['id'] = id
url = '%s/%d/' % (DETAIL_URL, id)
response = requests.get(url).content
html = BeautifulSoup(response)
dojinshi_info = html.find('div', attrs={'id': 'info'})
title = dojinshi_info.find('h1').text
subtitle = dojinshi_info.find('h2')
dojinshi['name'] = title
dojinshi['subtitle'] = subtitle.text if subtitle else ''
pages = 0
for _ in dojinshi_info.find_all('div', class_=''):
pages = re.search('([\d]+) pages', _.text)
if pages:
pages = pages.group(1)
break
dojinshi['pages'] = int(pages)
return dojinshi
if __name__ == '__main__':
print dojinshi_parser(32271)

View File

@ -0,0 +1,26 @@
from hentai.cmdline import cmd_parser
from hentai.parser import dojinshi_parser
from hentai.dojinshi import Dojinshi
from hentai.downloader import Downloader
def main():
options = cmd_parser()
dojinshi = None
if options.id:
dojinshi_info = dojinshi_parser(options.id)
dojinshi = Dojinshi(**dojinshi_info)
elif options.keyword:
pass
else:
raise SystemExit
dojinshi.show()
if options.is_download:
dojinshi.downloader = Downloader()
dojinshi.download()
if __name__ == '__main__':
main()

View File

@ -0,0 +1,3 @@
requests>=2.5.0
wget>=2.2
BeautifulSoup4>=4.0.0