nhentai/hentai/parser.py
2015-04-19 13:13:55 +08:00

39 lines
1019 B
Python

import re
import requests
from bs4 import BeautifulSoup
from constant import DETAIL_URL
dojinshi_fields = ['Artists:']
def dojinshi_parser(id):
if not isinstance(id, (int, )) or (isinstance(id, (str, )) and not id.isdigit()):
raise Exception('Dojinshi id(%s) is not valid' % str(id))
id = int(id)
dojinshi = dict()
dojinshi['id'] = id
url = '%s/%d/' % (DETAIL_URL, id)
response = requests.get(url).content
html = BeautifulSoup(response)
dojinshi_info = html.find('div', attrs={'id': 'info'})
title = dojinshi_info.find('h1').text
subtitle = dojinshi_info.find('h2')
dojinshi['name'] = title
dojinshi['subtitle'] = subtitle.text if subtitle else ''
pages = 0
for _ in dojinshi_info.find_all('div', class_=''):
pages = re.search('([\d]+) pages', _.text)
if pages:
pages = pages.group(1)
break
dojinshi['pages'] = int(pages)
return dojinshi
if __name__ == '__main__':
print dojinshi_parser(32271)