600字范文,内容丰富有趣,生活中的好帮手!
600字范文 > mangabz漫画网鬼灭之刃漫画爬虫

mangabz漫画网鬼灭之刃漫画爬虫

时间:2021-03-01 12:58:11

相关推荐

mangabz漫画网鬼灭之刃漫画爬虫

mangabz漫画爬虫

遇到的问题(未解决)

1:多线程导致开启多个chrome内存溢出,或者chrome崩溃

2:下载不全需执行多次补全漫画

3:日志打印混乱

import requestsimport fake_useragentimport timefrom lxml import etreeimport osfrom selenium import webdriverfrom selenium.webdriver.support import waitfrom selenium.webdriver.support import expected_conditions as ecfrom mon.by import Byfrom multiprocessing import Pool,Processfrom mon.exceptions import TimeoutExceptionheader = {'useragent': fake_useragent.UserAgent().random,}def get_pic(src,ipg,dir_name):# response = requests.get(url=src[0])# print(response)if not os.path.exists('./pic/%s' %dir_name):os.mkdir('./pic/%s' %dir_name)pic_name = src[0].split('?')[0].split('/')[-1]if not os.path.exists(os.getcwd()+'/pic/%s' %dir_name+'/'+pic_name):print('开始下载', src)response = requests.get(url=src[0])with open('./pic/%s/' %dir_name+pic_name,'wb') as ft:ft.write(response.content)ft.close()print('下载第%02d页成功!!!' %ipg)else:print('已经下载过了')options = webdriver.ChromeOptions()options.add_argument('--headless')options.add_argument('--disable-gpu')def get_pic_url(href,dir_name,ipg):# options = webdriver.ChromeOptions()# options.add_argument('--headless')# options.add_argument('--disable-gpu')borwser = webdriver.Chrome(options=options,executable_path='E:\py\chromedriver.exe')href=href+'#ipg%s' %ipgtry:borwser.get(href)wait.WebDriverWait(borwser,15,1).until(ec.presence_of_element_located((By.ID,'cp_image')))except TimeoutException:print('————————————————————————————请求超时')borwser.get(href)wait.WebDriverWait(borwser, 30, 1).until(ec.presence_of_element_located((By.ID, 'cp_image')))page = borwser.page_sourceet = etree.HTML(page)src = et.xpath('//img[@id="cp_image"]/@src')print("正在获取第%02d页" % ipg, src)get_pic(src, ipg, dir_name)print('再次下载——————————————————————————————')page = borwser.page_sourceet = etree.HTML(page)src = et.xpath('//img[@id="cp_image"]/@src')print("正在获取第%02d页" %ipg,src)# borwser.delete_all_cookies()borwser.close()borwser.quit()get_pic(src,ipg,dir_name)def get_page():url = '/73bz/'respones = requests.get(url=url,headers=header)page = respones.textet = etree.HTML(page)a_list = et.xpath('//div[@id="chapterlistload"]/a')for a in a_list:href = '/'+a.xpath('./@href')[0]dir_name = a.xpath('./text()')[0].strip()ipg = a.xpath('./span/text()')[0]print(href,dir_name,ipg)ipg = filter(str.isdigit, ipg)ipg = list(ipg)ipg = ''.join(ipg)ipg = ipg.replace(' ','')ipg = int(ipg)print('开始下载第%s' %dir_name)path = r"E:\py\manhua/pic/%s/" %dir_nameif os.path.exists(path):dir_num = len([lists for lists in os.listdir(path) if os.path.isfile(os.path.join(path, lists))])if dir_num == ipg-1:print('%s本章已下载' %dir_name)continueelse:pool = Pool(16)for page in range(1,ipg):# print('____________',page)# p = Process(target=get_pic_url,args=(href,dir_name,page))# p.start()# time.sleep(2)p = pool.apply_async(get_pic_url,(href,dir_name,page))p.wait()#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~pool.close()else:pool = Pool(32)for page in range(1, ipg):# print('____________',page)# p = Process(target=get_pic_url,args=(href,dir_name,page))# p.start()# time.sleep(2)p = pool.apply_async(get_pic_url, (href, dir_name, page))p.wait()pool.close()# 清除一切!!!!!!!!!!!!!!!# try:#os.system("taskkill /f /im chromedriver.exe /t")## os.system("taskkill /f /im python.exe /t")# except:#passif __name__ == '__main__':get_page()# if __name__ == '__main__':# pool = Pool(8)# pool.map(get_pic_url,range(1,21))

本内容不代表本网观点和政治立场,如有侵犯你的权益请联系我们处理。
网友评论
网友评论仅供其表达个人看法,并不表明网站立场。