0x00 前言
参考56 行代码,带你爬取豆瓣影评(影评的短评版),修改代码得到影评的长评,书评的短评、长评,长评再更新。
0x01 影评
1.短评
代码:
import requestsfrom urllib.parse import urlencodeimport reimport csvimport time# 获取网页请求数据def get_one(num):headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3724.8 Safari/537.36'}params = {'start': str(num),'limit': '20','sort': 'new_score','status': 'P','percent_type': ''}base_url = '/subject/20444530/comments?'#查询电影的网址 修改20444530部分即可url = base_url + urlencode(params)print("正在采集:" + url)try:response = requests.get(url, headers=headers, timeout=10)if response.status_code == 200:return response.textexcept EOFError as e:print(e)return None# 解析网页结构def parse_page(html):info = []patten1 = pile(r'<div class="comment">.*?<a href=.*?class="">(.*?)</a>.*?<span class="comment-time " title="(.*?)">.*?</span>.*?<p class="">(.*?)</p>.*?</div>',re.S)datas = re.findall(patten1, html)print(datas)for data in datas:comic = {}comic['User'] = data[0].strip()comic['Time'] = data[1].strip()comic['Comment'] = data[2].strip().split()info.append(comic)return info# 保存数据 修改电影的名字即可def write_to_file(info):with open('《我想和你好好的》影评.csv', 'a', newline='') as f:fieldnames = ['User', 'Time', 'Comment']writer = csv.DictWriter(f, fieldnames=fieldnames)writer.writeheader()try:writer.writerows(info)except:pass# 执行函数def main():for i in range(10):html = get_one(i * 20)datas = parse_page(html)write_to_file(datas)print('本页采集完毕。') # 采集完一页后的标识time.sleep(1) # 采集完一页休息一秒if __name__ == '__main__':main()
2.长评
代码:
待更……
0x02 书评
1.短评
代码:
import requestsfrom urllib.parse import urlencodeimport reimport csvimport time# 获取网页请求数据def get_one(num):headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3724.8 Safari/537.36'}params = {'p': str(num)}base_url = '/subject/24838578/comments/hot?' #在豆瓣查询书名,修改24838578这部分url = base_url + urlencode(params)print("正在采集:" + url)try:response = requests.get(url, headers=headers, timeout=10)if response.status_code == 200:return response.textexcept EOFError as e:print(e)return None# 解析网页结构def parse_page(html):info = []patten1 = pile(r'<div class="comment">.*?<span class="comment-info">.*?<a href=.*?>(.*?)</a>.*?</span>.*?<p class="comment-content">(.*?)</p>.*?</div>', re.S)datas = re.findall(patten1, html)#print(datas)for data in datas:comic = {}comic['User'] = data[0].strip()# comic['Time'] = data[1].strip()comic['Comment'] = data[1].strip().split()#print(comic)info.append(comic)return info# 保存数据 修改名字def write_to_file(info):with open('《西南联大行思录》书评.csv', 'a', newline='') as f:fieldnames = ['User', 'Comment']writer = csv.DictWriter(f, fieldnames=fieldnames)writer.writeheader()try:writer.writerows(info)except:pass# 执行函数def main():for i in range(0,100):html = get_one(i)datas = parse_page(html)write_to_file(datas)print('本页采集完毕。') # 采集完一页后的标识#time.sleep(1) # 采集完一页休息一秒if __name__ == '__main__':main()
2.长评
代码:
待更……