先将提取出的网页文本内容保存为本地HTML,防止解析时重复访问网站被封ip
import timeimport requests as r# 发送请求def get_html(url):head = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36"}response = r.get(url, headers=head)return response.textif __name__ == "__main__":for i in range(1, 11):url = '/ershoufang/p{i}/'print(get_html(url))time.sleep(10)
在pycharm中创建 .html 格式的文件,将上文输出的内容存入,再重新创建一个项目用于解析该html文件的内容。
from lxml import etreedef paser(local_html):doc = etree.HTML(local_html)out_list = []for row in doc.xpath():.........row_list = []print(row_list)out_list.append(row_list)return out_listif __name__=="__main__":with open('a58.html', 'r', encoding='utf-8') as f: # 读取本地HTML文件local_html = f.read()paser(local_html)
解析完成后再与其它步骤整合