600字范文 > html+link+点击次数使用正则表达式取得点击次数函数抽离(示例代码)

html+link+点击次数使用正则表达式取得点击次数函数抽离(示例代码)

时间：2022-04-18 14:07:26

importrequestsfrom bs4 importBeautifulSoupimportstringimporttimeimportdatetimeimportre#获取文章详情

defgetNewDetail(newsrrl):#点击次数

defgetClickCount(newUrl):#获取新闻编号

r2=re.findall(\'\\_\\d+\\/(.*?)\\.\',d,re.S)#print(r2)

r1=\'/api.php?op=count&id=\'r3=\'&modelid=80\'r22="".join(r2)#生成点击次数的URL

r_all=r1+r22+r3#print(r_all)

rlink2=requests.get(r_all,headers=head)#获取点击次数

hist=rlink2.text.split(\'.html\')[-1].lstrip("(\')").rstrip("\');")returnhist

soup=BeautifulSoup(r.text,\'html.parser\')for i in soup.select(\'li\'):if len(i.select(".news-list-title"))>0:

a=i.select(".news-list-title")[0].text

b=i.select(".news-list-info")[0].contents[0].text

c=i.select(".news-list-info")[0].contents[1].text

d=i.select("a")[0].attrs[\'href\']

hist=getClickCount(d)print("标题："+a+\'\\n\'+"时间："+b+\'\\n\'+"来源："+c+\'\\n\'+"链接："+d+\'\\n\'+"点击："+hist+\'\\n\\n\')print()

rlink=requests.get(d,headers=head)

rlink.encoding=\'utf-8\'

#print(rlink.text)

soup=BeautifulSoup(rlink.text,\'html.parser\')

e=soup.select(".show-info")[0].text

f=e.split()for i in range(len(f)-1):print(f[i],end=\' \')print("点击："+hist+"次")print()print()#时间类型转换

dt=e.lstrip(\'发布时间:\')[:19]

dt= datetime.datetime.strptime(dt,\'%Y-%m-%d %H:%M:%S\')print("datetime类型时间：",end=\' \')print(dt)print()#作者

i=e.find(\'作者：\')if i>0:

s=e[e.find(\'作者：\'):].split()[0].lstrip(\'作者：\')print("作者：",end=\' \')print(s)print()#审核

i=e.find(\'审核：\')if i>0:

s=e[e.find(\'审核：\'):].split()[0].lstrip(\'审核：\')print("审核：",end=\' \')print(s)print()#来源

i=e.find(\'来源：\')if i>0:

s=e[e.find(\'来源：\'):].split()[0].lstrip(\'来源：\')print("来源：",end=\' \')print(s)print()#摄影

i=e.find(\'摄影：\')if i>0:

s=e[e.find(\'摄影：\'):].split()[0].lstrip(\'摄影：\')print("摄影：",end=\' \')print(s)print()#点击次数

i=e.find(\'点击：\')if i>0:print("点击：",end=\' \')print(hist)for pn in range(5):print()print()#打印文章主体

print(soup.select("#content")[0].text)print()print()print()#爬虫伪装

head ={}

head[\'user-agent\']=\'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36\'r=requests.get("/html/xiaoyuanxinwen/",headers=head)

r.encoding=\'utf-8\'soup=BeautifulSoup(r.text,\'html.parser\')

getNewDetail(r)#电话

telephone=re.findall(\'(\\d{3,4})\\-(\\d{6,8})\',soup.text,re.S)print(telephone)print()#邮箱

email=\'308800902@\'eroll=\'^([0-9a-zA-Z_]{0,19}@[0-9a-zA-Z_]{0,19}(?:\\.\\w{2,3}){0,2})$\'efinadll=re.findall(eroll,email)print(efinadll)print()#英文分词

estr=\'\'\'Personal information such as names, birthdays, nicknames, pet\'s names, social security numbers, and the like

should never, ever, ever be used because these are way too obvious and too easy to crack. The more you avoid using

things like this as your passwords, the more secure your login areas will be.\'\'\'

print(re.split("[\\s,.?!]+",estr))

本内容不代表本网观点和政治立场，如有侵犯你的权益请联系我们处理。

网友评论

网友评论仅供其表达个人看法，并不表明网站立场。