600字范文 > 爬取新浪微博用户信息及微博内容并进行可视化分析

爬取新浪微博用户信息及微博内容并进行可视化分析

时间：2018-11-05 06:39:15

参考博文：/asher117/article/details/82793091

主要代码如下图

from selenium import webdriverfrom pyecharts.charts import PictorialBarfrom pyecharts.charts import Linefrom opdata.opexcel import Operatingexcelfrom bs4 import BeautifulSoupimport timeimport Draw as drawimport refrom pyecharts import options as optsfrom collections import Counterimport jieba.posseg as psgfrom snownlp import SnowNLP# browser = webdriver.Chrome()# #给定登陆的网址# url = '/signin/login'# browser.get(url)# time.sleep(3)def login():print(u'登陆新浪微博手机端...')# 找到输入用户名的地方，并将用户名里面的内容清空，然后送入你的账号username = browser.find_element_by_css_selector('#loginName')time.sleep(2)username.clear()username.send_keys('输入自己的账号') # 输入自己的账号# 找到输入密码的地方，然后送入你的密码password = browser.find_element_by_css_selector('#loginPassword')time.sleep(2)password.send_keys('输入自己的密码')# 点击登录browser.find_element_by_css_selector('#loginAction').click()##这里给个15秒非常重要，因为在点击登录之后，新浪微博会有个九宫格验证码，下图有，通过程序执行的话会有点麻烦（可以参考崔庆才的Python书里面有解决方法），这里就手动time.sleep(15)print('完成登陆!')def get_info():dic_info={}id = 'dengchao'niCheng = id# 用户的url结构为 url = '/' + idurl = '/' + idbrowser.get(url)time.sleep(3)# 使用BeautifulSoup解析网页的HTMLsoup = BeautifulSoup(browser.page_source, 'lxml')# 爬取商户的uid信息uid = soup.find('td', attrs={'valign': 'top'})uid = uid.a['href']uid = uid.split('/')[1]# 爬取最大页码数目pageSize = soup.find('div', attrs={'id': 'pagelist'})pageSize = pageSize.find('div').getText()pageSize = (pageSize.split('/')[1]).split('页')[0]# 爬取微博数量divMessage = soup.find('div', attrs={'class': 'tip2'})weiBoCount = divMessage.find('span').getText()weiBoCount = (weiBoCount.split('[')[1]).replace(']', '')# 爬取关注数量和粉丝数量a = divMessage.find_all('a')[:2]guanZhuCount = (a[0].getText().split('[')[1]).replace(']', '')fenSiCount = (a[1].getText().split('[')[1]).replace(']', '')dic_info.setdefault("微博总数",weiBoCount)dic_info.setdefault("微博关注", guanZhuCount)dic_info.setdefault("微博粉丝", fenSiCount)contents=[]dianZans=[]zhuanFas=[]pinLuns=[]faBuTimes=[]yuanChuangs=[]# 通过循环来抓取每一页数据# int(pageSize) + 1for i in range(1, 10): # pageSize+1# 每一页数据的url结构为 url = '/' + id + ‘?page=’ + iurl = '/dengchao?page=' + str(i)browser.get(url)time.sleep(1)# 使用BeautifulSoup解析网页的HTMLsoup = BeautifulSoup(browser.page_source, 'lxml')body = soup.find('body')divss = body.find_all('div', attrs={'class': 'c'})[1:-2]for divs in divss:# yuanChuang : 0表示转发，1表示原创yuanChuang = '1' # 初始值为原创，当非原创时，更改此值div = divs.find_all('div')# 这里有三种情况，两种为原创，一种为转发if (len(div) == 2): # 原创，有图# 爬取微博内容content = div[0].find('span', attrs={'class': 'ctt'}).getText()aa = div[1].find_all('a')for a in aa:text = a.getText()if (('赞' in text) or ('转发' in text) or ('评论' in text)):# 爬取点赞数if ('赞' in text):dianZan = (text.split('[')[1]).replace(']', '')# 爬取转发数elif ('转发' in text):zhuanFa = (text.split('[')[1]).replace(']', '')# 爬取评论数目elif ('评论' in text):pinLun = (text.split('[')[1]).replace(']', '')# 爬取微博来源和时间span = divs.find('span', attrs={'class': 'ct'}).getText()faBuTime = str(span.split('来自')[0])contents.append(content)dianZans.append(dianZan)zhuanFas.append(zhuanFa)pinLuns.append(pinLun)faBuTimes.append(faBuTime)yuanChuangs.append(yuanChuang)# 和上面一样elif (len(div) == 1): # 原创，无图content = div[0].find('span', attrs={'class': 'ctt'}).getText()aa = div[0].find_all('a')for a in aa:text = a.getText()if (('赞' in text) or ('转发' in text) or ('评论' in text)):if ('赞' in text):dianZan = (text.split('[')[1]).replace(']', '')elif ('转发' in text):zhuanFa = (text.split('[')[1]).replace(']', '')elif ('评论' in text):pinLun = (text.split('[')[1]).replace(']', '')span = divs.find('span', attrs={'class': 'ct'}).getText()faBuTime = str(span.split('来自')[0])contents.append(content)dianZans.append(dianZan)zhuanFas.append(zhuanFa)pinLuns.append(pinLun)faBuTimes.append(faBuTime)yuanChuangs.append(yuanChuang)# 这里为转发，其他和上面一样elif (len(div) == 3): # 转发的微博yuanChuang = '0'content = div[0].find('span', attrs={'class': 'ctt'}).getText()aa = div[2].find_all('a')for a in aa:text = a.getText()if (('赞' in text) or ('转发' in text) or ('评论' in text)):if ('赞' in text):dianZan = (text.split('[')[1]).replace(']', '')elif ('转发' in text):zhuanFa = (text.split('[')[1]).replace(']', '')elif ('评论' in text):pinLun = (text.split('[')[1]).replace(']', '')span = divs.find('span', attrs={'class': 'ct'}).getText()faBuTime = str(span.split('来自')[0])contents.append(content)dianZans.append(dianZan)zhuanFas.append(zhuanFa)pinLuns.append(pinLun)faBuTimes.append(faBuTime)yuanChuangs.append(yuanChuang)dic_info.setdefault("内容", contents)dic_info.setdefault("点赞", dianZans)dic_info.setdefault("转发", zhuanFas)dic_info.setdefault("评论", pinLuns)dic_info.setdefault("时间", faBuTimes)dic_info.setdefault("原创", yuanChuangs)time.sleep(2)# print(i)return dic_info# 存入txt文件def writetxt(jjrw, result):with open(jjrw, 'w+',encoding="utf-8") as r:for i in range(len(result)):if result[i] != "":s = str(result[i]).strip().replace("emoji", "").replace("span", "").replace("class", "").replace("#","").replace("http","")rec = pile("1f\d+\w*|[<>/=]|\r|\n|")s = rec.sub("", s)r.write(s+"\n")def count(seg_list1):# 计数count = Counter(seg_list1)# 字典排序result = sorted(count.items(), key=lambda x: x[1], reverse=True)return result# 读取文件并进行分词排序def readjieba(text,excludes,list_replace):dic_result = {}seg_list1 = []nr=[]ns=[]# 分词seg_list = psg.cut(text)for w, t in seg_list:# 去除停用词if len(w) != 1 and t != 'm' and w not in excludes:# 替换词for j in list_replace:if w == j[0]:real_word == j[1]else:real_word = wif t == 'nr':nr.append("{0}".format(real_word))if t=='ns':ns.append("{0}".format(real_word))seg_list1.append("{0}".format(real_word))dic_result.setdefault("全部", count(seg_list1))dic_result.setdefault("人名", count(nr))dic_result.setdefault("地名", count(ns))return dic_result# 趋势图def drawline(arrt,value,value1,value2,name):# 图表初始化配置init_opts = opts.InitOpts(page_title=name)line = Line(init_opts=init_opts)# 标题配置title = opts.TitleOpts(title=name,pos_left="10%")# 图例配置legend_opts = opts.LegendOpts(orient="horizontal",pos_top="5%",pos_right="15%")# 工具箱配置# feature = opts.ToolBoxFeatureOpts(save_as_image=True, restore=True, data_view=True, data_zoom=True)# 工具箱配置toolbox_opts = opts.ToolboxOpts(orient="vertical",pos_bottom="15%",pos_left="90%",)line.set_global_opts(title_opts=title,legend_opts=legend_opts,toolbox_opts=toolbox_opts,datazoom_opts = opts.DataZoomOpts(orient="vertical"),)line.add_xaxis(arrt, )line.add_yaxis("点赞", value, is_smooth=True, linestyle_opts=opts.LineStyleOpts(color="#E83132", width="4"))line.add_yaxis("评论", value1, is_smooth=True, linestyle_opts=opts.LineStyleOpts(color="#00FFFF", width="4"))line.add_yaxis("转发", value2, is_smooth=True, linestyle_opts=opts.LineStyleOpts(color="#7CFC00", width="4"))line.render('{0}.html'.format(name))def drawPictorialBar(location,values,name):c = (PictorialBar().add_xaxis(location).add_yaxis("",values,label_opts=opts.LabelOpts(is_show=False),symbol_size=22,symbol_repeat="10000",symbol_offset=[0, -5],is_symbol_clip=True,# symbol='image:///images/spinners/octocat-spinner-32.gif'symbol='image://http://weizhendong.top/images/1.png').reversal_axis().set_global_opts(title_opts=opts.TitleOpts(title=name),xaxis_opts=opts.AxisOpts(is_show=False),yaxis_opts=opts.AxisOpts(axistick_opts=opts.AxisTickOpts(is_show=False),axisline_opts=opts.AxisLineOpts(linestyle_opts=opts.LineStyleOpts(opacity=0)),),).render("{0}.html".format(name)))def read_snowNLP(filename):snow_list = []a=0b=0c=0with open(filename, "r", encoding='utf-8') as f:for line in f.readlines():if line != "":s = SnowNLP(line)if s.sentiments > 0.5:snow_list.append("{0}——褒义".format(line))a+=1elif s.sentiments < 0.5:snow_list.append("{0}——贬义".format(line))b += 1else:snow_list.append("{0}——中性".format(line))c+=1return snow_list,a,b,cif __name__ == '__main__':# 登录# login()# 获取信息# dic_info=get_info()# print(dic_info)ol = Operatingexcel()# 存储到excel# ol.set_excel_dic(dic_info, "data\csdn_data.xlsx", 0, 0)dics = ol.get_excel_dic("data\csdn_data.xlsx", "大学排名")# print(dics)"""绘制饼图"""# yuanchuang = dict()# for f in dics["原创"]:#if f == '1':# yuanchuang["原创"] = yuanchuang.get("原创", 0) + 1#elif f == '0':# yuanchuang["非原创"] = yuanchuang.get("非原创", 0) + 1# attr = ['原创', '非原创']# value = [yuanchuang["原创"], yuanchuang["非原创"]]# draw.drawpie(attr,value,"data/原创和非原创饼图")"""绘制词云"""excludes = {'将军', '却说', '令人', '赶来', '徐州', '不见', '下马', '喊声', '因此', '未知', '大败', '百姓', '大事', '一军', '之后', '接应', '起兵','成都', '原来', '江东', '正是', '忽然', '原来', '大叫', '上马', '天子', '一面', '太守', '不如', '忽报', '后人', '背后', '先主', '此人','城中', '然后', '大军', '何不', '先生', '何故', '夫人', '不如', '先锋', '二人', '不可', '如何', '荆州', '不能', '如此', '主公', '军士','商议', '引兵', '次日', '大喜', '魏兵', '军马', '于是', '东吴', '今日', '左右', '天下', '不敢', '陛下', '人马', '不知', '都督', '汉中','一人', '众将', '后主', '只见', '蜀兵', '马军', '黄巾', '立功', '白发', '大吉', '红旗', '士卒', '钱粮', '于汉', '郎舅', '龙凤', '古之','白虎', '古人云', '尔乃', '马飞报', '轩昂', '史官', '侍臣', '列阵', '玉玺', '车驾', '老夫', '伏兵', '都尉', '侍中', '西凉', '安民', '张曰','文武','白旗','祖宗', '寻思','英雄','赞美','乳牙'} # 排除的词汇key = "玄德曰"value = "刘备"list_replace = []list_replace.append(tuple((key, value)))writetxt("内容.txt", dics["内容"])with open('内容.txt', 'r', encoding='utf-8') as f:text = f.read()dic_result = readjieba(text,excludes,list_replace)draw.drawWordCloud(dic_result["全部"], "data/微博内容词云")draw.drawWordCloud(dic_result["地名"], "data/微博地点词云")draw.drawWordCloud(dic_result["人名"], "data/微博人名词云")"""绘制折线图"""# arrt = [x for x in range(len(dics["评论"]))]# drawline(arrt, dics["点赞"], dics["评论"], dics["转发"], "data/折线图")"""点赞象形图"""# drawPictorialBar(arrt,dics["评论"], "data/点赞象形图")"""评论涟漪图"""# draw.drawEffectScatter(arrt, dics["评论"],"data/评论涟漪图")"""转发柱状图"""# draw.drawbar(arrt, dics["转发"], "data/转发柱状图")"""微博情感极性"""snow_list,a,b,c = read_snowNLP("内容.txt")# 保存到txt文件中writetxt("data/微博情感极性.txt", snow_list)"""绘制饼状图"""attr = ['积极', '消极',"中性"]value = [a, b,c]draw.drawpie(attr,value,"data/情感积极性饼图")

Draw .py : /wei_zhen_dong/article/details/106300719

opdata.py :/wei_zhen_dong/article/details/105318970

本内容不代表本网观点和政治立场，如有侵犯你的权益请联系我们处理。

网友评论

网友评论仅供其表达个人看法，并不表明网站立场。