鉴于之前有人问过微博爬虫的问题,这里做一个简易教程, 编写匆忙如有问题可私信up,有兴趣的可以参考看看:
1、首先cmd命令行下 pip install weibocrawl安装必要包
Collecting weibocrawl
Downloading weibocrawl-0.0.6-py3-none-any.whl (19 kB)
2、开始编写python代码,导包并做一些初始化:
import pandas as pd
from WeiboCrawl import crawl
cookie='your cookie' #这里的cookie需要登录浏览器(推荐chrome)网页微博后按F12复制出来(失效后需要重新获取)
wc = crawl.WeiboCrawl()
wc.update_info()
wc.headers['home']['cookie'] = cookie
3、获取用户信息:
# 用户信息
def UserInfo(user_id):
# 微博用户信息
info = wc.get_user_info(user_id)
print(info)
调用:
UserInfo('1666052741')
4、获取用户指定时间段的全部微博:
# 微博正文下载
def WeiboDownload(user_id):
wc.param['weibo']['start_time'] ='2020-03-01'
wc.param['weibo']['end_time'] = '2020-9-30'
# 用户历史微博
df_weibo = wc.search_weibo(user_id)
print(df_weibo)
df_weibo.to_excel(user_id + '_weibo.xlsx')
输出:略,见附件
5、微博评论以及转发测试:(包含转发时间、点赞量等,这里pn=1表示只抓取一页内容)
# 单个微博评论转发
def CommentsRetweets(mid):
# 抓取用户微博评论
df_comment = wc.search_comments(mid, pn=1)
print(df_comment)
df_comment.to_excel(mid+'_comments.xlsx')
# 抓取用户微博转发
df_retweet = wc.search_retweets(mid, pn=1)
print(df_retweet)
df_retweet.to_excel(mid+'_retweets.xlsx')
6、编写函数从微博正文Excel中循环读取mid批量下载评论转发,全部代码如下, 可以直接复制到你自己的py文件中:
# __author:ly_peppa
# data:2020/12/2
# 仅供参考,禁止用于非法用途
# -*- coding:utf-8 -*-
import pandas as pd
from WeiboCrawl import crawl
cookie='your cookie'
wc = crawl.WeiboCrawl()
wc.update_info()
wc.headers['home']['cookie'] = cookie
# 用户信息
def UserInfo(user_id):
# 微博用户信息
info = wc.get_user_info(user_id)
print(info)
# 微博正文下载
def WeiboDownload(user_id):
wc.param['weibo']['start_time'] ='2020-03-01'
wc.param['weibo']['end_time'] = '2020-9-30'
# 用户历史微博
df_weibo = wc.search_weibo(user_id)
print(df_weibo)
df_weibo.to_excel(user_id + '_weibo.xlsx')
# 单个微博评论转发
def CommentsRetweets(mid):
# 抓取用户微博评论
df_comment = wc.search_comments(mid, pn=1)
print(df_comment)
df_comment.to_excel(mid+'_comments.xlsx')
# 抓取用户微博转发
df_retweet = wc.search_retweets(mid, pn=1)
print(df_retweet)
df_retweet.to_excel(mid+'_retweets.xlsx')
# 微博评论
def CommentsDownload(filepath):
df_excel=pd.read_excel(filepath)
user_name=None
df_result=None
for index, row in df_excel.iterrows():
try:
comment_count = row['comment_count']
comment_count=int(comment_count) if comment_count != '评论' else 0
mid = str(row['mid'])
user_name = row['user_name']
# 抓取用户微博评论
df_comment = wc.search_comments(mid, pn=(comment_count // 10)+1)
print(df_comment)
# df_comment.to_excel(mid + '_comments.xlsx')
if df_result is None:
df_result=df_comment
else:
df_result=pd.concat([df_result, df_comment], axis=0, ignore_index=True)
except Exception as e:
print(e)
df_result.to_excel(user_name + '_comments.xlsx')
# 微博转发
def RetweetsDownload(filepath):
df_excel=pd.read_excel(filepath) # 读取WeiboDownload()下载的微博数据文件
user_name=None
df_result=None
for index, row in df_excel.iterrows():
try:
retweet_count = row['retweet_count']
retweet_count = int(retweet_count) if retweet_count != '转发' else 0 # 转发数量为int
mid = str(row['id']) #每一条微博对应一个mid
user_name = row['user_name']
# 抓取用户微博转发
df_retweet = wc.search_retweets(mid, pn=(retweet_count // 10)+1)
print(df_retweet)
# df_retweet.to_excel(user_name + '_retweets.xlsx')
if df_result is None:
df_result=df_retweet
else:
df_result=pd.concat([df_result, df_retweet], axis=0, ignore_index=True) #所有评论汇总到一张表
except Exception as e:
print(e)
df_result.to_excel(user_name + '_retweets.xlsx')
if __name__ == '__main__':
filepath='1666052741_weibo.xlsx'
# UserInfo('1666052741')
WeiboDownload('1666052741')
# CommentsRetweets('4554207380898268')
# CommentsDownload(filepath)
#
# RetweetsDownload(filepath)
7、完。
附件列表