微博爬虫简易教程附python代码

1392

收藏 2020-12-02

鉴于之前有人问过微博爬虫的问题，这里做一个简易教程，编写匆忙如有问题可私信up，有兴趣的可以参考看看：

1、首先cmd命令行下 pip install weibocrawl安装必要包
Collecting weibocrawl
  Downloading weibocrawl-0.0.6-py3-none-any.whl (19 kB)

2、开始编写python代码，导包并做一些初始化：
import pandas as pd
from WeiboCrawl import crawl

cookie='your cookie'    #这里的cookie需要登录浏览器（推荐chrome）网页微博后按F12复制出来（失效后需要重新获取）

wc = crawl.WeiboCrawl()
wc.update_info()
wc.headers['home']['cookie'] = cookie

3、获取用户信息：
# 用户信息
def UserInfo(user_id):
# 微博用户信息
info = wc.get_user_info(user_id)
print(info)
调用：
UserInfo('1666052741')

4、获取用户指定时间段的全部微博：
# 微博正文下载
def WeiboDownload(user_id):
wc.param['weibo']['start_time'] ='2020-03-01'
wc.param['weibo']['end_time'] = '2020-9-30'

# 用户历史微博
df_weibo = wc.search_weibo(user_id)
print(df_weibo)
df_weibo.to_excel(user_id + '_weibo.xlsx')

输出：略，见附件

5、微博评论以及转发测试：（包含转发时间、点赞量等，这里pn=1表示只抓取一页内容）
# 单个微博评论转发
def CommentsRetweets(mid):

# 抓取用户微博评论
df_comment = wc.search_comments(mid, pn=1)
print(df_comment)
df_comment.to_excel(mid+'_comments.xlsx')

# 抓取用户微博转发
df_retweet = wc.search_retweets(mid, pn=1)
print(df_retweet)
df_retweet.to_excel(mid+'_retweets.xlsx')

6、编写函数从微博正文Excel中循环读取mid批量下载评论转发，全部代码如下, 可以直接复制到你自己的py文件中：
# __author:ly_peppa
# data:2020/12/2
# 仅供参考，禁止用于非法用途

# -*- coding:utf-8 -*-

import pandas as pd
from WeiboCrawl import crawl

cookie='your cookie'

wc = crawl.WeiboCrawl()
wc.update_info()
wc.headers['home']['cookie'] = cookie

# 用户信息
def UserInfo(user_id):
# 微博用户信息
info = wc.get_user_info(user_id)
print(info)

# 微博正文下载
def WeiboDownload(user_id):
wc.param['weibo']['start_time'] ='2020-03-01'
wc.param['weibo']['end_time'] = '2020-9-30'

# 用户历史微博
df_weibo = wc.search_weibo(user_id)
print(df_weibo)
df_weibo.to_excel(user_id + '_weibo.xlsx')

# 单个微博评论转发
def CommentsRetweets(mid):

# 抓取用户微博评论
df_comment = wc.search_comments(mid, pn=1)
print(df_comment)
df_comment.to_excel(mid+'_comments.xlsx')

# 抓取用户微博转发
df_retweet = wc.search_retweets(mid, pn=1)
print(df_retweet)
df_retweet.to_excel(mid+'_retweets.xlsx')

# 微博评论
def CommentsDownload(filepath):
df_excel=pd.read_excel(filepath)
user_name=None
df_result=None
for index, row in df_excel.iterrows():
      try:
         comment_count = row['comment_count']
         comment_count=int(comment_count) if comment_count != '评论' else 0
         mid = str(row['mid'])
         user_name = row['user_name']
         # 抓取用户微博评论
         df_comment = wc.search_comments(mid, pn=(comment_count // 10)+1)
         print(df_comment)
         # df_comment.to_excel(mid + '_comments.xlsx')
         if df_result is None:
            df_result=df_comment
         else:
            df_result=pd.concat([df_result, df_comment], axis=0, ignore_index=True)
      except Exception as e:
         print(e)

df_result.to_excel(user_name + '_comments.xlsx')

# 微博转发
def RetweetsDownload(filepath):
df_excel=pd.read_excel(filepath)             # 读取WeiboDownload()下载的微博数据文件
user_name=None
df_result=None
for index, row in df_excel.iterrows():
      try:
         retweet_count = row['retweet_count']
         retweet_count = int(retweet_count) if retweet_count != '转发' else 0    # 转发数量为int
         mid = str(row['id'])                                           #每一条微博对应一个mid
         user_name = row['user_name']
         # 抓取用户微博转发
         df_retweet = wc.search_retweets(mid, pn=(retweet_count // 10)+1)
         print(df_retweet)
         # df_retweet.to_excel(user_name + '_retweets.xlsx')
         if df_result is None:
            df_result=df_retweet
         else:
            df_result=pd.concat([df_result, df_retweet], axis=0, ignore_index=True)    #所有评论汇总到一张表
      except Exception as e:
         print(e)

df_result.to_excel(user_name + '_retweets.xlsx')

if __name__ == '__main__':
filepath='1666052741_weibo.xlsx'

# UserInfo('1666052741')

WeiboDownload('1666052741')

# CommentsRetweets('4554207380898268')

# CommentsDownload(filepath)
#
# RetweetsDownload(filepath)

7、完。

附件列表

weibosearch.txt