python网络爬虫求助_python爬虫

2231

收藏 2016-09-16

求助大神们：下面是一个python的小爬虫，本想爬取网站上的新闻标题，日期和点击量，但运行结果中却并没有出现这些内容只有  日期：点击量：。这是什么问题呢？新手刚入道，望多多指教！

#! /usr/bin/env python
#coding=gbk

import urllib2
import sys
import re
import os

def extract_url(info):
rege="<li><span class=\"title\"><a href=\"(.*?)\">"
re_url = re.findall(rege, info)
n=len(re_url)
for i in range(0,n):
      re_url="http://news.swjtu.edu.cn/"+re_url
return re_url

def extract_title(sub_web):
re_key = "<h4>\r\n (.*)\r\n </h4>"
title = re.findall(re_key,sub_web) or [""]
return title

def extract_date(sub_web):
re_key = "日期：(.*?) "
date = re.findall(re_key,sub_web) or [""]
return date

def extract_counts(sub_web):
re_key = "点击数：(.*?)  "
counts = re.findall(re_key,sub_web) or [""]
return counts

fp=open('output.txt','w')
content = urllib2.urlopen('http://news.swjtu.edu.cn/ShowList-82-0-1.shtml').read()
url=extract_url(content)
string=""
n=len(url)
print n

for i in range(0,n):
sub_web = urllib2.urlopen(url).read()
sub_title = extract_title(sub_web)
string+=sub_title[0]
string+=''
sub_date = extract_date(sub_web)
string+="日期："+sub_date[0]
string+=''
sub_counts = extract_counts(sub_web)
string+="点击数："+sub_counts[0]
string+='\n'

print string
fp.close()