#抓取信息
library(RCurl)
library(XML)
#伪装报头
myheader=c(
"User-Agent"="Mozilla/5.0(Windows;U;Windows NT 5.1;zh-CN;rv:1.9.1.6",
"Accept"="text/htmal,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language"="en-us",
"Connection"="keep-alive",
"Accept-Charset"="GB2312,utf-8;q=0.7,*;q=0.7"
)
#测试单个url下的信息读取
testurl<-"http://www.fruitday.com/web/product_list/40"
temp<-getURL(testurl,httpheader=myheader,encoding="UTF-8")
k<-htmlParse(temp) #解析网页
write.table(k,"k.txt")
#k
name.node<-getNodeSet(k,'//li[@class="cplist-p02"]/a')
style.node<-getNodeSet(k,'//li[@class="cplist-p03"]')
price.node<-getNodeSet(k,'//li[@class="cplist-p04"]')
name.node
style.node
price.node
#price<-xmlValue(price.node1[[1]])
#price
#name.node
#price<-sapply(getNodeSet(k,'//p[@class="price"]/i/text('),xmlValue)
#url.node1
#name<-xmlGetAttr(url.node[[1]],'title')
#name
#name<-xmlGetAttr(url.node1[[1]],'title')
#name
#getNodeSet(k,'//p[@class="title"]/a[@title]')
price<-c()
style<-c()
name<-c()
for (i in 1:200){
name<-xmlValue(name.node[])
style<-xmlValue(style.node[])
price<-xmlValue(price.node[])
#name<-iconv(name,"UTF-8","gbk")#解决中文正常显示问题
}
name<-iconv(name,"UTF-8","gbk")
df<-data.frame(na=name,sty=style,pr=price)
write.table(df,"tt.txt")