想咨询下这个程序哪里出问题了,读取不到99个列表页面内每个商品的URL地址,请高手指教!
# 存储99个分页的URL地址
url <- c()
for(i in 1:99){
url <- c(url, paste("http://category.dangdang.com/cid4008120-pg", i, ".html", sep=""))
}
# 读取99个列表页面内每个商品的URL地址
library(XML)
read_url <- function(url){
url_vector <- c()
i <- 1
for(i_url in url){
i_url.html <- htmlParse(i_url, encoding="UTF-8")
url.xpath <- getNodeSet(i_url.html, "//*[@class='shoplist']/ul/li[@name='lb']/div/a")
url.i <- c()
for(j in 1:length(url.xpath)){
url.i <- c(url.i, xmlGetAttr(url.xpath[[j]], 'href'))
}
url_vector <- c(url_vector, url.i)
i <- i+1
}
url_vector
}
urls <- read_url(url)
# 读取商品单页部分信息
read_xml <- function(url){
id_vector <- c() # 商品id
price_vector <- c() # 商品价格
i <- 1
for(i_url in url){
i_url.html <- htmlParse(i_url, encoding="UTF-8")
# 读取商品
id.xpath <- getNodeSet(i_url.html, "//*[@id='prd_item_id']/i")
id.i <- xmlValue(id.xpath[[1]])
# 读取商品价格
price.xpath <- getNodeSet(i_url.html, "//*[@id='salePriceTag']")
price.i <- xmlValue(price.xpath[[1]])
price.i <- substr(price.i, 2, nchar(price.i))
# 存储向量
id_vector[i] <- id.i
price_vector[i] <- as.numeric(price.i)
i <- i+1
}
data.frame(id=id_vector, price=price_vector)
}
data <- read_xml(urls)