library(rvest)
url = "http://www.w3schools.com/xml/simple.xml"
或者把网页下载到本地
url = "c:/temp/w31.xml"
id = c("name","price","calories","description")
dat = sapply(1:length(id),function(i) url %>% html() %>% html_nodes(id) %>% html_text()) %>% as.data.frame(.)
names(dat) = id
dat
name price calories
1 Belgian Waffles $5.95 650
2 Strawberry Belgian Waffles $7.95 900
3 Berry-Berry Belgian Waffles $8.95 900
4 French Toast $4.50 600
5 Homestyle Breakfast $6.95 950
description
1 \r\nTwo of our famous Belgian Waffles with plenty of real maple syrup\r\n
2 \r\nLight Belgian waffles covered with strawberries and whipped cream\r\n
3 \r\nLight Belgian waffles covered with an assortment of fresh berries and whipped cream\r\n
4 \r\nThick slices made from our homemade sourdough bread\r\n
5 \r\nTwo eggs, bacon or sausage, toast, and our ever-popular hash browns\r\n
看起来有点乱,修饰一下:
dat$description = gsub("\r|\n",'',dat$description)
dat
name price calories description
1 Belgian Waffles $5.95 650 Two of our famous Belgian Waffles with plenty of real maple syrup
2 Strawberry Belgian Waffles $7.95 900 Light Belgian waffles covered with strawberries and whipped cream
3 Berry-Berry Belgian Waffles $8.95 900 Light Belgian waffles covered with an assortment of fresh berries and whipped cream
4 French Toast $4.50 600 Thick slices made from our homemade sourdough bread
5 Homestyle Breakfast $6.95 950 Two eggs, bacon or sausage, toast, and our ever-popular hash browns