pulling multiple entries in xml with different data using R - r

I have a set of XML files that I am reading in, and wanted to know the best way to deal with the following:
<MyDecision>
<Decision>
<DecisionID>X1234</DecisionID>
<DecisionReasons xmlns:a="http://schemas.datacontract.org/2004/07/Contracts">
<a:Reason>
<a:Description>DOBMismatch</a:Description>
</a:Reason>
<a:Reason>
<a:Description>PrimaryChecksFail</a:Description>
</a:Reason>
<a:Reason>
<a:Description>IncomeReferral</a:Description>
</a:Reason>
</DecisionReasons>
</Decision>
</MyDecision>
At the moment, I am running some R code but get the response:
Error: Duplicate identifiers for rows (2, 3, 4)
The intended output is a dataframe that looks something like:
fieldname |contents
MyDecision_Decision_DecisionID |X1234
MyDecision_Decision_DecisionReasons_Reason_Description_DOBMismatch |DOBMismatch
MyDecision_Decision_DecisionReasons_Reason_Description_PrimaryChecksFail |PrimaryChecksFail
MyDecision_Decision_DecisionReasons_Reason_Description_IncomeReferral |IncomeReferral
My current code is as below:
library(profvis)
library(XML)
library(xml2)
library(plyr)
library(tidyverse)
library(reshape2)
library(foreign)
library(rio)
setwd('c:/temp/xml/t')
df <- data.frame()
transposed.df1 <- data.frame()
allxmldata <- data.frame()
inputfiles <- as.character('test.xml')
findchildren<-function(nodes, df) {
numchild <- sapply(nodes, function(x){length(xml_children(x))})
xmlvalue <- xml_text(nodes[numchild==0])
xmlname <- xml_name(nodes[numchild==0])
xmlpath <- sapply(nodes[numchild==0], function(x) {gsub(', ','_', toString(rev(xml_name(xml_parents(x)))))})
if (isTRUE(xmlpath == 'MyDecision_Decision_DecisionReasons_Reason')) {
fieldname <- paste(xmlpath,xmlname,xmlvalue,sep = '_')
} else {
fieldname <- paste(xmlpath,xmlname,sep = '_')
}
contents <- sapply(xmlvalue, function(f){is.na(f)<-which(f == '');f})
dftemp <- data.frame(fieldname, contents)
df <- rbind(df, dftemp)
print(dim(df))
if (sum(numchild)>0){
findchildren(xml_children(nodes[numchild>0]), df) }
else{ return(df)}
}
for (x in inputfiles) {
df1 <- findchildren(xml_children(read_xml(x)),df)
xml.df1 <- data.frame(spread(df1, key = fieldname, value = contents), fix.empty.names = TRUE)
allxmldata <- rbind.fill(allxmldata,xml.df1)
}
I hope that there is someone that can point out what I have done wrong...

I came up with the following solution that works for your exemplary dataset. Hopefully, this approach can also be used for your larger dataset.
# we need these two packages
library(xml2)
library(tidyverse)
# read in the xml-file
xml <- read_xml(
'<MyDecision>
<Decision>
<DecisionID>X1234</DecisionID>
<DecisionReasons xmlns:a="http://schemas.datacontract.org/2004/07/Contracts">
<a:Reason>
<a:Description>DOBMismatch</a:Description>
</a:Reason>
<a:Reason>
<a:Description>PrimaryChecksFail</a:Description>
</a:Reason>
<a:Reason>
<a:Description>IncomeReferral</a:Description>
</a:Reason>
</DecisionReasons>
</Decision>
</MyDecision>'
)
xml %>%
# first find all nodes that doesn't contain any child nodes
xml_find_all(".//node()[not(node())]") %>%
# find the parent of each node
map(xml_parent) %>%
# extract name and text of each of the childless nodes
map(~list(name = xml_name(.x), text = xml_text(.x))) %>%
# bind rows.
bind_rows()
This produces the following output:
# A tibble: 4 x 2
name text
<chr> <chr>
1 DecisionID X1234
2 Description DOBMismatch
3 Description PrimaryChecksFail
4 Description IncomeReferral

Related

Error in tm package while topic modelling

I am running into an error while trying to make a corpus object from the tm package in R.
The data have been scraped from a website and I have included the full code below so you can run and see how the data were gathered and the tibble was created. The very last line of code is where I am getting stuck! (I have modified the loop so it should run in a few seconds).
Any help would be appreciated. :)
library(tidyverse)
library(rvest)
##########################################
# WEB SCRAPING FROM SCHOLARLYKITCHEN.COM #
##########################################
# create loop that iteratively adds page numbers onto
# keep the loop numbers small for testing before full data is pulled in
output <- character()
for (i in 1:2) {
article.links <- paste0("https://scholarlykitchen.sspnet.org/archives/page/", i ,"/") %>%
read_html() %>%
html_nodes(".list-article__title") %>%
html_nodes("a") %>%
html_attr("href")
output <- c(output, article.links)
}
# get all comments
get.comments <- function(output) {
article.page <- read_html(output)
article.comments <- article.page %>% html_nodes(".comment") %>% html_text() %>% trimws(which = "both")
return(article.comments)
}
text <- sapply(output, FUN = get.comments, USE.NAMES = FALSE)
# get all dates
get.dates <- function(output) {
article.page <- read_html(output)
article.comments <- article.page %>% html_nodes(".comment__meta__date") %>% html_text() %>% trimws(which = "both")
return(article.comments)
}
dates <- sapply(output, FUN = get.dates, USE.NAMES = FALSE)
# create the made df for the analysis
df <- tibble(
text = unlist(text, recursive = TRUE), # unlist is needed because sapply (for some reason) creates a list
dates = unlist(dates, recursive = TRUE)
)
# extract dates from meta data
df$dates <- as.character(gsub(",","",df$dates))
df$dates <- as.Date(df$dates, "%B%d%Y")
###################
# TOPIC MODELLING #
###################
library(tm)
library(topicmodels)
# create df ready for topic modelling
# this needs to have very specifically names columns
df.tm <- df[-2] # create dupelicate for backup (dates not needed for topic modelling yet)
df.tm$doc_id <- row.names(df) # create a unique id for each row as is needed by the tm package
df.tm <- df.tm[c(2,1)] # reorders the columns
# From the comments text, create the corpus
corpus <- VCorpus(DataframeSource(df))
Error is the below
Error in DataframeSource(df) :
all(!is.na(match(c("doc_id", "text"), names(x)))) is not TRUE
DataframeSource() requires the df to have a document index in its first column, and it must be labeled "doc_id".
Try:
df_with_id <- rowid_to_column(df, var = "doc_id") # Alternatively, generate a doc index that better represents your collection of documents.
corpus <- VCorpus(DataframeSource(df))
<<VCorpus>>
Metadata: corpus specific: 0, document level (indexed): 1
Content: documents: 141

how to loop two loops one in another

I have a list (lst1) which contain 10 data.frame. Each data.frame has a variable ID. I also have a IDlist. Is it a way that we can create a looping codes so I can generate an excel book which contain those 10 data, one in each sheet, with one match ID in IDlist?
The tricky part is we need to looping through the IDlist list as well as passing through the lst1. Any advice?
I have some codes that I wrote but it won't work. Hopefully it can give you some idea on what I want to do.
for (i in IDlist) {
# create a workbook
tempwb <- createWorkbook()
for(j in seq_along(lst1)){
# store the ID -specific subset of the dataset
data.subset <-lst1[[j]] %>% filter(ID == i)
# add worksheet
addWorksheet(tempwb, sheetName = lst1[[j]])} # I want the sheetname= dataname, what should I do? mine should be wrong
## How can I load subset to each sheet?
file.name <- paste0(i,".xlsx")
### save workbook
saveWorkbook(tempwb, paste0(output_dir,file.name), overwrite = TRUE)
}
library(tidyverse)
library(writexl)
lst1 <- list(data1 = mpg, data2 = mpg, data3 = mpg, `data4/\\bad name` = mpg)
# Remove any illegal characters from names:
names(lst1) <- names(lst1) %>%
stringr::str_replace_all("[:punct:]", " ")
IDlist <- mpg %>% pull(cyl) %>% unique
make_one_xlsx <- function(this_id){
lst1 %>% map(~filter(., cyl == this_id)) %>% write_xlsx(paste0("ID_", this_id, ".xlsx"))
}
IDlist %>% map(make_one_xlsx)
you should show some example of your data for better answers.
you could use xlsx library
library('xlsx')
for(i in seq_along(lst1)){
if(i == 1){
append = FALSE
} else{
append = TRUE
}
write.xlsx(lst1[i], file='file_name.xlsx',
sheetName='get your id(sheet) name', append=append)
}

How to rename filenames considering their IDs

I'm a begginer with R programming. I have downloaded many pictures which have their ID as name. For example, pictures "senador588", "senador3", "senador16" and so on. Each picture shows one senator of Brazil. I need the name instead of the ID.
I also have a dataframe which displays only the ID (id_senador) and the name (name_lower).
This first part of the code downloads all the pictures:
library(data.table)
library(rvest)
library(lubridate)
library(stringr)
library(dplyr)
library(RCurl)
library(XML)
library(httr)
library(purrr)
# all the senators of Brazil
url <- "https://www25.senado.leg.br/web/senadores/em-exercicio/-/e/por-nome"
# get all url on the webpage
url2 <- getURL(url)
parsed <- htmlParse(url2)
links <- xpathSApply(parsed,path = "//a",xmlGetAttr,"href")
links <- do.call(rbind.data.frame, links)
colnames(links)[1] <- "links"
# filtering to get the urls of the senators
links_senador <- links %>%
filter(links %like% "/senadores/senador/")
links_senador <- data.frame(links_senador)
# creating a new directory for the pics
setwd("~/Downloads/")
dir.create("senadores-new")
setwd("~/Downloads/senadores-new")
# running a loop to download all pictures
i <- 1
while(1 <= 81){
tryCatch({
# defining the row of each senator
foto_webpage <- data.frame(links_senador$links[i])
# renaming the column's name
colnames(foto_webpage) <- "links"
# getting all images of html page
# filtering the photo which we want
html <- as.character(foto_webpage$links) %>%
httr::GET() %>%
xml2::read_html() %>%
rvest::html_nodes("img") %>%
map(xml_attrs) %>%
map_df(~as.list(.)) %>%
filter(src %like% "senadores/img/fotos-oficiais/") %>%
as.data.frame(html)
# downloading the photo
foto_senador <- html$src
download.file(foto_senador, basename(foto_senador), mode = "wb", header = TRUE)
Sys.sleep(3)
}, error = function(e) return(NULL)
)
i <- i + 1
}
This second part creates a dataframe with the ID and name of each senator:
url <- "https://www25.senado.leg.br/web/senadores/em-exercicio/-/e/por-nome"
file <- read_html(url)
tables <- html_nodes(file, "table")
table1 <- html_table(tables[1], fill = TRUE, header = T)
table1_df <- as.data.frame(table1)[1]
table1_df_sem_acentuacao <- as.data.frame(iconv(table1_df$Nome, from = "UTF-8", to = "ASCII//TRANSLIT"))
colnames(table1_df_sem_acentuacao) <- "senador_lower"
table1_df_lower <- as.data.frame(tolower(table1_df_sem_acentuacao$senador_lower))
colnames(table1_df_lower) <- "senador_lower"
table_name_final <- as.data.frame(gsub(" ", "-", table1_df_lower$senador_lower))
id_split <- as.data.frame(gsub("https://www25.senado.leg.br/web/senadores/senador/-/perfil/", "senador", links_senador$links))
table_dfs_final <- cbind(table_name_final, id_split)
colnames(table_dfs_final)[1] <- "name_lower"
colnames(table_dfs_final)[2] <- "id_senador"
For the loop to replace the ID for the name, I tried this:
for (p in photos) {
id <- basename(p)
id <- gsub(".jpg$", "", id)
name <- table_dfs_final$name_lower[match(id, basename(table_dfs_final$id_senador))]
fname <- paste0(table_dfs_final$id_senador, ".jpg")
file.rename(p, fname)
#optional
cat("renaming", basename(p), "to", name, "\n")
}
To make it more "R way" you can use one of the functions from apply family. create your function that changes names and than just apply it on ids and names columns you created.
changeName<- function(old_name, new_name){
file.rename(paste0(old_name,'.jpg'), paste0(new_name,'.jpg'))
}
mapply(changeName, table_dfs_final$id_senador,table_dfs_final$name_lower)

R: Trouble in appending rows in a dataframe from webscraping in r [duplicate]

This question already has answers here:
Error in if/while (condition) {: missing Value where TRUE/FALSE needed
(4 answers)
Closed 5 years ago.
I have dataframe with 7 rows and 1 column,which contains links of a website, I'm trying to extract data from those various link and store them in a data frame but not able to append that.Also I'm checking that if for a link if there is no records(this I'm checking through html attribute of that link) skip that link and proceed to next link.I'm also trying to fetch data for multiple pages of a link.
This is reproducible data
text1="http://www.magicbricks.com/property-for-sale/residential-real-estate?bedroom="
text3="&proptype="
text4="Multistorey-Apartment,Builder-Floor-Apartment,Penthouse,Studio-Apartment"
text5="&cityName=Thane&BudgetMin="
text6="&BudgetMax="
bhk=c("1","2","3","4","5",">5")
budg_min=c("5-Lacs","10-Lacs","20-Lacs","30-Lacs","40-Lacs","50-Lacs","60-Lacs","70-Lacs","80-Lacs","90-Lacs","1-Crores","1.2-Crores","1.4-Crores","1.6-Crores","1.8-Crores","2-Crores","2.3-Crores","2.6-Crores","3-Crores","3.5-Crores","4-Crores","4.5-Crores","5-Crores","10-Crores","20-Crores")
budg_max=c("5-Lacs","10-Lacs","20-Lacs","30-Lacs","40-Lacs","50-Lacs","60-Lacs","70-Lacs","80-Lacs","90-Lacs","1-Crores","1.2-Crores","1.4-Crores","1.6-Crores","1.8-Crores","2-Crores","2.3-Crores","2.6-Crores","3-Crores","3.5-Crores","4-Crores","4.5-Crores","5-Crores","10-Crores","20-Crores")
eg <- expand.grid(bhk = bhk, budg_min = budg_min, budg_max = budg_max)
eg <- eg[as.integer(eg$budg_min) <= as.integer(eg$budg_max),]
uuu <- sprintf("%s%s%s%s%s%s%s%s", text1,eg[,1],text3,text4,text5,eg[,2],text6,eg[,3])
uuu_df1=data.frame(x=uuu[1:7,])
dput(uuu_df1)
I have 3 solution for this but none seems to be working fine.
SOlution#1
urlList <- llply(uuu_df1[,1], function(url){
this_pg <- read_html(url)
results_count <- this_pg %>%
xml_find_first(".//span[#id='resultCount']") %>%
xml_text() %>%
as.integer()
if(results_count > 0){
cards <- this_pg %>%
xml_find_all('//div[#class="SRCard"]')
df <- ldply(cards, .fun=function(x){
y <- data.frame(wine = x %>% xml_find_first('.//span[#class="agentNameh"]') %>% xml_text(),
excerpt = x %>% xml_find_first('.//div[#class="postedOn"]') %>% xml_text(),
locality = x %>% xml_find_first('.//span[#class="localityFirst"]') %>% xml_text(),
society = x %>% xml_find_first('.//div[#class="labValu"]') %>% xml_text() %>% gsub('\\n', '', .))
return(y)
})
} else {
df <- NULL
}
return(df)
}, .progress = 'text')
names(urlList) <- uuu_df1[,1]
a=bind_rows(urlList)
Above code gives me error Error in if (results_count > 0) { : missing value where TRUE/FALSE needed
Solution#2
urlList <- lapply(uuu_df1[,1], function(url){
UrlPage <- html(as.character(url))
ImgNode <- UrlPage %>% html_node("div.noResultHead")
u <- paste("No", word(string = as(ImgNode, "character"), start=4, end=5), sep=" ")
cat(".")
pg <- read_html(url)
if(u!="No Results Found!") {
df <- data.frame(wine=html_text(html_nodes(pg, ".agentNameh")),
excerpt=html_text(html_nodes(pg, ".postedOn")),
locality=html_text(html_nodes(pg,".localityFirst")),
society=html_text(html_nodes(pg,'.labValu .stop-propagation:nth-child(1)')),
stringsAsFactors=FALSE)
} else {
# ASSIGN EMPTY DATAFRAME (FOR CONSISTENT STRUCTURE)
df <- data.frame(wine=character(), excerpt=character(), locality=character(), society=character())
}
# RETURN NAMED LIST
return(list(UrlPage=UrlPage, ImgNode=ImgNode, u=u, df=df))
})
# ROW BIND ONLY DATAFRAME ELEMENT FROM LIST
wines <- map_df(urlList, function(u) u$df)
Above code gives empty dataframe
Solution#3
uuu_df1=data.frame(x=uuu_df[1:7,])
wines=data.frame()
url_test=c()
UrlPage_test=c()
u=c()
ImgNode=c()
pg=c()
for(i in 1:dim(uuu_df1)[1]) {
url_test[i]=as.character(uuu_df1[i,])
UrlPage_test[i] <- html(url_test[i])
ImgNode[i] <- UrlPage_test[i] %>% html_node("div.noResultHead")
u[i]=ImgNode[i]
u[i]=as(u[i],"character")
u[i]=paste("No",word(string = u, start = 4, end = 5),sep = " ")
if(u[i]=="No Results Found!") next
{
map_df(1:5, function(i) # here 1:5 is number of webpages of a website
{
# simple but effective progress indicator
cat(".")
pg[i] <- read_html(sprintf(url_test[i], i))
data.frame(wine=html_text(html_nodes(pg[i], ".agentNameh")),
excerpt=html_text(html_nodes(pg[i], ".postedOn")),
locality=html_text(html_nodes(pg[i],".localityFirst")),
society=html_text(html_nodes(pg[i],'.labValu .stop-propagation:nth-child(1)')),
stringsAsFactors=FALSE)
}) -> wines
}}
Above code also gives an error
Error in UseMethod("xml_find_first") :
no applicable method for 'xml_find_first' applied to an object of class "list"
In addition: Warning messages:
1: 'html' is deprecated.
Use 'read_html' instead.
See help("Deprecated")
2: In UrlPage_test[i] <- html(url_test[i]) :
number of items to replace is not a multiple of replacement length
Any suggestions on which code can be corrected so that my requirement is met. Thanks in advance
Solution #1
That missing value where TRUE/FALSE needed is printed when you do something like this:
if (NA > 0) {
do something
}
So replace your if condition
if(results_count > 0)
with
(!is.na(results_count) & (results_count > 0))

R dplyr mutate with HTML (or XML) documents, nodes, or node sets

I have a file with multiple HTML links in it and now want to use dplyr and rvest to get the link to the image per every link of each row.
When I do it manually it works fine and returns the row but when the same code is called within a function it fails with the following error:
Error: no applicable method for 'xml_find_all' applied to an object of
class "factor"
I don't know what I'm doing wrong. Any help is appreciated. In order to make my question more clear I have added (in comments) a few example rows and also shown the manual approach.
library(rvest)
library(dplyr)
library(httr) # contains function stop_for_status()
#get html links from file
#EXAMPLE
# "_id",url
# 560fc55c65818bee0b77ec33,http://www.seriouseats.com/recipes/2011/01/sriracha-ceviche-recipe.html
# 560fc57e65818bee0b78d8b7,http://www.seriouseats.com/recipes/2008/07/pasta-arugula-tomatoes-recipe.html
# 560fc57e65818bee0b78dcde,http://www.seriouseats.com/recipes/2007/08/cook-the-book-minty-boozy-chic.html
# 560fc57e65818bee0b78de93,http://www.seriouseats.com/recipes/2010/02/chipped-beef-gravy-on-toast-stew-on-a-shingle-recipe.html
# 560fc57e65818bee0b78dfe6,http://www.seriouseats.com/recipes/2011/05/dinner-tonight-quinoa-salad-with-lemon-cream.html
# 560fc58165818bee0b78e65e,http://www.seriouseats.com/recipes/2010/10/dinner-tonight-spicy-quinoa-salad-recipe.html
#
#load into SE
#
SE <- read.csv("~/Desktop/SeriousEats.csv")
#
#function to retrieve imgPath per URL
#using rvest
#
getImgPath <- function(x) {
imgPath <- x %>% html_nodes(".photo") %>% html_attr("src")
stop_for_status(res)
return(imgPath)
}
#This works fine
#UrlPage <- read_html ("http://www.seriouseats.com/recipes/2011/01/sriracha-ceviche-recipe.html")
#imgPath <- UrlPage %>% html_nodes(".photo") %>% html_attr("src")
#
#This throws an error msg
#
S <- mutate(SE, imgPath = getImgPath(SE$url))
This works:
library(rvest)
library(dplyr)
# SE <- data_frame(url = c(
# "http://www.seriouseats.com/recipes/2011/01/sriracha-ceviche-recipe.html",
# "http://www.seriouseats.com/recipes/2008/07/pasta-arugula-tomatoes-recipe.html"
# ))
SE <- read.csv('/path/to/SeriousEats.csv', stringsAsFactors = FALSE)
getImgPath <- function(x) {
# x must be "a document, a node set or a single node" per rvest documentation; cannot be a factor or character
imgPath <- read_html(x) %>% html_nodes(".photo") %>% html_attr("src")
# httr::stop_for_status(res) OP said this is not necessary, so I removed
return(imgPath)
}
S <- SE %>%
rowwise() %>%
mutate(imgPath = getImgPath(url))
Thanks for the help and patience and #Jubbles. For the benefit of others here is the complete answer.
library(rvest)
library(dplyr)
SE <- read.csv("~/Desktop/FILE.txt", stringsAsFactors = FALSE)
getImgPath <- function(x) {
if (try(url.exists(x))) {
imgPath <- html(x) %>%
html_nodes(".photo") %>%
html_attr("src")
}
else {
imgPath = "NA"
}
#imgPath
return(imgPath)
}
SE1 <- SE %>%
rowwise() %>%
mutate(imgPath = getImgPath(url))

Resources