I have a dataframe uuu_df with records as links of website
dim(uuu_df)
output
1950 1
uuu_df
1) http://www.magicbricks.com/property-for-sale/residential-real-estate?bedroom=1&proptype=Multistorey-Apartment,Builder-Floor-Apartment,Penthouse,Studio-Apartment&cityName=Thane&BudgetMin=5-Lacs&BudgetMax=5-Lacs
2) http://www.magicbricks.com/property-for-sale/residential-real-estate?bedroom=2&proptype=Multistorey-Apartment,Builder-Floor-Apartment,Penthouse,Studio-Apartment&cityName=Thane&BudgetMin=5-Lacs&BudgetMax=5-Lacs
3) http://www.magicbricks.com/property-for-sale/residential-real-estate?bedroom=3&proptype=Multistorey-Apartment,Builder-Floor-Apartment,Penthouse,Studio-Apartment&cityName=Thane&BudgetMin=5-Lacs&BudgetMax=5-Lacs
.
.
.
1950) http://www.magicbricks.com/property-for-sale/residential-real-estate?bedroom=>5&proptype=Multistorey-Apartment,Builder-Floor-Apartment,Penthouse,Studio-Apartment&cityName=Thane&BudgetMin=20-Crores&BudgetMax=20-Crores
here I'm trying to scrape data using those multiple links from the dataframe along with the condition i.e. if the text of html attribute is equal to "No Results Found!" then skip that record and move on to next record,
this is the snippet of that scraping
UrlPage <- html("http://www.magicbricks.com/property-for-sale/residential-real-estate?bedroom=2&proptype=Multistorey-Apartment,Builder-Floor-Apartment,Penthouse,Studio-Apartment&cityName=Thane&BudgetMin=5-Lacs&BudgetMax=5-Lacs")
ImgNode <- UrlPage %>% html_node("div.noResultHead")
u=ImgNode
u=as(u,"character")
u=paste("No",word(string = u, start = 4, end = 5),sep = " ")
Here is what I have tried
wines=data.frame()
url_test=c()
UrlPage_test=c()
u=c()
ImgNode=c()
for(i in 1:dim(uuu_df)[1]){
url_test[i]=as.character(uuu_df[i,])
UrlPage_test[i] <- html(url_test[i])
ImgNode[i] <- UrlPage_test[i] %>% html_node("div.noResultHead")
u[i]=ImgNode[i]
u[i]=as(u[i],"character")
u[i]=paste("No",word(string = u, start = 4, end = 5),sep = " ")
if(u[i]=="No Results Found!") next
{
map_df(1:5, function(i) # here 1:5 is number of webpages of a website
{
# simple but effective progress indicator
cat(".")
pg <- read_html(sprintf(url_test, i))
data.frame(wine=html_text(html_nodes(pg, ".agentNameh")),
excerpt=html_text(html_nodes(pg, ".postedOn")),
locality=html_text(html_nodes(pg,".localityFirst")),
society=html_text(html_nodes(pg,'.labValu .stop-propagation:nth-child(1)')),
stringsAsFactors=FALSE)
}) -> wines
}
But the Wines dataframe gives me empty dataframe with empty rows and columns
Why is it not able to append rows inside it.
Any suggestion will be helpful. Thanks in advance
P.S: dput() of reproduciable data
text1="http://www.magicbricks.com/property-for-sale/residential-real-estate?bedroom="
text2="1"
text3="&proptype="
text4="Multistorey-Apartment,Builder-Floor-Apartment,Penthouse,Studio-Apartment"
text5="&cityName=Thane&BudgetMin="
text6="&BudgetMax="
bhk=c("1","2","3","4","5",">5")
budg_min=c("5-Lacs","10-Lacs","20-Lacs","30-Lacs","40-Lacs","50-Lacs","60-Lacs","70-Lacs","80-Lacs","90-Lacs","1-Crores","1.2-Crores","1.4-Crores","1.6-Crores","1.8-Crores","2-Crores","2.3-Crores","2.6-Crores","3-Crores","3.5-Crores","4-Crores","4.5-Crores","5-Crores","10-Crores","20-Crores")
budg_max=c("5-Lacs","10-Lacs","20-Lacs","30-Lacs","40-Lacs","50-Lacs","60-Lacs","70-Lacs","80-Lacs","90-Lacs","1-Crores","1.2-Crores","1.4-Crores","1.6-Crores","1.8-Crores","2-Crores","2.3-Crores","2.6-Crores","3-Crores","3.5-Crores","4-Crores","4.5-Crores","5-Crores","10-Crores","20-Crores")
eg <- expand.grid(bhk = bhk, budg_min = budg_min, budg_max = budg_max)
eg <- eg[as.integer(eg$budg_min) <= as.integer(eg$budg_max),]
uuu <- sprintf("%s%s%s%s%s%s%s%s", text1,eg[,1],text3,text4,text5,eg[,2],text6,eg[,3])
uuu_df=data.frame(Links=uuu)
dput(uuu_df)
You should take advantage of the document tree to consistently find the elements you need and control the flow of the loop or vectorized function. In the example below I check the result count to determine if there are results, then parse each node individually to ensure it's consistent. Finally, you can bind them if needed.
Side Note: llply has the .progress argument which more elegantly handles the progress indicator you were trying to devise using cat().
options(stringsAsFactors = FALSE)
library(plyr)
library(dplyr)
library(xml2)
uuu_df <- data.frame(x = c('http://www.magicbricks.com/property-for-sale/residential-real-estate?bedroom=1&proptype=Multistorey-Apartment,Builder-Floor-Apartment,Penthouse,Studio-Apartment&cityName=Thane&BudgetMin=5-Lacs&BudgetMax=5-Lacs',
'http://www.magicbricks.com/property-for-sale/residential-real-estate?bedroom=2&proptype=Multistorey-Apartment,Builder-Floor-Apartment,Penthouse,Studio-Apartment&cityName=Thane&BudgetMin=5-Lacs&BudgetMax=5-Lacs',
'http://www.magicbricks.com/property-for-sale/residential-real-estate?bedroom=3&proptype=Multistorey-Apartment,Builder-Floor-Apartment,Penthouse,Studio-Apartment&cityName=Thane&BudgetMin=5-Lacs&BudgetMax=90-Lacs'))
urlList <- llply(uuu_df[,1], function(url){
this_pg <- read_html(url)
results_count <- this_pg %>%
xml_find_first(".//span[#id='resultCount']") %>%
xml_text() %>%
as.integer()
if(results_count > 0){
cards <- this_pg %>%
xml_find_all('//div[#class="SRCard"]')
df <- ldply(cards, .fun=function(x){
y <- data.frame(wine = x %>% xml_find_first('.//span[#class="agentNameh"]') %>% xml_text(),
excerpt = x %>% xml_find_first('.//div[#class="postedOn"]') %>% xml_text(),
locality = x %>% xml_find_first('.//span[#class="localityFirst"]') %>% xml_text(),
society = x %>% xml_find_first('.//div[#class="labValu"]') %>% xml_text() %>% gsub('\\n', '', .))
return(y)
})
} else {
df <- NULL
}
return(df)
}, .progress = 'text')
names(urlList) <- uuu_df[,1]
bind_rows(urlList)
Consider working with one large list built using lapply that iterates through url column of dataframe instead of managing many smaller vectors:
urlList <- lapply(uuu_df[1,], function(url){
UrlPage <- html(as.character(url))
ImgNode <- UrlPage %>% html_node("div.noResultHead")
u <- paste("No", word(string = as(ImgNode, "character"), start=4, end=5), sep=" ")
cat(".")
pg <- read_html(url)
if(u!="No Results Found!") {
df <- data.frame(wine=html_text(html_nodes(pg, ".agentNameh")),
excerpt=html_text(html_nodes(pg, ".postedOn")),
locality=html_text(html_nodes(pg,".localityFirst")),
society=html_text(html_nodes(pg,'.labValu .stop-propagation:nth-child(1)')),
stringsAsFactors=FALSE)
} else {
# ASSIGN EMPTY DATAFRAME (FOR CONSISTENT STRUCTURE)
df <- data.frame(wine=c(), excerpt=c(), locality=c(), society=c())
}
# RETURN NAMED LIST
return(list(UrlPage=UrlPage, ImgNode=ImgNode, u=u, df=df))
})
# ROW BIND ONLY DATAFRAME ELEMENT FROM LIST
wines <- map_df(urlList, function(u) u$df)
Related
I'm trying to extract data with the map function from {https://www.mcdonalds.com/de/de-de/produkte/alle-produkte/highlights.html} with all the category links and title and all the nutritional values and allergies from each category.
I'm not sure why mapping function is not working here, for all the links ??
PS: I'm creating a function factory at the end that reads the HTML and then combines the extracted data to a tibble.
library(dplyr)
library(readr)
library(rvest)
library(stringr)
library(tibble)
#function to extract links from website and storing them with product category
extract_links <- function(category, i){
# paste overview URLs categories
url <- paste0('https://www.mcdonalds.com/de/de-de/produkte/alle-produkte/', category,
'.html', i)
#extract urls and store them in a tibble together with the category
tibble(
category = url %>%
html_elements(".category-title") %>%
html_text2(),
links = url %>%
html_elements(".category-link") %>%
html_attr("href") %>%
str_c("https://www.mcdonalds.com", .)
)
}
# function to combine the tibble over all inputs
get_links<- function(categorys, num_pages){
i <- 1
repeat{
if(i == 1){
df <- extract_links(categorys, i)
print(paste(categorys, i))
} else {
temp_df <- extract_links(categorys, i)
if(any(duplicated(rbind(df, temp_df)))) break
df <- rbind(df, temp_df)
print(paste(category, i))
}
if(i >= num_pages) break
i <- i + 1
Sys.sleep(sample(2:10, 1))
}
return(df)
}
# vector of desired categorys
categorys<- c('Beliebte Produkte', ' Highlights', 'McMenü') #etc
data <- map2_dfr(categorys, c(2, 2), get_links)
The Tibble for the Nutritional values not sure how to deposit that with mapping function
library(tidyverse)
library(httr2)
"https://www.mcdonalds.com/dnaapp/itemDetails?country=de&language=de&showLiveData=true&item=201799" %>%
request() %>%
req_perform() %>%
resp_body_json(simplifyVector = TRUE) %>%
.$item %>%
.$nutrient_facts %>%
.$nutrient %>%
as_tibble %>%
select(4:9)
``
I'm trying to wrangle USR files (around 7,000) into a Long data format.
I've created the below, but it takes over 2 hours to run (hence the reason for adding the progress printer).
Does anyone have any idea how I can speed up this code? Are there specific lines that are slowing it down?
Thanks in advance!
for(i in D_flows){
flow <- read.table(i, header = F, fill = T, sep = "|")
for(j in flow){
Flow_name <- i
Timestamp <- ymd_hms(flow[flow$V1 == "ZHV",8])
Date <- ymd(flow[flow$V1 == "ZPD",2])
SR <- as.vector(flow[flow$V1 == "ZPD",3])
SP <- as.integer(as.vector(flow[flow$V1 == "SE1",2]))
EV <- as.numeric(as.character(flow[flow$V1 == "SE1" , 4]))
Flow_data <- tibble(Flow_name, Timestamp, Date, SR, SP, EV)
Flow_data <- Flow_data[complete.cases(Flow_data),]
Flow_data <- Flow_data %>%
group_by(SP) %>%
mutate(MEV = sum(EV)) %>%
select(Flow_name, Timestamp, Date, SR, SP, MEV) %>%
unique() %>%
ungroup()
}
#Append the flow data to the D Flow data file
D_flow_data <- bind_rows(D_flow_data, Flow_data)
#Shows the progress of the for loop
progress <- D_flow_data %>%
select(-Timestamp, -Date, -SR, -SP, -MEV) %>%
unique()
print(nrow(progress))
}
I have the following large dataframes:
Jan_Feb2019
Mar_Apr2019
May_Jun2019
Jul_Aug2019
Sep_Oct2019
Nov_Dec2019
Jan_Feb2020
Mar_2020
And i use the following code to generate other dataframes and fill the columns with the data i want.
#Jan_Feb2019
Jan_Feb2019_df <- as.data.frame(Jan_Feb2019$reactions$summary$total_count)
colnames(Jan_Feb2019_df)[1] <- "Reactions"
Jan_Feb2019_df$Shares <- Jan_Feb2019$shares$count
Jan_Feb2019_df$Comments <- Jan_Feb2019$comments$summary$total_count
Jan_Feb2019_df$Message <- Jan_Feb2019$message
Jan_Feb2019_df$Likes <- Jan_Feb2019$likes$summary$total_count
Jan_Feb2019_df$CreatedDate <- Jan_Feb2019$created_time
Jan_Feb2019_df$PostID <- Jan_Feb2019$id
Jan_Feb2019_df$Love <- Jan_Feb2019$reacts_love$summary$total_count
Jan_Feb2019_df$Angry <- Jan_Feb2019$reacts_angry$summary$total_count
Jan_Feb2019_df$Sad <- Jan_Feb2019$reacts_sad$summary$total_count
Jan_Feb2019_df$HAHA <- Jan_Feb2019$reacts_haha$summary$total_count
Jan_Feb2019_df$WOW <- Jan_Feb2019$reacts_wow$summary$total_count
Jan_Feb2019_df$CreatedDate <- anytime(Jan_Feb2019_df[,6])
Jan_Feb2019_df$insights.data <- Jan_Feb2019$insights$data
Jan_Feb2019_df <- Jan_Feb2019_df %>%
unnest(insights.data) %>%
unnest(values) %>%
select(Message,Shares,Comments,Reactions,Likes,CreatedDate,PostID,Love,Angry,Sad,HAHA,WOW,name,value) %>%
pivot_wider(names_from = name, values_from = value)
Is there a way to iterate between all the above dataframes, so i won't have to repeat the process 8 times?
Thanks
The code below is untested. I have tried to follow the code in the question, making it general. There are 2 functions.
fillNewDf takes the old object as only argument and creates and fills the new data frame.
makeNewDf takes the old object name as an argument and calls fillNewDf returning its value.
If the objects are in the global environment then makeNewDf argument envir default value is used.
fillNewDf <- function(X){
vec <- X[['reactions']][['summary']][['total_count']]
Y <- data.frame(Reactions = vec)
Y[['Shares']] <- X[['shares']][['count']]
Y[['Comments']] <- X[['comments']][['summary']][['total_count']]
Y[['Message']] <- X[['message']]
Y[['Likes']] <- X[['likes']][['summary']][['total_count']]
Y[['CreatedDate']] <- X[['created_time']]
Y[['PostID']] <- X[['id']]
Y[['Love']] <- X[['reacts_love']][['summary']][['total_count']]
Y[['Angry']] <- X[['reacts_angry']][['summary']][['total_count']]
Y[['Sad']] <- X[['reacts_sad']][['summary']][['total_count']]
Y[['HAHA']] <- X[['reacts_haha']][['summary']][['total_count']]
Y[['WOW']] <- X[['reacts_wow']][['summary']][['total_count']]
Y[['CreatedDate']] <- anytime(Y[, 6])
Y[['insights.data']] <- X[['insights']][['data']]
Y %>%
unnest(insights.data) %>%
unnest(values) %>%
select(Message, Shares, Comments, Reactions, Likes, CreatedDate, PostID, Love, Angry, Sad, HAHA, WOW, name, value) %>%
pivot_wider(names_from = name, values_from = value)
}
makeNewDf <- function(X, envir = .GlobalEnv){
DF <- get(X, envir = envir)
filNewDf(DF)
}
Now get the names of the objects to be processed with ls() and create a list with the new data frames.
old_names <- ls(pattern = '\\d{4}$')
new_list <- lapply(old_list, makeNewDf)
names(new_list) <- paste(old_names, "df", sep = "_")
If these new data frames are to become objects in the global environment, list2env(new_list) will create them with the same names as the names attribute of new_list.
I have a following problem:
I run a for loop that web scrapes HTML tables. I have to scrape values from two different tables and bind them together. My R code is here:
html <- data$html
tables <- list()
races <- list()
index <- 1
for (i in html){
try({
url <- i
table <- url %>%
html() %>%
html_nodes(xpath='//*[#id="Form1"]/table[4]') %>%
html_table(fill = TRUE)
race <- url %>%
html() %>%
html_nodes(xpath='//*[#id="Form1"]/table[1]') %>%
html_table(fill = TRUE)
tables[index] <- table
races[index] <- race
raceDF <- data.frame(matrix(unlist(race), nrow=4, ncol = 2))
date <- raceDF$X2[3]
tableDF <- do.call(rbind, tables)
tableDF$Date <- date
index <- index + 1
})
}
My code sucesfully does tableDF, however it does not add Date to each observation. It just put the same date for all rows, which is bad.
How can I fix it please?
I have two columns "name" and "link" which indicates the name of the firm and the link to scrape from. How do I put back the name column so that each row of scraped data will have a column with the corresponding name of the firm?
List.Of.Tabs <- map(pages, ~ {
name <- .x[1]
link <- .x[2]
webpage <- read_html(link)
tbls <- html_nodes(webpage, "table")
tbls_ls <- html_table(tbls,fill = TRUE)
pos1 <- possibly(function(tbls) bind_rows(tbls) %>%
filter_all(any_vars(. %in% c("Ireland", "Japan")))
, otherwise = NA)
pos1(tbls_ls)
Results should be something like that:
results <- data.frame(subsidiaries = c('Microsoft Japan','Microsoft Ireland'),
country = c('Japan', 'Ireland'),
name = c('Microsoft','Microsoft'))
Based on the code, we may need to mutate (i.e. creating a column of 'name' from the already created 'name' in the first line of code)
List.Of.Tabs <- map(pages, ~ {
name <- .x[1]
link <- .x[2]
webpage <- read_html(link)
tbls <- html_nodes(webpage, "table")
tbls_ls <- html_table(tbls,fill = TRUE)
pos1 <- possibly(function(tbls) bind_rows(tbls) %>%
filter_all(any_vars(. %in% c("Ireland", "Japan"))) %>%
mutate(name = name) # changed here
, otherwise = NA)
pos1(tbls_ls)
})