I'm trying to wrangle USR files (around 7,000) into a Long data format.
I've created the below, but it takes over 2 hours to run (hence the reason for adding the progress printer).
Does anyone have any idea how I can speed up this code? Are there specific lines that are slowing it down?
Thanks in advance!
for(i in D_flows){
flow <- read.table(i, header = F, fill = T, sep = "|")
for(j in flow){
Flow_name <- i
Timestamp <- ymd_hms(flow[flow$V1 == "ZHV",8])
Date <- ymd(flow[flow$V1 == "ZPD",2])
SR <- as.vector(flow[flow$V1 == "ZPD",3])
SP <- as.integer(as.vector(flow[flow$V1 == "SE1",2]))
EV <- as.numeric(as.character(flow[flow$V1 == "SE1" , 4]))
Flow_data <- tibble(Flow_name, Timestamp, Date, SR, SP, EV)
Flow_data <- Flow_data[complete.cases(Flow_data),]
Flow_data <- Flow_data %>%
group_by(SP) %>%
mutate(MEV = sum(EV)) %>%
select(Flow_name, Timestamp, Date, SR, SP, MEV) %>%
unique() %>%
ungroup()
}
#Append the flow data to the D Flow data file
D_flow_data <- bind_rows(D_flow_data, Flow_data)
#Shows the progress of the for loop
progress <- D_flow_data %>%
select(-Timestamp, -Date, -SR, -SP, -MEV) %>%
unique()
print(nrow(progress))
}
Related
So, I recently building a music recommender system using Collaborative Filtering in Rstudio. I have some problem with the function of cosine similarity which the system said "subscript out of bond" on the matrix that I want to calculate.
I use Cosine Similarity which I got the reference from this website: https://bgstieber.github.io/post/recommending-songs-using-cosine-similarity-in-r/
I've tried to fix the script but still apparently the output isn't working.
##cosinesim-crossprod
cosine_sim <- function(a,b) {crossprod(a,b)/sqrt(crossprod(a)*crossprod(b))}
##User data
play_data <- "https://static.turi.com/datasets/millionsong/10000.txt" %>%
read_tsv(col_names = c('user', 'song_id', 'plays'))
##Song data
song_data <- read_csv("D:/3rd Term/DataAnalysis/dataSet/song_data.csv") %>%
distinct(song_id, title, artist_name)
##Grouped
all_data <- play_data %>%
group_by(user, song_id) %>%
summarise(plays = sum(plays, na.rm = TRUE)) %>%
inner_join(song_data)
top_1k_songs <- all_data %>%
group_by(song_id, title, artist_name) %>%
summarise(sum_plays = sum(plays)) %>%
ungroup() %>%
top_n(1000, sum_plays) %>%
distinct(song_id)
all_data_top_1k <- all_data %>%
inner_join(top_1k_songs)
top_1k_wide <- all_data_top_1k %>%
ungroup() %>%
distinct(user, song_id, plays) %>%
spread(song_id, plays, fill = 0)
ratings <- as.matrix(top_1k_wide[,-1])
##Function
calc_cos_sim <- function(song_code = top_1k_songs,
rating_mat = ratings,
songs = song_data,
return_n = 5) {
song_col_index <- which(colnames(ratings)== song_code) %>%
cos_sims <- apply(rating_mat, 2,FUN = function(y)
cosine_sim(rating_mat[,song_col_index], y))
##output
data_frame(song_id = names(cos_sims), cos_sim = cos_sims) %>%
filter(song_id != song_code) %>% # remove self reference
inner_join(songs) %>%
arrange(desc(cos_sim)) %>%
top_n(return_n, cos_sim) %>%
select(song_id, title, artist_name, cos_sim)
}
I expect when I use this script:
shots <- 'SOJYBJZ12AB01801D0'
knitr::kable(calc_cos_sim(shots))
The output would be a data frame of 5 songs.
The pipe at the end of this line looks like a typo:
song_col_index <- which(colnames(ratings)== song_code) %>%
Replace it with:
song_col_index <- which(colnames(ratings)== song_code)
I have two Kables, Both will output correctly as a data frame but i want them to be kables as it just looks smarter. This first one works fine as a kable:
#Build tables and convert to df
#First time=N
firstTimeNWork <- as.data.frame(tbl(SDreams, "PassengersInfo") %>% filter(FirstTime == 0 & WorkHoliday == 1)
%>% select(FirstTime,WorkHoliday,Price))
firstTimeNWorkN <- as.data.frame(tbl(SDreams, "PassengersInfo") %>% filter(FirstTime == 0 & WorkHoliday == 0)
%>% select(FirstTime,WorkHoliday,Price))
#First time=Y
firstTimeYWork <- as.data.frame(tbl(SDreams, "PassengersInfo") %>% filter(FirstTime == 1 & WorkHoliday == 1)
%>% select(FirstTime,WorkHoliday,Price))
firstTimeYWorkN <- as.data.frame(tbl(SDreams, "PassengersInfo") %>% filter(FirstTime == 1 & WorkHoliday == 0)
%>% select(FirstTime,WorkHoliday,Price))
#Setup headings for the outputted table
FirstTime <- c('No',' ','Yes',' ')
WorkHoliday<- c('Yes','No','Yes','No')
#Find out average amount paid based on the tables i created
#First time=N
firstTimeNWorkMean <- mean(firstTimeNWork$Price)
firstTimeNWorkNMean <- mean(firstTimeNWorkN$Price)
#First time=Y
firstTimeYWorkMean <- mean(firstTimeYWork$Price)
firstTimeYWorkNMean <- mean(firstTimeYWorkN$Price)
#Add values under heading
MeanPrice <-c(firstTimeNWorkMean,firstTimeNWorkNMean,firstTimeYWorkMean,firstTimeYWorkNMean)
#Create the output df
passengerSummary1 <- data.frame(FirstTime,WorkHoliday,MeanPrice)
#Label columns & display
kable(passengerSummary1,
col.names = c("First time travelling?","As part of work?","Average price paid in (£)"),
align = c("c","c","c"),digits = 0)
However the second Kable won't create:
#Create tables
wouldVisitAgainNPrice <- as.data.frame(tbl(SDreams, "PassengersInfo") %>% filter(Propensity == 1)
%>% select(Propensity,Price))
wouldVisitAgainYPrice <- as.data.frame(tbl(SDreams, "PassengersInfo") %>% filter(Propensity == 2)
%>% select(Propensity,Price))
#Count amount of passengers based on filter
wouldVisitAgainNPriceTab <-tabulate(wouldVisitAgainNPrice$Propensity)
wouldVisitAgainYPriceTab <-tabulate(wouldVisitAgainYPrice$Propensity)[[2]]
#Setup column
WouldVisitAgain <-c('No','Yes')
#Add column with values
Passengers <-c(wouldVisitAgainNPriceTab,wouldVisitAgainYPriceTab)
#Means of prices
wouldVisitAgainNPriceMean <-mean(wouldVisitAgainNPrice$Price)
wouldVisitAgainYPriceMean <-mean(wouldVisitAgainYPrice$Price)
#Add column with price values
MeanPrice2 <-c(wouldVisitAgainNPriceMean,wouldVisitAgainYPriceMean)
#Create df
passengerSummary2 <-data.frame(WouldVisitAgain,Passengers,MeanPrice2)
#Label columns & display
kable(passengerSummary2,
col.names = c("Would visit again?","No of Passengers","Mean Price in (£)"),
align = c("c","c","c"),digits = 0)
And as you can tell the kable code is exactly the same so i am very confused as to why it won't work. I have tried updating all my packages and restarting R Studio and removing objects however none of that has made any difference.
Sorry in advance for the amount of code and how bad it is.I am new to this so i am aware i have likely done something pretty stupid and will be questioned about why i did that and my only answer will be I'm new,I'll remember that and maybe try it out if i have time.
Thanks in advance if anyone can help with this :)
I have a dataframe uuu_df with records as links of website
dim(uuu_df)
output
1950 1
uuu_df
1) http://www.magicbricks.com/property-for-sale/residential-real-estate?bedroom=1&proptype=Multistorey-Apartment,Builder-Floor-Apartment,Penthouse,Studio-Apartment&cityName=Thane&BudgetMin=5-Lacs&BudgetMax=5-Lacs
2) http://www.magicbricks.com/property-for-sale/residential-real-estate?bedroom=2&proptype=Multistorey-Apartment,Builder-Floor-Apartment,Penthouse,Studio-Apartment&cityName=Thane&BudgetMin=5-Lacs&BudgetMax=5-Lacs
3) http://www.magicbricks.com/property-for-sale/residential-real-estate?bedroom=3&proptype=Multistorey-Apartment,Builder-Floor-Apartment,Penthouse,Studio-Apartment&cityName=Thane&BudgetMin=5-Lacs&BudgetMax=5-Lacs
.
.
.
1950) http://www.magicbricks.com/property-for-sale/residential-real-estate?bedroom=>5&proptype=Multistorey-Apartment,Builder-Floor-Apartment,Penthouse,Studio-Apartment&cityName=Thane&BudgetMin=20-Crores&BudgetMax=20-Crores
here I'm trying to scrape data using those multiple links from the dataframe along with the condition i.e. if the text of html attribute is equal to "No Results Found!" then skip that record and move on to next record,
this is the snippet of that scraping
UrlPage <- html("http://www.magicbricks.com/property-for-sale/residential-real-estate?bedroom=2&proptype=Multistorey-Apartment,Builder-Floor-Apartment,Penthouse,Studio-Apartment&cityName=Thane&BudgetMin=5-Lacs&BudgetMax=5-Lacs")
ImgNode <- UrlPage %>% html_node("div.noResultHead")
u=ImgNode
u=as(u,"character")
u=paste("No",word(string = u, start = 4, end = 5),sep = " ")
Here is what I have tried
wines=data.frame()
url_test=c()
UrlPage_test=c()
u=c()
ImgNode=c()
for(i in 1:dim(uuu_df)[1]){
url_test[i]=as.character(uuu_df[i,])
UrlPage_test[i] <- html(url_test[i])
ImgNode[i] <- UrlPage_test[i] %>% html_node("div.noResultHead")
u[i]=ImgNode[i]
u[i]=as(u[i],"character")
u[i]=paste("No",word(string = u, start = 4, end = 5),sep = " ")
if(u[i]=="No Results Found!") next
{
map_df(1:5, function(i) # here 1:5 is number of webpages of a website
{
# simple but effective progress indicator
cat(".")
pg <- read_html(sprintf(url_test, i))
data.frame(wine=html_text(html_nodes(pg, ".agentNameh")),
excerpt=html_text(html_nodes(pg, ".postedOn")),
locality=html_text(html_nodes(pg,".localityFirst")),
society=html_text(html_nodes(pg,'.labValu .stop-propagation:nth-child(1)')),
stringsAsFactors=FALSE)
}) -> wines
}
But the Wines dataframe gives me empty dataframe with empty rows and columns
Why is it not able to append rows inside it.
Any suggestion will be helpful. Thanks in advance
P.S: dput() of reproduciable data
text1="http://www.magicbricks.com/property-for-sale/residential-real-estate?bedroom="
text2="1"
text3="&proptype="
text4="Multistorey-Apartment,Builder-Floor-Apartment,Penthouse,Studio-Apartment"
text5="&cityName=Thane&BudgetMin="
text6="&BudgetMax="
bhk=c("1","2","3","4","5",">5")
budg_min=c("5-Lacs","10-Lacs","20-Lacs","30-Lacs","40-Lacs","50-Lacs","60-Lacs","70-Lacs","80-Lacs","90-Lacs","1-Crores","1.2-Crores","1.4-Crores","1.6-Crores","1.8-Crores","2-Crores","2.3-Crores","2.6-Crores","3-Crores","3.5-Crores","4-Crores","4.5-Crores","5-Crores","10-Crores","20-Crores")
budg_max=c("5-Lacs","10-Lacs","20-Lacs","30-Lacs","40-Lacs","50-Lacs","60-Lacs","70-Lacs","80-Lacs","90-Lacs","1-Crores","1.2-Crores","1.4-Crores","1.6-Crores","1.8-Crores","2-Crores","2.3-Crores","2.6-Crores","3-Crores","3.5-Crores","4-Crores","4.5-Crores","5-Crores","10-Crores","20-Crores")
eg <- expand.grid(bhk = bhk, budg_min = budg_min, budg_max = budg_max)
eg <- eg[as.integer(eg$budg_min) <= as.integer(eg$budg_max),]
uuu <- sprintf("%s%s%s%s%s%s%s%s", text1,eg[,1],text3,text4,text5,eg[,2],text6,eg[,3])
uuu_df=data.frame(Links=uuu)
dput(uuu_df)
You should take advantage of the document tree to consistently find the elements you need and control the flow of the loop or vectorized function. In the example below I check the result count to determine if there are results, then parse each node individually to ensure it's consistent. Finally, you can bind them if needed.
Side Note: llply has the .progress argument which more elegantly handles the progress indicator you were trying to devise using cat().
options(stringsAsFactors = FALSE)
library(plyr)
library(dplyr)
library(xml2)
uuu_df <- data.frame(x = c('http://www.magicbricks.com/property-for-sale/residential-real-estate?bedroom=1&proptype=Multistorey-Apartment,Builder-Floor-Apartment,Penthouse,Studio-Apartment&cityName=Thane&BudgetMin=5-Lacs&BudgetMax=5-Lacs',
'http://www.magicbricks.com/property-for-sale/residential-real-estate?bedroom=2&proptype=Multistorey-Apartment,Builder-Floor-Apartment,Penthouse,Studio-Apartment&cityName=Thane&BudgetMin=5-Lacs&BudgetMax=5-Lacs',
'http://www.magicbricks.com/property-for-sale/residential-real-estate?bedroom=3&proptype=Multistorey-Apartment,Builder-Floor-Apartment,Penthouse,Studio-Apartment&cityName=Thane&BudgetMin=5-Lacs&BudgetMax=90-Lacs'))
urlList <- llply(uuu_df[,1], function(url){
this_pg <- read_html(url)
results_count <- this_pg %>%
xml_find_first(".//span[#id='resultCount']") %>%
xml_text() %>%
as.integer()
if(results_count > 0){
cards <- this_pg %>%
xml_find_all('//div[#class="SRCard"]')
df <- ldply(cards, .fun=function(x){
y <- data.frame(wine = x %>% xml_find_first('.//span[#class="agentNameh"]') %>% xml_text(),
excerpt = x %>% xml_find_first('.//div[#class="postedOn"]') %>% xml_text(),
locality = x %>% xml_find_first('.//span[#class="localityFirst"]') %>% xml_text(),
society = x %>% xml_find_first('.//div[#class="labValu"]') %>% xml_text() %>% gsub('\\n', '', .))
return(y)
})
} else {
df <- NULL
}
return(df)
}, .progress = 'text')
names(urlList) <- uuu_df[,1]
bind_rows(urlList)
Consider working with one large list built using lapply that iterates through url column of dataframe instead of managing many smaller vectors:
urlList <- lapply(uuu_df[1,], function(url){
UrlPage <- html(as.character(url))
ImgNode <- UrlPage %>% html_node("div.noResultHead")
u <- paste("No", word(string = as(ImgNode, "character"), start=4, end=5), sep=" ")
cat(".")
pg <- read_html(url)
if(u!="No Results Found!") {
df <- data.frame(wine=html_text(html_nodes(pg, ".agentNameh")),
excerpt=html_text(html_nodes(pg, ".postedOn")),
locality=html_text(html_nodes(pg,".localityFirst")),
society=html_text(html_nodes(pg,'.labValu .stop-propagation:nth-child(1)')),
stringsAsFactors=FALSE)
} else {
# ASSIGN EMPTY DATAFRAME (FOR CONSISTENT STRUCTURE)
df <- data.frame(wine=c(), excerpt=c(), locality=c(), society=c())
}
# RETURN NAMED LIST
return(list(UrlPage=UrlPage, ImgNode=ImgNode, u=u, df=df))
})
# ROW BIND ONLY DATAFRAME ELEMENT FROM LIST
wines <- map_df(urlList, function(u) u$df)
I think this is an interesting task to optimize a piece of R code.
I have a dataframe df_red which details from orders of a webshop. For each product (ean), I want to get the 12 most likely other products to be in a basket with it.
This is the sample code to generate such data set:
library(tidyverse)
# create a vector with 1400 products (characterized by their EANs)
eans <- sample(1e5:1e6, 1400, replace = FALSE)
# create a vector with 200k orders
basket_nr <- 1:2e5
# a basket can have up to 4 items, it's most likely to have 3 items
n_prod_per_basket <- sample(x = 1:4, length(basket_nr), prob = c(0.2, 0.2, 0.5, 0.1), replace = TRUE)
# create df_red, each line of which correspond to a product with it's respective basket number
df <- data_frame(basket_nr, n_prod_per_basket)
df_red <- data_frame(basket_nr = rep(basket_nr, n_prod_per_basket))
df_red$ean <- sample(x = eans, nrow(df_red), replace = TRUE)
The code I am using to accomplish this task is the following. But I am sure it's not an efficient one. How can I increase the speed of the program?
ean <- unique(df_red$ean)
out <- list()
for (i in 1:length(ean)){
ean1 <- ean[i]
# get all basket_nr that contain the ean in question
basket_nr <- df_red[df_red$ean == ean1, ]$basket_nr
# get products that were together in the same basket with the ean in question
boo <- (df_red$ean != ean1) & (df_red$basket_nr %in% basket_nr)
prod <- df_red[boo, ]
# get top most frequent
top12 <- prod %>%
group_by(ean) %>%
summarise(n = n()) %>%
arrange(desc(n)) %>%
filter(row_number() %in% 1:12)
# skip products that weren't together in a basket with at least 12 different other products
if(nrow(top12) == 12) out[[i]] <- data_frame(ean = ean1, recom = top12$ean, freq = top12$n)
if(i %% 100 == 0) print(paste0(round(i/length(ean)*100, 2), '% is complete'))
}
Performance improvements are of course a matter of degree. How far to go before it is improved "enough" is hard to say. However, we can reduce run time by about 25% by functionalizing your code and cleaning up the subsetting logic. Starting with your code:
#added a timer
start.time <- Sys.time()
for (i in 1:length(ean)){
ean1 <- ean[i]
# get all basket_nr that contain the ean in question
basket_nr <- df_red[df_red$ean == ean1, ]$basket_nr
# get products that were together in the same basket with the ean in question
boo <- (df_red$ean != ean1) & (df_red$basket_nr %in% basket_nr)
prod <- df_red[boo, ]
# get top most frequent
top12 <- prod %>%
group_by(ean) %>%
summarise(n = n()) %>%
arrange(desc(n)) %>%
filter(row_number() %in% 1:12)
# skip products that weren't together in a basket with at least 12 different other products
if(nrow(top12) == 12) out[[i]] <- data_frame(ean = ean1, recom = top12$ean, freq = top12$n)
if(i %% 100 == 0) print(paste0(round(i/length(ean)*100, 2), '% is complete'))
}
Sys.time() - start.time
This takes between 30-34 seconds on my machine. However we can rewrite it as a function like so:
my.top12.func <- function(id, df_red) {
#improved subsetting logic - using which is faster and we can remove some code by
#removing the ean that is being iterated in the filter step below
prod <- df_red[df_red$basket_nr %in% df_red$basket_nr[which(df_red$ean == id)], ]
# set cutoff from 12 to 13 since the specific ean will always be one of the top 12
top12 <- prod %>%
group_by(ean) %>%
summarise(n = n()) %>%
arrange(desc(n)) %>%
filter(row_number() %in% 1:13 & ean != id) #additional filter required
# skip products that weren't together in a basket with at least 12 different other products
if(nrow(top12) == 12) return(data_frame(ean = id, recom = top12$ean, freq = top12$n))
}
Now we can test the speed and accuracy of this approach by doing:
start.time <- Sys.time()
my.out <- lapply(ean, my.top12.func, df_red = df_red)
Sys.time() - start.time
#test for equality
all.equal(out, my.out)
Which is about 24-26 seconds for a 25%+ improvement.
Playing around with data.table I have the output produced in under 7 seconds (which I guess is about 80% improvement):
library(data.table)
setDT(df_red)
all_eans <- df_red[, unique(ean)]
k <- lapply(all_eans, function(x) {
df_red[basket_nr %in% df_red[ean == x, unique(basket_nr)],
.N,
by = ean][order(-N)][2:13]
}
)
names(k) <- all_eans
k <- k[sapply(k, nrow) == 12]
I would consider not using a loop.
df_red$k <- 1
df_s <- left_join(df_red, df_red, by = "k") %>%
filter(ean.x != ean.y & basket_nr.x == basket_nr.y) %>%
group_by(ean.x) %>%
summarise(n = n()) %>%
arrange(desc(n)) %>%
filter(row_number() %in% 1:13)
df_s.ct <- df_s %>% filter(row_number() == 12)
df_s.fin <- df_s[df_s$ean.x %in% df_s.ct$ean.x, ]
The rate limiting step in this is the left_join which merges the dataset to itself, creating an exponentially larger dataset (so if you have 50,000 points, then you will end up creating a new dataset that is 2.5B points). It now indicates that the best way to store and manipulate the data is using data.table, which will increase the speed of this procedure, especially when combined with dplyr.
I would like to be able to use more automation when creating SpatialLines objects from otherwise tidy data frames.
library(sp)
#create sample data
sample_data <- data.frame(group_id = rep(c("a", "b","c"), 10),
x = rnorm(10),
y = rnorm(10))
#How can I recreate this using dplyr?
a_list <- Lines(list(Line(sample_data %>% filter(group_id == "a") %>% select(x, y))), ID = 1)
b_list <- Lines(Line(list(sample_data %>% filter(group_id == "b") %>% select(x, y))), ID = 2)
c_list <- Lines(Line(list(sample_data %>% filter(group_id == "c") %>% select(x, y))), ID = 3)
SpatialLines(list(a_list, b_list, c_list))
You can see how using something like group_by would make the process pretty easy if you could understand how the data could be piped into a list.
Using your sample data, a wrapper function, and dplyr::do will give you what you want :)
wrapper <- function(df) {
df %>% select(x,y) %>% as.data.frame %>% Line %>% list %>% return
}
y <- sample_data %>% group_by(group_id) %>%
do(res = wrapper(.))
# and now assign IDs (since we can't do that inside dplyr easily)
ids = 1:dim(y)[1]
SpatialLines(
mapply(x = y$res, ids = ids, FUN = function(x,ids) {Lines(x,ID=ids)})
)
I don't use sp so there might be a better way to assign IDs.
For reference, consider reading Hadley's comments on returning non-dataframe from dplyr do calls