Subsetting data by condition - r

I am trying to reshape/ reduce my data. So far, I employ a for loop (very slow) but from what I perceive, this should be quite fast with Plyr.
I have many groups (firms, as a factor in the dataset) and I want to drop entirely every firm which shows a 0 entry for value in any of that firm's cells. I thus create a new data.frame but leave out all groups showing 0 for value at some point.
The forloop:
Data Creation:
set.seed(1)
mydf <- data.frame(firmname = sample(LETTERS[1:5], 40, replace = TRUE),
value = rpois(40, 2))
-----------------------------
splitby = mydf$firmname
new.data <- data.frame()
for (i in 1:(length(unique(splitby)))) {
temp <- subset(mydf, splitby == as.character(paste(unique(splitby)[i])))
if (all(temp$value > 0) == "TRUE") {
new.data <- rbind(new.data, temp)
}
}
Delete all empty firm factors
new.data$splitby <- factor(new.data$splitby)
Is there a way to achieve that with the plyr package? Can the subset function be used in that context?
EDIT: For the purpose of the reproduction of the problem, data creation, as suggested by BenBarnes, is added. Ben, thanks a lot for that. Furthermore, my code is altered so as to comply with the answers provided below.

You could supply an anonymous function to the .fun argument in ddply():
set.seed(1)
mydf <- data.frame(firmname = sample(LETTERS[1:5], 40, replace = TRUE),
value = rpois(40, 2))
library(plyr)
ddply(mydf,.(firmname), function(x) if(any(x$value==0)) NULL else x )
Or using [, as suggested by Andrie:
firms0 <- unique(mydf$firmname[which(mydf$value == 0)])
mydf[-which(mydf$firmname %in% firms0), ]
Note that the results of ddply are sorted according to firmname
EDIT
For the example in your comments, this approach is again faster than using ddply() to subset, selecting only firms with more than three entries:
firmTable <- table(mydf$firmname)
firmsGT3 <- names(firmTable)[firmTable > 3]
mydf[mydf$firmname %in% firmsGT3, ]

Related

How to efficiently split each row into test and train subsets using R?

I have a data table that provides the length and composition of given vectors
for example:
set.seed(1)
dt = data.table(length = c(100, 150),
n_A = c(30, 30),
n_B = c(20, 100),
n_C = c(50, 20))
I need to randomly split each vector into two subsets with 80% and 20% of observations respectively. I can currently do this using a for loop. For example:
dt_80_list <- list() # create output lists
dt_20_list <- list()
for (i in 1:nrow(dt)){ # for each row in the data.table
sample_vec <- sample( c( rep("A", dt$n_A[i]), # create a randomised vector with the given nnumber of each component.
rep("B", dt$n_B[i]),
rep("C", dt$n_C[i]) ) )
sample_vec_80 <- sample_vec[1:floor(length(sample_vec)*0.8)] # subset 80% of the vector
dt_80_list[[i]] <- data.table( length = length(sample_vec_80), # count the number of each component in the subset and output to list
n_A = length(sample_vec_80[which(sample_vec_80 == "A")]),
n_B = length(sample_vec_80[which(sample_vec_80 == "B")]),
n_C = length(sample_vec_80[which(sample_vec_80 == "C")])
)
dt_20_list[[i]] <- data.table( length = dt$length[i] - dt_80_list[[i]]$length, # subtract the number of each component in the 80% to identify the number in the 20%
n_A = dt$n_A[i] - dt_80_list[[i]]$n_A,
n_B = dt$n_B[i] - dt_80_list[[i]]$n_B,
n_C = dt$n_C[i] - dt_80_list[[i]]$n_C
)
}
dt_80 <- do.call("rbind", dt_80_list) # collapse lists to output data.tables
dt_20 <- do.call("rbind", dt_20_list)
However, the dataset I need to apply this to is very large, and this is too slow. Does anyone have any suggestions for how I could improve performance?
Thanks.
(I assumed your dataset consists of many more rows (but only a few colums).)
Here's a version I came up with, with mainly three changes
use .N and by= to count the number of "A","B","C" drawn in each row
use the size argument in sample
join the original dt and dt_80 to calculate dt_20 without a for-loop
## draw training data
dt_80 <- dcast(
dt[,row:=1:nrow(dt)
][, .(draw=sample(c(rep("A80",n_A),
rep("B80",n_B),
rep("C80",n_C)),
size=.8*length) )
, by=row
][,.N,
by=.(row,draw)],
row~draw,value.var="N")[,length80:=A80+B80+C80]
## draw test data
dt_20 <- dt[dt_80,
.(A20=n_A-A80,
B20=n_B-B80,
C20=n_C-C80),on="row"][,length20:=A20+B20+C20]
There is probably still room for optimization, but I hope it already helps :)
EDIT
Here I add my initial first idea, I did not post this because the code above is much faster. But this one might be more memory-efficient which seems crucial in your case. So, even if you already have a working solution, this might be of interest...
library(data.table)
library(Rfast)
## add row numbers
dt[,row:=1:nrow(dt)]
## sampling function
sampfunc <- function(n_A,n_B,n_C){
draw <- sample(c(rep("A80",n_A),
rep("B80",n_B),
rep("C80",n_C)),
size=.8*(n_A+n_B+n_C))
out <- Rfast::Table(draw)
return(as.list(out))
}
## draw training data
dt_80 <- dt[,sampfunc(n_A,n_B,n_C),by=row]

R Convert loop into function

I would like to clean up my code a bit and start to use more functions for my everyday computations (where I would normally use for loops). I have an example of a for loop that I would like to make into a function. The problem I am having is in how to step through the constraint vectors without a loop. Here's what I mean;
## represents spectral data
set.seed(11)
df <- data.frame(Sample = 1:100, replicate(1000, sample(0:1000, 100, rep = TRUE)))
## feature ranges by column number
frm <- c(438,563,953,963)
to <- c(548,803,1000,993)
nm <- c("WL890", "WL1080", "WL1400", "WL1375")
WL.ps <- list()
for (i in 1:length(frm)){
## finds the minimum value within the range constraints and returns the corresponding column name
WL <- colnames(df[frm[i]:to[i]])[apply(df[frm[i]:to[i]],1,which.min)]
WL.ps[[i]] <- WL
}
new.df <- data.frame(WL.ps)
colnames(new.df) <- nm
The part where I iterate through the 'frm' and 'to' vector values is what I'm having trouble with. How does one go from frm[1] to frm[2].. so-on in a function (apply or otherwise)?
Any advice would be greatly appreciated.
Thank you.
You could write a function which returns column name of minimum value in each row for a particular range of columns. I have used max.col instead of apply(df, 1, which.min) to get minimum value in a row since max.col would be efficient compared to apply.
apply_fun <- function(data, x, y) {
cols <- x:y
names(data[cols])[max.col(-data[cols])]
}
Apply this function using Map :
WL.ps <- Map(apply_fun, frm, to, MoreArgs = list(data = df))

Optimizing processing time in the nested for loops - R

I have two datasets with 24k and 15k rows. I used nested for loops in order to rewrite some data... however it takes forever to compute the operation.
does anyone have a suggestion how to optimize the code to speed the process?
my code:
for(i in 1:length(data$kolicina)){
for(j in 1:length(df$kolicina)){
if(data$LIXcode[i] == df$LIXcode[j]){
data$kolicina[i] <- df$kolicina[j]
}
}
}
the full code with the imput looks like this:
df <- data[grepl("Trennscheiben", data$a_naziv) & data$SestavKolicina > 1,]
for(i in 1:length(df$kolicina)){
df$kolicina[i] <- df$kolicina[i] / 10
}
for(i in 1:length(data$kolicina)){
for(j in 1:length(df$kolicina)){
if(data$LIXcode[i] == df$LIXcode[j]){
data$kolicina[i] <- df$kolicina[j]
}
}
}
the data:
LIXcode a_naziv RacunCenaNaEM kolicina
LIX2017396957 MINI HVLP Spritzpistole 20,16 1
LIX2017396957 MINI HVLP Spritzpistole 20,16 1
LIX2017396963 Trennscheiben Ø115 Ø12 12,53 30
LIX2017396963 Trennscheiben Ø115 Ø12 12,53 1
I haven't tried this on my own machine, but this should work
fun <- function(x,y){
x[which(x$LIXcode %in% y$LIXcode)]$kolicina =
y[which(x$LIXcode %in% y$LIXcode)]$kolicina
}
}
fun(data,df)
R has the capability to do them all in parallel
As far as I understand, the question concerns table "dt1" with key column "a" and any number of value columns and any number of observations. And then we have a "dt2" that has some sort of mapping - which means that column "a" has unique values and some column "b" has values that need to be written into "dt1" where columns "a" match.
I would suggest joining tables:
require(data.table)
dt1 <- data.table(a = sample(1:10, 1000, replace = T),
b = sample(letters, 1000, replace = T))
dt2 <- data.table(a = 1:10,
b = letters[1:10])
output <- merge(dt1, dt2, by = "a", all.x = T)
Also you can try:
dt1[,new_value:=dt2$b[match(a, dt2$a)]
Both of these solutions are vectorized, therefore almost instant.
Base solution (no data.table syntax, although I'd highly recommend you to learn it):
dt1$new_value <- dt2$b[match(dt1$a, dt2$a)]
And that's if I understood the question correctly...
Here's a working solution to accommodate for expected output:
dt1[a %in% dt2$a, b:=dt2$b[match(a, dt2$a)]]

How to speed up a loop-like function in R

In trying to avoid using the for loop in R, I wrote a function that returns an average value from one data frame given row-specific values from another data frame. I then pass this function to sapply over the range of row numbers. My function works, but it returns ~ 2.5 results per second, which is not much better than using a for loop. So, I feel like I've not fully exploited the vectorized aspects of the apply family of functions. Can anyone help me rethink my approach? Here is a minimally working example. Thanks in advance.
#Creating first dataframe
dates<-seq(as.Date("2013-01-01"), as.Date("2016-07-01"), by = 1)
n<-length(seq(as.Date("2013-01-01"), as.Date("2016-07-01"), by = 1))
df1<-data.frame(date = dates,
hour = sample(1:24, n,replace = T),
cat = sample(c("a", "b"), n, replace = T),
lag = sample(1:24, n, replace = T))
#Creating second dataframe
df2<-data.frame(date = sort(rep(dates, 24)),
hour = rep(1:24, length(dates)),
p = runif(length(rep(dates, 24)), min = -20, max = 100))
df2<-df2[order(df2$date, df2$hour),]
df2$cat<-"a"
temp<-df2
temp$cat<-"b"
df2<-rbind(df2,temp)
#function
period_mean<-function(x){
tmp<-df2[df$cat == df1[x,]$cat,]
#This line extracts the row name index from tmp,
#in which the two dataframes match on date and hour
he_i<-which(tmp$date == df1[x,]$date & tmp$hour == df1[x,]$hour)
#My lagged period is given by the variable "lag". I want the average
#over the period hour - (hour - lag). Since df2 is sorted such hours
#are consecutive, this method requires that I subset on only the
#relevant value for cat (hence the creation of tmp in the first line
#of the function
p<-mean(tmp[(he_i - df1[x,]$lag):he_i,]$p)
print(x)
print(p)
return(p)
}
#Execute function
out<-sapply(1:length(row.names(df1)), period_mean)
EDIT I have subsequently learned that part of the reason my original problem was iterating so slowly is that my data classes between the two dataframes were not the same. df1$date was a date field, while df2$date was a character field. Of course, this wasn't apparent with the example I posted because the data types were the same by construction. Hope this helps.
Here's one suggestion:
getIdx <- function(i) {
date <- df1$date[i]
hour <- df1$hour[i]
cat <- df1$cat[i]
which(df2$date==date & df2$hour==hour & df2$cat==cat)
}
v_getIdx <- Vectorize(getIdx)
df1$index <- v_getIdx(1:nrow(df1))
b_start <- match("b", df2$cat)
out2 <- apply(df1[,c("cat","lag","index")], MAR=1, function(x) {
flr <- ifelse(x[1]=="a", 1, b_start)
x <- as.numeric(x[2:3])
mean(df2$p[max(flr, (x[2]-x[1])):x[2]])
})
We make a function (getIdx) to retrieve the rows from df2 that match the values from each row in df1, and then Vectorize the function.
We then run the vectorized function to get a vector of rownames. We set b_start to be the row where the "b" category starts.
We then iterate through the rows of df1 with apply. In the mean(...) function, we set the "floor" to be either row 1 (if cat=="a") or b_start (if cat=="b"), which eliminates the need to subset (what you were doing with tmp).
Performance:
> system.time(out<-sapply(1:length(row.names(df1)), period_mean))
user system elapsed
11.304 0.393 11.917
> system.time({
+ df1$index <- v_getIdx(1:nrow(df1))
+ b_start <- match("b", df2$cat)
+ out2 <- apply(df1[,c("cat","lag","index")], MAR=1, function(x) {
+ flr <- ifelse(x[1]=="a", 1, b_start)
+ x <- as.numeric(x[2:3])
+ mean(df2$p[max(flr, (x[2]-x[1])):x[2]])
+ })
+ })
user system elapsed
2.839 0.405 3.274
> all.equal(out, out2)
[1] TRUE

Using coefficient of variation in aggregate

I have a data frame with 50000 rows and 200 columns. There are duplicate rows in the data and I want to aggregate the data by choosing the row with maximum coefficient of variation among the duplicates using aggregate function in R. With aggregate I can use "mean", "sum" by default but not coefficient variation.
For example
aggregate(data, as.columnname, FUN=mean)
Works fine.
I have a custom function for calculating coefficient of variation but not sure how to use it with aggregate.
co.var <- function(x)
(
100*sd(x)/mean(x)
)
I have tried
aggregate(data, as.columnname, function (x) max (co.var (x, data[index (x),])
but it is giving an error as object x is not found.
Assuming that I understand your problem, I would suggest using tapply() instead of aggregate() (see ?tapply for more info). However, a minimal working example would be very helpful.
co.var <- function(x) ( 100*sd(x)/mean(x) )
## Data with multiple repeated measurements.
## There are three things (ID 1, 2, 3) that
## are measured two times, twice each (val1 and val2)
myDF<-data.frame(ID=c(1,2,3,1,2,3),val1=c(20,10,5,25,7,2),
val2=c(19,9,4,24,4,1))
## Calculate coefficient of variation for each measurement set
myDF$coVar<-apply(myDF[,c("val1","val2")],1,co.var)
## Use tapply() instead of aggregate
mySel<-tapply(seq_len(nrow(myDF)),myDF$ID,function(x){
curSub<-myDF[x,]
return(x[which(curSub$coVar==max(curSub$coVar))])
})
## The mySel vector is then the vector of rows that correspond to the
## maximum coefficient of variation for each ID
myDF[mySel,]
EDIT:
There are faster ways, one of which is below. However, with a 40000 by 100 dataset, the above code only took between 16 and 20 seconds on my machine.
# Create a big dataset
myDF <- data.frame(val1 = c(20, 10, 5, 25, 7, 2),
val2 = c(19, 9, 4, 24, 4, 1))
myDF <- myDF[sample(seq_len(nrow(myDF)), 40000, replace = TRUE), ]
myDF <- cbind(myDF, rep(myDF, 49))
myDF$ID <- sample.int(nrow(myDF)/5, nrow(myDF), replace = TRUE)
# Define a new function to work (slightly) better with large datasets
co.var.df <- function(x) ( 100*apply(x,1,sd)/rowMeans(x) )
# Create two datasets to benchmark the two methods
# (A second method proved slower than the third, hence the naming)
myDF.firstMethod <- myDF
myDF.thirdMethod <- myDF
Time the original method
startTime <- Sys.time()
myDF.firstMethod$coVar <- apply(myDF.firstMethod[,
grep("val", names(myDF.firstMethod))], 1, co.var)
mySel <- tapply(seq_len(nrow(myDF.firstMethod)),
myDF.firstMethod$ID, function(x) {
curSub <- myDF.firstMethod[x, ]
return(x[which(curSub$coVar == max(curSub$coVar))])
}, simplify = FALSE)
endTime <- Sys.time()
R> endTime-startTime
Time difference of 17.87806 secs
Time second method
startTime3 <- Sys.time()
coVar3<-co.var.df(myDF.thirdMethod[,
grep("val",names(myDF.thirdMethod))])
mySel3 <- tapply(seq_along(coVar3),
myDF[, "ID"], function(x) {
return(x[which(coVar3[x] == max(coVar3[x]))])
}, simplify = FALSE)
endTime3 <- Sys.time()
R> endTime3-startTime3
Time difference of 2.024207 secs
And check to see that we get the same results:
R> all.equal(mySel,mySel3)
[1] TRUE
There is an additional change from the original post, in that the edited code considers that there may be more than one row with the highest CV for a given ID. Therefore, to get the results from the edited code, you must unlist the mySel or mySel3 objects:
myDF.firstMethod[unlist(mySel),]
myDF.thirdMethod[unlist(mySel3),]

Resources