Compute DEA for each ID, Year, Dept, and Class? - r

I have a dataframe with identifier (IDENT), year (time), with their categories (OTEXE) and their department (CDEPT). I want to compute the efficiency score for each observation by year, categories and department. As, efficiency score are benchmarking techniques, it is better to compute it that way and then compare observations with the one more similar.
I've tried this so far:
my_dea <- function(x) Benchmarking::dea(X=as.matrix(x[,c("A", "B", "C", "D")]),
Y=s.matrix(x[["E"]], RTS ="VRS",
ORIENTATION ="out"))
score <- test[, .(eff = my_dea(.SD)), list(IDENT, time, OTEXE, CDEPT)]
c("A", "B", "C", "D") are input and "E" is the output. They should be in matrix form.
When i Run this code, I have this :
"ERROR in `[data.frame`test, ,.(eff = my_dea(.SD)), list(IDENT, time, :
object 'IDENT' not found.

Yeah sure! I've constructed a function:
my_dea <- function(X, Y) {
X <- as.matrix(df[, c("A", "B", "C", "D"])
Y <- as.matrix(df[,"E"])
benchmarking::dea.boot(X, Y, NREP = 2000, RTS="vrs", ORIENTATION ="out")
}
eff <- df1 %>% group_by(IDENT, TIME, CDEPT, OTEXE) %>% my_dea
Thanks #jay.sf !

Related

group by sequence of events and get summary statistics for each sequence

I have a data.frame with log of sequences of events. Here, sequence 1 is composed of event A, then B, then C, each starting at a specific timestamp (in seconds).
df=data.frame(id=runif(10, 1e6, 1e7), sequence = c(1,1,1,2,2,3,3,3,4,4), event=c("A", "B", "C", "B", "C", "A", "B", "C", "B", "C"), starts_at=c(20,22,24,20,30,20,21,23,20,40))
What I want is to group my data.frame by type of sequence (there are dozens of types, length 2 to 6): A->B->C or B->C, and then to get some results on those types. Desired output would be:
#### sequence_type number.appearances mean.delay.between.events
#### 1 ABC 2 1.5 / 2
#### 2 BC 2 15
The last column "mean delay" would be a string composed of the mean diff time between successive events in a sequence: in ABC sequence, there is 1.5 seconds in average between A and B, and 2 between B and C.
I also thought of "spreading" each mean difference in a new column diff.1, diff.2..., but seems complicated since sequence have different lengths. Though i'm open to different ways of presenting this information..
So far I've come up with:
library(dplyr)
df %>% group_by(sequence) %>% arrange(starts_at) %>% summarise(sequence_type = paste0(event, collapse="")) %>% group_by(sequence_type) %>% tally
I didn't find how to achieve the second part. Thanks for the help...
This might not bee the elegant solution you would get with dplyr but I think is general enough that it would work with your real data.
First you just need to get the corresponding sequence of each row of your data, that is ayuda_seq
library(zoo)
df=data.frame(id=runif(14, 1e6, 1e7), sequence = c(1,1,1,2,2,3,3,3,4,4,5,5,5,5),
event=c("A", "B", "C", "B", "C", "A", "B", "C", "B", "C","A","B","C","D"),
starts_at=c(20,22,24,20,30,20,21,23,20,40,20,22,21,15))
ayuda_seq = sapply(df$sequence, function(x) paste0(df[df$sequence == x,3],collapse = ""))
and then you just loop through the unique sequences and generate the sub sequence by each 2 elements.
vec_means = NULL
for(x in unique(ayuda_seq)){
data_temp = df[ayuda_seq == x,]
diff_temp = diff(data_temp$starts_at)
temp_sub = apply(rollapply(data_temp[,3],FUN = paste0,width = 2),1,paste0,collapse = "")
mean_temp = aggregate(diff_temp,by = list(temp_sub),mean)
if(all(!duplicated(temp_sub))){
averages = paste0(mean_temp[,2],collapse = " / ")
} else{
averages = paste0(mean_temp[match(temp_sub[duplicated(temp_sub)],mean_temp[,1]),2],collapse = " / ")
}
vec_means = c(vec_means,averages)
}
df_res = data.frame(sequence_type = unique(ayuda_seq),
number.appearances = as.numeric(table(ayuda_seq)/nchar(unique(ayuda_seq))),
mean.delay.between.events = vec_means)
the variable temp_sub will have the different combinations within the original string you are looping. In the case of "ABC" there is a possible combination of "CA" which is not taking in consideration because it is unique.
Not pretty, but it works
tmp<-df %>% group_by(sequence) %>% dplyr::arrange(sequence, starts_at) %>% dplyr::mutate(seq_row_num=dplyr::row_number(), lead_starts_at=dplyr::lead(starts_at, n = 1)) %>% base::as.data.frame()
tmp<- tmp %>% dplyr::group_by(sequence) %>% mutate(max_seq_len=max(seq_row_num)) %>% base::as.data.frame()
tmp$seq_len_id<- paste0(tmp$sequence, tmp$max_seq_len)
tmp$next_seq_val<- tmp$seq_row_num + 1
tmp$next_seq_val<- base::ifelse(tmp$next_seq_val >= tmp$max_seq_len, tmp$max_seq_len, tmp$next_seq_val)
tmp_seq_labels<- stats::aggregate(tmp$event, list(tmp$seq_len_id), paste, collapse='')
tmp<- base::merge(tmp, tmp_seq_labels, by.x="seq_len_id", by.y="Group.1")
colnames(tmp)[which(colnames(tmp)=="x")]<- "seq_group"
tmp$within_group_step<-"ZZ"
tmp$within_group_step<- base::ifelse(tmp$seq_row_num != tmp$max_seq_len, substr(tmp$seq_group, start = tmp$seq_row_num, stop =tmp$next_seq_val), tmp$within_group_step)
tmp$within_step_by_group_id<- paste0(tmp$seq_group, tmp$within_group_step)
tmp$time_diff<- 0
tmp$time_diff<- base::ifelse(!is.na(tmp$lead_starts_at), tmp$lead_starts_at - tmp$starts_at, tmp$time_diff)
res<- stats::aggregate(time_diff ~ within_step_by_group_id + seq_group + within_group_step, data=tmp, FUN=mean)
drops<- grep(pattern = "ZZ", x = res$within_step_by_group_id)
if(length(drops)>=1){
res<- res[-drops,]
}
colnames(res)<- c("Full_Group_Pattern", "Group_Pattern", "Sub_Group_Pattern", "Mean_Time_Difference")
res<- res %>% dplyr::group_by(Group_Pattern) %>%
dplyr::mutate(Number_of_Appearances=n()) %>% base::as.data.frame()
Here is the result:

Speed up date lookup in R

I have a function that takes a vector of dates and matches it with a subset list of dates (based on certain attributes). For example, say my raw data looks like this:
key_1 <- c("A", "A", "B", "B")
date_1 <- as.Date(c("2012-03-31", "2011-01-31", "2011-08-07", "2014-04-09"))
And my lookup data looks like this:
lookup <- date.frame(stringsAsFactors = FALSE,
key_2 = c("A", "A", "A", "A", "B", "B", "B", "B"),
date_2 = as.Date(c(
"2010-05-12", "2011-05-12", "2012-05-12", "2013-05-12",
"2010-12-01", "2011-12-01", "2012-12-01", "2013-12-01"
))
)
I'm essentially looking for the largest date_2 that date_1 is larger than. So that date_1 maps to date_3. Basically, it's this in Excel:
date_3 = VLOOKUP(date_1[1], date_2[1:4], 1, TRUE)
Which would produce this:
date_3 <- c("2011-05-12", "2010-05-12", "2010-12-01", "2013-12-01")
My current function (below) works great but my raw data is 220k rows so it takes roughly 12 minutes to run. While that isn't the worst thing in the world, I was hoping there might be a faster way to run it.
my_fun <- function(key_1, date_1) {
indices <- sapply(unique(lookup$key_2), function(x) {which(lookup$key_2 == x)})
periods <- lookup$date_2[indices[,key_1]]
index <- findInterval(x = date_1, vec = periods) %>% as.numeric()
periods %>% magrittr::extract(index)
}
date_3 = mapply(my_fun, key_1, date_1, USE.NAMES = FALSE) %>%
as.Date(origin = "1970-01-01")
Thanks.
Update: I've tried utilizing both of the answers here but couldn't get them to work.

How to lappy() over selective columns? - R

I am a novice R programmer. I am wondering how to lappy over a dataframe but avoiding certain columns.
# Some dummy dataframe
df <- data.frame(
grp = c("A", "B", "C", "D"),
trial = as.factor(c(1,1,2,2)),
mean = as.factor(c(44,33,22,11)),
sd = as.factor(c(3,4,1,.5)))
df <- lapply(df, function (x) {as.numeric(as.character(x))})
However, the method I used introduces NAs by coercion.
Would there to selectively (or deselectively) lapply over the dataframe while maintaining the integrity of the dataframe?
In other words, would there be a way to convert only mean and sd to numerics? (In general form)
Thank you
Try doing this:
df[,3:4] <- lapply(df[,3:4], function (x) {as.numeric(as.character(x))})
You are simply passing function to the specified columns. You can also provide a condition to select subset of your columns, something like excluding the ones you don't want to cast.
col = names(df)[names(df)!=c("grp","trial")]
df[,col] <- lapply(df[,col], function (x) {as.numeric(as.character(x))})
Well as you might have guessed, there are many ways. Since you seem to be doing in place substitution, actually, a for loop would be suitable.
df <- data.frame(
grp = c("A", "B", "C", "D"),
trial = as.factor(c(1,1,2,2)),
mean = as.factor(c(44,33,22,11)),
sd = as.factor(c(3,4,1,.5)))
my_cols <- c("trial", "mean", "sd")
for(mc in my_cols) {
df[[mc]] <- as.numeric(as.character(df[[mc]]))
}
If you want to convert selectively by column names:
library(dplyr)
df %>%
mutate_if(names(.) %in% c("mean", "sd"),
function(x) as.numeric(as.character(x)))

Finding occurrence of character from multiple vector or list

I wish to find the number of times a unique/distinct character occurs accross mulitple vectors or from a list.
Perhaps its best to describe in an example ;
In this example, lets say the "unique character" are letters. And the muliple "vectors" are books. I wish to find the occurance of the letters as the number of book increases.
# Initial data in the format of a list
book_list <- list(book_A <- c("a", "b", "c", "z"),
book_B <- c("c", "d", "a"),
book_C <- c("b", "a", "c", "e", "x"))
# Initial data in the format of multiple vectors
book_A <- c("a", "b", "c", "z")
book_B <- c("c", "d", "a")
book_C <- c("b", "a", "c", "e", "x")
# Finding the unique letters in each book
# This is the part im struggling to code in a loop fashion
one_book <- length(unique(book_A))
two_book <- length(unique(c(book_A, book_B)))
three_book <- length(unique(c(book_A, book_B, book_C)))
# Plot the desired output
plot(x=c(1,2,3),
y=c(one_book, two_book, three_book),
ylab = "Number of unqiue letters", xlab = "Book Number",
main="The occurence of unique letters as number of book increases")
To Note : The real data set is much bigger. Each vector (book_A, book_B...etc) is about 7000 in length.
I attempting to solve the problem with dplyr or data frame, but I'm not quite there yet.
# Explore data frame option with an example data
library(dplyr)
df <- read.delim("http://m.uploadedit.com/ba3s/148950223626.txt")
# Group them
df_group <- dplyr::group_by(df, book) %>% summarize(occurence = length(letter))
# Use the cummuative sum
plot(x=1:length(unique(df$book)), y=cumsum(df_group$occurence))
But I know the plot is not correct, as it is only plotting the cummulative sum rather than what I intended. Any hints would be most helpful.
To add to the complexity, it would be nice if the book which have the shortest number of letter first can be ploted. Something along the line
# Example ;
# Find the length of the letters in the book
lapply(book_list, length)
# I know that book_B is has the shortest number of letters (3);
# followed by book_A (4) then book_C (5)
one_book <- length(unique(book_B))
two_book <- length(unique(c(book_B, book_A)))
three_book <- length(unique(c(book_B, book_A, book_C)))
plot(x=c(1,2,3),
y=c(one_book, two_book, three_book),
ylab = "Number of letters", xlab = "Book Number")
You can use Reduce with accumulate = TRUE, i.e.
sapply(Reduce(c, book_list, accumulate = TRUE), function(i) length(unique(i)))
#[1] 4 5 7

apply function to groups within each column of a data frame in R

I want to calculate the mean and standard deviation, by group, for each column in a subset of a large data frame.
I'm trying to understand why some of the answers to similar questions aren't working for me; I'm still pretty new at R and I'm sure there are a lot of subtleties (and not-so-subtle things!) I'm completely missing.
I have a large data frame similar to this one:
mydata <- data.frame(Experiment = rep(c("E1", "E2", "E3", "E4"), each = 9),
Treatment = c(rep(c("A", "B", "C"), each = 3), rep(c("A", "C", "D"), each = 3), rep(c("A", "D", "E"), each = 3), rep(c("A", "B", "D"), each = 3)),
Day1 = sample(1:100, 36),
Day2 = sample(1:100, 36),
Day3 = sample(1:150, 36),
Day4 = sample(50:150, 36))
I need to subset the data by Experiment and by Treatment, for example:
testB <- mydata[(mydata[, "Experiment"] %in% c("E1", "E4"))
& mydata[, "Treatment"] %in% c("A", "B"),
c("Treatment", "Day1", "Day2", "Day4")]
Then, for each column in testB, I want to calculate the mean and standard deviation for each Treatment group.
I started by trying to use tapply (over just one column to begin with), but get back "NA" for Treatment groups that shouldn't be in testB, which isn't a big problem with this small dataset, but is pretty irksome with my real data:
>tapply(testB$Day1, testB$Treatment, mean)
A B C D E
70.66667 61.00000 NA NA NA
I tried implementing solutions from Compute mean and standard deviation by group for multiple variables in a data.frame. Using aggregate worked:
ag <- aggregate(. ~ Treatment, testB, function(x) c(mean = mean(x), sd = sd(x)))
But I can't get the data.table solutions to work.
library(data.table)
testB[, sapply(.SD, function(x) list(mean=mean(x), sd=sd(x))), by = Treatment]
testB[, c(mean = lapply(.SD, mean), sd = lapply(.SD, sd)), by = Treatment]
both gave me the error message
Error in `[.data.frame`(testB, , c(mean = lapply(.SD, mean), sd = lapply(.SD, :
unused argument(s) (by = Treatment)
What am I doing wrong?
Thanks in advance for helping a clueless beginner!
Your columns are factors. Although you've dropped the rows that have the treatments "C", "D", and "E" in your subset testB, those levels still exist. Use levels(testB) to see them. You can use the droplevels function when defining your testB subset to allow you to get means for A and B without returning NAs for empty factor levels.
testB <- droplevels(mydata[(mydata[, "Experiment"] %in% c("E1", "E4"))
& mydata[, "Treatment"] %in% c("A", "B"),
c("Treatment", "Day1", "Day2", "Day4")]
tapply(testB$Day1,testB$Treatment,mean)
A B
59.16667 66.00000
Hope this helps!
Ron
You could use plyr and reshape2 to tackle this problem as well; I generally prefer to use these libraries because the abstractions they introduce apply to more problems, and are cleaner.
How I would solve it:
library(plyr)
library(reshape2)
# testB from your code above
# make a "long" version of testB
longTestB <- melt(testB, id.vars="Treatment")
# then use ddply for calculating your metrics
ddply(longTestB, .(Treatment), summarize, mean=mean(value), stdev=sd(value))

Resources