Better way to produce data frame using table() - r

Recently, I have found that I am using the following pattern over and over again. The process is:
cross-tabulate numeric variable by factor using table
create data frame from created table
add original numeric values to data frame (from row names (!))
remove row names
reorder columns of aggregated data frame
In R, it looks like this:
# Sample data
df <- data.frame(x = round(runif(100), 1),
y = factor(ifelse(runif(100) > .5, 1, 0),
labels = c('failure', 'success'))
)
# Get frequencies
dfSummary <- as.data.frame.matrix(table(df$x, df$y))
# Add column of original values from rownames
dfSummary$x <- as.numeric(rownames(dfSummary))
# Remove rownames
rownames(dfSummary) <- NULL
# Reorder columns
dfSummary <- dfSummary[, c(3, 1, 2)]
Is there anything more elegant in R, preferably using base functions? I know I can use sql to do this in single command - I think that it has to be possible to achieve similar behavior in R.
sqldf solution:
library(sqldf)
dfSummary <- sqldf("select
x,
sum(y = 'failure') as failure,
sum(y = 'success') as success
from df group by x")

An alternative with base R could be:
aggregate(. ~ x, transform(df, success = y == "sucess",
failure = y == "failure", y = NULL), sum)
# x success failure
#1 0.0 2 4
#2 0.1 6 8
#3 0.2 1 7
#4 0.3 5 4
#5 0.4 6 6
#6 0.5 3 3
#7 0.6 4 6
#8 0.7 6 6
#9 0.8 4 5
#10 0.9 6 7
#11 1.0 1 0

Your code modified as a function would be efficient compared to the other solutions in base R (so far). If you wanted the code in one-line, a "reshape/table" combo from base R could be used.
reshape(as.data.frame(table(df)), idvar='x', timevar='y',
direction='wide')
# x Freq.failure Freq.success
#1 0 3 2
#2 0.1 3 9
#3 0.2 5 5
#4 0.3 8 7
#5 0.4 5 3
#6 0.5 9 4
#7 0.6 3 6
#8 0.7 7 6
#9 0.8 3 1
#10 0.9 4 3
#11 1 0 4
In case you want to try data.table
library(data.table)
dcast.data.table(setDT(df), x~y)
# x failure success
# 1: 0.0 3 2
# 2: 0.1 3 9
# 3: 0.2 5 5
# 4: 0.3 8 7
# 5: 0.4 5 3
# 6: 0.5 9 4
# 7: 0.6 3 6
# 8: 0.7 7 6
# 9: 0.8 3 1
#10: 0.9 4 3
#11: 1.0 0 4
Update
I didn't notice the as.data.frame(table( converts to "factor" columns (thanks to #Hadley's comment). A workaround is:
res <- transform(reshape(as.data.frame(table(df), stringsAsFactors=FALSE),
idvar='x', timevar='y', direction='wide'), x= as.numeric(x))
data
set.seed(24)
df <- data.frame(x = round(runif(100), 1),
y = factor(ifelse(runif(100) > .5, 1, 0),
labels = c('failure', 'success'))
)
Benchmarks
set.seed(24)
df <- data.frame(x = round(runif(1e6), 1),
y = factor(ifelse(runif(1e6) > .5, 1, 0),
labels = c('failure', 'success'))
)
tomas <- function(){
dfSummary <- as.data.frame.matrix(table(df$x, df$y))
dfSummary$x <- as.numeric(rownames(dfSummary))
dfSummary <- dfSummary[, c(3, 1, 2)]}
doc <- function(){aggregate(. ~ x, transform(df,
success = y == "success", failure = y == "failure",
y = NULL), sum)}
akrun <- function(){reshape(as.data.frame(table(df)),
idvar='x', timevar='y', direction='wide')}
library(microbenchmark)
microbenchmark(tomas(), doc(), akrun(), unit='relative', times=20L)
Unit: relative
#expr min lq mean median uq max neval cld
#tomas() 1.000000 1.0000000 1.000000 1.000000 1.0000000 1.000000 20 a
#doc() 13.451037 11.5050997 13.082074 13.043584 12.8048306 19.715535 20 b
#akrun() 1.019977 0.9522809 1.012332 1.007569 0.9993835 1.533191 20 a
Updated with dcast.data.table
df1 <- copy(df)
akrun2 <- function() {dcast.data.table(setDT(df1), x~y)}
microbenchmark(tomas(), akrun2(), unit='relative', times=20L)
# Unit: relative
# expr min lq mean median uq max neval cld
# tomas() 6.493231 6.345752 6.410853 6.51594 6.502044 5.591753 20 b
# akrun2() 1.000000 1.000000 1.000000 1.00000 1.000000 1.000000 20 a

This should be relatively efficient. You cannot really suppress rownames in a dataframe, since they are a requirement of a valid dataframe
X <- table(df$x,df$y)
cbind( data.frame(x=rownames(X)), unclass(X) )
x failure success
0 0 5 3
0.1 0.1 6 1
0.2 0.2 7 8
0.3 0.3 7 3
0.4 0.4 6 6
0.5 0.5 6 4
0.6 0.6 2 5
0.7 0.7 2 7
0.8 0.8 3 7
0.9 0.9 4 6
1 1 2 0

Related

rowmeans but ignore certain values when calculating the mean but na.rm=F

Here is a sample dataset:
data <- data.frame(x=c(4,3,4,4,99),
y=c(4,NA,3,2,4),
z = c(88,NA,4,4,5),
w = c(4,5,2,3,4))
I would like to create a new column for means using rowMeans. I would like to keep na.rm=F because if its truly NA I do not want to include that into my means calculation.
But if its either 88/99 I would like R to ignore it while calculating the mean and still use the remaining valid values. So far I have the below.
data$mean <- rowMeans(subset(data, select = c(`x`,`y`,`z`,`w`)), na.rm = T)
But I am not sure how to add in a function where it would just ignore the 88 and 99 from calculations.
This is what I am hoping to get
data <- data.frame(x=c(4,3,4,4,99),
y=c(4,NA,3,2,4),
z = c(88,NA,4,4,5),
w = c(4,5,2,3,4),
mean=c(4,NA,3.25,3.25,4.3))
Any help is appreciated - thank you!
Using rowMeans nevertheless with na.rm=TRUE, but on a subset and temporally replaceing 88 and 99 with NA.
s <- rowSums(is.na(data)) == 0 ## store row subset
v <- c("x", "y", "z", "w") ## col subset to calc. mean
data$mean <- NA ## ini column
m <- as.matrix(data[v]) ## we'll ned a matrix
data$mean[s] <- rowMeans(replace(m[s, v], m[s, v] %in% c(88, 99), NA), na.rm=TRUE)
data
# x y z w mean
# 1 4 4 88 4 4.000000
# 2 3 NA NA 5 NA
# 3 4 3 4 2 3.250000
# 4 4 2 4 3 3.250000
# 5 99 4 5 4 4.333333
Or simply using apply but is much slower.
f <- \(x) if (any(is.na(x))) NA else mean(x[!x %in% c(88, 99)])
cbind(data, mean=apply(data, 1, f))
# x y z w mean
# 1 4 4 88 4 4.000000
# 2 3 NA NA 5 NA
# 3 4 3 4 2 3.250000
# 4 4 2 4 3 3.250000
# 5 99 4 5 4 4.333333
From microbenchmark.
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# apply 35.018162 35.941815 38.834333 36.394632 36.960161 212.469412 100 b
# rowMeans 1.097393 1.119396 1.493563 1.193787 1.226691 9.352118 100 a
data <- data.frame(x=c(4,3,4,4,99),
y=c(4,NA,3,2,4),
z = c(88,NA,4,4,5),
w = c(4,5,2,3,4))
df$mean <- apply(data, 1, function(x) {
idx <- which((x %in% c(88, 89)) == FALSE)
mean(x[ idx ], na.rm = TRUE)
})
x y z w mean
1 4 4 88 4 4.00
2 3 NA NA 5 4.00
3 4 3 4 2 3.25
4 4 2 4 3 3.25
5 99 4 5 4 28.00

Compare values to column-wise "leave-one-out" mean

I have a dataframe df:
df <- data.frame(a = 1:5, b = 6:10)
a b
1 1 6
2 2 7
3 3 8
4 4 9
5 5 10
For each column, I want to divide each value by the column mean, where the mean is calculated by excluding the focal value from calculation of the mean ("leave-one-out" mean).
For example, the first two values in column "a"`, the calculation is like:
1: 1 / ((2 + 3 + 4 + 5) / 4)) = 0.2857143
2: 2 / ((1 + 3 + 4 + 5) / 4)) = 0.6153846
etc.
"Leave-one-out means":
mean_a mean_b
1 3.5 8.5
2 3.25 8.25
3 3 8
4 2.75 7.75
5 2.5 7.5
The desired result: values / "leave-one-out" means
res_a res_b
1 0.285 0.705
2 0.615 0.848
3 1 1
4 1.454 1.161
5 2 1.333
Many thanks for any help!
If I understand it correctly, the following should do it.
res <- sapply(df, function(x)
sapply(seq_along(x), function(i) x[i]/mean(x[-i]))
)
res <- as.data.frame(res)
names(res) <- paste("c", names(res), sep = "_")
res
# c_a c_b
#1 0.2857143 0.7058824
#2 0.6153846 0.8484848
#3 1.0000000 1.0000000
#4 1.4545455 1.1612903
#5 2.0000000 1.3333333
Just use the magic of index and vector in R
for(i in 1:nrow(df)){
print(df$a[i]/mean(df$a[-i]))
}
I have just replicated for column a .I hope you can do it for B and convert into dataframe .
Let me know if you need help.
Happy to help with R.
A vectorized possibility, which will be faster for larger data.
df / ((rep(colSums(df), each = nrow(df)) - df) / (nrow(df) - 1))
# a b
# 0.2857143 0.7058824
# 0.6153846 0.8484848
# 1.0000000 1.0000000
# 1.4545455 1.1612903
# 2.0000000 1.3333333

R moving average function to deal with values less window size

Following this answer, I have used a moving average function for a window size of 2, 3 and 4.
require(zoo)
#MOVING AVERAGE FUNCTION
get.mav <- function(df, n = 2){
if(length(df) < n){
return(df)
}
c(df[1:(n-1)],rollapply(df,width = n, mean, align="right"))
}
#DATA FRAME (dummy)
ID <- c("d","b","a","a","c","e","b","d","b","b")
Value <- c(4,5,5,3,2,1,6,9,5,5)
df <-data.frame(ID,Value)
# FUNCTION IMPLEMENTATION
df <- with(df,df[order(ID),])
df$mav2 <- unlist(aggregate(Value~ID,df,get.mav,na.action = NULL,n=2)$Value)
df$mav3 <- unlist(aggregate(Value~ID,df,get.mav,na.action = NULL,n=3)$Value)
df$mav4 <- unlist(aggregate(Value~ID,df,get.mav,na.action = NULL,n=4)$Value)
#OUTPUT
ID Value mav2 mav3 mav4
a 5 5 5 5
a 3 4 3 3
b 5 5 5 5
b 6 5.5 6 6
b 5 5.5 5.3 5
b 5 5 5.3 5.25
c 2 2 2 2
d 4 4 4 4
d 9 6.5 9 9
e 1 1 1 1
The function get.mav works exactly the way it should. I want to change this function such that
For window size 3, if df length is 2, it takes the mean of those two elements rather than simply returning df.
Similarly for window size 4, if length is 3 or 2, it takes mean of those three or two elements rather simply returning df.
I tried the if statements but comparisons are not working correctly. Any help would be appreciated.
Thanks.
For each width use ave to to invoke rollapplyr by ID. partial = TRUE in rollapplyr causes it to average partial number of points near the beginning.
library(zoo)
roll <- function(x, group, w) {
ave(x, group, FUN = function(x) rollapplyr(x, w, mean, partial = TRUE))
}
transform(df[order(df$ID), ],
mav2 = roll(Value, ID, 2),
mav3 = roll(Value, ID, 3),
mav4 = roll(Value, ID, 4)
)
or alternatively:
w <- 2:4
names(w) <- paste0("mav", w)
with(df[order(df$ID), ],
data.frame(ID, Value, lapply(w, roll, x = Value, group = ID), check.names = FALSE)
)
Either gives:
ID Value mav2 mav3 mav4
1 a 5 5.0 5.000000 5.000000
2 a 3 4.0 4.000000 4.000000
3 b 5 5.0 5.000000 5.000000
4 b 6 5.5 5.500000 5.500000
5 b 5 5.5 5.333333 5.333333
6 b 5 5.0 5.333333 5.250000
7 c 2 2.0 2.000000 2.000000
8 d 4 4.0 4.000000 4.000000
9 d 9 6.5 6.500000 6.500000
10 e 1 1.0 1.000000 1.000000
Update: Fixed.

Replace the value in one table conditional on the value in another table

I have the following data frames:
df_1 <- data.frame(f1= c(1,3,4,5,7,8), f2 = c(2,3,4,1,4,5))
df_2 <- data.frame(f1= c(0.1,0.3,0.04,0.015,0.7,0.8), f2 = c(0.02,0.13,0.4,1.4,0.04,0.5))
so they look like
> df_1
f1 f2
1 1 2
2 3 3
3 4 4
4 5 1
5 7 4
6 8 5
> df_2
f1 f2
1 0.100 0.02
2 0.300 0.13
3 0.040 0.40
4 0.015 1.40
5 0.700 0.04
6 0.800 0.50
The replacement I wish to perform is:
If one figure in df2 is higher than 0.05, I wish to replace the figure in df1 at the corresponding position with NA. The resulting data frame df1 should look like
f1 f2
1 NA 2
2 NA NA
3 4 NA
4 5 NA
5 NA 4
6 NA NA
I have tried to solve it using a for loop but it will take a long time when applied to my actual large table. I know there could be a quicker way using data.table but I don't actually know how. Can someone help me with this?
You can do it like this
> df_1[df_2 > 0.05] <- NA
> df_1
f1 f2
1 NA 2
2 NA NA
3 4 NA
4 5 NA
5 NA 4
6 NA NA
If performance is a issue, we can use set from data.table (also the OP mentioned data.table in the post). Using set will be fast as the overhead of [.data.table is avoided.
library(data.table)
setDT(df_1)
for(j in seq_along(df_1)){
set(df_1, i = which(df_2[[j]] > 0.05), j = j, value = NA)
}
df_1
# f1 f2
#1: NA 2
#2: NA NA
#3: 4 NA
#4: 5 NA
#5: NA 4
#6: NA NA
Benchmarks
set.seed(49)
df1 <- data.frame(f1 = sample(1:9, 1e7, replace=TRUE),
f2 = sample(1:9, 1e7, replace=TRUE))
set.seed(24)
df2 <- data.frame(f1 = rnorm(1e7), f2 = rnorm(1e7))
akrun <- function() {DT <- as.data.table(df1)
for(j in seq_along(DT)){
set(DT, i = which(df2[[j]] > 0.05), j=j, value = NA)
}
}
David <- function() {df1[df2 > 0.05] <- NA}
library(microbenchmark)
microbenchmark(akrun(), David(), unit="relative", times = 20L)
# expr min lq mean median uq max neval
# akrun() 1.000000 1.00000 1.000000 1.000000 1.000000 1.000000 20
# David() 2.487825 2.65275 2.428343 2.582355 2.298318 2.126138 20

Summarizing an arbitrary # of columns in a data frame using my own function in R

I am looking for a way to summarize a large flat table of experimental results in R. The summarization is not straightforward since I need to summarize an arbitrary # of columns (cannot hardcode the columns beforehand) and use an arbitrarily defined summary function.
As an example say I have the following flat table, my_table
my_table
id_1 id_2 rep_id value_1 value_2
1 a 1 1 0.0 0.0
2 a 1 2 0.2 0.2
3 a 1 3 0.3 0.3
4 a 1 4 0.4 0.4
5 a 1 5 0.1 0.1
6 a 2 1 0.5 0.0
7 a 2 2 1.5 1.5
8 a 2 3 2.5 2.5
9 a 2 4 3.5 3.5
10 a 2 5 4.5 4.5
I would summarize my_table into a table such as:
> summary_table
id_1 id_2 value_1.min value_1.max value_1.mean_plus_sd value_2.min value_2.max value_2.mean_plus_sd
1 a 1 0.0 0.4 0.3581139 0 0.4 0.3581139
2 a 2 0.5 4.5 4.0811388 0 4.5 4.1464249
The summarization is complicated since I would like to:
Specify the variables to group by, e.g. key_fields = c("id_1","id_2")
Specify the columns to summarize, e.g. fields_to_summarize = c("value_1","value_2")
Use my own summarizing function (that also names the new columns)
Here is the code that I am currently using to do all 3 of these things. This is nice, but it's also really inefficient. Any improvements would be really appreciated:
library(plyr)
# create table
my_table = data.frame("id_1" = c("a","a","a","a","a","a","a","a","a","a")
,"id_2" = c("1","1","1","1","1","2","2","2","2","2")
,"rep_id" = c(1,2,3,4,5,1,2,3,4,5)
,"value_1"= c(0.0,0.2,0.3,0.4,0.1,0.5,1.5,2.5,3.5,4.5)
,"value_2"= c(0.0,0.2,0.3,0.4,0.1,0.0,1.5,2.5,3.5,4.5)
)
# specify columns to group by / summarize over
key_fields = c("id_1","id_2")
fields_to_summarize = c("value_1","value_2")
# create summary_table
counter = 1;
for (fname in fields_to_summarize){
summary_function = function(D) data.frame(setNames(list(min(D[[fname]]),
max(D[[fname]]),
mean(D[[fname]])+sd(D[[fname]])),
paste(fname,c("min",
"max",
"mean_plus_sd"),
sep=".")
))
tmp = ddply(.data = df,
.variable = key_fields,
function(D) summary_function(D))
if (counter == 1){
summary_table = tmp;
} else {
summary_table = join(x=summary_table,y=tmp,by=key_fields,type="left", match="all")
}
counter = counter + 1;
}
not the final solution, but perhaps a good start with dplyr
library(dplyr)
mean_plus_sd <- function(x) mean(x) + sd(x)
key_fields = c("id_1","id_2")
my_table %>%
group_by_(.dots = key_fields) %>%
summarise_each_(funs(min,max,mean_plus_sd), fields_to_summarize)
Here's are two quick functions you could define. First is using base R approach, second is using a possible data.table approach
My_func <- function(data, fields_to_summarize, key_fields){
aggregate(data[fields_to_summarize],
data[key_fields],
function(x) c(min = min(x),
max = max(x),
mean_plus_sd = mean(x) + sd(x)))
}
My_func2 <- function(data, fields_to_summarize, key_fields){
as.data.table(data)[, lapply(.SD,
function(x) c(min(x), max(x), mean(x) + sd(x))),
key_fields,
.SDcols = fields_to_summarize][,
Funs := c("min", "max", "mean_plus_sd")][]
}
Testing first funciton
key_fields = c("id_1","id_2")
fields_to_summarize = c("value_1","value_2")
My_func(my_table, fields_to_summarize, key_fields)
# id_1 id_2 value_1.min value_1.max value_1.mean_plus_sd value_2.min value_2.max value_2.mean_plus_sd
# 1 a 1 0.0000000 0.4000000 0.3581139 0.0000000 0.4000000 0.3581139
# 2 a 2 0.5000000 4.5000000 4.0811388 0.0000000 4.5000000 4.1464249
Testing second function
library(data.table)
My_func2(my_table, fields_to_summarize, key_fields)
# id_1 id_2 value_1 value_2 Funs
# 1: a 1 0.0000000 0.0000000 min
# 2: a 1 0.4000000 0.4000000 max
# 3: a 1 0.3581139 0.3581139 mean_plus_sd
# 4: a 2 0.5000000 0.0000000 min
# 5: a 2 4.5000000 4.5000000 max
# 6: a 2 4.0811388 4.1464249 mean_plus_sd

Resources