Related
I have a dataframe which includes 2 columns, let's say "left" and "right", which define intervals. I want to test if a given numeric "x" is part of any interval defined by the dataframe (if it is, it should be only once, those intervals don't overlap). Expected behaviour:
> df <- data.frame(id = c("A", "B", "C"), left = c(0, 50, 150), right = c(15, 78, 190))
> df
id left right
1 A 0 15
2 B 50 78
3 C 150 190
> my_function(7)
TRUE
> my_function(20)
FALSE
So I did it this way, but it's terribly slow and I'm pretty sure this could be optimized:
my_function <- function(x) {
test <- df %>% dplyr::rowwise() %>% dplyr::mutate(test = (x >= left) && (x <= right)) %>% ungroup()
test <- test %>% filter(test == T)
nrow(test) == 1
}
Then I'd be interested in getting the matching row in case the output is TRUE, but with the current function it'll take forever (the actual dataframe has ~5,000 rows, and I want to test/get coordinates for thousands of x values).
I found a library that manages interval objets but it seems it's tailored for time intervals. Any suggestion?
Here is a simple way with an example:
z <- 567 # single dummy value
left <- x1 <- seq(100, 900, 200)
right <- seq(200, 1000, 200)
df <- data.frame(left, right) # dummy intervals
lo <- z >= df$left
hi <- z <= df$right
check <- lo * hi
introw <- which(check == 1)
introw
3
z2 <- c(356, 934, 134, 597, 771) # vector of values to check
lo2 <- sapply(z2, function(x) x >= df$left)
hi2 <- sapply(z2, function(x) x <= df$right)
check2 <- lo2 * hi2
introws <- apply(check2, 2, function(x) which(x ==1))
introws #vector of intervals for each input value
introws
2 5 1 3 4
final <- cbind(value = z2, interval = introws)
final
value interval
[1,] 356 2
[2,] 934 5
[3,] 134 1
[4,] 597 3
[5,] 771 4
Try this approach using between():
#Code
my_function <- function(x) {
test <- df %>% dplyr::rowwise() %>%
dplyr::mutate(test = between(x,left,right)) %>% ungroup()
test <- test %>% filter(test == T)
nrow(test) == 1
}
I would like to perform a two sample t test in R using separate groupings. The t.test must be "unbiased", meaning that for all transactions in the outer group (group 2 below), the T test must be run for each inner group (group 1 below) like: "inner group A" vs. "inner group not A". The for loop code shown below is probably clearer than a verbal explanation...
My current code is below. Does anyone know a faster/better way to do this? Open to using any package, but currently using data.table.
For context, I have ~1 million rows of transaction data. Group 1 indicates a person (if there are multiple rows they have multiple transactions) and contains ~30k unique values. Group 2 indicates a zip code and contains ~500 unique values
Thanks!
library(data.table)
# fake data
grp1 <- c('A','A','A','B','B','C','C','D','D','D','D','E','E','E','F','F')
grp2 <- c(1,1,1,1,1,1,1, 2,2,2,2,2,2,2, 2,2)
vals <- c(10,20,30, 40,15, 25,60, 70,100,200,300, 400,1000,2000, 3000,5000)
DT <- data.table(grp1 = grp1, grp2 = grp2, vals = vals)
# "two sample t.test" --------------------------------------------------
# non vectorized, in-place
# runtime is ~50 mins for real data
for (z in DT[,unique(grp2)]){
for (c in DT[grp2 == z, unique(grp1)]) {
res = t.test(
DT[grp2 == z & grp1 == c, vals],
DT[grp2 == z & grp1 != c, vals],
alternative = 'greater'
)
DT[grp2 == z & grp1 == c, pval := res$p.value]
DT[grp2 == z & grp1 == c, tstat := res$statistic]
}
}
# vectorized, creates new summarized data.table
# runtime is 1-2 mins on real data
vec <- DT[,{
grp2_vector = vals
.SD[,.(tstat = t.test(vals, setdiff(grp2_vector, vals), alternative = 'g')$statistic,
pval = t.test(vals, setdiff(grp2_vector, vals), alternative = 'g')$p.value), by=grp1]
} , by=grp2]
stats::t.test is generalized and does a number of checks. You can just calculate what you need, i.e. t-statistic and p-value and also make use of the optimization in data.table to calculate length, mean and variance. Here is a possible approach:
#combinations of grp1 and grp2 and those not in grp1 for each grp2
comb <- unique(DT[, .(grp1, grp2)])[,
rbindlist(lapply(1:.N, function(n) .(g1=rep(grp1[n], .N-1L), notIn=grp1[-n]))),
.(g2=grp2)]
#this is optimized, switch on verbose to see the output
X <- DT[, .(nx=.N, mx=mean(vals), vx=var(vals)), .(grp1, grp2)] #, verbose=TRUE]
#calculate length, mean, var for values not in grp1
Y <- DT[comb, on=.(grp2=g2, grp1=notIn), allow.cartesian=TRUE][,
.(ny=.N, my=mean(vals), vy=var(vals)), by=.(grp1=g1, grp2=grp2)]
#calculate outputs based on stats:::t.test.default
ans <- X[Y, on=.(grp1, grp2)][, c("tstat", "pval") := {
stderrx <- sqrt(vx/nx)
stderry <- sqrt(vy/ny)
stderr <- sqrt(stderrx^2 + stderry^2)
df <- stderr^4/(stderrx^4/(nx - 1) + stderry^4/(ny - 1))
tstat <- (mx - my)/stderr
.(tstat, pt(tstat, df, lower.tail = FALSE))
}, by=1:Y[,.N]]
output:
grp1 grp2 nx mx vx ny my vy tstat pval
1: C 1 2 42.500 612.50 5 23.0000 145.0000 1.06500150 0.22800432
2: B 1 2 27.500 312.50 5 29.0000 355.0000 -0.09950372 0.53511601
3: A 1 3 20.000 100.00 4 35.0000 383.3333 -1.31982404 0.87570431
4: F 2 2 4000.000 2000000.00 7 581.4286 489747.6190 3.30491342 0.08072148
5: E 2 3 1133.333 653333.33 6 1445.0000 4323350.0000 -0.32174451 0.62141500
6: D 2 4 167.500 10891.67 5 2280.0000 3292000.0000 -2.59809850 0.97016160
timing code:
library(data.table) #data.table_1.12.4
set.seed(0L)
np <- 4.2e5
nzc <- 4.2e3
DT <- data.table(grp1=rep(1:np, each=5), grp2=rep(1:nzc, each=np/nzc*5),
vals=abs(rnorm(np*5, 5000, 2000)), key=c("grp1", "grp2"))
mtd0 <- function() {
DT[, {
grp2_vector <- vals
.SD[,{
tres <- t.test(vals, setdiff(grp2_vector, vals), alternative = 'g')
.(tstat=tres$statistic, pval=tres$p.value)
}, by=grp1]
} , by=grp2]
}
mtd1 <- function() {
comb <- unique(DT[, .(grp1, grp2)])[,
rbindlist(lapply(1:.N, function(n) .(g1=rep(grp1[n], .N-1L), notIn=grp1[-n]))),
.(g2=grp2)]
X <- DT[, .(nx=.N, mx=mean(vals), vx=var(vals)), .(grp1, grp2)] #, verbose=TRUE]
Y <- DT[comb, on=.(grp2=g2, grp1=notIn), allow.cartesian=TRUE][,
.(ny=.N, my=mean(vals), vy=var(vals)), by=.(grp1=g1, grp2=grp2)]
ans <- X[Y, on=.(grp1, grp2)][, c("tstat", "pval") := {
stderrx <- sqrt(vx/nx)
stderry <- sqrt(vy/ny)
stderr <- sqrt(stderrx^2 + stderry^2)
df <- stderr^4/(stderrx^4/(nx - 1) + stderry^4/(ny - 1))
tstat <- (mx - my)/stderr
.(tstat, pt(tstat, df, lower.tail = FALSE))
}, by=1:Y[,.N]]
}
microbenchmark::microbenchmark(mtd0(), mtd1(), times=1L)
timings:
Unit: seconds
expr min lq mean median uq max neval
mtd0() 65.76456 65.76456 65.76456 65.76456 65.76456 65.76456 1
mtd1() 18.29710 18.29710 18.29710 18.29710 18.29710 18.29710 1
I would suggest looking at the package Rfast. There are commands, such as ttest1, ttest2 and ttests for one sample, 2 sample and many t
I have a dataset with several attributes and a value.
Input (sample)
GRP CAT TYP VAL
X H 5 0.76
X A 2 0.34
X D 3 0.70
X I 3 0.33
X F 4 0.80
X E 1 0.39
I want to:
Determine all combinations of CAT and TYP
For each combination, calculate the average value when the combination is removed
Return a final table of differences
Final Table (sample)
CAT TYP DIFF
1 <NA> NA 0.04000
2 H NA 0.03206
Row 1 means that if no records are removed, the difference between the average value of GRP='X' and GRP='Y' is 0.04. Row 2 means that if records with CAT='H' are removed, the difference is 0.032.
I have working code, but I want to make it faster. I'm open to your suggestions.
Working Code
library(dplyr)
set.seed(777)
# build example data frame
df <- data.frame(GRP = c(rep('X',25),rep('Y',25)),
CAT = sample(LETTERS[1:10], 50, T),
TYP = sample(1:5, 50, T),
VAL = sample(1:100, 50, T)/100,
stringsAsFactors = F)
# table of all combinations of CAT and TYP
splits <- expand.grid(lapply(df[,-c(1,4)], function(x) c(NA, unique(x))), stringsAsFactors = F)
# null data frame to store results
ans <- data.frame(CAT = character(),
TYP = integer(),
DIFF = numeric(),
stringsAsFactors = F)
# loop through each combination and calculate the difference between group X and Y
for(i in 1:nrow(splits)) {
split.i <- splits[i,]
# determine non-na columns
by.cols <- colnames(split.i)[unlist(lapply(split.i, function(x) !all(is.na(x))))]
# anti-join to remove records that match `split.i`
if(length(by.cols) > 0){
df.i <- df %>%
anti_join(split.i, by = by.cols)
} else {
df.i <- df
}
# calculate average by group
df.i <- df.i %>%
group_by(GRP) %>%
summarize(VAL_MEAN = mean(VAL))
# calculate difference of averages
DIFF <- df.i[,2] %>%
as.matrix() %>%
diff() %>%
as.numeric()
ans.tmp <- cbind(split.i, DIFF)
# bind to final data frame
ans <- bind_rows(ans, ans.tmp)
}
return(ans)
Speed results
> system.time(fcnDiffCalc())
user system elapsed
0.30 0.02 0.31
Consider assigning DIFF column with sapply rather than growing a data frame in a loop to avoid the repetitive in-memory copying:
fcnDiffCalc2 <- function() {
# table of all combinations of CAT and TYP
splits <- data.frame(expand.grid(lapply(df[,-c(1,4)], function(x) c(NA, unique(x))),
stringsAsFactors = F))
# loop through each combination and calculate the difference between group X and Y
splits$DIFF <- sapply(1:nrow(splits), function(i) {
split.i <- splits[i,]
# determine non-na columns
by.cols <- colnames(split.i)[unlist(lapply(split.i, function(x) !all(is.na(x))))]
# anti-join to remove records that match `split.i`
df.i <- tryCatch(df %>%
anti_join(split.i, by = by.cols), error = function(e) df)
# calculate average by group
df.i <- df.i %>%
group_by(GRP) %>%
summarize(VAL_MEAN = mean(VAL))
# calculate difference of averages
DIFF <- df.i[,2] %>%
as.matrix() %>%
diff() %>%
as.numeric()
})
return(splits)
}
Even better, avoid the loop in expand.grid, use vapply over sapply (even the unlist + lapply = sapply or vapply) defining the outcome structure, and avoid pipes in loop to revert to base R's aggregate:
fcnDiffCalc3 <- function() {
# table of all combinations of CAT and TYP
splits <- data.frame(expand.grid(CAT = c(NA, unique(df$CAT)), TYP = c(NA, unique(df$TYP)),
stringsAsFactors = FALSE))
# loop through each combination and calculate the difference between group X and Y
splits$DIFF <- vapply(1:nrow(splits), function(i) {
split.i <- splits[i,]
# determine non-na columns
by.cols <- colnames(split.i)[vapply(split.i, function(x) !all(is.na(x)), logical(1))]
# anti-join to remove records that match `split.i`
df.i <- tryCatch(anti_join(df, split.i, by = by.cols), error = function(e) df)
# calculate average by group
df.i <- aggregate(VAL ~ GRP, df.i, mean)
# calculate difference of averages
diff(df.i$VAL)
}, numeric(1))
return(splits)
}
Output
df_op <- fcnDiffCalc()
df_new <- fcnDiffCalc2()
df_new2 <- fcnDiffCalc3()
identical(df_op, df_new)
# [1] TRUE
identical(df_op, df_new2)
# [1] TRUE
library(microbenchmark)
microbenchmark(fcnDiffCalc(), fcnDiffCalc2(), fcnDiffCalc3())
# Unit: milliseconds
# expr min lq mean median uq max neval
# fcnDiffCalc() 128.1442 140.1946 152.0703 154.3662 159.6809 180.5960 100
# fcnDiffCalc2() 115.4415 126.6108 138.0991 137.4108 145.2452 266.3297 100
# fcnDiffCalc3() 107.6847 116.9920 126.9131 126.0414 133.3887 227.2758 100
I have a data frame with two vectors of length 5 and variable:
x <- seq(1:5)
y <- rep(0,5)
df <- data.frame(x, y)
z <- 10
I need to loop through the data frame and update y based on a condition related to x using z, and I need to update z at every iteration. Using a for loop, I would do this:
for (i in seq(2,nrow(df))){
if(df$x[i] %% 2 == 0){
df$y[i] <- df$y[i-1] + z
z <- z - df$x[i]
} else{
df$y[i] <- df$y[i-1]
}
}
Using data frames is slow and having to access the ith item using df$x[i] is not efficient, but I am unsure how to vectorize this since both y and z will change based on each iteration.
Does anyone have a recommendation on best way to iterate this? I was loking to avoide data frames completely and just use vectors so simplify the lookups, or use something from tidyverse using tibbles and the purrr package, but nothing seemed easy to implement. Thanks!
you can use sapply function:
y=0
z=10
sapply(df$x,function(x)ifelse(x%%2==0,{y<<-y+z;z<<-z-x;y},y<<-y))
[1] 0 10 10 18 18
Here's a vectorized version
vec_fun <- function(x, z) {
L <- length(x)
vec_z <- rep(0, L)
I <- seq(2, L, by=2)
vec_z[I] <- head(z-c(0, cumsum(I)), length(I))
cumsum(vec_z)
}
The alternative versions - sapply & tidyverse
sapply_fun <- function(x, z) {
y=0
sapply(df$x,function(x)ifelse(x%%2==0,{y<<-y+z;z<<-z-x;y},y<<-y))
}
library(tidyverse)
library(tidyverse)
tidy_fun <- function(df) {
df %>%
filter(x %% 2 != 0) %>%
mutate(z = accumulate(c(z, x[-1] - 1), `-`)) %>%
right_join(df, by = c("x", "y")) %>%
mutate(z = lag(z), z = ifelse(is.na(z), 0, z)) %>%
mutate(y = cumsum(z)) %>%
select(-z) %>%
pluck("y")
}
Your data
df <- data.frame(x=1:5, y=0)
z <- 10
Let's make sure they all return the same result
identical(vec_fun(df$x, z), sapply_fun(df$x, z), tidy_fun(df))
# TRUE
Benchmark with small dataset - sapply_fun appears to be slightly faster
library(microbenchmark)
microbenchmark(vec_fun(df$x, z), sapply_fun(df$x, z), tidy_fun(df), times=100L, unit="relative")
# Unit: relative
# expr min lq mean median uq max neval
# vec_fun(df$x, z) 1.349053 1.316664 1.256691 1.359864 1.348181 1.146733 100
# sapply_fun(df$x, z) 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 100
# tidy_fun(df) 411.409355 378.459005 168.689084 301.029545 270.519170 4.244833 100
Now with larger data.frame
df <- data.frame(x=1:1000, y=0)
z <- 10000
Same result - yes
identical(vec_fun(df$x, z), sapply_fun(df$x, z), tidy_fun(df))
# TRUE
Benchmark with larger dataset - now it's obvious vec_fun is faster
library(microbenchmark)
microbenchmark(vec_fun(df$x, z), sapply_fun(df$x, z), tidy_fun(df), times=100L, unit="relative")
# Unit: relative
# expr min lq mean median uq max neval
# vec_fun(df$x, z) 1.00000 1.00000 1.00000 1.00000 1.00000 1.000 100
# sapply_fun(df$x, z) 42.69696 37.00708 32.19552 35.19225 27.82914 27.285 100
# tidy_fun(df) 259.87893 228.06417 201.43230 218.92552 172.45386 380.484 100
Since your data contains solely numbers you could use a matrix rather than a data frame which is slightly faster.
mx <- matrix(c(x, y), ncol = 2, dimnames = list(1:length(x), c("x", "y")))
for (i in seq(2, nrow(mx))){
if(mx[i, 1] %% 2 == 0){
mx[i, 2] <- mx[i-1, 2] + z
z <- z - mx[i, 1]
} else {
mx[i, 2] <- mx[i-1, 2]
}
}
mx
# x y
# 1 1 0
# 2 2 10
# 3 3 10
# 4 4 18
# 5 5 18
microbenchmark() results:
# Unit: milliseconds
# expr min lq mean median uq max neval
# mx 8.675346 9.542153 10.71271 9.925953 11.02796 89.35088 1000
# df 10.363204 11.249255 12.85973 11.785933 13.59802 106.99920 1000
It would be great if we can vectorize the operation on the data frame. My strategy is to calculate the z values for each row and then use cumsum to calculate the y value. The accumulate function from the purrr package is to calculate the z values. right_join function from the dplyr function and fill function from the tidyr package is to further process the format.
library(tidyverse)
df2 <- df %>%
filter(x %% 2 != 0) %>%
mutate(z = accumulate(c(z, x[-1] - 1), `-`)) %>%
right_join(df, by = c("x", "y")) %>%
mutate(z = lag(z), z = ifelse(is.na(z), 0, z)) %>%
mutate(y = cumsum(z)) %>%
select(-z)
df2
# x y
# 1 1 0
# 2 2 10
# 3 3 10
# 4 4 18
# 5 5 18
I have an example dataframe
df <- data.frame(cust = sample(1:100, 1000, TRUE),
channel = sample(c("WEB", "POS"), 1000, TRUE))
that I'm trying to mutate
get_channels <- function(data) {
d <- data
if(unique(d) %>% length() == 2){
d <- "Both"
} else {
if(unique(d) %>% length() < 2 && unique(d) == "WEB") {
d <- "Web"
} else {
d <- "POS"
}
}
return(d)
}
This works without issue and on small dataframes, it takes no time at all.
start.time <- Sys.time()
df %>%
group_by(cust) %>%
mutate(chan = get_channels(channel)) %>%
group_by(cust) %>%
slice(1) %>%
group_by(chan) %>%
summarize(count = n()) %>%
mutate(perc = count/sum(count))
end.time <- Sys.time()
time.taken <- end.time - start.time
time.taken
Time difference of 0.34602 secs
However, when the data frame gets rather large, say, on the order of >1000000 or more cust, my basic if/else fx takes much, much longer.
How can I streamline this function to make it run more quickly?
You should use a data.table for this.
setDT(df)
t1 = Sys.time()
df = df[ , .(channels = ifelse(uniqueN(channel) == 2, "both", as.character(channel[1]))), by = .(cust)]
> Sys.time() - t1
Time difference of 0.00500083 secs
> head(df)
cust channels
1: 37 both
2: 45 both
3: 74 both
4: 20 both
5: 1 both
6: 68 both
You can do it in base R using something like that:
web_cust <- unique(df$cust[df$channel=="WEB"])
pos_cust <- unique(df$cust[df$channel=="POS"])
both <- length(intersect(web_cust, pos_cust))
web_only <- length(setdiff(web_cust, pos_cust))
pos_only <- length(setdiff(pos_cust, web_cust))
Data:
set.seed(1)
df <- data.frame(cust = sample(2e6, 1e7, TRUE),
channel = sample(c("WEB", "POS"), 1e7, TRUE),
stringsAsFactors = F)
A faster dplyr version that takes about 1/3 the time but is probably still slower than the data table version. uniqueN borrowed from #Kristoferson answer.
df %>%
group_by(cust) %>%
summarize(chan = if_else(uniqueN(channel) == 2, "Both", as.character(channel[1]))) %>%
group_by(chan) %>%
summarize(n = n() ) %>%
mutate(perc = n /sum(n))
Also, your orginal can be improved significantly by optimizing your function like this:
get_channels <- function(data) {
ud <- unique(data)
udl <- length(ud)
if(udl == 2) {
r <- "Both"
} else {
if(udl < 2 && ud == "WEB") {
r <- "Web"
} else {
r <- "POS"
}
}
return(r)
}
And some timings...
I tried three different alternatives in both dplyr and data.table: (1) ifelse (see #Kristofersen's answer), (2) if / else (because the test is of length 1), and (3) vector indexing. Unsurprisingly, the main difference is between dplyr and data.table and not among alternative 1-3.
For 1000 customers, data.table is about 7 times faster. For 10000 customers it's about 30 times faster. For 1e6 customers, I only tested data.table, not a very large difference between alternatives.
# 1000 customers, 2*1000 registrations
df <- data.frame(cust = sample(1e3, 2e3, replace = TRUE),
channel = sample(c("WEB", "POS"), 2e3, TRUE))
library(microbenchmark)
library(dplyr)
library(data.table)
microbenchmark(dp1 = df %>%
group_by(cust) %>%
summarise(res = ifelse(n_distinct(channel) == 1, channel[1], "both")),
dp2 = df %>%
group_by(cust) %>%
summarise(res = if(n_distinct(channel) == 1) channel[1] else "both"),
dp3 = df %>%
group_by(cust) %>%
summarise(res = c("both", channel[1])[(n_distinct(channel) == 1) + 1]),
dt1 = setDT(df)[ , .(channels = ifelse(uniqueN(channel) == 2, "both", channel[1])), by = cust],
dt2 = setDT(df)[ , .(channels = if(uniqueN(channel) == 2) "both" else channel[1]), by = cust],
dt3 = setDT(df)[ , .(res = c("both", channel[1])[(uniqueN(channel) == 1) + 1]), by = cust],
times = 5, unit = "relative")
# 1e3 customers
# Unit: relative
# expr min lq mean median uq max neval
# dp1 7.8985477 8.176139 7.9355234 7.676534 8.0359975 7.9166933 5
# dp2 7.8882707 8.018000 7.8965098 8.731935 7.8414478 7.3560530 5
# dp3 8.0851402 8.934831 7.7540060 7.653026 6.8305012 7.6887950 5
# dt1 1.1713088 1.180870 1.0350482 1.209861 1.0523597 0.7650059 5
# dt2 0.8272681 1.223387 0.9311628 1.047773 0.9028017 0.7795579 5
# dt3 1.0000000 1.000000 1.0000000 1.000000 1.0000000 1.0000000 5
# 1e4 customers
# Unit: relative
# expr min lq mean median uq max neval
# dp1 40.8725204 39.5297108 29.5755838 38.996075 38.246103 17.2784642 5
# dp2 40.7396141 39.4299918 27.4476811 38.819577 37.886320 12.7265756 5
# dp3 41.0940358 39.7819673 27.5532964 39.260488 38.317899 12.4685386 5
# dt1 1.0905470 1.0661613 0.7422082 1.053786 1.034642 0.3428945 5
# dt2 0.9052739 0.9008761 1.2813458 2.111642 2.356008 0.9005391 5
# dt3 1.0000000 1.0000000 1.0000000 1.000000 1.000000 1.0000000 5
# 1e6 customers, data.table only
# Unit: relative
# expr min lq mean median uq max neval
# dt1 1.146757 1.147152 1.155497 1.164471 1.156244 1.161660 5
# dt2 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 5
# dt3 1.084442 1.079734 1.253568 1.106833 1.098766 1.799935 5