Summarizing with data.table R - multiple mathematical operations and conditions - r

I want to summarize a table creating new columns using different mathematical operations and conditions.
I am using data.table because I am used to this package but I accept recommendations on different ones if any (maybe dplyr?).
this is an example of data frame:
id <- c(rep("A", 6), rep("B", 6), rep("C",6))
lat <- c(rep(45, 6), rep(50, 6), rep(-30,6))
lon <- c(rep(0, 6), rep(180, 6), rep(270,6))
hight <- c(rep(seq(0,100, 20),3))
var1 <- rnorm(18, 50, 50)
df <- data.frame(id, lat, lon, hight, var1)
setDT(df)
beside the typical mathematical operations, such as mean, sd, and median, I would like to create a new column showing the value of var1 at a specific condition, such as hight == 0, 100, etc..
df.new <- df[, .(
"var1_avg" = mean(var1, na.rm = T),
"var1_sd" = sd(var1, na.rm = T),
"var1_median" = median(var1, na.rm = T),
"var1_min" = min(var1),
#here I have the problems:
"var1_0" =df[which(hight == 0),
"var1"],
"var1_100" =df[which(hight == 100),
"var1"]
), by = c("lat", "lon")]
I understand the concept of the error:
Error in `[.data.table`(df, , .(var1_avg = mean(var1, na.rm = T), var1_sd = sd(var1, :
All items in j=list(...) should be atomic vectors or lists. If you are trying something like j=list(.SD,newcol=mean(colA)) then use := by group instead (much quicker), or cbind or merge afterwards.
But I do not find an efficient solution to get my df.new

Here is a data.table version that seems more efficient than the proposed tidyverse approach:
library(data.table)
set.seed(123)
id <- c(rep("A", 6), rep("B", 6), rep("C",6))
lat <- c(rep(45, 6), rep(50, 6), rep(-30,6))
lon <- c(rep(0, 6), rep(180, 6), rep(270,6))
hight <- c(rep(seq(0,100, 20),3))
var1 <- rnorm(18, 50, 50)
df <- data.table(id, lat, lon, hight, var1, key=c("lat", "lon"))
df[, .(
"var1_avg" = mean(var1, na.rm = T),
"var1_sd" = sd(var1, na.rm = T),
"var1_median" = median(var1, na.rm = T),
"var1_min" = min(var1),
"var1_0"= var1[hight==0],
"var1_100"= var1[hight==100]
), by = c("lat", "lon")]
#> lat lon var1_avg var1_sd var1_median var1_min var1_0 var1_100
#> 1: -30 270 52.28133 62.36118 62.78635 -48.33086 70.03857 -48.33086
#> 2: 45 0 72.35764 47.75012 54.99490 21.97622 21.97622 135.75325
#> 3: 50 180 47.06030 45.22337 47.85380 -13.25306 73.04581 67.99069
Created on 2022-04-04 by the reprex package (v2.0.1)

This will calculate the summary statistics e.g. mean or sd for every point (lat, lon) regardless of hight:
library(tidyverse)
id <- c(rep("A", 6), rep("B", 6), rep("C", 6))
lat <- c(rep(45, 6), rep(50, 6), rep(-30, 6))
lon <- c(rep(0, 6), rep(180, 6), rep(270, 6))
hight <- c(rep(seq(0, 100, 20), 3))
var1 <- rnorm(18, 50, 50)
df <- data.frame(id, lat, lon, hight, var1)
df %>%
group_by(lat, lon) %>%
summarise(
var1_avg = mean(var1, na.rm = TRUE),
var1_sd = sd(var1, na.rm = TRUE),
var1_median = median(var1, na.rm = TRUE)
) %>%
left_join(
df %>% filter(hight == 100) %>% transmute(lat, lon, var1_100 = var1)
) %>%
left_join(
df %>% filter(hight == 0) %>% transmute(lat, lon, var1_0 = var1)
)
#> `summarise()` has grouped output by 'lat'. You can override using the `.groups`
#> argument.
#> Joining, by = c("lat", "lon")
#> Joining, by = c("lat", "lon")
#> # A tibble: 3 × 7
#> # Groups: lat [3]
#> lat lon var1_avg var1_sd var1_median var1_100 var1_0
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 -30 270 90.6 67.0 81.6 181. 5.51
#> 2 45 0 43.3 40.5 49.6 36.6 -30.1
#> 3 50 180 34.9 47.0 25.3 24.6 0.705
Created on 2022-04-04 by the reprex package (v2.0.0)

Related

how to sum every row within date range in variables

df1:
library(tidyverse)
library(lubridate)
ex1 <- tibble(date = seq.Date(from = ymd('20200101'), length.out = 100, by = 'day'),
a = rnorm(100, mean = 1, sd = 2),
b = runif(100, min = 1, max = 2),
c = rnorm(100, mean = 3, sd = 1),
d = runif(100, min = 50, max = 60))
df2:
cal_c <- tibble(variable = c('a', 'b', 'c','d'),
start = c(ymd('20200101', '20200103', '20200203', '20200103')),
end = c(ymd('20200204', '20200405', '20200301', '20200401')),
total = c('NA', 'NA', 'NA', 'NA'))
I want to calc every row in df2 within the date range in the start and end based on df1, say a$toal between '2020-1-1' to '2020-2-4', b$total between '2020-1-3' to '2020-4-5', any help, thanks a lot.
We can create a sequence of start and end dates for cal_c data, get ex1 in long format and join. We can then sum value for each variable.
library(tidyverse)
cal_c %>%
mutate(date = map2(start, end, seq, by = 'day')) %>%
unnest(date) %>%
left_join(ex1 %>% pivot_longer(cols = -date, names_to = 'variable'),
by = c('variable', 'date')) %>%
group_by(variable, start, end) %>%
summarise(value = sum(value, na.rm = TRUE))
# variable start end value
# <chr> <date> <date> <dbl>
#1 a 2020-01-01 2020-02-04 34.3
#2 b 2020-01-03 2020-04-05 136.
#3 c 2020-02-03 2020-03-01 79.5
#4 d 2020-01-03 2020-04-01 4909.
Base R Solution:
cal_c$total <- sapply(split(cal_c, rownames(cal_c)), function(x){
sum(ex1[((ex1$date >= x$start) & (ex1$date <= x$end)), match(x$variable, names(ex1))])})
An option using data.table:
cal_c[, total :=
ex1[cal_c, on=.(date>=start, date<=end), by=.EACHI,
sum(.SD[[variable]])]$V1
]
output:
variable start end total
1: a 2020-01-01 2020-02-04 34.04780
2: b 2020-01-03 2020-04-05 135.40290
3: c 2020-02-03 2020-03-01 91.10271
4: d 2020-01-03 2020-04-01 4978.59884
data:
set.seed(0L)
library(data.table)
ex1 <- data.table(date = seq.Date(from = as.IDate('20200101', format="%Y%m%d"), length.out = 100, by = 'day'),
a = rnorm(100, mean = 1, sd = 2),
b = runif(100, min = 1, max = 2),
c = rnorm(100, mean = 3, sd = 1),
d = runif(100, min = 50, max = 60))
cal_c <- data.table(variable = c('a', 'b', 'c','d'),
start = as.IDate(c('20200101', '20200103', '20200203', '20200103'), format="%Y%m%d"),
end = as.IDate(c('20200204', '20200405', '20200301', '20200401'), format="%Y%m%d"))

How to use R and dplyr to paste column name as value for lookup

I am using the output coefficients from a glm regression model and I need to create a lookup value, using key paste ([column name].[Factor Level], and then return the corresponding value from another data table. The column names must be dynamic so that I don't have to explicitly name each column one by one.
The returned values from the lookup are then multiplied by 1 (for factors) or by the actual numeric values and all coef_colnames summed into column Total.
I've done some example in excel but cannot replicate it in R.
var_Factor1 combines the column name and the factor level from each row (using paste) to build a key for the next step lookup
var_Number1 is just the column name as it is numeric and has no factor levels
library(dplyr)
# original data
dt = data.table(
Factor1 = c("A","B","C"),
Number1 = c(10, 20,40),
Factor2 = c("D","H","N"),
Number2 = c(2, 5,3)
)
# Lookup table
model_coef = data.table(
Factor1.A = 10,
Factor1.B = 20,
Factor1.C = 30,
Factor2.D = 40,
Factor2.H = 50,
Factor2.N = 60,
Number1 = 200,
Number2 = 500
)
#initial steps
dt <- dt %>% mutate (
var_Factor1 = paste("Factor1", Factor1, sep =".")
, var_Number1 = "Number1"
, var_Factor2 = paste("Factor2", Factor2, sep =".")
, var_Number2 = "Number2"
) %>% mutate (
coef_Factor1 = model_coef[,var_Factor1]
)
#The final output should produce (as replicated from Excel)
final_output = data.table (
Factor1= c("A", "B", "C"),
Number1= c(10, 20, 40),
Factor2= c("D", "H", "N"),
Number2= c(2, 5, 3),
var_Factor1= c("Factor1.A", "Factor1.B", "Factor1.C"),
var_Number1= c("Number1", "Number1", "Number1"),
var_Factor2= c("Factor2.D", "Factor2.H", "Factor2.N"),
var_Number2= c("Number2", "Number2", "Number2"),
coef_Factor1= c(10, 20, 30),
coef_Number1= c(200, 200, 200),
coef_Factor2= c(40, 50, 60),
coef_Number2= c(500, 500, 500),
calc_Factor1= c(10, 20, 30),
calc_Number1= c(2000, 4000, 8000),
calc_Factor2= c(40, 50, 60),
calc_Number2= c(1000, 2500, 1500),
Total= c(3050, 6570, 9590)
)
It's generally a bad idea to try to generate and manipulate dynamic columns.
It will probably be better to use tidy data conventions and make the data "long". Also, it looks like you're trying to mix data.table and dplyr/tidyverse. In particular, this doesn't work: mutate (coef_Factor1 = model_coef[,var_Factor1]
I've tidied your data and modified your code to use dplyr/tidyverse below:
using tibble instead of data.table
re-built lookup table to tidy-long format so it can be left_joined
properly to your table
used mutate to do the calculations that you describe
Beyond your example, if you have more than 2 "Numbers"/"Factors" (your naming/labeling/numbering is confusing btw), there are ways to generalize further so that the code multiplies coef * number generically, for each "number"/combination. Also, your data implies but it isn't clear that A is related to D, B is related to H, etc.
library(tidyverse)
data <- tibble(Factor1 = c("A","B","C"),Number1 = c(10, 20,40),Factor2 = c("D","H","N"),Number2 = c(2, 5,3))
model_coef <- tibble(Factor1.A = 10,Factor1.B = 20,Factor1.C = 30,Factor2.D = 40,Factor2.H = 50,Factor2.N = 60,Number1 = 200,Number2 = 500)
(model_coef_factor1 <- model_coef %>%
select(Factor1.A:Factor1.C) %>%
pivot_longer(cols = everything(), names_to = c("number", "factor"), names_sep = "[.]", values_to = "coef_factor1") %>%
select(-number))
#> # A tibble: 3 x 2
#> factor coef_factor1
#> <chr> <dbl>
#> 1 A 10
#> 2 B 20
#> 3 C 30
(model_coef_factor2 <- model_coef %>%
select(Factor2.D:Factor2.N) %>%
pivot_longer(cols = everything(), names_to = c("number", "factor"), names_sep = "[.]", values_to = "coef_factor2") %>%
select(-number))
#> # A tibble: 3 x 2
#> factor coef_factor2
#> <chr> <dbl>
#> 1 D 40
#> 2 H 50
#> 3 N 60
(final_output <- data %>%
left_join(model_coef_factor1, by = c("Factor1"="factor")) %>%
left_join(model_coef_factor2, by = c("Factor2"="factor")) %>%
mutate(coef_number1 = model_coef$Number1,
coef_number2 = model_coef$Number2,
calc_factor1 = coef_factor1,
calc_number1 = Number1 * coef_number1,
calc_factor2 = coef_factor2,
calc_number2 = Number2 * coef_number2,
total = calc_factor1 + calc_number1 + calc_factor2 + calc_number2) %>%
select(total, everything()))
#> # A tibble: 3 x 13
#> total Factor1 Number1 Factor2 Number2 coef_factor1 coef_factor2
#> <dbl> <chr> <dbl> <chr> <dbl> <dbl> <dbl>
#> 1 3050 A 10 D 2 10 40
#> 2 6570 B 20 H 5 20 50
#> 3 9590 C 40 N 3 30 60
#> # ... with 6 more variables: coef_number1 <dbl>, coef_number2 <dbl>,
#> # calc_factor1 <dbl>, calc_number1 <dbl>, calc_factor2 <dbl>,
#> # calc_number2 <dbl>
Created on 2019-10-23 by the reprex package (v0.3.0)

Using the pipe in selfmade function with tidyeval (quo_name)

I have two functions: date_diff and group_stat. So I have read this article tidyverse and I try so create simple functions and use the pipe.
The first function creates a difftime and names them timex_minus_timey but when I pipe this result into the next function I have to look at the name so I can fill in summary_var. Is there a better way to do this?
library(tidyverse)
#
set.seed(42)
data <- dplyr::bind_rows(
tibble::tibble(Hosp = rep("A", 1000),
drg = sample(letters[1:5], 1000, replace = TRUE),
time1 = as.POSIXlt("2018-02-03 08:00:00", tz = "UTC") + rnorm(1000, 0, 60*60*60),
time2 = time1 + runif(1000, min = 10*60, max = 20*60)),
tibble::tibble(Hosp = rep("B", 1000),
drg = sample(letters[1:5], 1000, replace = TRUE),
time1 = as.POSIXlt("2018-02-03 08:00:00", tz = "UTC") + rnorm(1000, 0, 60*60*60),
time2 = time1 + runif(1000, min = 10*60, max = 20*60))
)
date_diff <- function(df, stamp1, stamp2, units = "mins"){
stamp1 <- rlang::enquo(stamp1)
stamp2 <- rlang::enquo(stamp2)
name <- paste0(rlang::quo_name(stamp1), "_minus_", rlang::quo_name(stamp2))
out <- df %>%
dplyr::mutate(!!name := as.numeric(difftime(!!stamp1, !!stamp2, units=units)))
out
}
group_stat <- function(df, group_var, summary_var, .f) {
func <- rlang::as_function(.f)
group_var <- rlang::enquo(group_var)
summary_var <-rlang::enquo(summary_var)
name <- paste0(rlang::quo_name(summary_var), "_", deparse(substitute(.f)))
df %>%
dplyr::group_by(!!group_var) %>%
dplyr::summarise(!!name := func(!!summary_var, na.rm = TRUE))
}
data %>%
date_diff(time2, time1) %>%
group_stat(Hosp, summary_var = time2_minus_time1, mean)
#> # A tibble: 2 x 2
#> Hosp time2_minus_time1_mean
#> <chr> <dbl>
#> 1 A 15.1
#> 2 B 14.9
Created on 2019-05-02 by the reprex package (v0.2.1)
If you intend to always use these functions one after another in this way you could add an attribute containing the new column's name with date_diff, and have group_stat use that attribute. With the if condition, the attribute is only used if it exists and the summary_var argument is not provided.
date_diff <- function(df, stamp1, stamp2, units = "mins"){
stamp1 <- rlang::enquo(stamp1)
stamp2 <- rlang::enquo(stamp2)
name <- paste0(rlang::quo_name(stamp1), "_minus_", rlang::quo_name(stamp2))
out <- df %>%
dplyr::mutate(!!name := as.numeric(difftime(!!stamp1, !!stamp2, units=units)))
attr(out, 'date_diff_nm') <- name
out
}
group_stat <- function(df, group_var, summary_var, .f) {
if(!is.null(attr(df, 'date_diff_nm')) & missing(summary_var))
summary_var <- attr(df, 'date_diff_nm')
group_var <- rlang::enquo(group_var)
name <- paste0(summary_var, "_", deparse(substitute(.f)))
df %>%
dplyr::group_by(!!group_var) %>%
dplyr::summarise_at(summary_var, funs(!!name := .f), na.rm = T)
}
data %>%
date_diff(time2, time1) %>%
group_stat(Hosp, .f = mean)
# # A tibble: 2 x 2
# Hosp time2_minus_time1_mean
# <chr> <dbl>
# 1 A 15.1
# 2 B 14.9

R: Tidy Aggregation of Sequence Data and Visualization of Stepfunctions

I have some patient data, where the individual patients change treatment groups over time. My goal is to visualize the sequence of group changes and aggregate this data into a "sequence profile" for each treatment group.
For each treatment group I would like to show, when it generally occurs
in the treatment cycle (say rather in the beginning or in the end). To account for the differing sequence length, I would like to standardize these profiles betweenn 0 (very beginning) and 1 (end).
I would like to find an efficient data preparation and visualization.
Mininmal Example
Structure of Data
library(dplyr)
library(purrr)
library(ggplot2)
# minimal data
cj_df_raw <- tibble::tribble(
~id, ~group,
1, "A",
1, "B",
2, "A",
2, "B",
2, "A"
)
# compute "intervals" for each person [start, end]
cj_df_raw %>%
group_by(id) %>%
mutate(pos = row_number(),
len = length(id),
start = (pos - 1) / len,
end = pos / len) %>%
filter(group == "A")
#> # A tibble: 3 x 6
#> # Groups: id [2]
#> id group pos len start end
#> <dbl> <chr> <int> <int> <dbl> <dbl>
#> 1 1 A 1 2 0 0.5
#> 2 2 A 1 3 0 0.333
#> 3 2 A 3 3 0.667 1
(So Id 1 was in group A in the first 50% of their sequence, and Id 2 was in Group A in the first 33% and the last 33% of their sequence. This means, that 2 Ids where between 0-33% of the sequence, 1 between 33-50%, 0 between 50-66% and 1 above 66%.)
This is the result I would like to achieve and I miss a chance to transform my data effectively.
Desired outcome
profile_treatmen_a <- tibble::tribble(
~x, ~y,
0, 0L,
0.33, 2L,
0.5, 1L,
0.66, 0L,
1, 1L,
1, 0L
)
profile_treatmen_a %>%
ggplot(aes(x, y)) +
geom_step(direction = "vh") +
expand_limits(x = c(0, 1), y = 0)
(Ideally the area under the curve would be shaded)
Ideal solution: via ggridges
The goal of the visualization would be to compare the "sequence-profile" of many treatment-groups at the same time. If I could prepare the data accordingly, I would like to use the ggridges-package for a striking visual comparison the treatment groups.
library(ggridges)
data.frame(group = rep(letters[1:2], each=20),
mean = rep(2, each=20)) %>%
mutate(count = runif(nrow(.))) %>%
ggplot(aes(x=count, y=group, fill=group)) +
geom_ridgeline(stat="binline", binwidth=0.5, scale=0.9)
You could build helper intervals and then just plot a histogram. Since each patient is either in Group A or B both groups sum up to 100%. With these helper intervals you could also easily switch to other geoms.
library(tidyverse, warn.conflicts = FALSE)
library(ggplot2)
# create sample data
set.seed(42)
id <- 1:10 %>% map(~ rep(x = .x, times = runif(n = 1, min = 1, max = 6))) %>%
unlist()
group <- sample(x = c("A", "B"), size = length(id), replace = TRUE) %>%
as_factor()
df <- tibble(id, group)
glimpse(df)
#> Observations: 37
#> Variables: 2
#> $ id <int> 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 4, 4, 4, 4, 4, 5, 5,...
#> $ group <fct> A, B, B, A, A, B, B, A, A, B, B, A, B, B, A, B, A, B, A,...
# tidy data
df <- df %>%
group_by(id) %>%
mutate(from = (row_number() - 1) / n(),
to = row_number() / n()) %>%
ungroup() %>%
rowwise() %>%
mutate(list = seq(from + 1/60, to, 1/60) %>% list()) %>%
unnest()
# plot
df %>%
ggplot(aes(x = list, fill = group)) +
geom_histogram(binwidth = 1/60) +
ggthemes::theme_hc()
Created on 2018-09-16 by the [reprex package](http://reprex.tidyverse.org) (v0.2.0).
My attempt at an answer.. although it is probably not the nicest/fastest/most efficient way, I think it might help you in your efforts.
library(data.table)
# compute "intervals" for each person [start, end]
df <- cj_df_raw %>%
group_by(id) %>%
mutate(pos = row_number(),
len = length(id),
from = (pos - 1) / len,
to = pos / len,
value = 1)
dt <- as.data.table(df)
setkey(dt, from, to)
#create intervals
dt.interval <- data.table(from = seq( from = 0, by = 0.01, length.out = 100),
to = seq( from = 0.01, by = 0.01, length.out = 100))
#perform overlap join on intervals
dt2 <- foverlaps( dt.interval, dt, type = "within", nomatch = NA)[, sum(value), by = c("i.from", "group")]
#some melting ans casting to fill in '0' on empty intervals
dt3 <- melt( dcast(dt2, ... ~ group, fill = 0), id.vars = 1 )
#plot
ggplot( dt3 ) +
geom_step( aes( x = i.from, y = value, color = variable ) ) +
facet_grid( .~variable )

Calculate Group Mean and Overall Mean

I have a data frame with several variables I want to get the means of and a variable I want to group by. Then, I would like to get the proportion of each group's mean to the overall mean.
I have put together the following, but it is clumsy.
How would you go about it using dplyr or data.table? Bonus points for the option to return both the intermediate step (group and overall mean) and the final proportions.
library(tidyverse)
set.seed(1)
Data <- data.frame(
X1 = sample(1:10),
X2 = sample(11:20),
X3 = sample(21:30),
Y = sample(c("yes", "no"), 10, replace = TRUE)
)
groupMeans <- Data %>%
group_by(Y) %>%
summarize_all(funs(mean))
overallMeans <- Data %>%
select(-Y) %>%
summarize_all(funs(mean))
index <- sweep(as.matrix(groupMeans[, -1]), MARGIN = 2, as.matrix(overallMeans), FUN = "/")
here is one more dplyr solution
index <- as.data.frame(Data %>%
group_by(Y) %>%
summarise_all(mean) %>%
select(-Y) %>%
rbind(Data %>% select(-Y) %>% summarise_all(mean))%>%
mutate_all(funs( . / .[3])))[1:2,]
Here is one possible dplyr solution that contains everything you want:
Data %>%
group_by(Y) %>%
summarise(
group_avg_X1 = mean(X1),
group_avg_X2 = mean(X2),
group_avg_X3 = mean(X3)
) %>%
mutate(
overall_avg_X1 = mean(group_avg_X1),
overall_avg_X2 = mean(group_avg_X2),
overall_avg_X3 = mean(group_avg_X3),
proportion_X1 = group_avg_X1 / overall_avg_X1,
proportion_X2 = group_avg_X2 / overall_avg_X2,
proportion_X3 = group_avg_X3 / overall_avg_X3
)
# # A tibble: 2 x 10
# Y group_avg_X1 group_avg_X2 group_avg_X3 overall_avg_X1 overall_avg_X2 overall_avg_X3 proportion_X1
# <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 no 6.6 14.6 25.8 5.5 15.5 25.5 1.2
# 2 yes 4.4 16.4 25.2 5.5 15.5 25.5 0.8
# # ... with 2 more variables: proportion_X2 <dbl>, proportion_X3 <dbl>
Here's a method with data.table:
#data
library(data.table)
set.seed(1)
dt <- data.table(
x1 = sample(1:10),
x2 = sample(11:20),
x3 = sample(21:30),
y = sample(c("yes", "no"), 10, replace = TRUE)
)
# group means
group_means <- dt[ , lapply(.SD, mean), by=y, .SDcols=1:3]
# overall means
overall_means <- dt[ , lapply(.SD, mean), .SDcols=1:3]
# clunky combination (sorry!)
group_means[ , perc_x1 := x1 / overall_means[[1]] ]
group_means[ , perc_x2 := x2 / overall_means[[2]] ]
group_means[ , perc_x3 := x3 / overall_means[[3]] ]

Resources