apply a custom function across certain columns in a dataframe in R - r

I have the following dataframe:
library(tidyverse)
library(lubridate)
date_data1 <- data.frame(
name = c('groupA'),
number = as.numeric(c(1:10)),
date1 = seq(from = ymd('2019-07-01'), to = ymd('2019-07-10'), by='days'),
date2 = seq(from = ymd('2019-07-02'), to = ymd('2019-07-11'), by='days'),
date3 = seq(from = ymd('2019-06-29'), to = ymd('2019-07-08'), by='days'),
date4 = seq(from = ymd('2019-07-03'), to = ymd('2019-07-12'), by='days'),
date5 = seq(from = ymd('2019-07-05'), to = ymd('2019-07-14'), by='days')
) %>%
mutate(yday = yday(date5))
date_data2 <- data.frame(
name = c('groupB'),
number = as.numeric(c(1:10)),
date1 = seq(from = ymd('2019-07-01'), to = ymd('2019-07-10'), by='days'),
date2 = seq(from = ymd('2019-07-02'), to = ymd('2019-07-11'), by='days'),
date3 = seq(from = ymd('2019-06-29'), to = ymd('2019-07-08'), by='days'),
date4 = seq(from = ymd('2019-07-03'), to = ymd('2019-07-12'), by='days'),
date5 = seq(from = ymd('2019-07-05'), to = ymd('2019-07-14'), by='days')
) %>%
mutate(yday = yday(date5))
date_data <- bind_rows(date_data1, date_data2)
I want to apply the following function to date1 through date4 columns:
mad <- function(x, y) abs(mean(x - y, na.rm = TRUE))
However, I want to retain the "name" identifier.
I have asked a similar question in the past and the solution worked. However, when attempting to adapt the code, I'm running into issues.
Here's what I thought should work, based on the previous post.
apply(date_data[, 3:6], function(x) mad(date_data[,7], x))
In other words, I'm attempting to find the mean absolute difference (the custom function, "mad") between column 7 ("date5") and columns 3 through 5 (i.e. "date1" through "date4") for each group. The goal is to have a new dataframe that gives the mean absolute difference for each of the date columns (1-4) with two rows, one for groupA and one for groupB.
I tried mapping the function, but I get an error that "arguments imply differeng number of rows."
Here's the code for the map() that does not work:
date_data_test <- date_data %>%
group_by(name) %>%
map_at(c(3:6), function(x) mad(date_data[,7], x)) %>%
data.frame()
Any suggestions are appreciated. Thank you.

Using the across function from dplyr:
library(tidyverse)
library(lubridate)
#>
#> Attaching package: 'lubridate'
#> The following objects are masked from 'package:base':
#>
#> date, intersect, setdiff, union
date_data1 <- data.frame(
name = c('groupA'),
number = as.numeric(c(1:10)),
date1 = seq(from = ymd('2019-07-01'), to = ymd('2019-07-10'), by='days'),
date2 = seq(from = ymd('2019-07-02'), to = ymd('2019-07-11'), by='days'),
date3 = seq(from = ymd('2019-06-29'), to = ymd('2019-07-08'), by='days'),
date4 = seq(from = ymd('2019-07-03'), to = ymd('2019-07-12'), by='days'),
date5 = seq(from = ymd('2019-07-05'), to = ymd('2019-07-14'), by='days')
) %>%
mutate(yday = yday(date5))
date_data2 <- data.frame(
name = c('groupB'),
number = as.numeric(c(1:10)),
date1 = seq(from = ymd('2019-07-01'), to = ymd('2019-07-10'), by='days'),
date2 = seq(from = ymd('2019-07-02'), to = ymd('2019-07-11'), by='days'),
date3 = seq(from = ymd('2019-06-29'), to = ymd('2019-07-08'), by='days'),
date4 = seq(from = ymd('2019-07-03'), to = ymd('2019-07-12'), by='days'),
date5 = seq(from = ymd('2019-07-05'), to = ymd('2019-07-14'), by='days')
) %>%
mutate(yday = yday(date5))
date_data <- bind_rows(date_data1, date_data2) %>%
as_tibble()
date_data %>%
group_by(name) %>%
summarise(across(
.cols = 2:5,
.fns = ~ abs(mean(interval(.x, date5) %/% days(1))),
.names = "diff_{.col}_date5"
))
#> # A tibble: 2 × 5
#> name diff_date1_date5 diff_date2_date5 diff_date3_date5 diff_date4_date5
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 groupA 4 3 6 2
#> 2 groupB 4 3 6 2
Created on 2021-11-11 by the reprex package (v2.0.1)

Related

Summarizing with data.table R - multiple mathematical operations and conditions

I want to summarize a table creating new columns using different mathematical operations and conditions.
I am using data.table because I am used to this package but I accept recommendations on different ones if any (maybe dplyr?).
this is an example of data frame:
id <- c(rep("A", 6), rep("B", 6), rep("C",6))
lat <- c(rep(45, 6), rep(50, 6), rep(-30,6))
lon <- c(rep(0, 6), rep(180, 6), rep(270,6))
hight <- c(rep(seq(0,100, 20),3))
var1 <- rnorm(18, 50, 50)
df <- data.frame(id, lat, lon, hight, var1)
setDT(df)
beside the typical mathematical operations, such as mean, sd, and median, I would like to create a new column showing the value of var1 at a specific condition, such as hight == 0, 100, etc..
df.new <- df[, .(
"var1_avg" = mean(var1, na.rm = T),
"var1_sd" = sd(var1, na.rm = T),
"var1_median" = median(var1, na.rm = T),
"var1_min" = min(var1),
#here I have the problems:
"var1_0" =df[which(hight == 0),
"var1"],
"var1_100" =df[which(hight == 100),
"var1"]
), by = c("lat", "lon")]
I understand the concept of the error:
Error in `[.data.table`(df, , .(var1_avg = mean(var1, na.rm = T), var1_sd = sd(var1, :
All items in j=list(...) should be atomic vectors or lists. If you are trying something like j=list(.SD,newcol=mean(colA)) then use := by group instead (much quicker), or cbind or merge afterwards.
But I do not find an efficient solution to get my df.new
Here is a data.table version that seems more efficient than the proposed tidyverse approach:
library(data.table)
set.seed(123)
id <- c(rep("A", 6), rep("B", 6), rep("C",6))
lat <- c(rep(45, 6), rep(50, 6), rep(-30,6))
lon <- c(rep(0, 6), rep(180, 6), rep(270,6))
hight <- c(rep(seq(0,100, 20),3))
var1 <- rnorm(18, 50, 50)
df <- data.table(id, lat, lon, hight, var1, key=c("lat", "lon"))
df[, .(
"var1_avg" = mean(var1, na.rm = T),
"var1_sd" = sd(var1, na.rm = T),
"var1_median" = median(var1, na.rm = T),
"var1_min" = min(var1),
"var1_0"= var1[hight==0],
"var1_100"= var1[hight==100]
), by = c("lat", "lon")]
#> lat lon var1_avg var1_sd var1_median var1_min var1_0 var1_100
#> 1: -30 270 52.28133 62.36118 62.78635 -48.33086 70.03857 -48.33086
#> 2: 45 0 72.35764 47.75012 54.99490 21.97622 21.97622 135.75325
#> 3: 50 180 47.06030 45.22337 47.85380 -13.25306 73.04581 67.99069
Created on 2022-04-04 by the reprex package (v2.0.1)
This will calculate the summary statistics e.g. mean or sd for every point (lat, lon) regardless of hight:
library(tidyverse)
id <- c(rep("A", 6), rep("B", 6), rep("C", 6))
lat <- c(rep(45, 6), rep(50, 6), rep(-30, 6))
lon <- c(rep(0, 6), rep(180, 6), rep(270, 6))
hight <- c(rep(seq(0, 100, 20), 3))
var1 <- rnorm(18, 50, 50)
df <- data.frame(id, lat, lon, hight, var1)
df %>%
group_by(lat, lon) %>%
summarise(
var1_avg = mean(var1, na.rm = TRUE),
var1_sd = sd(var1, na.rm = TRUE),
var1_median = median(var1, na.rm = TRUE)
) %>%
left_join(
df %>% filter(hight == 100) %>% transmute(lat, lon, var1_100 = var1)
) %>%
left_join(
df %>% filter(hight == 0) %>% transmute(lat, lon, var1_0 = var1)
)
#> `summarise()` has grouped output by 'lat'. You can override using the `.groups`
#> argument.
#> Joining, by = c("lat", "lon")
#> Joining, by = c("lat", "lon")
#> # A tibble: 3 × 7
#> # Groups: lat [3]
#> lat lon var1_avg var1_sd var1_median var1_100 var1_0
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 -30 270 90.6 67.0 81.6 181. 5.51
#> 2 45 0 43.3 40.5 49.6 36.6 -30.1
#> 3 50 180 34.9 47.0 25.3 24.6 0.705
Created on 2022-04-04 by the reprex package (v2.0.0)

Calculate Mean for Each Unique Value up to a certain date

Data for my example.
date1 = seq(as.Date("2019/01/01"), by = "month", length.out = 48)
date2 = seq(as.Date("2019/02/01"), by = "month", length.out = 48)
date3 = seq(as.Date("2019/02/01"), by = "month", length.out = 48)
date4 = seq(as.Date("2019/02/01"), by = "month", length.out = 48)
date = c(date1,date2,date3,date4)
subproducts1=rep("1",48)
subproducts2=rep("2",48)
subproductsx=rep("x",48)
subproductsy=rep("y",48)
b1 <- c(rnorm(48,5))
b2 <- c(rnorm(48,5))
b3 <-c(rnorm(48,5) )
b4 <- c(rnorm(48,5))
dfone <- data.frame(
"date"= date,
"subproduct"=
c(subproducts1,subproducts2,subproductsx,subproductsy),
"actuals"= c(b1,b2,b3,b4))
This creates Jan 2019 for date2,3,4 with value 0.
dfone <-dfone %>%
complete(date = seq.Date(from = min(date), to = as.Date('2021-06-01'), by = 'month'),
nesting(subproduct), fill = list(actuals = 0))
QUESTION: This calculates the mean for each unique sub product and replaces 0's with the mean of each, but how do I have a hard cutoff so the mean is only based off Jan-2019 to Dec-2020 and not Jan 2019 to Dec 2022?
library(dplyr)
dfone_new <- dfone %>%
group_by(subproduct) %>%
mutate(actuals = replace(actuals, actuals == 0,
mean(actuals[actuals != 0], na.rm = TRUE))) %>%
ungroup
We may need one more logical expression while subsetting the 'actuals' i.e. the 'date' should be between the 2019 Jan and 2020 Dec while calculating the mean
library(dplyr)
library(tidyr)
dfone %>%
group_by(subproduct) %>%
mutate(actuals = replace(actuals, actuals == 0,
mean(actuals[actuals != 0 &
between(date, as.Date("2019-01-01"), as.Date("2020-12-31"))],
na.rm = TRUE)))

how to sum every row within date range in variables

df1:
library(tidyverse)
library(lubridate)
ex1 <- tibble(date = seq.Date(from = ymd('20200101'), length.out = 100, by = 'day'),
a = rnorm(100, mean = 1, sd = 2),
b = runif(100, min = 1, max = 2),
c = rnorm(100, mean = 3, sd = 1),
d = runif(100, min = 50, max = 60))
df2:
cal_c <- tibble(variable = c('a', 'b', 'c','d'),
start = c(ymd('20200101', '20200103', '20200203', '20200103')),
end = c(ymd('20200204', '20200405', '20200301', '20200401')),
total = c('NA', 'NA', 'NA', 'NA'))
I want to calc every row in df2 within the date range in the start and end based on df1, say a$toal between '2020-1-1' to '2020-2-4', b$total between '2020-1-3' to '2020-4-5', any help, thanks a lot.
We can create a sequence of start and end dates for cal_c data, get ex1 in long format and join. We can then sum value for each variable.
library(tidyverse)
cal_c %>%
mutate(date = map2(start, end, seq, by = 'day')) %>%
unnest(date) %>%
left_join(ex1 %>% pivot_longer(cols = -date, names_to = 'variable'),
by = c('variable', 'date')) %>%
group_by(variable, start, end) %>%
summarise(value = sum(value, na.rm = TRUE))
# variable start end value
# <chr> <date> <date> <dbl>
#1 a 2020-01-01 2020-02-04 34.3
#2 b 2020-01-03 2020-04-05 136.
#3 c 2020-02-03 2020-03-01 79.5
#4 d 2020-01-03 2020-04-01 4909.
Base R Solution:
cal_c$total <- sapply(split(cal_c, rownames(cal_c)), function(x){
sum(ex1[((ex1$date >= x$start) & (ex1$date <= x$end)), match(x$variable, names(ex1))])})
An option using data.table:
cal_c[, total :=
ex1[cal_c, on=.(date>=start, date<=end), by=.EACHI,
sum(.SD[[variable]])]$V1
]
output:
variable start end total
1: a 2020-01-01 2020-02-04 34.04780
2: b 2020-01-03 2020-04-05 135.40290
3: c 2020-02-03 2020-03-01 91.10271
4: d 2020-01-03 2020-04-01 4978.59884
data:
set.seed(0L)
library(data.table)
ex1 <- data.table(date = seq.Date(from = as.IDate('20200101', format="%Y%m%d"), length.out = 100, by = 'day'),
a = rnorm(100, mean = 1, sd = 2),
b = runif(100, min = 1, max = 2),
c = rnorm(100, mean = 3, sd = 1),
d = runif(100, min = 50, max = 60))
cal_c <- data.table(variable = c('a', 'b', 'c','d'),
start = as.IDate(c('20200101', '20200103', '20200203', '20200103'), format="%Y%m%d"),
end = as.IDate(c('20200204', '20200405', '20200301', '20200401'), format="%Y%m%d"))

How to put/save all elements of a List into one Excel sheet in R?

I have a list (bbb) with 5 elements in it, i.e., each element for a year, like 2010, 2011, ... , 2014:
The first one in the list is this:
> bbb[1]
$`2010`
Date Average
X2010.01.01 2010-01-01 2.079090e-03
X2010.01.02 2010-01-02 5.147627e-04
X2010.01.03 2010-01-03 2.997464e-04
X2010.01.04 2010-01-04 1.375538e-04
X2010.01.05 2010-01-05 1.332109e-04
The second one in the list is this:
> bbb[2]
$`2011`
Date Average
X2011.01.01 2011-01-01 1.546253e-03
X2011.01.02 2011-01-02 1.152864e-03
X2011.01.03 2011-01-03 1.752446e-03
X2011.01.04 2011-01-04 2.639658e-03
X2011.01.05 2011-01-05 5.231150e-03
X2011.01.06 2011-01-06 8.909878e-04
And so on.
Here is my question:
How can I save all of these list's elements in 1 sheet of an Excel file to have something like this:
Your help would be highly appreciated.
You can do this using dcast.
bbb <- list(`2010` = data.frame(date = as.Date("2010-01-01") + 0:4,
avg = 1:5),
`2011` = data.frame(date = as.Date("2011-01-01") + 0:5,
avg = 11:16),
`2012` = data.frame(date = as.Date("2012-01-01") + 0:9,
avg = 21:30),
`2013` = data.frame(date = as.Date("2013-01-01") + 0:7,
avg = 21:28))
df <- do.call("rbind", bbb)
df$year <- format(df$date, format = "%Y")
df$month_date <- format(df$date, format = "%b-%d")
library(data.table)
library(openxlsx)
df_dcast <- dcast(df, month_date~year, value.var = "avg")
write.xlsx(df_dcast, "example1.xlsx")
Or using spread
library(dplyr)
library(tidyr)
df2 <- df %>%
select(-date) %>%
spread(key = year, value = avg)
write.xlsx(df2, "example2.xlsx")
This isn't very pretty, but it's the best I could think of right now. But you could take the dataframes and loop through the list, joining them by date like this:
library(tidyverse)
library(lubridate)
bbb <- list(`2010` = tibble(date = c('01-01-2010', '01-02-2010', '01-03-2010', '01-04-2010', '01-05-2010'),
average = 11:15),
`2011` = tibble(date = c('01-01-2011', '01-02-2011', '01-03-2011', '01-04-2011', '01-05-2011'),
average = 1:5),
`2012` = tibble(date = c('01-01-2012', '01-02-2012', '01-03-2012', '01-04-2012', '01-05-2012'),
average = 6:10))
for (i in seq_along(bbb)) {
if(i == 1){
df <- bbb[[i]] %>%
mutate(
date = paste(day(as.Date(date, format = '%m-%d-%Y')),
month(as.Date(date, format = '%m-%d-%Y'), label = TRUE),
sep = '-')
)
colnames(df) <- c('date', names(bbb[i])) # Assuming your list of dataframes has just 2 columns: date and average
} else {
join_df <- bbb[[i]] %>%
mutate(
date = paste(day(as.Date(date, format = '%m-%d-%Y')),
month(as.Date(date, format = '%m-%d-%Y'), label = TRUE),
sep = '-')
)
colnames(join_df) <- c('date', names(bbb[i]))
df <- full_join(df, join_df, by = 'date')
}
}
This loops through the list of dataframes and reformats the dates to Day-Month.
# A tibble: 5 x 4
date `2010` `2011` `2012`
<chr> <int> <int> <int>
1 1-Jan 11 1 6
2 2-Jan 12 2 7
3 3-Jan 13 3 8
4 4-Jan 14 4 9
5 5-Jan 15 5 10
You could then write that out with the writexl package function write_xlsx

Using the pipe in selfmade function with tidyeval (quo_name)

I have two functions: date_diff and group_stat. So I have read this article tidyverse and I try so create simple functions and use the pipe.
The first function creates a difftime and names them timex_minus_timey but when I pipe this result into the next function I have to look at the name so I can fill in summary_var. Is there a better way to do this?
library(tidyverse)
#
set.seed(42)
data <- dplyr::bind_rows(
tibble::tibble(Hosp = rep("A", 1000),
drg = sample(letters[1:5], 1000, replace = TRUE),
time1 = as.POSIXlt("2018-02-03 08:00:00", tz = "UTC") + rnorm(1000, 0, 60*60*60),
time2 = time1 + runif(1000, min = 10*60, max = 20*60)),
tibble::tibble(Hosp = rep("B", 1000),
drg = sample(letters[1:5], 1000, replace = TRUE),
time1 = as.POSIXlt("2018-02-03 08:00:00", tz = "UTC") + rnorm(1000, 0, 60*60*60),
time2 = time1 + runif(1000, min = 10*60, max = 20*60))
)
date_diff <- function(df, stamp1, stamp2, units = "mins"){
stamp1 <- rlang::enquo(stamp1)
stamp2 <- rlang::enquo(stamp2)
name <- paste0(rlang::quo_name(stamp1), "_minus_", rlang::quo_name(stamp2))
out <- df %>%
dplyr::mutate(!!name := as.numeric(difftime(!!stamp1, !!stamp2, units=units)))
out
}
group_stat <- function(df, group_var, summary_var, .f) {
func <- rlang::as_function(.f)
group_var <- rlang::enquo(group_var)
summary_var <-rlang::enquo(summary_var)
name <- paste0(rlang::quo_name(summary_var), "_", deparse(substitute(.f)))
df %>%
dplyr::group_by(!!group_var) %>%
dplyr::summarise(!!name := func(!!summary_var, na.rm = TRUE))
}
data %>%
date_diff(time2, time1) %>%
group_stat(Hosp, summary_var = time2_minus_time1, mean)
#> # A tibble: 2 x 2
#> Hosp time2_minus_time1_mean
#> <chr> <dbl>
#> 1 A 15.1
#> 2 B 14.9
Created on 2019-05-02 by the reprex package (v0.2.1)
If you intend to always use these functions one after another in this way you could add an attribute containing the new column's name with date_diff, and have group_stat use that attribute. With the if condition, the attribute is only used if it exists and the summary_var argument is not provided.
date_diff <- function(df, stamp1, stamp2, units = "mins"){
stamp1 <- rlang::enquo(stamp1)
stamp2 <- rlang::enquo(stamp2)
name <- paste0(rlang::quo_name(stamp1), "_minus_", rlang::quo_name(stamp2))
out <- df %>%
dplyr::mutate(!!name := as.numeric(difftime(!!stamp1, !!stamp2, units=units)))
attr(out, 'date_diff_nm') <- name
out
}
group_stat <- function(df, group_var, summary_var, .f) {
if(!is.null(attr(df, 'date_diff_nm')) & missing(summary_var))
summary_var <- attr(df, 'date_diff_nm')
group_var <- rlang::enquo(group_var)
name <- paste0(summary_var, "_", deparse(substitute(.f)))
df %>%
dplyr::group_by(!!group_var) %>%
dplyr::summarise_at(summary_var, funs(!!name := .f), na.rm = T)
}
data %>%
date_diff(time2, time1) %>%
group_stat(Hosp, .f = mean)
# # A tibble: 2 x 2
# Hosp time2_minus_time1_mean
# <chr> <dbl>
# 1 A 15.1
# 2 B 14.9

Resources