Calculate the median per week in R - r

I would like to calculate the median per weekday of my PV variable. In other words, looking at the variable PV, it can be seen, for example, that it has three Fridays, that is, the calculation of the median will consider data from these three Fridays.
Thanks!
library(dplyr)
df <- structure(
list(Id=c(1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1),
date1 = c("2021-07-01","2021-07-01","2021-07-01","2021-07-01","2021-04-02",
"2021-04-02","2021-04-02","2021-04-02","2021-04-02","2021-04-02","2021-04-03",
"2021-04-03","2021-04-03","2021-04-03","2021-04-03","2021-04-08","2021-04-08",
"2021-04-07","2021-04-09","2021-04-10","2021-04-10"),
Week= c("Thursday","Thursday","Thursday","Thursday","Friday","Friday","Friday","Friday",
"Friday","Friday","Saturday","Saturday","Saturday","Saturday","Saturday","Thursday",
"Thursday","Friday","Friday","Saturday","Saturday"),
DTPE = c("Ho","Ho","Ho","Ho","","","","","","","","","","","","","","","","Ho","Ho"),
D1 = c(8,1,9, 3,5,4,7,6,3,8,2,3,4,6,7,8,8,6,16,2,3), DR01 = c(4,1,4,3,3,4,3,6,3,7,2,3,4,6,7,8,9,2,6,7,3),
DR02 = c(4,1,4,3,3,4,1,6,3,7,6,6,4,6,7,8,4,2,6,2,3), DR03 = c(7,5,4,3,6,4,1,5,3,6,2,3,4,9,7,8,4,2,6,4,3),
DR04= c(9,5,6,7,3,2,7,4,2,1,5,3,4,6,7,8,4,7,7,4,3),DR05 = c(9,5,4,3,3,7,1,5,3,7,2,3,4,7,7,8,4,2,6,4,3)),
class = "data.frame", row.names = c(NA, -21L))
df<-df %>%
group_by(Id, date1, Week) %>%
select(D1:DR05) %>%
summarise_all(sum)
x<-subset(df, select = DR01:DR05)
x<-cbind(df, setNames(df$D1 - x, paste0(names(x), "_PV")))
PV<-select(x, date1, Week,ends_with("PV"))
PV
Id date1 Week DR01_PV DR02_PV DR03_PV DR04_PV DR05_PV
<dbl> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 2021-04-02 Friday 7 9 8 14 7
2 1 2021-04-03 Saturday 0 -7 -3 -3 -1
3 1 2021-04-07 Friday 4 4 4 -1 4
4 1 2021-04-08 Thursday -1 4 4 4 4
5 1 2021-04-09 Friday 10 10 10 9 10
6 1 2021-04-10 Saturday -5 0 -2 -2 -2
7 1 2021-07-01 Thursday 9 9 2 -6 0

x %>%
group_by(Week) %>%
summarize(across(ends_with("_PV"), median))
# # A tibble: 3 x 6
# Week DR01_PV DR02_PV DR03_PV DR04_PV DR05_PV
# <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 Friday 7 9 8 9 7
# 2 Saturday -2.5 -3.5 -2.5 -2.5 -1.5
# 3 Thursday 4 6.5 3 -1 2
If you want to combine all columns, one way is
PV %>%
ungroup() %>%
select(Week, ends_with("PV")) %>%
tidyr::pivot_longer(-Week) %>%
group_by(Week) %>%
summarize(Med = median(value))
# # A tibble: 3 x 2
# Week Med
# <chr> <dbl>
# 1 Friday 8
# 2 Saturday -2
# 3 Thursday 4

Related

How to adjust select of a function

Could you help me solve the problem below: as you can see in the second part of the code I exclude the DR that have all columns that are equal to 0. However, in the third part of the code, I need to select D1 until the last column DR, for the sum to be done. But it gives an error, could you help me solve the problem?
library(dplyr)
df1 <- structure(
list(date1 = c("2021-06-28","2021-06-28","2021-06-28","2021-06-28","2021-06-28",
"2021-06-28","2021-06-28","2021-06-28","2021-06-28","2021-06-28"),
date2 = c("2021-04-02","2021-04-02","2021-04-03","2021-04-08","2021-04-09","2021-04-10","2021-07-01","2021-07-02","2021-07-03","2021-07-03"),
Week= c("Friday","Friday","Saturday","Thursday","Friday","Saturday","Thursday","Friday","Saturday","Saturday"),
D1 = c(2,3,4,4,6,3,4,5,6,2), DR01 = c(4,1,4,3,3,4,3,6,3,2), DR02= c(4,2,6,7,3,2,7,4,4,3),DR03= c(9,5,4,3,3,2,1,5,4,3),
DR04 = c(5,4,3,3,3,6,2,1,9,2),DR05 = c(5,4,5,3,6,2,1,9,3,4),
DR06 = c(2,4,4,3,3,5,6,7,8,3),DR07 = c(2,5,4,4,9,4,7,8,3,3),
DR08 = c(0,0,0,0,1,2,0,0,0,0),DR09 = c(0,0,0,0,0,0,0,0,0,0),DR010 = c(0,0,0,0,0,0,0,0,0,0),DR011 = c(0,4,0,0,0,0,0,0,0,0), DR012 = c(0,0,0,0,0,0,0,0,0,0)),
class = "data.frame", row.names = c(NA, -10L))
df1<-df1 %>%
select(!where(~ is.numeric(.) && all(. == 0)))
df1<-df1 %>%
group_by(date1,date2, Week) %>%
select(D1:DR012) %>%
summarise_all(sum)
We can have the select before
library(dplyr)
df1 %>%
select(date1, date2, Week, matches("^D")) %>%
group_by(date1, date2, Week) %>%
summarise(across(everything(), sum), .groups = 'drop')
-output
# A tibble: 8 × 13
date1 date2 Week D1 DR01 DR02 DR03 DR04 DR05 DR06 DR07 DR08 DR011
<chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 2021-06-28 2021-04-02 Friday 5 5 6 14 9 9 6 7 0 4
2 2021-06-28 2021-04-03 Saturday 4 4 6 4 3 5 4 4 0 0
3 2021-06-28 2021-04-08 Thursday 4 3 7 3 3 3 3 4 0 0
4 2021-06-28 2021-04-09 Friday 6 3 3 3 3 6 3 9 1 0
5 2021-06-28 2021-04-10 Saturday 3 4 2 2 6 2 5 4 2 0
6 2021-06-28 2021-07-01 Thursday 4 3 7 1 2 1 6 7 0 0
7 2021-06-28 2021-07-02 Friday 5 6 4 5 1 9 7 8 0 0
8 2021-06-28 2021-07-03 Saturday 8 5 7 7 11 7 11 6 0 0
After we did the select, it is not clear why we have to select again. It is not really needed as summarise with across can be everything() other than the grouping columns
df1 %>%
select(!where(~ is.numeric(.) && all(. == 0))) %>%
group_by(across(date1:Week)) %>%
summarise(across(everything(), sum), .groups = 'drop')
# A tibble: 8 × 13
date1 date2 Week D1 DR01 DR02 DR03 DR04 DR05 DR06 DR07 DR08 DR011
<chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 2021-06-28 2021-04-02 Friday 5 5 6 14 9 9 6 7 0 4
2 2021-06-28 2021-04-03 Saturday 4 4 6 4 3 5 4 4 0 0
3 2021-06-28 2021-04-08 Thursday 4 3 7 3 3 3 3 4 0 0
4 2021-06-28 2021-04-09 Friday 6 3 3 3 3 6 3 9 1 0
5 2021-06-28 2021-04-10 Saturday 3 4 2 2 6 2 5 4 2 0
6 2021-06-28 2021-07-01 Thursday 4 3 7 1 2 1 6 7 0 0
7 2021-06-28 2021-07-02 Friday 5 6 4 5 1 9 7 8 0 0
8 2021-06-28 2021-07-03 Saturday 8 5 7 7 11 7 11 6 0 0
We could use summarise with across:
library(dplyr)
df1 %>%
select(!where(~ is.numeric(.) && all(. == 0))) %>%
group_by(date1,date2, Week) %>%
summarise(across(where(is.numeric), sum))
date1 date2 Week D1 DR01 DR02 DR03 DR04 DR05 DR06 DR07 DR08 DR011
1 2021-06-28 2021-04-02 Friday 2 4 4 9 5 5 2 2 0 0
2 2021-06-28 2021-04-02 Friday 3 1 2 5 4 4 4 5 0 4
3 2021-06-28 2021-04-03 Saturday 4 4 6 4 3 5 4 4 0 0
4 2021-06-28 2021-04-08 Thursday 4 3 7 3 3 3 3 4 0 0
5 2021-06-28 2021-04-09 Friday 6 3 3 3 3 6 3 9 1 0
6 2021-06-28 2021-04-10 Saturday 3 4 2 2 6 2 5 4 2 0
7 2021-06-28 2021-07-01 Thursday 4 3 7 1 2 1 6 7 0 0
8 2021-06-28 2021-07-02 Friday 5 6 4 5 1 9 7 8 0 0
9 2021-06-28 2021-07-03 Saturday 6 3 4 4 9 3 8 3 0 0
10 2021-06-28 2021-07-03 Saturday 2 2 3 3 2 4 3 3 0 0
DR012 is filtered, so it does not exist anymore to select:
df1 %>%
select(!where(~ is.numeric(.) && all(. == 0))) %>%
names()
[1] "date1" "date2" "Week" "D1" "DR01" "DR02" "DR03" "DR04" "DR05"
[10] "DR06" "DR07" "DR08" "DR011"
Change your code to
df1 %>%
group_by(date1,date2, Week) %>%
select(D1:DR011) %>%
summarise_all(sum)
or
df1 %>%
group_by(date1,date2, Week) %>%
select(starts_with("D")) %>%
summarise_all(sum)

How to Exclude month for mean and standard deviation calculation in R

The code below generates a scatter plot with three horizontal lines, which refer to mean, mean+standard deviation and mean - standard deviation. To calculate these three factors, all the dates in my data database are being considered.
However, I would like to exclude the month of April for calculating the mean and standard deviation, how could I do that?
Executable code below:
library(dplyr)
library(tidyr)
library(lubridate)
data <- structure(
list(Id=c(1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1),
date1 = c("2021-06-20","2021-06-20","2021-06-20","2021-06-20","2021-06-20",
"2021-06-20","2021-06-20","2021-06-20","2021-06-20","2021-06-20","2021-06-20",
"2021-06-20","2021-06-20","2021-06-20","2021-06-20","2021-06-20","2021-06-20",
"2021-06-20","2021-06-20","2021-06-20","2021-06-20"),
date2 = c("2021-07-01","2021-07-01","2021-07-01","2021-07-01","2021-04-02",
"2021-04-02","2021-04-02","2021-04-02","2021-04-02","2021-04-02","2021-04-03",
"2021-04-03","2021-04-03","2021-04-03","2021-04-03","2021-04-08","2021-04-08",
"2021-07-09","2021-07-09","2021-07-10","2021-07-10"),
Week= c("Thursday","Thursday","Thursday","Thursday","Friday","Friday","Friday","Friday",
"Friday","Friday","Saturday","Saturday","Saturday","Saturday","Saturday","Thursday",
"Thursday","Friday","Friday","Saturday","Saturday"),
DTPE = c("Ho","Ho","Ho","Ho","","","","","","","","","","","","","","","","Ho","Ho"),
D1 = c(8,1,9, 3,5,4,7,6,3,8,2,3,4,6,7,8,4,2,6,2,3), DR01 = c(4,1,4,3,3,4,3,6,3,7,2,3,4,6,7,8,4,2,6,7,3),
DR02 = c(8,1,4,3,3,4,1,6,3,7,2,3,4,6,7,8,4,2,6,2,3), DR03 = c(7,5,4,3,3,4,1,5,3,3,2,3,4,6,7,8,4,2,6,4,3),
DR04= c(4,5,6,7,3,2,7,4,2,1,2,3,4,6,7,8,4,2,6,4,3),DR05 = c(9,5,4,3,3,2,1,5,3,7,2,3,4,7,7,8,4,2,6,4,3)),
class = "data.frame", row.names = c(NA, -21L))
graph <- function(dt, dta = data) {
dim_data<-dim(data)
day<-c(seq.Date(from = as.Date(data$date2[1]), by = "days",
length = dim_data[1]
))
data_grouped <- data %>%
mutate(across(starts_with("date"), as.Date)) %>%
group_by(date2) %>%
summarise(Id = first(Id),
date1 = first(date1),
Week = first(Week),
DTPE = first(DTPE),
D1 = sum(D1)) %>%
select(Id,date1,date2,Week,DTPE,D1)
data_grouped %>%
mutate(DTPE = na_if(DTPE, ""))
df_OC<-subset(data_grouped, DTPE == "")
ds_CO = df_OC %>% filter(weekdays(date2) %in% weekdays(as.Date(dt)))
mean<-mean(ds_CO$D1)
sd<-sd(ds_CO$D1)
dta %>%
filter(date2 == ymd(dt)) %>%
summarize(across(starts_with("DR"), sum)) %>%
pivot_longer(everything(), names_pattern = "DR(.+)", values_to = "val") %>%
mutate(name = as.numeric(name)) %>%
plot(xlab = "Days", ylab = "Number", xlim = c(0, 45),cex=1.5,cex.lab=1.5,
cex.axis=1.5, cex.main=2, cex.sub=2, lwd=2.5, ylim = c((min(.$val) %/% 10) * 15, (max(.$val) %/% 10 + 1) * 100))
abline(h=mean, col='blue') +
abline(h=(mean + sd), col='green',lty=2)
abline(h=(mean - sd), col='orange',lty=2)
}
graph("2021-07-10",data)
data %>%
filter("04" != format(as.Date(date2), format = "%m"))
# Id date1 date2 Week DTPE D1 DR01 DR02 DR03 DR04 DR05
# 1 1 2021-06-20 2021-07-01 Thursday Ho 8 4 8 7 4 9
# 2 1 2021-06-20 2021-07-01 Thursday Ho 1 1 1 5 5 5
# 3 1 2021-06-20 2021-07-01 Thursday Ho 9 4 4 4 6 4
# 4 1 2021-06-20 2021-07-01 Thursday Ho 3 3 3 3 7 3
# 5 1 2021-06-20 2021-07-09 Friday 2 2 2 2 2 2
# 6 1 2021-06-20 2021-07-09 Friday 6 6 6 6 6 6
# 7 1 2021-06-20 2021-07-10 Saturday Ho 2 7 2 4 4 4
# 8 1 2021-06-20 2021-07-10 Saturday Ho 3 3 3 3 3 3
(I recommend you permanently make date1 and date2 proper Date objects in the frame instead of converting it every time you do something. While the conversion is relatively inexpensive, it's also unnecessary, and the consequence of forgetting it might be subtle differences in the results (i.e., treating it as a categorical variable vice continuous/discrete-ordinal).
You already use lubridate therefore you could apply month function from lubridate package:
data %>%
filter(month(date2) != 4)
Id date1 date2 Week DTPE D1 DR01 DR02 DR03 DR04 DR05
1 1 2021-06-20 2021-07-01 Thursday Ho 8 4 8 7 4 9
2 1 2021-06-20 2021-07-01 Thursday Ho 1 1 1 5 5 5
3 1 2021-06-20 2021-07-01 Thursday Ho 9 4 4 4 6 4
4 1 2021-06-20 2021-07-01 Thursday Ho 3 3 3 3 7 3
5 1 2021-06-20 2021-07-09 Friday 2 2 2 2 2 2
6 1 2021-06-20 2021-07-09 Friday 6 6 6 6 6 6
7 1 2021-06-20 2021-07-10 Saturday Ho 2 7 2 4 4 4
8 1 2021-06-20 2021-07-10 Saturday Ho 3 3 3 3 3 3
Using substr
subset(data, substr(date2, 6, 7 ) != '04')
-ouptut
Id date1 date2 Week DTPE D1 DR01 DR02 DR03 DR04 DR05
1 1 2021-06-20 2021-07-01 Thursday Ho 8 4 8 7 4 9
2 1 2021-06-20 2021-07-01 Thursday Ho 1 1 1 5 5 5
3 1 2021-06-20 2021-07-01 Thursday Ho 9 4 4 4 6 4
4 1 2021-06-20 2021-07-01 Thursday Ho 3 3 3 3 7 3
18 1 2021-06-20 2021-07-09 Friday 2 2 2 2 2 2
19 1 2021-06-20 2021-07-09 Friday 6 6 6 6 6 6
20 1 2021-06-20 2021-07-10 Saturday Ho 2 7 2 4 4 4
21 1 2021-06-20 2021-07-10 Saturday Ho 3 3 3 3 3 3

R, count non-missing dates in dataframe, return count as column

I have this data:
Times<-structure(list(record_id = c(1, 2, 3, 4, 5, 6), Date1 = structure(c(17385,
17959, 17267, 17204, 17063, 18436), class = "Date"), Date2 = structure(c(17689,
18001, NA, 17255, 17076, 18471), class = "Date"), Date3 = structure(c(NA,
NA, NA, NA, 18052, 18499), class = "Date")), row.names = c(NA,
-6L), class = c("tbl_df", "tbl", "data.frame"))
And what I'd like to do is count the number of dates in each row that are there (not missing) and return that as a count in a new column.
Like so:
The actual data frame has many more columns in between each target column, so I'd like to be able to explicitly name the columns I'm counting.
Any ideas?
I am not sure about your second requirement, but please check if this works for you:
library(tidyr)
library(dplyr)
Times %>% pivot_longer(cols = starts_with('Date'), names_to = 'Date') %>%
group_by(record_id) %>% mutate(Count = sum(!is.na(value))) %>%
pivot_wider(id_cols = c(record_id,Count), names_from = Date, values_from = value )
# A tibble: 6 x 5
# Groups: record_id [6]
record_id Count Date1 Date2 Date3
<dbl> <int> <date> <date> <date>
1 1 2 2017-08-07 2018-06-07 NA
2 2 2 2019-03-04 2019-04-15 NA
3 3 1 2017-04-11 NA NA
4 4 2 2017-02-07 2017-03-30 NA
5 5 3 2016-09-19 2016-10-02 2019-06-05
6 6 3 2020-06-23 2020-07-28 2020-08-25
With rowSums you can do -
cols <- c('Date1', 'Date2', 'Date3')
Times$Count <- rowSums(!is.na(Times[cols]))
Times
# A tibble: 6 x 5
# record_id Date1 Date2 Date3 Count
# <dbl> <date> <date> <date> <dbl>
#1 1 2017-08-07 2018-06-07 NA 2
#2 2 2019-03-04 2019-04-15 NA 2
#3 3 2017-04-11 NA NA 1
#4 4 2017-02-07 2017-03-30 NA 2
#5 5 2016-09-19 2016-10-02 2019-06-05 3
#6 6 2020-06-23 2020-07-28 2020-08-25 3
If the columns have some pattern you can use dplyr select helper functions.
library(dplyr)
Times %>% mutate(Count = rowSums(!is.na(select(., starts_with('Date')))))
Another base R option using rowSums
> Times$Count <- rowSums(!is.na(Times)) - 1
> Times
# A tibble: 6 x 5
record_id Date1 Date2 Date3 Count
<dbl> <date> <date> <date> <dbl>
1 1 2017-08-07 2018-06-07 NA 2
2 2 2019-03-04 2019-04-15 NA 2
3 3 2017-04-11 NA NA 1
4 4 2017-02-07 2017-03-30 NA 2
5 5 2016-09-19 2016-10-02 2019-06-05 3
6 6 2020-06-23 2020-07-28 2020-08-25 3
We can use
library(dplyr)
Times %>%
rowwise %>%
mutate(Count = sum(!is.na(c_across(starts_with('Date'))))) %>%
ungroup
# A tibble: 6 x 5
record_id Date1 Date2 Date3 Count
<dbl> <date> <date> <date> <int>
1 1 2017-08-07 2018-06-07 NA 2
2 2 2019-03-04 2019-04-15 NA 2
3 3 2017-04-11 NA NA 1
4 4 2017-02-07 2017-03-30 NA 2
5 5 2016-09-19 2016-10-02 2019-06-05 3
6 6 2020-06-23 2020-07-28 2020-08-25 3

Finding the differences of paired-columns using dplyr

set.seed(3)
library(dplyr)
dat <- tibble(Measure = c("Height","Weight","Width","Length"),
AD1_1= rpois(4,10),
AD1_2= rpois(4,9),
AD2_1= rpois(4,10),
AD2_2= rpois(4,9),
AD3_1= rpois(4,10),
AD3_2= rpois(4,9),
AD4_1= rpois(4,10),
AD4_2= rpois(4,9),
AD5_1= rpois(4,10),
AD5_2= rpois(4,9),
AD6_1= rpois(4,10),
AD6_2= rpois(4,9))
Suppose I have data that looks like this. I wish to calculate the difference for each AD, paired with underscored number, i.e., AD1diff, AD2diff,AD3diff.
Instead of writing
dat %>%
mutate(AD1diff = AD1_1 - AD1_2,
AD2diff = AD2_1 - AD2_2,
...)
what would be an efficient way to write this?
One dplyr option could be:
dat %>%
mutate(across(ends_with("_1"), .names = "{col}_diff") - across(ends_with("_2"))) %>%
rename_with(~ sub("_\\d+", "", .), ends_with("_diff"))
Measure AD1_1 AD1_2 AD2_1 AD2_2 AD3_1 AD3_2 AD4_1 AD4_2 AD5_1 AD5_2 AD6_1 AD6_2 AD1_diff AD2_diff AD3_diff AD4_diff AD5_diff AD6_diff
<chr> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
1 Height 6 10 10 3 12 8 7 5 7 5 8 9 -4 7 4 2 2 -1
2 Weight 8 9 13 6 14 7 8 7 13 11 10 9 -1 7 7 1 2 1
3 Width 10 9 11 5 12 8 7 11 9 5 5 6 1 6 4 -4 4 -1
4 Length 8 9 8 7 8 13 8 7 6 11 14 6 -1 1 -5 1 -5 8
The "tidy" way to do this would be to convert your data from wide to long, do a grouped subtraction, and then go back to wide format:
library(tidyr)
dat_long = dat %>% pivot_longer(
cols = starts_with("AD"),
names_sep = "_",
names_to = c("group", "obs")
)
dat_long %>% head
# # A tibble: 48 x 4
# Measure group obs value
# <chr> <chr> <chr> <int>
# 1 Height AD1 1 6
# 2 Height AD1 2 10
# 3 Height AD2 1 10
# 4 Height AD2 2 3
# 5 Height AD3 1 12
# 6 Height AD3 2 8
dat_long %>%
group_by(Measure, group) %>%
summarize(diff = value[obs == 1] - value[obs == 2]) %>%
pivot_wider(names_from = "group", values_from = "diff") %>%
rename_with(.fn = ~ paste0(., "diff"), .cols = starts_with("AD"))
# # A tibble: 4 x 7
# # Groups: Measure [4]
# Measure AD1diff AD2diff AD3diff AD4diff AD5diff AD6diff
# <chr> <int> <int> <int> <int> <int> <int>
# 1 Height -4 7 4 2 2 -1
# 2 Length -1 1 -5 1 -5 8
# 3 Weight -1 7 7 1 2 1
# 4 Width 1 6 4 -4 4 -1
Here is a data.table option
setDT(dat)[
,
paste0(
unique(gsub("_\\d+", "", names(dat)[-1])),
"diff"
) := lapply(
split.default(.SD[, -1], gsub("_\\d+", "", names(dat)[-1])),
function(x) do.call("-", x)
)
]
which gives
> dat
Measure AD1_1 AD1_2 AD2_1 AD2_2 AD3_1 AD3_2 AD4_1 AD4_2 AD5_1 AD5_2 AD6_1
1: Height 6 10 10 3 12 8 7 5 7 5 8
2: Weight 8 9 13 6 14 7 8 7 13 11 10
3: Width 10 9 11 5 12 8 7 11 9 5 5
4: Length 8 9 8 7 8 13 8 7 6 11 14
AD6_2 AD1diff AD2diff AD3diff AD4diff AD5diff AD6diff
1: 9 -4 7 4 2 2 -1
2: 9 -1 7 7 1 2 1
3: 6 1 6 4 -4 4 -1
4: 6 -1 1 -5 1 -5 8
or
setDT(dat)[
,
c(.(Measure = Measure), setNames(lapply(
split.default(.SD[, -1], gsub("_\\d+", "", names(dat)[-1])),
function(x) do.call("-", x)
), paste0(
unique(gsub("_\\d+", "", names(dat)[-1])),
"diff"
)))
]
gives
Measure AD1diff AD2diff AD3diff AD4diff AD5diff AD6diff
1: Height -4 7 4 2 2 -1
2: Weight -1 7 7 1 2 1
3: Width 1 6 4 -4 4 -1
4: Length -1 1 -5 1 -5 8
Use tidyverse package tidyr to rearrange your data before mutating
require(dplyr)
require(tidyr)
#> Loading required package: tidyr
First, tidyr::pivot_longer the data frame so that there's a separate row for every column:
new_dat <-
pivot_longer(dat, cols = starts_with("AD"), # For columns whose names start with 'AD'...
names_sep = "_", # separate columns using '_' in colname
names_to = c("AD_number", "observation")) %>%
arrange(AD_number, Measure, observation)
head(new_dat, 9)
#> # A tibble: 9 x 4
#> Measure AD_number observation value
#> <chr> <chr> <chr> <int>
#> 1 Height AD1 1 6
#> 2 Height AD1 2 10
#> 3 Length AD1 1 8
#> 4 Length AD1 2 9
#> 5 Weight AD1 1 8
#> 6 Weight AD1 2 9
#> 7 Width AD1 1 10
#> 8 Width AD1 2 9
#> 9 Height AD2 1 10
Then, use tidyr::pivot_wider (the functional opposite of pivot_longer) to make a separate column for each value in observation. This will be very compatible with the upcoming mutate operation.
new_dat <-
pivot_wider(new_dat,
names_from = observation,
values_from = value,
names_prefix = "value_")
head(new_dat, 5)
#> # A tibble: 5 x 4
#> Measure AD_number value_1 value_2
#> <chr> <chr> <int> <int>
#> 1 Height AD1 6 10
#> 2 Length AD1 8 9
#> 3 Weight AD1 8 9
#> 4 Width AD1 10 9
#> 5 Height AD2 10 3
Finally, mutate the data:
new_dat <-
mutate(new_dat, diff = value_1 - value_2)
head(new_dat, 4)
#> # A tibble: 4 x 5
#> Measure AD_number value_1 value_2 diff
#> <chr> <chr> <int> <int> <int>
#> 1 Height AD1 6 10 -4
#> 2 Length AD1 8 9 -1
#> 3 Weight AD1 8 9 -1
#> 4 Width AD1 10 9 1
Created on 2021-01-22 by the reprex package (v0.3.0)
Getting back to your original data format is possible, but it might not make the data any easier to work with:
rename(new_dat,
c(`1` = "value_1", `2` = "value_2")) %>%
pivot_wider(names_from = AD_number,
values_from = c(`1`, `2`, diff),
names_glue = "{AD_number}_{.value}") %>%
{.[,order(names(.))]} %>%
relocate(Measure)

R - (Tidyverse) Compress multiple observations into one

I have a dataset that has multiple variables, two of which are dates (start date, end date). Sometimes a date interval has been split into sequences so for example you would have:
Start: 1990-12-12, Stop: 1990-12-13
Start: 1990-12-13, Stop: 1990-12-14
Rather than
Start: 1990-12-12, Stop: 1990-12-14
What I want to do is isolate these chains of sequences and basically collapse them into one observation such that all observations from the end of the sequence are saved with the rest being overwritten (except the first start date). Below is a basic example:
library(tidyverse)
library(lubridate)
tib_ex <- tibble(
id = rep(1,5),
date1 = ymd(c('1990-11-05', '1990-12-01',
'1990-12-05', '1990-12-08',
'1990-12-15')),
date2 = ymd(c('1990-11-28', '1990-12-05',
'1990-12-08', '1990-12-12',
'1990-12-31')),
var1 = 2:6,
var2 = 7:11,
var3 = 12:16,
var4 = c(0, 1, 0 ,0, 1)
)
This yields the following tibble:
# A tibble: 5 x 7
id date1 date2 var1 var2 var3 var4
<dbl> <date> <date> <int> <int> <int> <dbl>
1 1 1990-11-05 1990-11-28 2 7 12 0
2 1 1990-12-01 1990-12-05 3 8 13 1
3 1 1990-12-05 1990-12-08 4 9 14 0
4 1 1990-12-08 1990-12-12 5 10 15 0
5 1 1990-12-15 1990-12-31 6 11 16 1
Which I want to transform into the following tibble:
# A tibble: 3 x 7
id date1 date2 var1 var2 var3 var4
<dbl> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 1 1990-11-05 1990-11-28 2 7 12 0
2 1 1990-12-01 1990-12-12 5 10 15 0
3 1 1990-12-15 1990-12-31 6 11 16 1
I thought about nesting by id, date1 and date2 which packs the rest of the variables into a tibble per row making it easy to overwrite I just don't know how to efficiently collapse the dates from row 2 to row 4.
I've tried creating a binary variable which tracks if the end date of one observation matches the start date of the following observation but I'm running into difficulties there as well.
Find the rows with start and end dates by comparing with the next / previous row and combine the result in a suitable way:
date_info <-
tib_ex %>%
## find indices of start and end dates by comparing with date in next / previous row
mutate(is_startdate = date1 != lag(date2),
is_enddate = date2 != lead(date1)) %>%
## NA's appear at the beginning (start_date) and end (end_date) and should thus be interpreted as TRUE
replace_na(list(is_startdate = T, is_enddate = T))
## combine the start- and end-dates
date_info %>%
filter(is_enddate) %>%
mutate(date1 = date_info$date1[date_info$is_startdate]) %>%
select(-starts_with("is_"))
-------
# A tibble: 3 x 7
id date1 date2 var1 var2 var3 var4
<dbl> <date> <date> <int> <int> <int> <dbl>
1 1.00 1990-11-05 1990-11-28 2 7 12 0
2 1.00 1990-12-01 1990-12-12 5 10 15 0
3 1.00 1990-12-15 1990-12-31 6 11 16 1.00
Here is a different approach which also will work if the dataset contains more than one individual id. According to OP's expected result the additional variables var1 to var4 are aggregated/summarized by picking the value at the end of each collapsed period.
The approach below
uses cumsum() and lag() to identify rows which belong to one period,
uses summarize() to collapse the start and end dates,
and joins with the original dataset to pick the values at the end of each collapsed period.
The last step avoids to include all additional variables in the call to summarize().
tib_ex %>%
arrange(id, date1, date2) %>% # this is important!
group_by(id) %>%
mutate(period = cumsum(lag(date2, default = date1[1]) < date1)) %>%
right_join(
(.) %>% group_by(id, period) %>%
summarize(date1 = first(date1), date2 = last(date2)),
by = c("id", "period", "date2"), suffix = c("", ".y")) %>%
select(-period, -date1.y)
# A tibble: 3 x 7
# Groups: id [1]
id date1 date2 var1 var2 var3 var4
<dbl> <date> <date> <int> <int> <int> <dbl>
1 1 1990-11-05 1990-11-28 2 7 12 0
2 1 1990-12-08 1990-12-12 5 10 15 0
3 1 1990-12-15 1990-12-31 6 11 16 1
Here is a test that the approach is working for multiple id:
tib_ex %>%
bind_rows(
(.) %>% mutate(id = 2))
duplicates OPs dataset for id = 2:
# A tibble: 10 x 7
id date1 date2 var1 var2 var3 var4
<dbl> <date> <date> <int> <int> <int> <dbl>
1 1 1990-11-05 1990-11-28 2 7 12 0
2 1 1990-12-01 1990-12-05 3 8 13 1
3 1 1990-12-05 1990-12-08 4 9 14 0
4 1 1990-12-08 1990-12-12 5 10 15 0
5 1 1990-12-15 1990-12-31 6 11 16 1
6 2 1990-11-05 1990-11-28 2 7 12 0
7 2 1990-12-01 1990-12-05 3 8 13 1
8 2 1990-12-05 1990-12-08 4 9 14 0
9 2 1990-12-08 1990-12-12 5 10 15 0
10 2 1990-12-15 1990-12-31 6 11 16 1
tib_ex %>%
bind_rows(
(.) %>% mutate(id = 2)) %>%
arrange(id, date1, date2) %>% # this is important!
group_by(id) %>%
mutate(period = cumsum(lag(date2, default = date1[1]) < date1)) %>%
right_join(
(.) %>% group_by(id, period) %>%
summarize(date1 = first(date1), date2 = last(date2)),
by = c("id", "period", "date2"), suffix = c("", ".y")) %>%
select(-period, -date1.y)
# A tibble: 6 x 7
# Groups: id [2]
id date1 date2 var1 var2 var3 var4
<dbl> <date> <date> <int> <int> <int> <dbl>
1 1 1990-11-05 1990-11-28 2 7 12 0
2 1 1990-12-08 1990-12-12 5 10 15 0
3 1 1990-12-15 1990-12-31 6 11 16 1
4 2 1990-11-05 1990-11-28 2 7 12 0
5 2 1990-12-08 1990-12-12 5 10 15 0
6 2 1990-12-15 1990-12-31 6 11 16 1

Resources