Using the data provide below, I would like to group my data table by Date and, by column reference (colstoCut), apply the cut function in my code. I can achieve this using dplyr but my actual data is quite large and it's too slow. I've made several attempts based on approaches I've found on SO and elsewhere but nothing I've found so far seems to work.
library(tidyverse)
library(data.table)
cutme <- structure(list(Date = structure(c(18993, 18993, 18993, 18993,
18993, 18994, 18994, 18994, 18994, 18994, 18995, 18995, 18995,
18995, 18995, 18996, 18996, 18996, 18996, 18996, 18997, 18997,
18997, 18997, 18997), class = "Date"), val1 = c(2, 1, 1, 1, 2,
0, 0, 1, 0, 0, 0, 1, 1, 2, 1, 1, 2, 1, 0, 0, 1, 1, 0, 0, 1),
val2 = c(306, 291, 306, 300, 306, 295, 299, 291, 302, 298,
301, 300, 291, 301, 297, 290, 294, 298, 293, 294, 310, 305,
293, 322, 299), val3 = c(278.115915402059, 275.206632766366,
277.843871977486, 274.375934310537, 271.976342200702, 314.694861131995,
322.55015422103, 312.56565930567, 321.31779178896, 310.742656596237,
294.839125866978, 305.946938215211, 317.090018318496, 319.386088532157,
312.323793703966, 309.29514039576, 313.96520162878, 317.360306029457,
310.212544203034, 320.263145398593, 310.432980834677, 296.638028917156,
294.622602772748, 305.922855022984, 308.30568677617)), row.names = c(NA,
-25L), groups = structure(list(`Date,` = structure(c(18993, 18994,
18995, 18996, 18997), class = "Date"), .rows = structure(list(
1:5, 6:10, 11:15, 16:20, 21:25), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, -5L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE), class = c("data.table",
"data.frame"))
# cut these columns [transforms them to bins (factors)]
colstoCut <- colnames(cutme)[-1]
# approach using dplyr (works but too slow on real data)
cutme <- cutme %>%
dplyr::group_by(Date) %>%
dplyr::mutate_at(all_of(colstoCut), ~cut(x = ., breaks = unique(quantile(x = ., probs = seq(0,1,0.025))), include.lowest = TRUE)) %>%
dplyr::ungroup(.)
## several attempts using data.table ##
# no error thrown but columns are not actually cut
cutme[, (colstoCut) := Map(function(x) cut(x = ., breaks = seq(0,1,0.0025), include.lowest = TRUE)), by = Date]
# Error in cut.default(x = ., breaks = seq(0, 1, 0.0025), include.lowest = TRUE) : 'x' must be numeric
cutme[, (colstoCut) := lapply(.SD, cut(x = ., breaks = seq(0,1,0.0025), include.lowest = TRUE)), by = Date, .SDcols = colstoCut]
# Error in cut.default(x = ., breaks = seq(0, 1, 0.0025), include.lowest = TRUE) : 'x' must be numeric
in_cols <- colstoCut
out_cols <- paste0(in_cols,"fact")
cutme[, (out_cols) := lapply(.SD, cut(x = ., breaks = seq(0,1,0.0025), include.lowest = TRUE)), by = Date, .SDcols = in_cols]
# Error in cut.default(x = ., breaks = seq(0, 1, 0.0025), include.lowest = TRUE) : 'x' must be numeric
cutme[, (colstoCut) := lapply(colstoCut, function(x) cut(x = ., breaks = seq(0,1,0.0025), include.lowest = TRUE)), by = Date]
# Error in .subset2(x, i, exact = exact) : recursive indexing failed at level 2
cutme[, (colstoCut) := lapply(.SD, cut(x = cutme[[colstoCut]], breaks = seq(0,1,0.0025), include.lowest = TRUE)), by = Date]
For the given sample dataset with 1 grouping column and 3 value columns to be transformed, the data.table equivalent of OP's dplyr code simply is
library(data.table)
mycut <- \(x) cut(x, unique(quantile(x, probs = seq(0, 1, 0.025))), include.lowest = TRUE)
cutme <- setDT(cutme)[, lapply(.SD, mycut), .SDcols = colstoCut, by = Date]
cutme
Date val1 val2 val3
<Date> <fctr> <fctr> <fctr>
1: 2022-01-01 (1.9,2] (305.4,306] (278.09,278.12]
2: 2022-01-01 [1,1.1] [291,291.9] (275.12,275.21]
3: 2022-01-01 [1,1.1] (305.4,306] (277.58,277.84]
4: 2022-01-01 [1,1.1] (299.1,300] (274.14,274.38]
5: 2022-01-01 (1.9,2] (305.4,306] [271.98,272.22]
6: 2022-01-02 [0,0.1] (294.6,295] (314.5,314.7]
7: 2022-01-02 [0,0.1] (298.9,299] (322.4,322.6]
8: 2022-01-02 (0.9,1] [291,291.4] (312.4,312.6]
9: 2022-01-02 [0,0.1] (301.7,302] (320.7,321.3]
10: 2022-01-02 [0,0.1] (297.7,298] [310.7,310.9]
11: 2022-01-03 [0,0.1] (300.9,301] [294.8,295.9]
12: 2022-01-03 (0.9,1] (299.7,300] (304.8,305.9]
13: 2022-01-03 (0.9,1] [291,291.6] (316.6,317.1]
14: 2022-01-03 (1.9,2] (300.9,301] (319.2,319.4]
15: 2022-01-03 (0.9,1] (296.4,297] (311.7,312.3]
16: 2022-01-04 (0.9,1] [290,290.3] [309.3,309.39]
17: 2022-01-04 (1.9,2] (293.9,294] (313.59,313.97]
18: 2022-01-04 (0.9,1] (297.6,298] (317.02,317.36]
19: 2022-01-04 [0,0.1] (292.7,293] (310.12,310.21]
20: 2022-01-04 [0,0.1] (293.9,294] (319.97,320.26]
21: 2022-01-05 (0.9,1] (309.5,310] (310.2,310.4]
22: 2022-01-05 (0.9,1] (304.4,305] (296.4,296.6]
23: 2022-01-05 [0,0.1] [293,293.6] [294.6,294.8]
24: 2022-01-05 [0,0.1] (320.8,322] (305,305.9]
25: 2022-01-05 (0.9,1] (298.4,299] (308.1,308.3]
Date val1 val2 val3
Based on the comment from akrun the following worked for me:
in_cols <- colstoCut
out_cols <- paste0(in_cols,"_fact")
cutme[, (out_cols) := lapply(.SD, function(.) cut(x = ., breaks = unique(quantile(x = ., probs = seq(0,1,0.025))), include.lowest = TRUE)), by = Date, .SDcols = (in_cols) ]
cutme <- cutme %>%
dplyr::select(Date, all_of(out_cols)
colnames(cutme) <- gsub("_fact","",colnames(cutme))
Because I wasn't able to directly transform the numeric columns into factor columns like I can with dplyr::mutate_at(vars(colstoCut), ~cut(...)) I removed the original (uncut) columns from the data table and selected the new cut columns / renamed using gsub.
I'm sure there's a cleaner way to do this without resorting to selecting / renaming columns but it works for me. I'm happy to accept a better answer once and if it's posted.
Related
Following is the dput() output of DT. I would like to sum the values every 3 days, starting from the MIN of DATE, group by ID.
structure(list(ID = c("pqr", "abc", "ort", "kkg", "ssc", "ccv",
"xyz", "xyz", "xyz"), DATE = c("2022-06-07", "2022-06-24", "2022-06-02",
"2022-06-01", "2022-06-16", "2022-06-07", "2022-06-11", "2022-06-13", "2022-06-27"
), READING_IN = c(150, 2800, 600, 500, 1395.94, 500, 800, 179, 200
), READING_OUT = c(150, 2800, 600, 500, 1400, 501.4, 371.34,
556.47, 462.75)), class = "data.frame", row.names = c(NA, -9L))
Following is an unsuccessful attempt.
DT$DATE = as.Date(DT$DATE, format = "%Y-%m-%d")
DT1 = DT %>%
group_by(ID, group = cut(as.Date(DT$DATE, format = "%Y-%m-%d"), '3 days')) %>%
summarise(date_range = paste(min(DATE), min(DATE) + 2, sep = ' to '),
sum_in = sum(READING_IN),
sum_out = sum(READING_OUT), .groups = 'drop') %>%
select(-group)
Result:
structure(list(ID = c("abc", "ccv", "kkg", "ort", "pqr", "ssc",
"xyz", "xyz", "xyz"), date_range = c("2022-06-24-2022-06-26",
"2022-06-07-2022-06-09", "2022-06-01-2022-06-03", "2022-06-02-2022-06-04",
"2022-06-07-2022-06-09", "2022-06-16-2022-06-18", "2022-06-11-2022-06-13",
"2022-06-13-2022-06-15", "2022-06-27-2022-06-29"), sum_in = c(2800,
500, 500, 600, 150, 1395.94, 800, 179, 200), sum_out = c(2800,
501.4, 500, 600, 150, 1400, 371.34, 556.47, 462.75)), row.names = c(NA,
-9L), class = c("tbl_df", "tbl", "data.frame"))
Desired Output for ID = xyz:
ID
DATE
READING_IN
READING_OUT
xyz
2022-06-11 to 2022-06-13
979
927.81
xyz
2022-06-27 to 2022-06-29
200
462.75
I understand the issue here is the entry on 2022-06-13, an entry which should be aggregated in 2022-06-11 + 2 window. Is there any way to sum the values every 3 days aligned to desired output format?
I believe you were tricked by some group_by() details:
Computations are always done on the ungrouped data frame. To perform computations on the grouped data, you need to use a separate mutate() step before the group_by().
With extra mutate() + group_by() step it seems to behave like described:
library(tibble)
library(dplyr)
DT %>%
mutate(DATE = as.Date(DATE, format = "%Y-%m-%d")) %>%
group_by(ID) %>%
mutate(date_group = cut(DATE, '3 days')) %>%
group_by(ID, date_group) %>%
summarise(date_range = paste(min(DATE), min(DATE) + 2, sep = ' to '),
sum_in = num(sum(READING_IN), digits = 2),
sum_out = num(sum(READING_OUT),digits = 2), .groups = 'drop') %>%
select(-date_group)
#> # A tibble: 8 × 4
#> ID date_range sum_in sum_out
#> <chr> <chr> <num:.2!> <num:.2!>
#> 1 abc 2022-06-24 to 2022-06-26 2800.00 2800.00
#> 2 ccv 2022-06-07 to 2022-06-09 500.00 501.40
#> 3 kkg 2022-06-01 to 2022-06-03 500.00 500.00
#> 4 ort 2022-06-02 to 2022-06-04 600.00 600.00
#> 5 pqr 2022-06-07 to 2022-06-09 150.00 150.00
#> 6 ssc 2022-06-16 to 2022-06-18 1395.94 1400.00
#> 7 xyz 2022-06-11 to 2022-06-13 979.00 927.81
#> 8 xyz 2022-06-27 to 2022-06-29 200.00 462.75
Input:
DT <- structure(list(
ID = c(
"pqr", "abc", "ort", "kkg", "ssc", "ccv", "xyz", "xyz", "xyz"
),
DATE = c(
"2022-06-07", "2022-06-24", "2022-06-02", "2022-06-01", "2022-06-16",
"2022-06-07", "2022-06-11", "2022-06-13", "2022-06-27"
),
READING_IN = c(150, 2800, 600, 500, 1395.94, 500, 800, 179, 200),
READING_OUT = c(150, 2800, 600, 500, 1400, 501.4, 371.34, 556.47, 462.75)
), class = "data.frame", row.names = c(NA, -9L))
Created on 2023-01-18 with reprex v2.0.2
I have the following three data.frame:
area1 <- data.frame(ua = c(1, 2, 3),
sub_ua1 = c(0, 100, 0),
sub_ua2 = c(100, 100, 100),
sub_ua3 = c(100, 0, 0))
area2 <- data.frame(ua = c(1, 2, 3),
sub_ua1 = c(100, 100, 0),
sub_ua2 = c(100, 100, 0),
sub_ua3 = c(100, 0, 0))
df <- data.frame(ua = c(rep(1, 5), rep(2, 4), rep(3, 7)),
subua = c(rep("sub_ua1", 3), "sub_ua2", "sub_ua3",
"sub_ua1", "sub_ua1", "sub_ua2", "sub_ua3",
"sub_ua1", c(rep("sub_ua2", 2)), rep("sub_ua3", 4)),
value = c(rep(2, 3), rep(4, 3), rep(2, 2), rep(1, 8)))
What I'm trying to do is, based on column ua in dfs area_1 and area_2, filter only sub_ua (1 to 3) that have a match of 100 in each df. For example, the first value of sub_ua2 is 100 in both area_1 and area_2. This is a "sub_ua" I want.
Then, after having this list of "sub_ua" per "ua", filter only them on df to obtain the filtered value.
The results should be:
For ua == 1, get both sub_ua2 and sub_ua3
For ua == 2, get both sub_ua1 and sub_ua2
For ua == 3, get sub_ua2
EDIT:
I was using the following approach to obtain a data.frame of rows and columns indices:
library(prodlim)
# Indices for data frame 1 and 2 for values = 100
indices_1 <- which(area1 == 100, arr.ind = TRUE)
indices_2 <- which(area2 == 100, arr.ind = TRUE)
# Rows where indices are matched between the two data frame indices
indices_rows <- na.omit(row.match(as.data.frame(indices_1), as.data.frame(indices_2)))
# Row-column indices where both data frames have values of 100
indices_2[indices_rows, ]
I just don't know how to use this to filter in the final dataset df
If I understood correctly this should work:
area1 <- data.frame(ua = c(1, 2, 3),
sub_ua1 = c(0, 100, 0),
sub_ua2 = c(100, 100, 100),
sub_ua3 = c(100, 0, 0))
area2 <- data.frame(ua = c(1, 2, 3),
sub_ua1 = c(100, 100, 0),
sub_ua2 = c(100, 100, 0),
sub_ua3 = c(100, 0, 0))
library(dplyr)
library(tidyr)
area1 %>%
left_join(area2, by = "ua", suffix = c(".area1",".area2")) %>%
pivot_longer(cols = -ua,names_to = "var",values_to = "value") %>%
separate(col = var,into = c("var","area"),sep = "\\.") %>%
pivot_wider(names_from = area,values_from = value) %>%
filter(area1 == 100, area2 == 100) %>%
select(-starts_with("area"))
# A tibble: 4 x 2
ua var
<dbl> <chr>
1 1 sub_ua2
2 1 sub_ua3
3 2 sub_ua1
4 2 sub_ua2
I have been trying to get a loop that splits a dataset in multiple datasets based on a column value. However, the dataset is of a format I haven't handled before (i.e. a list containing both lists and data.tables). The dataset is reproducible by:
table1 <- data.table::data.table(Scenario =
c(rep(
c("A", "B", "C", "D"),
4)),
A = c(
rep("x", 4), rep("b", 4), rep("s", 4),
rep("u", 4)),
Correlation = c(1, 0.125, 0.1, 0,
0.125, 1, 0.2, 0,
0.1, 0.2, 1, 0,
0, 0, 0, 1),
Matrix = "IM",
stringsAsFactors = FALSE,
check.names = FALSE)
table2 <- data.table::data.table(Scenario =
c(rep(
c("A", "B", "C", "D"),
4)),
A = c(
rep("x", 4), rep("b", 4), rep("s", 4),
rep("u", 4)),
Correlation = c(1, 0.125, 0.1, 0,
0.125, 1, 0.2, 0,
0.1, 0.2, 1, 0,
0, 0, 0, 1),
Matrix = "IM",
stringsAsFactors = FALSE,
check.names = FALSE)
table3 <- data.table::data.table(Scenario =
c(rep(
c("A", "B", "C", "D"),
4)),
A = c(
rep("x", 4), rep("b", 4), rep("s", 4),
rep("u", 4)),
Correlation = c(1, 0.125, 0.1, 0,
0.125, 1, 0.2, 0,
0.1, 0.2, 1, 0,
0, 0, 0, 1),
Matrix = "IM",
stringsAsFactors = FALSE,
check.names = FALSE)
list1 <- list("a" = "2019", "b" = "2020", "c" = "2021")
list2 <- list("a" = "test", "b" = "test", "c" = "test")
input_data <- list("table1" = table1, "table2" = table2, "table3" = table3,
"list1"=list1, "list2" = list2)
I need a loop that splits this dataset based on all unique instances in the scenario column. The first dataset (for scenario value "A") is reproducible by:
table1 <- data.table::data.table(Scenario =
c(rep(
c("A"),
4)),
A = c(
rep("x", 1), rep("b", 1), rep("s", 1),
rep("u", 1)),
Correlation = c(1, 0.125, 0.1, 0 ),
Matrix = "IM",
stringsAsFactors = FALSE,
check.names = FALSE)
table2 <- data.table::data.table(Scenario =
c(rep(
c( "A"),
4)),
A = c(
rep("x", 1), rep("b", 1), rep("s", 1),
rep("u", 1)),
Correlation = c(1, 0.125, 0.1, 0),
Matrix = "IM",
stringsAsFactors = FALSE,
check.names = FALSE)
table3 <- data.table::data.table(Scenario =
c(rep(
c("A"),
4)),
A = c(
rep("x", 1), rep("b", 1), rep("s", 1),
rep("u", 1)),
Correlation = c(1, 0.125, 0.1, 0),
Matrix = "IM",
stringsAsFactors = FALSE,
check.names = FALSE)
list1 <- list("a" = "2019", "b" = "2020", "c" = "2021")
list2 <- list("a" = "test", "b" = "test", "c" = "test")
input_data <- list("table1" = table1, "table2" = table2, "table3" = table3,
"list1"=list1, "list2" = list2)
Please let me know if additional information is needed.
You can write a function that wraps lapply, utilizing inherits as a check for the type of each object in the list. If the object inherits from data.frame and contains a column called Scenario then you can simply subset it. Items that are not data frames or data tables, or those that do not have columns called Scenario are left unaltered:
get_scenario <- function(S) {
lapply(input_data, function(x) {
if(!inherits(x, "data.frame"))
return(x)
else if(!"Scenario" %in% names(x))
return(x)
return(x[x$Scenario == S,])
})
}
This allows:
get_scenario("A")
#> $table1
#> Scenario A Correlation Matrix
#> 1: A x 1.000 IM
#> 2: A b 0.125 IM
#> 3: A s 0.100 IM
#> 4: A u 0.000 IM
#>
#> $table2
#> Scenario A Correlation Matrix
#> 1: A x 1.000 IM
#> 2: A b 0.125 IM
#> 3: A s 0.100 IM
#> 4: A u 0.000 IM
#>
#> $table3
#> Scenario A Correlation Matrix
#> 1: A x 1.000 IM
#> 2: A b 0.125 IM
#> 3: A s 0.100 IM
#> 4: A u 0.000 IM
#>
#> $list1
#> $list1$a
#> [1] "2019"
#>
#> $list1$b
#> [1] "2020"
#>
#> $list1$c
#> [1] "2021"
#>
#>
#> $list2
#> $list2$a
#> [1] "test"
#>
#> $list2$b
#> [1] "test"
#>
#> $list2$c
#> [1] "test"
And if you want all subgroups as one uber-list, you can do:
lapply(c("A", "B", "C"), get_scenario)
I have data table that looks like this:
data <- data.table(time = c(0, 1, 2, 3, 4, 5, 6, 7),
anom = c(0, 0, 1, 1, 1, 0, 0, 0),
gier = c(0, 0, 4, 9, 7, 0, 0, 0))
Now I am calculating some statistical values of the column gier grouped by column anom like this:
cols <- c("gier")
statFun <- function(x) list(mean = mean(x), median = median(x), std = sd(x))
statSum <- data[, unlist(lapply(.SD, statFun), recursive = FALSE), .SDcols = cols, by = anom]
This is fine but I want to go a step further and put in the start and end points of time depending on the start and of the anom groups (0 and 1). So in the end I have something like a new time series but only with the start and end points of time. So in the end the result should look like this:
res <- data.table(x.start = c(0, 2, 5),
x.end = c(1, 4, 7),
anom = c(0, 1, 0),
gier.mean = c(0, 6.666, 0),
gier.median = c(0, 7, 0),
gier.std = c(0, 2.516, 0))
How is it possible to achieve this?
addition: is there a way to achieve the result for multiple columns and not only one column like gier? For example I am able to do this but I don't know how to extend it with the mentioned columns. This way there is at least an extra column rn for the column names I calculate the statistical values.
res <- data[, setDT(do.call(rbind.data.frame, lapply(.SD, statFun)), keep.rownames = TRUE), .SDcols = cols, by = anom]
You can include additional calculation outside lapply :
library(data.table)
data[, unlist(c(lapply(.SD, statFun),
anom = first(anom), x.start = first(time), x.end = last(time)),
recursive = FALSE), rleid(anom), .SDcols = cols]
# rleid gier.mean gier.median gier.std anom x.start x.end
#1: 1 0.000000 0 0.000000 0 0 1
#2: 2 6.666667 7 2.516611 1 2 4
#3: 3 0.000000 0 0.000000 0 5 7
In dplyr we can do this similarly :
library(dplyr)
data %>%
group_by(grp = rleid(anom)) %>%
summarise(across(cols, list(mean = mean, median = median, std = sd)),
x.start = first(time),
x.end = last(time))
I would like to calculate the "non-NA values interval" for different columns.
Here is the dataset:
temp <- data.frame(
date = seq(as.Date("2018-01-01"), by = 'month', length.out = 12),
X1 = c(100, NA, 23, NA, NA, 12, NA, NA, NA, NA, NA, 100),
X2 = runif(12, 50, 100),
X3 = c(24, NA, NA, NA, NA, 31, 1, NA, 44, NA, 100, NA),
X4 = NA
)
For example, X1 has non-NA intervals as 1, 2, 5, which means, from 100 to 23, there is 1 NA between these two non-NA values, from 23 to 12, there is 2 NAs between these two non-NA values, and from 12 to 100, there are 5 NAs between these two non-NA values.
The expected result is:
result <- data.frame(
X1_inv_mean = mean(c(1, 2, 5)),
X1_inv_median = median(c(1, 2, 5)),
X1_inv_sd = sd(c(1, 2, 5)),
X2_inv_mean = mean(0),
X2_inv_median = median(0),
X2_inv_sd = sd(0),
X3_inv_mean = mean(c(4, 1, 1, 1)),
X3_inv_median = median(c(4, 1, 1, 1)),
X3_inv_sd = sd(c(4, 1, 1, 1)),
X4_inv_mean = NA,
X4_inv_median = NA,
X4_inv_sd = NA
)
>result
X1_inv_mean X1_inv_median X1_inv_sd X2_inv_mean X2_inv_median X2_inv_sd X3_inv_mean X3_inv_median X3_inv_sd
1 2.666667 2 2.081666 0 0 NA 1.75 1 1.5
X4_inv_mean X4_inv_median X4_inv_sd
1 NA NA NA
Thanks for the help!
A base R option
out <- lapply(temp[-1], function(x) {
if(all(is.na(x))) {
tmp <- NA
} else {
tmp <- with(rle(is.na(x)), lengths[values])
c(mean = mean(tmp),
median = median(tmp),
sd = sd(tmp))}
})
as.data.frame(out)
# X1 X2 X3 X4
#mean 2.666667 NaN 1.75 NA
#median 2.000000 NA 1.00 NA
#sd 2.081666 NA 1.50 NA
Using rle the following line gives you the runs of NAs for each column
tmp <- with(rle(is.na(x)), lengths[values])
E.g. for column X1
with(rle(is.na(temp$X1)), lengths[values])
#[1] 1 2 5
Then we calculate your summary statistics for each tmp.
If all values in a column are NA the function returns NA.
Update:
For variable n columns:
command <- ""
summaryString <- ""
for(i in colnames(temp)){
if(i != "date"){
print(i)
summaryString <- paste(summaryString,i,"_inv_mean = mean(",i,", na.rm = T),",sep="")
summaryString <- paste(summaryString,i,"_inv_median = median(",i,", na.rm = T),",sep="")
summaryString <- paste(summaryString,i,"_inv_sd = sd(",i,", na.rm = T),",sep="")
}
command <- paste("output <- temp %>% summarise(",substr(summaryString, 0, nchar(summaryString)-1),")",sep="")
}
eval(parse(text=command))
Using dplyr:
library(dplyr)
output <- temp%>%
summarise(x1_inv_mean = mean(X1, na.rm = T),
x1_inv_median = median(X1, na.rm = T),
x1_inv_sd = sd(X1, na.rm = T),
x2_inv_mean = median(X2, na.rm = T),
x2_inv_median = mean(X2, na.rm = T),
x2_inv_sd = sd(X2, na.rm = T),
x3_inv_mean = median(X3, na.rm = T),
x3_inv_median = mean(X3, na.rm = T),
x3_inv_sd = sd(X3, na.rm = T),
x4_inv_mean = mean(X4, na.rm = T),
x4_inv_median = median(X4, na.rm = T),
x4_inv_sd = sd(X4, na.rm = T))