I'm trying to perform a forloop to apply a custom summarise function to all the numeric columns in the dataframe. The forloop output seems to ignore the grouping factor- however, if I perform the function alone on a single column (without the for loop), it provides the correct output.
#sample df
structure(list(participant = c("pt04", "pt75", "pt21", "pt73",
"pt27", "pt39", "pt43", "pt52", "pt69", "pt49", "pt50", "pt56",
"pt62", "pt68", "pt22", "pt64", "pt54", "pt79", "pt36", "pt26",
"pt65", "pt38"), group = structure(c(1L, 2L, 2L, 1L, 1L, 2L,
1L, 2L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 1L
), .Label = c("c", "e"), class = "factor"), sex = structure(c(2L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L,
2L, 1L, 1L, 2L, 1L), .Label = c("m", "f"), class = "factor"),
fm_bdc3 = c(18.535199635968, 23.52996574649, 17.276246451976,
11.526088555461, 23.805048656112, 23.08597823716, 28.691020942436,
28.968097858499, 23.378093165331, 22.491725344661, 14.609015054932,
19.734914019306, 31.947412973684, 25.152298171274, 12.007356801787,
20.836128108938, 22.322230884349, 14.777652101515, 21.389572717608,
16.992853675086, 14.138189878472, 17.777235203826), fm_rec3 = c(18.545007190636,
23.017181869742, 17.031403417007, 11.227201061887, 23.581434653208,
21.571120542136, 28.919246372213, 28.138632765662, 22.990408911436,
22.274932676852, 14.012586350504, 19.066675709151, 30.897705534847,
24.491614222412, 11.670939246332, 20.306494543464, 22.052263684182,
14.252973638341, 21.028701096846, 17.207104923059, 13.172159777361,
17.610831079442), fm_chg = c(0.00980755466799721, -0.512783876747999,
-0.244843034968998, -0.298887493573998, -0.223614002904,
-1.514857695024, 0.228225429777002, -0.829465092836998, -0.387684253894999,
-0.216792667809003, -0.596428704428, -0.668238310155001,
-1.049707438837, -0.660683948862001, -0.336417555455, -0.529633565474001,
-0.269967200167002, -0.524678463173998, -0.360871620761998,
0.214251247972999, -0.966030101111, -0.166404124383998),
fm_percchg = c(0.00052913132097943, -0.0217928016671462,
-0.0141722355981437, -0.0259313896588437, -0.00939355370091154,
-0.0656180855522784, 0.00795459423472242, -0.0286337438132355,
-0.0165832282022865, -0.00963877445980213, -0.0408260722701251,
-0.0338607155572751, -0.0328573534170568, -0.0262673392452288,
-0.028017619615079, -0.025419001203338, -0.0120940958619099,
-0.0355048596062299, -0.0168713805332318, 0.0126083147698213,
-0.0683277073949869, -0.00936051767758492)), row.names = c(NA,
-22L), class = "data.frame")
#my function:
summbygrp <- function(x) {
group_by(dexadf, group) %>%
summarise(
count = n(),
mean = mean({{x}}, na.rm = TRUE),
sd = sd({{x}}, na.rm = TRUE)
) %>%
mutate(se = sd / sqrt(11),
lower.ci = mean - qt(1 - (0.05 / 2), 11 - 1) * se,
upper.ci = mean + qt(1 - (0.05 / 2), 11 - 1) * se
)
}
#apply function to all numeric columns and print column names before output
coln = 1
for (col in dexadf) {
print(colnames(dexadf)[coln])
coln = coln + 1
if(is.numeric(col)) {
print(summbygrp(col))
} else {next}
}
#output:
[1] "fm_bdc3"
# A tibble: 2 × 7
group count mean sd se lower.ci upper.ci
<fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 c 11 20.6 5.48 1.65 16.9 24.3
2 e 11 20.6 5.48 1.65 16.9 24.3
[1] "fm_rec3"
# A tibble: 2 × 7
group count mean sd se lower.ci upper.ci
<fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 c 11 20.1 5.41 1.63 16.5 23.8
2 e 11 20.1 5.41 1.63 16.5 23.8
[1] "fm_chg"
# A tibble: 2 × 7
group count mean sd se lower.ci upper.ci
<fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 c 11 -0.450 0.406 0.122 -0.723 -0.178
2 e 11 -0.450 0.406 0.122 -0.723 -0.178
[1] "fm_percchg"
# A tibble: 2 × 7
group count mean sd se lower.ci upper.ci
<fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 c 11 -0.0227 0.0198 0.00598 -0.0360 -0.00942
2 e 11 -0.0227 0.0198 0.00598 -0.0360 -0.00942
As you can see, all the means for both groups are the same, and I know this shouldn't be true. Could someone identify the error in the code? Thank you!
So instead of using for-loops you can do better,
library(dplyr)
library(rlang)
library(purrr)
library(tibble)
dexadf <- data.frame(
stringsAsFactors = FALSE,
participant = c("pt04","pt75","pt21","pt73",
"pt27","pt39","pt43","pt52","pt69","pt49","pt50",
"pt56","pt62","pt68","pt22","pt64","pt54","pt79",
"pt36","pt26","pt65","pt38"),
fm_bdc3 = c(18.535199635968,23.52996574649,
17.276246451976,11.526088555461,23.805048656112,
23.08597823716,28.691020942436,28.968097858499,
23.378093165331,22.491725344661,14.609015054932,19.734914019306,
31.947412973684,25.152298171274,12.007356801787,
20.836128108938,22.322230884349,14.777652101515,
21.389572717608,16.992853675086,14.138189878472,17.777235203826),
fm_rec3 = c(18.545007190636,
23.017181869742,17.031403417007,11.227201061887,23.581434653208,
21.571120542136,28.919246372213,28.138632765662,
22.990408911436,22.274932676852,14.012586350504,19.066675709151,
30.897705534847,24.491614222412,11.670939246332,
20.306494543464,22.052263684182,14.252973638341,
21.028701096846,17.207104923059,13.172159777361,17.610831079442),
fm_chg = c(0.00980755466799721,
-0.512783876747999,-0.244843034968998,-0.298887493573998,
-0.223614002904,-1.514857695024,0.228225429777002,
-0.829465092836998,-0.387684253894999,-0.216792667809003,
-0.596428704428,-0.668238310155001,-1.049707438837,
-0.660683948862001,-0.336417555455,-0.529633565474001,
-0.269967200167002,-0.524678463173998,-0.360871620761998,
0.214251247972999,-0.966030101111,-0.166404124383998),
fm_percchg = c(0.00052913132097943,
-0.0217928016671462,-0.0141722355981437,-0.0259313896588437,
-0.00939355370091154,-0.0656180855522784,
0.00795459423472242,-0.0286337438132355,-0.0165832282022865,
-0.00963877445980213,-0.0408260722701251,-0.0338607155572751,
-0.0328573534170568,-0.0262673392452288,-0.028017619615079,
-0.025419001203338,-0.0120940958619099,
-0.0355048596062299,-0.0168713805332318,0.0126083147698213,
-0.0683277073949869,-0.00936051767758492),
group = as.factor(c("c","e",
"e","c","c","e","c","e","c","e","e","c",
"e","c","c","e","e","c","e","c","e",
"c")),
sex = as.factor(c("f","m",
"m","m","m","m","m","f","m","f","f","f",
"f","f","f","f","m","f","m","m","f",
"m"))
)
dexadf <- as_tibble(dexadf)
# Note the use of .data pronoun, since columns will passed to this function as characters
summbygrp <- function(df, x) {
df %>%
group_by(group) %>%
summarise(
count = n(),
mean = mean(.data[[x]], na.rm = TRUE), # use of .data
sd = sd(.data[[x]], na.rm = TRUE) # use of .data
) %>%
mutate(se = sd / sqrt(11),
lower.ci = mean - qt(1 - (0.05 / 2), 11 - 1) * se,
upper.ci = mean + qt(1 - (0.05 / 2), 11 - 1) * se
)
}
# Here we extract the numerical columns of the dataset
cols <- dexadf %>%
select(where(is.numeric)) %>% colnames(.)
cols
#> [1] "fm_bdc3" "fm_rec3" "fm_chg" "fm_percchg"
# Then instead of for loops we can simply use this map function
map(.x = cols, ~ summbygrp(dexadf, .x))
#> [[1]]
#> # A tibble: 2 × 7
#> group count mean sd se lower.ci upper.ci
#> <fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 c 11 19.3 5.49 1.66 15.6 23.0
#> 2 e 11 21.9 5.40 1.63 18.2 25.5
#>
#> [[2]]
#> # A tibble: 2 × 7
#> group count mean sd se lower.ci upper.ci
#> <fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 c 11 19.1 5.54 1.67 15.3 22.8
#> 2 e 11 21.2 5.31 1.60 17.7 24.8
#>
#> [[3]]
#> # A tibble: 2 × 7
#> group count mean sd se lower.ci upper.ci
#> <fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 c 11 -0.256 0.311 0.0938 -0.465 -0.0470
#> 2 e 11 -0.645 0.407 0.123 -0.918 -0.371
#>
#> [[4]]
#> # A tibble: 2 × 7
#> group count mean sd se lower.ci upper.ci
#> <fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 c 11 -0.0149 0.0167 0.00503 -0.0261 -0.00368
#> 2 e 11 -0.0306 0.0203 0.00611 -0.0442 -0.0170
# -------------------------------------------------------------------
# we can also bind all the output results (dataframes) in a single dataframe
map_dfr(.x = cols, ~ summbygrp(dexadf, .x), .id = "vars")
#> # A tibble: 8 × 8
#> vars group count mean sd se lower.ci upper.ci
#> <chr> <fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 c 11 19.3 5.49 1.66 15.6 23.0
#> 2 1 e 11 21.9 5.40 1.63 18.2 25.5
#> 3 2 c 11 19.1 5.54 1.67 15.3 22.8
#> 4 2 e 11 21.2 5.31 1.60 17.7 24.8
#> 5 3 c 11 -0.256 0.311 0.0938 -0.465 -0.0470
#> 6 3 e 11 -0.645 0.407 0.123 -0.918 -0.371
#> 7 4 c 11 -0.0149 0.0167 0.00503 -0.0261 -0.00368
#> 8 4 e 11 -0.0306 0.0203 0.00611 -0.0442 -0.0170
Created on 2022-07-09 by the reprex package (v2.0.1)
out <- df %>%
pivot_longer(starts_with('fm')) %>%
group_by(name, group) %>%
summarise(
count = n(),
mean = mean(value, na.rm = TRUE),
sd = sd(value, na.rm = TRUE),
.groups = 'drop'
) %>%
mutate(se = sd / sqrt(11),
lower.ci = mean - qt(1 - (0.05 / 2), 11 - 1) * se,
upper.ci = mean + qt(1 - (0.05 / 2), 11 - 1) * se
)
out
# A tibble: 8 x 8
name group count mean sd se lower.ci upper.ci
<chr> <fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 fm_bdc3 c 11 19.3 5.49 1.66 15.6 23.0
2 fm_bdc3 e 11 21.9 5.40 1.63 18.2 25.5
3 fm_chg c 11 -0.256 0.311 0.0938 -0.465 -0.0470
4 fm_chg e 11 -0.645 0.407 0.123 -0.918 -0.371
5 fm_percchg c 11 -0.0149 0.0167 0.00503 -0.0261 -0.00368
6 fm_percchg e 11 -0.0306 0.0203 0.00611 -0.0442 -0.0170
7 fm_rec3 c 11 19.1 5.54 1.67 15.3 22.8
8 fm_rec3 e 11 21.2 5.31 1.60 17.7 24.8
if you need the list, just split it:
split(out, ~name)
$fm_bdc3
# A tibble: 2 x 8
name group count mean sd se lower.ci upper.ci
<chr> <fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 fm_bdc3 c 11 19.3 5.49 1.66 15.6 23.0
2 fm_bdc3 e 11 21.9 5.40 1.63 18.2 25.5
$fm_chg
# A tibble: 2 x 8
name group count mean sd se lower.ci upper.ci
<chr> <fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 fm_chg c 11 -0.256 0.311 0.0938 -0.465 -0.0470
2 fm_chg e 11 -0.645 0.407 0.123 -0.918 -0.371
$fm_percchg
# A tibble: 2 x 8
name group count mean sd se lower.ci upper.ci
<chr> <fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 fm_percchg c 11 -0.0149 0.0167 0.00503 -0.0261 -0.00368
2 fm_percchg e 11 -0.0306 0.0203 0.00611 -0.0442 -0.0170
$fm_rec3
# A tibble: 2 x 8
name group count mean sd se lower.ci upper.ci
<chr> <fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 fm_rec3 c 11 19.1 5.54 1.67 15.3 22.8
2 fm_rec3 e 11 21.2 5.31 1.60 17.7 24.8
A similar answer to the above, but combining across and summarise:
df |>
group_by(group) |>
summarise(
across(
where(is.numeric),
list(
mean = ~mean(.x, na.rm = TRUE),
sd = ~sd(.x, na.rm = TRUE),
n = ~n()
),
.names = "{.col}.{.fn}"
)
) |>
pivot_longer(
-group,
names_to = c("measure", "stat"),
names_sep = "\\."
) |>
pivot_wider(
names_from = stat,
values_from = value
) |>
mutate(
se = sd / sqrt(n),
lower.ci = mean - qt(1 - (0.05 / 2), 11 - 1) * se,
upper.ci = mean + qt(1 - (0.05 / 2), 11 - 1) * se
) |>
arrange(measure)
I have a dataframe:
> print(merged)
AgeGroup values ind
1 1 0.2449762 diff_v.ownhigh_avg
2 1 0.2598964 diff_v.ownhigh_avg
3 1 0.2519043 diff_v.ownhigh_avg
4 1 0.2452479 diff_v.ownhigh_avg
5 1 0.2840650 diff_v.ownhigh_avg
6 1 0.2589341 diff_v.ownhigh_avg
7 1 0.3201843 diff_v.ownhigh_avg
8 1 0.3218865 diff_v.ownhigh_avg
9 1 0.2822984 diff_v.ownhigh_avg
10 1 0.3313962 diff_v.ownhigh_avg
There are 8 different types of ind, and there are 2 AgeGroup types.
I am creating a new dataframe that summarises the means and credble intervals based on 2 group factors (AgeGroup and ind).
This is the code that I have:
meansCIs <- merged %>%
group_by(AgeGroup, ind) %>%
summarise(means = mean(values), .groups = "keep",
lower_bound = quantile(means,.025),
upper_bound = quantile(means,.975))
This is the output it gives:
# A tibble: 16 x 5
# Groups: AgeGroup, ind [16]
AgeGroup ind means lower_bound upper_bound
<dbl> <fct> <dbl> <dbl> <dbl>
1 1 diff_v.ownhigh_avg 0.290 0.290 0.290
2 1 diff_v.ownlow_avg 0.272 0.272 0.272
3 1 diff_v.otherhigh_avg 0.274 0.274 0.274
4 1 diff_v.otherlow_avg 0.388 0.388 0.388
5 1 diff_v.own_avg 0.281 0.281 0.281
As you can see, something has gone wrong with computing the credible intervals. It is just replicating the mean for each condition. Does anyone know how I could fix this?
The quantile function is operating on just the single mean value. I think you need to substitute in the “values” variable.
merged<- structure(list(AgeGroup = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L),
values = c(0.2449762, 0.2598964, 0.2519043, 0.2452479, 0.284065,
0.2589341, 0.3201843, 0.3218865, 0.2822984, 0.3313962),
ind = c("diff_v.ownhigh_avg", "diff_v.ownhigh_avg", "diff_v.ownhigh_avg", "diff_v.ownhigh_avg",
"diff_v.ownhigh_avg", "diff_v.ownhigh_avg", "diff_v.ownhigh_avg",
"diff_v.ownhigh_avg", "diff_v.ownhigh_avg", "diff_v.ownhigh_avg" )),
class = "data.frame", row.names = c(NA, -10L))
merged %>%
group_by(AgeGroup, ind) %>%
summarise(means = mean(values), .groups = "keep",
lower_bound = quantile(values,.025),
upper_bound = quantile(values,.975))
# A tibble: 1 × 5
# Groups: AgeGroup, ind [1]
AgeGroup ind means lower_bound upper_bound
<int> <chr> <dbl> <dbl> <dbl>
1 1 diff_v.ownhigh_avg 0.280 0.245 0.329
I am using the separate function from tidyverse to split the first column of this tibble :
# A tibble: 6,951 x 9
Row.names Number_of_analysis~ DL_Minimum DL_Mean DL_Maximum Number_of_measur~ Measure_Minimum Measure_Mean Measure_Maximum
<I<chr>> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 2011.FACILITY.PONT-À-CELLES 52 0.6 1.81 16 0 0 0 0
2 2011.FACILITY.PONT-À-CELLES 52 0.07 0.177 1.3 0 0 0 0
3 2011.FACILITY.CHARLEROI 52 0.07 0.212 1.9 0 0 0 0
4 2011.FACILITY.CHARLEROI 52 0.08 0.209 2 0 0 0 0
Merge_splitnames <- Merge %>%
separate(Row.names,sep = "\\.",into = c("Year", "Catchment", "Locality"), extra = "drop")
While everything seems correct, the output is a tibble without the first 2 columns (the ones which have a name comprising an accent in French) :
# A tibble: 6,951 x 9
Year Catchment Locality Number_of_analysis~ DL_Minimum DL_Mean DL_Maximum Number_of_measur~ Measure_Minimum Measure_Mean Measure_Maximum
<I<chr>> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
3 2011 FACILITY CHARLEROI 52 0.07 0.212 1.9 0 0 0 0
4 2011 FACILITY CHARLEROI 52 0.08 0.209 2 0 0 0 0
Any idea how to deal with this issue ? I wish to keep the real name in French (with the accent). This is quite surprising for me, I've never got any issue with all the other functions from tidyverse.
NB : this is a simple and reproducible example, my real tibble is about 100 times bigger
separate is retaining the accent for me:
library(tidyverse)
tribble(
~names,
"2011.FACILITY.PONT-À-CELLES",
"2011.FACILITY.PONT-À-CELLES",
"2011.FACILITY.CHARLEROI",
"2011.FACILITY.CHARLEROI"
) %>%
separate(names, sep = "\\.", into = c("Year", "Catchment", "Locality"))
#> # A tibble: 4 × 3
#> Year Catchment Locality
#> <chr> <chr> <chr>
#> 1 2011 FACILITY PONT-À-CELLES
#> 2 2011 FACILITY PONT-À-CELLES
#> 3 2011 FACILITY CHARLEROI
#> 4 2011 FACILITY CHARLEROI
Created on 2022-05-06 by the reprex package (v2.0.1)
Assuming DF shown reproducibly in the Note at the end, use extra = "merge" in separate . (It is possible that you may need to change your locale but I did not need to do that. Some things to try are shown in How to change the locale of R? or Using weekdays with any locale under Windows )
library(tidyr)
DF %>%
separate(Row.names, c("Year", "Catchment", "Locality"), extra = "merge")
giving:
Year Catchment Locality Number_of_analysis~ DL_Minimum DL_Mean
1 2011 FACILITY PONT-À-CELLES 52 0.60 1.810
2 2011 FACILITY PONT-À-CELLES 52 0.07 0.177
3 2011 FACILITY CHARLEROI 52 0.07 0.212
4 2011 FACILITY CHARLEROI 52 0.08 0.209
DL_Maximum Number_of_measur~ Measure_Minimum Measure_Mean Measure_Maximum
1 16.0 0 0 0 0
2 1.3 0 0 0 0
3 1.9 0 0 0 0
4 2.0 0 0 0 0
Note
DF <-
structure(list(Row.names = c("2011.FACILITY.PONT-À-CELLES", "2011.FACILITY.PONT-À-CELLES",
"2011.FACILITY.CHARLEROI", "2011.FACILITY.CHARLEROI"), `Number_of_analysis~` = c(52L,
52L, 52L, 52L), DL_Minimum = c(0.6, 0.07, 0.07, 0.08), DL_Mean = c(1.81,
0.177, 0.212, 0.209), DL_Maximum = c(16, 1.3, 1.9, 2), `Number_of_measur~` = c(0L,
0L, 0L, 0L), Measure_Minimum = c(0L, 0L, 0L, 0L), Measure_Mean = c(0L,
0L, 0L, 0L), Measure_Maximum = c(0L, 0L, 0L, 0L)), class = "data.frame", row.names = c("1",
"2", "3", "4"))
I have 2 R data frames that looks like this:
DATA FRAME 1:
identifier
ef_posterior
position_no
classification
11111
0.260
1
yes
11111
0.0822
2
yes
11111
0.00797
3
yes
11111
0.04
4
no
11111
0.245
5
yes
11111
0.432
6
yes
11112
0.342
1
maybe
11112
0.453
2
yes
11112
0.0032
3
yes
11112
0.241
5
no
11112
0.0422
6
yes
11112
0.311
4
no
DATAFRAME 2:
study_identifier
%LVEF
11111
62
11112
76
I want to merge and rearrange these two data frames into something like this:
Study_identifier and identifier are the same thing (just different column names). Additionally, I would like to recode the classification so that yes = 0, no = 1, maybe = 2
identifier
pos_1
pos_1_class
pos_2
pos_2_class
pos_3
pos_3_class
pos_4
pos_4_class
pos_5
pos_5_class
pos_6
pos_6_class
%LVEF
11111
0.260
0
0.0822
0
0.00797
0
0.04
1
0.245
0
0.432
0
62
11112
0.342
2
0.453
0
0.0032
0
0.311
1
0.241
1
0.0422
0
76
df1 %>% mutate(position_no = paste0("position_", position_no)) %>%
pivot_wider(id_cols = identifier, names_from = position_no, values_from = ef_posterior) %>%
left_join(df2 %>% mutate(study_identifier = as.numeric(as.character(study_identifier))), by = c("identifier" = "study_identifier"))
This is the code I have right now, but I can't figure out where to put in the code for the classification column
How would I go about doing this?
Any help would be very much appreciated!
You can recode quite easily with dplyr and case_when:
df1 %>% mutate(
classification =
case_when( classification == "yes" ~ 1,
classification == "no" ~ 0,
classification == "maybe" ~ 2)
)
I would solve it the following way:
library(tidyverse)
df1 <- data.frame(
stringsAsFactors = FALSE,
identifier = c(11111L,11111L,11111L,11111L,
11111L,11111L,11112L,11112L,11112L,11112L,11112L,
11112L),
ef_posterior = c(0.26,0.0822,0.00797,0.04,
0.245,0.432,0.342,0.453,0.0032,0.241,0.0422,0.311),
position_no = c(1L, 2L, 3L, 4L, 5L, 6L, 1L, 2L, 3L, 5L, 6L, 4L),
classification = c("yes","yes","yes","no",
"yes","yes","maybe","yes","yes","no","yes","no")
)
df2 <- data.frame(
check.names = FALSE,
study_identifier = c(11111L, 11112L),
`%LVEF` = c(62L, 76L)
)
df1 %>% mutate(
classification =
case_when( classification == "yes" ~ 1,
classification == "no" ~ 0,
classification == "maybe" ~ 2)
) %>%
pivot_wider(
id_cols = c(identifier), names_from = c(position_no), values_from = c(classification,ef_posterior)) %>%
left_join(df2, by = c("identifier" = "study_identifier"))
#> # A tibble: 2 x 14
#> identifier classification_1 classification_2 classification_3 classification_4
#> <int> <dbl> <dbl> <dbl> <dbl>
#> 1 11111 1 1 1 0
#> 2 11112 2 1 1 0
#> # … with 9 more variables: classification_5 <dbl>, classification_6 <dbl>,
#> # ef_posterior_1 <dbl>, ef_posterior_2 <dbl>, ef_posterior_3 <dbl>,
#> # ef_posterior_4 <dbl>, ef_posterior_5 <dbl>, ef_posterior_6 <dbl>,
#> # `%LVEF` <int>
Created on 2021-04-12 by the reprex package (v0.3.0)
I have a matrix (A) containing 211 rows and 6 columns (one per time period) and a different matrix (B) containing 211 rows and 2 columns, the second of which contains categorial information (1-9).
My aim is to create a new matrix (C) where each value in matrix A is the value(A) divided by the mean of (value(A) by category(B)). I managed to compute the means for each category per column with the aggregate function. These are stored in a separate dataframe, column_means, with each time wave in a separate column. This also contains the information about the group in column_means[,1].
I don't understand how to proceed from here and am looking for an elegant solution so I can transfer this knowledge to future projects (and possibly improve my existing code). My guess is that the solution is hidden somewhere in dplyr and rather simple once you know it.
Thank you for any suggestions.
Data example:
##each column here represents a wave:
initialmatrix <- structure(c(0.882647671948723, 0.847932241438909, 0.753052308699317,
0.754977233408875, NA, 0.886095543329695, 0.849625252682829,
0.78893884364632, 0.77111113840682, NA, 0.887255207679895, 0.851503493865384,
0.812107856411831, 0.793982699495818, NA, 0.885212452552841,
0.854894065774315, 0.815265718290737, 0.806766276556325, NA,
0.882027335190646, 0.85386634818439, 0.818052477777012, 0.815997781565393,
NA, 0.88245957310107, 0.855819521951304, 0.830425687228663, 0.820857689847061,
NA), .Dim = 5:6, .Dimnames = list(NULL, c("V1", "V2", "V3", "V4",
"V5", "V6")))
##the first column is unique ID, the 2nd the category:
categories <- structure(c(1L, 2L, 3L, 4L, 5L, 2L, 1L, 2L, 2L, 4L), .Dim = c(5L,
2L), .Dimnames = list(NULL, c("V1", "V2")))
##the first column represents the category, column 1-6 the mean per category for each corresponding wave in "initialmatrix"
column.means <- structure(list(Group.1 = 1:5, x = c(0.805689153058216, 0.815006230419524,
0.832326976776262, 0.794835253329865, 0.773041961434791), asset_means_2...2. = c(0.80050960343197,
0.81923553710203, 0.833814773618545, 0.797834687980729, 0.780028077018158
), asset_means_3...2. = c(0.805053341257357, 0.828691564900149,
0.833953165695685, 0.799381078569563, 0.785813047374534), asset_means_4...2. = c(0.806116664276125,
0.832439754757116, 0.835982197159582, 0.801702200401293, 0.788814840753852
), asset_means_5...2. = c(0.807668548993891, 0.83801834926905,
0.836036508152776, 0.803433961863399, 0.79014026195926), asset_means_6...2. = c(0.808800359101212,
0.840923947682599, 0.839660313992458, 0.804901773257962, 0.793165113115977
)), row.names = c(NA, 5L), class = "data.frame")
Is this what you are trying to do?
options(digits=3)
divisor <- column.means[categories[, 2], -1]
divisor
# x asset_means_2...2. asset_means_3...2. asset_means_4...2. asset_means_5...2. asset_means_6...2.
# 2 0.815 0.819 0.829 0.832 0.838 0.841
# 1 0.806 0.801 0.805 0.806 0.808 0.809
# 2.1 0.815 0.819 0.829 0.832 0.838 0.841
# 2.2 0.815 0.819 0.829 0.832 0.838 0.841
# 4 0.795 0.798 0.799 0.802 0.803 0.805
initialmatrix/divisor
# x asset_means_2...2. asset_means_3...2. asset_means_4...2. asset_means_5...2. asset_means_6...2.
# 2 1.083 1.082 1.071 1.063 1.053 1.049
# 1 1.052 1.061 1.058 1.061 1.057 1.058
# 2.1 0.924 0.963 0.980 0.979 0.976 0.988
# 2.2 0.926 0.941 0.958 0.969 0.974 0.976
# 4 NA NA NA NA NA NA
This looks like a job for Superma ... no wait ... map2.
library(dplyr)
library(purrr)
as_tibble(initialmatrix) %>%
mutate(category = as.double(as_tibble(categories)$V2),
across(starts_with('V'),
~ unlist(map2(., category, ~ .x/mean(c(.x, .y)))))) %>%
select(-category)
# V1 V2 V3 V4 V5 V6
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 0.612 0.614 0.615 0.614 0.612 0.612
# 2 0.918 0.919 0.920 0.922 0.921 0.922
# 3 0.547 0.566 0.578 0.579 0.581 0.587
# 4 0.548 0.557 0.568 0.575 0.580 0.582
# 5 NA NA NA NA NA NA