Merging output from cor.test in R dataframe - r

I am trying to perform cor.test in R in a dataframe:
For a toy dataset of X and Y, I used the following:
library(dplyr)
library(broom)
X = c(0.88,1.3,5.6,3.1)
Y = c(0,1,1,1)
ft<-cor.test(X,Y)
tidy(ft) %>%
select(estimate, p.value, conf.low, conf.high) %>%
bind_rows(.id = 'grp')
which gives me the following result:
grp estimate p.value conf.low conf.high
<chr> <dbl> <dbl> <dbl> <dbl>
1 1 0.571 0.429 -0.864 0.989
Now, a short version of my dataframe is like:
df<-structure(list(X_sample1 = c(0.11, 0.98, 0.88), X_sample2 = c(0.13,
0, 1.3), X_sample3 = c(1.5, 3.5, 5.6), X_sample4 = c(3.2, 2.4,
3.1), Y_sample1 = c(0L, 1L, 0L), Y_sample2 = c(0L, 0L, 1L), Y_sample3 = c(1L,
1L, 1L), Y_sample4 = c(1L, 1L, 1L)), class = "data.frame", row.names = c("Product1",
"Product2", "Product3"))
I want to perform cor.test in each row of the df between X and Y groups. Thus, in the above example df, the groups are:
X = c(0.11,0.13,1.5,3.2)
Y = c(0,0,1,1)
---------------
X = c(0.98,0,3.5,2.4)
Y = c(1,0,1,1)
---------------
X = c(0.88,1.3,5.6,3.1)
Y = c(0,1,1,1)
I want a output like:
grp estimate p.value conf.low conf.high
Product1 0.88 0.12 -0.525 0.997
Product2 0.743 0.257 -0.762 0.994
Product3 0.571 0.429 -0.864 0.989
Thanks for your help!

One option could be:
df %>%
rownames_to_column(var = "grp") %>%
rowwise() %>%
transmute(grp,
tidy(cor.test(c_across(starts_with("X")), c_across(starts_with("Y"))))) %>%
select(grp, estimate, p.value, conf.low, conf.high)
grp estimate p.value conf.low conf.high
<chr> <dbl> <dbl> <dbl> <dbl>
1 Product1 0.880 0.120 -0.525 0.997
2 Product2 0.743 0.257 -0.762 0.994
3 Product3 0.571 0.429 -0.864 0.989

You can use dplyr and broom:
library(dplyr)
library(broom)
df %>%
rownames_to_column() %>%
pivot_longer(-rowname, names_to = c(".value", "sample"),
names_sep = "_sample") %>%
nest_by(rowname) %>%
summarize(cors1 = tidy(cor.test(data$X, data$Y)))
# A tibble: 3 x 2
# Groups: rowname [3]
rowname cors1$estimate $statistic $p.value $parameter $conf.low $conf.high
<chr> <dbl> <dbl> <dbl> <int> <dbl> <dbl>
1 Produc~ 0.880 2.62 0.120 2 -0.525 0.997
2 Produc~ 0.743 1.57 0.257 2 -0.762 0.994
3 Produc~ 0.571 0.984 0.429 2 -0.864 0.989

Related

How to perform for loop to apply custom function with grouping

I'm trying to perform a forloop to apply a custom summarise function to all the numeric columns in the dataframe. The forloop output seems to ignore the grouping factor- however, if I perform the function alone on a single column (without the for loop), it provides the correct output.
#sample df
structure(list(participant = c("pt04", "pt75", "pt21", "pt73",
"pt27", "pt39", "pt43", "pt52", "pt69", "pt49", "pt50", "pt56",
"pt62", "pt68", "pt22", "pt64", "pt54", "pt79", "pt36", "pt26",
"pt65", "pt38"), group = structure(c(1L, 2L, 2L, 1L, 1L, 2L,
1L, 2L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 1L
), .Label = c("c", "e"), class = "factor"), sex = structure(c(2L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L,
2L, 1L, 1L, 2L, 1L), .Label = c("m", "f"), class = "factor"),
fm_bdc3 = c(18.535199635968, 23.52996574649, 17.276246451976,
11.526088555461, 23.805048656112, 23.08597823716, 28.691020942436,
28.968097858499, 23.378093165331, 22.491725344661, 14.609015054932,
19.734914019306, 31.947412973684, 25.152298171274, 12.007356801787,
20.836128108938, 22.322230884349, 14.777652101515, 21.389572717608,
16.992853675086, 14.138189878472, 17.777235203826), fm_rec3 = c(18.545007190636,
23.017181869742, 17.031403417007, 11.227201061887, 23.581434653208,
21.571120542136, 28.919246372213, 28.138632765662, 22.990408911436,
22.274932676852, 14.012586350504, 19.066675709151, 30.897705534847,
24.491614222412, 11.670939246332, 20.306494543464, 22.052263684182,
14.252973638341, 21.028701096846, 17.207104923059, 13.172159777361,
17.610831079442), fm_chg = c(0.00980755466799721, -0.512783876747999,
-0.244843034968998, -0.298887493573998, -0.223614002904,
-1.514857695024, 0.228225429777002, -0.829465092836998, -0.387684253894999,
-0.216792667809003, -0.596428704428, -0.668238310155001,
-1.049707438837, -0.660683948862001, -0.336417555455, -0.529633565474001,
-0.269967200167002, -0.524678463173998, -0.360871620761998,
0.214251247972999, -0.966030101111, -0.166404124383998),
fm_percchg = c(0.00052913132097943, -0.0217928016671462,
-0.0141722355981437, -0.0259313896588437, -0.00939355370091154,
-0.0656180855522784, 0.00795459423472242, -0.0286337438132355,
-0.0165832282022865, -0.00963877445980213, -0.0408260722701251,
-0.0338607155572751, -0.0328573534170568, -0.0262673392452288,
-0.028017619615079, -0.025419001203338, -0.0120940958619099,
-0.0355048596062299, -0.0168713805332318, 0.0126083147698213,
-0.0683277073949869, -0.00936051767758492)), row.names = c(NA,
-22L), class = "data.frame")
#my function:
summbygrp <- function(x) {
group_by(dexadf, group) %>%
summarise(
count = n(),
mean = mean({{x}}, na.rm = TRUE),
sd = sd({{x}}, na.rm = TRUE)
) %>%
mutate(se = sd / sqrt(11),
lower.ci = mean - qt(1 - (0.05 / 2), 11 - 1) * se,
upper.ci = mean + qt(1 - (0.05 / 2), 11 - 1) * se
)
}
#apply function to all numeric columns and print column names before output
coln = 1
for (col in dexadf) {
print(colnames(dexadf)[coln])
coln = coln + 1
if(is.numeric(col)) {
print(summbygrp(col))
} else {next}
}
#output:
[1] "fm_bdc3"
# A tibble: 2 × 7
group count mean sd se lower.ci upper.ci
<fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 c 11 20.6 5.48 1.65 16.9 24.3
2 e 11 20.6 5.48 1.65 16.9 24.3
[1] "fm_rec3"
# A tibble: 2 × 7
group count mean sd se lower.ci upper.ci
<fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 c 11 20.1 5.41 1.63 16.5 23.8
2 e 11 20.1 5.41 1.63 16.5 23.8
[1] "fm_chg"
# A tibble: 2 × 7
group count mean sd se lower.ci upper.ci
<fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 c 11 -0.450 0.406 0.122 -0.723 -0.178
2 e 11 -0.450 0.406 0.122 -0.723 -0.178
[1] "fm_percchg"
# A tibble: 2 × 7
group count mean sd se lower.ci upper.ci
<fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 c 11 -0.0227 0.0198 0.00598 -0.0360 -0.00942
2 e 11 -0.0227 0.0198 0.00598 -0.0360 -0.00942
As you can see, all the means for both groups are the same, and I know this shouldn't be true. Could someone identify the error in the code? Thank you!
So instead of using for-loops you can do better,
library(dplyr)
library(rlang)
library(purrr)
library(tibble)
dexadf <- data.frame(
stringsAsFactors = FALSE,
participant = c("pt04","pt75","pt21","pt73",
"pt27","pt39","pt43","pt52","pt69","pt49","pt50",
"pt56","pt62","pt68","pt22","pt64","pt54","pt79",
"pt36","pt26","pt65","pt38"),
fm_bdc3 = c(18.535199635968,23.52996574649,
17.276246451976,11.526088555461,23.805048656112,
23.08597823716,28.691020942436,28.968097858499,
23.378093165331,22.491725344661,14.609015054932,19.734914019306,
31.947412973684,25.152298171274,12.007356801787,
20.836128108938,22.322230884349,14.777652101515,
21.389572717608,16.992853675086,14.138189878472,17.777235203826),
fm_rec3 = c(18.545007190636,
23.017181869742,17.031403417007,11.227201061887,23.581434653208,
21.571120542136,28.919246372213,28.138632765662,
22.990408911436,22.274932676852,14.012586350504,19.066675709151,
30.897705534847,24.491614222412,11.670939246332,
20.306494543464,22.052263684182,14.252973638341,
21.028701096846,17.207104923059,13.172159777361,17.610831079442),
fm_chg = c(0.00980755466799721,
-0.512783876747999,-0.244843034968998,-0.298887493573998,
-0.223614002904,-1.514857695024,0.228225429777002,
-0.829465092836998,-0.387684253894999,-0.216792667809003,
-0.596428704428,-0.668238310155001,-1.049707438837,
-0.660683948862001,-0.336417555455,-0.529633565474001,
-0.269967200167002,-0.524678463173998,-0.360871620761998,
0.214251247972999,-0.966030101111,-0.166404124383998),
fm_percchg = c(0.00052913132097943,
-0.0217928016671462,-0.0141722355981437,-0.0259313896588437,
-0.00939355370091154,-0.0656180855522784,
0.00795459423472242,-0.0286337438132355,-0.0165832282022865,
-0.00963877445980213,-0.0408260722701251,-0.0338607155572751,
-0.0328573534170568,-0.0262673392452288,-0.028017619615079,
-0.025419001203338,-0.0120940958619099,
-0.0355048596062299,-0.0168713805332318,0.0126083147698213,
-0.0683277073949869,-0.00936051767758492),
group = as.factor(c("c","e",
"e","c","c","e","c","e","c","e","e","c",
"e","c","c","e","e","c","e","c","e",
"c")),
sex = as.factor(c("f","m",
"m","m","m","m","m","f","m","f","f","f",
"f","f","f","f","m","f","m","m","f",
"m"))
)
dexadf <- as_tibble(dexadf)
# Note the use of .data pronoun, since columns will passed to this function as characters
summbygrp <- function(df, x) {
df %>%
group_by(group) %>%
summarise(
count = n(),
mean = mean(.data[[x]], na.rm = TRUE), # use of .data
sd = sd(.data[[x]], na.rm = TRUE) # use of .data
) %>%
mutate(se = sd / sqrt(11),
lower.ci = mean - qt(1 - (0.05 / 2), 11 - 1) * se,
upper.ci = mean + qt(1 - (0.05 / 2), 11 - 1) * se
)
}
# Here we extract the numerical columns of the dataset
cols <- dexadf %>%
select(where(is.numeric)) %>% colnames(.)
cols
#> [1] "fm_bdc3" "fm_rec3" "fm_chg" "fm_percchg"
# Then instead of for loops we can simply use this map function
map(.x = cols, ~ summbygrp(dexadf, .x))
#> [[1]]
#> # A tibble: 2 × 7
#> group count mean sd se lower.ci upper.ci
#> <fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 c 11 19.3 5.49 1.66 15.6 23.0
#> 2 e 11 21.9 5.40 1.63 18.2 25.5
#>
#> [[2]]
#> # A tibble: 2 × 7
#> group count mean sd se lower.ci upper.ci
#> <fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 c 11 19.1 5.54 1.67 15.3 22.8
#> 2 e 11 21.2 5.31 1.60 17.7 24.8
#>
#> [[3]]
#> # A tibble: 2 × 7
#> group count mean sd se lower.ci upper.ci
#> <fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 c 11 -0.256 0.311 0.0938 -0.465 -0.0470
#> 2 e 11 -0.645 0.407 0.123 -0.918 -0.371
#>
#> [[4]]
#> # A tibble: 2 × 7
#> group count mean sd se lower.ci upper.ci
#> <fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 c 11 -0.0149 0.0167 0.00503 -0.0261 -0.00368
#> 2 e 11 -0.0306 0.0203 0.00611 -0.0442 -0.0170
# -------------------------------------------------------------------
# we can also bind all the output results (dataframes) in a single dataframe
map_dfr(.x = cols, ~ summbygrp(dexadf, .x), .id = "vars")
#> # A tibble: 8 × 8
#> vars group count mean sd se lower.ci upper.ci
#> <chr> <fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 c 11 19.3 5.49 1.66 15.6 23.0
#> 2 1 e 11 21.9 5.40 1.63 18.2 25.5
#> 3 2 c 11 19.1 5.54 1.67 15.3 22.8
#> 4 2 e 11 21.2 5.31 1.60 17.7 24.8
#> 5 3 c 11 -0.256 0.311 0.0938 -0.465 -0.0470
#> 6 3 e 11 -0.645 0.407 0.123 -0.918 -0.371
#> 7 4 c 11 -0.0149 0.0167 0.00503 -0.0261 -0.00368
#> 8 4 e 11 -0.0306 0.0203 0.00611 -0.0442 -0.0170
Created on 2022-07-09 by the reprex package (v2.0.1)
out <- df %>%
pivot_longer(starts_with('fm')) %>%
group_by(name, group) %>%
summarise(
count = n(),
mean = mean(value, na.rm = TRUE),
sd = sd(value, na.rm = TRUE),
.groups = 'drop'
) %>%
mutate(se = sd / sqrt(11),
lower.ci = mean - qt(1 - (0.05 / 2), 11 - 1) * se,
upper.ci = mean + qt(1 - (0.05 / 2), 11 - 1) * se
)
out
# A tibble: 8 x 8
name group count mean sd se lower.ci upper.ci
<chr> <fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 fm_bdc3 c 11 19.3 5.49 1.66 15.6 23.0
2 fm_bdc3 e 11 21.9 5.40 1.63 18.2 25.5
3 fm_chg c 11 -0.256 0.311 0.0938 -0.465 -0.0470
4 fm_chg e 11 -0.645 0.407 0.123 -0.918 -0.371
5 fm_percchg c 11 -0.0149 0.0167 0.00503 -0.0261 -0.00368
6 fm_percchg e 11 -0.0306 0.0203 0.00611 -0.0442 -0.0170
7 fm_rec3 c 11 19.1 5.54 1.67 15.3 22.8
8 fm_rec3 e 11 21.2 5.31 1.60 17.7 24.8
if you need the list, just split it:
split(out, ~name)
$fm_bdc3
# A tibble: 2 x 8
name group count mean sd se lower.ci upper.ci
<chr> <fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 fm_bdc3 c 11 19.3 5.49 1.66 15.6 23.0
2 fm_bdc3 e 11 21.9 5.40 1.63 18.2 25.5
$fm_chg
# A tibble: 2 x 8
name group count mean sd se lower.ci upper.ci
<chr> <fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 fm_chg c 11 -0.256 0.311 0.0938 -0.465 -0.0470
2 fm_chg e 11 -0.645 0.407 0.123 -0.918 -0.371
$fm_percchg
# A tibble: 2 x 8
name group count mean sd se lower.ci upper.ci
<chr> <fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 fm_percchg c 11 -0.0149 0.0167 0.00503 -0.0261 -0.00368
2 fm_percchg e 11 -0.0306 0.0203 0.00611 -0.0442 -0.0170
$fm_rec3
# A tibble: 2 x 8
name group count mean sd se lower.ci upper.ci
<chr> <fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 fm_rec3 c 11 19.1 5.54 1.67 15.3 22.8
2 fm_rec3 e 11 21.2 5.31 1.60 17.7 24.8
A similar answer to the above, but combining across and summarise:
df |>
group_by(group) |>
summarise(
across(
where(is.numeric),
list(
mean = ~mean(.x, na.rm = TRUE),
sd = ~sd(.x, na.rm = TRUE),
n = ~n()
),
.names = "{.col}.{.fn}"
)
) |>
pivot_longer(
-group,
names_to = c("measure", "stat"),
names_sep = "\\."
) |>
pivot_wider(
names_from = stat,
values_from = value
) |>
mutate(
se = sd / sqrt(n),
lower.ci = mean - qt(1 - (0.05 / 2), 11 - 1) * se,
upper.ci = mean + qt(1 - (0.05 / 2), 11 - 1) * se
) |>
arrange(measure)

computing quantiles in R from means

I have a dataframe:
> print(merged)
AgeGroup values ind
1 1 0.2449762 diff_v.ownhigh_avg
2 1 0.2598964 diff_v.ownhigh_avg
3 1 0.2519043 diff_v.ownhigh_avg
4 1 0.2452479 diff_v.ownhigh_avg
5 1 0.2840650 diff_v.ownhigh_avg
6 1 0.2589341 diff_v.ownhigh_avg
7 1 0.3201843 diff_v.ownhigh_avg
8 1 0.3218865 diff_v.ownhigh_avg
9 1 0.2822984 diff_v.ownhigh_avg
10 1 0.3313962 diff_v.ownhigh_avg
There are 8 different types of ind, and there are 2 AgeGroup types.
I am creating a new dataframe that summarises the means and credble intervals based on 2 group factors (AgeGroup and ind).
This is the code that I have:
meansCIs <- merged %>%
group_by(AgeGroup, ind) %>%
summarise(means = mean(values), .groups = "keep",
lower_bound = quantile(means,.025),
upper_bound = quantile(means,.975))
This is the output it gives:
# A tibble: 16 x 5
# Groups: AgeGroup, ind [16]
AgeGroup ind means lower_bound upper_bound
<dbl> <fct> <dbl> <dbl> <dbl>
1 1 diff_v.ownhigh_avg 0.290 0.290 0.290
2 1 diff_v.ownlow_avg 0.272 0.272 0.272
3 1 diff_v.otherhigh_avg 0.274 0.274 0.274
4 1 diff_v.otherlow_avg 0.388 0.388 0.388
5 1 diff_v.own_avg 0.281 0.281 0.281
As you can see, something has gone wrong with computing the credible intervals. It is just replicating the mean for each condition. Does anyone know how I could fix this?
The quantile function is operating on just the single mean value. I think you need to substitute in the “values” variable.
merged<- structure(list(AgeGroup = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L),
values = c(0.2449762, 0.2598964, 0.2519043, 0.2452479, 0.284065,
0.2589341, 0.3201843, 0.3218865, 0.2822984, 0.3313962),
ind = c("diff_v.ownhigh_avg", "diff_v.ownhigh_avg", "diff_v.ownhigh_avg", "diff_v.ownhigh_avg",
"diff_v.ownhigh_avg", "diff_v.ownhigh_avg", "diff_v.ownhigh_avg",
"diff_v.ownhigh_avg", "diff_v.ownhigh_avg", "diff_v.ownhigh_avg" )),
class = "data.frame", row.names = c(NA, -10L))
merged %>%
group_by(AgeGroup, ind) %>%
summarise(means = mean(values), .groups = "keep",
lower_bound = quantile(values,.025),
upper_bound = quantile(values,.975))
# A tibble: 1 × 5
# Groups: AgeGroup, ind [1]
AgeGroup ind means lower_bound upper_bound
<int> <chr> <dbl> <dbl> <dbl>
1 1 diff_v.ownhigh_avg 0.280 0.245 0.329

Fixing function error: 'pairlist' object cannot be coerced to type 'double'

I wrote a function to summarize the same numeric variable across three datasets. This code works outside of the function, when I replace x with the actual variable name.
k1 <- data.frame(variable_name = rnorm(100), year = sample(x = 1990:1995, size = 100, replace = TRUE))
k2 <- data.frame(variable_name = rnorm(100), year = sample(x = 1990:1995, size = 100, replace = TRUE))
k3 <- data.frame(variable_name = rnorm(100), year = sample(x = 1990:1995, size = 100, replace = TRUE))
numeric_var_summary <- function(x) {
x <- enquo(x)
k1_x <- k1 %>%
select(year, !!x) %>%
group_by(year) %>%
summarize(min = min(!!x), Q1 = quantile(!!x, 0.25), median = median(!!x),
Q3 = quantile(!!x, 0.75), max = max(!!x), Qrange = quantile(!!x, 0.75) - quantile(!!x, 0.25),
mean = mean(!!x), sd = sd(x), n = n(),
missing = sum(is.na(!!x)))
k2_x <- k2 %>%
select(xear, !!x) %>%
group_by(year) %>%
summarize(min = min(!!x), Q1 = quantile(!!x, 0.25), median = median(!!x),
Q3 = quantile(!!x, 0.75), max = max(!!x), Qrange = quantile(!!x, 0.75) - quantile(!!x, 0.25),
mean = mean(!!x), sd = sd(!!x), n = n(),
missing = sum(is.na(!!x)))
k3_x <- k3 %>%
select(year, !!x) %>%
group_by(year) %>%
summarize(min = min(!!x), Q1 = quantile(x, 0.25),
median = median(!!x), Q3 = quantile(!!x, 0.75),
max = max(!!x), Qrange = quantile(!!x, 0.75) - quantile(!!x, 0.25),
mean = mean(!!x), sd = sd(!!x), n = n(),
missing = sum(is.na(!!x)))
return(bind_rows(k1_x, k2_x, k3_x), n = Inf)
}
numeric_var_summary(x = variable_name)
But I get this error when I try to run the function:
Error in is.data.frame(x) :
'pairlist' object cannot be coerced to type 'double'
I wonder if it's an issue with the tidy evaluation? Not sure if I did that correctly. Thanks for your help.
As of dplyr 1.0, there are two ways of approaching this:
if you want to pass the column identifier as a bare expression, like you're doing in your example, use {{x}}
alternatively, you can pass it as a character vector and then use .data[[x]]
I'll use the first approach, since that's closest to what you were going for.
Since you're computing the same summary across all three data frames, your function can also be made a lot more concise. Here I use purrr::map_dfr to apply a function that summarizes one of your data frames to all three data frames and then row bind the result:
library(dplyr)
library(purrr)
set.seed(3046)
k1 <- data.frame(variable_name = rnorm(100), year = sample(x = 1990:1995, size = 100, replace = TRUE))
k2 <- data.frame(variable_name = rnorm(100), year = sample(x = 1990:1995, size = 100, replace = TRUE))
k3 <- data.frame(variable_name = rnorm(100), year = sample(x = 1990:1995, size = 100, replace = TRUE))
numeric_var_summary <- function(k, col, na.rm = TRUE) {
k_summary <- k %>%
select(year, {{col}}) %>%
group_by(year) %>%
summarize(min = min({{col}}, na.rm = na.rm),
Q1 = quantile({{col}}, 0.25, na.rm = na.rm),
median = median({{col}}, na.rm = na.rm),
Q3 = quantile({{col}}, 0.75, na.rm = na.rm),
max = max({{col}}, na.rm = na.rm),
Qrange = quantile({{col}}, 0.75, na.rm = na.rm) - quantile({{col}}, 0.25, na.rm = na.rm),
mean = mean({{col}}, na.rm = na.rm),
sd = sd({{col}}, na.rm = na.rm),
n = n(),
missing = sum(is.na({{col}})))
return(k_summary)
}
# compute the individual summaries and combine the results
map_dfr(list(k1, k2, k3), numeric_var_summary, col = variable_name)
#> `summarise()` ungrouping output (override with `.groups` argument)
#> `summarise()` ungrouping output (override with `.groups` argument)
#> `summarise()` ungrouping output (override with `.groups` argument)
#> # A tibble: 18 x 11
#> year min Q1 median Q3 max Qrange mean sd n missing
#> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int> <int>
#> 1 1990 -1.18 -1.06 -0.553 0.312 2.72 1.37 -0.0309 1.38 9 0
#> 2 1991 -2.73 -0.775 -0.245 0.163 0.849 0.938 -0.471 0.954 18 0
#> 3 1992 -1.01 -0.176 0.354 0.735 2.75 0.911 0.344 0.849 21 0
#> 4 1993 -0.681 -0.247 -0.0524 0.567 1.99 0.814 0.256 0.799 14 0
#> 5 1994 -1.84 -1.08 -0.157 0.287 1.80 1.37 -0.280 0.948 18 0
#> 6 1995 -1.20 -0.573 -0.261 0.720 1.97 1.29 0.0397 0.881 20 0
#> 7 1990 -1.76 -0.397 0.283 0.534 1.29 0.931 0.0240 0.870 22 0
#> 8 1991 -2.24 -0.496 -0.112 0.372 1.29 0.868 -0.137 1.01 11 0
#> 9 1992 -1.44 -0.241 0.711 1.17 2.51 1.41 0.449 1.16 12 0
#> 10 1993 -1.92 -0.858 -0.210 0.770 2.31 1.63 -0.0219 1.11 22 0
#> 11 1994 -1.41 -0.207 0.485 0.870 2.23 1.08 0.332 0.987 14 0
#> 12 1995 -2.86 -0.374 0.300 1.05 2.35 1.43 0.221 1.24 19 0
#> 13 1990 -1.49 -1.03 -0.206 0.113 0.851 1.14 -0.292 0.722 14 0
#> 14 1991 -1.67 -0.454 0.139 0.514 1.82 0.968 0.0963 0.956 19 0
#> 15 1992 -2.11 -1.02 -0.217 0.569 1.10 1.59 -0.344 0.986 16 0
#> 16 1993 -1.58 -0.935 -0.0794 0.625 1.26 1.56 -0.160 0.946 10 0
#> 17 1994 -1.93 -0.494 -0.307 0.294 1.60 0.788 -0.186 0.902 22 0
#> 18 1995 -1.49 -0.751 0.374 0.900 2.19 1.65 0.229 1.10 19 0
# verify that the simplified function returns identical results:
numeric_var_summary_manual <- function(x) {
k1_x <- k1 %>%
select(year, {{x}}) %>%
group_by(year) %>%
summarize(min = min({{x}}), Q1 = quantile({{x}}, 0.25), median = median({{x}}),
Q3 = quantile({{x}}, 0.75), max = max({{x}}), Qrange = quantile({{x}}, 0.75) - quantile({{x}}, 0.25),
mean = mean({{x}}), sd = sd({{x}}), n = n(),
missing = sum(is.na({{x}})))
k2_x <- k2 %>%
select(year, {{x}}) %>%
group_by(year) %>%
summarize(min = min({{x}}), Q1 = quantile({{x}}, 0.25), median = median({{x}}),
Q3 = quantile({{x}}, 0.75), max = max({{x}}), Qrange = quantile({{x}}, 0.75) - quantile({{x}}, 0.25),
mean = mean({{x}}), sd = sd({{x}}), n = n(),
missing = sum(is.na({{x}})))
k3_x <- k3 %>%
select(year, {{x}}) %>%
group_by(year) %>%
summarize(min = min({{x}}), Q1 = quantile({{x}}, 0.25),
median = median({{x}}), Q3 = quantile({{x}}, 0.75),
max = max({{x}}), Qrange = quantile({{x}}, 0.75) - quantile({{x}}, 0.25),
mean = mean({{x}}), sd = sd({{x}}), n = n(),
missing = sum(is.na({{x}})))
return(bind_rows(k1_x, k2_x, k3_x))
}
identical(numeric_var_summary_manual(x = variable_name),
map_dfr(list(k1, k2, k3), numeric_var_summary, col = variable_name))
#> `summarise()` ungrouping output (override with `.groups` argument)
#> `summarise()` ungrouping output (override with `.groups` argument)
#> `summarise()` ungrouping output (override with `.groups` argument)
#> `summarise()` ungrouping output (override with `.groups` argument)
#> `summarise()` ungrouping output (override with `.groups` argument)
#> `summarise()` ungrouping output (override with `.groups` argument)
#> [1] TRUE

Divide by last row in mutate in Tidyverse

So this is a relatively simple problem, I have a dataset as below
df <- structure(list(term = c("(Intercept)", "overall_quality", "overall_costs",
"wwpf"), estimate = c(0.388607224137536, 0.456477162621961, 0.485612564501229,
NA), std.error = c(0.499812263278414, 0.0987819420575201, 0.108042289289401,
NA), statistic = c(0.777506381273137, 4.62105879995918, 4.49465267438447,
NA), p.value = c(0.440597919486169, 0.0000279867005591494, 0.0000426773877613654,
NA), average = c(NA, 8.09615384615385, 7.86538461538461, 7.90384615384615
), Elasticity = c(NA, 3.69570933584318, 3.81952959386543, NA)), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -4L))
I am trying to use below
df %>% mutate(Elasticity= average*estimate/average[nrow(df)])
Expected output: https://ibb.co/42ptLXx
basically, divide by last row value & since I am trying to incorporate this in function, I need the method to be dynamic & not hard coded value.
Please help !
We can use n() to return the index of last row for subsetting the value of that column
library(dplyr)
df %>%
mutate(Elasticity= average*estimate/average[n()])
If we need a function (using rlang_0.4.0), we can make use {{..}} for evaluation
f1 <- function(dat, col1, col2) {
dat %>%
mutate(Elasticity = {{col1}} * {{col2}}/{{col1}}[n()])
}
f1(df, average, estimate)
# A tibble: 4 x 7
# term estimate std.error statistic p.value average Elasticity
# <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 (Intercept) 0.389 0.500 0.778 0.441 NA NA
#2 overall_quality 0.456 0.0988 4.62 0.0000280 8.10 0.468
#3 overall_costs 0.486 0.108 4.49 0.0000427 7.87 0.483
#4 wwpf NA NA NA NA 7.90 NA

Row products and differences for tibbles created by combn function

I've got a data like below:
df <- structure(list(x1 = c(0.544341260178568, 0.412555423655238, -0.013600925280521,
-0.947831642260442, -0.705819557090343, -0.440052278478676, 0.583360907624305,
-0.548217106316072, -0.381271093402877, 1.66078031000975), x2 = c(-2.17595468838955,
3.73045998213455, 7.88166053118859, 0.295257601073637, -0.503260811313588,
0.118118179398699, 3.77037347523743, 2.92758197923041, 3.40618904087335,
1.45012335878481), x3 = c(14.1085074738418, 9.46630939737492,
7.30026032988652, 10.1473062197382, 11.0336174184083, 7.09744336163716,
16.6871358354018, 13.5030856142587, 14.8384334167838, 1.82381360524456
), x4 = c(-2.78166486821977, -3.14368874900826, -3.70425316743753,
-4.34268218961615, -3.03557313652054, -2.74059520574829, -4.10826186695405,
-1.97243713944283, -3.88803755426516, -2.56315085425652), x5 = c(-0.279614449281486,
-0.480466773938402, -1.43353886424161, 0.286937906279445, 0.701999608919316,
0.591932833840325, 0.994266002713824, 1.03424778687263, 0.462618513817936,
-3.08491622131441)), row.names = c(NA, -10L), class = c("tbl_df",
"tbl", "data.frame"))
Now I want to create columns that are sums, products and differences of each pair of column. With sums it went easy:
combn(df, 2, function(x) {
x %>% transmute(!!paste(names(.), collapse = '+') := rowSums(.))},
simplify = FALSE)
But now I need to calculate products and differences. As there are no equivalent to rowSums for difference or product my approach fails here. I was thinking about something like
combn(df, 2, function(x) {
x %>% transmute(!!paste(names(.), collapse = '-') := apply(., 1, `-`)},
simplify = FALSE)
but it doesn't work.
Here's one "tidy" approach. It relies on converting the data to a long format where each row in your original df gets assigned an id and the columns are gathered.
This allows us to do a full join of the data frame with itself. That way you get all pairwise combinations of your columns. Once in this format, applying the sums, products and differences becomes really easy.
Update: Reformat output
library(tidyverse)
df <-
structure(
list(
x1 = c(
0.544341260178568,
0.412555423655238,
-0.013600925280521,-0.947831642260442,
-0.705819557090343,
-0.440052278478676,
0.583360907624305,-0.548217106316072,
-0.381271093402877,
1.66078031000975
),
x2 = c(
-2.17595468838955,
3.73045998213455,
7.88166053118859,
0.295257601073637,
-0.503260811313588,
0.118118179398699,
3.77037347523743,
2.92758197923041,
3.40618904087335,
1.45012335878481
),
x3 = c(
14.1085074738418,
9.46630939737492,
7.30026032988652,
10.1473062197382,
11.0336174184083,
7.09744336163716,
16.6871358354018,
13.5030856142587,
14.8384334167838,
1.82381360524456
),
x4 = c(
-2.78166486821977,
-3.14368874900826,
-3.70425316743753,-4.34268218961615,
-3.03557313652054,
-2.74059520574829,
-4.10826186695405,-1.97243713944283,
-3.88803755426516,
-2.56315085425652
),
x5 = c(
-0.279614449281486,-0.480466773938402,
-1.43353886424161,
0.286937906279445,
0.701999608919316,
0.591932833840325,
0.994266002713824,
1.03424778687263,
0.462618513817936,-3.08491622131441
)
),
row.names = c(NA,-10L),
class = c("tbl_df",
"tbl", "data.frame")
)
# Add an id for each observation and covert to long format
df_wrangled <- df %>%
mutate(id = 1:n()) %>%
gather(col, val, -id)
pairs <- full_join(df_wrangled, df_wrangled, by = "id") %>%
mutate(
sum = val.x + val.y,
prod = val.x * val.y,
diff = val.x - val.y
)
head(pairs)
#> # A tibble: 6 x 8
#> id col.x val.x col.y val.y sum prod diff
#> <int> <chr> <dbl> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 1 x1 0.544 x1 0.544 1.09 0.296 0
#> 2 1 x1 0.544 x2 -2.18 -1.63 -1.18 2.72
#> 3 1 x1 0.544 x3 14.1 14.7 7.68 -13.6
#> 4 1 x1 0.544 x4 -2.78 -2.24 -1.51 3.33
#> 5 1 x1 0.544 x5 -0.280 0.265 -0.152 0.824
#> 6 2 x1 0.413 x1 0.413 0.825 0.170 0
pairs_wrangled <- pairs %>%
filter(col.x != col.y) %>%
gather(operation, val, sum, prod, diff) %>%
mutate(
label = paste0(
col.x,
case_when(operation == "sum" ~ "+", operation == "diff" ~ "-", operation == "prod" ~ "*"),
col.y
)
) %>%
select(id, label, val) %>%
spread(label, val)
head(pairs_wrangled)
#> # A tibble: 6 x 61
#> id `x1-x2` `x1-x3` `x1-x4` `x1-x5` `x1*x2` `x1*x3` `x1*x4` `x1*x5`
#> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 2.72 -13.6 3.33 0.824 -1.18 7.68 -1.51 -0.152
#> 2 2 -3.32 -9.05 3.56 0.893 1.54 3.91 -1.30 -0.198
#> 3 3 -7.90 -7.31 3.69 1.42 -0.107 -0.0993 0.0504 0.0195
#> 4 4 -1.24 -11.1 3.39 -1.23 -0.280 -9.62 4.12 -0.272
#> 5 5 -0.203 -11.7 2.33 -1.41 0.355 -7.79 2.14 -0.495
#> 6 6 -0.558 -7.54 2.30 -1.03 -0.0520 -3.12 1.21 -0.260
#> # … with 52 more variables: `x1+x2` <dbl>, `x1+x3` <dbl>, `x1+x4` <dbl>,
#> # `x1+x5` <dbl>, `x2-x1` <dbl>, `x2-x3` <dbl>, `x2-x4` <dbl>,
#> # `x2-x5` <dbl>, `x2*x1` <dbl>, `x2*x3` <dbl>, `x2*x4` <dbl>,
#> # `x2*x5` <dbl>, `x2+x1` <dbl>, `x2+x3` <dbl>, `x2+x4` <dbl>,
#> # `x2+x5` <dbl>, `x3-x1` <dbl>, `x3-x2` <dbl>, `x3-x4` <dbl>,
#> # `x3-x5` <dbl>, `x3*x1` <dbl>, `x3*x2` <dbl>, `x3*x4` <dbl>,
#> # `x3*x5` <dbl>, `x3+x1` <dbl>, `x3+x2` <dbl>, `x3+x4` <dbl>,
#> # `x3+x5` <dbl>, `x4-x1` <dbl>, `x4-x2` <dbl>, `x4-x3` <dbl>,
#> # `x4-x5` <dbl>, `x4*x1` <dbl>, `x4*x2` <dbl>, `x4*x3` <dbl>,
#> # `x4*x5` <dbl>, `x4+x1` <dbl>, `x4+x2` <dbl>, `x4+x3` <dbl>,
#> # `x4+x5` <dbl>, `x5-x1` <dbl>, `x5-x2` <dbl>, `x5-x3` <dbl>,
#> # `x5-x4` <dbl>, `x5*x1` <dbl>, `x5*x2` <dbl>, `x5*x3` <dbl>,
#> # `x5*x4` <dbl>, `x5+x1` <dbl>, `x5+x2` <dbl>, `x5+x3` <dbl>,
#> # `x5+x4` <dbl>
Created on 2019-04-02 by the reprex package (v0.2.1)

Resources