I want to apply a custom function to all pairs of columns in a dataframe to get a p x p matrix/dataframe of the results. Is there a quick way to do that in the tidyverse?
The output should be the results data frame.
custom_function <- function(x, y){
sum(x, y)
}
set.seed(100)
data <- tibble(x = rnorm(10), y = rnorm(10), z = rnorm(10))
result <- tibble(cols = c("x","y","z"),
x = c(custom_function(data$x, data$x), custom_function(data$x, data$y), custom_function(data$x, data$z)),
y = c(custom_function(data$y, data$x), custom_function(data$y, data$y), custom_function(data$y, data$z)),
z = c(custom_function(data$z, data$x), custom_function(data$z, data$y), custom_function(data$z, data$z)))
result
You can use the following solution:
library(dplyr)
library(tibble)
expand.grid(names(data), names(data)) %>%
rowwise() %>%
mutate(Res = custom_function(data[as.character(Var1)], data[as.character(Var2)])) %>%
pivot_wider(names_from = unique("Var1"), values_from = "Res") %>%
column_to_rownames("Var2")
x y z
x -0.3591433 2.157343 -1.470995
y 2.1573430 4.673829 1.045491
z -1.4709953 1.045491 -2.582847
One idea:
library(dplyr, warn.conflicts = FALSE)
custom_function <- function(x, y) {
sum(x, y)
}
set.seed(100)
data <- tibble(x = rnorm(10), y = rnorm(10), z = rnorm(10))
data_long <-
data %>%
mutate(id = 1:nrow(.)) %>%
tidyr::pivot_longer(cols = -id)
result <-
data_long %>%
inner_join(data_long, by = "id") %>%
group_by(name.x, name.y) %>%
summarize(value = custom_function(value.x, value.y),
.groups = "drop") %>%
tidyr::pivot_wider(names_from = name.x, values_from = value) %>%
rename(cols = name.y)
result
#> # A tibble: 3 x 4
#> cols x y z
#> <chr> <dbl> <dbl> <dbl>
#> 1 x -0.359 2.16 -1.47
#> 2 y 2.16 4.67 1.05
#> 3 z -1.47 1.05 -2.58
Created on 2021-07-10 by the reprex package (v2.0.0)
And here it is organized as a function:
library(dplyr, warn.conflicts = FALSE)
custom_function <- function(x, y) {
sum(x, y)
}
set.seed(100)
data <- tibble(x = rnorm(10), y = rnorm(10), z = rnorm(10))
custom_summ <- function(df, f) {
data_long <-
data %>%
mutate(id = 1:nrow(.)) %>%
tidyr::pivot_longer(cols = -id)
result <-
data_long %>%
inner_join(data_long, by = "id") %>%
group_by(name.x, name.y) %>%
summarize(value = f(value.x, value.y),
.groups = "drop") %>%
tidyr::pivot_wider(names_from = name.x, values_from = value) %>%
rename(cols = name.y)
result
}
custom_summ(data, custom_function)
#> # A tibble: 3 x 4
#> cols x y z
#> <chr> <dbl> <dbl> <dbl>
#> 1 x -0.359 2.16 -1.47
#> 2 y 2.16 4.67 1.05
#> 3 z -1.47 1.05 -2.58
Created on 2021-07-10 by the reprex package (v2.0.0)
And here are some benchmarking data for the various options. The tidyverse approach offered in the accepted answer is not a good one if performance is at all a concern. The fastest option here is the sapply-based one offered in a comment to the question.
library(tidyverse)
custom_function <- function(x, y) {
sum(x, y)
}
set.seed(100)
get_data <- function() {
data <- lapply(letters, function(i) rnorm(1000))
names(data) <- letters
as_tibble(data)
}
custom_summ <- function(df, f) {
data_long <-
data %>%
mutate(id = 1:nrow(.)) %>%
pivot_longer(cols = -id)
result <-
data_long %>%
inner_join(data_long, by = "id") %>%
group_by(name.x, name.y) %>%
summarize(value = f(value.x, value.y),
.groups = "drop") %>%
pivot_wider(names_from = name.x, values_from = value) %>%
rename(cols = name.y)
result
}
data <- get_data()
system.time(custom_summ(data, custom_function))
#> user system elapsed
#> 0.053 0.007 0.062
custom_summ_2 <- function(data, f) {
expand.grid(names(data), names(data)) %>%
mutate(val = map2(Var1, Var2, ~ f(data[.x], data[.y]))) %>%
pivot_wider(id_cols = Var1 ,names_from = Var2, values_from = val, values_fn = first) %>%
column_to_rownames('Var1') %>%
as.matrix()
}
system.time(custom_summ_2(data, custom_function))
#> user system elapsed
#> 26.479 0.317 27.365
custom_summ_3 <- function(data, f) {
expand.grid(names(data), names(data)) %>%
rowwise() %>%
mutate(Res = f(data[as.character(Var1)], data[as.character(Var2)])) %>%
pivot_wider(names_from = unique("Var1"), values_from = "Res") %>%
column_to_rownames("Var2")
}
system.time(custom_summ_3(data, custom_function))
#> user system elapsed
#> 0.048 0.001 0.049
custom_summ_4 <- function(data, f) {
sapply(data, function(y) sapply(data, f, y = y))
}
system.time(custom_summ_4(data, custom_function))
#> user system elapsed
#> 0.003 0.000 0.003
custom_summ_5 <- function(data, f) {
outer(names(data), names(data),
FUN = Vectorize(function(x, y) f (data[x], data[y])))
}
system.time(custom_summ_5(data, custom_function))
#> user system elapsed
#> 0.044 0.001 0.045
Created on 2021-07-11 by the reprex package (v2.0.0)
We could have used outer directly if the custom_function was a vectorized one. But it is using sum which is an scalar function so we can use it by wrapping it around Vectorize() in FUN = argument in outer. Do it like this-
outer(names(data),names(data), FUN = Vectorize(function(x, y) custom_function (data[x], data[y])))
tidyverse strategy Though a little verbose but you can manage this approach in tidyverse, if you want.
library(tidyverse)
custom_function <- function(x, y){
sum(x, y)
}
set.seed(100)
data <- tibble(x = rnorm(10), y = rnorm(10), z = rnorm(10))
expand.grid(names(data), names(data)) %>%
mutate(val = map2(Var1, Var2, ~ custom_function(data[.x], data[.y]))) %>%
pivot_wider(id_cols = Var1 ,names_from = Var2, values_from = val, values_fn = first) %>%
column_to_rownames('Var1') %>%
as.matrix()
#> x y z
#> x -0.3591433 2.157343 -1.470995
#> y 2.1573430 4.673829 1.045491
#> z -1.4709953 1.045491 -2.582847
Created on 2021-07-10 by the reprex package (v2.0.0)
Related
I would like to randomly add NA values to my dataframe with the proportion set by group.
library(tidyverse)
set.seed(1)
dat <- tibble(group = c(rep("A", 100),
rep("B", 100)),
value = rnorm(200))
pA <- 0.5
pB <- 0.2
# does not work
# was trying to create another column that i could use with
# case_when to set value to NA if missing==1
dat %>%
group_by(group) %>%
mutate(missing = rbinom(n(), 1, c(pA, pB))) %>%
summarise(mean = mean(missing))
I'd create a small tibble to keep track of the expected missingness rates, and join it to the first data frame. Then go through row by row to decide whether to set a value to missing or not.
This is easy to generalize to more than two groups as well.
library("tidyverse")
set.seed(1)
dat <- tibble(
group = c(
rep("A", 100),
rep("B", 100)
),
value = rnorm(200)
)
expected_nans <- tibble(
group = c("A", "B"),
p = c(0.5, 0.2)
)
dat_with_nans <- dat %>%
inner_join(
expected_nans,
by = "group"
) %>%
mutate(
r = runif(n()),
value = if_else(r < p, NA_real_, value)
) %>%
select(
-p, -r
)
dat_with_nans %>%
group_by(
group
) %>%
summarise(
mean(is.na(value))
)
#> # A tibble: 2 × 2
#> group `mean(is.na(value))`
#> <chr> <dbl>
#> 1 A 0.53
#> 2 B 0.17
Created on 2022-03-11 by the reprex package (v2.0.1)
Nesting and unnesting works
library(tidyverse)
dat <- tibble(group = c(rep("A", 1000),
rep("B", 1000)),
value = rnorm(2000))
pA <- .1
pB <- 0.5
set.seed(1)
dat %>%
group_by(group) %>%
nest() %>%
mutate(p = case_when(
group=="A" ~ pA,
TRUE ~ pB
)) %>%
mutate(data = purrr::map(data, ~ mutate(.x, missing = rbinom(n(), 1, p)))) %>%
unnest() %>%
summarise(mean = mean(missing))
# A tibble: 2 × 2
group mean
<chr> <dbl>
1 A 0.11
2 B 0.481
set.seed(1)
dat %>%
group_by(group) %>%
nest() %>%
mutate(p = case_when(
group=="A" ~ pA,
TRUE ~ pB
)) %>%
mutate(data = purrr::map(data, ~ mutate(.x, missing = rbinom(n(), 1, p)))) %>%
unnest() %>%
ungroup() %>%
mutate(value = case_when(
missing == 1 ~ NA_real_,
TRUE ~ value
)) %>%
select(-p, -missing)
library(tidyverse)
#make a sample data frame
a <- c(2000,2000,2000,2000,2001,2001,2001,2001)
b <- c("M","M","M","F","F","M","F","F")
d<- c("Yes","No","Yes","No","No","Unknown","Unknown","Yes")
e <- c("Unknown","No","No","Yes","Unknown","Yes","No","Unknown")
df <- data.frame(a,b,d,e)
colnames(df) <- c("Year","Gender","q1","q2")
# make a table for q1
myvar <- c("Gender","q1")
mydf <- df[,myvar]
table1 <- mydf %>%
pivot_longer(-q1) %>%
group_by(name,q1,value) %>%
summarise(n=n()) %>%
mutate(prop = round(n/sum(n),3)*100,
summary_str = glue::glue("{n}({prop}%)")) %>%
pivot_wider(id_cols = c(name,value), names_from = "q1", values_from = "summary_str")
#make the function creating a table
maketable <- function(df,x){
myvar <- c("gender",paste0(x))
mydf <- df[,myvar]
table1 <- mydf %>%
pivot_longer(-get(x)) %>%
group_by(name,get(x),value) %>%
summarise(n=n()) %>%
mutate(prop = round(n/sum(n),3)*100,
summary_str = glue::glue("{n}({prop}%)")) %>%
pivot_wider(id_cols = c(name,value), names_from = paste0(x), values_from = "summary_str")
colnames(table1)
}
maketable(df,q1)
maketable(df,q2)
Error in paste0(x): object 'q1' not found.
I want to make a function, so that I can use it for q2.
Could anyone help to correct the code? or suggest a better way?
Output per variable is as below
If you want to pass in unquoted column names to your function, you can use the {{}} (embrace) operator to inject them into your commands. For example
maketable <- function(df,x){
df %>%
select(Gender, {{x}}) %>%
pivot_longer(-{{x}}) %>%
group_by(name,{{x}},value)%>%
summarise(n=n()) %>%
mutate(prop = round(n/sum(n),3)*100,
summary_str = glue::glue("{n}({prop}%)")) %>%
pivot_wider(id_cols = c(name,value), names_from = {{x}}, values_from = "summary_str")
}
table1 <-maketable(df, q1)
See the programming with dplyr guide for more information.
Also note that the function just returns the new value. If you want to assign that to a new variable, make sure you do that outside the function. Values created inside of functions will not appear outside.
I have tried this one here
my_func = function(x)
{
new_df = df %>% group_by(Gender) %>% count({{x}}) %>% pivot_wider(names_from = {{x}}, values_from = n)
return(new_df)
}
I'm not sure that this is what you asked
colns <- colnames(df)
lapply(colns[c(3:4)], function(x) {
myvar <- c("Gender", x)
mydf <- df[,myvar]
table1 <- mydf%>%
pivot_longer(-x) %>%
group_by_all %>%
summarise(n=n()) %>%
mutate(prop = round(n/sum(n),3)*100,
summary_str = glue::glue("{n}({prop}%)")) %>%
pivot_wider(id_cols = c(name,value), names_from = x, values_from = "summary_str")
})
result is like
[[1]]
# A tibble: 2 x 5
# Groups: name [1]
name value No Unknown Yes
<chr> <chr> <glue> <glue> <glue>
1 Gender F 2(25%) 1(12.5%) 1(12.5%)
2 Gender M 1(12.5%) 1(12.5%) 2(25%)
[[2]]
# A tibble: 2 x 5
# Groups: name [1]
name value No Unknown Yes
<chr> <chr> <glue> <glue> <glue>
1 Gender F 1(12.5%) 2(25%) 1(12.5%)
2 Gender M 2(25%) 1(12.5%) 1(12.5%)
You may need to change
lapply(colns[c(3:4)],...
3:4 to 3:102 for q1~q100
Consider the following simple dplyr pipeline in R:
df <- data.frame(group = rep(LETTERS[1:3],each=5), value = rnorm(15)) %>%
group_by(group) %>%
mutate(rank = rank(value, ties.method = 'min'))
df %>%
group_by(group) %>%
summarise(mean_1 = mean(value[rank <= 1]),
mean_2 = mean(value[rank <= 2]),
mean_3 = mean(value[rank <= 3]),
mean_4 = mean(value[rank <= 4]),
mean_5 = mean(value[rank <= 5]))
How can I avoid typing out mean_i = mean(value[rank <= i]) for all i without reverting to a loop over group and i? Specifically, is there a neat way to iteratively create variables with the dplyr::summarise function?
You are actually calculative cumulative mean here. There is a function cummean in dplyr which we can use here and cast the data to wide format.
library(tidyverse)
df %>%
arrange(group, rank) %>%
group_by(group) %>%
mutate(value = cummean(value)) %>%
pivot_wider(names_from = rank, values_from = value, names_prefix = 'mean_')
# group mean_1 mean_2 mean_3 mean_4 mean_5
# <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 A -0.560 -0.395 -0.240 -0.148 0.194
#2 B -1.27 -0.976 -0.799 -0.484 -0.0443
#3 C -0.556 -0.223 -0.0284 0.0789 0.308
If you are asking for a general solution and calculating cumulative mean is just an example in that case you can use map.
n <- max(df$rank)
map(seq_len(n), ~df %>%
group_by(group) %>%
summarise(!!paste0('mean_', .x):= mean(value[rank <= .x]))) %>%
reduce(inner_join, by = 'group')
data
set.seed(123)
df <- data.frame(group = rep(LETTERS[1:3],each=5), value = rnorm(15)) %>%
group_by(group) %>%
mutate(rank = rank(value, ties.method = 'min'))
I am trying to create a function to run chi squared where I have to group by several groups. However, while the method works when it's not a function, I am having trouble turning into a function. As I'll be repeating the procedure multiple times, its seems worth doing, but I just can't get the function to recognise the "z" variable and always get the "Unknown or uninitialised column" warning.
Example is below.
library(tidyverse)
library(datasets)
#data
data(iris)
df<-iris%>%
gather(Type, value, -Species)%>%
separate(Type, c("type", "attribute"), sep="[.]")
#functions------------
frequency<-function(data, x, y, z){
x <- enquo(x)
y <- enquo(y)
z <- enquo(z)
data%>%
filter(!is.na(!!x), !is.na(!!y), !is.na(!!z))%>%
count(!!x, !!y, !!z)
}
group_chi<-function(data, x, y, z){
x <- enquo(x)
y <- enquo(y)
data %>%
group_by(!! x) %>%
nest() %>%
mutate(M = map(data, function(dat){
dat2 <- dat %>% spread(!! y, n)
M <- as.matrix(dat2[, -1])
row.names(M) <- dat2$'z' #I've done it like this becasue z <- enquo(z) and dat2$!!z doesn't work. jsut having it a z doesnt work either
return(M)
}))%>%
mutate(pvalue = map_dbl(M, ~chisq.test(.x)$p.value)) %>%
select(-data, -M) %>%
ungroup()
}
#aplying them--------------------
test<-frequency(df, type, Species, attribute)
chi_test<-group_chi(test, type, Species, attribute)#brings up warning
#> Warning: Unknown or uninitialised column: 'z'.
#> Warning: Unknown or uninitialised column: 'z'.
#test without the function=no warning.
No_function<-test %>%
group_by(type) %>%
nest() %>%
mutate(M = map(data, function(dat){
dat2 <- dat %>% spread(Species, n)
M <- as.matrix(dat2[, -1])
row.names(M) <- dat2$attribute
return(M)
}))%>%
mutate(pvalue = map_dbl(M, ~chisq.test(.x)$p.value)) %>%
select(-data, -M) %>%
ungroup()
# in the example the results are the same but....the warning message is of concern and the function doesn't output the same in a more compelx dataset.
chi_test
#> # A tibble: 2 x 2
#> type pvalue
#> <chr> <dbl>
#> 1 Petal 1
#> 2 Sepal 1
No_function
#> # A tibble: 2 x 2
#> type pvalue
#> <chr> <dbl>
#> 1 Petal 1
#> 2 Sepal 1
# what am I doing wrong?
Created on 2020-01-27 by the reprex package (v0.3.0)
What am I doing wrong here?
You can't use $ for an indirect column reference (as in dat2$'z'), instead use dat2[[z]]. When I replace that, there are no warnings/errors.
Try this version of your function instead:
group_chi<-function(data, x, y, z){
x <- enquo(x)
y <- enquo(y)
data %>%
group_by(!! x) %>%
nest() %>%
mutate(M = map(data, function(dat){
dat2 <- dat %>% spread(!! y, n)
M <- as.matrix(dat2[, -1])
row.names(M) <- dat2[[z]]
return(M)
}))%>%
mutate(pvalue = map_dbl(M, ~chisq.test(.x)$p.value)) %>%
select(-data, -M) %>%
ungroup()
}
and then call with the string:
chi_test <- group_chi(test, type, Species, "attribute")
Alternatively, you can first z <- enquo(z) then pull(dat2, !!z) (as in #akrun's answer).
group_chi<-function(data, x, y, z){
x <- enquo(x)
y <- enquo(y)
z <- enquo(z)
data %>%
group_by(!! x) %>%
nest() %>%
mutate(M = map(data, function(dat){
dat2 <- dat %>% spread(!! y, n)
M <- as.matrix(dat2[, -1])
row.names(M) <- pull(dat2, !!z)
return(M)
}))%>%
mutate(pvalue = map_dbl(M, ~chisq.test(.x)$p.value)) %>%
select(-data, -M) %>%
ungroup()
}
group_chi(test, type, Species, attribute)
# # A tibble: 2 x 2
# type pvalue
# <chr> <dbl>
# 1 Petal 1
# 2 Sepal 1
We could also use z <- enquo(z), then make use of the select and pull to extract the column as a vector
group_chi<-function(data, x, y, z){
x <- enquo(x)
y <- enquo(y)
z <- enquo(z)
data %>%
group_by(!! x) %>%
nest() %>%
mutate(M = map(data, function(dat){
dat2 <- dat %>% spread(!! y, n)
M <- as.matrix(dat2[, -1])
row.names(M) <- dat2 %>%
select(!!z) %>%
pull(1)
return(M)
}))%>%
mutate(pvalue = map_dbl(M, ~chisq.test(.x)$p.value)) %>%
select(-data, -M) %>%
ungroup()
}
-checking
chi_test <- group_chi(test, type, Species, attribute)
chi_test
# A tibble: 2 x 2
# type pvalue
# <chr> <dbl>
#1 Petal 1
#2 Sepal 1
With the newer versions of tidyverse, the curly-curly operator ({{}}) can replace the !!/enquo
group_chi<-function(data, x, y, z){
data %>%
group_by({{x}}) %>%
nest() %>%
mutate(M = map(data, function(dat){
dat2 <- dat %>% spread({{y}}, n)
M <- as.matrix(dat2[, -1])
row.names(M) <- dat2 %>%
pull({{z}})
return(M)
}))%>%
mutate(pvalue = map_dbl(M, ~chisq.test(.x)$p.value)) %>%
select(-data, -M) %>%
ungroup()
}
chi_test <- group_chi(test, type, Species, attribute)
Consider the following: I create a df_tbl with 1,000,000 rows. Large enough such that computation time isn't trivially fast.
I put the dataframe into Spark, and perform computations with the dataframe held in RAM, and the Spark dataframe.
Microbenchmark suggests that the computation with the Spark dataframe is faster, as would be expected, yet, when I'm programming interactively the computation involving the Spark dataframe is noticeably slower to return a result.
I'm curious as to what is going on. Example code given below:
library(sparklyr)
library(dplyr)
sc <- spark_connect(master = "local")
#> * Using Spark: 2.2.0
# main --------------------------------------------------------------------
N <- 1000000
df <- data_frame(
CASENO = 1000001:(1000000 + N),
sex = sample(1:2, N, rep = TRUE),
group = sample(1:5, N, rep = TRUE),
x = abs(rnorm(N)),
y = rnorm(N),
z = rnorm(N)
)
spark_df <- sdf_copy_to(sc, df, "spark_df", memory = FALSE, overwrite = TRUE)
benchmark <- microbenchmark::microbenchmark(
df %>% group_by(sex, group) %>% summarise(sum_x = sum(x)) %>% mutate(prop = sum_x/sum(sum_x)),
spark_df %>% group_by(sex, group) %>% summarise(sum_x = sum(x)) %>% mutate(prop = sum_x/sum(sum_x))
)
summary(benchmark)
#> expr
#> 1 df %>% group_by(sex, group) %>% summarise(sum_x = sum(x)) %>% mutate(prop = sum_x/sum(sum_x))
#> 2 spark_df %>% group_by(sex, group) %>% summarise(sum_x = sum(x)) %>% mutate(prop = sum_x/sum(sum_x))
#> min lq mean median uq max neval
#> 1 36.92519 39.119954 43.993727 41.522914 45.885576 107.71227 100
#> 2 1.12158 1.279999 1.855679 1.423407 1.551012 20.22911 100
start1 <- proc.time()
df %>% group_by(sex, group) %>% summarise(sum_x = sum(x)) %>% mutate(prop = sum_x/sum(sum_x))
end1 <- proc.time() - start1
start2 <- proc.time()
spark_df %>% group_by(sex, group) %>% summarise(sum_x = sum(x)) %>% mutate(prop = sum_x/sum(sum_x))
end2 <- proc.time() - start2
end1
#> user system elapsed
#> 0.33 0.04 0.37
end2
#> user system elapsed
#> 0.18 0.00 7.51
Created on 2018-03-27 by the [reprex package](http://reprex.tidyverse.org) (v0.2.0).