I am trying to create a function to run chi squared where I have to group by several groups. However, while the method works when it's not a function, I am having trouble turning into a function. As I'll be repeating the procedure multiple times, its seems worth doing, but I just can't get the function to recognise the "z" variable and always get the "Unknown or uninitialised column" warning.
Example is below.
library(tidyverse)
library(datasets)
#data
data(iris)
df<-iris%>%
gather(Type, value, -Species)%>%
separate(Type, c("type", "attribute"), sep="[.]")
#functions------------
frequency<-function(data, x, y, z){
x <- enquo(x)
y <- enquo(y)
z <- enquo(z)
data%>%
filter(!is.na(!!x), !is.na(!!y), !is.na(!!z))%>%
count(!!x, !!y, !!z)
}
group_chi<-function(data, x, y, z){
x <- enquo(x)
y <- enquo(y)
data %>%
group_by(!! x) %>%
nest() %>%
mutate(M = map(data, function(dat){
dat2 <- dat %>% spread(!! y, n)
M <- as.matrix(dat2[, -1])
row.names(M) <- dat2$'z' #I've done it like this becasue z <- enquo(z) and dat2$!!z doesn't work. jsut having it a z doesnt work either
return(M)
}))%>%
mutate(pvalue = map_dbl(M, ~chisq.test(.x)$p.value)) %>%
select(-data, -M) %>%
ungroup()
}
#aplying them--------------------
test<-frequency(df, type, Species, attribute)
chi_test<-group_chi(test, type, Species, attribute)#brings up warning
#> Warning: Unknown or uninitialised column: 'z'.
#> Warning: Unknown or uninitialised column: 'z'.
#test without the function=no warning.
No_function<-test %>%
group_by(type) %>%
nest() %>%
mutate(M = map(data, function(dat){
dat2 <- dat %>% spread(Species, n)
M <- as.matrix(dat2[, -1])
row.names(M) <- dat2$attribute
return(M)
}))%>%
mutate(pvalue = map_dbl(M, ~chisq.test(.x)$p.value)) %>%
select(-data, -M) %>%
ungroup()
# in the example the results are the same but....the warning message is of concern and the function doesn't output the same in a more compelx dataset.
chi_test
#> # A tibble: 2 x 2
#> type pvalue
#> <chr> <dbl>
#> 1 Petal 1
#> 2 Sepal 1
No_function
#> # A tibble: 2 x 2
#> type pvalue
#> <chr> <dbl>
#> 1 Petal 1
#> 2 Sepal 1
# what am I doing wrong?
Created on 2020-01-27 by the reprex package (v0.3.0)
What am I doing wrong here?
You can't use $ for an indirect column reference (as in dat2$'z'), instead use dat2[[z]]. When I replace that, there are no warnings/errors.
Try this version of your function instead:
group_chi<-function(data, x, y, z){
x <- enquo(x)
y <- enquo(y)
data %>%
group_by(!! x) %>%
nest() %>%
mutate(M = map(data, function(dat){
dat2 <- dat %>% spread(!! y, n)
M <- as.matrix(dat2[, -1])
row.names(M) <- dat2[[z]]
return(M)
}))%>%
mutate(pvalue = map_dbl(M, ~chisq.test(.x)$p.value)) %>%
select(-data, -M) %>%
ungroup()
}
and then call with the string:
chi_test <- group_chi(test, type, Species, "attribute")
Alternatively, you can first z <- enquo(z) then pull(dat2, !!z) (as in #akrun's answer).
group_chi<-function(data, x, y, z){
x <- enquo(x)
y <- enquo(y)
z <- enquo(z)
data %>%
group_by(!! x) %>%
nest() %>%
mutate(M = map(data, function(dat){
dat2 <- dat %>% spread(!! y, n)
M <- as.matrix(dat2[, -1])
row.names(M) <- pull(dat2, !!z)
return(M)
}))%>%
mutate(pvalue = map_dbl(M, ~chisq.test(.x)$p.value)) %>%
select(-data, -M) %>%
ungroup()
}
group_chi(test, type, Species, attribute)
# # A tibble: 2 x 2
# type pvalue
# <chr> <dbl>
# 1 Petal 1
# 2 Sepal 1
We could also use z <- enquo(z), then make use of the select and pull to extract the column as a vector
group_chi<-function(data, x, y, z){
x <- enquo(x)
y <- enquo(y)
z <- enquo(z)
data %>%
group_by(!! x) %>%
nest() %>%
mutate(M = map(data, function(dat){
dat2 <- dat %>% spread(!! y, n)
M <- as.matrix(dat2[, -1])
row.names(M) <- dat2 %>%
select(!!z) %>%
pull(1)
return(M)
}))%>%
mutate(pvalue = map_dbl(M, ~chisq.test(.x)$p.value)) %>%
select(-data, -M) %>%
ungroup()
}
-checking
chi_test <- group_chi(test, type, Species, attribute)
chi_test
# A tibble: 2 x 2
# type pvalue
# <chr> <dbl>
#1 Petal 1
#2 Sepal 1
With the newer versions of tidyverse, the curly-curly operator ({{}}) can replace the !!/enquo
group_chi<-function(data, x, y, z){
data %>%
group_by({{x}}) %>%
nest() %>%
mutate(M = map(data, function(dat){
dat2 <- dat %>% spread({{y}}, n)
M <- as.matrix(dat2[, -1])
row.names(M) <- dat2 %>%
pull({{z}})
return(M)
}))%>%
mutate(pvalue = map_dbl(M, ~chisq.test(.x)$p.value)) %>%
select(-data, -M) %>%
ungroup()
}
chi_test <- group_chi(test, type, Species, attribute)
Related
library(tidyverse)
#make a sample data frame
a <- c(2000,2000,2000,2000,2001,2001,2001,2001)
b <- c("M","M","M","F","F","M","F","F")
d<- c("Yes","No","Yes","No","No","Unknown","Unknown","Yes")
e <- c("Unknown","No","No","Yes","Unknown","Yes","No","Unknown")
df <- data.frame(a,b,d,e)
colnames(df) <- c("Year","Gender","q1","q2")
# make a table for q1
myvar <- c("Gender","q1")
mydf <- df[,myvar]
table1 <- mydf %>%
pivot_longer(-q1) %>%
group_by(name,q1,value) %>%
summarise(n=n()) %>%
mutate(prop = round(n/sum(n),3)*100,
summary_str = glue::glue("{n}({prop}%)")) %>%
pivot_wider(id_cols = c(name,value), names_from = "q1", values_from = "summary_str")
#make the function creating a table
maketable <- function(df,x){
myvar <- c("gender",paste0(x))
mydf <- df[,myvar]
table1 <- mydf %>%
pivot_longer(-get(x)) %>%
group_by(name,get(x),value) %>%
summarise(n=n()) %>%
mutate(prop = round(n/sum(n),3)*100,
summary_str = glue::glue("{n}({prop}%)")) %>%
pivot_wider(id_cols = c(name,value), names_from = paste0(x), values_from = "summary_str")
colnames(table1)
}
maketable(df,q1)
maketable(df,q2)
Error in paste0(x): object 'q1' not found.
I want to make a function, so that I can use it for q2.
Could anyone help to correct the code? or suggest a better way?
Output per variable is as below
If you want to pass in unquoted column names to your function, you can use the {{}} (embrace) operator to inject them into your commands. For example
maketable <- function(df,x){
df %>%
select(Gender, {{x}}) %>%
pivot_longer(-{{x}}) %>%
group_by(name,{{x}},value)%>%
summarise(n=n()) %>%
mutate(prop = round(n/sum(n),3)*100,
summary_str = glue::glue("{n}({prop}%)")) %>%
pivot_wider(id_cols = c(name,value), names_from = {{x}}, values_from = "summary_str")
}
table1 <-maketable(df, q1)
See the programming with dplyr guide for more information.
Also note that the function just returns the new value. If you want to assign that to a new variable, make sure you do that outside the function. Values created inside of functions will not appear outside.
I have tried this one here
my_func = function(x)
{
new_df = df %>% group_by(Gender) %>% count({{x}}) %>% pivot_wider(names_from = {{x}}, values_from = n)
return(new_df)
}
I'm not sure that this is what you asked
colns <- colnames(df)
lapply(colns[c(3:4)], function(x) {
myvar <- c("Gender", x)
mydf <- df[,myvar]
table1 <- mydf%>%
pivot_longer(-x) %>%
group_by_all %>%
summarise(n=n()) %>%
mutate(prop = round(n/sum(n),3)*100,
summary_str = glue::glue("{n}({prop}%)")) %>%
pivot_wider(id_cols = c(name,value), names_from = x, values_from = "summary_str")
})
result is like
[[1]]
# A tibble: 2 x 5
# Groups: name [1]
name value No Unknown Yes
<chr> <chr> <glue> <glue> <glue>
1 Gender F 2(25%) 1(12.5%) 1(12.5%)
2 Gender M 1(12.5%) 1(12.5%) 2(25%)
[[2]]
# A tibble: 2 x 5
# Groups: name [1]
name value No Unknown Yes
<chr> <chr> <glue> <glue> <glue>
1 Gender F 1(12.5%) 2(25%) 1(12.5%)
2 Gender M 2(25%) 1(12.5%) 1(12.5%)
You may need to change
lapply(colns[c(3:4)],...
3:4 to 3:102 for q1~q100
I want to apply a custom function to all pairs of columns in a dataframe to get a p x p matrix/dataframe of the results. Is there a quick way to do that in the tidyverse?
The output should be the results data frame.
custom_function <- function(x, y){
sum(x, y)
}
set.seed(100)
data <- tibble(x = rnorm(10), y = rnorm(10), z = rnorm(10))
result <- tibble(cols = c("x","y","z"),
x = c(custom_function(data$x, data$x), custom_function(data$x, data$y), custom_function(data$x, data$z)),
y = c(custom_function(data$y, data$x), custom_function(data$y, data$y), custom_function(data$y, data$z)),
z = c(custom_function(data$z, data$x), custom_function(data$z, data$y), custom_function(data$z, data$z)))
result
You can use the following solution:
library(dplyr)
library(tibble)
expand.grid(names(data), names(data)) %>%
rowwise() %>%
mutate(Res = custom_function(data[as.character(Var1)], data[as.character(Var2)])) %>%
pivot_wider(names_from = unique("Var1"), values_from = "Res") %>%
column_to_rownames("Var2")
x y z
x -0.3591433 2.157343 -1.470995
y 2.1573430 4.673829 1.045491
z -1.4709953 1.045491 -2.582847
One idea:
library(dplyr, warn.conflicts = FALSE)
custom_function <- function(x, y) {
sum(x, y)
}
set.seed(100)
data <- tibble(x = rnorm(10), y = rnorm(10), z = rnorm(10))
data_long <-
data %>%
mutate(id = 1:nrow(.)) %>%
tidyr::pivot_longer(cols = -id)
result <-
data_long %>%
inner_join(data_long, by = "id") %>%
group_by(name.x, name.y) %>%
summarize(value = custom_function(value.x, value.y),
.groups = "drop") %>%
tidyr::pivot_wider(names_from = name.x, values_from = value) %>%
rename(cols = name.y)
result
#> # A tibble: 3 x 4
#> cols x y z
#> <chr> <dbl> <dbl> <dbl>
#> 1 x -0.359 2.16 -1.47
#> 2 y 2.16 4.67 1.05
#> 3 z -1.47 1.05 -2.58
Created on 2021-07-10 by the reprex package (v2.0.0)
And here it is organized as a function:
library(dplyr, warn.conflicts = FALSE)
custom_function <- function(x, y) {
sum(x, y)
}
set.seed(100)
data <- tibble(x = rnorm(10), y = rnorm(10), z = rnorm(10))
custom_summ <- function(df, f) {
data_long <-
data %>%
mutate(id = 1:nrow(.)) %>%
tidyr::pivot_longer(cols = -id)
result <-
data_long %>%
inner_join(data_long, by = "id") %>%
group_by(name.x, name.y) %>%
summarize(value = f(value.x, value.y),
.groups = "drop") %>%
tidyr::pivot_wider(names_from = name.x, values_from = value) %>%
rename(cols = name.y)
result
}
custom_summ(data, custom_function)
#> # A tibble: 3 x 4
#> cols x y z
#> <chr> <dbl> <dbl> <dbl>
#> 1 x -0.359 2.16 -1.47
#> 2 y 2.16 4.67 1.05
#> 3 z -1.47 1.05 -2.58
Created on 2021-07-10 by the reprex package (v2.0.0)
And here are some benchmarking data for the various options. The tidyverse approach offered in the accepted answer is not a good one if performance is at all a concern. The fastest option here is the sapply-based one offered in a comment to the question.
library(tidyverse)
custom_function <- function(x, y) {
sum(x, y)
}
set.seed(100)
get_data <- function() {
data <- lapply(letters, function(i) rnorm(1000))
names(data) <- letters
as_tibble(data)
}
custom_summ <- function(df, f) {
data_long <-
data %>%
mutate(id = 1:nrow(.)) %>%
pivot_longer(cols = -id)
result <-
data_long %>%
inner_join(data_long, by = "id") %>%
group_by(name.x, name.y) %>%
summarize(value = f(value.x, value.y),
.groups = "drop") %>%
pivot_wider(names_from = name.x, values_from = value) %>%
rename(cols = name.y)
result
}
data <- get_data()
system.time(custom_summ(data, custom_function))
#> user system elapsed
#> 0.053 0.007 0.062
custom_summ_2 <- function(data, f) {
expand.grid(names(data), names(data)) %>%
mutate(val = map2(Var1, Var2, ~ f(data[.x], data[.y]))) %>%
pivot_wider(id_cols = Var1 ,names_from = Var2, values_from = val, values_fn = first) %>%
column_to_rownames('Var1') %>%
as.matrix()
}
system.time(custom_summ_2(data, custom_function))
#> user system elapsed
#> 26.479 0.317 27.365
custom_summ_3 <- function(data, f) {
expand.grid(names(data), names(data)) %>%
rowwise() %>%
mutate(Res = f(data[as.character(Var1)], data[as.character(Var2)])) %>%
pivot_wider(names_from = unique("Var1"), values_from = "Res") %>%
column_to_rownames("Var2")
}
system.time(custom_summ_3(data, custom_function))
#> user system elapsed
#> 0.048 0.001 0.049
custom_summ_4 <- function(data, f) {
sapply(data, function(y) sapply(data, f, y = y))
}
system.time(custom_summ_4(data, custom_function))
#> user system elapsed
#> 0.003 0.000 0.003
custom_summ_5 <- function(data, f) {
outer(names(data), names(data),
FUN = Vectorize(function(x, y) f (data[x], data[y])))
}
system.time(custom_summ_5(data, custom_function))
#> user system elapsed
#> 0.044 0.001 0.045
Created on 2021-07-11 by the reprex package (v2.0.0)
We could have used outer directly if the custom_function was a vectorized one. But it is using sum which is an scalar function so we can use it by wrapping it around Vectorize() in FUN = argument in outer. Do it like this-
outer(names(data),names(data), FUN = Vectorize(function(x, y) custom_function (data[x], data[y])))
tidyverse strategy Though a little verbose but you can manage this approach in tidyverse, if you want.
library(tidyverse)
custom_function <- function(x, y){
sum(x, y)
}
set.seed(100)
data <- tibble(x = rnorm(10), y = rnorm(10), z = rnorm(10))
expand.grid(names(data), names(data)) %>%
mutate(val = map2(Var1, Var2, ~ custom_function(data[.x], data[.y]))) %>%
pivot_wider(id_cols = Var1 ,names_from = Var2, values_from = val, values_fn = first) %>%
column_to_rownames('Var1') %>%
as.matrix()
#> x y z
#> x -0.3591433 2.157343 -1.470995
#> y 2.1573430 4.673829 1.045491
#> z -1.4709953 1.045491 -2.582847
Created on 2021-07-10 by the reprex package (v2.0.0)
I cannot figure out why the bang-bang operator in my function is not unquoting my grp argument. Any help would be much appreciated!
library(dplyr)
test_func <- function(dat, grp){
dat %>%
group_by(!!grp) %>%
summarise(N = n())
}
test_func(dat = iris, grp = "Species")
Instead of grouping by species it just produces the summary for the entire data:
If we are passing a string, then convert to symbol and evaluate (!!)
test_func <- function(dat, grp){
dat %>%
group_by(!! rlang::ensym(grp)) %>%
summarise(N = n(), .groups = 'drop')
}
-testing
test_func(dat = iris, grp = "Species")
# A tibble: 3 x 2
# Species N
#* <fct> <int>
#1 setosa 50
#2 versicolor 50
#3 virginica 50
Or another option is to use across
test_func <- function(dat, grp){
dat %>%
group_by(across(all_of(grp))) %>%
summarise(N = n(), .groups = 'drop')
}
My data is below
grp <- paste('group', sample(1:3, 100, replace = T))
x <- rnorm(100, 100)
y <- rnorm(100, 10)
df <- data.frame(grp = grp, x =x , y =y , stringsAsFactors = F)
lag_size <- c(10, 4, 9)
Now when I try to use
df %>% group_by(grp) %>% mutate_all(lag, n = lag_size) %>% arrange(grp)
it gives an error
Error in mutate_impl(.data, dots) :
Expecting a single value:
whereas this works fine
df %>% group_by(grp) %>% mutate_all(lag, n = 10) %>% arrange(grp)
If we need to do the lag based on the 'grp' i.e. to lag the corresponding 'grp' with the value specified in 'lag_size'
library(tidyverse)
res <- map2(split(df[2:3], df$grp) , lag_size, ~.x %>%
mutate_all(lag, n = .y)) %>%
bind_rows(., .id = 'grp')
We can check the lag in 'grp' by the position of the first non-NA element
res %>%
group_by(grp) %>%
summarise(n = which(!is.na(x))[1]-1)
# A tibble: 3 x 2
# grp n
# <chr> <dbl>
#1 group 1 10
#2 group 2 4
#3 group 3 9
I have a dataset with tons of factors and I want to get the relative frequencies of each factor based on another factor. For example, let's use mtcars:
mtcars$am <- as.factor(mtcars$am)
mtcars$cyl <- as.factor(mtcars$cyl)
I want to get the frequencies where am == 1, based on the values of cyl. In this case, I should get three relative frequencies because cyl has three levels (4, 6, and 8). I have this code working:
mtcars %>%
select(am, cyl) %>%
table(.) %>%
prop.table(., 1) %>%
round(., digits = 2) %>%
data.frame() %>%
filter(am == 1) %>%
t() %>%
data.frame() %>%
slice(3)
# # A tibble: 1 x 3
# X1 X2 X3
# <fctr> <fctr> <fctr>
# 1 0.62 0.23 0.15
If you run it, you'll get the three frequencies above. Of course, I built this code so I know that X1 corresponds to the frequency where cyl == 4, X2 is cyl == 6, and X3 is cyl == 8.
Now, I want to do this with tons of factors (other binary factors like am). So, I want to build a custom function, bind all the frequencies later as rows, and create a nice table with these frequencies. Right now, I have this:
pull_freq <- function(mydata, var1, var2){
require(tidyverse)
var1 <- enquo(var1)
var2 <- enquo(var2)
mydata %>%
select(!!var1, !!var2) %>%
table(.) %>%
prop.table(., 1) %>%
round(., digits = 2) %>%
data.frame() %>%
filter(!!var1 == 1) %>%
t() %>%
data.frame() %>%
slice(3)
}
pull_freq(mtcars, am, cyl)
# A tibble: 1 x 0
But as you can see, when I run this function, I don't get any output. Any ideas of why I don't get any output? How can I get this function to work? Thank you!
custom function
myfun <- function(df, col1, col2, col3) {
require(dplyr)
require(tidyr)
col1 <- enquo(col1)
col2 <- enquo(col2)
df %>%
count(!!col1, !!col2) %>%
group_by(!!col1) %>%
mutate(tot = sum(n)) %>%
ungroup() %>%
group_by(!!col2) %>%
mutate(n = n / tot) %>%
select(-tot) %>%
filter(UQ(col1)==1) %>%
spread_(col3, "n") %>%
round(., digits=2)
}
Output
myfun(mtcars, am, cyl, "cyl")
# am `4` `6` `8`
# 1 0.62 0.23 0.15
Maybe I'm completely off, but is this it?
data(mtcars)
agg <- aggregate(mtcars$cyl, list(mtcars$cyl, mtcars$am), FUN = length)
names(agg) <- c("cyl", "am", "count")
agg$freq <- ave(agg$count, agg$am, FUN = function(x) x/sum(x))
agg <- t(agg[-3])
agg
Note that I have not coerced cyl and am to factors with as.factor. This is because when the data frame would be transposed, the result would be a matrix. And since matrices can only have elements of one class, all the values would become of class character. The freq values would no longer be numeric.
How about this,
library(tidyverse)
getFreq <- function(data, group_var, value_var) {
data %>%
group_by_(group_var) %>%
do({
table(.[[value_var]]) %>%
prop.table() %>%
as_tibble()
}) %>%
spread(Var1, n)
}
getFreq(mtcars, "am", "cyl") %>% print()
You can do all filtering afterwards or just include inside the function.