Related
I want to do something like
df1 <- iris %>% distinct(Species, .keep_all = TRUE) %>% group_by(Petal.Width) %>% summarise(Sepal.Length.mean1=mean(Sepal.Length), .groups = "drop")
df2 <- iris %>% distinct(Species, Petal.Width, .keep_all = TRUE) %>% group_by(Petal.Width) %>% summarise(Sepal.Length.mean2 =mean(Sepal.Length), .groups = "drop")
inner_join(df1, df2, by="Petal.Width")
But this is tedious to read because of the repetition. Is it possible to do all in one pipe? I cannot recover the initial dataset after distinct() so I wonder if there's a replacement to that.
A possible solution is to create first a function and then use it inside pipes:
library(tidyverse)
f <- function(df = iris, var1 = Species, var2 = Petal.Width,
var3 = Sepal.Length, i)
{
x <- enquo(var3)
{{df}} %>%
distinct({{var1}}, .keep_all = TRUE) %>% group_by({{var2}}) %>%
summarise(!!str_c(quo_name(x), ".mean", i , sep = "") := mean({{var3}}),
.groups = "drop")
}
inner_join(f(i = 1), f(i = 2), by="Petal.Width")
#> # A tibble: 3 × 3
#> Petal.Width Sepal.Length.mean1 Sepal.Length.mean2
#> <dbl> <dbl> <dbl>
#> 1 0.2 5.1 5.1
#> 2 1.4 7 7
#> 3 2.5 6.3 6.3
A workaround would be to use an expression with {}
Here is the beginning of the solution
iris %>% {
df1 <- distinct(., Species, .keep_all = TRUE)
df2 <- distinct(., Species, Petal.Width, .keep_all = TRUE)
list(df1, df2)} %>%
map(~ group_by(.x, Petal.Width)) # SOLUTION TO BE COMPLETED
I want to create a gt table where I see some metrics like number of observations, mean and median, and I want a column with its histogram. For this question I will use the iris dataset.
I have recently learned how to put a plot in a tibble using this code:
library(dplyr)
library(tidyr)
library(purrr)
library(gt)
my_tibble <- iris %>%
pivot_longer(-Species,
names_to = "Vars",
values_to = "Values") %>%
group_by(Vars) %>%
summarise(obs = n(),
mean = round(mean(Values),2),
median = round(median(Values),2),
plots = list(ggplot(cur_data(), aes(Values)) + geom_histogram()))
Now I want to use the plots column for plotting an histogram per variable, so I have tried this:
my_tibble %>%
mutate(ggplot = NA) %>%
gt() %>%
text_transform(
locations = cells_body(vars(ggplot)),
fn = function(x) {
map(.$plots,ggplot_image)
}
)
But it returns me an error:
Error in body[[col]][stub_df$rownum_i %in% loc$rows] <- fn(body[[col]][stub_df$rownum_i %in% :
replacement has length zero
The gt table should be like this:
Any help will be greatly appreciated.
After reviewing the excellent ideas from #akrun and #TarJae, I have this solution that gives the required gt table:
plots <- iris %>%
pivot_longer(-Species,
names_to = "Vars",
values_to = "Values") %>%
group_by(Vars) %>%
nest() %>%
mutate(plot = map(data,
function(df) df %>%
ggplot(aes(Values)) +
geom_histogram())) %>%
select(-data)
iris %>%
pivot_longer(-Species,
names_to = "Vars",
values_to = "Values") %>%
group_by(Vars) %>%
summarise(obs = n(),
mean = round(mean(Values),2),
median = round(median(Values),2)) %>%
mutate(ggplot = NA) %>%
gt() %>%
text_transform(
locations = cells_body(vars(ggplot)),
fn = function(x) {
map(plots$plot, ggplot_image, height = px(100))
}
)
And this is the table:
I had to create the plot outside the output table, so I could call it in the gt table.
We need to loop over the plots
library(dplyr)
library(tidyr)
library(purrr)
library(gt)
library(ggplot2)
iris %>%
pivot_longer(-Species,
names_to = "Vars",
values_to = "Values") %>%
nest_by(Vars) %>%
mutate(n = nrow(data),
mean = round(mean(data$Values), 2),
median = round(median(data$Values), 2),
plots = list(ggplot(data, aes(Values)) + geom_histogram()), .keep = "unused") %>%
ungroup %>%
mutate(ggplot = NA) %>%
{dat <- .
dat %>%
select(-plots) %>%
gt() %>%
text_transform(locations = cells_body(c(ggplot)),
fn = function(x) {
map(dat$plots, ggplot_image, height = px(100))
}
)
}
-check for the output
Update: See comments:
For your purposes in accordance with a shiny app you may use summarytools see here: https://cran.r-project.org/web/packages/summarytools/vignettes/introduction.html
it is compatible with r shiny!
Here is a small example:
library(summarytools)
dfSummary(iris,
plain.ascii = FALSE,
style = "grid",
graph.magnif = 0.75,
valid.col = FALSE,
tmp.img.dir = "/tmp")
view(dfSummary(iris))
Try this:
library(skimr)
skim(iris)
skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
* <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
1 Sepal.Length 0 1 5.84 0.828 4.3 5.1 5.8 6.4 7.9 ▆▇▇▅▂
2 Sepal.Width 0 1 3.06 0.436 2 2.8 3 3.3 4.4 ▁▆▇▂▁
3 Petal.Length 0 1 3.76 1.77 1 1.6 4.35 5.1 6.9 ▇▁▆▇▂
4 Petal.Width 0 1 1.20 0.762 0.1 0.3 1.3 1.8 2.5 ▇▁▇▅▃
library(tidyverse)
#make a sample data frame
a <- c(2000,2000,2000,2000,2001,2001,2001,2001)
b <- c("M","M","M","F","F","M","F","F")
d<- c("Yes","No","Yes","No","No","Unknown","Unknown","Yes")
e <- c("Unknown","No","No","Yes","Unknown","Yes","No","Unknown")
df <- data.frame(a,b,d,e)
colnames(df) <- c("Year","Gender","q1","q2")
# make a table for q1
myvar <- c("Gender","q1")
mydf <- df[,myvar]
table1 <- mydf %>%
pivot_longer(-q1) %>%
group_by(name,q1,value) %>%
summarise(n=n()) %>%
mutate(prop = round(n/sum(n),3)*100,
summary_str = glue::glue("{n}({prop}%)")) %>%
pivot_wider(id_cols = c(name,value), names_from = "q1", values_from = "summary_str")
#make the function creating a table
maketable <- function(df,x){
myvar <- c("gender",paste0(x))
mydf <- df[,myvar]
table1 <- mydf %>%
pivot_longer(-get(x)) %>%
group_by(name,get(x),value) %>%
summarise(n=n()) %>%
mutate(prop = round(n/sum(n),3)*100,
summary_str = glue::glue("{n}({prop}%)")) %>%
pivot_wider(id_cols = c(name,value), names_from = paste0(x), values_from = "summary_str")
colnames(table1)
}
maketable(df,q1)
maketable(df,q2)
Error in paste0(x): object 'q1' not found.
I want to make a function, so that I can use it for q2.
Could anyone help to correct the code? or suggest a better way?
Output per variable is as below
If you want to pass in unquoted column names to your function, you can use the {{}} (embrace) operator to inject them into your commands. For example
maketable <- function(df,x){
df %>%
select(Gender, {{x}}) %>%
pivot_longer(-{{x}}) %>%
group_by(name,{{x}},value)%>%
summarise(n=n()) %>%
mutate(prop = round(n/sum(n),3)*100,
summary_str = glue::glue("{n}({prop}%)")) %>%
pivot_wider(id_cols = c(name,value), names_from = {{x}}, values_from = "summary_str")
}
table1 <-maketable(df, q1)
See the programming with dplyr guide for more information.
Also note that the function just returns the new value. If you want to assign that to a new variable, make sure you do that outside the function. Values created inside of functions will not appear outside.
I have tried this one here
my_func = function(x)
{
new_df = df %>% group_by(Gender) %>% count({{x}}) %>% pivot_wider(names_from = {{x}}, values_from = n)
return(new_df)
}
I'm not sure that this is what you asked
colns <- colnames(df)
lapply(colns[c(3:4)], function(x) {
myvar <- c("Gender", x)
mydf <- df[,myvar]
table1 <- mydf%>%
pivot_longer(-x) %>%
group_by_all %>%
summarise(n=n()) %>%
mutate(prop = round(n/sum(n),3)*100,
summary_str = glue::glue("{n}({prop}%)")) %>%
pivot_wider(id_cols = c(name,value), names_from = x, values_from = "summary_str")
})
result is like
[[1]]
# A tibble: 2 x 5
# Groups: name [1]
name value No Unknown Yes
<chr> <chr> <glue> <glue> <glue>
1 Gender F 2(25%) 1(12.5%) 1(12.5%)
2 Gender M 1(12.5%) 1(12.5%) 2(25%)
[[2]]
# A tibble: 2 x 5
# Groups: name [1]
name value No Unknown Yes
<chr> <chr> <glue> <glue> <glue>
1 Gender F 1(12.5%) 2(25%) 1(12.5%)
2 Gender M 2(25%) 1(12.5%) 1(12.5%)
You may need to change
lapply(colns[c(3:4)],...
3:4 to 3:102 for q1~q100
I am trying to create a function to run chi squared where I have to group by several groups. However, while the method works when it's not a function, I am having trouble turning into a function. As I'll be repeating the procedure multiple times, its seems worth doing, but I just can't get the function to recognise the "z" variable and always get the "Unknown or uninitialised column" warning.
Example is below.
library(tidyverse)
library(datasets)
#data
data(iris)
df<-iris%>%
gather(Type, value, -Species)%>%
separate(Type, c("type", "attribute"), sep="[.]")
#functions------------
frequency<-function(data, x, y, z){
x <- enquo(x)
y <- enquo(y)
z <- enquo(z)
data%>%
filter(!is.na(!!x), !is.na(!!y), !is.na(!!z))%>%
count(!!x, !!y, !!z)
}
group_chi<-function(data, x, y, z){
x <- enquo(x)
y <- enquo(y)
data %>%
group_by(!! x) %>%
nest() %>%
mutate(M = map(data, function(dat){
dat2 <- dat %>% spread(!! y, n)
M <- as.matrix(dat2[, -1])
row.names(M) <- dat2$'z' #I've done it like this becasue z <- enquo(z) and dat2$!!z doesn't work. jsut having it a z doesnt work either
return(M)
}))%>%
mutate(pvalue = map_dbl(M, ~chisq.test(.x)$p.value)) %>%
select(-data, -M) %>%
ungroup()
}
#aplying them--------------------
test<-frequency(df, type, Species, attribute)
chi_test<-group_chi(test, type, Species, attribute)#brings up warning
#> Warning: Unknown or uninitialised column: 'z'.
#> Warning: Unknown or uninitialised column: 'z'.
#test without the function=no warning.
No_function<-test %>%
group_by(type) %>%
nest() %>%
mutate(M = map(data, function(dat){
dat2 <- dat %>% spread(Species, n)
M <- as.matrix(dat2[, -1])
row.names(M) <- dat2$attribute
return(M)
}))%>%
mutate(pvalue = map_dbl(M, ~chisq.test(.x)$p.value)) %>%
select(-data, -M) %>%
ungroup()
# in the example the results are the same but....the warning message is of concern and the function doesn't output the same in a more compelx dataset.
chi_test
#> # A tibble: 2 x 2
#> type pvalue
#> <chr> <dbl>
#> 1 Petal 1
#> 2 Sepal 1
No_function
#> # A tibble: 2 x 2
#> type pvalue
#> <chr> <dbl>
#> 1 Petal 1
#> 2 Sepal 1
# what am I doing wrong?
Created on 2020-01-27 by the reprex package (v0.3.0)
What am I doing wrong here?
You can't use $ for an indirect column reference (as in dat2$'z'), instead use dat2[[z]]. When I replace that, there are no warnings/errors.
Try this version of your function instead:
group_chi<-function(data, x, y, z){
x <- enquo(x)
y <- enquo(y)
data %>%
group_by(!! x) %>%
nest() %>%
mutate(M = map(data, function(dat){
dat2 <- dat %>% spread(!! y, n)
M <- as.matrix(dat2[, -1])
row.names(M) <- dat2[[z]]
return(M)
}))%>%
mutate(pvalue = map_dbl(M, ~chisq.test(.x)$p.value)) %>%
select(-data, -M) %>%
ungroup()
}
and then call with the string:
chi_test <- group_chi(test, type, Species, "attribute")
Alternatively, you can first z <- enquo(z) then pull(dat2, !!z) (as in #akrun's answer).
group_chi<-function(data, x, y, z){
x <- enquo(x)
y <- enquo(y)
z <- enquo(z)
data %>%
group_by(!! x) %>%
nest() %>%
mutate(M = map(data, function(dat){
dat2 <- dat %>% spread(!! y, n)
M <- as.matrix(dat2[, -1])
row.names(M) <- pull(dat2, !!z)
return(M)
}))%>%
mutate(pvalue = map_dbl(M, ~chisq.test(.x)$p.value)) %>%
select(-data, -M) %>%
ungroup()
}
group_chi(test, type, Species, attribute)
# # A tibble: 2 x 2
# type pvalue
# <chr> <dbl>
# 1 Petal 1
# 2 Sepal 1
We could also use z <- enquo(z), then make use of the select and pull to extract the column as a vector
group_chi<-function(data, x, y, z){
x <- enquo(x)
y <- enquo(y)
z <- enquo(z)
data %>%
group_by(!! x) %>%
nest() %>%
mutate(M = map(data, function(dat){
dat2 <- dat %>% spread(!! y, n)
M <- as.matrix(dat2[, -1])
row.names(M) <- dat2 %>%
select(!!z) %>%
pull(1)
return(M)
}))%>%
mutate(pvalue = map_dbl(M, ~chisq.test(.x)$p.value)) %>%
select(-data, -M) %>%
ungroup()
}
-checking
chi_test <- group_chi(test, type, Species, attribute)
chi_test
# A tibble: 2 x 2
# type pvalue
# <chr> <dbl>
#1 Petal 1
#2 Sepal 1
With the newer versions of tidyverse, the curly-curly operator ({{}}) can replace the !!/enquo
group_chi<-function(data, x, y, z){
data %>%
group_by({{x}}) %>%
nest() %>%
mutate(M = map(data, function(dat){
dat2 <- dat %>% spread({{y}}, n)
M <- as.matrix(dat2[, -1])
row.names(M) <- dat2 %>%
pull({{z}})
return(M)
}))%>%
mutate(pvalue = map_dbl(M, ~chisq.test(.x)$p.value)) %>%
select(-data, -M) %>%
ungroup()
}
chi_test <- group_chi(test, type, Species, attribute)
I am trying to create a user-defined function which carries out some data transformations.
Mock data:
library(tidyverse)
set.seed(1)
sampledata_a <- data.frame(
patientid = sample(1:100),
servicetype = sample(c("service1", "service2", "service3", "service4", "service5"), 100, replace=TRUE),
date = sample(seq(as.Date('1999/01/01'), as.Date('2000/01/01'), by="day"), 100)
)
sampledata_b <- data.frame(
patientid = sample(1:100),
servicetype = sample(c("service6", "service7", "service8", "service9", "service10"), 100, replace=TRUE),
date = sample(seq(as.Date('1999/01/01'), as.Date('2000/01/01'), by="day"), 100)
)
sampledata1 <- rbind(sampledata_a, sampledata_b)
User-defined function:
get_most_recent_and_unique <- function(inputdata, groupbyvar, uniquevar, datevar) {
# first selects the most recent observation for each unique variable combination
outputdata <- inputdata %>%
distinct() %>%
arrange(groupbyvar, uniquevar, desc(datevar)) %>%
mutate(orderkey = paste0(groupbyvar, uniquevar, sep = "")) %>%
group_by(orderkey) %>%
do(head(., n=1)) %>%
ungroup() %>%
arrange(groupbyvar, desc(datevar), uniquevar)
# then tranpose from long to wide, and unite variables other than first variable into one
outputdata <- outputdata %>%
select(groupbyvar, uniquevar) %>%
group_by(groupbyvar) %>%
mutate(pos=1:n()) %>%
spread(pos, uniquevar) %>%
unite(uniquevar, -groupbyvar, sep=" / ")
return(outputdata)
}
When running the function as below:
outputdata <- get_most_recent_and_unique(sampledata1, "patientid", "servicetype", "date")
Following error message:
Error in arrange_impl(.data, dots) :
incorrect size (1) at position 1, expecting : 100
However, the code works fine when outside the user-defined function. I wonder if anyone can tell me what is wrong?
testoutputdata <- sampledata1 %>%
distinct() %>%
arrange(patientid, servicetype, desc(date)) %>%
mutate(orderkey = paste0(patientid, servicetype, sep = "")) %>%
group_by(orderkey) %>%
do(head(., n=1)) %>%
ungroup() %>%
arrange(patientid, desc(date), servicetype)
testoutputdata <- testoutputdata %>%
select(patientid, servicetype) %>%
group_by(patientid) %>%
mutate(pos=1:n()) %>%
spread(pos, servicetype) %>%
unite(servicetype, -patientid, sep=" / ")
Try this:
get_most_recent_and_unique <- function(inputdata, groupbyvar, uniquevar, datevar) {
groupbyvar <- enquo(groupbyvar)
uniquevar <- enquo(uniquevar)
datevar <- enquo(datevar)
# first selects the most recent observation for each unique variable combination
outputdata <- inputdata %>%
distinct() %>%
arrange(!! groupbyvar, !! uniquevar, desc(!! datevar)) %>%
mutate(orderkey := paste0(!! groupbyvar, !! uniquevar, sep = "")) %>%
group_by(orderkey) %>%
do(head(., n=1)) %>%
ungroup() %>%
arrange(!! groupbyvar, desc(!! datevar), !! uniquevar)
# then tranpose from long to wide, and unite variables other than first variable into one
outputdata <- outputdata %>%
select(!! groupbyvar, !! uniquevar) %>%
group_by(!! groupbyvar) %>%
mutate(pos=1:n()) %>%
spread(pos, !! uniquevar) %>%
unite(!! uniquevar, -!! groupbyvar, sep=" / ")
return(outputdata)
}
outputdata <- get_most_recent_and_unique(sampledata1, patientid, servicetype, date) # No quotation with arguments!
Here is the output:
patientid servicetype
<int> <chr>
1 1 service7 / service3
2 2 service10 / service1
3 3 service4 / service9
4 4 service8 / service3
5 5 service6 / service1
It seems to match your expectations when I compare them:
all.equal(outputdata, testoutputdata)
[1] TRUE
Note that you shouldn't quote the arguments when specifying the function, i.e. outputdata <- get_most_recent_and_unique(sampledata1, patientid, servicetype, date) will work while outputdata <- get_most_recent_and_unique(sampledata1, "patientid", "servicetype", "date") won't.