library(tidyverse)
#make a sample data frame
a <- c(2000,2000,2000,2000,2001,2001,2001,2001)
b <- c("M","M","M","F","F","M","F","F")
d<- c("Yes","No","Yes","No","No","Unknown","Unknown","Yes")
e <- c("Unknown","No","No","Yes","Unknown","Yes","No","Unknown")
df <- data.frame(a,b,d,e)
colnames(df) <- c("Year","Gender","q1","q2")
# make a table for q1
myvar <- c("Gender","q1")
mydf <- df[,myvar]
table1 <- mydf %>%
pivot_longer(-q1) %>%
group_by(name,q1,value) %>%
summarise(n=n()) %>%
mutate(prop = round(n/sum(n),3)*100,
summary_str = glue::glue("{n}({prop}%)")) %>%
pivot_wider(id_cols = c(name,value), names_from = "q1", values_from = "summary_str")
#make the function creating a table
maketable <- function(df,x){
myvar <- c("gender",paste0(x))
mydf <- df[,myvar]
table1 <- mydf %>%
pivot_longer(-get(x)) %>%
group_by(name,get(x),value) %>%
summarise(n=n()) %>%
mutate(prop = round(n/sum(n),3)*100,
summary_str = glue::glue("{n}({prop}%)")) %>%
pivot_wider(id_cols = c(name,value), names_from = paste0(x), values_from = "summary_str")
colnames(table1)
}
maketable(df,q1)
maketable(df,q2)
Error in paste0(x): object 'q1' not found.
I want to make a function, so that I can use it for q2.
Could anyone help to correct the code? or suggest a better way?
Output per variable is as below
If you want to pass in unquoted column names to your function, you can use the {{}} (embrace) operator to inject them into your commands. For example
maketable <- function(df,x){
df %>%
select(Gender, {{x}}) %>%
pivot_longer(-{{x}}) %>%
group_by(name,{{x}},value)%>%
summarise(n=n()) %>%
mutate(prop = round(n/sum(n),3)*100,
summary_str = glue::glue("{n}({prop}%)")) %>%
pivot_wider(id_cols = c(name,value), names_from = {{x}}, values_from = "summary_str")
}
table1 <-maketable(df, q1)
See the programming with dplyr guide for more information.
Also note that the function just returns the new value. If you want to assign that to a new variable, make sure you do that outside the function. Values created inside of functions will not appear outside.
I have tried this one here
my_func = function(x)
{
new_df = df %>% group_by(Gender) %>% count({{x}}) %>% pivot_wider(names_from = {{x}}, values_from = n)
return(new_df)
}
I'm not sure that this is what you asked
colns <- colnames(df)
lapply(colns[c(3:4)], function(x) {
myvar <- c("Gender", x)
mydf <- df[,myvar]
table1 <- mydf%>%
pivot_longer(-x) %>%
group_by_all %>%
summarise(n=n()) %>%
mutate(prop = round(n/sum(n),3)*100,
summary_str = glue::glue("{n}({prop}%)")) %>%
pivot_wider(id_cols = c(name,value), names_from = x, values_from = "summary_str")
})
result is like
[[1]]
# A tibble: 2 x 5
# Groups: name [1]
name value No Unknown Yes
<chr> <chr> <glue> <glue> <glue>
1 Gender F 2(25%) 1(12.5%) 1(12.5%)
2 Gender M 1(12.5%) 1(12.5%) 2(25%)
[[2]]
# A tibble: 2 x 5
# Groups: name [1]
name value No Unknown Yes
<chr> <chr> <glue> <glue> <glue>
1 Gender F 1(12.5%) 2(25%) 1(12.5%)
2 Gender M 2(25%) 1(12.5%) 1(12.5%)
You may need to change
lapply(colns[c(3:4)],...
3:4 to 3:102 for q1~q100
Related
I have a data frame where I want to sum column values with the same prefix to produce a new column. My current problem is that it's not taking into account my group_by variable and returning identical values. Is part of the problem the .cols variable I'm selecting in the across function?
Sample data
library(dplyr)
library(purrr)
set.seed(10)
dat <- data.frame(id = rep(1:2, 5),
var1.pre = rnorm(10),
var1.post = rnorm(10),
var2.pre = rnorm(10),
var2.post = rnorm(10)
) %>%
mutate(index = id)
var_names = c("var1", "var2")
What I've tried
sumfunction <- map(
var_names,
~function(.){
sum(dat[glue("{.x}.pre")], dat[glue("{.x}.post")], na.rm = TRUE)
}
) %>%
setNames(var_names)
dat %>%
group_by(id) %>%
summarise(
across(
.cols = index,
.fns = sumfunction,
.names = "{.fn}"
)
) %>%
ungroup
Desired output
For this and similar problems I made the 'dplyover' package (it is not on CRAN). Here we can use dplyover::across2() to loop over two series of columns, first, all columns ending with "pre" and second all columns ending with "post". To get the names correct we can use .names = "{pre}" to get the common prefix of both series of columns.
library(dplyr)
library(dplyover) # https://timteafan.github.io/dplyover/
dat %>%
group_by(id) %>%
summarise(across2(ends_with("pre"),
ends_with("post"),
~ sum(c(.x, .y)),
.names = "{pre}"
)
)
#> # A tibble: 2 × 3
#> id var1 var2
#> <int> <dbl> <dbl>
#> 1 1 -2.32 -5.55
#> 2 2 1.11 -9.54
Created on 2022-12-14 with reprex v2.0.2
Whenever operations across multiple columns get complicated, we could pivot:
library(dplyr)
library(tidyr)
dat %>%
pivot_longer(-c(id, index),
names_to = c(".value", "name"),
names_sep = "\\.") %>%
group_by(id) %>%
summarise(var1 = sum(var1), var2=sum(var2))
id var1 var2
<int> <dbl> <dbl>
1 1 -2.32 -5.55
2 2 1.11 -9.54
I want to do something like
df1 <- iris %>% distinct(Species, .keep_all = TRUE) %>% group_by(Petal.Width) %>% summarise(Sepal.Length.mean1=mean(Sepal.Length), .groups = "drop")
df2 <- iris %>% distinct(Species, Petal.Width, .keep_all = TRUE) %>% group_by(Petal.Width) %>% summarise(Sepal.Length.mean2 =mean(Sepal.Length), .groups = "drop")
inner_join(df1, df2, by="Petal.Width")
But this is tedious to read because of the repetition. Is it possible to do all in one pipe? I cannot recover the initial dataset after distinct() so I wonder if there's a replacement to that.
A possible solution is to create first a function and then use it inside pipes:
library(tidyverse)
f <- function(df = iris, var1 = Species, var2 = Petal.Width,
var3 = Sepal.Length, i)
{
x <- enquo(var3)
{{df}} %>%
distinct({{var1}}, .keep_all = TRUE) %>% group_by({{var2}}) %>%
summarise(!!str_c(quo_name(x), ".mean", i , sep = "") := mean({{var3}}),
.groups = "drop")
}
inner_join(f(i = 1), f(i = 2), by="Petal.Width")
#> # A tibble: 3 × 3
#> Petal.Width Sepal.Length.mean1 Sepal.Length.mean2
#> <dbl> <dbl> <dbl>
#> 1 0.2 5.1 5.1
#> 2 1.4 7 7
#> 3 2.5 6.3 6.3
A workaround would be to use an expression with {}
Here is the beginning of the solution
iris %>% {
df1 <- distinct(., Species, .keep_all = TRUE)
df2 <- distinct(., Species, Petal.Width, .keep_all = TRUE)
list(df1, df2)} %>%
map(~ group_by(.x, Petal.Width)) # SOLUTION TO BE COMPLETED
I cannot figure out why the bang-bang operator in my function is not unquoting my grp argument. Any help would be much appreciated!
library(dplyr)
test_func <- function(dat, grp){
dat %>%
group_by(!!grp) %>%
summarise(N = n())
}
test_func(dat = iris, grp = "Species")
Instead of grouping by species it just produces the summary for the entire data:
If we are passing a string, then convert to symbol and evaluate (!!)
test_func <- function(dat, grp){
dat %>%
group_by(!! rlang::ensym(grp)) %>%
summarise(N = n(), .groups = 'drop')
}
-testing
test_func(dat = iris, grp = "Species")
# A tibble: 3 x 2
# Species N
#* <fct> <int>
#1 setosa 50
#2 versicolor 50
#3 virginica 50
Or another option is to use across
test_func <- function(dat, grp){
dat %>%
group_by(across(all_of(grp))) %>%
summarise(N = n(), .groups = 'drop')
}
Below is a sample data frame that I have created along with the expected output.
df = data.frame(color = c("Yellow", "Blue", "Green", "Red", "Magenta"),
values = c(24, 24, 34, 45, 49),
Quarter = c("Period1","Period2" , "Period3", "Period3", "Period1"),
Market = c("Camden", "StreetA", "DansFireplace", "StreetA", "DansFireplace"))
dfXQuarter = df %>% group_by(Quarter) %>% summarise(values = sum(values)) %>%
mutate(cut = "Quarter") %>% data.frame()
colnames(dfXQuarter)[1] = "Grouping"
dfXMarket = df %>% group_by(Market) %>% summarise(values = sum(values)) %>%
mutate(cut = "Market")%>% data.frame()
colnames(dfXMarket)[1] = "Grouping"
df_all = rbind(dfXQuarter, dfXMarket)
Now I for the sake brevity I want to compile this into a function and using lapply.
Below is my attempt at the same-
list = c("Market", "Quarter")
df_all <- do.call(rbind, lapply(list, function(x){
df_l= df %>% group_by(x) %>%
summarise(values = sum(values)) %>%
mutate(cut= x) %>%
data.frame()
colnames(df_l)[df_l$x] = "Grouping"
df_l
}))
This block of code is giving me error.
I need the output to be the exact replica of the 'df_all' output for further operations.
How I do write this function correctly?
We can use purrr::map_dfr
library(dplyr)
library(purrr)
#Don't use the R build-in type e.g. list in variables name
lst <- c("Market", "Quarter")
#Use map if you need the output as a list
map_dfr(lst, ~df %>% group_by("Grouping"=!!sym(.x)) %>%
summarise(values = sum(values)) %>%
mutate(cut = .x) %>%
#To avoid the warning massage from bind_rows
mutate_if(is.factor, as.character))
# A tibble: 6 x 3
Grouping values cut
<chr> <dbl> <chr>
1 Camden 24 Market
2 DansFireplace 83 Market
3 StreetA 69 Market
4 Period1 73 Quarter
5 Period2 24 Quarter
6 Period3 79 Quarter
We can fix the first solution by
change group_by(x) to group_by_at(x), since x is a string here.
Use colnames(df_l)[colnames(df_l)==x] <- "Grouping" in naming the grouping variable.
Not pretty but works and doesn't require tidy functions:
groupwise_summation <- function(df, grouping_vecs){
# Split, apply, combine:
tmpdf <- do.call(rbind, lapply(split(df, df[,grouping_vecs]), function(x){sum(x$values)}))
# Clean up the df:
data.frame(cbind(cut = row.names(tmpdf), value = as.numeric(tmpdf)), row.names = NULL)
}
# Apply and combine:
df_all <- rbind(groupwise_summation(df, c("Quarter")), groupwise_summation(df, c("Market")))
# Note inside the c(), you can use multiple grouping variables.
I am trying to create a user-defined function which carries out some data transformations.
Mock data:
library(tidyverse)
set.seed(1)
sampledata_a <- data.frame(
patientid = sample(1:100),
servicetype = sample(c("service1", "service2", "service3", "service4", "service5"), 100, replace=TRUE),
date = sample(seq(as.Date('1999/01/01'), as.Date('2000/01/01'), by="day"), 100)
)
sampledata_b <- data.frame(
patientid = sample(1:100),
servicetype = sample(c("service6", "service7", "service8", "service9", "service10"), 100, replace=TRUE),
date = sample(seq(as.Date('1999/01/01'), as.Date('2000/01/01'), by="day"), 100)
)
sampledata1 <- rbind(sampledata_a, sampledata_b)
User-defined function:
get_most_recent_and_unique <- function(inputdata, groupbyvar, uniquevar, datevar) {
# first selects the most recent observation for each unique variable combination
outputdata <- inputdata %>%
distinct() %>%
arrange(groupbyvar, uniquevar, desc(datevar)) %>%
mutate(orderkey = paste0(groupbyvar, uniquevar, sep = "")) %>%
group_by(orderkey) %>%
do(head(., n=1)) %>%
ungroup() %>%
arrange(groupbyvar, desc(datevar), uniquevar)
# then tranpose from long to wide, and unite variables other than first variable into one
outputdata <- outputdata %>%
select(groupbyvar, uniquevar) %>%
group_by(groupbyvar) %>%
mutate(pos=1:n()) %>%
spread(pos, uniquevar) %>%
unite(uniquevar, -groupbyvar, sep=" / ")
return(outputdata)
}
When running the function as below:
outputdata <- get_most_recent_and_unique(sampledata1, "patientid", "servicetype", "date")
Following error message:
Error in arrange_impl(.data, dots) :
incorrect size (1) at position 1, expecting : 100
However, the code works fine when outside the user-defined function. I wonder if anyone can tell me what is wrong?
testoutputdata <- sampledata1 %>%
distinct() %>%
arrange(patientid, servicetype, desc(date)) %>%
mutate(orderkey = paste0(patientid, servicetype, sep = "")) %>%
group_by(orderkey) %>%
do(head(., n=1)) %>%
ungroup() %>%
arrange(patientid, desc(date), servicetype)
testoutputdata <- testoutputdata %>%
select(patientid, servicetype) %>%
group_by(patientid) %>%
mutate(pos=1:n()) %>%
spread(pos, servicetype) %>%
unite(servicetype, -patientid, sep=" / ")
Try this:
get_most_recent_and_unique <- function(inputdata, groupbyvar, uniquevar, datevar) {
groupbyvar <- enquo(groupbyvar)
uniquevar <- enquo(uniquevar)
datevar <- enquo(datevar)
# first selects the most recent observation for each unique variable combination
outputdata <- inputdata %>%
distinct() %>%
arrange(!! groupbyvar, !! uniquevar, desc(!! datevar)) %>%
mutate(orderkey := paste0(!! groupbyvar, !! uniquevar, sep = "")) %>%
group_by(orderkey) %>%
do(head(., n=1)) %>%
ungroup() %>%
arrange(!! groupbyvar, desc(!! datevar), !! uniquevar)
# then tranpose from long to wide, and unite variables other than first variable into one
outputdata <- outputdata %>%
select(!! groupbyvar, !! uniquevar) %>%
group_by(!! groupbyvar) %>%
mutate(pos=1:n()) %>%
spread(pos, !! uniquevar) %>%
unite(!! uniquevar, -!! groupbyvar, sep=" / ")
return(outputdata)
}
outputdata <- get_most_recent_and_unique(sampledata1, patientid, servicetype, date) # No quotation with arguments!
Here is the output:
patientid servicetype
<int> <chr>
1 1 service7 / service3
2 2 service10 / service1
3 3 service4 / service9
4 4 service8 / service3
5 5 service6 / service1
It seems to match your expectations when I compare them:
all.equal(outputdata, testoutputdata)
[1] TRUE
Note that you shouldn't quote the arguments when specifying the function, i.e. outputdata <- get_most_recent_and_unique(sampledata1, patientid, servicetype, date) will work while outputdata <- get_most_recent_and_unique(sampledata1, "patientid", "servicetype", "date") won't.