How can I melt/reshape/rotate my table from this:
profit lost obs fc.mape
mean 3724.743 804.1835 427.8899 0.21037696
std.dev 677.171 406.1391 372.5544 0.06072549
To this:
mean std.dev
profit x
lost x
obs x
fc.mape x
Here is a tidyverse solution. I find it too complicated but it works. Maybe there are simpler ones.
library(dplyr)
library(tidyr)
df1 %>%
mutate(id = row.names(.)) %>%
pivot_longer(
cols = -id,
names_to = "stat"
) %>%
group_by(id) %>%
mutate(n = row_number()) %>%
ungroup() %>%
pivot_wider(
id_cols = c(n, stat),
names_from = id,
values_from = value
) %>%
select(-n)
## A tibble: 4 x 3
# stat mean std.dev
# <chr> <dbl> <dbl>
#1 profit 3725. 677.
#2 lost 804. 406.
#3 obs 428. 373.
#4 fc.mape 0.210 0.0607
Data
df1 <-
structure(list(profit = c(3724.743, 677.171), lost = c(804.1835,
406.1391), obs = c(427.8899, 372.5544), fc.mape = c(0.21037696,
0.06072549)), class = "data.frame", row.names = c("mean", "std.dev"))
Related
I have a data frame where I want to sum column values with the same prefix to produce a new column. My current problem is that it's not taking into account my group_by variable and returning identical values. Is part of the problem the .cols variable I'm selecting in the across function?
Sample data
library(dplyr)
library(purrr)
set.seed(10)
dat <- data.frame(id = rep(1:2, 5),
var1.pre = rnorm(10),
var1.post = rnorm(10),
var2.pre = rnorm(10),
var2.post = rnorm(10)
) %>%
mutate(index = id)
var_names = c("var1", "var2")
What I've tried
sumfunction <- map(
var_names,
~function(.){
sum(dat[glue("{.x}.pre")], dat[glue("{.x}.post")], na.rm = TRUE)
}
) %>%
setNames(var_names)
dat %>%
group_by(id) %>%
summarise(
across(
.cols = index,
.fns = sumfunction,
.names = "{.fn}"
)
) %>%
ungroup
Desired output
For this and similar problems I made the 'dplyover' package (it is not on CRAN). Here we can use dplyover::across2() to loop over two series of columns, first, all columns ending with "pre" and second all columns ending with "post". To get the names correct we can use .names = "{pre}" to get the common prefix of both series of columns.
library(dplyr)
library(dplyover) # https://timteafan.github.io/dplyover/
dat %>%
group_by(id) %>%
summarise(across2(ends_with("pre"),
ends_with("post"),
~ sum(c(.x, .y)),
.names = "{pre}"
)
)
#> # A tibble: 2 × 3
#> id var1 var2
#> <int> <dbl> <dbl>
#> 1 1 -2.32 -5.55
#> 2 2 1.11 -9.54
Created on 2022-12-14 with reprex v2.0.2
Whenever operations across multiple columns get complicated, we could pivot:
library(dplyr)
library(tidyr)
dat %>%
pivot_longer(-c(id, index),
names_to = c(".value", "name"),
names_sep = "\\.") %>%
group_by(id) %>%
summarise(var1 = sum(var1), var2=sum(var2))
id var1 var2
<int> <dbl> <dbl>
1 1 -2.32 -5.55
2 2 1.11 -9.54
I'm trying to get multiple summary statistics in R grouped by Team. I used code like below, but output is not what I want.
please point me in a better direction. Thanks!
set.seed(77)
data <- data.frame(Team =sample(c("A","B"),30, replace=TRUE),
gender=sample(c("female","male"),30, replace=TRUE),
Age =sample(c(0:100),30, replace=T))
dat <- data %>%
group_by(Team, gender) %>%
dplyr::summarize_all(list(my_mean = mean,
my_sum = sum,
my_sd = sd)) %>%
as.data.frame()
df <- data %>%
group_by(Team) %>%
summarize(total = n(gender),
mean = mean(Age),
Max_Age = max(Age),
Min_Age = min(Age),
sd = sd(Age),
)
I want to get like this pic.
You may need to create the dataframe for the summary statistics of age per Team (age_summary in the example below) and that for the count of Team members per gender and Team (gender_summary in the example below), and then merge them into one dataframe (say summary_df).
library(tidyverse)
set.seed(77)
data <- data.frame(
Team = sample(c("A", "B"), 30, replace = TRUE),
gender = sample(c("female", "male"), 30, replace = TRUE),
Age = sample(c(0:100), 30, replace = T)
)
age_summary <- data %>%
group_by(Team) %>%
summarize(
mean = mean(Age),
Max = max(Age),
Min = min(Age),
sd = sd(Age)
) %>%
column_to_rownames("Team") %>%
t() %>%
as_tibble(
rownames = "age_summary"
)
gender_summary <- data %>%
group_by(Team) %>%
count(gender) %>%
ungroup() %>%
pivot_wider(names_from = Team, values_from = n)
summary_df <- full_join(
age_summary,
gender_summary
) %>%
mutate(
"item" = if_else(
is.na(gender),
"Age",
"Sex"
)
) %>%
unite("summary", c(age_summary, gender), na.rm = TRUE, remove = FALSE) %>%
relocate(item, .before = 1) %>%
select(-c(age_summary, gender))
# # A tibble: 6 × 4
# item summary A B
# <chr> <chr> <dbl> <dbl>
# 1 Age mean 45.6 57.8
# 2 Age Max 92 82
# 3 Age Min 5 14
# 4 Age sd 30.1 22.1
# 5 Sex female 8 9
# 6 Sex male 7 6
library(tidyverse)
#make a sample data frame
a <- c(2000,2000,2000,2000,2001,2001,2001,2001)
b <- c("M","M","M","F","F","M","F","F")
d<- c("Yes","No","Yes","No","No","Unknown","Unknown","Yes")
e <- c("Unknown","No","No","Yes","Unknown","Yes","No","Unknown")
df <- data.frame(a,b,d,e)
colnames(df) <- c("Year","Gender","q1","q2")
# make a table for q1
myvar <- c("Gender","q1")
mydf <- df[,myvar]
table1 <- mydf %>%
pivot_longer(-q1) %>%
group_by(name,q1,value) %>%
summarise(n=n()) %>%
mutate(prop = round(n/sum(n),3)*100,
summary_str = glue::glue("{n}({prop}%)")) %>%
pivot_wider(id_cols = c(name,value), names_from = "q1", values_from = "summary_str")
#make the function creating a table
maketable <- function(df,x){
myvar <- c("gender",paste0(x))
mydf <- df[,myvar]
table1 <- mydf %>%
pivot_longer(-get(x)) %>%
group_by(name,get(x),value) %>%
summarise(n=n()) %>%
mutate(prop = round(n/sum(n),3)*100,
summary_str = glue::glue("{n}({prop}%)")) %>%
pivot_wider(id_cols = c(name,value), names_from = paste0(x), values_from = "summary_str")
colnames(table1)
}
maketable(df,q1)
maketable(df,q2)
Error in paste0(x): object 'q1' not found.
I want to make a function, so that I can use it for q2.
Could anyone help to correct the code? or suggest a better way?
Output per variable is as below
If you want to pass in unquoted column names to your function, you can use the {{}} (embrace) operator to inject them into your commands. For example
maketable <- function(df,x){
df %>%
select(Gender, {{x}}) %>%
pivot_longer(-{{x}}) %>%
group_by(name,{{x}},value)%>%
summarise(n=n()) %>%
mutate(prop = round(n/sum(n),3)*100,
summary_str = glue::glue("{n}({prop}%)")) %>%
pivot_wider(id_cols = c(name,value), names_from = {{x}}, values_from = "summary_str")
}
table1 <-maketable(df, q1)
See the programming with dplyr guide for more information.
Also note that the function just returns the new value. If you want to assign that to a new variable, make sure you do that outside the function. Values created inside of functions will not appear outside.
I have tried this one here
my_func = function(x)
{
new_df = df %>% group_by(Gender) %>% count({{x}}) %>% pivot_wider(names_from = {{x}}, values_from = n)
return(new_df)
}
I'm not sure that this is what you asked
colns <- colnames(df)
lapply(colns[c(3:4)], function(x) {
myvar <- c("Gender", x)
mydf <- df[,myvar]
table1 <- mydf%>%
pivot_longer(-x) %>%
group_by_all %>%
summarise(n=n()) %>%
mutate(prop = round(n/sum(n),3)*100,
summary_str = glue::glue("{n}({prop}%)")) %>%
pivot_wider(id_cols = c(name,value), names_from = x, values_from = "summary_str")
})
result is like
[[1]]
# A tibble: 2 x 5
# Groups: name [1]
name value No Unknown Yes
<chr> <chr> <glue> <glue> <glue>
1 Gender F 2(25%) 1(12.5%) 1(12.5%)
2 Gender M 1(12.5%) 1(12.5%) 2(25%)
[[2]]
# A tibble: 2 x 5
# Groups: name [1]
name value No Unknown Yes
<chr> <chr> <glue> <glue> <glue>
1 Gender F 1(12.5%) 2(25%) 1(12.5%)
2 Gender M 2(25%) 1(12.5%) 1(12.5%)
You may need to change
lapply(colns[c(3:4)],...
3:4 to 3:102 for q1~q100
Consider the following simple dplyr pipeline in R:
df <- data.frame(group = rep(LETTERS[1:3],each=5), value = rnorm(15)) %>%
group_by(group) %>%
mutate(rank = rank(value, ties.method = 'min'))
df %>%
group_by(group) %>%
summarise(mean_1 = mean(value[rank <= 1]),
mean_2 = mean(value[rank <= 2]),
mean_3 = mean(value[rank <= 3]),
mean_4 = mean(value[rank <= 4]),
mean_5 = mean(value[rank <= 5]))
How can I avoid typing out mean_i = mean(value[rank <= i]) for all i without reverting to a loop over group and i? Specifically, is there a neat way to iteratively create variables with the dplyr::summarise function?
You are actually calculative cumulative mean here. There is a function cummean in dplyr which we can use here and cast the data to wide format.
library(tidyverse)
df %>%
arrange(group, rank) %>%
group_by(group) %>%
mutate(value = cummean(value)) %>%
pivot_wider(names_from = rank, values_from = value, names_prefix = 'mean_')
# group mean_1 mean_2 mean_3 mean_4 mean_5
# <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 A -0.560 -0.395 -0.240 -0.148 0.194
#2 B -1.27 -0.976 -0.799 -0.484 -0.0443
#3 C -0.556 -0.223 -0.0284 0.0789 0.308
If you are asking for a general solution and calculating cumulative mean is just an example in that case you can use map.
n <- max(df$rank)
map(seq_len(n), ~df %>%
group_by(group) %>%
summarise(!!paste0('mean_', .x):= mean(value[rank <= .x]))) %>%
reduce(inner_join, by = 'group')
data
set.seed(123)
df <- data.frame(group = rep(LETTERS[1:3],each=5), value = rnorm(15)) %>%
group_by(group) %>%
mutate(rank = rank(value, ties.method = 'min'))
I have a df that looks like the following:
ID DATE
12 10-20-20
12 10-22-20
10 10-15-20
9 10-10-20
11 11-01-20
7 11-02-20
I would like to group by month and then create a column for unique id count and repeat id count like below:
MONTH Unique_Count Repeat_Count
10-1-20 2 2
11-1-20 2 0
I am able to get the date down to the first of the month and group by ID but I am not sure how to count unique instances within the months.
df %>%
mutate(month = floor_date(as.Date(DATE), "month")) %>%
group_by(ID) %>%
mutate(count = n())
Are you perhaps looking for:
df %>%
mutate(month = strftime(floor_date(as.Date(DATE, "%m-%d-%y"), "month"),
"%m-%d-%y")) %>%
group_by(month) %>%
summarize(unique_count = length(which(table(ID) == 1)),
repeat_count = sum(table(ID)[(which(table(ID) > 1))]))
#> # A tibble: 2 x 3
#> month unique_count repeat_count
#> <chr> <int> <int>
#> 1 10-01-20 2 2
#> 2 11-01-20 2 0
Here's a shot at it:
library(lubridate)
library(dplyr)
dates <- as.Date(c("2020-10-15", "2020-10-15", "2020-11-16", "2020-11-16", "2020-11-16"))
ids <- c(12, 12, 13, 13, 14)
df <- data.frame(dates, ids)
duplicates <- df %>%
group_by(dates_floored = floor_date(dates, unit = "month"), ids) %>%
mutate(duplicate_count = n()) %>%
filter(duplicate_count > 1) %>%
distinct(ids, .keep_all = TRUE)
uniques <- df %>%
group_by(dates_floored = floor_date(dates, unit = "month"), ids) %>%
mutate(unique_count = n()) %>%
filter(unique_count < 2) %>%
distinct(ids, .keep_all = TRUE)
df_cleaned <- full_join(uniques, duplicates, by = c("ids", "dates", "dates_floored")) %>%
group_by(dates_floored) %>%
summarize(count_duplicates = sum(duplicate_count, na.rm = TRUE),
count_unique = sum(unique_count, na.rm = TRUE))
df_cleaned