This question already has answers here:
Summarizing multiple columns with dplyr? [duplicate]
(5 answers)
Closed 1 year ago.
I am trying to calculate median mean for group of columns but its calculating only for one column. what i am doing wrong here ...??
df <- data.frame(Name = c("ABC", "DCA", "GOL",NA, "MNA",NA, "VAN"),
Goal =c("published", "pending", "not designed",NA, "pending", "pending", "not designed"),
Target_1 = c(3734, 2639, 2604, NA, 2793, 2688, 2403),
Target_2 = c(3322, 2016, 2310, NA, 3236, 3898, 2309),
Target_3 = c(3785, 2585, 3750, NA, 2781, 3589, 2830))
df_summary <- df %>% select(contains("Target")) %>% summarise(
q25 = round(quantile(., type=6, probs = seq(0, 1, 0.25), na.rm=TRUE)[2],digits = 0),
Median = round(quantile(., type=6, probs = seq(0, 1, 0.25), na.rm=TRUE)[3],digits = 0),
Mean = round( mean(., na.rm=TRUE),digits = 0),
q75 = round(quantile(., type=6, probs = seq(0, 1, 0.25), na.rm=TRUE)[4],digits = 0),
N = sum(!is.na(.)))
Use across to apply a function to multiple columns.
library(dplyr)
library(tidyr)
df %>%
summarise(across(contains("Target"), list(
q25 = ~round(quantile(., type=6, probs = 0.25, na.rm=TRUE),digits = 0),
Median = ~round(quantile(., type=6, probs = 0.5, na.rm=TRUE),digits = 0),
Mean = ~round( mean(., na.rm=TRUE),digits = 0),
q75 = ~round(quantile(., type=6, probs = 0.75, na.rm=TRUE),digits = 0),
N = ~sum(!is.na(.)))))
# Target_1_q25 Target_1_Median Target_1_Mean Target_1_q75 Target_1_N Target_2_q25
#1 2554 2664 2810 3028 6 2236
# Target_2_Median Target_2_Mean Target_2_q75 Target_2_N Target_3_q25 Target_3_Median
#1 2773 2848 3466 6 2732 3210
# Target_3_Mean Target_3_q75 Target_3_N
#1 3220 3759 6
Or maybe long format is a better way to display the values.
df %>%
pivot_longer(cols = contains("Target")) %>%
group_by(name) %>%
summarise( q25 = round(quantile(value, type=6, probs = 0.25, na.rm=TRUE),digits = 0),
Median = round(quantile(value, type=6, probs = 0.5, na.rm=TRUE),digits = 0),
Mean = round( mean(value, na.rm=TRUE),digits = 0),
q75 = round(quantile(value, type=6, probs = 0.75, na.rm=TRUE),digits = 0),
N = sum(!is.na(value)))
# name q25 Median Mean q75 N
# <chr> <dbl> <dbl> <dbl> <dbl> <int>
#1 Target_1 2554 2664 2810 3028 6
#2 Target_2 2236 2773 2848 3466 6
#3 Target_3 2732 3210 3220 3759 6
Using map:
df %>%
select(contains('Target'))%>%
map_dfr(~c(quantile(.x, type=6, probs = c(.25, .5,.75), na.rm = TRUE),
mean = mean(.x, na.rm = TRUE),
N = length(na.omit(.x))), .id = 'grp')
grp `25%` `50%` `75%` mean N
<chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Target_1 2554. 2664. 3028. 2810. 6
2 Target_2 2236. 2773 3466 2848. 6
3 Target_3 2732 3210. 3759. 3220 6
Whatever you are doing seems like a summary:
df %>%
select(contains('Target'))%>%
summary()
Another way could be:
df %>%
summarise(across(contains('Target'),
~list(quantile(.x, type=6, probs = c(.25, .5,.75), na.rm = TRUE),
mean(.x, na.rm = TRUE),
length(na.omit(.x))))
)%>%
unnest(everything())
A tibble: 5 x 3
Target_1 Target_2 Target_3
<dbl> <dbl> <dbl>
1 2554. 2236. 2732
2 2664. 2773 3210.
3 3028. 3466 3759.
4 2810. 2848. 3220
5 6 6 6
If you were to include pivoting:
df %>%
pivot_longer(contains('Target')) %>%
group_by(name) %>%
summarise(a = list(quantile(value, type=6, probs = c(.25, .5,.75), na.rm = TRUE)),
mean = mean(value, na.rm = TRUE), N = length(na.omit(value)))%>%
unnest_wider(a)
# A tibble: 3 x 6
name `25%` `50%` `75%` mean N
<chr> <dbl> <dbl> <dbl> <dbl> <int>
1 Target_1 2554. 2664. 3028. 2810. 6
2 Target_2 2236. 2773 3466 2848. 6
3 Target_3 2732 3210. 3759. 3220 6
Related
I have a data frame like
river
discharge
river1
500
river1
450
river1
200
river1
250
river2
375
river2
235
river2
130
river2
250
I want to apply the following list of function to the column discharge ..
f <- list(
mean = function(x, ...) mean(x),
Q50 = function(x, ...) lfquantile(x, exc.freq = 0.5),
Q95 = function(x, ...) lfquantile(x, exc.freq = 0.95),
Q90 = function(x, ...) lfquantile(x, exc.freq = 0.9),
Q70 = function(x, ...) lfquantile(x, exc.freq = 0.7),
)
in the end I am supposed to have a table like this :
river
mean
Q50
Q95
Q90
Q70
river1
river2
rivern
I do not have any idea how to do that :(
If we have all the functions available, then use
library(dplyr)
library(purrr)
imap_dfc(f, ~ df1 %>%
group_by(river) %>%
reframe(!! .y := .x(discharge)))
You could use group_by() function and apply the list of statistics to calculate the summaries and no need to write functions:
library(dplyr)
df %>%
group_by(river)%>%
summarize(
mean = mean(discharge),
q50 = quantile(discharge, 0.50),
q95 = quantile(discharge, 0.95),
q90 = quantile(discharge, 0.90),
q70 = quantile(discharge, 0.70)
)
and the output is:
river mean q50 q95 q90 q70
river1 350 350 492 485 455
river2 248 242 356 338 262
A base R approach. Replacing lfquantile with quantile for this example.
func <- list(mean = function (x, ...) mean(x),
Q50 = function (x, ...) quantile(x, probs = 0.5),
Q95 = function (x, ...) quantile(x, probs = 0.95),
Q90 = function (x, ...) quantile(x, probs = 0.9),
Q70 = function (x, ...) quantile(x, probs = 0.7))
setNames(aggregate(discharge ~ river, df, function(x)
setNames(sapply(names(func), function(nm)
func[[nm]](x)), names(func))), c("river", ""))
river mean Q50 Q95 Q90 Q70
1 river1 350.00 350.00 492.50 485.00 455.00
2 river2 247.50 242.50 356.25 337.50 262.50
Data
df <- structure(list(river = c("river1", "river1", "river1", "river1",
"river2", "river2", "river2", "river2"), discharge = c(500L,
450L, 200L, 250L, 375L, 235L, 130L, 250L)), class = "data.frame",
row.names = c(NA, -8L))
library(tidyverse)
func <- list(mean = function (x, ...) mean(x),
Q50 = function (x, ...) quantile(x, probs = 0.5),
Q95 = function (x, ...) quantile(x, probs = 0.95),
Q90 = function (x, ...) quantile(x, probs = 0.9),
Q70 = function (x, ...) quantile(x, probs = 0.7))
df %>%
group_by(river) %>%
summarise_at(vars(discharge), func)
#> # A tibble: 2 × 6
#> river mean Q50 Q95 Q90 Q70
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 river1 350 350 492. 485 455
#> 2 river2 248. 242. 356. 338. 262.
df %>%
group_by(river) %>%
summarise(across(discharge, func))
#> # A tibble: 2 × 6
#> river discharge_mean discharge_Q50 discharge_Q95 discharge_Q90 discharge_Q70
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 river1 350 350 492. 485 455
#> 2 river2 248. 242. 356. 338. 262.
Created on 2023-02-08 with reprex v2.0.2
EDIT
since the quantile function is vectorized, you could do:
library(tidyverse)
func1 <- function(x){
qnts <- quantile(x, probs = c(0.5,0.95,0.9,0.7))
qnts <- setNames(qnts, paste0('Q', c(50,95,90,70)))
data.frame(mean = mean(x), as.list(qnts))
}
df %>%
summarise(across(discharge, func1, .unpack = TRUE), .by = river)
#> river discharge_mean discharge_Q50 discharge_Q95 discharge_Q90 discharge_Q70
#> 1 river1 350.0 350.0 492.50 485.0 455.0
#> 2 river2 247.5 242.5 356.25 337.5 262.5
Created on 2023-02-08 with reprex v2.0.2
I am currently working with the palmer penguins data set in R and want to summarise data that combines means, median, range and quants, grouping by sex.
My current solution has the quant data split from the summary data. Is there a way to do this in one go. If not how do I combine the data sets. The group quant is currently in long format, and I am not sure how to combine them.
group_summary <- penguins %>% group_by(sex) %>% summarize(mean = mean(bill_length_mm,
na.rm = TRUE), meadian = median(bill_length_mm, na.rm = TRUE), range =
max(bill_length_mm, na.rm = TRUE) - min(bill_length_mm, na.rm = TRUE))
group_quant <- penguins %>% group_by(sex) %>% summarize(quantile(bill_length_mm,
probs =seq(.1, 1, by = .1), na.rm =TRUE, .groups = 'drop'))
I had the following solution but it drops the NA values from Sex and I am not sure why.
group_summary <- do.call(data.frame,aggregate(bill_length_mm ~ sex, penguins,
function(x) c(mean = mean(x, na.rm = TRUE), median = median(x, na.rm = TRUE), range =
max(x, na.rm = TRUE) - min(x, na.rm = TRUE), quantile(x, probs = seq(.1, 1, by = .1),
na.rm = TRUE, .groups = 'drop'))))
You may save the quantiles in a list and then use unnest_wider to create new columns from them. To calculate range I used diff(range(...)) instead of max(...) - min(...). Both of them are fine but I included it to show an alternative.
library(palmerpenguins)
library(dplyr)
library(tidyr)
penguins %>%
group_by(sex) %>%
summarize(mean = mean(bill_length_mm, na.rm = TRUE),
median = median(bill_length_mm, na.rm = TRUE),
range = diff(range(bill_length_mm, na.rm = TRUE)),
quantile = list(quantile(bill_length_mm, probs = seq(.1, 1, by = .1), na.rm = TRUE))) %>%
unnest_wider(quantile)
# sex mean median range `10%` `20%` `30%` `40%` `50%` `60%` `70%` `80%` `90%` `100%`
# <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 female 42.1 42.8 25.9 35.8 36.7 38.2 40 42.8 45.1 45.7 46.5 47.5 58
#2 male 45.9 46.8 25 38.8 40.5 41.3 43.2 46.8 49.0 50.0 50.8 51.9 59.6
#3 NA 41.3 42 13.2 36.8 37.7 37.8 38.6 42 44 44.5 45.2 46.4 47.3
I have a dataframe that looks like this
df <- data.frame(Region = c("Asia","Asia","Africa","Europe","Europe"),
Emp = c(120,40,10,67,110),
Sales18 = c(12310, 4510, 1140, 5310, 16435),
Sales19 = c(15670, 6730, 1605, 6120, 1755))
I am running a code where I group by region and then take average and weighted average for all 'sales' columns by 'Emp'
Result <- df %>% group_by(Region) %>%
summarise(sales18 = mean(Sales18, na.rm = T),
sales19 = mean(Sales19, na.rm = T),
weightedsales18 = weighted.mean(Sales18, .data[[Emp]], na.rm = T),
weightedsales19 = weighted.mean(Sales19, .data[[Emp]], na.rm = T))
However, I get the following err
Error in splice(dot_call(capture_dots, frame_env = frame_env, named = named, :
object 'Emp' not found
Can't figure out what I am doing wrong
An option could be:
library(tidyverse)
df <- data.frame(Region = c("Asia","Asia","Africa","Europe","Europe"),
Emp = c(120,40,10,67,110),
Sales18 = c(12310, 4510, 1140, 5310, 16435),
Sales19 = c(15670, 6730, 1605, 6120, 1755))
df %>%
group_by(Region) %>%
summarise(across(
.cols = starts_with("Sales"),
.fns = list(w_mean = ~ weighted.mean(.x, w = Emp), mean = ~ mean(.x)),
.names = "{.col}_{.fn}")
)
#> # A tibble: 3 x 5
#> Region Sales18_w_mean Sales18_mean Sales19_w_mean Sales19_mean
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 Africa 1140 1140 1605 1605
#> 2 Asia 10360 8410 13435 11200
#> 3 Europe 12224. 10872. 3407. 3938.
Created on 2021-05-25 by the reprex package (v2.0.0)
This works. The data masking already takes place, you don't need the .data pronoun.
library(tidyverse)
df <- data.frame(Region = c("Asia","Asia","Africa","Europe","Europe"),
Emp = c(120,40,10,67,110),
Sales18 = c(12310, 4510, 1140, 5310, 16435),
Sales19 = c(15670, 6730, 1605, 6120, 1755))
Result <- df %>% group_by(Region) %>%
summarise(sales18 = mean(Sales18, na.rm = T),
sales19 = mean(Sales19, na.rm = T),
weightedsales18 = weighted.mean(Sales18, Emp, na.rm = T),
weightedsales19 = weighted.mean(Sales19, Emp, na.rm = T))
Result
#> # A tibble: 3 x 5
#> Region sales18 sales19 weightedsales18 weightedsales19
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 Africa 1140 1605 1140 1605
#> 2 Asia 8410 11200 10360 13435
#> 3 Europe 10872. 3938. 12224. 3407.
Created on 2021-05-25 by the reprex package (v2.0.0)
Unquoted Emp inside [[ tells R to search for string variable called Emp that presumably contains name of other variable that contains weights, like here:
df <- data.frame(Region = c("Asia","Asia","Africa","Europe","Europe"),
x = c(120,40,10,67,110),
Sales18 = c(12310, 4510, 1140, 5310, 16435),
Sales19 = c(15670, 6730, 1605, 6120, 1755))
Emp <- 'x'
df %>% group_by(Region) %>%
summarise(sales18 = mean(Sales18, na.rm = T),
sales19 = mean(Sales19, na.rm = T),
weightedsales18 = weighted.mean(Sales18, .data[[Emp]], na.rm = T),
weightedsales19 = weighted.mean(Sales19, .data[[Emp]], na.rm = T))
# A tibble: 3 x 5
Region sales18 sales19 weightedsales18 weightedsales19
<chr> <dbl> <dbl> <dbl> <dbl>
1 Africa 1140 1605 1140 1605
2 Asia 8410 11200 10360 13435
3 Europe 10872. 3938. 12224. 3407.
Since, you do not have this kind of Emp, R throws an error.
What to do? Just quote Emp inside [[:
df <- data.frame(Region = c("Asia","Asia","Africa","Europe","Europe"),
Emp = c(120,40,10,67,110),
Sales18 = c(12310, 4510, 1140, 5310, 16435),
Sales19 = c(15670, 6730, 1605, 6120, 1755))
df %>% group_by(Region) %>%
summarise(sales18 = mean(Sales18, na.rm = T),
sales19 = mean(Sales19, na.rm = T),
weightedsales18 = weighted.mean(Sales18, .data[['Emp']], na.rm = T),
weightedsales19 = weighted.mean(Sales19, .data[['Emp']], na.rm = T))
# A tibble: 3 x 5
Region sales18 sales19 weightedsales18 weightedsales19
<chr> <dbl> <dbl> <dbl> <dbl>
1 Africa 1140 1605 1140 1605
2 Asia 8410 11200 10360 13435
3 Europe 10872. 3938. 12224. 3407.
Assume a company that we have info about Total sales and the amount of sales in three counties CA , TX and WI.
How can i calculate : the average sales contribution of the three states of total company sales
I need furthermore to find : the same average percentages for each year, month of the year and day of the week.
EDITED !!!
structure(list(CA = c(11047, 9925, 11322, 12251, 16610, 14696
), TX = c(7381, 5912, 9006, 6226, 9440, 9376), WI = c(6984, 3309,
8883, 9533, 11882, 8664), Total = c(25412, 19146, 29211, 28010,
37932, 32736), date = structure(c(1296518400, 1296604800, 1296691200,
1296777600, 1296864000, 1296950400), tzone = "UTC", class = c("POSIXct",
"POSIXt")), event_type = c("NA", "NA", "NA", "NA", "NA", "Sporting"
), snap_CA = c(1, 1, 1, 1, 1, 1), snap_TX = c(1, 0, 1, 0, 1,
1), snap_WI = c(0, 1, 1, 0, 1, 1)), row.names = c(NA, -6L), class = c("tbl_df",
"tbl", "data.frame"))
if I understood your problem correctly a possible solution would be this:
library(dplyr)
library(lubridate)
df1 <- df %>%
dplyr::mutate(YEAR = lubridate::year(date),
MONTH = lubridate::month(date),
WEEKDAY = lubridate::wday(date),
P_CA = CA / Total,
P_TX = TX / Total,
P_WI = WI / Total)
# Average per Year
df1 %>%
dplyr::group_by(YEAR) %>%
dplyr::summarise(AV_CA = mean(P_CA, na.rm = TRUE),
AV_TX = mean(P_TX, na.rm = TRUE),
AV_WI = mean(P_WI, na.rm = TRUE))
YEAR AV_CA AV_TX AV_WI
<dbl> <dbl> <dbl> <dbl>
1 2011 0.444 0.278 0.278
# Average per Month
df1 %>%
dplyr::group_by(MONTH) %>%
dplyr::summarise(AV_CA = mean(P_CA, na.rm = TRUE),
AV_TX = mean(P_TX, na.rm = TRUE),
AV_WI = mean(P_WI, na.rm = TRUE))
MONTH AV_CA AV_TX AV_WI
<dbl> <dbl> <dbl> <dbl>
1 2 0.444 0.278 0.278
# Average per Weekday
df1 %>%
dplyr::group_by(WEEKDAY) %>%
dplyr::summarise(AV_CA = mean(P_CA, na.rm = TRUE),
AV_TX = mean(P_TX, na.rm = TRUE),
AV_WI = mean(P_WI, na.rm = TRUE))
WEEKDAY AV_CA AV_TX AV_WI
<dbl> <dbl> <dbl> <dbl>
1 1 0.449 0.286 0.265
2 3 0.435 0.290 0.275
3 4 0.518 0.309 0.173
4 5 0.388 0.308 0.304
5 6 0.437 0.222 0.340
6 7 0.438 0.249 0.313
For this dummy data all will up to 100% but when using a larger dataset this might not be true
I found a few solutions on here but none seem to work to add a summary row to dplyr output.
#mock up data
df <- data.frame("Market" = sample(c("East", "North", "West"), 100, replace = TRUE, prob = c(0.33, 0.33, 0.34)),
"var1" = sample(c("Y", "N"), 100, replace = TRUE, prob = c(0.4, 0.6)),
"var2" = sample(c("Y", "N"), 100, replace = TRUE, prob = c(0.7, 0.3)),
"var3" = sample(c("Y", "N"), 100, replace = TRUE, prob = c(0.5, 0.5)))
Here is the code:
df_report <- df %>%
group_by(Market) %>%
filter(Market == "East" | Market == "West") %>%
summarise(n = n(),
var1_y = sum(var1 == "Y"),
var1_n = sum(var1 == "N")) %>%
mutate(total = var1_y + var1_n,
var1_y_pct = (var1_y/total),
var1_n_pct = (var1_n/total),
pct_total = total/sum(total))
Here is the output:
# A tibble: 2 x 8
Market n var1_y var1_n total var1_y_pct var1_n_pct pct_total
<fct> <int> <int> <int> <int> <dbl> <dbl> <dbl>
1 East 29 13 16 29 0.448 0.552 0.453
2 West 35 16 19 35 0.457 0.543 0.547
Here are the two solutions I tried:
Option 1
df_report %>%
add_row(Market = "Total", n = sum(n), var1_y = sum(var1_y), var1_n = sum(var1_n),
total = sum(total), var1_y_pct = sum(var1_y_pct), var1_n_pct = sum(varn_y_pct), pct_total = sum(pct_total))
Option 2
df_report %>%
rbind(c("Total", sum(n), sum(var1_y), sum(var1_n), sum(total), sum(var1_y_pct), sum(varn_y_pct), sum(pct_total)))
Both give me the same error: Error in sum(n) : invalid 'type' (closure) of argument
I'm unable to determine why these solutions, while working for others and seeming very reasonable, are not working for me.
You should try
df_report %>% janitor::adorn_totals("row")
Which produces
Market n var1_y var1_n total var1_y_pct var1_n_pct pct_total
East 30 11 19 30 0.3666667 0.6333333 0.4285714
West 40 19 21 40 0.4750000 0.5250000 0.5714286
Total 70 30 40 70 0.8416667 1.1583333 1.0000000
The long way of doing this is going for summarise (watch out, you have a typo in var1_n_pct). Then bind the rows.
row_to_add <- df_report %>%
summarise(Market = "Total",
n = sum(n),
var1_y = sum(var1_y),
var1_n = sum(var1_n),
total = sum(total),
var1_y_pct = sum(var1_y_pct),
var1_n_pct = sum(var1_n_pct),
pct_total = sum(pct_total))
df_report %>% bind_rows(row_to_add)