Related
This question already has answers here:
Summarizing multiple columns with dplyr? [duplicate]
(5 answers)
Closed 1 year ago.
I am trying to calculate median mean for group of columns but its calculating only for one column. what i am doing wrong here ...??
df <- data.frame(Name = c("ABC", "DCA", "GOL",NA, "MNA",NA, "VAN"),
Goal =c("published", "pending", "not designed",NA, "pending", "pending", "not designed"),
Target_1 = c(3734, 2639, 2604, NA, 2793, 2688, 2403),
Target_2 = c(3322, 2016, 2310, NA, 3236, 3898, 2309),
Target_3 = c(3785, 2585, 3750, NA, 2781, 3589, 2830))
df_summary <- df %>% select(contains("Target")) %>% summarise(
q25 = round(quantile(., type=6, probs = seq(0, 1, 0.25), na.rm=TRUE)[2],digits = 0),
Median = round(quantile(., type=6, probs = seq(0, 1, 0.25), na.rm=TRUE)[3],digits = 0),
Mean = round( mean(., na.rm=TRUE),digits = 0),
q75 = round(quantile(., type=6, probs = seq(0, 1, 0.25), na.rm=TRUE)[4],digits = 0),
N = sum(!is.na(.)))
Use across to apply a function to multiple columns.
library(dplyr)
library(tidyr)
df %>%
summarise(across(contains("Target"), list(
q25 = ~round(quantile(., type=6, probs = 0.25, na.rm=TRUE),digits = 0),
Median = ~round(quantile(., type=6, probs = 0.5, na.rm=TRUE),digits = 0),
Mean = ~round( mean(., na.rm=TRUE),digits = 0),
q75 = ~round(quantile(., type=6, probs = 0.75, na.rm=TRUE),digits = 0),
N = ~sum(!is.na(.)))))
# Target_1_q25 Target_1_Median Target_1_Mean Target_1_q75 Target_1_N Target_2_q25
#1 2554 2664 2810 3028 6 2236
# Target_2_Median Target_2_Mean Target_2_q75 Target_2_N Target_3_q25 Target_3_Median
#1 2773 2848 3466 6 2732 3210
# Target_3_Mean Target_3_q75 Target_3_N
#1 3220 3759 6
Or maybe long format is a better way to display the values.
df %>%
pivot_longer(cols = contains("Target")) %>%
group_by(name) %>%
summarise( q25 = round(quantile(value, type=6, probs = 0.25, na.rm=TRUE),digits = 0),
Median = round(quantile(value, type=6, probs = 0.5, na.rm=TRUE),digits = 0),
Mean = round( mean(value, na.rm=TRUE),digits = 0),
q75 = round(quantile(value, type=6, probs = 0.75, na.rm=TRUE),digits = 0),
N = sum(!is.na(value)))
# name q25 Median Mean q75 N
# <chr> <dbl> <dbl> <dbl> <dbl> <int>
#1 Target_1 2554 2664 2810 3028 6
#2 Target_2 2236 2773 2848 3466 6
#3 Target_3 2732 3210 3220 3759 6
Using map:
df %>%
select(contains('Target'))%>%
map_dfr(~c(quantile(.x, type=6, probs = c(.25, .5,.75), na.rm = TRUE),
mean = mean(.x, na.rm = TRUE),
N = length(na.omit(.x))), .id = 'grp')
grp `25%` `50%` `75%` mean N
<chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Target_1 2554. 2664. 3028. 2810. 6
2 Target_2 2236. 2773 3466 2848. 6
3 Target_3 2732 3210. 3759. 3220 6
Whatever you are doing seems like a summary:
df %>%
select(contains('Target'))%>%
summary()
Another way could be:
df %>%
summarise(across(contains('Target'),
~list(quantile(.x, type=6, probs = c(.25, .5,.75), na.rm = TRUE),
mean(.x, na.rm = TRUE),
length(na.omit(.x))))
)%>%
unnest(everything())
A tibble: 5 x 3
Target_1 Target_2 Target_3
<dbl> <dbl> <dbl>
1 2554. 2236. 2732
2 2664. 2773 3210.
3 3028. 3466 3759.
4 2810. 2848. 3220
5 6 6 6
If you were to include pivoting:
df %>%
pivot_longer(contains('Target')) %>%
group_by(name) %>%
summarise(a = list(quantile(value, type=6, probs = c(.25, .5,.75), na.rm = TRUE)),
mean = mean(value, na.rm = TRUE), N = length(na.omit(value)))%>%
unnest_wider(a)
# A tibble: 3 x 6
name `25%` `50%` `75%` mean N
<chr> <dbl> <dbl> <dbl> <dbl> <int>
1 Target_1 2554. 2664. 3028. 2810. 6
2 Target_2 2236. 2773 3466 2848. 6
3 Target_3 2732 3210. 3759. 3220 6
I have a dataframe that looks like this
df <- data.frame(Region = c("Asia","Asia","Africa","Europe","Europe"),
Emp = c(120,40,10,67,110),
Sales18 = c(12310, 4510, 1140, 5310, 16435),
Sales19 = c(15670, 6730, 1605, 6120, 1755))
I am running a code where I group by region and then take average and weighted average for all 'sales' columns by 'Emp'
Result <- df %>% group_by(Region) %>%
summarise(sales18 = mean(Sales18, na.rm = T),
sales19 = mean(Sales19, na.rm = T),
weightedsales18 = weighted.mean(Sales18, .data[[Emp]], na.rm = T),
weightedsales19 = weighted.mean(Sales19, .data[[Emp]], na.rm = T))
However, I get the following err
Error in splice(dot_call(capture_dots, frame_env = frame_env, named = named, :
object 'Emp' not found
Can't figure out what I am doing wrong
An option could be:
library(tidyverse)
df <- data.frame(Region = c("Asia","Asia","Africa","Europe","Europe"),
Emp = c(120,40,10,67,110),
Sales18 = c(12310, 4510, 1140, 5310, 16435),
Sales19 = c(15670, 6730, 1605, 6120, 1755))
df %>%
group_by(Region) %>%
summarise(across(
.cols = starts_with("Sales"),
.fns = list(w_mean = ~ weighted.mean(.x, w = Emp), mean = ~ mean(.x)),
.names = "{.col}_{.fn}")
)
#> # A tibble: 3 x 5
#> Region Sales18_w_mean Sales18_mean Sales19_w_mean Sales19_mean
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 Africa 1140 1140 1605 1605
#> 2 Asia 10360 8410 13435 11200
#> 3 Europe 12224. 10872. 3407. 3938.
Created on 2021-05-25 by the reprex package (v2.0.0)
This works. The data masking already takes place, you don't need the .data pronoun.
library(tidyverse)
df <- data.frame(Region = c("Asia","Asia","Africa","Europe","Europe"),
Emp = c(120,40,10,67,110),
Sales18 = c(12310, 4510, 1140, 5310, 16435),
Sales19 = c(15670, 6730, 1605, 6120, 1755))
Result <- df %>% group_by(Region) %>%
summarise(sales18 = mean(Sales18, na.rm = T),
sales19 = mean(Sales19, na.rm = T),
weightedsales18 = weighted.mean(Sales18, Emp, na.rm = T),
weightedsales19 = weighted.mean(Sales19, Emp, na.rm = T))
Result
#> # A tibble: 3 x 5
#> Region sales18 sales19 weightedsales18 weightedsales19
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 Africa 1140 1605 1140 1605
#> 2 Asia 8410 11200 10360 13435
#> 3 Europe 10872. 3938. 12224. 3407.
Created on 2021-05-25 by the reprex package (v2.0.0)
Unquoted Emp inside [[ tells R to search for string variable called Emp that presumably contains name of other variable that contains weights, like here:
df <- data.frame(Region = c("Asia","Asia","Africa","Europe","Europe"),
x = c(120,40,10,67,110),
Sales18 = c(12310, 4510, 1140, 5310, 16435),
Sales19 = c(15670, 6730, 1605, 6120, 1755))
Emp <- 'x'
df %>% group_by(Region) %>%
summarise(sales18 = mean(Sales18, na.rm = T),
sales19 = mean(Sales19, na.rm = T),
weightedsales18 = weighted.mean(Sales18, .data[[Emp]], na.rm = T),
weightedsales19 = weighted.mean(Sales19, .data[[Emp]], na.rm = T))
# A tibble: 3 x 5
Region sales18 sales19 weightedsales18 weightedsales19
<chr> <dbl> <dbl> <dbl> <dbl>
1 Africa 1140 1605 1140 1605
2 Asia 8410 11200 10360 13435
3 Europe 10872. 3938. 12224. 3407.
Since, you do not have this kind of Emp, R throws an error.
What to do? Just quote Emp inside [[:
df <- data.frame(Region = c("Asia","Asia","Africa","Europe","Europe"),
Emp = c(120,40,10,67,110),
Sales18 = c(12310, 4510, 1140, 5310, 16435),
Sales19 = c(15670, 6730, 1605, 6120, 1755))
df %>% group_by(Region) %>%
summarise(sales18 = mean(Sales18, na.rm = T),
sales19 = mean(Sales19, na.rm = T),
weightedsales18 = weighted.mean(Sales18, .data[['Emp']], na.rm = T),
weightedsales19 = weighted.mean(Sales19, .data[['Emp']], na.rm = T))
# A tibble: 3 x 5
Region sales18 sales19 weightedsales18 weightedsales19
<chr> <dbl> <dbl> <dbl> <dbl>
1 Africa 1140 1605 1140 1605
2 Asia 8410 11200 10360 13435
3 Europe 10872. 3938. 12224. 3407.
Assume a company that we have info about Total sales and the amount of sales in three counties CA , TX and WI.
How can i calculate : the average sales contribution of the three states of total company sales
I need furthermore to find : the same average percentages for each year, month of the year and day of the week.
EDITED !!!
structure(list(CA = c(11047, 9925, 11322, 12251, 16610, 14696
), TX = c(7381, 5912, 9006, 6226, 9440, 9376), WI = c(6984, 3309,
8883, 9533, 11882, 8664), Total = c(25412, 19146, 29211, 28010,
37932, 32736), date = structure(c(1296518400, 1296604800, 1296691200,
1296777600, 1296864000, 1296950400), tzone = "UTC", class = c("POSIXct",
"POSIXt")), event_type = c("NA", "NA", "NA", "NA", "NA", "Sporting"
), snap_CA = c(1, 1, 1, 1, 1, 1), snap_TX = c(1, 0, 1, 0, 1,
1), snap_WI = c(0, 1, 1, 0, 1, 1)), row.names = c(NA, -6L), class = c("tbl_df",
"tbl", "data.frame"))
if I understood your problem correctly a possible solution would be this:
library(dplyr)
library(lubridate)
df1 <- df %>%
dplyr::mutate(YEAR = lubridate::year(date),
MONTH = lubridate::month(date),
WEEKDAY = lubridate::wday(date),
P_CA = CA / Total,
P_TX = TX / Total,
P_WI = WI / Total)
# Average per Year
df1 %>%
dplyr::group_by(YEAR) %>%
dplyr::summarise(AV_CA = mean(P_CA, na.rm = TRUE),
AV_TX = mean(P_TX, na.rm = TRUE),
AV_WI = mean(P_WI, na.rm = TRUE))
YEAR AV_CA AV_TX AV_WI
<dbl> <dbl> <dbl> <dbl>
1 2011 0.444 0.278 0.278
# Average per Month
df1 %>%
dplyr::group_by(MONTH) %>%
dplyr::summarise(AV_CA = mean(P_CA, na.rm = TRUE),
AV_TX = mean(P_TX, na.rm = TRUE),
AV_WI = mean(P_WI, na.rm = TRUE))
MONTH AV_CA AV_TX AV_WI
<dbl> <dbl> <dbl> <dbl>
1 2 0.444 0.278 0.278
# Average per Weekday
df1 %>%
dplyr::group_by(WEEKDAY) %>%
dplyr::summarise(AV_CA = mean(P_CA, na.rm = TRUE),
AV_TX = mean(P_TX, na.rm = TRUE),
AV_WI = mean(P_WI, na.rm = TRUE))
WEEKDAY AV_CA AV_TX AV_WI
<dbl> <dbl> <dbl> <dbl>
1 1 0.449 0.286 0.265
2 3 0.435 0.290 0.275
3 4 0.518 0.309 0.173
4 5 0.388 0.308 0.304
5 6 0.437 0.222 0.340
6 7 0.438 0.249 0.313
For this dummy data all will up to 100% but when using a larger dataset this might not be true
My data concerns a company and includes Total Sales and the amount of sales in three counties CA , TX and WI.
Data :
> dput(head(WalData))
structure(list(CA = c(11047, 9925, 11322, 12251, 16610, 14696
), TX = c(7381, 5912, 9006, 6226, 9440, 9376), WI = c(6984, 3309,
8883, 9533, 11882, 8664), Total = c(25412, 19146, 29211, 28010,
37932, 32736), date = structure(c(1296518400, 1296604800, 1296691200,
1296777600, 1296864000, 1296950400), tzone = "UTC", class = c("POSIXct",
"POSIXt")), event_type = c("NA", "NA", "NA", "NA", "NA", "Sporting"
), snap_CA = c(1, 1, 1, 1, 1, 1), snap_TX = c(1, 0, 1, 0, 1,
1), snap_WI = c(0, 1, 1, 0, 1, 1)), row.names = c(NA, -6L), class = c("tbl_df",
"tbl", "data.frame"))
With the following code i am trying to calculate the average sales share of the three states on the company's total sales.
In addition, i need the same average percentages for each year, month of the year and day of the week.
install.packages("dplyr")
install.packages("lubridate")
library(dplyr)
library(lubridate)
df1 <- df %>%
dplyr::mutate(YEAR = lubridate::year(date),
MONTH = lubridate::month(date),
WEEKDAY = lubridate::wday(date),
P_CA = CA / Total,
P_TX = TX / Total,
P_WI = WI / Total)
# Average per Year
df1 %>%
dplyr::group_by(YEAR) %>%
dplyr::summarise(AV_CA = mean(P_CA, na.rm = TRUE),
AV_TX = mean(P_TX, na.rm = TRUE),
AV_WI = mean(P_WI, na.rm = TRUE))
# Average per Month
df1 %>%
dplyr::group_by(MONTH) %>%
dplyr::summarise(AV_CA = mean(P_CA, na.rm = TRUE),
AV_TX = mean(P_TX, na.rm = TRUE),
AV_WI = mean(P_WI, na.rm = TRUE))
# Average per Weekday
df1 %>%
dplyr::group_by(WEEKDAY) %>%
dplyr::summarise(AV_CA = mean(P_CA, na.rm = TRUE),
AV_TX = mean(P_TX, na.rm = TRUE),
AV_WI = mean(P_WI, na.rm = TRUE))
Output :
> df1 <- df %>%
+ dplyr::mutate(YEAR = lubridate::year(date),
+ MONTH = lubridate::month(date),
+ WEEKDAY = lubridate::wday(date),
+ P_CA = CA / Total,
+ P_TX = TX / Total,
+ P_WI = WI / Total)
Error in UseMethod("mutate_") :
no applicable method for 'mutate_' applied to an object of class "function"
> # Average per Year
> df1 %>%
+ dplyr::group_by(YEAR) %>%
+ dplyr::summarise(AV_CA = mean(P_CA, na.rm = TRUE),
+ AV_TX = mean(P_TX, na.rm = TRUE),
+ AV_WI = mean(P_WI, na.rm = TRUE))
Error in eval(lhs, parent, parent) : object 'df1' not found
It comes with an error : Error in UseMethod("mutate_") :
no applicable method for 'mutate_' applied to an object of class "function"
I cant figure out whats wrong , i double checked the code and the correctness of the data .
Please give a solution .
The issue would be that df is not created as an object in the global env and there is a function with name df if we do ?df
df(x, df1, df2, ncp, log = FALSE)
Basically, the error is based on applying mutate on a function df rather than an object
Checking on a fresh R session with no objects created
df %>%
dplyr::mutate(YEAR = lubridate::year(date),
MONTH = lubridate::month(date),
WEEKDAY = lubridate::wday(date),
P_CA = CA / Total,
P_TX = TX / Total,
P_WI = WI / Total)
Error in UseMethod("mutate_") :
no applicable method for 'mutate_' applied to an object of class "function"
Now, we define 'df' as
df <- WalData
df %>%
dplyr::mutate(YEAR = lubridate::year(date),
MONTH = lubridate::month(date),
WEEKDAY = lubridate::wday(date),
P_CA = CA / Total,
P_TX = TX / Total,
P_WI = WI / Total)
# A tibble: 6 x 15
# CA TX WI Total date event_type snap_CA snap_TX snap_WI YEAR MONTH WEEKDAY P_CA P_TX P_WI
# <dbl> <dbl> <dbl> <dbl> <dttm> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 11047 7381 6984 25412 2011-02-01 00:00:00 NA 1 1 0 2011 2 3 0.435 0.290 0.275
#2 9925 5912 3309 19146 2011-02-02 00:00:00 NA 1 0 1 2011 2 4 0.518 0.309 0.173
#3 11322 9006 8883 29211 2011-02-03 00:00:00 NA 1 1 1 2011 2 5 0.388 0.308 0.304
#4 12251 6226 9533 28010 2011-02-04 00:00:00 NA 1 0 0 2011 2 6 0.437 0.222 0.340
#5 16610 9440 11882 37932 2011-02-05 00:00:00 NA 1 1 1 2011 2 7 0.438 0.249 0.313
#6 14696 9376 8664 32736 2011-02-06 00:00:00 Sporting 1 1 1 2011 2 1 0.449 0.286 0.265
In this problem I want to map a function over multiple data inputs to create output showing frequencies of item responses.
First I create two data sets using the psych package:
suppressMessages(library(here))
suppressMessages(library(tidyverse))
suppressMessages(library(psych))
set.seed(123)
data_input_sim <-
as_tibble(sim.poly.ideal(nvar = 50, n = 1000, cat = 4, )[["items"]]) %>%
mutate_all(
~ case_when(
.x == 0 ~ "never",
.x == 1 ~ "occasionally",
.x == 2 ~ "frequently",
.x == 3 ~ "always"
)
) %>%
rename_all( ~ str_c("i", str_pad(
as.character(1:50), 2, side = "left", pad = "0"
))) %>%
mutate(
ID = 100001:101000,
age = sample(c(5:12), 1000, replace = TRUE),
age_range = case_when(
age <=8 ~ "5 to 8 yo",
T ~ "9 to 12 yo"
),
gender = sample(
c("female", "male"),
1000,
replace = TRUE,
prob = c(0.53, 0.47)
),
educ = sample(
c("no_HS", "HS_grad", "some_college", "BA_plus"),
1000,
replace = TRUE,
prob = c(0.119, 0.263, 0.306, 0.311)
),
ethnic = sample(
c("hispanic", "asian", "black", "white", "other"),
1000,
replace = TRUE,
prob = c(0.239, 0.048, 0.136, 0.521, .056)
),
region = sample(
c("northeast", "south", "midwest", "west"),
1000,
replace = TRUE,
prob = c(0.166, 0.383, 0.212, 0.238)
),
clin_status = sample(
c("typ", "clin"),
1000,
replace = TRUE,
prob = c(0.8, 0.2)
)
) %>%
select(ID:clin_status, i01:i50)
data_input_bfi <- bfi %>%
drop_na() %>%
sample_n(1000) %>%
mutate(
ID = 200001:201000,
age_range = case_when(
age <= 18 ~ "18 yo or younger",
between(age, 19, 24) ~ "19 to 24 yo",
between(age, 25, 39) ~ "25 to 39 yo",
T ~ "40 yo or older"
),
gender = case_when(gender == 1 ~ "male",
gender == 2 ~ "female"),
educ = case_when(
education == 1 ~ "no_HS",
education == 2 ~ "HS_grad",
education == 3 ~ "some_college",
T ~ "BA_plus"
),
ethnic = sample(
c("hispanic", "asian", "black", "white", "other"),
1000,
replace = TRUE,
prob = c(0.239, 0.048, 0.136, 0.521, .056)
),
region = sample(
c("northeast", "south", "midwest", "west"),
1000,
replace = TRUE,
prob = c(0.166, 0.383, 0.212, 0.238)
),
clin_status = sample(
c("typ", "clin"),
1000,
replace = TRUE,
prob = c(0.8, 0.2)
)
) %>%
mutate_at(
vars(A1:O5),
~
case_when(
.x == 1 ~ "very_inaccurate",
.x == 2 ~ "moderately_inaccurate",
.x == 3 ~ "slightly_inaccurate",
.x == 4 ~ "slightly_accurate",
.x == 5 ~ "moderately_accurate",
.x == 6 ~ "very_accurate",
)
) %>%
select(ID, age:clin_status, A1:O5)
Then I extract and sequence elements unique to each data set: the suffix of its name, the names of its item columns, and the names of its item categories:
data_name_suffix <- c("sim", "bfi")
sim_item_cols <- str_c("i", str_pad(as.character(1:50), 2, side = "left", pad = "0"))
bfi_item_cols <- cross(list(c("A", "C", "E", "N", "O"), seq(1:5))) %>%
map_chr(str_c, collapse = "") %>%
sort()
sim_item_cats <- c("never", "occasionally","frequently", "always")
bfi_item_cats <- c("very_inaccurate", "moderately_inaccurate", "slightly_inaccurate",
"slightly_accurate", "moderately_accurate", "very_accurate")
data_name_suffix is a two-element character vector; I then create two-element lists (using quos()) to hold the item column and category names:
item_cols <- quos(sim_item_cols, bfi_item_cols)
item_cats <- quos(sim_item_cats, bfi_item_cats)
Now I attempt to map the output-creating function over the three inputs, using purrr::pmap():
pmap_df(
list(data_name_suffix,
item_cols,
item_cats),
~
eval(as.name(str_c("data_input_", data_name_suffix))) %>%
select(!!!item_cols) %>%
gather(var, value) %>%
group_by(var, value) %>%
count(var, value) %>%
ungroup() %>%
spread(value, n) %>%
arrange(match(var, !!!item_cols)) %>%
select(var, !!!item_cats) %>%
assign(str_c("freq_item_val_", data_name_suffix), ., envir = .GlobalEnv)
)
And it returns this error:
Error: Unknown columns `A1`, `A2`, `A3`, `A4`, `A5` and ...
Which suggests to me that R is seeing the list item_cols as a single long character vector, rather than two separate character vectors to iterate over.
And here we reach the limit of my understanding of and experience with tidyeval techniques. I suspect that I'm doing something wrong with quos() and !!!.
Thanks in advance for any help, and I hope whoever reads this is safe and healthy during this surreal time.
Here, we could use mget to get the values of the objects
library(stringr)
library(purrr)
library(dplyr)
library(tidyr)
list(mget(str_c('data_input_', data_name_suffix)),
item_cols,
item_cats) %>%
pmap(~ ..1 %>%
select(!!! ..2) %>%
pivot_longer(everything(), names_to = 'var', values_to = 'value') %>%
count(var, value) %>%
pivot_wider(names_from = value, values_from = n) %>%
arrange(match(var, !!!..2)) %>%
select(var, !!! ..3) )
#$data_input_sim
# A tibble: 50 x 5
# var never occasionally frequently always
# <chr> <int> <int> <int> <int>
# 1 i01 465 366 141 28
# 2 i02 489 336 147 28
# 3 i03 457 367 146 30
# 4 i04 433 385 162 20
# 5 i05 418 362 171 49
# 6 i06 420 369 169 42
# 7 i07 405 367 182 46
# 8 i08 361 401 194 44
# 9 i09 346 391 211 52
#10 i10 334 425 203 38
# … with 40 more rows
#$data_input_bfi
# A tibble: 25 x 7
# var very_inaccurate moderately_inaccurate slightly_inaccurate slightly_accurate moderately_accurate very_accurate
# <chr> <int> <int> <int> <int> #<int> <int>
# 1 A1 334 278 151 130 75 32
# 2 A2 18 49 48 197 365 323
# 3 A3 32 51 72 210 353 282
# 4 A4 48 69 60 159 243 421
# 5 A5 26 66 89 207 340 272
# 6 C1 17 48 82 213 383 257
# 7 C2 26 85 98 212 361 218
# 8 C3 35 80 102 272 322 189
# 9 C4 296 270 166 163 83 22
#10 C5 197 212 118 207 167 99
# … with 15 more rows
NOTE: assigning to create multiple objects is not recommended. Instead keep the output in a list and make changes in each of the list elements (if needed) by looping over it with map