Calculate percentage with group by dplyr

Calculate percentage with group by dplyr - r

I want to calculate the percentage for each character colname in my dataframe but the percentage isn't good.
My code :
for(i in names(which((sapply(creditDF,class) == "character")))){
distribution <- creditDF %>%
group_by_at(.vars = i) %>%
summarise(value = n(),
percent = value/sum(value)) %>%
select(label = i, value, percent)
}
Résult :
label value percent
<chr> <int> <dbl>
1 chéquier autorisé 415 1
2 chéquier interdit 53 1
Normally for the first lines the percentage is 415/468*100.
How can I fix my problem ?
Thanks for your help.

Here, we need to ungroup to get the sum of the whole 'value' column i.e
-- %>%
group_by_at(.vars = i) %>%
summarise(value = n() %>%
ungroup() %>%
mutate(percent = value/sum(value)) %>%
select(label = i, value, percent)
}

Related

Summarise+Group_by+mean+sd various columns

I need to have a new df with gender separation of means (several columns) and sd, and later do my graphs. After of hours of work I did it but in a silly way :D one by one.
For me that I'm a super beginner, I'm trying-mistaking during hours until something work. But I know with a teacher or more explanation I would do better.
" mediapi<-rowMeans(datos[1:325,c(31,37)])
Sdbi<-rowSds(as.matrix(datos[1:325,c(6:18)])). "
al later one by one, because I tried in one table and it does not work...
datos %>%
group_by(Sexo) %>%
summarize(m = mean(mediaci), # calculates the mean
s = sd(mediaci), # calculates the standard deviation
n = n()) %>% # calculates the total number of observations
ungroup()
datos %>%
group_by(Sexo) %>%
summarize(m = mean(mediaii), # calculates the mean
s = sd(mediaii), # calculates the standard deviation
n = n()) %>% # calculates the total number of observations
ungroup()
datos %>%
group_by(Sexo) %>%
summarize(m = mean(mediabi), # calculates the mean
s = sd(mediabi), # calculates the standard deviation
n = n()) %>% # calculates the total number of observations
ungroup()
datos %>%
group_by(Sexo) %>%
summarize(m = mean(mediaai), # calculates the mean
s = sd(mediaai), # calculates the standard deviation
n = n()) %>% # calculates the total number of observations
ungroup()
datos %>%
group_by(Sexo) %>%
summarize(m = mean(mediabai), # calculates the mean
s = sd(mediabai), # calculates the standard deviation
n = n()) %>% # calculates the total number of observations
ungroup()
datos %>%
group_by(Sexo) %>%
summarize(m = mean(mediapi),# calculates the mean
s = sd(mediapi),
n = n()) %>% # calculates the total number of observations
ungroup()
datos %>%
group_by(Sexo) %>%
summarize(m = mean(mediandi),# calculates the mean
s = sd(mediandi),
n = n()) %>% # calculates the total number of observations
ungroup()
datos %>%
group_by(Sexo) %>%
summarize(m = mean(mediansi),# calculates the mean
s = sd(mediansi),
n = n()) %>% # calculates the total number of observations
ungroup()
Can someone tell me an easiest and efficient way? Thanks a lot

How do I create a table in R with conditional formatting and row and column totals?

Are there any R packages that I use to replicate the table below -
I would like a table with conditional formatting for the table values but no conditional formatting on the row and column grand totals.
The code can be used to reproduce the values in the table along with the row and column grand totals -
library(tidyverse)
# vectors
dates <- rep(date_vec <- c(as.Date("2022-01-01"), as.Date("2022-02-01"), as.Date("2022-03-01")), 30)
row_groups <- c(rep("row_group1", 20), rep("row_group2", 30), rep("row_group3", 10), rep("row_group4", 30))
col_groups <- c(rep("col_group1", 10), rep("col_group2", 10), rep("col_group3", 30), rep("col_group4", 40))
# dataframe
df <- tibble(dates, row_groups, col_groups)
# column grand totals
col_group_total <- df %>%
group_by(dates, col_groups) %>%
count() %>%
group_by(col_groups) %>%
summarise(mean = mean(n)) %>%
mutate(pct = mean/sum(mean))
# row grand totals
row_group_total <- df %>%
group_by(dates, row_groups) %>%
count() %>%
group_by(row_groups) %>%
summarise(mean = mean(n)) %>%
mutate(pct = mean/sum(mean))%>%
ungroup()
# table values
group_total <- df %>%
group_by(dates, row_groups, col_groups) %>%
count() %>%
group_by(row_groups, col_groups) %>%
summarise(count = mean(n)) %>%
ungroup() %>%
mutate(pct = count/sum(count))%>%
ungroup()
red_color <- "#f4cccc"
yellow_color <- "#f3f0ce"
green_color <- "#d9ead3"

library(janitor); library(gt)
df %>%
tabyl(row_groups, col_groups) %>%
adorn_percentages("all") %>%
adorn_totals(c("col")) -> df_tabyl
gt(df_tabyl) %>%
data_color(columns = col_group1:col_group4,
colors = scales::col_numeric(
palette = c(red_color, yellow_color, green_color),
domain = range(df_tabyl[1:4,2:5])
)
) %>%
fmt_percent(columns = -row_groups,
rows = everything()) %>%
summary_rows(
columns = -row_groups,
fns = list("Total" = "sum"),
formatter = fmt_percent
)
The coloring varies with your example b/c the col_numeric function maps the colors linearly along the three provided colors, and 11% is only 1/3 of the way between 0% and 33%. Not sure what approach you expect.

Numbers of years having rainy days in the range of 81–119% of long term average

A day with precipitation >= 2.5 mm is called a rainy day. I could able to calculate monthwise rainy days using the following code
library(seas)
library(tidyverse)
library(zoo)
library(lubridate)
data(mscdata)
dat.int <- (mksub(mscdata, id=1108447))
dat.int %>%
as_tibble() %>% # for easier viewing
mutate(yearmon = as.yearmon(dat.int$date, "%b %y")) %>%
dplyr::select(-date, -year, -yday, -t_max, -t_min, -t_mean) %>%
pivot_longer(cols = -yearmon, names_to = "variable", values_to = "value") %>%
group_by(yearmon, variable) %>%
summarise(rainy_days = sum(value > 2.5)) %>%
pivot_wider(names_from = "variable", values_from = "rainy_days")
Then I have calculated the longterm average using the following code
dat.int %>%
as_tibble() %>% # for easier viewing
mutate(yearmon = as.yearmon(dat.int$date, "%b %y")) %>%
dplyr::select(-date, -year, -yday, -t_max, -t_min, -t_mean) %>%
pivot_longer(cols = -yearmon, names_to = "variable", values_to = "value") %>%
group_by(yearmon, variable) %>%
summarise(rainy_days = sum(value > 2.5)) %>%
mutate(year = year(yearmon)) %>%
group_by(variable) %>%
summarize(value = as.integer(round(mean(rainy_days, na.rm = T)))) %>%
pivot_wider(names_from = "variable", values_from = "value")
Now two thresholds should be calculated as: lower threshold = 0.81*long term average and upper threshold = 1.19*long term average. Then calculate the number of years having rainy days between these two thresholds. Now I want to calculate the number of years having rainy days in the range of 81–119% of long term average (between lower and upper threshold).

Edit: Based on OP's comments and wanting to summarize by total precip, rain and snow.
library(dplyr)
library(lubridate)
dat.int %>%
mutate(month = month(ymd(date))) %>%
group_by(year, month) %>%
summarize_at(vars(precip,rain,snow), funs(days = sum(. >= 2.5,na.rm = TRUE))) %>%
group_by(year) %>%
summarize_at(vars(ends_with("days")), funs(yearly = sum(.))) %>%
summarize_at(vars(-year), list(~ sum(. > mean(.) * 0.81 & . < mean(.) * 1.19))) %>%
rename_all(list(~ gsub("days_yearly","in_range",.))) summarize(years = n())
# precip_in_range rain_in_range snow_in_range
# <int> <int> <int>
#1 26 24 6

How to data wrangle and barplot the proportion without undesired stripes

Please find the input data and expected output as screenshot below:
However, the current plot with the below code:
I feel, I made it too complicated. But I shared input data and expected data along with struggled code along the way. Could you please help us
Mainly there are 2 issues.
1. If mutate is used, undesired stripes appear on the plot
Summarize used, then it is not adding to 100%
2. How can we extract the top contributors
Both have been tried by us but stuck somewhere
# Input data
df <- tibble(
country = c(rep(c("India","USA","Germany","Africa"), each = 8)),
type = c("sms","Other","whatsapp","web","online","shiny","whatsapp","whatsapp",
"sms","sms","sms","web","web","Other","online","whatsapp",
"sms","Other","whatsapp","shiny","online","shiny","whatsapp","whatsapp",
"sms","sms","sms","shiny","online","Other","online","Other"
),
cust = rep(c("google","Apple","wallmart","pg"),8),
quantity = c(10,20,30,40,50,60,70,80,
90,100,15,25,35,45,55,65,
75,85,95,105,10,15,20,25,
30,35,40,45,50,55,60,65)
)
# Without Customer
df %>%
group_by(country,type) %>%
summarise(kpi_wo_cust = sum(quantity)) %>%
ungroup() -> df_wo_cust
# With Customer
df %>%
group_by(country,type,cust) %>%
summarise(kpi_cust = sum(quantity)) %>%
ungroup() -> df_cust
df_combo <- left_join(df_cust, df_wo_cust, by = c("country","type"))
df_combo %>% glimpse()
# Aggregated data for certain KPIs for final plot
df_aggr <- df_combo %>%
group_by(country,type) %>%
mutate(kpi_cust_total = sum(kpi_cust),
per_kpi_cust = 100 * (kpi_cust/kpi_cust_total)) %>%
group_by(country) %>%
# In order to except from repeated counting, selecting unique()
mutate(kpi_cust_uniq_total = sum(kpi_cust) %>% unique(),
per_unq_kpi_cust = 100 * (kpi_cust/kpi_cust_uniq_total) %>% round(4))
#
plt = df_aggr %>% ungroup() %>%#glimpse()
# In order to obtain theTop 2 customers (Major contributor) within country and type
# However, if this code is used, there is an error
# group_by(country, type) %>%
# nest() %>%
# mutate(top_cust = purrr::map_chr(data, function(x){
# x %>% arrange(desc(per_kpi_cust)) %>%
# top_n(2,per_kpi_cust) %>%
# summarise(Cust = paste(cust,round(per_kpi_cust,2), collapse = "<br>")) %>%
# pull(cust)
# })#,data = NULL
# ) %>%
# unnest(cols = data) %>%
group_by(country, type) %>%
# If mutate is used, undesired stripes appear on the plot
# Summarize used, then it is not adding to 100%
mutate(avg_kpi_cust = per_unq_kpi_cust %>% mean()) %>%
#summarise(avg_kpi_cust = per_unq_kpi_cust %>% mean()) %>%
ggplot(aes(x = country,
y = avg_kpi_cust,
fill = type,
text = paste('<br>proportion: ', round(avg_kpi_cust,2), "%",
"<br>country:",country
))) +
geom_bar(stat = "identity"#, position=position_dodge()
) +
coord_flip() +
theme_bw()
ggplotly(plt)

The key was to use distinct() after mutate() instead of summarise()
Also, mean() was the wrong function used earlier instead of sum() which had resulted in incomplete barplot.
library(tidyverse)
library(plotly)
# Input data
df <- tibble(
country = c(rep(c("India","USA","Germany","Africa"), each = 8)),
type = c("sms","Other","whatsapp","web","online","shiny","whatsapp","whatsapp",
"sms","sms","sms","web","web","Other","online","whatsapp",
"sms","Other","whatsapp","shiny","online","shiny","whatsapp","whatsapp",
"sms","sms","sms","shiny","online","Other","online","Other"
),
cust = rep(c("google","Apple","wallmart","pg"),8),
quantity = c(10,20,30,40,50,60,70,80,
90,100,15,25,35,45,55,65,
75,85,95,105,10,15,20,25,
30,35,40,45,50,55,60,65)
)
# Without Customer
df %>%
group_by(country,type) %>%
summarise(kpi_wo_cust = sum(quantity)) %>%
ungroup() -> df_wo_cust
# With Customer
df %>%
group_by(country,type,cust) %>%
summarise(kpi_cust = sum(quantity)) %>%
ungroup() -> df_cust
df_combo <- left_join(df_cust, df_wo_cust, by = c("country","type"))
df_combo %>% glimpse()
# Aggregated data for certain KPIs for final plot
df_aggr <- df_combo %>%
group_by(country,type) %>%
mutate(kpi_cust_total = sum(kpi_cust),
per_kpi_cust = 100 * (kpi_cust/kpi_cust_total)) %>%
group_by(country) %>%
# In order to except from repeated counting, selecting unique()
mutate(kpi_cust_uniq_total = sum(kpi_cust) %>% unique(),
per_unq_kpi_cust = 100 * (kpi_cust/kpi_cust_uniq_total) %>% round(4))
plt = df_aggr %>% ungroup() %>%
# In order to diplay Top 2 customers (Major contributor) within country and type
group_by(country, type) %>%
nest() %>%
mutate(top_cust = purrr::map_chr(data, function(x){
x %>% arrange(desc(per_kpi_cust)) %>%
top_n(2,per_kpi_cust) %>%
summarise(Cust = paste(cust,round(per_kpi_cust,2), collapse = "<br>")) %>%
pull(Cust)
})) %>%
unnest(cols = data) %>%
group_by(country, type) %>%
# If mutate is used, undesired stripes appear on the plot
# Summarize used, then it is not adding to 100%.
# So distinct was used
mutate(avg_kpi_cust = per_unq_kpi_cust %>% sum()) %>%
ungroup() %>%
distinct(country, type, .keep_all = T) %>%
ggplot(aes(x = country,
y = avg_kpi_cust,
fill = type,
text = top_cust
)) +
geom_bar(stat = "identity") +
coord_flip() +
theme_bw()
ggplotly(plt, tooltip = "text")

Moving mean as a function in dplyr

I'd like to create a function that can calculate the moving mean for a variable number of last observations and different variables. Take this as mock data:
df = expand.grid(site = factor(seq(10)),
year = 2000:2004,
day = 1:50)
df$temp = rpois(dim(df)[1], 5)
Calculating for 1 variable and a fixed number of last observations works. E.g. this calculates the average of the temperature of the last 5 days:
library(dplyr)
library(zoo)
df <- df %>%
group_by(site, year) %>%
arrange(site, year, day) %>%
mutate(almost_avg = rollmean(x = temp, 5, align = "right", fill = NA)) %>%
mutate(avg = lag(almost_avg, 1))
So far so good. Now trying to functionalize fails.
avg_last_x <- function(dataframe, column, last_x) {
dataframe <- dataframe %>%
group_by(site, year) %>%
arrange(site, year, day) %>%
mutate(almost_avg = rollmean(x = column, k = last_x, align = "right", fill = NA)) %>%
mutate(avg = lag(almost_avg, 1))
return(dataframe) }
avg_last_x(dataframe = df, column = "temp", last_x = 10)
I get this error:
Error in mutate_impl(.data, dots) : k <= n is not TRUE
I understand this is probably related to the evaluation mechanism in dplyr, but I don't get it fixed.
Thanks in advance for your help.

This should fix it.
library(lazyeval)
avg_last_x <- function(dataframe, column, last_x) {
dataframe %>%
group_by(site, year) %>%
arrange(site, year, day) %>%
mutate_(almost_avg = interp(~rollmean(x = c, k = last_x, align = "right",
fill = NA), c = as.name(column)),
avg = ~lag(almost_avg, 1))
}

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

Calculate percentage with group by dplyr - r

Here, we need to ungroup to get the sum of the whole 'value' column i.e -- %>% group_by_at(.vars = i) %>% summarise(value = n() %>% ungroup() %>% mutate(percent = value/sum(value)) %>% select(label = i, value, percent) }

Related

Summarise+Group_by+mean+sd various columns

How do I create a table in R with conditional formatting and row and column totals?

Numbers of years having rainy days in the range of 81–119% of long term average

How to data wrangle and barplot the proportion without undesired stripes

Moving mean as a function in dplyr

Categories

Resources