Related
I have a data set I modified a lot, to the point where the code doesn't look very clean and tidy, and I need some help in order to put everything in a clean dplyr style, this is my code:
ddd_dataset <- read_excel("data/ddd_dataset.xlsx")
new_data = ddd_dataset[ddd_dataset$`Indicator name`=="Population covered by at least a 4G mobile network (%)",]
new_data = new_data[order(new_data$Country),]
new_data = spread(new_data[-c(1553, 1554), c(1,5,6)], Year, value = Value)
# Data imputation
new_data = new_data %>% pivot_longer(-Country, names_to = "year") %>%
mutate(value = value %>% as.numeric()) %>%
group_by(Country) %>%
fill(value, .direction = "updown") %>%
pivot_wider(names_from = year, values_from = value)
# Change column
itu_emi_countries <- read_csv("data/itu-emi-countries.csv")
itu_emi_countries <- itu_emi_countries %>% rename(Country = `ITU Name`)
new_data = left_join(new_data, itu_emi_countries, by.x = "Country", by.y = "Country")
new_data$Country = new_data$`EMI Name`
new_data = new_data[,1:10]
# Turn data into long format
new_long =
new_data %>%
pivot_longer(-Country, names_to = "year", values_to = "x") %>%
mutate(across(year, as.numeric))
Does anyone know how I can rewrite these functions into a single function that has the style of a dplyr function (using %>%)?
Literal, with inference and caveats:
library(dplyr)
library(tidyr) # pivot_*, complete, fill
# library(readr)
# library(readxl)
ddd_dataset <- readxl::read_excel("ddd_dataset.xlsx")
itu_emi_countries <- readr::read_csv("itu-emi-countries.csv") %>%
rename(Country = `ITU Name`)
new_data <- ddd_dataset %>%
filter(`Indicator name` == "Population covered by at least a 4G mobile network (%)") %>%
mutate(Value = suppressWarnings(as.numeric(Value))) %>%
pivot_wider(Country, names_from = Year, values_from = Value) %>%
# we cannot impute before here, since some countries do not have all years, but now they will
pivot_longer(-Country, names_to = "Year", values_to = "Value") %>%
arrange(Country, Year) %>%
group_by(Country) %>%
fill(Value, .direction = "updown") %>%
pivot_wider(Country, names_from = Year, values_from = Value)
new_long <- left_join(new_data, itu_emi_countries, by = "Country") %>%
# inferring that you want to keep names for countries in new_data not present in itu
mutate(Country = coalesce(`EMI Name`, Country)) %>%
# inferring you want all but `EMI Name`, not just hard-coding 1:10
select(-`EMI Name`) %>%
pivot_longer(-Country, names_to = "year", values_to = "x") %>%
mutate(year = as.integer(year))
new_data
# # A tibble: 196 x 10
# Country `2012` `2013` `2014` `2015` `2016` `2017` `2018` `2019` `2020`
# <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 Afghanistan 0 0 0 0 0 4 7 22 26
# 2 Albania 0 0 0 35 80.2 85.3 85.5 95 98.4
# 3 Algeria 0 0 0 0 3.62 30.5 52.8 53.6 76.2
# 4 Andorra 50 50 50 50 50 85 85 85 85
# 5 Angola 7 7 7 7 8 8 8 18 30
# 6 Antigua and Barbuda 65 78.6 80 98 99 99 99 99 99
# 7 Argentina 0 0 0 65 85 85 90.8 91.2 97.7
# 8 Armenia 17.5 44 46 46.5 52.5 90.0 99.1 99.3 100
# 9 Australia 52.2 85 95 94 98 99 99.2 99.4 99.5
# 10 Austria 31.6 58.4 85 98 98 98 98 98 98
# # ... with 186 more rows
new_long
# # A tibble: 1,764 x 3
# Country year x
# <chr> <int> <dbl>
# 1 Afghanistan 2012 0
# 2 Afghanistan 2013 0
# 3 Afghanistan 2014 0
# 4 Afghanistan 2015 0
# 5 Afghanistan 2016 0
# 6 Afghanistan 2017 4
# 7 Afghanistan 2018 7
# 8 Afghanistan 2019 22
# 9 Afghanistan 2020 26
# 10 Albania 2012 0
# # ... with 1,754 more rows
But it seems unnecessary and inefficient to pivot back and forth when you ultimately want it in long format in the end. One-step:
new_long2 <- ddd_dataset %>%
filter(`Indicator name` == "Population covered by at least a 4G mobile network (%)") %>%
left_join(itu_emi_countries, by = "Country") %>%
mutate(
Country = coalesce(`EMI Name`, Country), # some `EMI Name` are missing
Value = suppressWarnings(as.numeric(Value)) # "NULL" -> NA
) %>%
complete(Country, Year) %>%
arrange(Year) %>%
group_by(Country) %>%
fill(Value, .direction = "updown") %>%
ungroup() %>%
select(Country, year = Year, x = Value)
(The only difference in the data, other than order, is that Year is a numeric in this last block and is integer above. This can easily be remedied, over to you.)
I have a data frame named df with five columns :
age <- c(10,11,12,12,10,11,11,12,10,11,12)
time <- c(20,26,41,60,29,28,54,24,59,70,25)
weight <- c(123,330,445,145,67,167,190,104,209,146,201)
gender <- c(1,1,2,2,2,2,1,2,2,2,1)
Q2 <- c(112,119,114,120,121,117,116,114,121,122,124)
df <- data_frame(age, w, time, gender, Q2)
what I want is applying the weighted.mean based on each age to my data frame by using two conditions: 1)gender = 2 and 2) Q2 >=114 & Q2 <= 121
by the code below, I can simply apply weighted.mean but I do not know how to use my two conditions.
df1<-
df %>%
group_by(age) %>%
summarise(weighted_time = weighted.mean(time, weight))
Is the following what you are looking for?
library(tidyverse)
age <- c(10,11,12,12,10,11,11,12,10,11,12)
time <- c(20,26,41,60,29,28,54,24,59,70,25)
weight <- c(123,330,445,145,67,167,190,104,209,146,201)
gender <- c(1,1,2,2,2,2,1,2,2,2,1)
Q2 <- c(112,119,114,120,121,117,116,114,121,122,124)
df <- data.frame(age, weight, time, gender, Q2)
df %>%
group_by(age) %>%
filter(gender == 2 & Q2 >=114 & Q2 <= 121) %>%
summarise(weighted_time = weighted.mean(time, weight), .groups = "drop")
#> # A tibble: 3 × 2
#> age weighted_time
#> <dbl> <dbl>
#> 1 10 51.7
#> 2 11 28
#> 3 12 42.4
You can add a filter for those 2 (3) conditions:
df %>% filter(gender == 2 & Q2 >= 114 & Q2 <= 121) %>% group_by(age) %>% summarise(weighted_time = weighted.mean(time, weight))
This gives
# A tibble: 3 x 2
age weighted_time
<dbl> <dbl>
1 10 51.7
2 11 28
3 12 42.4
data.table
age <- c(10,11,12,12,10,11,11,12,10,11,12)
time <- c(20,26,41,60,29,28,54,24,59,70,25)
weight <- c(123,330,445,145,67,167,190,104,209,146,201)
gender <- c(1,1,2,2,2,2,1,2,2,2,1)
Q2 <- c(112,119,114,120,121,117,116,114,121,122,124)
df <- data.frame(age, weight, time, gender, Q2)
library(data.table)
setDT(df)[gender == 2 & (Q2 >=114 & Q2 <= 121), list(res = weighted.mean(time, weight)), by = age
][order(age)]
#> age res
#> 1: 10 51.71739
#> 2: 11 28.00000
#> 3: 12 42.42219
Created on 2021-12-10 by the reprex package (v2.0.1)
Context: I am trying to create a docx table with summary statistics by groups.
Question: how to add summary statistics (e.g. sum) on top or bottom of each group and get a "total" line as last row?
So far I got good results using flextable::as_grouped_data() as shown here: https://davidgohel.github.io/flextable/reference/as_grouped_data.html#see-also
Example:
library(dplyr) # feel free to use data.table if you prefer, I am just more used to dplyr
data_co2_2 <- CO2 %>%
group_by(Type, Treatment, conc) %>%
summarise(uptake = mean(uptake)) %>%
pivot_wider(names_from = Type, values_from = uptake)
data_co2_2 <- as_grouped_data(x = data_co2_2, groups = c("Treatment"))
Output:
data_co2
#> Treatment conc Quebec Mississippi
#> 1 nonchilled NA NA NA
#> 3 <NA> 95 15.26667 11.30000
#> 4 <NA> 175 30.03333 20.20000
#> 5 <NA> 250 37.40000 27.53333
#> 6 <NA> 350 40.36667 29.90000
#> 7 <NA> 500 39.60000 30.60000
#> 8 <NA> 675 41.50000 30.53333
#> 9 <NA> 1000 43.16667 31.60000
#> 2 chilled NA NA NA
#> 10 <NA> 95 12.86667 9.60000
#> 11 <NA> 175 24.13333 14.76667
#> 12 <NA> 250 34.46667 16.10000
#> 13 <NA> 350 35.80000 16.60000
#> 14 <NA> 500 36.66667 16.63333
#> 15 <NA> 675 37.50000 18.26667
#> 16 <NA> 1000 40.83333 18.73333
Expected output: instead of NA in the "group" line I would like to display a summary statistic (like the sum of the sub-group). Icing on the cake: display a "Overall total" on the bottom of the table.
instead of NA in the "group" line I would like to display a summary statistic (like the sum of the sub-group).
That's not possible if using as_grouped_data() %>% as_flextable(). The value displayed is the name of the group.
The following is a proposition:
library(flextable)
library(dplyr)
library(tidyr)
CO2 <- CO2 %>%
mutate(conc = as.character(conc))
agg1 <- CO2 %>%
group_by(Type, Treatment, conc) %>%
summarise(uptake = mean(uptake), .groups = "drop")
agg2 <- CO2 %>%
group_by(Type, Treatment) %>%
summarise(uptake = mean(uptake), .groups = "drop") %>%
mutate(conc="Overall")
agg3 <- CO2 %>%
group_by(Type) %>%
summarise(uptake = mean(uptake), .groups = "drop") %>%
mutate(conc="Overall", Treatment = "Overall")
all_data <- bind_rows(agg1, agg2, agg3) %>%
arrange(Type, Treatment, conc) %>%
pivot_wider(names_from = Type, values_from = uptake)
as_grouped_data(x = all_data, groups = c("Treatment")) %>%
as_flextable() %>%
compose(i = ~ is.na(conc) & is.na(Treatment),
j = "conc", value = as_paragraph("avg for all conc")) %>%
compose(i = ~ is.na(conc) & is.na(Treatment),
j = "conc", value = as_paragraph("avg for all conc")) %>%
bold(bold = TRUE, i = ~!is.na(Treatment)) %>%
color(i= ~ conc %in% "Overall", color = "red") %>%
colformat_double(j = c("Quebec", "Mississippi"), digits = 1)
I have dataframe with country, gender, 2013,2014,2014,2015 column names.
City Gender 2013 2014 2015
Aberdeen Female 30 40 50
Aberdeen Male 20 15 16
Aberdeenshire Female 60 80 70
Aberdeenshire Male 50 40 15
.....Includes 425 records.
I want to perform female to male ratio (dividing Female/male for each city) for each city, so this is how i tried to get,
City 2013_ratio 2014_ratio 2015_ration
Aberdeen 1.5 2.66 2.5
Aberdeenshire 1.2 2 4.66
can anyone help me to solve this. I have tried grouping by city but I don't know how to do by getting value by rows in gender.
You can more easily calculate the ratio if the Male and Female are in different columns, which you can change the structure by using tidyr
library(dplyr)
library(tidyr)
df %>%
gather(Year, Value, -City, - Gender) %>%
spread(Gender, Value) %>%
mutate(Ratio = Female/Male, Year = paste0(Year, "_Ratio")) %>%
select(-Female, -Male) %>%
spread(Year, Ratio)
The code from Rob's suggested solution would be (with an additional spread() step:
# data
df = data.frame(City = c("a", "a", "b", "b"),
Gender = c("Female", "Male", "Female", "Male"),
`2013` = c(30, 20, 60, 50),
`2014` = c(40, 15, 80, 40),
`2015` = c(50, 16, 70, 15))
# Actual process
library("dplyr")
library("tidyr")
df %>%
# Transform wide table into tidy
gather("Year", "Number", X2013:X2015) %>%
# Reshape gender columns for easier summaries
spread("Gender", "Number") %>%
# Compute ratios
group_by(City, Year) %>%
summarise(ratio = Female/(Male + Female))
#> # A tibble: 6 x 3
#> # Groups: City [?]
#> City Year ratio
#> <fct> <chr> <dbl>
#> 1 a X2013 0.6
#> 2 a X2014 0.727
#> 3 a X2015 0.758
#> 4 b X2013 0.545
#> 5 b X2014 0.667
#> 6 b X2015 0.824
Created on 2018-10-10 by the reprex package (v0.2.1)
To get exactly your result you can apply back the function spread() to spread the ratios over years, (spread(Year, ratio))
With tidyverse:
df = read.table(text="City Gender 2013 2014 2015
Aberdeen Female 30 40 50
Aberdeen Male 20 15 16
Aberdeenshire Female 60 80 70
Aberdeenshire Male 50 40 15", header = T)
> library(tidyverse)
>
> df %>%
group_by(City) %>%
arrange(City, Gender) %>%
summarise_at(vars(X2013:X2015), .funs = funs(ratio = first(.)/last(.)))
# A tibble: 2 x 4
City X2013_ratio X2014_ratio X2015_ratio
<fct> <dbl> <dbl> <dbl>
1 Aberdeen 1.5 2.67 3.12
2 Aberdeenshire 1.2 2 4.67
or
df %>%
group_by(City) %>%
arrange(City,Gender) %>%
summarise_at(vars(X2013:X2015), .funs = funs(ratio = .[Gender == "Female"]/.[Gender != "Female"]))
How do I use dplyr to create proportions of a level of a factor variable for each state? For example, I'd like to add a variable that indicates the percent of females within each state to the data frame.
# gen data
state <- rep(c(rep("Idaho", 10), rep("Maine", 10)), 2)
student.id <- sample(1:1000,8,replace=T)
gender <- rep( c("Male","Female"), 100*c(0.25,0.75) )
gender <- sample(gender, 40)
school.data <- data.frame(student.id, state, gender)
Here's an attempt that I know is wrong, but gets me access to the information:
middle %>%
group_by(state, gender %in%c("Female")) %>%
summarise(count = n()) %>%
mutate(test_count = count)
I have a hard time with the count and mutate functions, which makes it hard to get much further. It doesn't behave as I'd expect.
To add a new column to your existing data frame:
school.data %>%
group_by(state) %>%
mutate(pct.female = mean(gender == "Female"))
Use summarize rather than mutate if you just want one row per state rather than adding a column to the original data.
school.data %>%
group_by(state) %>%
summarize(pct.female = mean(gender == "Female"))
# # A tibble: 2 x 2
# state pct.female
# <fctr> <dbl>
# 1 Idaho 0.75
# 2 Maine 0.70
Gregor's answer gets to the heart of it. Here's a version that would give you counts and proportions for both genders per state:
library(dplyr)
gender.proportions <- group_by(school.data, state, gender) %>%
summarize(n = length(student.id)) %>% # count per gender
ungroup %>% group_by(state) %>%
mutate(proportion = n / sum(n)) # proportion per gender
# state gender n proportion
# <fctr> <fctr> <int> <dbl>
#1 Idaho Female 16 0.80
#2 Idaho Male 4 0.20
#3 Maine Female 11 0.55
#4 Maine Male 9 0.45
Edit:
In reference to OP's comment/request, the code below would repeat the male and female proportions for each individual student in each state:
gender.proportions <- group_by(school.data, state) %>%
mutate(prop.female = mean(gender == 'Female'), prop.male = mean(gender == 'Male'))
student.id state gender prop.female prop.male
<int> <fctr> <fctr> <dbl> <dbl>
1 479 Idaho Male 0.8 0.2
2 634 Idaho Female 0.8 0.2
3 175 Idaho Female 0.8 0.2
4 527 Idaho Female 0.8 0.2
5 368 Idaho Female 0.8 0.2
6 423 Idaho Male 0.8 0.2
7 357 Idaho Female 0.8 0.2
8 994 Idaho Female 0.8 0.2
9 479 Idaho Female 0.8 0.2
10 634 Idaho Female 0.8 0.2
# ... with 30 more rows
Here is one solution using a left_join.
state <- rep(c(rep("Idaho", 10), rep("Maine", 10)), 2)
student.id <- sample(1:1000,8,replace=T)
gender <- rep( c("Male","Female"), 100*c(0.25,0.75) )
gender <- sample(gender, 40)
school.data <- data.frame(student.id, state, gender)
school.data %>%
group_by(state) %>%
mutate(gender_id = ifelse(gender == "Female", 1, 0)) %>%
summarise(female_count = sum(gender_id)) %>%
left_join(school.data %>%
group_by(state) %>%
summarise(state_count = n()),
by = c("state" = "state")
) %>%
mutate(percent_female = female_count / state_count)