Density plot using population data for a specific year - r

Is it possible to create a density plot using this population data? Age_group is a categorical variable. Does it have to be numeric to create a density plot?
library(tidyverse)
df <- structure(list(year = c(1971, 1971, 1971, 1971, 1971, 1971, 1971,
1971, 1971, 1971, 1971, 1971, 1971, 1971, 1971, 1971, 1971, 1971
), age_group = structure(2:19, .Label = c("All ages", "0 to 4 years",
"5 to 9 years", "10 to 14 years", "15 to 19 years", "20 to 24 years",
"25 to 29 years", "30 to 34 years", "35 to 39 years", "40 to 44 years",
"45 to 49 years", "50 to 54 years", "55 to 59 years", "60 to 64 years",
"65 to 69 years", "70 to 74 years", "75 to 79 years", "80 to 84 years",
"85 to 89 years", "90 to 94 years", "95 to 99 years", "100 years and over",
"Median age"), class = "factor"), population = c(1836149, 2267794,
2329323, 2164092, 1976914, 1643264, 1342744, 1286302, 1284154,
1252545, 1065664, 964984, 785693, 626521, 462065, 328583, 206174,
101117)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-18L))

You can convert the text to numeric ranges, e.g.:
library(tidyverse) # if not already loaded
df %>%
# These extract the 1st and 3rd "word" of age_group
# Uses stringr::word(), loaded as part of tidyverse
mutate(age_min = word(age_group, 1) %>% as.numeric,
age_max = word(age_group, 3) %>% as.numeric) %>%
head
# A tibble: 6 x 5
year age_group population age_min age_max
<dbl> <fct> <dbl> <dbl> <dbl>
1 1971 0 to 4 years 1836149 0 4
2 1971 5 to 9 years 2267794 5 9
3 1971 10 to 14 years 2329323 10 14
4 1971 15 to 19 years 2164092 15 19
5 1971 20 to 24 years 1976914 20 24
6 1971 25 to 29 years 1643264 25 29
From that, you could display in ggplot a bunch of ways:
... %>%
ggplot(aes(age_numeric, population)) +
geom_step()
... %>%
ggplot(aes(age_numeric, population)) +
geom_col()
... %>%
ggplot(aes(age_numeric, y = population)) +
geom_density(stat = "identity")

Related

How to match two columns then sum the other column variable and add new row with different name for it?

I would like to take the sum of a column variable when the other two columns match and then adding a new row with different name for this sum result. I'm able to get the first part but unsure how to add the new row for the result obtained...
Here's my dataframe:
df <- structure(list(measure_name = c("Prevalence", "Prevalence", "Incidence",
"Incidence", "Deaths", "Deaths", "YLLs (Years of Life Lost)",
"YLLs (Years of Life Lost)", "YLDs (Years Lived with Disability)",
"YLDs (Years Lived with Disability)", "DALYs (Disability-Adjusted Life Years)",
"DALYs (Disability-Adjusted Life Years)", "Prevalence", "Prevalence",
"Incidence", "Incidence", "YLDs (Years Lived with Disability)",
"YLDs (Years Lived with Disability)", "DALYs (Disability-Adjusted Life Years)",
"DALYs (Disability-Adjusted Life Years)"), age_name = c("1-4 years",
"5-9 years", "1-4 years", "5-9 years", "1-4 years", "5-9 years",
"1-4 years", "5-9 years", "1-4 years", "5-9 years", "1-4 years",
"5-9 years", "1-4 years", "5-9 years", "1-4 years", "5-9 years",
"1-4 years", "5-9 years", "1-4 years", "5-9 years"), cause_name = c("Asthma",
"Asthma", "Asthma", "Asthma", "Asthma", "Asthma", "Asthma", "Asthma",
"Asthma", "Asthma", "Asthma", "Asthma", "Attention-deficit/hyperactivity disorder",
"Attention-deficit/hyperactivity disorder", "Attention-deficit/hyperactivity disorder",
"Attention-deficit/hyperactivity disorder", "Attention-deficit/hyperactivity disorder",
"Attention-deficit/hyperactivity disorder", "Attention-deficit/hyperactivity disorder",
"Attention-deficit/hyperactivity disorder"), val = c(21809765.44,
33602368.48, 10004723.65, 6417738.685, 6101.934992, 1699.9247,
524901.7761, 138969.73, 879880.8571, 1355302.883, 1404782.633,
1494272.613, 1367581.312, 14033704.42, 1314270.786, 2654899.128,
16774.31306, 171847.3209, 16774.31306, 171847.3209)), row.names = c(NA,
-20L), class = c("tbl_df", "tbl", "data.frame"))
> df
# A tibble: 20 × 4
measure_name age_name cause_name val
<chr> <chr> <chr> <dbl>
1 Prevalence 1-4 years Asthma 21809765.
2 Prevalence 5-9 years Asthma 33602368.
3 Incidence 1-4 years Asthma 10004724.
4 Incidence 5-9 years Asthma 6417739.
5 Deaths 1-4 years Asthma 6102.
6 Deaths 5-9 years Asthma 1700.
7 YLLs (Years of Life Lost) 1-4 years Asthma 524902.
8 YLLs (Years of Life Lost) 5-9 years Asthma 138970.
9 YLDs (Years Lived with Disability) 1-4 years Asthma 879881.
10 YLDs (Years Lived with Disability) 5-9 years Asthma 1355303.
11 DALYs (Disability-Adjusted Life Years) 1-4 years Asthma 1404783.
12 DALYs (Disability-Adjusted Life Years) 5-9 years Asthma 1494273.
13 Prevalence 1-4 years Attention-deficit/hyperactivity disorder 1367581.
14 Prevalence 5-9 years Attention-deficit/hyperactivity disorder 14033704.
15 Incidence 1-4 years Attention-deficit/hyperactivity disorder 1314271.
16 Incidence 5-9 years Attention-deficit/hyperactivity disorder 2654899.
17 YLDs (Years Lived with Disability) 1-4 years Attention-deficit/hyperactivity disorder 16774.
18 YLDs (Years Lived with Disability) 5-9 years Attention-deficit/hyperactivity disorder 171847.
19 DALYs (Disability-Adjusted Life Years) 1-4 years Attention-deficit/hyperactivity disorder 16774.
20 DALYs (Disability-Adjusted Life Years) 5-9 years Attention-deficit/hyperactivity disorder 171847.
Desired output using prevalence of asthma as an example:
measure_name age_name cause_name val
<chr> <chr> <chr> <dbl>
1 Prevalence 1-4 years Asthma 21809765.
2 Prevalence 5-9 years Asthma 33602368.
3 Prevalence 1–9 years Asthma 55412133.
What I have tried:
Total <- df %>%
group_by(measure_name, cause_name) %>%
summarise(val = sum(val, na.rm = TRUE)) %>% arrange(cause_name, factor(measure_name, levels = c("Prevalence",
"Incidence",
"Deaths",
"YLLs (Years of Life Lost)",
"YLDs (Years Lived with Disability)",
"DALYs (Disability-Adjusted Life Years)")))
# A tibble: 10 × 3
# Groups: measure_name [6]
measure_name cause_name val
<chr> <chr> <dbl>
1 Prevalence Asthma 55412134.
2 Incidence Asthma 16422462.
3 Deaths Asthma 7802.
4 YLLs (Years of Life Lost) Asthma 663872.
5 YLDs (Years Lived with Disability) Asthma 2235184.
6 DALYs (Disability-Adjusted Life Years) Asthma 2899055.
7 Prevalence Attention-deficit/hyperactivity disorder 15401286.
8 Incidence Attention-deficit/hyperactivity disorder 3969170.
9 YLDs (Years Lived with Disability) Attention-deficit/hyperactivity disorder 188622.
10 DALYs (Disability-Adjusted Life Years) Attention-deficit/hyperactivity disorder 188622.
I get the total but would like to create new row with new age_name that is 1–9 years (pls see above desired output using prevalence of asthma as an example). Can someone help pls? Thanks.
If I understand your question correctly, you can just sum val, change the value of age_name, and then use bind_rows to tie it up with the original data set. After that it is just a matter of organizing the rows to appear in the way you want it. Here is such a solution:
df %>%
group_by(measure_name, cause_name) %>%
summarise(val = sum(val, na.rm = TRUE), age_name = "1-9 years", .groups = "drop") %>%
bind_rows(df) %>%
arrange(cause_name, factor(measure_name, levels = c("Prevalence",
"Incidence",
"Deaths",
"YLLs (Years of Life Lost)",
"YLDs (Years Lived with Disability)",
"DALYs (Disability-Adjusted Life Years)")),
age_name = factor(age_name, levels = c("1-4 years", "5-9 years", "1-9 years")))

Using dplyr to create new groups inside a column

This is my dataframe:
mydf<-structure(list(DS_FAIXA_ETARIA = c("Inválido", "16 anos", "17 anos",
"18 anos", "19 anos", "20 anos", "21 a 24 anos", "25 a 29 anos",
"30 a 34 anos", "35 a 39 anos"), n = c(5202L, 48253L, 67401L,
79398L, 88233L, 90738L, 149634L, 198848L, 238406L, 265509L)), row.names = c(NA,
-10L), class = c("tbl_df", "tbl", "data.frame"))
I would like to have grouped the observations into one group called: 16 a 20 anos.
"16 anos", "17 anos",
"18 anos", "19 anos", "20 anos"
In other words I would like to "merge" the rows 2-6 and sum its observations on the n column. I would have one row represent the sum of rows 2-6.
Is it possible to do this using group_by and then summarise(sum(DS_FAIXA_ETARIA)) verbs from dplyr?
This would be the output that I want:
mydf<-structure(list(DS_FAIXA_ETARIA = c("Inválido","16 a 20 anos" ,"21 a 24 anos", "25 a 29 anos",
"30 a 34 anos", "35 a 39 anos"), n = c(5202L,374023L , 149634L, 198848L, 238406L, 265509L)), row.names = c(NA,
-6L), class = c("tbl_df", "tbl", "data.frame"))
Many thanks
This should the job. First sum with summarize.
Then add_row to the original dataframe. slice_tail and arrange
df1 <- mydf %>%
summarise(`16 a 20 anos`= sum(n[2:6]))
mydf %>%
add_row(DS_FAIXA_ETARIA=names(df1), n=df1$`16 a 20 anos`[1]) %>%
slice_tail(n=5) %>%
arrange(DS_FAIXA_ETARIA)
Output:
DS_FAIXA_ETARIA n
<chr> <int>
1 16 a 20 anos 374023
2 21 a 24 anos 149634
3 25 a 29 anos 198848
4 30 a 34 anos 238406
5 35 a 39 anos 265509
We create a grouping variable based on the occurrence of 'Invalido' or those elements with only digits (\\d+) followed by space and 'anos', then summarise by pasteing the first and last elements while getting the sum of 'n'
library(dplyr)
library(stringr)
mydf %>%
group_by(grp = replace(cumsum(!str_detect(DS_FAIXA_ETARIA,
'^\\d+\\s+anos$')), DS_FAIXA_ETARIA == 'Inválido', 0)) %>%
summarise(DS_FAIXA_ETARIA = if(n() > 1)
str_c(DS_FAIXA_ETARIA[c(1, n())], collapse="_") else
DS_FAIXA_ETARIA, n = sum(n), .groups = 'drop') %>%
select(-grp)
-output
# A tibble: 6 x 2
# DS_FAIXA_ETARIA n
# <chr> <int>
#1 Inválido 5202
#2 16 anos_20 anos 374023
#3 21 a 24 anos 149634
#4 25 a 29 anos 198848
#5 30 a 34 anos 238406
#6 35 a 39 anos 265509

Renaming labels of a factor in R

I have census data of Male and Female populations organizaed by age group:
library(tidyverse)
url <- "https://www2.census.gov/programs-surveys/popest/datasets/2010-2018/counties/asrh/cc-est2018-alldata-54.csv"
if (!file.exists("./datafiles/cc-est2018-alldata-54.csv"))
download.file(url, destfile = "./datafiles/cc-est2018-alldata-54.csv", mode = "wb")
popSample <- read.csv("./datafiles/cc-est2018-alldata-54.csv") %>%
filter(AGEGRP != 0 & YEAR == 1) %>%
select("STNAME", "CTYNAME", "AGEGRP", "TOT_POP", "TOT_MALE", "TOT_FEMALE")
popSample$AGEGRP <- as.factor(popSample$AGEGRP)
I then plot the Male and Female population relationships, faceted by age group (1-18, which is currently treated as a int
g <- ggplot(popSample, aes(x=TOT_MALE, y=TOT_FEMALE)) +
geom_point(alpha = 0.5, colour="darkblue") +
scale_x_log10() +
scale_y_log10() +
facet_wrap(~AGEGRP) +
stat_smooth(method = "lm", col = "darkred", size=.75) +
labs(title = "F vs. M Population across all Age Groups", x = "Total Male (log10)", y = "Total Female (log10)") +
theme_light()
g
Which results in this plot: https://share.getcloudapp.com/v1ur6O4e
The problem: I am trying to convert the column AGEGRP from ‘int’ to ‘factor’, and change the factors labels from “1”, “2”, “3”, … “18” to "AgeGroup1", "AgeGroup2", "AgeGroup3", … "AgeGroup18"
When I try this code, my AGEGRP column's observation values are all replaced with NAs:popSample$AGEGRP <- factor(popSample$AGEGRP, levels = c("0 to 4", "5 to 9", "10 to 14", "15 to 19", "20 to 24", "25 to 29", "30 to 34", "35 to 39", "40 to 44", "45 to 49", "50 to 54", "55 to 59", "60 to 64", "65 to 69", "70 to 74", "75 to 79", "80 to 84", "85+"))
https://share.getcloudapp.com/qGuo1O4y
Thank you for your help,
popSample$AGEGRP <- factor( popSample$AGEGRP, levels = c("0 to 4", "5 to 9", "10 to 14", "15 to 19", "20 to 24", "25 to 29", "30 to 34", "35 to 39", "40 to 44", "45 to 49", "50 to 54", "55 to 59", "60 to 64", "65 to 69", "70 to 74", "75 to 79", "80 to 84", "85+"))
Need to add all levels though.
Alternatively
levels(popSample$AGEGRP) <- c("0 to 4", "5 to 9", "10 to 14", "15 to 19", "20 to 24", "25 to 29", "30 to 34", "35 to 39", "40 to 44", "45 to 49", "50 to 54", "55 to 59", "60 to 64", "65 to 69", "70 to 74", "75 to 79", "80 to 84", "85+")
should work as well.
Read in the csv again:
library(tidyverse)
url <- "https://www2.census.gov/programs-surveys/popest/datasets/2010-2018/counties/asrh/cc-est2018-alldata-54.csv"
popSample <- read.csv(url) %>%
filter(AGEGRP != 0 & YEAR == 1) %>%
select("STNAME", "CTYNAME", "AGEGRP", "TOT_POP", "TOT_MALE", "TOT_FEMALE")
If you just want to add a prefix "AgeGroup" to your facet labels, you do:
ggplot(popSample, aes(x=TOT_MALE, y=TOT_FEMALE)) +
geom_point(alpha = 0.5, colour="darkblue") +
scale_x_log10() +
scale_y_log10() +
facet_wrap(~AGEGRP,labeller=labeller(AGEGRP = function(i)paste0("AgeGroup",i))) +
stat_smooth(method = "lm", col = "darkred", size=.75) +
labs(title = "F vs. M Population across all Age Groups",
x = "Total Male (log10)", y = "Total Female (log10)") +
theme_light()
If there is a need for new factors, then you need to refactor (like #Annet's answer below):
lvls = c("0 to 4", "5 to 9", "10 to 14", "15 to 19",
"20 to 24", "25 to 29", "30 to 34", "35 to 39",
"40 to 44", "45 to 49", "50 to 54", "55 to 59",
"60 to 64", "65 to 69", "70 to 74", "75 to 79", "80 to 84", "85+")
#because you have factorize it
# if you can read the csv again, skip the factorization
popSample$AGEGRP = factor(lvls[popSample$AGEGRP],levels=lvls)
Then plot:
ggplot(popSample, aes(x=TOT_MALE, y=TOT_FEMALE)) +
geom_point(alpha = 0.5, colour="darkblue") +
scale_x_log10() +
scale_y_log10() +
facet_wrap(~AGEGRP) +
stat_smooth(method = "lm", col = "darkred", size=.75) +
labs(title = "F vs. M Population across all Age Groups",
x = "Total Male (log10)", y = "Total Female (log10)") +
theme_light()
To change all the factor labels with one function, you can use forcats::fct_relabel (forcats ships as part of the tidyverse, which you've already got loaded). The changed factor labels will carry over to the plot facets and the order stays the same.
First few entries:
# before relabelling
popSample$AGEGRP[1:4]
#> [1] 1 2 3 4
#> Levels: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
# after relabelling
forcats::fct_relabel(popSample$AGEGRP, ~paste0("AgeGroup", .))[1:4]
#> [1] AgeGroup1 AgeGroup2 AgeGroup3 AgeGroup4
#> 18 Levels: AgeGroup1 AgeGroup2 AgeGroup3 AgeGroup4 AgeGroup5 ... AgeGroup18
Or with base R, reassign the levels:
levels(popSample$AGEGRP) <- paste0("AgeGroup", levels(popSample$AGEGRP))
popSample$AGEGRP[1:4]
#> [1] AgeGroup1 AgeGroup2 AgeGroup3 AgeGroup4
#> 18 Levels: AgeGroup1 AgeGroup2 AgeGroup3 AgeGroup4 AgeGroup5 ... AgeGroup18

Changing levels in R

I have a field where the levels are broken down as below:
levels(demo$age)
"18 to 24 years old" "25 to 34 years old" "35 to 44 years old" "45 to 54 years old" "55 to 64 years old" "65 to 74 years old" "75 years old or older"
How can I change the levels to
"Total " "18 to 24 years old" "25 plus".
We create a vector of levels that needs to be changed
v1 <- c("25 to 34 years old", "35 to 44 years old", "45 to 54 years old",
"55 to 64 years old", "65 to 74 years old" , "75 years old or older")
then, assign those to new level
levels(demo$age)[levels(demo$age) %in% v1] <- "25 plus"
If we need a 'Total' level as well
levels(demo$age) <- c("Total", levels(demo$age))
levels(demo$age)
#[1] "Total" "18 to 24 years old" "25 plus"
data
set.seed(24)
demo <- data.frame(age = sample(c("18 to 24 years old", v1), 100, replace = TRUE))

Combine two data frames by one variable and combining columns under one main header

I want to combine two dataframes T2 and T4 by variable "Industry" and the columns of each data set with one main heading. So in the final output table I want columns Industry, three columns of T2 under one column heading "Executive" and three other columns of T4 as sub-columns of one heading "management".
T2
Industry percentage_Yes percentage_No Total_responses
1 ALL 94 % 6 % 117
2 Banking/Financial Services 83 % 17 % 6
3 Chemicals 100 % 0 % 5
4 Consumer Goods 75 % 25 % 8
5 Energy 89 % 11 % 9
6 High Tech 100 % 0 % 8
7 Insurance/Reinsurance 100 % 0 % 14
8 Life Sciences 100 % 0 % 11
9 Logistics -- -- 3
10 Mining & Metals -- -- 1
11 Other Manufacturing 100 % 0 % 11
12 Other Non-Manufacturing -- -- 3
13 Retail & Wholesale 100 % 0 % 12
14 Services (Non-Financial) 88 % 12 % 24
15 Transportation Equipment -- -- 2
16 <NA> -- -- 0
T4
Industry percentage_Yes percentage_No Total_responses
1 ALL 96 % 4 % 121
2 Banking/Financial Services 86 % 14 % 7
3 Chemicals 100 % 0 % 5
4 Consumer Goods 100 % 0 % 8
5 Energy 100 % 0 % 9
6 High Tech 100 % 0 % 9
7 Insurance/Reinsurance 93 % 7 % 15
8 Life Sciences 91 % 9 % 11
9 Logistics -- -- 3
10 Mining & Metals -- -- 1
11 Other Manufacturing 100 % 0 % 12
12 Other Non-Manufacturing -- -- 3
13 Retail & Wholesale 100 % 0 % 12
14 Services (Non-Financial) 92 % 8 % 24
15 Transportation Equipment -- -- 2
16 <NA> -- -- 0
> dput(T2)
structure(list(Industry = c("ALL", "Banking/Financial Services",
"Chemicals", "Consumer Goods", "Energy", "High Tech", "Insurance/Reinsurance",
"Life Sciences", "Logistics", "Mining & Metals", "Other Manufacturing",
"Other Non-Manufacturing", "Retail & Wholesale", "Services (Non-Financial)",
"Transportation Equipment", NA), percentage_Yes = c("94 %", "83 %",
"100 %", "75 %", "89 %", "100 %", "100 %", "100 %", "--", "--",
"100 %", "--", "100 %", "88 %", "--", "--"), percentage_No = c("6 %",
"17 %", "0 %", "25 %", "11 %", "0 %", "0 %", "0 %", "--", "--",
"0 %", "--", "0 %", "12 %", "--", "--"), Total_responses = c(117,
6, 5, 8, 9, 8, 14, 11, 3, 1, 11, 3, 12, 24, 2, 0)), class = "data.frame", row.names = c(NA,
-16L), .Names = c("Industry", "percentage_Yes", "percentage_No",
"Total_responses"))
> dput(T4)
structure(list(Industry = c("ALL", "Banking/Financial Services",
"Chemicals", "Consumer Goods", "Energy", "High Tech", "Insurance/Reinsurance",
"Life Sciences", "Logistics", "Mining & Metals", "Other Manufacturing",
"Other Non-Manufacturing", "Retail & Wholesale", "Services (Non-Financial)",
"Transportation Equipment", NA), percentage_Yes = c("96 %", "86 %",
"100 %", "100 %", "100 %", "100 %", "93 %", "91 %", "--", "--",
"100 %", "--", "100 %", "92 %", "--", "--"), percentage_No = c("4 %",
"14 %", "0 %", "0 %", "0 %", "0 %", "7 %", "9 %", "--", "--",
"0 %", "--", "0 %", "8 %", "--", "--"), Total_responses = c(121,
7, 5, 8, 9, 9, 15, 11, 3, 1, 12, 3, 12, 24, 2, 0)), class = "data.frame", row.names = c(NA,
-16L), .Names = c("Industry", "percentage_Yes", "percentage_No",
"Total_responses"))
I have tried tabular but then m getting Industry column 2 times:
library("tables")
st<-rbind(data.frame(T2, Employee_Level = 'Exe', what = factor(rownames(T2), levels = rownames(T2)),
row.names= NULL, check.names = FALSE),
data.frame(T4,Employee_Level = 'Mgmt',what = factor(rownames(T4), levels = rownames(T4)),
row.names = NULL,check.names = FALSE))
mytable <- tabular(Heading()*what ~ Employee_Level*(`Industry`+`percentage_Yes`+`percentage_No`+`Total_responses`)*Heading()*(identity),data=st)
latex(mytable)
Here's one way using (my) huxtable package:
library(huxtable)
my_data <- cbind(T2, T4)[, c(1:4, 6:8)]
my_hux <- as_hux(my_data, add_colnames = TRUE)
my_hux <- insert_row(my_hux, rep("", 7))
my_hux[1, 2] <- "Executive"
my_hux[1, 5] <- "Management"
colspan(my_hux)[1, 2] <- 3
colspan(my_hux)[1, 5] <- 3
my_hux[2, 2:7] <- rep(c("% yes", "% no", "Total responses"), 2)
number_format(my_hux) <- 0
# This should look like what you want:
my_hux

Resources