name variable at the same time using complete - r

I would like to get a sub total by age group. The sample data and the codes are:
set.seed(12345)
#create a numeric variable Age
AGE <- sample(0:110, 100, replace = TRUE)
# Creat Data fame
Sample.data <-data.frame(AGE)
summary_data<- Sample.data %>%
group_by(grp = cut(
AGE,
breaks=c(-Inf, 0,0.001, 0.083, 2, 13, 65,1000),
right=TRUE,
labels = c("Foetus(0 yr)","Neonate (0.001 - 0.082 yr)","Infant(0.083-1.999 yrs)","Child(2-12.999 yrs)", "Adolescent(13-17.999 yrs)","Adult(18-64.999 yrs.)","Elderly(65-199 yrs)")
)) %>% summarise("Total People" = n())%>% complete(grp = levels(grp), fill = list("Total People = 0))
which gave the results that looks like this:
Is it possible to label grp as "Age Group"? and sort it by age?
I tried to define the name as follow and the results are strange. What did I do wrong?
summary_data<- Sample.data %>%
group_by("Age Group" = cut(
AGE,
breaks=c(-Inf, 0,0.001, 0.083, 2, 13, 65,1000),
right=TRUE,
labels = c("Foetus(0 yr)","Neonate (0.001 - 0.082 yr)","Infant(0.083-1.999 yrs)","Child(2-12.999 yrs)", "Adolescent(13-17.999 yrs)","Adult(18-64.999 yrs.)","Elderly(65-199 yrs)")
)) %>% summarise("Total People" = n())%>% complete("Age Group" = levels("Age Group"), fill = list("Total People" = 0))
The new results are:
The variable now has name "Age Group", but table doesn't fill 0 for the age category that do not have observation. What should I do.
The ideal results should look like this:

We can change the double quotes to backquotes in levels to evaluate the column in complete. The column name is non-standard i.e. it includes a space.
...
%>%
complete("Age Group" = levels(`Age Group`), fill = list("Total People" = 0))
# A tibble: 7 x 2
# `Age Group` `Total People`
# <chr> <dbl>
#1 Adolescent(13-17.999 yrs) 14
#2 Adult(18-64.999 yrs.) 37
#3 Child(2-12.999 yrs) 2
#4 Elderly(65-199 yrs) 46
#5 Foetus(0 yr) 1
#6 Infant(0.083-1.999 yrs) 0
#7 Neonate (0.001 - 0.082 yr) 0
If we want to arrange
Sample.data %>%
group_by("Age Group" = cut(
AGE,
breaks=c(-Inf, 0,0.001, 0.083, 2, 13, 65,1000),
right=TRUE,
labels = c("Foetus(0 yr)","Neonate (0.001 - 0.082 yr)","Infant(0.083-1.999 yrs)","Child(2-12.999 yrs)", "Adolescent(13-17.999 yrs)","Adult(18-64.999 yrs.)","Elderly(65-199 yrs)")
)) %>%
summarise("Total People" = n()) %>%
complete("Age Group" = levels(`Age Group`), fill = list("Total People" = 0)) %>%
arrange(`Total People`)

Related

How to use case_when when doing quartiles in R?

If i have this tibble:
tibble(
period = c("2010END", "2011END",
"2010Q1","2010Q2","2011END"),
date = c('31-12-2010','31-12-2011', '30-04-2010','31-07-2010','30-09-2010'),
website = c(
"google",
"google",
"facebook",
"facebook",
"youtube"
),
method = c("website",
"phone",
"website",
"laptop",
"phone"),
values = c(1, NA, 1, 2, 3))
And then i have this dataframe which tells you which quantiles to create along with the rankings to be made from the ranks:
tibble(
method = c(
"phone",
"phone",
"phone",
"website",
"website",
"website",
"laptop",
"laptop",
"laptop"
),
rank = c(3,2,1,3,2,1,3,2,1),
tile_condition = c("lowest 25%", "25 to 50%", "more than 50%",
"highest 25%", "25 to 50%", "less than 25%",
"lowest 25%", "25 to 50%", "more than 50%")
)
How can i use a case_when statement to correctly allow myself to create a ranking column which is based on the quartile calculation from the values column in the first dataframe?
I'm trying to apply the quantiles from the other dataframe to create a ranking column in the original dataframe - stuck on how to use case_when for it.
I would do something like this:
set.seed(124)
left_join(
df1[sample(1:5,1000, replace=T),] %>%
mutate(values=sample(c(df1$values,1:30),1000, replace=T)) %>%
group_by(method) %>%
mutate(q=as.double(cut(values,quantile(values,probs=seq(0,1,0.25), na.rm=T), labels=c(1:4), include.lowest=T))) %>%
ungroup(),
df2 %>% mutate(q = list(1,2,c(3,4),4,c(2,3),1,1,2,c(3,4))) %>% unnest(q),
by=c("method", "q")
) %>% select(-q)
Output:
# A tibble: 1,000 × 7
period date website method values rank tile_condition
<chr> <chr> <chr> <chr> <dbl> <dbl> <chr>
1 2010END 31-12-2010 google website 7 2 25 to 75%
2 2011END 31-12-2011 google phone 18 1 more than 50%
3 2010Q1 30-04-2010 facebook website 21 2 25 to 75%
4 2011END 30-09-2010 youtube phone 15 1 more than 50%
5 2011END 30-09-2010 youtube phone 26 1 more than 50%
6 2011END 31-12-2011 google phone 3 3 lowest 25%
7 2010END 31-12-2010 google website 1 1 less than 25%
8 2010Q1 30-04-2010 facebook website 2 1 less than 25%
9 2010Q2 31-07-2010 facebook laptop 14 2 25 to 50%
10 2010Q2 31-07-2010 facebook laptop 16 1 more than 50%
# … with 990 more rows
Notice that I updated your input to 1000 rows, and random new values for the purposes of illustration. Also, notice that I fixed df2, so that method website covers the full range of values. In your example the 50% to 75% quartile is missing.
Adjusted df2 input:
structure(list(method = c("phone", "phone", "phone", "website",
"website", "website", "laptop", "laptop", "laptop"), rank = c(3,
2, 1, 3, 2, 1, 3, 2, 1), tile_condition = c("lowest 25%", "25 to 50%",
"more than 50%", "highest 25%", "25 to 75%", "less than 25%",
"lowest 25%", "25 to 50%", "more than 50%")), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -9L))
If I've understood your question correctly, you first have to create a table to compare against, as:
df_quants <-
df1 %>%
drop_na(values) %>%
group_by(method) %>%
summarize(quant25 = quantile(values, probs = 0.25),
quant50 = quantile(values, probs = 0.5),
quant75 = quantile(values, probs = 0.75),
quant100 = quantile(values, probs = 1))
Then, using a join and a case_when statement, you would arrive at:
df2 %>%
left_join(df_quants, by = 'method') %>%
mutate(tiles =
case_when(rank < quant25 ~ 'lowest 25%',
rank < quant50 ~ '25 to 50%',
rank < quant75 ~ 'more than 50%',
rank >= quant75 ~ 'highest 25%'))
Here's a quick version. It doesn't get the exact labels you want. To do that you'd have to parse your tile_condition column, which is a little tricky.
library(santoku)
df |>
group_by(method) |>
mutate(
quantile = chop_quantiles(values, c(0.25, 0.50),
labels = c("Lowest 25%", "25 to 50%", "Above 50%"), extend = TRUE)
)

How to merge similar date in HTML using R

I have a dataframe in R which looks like.
Order Date Sell Sell_pc Order_fm mkt_pc Dealer_pc
2020-01-01 5 14.34 340 11.23 23.43
2020-01-01 1000 14.34 45900 13.23 34.45
2020-01-02 12 12.33 13 15.44 23.66
2020-01-02 13000 11.45 600000 15.21 14.44
2020-01-03 110000 12.33 31 15.34 12.34
2020-01-03 1600 11.45 18000 13.31 24.45
I want to convert the above-mentioned data frame in HTML image in R, Where I want to merge to similar date in on in the same sequence and add a light gray column for every two-column skipping next two-column.
Required output like:
I have tried:
html_image<-df %>% tableHTML(rownames = FALSE,
widths = rep(100, 6),
caption = "Order Book Reported") %>%
add_css_caption(css = list(c("font-weight", "border","font-size"),
c("bold", "1px solid black","16px")))%>%
add_css_row(css = list(c("background-color"), c("lightgray")), rows = 0:2)
The gt-package could be helpful here:
library(gt)
library(tidyverse)
df |>
mutate(Date = as.Date(Date)) |>
group_by(Date) |>
gt() |>
# gt(rowname_col = "Date") |>
tab_stubhead(label = "Date") |>
tab_header(
title = md("Order Book Reported")
) |>
tab_options(
row_group.as_column = F,
row_group.background.color = "gray",
heading.background.color = "orange",
column_labels.background.color = "orange"
) |>
tab_options(row_group.as_column = TRUE) |>
tab_style(
style = list(
cell_fill(color = "grey")
),
locations = cells_body(
rows = Date == "2020-01-02"
)
)

why my new var name show up as "... <- NULL"

I would like to creat a var named"AGEGROUP" for age group. I wrote following code. But it seems like the new variable name in the data set show up as "... <- NULL" instead of "AGEGROUP". Why? How should I fix this.
Here is my codes:
set.seed(12345)
AGE <- sample(0:110, 100, replace = TRUE)
Sample.data <-data.frame(AGE)
Sample.data <- Sample.data %>% dplyr::mutate(AGEGROUP <-cut(AGE,
right=FALSE,
breaks = c(0,1,12,17,64,1000),
labels = c("Infant(0.083-1.999 yrs)","Child(2-12.999 yrs)", "Adolescent(13-17.999 yrs)","Adult(18-64.999 yrs.)","Elderly(65-199 yrs)")))
There seems to be a <- (assignment) instead of "=" (parameter match):
library("dplyr")
set.seed(12345)
AGE <- sample(0:110, 100, replace = TRUE)
Sample.data <-data.frame(AGE)
Sample.data <- Sample.data %>%
dplyr::mutate(AGEGROUP = cut(
AGE,
right = FALSE,
breaks = c(0, 1, 12, 17, 64, 1000),
labels = c(
"Infant(0.083-1.999 yrs)",
"Child(2-12.999 yrs)",
"Adolescent(13-17.999 yrs)",
"Adult(18-64.999 yrs.)",
"Elderly(65-199 yrs)"
)
))

how to built a new data with summarize and cut command for each age group

I want to build new data (age_summary) with a total number of people by age group. I would like to use "cut" and My codes are:
set.seed(12345)
#create a numeric variable Age
AGE <- sample(0:110, 100, replace = TRUE)
# Creat Data fame
Sample.data <-data.frame(AGE)
age_summary <- Sample.data %>% summarize(group_by(Sample.data,
cut(
AGE,
breaks=c(0,0.001, 0.083, 2, 13, 65,1000),
right=TRUE,
labels = c("Foetus(0 yr)","Neonate (0.001 - 0.082 yr)","Infant(0.083-1.999 yrs)","Child(2-12.999 yrs)", "Adolescent(13-17.999 yrs)","Adult(18-64.999 yrs.)","Elderly(65-199 yrs)")
),"Total people" = n())
)
However my codes do not work. I am not sure what went wrong. Any suggestion on how to solve this?
Add:
I was able to get results that look like this:
is it possible for me to achieve something looks like this:
Here is what I get with adorn_totals(.) on a new data set. the total people looks OK, but the ave-age looks strange.
Any idea?
If we remove the summarise wrapping around the group_by, we can find the issue more easily. Here, the cut labels and breaks have different lengths, which can be changed if we add -Inf or Inf in breaks
library(dplyr)
Sample.data %>%
group_by(grp = cut(AGE,
breaks=c(-Inf, 0,0.001, 0.083, 2, 13, 65,1000),
right=TRUE,
labels = c("Foetus(0 yr)",
"Neonate (0.001 - 0.082 yr)","Infant(0.083-1.999 yrs)","Child(2-12.999 yrs)", "Adolescent(13-17.999 yrs)",
"Adult(18-64.999 yrs.)","Elderly(65-199 yrs)")
)) %>%
summarise(TotalPeople = n())
If we need to create a row with different functions applied on different columns, use add_row
library(tibble)
library(tidyr)
Sample.data %>%
group_by(grp = cut( AGE, breaks=c(-Inf, 0,0.001, 0.083, 2, 13, 65,1000),
right=TRUE, labels = c("Foetus(0 yr)","Neonate (0.001 - 0.082 yr)","Infant(0.083-1.999 yrs)","Child(2-12.999 yrs)",
"Adolescent(13-17.999 yrs)","Adult(18-64.999 yrs.)","Elderly(65-199 yrs)") )) %>%
summarise(TotalPeople = n(), Ave_age=mean(AGE))%>%
complete(grp = levels(grp), fill = list(TotalPeople = 0)) %>%
add_row(grp = "Total", TotalPeople = sum(.$TotalPeople),
Ave_age = mean(.$Ave_age, na.rm = TRUE))

Formatting an ftable in R

I have the following 3 way table I created in R.
with(dataset, ftable(xtabs(count ~ dos + sex + edu)))
The output looks like
edu high low medium unknown
dos sex
five-to-ten-years female 247776 44916 127133 23793
male 225403 37858 147821 20383
five-years-or-less female 304851 58018 182152 33649
male 253977 55720 193621 28972
more-than-ten-years female 709303 452605 539403 165675
male 629162 309193 689299 121336
native-born female 1988476 1456792 2094297 502153
male 1411509 1197395 2790522 395953
unknown female 57974 75480 73204 593141
male 40176 57786 93108 605542
I want to rename the variables and format the table so that I can include it in a report. I know that I can use dnn to rename the variables, but are there any other recommendations to rename the variables? And to format the table (similar to using kable)?
You could convert the output to a text matrix using the following function, after which you can style with kable however you choose:
ftab_to_matrix <- function(ft)
{
row_vars <- attr(ft, "row.vars")
for(i in seq_along(row_vars)){
row_vars[[i]] <- c(names(row_vars[i]), row_vars[[i]])}
rowvar_widths <- sapply(row_vars, function(x) max(nchar(x))) + 1
col_vars <- attr(ft, "col.vars")
rowvar_widths <- c(1, cumsum(c(rowvar_widths, max(nchar(names(col_vars))))))
ft_text <- capture.output(print(ft))
row_cols <- sapply(seq_along(rowvar_widths)[-1], function(x)
substr(ft_text, rowvar_widths[x - 1], rowvar_widths[x]))
ft_text <- substr(ft_text, rowvar_widths[length(rowvar_widths)] + 2, 100)
ft_breaks <- c(1, cumsum(lapply(strsplit(ft_text[length(ft_text)], "\\d "),
function(x) nchar(x) + 2)[[1]]))
col_cols <- sapply(seq_along(ft_breaks)[-1], function(x)
substr(ft_text, ft_breaks[x - 1], ft_breaks[x]))
trimws(cbind(row_cols, col_cols))
}
So, for example, using my example data from your last question, you could do something like:
my_tab <- with(`3waydata`, ftable(xtabs(count ~ duration + sex + education)))
as_image(kable_styling(kable(ftab_to_df(my_tab))), file = "kable.png")
Might have been easier had you given the full picture when you asked your first question... You could use gt to make fancy tables for reports. This is an edited version more fully demonstrating some capabilities.
library(dplyr)
library(gt)
way3data <- data %>%
group_by(duration, education, sex) %>%
summarise(count = sum(number)) %>%
ungroup
# Reorder with select and Titlecase with stringr
longer <- tidyr::pivot_wider(way3data,
values_from = count,
names_from = "education") %>%
select(duration, sex, high, medium, low, unknown) %>%
rename_with(stringr::str_to_title)
# Demonstrating some of the features of gt
# obviously could have done some of this
# to the original dataframe
myresults <- longer %>%
group_by(Duration) %>%
gt(rowname_col = "Sex") %>%
row_group_order(
groups = c("native-born",
"more-than-ten-years",
"five-to-ten-years",
"five-years-or-less",
"unknown")
) %>%
tab_spanner(label = "Education",
columns = matches("High|Low|Medium|Unknown")) %>%
tab_stubhead(label = "Duration or something") %>%
tab_style(
style = cell_text(style = "oblique", weight = "bold"),
locations = cells_row_groups()) %>%
tab_style(
style = cell_text(align = "right", style = "italic", weight = "bold"),
locations = cells_column_labels(
columns = vars(High, Low, Medium, Unknown)
)) %>%
tab_style(
style = cell_text(align = "right", weight = "bold"),
locations = cells_stub()) %>%
tab_header(
title = "Fancy table of counts with Duration, Education and Gender") %>%
tab_source_note(md("More information is available at https://stackoverflow.com/questions/62284264."))
# myresults
# Can save in other formats including .rtf
myresults %>%
gtsave(
"tab_1.png", expand = 10
)
You can read about all the formatting choices here
Data compliments of Allan
set.seed(69)
data <- data.frame(education = sample(c("high","low","medium","unknown"), 600, T),
sex = rep(c("Male", "Female"), 300),
duration = sample(c("unknown", "native-born",
"five-years-or-less", "five-to-ten-years",
"more-than-ten-years"), 600, T),
number = rpois(600, 10))

Resources