add summary `n` from one dataframe to another dataframe (tidyverse) - r

I was wondering if there might be a way to replace the column fpc in DATA2 with corresponding fpc obtained from DATA1?
library(tidyverse)
dat <- read.csv('https://raw.githubusercontent.com/rnorouzian/d/master/su.csv')
## 10000 rows ################
DATA1 <- dat %>%
group_by(across(all_of(c("gender", "pre")))) %>%
summarise(n = n(), .groups = 'drop') %>%
mutate(fpc = n/sum(n)) %>%
right_join(dat)
dat2 <- read.csv('https://raw.githubusercontent.com/rnorouzian/d/master/out.csv')
## 200 rows #################
DATA2 <- dat2 %>%
group_by(across(all_of(c("gender", "pre")))) %>%
summarise(n = n(), .groups = 'drop') %>%
mutate(fpc = n/sum(n)) %>%
right_join(dat2)

You can join the dataframe and use coalesce to select fpc from DATA2.
library(dplyr)
result <- DATA2 %>%
left_join(DATA1 %>% distinct(gender, pre, fpc),
by = c('gender', 'pre')) %>%
mutate(fpc = coalesce(fpc.y, fpc.x)) %>%
select(names(DATA2))
nrow(result)
#[1] 200

It would be more efficient to do this in data.table
library(data.table)
setDT(DATA2)[as.data.table(unique(DATA1[c('gender', 'pre', 'fpc')])),
fpc := i.fpc, on = .(gender, pre)]

Related

How do I create a table in R with conditional formatting and row and column totals?

Are there any R packages that I use to replicate the table below -
I would like a table with conditional formatting for the table values but no conditional formatting on the row and column grand totals.
The code can be used to reproduce the values in the table along with the row and column grand totals -
library(tidyverse)
# vectors
dates <- rep(date_vec <- c(as.Date("2022-01-01"), as.Date("2022-02-01"), as.Date("2022-03-01")), 30)
row_groups <- c(rep("row_group1", 20), rep("row_group2", 30), rep("row_group3", 10), rep("row_group4", 30))
col_groups <- c(rep("col_group1", 10), rep("col_group2", 10), rep("col_group3", 30), rep("col_group4", 40))
# dataframe
df <- tibble(dates, row_groups, col_groups)
# column grand totals
col_group_total <- df %>%
group_by(dates, col_groups) %>%
count() %>%
group_by(col_groups) %>%
summarise(mean = mean(n)) %>%
mutate(pct = mean/sum(mean))
# row grand totals
row_group_total <- df %>%
group_by(dates, row_groups) %>%
count() %>%
group_by(row_groups) %>%
summarise(mean = mean(n)) %>%
mutate(pct = mean/sum(mean))%>%
ungroup()
# table values
group_total <- df %>%
group_by(dates, row_groups, col_groups) %>%
count() %>%
group_by(row_groups, col_groups) %>%
summarise(count = mean(n)) %>%
ungroup() %>%
mutate(pct = count/sum(count))%>%
ungroup()
red_color <- "#f4cccc"
yellow_color <- "#f3f0ce"
green_color <- "#d9ead3"
library(janitor); library(gt)
df %>%
tabyl(row_groups, col_groups) %>%
adorn_percentages("all") %>%
adorn_totals(c("col")) -> df_tabyl
gt(df_tabyl) %>%
data_color(columns = col_group1:col_group4,
colors = scales::col_numeric(
palette = c(red_color, yellow_color, green_color),
domain = range(df_tabyl[1:4,2:5])
)
) %>%
fmt_percent(columns = -row_groups,
rows = everything()) %>%
summary_rows(
columns = -row_groups,
fns = list("Total" = "sum"),
formatter = fmt_percent
)
The coloring varies with your example b/c the col_numeric function maps the colors linearly along the three provided colors, and 11% is only 1/3 of the way between 0% and 33%. Not sure what approach you expect.

Use dplyr to get index of first column with certain value per group or row

I have the following script. Option 1 uses a long format and group_by to identify the first step of many where the status equals 0.
Another option (2) is to use apply to calculate this value for each row, and then transform the data to a long format.
The firs option does not scale well. The second does, but I was unable to get it into a dplyr pipe. I tried to solve this with purrr but did not succeeed.
Questions:
Why does the first option not scale well?
How can I transform the second option in a dplyr pipe?
require(dplyr)
require(tidyr)
require(ggplot2)
set.seed(314)
# example data
dat <- as.data.frame(matrix(sample(c(0,1),
size = 9000000,
replace = TRUE,
prob = c(5,95)),
ncol = 9))
names(dat) <- paste("step",1:9, sep="_")
steps <- dat %>% select(starts_with("step_")) %>% names()
# option 1 is slow
dat.cum <- dat %>%
mutate(id = row_number()) %>%
gather(step, status,-id) %>%
group_by(id) %>%
mutate(drop = min(if_else(status==0,match(step, steps),99L))) %>%
mutate(status = if_else(match(step, steps)>=drop,0,1))
ggplot(dat.cum, aes(x = step, fill = factor(status))) +
geom_bar()
# option 2 is faster
dat$drop <- apply(dat,1,function(x) min(which(x==0),99))
dat.cum <- dat %>%
gather(step,status,-drop) %>%
mutate(status = if_else(match(step,steps)>=drop,0,1))
ggplot(dat.cum, aes(x = step, fill = factor(status))) +
geom_bar()
If you would like to map along rows you could do:
dat %>%
mutate(drop2 = map_int(seq_len(nrow(dat)), ~ min(which(dat[.x, ] == 0L), 99L)))
It could be that "gathering and grouping" is faster than Looping:
dat %>%
as_tibble() %>%
select(starts_with("step_")) %>%
mutate(row_nr = row_number()) %>%
gather(key = "col", value = "value", -row_nr) %>%
arrange(row_nr, col) %>%
group_by(row_nr) %>%
mutate(col_index = row_number()) %>%
filter(value == 0) %>%
summarise(drop3 = min(col_index)) %>%
ungroup() %>%
right_join(dat %>%
mutate(row_nr = row_number()),
by = "row_nr") %>%
mutate(drop3 = if_else(is.na(drop3), 99, drop3))

How do I parameterize the column names in dplyr function calls

Suppose we have the following setup
library(dplyr)
set.seed(10101)
id <- sample(3,20,replace = TRUE)
x <- sample(2,20,replace = TRUE)
df <- data.frame(id,x)
How do I parameterize the following:
df %>% group_by(id) %>% arrange(id) %>% mutate(x.lag=lag(x,1,default=0))
cl <- "x"
cl.lag <- "x.lag.1"
my naive attempt does not seem to work:
df %>% group_by(id) %>% arrange(id) %>% mutate(cl.lag=lag(cl,1,default=0))

Mutating values of subset of columns into percentage format

I have generated this summary table based on the df below.
set.seed(1)
df <- data.frame(rep(
sample(c(2012,2016),10, replace = T)),
sample(c('Treat','Control'),10,replace = T),
runif(10,0,1),
runif(10,0,1),
runif(10,0,1))
colnames(df) <- c('Year','Group','V1','V2','V3')
summary.table = df %>%
group_by(Year, Group) %>%
group_by(N = n(), add = TRUE) %>%
summarise_all(funs(sd,median)) %>%
ungroup %>%
mutate(Year = ifelse(duplicated(Year),"",Year))
Is there a way I could display the values related to the median columns as percentages?
I did not know how to use mutate() and scales::percent() for only a subset of columns (I dont want to do it individually, since there will be more columns in the original dataset, making this procedure not practical enough.
What should I have done instead if I wanted to mutate according to a subset of rows?
Thank you
EDIT:
And if it was like this?
summary.table = df %>%
group_by(Year, Group) %>%
summarise_all(funs(median,sd)) %>%
gather(key, value, -Year, -Group) %>%
separate(key, into=c("var", "stat")) %>%
unite(stat_Group, stat, Group) %>%
spread(stat_Group, value) %>%
ungroup %>%
mutate(Year = ifelse(duplicated(Year),"",Year))
We need to use the percent wrapped on median
summary.table <- df %>%
group_by(Year, Group) %>%
group_by(N = n(), add = TRUE) %>%
summarise_all(funs(sd=sd(.),median=scales::percent(median(.)))) %>%
ungroup %>%
mutate(Year = ifelse(duplicated(Year),"",Year))

Combine list of data frames with one column of characters

I am learning to get, cleaning and combining data. I am confused why in a loop rbind command result in returning 10 data instead of expected 30 data as when I combine it manually (i by i).
library(XML)
mergeal <- NULL
tabnums <- 3
for (i in 1:length(tabnums)) {
bnn <- paste0("http://www.ngchanmau.com/listing_browse.php?cur_page=",
tabnums[i], "&&coming=22-Oct-2015&coming=22-Oct-2015")
tem <- readHTMLTable(bnn, header=T, stringsAsFactors=F)
#data cleaning
ff <- tem[8] #wanted data
ff1 <- as.data.frame(ff)
ff2 <- ff1[ , 1] #get 1st col data only
ff3 <- unique(ff2)
ff4 <- ff3[c(2,5:13)] #wanted list only
#merging dataset
mergeal <- rbind(mergeal, ff4)
}
I've tried using list rbind list of data frames with one column of characters and numerics but still have the same result as above. Appreciate any help on what I missed, thanks.
I cleaned up the data cause I was bored.
library(plyr)
library(XML)
library(dplyr)
library(magrittr)
library(stringi)
library(tidyr)
library(lubridate)
answer =
data_frame(tabnums = 1:3) %>%
group_by(tabnums) %>%
do(.$tabnums %>%
paste0("http://www.ngchanmau.com/listing_browse.php?cur_page=",
., "&&coming=22-Oct-2015&coming=22-Oct-2015") %>%
readHTMLTable(header = T, stringsAsFactors = F) %>%
extract2(8)) %>%
ungroup %>%
select(V1) %>%
distinct %>%
mutate(V1 =
V1 %>%
stri_replace_all_fixed("Â", "\n") %>%
stri_replace_all_fixed("Type:", "\nType:") %>%
stri_replace_all_fixed("Time:", "\nTime:") %>%
stri_replace_all_fixed("Area:", "\nArea:") %>%
stri_split_fixed("\n")) %>%
unnest(V1) %>%
mutate(V1 = V1 %>% stri_trim) %>%
filter(V1 %>% stri_detect_regex("^There are currently") %>% `!`) %>%
filter(V1 != "") %>%
separate(V1, c("variable", "value"), sep = ":", fill = "left") %>%
mutate(variable = variable %>% mapvalues(NA, "Description"),
ID = variable %>% `==`("Description") %>% cumsum) %>%
spread(variable, value) %>%
mutate(Area = Area %>% extract_numeric,
Price = Price %>% extract_numeric,
Datetime =
Time %>%
stri_replace_all_fixed("a.m.", "am") %>%
stri_replace_all_fixed("p.m.", "pm") %>%
paste(Date, .) %>%
dmy_hm) %>%
select(-Date, -Time)

Resources