I am writing a function that will create a table from a dataframe using the kable() function, then export it to excel using the writexl library.
However I get an error with summarise() and group_by() function:
show_data <- function(df,col1,col2){
myTable <- df %>%
filter(!is.na(col1),!is.na(col2)) %>%
group_by(col2, col1) %>%
summarise(subjects = n()) %>%
mutate(percent = round(subjects / sum(subjects, na.rm = TRUE)* 100, 1)) %>% select(-subjects) %>%
spread(col2,percent)
names(myTable) <- c(col1,"col",col2)
MyTable %>% a
rrange(desc(col2)) %>%
kable(caption = paste(col1,"_",col2))
}
Error messages:
Error: Can't extract columns that don't exist.
x Column foo doesn't exist.
and
***Error: Must group by variables found in .data.
Column columnTwo is not found.
Column columnOne is not found.***
Also, I would like to export a resulting table (via kable()) from R to Excel without having to repeat the same lines of code. Is this possible?
show_data <- function(df,col1,col2){
myTable <- df %>%
filter(!is.na(col1),!is.na(col2)) %>%
group_by(col2, col1) %>%
summarise(subjects = n()) %>%
mutate(percent = round(subjects / sum(subjects, na.rm = TRUE)* 100, 1)) %>% select(-subjects) %>%
spread(col2,percent)
names(myTable) <- c(col1,"col",col2)
MyTable %>% a
rrange(desc(col2)) %>%
kable(caption = paste(col1,"_",col2))
myTable <- df %>%
filter(!is.na(col1),!is.na(col2)) %>%
group_by(col2, col1) %>%
summarise(subjects = n()) %>%
mutate(percent = round(subjects / sum(subjects, na.rm = TRUE)* 100, 1)) %>% select(-subjects) %>%
spread(col2,percent)
names(myTable) <- c(col1,"col",col2)
MyTable %>%
arrange(desc(col2)) %>%
write_xlsx(results.xlsx")
}
show_data(my_data,"column_one","column_two")
Related
Are there any R packages that I use to replicate the table below -
I would like a table with conditional formatting for the table values but no conditional formatting on the row and column grand totals.
The code can be used to reproduce the values in the table along with the row and column grand totals -
library(tidyverse)
# vectors
dates <- rep(date_vec <- c(as.Date("2022-01-01"), as.Date("2022-02-01"), as.Date("2022-03-01")), 30)
row_groups <- c(rep("row_group1", 20), rep("row_group2", 30), rep("row_group3", 10), rep("row_group4", 30))
col_groups <- c(rep("col_group1", 10), rep("col_group2", 10), rep("col_group3", 30), rep("col_group4", 40))
# dataframe
df <- tibble(dates, row_groups, col_groups)
# column grand totals
col_group_total <- df %>%
group_by(dates, col_groups) %>%
count() %>%
group_by(col_groups) %>%
summarise(mean = mean(n)) %>%
mutate(pct = mean/sum(mean))
# row grand totals
row_group_total <- df %>%
group_by(dates, row_groups) %>%
count() %>%
group_by(row_groups) %>%
summarise(mean = mean(n)) %>%
mutate(pct = mean/sum(mean))%>%
ungroup()
# table values
group_total <- df %>%
group_by(dates, row_groups, col_groups) %>%
count() %>%
group_by(row_groups, col_groups) %>%
summarise(count = mean(n)) %>%
ungroup() %>%
mutate(pct = count/sum(count))%>%
ungroup()
red_color <- "#f4cccc"
yellow_color <- "#f3f0ce"
green_color <- "#d9ead3"
library(janitor); library(gt)
df %>%
tabyl(row_groups, col_groups) %>%
adorn_percentages("all") %>%
adorn_totals(c("col")) -> df_tabyl
gt(df_tabyl) %>%
data_color(columns = col_group1:col_group4,
colors = scales::col_numeric(
palette = c(red_color, yellow_color, green_color),
domain = range(df_tabyl[1:4,2:5])
)
) %>%
fmt_percent(columns = -row_groups,
rows = everything()) %>%
summary_rows(
columns = -row_groups,
fns = list("Total" = "sum"),
formatter = fmt_percent
)
The coloring varies with your example b/c the col_numeric function maps the colors linearly along the three provided colors, and 11% is only 1/3 of the way between 0% and 33%. Not sure what approach you expect.
I was wondering if there might be a way to replace the column fpc in DATA2 with corresponding fpc obtained from DATA1?
library(tidyverse)
dat <- read.csv('https://raw.githubusercontent.com/rnorouzian/d/master/su.csv')
## 10000 rows ################
DATA1 <- dat %>%
group_by(across(all_of(c("gender", "pre")))) %>%
summarise(n = n(), .groups = 'drop') %>%
mutate(fpc = n/sum(n)) %>%
right_join(dat)
dat2 <- read.csv('https://raw.githubusercontent.com/rnorouzian/d/master/out.csv')
## 200 rows #################
DATA2 <- dat2 %>%
group_by(across(all_of(c("gender", "pre")))) %>%
summarise(n = n(), .groups = 'drop') %>%
mutate(fpc = n/sum(n)) %>%
right_join(dat2)
You can join the dataframe and use coalesce to select fpc from DATA2.
library(dplyr)
result <- DATA2 %>%
left_join(DATA1 %>% distinct(gender, pre, fpc),
by = c('gender', 'pre')) %>%
mutate(fpc = coalesce(fpc.y, fpc.x)) %>%
select(names(DATA2))
nrow(result)
#[1] 200
It would be more efficient to do this in data.table
library(data.table)
setDT(DATA2)[as.data.table(unique(DATA1[c('gender', 'pre', 'fpc')])),
fpc := i.fpc, on = .(gender, pre)]
I have the following script. Option 1 uses a long format and group_by to identify the first step of many where the status equals 0.
Another option (2) is to use apply to calculate this value for each row, and then transform the data to a long format.
The firs option does not scale well. The second does, but I was unable to get it into a dplyr pipe. I tried to solve this with purrr but did not succeeed.
Questions:
Why does the first option not scale well?
How can I transform the second option in a dplyr pipe?
require(dplyr)
require(tidyr)
require(ggplot2)
set.seed(314)
# example data
dat <- as.data.frame(matrix(sample(c(0,1),
size = 9000000,
replace = TRUE,
prob = c(5,95)),
ncol = 9))
names(dat) <- paste("step",1:9, sep="_")
steps <- dat %>% select(starts_with("step_")) %>% names()
# option 1 is slow
dat.cum <- dat %>%
mutate(id = row_number()) %>%
gather(step, status,-id) %>%
group_by(id) %>%
mutate(drop = min(if_else(status==0,match(step, steps),99L))) %>%
mutate(status = if_else(match(step, steps)>=drop,0,1))
ggplot(dat.cum, aes(x = step, fill = factor(status))) +
geom_bar()
# option 2 is faster
dat$drop <- apply(dat,1,function(x) min(which(x==0),99))
dat.cum <- dat %>%
gather(step,status,-drop) %>%
mutate(status = if_else(match(step,steps)>=drop,0,1))
ggplot(dat.cum, aes(x = step, fill = factor(status))) +
geom_bar()
If you would like to map along rows you could do:
dat %>%
mutate(drop2 = map_int(seq_len(nrow(dat)), ~ min(which(dat[.x, ] == 0L), 99L)))
It could be that "gathering and grouping" is faster than Looping:
dat %>%
as_tibble() %>%
select(starts_with("step_")) %>%
mutate(row_nr = row_number()) %>%
gather(key = "col", value = "value", -row_nr) %>%
arrange(row_nr, col) %>%
group_by(row_nr) %>%
mutate(col_index = row_number()) %>%
filter(value == 0) %>%
summarise(drop3 = min(col_index)) %>%
ungroup() %>%
right_join(dat %>%
mutate(row_nr = row_number()),
by = "row_nr") %>%
mutate(drop3 = if_else(is.na(drop3), 99, drop3))
this is my code and I have a problem with groupby :
library(dplyr)
library(lubridate)
df <- read.xlsx("Data.xlsx", sheet = "Sector-STOXX600", startRow = 2,colNames = TRUE, detectDates = TRUE, skipEmptyRows = FALSE)
df[2:19] <- data.matrix(df[2:19])
percent_change2 <- function(x)last(x)/first(x) - 1
monthly_return <- df %>%
group_by(gr = floor_date(Date, unit = "month")) %>%
summarize_at(vars(-Date, -gr), percent_change2) %>%
ungroup() %>%
select(-gr) %>%
as.matrix()
Indeed I have this error :
"Error in is_character(x) : object 'gr' not found"
Here is a sample of the dataset :
Date .SXQR .SXTR .SXNR .SXMR .SXAR .SX3R .SX6R .SXFR .SXOR .SXDR .SX4R .SXRR .SXER
1 2000-01-03 364.94 223.93 489.04 586.38 306.56 246.81 385.36 403.82 283.78 455.39 427.43 498.08 457.57
2 2000-01-04 345.04 218.90 474.05 566.15 301.13 239.24 374.64 390.41 275.93 434.92 414.10 476.17 435.72
UPDATE
volatility_function<- function(x)sqrt(252) * sd(diff(log(x))) * 100
annualized_volatility <- df %>%
mutate(Date=ymd(Date)) %>%
group_by(gr = floor_date(Date, unit = "year")) %>%
select(gr,everything()) %>%
summarize_at(vars(-Date, -gr), volatility_function) %>%
ungroup() %>% select(-gr) %>%
as.matrix()
head(annualized_volatility,5)
I tried what #NeslonGon told me to do, however I know get the same error on an another function, what should I do ?
The idea is that we don't need to summarise_at a grouped variable but use the Date to account for this. The select and mutate calls can be skipped. They're for convenience.
df %>%
mutate(Date=ymd(Date)) %>%
group_by(gr = floor_date(Date, unit = "month")) %>%
select(gr,everything()) %>%
summarize_at(vars(-Date), percent_change2) %>%
ungroup() %>%
select(-gr) %>%
as.matrix()
I am trying to produce a formatted html table which has columns for frequency, cumulative frequency, column percentage, and cumulative column percentage. The table should also have the data subsetted by a grouping variable, and including a group total.
I can almost achieve this using a combination of dplyr and tidyr, but the output is a dataframe which doesn't look so pretty. I wonder if there is an easier way using the tables::tabulate command?
# Sample data
dat <- data.frame(
id = 1:100,
group = factor(sample(c("A", "B"), 100, replace = TRUE)),
sessions = factor(sample(1:10, 100, replace = TRUE))
)
# dplyr/tidyr solution
library(dplyr)
library(tidyr)
dat %>%
group_by(group, sessions) %>%
tally() %>%
spread(key = group, value = n) %>%
mutate(All = rowSums(.[-1])) %>%
gather(key = group, value = n, -sessions) %>%
group_by(group) %>%
mutate(
cum_n = cumsum(n),
p = round(n / sum(n)*100,1),
cum_p = round(cum_n / sum(n)*100,1),
) %>%
data.frame() %>%
reshape(timevar = "group", idvar = "sessions", direction = "wide")
# As far as I get using tables::tabulate
library(tables)
tabular(
Factor(sessions, "Sessions") ~
(Heading()*group + 1) *
(
(n = 1) +
# (cum_n = ??) +
Heading("%")*Percent(denom = "col")*Format(digits = 2)
# + Heading("cum_%")*??*Format(digits = 2)
),
data = dat
)
I would recommend using knitr::kable and kableExtra, amazing packages for producing tables. You can also set it up for multiple format outputs, for example using the same code to produce html and latex for pdf.
library(dplyr)
library(tidyr)
library(knitr)
library(kableExtra)
dat %>%
group_by(group, sessions) %>%
tally() %>%
spread(key = group, value = n) %>%
mutate(All = rowSums(.[-1])) %>%
gather(key = group, value = n, -sessions) %>%
group_by(group) %>%
mutate(
cum_n = cumsum(n),
p = round(n / sum(n)*100,1),
cum_p = round(cum_n / sum(n)*100,1),
) %>%
data.frame() %>%
reshape(timevar = "group", idvar = "sessions", direction = "wide") %>%
kable("html") %>%
kable_styling(bootstrap_options = c("striped", "hover"))