Using both text and argument in same function in R - r

First of all need to create the same data;
library(tibbletime)
date <- seq(from = as.Date("1979-09-01"), to = as.Date("2019-12-01"), by = "month")
n <- length(date)
df <- matrix(NA, n, 2)
df <- data.frame(df)
var <- sample(1000:5000, n)
df[1] <- date
df[2] <- var
names(df) <- c("date", "var")
df <- as_tbl_time(df, index = date)
How can i use command below;
filter_time(df, ~"1979-09")
by paste0, eval(parse(text = "") I tried that one;
filter_time(eval(parse(text = paste0("df",",", "~" , "1979-09",sep = ""))))
but it didn't work. I have problem with time object which must be as text format. If that code works, it is possible to use in for loop, thanks.

You can also accomplish this with as.formula()
filter_time(df, as.formula(paste0("~", '"', year, '"')))
Full reprex:
library(tibbletime)
date <- seq(from = as.Date("1979-09-01"), to = as.Date("2019-12-01"), by = "month")
n <- length(date)
df <- matrix(NA, n, 2)
df <- data.frame(df)
var <- sample(1000:5000, n)
df[1] <- date
df[2] <- var
names(df) <- c("date", "var")
df <- as_tbl_time(df, index = date)
year <- paste("1979-09")
paste0("~", '"', year, '"')
#> [1] "~\"1979-09\""
as.formula(paste0("~", '"', year, '"'))
#> ~"1979-09"
filter_time(df, as.formula(paste0("~", '"', year, '"')))
#> # A time tibble: 1 x 2
#> # Index: date
#> date var
#> <date> <int>
#> 1 1979-09-01 3600

We could use reformulate to construct the expression
library(tibbletime)
filter_time(df, reformulate(termlabels = dQuote("1979-09", FALSE)))
-output
# A time tibble: 1 × 2
# Index: date
date var
<date> <int>
1 1979-09-01 1700

Related

format not being applied to data frame

I try to apply formatting to a data frame created from a printed TableOne object but it won't "stick"
Sample
library(dplyr)
library(tableone)
data(ovarian)
data <- ovarian
data$futime <- data$futime * 100
vars <- c("futime","fustat")
catvars <- c("fustat")
table1 <- CreateTableOne(vars = vars, factorVars = catvars,strata = "rx", data = data)
print(table1, printToggle = F, quote = F) %>%as.data.frame() %>% format(big.mark = ",")
Result:
1 2 p test
n 13 13
futime (mean (SD)) 51730.77 (34688.14) 68176.92 (32467.63) 0.224
fustat = 1 (%) 7 (53.8) 5 (38.5) 0.694
This behaves similar to simply creating my own data frame
c(1213,2,3,1213,2,3) %>% table()%>% as.data.frame() %>% format(big.mark = ",")
Result:
. Freq
1 2 2
2 3 2
3 1213 2
This is unlike when simply using the format option on a numeric variable or even a one column data frame
123321789 %>% format(big.mark = ",")
Result:
[1] "123,321,789"
or
c(1213,2,3,1213,2,3) %>% as.data.frame() %>% format(big.mark = ",")
Result:
1 1,213
2 2
3 3
4 1,213
5 2
6 3
This is linked to the fact that table returns factors.
The examples you provided apply to numeric data.
Try:
library(dplyr)
result <- c(1213,2,3,1213,2,3) %>% table() %>% as.data.frame
class(result$.)
#> [1] "factor"
result %>% mutate_all( ~format(as.numeric(as.character(.x)),big.mark=','))
#> . Freq
#> 1 2 2
#> 2 3 2
#> 3 1,213 2
OK, so this one lead me down a rabbit hole of text formatting.
Ended up writing a function to address the matter, used address some weird rounding issues.
styleTableOne <- function(x){
if(!is.na(as.numeric(x))){return(format(as.numeric(x),big.mark = ","))}
if(x == ""){return(x)}
if(x == "<0.001"){return(x)}
if(x == "0.0"){return(x)}
if(x == " "){return(x)}
if (length(strsplit(x, split = "(", fixed = T)[[1]]) == 2){
set1 <- strsplit(x, split = "(",fixed = T)[[1]][1] %>% as.numeric()
set2 <- strsplit(x, split = "(",fixed = T)[[1]][2] %>% str_remove(fixed(")")) %>% as.numeric()
set1 <- case_when(
set1 > 100 ~ round(set1,0),
set1 > 25 ~ round(set1,1),
T ~ round(set1,2)
)
set2 <- case_when(
set1 > 100 ~ round(set2,0),
set1 > 25 ~ round(set2,1),
T ~ round(set2,2)
)
set1 %<>% format(big.mark = ",")
set2 %<>% format(big.mark = ",")
set <- paste(set1,set2,sep = " (")
set <- paste0(set,")")
return(set)}
x %>% strsplit(split = " ",fixed = T) %>% .[[1]] -> x
x <- subset(x, x != "")
set1 <- strsplit(x, split = " ",fixed = T)[1] %>% as.numeric()
set2 <- strsplit(x, split = " ",fixed = T)[2] %>% str_remove(fixed("[")) %>% str_remove(fixed(",")) %>% as.numeric()
set3 <- strsplit(x, split = " ",fixed = T)[3] %>% str_remove(fixed("]")) %>% as.numeric()
set1 <- case_when(
set1 > 100 ~ round(set1,0),
set1 > 25 ~ round(set1,1),
T ~ round(set1,2)
)
set2 <- case_when(
set1 > 100 ~ round(set2,0),
set1 > 25 ~ round(set2,1),
T ~ round(set2,2)
)
set3 <- case_when(
set1 > 100 ~ round(set3,0),
set1 > 25 ~ round(set3,1),
T ~ round(set3,2)
)
set1 %<>% format(big.mark = ",")
set2 %<>% format(big.mark = ",")
set3 %<>% format(big.mark = ",")
set <- paste0(set1," (",set2,"-",set3,")")
return(set)
}
Then you can do:
library(survival)
library(dplyr)
library(tableone)
data(ovarian)
data <- ovarian
data$futime <- data$futime * 100
vars <- c("futime","fustat")
catvars <- c("fustat")
table1 <- CreateTableOne(vars = vars, factorVars = catvars,strata = "rx", data = data)
print(table1, printToggle = F) %>%
as.data.frame() %>%
sapply(sapply, styleTableOne) %>%
as.data.frame(row.names = row.names(print(table1)))
result:
1 2 p test
n 13 13
futime (mean (SD)) 51,731 (34,688) 68,177 (32,468) 0.224
fustat = 1 (%) 7 (53.8) 5 (38.5) 0.694

How to put/save all elements of a List into one Excel sheet in R?

I have a list (bbb) with 5 elements in it, i.e., each element for a year, like 2010, 2011, ... , 2014:
The first one in the list is this:
> bbb[1]
$`2010`
Date Average
X2010.01.01 2010-01-01 2.079090e-03
X2010.01.02 2010-01-02 5.147627e-04
X2010.01.03 2010-01-03 2.997464e-04
X2010.01.04 2010-01-04 1.375538e-04
X2010.01.05 2010-01-05 1.332109e-04
The second one in the list is this:
> bbb[2]
$`2011`
Date Average
X2011.01.01 2011-01-01 1.546253e-03
X2011.01.02 2011-01-02 1.152864e-03
X2011.01.03 2011-01-03 1.752446e-03
X2011.01.04 2011-01-04 2.639658e-03
X2011.01.05 2011-01-05 5.231150e-03
X2011.01.06 2011-01-06 8.909878e-04
And so on.
Here is my question:
How can I save all of these list's elements in 1 sheet of an Excel file to have something like this:
Your help would be highly appreciated.
You can do this using dcast.
bbb <- list(`2010` = data.frame(date = as.Date("2010-01-01") + 0:4,
avg = 1:5),
`2011` = data.frame(date = as.Date("2011-01-01") + 0:5,
avg = 11:16),
`2012` = data.frame(date = as.Date("2012-01-01") + 0:9,
avg = 21:30),
`2013` = data.frame(date = as.Date("2013-01-01") + 0:7,
avg = 21:28))
df <- do.call("rbind", bbb)
df$year <- format(df$date, format = "%Y")
df$month_date <- format(df$date, format = "%b-%d")
library(data.table)
library(openxlsx)
df_dcast <- dcast(df, month_date~year, value.var = "avg")
write.xlsx(df_dcast, "example1.xlsx")
Or using spread
library(dplyr)
library(tidyr)
df2 <- df %>%
select(-date) %>%
spread(key = year, value = avg)
write.xlsx(df2, "example2.xlsx")
This isn't very pretty, but it's the best I could think of right now. But you could take the dataframes and loop through the list, joining them by date like this:
library(tidyverse)
library(lubridate)
bbb <- list(`2010` = tibble(date = c('01-01-2010', '01-02-2010', '01-03-2010', '01-04-2010', '01-05-2010'),
average = 11:15),
`2011` = tibble(date = c('01-01-2011', '01-02-2011', '01-03-2011', '01-04-2011', '01-05-2011'),
average = 1:5),
`2012` = tibble(date = c('01-01-2012', '01-02-2012', '01-03-2012', '01-04-2012', '01-05-2012'),
average = 6:10))
for (i in seq_along(bbb)) {
if(i == 1){
df <- bbb[[i]] %>%
mutate(
date = paste(day(as.Date(date, format = '%m-%d-%Y')),
month(as.Date(date, format = '%m-%d-%Y'), label = TRUE),
sep = '-')
)
colnames(df) <- c('date', names(bbb[i])) # Assuming your list of dataframes has just 2 columns: date and average
} else {
join_df <- bbb[[i]] %>%
mutate(
date = paste(day(as.Date(date, format = '%m-%d-%Y')),
month(as.Date(date, format = '%m-%d-%Y'), label = TRUE),
sep = '-')
)
colnames(join_df) <- c('date', names(bbb[i]))
df <- full_join(df, join_df, by = 'date')
}
}
This loops through the list of dataframes and reformats the dates to Day-Month.
# A tibble: 5 x 4
date `2010` `2011` `2012`
<chr> <int> <int> <int>
1 1-Jan 11 1 6
2 2-Jan 12 2 7
3 3-Jan 13 3 8
4 4-Jan 14 4 9
5 5-Jan 15 5 10
You could then write that out with the writexl package function write_xlsx

Using the pipe in selfmade function with tidyeval (quo_name)

I have two functions: date_diff and group_stat. So I have read this article tidyverse and I try so create simple functions and use the pipe.
The first function creates a difftime and names them timex_minus_timey but when I pipe this result into the next function I have to look at the name so I can fill in summary_var. Is there a better way to do this?
library(tidyverse)
#
set.seed(42)
data <- dplyr::bind_rows(
tibble::tibble(Hosp = rep("A", 1000),
drg = sample(letters[1:5], 1000, replace = TRUE),
time1 = as.POSIXlt("2018-02-03 08:00:00", tz = "UTC") + rnorm(1000, 0, 60*60*60),
time2 = time1 + runif(1000, min = 10*60, max = 20*60)),
tibble::tibble(Hosp = rep("B", 1000),
drg = sample(letters[1:5], 1000, replace = TRUE),
time1 = as.POSIXlt("2018-02-03 08:00:00", tz = "UTC") + rnorm(1000, 0, 60*60*60),
time2 = time1 + runif(1000, min = 10*60, max = 20*60))
)
date_diff <- function(df, stamp1, stamp2, units = "mins"){
stamp1 <- rlang::enquo(stamp1)
stamp2 <- rlang::enquo(stamp2)
name <- paste0(rlang::quo_name(stamp1), "_minus_", rlang::quo_name(stamp2))
out <- df %>%
dplyr::mutate(!!name := as.numeric(difftime(!!stamp1, !!stamp2, units=units)))
out
}
group_stat <- function(df, group_var, summary_var, .f) {
func <- rlang::as_function(.f)
group_var <- rlang::enquo(group_var)
summary_var <-rlang::enquo(summary_var)
name <- paste0(rlang::quo_name(summary_var), "_", deparse(substitute(.f)))
df %>%
dplyr::group_by(!!group_var) %>%
dplyr::summarise(!!name := func(!!summary_var, na.rm = TRUE))
}
data %>%
date_diff(time2, time1) %>%
group_stat(Hosp, summary_var = time2_minus_time1, mean)
#> # A tibble: 2 x 2
#> Hosp time2_minus_time1_mean
#> <chr> <dbl>
#> 1 A 15.1
#> 2 B 14.9
Created on 2019-05-02 by the reprex package (v0.2.1)
If you intend to always use these functions one after another in this way you could add an attribute containing the new column's name with date_diff, and have group_stat use that attribute. With the if condition, the attribute is only used if it exists and the summary_var argument is not provided.
date_diff <- function(df, stamp1, stamp2, units = "mins"){
stamp1 <- rlang::enquo(stamp1)
stamp2 <- rlang::enquo(stamp2)
name <- paste0(rlang::quo_name(stamp1), "_minus_", rlang::quo_name(stamp2))
out <- df %>%
dplyr::mutate(!!name := as.numeric(difftime(!!stamp1, !!stamp2, units=units)))
attr(out, 'date_diff_nm') <- name
out
}
group_stat <- function(df, group_var, summary_var, .f) {
if(!is.null(attr(df, 'date_diff_nm')) & missing(summary_var))
summary_var <- attr(df, 'date_diff_nm')
group_var <- rlang::enquo(group_var)
name <- paste0(summary_var, "_", deparse(substitute(.f)))
df %>%
dplyr::group_by(!!group_var) %>%
dplyr::summarise_at(summary_var, funs(!!name := .f), na.rm = T)
}
data %>%
date_diff(time2, time1) %>%
group_stat(Hosp, .f = mean)
# # A tibble: 2 x 2
# Hosp time2_minus_time1_mean
# <chr> <dbl>
# 1 A 15.1
# 2 B 14.9

Find max of rows from specific columns and extract column name and corresponding row value from another column

Here is a data structure that I have:
structure(list(UDD_beta = c(1.17136554204268, 0.939587997289016
), UDD_pval = c(0, 0), UDD_R.sq = c(0.749044972637797, 0.516943886705951
), SSX_beta = c(1.05356804780772, 0.927948300464624), SSX_pval = c(0,
0), SSX_R.sq = c(0.60226298037862, 0.629111666509209), SPP_beta = c(0.675765151939885,
0.516425218613404), SPP_pval = c(0, 0), SPP_R.sq = c(0.479849538274406,
0.378266618442121), EEE_beta = c(0.690521022226874, 0.639380962824289
), EEE_pval = c(0, 0), EEE_R.sq = c(0.585610742768951, 0.676073352909597
)), .Names = c("UDD_beta", "UDD_pval", "UDD_R.sq", "SSX_beta",
"SSX_pval", "SSX_R.sq", "SPP_beta", "SPP_pval", "SPP_R.sq",
"EEE_beta", "EEE_pval", "EEE_R.sq"), row.names = c("DDK", "DDL"
), class = "data.frame")
I want to take R.sq columns and for each row find the max and the column name of the max value. Then take corresponding beta. Expected output:
Name Value
DDK UDD 1.17136554204268
DDL EEE 0.690521022226874
Sorry, the second expected value should be 0.639380962824289.
We could use max.col. Subset the columns of interest i.e. columns that have 'R.sq' using the grep, then get the column index of max value with max.col. Use that to get the column names and also the values that correspond to a particular row (row/column indexing)
i1 <- grep("R.sq", names(df1))
i2 <- max.col(df1[i1], "first")
i3 <- grep("beta", names(df1))
res <- data.frame(Names = sub("_.*", "", names(df1)[i1][i2]),
Value = df1[i3][cbind(1:nrow(df1), i2)])
row.names(res) <- row.names(df1)
sub_data <- data[grep("R.sq", colnames(data))]
colnames(sub_data) <- gsub("_R.sq", "", colnames(sub_data))
sub_data$Name <- NA
sub_data$Value <- NA
for (i in 1:nrow(sub_data)){
sub_data$Name[i] <- names(sub_data[i,])[which.max(apply(sub_data[i,], 2, max))]
sub_data$Value[i] <- max(data[grep(paste0(sub_data$Name[i], "_beta"), colnames(data))], na.rm=T)
}
sub_data[c("Name", "Value")]
# Name Value
#DDK UDD 1.171366
#DDL EEE 0.690521
You can use a tidyverse approach via gathering your df to long and filtering both R.sq vars and max value, i.e.
library(tidyverse)
df %>%
rownames_to_column('ID') %>%
gather(var, val, -ID) %>%
filter(grepl('R.sq|beta', var)) %>%
group_by(ID) %>%
mutate(max1=as.integer(val == max(val[grepl('R.sq', var)]))) %>%
group_by(ID, grp = sub('_.*', '', var)) %>%
filter(!all(max1 == 0) & grepl('beta', var)) %>%
ungroup() %>% select(-c(max1, grp))
which gives,
# A tibble: 2 x 3
ID var val
<chr> <chr> <dbl>
1 DDK UDD_beta 1.171366
2 DDL EEE_beta 0.639381
# Need ID for all possible betas and Rsq
ID <- gsub("_R.sq", "", grep("_R.sq$", names(INPUT), value = TRUE))
dummy <- function(x) {
# Find out which Rsq is largest
i <- ID[which.max(x[paste0(ID, "_R.sq")])]
# Extract beta for largest Rsq
data.frame(Name = i, Value = x[paste0(i, "_beta")])
}
do.call("rbind", apply(INPUT, 1, dummy))

Aggregate if string contains specific text in R

I've seen a lot of posts on this topic so apologies if this is a duplicate but I couldn't figure out my problem.
I have
df <- data.frame(name = c('bike+ride','shoe+store','ride','mountian%20bike','ride+along'),
count = c(2,5,8,7,6))
and want to sum each count if it name contains a string group
group <- data.frame(group = c('ride','bike'))
So the end result looks as follows:
Group Count
bike 9
ride 16
Can anyone help?
A base R idea,
sapply(sapply(as.character(group$group), function(i) grep(i, df$name)), function(i) sum(df$count[i]))
#or make it a function
aggr1 <- function(var1, grp, cnt){
m1 <- sapply(as.character(grp), function(i) grep(i, var1))
final_d <- sapply(m1, function(i) sum(cnt[i]))
return(data.frame(Group = names(final_d),
Count = as.integer(final_d), stringsAsFactors = FALSE)
)
}
aggr1(df$name, group$group, df$count)
# Group Count
#1 ride 16
#2 bike 9
One way is
do.call(rbind, sapply(group$group, FUN = function(x, df) {
out <- df[grepl(pattern = x, x = df$name), ]
data.frame(group = x, count = sum(out$count))
}, df = df, simplify = FALSE))
group count
1 ride 16
2 bike 9
In two steps:
# make a data.frame which locates where each group level is located
grp <- as.data.frame(sapply(group$group, FUN = function(x) grepl(pattern = x, x = df$name)))
names(grp) <- group$group
# based on above location (TRUE/FALSE), sum accordingly
data.frame(count = apply(grp, MARGIN = 2, FUN = function(x, df) {
sum(df[x, "count"])
}, df = df))
count
ride 16
bike 9
A way using tidyverse packages purrr, dplyr and tidyr:
library(tidyverse) # for dplyr, purr and tidyr
groups <- c('ride','bike')
map_df(groups, ~setNames(summarize_(df, interp(~sum(df$count[grepl(var, name)], na.rm = TRUE), var = .x)), .x)) %>%
gather(group, count, na.rm = TRUE)

Resources