Related
I'm trying to pass vectors, each with a different number of NA values, through to a map() function but it's returning an error.
I have a tibble of N numeric columns and 1 categorical column. I want to compare the distributions for each of the numeric columns against the other split by the values of the categorical column. I use overlapping::overlap() to calculate the overlap of the distributions, and i feed the numeric columns into a map_dfr function for the iteration. For example:
require(overlapping)
require(dplyr)
require(purrr)
set.seed( 1 )
n <- 100
G1 <- sample( 0:30, size = n, replace = TRUE )
G2 <- sample( 0:30, size = n, replace = TRUE, prob = dbinom( 0:30, 31, .55 ))
G3 <- sample( 0:30, size = n, replace = TRUE, prob = dbinom( 0:30, 41, .65 ))
Data <- data.frame(y = G1, x = G2, z = G3, group = rep(c("G1","G2", "G3"), each = n), class = rep(c("C1","C2", "C3"), each = 1)) %>% as_tibble()
Data
overlap_fcn <- function(.x) {
## construct list of vectors
dist_list <- list(
"C1" = Data %>%
filter(class == 'C1', !is.na(.x)) %>%
pull(.x),
"C2" = Data %>%
filter(class == 'C2', !is.na(.x)) %>%
pull(.x),
"C3" = Data %>%
filter(class == 'C3', !is.na(.x)) %>%
pull(.x)
)
## calculate distribution overlaps
return(
enframe(
overlapping::overlap(dist_list)$OV*100
) %>%
mutate(value = paste0(round(value, 2), "%"),
class = .x) %>%
rename(comparison = name, overlap = value) %>%
relocate(class)
)
}
overlap_table <- purrr::map_dfr(
.x = c('y', 'x', "z"),
.f = ~overlap_fcn(.x))
overlap_table
The above works as intended. However, in practice I have different amounts of missingess in each of x, y, and z. I try to account for this with the filter on !is.na(.x) but it's not working. For example:
Data$x[1:3] <- NA
Data$y[10:20] <- NA
Data$z[100:150] <- NA
overlap_table <- purrr::map_dfr(
.x = c('x', 'y', "z"),
.f = ~overlap_fcn(.x))
returns this error:
Error in density.default(x[[j]], n = nbins, ...): 'x' contains missing values
Error in density.default(x[[j]], n = nbins, ...): 'x' contains missing values
Traceback:
1. purrr::map_dfr(.x = c("x", "y", "z"), .f = ~overlap_fcn(.x))
2. map(.x, .f, ...)
3. .f(.x[[i]], ...)
4. overlap_fcn(.x)
5. enframe(overlapping::overlap(dist_list)$OV * 100) %>% mutate(value = paste0(round(value,
. 2), "%"), class = .x) %>% rename(comparison = name, overlap = value) %>%
. relocate(class) # at line 25-33 of file <text>
6. relocate(., class)
7. rename(., comparison = name, overlap = value)
8. mutate(., value = paste0(round(value, 2), "%"), class = .x)
9. enframe(overlapping::overlap(dist_list)$OV * 100)
10. overlapping::overlap(dist_list)
11. density(x[[j]], n = nbins, ...)
12. density.default(x[[j]], n = nbins, ...)
13. stop("'x' contains missing values")
Can anyone help me out here please? I'm sure it's something super obvious i'm missing; i just can't see what!
Here, the .x is character class. We may need to convert to symbol and evaluate (!!)
overlap_fcn <- function(.x) {
## construct list of vectors
dist_list <- list(
"C1" = Data %>%
filter(class == 'C1', !is.na(!! rlang::sym(.x))) %>%
pull(.x),
"C2" = Data %>%
filter(class == 'C2', !is.na(!! rlang::sym(.x))) %>%
pull(.x),
"C3" = Data %>%
filter(class == 'C3', !is.na(!! rlang::sym(.x))) %>%
pull(.x)
)
## calculate distribution overlaps
return(
enframe(
overlapping::overlap(dist_list)$OV*100
) %>%
mutate(value = paste0(round(value, 2), "%"),
class = .x) %>%
rename(comparison = name, overlap = value) %>%
relocate(class)
)
}
-testing after creating the NAs in Data
> purrr::map_dfr(
+ .x = c('x', 'y', "z"),
+ .f = ~overlap_fcn(.x))
# A tibble: 9 × 3
class comparison overlap
<chr> <chr> <chr>
1 x C1-C2 98.61%
2 x C1-C3 97.46%
3 x C2-C3 97.5%
4 y C1-C2 95.47%
5 y C1-C3 96.22%
6 y C2-C3 97.14%
7 z C1-C2 90.17%
8 z C1-C3 94.9%
9 z C2-C3 89.24%
All formatting in this Flexible table is as desired, except I prefer the values in row 'n' to show no decimals. Rounding the row values does not produce the desired result.
library(flextable)
tab_tbl <- tibble(Date = c( "2000-12-27", "2000-12-28", "2000-12-29", "2000-12-30", "2000-12-30", "n", "Manster" ),
Col1 = c(runif(5), 5, 3.75325),
Col2 = c(runif(5), 5, 4.3892),
Col3 = c(runif(5), 5, 5.789)
)
tab_tbl[ , 2] <- round( tab_tbl[ , 2], 3)
tab_tbl[ , 3:4] <- round( tab_tbl[ , 3:4], 2)
tab_tbl[6, 2:4] <- round( tab_tbl[6, 2:4], 0)
small_border = fp_border(color="gray", width = 1)
my_table <- flextable( tab_tbl )
my_table %>%
width(j=1:4, width = 1.2) %>%
flextable::align(align = "center", j = c(2:4), part = "all") %>%
hline( i=5, border = small_border )
One solution is to use colformat_double(digits = 0):
library(flextable)
tab_tbl <- tibble(Date = c( "2000-12-27", "2000-12-28", "2000-12-29", "2000-12-30", "2000-12-30", "n", "Manster" ),
Col1 = c(runif(5), 5, 3.75325),
Col2 = c(runif(5), 5, 4.3892),
Col3 = c(runif(5), 5, 5.789)
)
tab_tbl[ , 2] <- round( tab_tbl[ , 2], 3)
tab_tbl[ , 3:4] <- round( tab_tbl[ , 3:4], 2)
tab_tbl[6, 2:4] <- round( tab_tbl[6, 2:4], 0)
small_border = fp_border_default(color="gray")
my_table <- flextable( tab_tbl )
my_table %>%
width(j=1:4, width = 1.2) %>%
flextable::align(align = "center", j = c(2:4), part = "all") %>%
hline( i=5, border = small_border ) %>%
colformat_double(i = ~ Date == "n", digits = 0)
I have created a function, for getting summary of average, percentile. but not I want that summary for particular subsets. so I have created subsets accordingly.
but my function is not working properly.
so actually I am trying to update my function so that I can get a summary for list of variables as variable name and summary can be rbind for multiple list of variables.
I have no Idea how can i put "ALL", "MM" as name of variable in my function.
so that the summary for both can be rbind itself
df <- data.frame(Name = c("asdf","kjhgf","cvbnm","rtyui","cvbnm","jhfd","cvbnm","sdfghj","cvbnm","dfghj","cvbnm"),
sale=c(27,28,27,16,14,25,14,14,19,18,28),
city=c("CA","TX","MN","NY","TX","MT","HU","KL","TX","SA","TX"),
Dept = c("HH","MM","NN","MM","AA","VV","MM","HU","JJ","MM","ZZ"))
df1<- df
df$cc1<-1
df2<- subset(df, Dept == 'MM')
df$cc2<-ifelse(df$Dept == 'MM',1,NA)
lst<-list(df$cc1, df$cc2)
listd<-list("ALL" = df1, "MM" =df2)
#I want to run my function for listd so that i can get a combined summary for all variables in listd
tt2<-function(data,var,footer,Name_of_variable,decimal){
for (d in 1:length(data)) {
cat('\n\n#### ', names(data)[d], '\n\n')
md<-data[[d]]
table_list<-list()
for (i in 1:length(d))
table_list[[i]]<-t1(md,var,footer,decimal,Name_of_variable)
tt<- do.call(rbind,table_list)
}
cat(knit_print(tt))
cat('\n\n')
}
t1<-function(dataset,var,Suff,decimal,Name_of_variable){
numdig <- if (decimal == TRUE) {1} else {0}
var <- rlang::parse_expr(var)
summ_tab1<- dataset %>% filter(!is.na(!!var)) %>% summarise(
q25 = format(round(quantile(!! var, type=6, probs = seq(0, 1, 0.25), na.rm=TRUE)[2],digits = numdig),nsmall = numdig),
Median = format(round(quantile(!! var, type=6, probs = seq(0, 1, 0.25), na.rm=TRUE)[3],digits = numdig),nsmall = numdig),
Average = format(round( mean(!! var, na.rm=TRUE),digits = numdig),nsmall = numdig),
q75 = format(round(quantile(!! var, type=6, probs = seq(0, 1, 0.25), na.rm=TRUE)[4],digits = numdig) ,nsmall = numdig),
N = sum(!is.na(!!var)))
summ_tab<-summ_tab1 %>%
mutate(" "=!!Name_of_variable,
q25 = q25,
Median =Median,
Average =Average,
q75 = q75)%>%
dplyr::rename(
`25th percentile` = q25,
`75th percentile` = q75)%>%select(" ",N,everything())
summ_tab1
}
tt2(data = listd,var = "sale",Name_of_variable = "listd",decimal = TRUE)
Previously I was getting summary like below
but now the output summary should be like , name of variable should be in rows.
I've slightly rewritten your t1 function and make use of the fact that it returns a dataframe. This can be used together with purrr::map_dfr:
library(dplyr)
df <- data.frame(Name = c("asdf","kjhgf","cvbnm","rtyui","cvbnm","jhfd","cvbnm","sdfghj","cvbnm","dfghj","cvbnm"),
sale=c(27,28,27,16,14,25,14,14,19,18,28),
city=c("CA","TX","MN","NY","TX","MT","HU","KL","TX","SA","TX"),
Dept = c("HH","MM","NN","MM","AA","VV","MM","HU","JJ","MM","ZZ"))
df1<- df
df$cc1<-1
df2<- subset(df, Dept == 'MM')
df$cc2<-ifelse(df$Dept == 'MM',1,NA)
lst<-list(df$cc1, df$cc2)
listd<-list("ALL" = df1, "MM" =df2)
t1 <- function(dataset, var, decimal){
numdig <- if (decimal == TRUE) {
1
} else {
0
}
var <- rlang::parse_expr(var)
dataset %>%
filter(!is.na(!!var)) %>%
summarise(
q25 = format(round(quantile(!!var,
type = 6,
probs = seq(0, 1, 0.25),
na.rm=TRUE)[2],
digits = numdig),
nsmall = numdig),
Median = format(round(quantile(!!var,
type = 6,
probs = seq(0, 1, 0.25), na.rm=TRUE)[3],
digits = numdig),
nsmall = numdig),
Average = format(round(mean(!!var,
na.rm = TRUE),
digits = numdig),
nsmall = numdig),
q75 = format(round(quantile(!!var,
type = 6,
probs = seq(0, 1, 0.25),
na.rm = TRUE)[4],
digits = numdig),
nsmall = numdig),
N = sum(!is.na(!!var))) %>%
rename(
`25th percentile` = q25,
`75th percentile` = q75)
}
listd %>%
purrr::map_dfr(~t1(dataset = .x, var = "sale", decimal = TRUE), .id = " ")
#> 25th percentile Median Average 75th percentile N
#> 1 ALL 14.0 19.0 20.9 27.0 11
#> 2 MM 14.5 17.0 19.0 25.5 4
Created on 2020-09-23 by the reprex package (v0.3.0)
I have a list of 2D matrices. Each matrix is filled using the function fillMatrices. This function adds a number of individuals to each day 0 in a matrix and updates the columns a_M, b_M and c_M. The numbers of individuals come from an initial matrix ind. The code works but it is slow when the number of matrices within the list is large. For example with n = 10000:
user system elapsed
3.73 0.83 4.55
If possible, I would like to reduce the elapsed time to <= 1 sec and increase the n to 720000 matrices. So, I am looking for way to optimize only the section 3. Here is the code:
###############################################
###############################################
## Section 3
## Run the function "fillMatrices"
indexTime <- 1
dt_t_1 <- do.call(rbind, lapply(list_matrices, function(x) x[1,]))
dt_t <- fillMatrices(dt_t_1 = dt_t_1, species = c("a_M", "b_M", "c_M"), maxDuration = 5, matrixColumns = col_mat)
## Fill the matrices within the list
system.time(for(i in 1:n){
list_matrices[[i]][indexTime + 1,] <- dt_t[,i]
})
## test <- list_matrices[[1]]
The code of the section 1 is used to initialize the matrices and the function fillMatrices can be found in the section 2. In my example, the function is used to fill matrices for one species. In reality, the function is used for 3 species (i.e., is applied three times) by changing the argument species = c("a_M", "b_M", "c_M"). How can I speed up my code? Any advice would be much appreciated.
Here are the codes of sections 1 and 2:
rm(list=ls(all=TRUE))
library(ff)
library(dplyr)
set.seed(12345)
## Define the number of individuals
n <- 10000
###############################################
###############################################
## Section 1
## Build the list of 2D matrices
v_date <- as.vector(outer(c(paste(seq(0, 1, by = 1), "day", sep="_"), paste(seq(2, 5, by = 1), "days", sep="_")), c("a_M", "b_M", "c_M"), paste, sep="|"))
col_mat <- c("year", "day", "time", "ID", "died", v_date)
list_matrices <- list()
for(i in 1:n){
print(i)
list_matrices[[i]] <- ff(-999, dim=c(3650, length(col_mat)), dimnames=list(NULL, col_mat), vmode="double", overwrite = TRUE)
}
## test <- list_matrices[[1]]
## dim(list_matrices[[1]])
## Fill the first row of each matrix
for(i in 1:n){
print(i)
list_matrices[[i]][1,] <- c(1, 1, 1, i-1, 0, rep(0, length(v_date)))
}
## test <- list_matrices[[2]]
## Build the matrix "individual"
ind <- as.matrix(data.frame(year = rep(1, n), day = rep(1, n), time = rep(1, n), died = rep(0, n), ID = (seq(1, n, 1))- 1, a_M = sample(1:10, n, replace = T), b_M = sample(1:10, n, replace = T), c_M = sample(1:10, n, replace = T)))
## print(ind)
###############################################
###############################################
## Section 2
## Function to convert a data frame into a matrix
convertDFToMat <- function(x){
mat <- as.matrix(x[,-1])
ifelse(is(x[,1], "data.frame"), rownames(mat) <- pull(x[,1]), rownames(mat) <- x[,1])
## Convert character matrix into numeric matrix
mat <- apply(mat, 2, as.numeric)
return(mat)
}
## Define the function that is used to fill the matrices within the list
fillMatrices <- function(dt_t_1, species, maxDuration, matrixColumns){
## Format data
dt <- as.data.frame(dt_t_1) %>%
reshape::melt(id = c("ID")) %>%
arrange(ID) %>%
dplyr::mutate_all(as.character)
## summary(dt)
## Break out the variable "variable" into different columns, with one row for each individual-day
dt_reshape_filter_1 <- dt %>%
dplyr::filter(!variable %in% c("year", "day", "time", "ID", "died")) %>%
dplyr::mutate(day = variable %>% gsub(pattern = "\\_.*", replacement = "", x = .), col = variable %>% gsub(pattern = ".*\\|", replacement = "", x = .)) %>%
dplyr::select(-variable) %>%
tidyr::spread(col, value) %>%
dplyr::mutate_all(as.numeric) %>%
dplyr::arrange(ID, day)
## summary(dt_reshape_filter_1)
## Apply requested transformations and build the data frame
dt_transform <- dt_reshape_filter_1 %>%
dplyr::rename_at(vars(species), ~ c("a", "b", "c")) %>%
dplyr::mutate(day = day + 1) %>%
dplyr::filter(day < maxDuration + 1) %>%
dplyr::bind_rows(tibble(ID = ind[,c("ID")], day = 0, a = ind[,c("a_M")], b = ind[,c("b_M")])) %>%
dplyr::mutate(c = a + b) %>%
dplyr::rename_at(vars("a", "b", "c"), ~ species) %>%
dplyr::arrange(ID, day)
## summary(dt_transform)
## Take different columns of the data frame and gather them into a single column
dt_gather <- dt_transform %>%
tidyr::gather(variable, value, species) %>%
dplyr::mutate(day = if_else(day > 1, paste0(day, "_days"), paste0(day, "_day"))) %>%
tidyr::unite(variable, c("day", "variable"), sep = "|") %>%
dplyr::rename(var2 = ID) %>%
dplyr::mutate_all(as.character)
## summary(dt_gather)
## Add the other columns in the data frame and convert the resulting data frame into a matrix
dt_reshape_filter_2 <- dt %>%
dplyr::rename(var2 = ID) %>%
dplyr::filter(variable %in% c("year", "day", "time", "ID", "died")) %>%
tidyr::spread(variable, value) %>%
dplyr::arrange(as.numeric(var2)) %>%
dplyr::mutate(year = ind[,c("year")],
day = ind[,c("day")],
time = ind[,c("time")],
ID = ind[,c("ID")],
died = ind[,c("died")]) %>%
tidyr::gather(variable, value, c(year, day, time, ID, died)) %>%
dplyr::arrange(as.numeric(var2)) %>%
dplyr::mutate_all(as.character)
## summary(dt_reshape_filter_2)
## Build the output matrix
dt_bind <- bind_rows(dt_reshape_filter_2, dt_gather) %>%
tidyr::spread(var2, value) %>%
dplyr::arrange(match(variable, matrixColumns)) %>%
dplyr::select("variable", as.character(ind[,c("ID")]))
## summary(dt_bind)
dt_mat <- convertDFToMat(dt_bind)
## summary(dt_mat)
return(dt_mat)
}
Making a 3D array instead of a 2D list of matrices gives you more options
library(ff)
library(dplyr)
set.seed(12345)
## Define the number of individuals
n <- 10000L
n_row <- 3650L
#array way:
v_date <- as.vector(outer(c(paste(seq(0, 1, by = 1), "day", sep="_"), paste(seq(2, 5, by = 1), "days", sep="_")), c("a_M", "b_M", "c_M"), paste, sep="|"))
col_mat <- c("year", "day", "time", "ID", "died", v_date)
arr1 <- ff(-999L, dim = c(n_row, length(col_mat), n), dimnames = list(NULL, col_mat, NULL))
## Fill the first row of each matrix slice
arr1[1, , ] <- c(1L, 1L, 1L, NA, 0L, rep(0L, length(v_date)))
arr1[1, 4, ] <- seq_len(n)-1L
## Build the matrix "individual"
ind <- as.matrix(data.frame(year = rep(1L, n), day = rep(1L, n), time = rep(1L, n), died = rep(0L, n), ID = (seq(1L, n, 1L))- 1L, a_M = sample(1L:10L, n, replace = T), b_M = sample(1L:10L, n, replace = T), c_M = sample(1L:10L, n, replace = T)))
##fill the matrix
indexTime <- 1L
dt_t <- fillMatrices(dt_t_1 = t(arr1[1, ,]), species = c("a_M", "b_M", "c_M"), maxDuration = 5, matrixColumns = col_mat)
## reassign
system.time(
arr1[indexTime + 1, ,] <- dt_t
)
user system elapsed
0.05 0.70 0.7
# for comparison
#> system.time(for(i in 1:n){
#+ list_matrices[[i]][indexTime + 1,] <- dt_t[,i]
#+ })
# user system elapsed
# 4.75 1.08 5.90
As far as I can tell, it's giving me the same results as your original approach but does so a lot faster.
I am struggling to write a function fun2 that uses fun1... and keep getting errors. I have written a simplified example below. It is the first time I deal with "tidy evaluation" and not sure to understand the in and outs of it.
Example dataframes:
d1 = data.frame(
ID = c("A", "A", "A", "B", "B", "C", "C", "C", "C"),
EXPR = c(2, 8, 3, 5, 7, 20, 1, 5, 4)
)
d2 = data.frame(
ID = c("A", "B", "C"),
NUM = c(22, 50, 31)
)
First function
fun1 <- function(
df1 = "df 1",
df2 = "df 2",
t1 = "threshold 1",
expr_col = "expr column",
id_col = "sample column - must be present in df1 and df2") {
# dataframes
df <- df1
db <- df2
# quosure
enquo_id <- enquo(id_col)
enquo_expr <- enquo(expr_col)
# classify
df <- df %>%
mutate(threshold = t1) %>%
mutate(class = ifelse(!!enquo_expr > t1, "positive", "negative")) %>%
mutate(class = factor(class, levels = c("positive", "negative")))
# calculate sample data
df.sum <- df %>%
group_by(!!enquo_id, class) %>%
summarise(count = n()) %>%
complete(class, fill = list(count = 0)) %>%
mutate(total = sum(count), freq = count/total)
# merge dataframes
df.sum <- left_join(df.sum, db, by = quo_name(enquo_id))
# return
return(df.sum)
}
If I run a test of this, I get a dataframe in return, as expected
test <- fun1(df1 = d1, df2 = d2, t1 = 3, expr_col = EXPR, id_col = ID)
Second funtion
Now with fun2, I am trying to use fun1 in a for loop to iterate from ti to tf of the seq vector:
fun2 <- function(
df1 = "df 1",
df2 = "df 2",
expr_col = "expr column",
id_col = "sample column - must be present in df1 and df2",
ti = "initial value",
tf = "final value",
res = "resolution") {
# define variables for fun1
var1 <- enquo(d1)
var2 <- enquo(d2)
var3 <- enquo(t1)
var4 <- enquo(EXPR)
var5 <- enquo(ID)
# get sequence of values
seq <- seq(from = ti, to = tf, by = res)
# open list
t.list <- list()
# Loop ----
for (i in seq_along(seq)){
t1 <- seq[i]
t.list[[i]] <- fun1(df1 = var1,
df2 = var2,
t1 = var3,
expr_col = var4,
id_col = var5)
}
df.out <- plyr::ldply(t.list, rbind)
### Return ---
return(df.out)
}
But if I run this
test <- fun2(df1 = d1, df2 = d2, expr_col = EXPR, id_col = ID, ti = 1, tf = 10, res = 1)
I get an error message
Error in (function (x) : object 'EXPR' not found
I tried various things... and I am kind of stuck here. I guess I am not using enquo() properly. I can get it to work by not using varX and putting directly the actual appropriate name of each element in the fun1 arguments, but the whole point of doing this, to me, is to make it "generalisable" and therefore specify the arguments only in fun2 which will then be passed to fun1.
Any help would be greatly appreciated.
Many thanks for your answer aosmith. I am now sorted using the following code:
fun2 <- function(
df1 = "df 1",
df2 = "df 2",
expr_col = "expr column",
id_col = "sample column - must be present in df1 and df2",
ti = "initial value",
tf = "final value",
res = "resolution") {
# define variables for fun1
var4 <- enquo(expr_col)
var5 <- enquo(id_col)
# get sequence of values
seq <- seq(from = ti, to = tf, by = res)
# open list
t.list <- list()
### Loop --------------------------------------------------------------
for (i in seq_along(seq)){
t1 <- seq[i]
t.list[[i]] <- fun1(df1 = df1,
df2 = df2,
t1 = t1,
expr_col = !!var4,
id_col = !!var5)
}
df.out <- plyr::ldply(t.list, rbind)
### Return ---
return(df.out)
}
# TEST FUN2
test <- fun2(df1 = d1, df2 = d2, expr_col = EXPR, id_col = ID, ti = 1, tf = 10, res = 1)