This is the data frame I'm trying to work on:
m <- matrix(rnorm(108, mean = 5000, sd = 1000), nrow = 36)
colnames(m) <- paste('V', 1:3, sep = '')
df <- data.frame(type = factor(rep(c('T1', 'T2', 'T3', 'T4', 'T5',
'T6', 'T7', 'T8', 'T9'), each = 4)),
treatment = factor(rep(rep(c('C','P', 'N', 'S'), each = 1),
9)),
as.data.frame(m))
I want to know how can I perform a t-test between the rows within each "type". Here's an example of t-tests for type T1 I want:
t.test(df[1,3:5], df[2, 3:5])
t.test(df[1,3:5], df[3, 3:5])
t.test(df[1,3:5], df[4, 3:5])
t.test(df[1,3:5], df[3, 3:5])
t.test(df[1,3:5], df[4, 3:5])
I'm trying to figure out how can I loop through all rows and get all the p-values from the t-test (along with the type and treatment for identification), instead of calculating each row manually. Any help or suggestion would be greatly appreciated.
Something like this:
library(dplyr)
t_tests = df %>%
split(.$type) %>%
lapply(function(x){
t(x[3:5]) %>%
data.frame %>%
setNames(x$treatment) %>%
combn(2, simplify = FALSE) %>%
lapply(function(x){
data.frame(treatment = paste0(names(x), collapse = ", "),
p_value = t.test(x[,1], x[,2])$p.value)
}) %>%
do.call(rbind, .)
}) %>%
do.call(rbind, .) %>%
mutate(type = sub("[.].+", "", row.names(.)))
Result:
> head(t_tests, 10)
treatment p_value type
1 C, P 0.6112274 T1
2 C, N 0.6630060 T1
3 C, S 0.5945135 T1
4 P, N 0.9388568 T1
5 P, S 0.8349370 T1
6 N, S 0.9049995 T1
7 C, P 0.3274583 T2
8 C, N 0.9755364 T2
9 C, S 0.7391661 T2
10 P, N 0.3177871 T2
Edits (Added an extra level "file" to the dataset):
library(dplyr)
t_tests = df %>%
split(.$file) %>%
lapply(function(y){
split(y, y$type) %>%
lapply(function(x){
t(x[4:6]) %>%
data.frame %>%
setNames(x$treatment) %>%
combn(2, simplify = FALSE) %>%
lapply(function(x){
data.frame(treatment = paste0(names(x), collapse = ", "),
p_value = t.test(x[,1], x[,2])$p.value)
}) %>%
do.call(rbind, .)
}) %>%
do.call(rbind, .) %>%
mutate(type = sub("[.].+", "", row.names(.)))
}) %>%
do.call(rbind, .) %>%
mutate(file = sub("[.].+", "", row.names(.)))
Result:
treatment p_value type file
1 C, P 0.3903450 T1 file1
2 C, N 0.3288727 T1 file1
3 C, S 0.0638599 T1 file1
4 P, N 0.6927599 T1 file1
5 P, S 0.1159615 T1 file1
6 N, S 0.2184015 T1 file1
7 C, P 0.1147805 T2 file1
8 C, N 0.4961888 T2 file1
9 C, S 0.9048607 T2 file1
10 P, N 0.4203666 T2 file1
11 P, S 0.3425908 T2 file1
12 N, S 0.7262478 T2 file1
13 C, P 0.6300293 T3 file1
14 C, N 0.8255837 T3 file1
15 C, S 0.7140522 T3 file1
16 P, N 0.4768694 T3 file1
17 P, S 0.3992130 T3 file1
18 N, S 0.8740219 T3 file1
19 C, P 0.2434270 T4 file1
20 C, N 0.2713622 T4 file1
Note about edit:
OP wanted an extra top level file to be added to the data, one can simply add another split + lapply and do.call at the end.
New Data:
m <- matrix(rnorm(324, mean = 5000, sd = 1000), nrow = 108)
colnames(m) <- paste('V', 1:3, sep = '')
df <- data.frame(type = factor(rep(c('T1', 'T2', 'T3', 'T4', 'T5', 'T6', 'T7', 'T8', 'T9'), each = 4)),
treatment = factor(rep(rep(c('C','P', 'N', 'S'), each = 1), 9)),
file = factor(rep(c("file1", "file2", "file3"), each = 36)),
as.data.frame(m))
Related
I am looking to hide values from the output table if the frequency of data in the respective variable is less than 4 .
lets say if the number of records in column hp, mpg, qsec is less than 4 than the mean or median should be masked with "--"
i am trying like below but not working showing some error due to NA in database
library(expss)
data <- data.frame(
gender = c(1, 2, 1),
sector = c(3, 3, 1),
col1 = c(12, 15, 22),
col2 = c(33, NA, 41),
col3 = c(1, 1, 0),
col4 = c(NA,NA,NA),
col5 = c(1, 2, 1)
)
data$col3 <- factor(data$col3, levels = 1, labels = "Management")
data$col4 <- factor(data$col4, levels = 1, labels = "HR")
lst <- list(data$col4,data$col3)
fun1 <- function(dataset,var_list,banner1){
perc_25 <- function(x, ...){unname(quantile(x, .25, na.rm=TRUE))}
perc_75 <- function(x, ...){unname(quantile(x, .75, na.rm=TRUE))}
dataset<-dataset[var_list] %>% as.data.frame()
first_col_param <- head(var_list,1)
second_col_param <- tail(var_list,1)
var_lab(colnames(dataset)[ncol(dataset)]) <- ""
mr <- parse(text=paste0("mrset(",
first_col_param ," %to% ",second_col_param,")"))
fun_replace_valid_n <- function(x, n) {
dat <- dplyr::cur_data_all() %>% replace(is.na(.),0)
func_name <- dat$func_name
if(x[func_name == "Valid N"] < n) {
replace(x, func_name %in% c("Mean", "Median"), "--")
} else x
}
t1<- cross_fun(dataset,
eval(mr),
col_vars = banner1,
fun = combine_functions("Mean" = mean,
"Median" = median,
"Max"= max,
"Min"=min,
"25th Perc" = perc_25,
"75th Perc" = perc_75,
"Valid N" = valid_n
))
t1 <- as.data.frame(t1)
t1 <- t1 %>% tidyr::separate(row_labels, into = c('grp', 'func_name'), sep = "\\|")
t1 <- t1 %>% dplyr::group_by(grp)
t1 <- t1 %>% dplyr::mutate(dplyr::across(where(is.numeric), fun_replace_valid_n, n = 4)) %>%
dplyr::ungroup()
t1 <- t1 %>% tidyr::unite(row_labels, grp, func_name, sep = "|") %>%
as.etable
t1
}
debugonce(fun1)
t1 <- fun1(dataset=data,"col1",banner1=lst)
error:
expected output:
this is my example dataframe
example = data.frame(group = c("A", "B", "A", "A"), word = c("car", "sun ,sun, house", "car, house", "tree"))
I would like to get only unique words within group and through groups
So I would like to get this
group word
A car, tree
B sun
I used aggregate and get this
aggregate(word ~ group , data = example, FUN = paste0)
group word
1 A car, car, house, tree
2 B sun ,sun, house
but now i need to select only unique values, but even this does not work out
for (i in 1:nrow(cluster)) {cluster[i, ][["word"]] = lapply(unlist(cluster[i, ][["word"]]), unique)}
with
Error in `[[<-.data.frame`(`*tmp*`, "word", value = list("car", "car, house", :
replacement has 3 rows, data has 1
A base R option using aggregate + subset + ave like below
with(
aggregate(
word ~ .,
example,
function(x) {
unlist(strsplit(x, "[, ]+"))
}
),
aggregate(
. ~ ind,
subset(
unique(stack(setNames(word, group))),
ave(seq_along(ind), values, FUN = length) == 1
),
c
)
)
gives
ind values
1 A car, tree
2 B sun
Here's a dplyr solution:
library(dplyr)
library(tidyr)
example %>%
separate_rows(word) %>%
distinct(group, word) %>%
group_by(word) %>%
filter(n() == 1) %>%
group_by(group) %>%
summarise(word = toString(word))
output
group word
1 A car, tree
2 B sun
In base you can use strsplit to get the words, split them by group and use unique the get unique words per group. Use table to get the number of same words and take those which appear only once.
t1 <- lapply(split(strsplit(example$word, "[, ]+"), example$group),
\(x) unique(unlist(x)))
t2 <- table(unlist(t1))
t2 <- names(t2)[t2 == 1]
t1 <- lapply(t1, \(x) paste(x[x %in% t2], collapse = ", "))
data.frame(group = names(t1), word=unlist(t1))
# group word
#A A car, tree
#B B sun
Or another way starting with the already used aggregate in the question.
t1 <- aggregate(word ~ group , data = example, FUN = toString)
t2 <- lapply(strsplit(t1$word, "[, ]+"), unique)
t3 <- table(unlist(t2))
t3 <- names(t3)[t3 == 1]
t1$word <- lapply(t2, \(x) x[x %in% t3])
t1
# group word
#1 A car, tree
#2 B sun
And just for fun a Benchmark
library(bench)
library(dplyr)
library(tidyr)
library(tidyverse)
example = data.frame(group = c("A", "B", "A", "A"), word = c("car", "sun ,sun, house", "car, house", "tree"))
bench::mark(check = FALSE,
GKi = {t1 <- lapply(split(strsplit(example$word, "[, ]+"), example$group),
\(x) unique(unlist(x)))
t2 <- table(unlist(t1))
t2 <- names(t2)[t2 == 1]
t1 <- lapply(t1, \(x) paste(x[x %in% t2], collapse = ", "))
data.frame(group = names(t1), word=unlist(t1))},
GKi2 = {t1 <- aggregate(word ~ group , data = example, FUN = toString)
t2 <- lapply(strsplit(t1$word, "[, ]+"), unique)
t3 <- table(unlist(t2))
t3 <- names(t3)[t3 == 1]
t1$word <- lapply(t2, \(x) x[x %in% t3])
t1},
ThomasIsCoding = with(
aggregate(
word ~ .,
example,
function(x) {
unlist(strsplit(x, ", "))
}
),
aggregate(
. ~ ind,
subset(
unique(stack(setNames(word, group))),
ave(seq_along(ind), values, FUN = length) == 1
),
c
)
),
Mael = {example %>%
separate_rows(word) %>%
distinct(group, word) %>%
group_by(word) %>%
filter(n() == 1) %>%
group_by(group) %>%
summarise(word = toString(word))},
"Nir Graham" = {example <- data.frame(group = c("A", "B", "A", "A"),
word = c("car", "sun ,sun, house", "car, house", "tree"))
(sep_df <- separate_rows(example,word,sep = ",") |> mutate_all(trimws) |> distinct())
(uniq_df <- sep_df|> group_by(word) |> count() |> filter(n==1))
(result_df <- inner_join(sep_df,uniq_df) |> group_by(group) |> summarise(word=paste0(word,collapse=", ")))
}
)
Result
expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc
<bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl>
1 GKi 445.13µs 486.26µs 1997. 16.03KB 6.15 974 3
2 GKi2 916.97µs 968.68µs 1023. 7.3KB 6.15 499 3
3 ThomasIsCoding 3.54ms 3.73ms 266. 8.19KB 8.45 126 4
4 Mael 16.07ms 16.48ms 60.1 60.04KB 6.68 27 3
5 Nir Graham 37.29ms 39.49ms 24.0 90.59KB 8.00 9 3
GKi is about 2 times faster than GKi2, 7 times faster than ThomasIsCoding, 30 than Mael and 80 than Nir Graham.
library(tidyverse)
example <- data.frame(group = c("A", "B", "A", "A"),
word = c("car", "sun ,sun, house", "car, house", "tree"))
(sep_df <- separate_rows(example,word,sep = ",") |> mutate_all(trimws) |> distinct())
(uniq_df <- sep_df|> group_by(word) |> count() |> filter(n==1))
(result_df <- inner_join(sep_df,uniq_df) |> group_by(group) |> summarise(word=paste0(word,collapse=", ")))
I have a data set df that has been split into int1 and int2. In int1andint2, there is two elements for the IDA and three elements for theID` B.
My goal is to create a 2x2 matrix for ID A and 3x3 for ID B, and have it divided from my example list of matrices l1. Currently, my code is creating a 3x3 matrix for ID A and 2x2 matrix for ID B using a combination of the product from g1 and f2 using map2() resulting to lstmat.
Any suggestions on how I can get the desired output of a 2x2 matrix for ID A and 3x3 matrix for ID B?
Example data:
library(lubridate)
library(tidyverse)
date <- rep_len(seq(dmy("26-12-2010"), dmy("20-12-2011"), by = "days"), 500)
ID <- rep(c("A","B"), 5000)
df <- data.frame(date = date,
x = runif(length(date), min = 60000, max = 80000),
y = runif(length(date), min = 800000, max = 900000),
ID)
df$jDate <- julian(as.Date(df$date), origin = as.Date('1970-01-01'))
df$Month <- month(df$date)
df$year <- year(df$date)
t1 <- c(100,150)
t2 <- c(200,250)
mat <- cbind(t1,t2)
t1 <- c(150,150,200)
t2 <- c(250,250,350)
t3 <- c(350,350, 400)
mat2 <- cbind(t1,t2, t3)
l1 <- list(mat, mat2)
int1 <- df %>%
# arrange(ID) %>% # skipped for readability of result
mutate(new = floor_date(date, '10 day')) %>%
mutate(new = if_else(day(new) == 31, new - days(10), new)) %>%
group_by(ID, new) %>%
filter(Month == "3") %>%
group_split()
int2 <- df %>%
# arrange(ID) %>% # skipped for readability of result
mutate(new = floor_date(date, '10 day')) %>%
mutate(new = if_else(day(new) == 31, new - days(10), new)) %>%
group_by(ID, new) %>%
filter(Month == "2") %>%
group_split()
names(int1) <- sapply(int1, function(x) paste(x$ID[1],
sep = '_'))
names(int2) <- sapply(int2, function(x) paste(x$ID[1],
sep = '_'))
int1 <- int1[-1]
int2 <- int2[-1]
Any suggestions for changes to this code for the desired result? :
g1 <- as.integer(gl(length(int1), 3, length(int1)))
f2 <- function(.int1, .int2) {
t(outer(seq_along(.int1), seq_along(.int2),
FUN = Vectorize(function(i, j) min(.int1[[i]]$jDate) -
min(.int2[[j]]$jDate))))
}
lstMat <- map2(split(int1, g1), split(int2, g1), f2)
map2(l1, lstMat, `/`)
As the 'int1', 'int2' have duplicated names, split on the names instead of creating a grouping index with gl
lstMat <- map2(split(int1, names(int1)), split(int2, names(int2)), f2)
map2(l1, lstMat, `/`)
-output
[[1]]
t1 t2
[1,] 3.571429 5.263158
[2,] 8.333333 8.928571
[[2]]
t1 t2 t3
[1,] 5.357143 6.578947 7.291667
[2,] 8.333333 8.928571 9.210526
[3,] 25.000000 19.444444 14.285714
I have the following dataframe:
id <- c("A", "B", "C")
col1 <- c(1, 3, 5)
col2 <- c(6, 12, 9)
col3 <- c(2, 4, 30)
df <- data.frame(id, col1, col2, col3)
Essentially, I want every i to be replaced by 20, 25, 30, 35, 40. This loop works but it works very, very slowly.
library(dplyr)
library(tibble)
library(foreach)
library(tidyverse)
library(purrr)
id <- c("A", "B", "C")
col1 <- c(1, 3, 5)
col2 <- c(6, 12, 9)
col3 <- c(2, 4, 30)
df <- data.frame(id, col1, col2, col3)
vals <- c(seq(from=20, to=40, by=5))
final <- foreach(i = vals, .combine='cbind') %do% {
# if cell is greater than i, then code 0
df_2 <- df %>% mutate(across(starts_with("col"), ~ +(. < i)))
# transpose the dataset
rownames(df_2) <- df_2$id
df_2$id <- NULL
df_2_t <- as.data.frame(t(df_2))
# sum the rows
df_2_t <- cbind(id = rownames(df_2_t), df_2_t)
rownames(df_2_t) <- 1:nrow(df_2_t)
df_2_t <- df_2_t %>%
mutate(sum = rowSums(.[2:ncol(.)]))
# merge a new column
id2 <- c("col1", "col2", "col3")
D <- c(3, 4, 5)
id_d <- data.frame(id2, D)
df_2_t_d <- left_join(df_2_t, id_d, by = c("id" = "id2"))
# divide D by the number of letters (there are 3 letter columns -- A, B, C)
df_2_t_d$letters <- rep(3)
df_2_t_d <- df_2_t_d %>%
mutate(frac = D/letters)
# recode all 1s to the frac
letters <- grep("^A|^B|^C", names(df_2_t_d))
df_2_t_d[letters] <- apply(df_2_t_d[letters], 2, function(x) ifelse(x == 1, df_2_t_d$frac, 0))
# drop two columns
df_2_t_d <- select(df_2_t_d, -c(D, letters))
# transpose again
rownames(df_2_t_d) <- df_2_t_d$id
df_2_t_d$id <- NULL
df_2_t_d2 <- as.data.frame(t(df_2_t_d))
df_2_t_d2_sum <- df_2_t_d2 %>%
mutate(rowSums(.[1:3])) %>%
transmute(!!paste0('sum_', i) := rowSums(select(., starts_with('col'))))
}
df_2_t_d2 <- cbind(list_name = rownames(df_2_t_d2), df_2_t_d2)
rownames(df_2_t_d2) <- 1:nrow(df_2_t_d2)
df_2_t_d2 <- select(df_2_t_d2, list_name)
abc <- cbind(df_2_t_d2, df_2_t_d2_sum)
View(abc)
If there's any way to speed it up, suggestions are welcome!
Here's a way to do this map_dfc :
library(dplyr)
library(purrr)
vals <- seq(from=20, to=40, by=5)
bind_cols(
df, map_dfc(vals, function(x) df %>%
mutate(across(starts_with("col"), ~ +(. < x))) %>%
transmute(!!paste0('sum_', x) := rowSums(select(., starts_with('col'))))))
Or in base R :
cols <- grep('col', names(df))
df[paste0('sum_', vals)] <- lapply(vals, function(x) rowSums(+(df[cols] < x)))
df
# id col1 col2 col3 sum_20 sum_25 sum_30 sum_35 sum_40
#1 A 1 6 2 3 3 3 3 3
#2 B 3 12 4 3 3 3 3 3
#3 C 5 9 30 2 2 2 3 3
I have a list of 2D matrices. Each matrix is filled using the function fillMatrices. This function adds a number of individuals to each day 0 in a matrix and updates the columns a_M, b_M and c_M. The numbers of individuals come from an initial matrix ind. The code works but it is slow when the number of matrices within the list is large. For example with n = 10000:
user system elapsed
3.73 0.83 4.55
If possible, I would like to reduce the elapsed time to <= 1 sec and increase the n to 720000 matrices. So, I am looking for way to optimize only the section 3. Here is the code:
###############################################
###############################################
## Section 3
## Run the function "fillMatrices"
indexTime <- 1
dt_t_1 <- do.call(rbind, lapply(list_matrices, function(x) x[1,]))
dt_t <- fillMatrices(dt_t_1 = dt_t_1, species = c("a_M", "b_M", "c_M"), maxDuration = 5, matrixColumns = col_mat)
## Fill the matrices within the list
system.time(for(i in 1:n){
list_matrices[[i]][indexTime + 1,] <- dt_t[,i]
})
## test <- list_matrices[[1]]
The code of the section 1 is used to initialize the matrices and the function fillMatrices can be found in the section 2. In my example, the function is used to fill matrices for one species. In reality, the function is used for 3 species (i.e., is applied three times) by changing the argument species = c("a_M", "b_M", "c_M"). How can I speed up my code? Any advice would be much appreciated.
Here are the codes of sections 1 and 2:
rm(list=ls(all=TRUE))
library(ff)
library(dplyr)
set.seed(12345)
## Define the number of individuals
n <- 10000
###############################################
###############################################
## Section 1
## Build the list of 2D matrices
v_date <- as.vector(outer(c(paste(seq(0, 1, by = 1), "day", sep="_"), paste(seq(2, 5, by = 1), "days", sep="_")), c("a_M", "b_M", "c_M"), paste, sep="|"))
col_mat <- c("year", "day", "time", "ID", "died", v_date)
list_matrices <- list()
for(i in 1:n){
print(i)
list_matrices[[i]] <- ff(-999, dim=c(3650, length(col_mat)), dimnames=list(NULL, col_mat), vmode="double", overwrite = TRUE)
}
## test <- list_matrices[[1]]
## dim(list_matrices[[1]])
## Fill the first row of each matrix
for(i in 1:n){
print(i)
list_matrices[[i]][1,] <- c(1, 1, 1, i-1, 0, rep(0, length(v_date)))
}
## test <- list_matrices[[2]]
## Build the matrix "individual"
ind <- as.matrix(data.frame(year = rep(1, n), day = rep(1, n), time = rep(1, n), died = rep(0, n), ID = (seq(1, n, 1))- 1, a_M = sample(1:10, n, replace = T), b_M = sample(1:10, n, replace = T), c_M = sample(1:10, n, replace = T)))
## print(ind)
###############################################
###############################################
## Section 2
## Function to convert a data frame into a matrix
convertDFToMat <- function(x){
mat <- as.matrix(x[,-1])
ifelse(is(x[,1], "data.frame"), rownames(mat) <- pull(x[,1]), rownames(mat) <- x[,1])
## Convert character matrix into numeric matrix
mat <- apply(mat, 2, as.numeric)
return(mat)
}
## Define the function that is used to fill the matrices within the list
fillMatrices <- function(dt_t_1, species, maxDuration, matrixColumns){
## Format data
dt <- as.data.frame(dt_t_1) %>%
reshape::melt(id = c("ID")) %>%
arrange(ID) %>%
dplyr::mutate_all(as.character)
## summary(dt)
## Break out the variable "variable" into different columns, with one row for each individual-day
dt_reshape_filter_1 <- dt %>%
dplyr::filter(!variable %in% c("year", "day", "time", "ID", "died")) %>%
dplyr::mutate(day = variable %>% gsub(pattern = "\\_.*", replacement = "", x = .), col = variable %>% gsub(pattern = ".*\\|", replacement = "", x = .)) %>%
dplyr::select(-variable) %>%
tidyr::spread(col, value) %>%
dplyr::mutate_all(as.numeric) %>%
dplyr::arrange(ID, day)
## summary(dt_reshape_filter_1)
## Apply requested transformations and build the data frame
dt_transform <- dt_reshape_filter_1 %>%
dplyr::rename_at(vars(species), ~ c("a", "b", "c")) %>%
dplyr::mutate(day = day + 1) %>%
dplyr::filter(day < maxDuration + 1) %>%
dplyr::bind_rows(tibble(ID = ind[,c("ID")], day = 0, a = ind[,c("a_M")], b = ind[,c("b_M")])) %>%
dplyr::mutate(c = a + b) %>%
dplyr::rename_at(vars("a", "b", "c"), ~ species) %>%
dplyr::arrange(ID, day)
## summary(dt_transform)
## Take different columns of the data frame and gather them into a single column
dt_gather <- dt_transform %>%
tidyr::gather(variable, value, species) %>%
dplyr::mutate(day = if_else(day > 1, paste0(day, "_days"), paste0(day, "_day"))) %>%
tidyr::unite(variable, c("day", "variable"), sep = "|") %>%
dplyr::rename(var2 = ID) %>%
dplyr::mutate_all(as.character)
## summary(dt_gather)
## Add the other columns in the data frame and convert the resulting data frame into a matrix
dt_reshape_filter_2 <- dt %>%
dplyr::rename(var2 = ID) %>%
dplyr::filter(variable %in% c("year", "day", "time", "ID", "died")) %>%
tidyr::spread(variable, value) %>%
dplyr::arrange(as.numeric(var2)) %>%
dplyr::mutate(year = ind[,c("year")],
day = ind[,c("day")],
time = ind[,c("time")],
ID = ind[,c("ID")],
died = ind[,c("died")]) %>%
tidyr::gather(variable, value, c(year, day, time, ID, died)) %>%
dplyr::arrange(as.numeric(var2)) %>%
dplyr::mutate_all(as.character)
## summary(dt_reshape_filter_2)
## Build the output matrix
dt_bind <- bind_rows(dt_reshape_filter_2, dt_gather) %>%
tidyr::spread(var2, value) %>%
dplyr::arrange(match(variable, matrixColumns)) %>%
dplyr::select("variable", as.character(ind[,c("ID")]))
## summary(dt_bind)
dt_mat <- convertDFToMat(dt_bind)
## summary(dt_mat)
return(dt_mat)
}
Making a 3D array instead of a 2D list of matrices gives you more options
library(ff)
library(dplyr)
set.seed(12345)
## Define the number of individuals
n <- 10000L
n_row <- 3650L
#array way:
v_date <- as.vector(outer(c(paste(seq(0, 1, by = 1), "day", sep="_"), paste(seq(2, 5, by = 1), "days", sep="_")), c("a_M", "b_M", "c_M"), paste, sep="|"))
col_mat <- c("year", "day", "time", "ID", "died", v_date)
arr1 <- ff(-999L, dim = c(n_row, length(col_mat), n), dimnames = list(NULL, col_mat, NULL))
## Fill the first row of each matrix slice
arr1[1, , ] <- c(1L, 1L, 1L, NA, 0L, rep(0L, length(v_date)))
arr1[1, 4, ] <- seq_len(n)-1L
## Build the matrix "individual"
ind <- as.matrix(data.frame(year = rep(1L, n), day = rep(1L, n), time = rep(1L, n), died = rep(0L, n), ID = (seq(1L, n, 1L))- 1L, a_M = sample(1L:10L, n, replace = T), b_M = sample(1L:10L, n, replace = T), c_M = sample(1L:10L, n, replace = T)))
##fill the matrix
indexTime <- 1L
dt_t <- fillMatrices(dt_t_1 = t(arr1[1, ,]), species = c("a_M", "b_M", "c_M"), maxDuration = 5, matrixColumns = col_mat)
## reassign
system.time(
arr1[indexTime + 1, ,] <- dt_t
)
user system elapsed
0.05 0.70 0.7
# for comparison
#> system.time(for(i in 1:n){
#+ list_matrices[[i]][indexTime + 1,] <- dt_t[,i]
#+ })
# user system elapsed
# 4.75 1.08 5.90
As far as I can tell, it's giving me the same results as your original approach but does so a lot faster.