Correctly Understanding the Use of the replicate() function - r

I have the following function:
library(dplyr)
var_1 <- rnorm(100, 10, 10)
var_2 <- rnorm(100, 1, 10)
var_3 <- rnorm(100, 5, 10)
response <- rnorm(100, 1, 1)
my_data <- data.frame(var_1, var_2, var_3, response)
my_data$id <- 1:100
simulate <- function() {
results <- list()
results2 <- list()
for (i in 1:100) {
iteration_i <- i
sample_i <- my_data[sample(nrow(my_data), 10), ]
results_tmp <- data.frame(iteration_i, sample_i)
results[[i]] <- results_tmp
}
results_df <- do.call(rbind.data.frame, results)
test_1 <- data.frame(results_df %>%
group_by(id) %>%
filter(iteration_i == min(iteration_i)) %>%
distinct)
summary_file <- data.frame(test_1 %>%
group_by(iteration_i) %>%
summarise(Count=n()))
cumulative <- cumsum(summary_file$Count)
summary_file$Cumulative <- cumulative
summary_file$unobserved <- 100 - cumulative
return(summary_file)
}
When I call this function, I get the following output:
> head(simulate())
iteration_i Count Cumulative unobserved
1 1 10 10 90
2 2 7 17 83
3 3 10 27 73
4 4 5 32 68
5 5 7 39 61
6 6 8 47 53
I want to try to run this function 10 times and append all the results into a single file. I tried to do this using the "replicate()" function - but this is not working:
# Method 1 : Did not work
n_replicates = 10
iterations_required <- replicate(n_replicates, {
simulate()
})
# Method 2: Did not work
lapply(seq_len(10), simulate(1))
# Method 3: Did Not Work
library(purrr)
rerun(10, simulate(1))
# Method 4: Did Not Work
lapply(seq_len(10), simulate)
Ideally, I would like to get something like this:
# works fine!
results <- list()
for (i in 1:10) {
game_i <- i
s_i <- simulate()
results_tmp <- data.frame(game_i, s_i)
results[[i]] <- results_tmp
}
final_file <- do.call(rbind.data.frame, results)
My Question: Is there a reason that "Method 1, Method 2, Method 3, Method 4" were not working - could someone please show me how to fix this?

# Method 1 :
n_replicates = 10
iterations_required <- replicate(n_replicates, {
simulate()
}, simplify=FALSE)
# Method 2:
iterations_required<-lapply(seq_len(10), function(x) simulate(1))
# Method 4:
iterations_required<-lapply(seq_len(10), simulate)
# to merge into one data.frame
as.data.frame(data.table::rbindlist(iterations_required, idcol=TRUE))
Alternatively, if you modify your function to simulate(i), where i will be the first column in the output (interation index). Then you could use do.call(rbind.data.frame, lapply(seq_len(n_replicates), simulate))

replicate by default tries to simplify the result in a matrix. So the trick is actually just not to simplify.
n_replicates<- 10
iterations_required <- replicate(n_replicates, simulate(), simplify=FALSE)

Related

R: Introducing a WHILE LOOP to a Function

I am working with the R programming language.
I am trying to count the first time a certain pattern (e.g. ABCD) appears in a random string (e.g. ACABCDCDBCABCDBC - answer =6 ). I wrote a function to do this:
library(stringr)
letters <- c("A", "B", "C", "D")
results <- list()
for (i in 1:100)
{
iteration_i = i
letters_i = paste(sample(letters, 100, replace=TRUE, prob=c(0.25, 0.25, 0.25, 0.25)),collapse="")
position_i = str_locate(letters_i, "ADBC")
results_tmp = data.frame(iteration_i , letters_i, position_i)
results[[i]] <- results_tmp
}
results_df <- do.call(rbind.data.frame, results)
This looks something like this now (note: I don't think this is correct - in row 5, I see ABCD at the beginning of the row, but its being recorded as NA for some reason):
iteration_i letters_i start end
1 1 BACDCCCDCCCDCDDBBCBBAACACBBBBAAABDDDACAABDDABBABADCDDCDACCBBBCABCDABCDCCCDADDDBADBDCADAABDBDCDCAACCB NA NA
2 2 CACACCCCDCCBADACBBAADBCABBAAAAADBDDBCADCAAADADAAABDCABBAABABBCBDADCDDDDCDBADDBDCBCDDDBDCDDAACBBBBACA 20 23
3 3 CDCBDAABDDDDADBAAABBADAADBDDDBDADDCABADDDCDABBBCBCBBACBBDADABBCDCCACDBCDCDDBDBADBCDCADDADDDBDBAAABBD 79 82
4 4 ADBCDBADADBAAACAADACACACACBDDCACBDACCBDAAABDBAAAABBCCDBADADDADCBCABCBAABDCBCDCDACDCCDBADCBDDAADBCDAC 1 4
5 5 D**ABCD**DDCCBCDABADBBBBCDBCADCBBBDCAAACACCCBCBCADBDDABBACACBDABAAACCAAAAACCCCBCBCCABABDDADBABDDDCCDDCCC NA NA
6 6 DDDDDBDDDDBDDDABDDADAADCABCDAABBCCCDAABDDAACBDABBBBBABBCBDADBDCCAAADACCBCDDBDCAADCBBBCACDBBADDDDCABC NA NA
Currently, I am only generating 100 letters and hoping that this is enough to observe the desired pattern (sometimes this doesn't happen, notice the NA's) - is there a way to add a WHILE LOOP to what I have written to keep generating letters until the desired pattern first appears?
Can someone please show me how to do this?
Thanks!
The loop is a repeat loop, not while, that only breaks when the pattern is found. I have set the results list length to 2, there's no point in making it bigger just to test the code.
library(stringr)
Letters <- c("A", "B", "C", "D")
Pattern <- "ADBC"
n <- 2L
set.seed(2022)
results <- vector("list", length = n)
for (i in seq.int(n)) {
repeat {
l <- sample(Letters, 100, replace = TRUE, prob=c(0.25, 0.25, 0.25, 0.25))
letters_i <- paste(l, collapse = "")
position_i <- str_locate(letters_i, pattern = Pattern)
if(any(!is.na(position_i))) break
}
results_tmp <- data.frame(iteration = i, letters = letters_i, position_i)
results[[i]] <- results_tmp
}
results_df <- do.call(rbind.data.frame, results)
results_df
#> iteration letters start end
#> 1 1 ADBDBDBBCABBBDDBADDAADCBBADACACDCCBBADAADCDDABADCABCDCDDCCCBDDAABACCBDAAAADBDDCCCCADBCBBDABBDCCCBADD 83 86
#> 2 2 DDBDBDBCDDBDBBBDBABBCCBBCCBDBDABBAAABACABADCCBBABADBCCCDABABBDBADCADCABDDDAAACCBDCAACACACBBDDDACCDDC 50 53
Created on 2022-06-11 by the reprex package (v2.0.1)

Efficient Montecarlo simulation over a grid in R

I am running a Montecarlo simulation of a multinomial logit. Therefore I have a function that generates the data and estimates the model. Additionally, I want to generate different datasets over a grid of values. In particular, changing both the number of individuals (n.indiv) and the number of answers by each individual (n.choices).
So far, I have managed to solve it, but at some point, I incurred into a nested for-loop structure over a grid search of the possible values for the number of individuals (n.indiv_list) and the number of answers by each individual(n.choices_list). Finally, I am quite worried about the efficiency of the usage of my last bit of code with the double for-loop structure running on the combinations of the possible values. Probably there is a vectorized way to do it that I am missing (or maybe not?).
Finally, and this is mostly a matter of style, I managed to arrive a multiples objects that contain the models from the combinations of the grid search with informative names, but also would be great if I could collapse all of them in a list but with the current structure, I am not sure how to do it. Thank you in advance!
1) Function that generates data and estimates the model.
library(dplyr)
library(VGAM)
library(mlogit)
#function that generates the data and estimates the model.
mlogit_sim_data <- function(...){
# generating number of (n.alter) X (n.choices)
df <- data.frame(id= rep(seq(1,n.choices ),n.alter ))
# id per individual
df <- df %>%
group_by(id) %>%
mutate(altern = sequence(n()))%>%
arrange(id)
#Repeated scheme for each individual + id_ind
df <- cbind(df[rep(1:nrow(df), n.indiv), ], id_ind = rep(1:n.indiv, each = nrow(df)))
## creating attributes
df<- df %>%
mutate(
x1=rlnorm(n.indiv*n.alter),
x2=rlnorm(n.indiv*n.alter),
)%>%
group_by(altern) %>%
mutate(
id_choice = sequence(n()))%>%
group_by(id_ind) %>%
mutate(
z1 = rpois(1,lambda = 25),
z2 = rlnorm(1,meanlog = 5, sdlog = 0.5),
z3 = ifelse(runif(1, min = 0 , max = 1) > 0.5 , 1 , 0)
)
# Observed utility
df$V1 <- with(df, b1 * x1 + b2 * x2 )
#### Generate Response Variable ####
fn_choice_generator <- function(V){
U <- V + rgumbel(length(V), 0, 1)
1L * (U == max(U))
}
# Using fn_choice_generator to generate 'choice' columns
df <- df %>%
group_by(id_choice) %>%
mutate(across(starts_with("V"),
fn_choice_generator, .names = "choice_{.col}")) %>% # generating choice(s)
select(-starts_with("V")) %>% ##drop V variables.
select(-c(id,id_ind))
tryCatch(
{
model_result <- mlogit(choice_V1 ~ 0 + x1 + x2 |1 ,
data = df,
idx = c("id_choice", "altern"))
return(model_result)
},
error = function(e){
return(NA)
}
)
}
2) Grid search over possible combinations of the data
#List with the values that varies in the simulation
#number of individuals
n.indiv_list <- c(1, 15, 100, 500 )
#number of choice situations
n.choices_list <- c(1, 2, 4, 8, 10)
# Values that remains constant across simulations
#set number of alternatives
n.alter <- 3
## Real parameters
b1 <- 1
b2 <- 2
#Number of reps
nreps <- 10
#Set seed
set.seed(777)
#iteration over different values in the simulation
for(i in n.indiv_list) {
for(j in n.choices_list) {
n.indiv <- i
n.choices <- j
assign(paste0("m_ind_", i, "_choices_", j), lapply(X = 1:nreps, FUN = mlogit_sim_data))
}
}
You can vectorize using the map2 function of the purrr package:
library(tidyverse)
n.indiv_list <- c(1, 15, 100, 500 )
#number of choice situations
n.choices_list <- c(1, 2, 4, 8, 10)
l1 <- length(n.indiv_list)
l2 <- length(n.choices_list)
v1 <- rep(n.indiv_list, each = l2)
v2 <- rep(n.choices_list, l1) #v1, v2 generate all pairs
> v1
[1] 1 1 1 1 1 15 15 15 15 15 100 100 100 100 100 500 500 500 500 500
> v2
[1] 1 2 4 8 10 1 2 4 8 10 1 2 4 8 10 1 2 4 8 10
result <- map2(v1, v2, function(v1, v2) assign(paste0("m_ind_", v1, "_choices_", v2), lapply(X = 1:nreps, FUN = mlogit_sim_data)))
result will be a list of your function outputs.

Alternative nested for loops for counting value occurrence in R dataframe

I am working on a large dataset, i what to count how many time two columns have the same values. Here is an example of the dataset:
id = rep(replicate(4, paste(sample(LETTERS, 3, replace=F), collapse="")), 12500)
names = rep(replicate(3125, paste(sample(letters, 5, replace=T), collapse="")), 16)
times = sample(c(3,6,24), 50000, replace = T)
df = data.frame(id=id, names=names, times=times)
count <- list()
ids <- as.vector(unique(df$id))
nms <- as.vector(unique(df$names))
for(i in 1:length(ids)){
vec <- c()
for(j in 1:length(nms)){
vec[j] <- nrow(df[df$id == ids[i] & df$names == nms[j], ])
}
count[[i]] <- vec
}
My real data have about 50000 x 10 dimension and the id and name fields are randomly scattered. Can anyone suggest a better way to handle this? because my approach is working but too slow. dplyr or plyr methods?
Thanks,
EDIT:
short version of my dataframe:
id = rep(replicate(3, paste(sample(LETTERS, 3, replace=F), collapse="")), 5)
names = rep(replicate(3, paste(sample(letters, 5, replace=T), collapse="")), 5)
times = sample(c(3,6,24), 15, replace = T)
df = data.frame(id=id, names=names, times=times)
df
id names times
1 DEW xxsre 24
2 QHY xkbhr 24
3 DQE tuyfk 6
4 DEW xxsre 24
5 QHY xkbhr 24
6 DQE tuyfk 3
7 DEW xxsre 3
8 QHY xkbhr 24
9 DQE tuyfk 3
10 DEW xxsre 24
11 QHY xkbhr 24
12 DQE tuyfk 3
13 DEW xxsre 24
14 QHY xkbhr 3
15 DQE tuyfk 3
output:
> count
[[1]]
[1] 5 0 0
[[2]]
[1] 0 5 0
[[3]]
[1] 0 0 5
each list item is for id, and the list vec is for names count. in other words as.vector(unique(df$id)) and as.vector(unique(df$names)) respectively.
You can use data.table, which is likely the fastest solution:
library(data.table)
# convert your dataset into a data.table
setDT(df)
output <- df [ , .N, by = .(id, names)]
head(output)
> id names N
> 1: FYG vlrcd 4
> 2: FAL mjhhs 4
> 3: BZU rfnvc 4
> 4: HJA zhssf 4
> 5: FYG pxtne 4
> 6: FAL qgeqr 4
If you want the output to be a list, you can convert the output in different ways:
L1 <- as.list(as.data.frame(t(output))) # or
L2 <- split(output, list(output$id, output$names)) # or
L3 <- split(output, seq(nrow(output)))
Does this do what you want?
library(dplyr)
count <- df %>%
group_by(id, names) %>%
summarise(n=sum(times))
count
Without using plyr and dplyr you can reduce computing time by 25%.
To a reasonnable computing time, I subsetted the first 1000 rows of your data.
library(microbenchmark)
id = rep(replicate(4, paste(sample(LETTERS, 3, replace=F), collapse="")), 12500)
names = rep(replicate(3125, paste(sample(letters, 5, replace=T), collapse="")), 16)
times = sample(c(3,6,24), 50000, replace = T)
df = data.frame(id=id, names=names, times=times)
df = df[1:1000,]
ids <- as.vector(unique(df$id))
nms <- as.vector(unique(df$names))
Then I define 3 functions, default, summation, and sum+preallocation
default<-function(ids,nms,df){
count <- list()
for(i in 1:length(ids)){
vec <- c()
for(j in 1:length(nms)){
vec[j] <- nrow(df[df$id == ids[i] & df$names == nms[j], ])
}
count[[i]] <- vec
}
}
summation<-function(ids,nms,df){
count <- list()
for(i in 1:length(ids)){
vec <- c()
for(j in 1:length(nms)){
vec[j] <- sum(df$id == ids[i] & df$names == nms[j])
}
count[[i]] <- vec
}
}
summation_and_preallocation<-function(ids,nms,df){
count <- list()
for(i in 1:length(ids)){
vec <- integer(length = length(nms))
for(j in 1:length(nms)){
vec[j] <- sum(df$id == ids[i] & df$names == nms[j])
}
count[[i]] <- vec
}
}
Tests with microbenchmark show:
m<-microbenchmark(default(ids,nms,df),summation(ids,nms,df),summation_and_preallocation(ids,nms,df),times = 10)
Unit: milliseconds
expr min lq mean median uq max neval
default(ids, nms, df) 994.5040 1012.1560 1040.7012 1042.5689 1072.4689 1074.8893 10
summation(ids, nms, df) 735.0831 740.6620 741.2254 742.1361 742.9321 743.7806 10
summation_and_preallocation(ids, nms, df) 729.1192 733.0536 753.8661 736.8319 791.5001 804.2335 10
How does it compare with dplyr solution from #Adrian?
dplyr_count(ids, nms, df) 3.154741 3.206819 49.06034 3.275624 3.701375 457.943 10
So about 200 times faster for dplyr!

create and name NA indicator variables for a large data frame

Okay, I'm close. Everything works but the last loop for compound where I get hung up on a data type issue. Copy and run to your heart's content.
x <- c(1:12)
dim(x) <- c(3,4)
x[2,2] <- NA
x[3,3] <- NA
colnames(x) <- c("A","B","C","D")
x
newframe <- data.frame(matrix(0, ncol = 4, nrow = 3))
for (i in 1:3)
for (j in 1:4)
{ newframe[i,j] <- (1 -1*(is.na(x[i,j]))) }
newframe <- as.matrix((newframe))
newframe
compound <- data.frame(matrix(0, ncol = 4, nrow = 3))
for (i in 1:3)
for (j in 1:4 )
{ compound[i,j] <- (as.numeric(x[i,j])*(as.numeric(newframe[i,j])))
}
compound
I'm trying to create an indicator variable for null instances and use it to create a compound variable that will zero out the original variable when null and flash the indicator.
Create indicator var's for missing instances and zero out or impute values for NA instances in original data:
# create data
x <- c(1:12)
dim(x) <- c(3,4)
x[2,2] <- NA
x[3,3] <- NA
x
# create data frame for indicator var's
newframe <- 1*(is.na(x))
newframe
class(newframe)
# zero out NAs in data, or alternatively replaced with imputed values
x[is.na(x)] <- 0
# create data frame for original data and indicator var's
newdata <- cbind(x, newframe)
newdata
Copy and run.
Is this what you're looking for ?
compound <- x
compound[is.na(x)] <- 0
compound
A B C D
[1,] 1 4 7 10
[2,] 2 0 8 11
[3,] 3 6 0 12

Nested for loop in R

I wrote the following code, and I need to repeat this for 100 times, and I know I need to user another for loop, but I don't know how to do it. Here is the code:
mean <- c(5,5,10,10,5,5,5)
x <- NULL
u <- NULL
delta1 <- NULL
w1 <- NULL
for (i in 1:7 ) {
x[i] <- rexp(1, rate = mean[i])
u[i] <- (1/1.2)*runif(1, min=0, max=1)
y1 <- min(x,u)
if (y1 == min(x)) {
delta1 <- 1
}
else {
delta1 <- 0
}
if (delta1 == 0)
{
w1 <- NULL
}
else {
if(y1== x[[1]])
{
w1 <- "x1"
}
}
}
output <- cbind(delta1,w1)
output
I want the final output to be 100 rows* 3 columns matrix representing run number, delta1, and w1.
Any thought will be truly appreciated.
Here's what I gather you're trying to achieve from your code:
Given two vectors drawn from different distributions (Exponential and Uniform)
Find out which distribution the smallest number comes from
Repeat this 100 times.
Theres a couple of problems with your code if you want to achieve this, so here's a cleaned up example:
rates <- c(5, 5, 10, 10, 5, 5, 5) # 'mean' is an inbuilt function
# Initialise the output data frame:
output <- data.frame(number=rep(0, 100), delta1=rep(1, 100), w1=rep("x1", 100))
for (i in 1:100) {
# Generating u doesn't require a for loop. Additionally, can bring in
# the (1/1.2) out the front.
u <- runif(7, min=0, max=5/6)
# Generating x doesn't need a loop either. It's better to use apply functions
# when you can!
x <- sapply(rates, function(x) { rexp(1, rate=x) })
y1 <- min(x, u)
# Now we can store the output
output[i, "number"] <- y1
# Two things here:
# 1) use all.equal instead of == to compare floating point numbers
# 2) We initialised the data frame to assume they always came from x.
# So we only need to overwrite it where it comes from u.
if (isTRUE(all.equal(y1, min(u)))) {
output[i, "delta1"] <- 0
output[i, "w1"] <- NA # Can't use NULL in a character vector.
}
}
output
Here's an alternative, more efficient approach with replicate:
Mean <- c(5, 5, 10, 10, 5, 5, 5)
n <- 100 # number of runs
res <- t(replicate(n, {
x <- rexp(n = length(Mean), rate = Mean)
u <- runif(n = length(Mean), min = 0, max = 1/1.2)
mx <- min(x)
delta1 <- mx <= min(u)
w1 <- delta1 & mx == x[1]
c(delta1, w1)
}))
output <- data.frame(run = seq.int(n), delta1 = as.integer(res[ , 1]),
w1 = c(NA, "x1")[res[ , 2] + 1])
The result:
head(output)
# run delta1 w1
# 1 1 1 <NA>
# 2 2 1 <NA>
# 3 3 1 <NA>
# 4 4 1 x1
# 5 5 1 <NA>
# 6 6 0 <NA>

Resources