How to avoid having duplicate binary values in R - r

I am currently working on building a simulation that simulates user interactions with fake/authentic FB news posts. As I built simulation using for loop, I've ran into the following problem:
First, I wanted to set my loop such that if a person reacts to a FB post (i.e leave a like, love, wow, haha, sad, angry, or care) he/she would leave only one reaction. For instance, if someone leaves a like, he/she shouldn't be able to leave other reactions like love, wow, etc.
This is the code I used for generating simulation data
#Creating empty dataframe
fake_id<-1:1000
like<-rep(NA,max(fake_id))
love<-rep(NA,max(fake_id))
wow<-rep(NA,max(fake_id))
haha<-rep(NA,max(fake_id))
sad<-rep(NA,max(fake_id))
angry<-rep(NA,max(fake_id))
care<-rep(NA,max(fake_id))
comment<-rep(NA,max(fake_id))
shares<-rep(NA,max(fake_id))
fake<-data.frame(fake_id,like,love,wow,haha,sad,angry,care,comment,shares)
#Probability distribution for user interaction with a given FB post
misinformation_prob<-c(0.090637966,0.015194195,0.023018674,0.013500845,0.001573673,0.017003550,0.002058321,0.003093388,0.001312486)
authentic_prob<-c(0.0275070460,0.0103958123,0.0060707537,0.0034785282,0.0007527044,0.0088240139,0.0020064930,0.0019195168,0.0006860144)
prob.dist<-data.frame(misinformation_prob,authentic_prob)
colnames(prob.dist)<-c("Misinformation","Authentic")
rownames(prob.dist)<-c("Likes","Comments","Shares","Loves","Wows","Hahas","Sads","Angrys","Cares")
prob.dist
#For loop used to create a simulated data
for(i in fake_id){
fake$like[i]<-sample(x=c(0,1), size=1,prob=c(1-prob.dist[1,'Misinformation'],prob.dist[1,'Misinformation']))
fake$comment[i]<-sample(x=c(0,1), size=1,prob=c(1-prob.dist[2,'Misinformation'],prob.dist[2,'Misinformation']))
fake$shares[i]<-sample(x=c(0,1), size=1,prob=c(1-prob.dist[3,'Misinformation'],prob.dist[3,'Misinformation']))
if(fake$like[i]==1){
fake[i,3:8]=0
}else for(j in 3:8){
if(is.na(fake[i,j])==TRUE){
fake[i,j]<-sample(x=c(0,1),size=1,prob=c(1-prob.dist[j+1,'Misinformation'],prob.dist[j+1,'Misinformation']))
}
if(fake[i,j]==1){
fake[i,-j]==0
}
}
}
I hoped that by writing
if(fake[i,j]==1){
fake[i,-j]==0
}
I'd be able to avoid having duplicate reactions like certain user liking and loving the post simultaneously. Yet, once I run the simulation, I'd occasionally run into this problem that I wanted to avoid. Any input regarding this problem would be greatly appreciated. Thank you community!

To elaborate a little on my comment, you have structured the data pretty nicely so once we clean the row names all of this can be vectorised easily:
rownames(prob.dist) <- c("like", "comment", "shares", "love", "wow", "haha", "sad", "angry", "care")
We can then create a function to look up the probabilities for each column in your table:
get_prob <- function(col, dat = prob.dist) {
prob <- c(
1 - dat[col, "Misinformation"],
dat[col, "Misinformation"]
)
return(prob)
}
Then it's a simple matter of running the simulation for each column. First the three columns can be sampled from the distribution randomly based on the probabilities but not depending on likes:
set.seed(100) # for reproducibility
n <- length(fake_id)
independent_cols <- c("like", "comment", "shares")
cols_depend_on_like <- c("love", "wow", "haha", "sad", "angry", "care")
fake[independent_cols] <- lapply(independent_cols, \(col) {
sample(x = c(0, 1), size = n, prob = get_prob(col), replace = TRUE)
})
Note that we are sampling with replacement - which is essentially what you were doing when you did a rowwise sampling without replacement. Then we can add the columns which depend on likes:
zeroes <- fake$like == 1
fake[cols_depend_on_like] <- lapply(cols_depend_on_like, \(col) {
values <- sample(0:1, size = n, prob = get_prob(col), replace = TRUE)
values[zeroes] <- 0
values
})
The output is probabilities in the same range as your original code but much quicker:
sapply(fake, sum)
# fake_id like love wow haha sad angry care comment shares
# 500500 100 21 2 19 2 4 1 19 20
# Check that all the columns are zeroes that are supposed to be when like==1
sapply(fake[fake$like == 1, ], sum)
# fake_id like love wow haha sad angry care comment shares
# 51347 100 0 0 0 0 0 0 6 2
Benchmarking
I wanted to compare the performance for fun but also it will hopefully give you a sense of why it is worth doing this in R. This is the benchmarking code:
num_rows <- c(10, 1e2, 1e3, 1e4)
results <- bench::press(
rows = num_rows,
{
# Creating dataframe
fake_id <- 1:rows
like <- rep(NA, max(fake_id))
love <- rep(NA, max(fake_id))
wow <- rep(NA, max(fake_id))
haha <- rep(NA, max(fake_id))
sad <- rep(NA, max(fake_id))
angry <- rep(NA, max(fake_id))
care <- rep(NA, max(fake_id))
comment <- rep(NA, max(fake_id))
shares <- rep(NA, max(fake_id))
fake <- data.frame(fake_id, like, love, wow, haha, sad, angry, care, comment, shares)
# Probability distribution for user interaction with a given FB post
misinformation_prob <- c(0.090637966, 0.015194195, 0.023018674, 0.013500845, 0.001573673, 0.017003550, 0.002058321, 0.003093388, 0.001312486)
authentic_prob <- c(0.0275070460, 0.0103958123, 0.0060707537, 0.0034785282, 0.0007527044, 0.0088240139, 0.0020064930, 0.0019195168, 0.0006860144)
prob.dist <- data.frame(misinformation_prob, authentic_prob)
colnames(prob.dist) <- c("Misinformation", "Authentic")
rownames(prob.dist) <- c("like", "comment", "shares", "love", "wow", "haha", "sad", "angry", "care")
bench::mark(
min_iterations = 10,
check = FALSE,
rowwise = {
set.seed(100) # for reproducibility
for (i in fake_id) {
fake$like[i] <- sample(x = c(0, 1), size = 1, prob = c(1 - prob.dist[1, "Misinformation"], prob.dist[1, "Misinformation"]))
fake$comment[i] <- sample(x = c(0, 1), size = 1, prob = c(1 - prob.dist[2, "Misinformation"], prob.dist[2, "Misinformation"]))
fake$shares[i] <- sample(x = c(0, 1), size = 1, prob = c(1 - prob.dist[3, "Misinformation"], prob.dist[3, "Misinformation"]))
if (fake$like[i] == 1) {
fake[i, 3:8] <- 0
} else {
for (j in 3:8) {
if (is.na(fake[i, j]) == TRUE) {
fake[i, j] <- sample(x = c(0, 1), size = 1, prob = c(1 - prob.dist[j + 1, "Misinformation"], prob.dist[j + 1, "Misinformation"]))
}
if (fake[i, j] == 1) {
fake[i, -j] <- 0
}
}
}
}
},
vectorised = {
set.seed(100) # for reproducibility
n <- length(fake_id)
independent_cols <- names(fake)[c(2, 9, 10)]
cols_depend_on_like <- names(fake)[3:8]
fake[independent_cols] <- lapply(independent_cols, \(col) {
sample(x = c(0, 1), size = n, prob = get_prob(col), replace = TRUE)
})
fake[cols_depend_on_like] <- lapply(cols_depend_on_like, \(col) {
values <- fake[[col]]
zeroes <- fake$like == 1
n <- sum(!zeroes)
values[zeroes] <- 0
values[!zeroes] <- sample(0:1, size = n, prob = get_prob(col), replace = TRUE)
values
})
}
)
}
)

Related

R bootstrapping for the two dataframe individual column wise

Want to do Bootstrapping while comparing two dataframe column wise with the different number of rows.
I have two dataframe in which row represent values from experiments and column with the dataset names (data1, data2, data3, data4)
emp.data1 <- data.frame(
data1 = c(234,0,34,0,46,0,0,0,2.26,0, 5,8,93,56),
data2 = c(1.40,1.21,0.83,1.379,2.60,9.06,0.88,1.16,0.64,8.28, 5,8,93,56),
data3 =c(0,34,43,0,0,56,0,0,0,45,5,8,93,56),
data4 =c(45,0,545,34,0,35,0,35,0,534, 5,8,93,56),
stringsAsFactors = FALSE
)
emp.data2 <- data.frame(
data1 = c(45, 0, 0, 45, 45, 53),
data2 = c(23, 0, 45, 12, 90, 78),
data3 = c(72, 45, 756, 78, 763, 98),
data4 = c(1, 3, 65, 78, 9, 45),
stringsAsFactors = FALSE
)
I am trying to do bootstrapping(n=1000). Values are selected at random replacement from emp.data1(14 * 4) without change in the emp.data2(6 * 4). For example from emp.data2 first column (data1) select 6 values colSum and from emp.data1(data1) select 6 random non zero values colSum Divide the values and store in temp repeat the same 1000 times and take a median value et the end. like this i want to do it for each column of the dataframe. sample code I am providing which is working fine but i am not able get the non-zero random values for emp.data1
nboot <- 1e3
boot_temp_emp<- c()
n_data1 <- nrow(emp.data1); n_data2 <- nrow(emp.data2)
for (j in seq_len(nboot)) {
boot <- sample(x = seq_len(n_data1), size = n_data2, replace = TRUE)
value <- colSums(emp.data2)/colSums(emp.data1[boot,])
boot_temp_emp <- rbind(boot_temp_emp, value)
}
boot_data<- apply(boot_temp_emp, 2, median)
From the above script i am able get the output but each column emp.data1[boot,] data has zero values and taken sum. I want indivisual ramdomly selected non-zero values column sum so I tried below script not able remove zero values. Not able get desired output please some one help me to correct my script
nboot <- 1e3
boot_temp_emp<- c()
for (i in colnames(emp.data2)){
for (j in seq_len(nboot)){
data1=emp.data1[i]
data2=emp.data2[i]
n_data1 <- nrow(data1); n_data2 <- nrow(data2)
boot <- sample(x = seq_len(n_data1), size = n_data2, replace = TRUE)
value <- colSums(data2[i])/colSums(data1[boot, ,drop = FALSE])
boot_temp_emp <- rbind(boot_temp_emp, value)
}
}
boot_data<- apply(boot_temp_emp, 2, median)
Thank you
Here is a solution.
Write a function to make the code clearer. This function takes the following arguments.
x the input data.frame emp.data1;
s2 the columns sums of emp.data2;
n = 6 the number of vector elements to sample from emp.data1's columns with a default value of 6.
The create a results matrix, pre-compute the column sums of emp.data2 and call the function in a loop.
boot_fun <- function(x, s2, n = 6){
# the loop makes sure ther is no divide by zero
nrx <- nrow(x)
repeat{
i <- sample(nrx, n, replace = TRUE)
s1 <- colSums(x[i, ])
if(all(s1 != 0)) break
}
s2/s1
}
set.seed(2022)
nboot <- 1e3
sums2 <- colSums(emp.data2)
results <- matrix(nrow = nboot, ncol = ncol(emp.data1))
for(i in seq_len(nboot)){
results[i, ] <- boot_fun(emp.data1, sums2)
}
ratios_medians <- apply(results, 2, median)
old_par <- par(mfrow = c(2, 2))
for(j in 1:4) {
main <- paste0("data", j)
hist(results[, j], main = main, xlab = "ratios", freq = FALSE)
abline(v = ratios_medians[j], col = "blue", lty = "dashed")
}
par(old_par)
Created on 2022-02-24 by the reprex package (v2.0.1)
Edit
Following the comments here is a revised version of the bootstrap function. It makes sure there are no zeros in the sampled vectors, before computing their sums.
boot_fun2 <- function(x, s2, n = 6){
nrx <- nrow(x)
ncx <- ncol(x)
s1 <- numeric(ncx)
for(j in seq.int(ncx)) {
repeat{
i <- sample(nrx, n, replace = TRUE)
if(all(x[i, j] != 0)) {
s1[j] <- sum(x[i, j])
break
}
}
}
s2/s1
}
set.seed(2022)
nboot <- 1e3
sums2 <- colSums(emp.data2)
results2 <- matrix(nrow = nboot, ncol = ncol(emp.data1))
for(i in seq_len(nboot)){
results2[i, ] <- boot_fun2(emp.data1, sums2)
}
ratios_medians2 <- apply(results2, 2, median)
old_par <- par(mfrow = c(2, 2))
for(j in 1:4) {
main <- paste0("data", j)
hist(results2[, j], main = main, xlab = "ratios", freq = FALSE)
abline(v = ratios_medians2[j], col = "blue", lty = "dashed")
}
par(old_par)
Created on 2022-02-27 by the reprex package (v2.0.1)

Improve the performance of this script

Here is a piece of my code that I introduce in an R shiny application but which takes me a lot of time because I execute it in a reactive function which I then call five times for different graphics.
Do you have an idea to improve the speed of this script?
I have already tried to execute this with purr but I do not master this tool well enough.
Here is a reproducible example
library(profvis)
profvis({
#dataframe created for the example
DF<- data.frame("scan"=seq(1:7518),"dye1"=NA,"dye2"=NA,"dye3"=NA,"dye4"=NA,"dye5"=NA,"dye6"=NA)
DF$dye1 <- sample(100, size = nrow(DF), replace = TRUE)
DF$dye2 <- sample(100, size = nrow(DF), replace = TRUE)
DF$dye3 <- sample(100, size = nrow(DF), replace = TRUE)
DF$dye4 <- sample(100, size = nrow(DF), replace = TRUE)
DF$dye5 <- sample(100, size = nrow(DF), replace = TRUE)
DF$dye6 <- sample(100, size = nrow(DF), replace = TRUE)
#slowness begins here
for (d in 3000:7518){
#array of input data
input <- numeric(1206)
for (i in -100:100){
input[1+i+100] <- DF$dye1[d + i]
input[202+i+100] <- DF$dye2[d + i]
input[403+i+100] <- DF$dye3[d + i]
input[604+i+100] <- DF$dye4[d + i]
input[805+i+100] <- DF$dye5[d + i]
input[1006+i+100] <- DF$dye6[d + i]
}
}
})
First: please really reconsider what you want to achieve and whether this approach is the smartest way to achieve it..
Second: use vectorization to improve your performance:
d <- 3000
input <- numeric(1206)
microbenchmark::microbenchmark(
# loop as before
case1 = {for (i in -100:100){
input[1+i+100] <- DF$dye1[d + i]
}},
# use vectorization
case2 = {input[(1-100+100):(1+100+100)] <- DF$dye1[(d -100):(d +100)]}
)

How can I add an event with matrix data in ode solver

I have a differential equation model that is running on a network of interactions. Nodes connect to food and can take food at a rate dependent on the size of the food (see first chunk of code).
changes <- function(t, state_a, parameters){
with(as.list(c(state_a, parameters)),{
r <- rowSums(n_mat * food)
dN <- matrix(r * state_a,3,1)
list(c(dN))
})
}
food <- c(0,0.2,0.5)
n_vec <- c(0,0,1,1,0,0,0,1,0)
n_mat <- matrix(n_vec, 3 ,3)
times <- seq(0, 10, by = 1)
state_a <- runif(3, 0, 1000)
parameters <- c(n_mat, food)
out <- ode (y = state_a,
times = times,
func = changes, parms = parameters)
However, I'd like to be able to change the size of the food over time, whilst the differential equations are runnning. For example, if the food looks like the below code (where each row is a timepoint and each column is a food source). It looks like this is possible with using events in the ode solver, but I can't figure out how to do this when I have a matrix of parameters to change, rather than just a single parameter. Is there a good way to do this?
food <- rep(c(0,0.6,0.1,0.4,0.2,0.1,0.2), 6)
food <- matrix(food[1:30],10,3)
colnames(food) <- 1:3
rownames(food) <- 1:10
Below is a working example of ode events where only a single parameter is being changed:
derivs <- function(t, var, parms) {
list(dvar = rep(0, 2))
}
yini <- c(v1 = 1, v2 = 2)
times <- seq(0, 10, by = 0.1)
eventdat <- data.frame(var = c("v1", "v2", "v2", "v1"),
time = c(1, 1, 5, 9) ,
value = c(1, 2, 3, 4),
method = c("add", "mult", "rep", "add"))
eventdat
out <- vode(func = derivs, y = yini, times = times, parms = NULL,
events = list(data = eventdat))
New, but not working code:
calc_food_mat <- function(t, food_df){
return(food_df[which(food_df$time == floor(t)),2] + ((food_df[which(food_df$time == ceiling(t)),2] - food_df[which(food_df$time == floor(t)),2]) * (t - floor(t))))
}
changes <- function(t, state_a, parameters){
with(as.list(c(t, state_a, parameters)),{
food <- calc_food_mat(t, food_df)
r <- rowSums((n_mat * food)[drop = FALSE])
dN <- r * state_a
list(c(dN))
})
}
seasonl <- 40
foodsize <- 4000
foods <- 3
food_seq <- append(seq(foodsize/5, foodsize, foodsize/5), rev(seq(foodsize/5, foodsize, foodsize/5)))
start <- round(runif(foods, -0.5, seasonl - length(food_seq) + 0.5))
food_mat <- matrix(0, foods, seasonl)
for (i in 1:length(start)){
food_mat[i,(start[i]+1):(start[i]+length(food_seq))] <- food_seq
}
food_mat <- data.frame(food_mat)
colnames(food_mat) <- 1:seasonl
rownames(food_mat) <- 1:foods
food_df <- food_mat %>%
gather (key = time, value = resources)
n_vec <- c(0,0,1,1,0,0,0,1,0)
n_mat <- matrix(n_vec, 3 ,3)
times <- seq(0, 40, by = 1)
state_a <- runif(3, 0, 1000)
parameters <- c(n_mat, food_df)
out <- ode (y = state_a,
times = times,
func = changes, parms = parameters)

Avoiding Loops to Generate a Complex Dataframe with Nested Lists

Here is a kind of DF, I have to generate to store simulations data).
nbSimul <- 100
nbSampleSizes <- 4
nbCensoredRates <- 4
sampleSize <- c(100, 50, 30, 10)
censoredRate <- c(0.1, 0.3, 0.5, 0.8)
df.sampled <- data.frame(cas = numeric() ,
distribution = character(),
simul = numeric() ,
sampleSize = numeric() ,
censoredRate = numeric() ,
dta = I(list()) ,
quantileLD = I(list()) ,
stringsAsFactors = FALSE)
v <- 0 # Scenario indicator
for(k in 1:nbCensoredRates){
for(j in 1:nbSampleSizes){
for(i in 1:nbSimul){
# Scenario Id + Other info
v <- v + 1
df.sampled[v,"cas"] <- v
df.sampled[v,"distribution"] <- "logNormal"
df.sampled[v,"simul"] <- i
df.sampled[v,"sampleSize"] <- sampleSize[j]
df.sampled[v,"censoredRate"] <- censoredRate[k]
X <- rlnorm(sampleSize[j], meanlog = 0, sdlog = 1)
estimatedLD <- array(9)
for(w in 1:9){
estimatedLD[w] <- quantile(X, probs=censoredRate[k], type=w)[[1]]
}
df.sampled$dta[v] <- list(X)
df.sampled$quantileLD[v] <- list(estimatedLD[1:9])
}
}
}
Which is quite difficult to read.
I would like to find a way to avoid loops, and to reference easily scenarios (v) and attached variables.
Any idea?

randomly assign teachers to a school with dplyr or similar?

Suppose I have a data frame with 8 schools and its characteristics, and another with 48 teachers and its characteristics. I can generate some fake data with the following code:
library(dplyr)
library(geosphere)
set.seed(6232015)
n.schools <-8
n.teachers <- 48
makeRandomString <- function(pre, n=1, length=12) {
randomString <- c(1:n) # initialize vector
for (i in 1:n) {
randomString[i] <- paste0(pre,'.', paste(sample(c(0:9, letters, LETTERS),
length, replace=TRUE),
collapse=""))
}
return(randomString)
}
gen.teachers <- function(n.teachers){
Teacher.ID <- makeRandomString(pre= 'T', n = n.teachers, length = 20)
Teacher.exp <- runif(n = n.teachers, min = 1, max = 30)
Teacher.Other <- sample(c(0,1), replace = T, prob = c(0.5, 0.5), size = n.teachers)
Teachers <- data.frame(Teacher.ID, Teacher.exp, Teacher.Other)
return(Teachers)
}
gen.schools <- function(n.schools){
School.ID <- makeRandomString(pre= 'S', n = n.schools, length = 20)
School.lat <- runif(n = n.schools, min = -2, max = 2)
School.long <- runif(n = n.schools, min = -2, max = 2)
Schools <- data.frame(School.ID, School.lat, School.long) %>%
rowwise() %>% mutate (School.distance = distHaversine(p1 = c(School.long, School.lat),
p2 = c(0, 0), r = 3961))
return(Schools)
}
Teachers <- gen.teachers(n.teachers = n.teachers)
Schools <- gen.schools(n.schools = n.schools)
To each shool, I want to assign 6 teachers (every teacher get 1 and only 1 school). I could use:
Teachers %>% sample_n(6)
To get a list of 6 teachers assign those to a school, remove them from my original pool and keep going with a loop. My guess/hope is that there is a much easier way of doing this.
Thanks for the help!
In the context of your code
sample(rep(Schools$School.ID, each = 6))
gives a random sequence of schools where each school.id appears 6 times. Set Teachers$AssignedSchool to this sample and each teacher has an assigned school

Resources