Related
I am currently working on building a simulation that simulates user interactions with fake/authentic FB news posts. As I built simulation using for loop, I've ran into the following problem:
First, I wanted to set my loop such that if a person reacts to a FB post (i.e leave a like, love, wow, haha, sad, angry, or care) he/she would leave only one reaction. For instance, if someone leaves a like, he/she shouldn't be able to leave other reactions like love, wow, etc.
This is the code I used for generating simulation data
#Creating empty dataframe
fake_id<-1:1000
like<-rep(NA,max(fake_id))
love<-rep(NA,max(fake_id))
wow<-rep(NA,max(fake_id))
haha<-rep(NA,max(fake_id))
sad<-rep(NA,max(fake_id))
angry<-rep(NA,max(fake_id))
care<-rep(NA,max(fake_id))
comment<-rep(NA,max(fake_id))
shares<-rep(NA,max(fake_id))
fake<-data.frame(fake_id,like,love,wow,haha,sad,angry,care,comment,shares)
#Probability distribution for user interaction with a given FB post
misinformation_prob<-c(0.090637966,0.015194195,0.023018674,0.013500845,0.001573673,0.017003550,0.002058321,0.003093388,0.001312486)
authentic_prob<-c(0.0275070460,0.0103958123,0.0060707537,0.0034785282,0.0007527044,0.0088240139,0.0020064930,0.0019195168,0.0006860144)
prob.dist<-data.frame(misinformation_prob,authentic_prob)
colnames(prob.dist)<-c("Misinformation","Authentic")
rownames(prob.dist)<-c("Likes","Comments","Shares","Loves","Wows","Hahas","Sads","Angrys","Cares")
prob.dist
#For loop used to create a simulated data
for(i in fake_id){
fake$like[i]<-sample(x=c(0,1), size=1,prob=c(1-prob.dist[1,'Misinformation'],prob.dist[1,'Misinformation']))
fake$comment[i]<-sample(x=c(0,1), size=1,prob=c(1-prob.dist[2,'Misinformation'],prob.dist[2,'Misinformation']))
fake$shares[i]<-sample(x=c(0,1), size=1,prob=c(1-prob.dist[3,'Misinformation'],prob.dist[3,'Misinformation']))
if(fake$like[i]==1){
fake[i,3:8]=0
}else for(j in 3:8){
if(is.na(fake[i,j])==TRUE){
fake[i,j]<-sample(x=c(0,1),size=1,prob=c(1-prob.dist[j+1,'Misinformation'],prob.dist[j+1,'Misinformation']))
}
if(fake[i,j]==1){
fake[i,-j]==0
}
}
}
I hoped that by writing
if(fake[i,j]==1){
fake[i,-j]==0
}
I'd be able to avoid having duplicate reactions like certain user liking and loving the post simultaneously. Yet, once I run the simulation, I'd occasionally run into this problem that I wanted to avoid. Any input regarding this problem would be greatly appreciated. Thank you community!
To elaborate a little on my comment, you have structured the data pretty nicely so once we clean the row names all of this can be vectorised easily:
rownames(prob.dist) <- c("like", "comment", "shares", "love", "wow", "haha", "sad", "angry", "care")
We can then create a function to look up the probabilities for each column in your table:
get_prob <- function(col, dat = prob.dist) {
prob <- c(
1 - dat[col, "Misinformation"],
dat[col, "Misinformation"]
)
return(prob)
}
Then it's a simple matter of running the simulation for each column. First the three columns can be sampled from the distribution randomly based on the probabilities but not depending on likes:
set.seed(100) # for reproducibility
n <- length(fake_id)
independent_cols <- c("like", "comment", "shares")
cols_depend_on_like <- c("love", "wow", "haha", "sad", "angry", "care")
fake[independent_cols] <- lapply(independent_cols, \(col) {
sample(x = c(0, 1), size = n, prob = get_prob(col), replace = TRUE)
})
Note that we are sampling with replacement - which is essentially what you were doing when you did a rowwise sampling without replacement. Then we can add the columns which depend on likes:
zeroes <- fake$like == 1
fake[cols_depend_on_like] <- lapply(cols_depend_on_like, \(col) {
values <- sample(0:1, size = n, prob = get_prob(col), replace = TRUE)
values[zeroes] <- 0
values
})
The output is probabilities in the same range as your original code but much quicker:
sapply(fake, sum)
# fake_id like love wow haha sad angry care comment shares
# 500500 100 21 2 19 2 4 1 19 20
# Check that all the columns are zeroes that are supposed to be when like==1
sapply(fake[fake$like == 1, ], sum)
# fake_id like love wow haha sad angry care comment shares
# 51347 100 0 0 0 0 0 0 6 2
Benchmarking
I wanted to compare the performance for fun but also it will hopefully give you a sense of why it is worth doing this in R. This is the benchmarking code:
num_rows <- c(10, 1e2, 1e3, 1e4)
results <- bench::press(
rows = num_rows,
{
# Creating dataframe
fake_id <- 1:rows
like <- rep(NA, max(fake_id))
love <- rep(NA, max(fake_id))
wow <- rep(NA, max(fake_id))
haha <- rep(NA, max(fake_id))
sad <- rep(NA, max(fake_id))
angry <- rep(NA, max(fake_id))
care <- rep(NA, max(fake_id))
comment <- rep(NA, max(fake_id))
shares <- rep(NA, max(fake_id))
fake <- data.frame(fake_id, like, love, wow, haha, sad, angry, care, comment, shares)
# Probability distribution for user interaction with a given FB post
misinformation_prob <- c(0.090637966, 0.015194195, 0.023018674, 0.013500845, 0.001573673, 0.017003550, 0.002058321, 0.003093388, 0.001312486)
authentic_prob <- c(0.0275070460, 0.0103958123, 0.0060707537, 0.0034785282, 0.0007527044, 0.0088240139, 0.0020064930, 0.0019195168, 0.0006860144)
prob.dist <- data.frame(misinformation_prob, authentic_prob)
colnames(prob.dist) <- c("Misinformation", "Authentic")
rownames(prob.dist) <- c("like", "comment", "shares", "love", "wow", "haha", "sad", "angry", "care")
bench::mark(
min_iterations = 10,
check = FALSE,
rowwise = {
set.seed(100) # for reproducibility
for (i in fake_id) {
fake$like[i] <- sample(x = c(0, 1), size = 1, prob = c(1 - prob.dist[1, "Misinformation"], prob.dist[1, "Misinformation"]))
fake$comment[i] <- sample(x = c(0, 1), size = 1, prob = c(1 - prob.dist[2, "Misinformation"], prob.dist[2, "Misinformation"]))
fake$shares[i] <- sample(x = c(0, 1), size = 1, prob = c(1 - prob.dist[3, "Misinformation"], prob.dist[3, "Misinformation"]))
if (fake$like[i] == 1) {
fake[i, 3:8] <- 0
} else {
for (j in 3:8) {
if (is.na(fake[i, j]) == TRUE) {
fake[i, j] <- sample(x = c(0, 1), size = 1, prob = c(1 - prob.dist[j + 1, "Misinformation"], prob.dist[j + 1, "Misinformation"]))
}
if (fake[i, j] == 1) {
fake[i, -j] <- 0
}
}
}
}
},
vectorised = {
set.seed(100) # for reproducibility
n <- length(fake_id)
independent_cols <- names(fake)[c(2, 9, 10)]
cols_depend_on_like <- names(fake)[3:8]
fake[independent_cols] <- lapply(independent_cols, \(col) {
sample(x = c(0, 1), size = n, prob = get_prob(col), replace = TRUE)
})
fake[cols_depend_on_like] <- lapply(cols_depend_on_like, \(col) {
values <- fake[[col]]
zeroes <- fake$like == 1
n <- sum(!zeroes)
values[zeroes] <- 0
values[!zeroes] <- sample(0:1, size = n, prob = get_prob(col), replace = TRUE)
values
})
}
)
}
)
I have written a cross validation/grid search style code in R that tries to find an optimal threshold value for a given value of mtry (using the random forest algorithm). I have posted my code below using the Sonar data from the library mlbench However, there seems to be some problems with this code.
library(caret)
library(mlbench)
library(randomForest)
res <- matrix(0, nrow = 10, ncol = 6)
colnames(res) <- c("mtry","Threshhold","Accuracy", "PositivePred", "NegativePred", "F-value")
out <- matrix(0, nrow = 17, ncol = 6)
colnames(out) <- c("mtry","Threshhold","Avg.Accuracy", "Avg.PosPred", "Avg.NegPred", "Avg.F_Value")
rep <- matrix(0, nrow = 10, ncol = 6)
colnames(out) <- c("mtry","Threshhold","Avg_Accuracy", "Avg_PosPred", "Avg_NegPred", "Avg_F_Value")
data(Sonar)
N=Sonar
### creating 10 folds
folds <- cut(seq(1,nrow(N)),breaks=10,labels=FALSE)
for (mtry in 5:14) {
K=mtry-4
for(thresh in seq(1,9,0.5)) {
J = 2*thresh-1
dataset<-N[sample(nrow(N)),] #### mix up the dataset N
for(I in 1:10){
#Segement your data by fold using the which() function
testIndexes <- which(folds==I,arr.ind=TRUE)
N_test <- dataset[testIndexes, ] ### select each fold for test
N_train <- dataset[-testIndexes, ] ### select rest for training
rf = randomForest(Class~., data = N_train, mtry=mtry, ntree=500)
pred = predict(rf, N_test, type="prob")
label = as.factor(ifelse(pred[,2]>=thresh,"M","R"))
confusion = confusionMatrix(N_test$Class, label)
res[I,1]=mtry
res[I,2]=thresh
res[I,3]=confusion$overall[1]
res[I,4]=confusion$byClass[3]
res[I,5]=confusion$byClass[4]
res[I,6]=confusion$byClass[7]
}
print(res)
out[J,1] = mtry
out[J,2] = thresh
out[J,3] = mean(res[,2])
out[J,4] = mean(res[,3])
out[J,5] = mean(res[,4])
out[J,6] = mean(res[,5])
}
print(out)
rep[K,1] = mtry
rep[K,2] = thresh
rep[K,3] = mean(out[,2])
rep[K,4] = mean(out[,3])
rep[K,5] = mean(out[,4])
rep[K,6] = mean(out[,5])
}
print(rep)
Earlier, I wrote a similar code with the "iris" dataset, and I did not seem to have any problems:
library(caret)
library(randomForest)
data(iris)
N <- iris
N$Species = ifelse(N$Species == "setosa", "a", "b")
N$Species = as.factor(N$Species)
res <- matrix(0, nrow = 10, ncol = 5)
colnames(res) <- c("Threshhold","Accuracy", "PositivePred", "NegativePred", "F-value")
out <- matrix(0, nrow = 9, ncol = 5)
colnames(out) <- c("Threshhold","Avg.Accuracy", "Avg.PosPred", "Avg.NegPred", "Avg.F_Value")
### creating 10 folds
folds <- cut(seq(1,nrow(N)),breaks=10,labels=FALSE)
for(J in 1:9) {
thresh = J/10
dataset<-N[sample(nrow(N)),] #### mix up the dataset N
for(I in 1:10){
#Segement your data by fold using the which() function
testIndexes <- which(folds==I,arr.ind=TRUE)
N_test <- dataset[testIndexes, ] ### select each fold for test
N_train <- dataset[-testIndexes, ] ### select rest for training
rf = randomForest(Species~., data = N_train, mtry=3, ntree=10)
pred = predict(rf, N_test, type="prob")
label = as.factor(ifelse(pred[,1]>=thresh,"a","b"))
confusion = confusionMatrix(N_test$Species, label)
res[I,1]=thresh
res[I,2]=confusion$overall[1]
res[I,3]=confusion$byClass[3]
res[I,4]=confusion$byClass[4]
res[I,5]=confusion$byClass[7]
}
print(res)
out[J,1] = thresh
out[J,2] = mean(res[,2])
out[J,3] = mean(res[,3])
out[J,4] = mean(res[,4])
out[J,5] = mean(res[,5])
}
print(out)
Could someone please assist me in debugging the first code?
Thanks
You need to close parenthesis ) in your for loop.
Replace this
for(thresh in seq(1,9,0.5) {
with
for(thresh in seq(1,9,0.5)) {
Update:
Also, it appears that your thresh is always above 1 giving a single value R in the label, as it is never above thresh.
label = as.factor(ifelse(pred[,2]>=thresh,"M","R"))
and that creates a problem in the next statement
confusion = confusionMatrix(N_test$Class, label)
I tested with 0.5, and I get no error.
label = as.factor(ifelse(pred[,2]>=0.5,"M","R"))
If you can define a better thresh - to stay between 0 and 1, you should be fine.
I have heard that it is not recommended to use for loops in R mainly because it is slow. I have heard that I should use lapply instead because it's calling C for efficiency.
Question: Would it be possible to show me how to transform the following example into a lapply efficient code (or any other apply sapply from the same family)?
myFun <- function(loop){
result = data.frame() #init new df
for(iteration in 1:loop){
generateRnorm1 = matrix(data = rnorm(n = 1000000), nrow = 10000, ncol = 10000)
generateRnorm2 = matrix(data = rnorm(n = 1000000), nrow = 10000, ncol = 10000)
iterationResult = sum(generateRnorm1, generateRnorm2)
bindIterationResult = cbind(iteration, iterationResult)
result = rbind(result, bindIterationResult)
}
return(result)
}
test = myFun(loop = 10)
Here is an lapply method:
myFun2 <- function(loop){
generateRnorm1 = matrix(data = rnorm(n = 1000000), nrow = 10000, ncol = 10000)
generateRnorm2 = matrix(data = rnorm(n = 1000000), nrow = 10000, ncol = 10000)
sum(generateRnorm1, generateRnorm2)
}
# run function over 1:10
myList <- lapply(seq.int(10), myFun2)
# rbind the resulting list
result2 <- do.call(rbind, myList)
Note that there isn't much (if any) speed increase, because the body of your function takes a long time to execute. This swamps any potential speed up with lapply.
On my computer, both methods take about 20 seconds to run.
Here is a kind of DF, I have to generate to store simulations data).
nbSimul <- 100
nbSampleSizes <- 4
nbCensoredRates <- 4
sampleSize <- c(100, 50, 30, 10)
censoredRate <- c(0.1, 0.3, 0.5, 0.8)
df.sampled <- data.frame(cas = numeric() ,
distribution = character(),
simul = numeric() ,
sampleSize = numeric() ,
censoredRate = numeric() ,
dta = I(list()) ,
quantileLD = I(list()) ,
stringsAsFactors = FALSE)
v <- 0 # Scenario indicator
for(k in 1:nbCensoredRates){
for(j in 1:nbSampleSizes){
for(i in 1:nbSimul){
# Scenario Id + Other info
v <- v + 1
df.sampled[v,"cas"] <- v
df.sampled[v,"distribution"] <- "logNormal"
df.sampled[v,"simul"] <- i
df.sampled[v,"sampleSize"] <- sampleSize[j]
df.sampled[v,"censoredRate"] <- censoredRate[k]
X <- rlnorm(sampleSize[j], meanlog = 0, sdlog = 1)
estimatedLD <- array(9)
for(w in 1:9){
estimatedLD[w] <- quantile(X, probs=censoredRate[k], type=w)[[1]]
}
df.sampled$dta[v] <- list(X)
df.sampled$quantileLD[v] <- list(estimatedLD[1:9])
}
}
}
Which is quite difficult to read.
I would like to find a way to avoid loops, and to reference easily scenarios (v) and attached variables.
Any idea?
How can I run a function (in R) where some of the inputs are pulled from a list (or data frame)? Am I right in thinking that this would be more efficient than running a for-loop?
I am running simulations and want to change the variable values, but as they take a long time to run I want them to run overnight and to just tick through the different values automatically.
Here's the code for the function:
n = 10000
mu = 0
sd = 1
n.sub = 100
iboot = 100
isim = 1000 ### REDUCED FOR THIS EXAMPLE ###
var.values <- NULL
var.values.pop <- NULL
hist.fn <- function(n,mu,sd,n.sub,iboot)
{
Pop <- rnorm(n,mu,sd)
var.pop <- var(Pop)
Samp <- sample(Pop, n.sub, replace = FALSE)
var.samp <- var(Samp)
for(i in 1:isim) {
for(j in 1:iboot) {
Boot <- sample(Samp, n.sub, replace = TRUE)
var.values[j] <- var(Boot)
}
Samp <- sample(Pop, n.sub, replace = FALSE)
var.values.pop[i] <- var(Samp)
}
hist.pop <- hist(var.values.pop,plot=F)
hist.boot <- hist(var.values,plot=F)
#mypath = file.path("C:", "Output", paste("hist.boot_n.", n.sub, "_var.", sd^2, "_isim.", isim, "_iboot.", iboot, ".wmf", sep=""))
#win.metafile(file=mypath)
plot.new() #### ADDED FOR THIS EXAMPLE INSTEAD OF OUTPUTTING TO FILE ####
plot(hist.pop, freq=FALSE, xlim=range(var.values.pop, var.values), ylim=range(hist.pop$density, hist.boot$density), main = paste("Histogram of variances \n n=",n.sub," mu=",mu,"var=",sd^2,"\n n.sim=",isim,"n.boot=",iboot,"\n"), cex.main=0.8, xlab="Variance", col="red")
plot(hist.boot, freq=FALSE, col="blue", border="blue", add=T, density=20, angle=45)
abline(v=var.pop, lty=2, col="black", lwd=2)
legend("topright", legend=c("sample","bootstrap"),col=c("red","blue"),lty=1,lwd=2,bty="n",cex=0.7)
#dev.off()
}
hist.fn(n,mu,sd,n.sub,iboot)
Then I want sd, n.sub, and iboot to change by running through the following values:
sd <- c(1,10,100,1000)
n.sub <- c(4,10,100,1000)
iboot <- c(100,1000,10000)
Perhaps something like this?
n = 10000
mu = 0
sd = 1
n.sub = 100
iboot = 100
isim = 1000
sd <- c(1,10,100,1000)
n.sub <- c(4,10,100,1000)
iboot <- c(100,1000,10000)
# hist.fn parameters: n,mu,sd,n.sub,iboot
params <- expand.grid(n = n, mu = mu, sd = sd,
n.sub = n.sub, iboot = iboot)
apply(params, 1, FUN = function(x) do.call(hist.fn, as.list(x) ) )
You probably want to put these:
var.values <- NULL
var.values.pop <- NULL
Inside hist.fn, because assigning values to variables outside a function doesn't work like you seem to think.
You should use do.call, which will apply the function using arguments in a list. I have simplified your example to run less loops for the example. You can modify the printline of the script in order to monitor your progress for a larger job:
# The function
hist.fn <- function(n,mu,isim,sd,n.sub,iboot)
{
Pop <- rnorm(n,mu,sd)
var.pop <- var(Pop)
Samp <- sample(Pop, n.sub, replace = FALSE)
var.samp <- var(Samp)
var.values <- NaN*seq(isim) # sets up an empty vector for results
var.values.pop <- NaN*seq(isim) # sets up an empty vector for results
for(i in seq(isim)) {
for(j in seq(iboot)) {
Boot <- sample(Samp, n.sub, replace = TRUE)
var.values[j] <- var(Boot)
print(paste("i =", i, "; j =", j))
}
Samp <- sample(Pop, n.sub, replace = FALSE)
var.values.pop[i] <- var(Samp)
}
list(var.values=var.values, var.values.pop=var.values.pop) #returns results in the form of a list
}
# Global variables
n = 100
mu = 0
isim = 10
# Changing variables
sd <- c(1,10,20,30)
n.sub <- c(4,10,20,30)
iboot <- c(100,200,300,400)
df <- data.frame(sd=sd, n.sub=n.sub, iboot=iboot)
res <- vector(mode="list", nrow(df)) # sets up an empty list for results
for(i in seq(nrow(df))){
res[[i]] <- do.call(hist.fn, c(n=n, mu=mu, isim=isim, df[i,]) )
}
res # show results
sd <- 1:3
n.sub <- 4:6
iboot <- 7:9
funct1<-function(x,y,z) print(x+y+z)
for (i in 1:length(sd)){
funct1(sd[i],n.sub[i],iboot[i])
}
just an example. Doing it with loop.