Increasing weighted mean by random sample - r

I have written code in order to randomly add points to a numeric variable to increase the weighted mean score by 10% storing the new scores in variable S1.
This is done by calculating the total number of points that need to be added to increase the mean by 10%. Next step is to randomly select until the weighted sum of responses is equal to the target - but not adding points where the score is already 10 so as not to pass the maximum value on the scale. The final stage is to select whether the sum that is just above or just below the target is closest and select this sample to add points to.
The code works ok but doesn't look efficient. I am an R novice and have read that loops should be avoided as much as possible, but cannot work out an alternative. Is it possible to do what I am attempting, but more efficiently?
#Create random data
library(stats)
set.seed(21821)
ncust <- 1000
cust.df <- data.frame(cust.id=as.factor(c(1:ncust)))
wtvar <- rnorm(ncust, mean=1, sd=0.2)
V1 <- floor(rnorm(ncust, mean=7.5, sd=3))
V1[V1 > 10] <- 10
V1[V1 < 1] <- 1
cust.df$V1 <- V1
cust.df$wtvar <- wtvar
#Function to determine sample required
random.sample <- function(x) {
(pctadd <- (sum(cust.df$V1*cust.df$wtvar)*0.1)/sum(cust.df$V1[!cust.df$V1==10]*cust.df$wtvar[!cust.df$V1==10])) #percentage of resps (weighted) who need points added to make 10% increase
(numadd <- (sum(cust.df$V1*cust.df$wtvar)*0.1)) #sum of weights needed to make 10% increase
wgttot <- vector(mode="numeric", length=0)
idtot <- vector(mode="numeric", length=0)
id.ref <- cust.df$cust.id[!cust.df$V1==10]
repeat {
preidtot <- idtot
prewgttot <- wgttot
(t.id <- as.numeric(sample(id.ref, 1)))
(t.wgt <- cust.df$wtvar[cust.df$cust.id==t.id])
id.ref <- id.ref[!id.ref==t.id]
wgttot <- c(wgttot,t.wgt)
idtot <- c(idtot, t.id)
if (sum(wgttot) > numadd) break
}
prediff <- numadd - sum(prewgttot)
postdiff <- sum(wgttot) - numadd
if (prediff < postdiff) {
x <- preidtot
} else {
x <- idtot
}
return(x)
}
tempids <- random.sample()
#Apply sample rule
cust.df$S1 = ifelse(cust.df$cust.id %in% tempids, cust.df$V1 + 1, cust.df$V1)
#Check ~10% increase achieved
weighted.mean(cust.df$V1,cust.df$wtvar)
weighted.mean(cust.df$S1,cust.df$wtvar)

The random.sample is your first version, random.sample1 is the without-loop version, random.sample1 do similar thing as random.sample, but their results are different. You can check the code to see how the result of random.sample1 is used. And due to fact that from your definition, the samples required are not unique, so the results of weighted sum are also different, but all approximately increase by 10%.
#Create random data
library(stats)
set.seed(21821)
ncust <- 1000
cust.df <- data.frame(cust.id=as.factor(c(1:ncust)))
wtvar <- rnorm(ncust, mean=1, sd=0.2)
V1 <- floor(rnorm(ncust, mean=7.5, sd=3))
V1[V1 > 10] <- 10
V1[V1 < 1] <- 1
cust.df$V1 <- V1
cust.df$wtvar <- wtvar
#Function to determine sample required
random.sample <- function() {
(pctadd <- (sum(cust.df$V1*cust.df$wtvar)*0.1)/sum(cust.df$V1[!cust.df$V1==10]*cust.df$wtvar[!cust.df$V1==10])) #percentage of resps (weighted) who need points added to make 10% increase
(numadd <- (sum(cust.df$V1*cust.df$wtvar)*0.1)) #sum of weights needed to make 10% increase
wgttot <- vector(mode="numeric", length=0)
idtot <- vector(mode="numeric", length=0)
id.ref <- cust.df$cust.id[!cust.df$V1==10]
repeat {
preidtot <- idtot
prewgttot <- wgttot
(t.id <- as.numeric(sample(id.ref, 1)))
(t.wgt <- cust.df$wtvar[cust.df$cust.id==t.id])
id.ref <- id.ref[!id.ref==t.id]
wgttot <- c(wgttot,t.wgt)
idtot <- c(idtot, t.id)
if (sum(wgttot) > numadd) break
}
prediff <- numadd - sum(prewgttot)
postdiff <- sum(wgttot) - numadd
if (prediff < postdiff) {
x <- preidtot
} else {
x <- idtot
}
return(x)
}
random.sample1 <- function() {
numadd <- sum(cust.df$V1 * cust.df$wtvar) * 0.1 #sum of weights needed to make 10% increase
id.ref <- which(cust.df$V1 != 10)
pos <- sample(id.ref, length(id.ref))
t.wgt <- cust.df$wtvar[pos]
sumwgttot <- cumsum(t.wgt)
return(pos[1:which.min(abs(sumwgttot - numadd))])
}
system.time(tempids <- random.sample())
## On my computer, it uses about 0.200s to finish the calculation.
system.time(tempids1 <- random.sample1())
## On my computer, the without loop version uses about 0.000s.
#Apply sample rule
cust.df$S1 = ifelse(cust.df$cust.id %in% tempids, cust.df$V1 + 1, cust.df$V1)
## Note that the usage of tempids1 is different, this usage is more
## effective than the original one.
cust.df$S2 = cust.df$V1
cust.df$S2[tempids1] = cust.df$V1[tempids1] + 1
#Check ~10% increase achieved
weighted.mean(cust.df$V1,cust.df$wtvar)
weighted.mean(cust.df$S1,cust.df$wtvar)
weighted.mean(cust.df$S2,cust.df$wtvar)

Related

How do I convolve() more than two distributions without doubling the result rowcount every time?

I am attempting to convolve() 36 beta distributions. The result distribution is one of the two input distributions to the next successive call to convolve(). After every convolve(), the result has row count = (nrow(vector1)+nrow(vector2)-1). In effect, the row count of the result distribution almost doubles with every call to convolve(). This is very inefficient - it makes runtime impossibly long and consumes large amounts of memory. Is there any way to keep the row count constant?
Code example below ...
# Function from https://stat.ethz.ch/pipermail/r-help/2008-July/168762.html
weighted.var <- function(x, w, na.rm = FALSE) {
if (na.rm) {
w <- w[i <- !is.na(x)]
x <- x[i]
}
sum.w <- sum(w)
sum.w2 <- sum(w^2)
mean.w <- sum(x * w) / sum(w)
(sum.w / (sum.w^2 - sum.w2)) * sum(w * (x - mean.w)^2, na.rm = na.rm);
}
# Define beta distribution shape parameters.
s1a <- 3.52; s1b <- 65.35;
s2a <- 1.684; s2b <- 189.12;
s3a <- 5.696; s3b <- 32.34;
s4a <- 1.81; s4b <- 185.5;
# Define intial set of quantiles.
mQ1 <- matrix(data=seq(0,1,1/1000),ncol=1);
for (i in 1:3){
mPDF <- matrix(data=convolve(dbeta(mQ1,s1a,s1b),rev(dbeta(mQ1,s2a,s2b)),type="open"),ncol=1L);
print(paste(nrow(mPDF),' rows',sep=''));
if(i < 3){
# Calculate the merged shape parameters directly from mPDF.
mQ2 <- matrix(data=seq(0,1L,(1L/(nrow(mPDF)-1L))),ncol=1L);
wtMean <- weighted.mean(mQ2,mPDF);
wtStd <- sqrt(weighted.var(mQ2,mPDF));
s1a <- -1L * ((wtMean*(wtStd^2 + wtMean^2 - wtMean))/wtStd^2);
s1b <- ((wtStd^2 + wtMean^2 - wtMean)*(wtMean - 1))/wtStd^2;
s2a <- s3a; s2b <- s3b;
mQ1 <- mQ2;
}
} #i

Plot cumulative value for different series

I have run a short simulation and want to plot the outcomes of each simulation in terms of the "running sum" over parameter k. For reference, I want to end up with a plot that looks similar to the ones in this article:
https://www.pinnacle.com/en/betting-articles/Betting-Strategy/betting-bankroll-management/VDM2GY6UX3B552BG
The following is the code for the simulation:
## Simulating returns over k bets.
odds <- 1.5
k <- 100
return <- odds - 1
edge <- 0.04
pw <- 1/(odds/(1-edge))
pl <- 1-pw
nsims <- 10000
set.seed(42)
sims <- replicate(nsims, {
x <- sample(c(-1,return), k, TRUE, prob=c(pl, pw))
})
rownames(sims) <- c(1:k)
colnames(sims) <- c(1:nsims)
If I wasn't being clear in the description let me know.
Okay so here is how you can achieve the plot of the cumulative value over bets (I set nsims <- 10 so that the plot is readable).
First I generate the data :
## Simulating returns over k bets.
odds <- 1.5
k <- 100
return <- odds - 1
edge <- 0.04
pw <- 1/(odds/(1-edge))
pl <- 1-pw
nsims <- 10
set.seed(42)
sims <- replicate(nsims, {
x <- sample(c(-1,return), k, TRUE, prob=c(pl, pw))
})
rownames(sims) <- c(1:k)
colnames(sims) <- c(1:nsims)
Then I create a dataframe containing the results of the n simulations (10 here) :
df <- as.data.frame(sims)
What we want to plot is the cumulative sum, not the result at a specific bet so we iterate through the columns (i.e. the simulations) to have that value :
for (i in colnames(df)){
df[[i]] <- cumsum(df[[i]])
}
df <- mutate(df, bets = rownames(df))
output <- melt(df, id.vars = "bets", variable.name = 'simulation')
Now we can plot our data :
ggplot(output, aes(bets,value,group=simulation)) + geom_line(aes(colour = simulation))

How to create a loop to generate increasing sample sizes in a simulation

I'm trying to create a simulation to calculate the confidence interval for a binomial proportion. So far I have a function that calculates the lower and upper bounds and I have generated and stored the type of data I want (in a matrix, I'm not sure about that).
How can I create a loop that generates samples with different sizes. I'd like to test how the formula performs when calculating the intervals with sample sizes n=10, 11, 12,... up to 100.
My code so far:
## functions that calculate lower and upper bounds
ll <- function(x, cl=0.95) {
n <- length(x)
p.est <- mean(x)
z = abs(qnorm((1-cl)/2))
return((p.est) - z*sqrt(p.est*(1-p.est)/n))
}
ul <- function(x, cl=0.95) {
n <- length(x)
p.est <- mean(x)
z = abs(qnorm((1-cl)/2))
return((p.est) + z*sqrt(p.est*(1-p.est)/n))
}
## my simulation for n=10 and 200 repetitions.
p <- 0.4
n <- 10
rep <- 200
dat <- rbinom(rep*n,1,p)
x <- matrix(dat, ncol=rep)
ll.res <- apply(x, 2, ll)
ul.res <- apply(x, 2, ul)
hits <- ll.res <= p & p <= ul.res
sum(hits==1)/rep
I'm not sure which values do you want to compare between different sample sizes. But I guess wrapping your simulation in a for and using lists to store the results should work:
nrep=200
hits=list()
value=NULL
ll.res = list()
ul.res = list()
ns = c(10:100)
for(i in 1:length(ns)){
p <- 0.4
n <- ns[i]
rep <- 200
dat <- rbinom(rep*n,1,p)
x <- matrix(dat, ncol=nrep)
ll.res[[i]] <- apply(x, 2, ll)
ul.res[[i]] <- apply(x, 2, ul,cl=0.95)
hits[[i]] <- ll.res[[i]] <= p & p <= ul.res[[i]]
value[i] = sum(hits[[i]]==1)/rep
}

replace NA by truncated normal distribution values in r

I am trying to replace NAs by truncated normal distribution values.
First I used sample as follows and the function worked:
v.new <- replace(vector,v, sample(8,length(v),replace =FALSE))
However when I try to use rtnorm it seems not to work. I got any error messages and it takes ages to replace the NAs by the desired interval. Any suggestion to make this work?
library(msm)
# Some data
data("airquality")
airquality$Ozone
# My function
add.trunc.to.NAvector <- function(vector){
v <- NULL
for(i in 1:length(vector)){
if(is.na(vector[i])==TRUE)
v <- append(v, i)
}
mean.val <- mean(vector)
sd.val <- sd(vector)
min.val <- mean.val - 4 * sd.val
max.val <- mean.val + 4 * sd.val
v.new <- replace(vector,v, rtnorm(length(v), lower = min.val, upper = max.val))
return(v.new)
}
Should not this work?
v <- airquality$Ozone
v.new <- v
indices <- which(is.na(v))
m <- mean(v[-indices])
s <- sd(v[-indices])
v.new[indices] <- rtnorm(length(indices), lower = m-4*s, upper = m+4*s)

Extracting row and column in R

I have to write a one sample proportion Z test function in R. I need to have the sample proportion be the proportion of data in the first factor level.
For example,
data <- factor(c(NA, rep("a", 60), rep("b", 40)))
table(data)
a b
60 40
And I need the sample proportion to be 60/100. Here is portion of my code and it is returning an error saying unexpected symbol in mtab <- addmargins(table(data)).
hyp_test <- function(data, hyp_val=NULL, alpha, alternative="two-sided",graph=FALSE) {
n <- sum(!is.na(data))
ifelse(is.factor(data),
mtab <- addmargins(table(data))
phat <- mtab[1]/mtab[3]
qhat <- 1 - phat
if(length(hyp_val) > 0) {
q <- 1-hyp_val
SE.phat <- sqrt((hyp_val*q)/n)
ts.z <- (phat - hyp_val)/SE.phat
p.val <- pnorm(ts.z)*2
if(alternative=="less") {
p.val <- pnorm(ts.z)
}
if(alternative=="greater") {
p.val <- 1 - p.val
}
}
Any help would be much appreciated. I need to basically find out how to find the sample proportion.
In addition to what r2evans states, you should review if statements and pnorm. This is a guesstimate of what you are trying to accomplish since the code is cut off.
hyp_test <- function(data, hyp_val=NULL, alpha, alternative="two-sided",graph=FALSE) {
n <- sum(!is.na(data))
mtab <- addmargins(table(data))
phat <- mtab[1]/mtab[3]
qhat <- 1 - phat
q <- 1-hyp_val
SE.phat <- sqrt((hyp_val*q)/n)
ts.z <- (phat - hyp_val)/SE.phat
p.val <- ifelse(alternative=="two-sided", dnorm(ts.z)*2,ifelse(alternative=="less",1-dnorm(ts.z), dnorm(ts.z)))
if(graph==TRUE) {plot(...)}
return(p.val)
}

Resources