I have a function that I have written to create a simulation that demonstrates the central limit theorem. I'm not sure if its possible or if I am better off just making separate functions but currently it only stores that data frame containing the mean values of all the trials.
# create function to perform CLT simulation
# where n = sample size, t = number of trials, pop = which population is being used, popmean = population mean,
cltsim <- function(n, t, pop, popmean, popsd, poptitle){
popsim <- data.frame()
# Run the simulation
for(i in n) { # for each value of n
col <- c()
for(j in t) { #loop through each value of t
trial <- 1:j
counter <- j #set up an egg timer based on whichever t value we're on
value <- c()
while(counter > 0) { # and extract n samples from the population
bucket <- sample(pop, i, replace = TRUE)
xbar <- mean(bucket) #calculate the sample mean
value <- c(value, xbar) # and add it to a vector
counter <- counter - 1 #egg timer counts down and loops back until it hits 0
}
sbar <- sd(value) #calculate the sample standard deviation
col <- cbind(trial, value, sbar, i, j) #merge all info together
popsim <- rbind(popsim, col) # attach it to empty dataframe
}
}
#clean up so just the finished data frame is left
rm(col, bucket, value, counter, i, j, n, sbar, t, xbar, trial)
#tidy up data frame in order to graph it
names(popsim) <- c("trial#", "value", "sdev", "samples", "trials")
#view the rows of data in popsim data table
popsim
}
when I try to add any more code that requires creating datatables it doesnt store them, below are the blocks of code I wish to add to the function
g1 <- ggplot(popsim, aes(x = value)) + geom_density(fill = "#09AB30") +
facet_grid(samples ~ trials, labeller = label_both) +
ggtitle(paste("Demonstrating The Central Limit Theorem with Simulation using", poptitle)) +
geom_vline(xintercept = popmean, linetype = "dashed")
g1
and
#create data frame of simulated sample standard deviations \
sdmatrix <- matrix(unique(popsim$sdev), nrow = 4, ncol = 4)
sdf <- as.data.frame(sdmatrix, row.names = c("t10", "t100", "t1000", "t10000"))
names(sdf) <- c("s1", "s10", "s30", "s50")
sdf <- t(sdf)
rm(sdmatrix)
sdf
exvals <- pop1sd/sqrt(c(1, 10, 30, 50))
dfex <- as.data.frame(exvals, row.names = c("s1", "s10", "s30", "s50"))
names(dfex) <- "Predicted Standard Deviations"
dfex
Ive had a look around and I cant find a solution anywhere, am I better off just writing different functions for them? Any advice or input on how to make this lot of code more effective/efficient would be greatly appreciated.
thanks in advance
Related
I have some functional data as fda-object. Now I got the first derivative and want to have the coordinates of the maximum value of each single curve. How is this possible?
For better understanding I include some fictional data I took from here:
library(fdaoutlier);
library(fda);
set.seed(95139);
n_obs <- 50;
n_curves <- 100
mod4 <- simulation_model4(n = n_curves, p = n_obs, outlier_rate = .5, seed = 50, plot = FALSE)
index1 <- mod4$true_outliers;
curves_mat <- mod4$data;
n_order = 4;
knots = c(seq(0,n_obs,5))
n_basis = length(knots) + n_order - 2;
spline_basis = create.bspline.basis(rangeval = c(0,n_obs), nbasis = n_basis, norder = n_order)
df1 <- curves_mat[index1,]
df1_obj <- Data2fd(argvals = 1:n_obs, y = t(df1), basisobj = spline_basis, lambda = 0.5)
So, how can I get the coordinates of the maximum value of each single curve of df1_obj?
Some kind of workaround, maybe somebody can add a better solution:
eval.fd() gives a discrete representation of the curves, and so one can get a maximum of them.
fine_df1 <- eval.fd(seq(0,50,length=500),df1_obj);
max_df1 <- array(NA,2);
for(c in c(1:dim(fine_df1)[2])){
cur <- fine_df1[,c];
m <- max(cur);
i <- which(cur %in% m);
max_df1 <- rbind(max_df1, c(i,m));
}
max_df1 <- max_df1[2:dim(max_df1)[1],];
plot(max_df1);
I want to create a comparison for normal test with Shapiro-Wilks, Kolmogorov-Smirnov, Anderson-Darling, Cramer von Mises dan Adjusted Jarque-Bera methods based on the power of test (1-beta) on sample sizes n = 10,20,30,40 and 50.
testnormal=function(n,m,alfa)
{
require(nortest)
require(normtest)
require(xlsx)
pvalue=matrix(0,m,5)
decision=matrix(0,m,5)
for (i in 1:m)
{
data=runif(n,2,5)
test1=shapiro.test(data)
pv1=test1$p.value
pvalue[i,1]=pv1
if (pv1<alfa)
{
decision[i,1]=1
}
test2=ks.test(data,"pnorm",mean=mean(data),sd=sd(data))
pv2=test2$p.value
pvalue[i,2]=pv2
if (pv2<alfa)
{
decision[i,2]=1
}
test3=ad.test(data)
pv3=test3$p.value
pvalue[i,3]=pv3
if (pv3<alfa)
{
decision[i,3]=1
}
test4=cvm.test(data)
pv4=test4$p.value
pvalue[i,4]=pv4
if (pv4<alfa)
{
decision[i,4]=1
}
test5=ajb.norm.test(data)
pv5=test5$p.value
pvalue[i,5]=pv5
if (pv2<alfa)
{
decision[i,5]=1
}
}
result1=data.frame(pvalue)
result2=data.frame(decision)
colnames(result1)=c("SW","KS","AD","CvM","AJB")
colnames(result2)=c("SW","KS","AD","CvM","AJB")
write.xlsx(result1,"testnormal_pvalue.xlsx")
write.xlsx(result2,"testnormal_decision.xlsx")
one_min_beta=t(1-(colSums(decision)/m))
test.of.power=data.frame(one_min_beta)
colnames(test.of.power)=c("SW","KS","AD","CvM","AJB")
return(test.of.power)
}
simulation=testnormal(10,100,0.05)
simulation2=testnormal(20,100,0.05)
simulation3=testnormal(30,100,0.05)
simulation4=testnormal(40,100,0.05)
simulation5=testnormal(50,100,0.05)
output=rbind(simulation,simulation2,simulation3,simulation4,simulation5)
output
I want to graph the power of the test to see trends in the up and down trend of the power of the test over the sample size, anyone can help please?
I went through your code and rewrote along the way to better understand what you want (what is the excel stuff for?). I have broken it down to smaller functions to allow you to have more control in these kinds of simulation studies. The code is not particularly efficient.
But does this give you what you want?
library("nortest")
library("normtest")
library("dplyr")
library("ggplot2")
# Function for doing all tests and putting it into a data.frame
tests <- function(data) {
list_of_tests <- list(
SW = shapiro.test(data),
KS = ks.test(data, pnorm, mean = mean(data), sd = sd(data)),
AD = ad.test(data) ,
CMV = cvm.test(data),
AJB = ajb.norm.test(data)
)
# Combine to tibble
res <- bind_rows(lapply(list_of_tests, unclass))
res[c("method", "p.value")] # Keep only method and p-value cols
}
# Test it with e.g. 'tests(data = runif(8, 2, 5))'
# Function for repeated simulation and testing, combine results and derive power
testnormal <- function(n, m, alpha) {
# Important that runif is inside replicate
test_res <-
bind_rows(replicate(tests(data = runif(n, 2, 5)), n = m,
simplify = FALSE))
test_of_powers <-
test_res %>%
group_by(method) %>%
summarize(power = mean(p.value < alpha)) %>%
mutate(n = n, m = m, alpha = alpha)
return(test_of_powers)
}
# Repeat over a number of simulations:
sims <- expand.grid(n = c(10, 20, 30, 40, 50),
m = 1000,
alpha = 0.05)
output <- bind_rows(
mapply(testnormal, n = sims$n, m = sims$m, alpha = sims$alpha,
SIMPLIFY = FALSE)
)
Actually doing the plot:
# Plot it
ggplot(output, aes(x = n, y = power, col = method)) +
geom_line()
This way should make it easier to plot as well as making simulations over other grids of values (e.g. varying alpha) or expand your range of n, etc.
What I would like to do is creating several boxplots (all displayed in a single boxplot) only from certain values of my original data frame.
My data frame looks as follows:
enter image description here
So now I want R to visualise Parameter ~ Station (Parameter are all variables coloured green and Station is the "station id")
Is there a way to tell R that I want all my Parameters on the x-axis ONLY for BB0028 for example, which would mean that I only take the first 6 values of mean_area, mean_area_exc, esd, feret, min and max into account in the boxplot?
That would look like this:
enter image description here
I tried it in very complicated way to add single boxplots one by one but I am sure there must be a more simple way.
This is what I tried:
bb28 <- df[c(1:6),]
bb28area <- boxplot(bb28$mean_area ~ bb28$BBnr)
bb28area_exc <- boxplot(bb28$mean_area_exc ~ bb28$BBnr)
bb28esd <- boxplot(bb28$mean_esd ~ bb28$BBnr)
bb28feret <- boxplot(bb28$mean_feret ~ bb28$BBnr)
bb28min <- boxplot(bb28$mean_min ~ bb28$BBnr)
bb28max <- boxplot(bb28$mean_max ~ bb28$BBnr)
boxplot(bb28$mean_area ~ bb28$BBnr)
boxplot(bb28$mean_area_exc ~ bb28$BBnr, add=TRUE, at = 1:1+0.45)
Also it doesn't look very nice because in the plot the x-axis does not adjust to the new boxplot which is cut off then:
enter image description here
I hope you can help me with simple a proper code to get my plot.
Thank you!
Cheers, Merle
Maybe the function multi.boxplot below is what you are looking for. It uses base R only.
Data.
First, make up a dataset, since you have not provided us with one in a copy&paste friendly format.
set.seed(1234)
n <- 50
BBnr <- sort(sprintf("BB%04d", sample(28:30, n, TRUE)))
bb28 <- data.frame(col1 = 1:n, col2 = n:1, BBnr = BBnr)
tmp <- matrix(runif(3*n), ncol = 3)
colnames(tmp) <- paste("mean", c("this", "that", "other"), sep = "_")
bb28 <- cbind(bb28, tmp)
rm(BBnr, tmp)
Code.
multi.boxplot <- function(x, by, col=0, ...){
x <- as.data.frame(x)
uniq.by <- unique(by)
len <- length(uniq.by) - 1
n <- ncol(x)
n1 <- n + 1
col <- rep(col, n)[seq_len(n)]
boxplot(x[[ 1 ]] ~ by, at = 0:len*n1 + 1,
xlim = c(0, (len + 1)*n1), ylim = range(unlist(x)), xaxt = "n", col=col[1], ...)
for(i in seq_len(n)[-1])
boxplot(x[[i]] ~ by, at = 0:len*n1 + i, xaxt = "n", add = TRUE, col=col[i], ...)
axis(1, at = 0:len*n1 + n1/2, labels = uniq.by, tick = TRUE)
}
inx <- grep("mean", names(bb28))
multi.boxplot(bb28[inx], by = bb28$BBnr, col = rainbow(length(inx)))
I am making various ggplots on a very large dataset (much larger than the examples). I created a binning function on both x- and y-axes to enable plotting of such large dataset.
In the following example, the memory.size() is recorded at the start. Then the large dataset is simulated as dt. dt's x2 is plotted against x1 with binning. Plotting is repeated with different subsets of dt. The size of the ploted object is checked by object.size() and stored. After the plotting objects have been created, rm(dt) is executed, followed by a double gc(). At this point, memory.size() is recorded again. At the end, the memory.size() at the end is compared to that at the beginning and printed.
In view of the small size of the plotted object, it is expected that the memory.size() at the end should be similar to that at the beginning. But no. memory.size() does not go down anymore until I restart a new R session.
REPRODUCIBLE EXAMPLE
library(data.table)
library(ggplot2)
library(magrittr)
# The binning function
# x = column name for x-axis (character)
# y = column name for y-axis (character)
# xNItv = Number of bin for x-axis
# yNItv = Number of bin for y-axis
# Value: A binned data.table
tab_by_bin_idxy <- function(dt, x, y, xNItv, yNItv) {
#Binning
xBreaks = dt[, seq(min(get(x), na.rm = T), max(get(x), na.rm = T), length.out = xNItv + 1)]
yBreaks = dt[, seq(min(get(y), na.rm = T), max(get(y), na.rm = T), length.out = yNItv + 1)]
xbinCode = dt[, .bincode(get(x), breaks = xBreaks, include.lowest = T)]
xbinMid = sapply(seq(xNItv), function(i) {return(mean(xBreaks[c(i, i+1)]))})[xbinCode]
ybinCode = dt[, .bincode(get(y), breaks = yBreaks, include.lowest = T)]
ybinMid = sapply(seq(yNItv), function(i) {return(mean(yBreaks[c(i, i+1)]))})[ybinCode]
#Creating table
tab_match = CJ(xbinCode = seq(xNItv), ybinCode = seq(yNItv))
tab_plot = data.table(xbinCode, xbinMid, ybinCode, ybinMid)[
tab_match, .(xbinMid = xbinMid[1], ybinMid = ybinMid[1], N = .N), keyby = .EACHI, on = c("xbinCode", "ybinCode")
]
#Returning table
return(tab_plot)
}
before.mem.size <- memory.size()
# Simulation of dataset
nrow <- 6e5
ncol <- 60
dt <- do.call(data.table, lapply(seq(ncol), function(i) {return(runif(nrow))}) %>% set_names(paste0("x", seq(ncol))))
# Graph plotting
dummyEnv <- new.env()
with(dummyEnv, {
fcn <- function(tab) {
binned.dt <- tab_by_bin_idxy(dt = tab, x = "x1", y = "x2", xNItv = 50, yNItv = 50)
plot <- ggplot(binned.dt, aes(x = xbinMid, y = ybinMid)) + geom_point(aes(size = N))
return(plot)
}
lst_plots <- list(
plot1 = fcn(dt),
plot2 = fcn(dt[x1 <= 0.7]),
plot3 = fcn(dt[x5 <= 0.3])
)
assign("size.of.plots", object.size(lst_plots), envir = .GlobalEnv)
})
rm(dummyEnv)
# After use, remove and clean up of dataset
rm(dt)
gc();gc()
after.mem.size <- memory.size()
# Memory reports
print(paste0("before.mem.size = ", before.mem.size))
print(paste0("after.mem.size = ", after.mem.size))
print(paste0("plot.objs.size = ", size.of.plots / 1000000))
I have tried the following modifications to the code:
Inside fcn, removing ggplot and returning a NULL instead of a plot object: The memory leakage is totally gone. But this is not a solution. I need the plot.
The less plots requested / less columns / less rows passed to fcn, the less is the memory leakage.
Memory leakage also exists if I do not make any subset and make only one plot object (In the examples, I plotted 3).
After the process, even after I call rm(list = ls()), the memory is still non-recoverable.
I wish to know why this happens and how to get rid of it without compromising my need to do binned plots and subset dt to make different plots.
Thanks for attention!
I have two vectors. I need to find the intersection between these two, and do a nice plot of it.
So, here is a very simple data frame example:
df <- data.frame( id <- c(1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2),
p <-c(5,7,9,11,13,15,17,19,21,23,20,18,16,14,12,10,8,6,4,2 ),
q <-c(3,5,7,13,19,31,37,53,61,67,6,18,20,24,40,46,66,70,76,78))
colnames(df) <- c("id","price","quantity")
supply <- df[df$id == 1,]
demand <- df[df$id == 2,]
plot( x = supply$quantity, y = supply$price, type = "l", ylab = "price", xlab = "quantity")
lines(x = demand$quantity , y = demand$price, type = "l")
grid()
Now, I can plot them and find the intersection manually, but can you make R calculate the intersection between these two lines?
The data can take huge jumps, and the lines can go from very step to nearly horizontal.
Be careful creating your data frame. You want =, not <-. Also, make id a factor, for clarity.
df <- data.frame(
id = factor(rep(c("supply", "demand"), each = 10)),
price = c(5,7,9,11,13,15,17,19,21,23,20,18,16,14,12,10,8,6,4,2 ),
quantity = c(3,5,7,13,19,31,37,53,61,67,6,18,20,24,40,46,66,70,76,78)
)
First we define common, frequent points to evaluate the quantity at.
quantity_points <- with(
df,
seq(min(quantity), max(quantity), length.out = 500)
)
Now split the dataset into supply/demand parts.
by_id <- split(df[, c("price", "quantity")], df$id)
Then we use approx to calculate the price at each of these quantities, for supply and demand separately.
interpolated_price <- lapply(
by_id,
function(x)
{
with(
x,
approx(
quantity,
price,
xout = quantity_points
)
)$y
}
)
Finally, the crossing point is where the absolute value of the supply price minus the demand price is minimised.
index_of_equality <- with(interpolated_price, which.min(abs(supply - demand)))
quantity_points[index_of_equality]