I generate a network with npeople(=80), ncomp(=4) components and I want each component to have density equal to dens(=0.2).
I want to optimize 2 lines of the code which take most of the time (especially if I want to have 5k people in the network).
the 2 lines are:
# adjust probability to keep density
nodes[,p:= as.numeric(min(c(1, p * (1/(mean(nodes$p) / c.dens))))), by = c("ID","ALTERID")]
# simulate edges
nodes[, edge := sample(c(0,1),1, prob = c(1-p,p)), by = c("ID","ALTERID")]
I have tried using the lapply() function, but the execution time increased - see below the line of code:
nodes[,lapply(.SD, function(p) min(c(1, p * (1/(mean(nodes$p) / c.dens))))), by = c("ID","ALTERID")]
rm(list=ls())
library(data.table)
library(intergraph)
library(igraph)
library(Matrix)
library(profvis)
library(ggplot2)
draw.var <- function(n, var1, rho, mean){
C <- matrix(rho, nrow = 2, ncol = 2)
diag(C) <- 1
C <- chol(C)
S <- rnorm(n, mean = mean)
S <- cbind(scale(var1)[1:n],S)
ZS <- S %*% C
return(ZS[,2])
}
set.seed(1123)
profvis({
# create empty list to store data
dt.list <- list()
npeople <- 500
dens <- .2
OC.impact <- FALSE
cor_iv_si <- .6
cor_iv_uc <- 0
cor_uc_oc <- 0.6
ncomp <- 4
beta_oc <- 2 # observed characteristics
beta_uc <- 2 # unobserved characteristics
beta_si <- 1
# create data.table
dt.people <- data.table(ego = 1:npeople)
# draw observed characteristics
dt.people[, OC := abs(rt(npeople,2))]
# draw unobserved variable
dt.people[, UC := draw.var(npeople, dt.people$OC, rho = cor_uc_oc,mean = 5)]
# set component idientifier
dt.people$group <- cut_number(dt.people$UC, ncomp,labels = F)
for(q in 1:ncomp){
# subset comp
dt.sub <- dt.people[group == q]
# create undirected graph
nodes <- as.data.table(t(combn(dt.sub$ego, 2)))
setnames(nodes,c("ID","ALTERID"))
# add attributes
nodes <- merge(nodes,dt.people[,list(ID = ego, ID.UC = UC, ID.OC = OC)], by = "ID")
nodes <- merge(nodes,dt.people[,list(ALTERID = ego, ALTERID.UC = UC, ALTERID.OC = OC)], by = "ALTERID")
# calculate distance
nodes[,d := abs(ID.UC - ALTERID.UC)]
# estimate the appropiate density per component
n.edges <- (dens * (npeople * (npeople - 1)))/ncomp
n.nodes <- npeople/ncomp
c.dens <- n.edges/(n.nodes * (n.nodes - 1))
# estimate initial probability of tie based on distance
coefficient <- log(c.dens / (1 - c.dens))
alpha <- coefficient / mean(nodes$d)
nodes[,p := exp(alpha * d) / (1 + exp(alpha * d))]
# adjust probability to keep density
nodes[,p:= as.numeric(min(c(1, p * (1/(mean(nodes$p) / c.dens))))), by = c("ID","ALTERID")]
# simulate edges
nodes[, edge := sample(c(0,1),1, prob = c(1-p,p)), by = c("ID","ALTERID")]
# keep the edges
nodes <- nodes[edge == 1,list(ID,ALTERID)]
# bind the networks
if(q == 1){
net <- copy(nodes)
} else{
net <- rbind(net,nodes)
}
}
# create opposide direction
net <- rbind(net,net[,list(ID = ALTERID, ALTERID = ID)])
})
This incorporates #BenBolker and # DavidArenburg's suggestions and also incorporates some of data.table's tools.
Non-Equi joins
The OP code loops through each group. One part of the code also uses combn and multiple joins to get the data in the right format. Using non-equi joins, we can combine all of those steps in one data.table call
dt_non_sub <- dt.people[dt.people,
on = .(ego < ego, group = group),
allow.cartesian = T,
nomatch = 0L,
.(group,
ALTERID = i.ego, ID = x.ego,
ID.UC = UC, ID.OC = OC,
ALTERID.OC = i.OC, ALTERID.UC = i.UC,
d = abs(UC - i.UC)) #added to be more efficient
]
# dt_non_sub[, d:= abs(ID.UC - ALTERID.UC)]
Vectorization
The original code was mostly slow because of two calls with by groupings. Since each call split the dataframe in around 8,000 individual groups, there were 8,000 functions calls each time. This eliminates those by using pmin as suggested by #DavidArenburg and then uses runif(N)<p as suggested by #BenBolker. My addition was that since your final result don't seem to care about p, I only assigned the edge by using {} to only return the last thing calculated in the call.
# alpha <- coefficient / mean(nodes$d)
dt_non_sub[,
edge := {
alpha = coefficient / mean(d)
p = exp(alpha * d) / (1 + exp(alpha * d))
p_mean = mean(p)
p = pmin(1, p * (1/(p_mean / c.dens)))
as.numeric(runif(.N)<p)
}
, by = .(group)]
net2 <- rbindlist(dt_non_sub[edge == 1, .(group, ALTERID, ID)],
dt_non_sub[edge == 1, .(group, ID = ALTERID, ALTERID = ID)]
One thing to note is that the vectorization is not 100% identical. Your code was recursive, each split updated the mean(node$p) for the next ID, ALTERID group. If you need that recursive part of the call, there's not much help to make it faster.
In the end, the modified code runs in 20 ms vs. the 810 ms of your original function. The results, while different, are somewhat similar in the total number of results:
Original:
net
ID ALTERID
1: 5 10
2: 10 14
3: 5 25
4: 10 25
5: 14 25
---
48646: 498 458
48647: 498 477
48648: 498 486
48649: 498 487
48650: 498 493
Modified
net2
group ALTERID ID
1: 2 4 3
2: 2 6 4
3: 4 7 1
4: 4 8 7
5: 2 9 4
---
49512: 3 460 500
49513: 3 465 500
49514: 3 478 500
49515: 3 482 500
49516: 3 497 500
I m just trying to calculate the relative angle between with my x,y,z data frame to the reference vector. So far, I use dplyr to group things and apply my angle function to get relative angle. However things are quite slow even for dummy data that I provide here.
set.seed(12345)
x <- replicate(1,c(replicate(1000,rnorm(50,0,0.01))))
y <- replicate(1,c(replicate(1000,rnorm(50,0,0.01))))
z <- replicate(1,c(replicate(1000,rnorm(50,0.9,0.01))))
ref_vector <- data.frame(ref_x=rep(0,100),ref_y=rep(0,100),ref_z=rep(1,100))
set <- rep(seq(1,1000),each=50)
data_rep <- data.frame(x,y,z,ref_vector,set)
>
head(data_rep)
# x y z ref_x ref_y ref_z set
# 1 0.005855288 -0.015472796 0.9059337 0 0 1 1
# 2 0.007094660 -0.013354359 0.9040137 0 0 1 1
# 3 -0.001093033 -0.014661486 0.9047502 0 0 1 1
# 4 -0.004534972 -0.002764655 0.9070553 0 0 1 1
# 5 0.006058875 -0.008339952 0.8926551 0 0 1 1
# 6 -0.018179560 -0.008412400 0.9055541 0 0 1 1
I define the angle between two vectors with this angle function,
angle <- function(x,y){
dot.prod <- x%*%y
norm.x <- norm(x,type="2")
norm.y <- norm(y,type="2")
theta <- acos(dot.prod / (norm.x * norm.y))
as.numeric(theta)
}
then lets apply this to our data_rep
library(dplyr)
system.time(df_angle <- data_rep%>%
rowwise()%>%
do(data.frame(.,angle_rad=angle(unlist(.[1:3]),unlist(.[4:6]))))%>%
group_by(set)%>%
mutate(angle=angle_rad*180/pi, mean_angle=mean(angle)))
# user system elapsed
# 64.22 0.08 64.81
# Warning message:
# Grouping rowwise data frame strips rowwise nature
As you can see, the process took around 1 min and I even did not provide all my real data set which has 350000 rows and it takes 10 min to calculate the relative angle.
I wonder is there any way to speed up this process.
Thanks!
Just discover linear algebra for yourself:
m1 = as.matrix(data_rep[, 1:3])
m2 = as.matrix(data_rep[, 4:6])
system.time( {
m1 = m1 / sqrt(rowSums(m1 ^ 2))
m2 = m2 / sqrt(rowSums(m2 ^ 2))
RESULT <- acos(rowSums(m1 * m2))
})
# user system elapsed
# 0.004 0.001 0.006
all.equal(df_angle$angle_rad, RESULT)
# TRUE
Just make a simple mutatestatement instead of your do(data.frame()) part. This improves the performance quite a bit, because you no longer have to convert each row into a data.frame
system.time(df_angle2 <- data_rep%>%
rowwise() %>%
mutate(angle_rad=angle(x = c(x,y,z),y = c(ref_x,ref_y,ref_z))) %>%
group_by(set)%>%
mutate(angle=angle_rad*180/pi, mean_angle=mean(angle)))
## user system elapsed
## 3.72 0.00 3.71
all.equal(df_angle,df_angle2)
## TRUE
I have written the code below to obtain a bootstrap estimate of a mean. My objective is to view the numbers selected from the data set, ideally in the order they are selected, by the function boot in the boot package.
The data set only contains three numbers: 1, 10, and 100 and I am only using two bootstrap samples.
The estimated mean is 23.5 and the R code below indicates that the six numbers included one '1', four '10' and one '100'. However, there are 30 possible combinations of those numbers that would have resulted in a mean of 23.5.
Is there a way for me to determine which of those 30 possible combinations is the combination that actually appeared in the two bootstrap samples?
library(boot)
set.seed(1234)
dat <- c(1, 10, 100)
av <- function(dat, i) { sum(dat[i])/length(dat[i]) }
av.boot <- boot(dat, av, R = 2)
av.boot
#
# ORDINARY NONPARAMETRIC BOOTSTRAP
#
#
# Call:
# boot(data = dat, statistic = av, R = 2)
#
#
# Bootstrap Statistics :
# original bias std. error
# t1* 37 -13.5 19.09188
#
mean(dat) + -13.5
# [1] 23.5
# The two samples must have contained one '1', four '10' and one '100',
# but there are 30 possibilities.
# Which of these 30 possible sequences actual occurred?
# This code shows there must have been one '1', four '10' and one '100'
# and shows the 30 possible combinations
my.combos <- expand.grid(V1 = c(1, 10, 100),
V2 = c(1, 10, 100),
V3 = c(1, 10, 100),
V4 = c(1, 10, 100),
V5 = c(1, 10, 100),
V6 = c(1, 10, 100))
my.means <- apply(my.combos, 1, function(x) {( (x[1] + x[2] + x[3])/3 + (x[4] + x[5] + x[6])/3 ) / 2 })
possible.samples <- my.combos[my.means == 23.5,]
dim(possible.samples)
n.1 <- rowSums(possible.samples == 1)
n.10 <- rowSums(possible.samples == 10)
n.100 <- rowSums(possible.samples == 100)
n.1[1]
n.10[1]
n.100[1]
length(unique(n.1)) == 1
length(unique(n.10)) == 1
length(unique(n.100)) == 1
I think you can determine the numbers sampled and the order in which they are sampled with the code below. You have to extract the function ordinary.array from the boot package and paste that function into your R code. Then specify the values for n, R and strata, where n is the number of observations in the data set and R is the number of replicate samples you want.
I do not know how general this approach is, but it worked with a couple of simple examples I tried, including the example below.
library(boot)
set.seed(1234)
dat <- c(1, 10, 100, 1000)
av <- function(dat, i) { sum(dat[i])/length(dat[i]) }
av.boot <- boot(dat, av, R = 3)
av.boot
#
# ORDINARY NONPARAMETRIC BOOTSTRAP
#
#
# Call:
# boot(data = dat, statistic = av, R = 3)
#
#
# Bootstrap Statistics :
# original bias std. error
# t1* 277.75 -127.5 132.2405
#
#
mean(dat) + -127.5
# [1] 150.25
# boot:::ordinary.array
ordinary.array <- function (n, R, strata)
{
inds <- as.integer(names(table(strata)))
if (length(inds) == 1L) {
output <- sample.int(n, n * R, replace = TRUE)
dim(output) <- c(R, n)
}
else {
output <- matrix(as.integer(0L), R, n)
for (is in inds) {
gp <- seq_len(n)[strata == is]
output[, gp] <- if (length(gp) == 1)
rep(gp, R)
else bsample(gp, R * length(gp))
}
}
output
}
# I think the function ordinary.array determines which elements
# of the data are sampled in each of the R samples
set.seed(1234)
ordinary.array(n=4,R=3,1)
# [,1] [,2] [,3] [,4]
# [1,] 1 3 1 3
# [2,] 3 4 1 3
# [3,] 3 3 3 3
#
# which equals:
((1+100+1+100) / 4 + (100+1000+1+100) / 4 + (100+100+100+100) / 4) / 3
# [1] 150.25
I need to calculate the within and between run variances from some data as part of developing a new analytical chemistry method. I also need confidence intervals from this data using the R language
I assume I need to use anova or something ?
My data is like
> variance
Run Rep Value
1 1 1 9.85
2 1 2 9.95
3 1 3 10.00
4 2 1 9.90
5 2 2 8.80
6 2 3 9.50
7 3 1 11.20
8 3 2 11.10
9 3 3 9.80
10 4 1 9.70
11 4 2 10.10
12 4 3 10.00
You have four groups of three observations:
> run1 = c(9.85, 9.95, 10.00)
> run2 = c(9.90, 8.80, 9.50)
> run3 = c(11.20, 11.10, 9.80)
> run4 = c(9.70, 10.10, 10.00)
> runs = c(run1, run2, run3, run4)
> runs
[1] 9.85 9.95 10.00 9.90 8.80 9.50 11.20 11.10 9.80 9.70 10.10 10.00
Make some labels:
> n = rep(3, 4)
> group = rep(1:4, n)
> group
[1] 1 1 1 2 2 2 3 3 3 4 4 4
Calculate within-run stats:
> withinRunStats = function(x) c(sum = sum(x), mean = mean(x), var = var(x), n = length(x))
> tapply(runs, group, withinRunStats)
$`1`
sum mean var n
29.800000000 9.933333333 0.005833333 3.000000000
$`2`
sum mean var n
28.20 9.40 0.31 3.00
$`3`
sum mean var n
32.10 10.70 0.61 3.00
$`4`
sum mean var n
29.80000000 9.93333333 0.04333333 3.00000000
You can do some ANOVA here:
> data = data.frame(y = runs, group = factor(group))
> data
y group
1 9.85 1
2 9.95 1
3 10.00 1
4 9.90 2
5 8.80 2
6 9.50 2
7 11.20 3
8 11.10 3
9 9.80 3
10 9.70 4
11 10.10 4
12 10.00 4
> fit = lm(runs ~ group, data)
> fit
Call:
lm(formula = runs ~ group, data = data)
Coefficients:
(Intercept) group2 group3 group4
9.933e+00 -5.333e-01 7.667e-01 -2.448e-15
> anova(fit)
Analysis of Variance Table
Response: runs
Df Sum Sq Mean Sq F value Pr(>F)
group 3 2.57583 0.85861 3.5437 0.06769 .
Residuals 8 1.93833 0.24229
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
> degreesOfFreedom = anova(fit)[, "Df"]
> names(degreesOfFreedom) = c("treatment", "error")
> degreesOfFreedom
treatment error
3 8
Error or within-group variance:
> anova(fit)["Residuals", "Mean Sq"]
[1] 0.2422917
Treatment or between-group variance:
> anova(fit)["group", "Mean Sq"]
[1] 0.8586111
This should give you enough confidence to do confidence intervals.
If you want to apply a function (such as var) across a factor such as Run or Rep, you can use tapply:
> with(variance, tapply(Value, Run, var))
1 2 3 4
0.005833333 0.310000000 0.610000000 0.043333333
> with(variance, tapply(Value, Rep, var))
1 2 3
0.48562500 0.88729167 0.05583333
I'm going to take a crack at this when I have more time, but meanwhile, here's the dput() for Kiar's data structure:
structure(list(Run = c(1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4), Rep = c(1,
2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3), Value = c(9.85, 9.95, 10, 9.9,
8.8, 9.5, 11.2, 11.1, 9.8, 9.7, 10.1, 10)), .Names = c("Run",
"Rep", "Value"), row.names = c(NA, -12L), class = "data.frame")
... in case you'd like to take a quick shot at it.
I've been looking at a similar problem. I've found reference to caluclating confidence intervals by Burdick and Graybill (Burdick, R. and Graybill, F. 1992, Confidence Intervals on variance components, CRC Press)
Using some code I've been trying I get these values
> kiaraov = aov(Value~Run+Error(Run),data=kiar)
> summary(kiaraov)
Error: Run
Df Sum Sq Mean Sq
Run 3 2.57583 0.85861
Error: Within
Df Sum Sq Mean Sq F value Pr(>F)
Residuals 8 1.93833 0.24229
> confint = 95
> a = (1-(confint/100))/2
> grandmean = as.vector(kiaraov$"(Intercept)"[[1]][1]) # Grand Mean (I think)
> within = summary(kiaraov)$"Error: Within"[[1]]$"Mean Sq" # S2^2Mean Square Value for Within Run
> dfRun = summary(kiaraov)$"Error: Run"[[1]]$"Df"
> dfWithin = summary(kiaraov)$"Error: Within"[[1]]$"Df"
> Run = summary(kiaraov)$"Error: Run"[[1]]$"Mean Sq" # S1^2Mean Square for between Run
> between = (Run-within)/((dfWithin/(dfRun+1))+1) # (S1^2-S2^2)/J
> total = between+within
> between # Between Run Variance
[1] 0.2054398
> within # Within Run Variance
[1] 0.2422917
> total # Total Variance
[1] 0.4477315
> betweenCV = sqrt(between)/grandmean * 100 # Between Run CV%
> withinCV = sqrt(within)/grandmean * 100 # Within Run CV%
> totalCV = sqrt(total)/grandmean * 100 # Total CV%
> #within confidence intervals
> withinLCB = within/qf(1-a,8,Inf) # Within LCB
> withinUCB = within/qf(a,8,Inf) # Within UCB
> #Between Confidence Intervals
> n1 = dfRun
> n2 = dfWithin
> G1 = 1-(1/qf(1-a,n1,Inf)) # According to Burdick and Graybill this should be a
> G2 = 1-(1/qf(1-a,n2,Inf))
> H1 = (1/qf(a,n1,Inf))-1 # and this should be 1-a, but my results don't agree
> H2 = (1/qf(a,n2,Inf))-1
> G12 = ((qf(1-a,n1,n2)-1)^2-(G1^2*qf(1-a,n1,n2)^2)-(H2^2))/qf(1-a,n1,n2) # again, should be a, not 1-a
> H12 = ((1-qf(a,n1,n2))^2-H1^2*qf(a,n1,n2)^2-G2^2)/qf(a,n1,n2) # again, should be 1-a, not a
> Vu = H1^2*Run^2+G2^2*within^2+H12*Run*within
> Vl = G1^2*Run^2+H2^2*within^2+G12*within*Run
> betweenLCB = (Run-within-sqrt(Vl))/J # Betwen LCB
> betweenUCB = (Run-within+sqrt(Vu))/J # Between UCB
> #Total Confidence Intervals
> y = (Run+(J-1)*within)/J
> totalLCB = y-(sqrt(G1^2*Run^2+G2^2*(J-1)^2*within^2)/J) # Total LCB
> totalUCB = y+(sqrt(H1^2*Run^2+H2^2*(J-1)^2*within^2)/J) # Total UCB
> result = data.frame(Name=c("within", "between", "total"),CV=c(withinCV,betweenCV,totalCV),LCB=c(sqrt(withinLCB)/grandmean*100,sqrt(betweenLCB)/grandmean*100,sqrt(totalLCB)/grandmean*100),UCB=c(sqrt(withinUCB)/grandmean*100,sqrt(betweenUCB)/grandmean*100,sqrt(totalUCB)/grandmean*100))
> result
Name CV LCB UCB
1 within 4.926418 3.327584 9.43789
2 between 4.536327 NaN 19.73568
3 total 6.696855 4.846030 20.42647
Here the lower confidence interval for between run CV is less than zero, so reported as NaN.
I'd love to have a better way to do this. If I get time I might try to create a function to do this.
Paul.
--
Edit: I did eventually write a function, here it is (caveat emptor)
#' avar Function
#'
#' Calculate thewithin, between and total %CV of a dataset by ANOVA, and the
#' associated confidence intervals
#'
#' #param dataf - The data frame to use, in long format
#' #param afactor Character string representing the column in dataf that contains the factor
#' #param aresponse Charactyer string representing the column in dataf that contains the response value
#' #param aconfidence What Confidence limits to use, default = 95%
#' #param digits Significant Digits to report to, default = 3
#' #param debug Boolean, Should debug messages be displayed, default=FALSE
#' #returnType dataframe containing the Mean, Within, Between and Total %CV and LCB and UCB for each
#' #return
#' #author Paul Hurley
#' #export
#' #examples
#' #Using the BGBottles data from Burdick and Graybill Page 62
#' assayvar(dataf=BGBottles, afactor="Machine", aresponse="weight")
avar<-function(dataf, afactor, aresponse, aconfidence=95, digits=3, debug=FALSE){
dataf<-subset(dataf,!is.na(with(dataf,get(aresponse))))
nmissing<-function(x) sum(!is.na(x))
n<-nrow(subset(dataf,is.numeric(with(dataf,get(aresponse)))))
datadesc<-ddply(dataf, afactor, colwise(nmissing,aresponse))
I<-nrow(datadesc)
if(debug){print(datadesc)}
if(min(datadesc[,2])==max(datadesc[,2])){
balance<-TRUE
J<-min(datadesc[,2])
if(debug){message(paste("Dataset is balanced, J=",J,"I is ",I,sep=""))}
} else {
balance<-FALSE
Jh<-I/(sum(1/datadesc[,2], na.rm = TRUE))
J<-Jh
m<-min(datadesc[,2])
M<-max(datadesc[,2])
if(debug){message(paste("Dataset is unbalanced, like me, I is ",I,sep=""))}
if(debug){message(paste("Jh is ",Jh, ", m is ",m, ", M is ",M, sep=""))}
}
if(debug){message(paste("Call afactor=",afactor,", aresponse=",aresponse,sep=""))}
formulatext<-paste(as.character(aresponse)," ~ 1 + Error(",as.character(afactor),")",sep="")
if(debug){message(paste("formula text is ",formulatext,sep=""))}
aovformula<-formula(formulatext)
if(debug){message(paste("Formula is ",as.character(aovformula),sep=""))}
assayaov<-aov(formula=aovformula,data=dataf)
if(debug){
print(assayaov)
print(summary(assayaov))
}
a<-1-((1-(aconfidence/100))/2)
if(debug){message(paste("confidence is ",aconfidence,", alpha is ",a,sep=""))}
grandmean<-as.vector(assayaov$"(Intercept)"[[1]][1]) # Grand Mean (I think)
if(debug){message(paste("n is",n,sep=""))}
#This line commented out, seems to choke with an aov object built from an external formula
#grandmean<-as.vector(model.tables(assayaov,type="means")[[1]]$`Grand mean`) # Grand Mean (I think)
within<-summary(assayaov)[[2]][[1]]$"Mean Sq" # d2e, S2^2 Mean Square Value for Within Machine = 0.1819
dfRun<-summary(assayaov)[[1]][[1]]$"Df" # DF for within = 3
dfWithin<-summary(assayaov)[[2]][[1]]$"Df" # DF for within = 8
Run<-summary(assayaov)[[1]][[1]]$"Mean Sq" # S1^2Mean Square for Machine
if(debug){message(paste("mean square for Run ?",Run,sep=""))}
#Was between<-(Run-within)/((dfWithin/(dfRun+1))+1) but my comment suggests this should be just J, so I'll use J !
between<-(Run-within)/J # d2a (S1^2-S2^2)/J
if(debug){message(paste("S1^2 mean square machine is ",Run,", S2^2 mean square within is ",within))}
total<-between+within
between # Between Run Variance
within # Within Run Variance
total # Total Variance
if(debug){message(paste("between is ",between,", within is ",within,", Total is ",total,sep=""))}
betweenCV<-sqrt(between)/grandmean * 100 # Between Run CV%
withinCV<-sqrt(within)/grandmean * 100 # Within Run CV%
totalCV<-sqrt(total)/grandmean * 100 # Total CV%
n1<-dfRun
n2<-dfWithin
if(debug){message(paste("n1 is ",n1,", n2 is ",n2,sep=""))}
#within confidence intervals
if(balance){
withinLCB<-within/qf(a,n2,Inf) # Within LCB
withinUCB<-within/qf(1-a,n2,Inf) # Within UCB
} else {
withinLCB<-within/qf(a,n2,Inf) # Within LCB
withinUCB<-within/qf(1-a,n2,Inf) # Within UCB
}
#Mean Confidence Intervals
if(debug){message(paste(grandmean,"+/-(sqrt(",Run,"/",n,")*qt(",a,",df=",I-1,"))",sep=""))}
meanLCB<-grandmean+(sqrt(Run/n)*qt(1-a,df=I-1)) # wrong
meanUCB<-grandmean-(sqrt(Run/n)*qt(1-a,df=I-1)) # wrong
if(debug){message(paste("Grandmean is ",grandmean,", meanLCB = ",meanLCB,", meanUCB = ",meanUCB,aresponse,sep=""))}
if(debug){print(summary(assayaov))}
#Between Confidence Intervals
G1<-1-(1/qf(a,n1,Inf))
G2<-1-(1/qf(a,n2,Inf))
H1<-(1/qf(1-a,n1,Inf))-1
H2<-(1/qf(1-a,n2,Inf))-1
G12<-((qf(a,n1,n2)-1)^2-(G1^2*qf(a,n1,n2)^2)-(H2^2))/qf(a,n1,n2)
H12<-((1-qf(1-a,n1,n2))^2-H1^2*qf(1-a,n1,n2)^2-G2^2)/qf(1-a,n1,n2)
if(debug){message(paste("G1 is ",G1,", G2 is ",G2,sep=""))
message(paste("H1 is ",H1,", H2 is ",H2,sep=""))
message(paste("G12 is ",G12,", H12 is ",H12,sep=""))
}
if(balance){
Vu<-H1^2*Run^2+G2^2*within^2+H12*Run*within
Vl<-G1^2*Run^2+H2^2*within^2+G12*within*Run
betweenLCB<-(Run-within-sqrt(Vl))/J # Betwen LCB
betweenUCB<-(Run-within+sqrt(Vu))/J # Between UCB
} else {
#Burdick and Graybill seem to suggest calculating anova of mean values to find n1S12u/Jh
meandataf<-ddply(.data=dataf,.variable=afactor, .fun=function(df){mean(with(df, get(aresponse)), na.rm=TRUE)})
meandataaov<-aov(formula(paste("V1~",afactor,sep="")), data=meandataf)
sumsquare<-summary(meandataaov)[[1]]$`Sum Sq`
#so maybe S12u is just that bit ?
Runu<-(sumsquare*Jh)/n1
if(debug){message(paste("n1S12u/Jh is ",sumsquare,", so S12u is ",Runu,sep=""))}
Vu<-H1^2*Runu^2+G2^2*within^2+H12*Runu*within
Vl<-G1^2*Runu^2+H2^2*within^2+G12*within*Runu
betweenLCB<-(Runu-within-sqrt(Vl))/Jh # Betwen LCB
betweenUCB<-(Runu-within+sqrt(Vu))/Jh # Between UCB
if(debug){message(paste("betweenLCB is ",betweenLCB,", between UCB is ",betweenUCB,sep=""))}
}
#Total Confidence Intervals
if(balance){
y<-(Run+(J-1)*within)/J
if(debug){message(paste("y is ",y,sep=""))}
totalLCB<-y-(sqrt(G1^2*Run^2+G2^2*(J-1)^2*within^2)/J) # Total LCB
totalUCB<-y+(sqrt(H1^2*Run^2+H2^2*(J-1)^2*within^2)/J) # Total UCB
} else {
y<-(Runu+(Jh-1)*within)/Jh
if(debug){message(paste("y is ",y,sep=""))}
totalLCB<-y-(sqrt(G1^2*Runu^2+G2^2*(Jh-1)^2*within^2)/Jh) # Total LCB
totalUCB<-y+(sqrt(H1^2*Runu^2+H2^2*(Jh-1)^2*within^2)/Jh) # Total UCB
}
if(debug){message(paste("totalLCB is ",totalLCB,", total UCB is ",totalUCB,sep=""))}
# result<-data.frame(Name=c("within", "between", "total"),CV=c(withinCV,betweenCV,totalCV),
# LCB=c(sqrt(withinLCB)/grandmean*100,sqrt(betweenLCB)/grandmean*100,sqrt(totalLCB)/grandmean*100),
# UCB=c(sqrt(withinUCB)/grandmean*100,sqrt(betweenUCB)/grandmean*100,sqrt(totalUCB)/grandmean*100))
result<-data.frame(Mean=grandmean,MeanLCB=meanLCB, MeanUCB=meanUCB, Within=withinCV,WithinLCB=sqrt(withinLCB)/grandmean*100, WithinUCB=sqrt(withinUCB)/grandmean*100,
Between=betweenCV, BetweenLCB=sqrt(betweenLCB)/grandmean*100, BetweenUCB=sqrt(betweenUCB)/grandmean*100,
Total=totalCV, TotalLCB=sqrt(totalLCB)/grandmean*100, TotalUCB=sqrt(totalUCB)/grandmean*100)
if(!digits=="NA"){
result$Mean<-signif(result$Mean,digits=digits)
result$MeanLCB<-signif(result$MeanLCB,digits=digits)
result$MeanUCB<-signif(result$MeanUCB,digits=digits)
result$Within<-signif(result$Within,digits=digits)
result$WithinLCB<-signif(result$WithinLCB,digits=digits)
result$WithinUCB<-signif(result$WithinUCB,digits=digits)
result$Between<-signif(result$Between,digits=digits)
result$BetweenLCB<-signif(result$BetweenLCB,digits=digits)
result$BetweenUCB<-signif(result$BetweenUCB,digits=digits)
result$Total<-signif(result$Total,digits=digits)
result$TotalLCB<-signif(result$TotalLCB,digits=digits)
result$TotalUCB<-signif(result$TotalUCB,digits=digits)
}
return(result)
}
assayvar<-function(adata, aresponse, afactor, anominal, aconfidence=95, digits=3, debug=FALSE){
result<-ddply(adata,anominal,function(df){
resul<-avar(dataf=df,afactor=afactor,aresponse=aresponse,aconfidence=aconfidence, digits=digits, debug=debug)
resul$n<-nrow(subset(df, !is.na(with(df, get(aresponse)))))
return(resul)
})
return(result)
}