Related
I would like to aggregate, in order to reduce the number of constructs, its following data frame containing only binary variables that correspond to "yes/no", its following data frame (first 10 row). The original data frame contains 169 rows.
outcome <-
structure(list(Q9_Automazione.processi = c(0, 0, 0, 0, 0, 0,
1, 1, 1, 0), Q9_Velocita.Prod = c(1, 0, 0, 1, 0, 0, 1, 1, 1,
0), Q9_Flessibilita.Prod = c(0, 0, 0, 1, 0, 0, 1, 1, 0, 1), Q9_Controllo.processi = c(0,
0, 0, 1, 0, 0, 1, 1, 0, 0), Q9_Effic.Magazzino = c(0, 0, 0, 1,
0, 0, 0, 0, 0, 0), Q9_Riduz.Costi = c(0, 1, 0, 0, 0, 0, 0, 0,
0, 1), Q9_Miglior.Sicurezza = c(0, 0, 0, 0, 0, 0, 1, 0, 1, 1),
Q9_Connett.Interna = c(0, 0, 0, 0, 0, 0, 0, 1, 1, 0), Q9_Connett.Esterna = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0), Q9_Virtualizzazione = c(0, 0,
0, 0, 0, 0, 0, 0, 0, 0), Q9_Innov.Prod = c(0, 0, 0, 0, 0,
1, 0, 0, 0, 1), Q9_Person.Prod = c(0, 1, 0, 1, 0, 1, 0, 0,
0, 1), Q9_Nuovi.Mercati = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
Q9_Nuovi.BM = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), Q9_Perform.Energ = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0), Q9_Perform.SostAmb = c(0, 0,
0, 0, 0, 0, 0, 0, 0, 0)), row.names = c(NA, 10L), class = "data.frame")
I have tried performing factor analysis via the tethracoric method on the obtained correlation matrix ( the obtained value from the KMO function turns out to be inadequate) both directly on the dataframe and then using tethracoric correletions in fafunction (using cor = "tet" I get a negative Tucker Lewis Index).
I have been reading up on this but cannot find a methodology that is adequate and of which I am certain of the correctness of the analysis.
So basically what I would like to achieve is to aggregate similar constructs, e.g., assess whether column 5 has value 1 (i.e., "yes") almost always when column 11 has value 1 and then aggregate.
Here the code that I try to used
library(psych)
tet <- tetrachoric(outcome)
corrplot(tet$rho, "ellipse", tl.cex = 0.75, tl.col = "black")
par(mfrow = c(1,2))
corr_matrix %>%
ggcorrplot(show.diag = F,
type="lower",
lab=TRUE,
lab_size=2)
KMO(corr_matrix)
cortest.bartlett(corr_matrix)
fa.parallel(corr_matrix, fm = "ml")
factor <- fa(corr_matrix, nfactors = 3, rotate = "oblimin", fm = "ml")
print(factor, cut = 0.3, digits = 3)
# -------- Pearson --------
cor(outcome, method = 'pearson', use = "pairwise.complete.obs") %>%
ggcorrplot(show.diag = F,
type="lower",
lab=TRUE,
lab_size=2)
KMO(outcome)
cortest.bartlett(outcome)
fa.parallel(outcome)
factor1 <- fa(outcome, nfactors = 3, rotate = "oblimin", cor = "tet", fm = "ml")
print(factor1, cut = 0.3, digits = 3)
I am trying to create two calibration plots (for two different models) but it does not seem to work. My data (a subset of them):
structure(list(X1 = c(0.0205881308065423, 0.030107400545467,
0.0224902821967529, 0.067082269138019, 0.0128933436225658, 0.010528298470225,
0.0448801718109416, 0.0147825838164296, 0.00127338570492985,
0.0187288824619526, 0.0166935353708351, 0.000827013756910522,
0.000268624600100464, 0.00022554771787564, 0.000239290116892055,
0.00046320712675918, 0.0127930773405932, 0.123559021969098, 0.00196413334593659,
0.00267343502355055, 0.0119560304531064, 0.0151288958940289,
0.0450932732709064, 0.284128554073485, 0.0435626434150131, 0.00919667587971063,
0.241220354905637, 0.0188148171033879, 0.0116570772346002, 0.0159496690575734,
0.00518918742249186, 0.0319701660388646, 0.100234998067917, 0.0119794232466471,
0.00123658099677804, 0.00178774726967923, 0.00215162606048125,
0.028398874195245, 0.02727277199735, 0.0536089031118459, 0.00567355556708304,
0.00182798929912398, 0.0221311523302337, 0.0317268552025847,
0.241167765332718, 0.201815176728704, 0.00750328900855035, 0.00346824263327472,
0.00859464311717095, 0.00488864781312837), X2 = c(0.0123677690429329,
0.0275541038901166, 0.0166991553536275, 0.0260168210079643, 0.00693728726147325,
0.00464096927279578, 0.0124618831179862, 0.0184586073538044,
0.00569866130459529, 0.00293809224808261, 0.00119326039429657,
0.00316749683866091, 0.00419982136508501, 0.00140900547876921,
0.00110999833888004, 0.00276678547598108, 0.0162868658191231,
0.0649037872628959, 0.00123222675644274, 0.00171687152904065,
0.0152583510689248, 0.0258721612337077, 0.0392641646035583, 0.361960538193137,
0.0357326269142103, 0.0107540980920499, 0.22279499286353, 0.0301267823507665,
0.0144535141006957, 0.0124677305707919, 0.00520476987173168,
0.0320510777198151, 0.0770024283430764, 0.00793648556749427,
0.000401508352378066, 0.000498605187176815, 0.000982487695277534,
0.0399009464278308, 0.0698023981838097, 0.0506533144316593, 0.00462517180839983,
0.00275731807224233, 0.0374332227392187, 0.0582978817333271,
0.121896031487931, 0.236774303454737, 0.0106755443754257, 0.00398238213200619,
0.0113833654830731, 0.00708983623072867), X3 = c(0, 0, 1, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0)), row.names = c(NA, 50L), class = "data.frame")
and the code I use:
par(mfrow = c(1,2),
oma=c(5,5,0,0) + 0.0,
mar=c(0,0,1,1) + 0.0)
val.prob.ci.2(p = lo$X1, y = lo$X3, smooth="loess", legendloc=F, lwd.smooth=2, lwd.ideal=2, lty.ideal=2, dostats = F, axes = F)
axis(side=2,at=c(0,0.2,0.4,0.6,0.8,1))
box(which="plot")
text(x=0, y=1, adj=0, "LG", cex=1.3)
val.prob.ci.2(p = lo$X2, y = lo$X3, smooth="loess", legendloc=F, lwd.smooth=2, lwd.ideal=2, lty.ideal=2, dostats = F, axes = F)
box(which="plot")
text(x=0, y=1, adj=0, "rf", cex=1.3)
title(xlab = list("Estimated probability",cex=1.5),
ylab = list("Observed proportion",cex=1.5),
outer = TRUE)#, line=3)
the output is:
The problem lies with the second plot. Why it does not appear?
I have a square matrix that represents directed interactions, with values representing the magnitude of the "flow" from row i to column j.
mat <- structure(c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.59734154600838,
0.962276996464401, 0.996554553573577, 0.988150008522967, 0.581536975261071,
0.280105566896129, 0.0520717823071291, 0.0443864046117343, 0.0162858335588474,
0, 0, 0, 0, 0, 0, 0, 0.111900863185923, 0.289483837277475, 0.338036619790556,
0.973201117894343, 0.876145758734938, 0.280105566896129, 0.245172586054694,
0.101440228047504, 0.0136022221272776, 0, 0, 0, 0, 0, 0, 0.073088274682518,
0.21588462733217, 0.258134862678946, 0.93528472971792, 0.921844796228768,
0.318790697187933, 0.280105566896129, 0.117928032625428, 0.016073037487081,
0, 0, 0, 0, 0, 0, 0, 0.0119602547215087, 0.0174757225504163,
0.443466799224191, 0.941024455005652, 0.632609306727839, 0.57418820480725,
0.280105566896129, 0.043827579210664, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0.0547471528159807, 0.884304818335752, 0.937495721370637,
0.925118019265575, 0.280105566896129, 0.055967839940851, 0.0122649398400715,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0.0679263578760456, 0.104884821422108,
0.569814755335506, 0.853130344409379, 0.280105566896129, 0.0728699300735904,
0.0339371561178606, 0.012188886551821, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0.0219303360220489, 0.843994038605239, 0.759918325154657,
0.280105566896129, 0.143508732965731, 0.0556400089034765, 0.0296286033644999,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0.421151438381493, 0.977746695038157,
0.499880491267235, 0.280105566896129, 0.116686808742586, 0.0639605586005988,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0.0495967410949283, 0.841406989124245,
0.85505217514437, 0.578265483357174, 0.280105566896129, 0.163154497800251,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.499941945587477, 0.993657104473566,
0.807475685951474, 0.45318772928331, 0.280105566896129), .Dim = c(15L,
15L))
I am interested in calculating the weighted linkage similarity (both in and out flows) of all vertices in the network, so taking magnitude into account.
Using igraph, I can calculate the Jaccard similarity but without considering weights
library(igraph)
bin <- mat
bin[bin > 0] <- 1
similarity(graph_from_adjacency_matrix(bin),
mode = "all",
method = "jaccard")
# this gives the same result as the one above
similarity(graph_from_adjacency_matrix(mat, weighted = T),
mode = "all",
method = "jaccard")
Using the code from this blogpost, I was able to calculate the Jaccard similarity of outflows and inflows and combine them.
# outflow similarity
sim.jac.out <- matrix(0, nrow=nrow(mat), ncol=nrow(mat))
pairs <- t(combn(1:nrow(mat), 2))
for (i in 1:nrow(pairs)) {
num <- sum(sapply(1:ncol(mat), function(x) (min(mat[pairs[i,1],x], mat[pairs[i,2],x]))))
den <- sum(sapply(1:ncol(mat), function(x) (max(mat[pairs[i,1],x], mat[pairs[i,2],x]))))
sim.jac.out[pairs[i,1],pairs[i,2]] <- num/den
sim.jac.out[pairs[i,2],pairs[i,1]] <- num/den
}
sim.jac.out[which(is.na(sim.jac.out))] <- 0
diag(sim.jac.out) <- 1
# inflow similarity
sim.jac.in <- matrix(0, nrow=nrow(mat), ncol=nrow(mat))
pairs <- t(combn(1:nrow(t(mat)), 2))
for (i in 1:nrow(pairs)) {
num <- sum(sapply(1:ncol(t(mat)), function(x) (min(t(mat)[pairs[i,1],x], t(mat)[pairs[i,2],x]))))
den <- sum(sapply(1:ncol(t(mat)), function(x) (max(t(mat)[pairs[i,1],x], t(mat)[pairs[i,2],x]))))
sim.jac.in[pairs[i,1],pairs[i,2]] <- num/den
sim.jac.in[pairs[i,2],pairs[i,1]] <- num/den
}
sim.jac.in[which(is.na(sim.jac.in))] <- 0
diag(sim.jac.in) <- 1
# total similariry
sim.jac.all <- (sim.jac.in + sim.jac.out)/2
So the general question is, does this make sense?
But more specifically, I would be interested to know if there is a way to incorporate link weights in the calculation of similarity with igraph.
In my real dataset, I need to do this several times iteratively (swapping individuals), for a large number of networks, so my method would take forever. I believe igraph uses C++ under the hood.
Apologies if this is a repeat question. Many have posted looking looking for a way to do post-hoc analyses on the conditional model (fixed factors) in glmmTMB. I want to do plannned contrasts between certain groups, not test every pairwise comparison (e.g. Tukey).
The code below worked well on nlme:lme for a lmm. However, it returns an error on the code below.
Error in modelparm.default(model, ...) :
dimensions of coefficients and covariance matrix don't match
Is there a way to do planned contrasts on a glmmTMB?
#filtdens is a dataframe and TRT,DATE,BURN,VEG are factors
filtdens <- merged %>% filter(!BLOCK %in% c("JB2","JB4","JB5") & MEAS =="DENS" &
group == "TOT" & BURN == "N" & VEG == "C")
filtdens$TD <- interaction(filtdens$TRT, filtdens$DATE)
mod2 <- glmmTMB(count~(TD)+(1|BLOCK),
data=filtdens,
zi=~1,
family=nbinom1(link = "log"))
k1 <- matrix(c(0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, -1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, -1, 1, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, -1, 0, 1, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, -1, 1, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, -1, 1, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, -1, 0, 1, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, -1, 1, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 1, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 1,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 1), byrow = T, ncol = 12)
summary(glht(mod2, linfct=k1),test=adjusted("bonferroni"))
A reproducible example would be helpful, but: this vignette in the development version offers code that ought to enable multcomp::linfct, i.e.:
glht_glmmTMB <- function (model, ..., component="cond") {
glht(model, ...,
coef. = function(x) fixef(x)[[component]],
vcov. = function(x) vcov(x)[[component]],
df = NULL)
}
modelparm.glmmTMB <- function (model,
coef. = function(x) fixef(x)[[component]],
vcov. = function(x) vcov(x)[[component]],
df = NULL, component="cond", ...) {
multcomp:::modelparm.default(model, coef. = coef., vcov. = vcov.,
df = df, ...)
}
Test (this example is with Tukey, but I don't see why it shouldn't work more generally ...)
library(glmmTMB)
data("cbpp",package="lme4")
cbpp_b1 <- glmmTMB(incidence/size~period+(1|herd),
weights=size,family=binomial,
data=cbpp)
g1 <- glht(cbpp_b1, linfct = mcp(period = "Tukey"))
summary(g1)
This works with the current CRAN version, but the current development version of glmmTMB offers more options (e.g. emmeans(); see the above-linked vignette). You'll need to install via devtools::install_github("glmmTMB/glmmTMB/glmmTMB") (you'll need compilation tools installed as well).
I would like to calculate the minimum number of consecutive elements in a vector that when added (consecutively) would be less than a given value.
For example in the following vector
ev<-c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 2.7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3.27, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 370.33, 1375.4,
1394.03, 1423.8, 1360, 1269.77, 1378.8, 1350.37, 1425.97, 1423.6,
1363.4, 1369.87, 1365.5, 1294.97, 1362.27, 1117.67, 1026.97,
1077.4, 1356.83, 565.23, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 356.83,
973.5, 0, 240.43, 1232.07, 1440, 1329.67, 1096.87, 1331.37, 1305.03,
1328.03, 1246.03, 1182.3, 1054.53, 723.03, 1171.53, 1263.17,
1200.37, 1054.8, 971.4, 936.4, 968.57, 897.93, 1099.87, 876.43,
1095.47, 1132, 774.4, 1075.13, 982.57, 947.33, 1096.97, 929.83,
1246.9, 1398.2, 1063.83, 1223.73, 1174.37, 1248.5, 1171.63, 1280.57,
1183.33, 1016.23, 1082.1, 795.37, 900.83, 1159.2, 992.5, 967.3,
1440, 804.13, 418.17, 559.57, 563.87, 562.97, 1113.1, 954.87,
883.8, 1207.1, 1046.83, 995.77, 803.93, 1036.63, 946.9, 887.33,
727.97, 733.93, 979.2, 1176.8, 1241.3, 1435.6)
What is the minimum number of elements that when added consecutively (as in the order within the vector) would sum up to lets say 20000
To be more clear i need the following:
Start with ev[1] and add consecutively up to 20000. Record the number of elements you had to add in order to get to 20000 as r[1]. Then start with ev[2] and add till 20000 and so on. Recored the number of elements you had to add till 20000 as r[2]. Do this for the entire length of ev. Then return the min(r)
For example
j<-c(1, 2, 3, 5, 7, 9, 2).
I want the minimum number of elements that when added consecutively would give lets say >20. This should be 3 (5+7+9)
Thanks a lot
Well, I'll give it a shot: This one will find the length of the minimum sequence of numbers
that add up to or above max. It makes no claims to be fast, but it has O(2n) time complexity :-)
I made it return both the start index and the length.
f <- function(x, max=10) {
s <- 0
len <- Inf
start <- 1
j <- 1
for (i in seq_along(x)) {
s <- s + x[i]
while (s >= max) {
if (i-j+1 < len) {
len <- i-j+1
start <- j
}
s <- s - x[j]
j <- j + 1
}
}
list(start=start, length=len)
# uncomment the line below if you don't need the start index...
#len
}
r <- f(ev, 20000) # list(start=245, length=15)
sum(ev[seq(r$start, len=r$length)]) # 20275.42
# Test speed:
x <- sin(1:1e6)
system.time( r <- f(x, 1.9) ) # 1.54 secs
# Compile the function makes it 9x faster...
g <- compiler::cmpfun(f)
system.time( r <- g(x, 1.9) ) # 0.17 secs
library(zoo) # Needed for rollapply
N <- 20000 # The desired sum we want to achieve
j <- 0
for(i in 1:length(ev)){
k <- rollapply(ev, i, sum)
j[i] <- max(k)
if(j[i] >= N){
break
}
}
i # contains how many consecutive elements you need to sum (15)
j[i] # contains the corresponding sum(20275.42)
Currently this doesn't tell you where the specific subset occurs in the vector but another use of rollapply could get you that information.
There are other ways to do it but if you have a really long vector this will break out of the loop so you don't calculate more than you need. The basic idea is to use rollapply to create a vector of the consecutive sums of length k and then find the maximum of that. If this is less than what we desire do the same thing for sums of length k+1. Repeat until we find a sum that is larger than the desired threshold.
Edit:
This appears to be about 100x faster. I haven't compared it to Tommy's answer (which is probably faster than this but this will provide a significant speedup compared to my original method.
Edit 2: Moving the [-n] and removing the suppresswarnings speeds this up quite a bit.
myfun <- function(ev, N){
i <- 1
n <- length(ev)
j <- ev
repeat{
j <- (j[-n] + ev[-c(1:i)])
i <- i+1
n <- n-1
if(max(j) >= N | i > length(ev)){
break;
}
}
return(i)
}
myfun(ev, 20000)
# And stealing the idea from Tommy gives a nice speedup as well
myfuncomp <- compiler:cmpfun(myfun)
myfuncomp(ev, 20000)
myfunc3 <- compiler:cmpfun(myfun, options = list(optimize = 3))
myfunc3(ev, 20000)
library(rbenchmark) # For testing
# If you have Tommy's functions loaded as f and g you can compare
benchmark(f(ev, 20000), g(ev, 20000), myfun(ev, 20000), myfuncomp(ev, 20000), myfunc3(ev, 20000))
you mean something like this?
> sum(ifelse(cumsum(ev)<=200000, 1, 0))
[1] 364
I think this may be a Traveling Salesman Problem in disguise unless you put in some more constraints. You cannot necessarily start at the max ev and go out in either direction since it may be a local non-dense maximum
x=1:length(ev)
plot(x,ev)
lxy <- loess(ev~x )
lines(predict(lxy, x=1:length(y)))
title(main="loess() fit of ev")
But in the region of the most dense values the values are fairly flat.
x=1:length(y); y=c(356.83,
973.5, 0, 240.43, 1232.07, 1440, 1329.67, 1096.87, 1331.37, 1305.03,
1328.03, 1246.03, 1182.3, 1054.53, 723.03, 1171.53, 1263.17,
1200.37, 1054.8, 971.4, 936.4, 968.57, 897.93, 1099.87, 876.43,
1095.47, 1132, 774.4, 1075.13, 982.57, 947.33, 1096.97, 929.83,
1246.9, 1398.2, 1063.83, 1223.73, 1174.37, 1248.5, 1171.63, 1280.57,
1183.33, 1016.23, 1082.1, 795.37, 900.83, 1159.2, 992.5, 967.3,
1440, 804.13, 418.17, 559.57, 563.87, 562.97, 1113.1, 954.87,
883.8, 1207.1, 1046.83, 995.77, 803.93, 1036.63, 946.9, 887.33,
727.97, 733.93, 979.2, 1176.8, 1241.3, 1435.6)
lxyhi <- loess(y~x)
plot(x,y)
lines(predict(lxyhi, x=1:length(y)))