I'm applying the example here:
https://quantdev.ssri.psu.edu/sites/qdev/files/09_EnsembleMethods_2017_1127.html
to my data, to build a model for classification using the caret package.
I got to the point:
cvcontrol <- trainControl(method="repeatedcv", number = 10, repeats=3,allowParallel=TRUE)
train.rf <- train(as.factor(variate) ~ .,
data=train.n.inp,
method="rf",
trControl=cvcontrol,
importance=TRUE)
rf.classTrain <- predict(train.rf, type="raw")
#computing confusion matrix
cM <- confusionMatrix(train.n.inp$variate,rf.classTrain)
I don't understand the need to use the predict function to calculate the confusion matrix, or, in other words, what is the difference between cM and train.rf$finalModel:
train.rf$finalModel
OOB estimate of error rate: 43.08%
Confusion matrix:
MV UV class.error
MV 25 12 0.3243243
UV 16 12 0.5714286
> cM
Confusion Matrix and Statistics
Reference
Prediction MV UV
MV 37 0
UV 0 28
Accuracy : 1
I am confused by the (large) difference between the two confusion matrices and unsure which one reflects the accuracy of the model. Any help appreciated.
the data:
dput(train.n.inp)
structure(list(variate = structure(c(1L, 1L, 2L, 1L, 1L, 2L,
1L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 2L,
1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 1L, 1L, 2L, 1L, 1L,
2L, 2L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 2L,
1L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 2L), .Label = c("MV",
"UV"), class = "factor"), AMB = c(0.148918043959789, 0.137429106929874,
0.13522219247215, 0.152139165429334, 0.193551266136034, 0.1418753904697,
0.132098434875739, 0.256245486778797, 0.136593400352133, 0.0183612037420183,
0.0235701709547339, 0.030539801539972, 0.0532418112925866, 0.0506048730618504,
0.0443005622763673, 0.172991261592386, 0.135717125493919, 0.139092406429261,
0.1225892299329, 0.13579014839877, 0.183709401293317, 0.122207888096455,
0.00542803592726925, 0.0192455922563268, 0.0731446096925737,
0.0150264910871489, 0.0487793004405717, 0.0433918327937752, 0.0122597343588996,
0.0211847560629296, 0.114451232870044, 0.113712890165437, 0.00788647372392488,
-0.03807738805183, 0.00735097242168299, -0.00173226557619129,
0.000279921135262793, 0.0487306185040041, 0.00901021509302318,
0.164378615647997, 0.081505732298031, 0.0337690366656119, 0.0520247628784008,
0.0318461001711981, 0.0467265454486446, 0.0503046677863513, 0.026150313592808,
0.102418680881792, 0.145640126897581, 0.158703113209843, 0.166192017785134,
0.145234444092853, 0.189096868940113, 0.142573164893833, 0.157794383727251,
0.312043099741174, 0.136009217113324, 0.115213916542934, 0.119757563955894,
0.120065882887488, 0.141891617781889, 0.177956819122265, 0.13731551574455,
0.328513821613157, 0.110426859447136), MB = c(-0.73416, -0.67752,
-0.66664, -0.75004, -0.9542, -0.69944, -0.65124, -1.26328, -0.6734,
-0.09052, -0.1162, -0.15056, -0.26248, -0.24948, -0.2184, -0.85284,
-0.66908, -0.68572, -0.60436, -0.66944, -0.90568, -0.60248, -0.02676,
-0.09488, -0.3606, -0.07408, -0.24048, -0.21392, -0.06044, -0.10444,
-0.56424, -0.5606, -0.0388800000000001, 0.18772, -0.0362400000000001,
0.00854000000000001, -0.00138, -0.24024, -0.04442, -0.81038,
-0.40182, -0.16648, -0.25648, -0.157, -0.23036, -0.248, -0.12892,
-0.50492, -0.718, -0.7824, -0.81932, -0.716, -0.93224, -0.70288,
-0.77792, -1.53836, -0.67052, -0.568, -0.5904, -0.59192, -0.69952,
-0.87732, -0.67696, -1.61956, -0.5444), MGE = c(1.58768, 1.6152,
1.53288, 1.52972, 1.12908, 1.50552, 1.48988, 1.67552, 1.55052,
1.23556, 1.27284, 1.21336, 0.84592, 1.30172, 1.14048, 1.26828,
1.20884, 1.21764, 1.22876, 1.22168, 1.27944, 1.22528, 1.26932,
1.25408, 1.183, 1.38032, 1.33416, 0.95584, 1.31188, 1.39796,
1.33848, 1.4458, 1.18416, 1.23868, 1.22968, 1.17838, 1.17278,
1.13368, 1.11374, 1.31642, 1.14034, 1.21984, 1.17128, 1.16364,
1.15036, 1.12984, 1.22484, 1.17244, 1.2768, 1.55744, 1.66964,
1.54848, 1.17416, 1.56424, 1.48928, 1.9326, 1.54588, 1.228, 1.29096,
1.39296, 1.38432, 1.275, 1.32704, 1.9442, 1.35128)), row.names = c(NA,
-65L), class = "data.frame")
Related
I often have groups of people who differ in their nationality and their status. They have to work in groups, and I would like to use block random assignment to create groups of a maximum of 5 individuals. Each group should have at least one person who is "foreign" and one who is "female". I have found the library randomizr which is supposedly able to do block random assignments, but my code does not work as intended.
An example dataset would be:
structure(list(Student = c("Susan", "Ciara", "Carl",
"Paula", "Emil", "Tammy", "Logan", "Anna", "Victor",
"Felix", "Federica", "Jesus", "Jens", "Samira", "Berit", "Yi",
"Lea", "Gordon", "Boris", "Silvester", "Celine", "Thomas", "Eduardo",
"RoY", "Marlene", "Amelie", "Claudius", "Herbert", "Cynthia", "Melanie",
"Leander", "Leona", "Tobias", "Leander", "Peter",
"Lilly", "Roxy", "Joachim"), Nationality = structure(c(2L, 2L,
1L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 1L,
1L, 1L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 1L,
1L, 1L, 2L, 2L), levels = c("Non-foreign", "Foreign"), class = "factor"),
Gender = structure(c(1L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L,
2L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 2L,
1L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 1L, 2L), levels = c("female",
"male"), class = "factor")), class = "data.frame", row.names = c(NA,
-38L))
UPDATE: I have carefully read the vignette for the randomzir package again. I found that it is possible to create blocks with more than 1 covariate. I am now looking to see if i can assign these blocks to the students to get block random groups. I need to test if the code below works as intended.
blocks <- with(data, paste(Nationality, Gender, sep = "_"))
Z <- block_ra(blocks = blocks, num_arms = 6)
table(data$Student, Z)
I am running mixed linear models using lmer from lme4. We are testing the effect of family, strain and temperature on several growth factors for brook trouts. I have 4 families (variable FAMILLE) form which we sampled our individuals. 2 are from the selected strain and 2 are from the control strain (variable Lignee). For each strain, the 2 families were either marked as resistant (Res) or sensible (Sens). So my fixed effect variable (FAMILLE), is nested in my variable Lignee. The expermiment was conducted at 3 different temperatures.
Here is what my dataframe looks like :
structure(list(BASSIN = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("1",
"2", "3", "4"), class = "factor"), t.visee = structure(c(2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L), .Label = c("15", "17", "19"), class = "factor"), FAMILLE = structure(c(2L,
2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L,
1L), .Label = c("RES", "SENS"), class = "factor"), Lignee = structure(c(2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L), .Label = c("CTRL", "SEL"), class = "factor"), taux.croiss.sp.poids = c(0.8,
1.14285714285714, 1.42857142857143, 0.457142857142857, -0.228571428571429,
0.628571428571429, 0.971428571428571, 0.742857142857143, 1.08571428571429,
0.8, 0.571428571428571, 1.02857142857143, 0.8, 0.285714285714286,
0.285714285714286, 0.571428571428571, 0.742857142857143, 1.14285714285714,
0.628571428571429, 0.742857142857143, 1.02857142857143, 0.285714285714286,
0.628571428571429, 0.628571428571429, 0.857142857142857, 0.8,
1.08571428571429, 1.37142857142857, 0.742857142857143, 1.08571428571429,
0.0571428571428571, 0.571428571428571, 0.171428571428571, 0.8,
0.685714285714286, 0.285714285714286, 0.285714285714286, 0.8,
0.457142857142857, 1.02857142857143, 0.342857142857143, 0.742857142857143,
0.857142857142857, 0.457142857142857, 0.742857142857143, 1.25714285714286,
0.971428571428571, 0.857142857142857, 0.742857142857143, 0.514285714285714
)), row.names = c(NA, -50L), class = c("tbl_df", "tbl", "data.frame"
))
Lignee has 2 levels (Sel and Ctrl)
FAMILLE has 2 levels (Sens and Res)
So I have 4 distinct levels :
Lignee Sel and FAMILLE Sens
Lignee Sel and FAMILLE Res
Lignee Ctrl and FAMILLE Sens
Lignee Ctrl and FAMILLE Res
when I run for example this line to test the effect of the variables on the rate of weight gain:
model6 <- lmer((taux.croiss.sp.poids) ~ t.visee + Lignee/FAMILLE + (1 |BASSIN), data = mydata1, REML = FALSE)
and then
summary(model6)
<Linear mixed model fit by maximum likelihood . t-tests use Satterthwaite's method ['lmerModLmerTest']
Formula: (taux.croiss.sp.poids) ~ t.visee + Lignee/FAMILLE + (1 | BASSIN)
Data: mydata1
AIC BIC logLik deviance df.resid
115.2 139.5 -50.6 101.2 228
Scaled residuals:
Min 1Q Median 3Q Max
-3.11527 -0.59489 0.05557 0.69775 2.79920
Random effects:
Groups Name Variance Std.Dev.
BASSIN (Intercept) 0.01184 0.1088
Residual 0.08677 0.2946
Number of obs: 235, groups: BASSIN, 4
Fixed effects:
Estimate Std. Error df t value Pr(>|t|)
(Intercept) 0.770942 0.209508 194.702337 3.680 0.000302 ***
t.visee -0.019077 0.011682 231.005933 -1.633 0.103809
LigneeSEL 0.214062 0.054471 231.007713 3.930 0.000112 ***
LigneeCTRL:FAMILLESENS -0.008695 0.054487 231.038877 -0.160 0.873358
LigneeSEL:FAMILLESENS -0.205001 0.054242 231.016973 -3.779 0.000200 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Correlation of Fixed Effects:
(Intr) t.vise LgnSEL LCTRL:
t.visee -0.948
LigneeSEL -0.131 0.000
LCTRL:FAMIL -0.124 -0.007 0.504
LSEL:FAMILL 0.000 0.000 -0.498 0.000>
From what I can understand, the model chooses 1 family as the reference group, which won't be in the output. But the problem here is that 2 groups are missing :
LigneeCTRL:FAMILLERES
AND
LigneeSEL:FAMILLERES
Does somebody knows why my output is missing not ONE but TWO of the groups?
I'm french canadian so don't hesitate if some things are not clear, I will try to re-explain in other words!
Also, this is my 1st message on Stack, I tried to include everything needed but dont hesitate if I need to include some other things!
Thanks in advance
I am trying to run a poisson regression to predict a common binary outcome.
This is my first attempt at using dput - if I have used it inappropriately, please let me know so I can correct it.
Example data:
df <- structure(list(id = 1:30, sex = structure(c(1L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 2L, 2L, 2L, 1L, 2L,
2L, 2L, 2L, 1L, 2L, 1L, 2L, 1L, 1L), .Label = c("Female", "Male"
), class = "factor"), migStat = structure(c(1L, 2L, 1L, 1L, 1L,
1L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L,
1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L), .Label = c("Australian-born",
"Migrant"), class = "factor"), mhAreaBi = structure(c(1L, 1L,
1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 2L,
1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L), .Label = c("Metropolitan",
"Regional"), class = "factor"), empStatBi = structure(c(2L, 2L,
1L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 1L, 2L, 1L,
2L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("Student / employed",
"Unemployed"), class = "factor"), pensBenBi = structure(c(1L,
2L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 1L, 2L,
1L, 2L, 1L, 1L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 1L, 2L), .Label = c("No benefit",
"In receipt of pension benefit"), class = "factor"), maritStatBi = structure(c(2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L), .Label = c("Married (including de facto)",
"Not married"), class = "factor"), cto = structure(c(1L, 2L,
2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 2L, 1L, 2L, 2L,
2L, 1L, 2L, 2L, 2L, 2L, 1L, 2L, 1L, 2L, 2L, 2L), .Label = c("No",
"Yes"), class = "factor")), .Names = c("id", "sex", "migStat",
"mhAreaBi", "empStatBi", "pensBenBi", "maritStatBi", "cto"), row.names = c(NA,
-30L), class = "data.frame")
When running the regression using glm in R, I receive an error:
fit <- glm(cto ~ sex + migStat + mhAreaBi + empStatBi + pensBenBi + maritStatBi, df, family = poisson)
Error in if (any(y < 0)) stop("negative values not allowed for the 'Poisson' family") :
missing value where TRUE/FALSE needed
In addition: Warning message:
In Ops.factor(y, 0) : ‘<’ not meaningful for factors
The same error has been explained briefly in this thread:
Because the "<" operator is not defined for factors the result that is
passed to if is of length 0. Setting the factor variable on the RHS
and using the integer values on hte LHS succeeds.
The error does not appear when I convert the outcome to an integer; however, this:
seems to defeat the purpose of predicting a binary outcome (unless a numeric variable with range 0-1 is treated the same as a factor variable with two levels); and
does not seem necessary (at least according to this post, which uses geeglm from geepack to predict a binary outcome [unfortunately, I receive the same error when I adapt the code to my own dataset])
Questions:
Could I receive further explanation of the error?
If I convert my outcome to an integer with range 0-1, will glm treat it the same as a binary variable? If not, is there an approach better suited to running a regression for a common binary outcome?
I think the best option here is:
df$cto_binary <- as.numeric(df$cto == "Yes")
fit <- glm(cto_binary ~ sex + migStat + mhAreaBi + empStatBi + pensBenBi + maritStatBi,
df, family = poisson)
As this way you explicitly show in your code what will be a 1/success in your binary outcome and don't get tripped up by things like the ordering of factor levels. Note that in R as.numeric(c(FALSE, TRUE)) gives c(0, 1), so you always know what you're going to get from a logical comparison.
A factorial combination of 16 treatments (4*2*2) was replicated three times and laid out in a strip-split block. Treatments consisted of eight site preparations (4*2) applied as whole plot treatments and two levels of weeding(weeding/no-weeding) were applied randomly to subplots. The analysis was run in Genstat giving the following results:
Variate: result
Source of variation d.f. s.s. m.s. v.r. F pr.
Rep stratum 2 35.735 17.868
Rep.Burning stratum
Burning 1 0.003 0.003 0.00 0.972
Residual 2 3.933 1.966 1.53
Rep.Site_prep stratum
Site_prep 3 7.981 2.660 0.45 0.727
Residual 6 35.477 5.913 4.61
Rep.Burning.Site_prep stratum
Burning.Site_prep 3 2.395 0.798 0.62 0.626
Residual 6 7.691 1.282 0.60
Rep.Burning.Site_prep.*Units* stratum
Weeding 1 13.113 13.113 6.13 0.025
Burning.Weeding 1 0.486 0.486 0.23 0.640
Site_prep.Weeding 3 17.703 5.901 2.76 0.076
Burning.Site_prep.Weed.3 3.425 1.142 0.53 0.666
Residual 16 34.248 2.141
Total 47 162.190
I want to repeat these results in R. I used both the base::aov function and the lmerTest::lmer function. I managed to get the correct results with aov using function
result ~ Burning * Weeding * Site.prep + Error(Rep/Burning*Site.prep). With lmer I used the function
result ~ Burning*Site.prep*Weeding+(1|Rep/(Burning:Site.prep)) giving me only partially correct results. The SS values and the F-values for Burning, Site.prep and Burning:Site.prep deviated (although not too much)from the Genstat results, but the Weeding and Weeding interactions gave the same SS and F-valus as the Genstat output.
I would like to know how I should specify the lmer model to reproduce the Genstat and aov results.
Data and code below:
x <- structure(list(
Rep = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("1", "2", "3"
), class = "factor"),Burning = structure(c(1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L), .Label = c("Burn",
"No-burn"), class = "factor"), Site.prep = structure(c(4L, 4L,4L, 4L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 1L, 1L, 1L, 1L, 4L, 4L, 4L, 4L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 1L, 1L, 1L, 4L, 4L, 4L, 4L),
.Label = c("Chop_Pit", "Chop_Rip", "Pit", "Rip"), class = "factor"), Weeding = structure(c(1L,
2L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 1L,
2L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 2L),
.Label = c("Weedfree", "Weedy"), class = "factor"),
Dbh14 = c(27.4, 28.4083333333333, 27.7066666666667, 27.3461538461538, 28.6, 28.3333333333333, 27.0909090909091,
27.8076923076923, 27.1833333333333, 27.5461538461538, 24.3076923076923,
29.3461538461538, 27.4, 25.1, 26.61, 28.0461538461538, 27.71,
25.2533333333333, 25.3833333333333, 24.2307692307692, 24.2533333333333,
24.95, 24.34375, 26.9909090909091, 24.775, 25.9076923076923,
25.1666666666667, 25.9933333333333, 27.0466666666667, 30.5625,
27.36, 25.2636363636364, 29.6846153846154, 27.7, 28.3071428571429,
29.4857142857143, 27.025, 30.1, 31.2454545454545, 24.2888888888889,
28.4875, 29.23, 30, 28.5, 29.3615384615385, 27.45, 28.8153846153846,
29.1866666666667)), .Names = c("Rep", "Burning", "Site.prep",
"Weeding", "result"), class = "data.frame", row.names = c(NA, -48L))
model1 <- aov(result ~ Burning* Weeding*Site.prep+ Error(Rep/Burning*Site.prep), data=x)
summary(model1)
model2 <- lmer(result ~ Burning*Site.prep*Weeding+(1|Rep/(Burning:Site.prep)),data=x)
anova(model2)
Applying the three-way split-plot-factorial ANOVA example from the site mentioned by #cuttlefish44, leads to:
library(lme4)
library(nlme)
m1 <- aov(result ~ Weeding*Burning*Site.prep + Error(Rep/Burning*Site.prep), data=x)
m2 <- lmer(result ~ Weeding*Burning*Site.prep + (1|Rep) + (1|Burning:Rep) +
(1|Site.prep:Rep), data=x)
m3 <- anova(lme(result ~ Weeding*Burning*Site.prep,
random=list(Rep=pdBlocked(list(~1, pdIdent(~Burning-1), pdIdent(~Site.prep-1)))),
method="ML", data=x))
summary(m1)
anova(m2)
m3
Except for Site.prep, the results match. Moreover, the results between lmer() and lme() are pretty similar (also for Site.prep). I'm not sure whether this is the result of differences in modelling approaches: the multi-level approach takes both within and between effects into account.
I'm new to R and have the following challenge;
I want to create a visualization that basically combines 2 kind of 'heatmaps' in order to visualize at what times there are truly dark skies (for astronomy). For this I want to have a heatmap that visualizes the brightness of the moon based on the moonrise and moonset times and the phase of the moon. On this then we can plot a 'band'like heatmap for the time the sun is up with some transparency.
I'm not sure if this is going to work visualy or if I need to find some other solution, however this seems like a good challenge to get into R some more.
But I could use some pointers as I'm stuck already loading the matrix of size 24(hours) x 31(days) with all the 720 values. When trying to create a basic data.frame from the vectors I get the error that the number of rows are inconsistent.
Furthermore I have some heatmap examples working already, but I'm not sure how to combine 2 of them in the same plot like I described.
As an illustration the current 'heatmap' as it is in excel
And some data:
MOON
moon <- structure(list(X1.9.12 = structure(c(2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
2L, 2L), .Label = c("0%", "100%"), class = "factor"), X2.9.12 = structure(c(2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 2L, 2L, 2L), .Label = c("0%", "98%"), class = "factor"),
X3.9.12 = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L
), .Label = c("0%", "94%"), class = "factor"), X4.9.12 = structure(c(2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L), .Label = c("0%", "89%"), class = "factor"),
X5.9.12 = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L
), .Label = c("0%", "82%"), class = "factor"), X6.9.12 = structure(c(2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L), .Label = c("0%", "74%"), class = "factor"),
X7.9.12 = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L
), .Label = c("0%", "65%"), class = "factor"), X8.9.12 = structure(c(2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("0%", "56%"), class = "factor"),
X9.9.12 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L
), .Label = c("0%", "47%"), class = "factor"), X10.9.12 = structure(c(2L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("0%", "37%"), class = "factor"),
X11.9.12 = structure(c(2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L
), .Label = c("0%", "28%"), class = "factor"), X12.9.12 = structure(c(2L,
2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("0%", "20%"), class = "factor"),
X13.9.12 = structure(c(2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L
), .Label = c("0%", "12%"), class = "factor"), X14.9.12 = structure(c(2L,
2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L), .Label = c("0%", "6%"), class = "factor"),
X15.9.12 = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L
), .Label = c("0%", "2%"), class = "factor"), X16.9.12 = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "0%", class = "factor"),
X17.9.12 = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L
), .Label = c("0%", "1%"), class = "factor")), .Names = c("X1.9.12",
"X2.9.12", "X3.9.12", "X4.9.12", "X5.9.12", "X6.9.12", "X7.9.12",
"X8.9.12", "X9.9.12", "X10.9.12", "X11.9.12", "X12.9.12", "X13.9.12",
"X14.9.12", "X15.9.12", "X16.9.12", "X17.9.12"), class = "data.frame", row.names = c("0:00:00",
"1:00:00", "2:00:00", "3:00:00", "4:00:00", "5:00:00", "6:00:00",
"7:00:00", "8:00:00", "9:00:00", "10:00:00", "11:00:00", "12:00:00",
"13:00:00", "14:00:00", "15:00:00", "16:00:00", "17:00:00", "18:00:00",
"19:00:00", "20:00:00", "21:00:00", "22:00:00", "23:00:00"))
SUN
September
Day Sunrise Sunset
1 6:52 20:26
2 6:54 20:24
3 6:56 20:22
4 6:57 20:20
5 6:59 20:17
6 7:00 20:15
7 7:02 20:13
8 7:04 20:10
9 7:05 20:08
10 7:07 20:06
11 7:08 20:05
12 7:09 20:02
13 7:11 20:00
14 7:13 19:58
15 7:14 19:55
16 7:16 19:53
17 7:17 19:51
18 7:19 19:48
19 7:21 19:46
20 7:22 19:44
21 7:25 19:40
22 7:26 19:38
23 7:28 19:35
24 7:30 19:33
25 7:31 19:31
26 7:33 19:28
27 7:35 19:26
28 7:36 19:24
29 7:38 19:21
30 7:40 19:19
So from what I understood, there are basically two questions:
Data organization
The easiest would be, if you'd have all data in one data.frame in long format. I.e. for each combination of time and date you have one row, with additional columns for the moon and sun intensity.
So we start with melting and fixing the moon data:
library(reshape2)
moon$time <- row.names(moon)
moon <- melt(moon, id.vars="time", variable.name="date", value.name="moon" )
moon$date <- sub("X(.*)", "\\1", moon$date)
moon$moon <- 1 - as.numeric(sub("%", "", moon$moon)) /100
Now we bring the sun data to an comparable form, by at least give them the same identifier for the date:
sun$Day <- paste( sun$Day, "9.12", sep ="." )
Next step is to merge the data by the date resp. Day and to set a comparable column for the sun intensity as is given already for the moon intensity. This can be done by casting the times to a time format and compare Sunrise and Sunset with the actual time:
mdf <- merge( moon, sun, by.x = "date", by.y = "Day" )
mdf$time.tmp <- strptime(mdf$time, format="%H:%M")
mdf$Sunrise <- round(strptime(mdf$Sunrise, format="%H:%M"), units = "hours")
mdf$Sunset <- round(strptime(mdf$Sunset, format="%H:%M"), units = "hours")
mdf$sun <- ifelse( mdf$Sunrise <= mdf$time.tmp & mdf$Sunset >= mdf$time.tmp, 1, 0 )
mdf <- mdf[c("date", "time", "moon", "sun")]
mdf[ 5:10, ]
date time moon sun
1.9.12 4:00:00 0 0
1.9.12 5:00:00 0 0
1.9.12 6:00:00 0 0
1.9.12 7:00:00 0 1
1.9.12 8:00:00 1 1
1.9.12 9:00:00 1 1
Plotting
Adding multiple layers with different transparencies begs literally for ggplot2. In order to use this in a proper way, there is one more data manipulation necessary, which ensures the proper order on the axes: date and time have to be converted to factors with factor levels ordered not lexically, but by time:
mdf <- within( mdf, {
date <- factor( date, levels=unique(date)[ order(as.Date( unique(date), "%d.%m.%y" ) ) ] )
time <- factor( time, levels=unique(time)[ order(strptime( time, format="%H:%M:%S"), decreasing=TRUE ) ] )
} )
This can be plot now:
library( ggplot2 )
ggplot( data = mdf, aes(x = date, y = time ) ) +
geom_tile( aes( alpha = sun ), fill = "goldenrod1" ) +
geom_tile( aes( alpha = moon ), fill = "dodgerblue3" ) +
scale_alpha_continuous( "moon", range=c(0,0.5) ) +
theme_bw() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Which gives you the following result