Related
I've been working on the following dataset
> dput(db_analysis)
structure(list(ID = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76,
77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92,
93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106,
107, 108, 109, 110, 111, 112, 113), GROUP = structure(c(2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("CONTROL",
"TRAINING"), class = "factor"), Gender = c(1, 0, 1, 0, 1, 0,
0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1,
0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1,
0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0,
1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1,
1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1,
1, 0), Age = c(74, 76, 81, 74, 69, 72, 75, 83, 78, 72, 82, 68,
72, 72, 73, 80, 69, 72, 70, 80, 75, 80, 78, 74, 82, 74, 80, 82,
78, 81, 66, 71, 70, 79, 78, 73, 72, 77, 77, 71, 83, 74, 70, 71,
77, 69, 67, 64, 79, 71, 77, 77, 73, 67, 68, 79, 81, 67, 84, 75,
80, 73, 68, 74, 77, 79, 79, 72, 73, 78, 76, 78, 77, 74, 78, 77,
77, 82, 77, 70, 77, 81, 79, 75, 74, 78, 69, 77, 73, 77, 70, 79,
70, 72, 77, 72, 71, 71, 73, 81, 70, 72, 68, 70, 73, 82, 81, 73,
76, 85, 70, 77, 65), Education = c(18, 4, 8, 5, 8, 11, 5, 5,
4, 8, 8, 12, 5, 18, 13, 5, 13, 13, 5, 5, 13, 5, 3, 8, 17, 5,
8, 5, 5, 8, 17, 8, 18, 18, 13, 13, 13, 13, 15, 17, 8, 5, 5, 13,
8, 5, 11, 13, 8, 8, 8, 5, 13, 8, 5, 17, 8, 12, 13, 5, 8, 8, 8,
5, 3, 8, 18, 5, 8, 13, 8, 5, 17, 8, 5, 17, 5, 8, 11, 8, 8, 5,
12, 3, 8, 8, 8, 13, 5, 5, 8, 8, 13, 5, 5, 8, 13, 5, 8, 12, 5,
13, 12, 8, 5, 17, 5, 5, 5, 8, 13, 10, 8), ADAS_CogT0 = c(14.66,
15.33, 17.33, 19, 7.66, 12.6, 18.67, 14.99, 17.99, 17.33, 13.66,
16.99, 10.66, 9.66, 14.99, 15.66, 13.33, 4.33, 14.33, 15.99,
16.33, 10.66, 14.66, 10.66, 19.33, 17.66, 15.99, 20.66, 20.6,
17, 10.33, 6.33, 6.66, 19.99, 13.33, 24.33, 12.33, 10.33, 12.33,
9.66, 10.99, 13.99, 23, 6.32, 11.32, 13.99, 14.66, 8.99, 14.33,
9.99, 7.33, 15.66, 14, 7.99, 23.32, 14.66, 9.99, 5.66, 6.99,
11.66, 10.33, 6.99, 19.32, NA, 10, 17.66, 13.66, 10.32, NA, NA,
8.66, 9, 6.99, 14.99, 9.66, 13.66, 15.32, 12, 14, 13.66, 11.99,
15.66, 16, 15, 16.99, 20, 11, 7.99, 8.33, 8.32, 14.99, 18.66,
10.33, 11.99, 9.32, 17, 14.33, 14.66, 16.6, 9.99, NA, 17.66,
18.66, NA, 19, 11.9, 16.66, 9.33, 10, 13.99, 7.66, 8.66, 9.32
), ADAS_CogT7 = c(16, 9.32, 21.33, 17, 8.32, 11, 14.99, 10.99,
17, 18.33, 13.32, 14.34, 8.99, 7, 11.99, 15.33, 6.99, 5.33, 12.32,
13, 21.32, 7.99, 13.33, 11.99, 17.32, 16.32, 16.33, 14.66, 18.99,
17.33, 7.99, 9.33, 10.99, NA, 12.99, 16.33, 21.66, 9, 9.34, 8.66,
8.33, 13.66, 15.66, 6.66, 10.99, 13.33, 13.33, 7.99, 11.99, 11.32,
7.33, 9.66, 6.99, NA, 15.99, 15.66, 14.66, 6.32, 7, 11, 14, 10.33,
24.66, NA, 14.99, NA, 15.99, 9.32, NA, NA, 9.99, 9.33, 7.66,
17.33, 10.32, 16, 17, 12.99, 15, 14.33, 10, 14.99, 19, 13.99,
19.33, NA, 10, 6.99, 11.66, 6.66, 14.33, 16, 8.66, 10, NA, 20,
14.99, 19.66, 26.66, 8.99, NA, 14.99, 20.99, NA, 17.99, 12.33,
19, 11.33, 10.66, 16.66, 11.33, 9.66, 6.99)), row.names = c(NA,
-113L), class = c("tbl_df", "tbl", "data.frame"))
>
ADAS_CogT0 and ADAS_CogT7 are score of psychological test gather at time0 and at time7 both into a CONTROL group as well as into a TRAINING one. Since I need to fit a model fro the variable at time 7 corrected for its initial score and evalue the effect as well of variable TIME, GROUP and TIME*GROUP, I've scripted down the following code:
db_long <- db_analysis %>%
dplyr::select(ID, GROUP, Age, Gender,ADAS_CogT0, ADAS_CogT7,Education) %>%
na.omit() %>%
pivot_longer(
c(ADAS_CogT0, ADAS_CogT7), names_to = "time", values_to = "score"
) %>%
mutate(
time = factor(if_else(
time == "ADAS_CogT0", "0", "7"
), levels = c("0", "7")),
ID = factor(ID)
)
And fitted this model
options(datadist = "dd")
dd <- datadist(db_long)
ols_fit <- ols(
score ~ time * GROUP + Age ,
data = db_long,
x = TRUE,
y = TRUE
)
However the out output does not reflect the I needed to calculate.
Effects Response : score
Factor Low High Diff. Effect S.E. Lower 0.95 Upper 0.95
Age 71.25 78 6.75 0.62149 0.58269 -0.52749 1.770500
time - 7:0 1.00 2 NA -1.40080 0.44476 -2.27770 -0.523760
GROUP - CONTROLLO:TRAINING 2.00 1 NA -1.66800 0.79710 -3.23980 -0.096255
Adjusted to: time=0 GROUP=TRAINING
How should manipulate the dataset to fit the model in a way I obtain evalaution of score time7 as a dependent variables of time, group, time*group and adjusted for its initial score at time0?
The goal of this code to run rpart to create a model to predict quality/wine quality. I then tried to graph the in-sample and out of sample from validating the model. The plot outputted has number of leaves as the x and loss on the y.
Here is my code:
# rm(list=ls())
### read the dataset and make sure to change the diresctory
wine_quality <- read.csv("C:/Users/machu/Documents/data_analytics_program/wine_quality.csv")
### there are some data missing, I have come up several ways to deal with that:
# there are pros and cons for both. I will leave this to you guys which way to choose
## 1. drop rows that include na
wine_quality = na.omit(wine_quality)
## 2. replace the na with the mean of the columns
# for(i in 1:ncol(wine_quality)){
# wine_quality[is.na(wine_quality[,i]), i] <- mean(wine_quality[,i], na.rm = TRUE)
# }
### changing quality to factor type
wine_quality$quality <- factor(wine_quality$quality)
wine_quality$type <- factor(wine_quality$type )
### rpart
library(rpart)
library(tree)
library(RColorBrewer)
library(rattle)
#train, val, test
set.seed(99)
n=nrow(wine_quality)
n1=floor(n/2)
n2=floor(n/4)
n3=n-n1-n2
ii = sample(1:n,n)
train = wine_quality[ii[1:n1],]
train = subset(train, select = -1)
val = wine_quality[ii[n1+1:n2],]
val = subset(val, select = -1)
test = wine_quality[ii[n1+n2+1:n3],]
test = subset(test, select = -1)
### get big tree at first
big.tree = rpart(quality~.,
method="class",
data=train,
control=rpart.control(minsplit=5,
cp=.0001))
nbig = length(unique(big.tree$where))
cat('Number of leaf nodes: ',nbig,'\n')
cpvec = big.tree$cptable[,"CP"] #cp values to try
ntree = length(cpvec) #number of cv values = number of trees fit.
iltree = rep(0,ntree) #in-sample loss
oltree = rep(0,ntree) #out-of-sample loss
sztree = rep(0,ntree) #size of each tree
for(i in 1:ntree) {
if((i %% 5)==0) cat('tree i: ',i, "out of", ntree, '\n')
temptree = prune(big.tree,cp=cpvec[i]) #Pruned tree by cp
sztree[i] = length(unique(temptree$where)) #Number of leaves
predicted_value_is = c()
for(k in nrow(predict(temptree))){
predicted_value_is = c(predicted_value_is, which.max(predict(temptree)[k,]+2))
}
iltree[i] = sum(train$quality!=predicted_value_is) #in-sample loss
ofit = predict(temptree,val) #Validation prediction
predicted_value_val = c()
for(j in nrow(ofit)){
predicted_value_val = c(predicted_value_val, which.max(ofit[j,]+2))
}
oltree[i] = sum(val$quality!=predicted_value_val) #out-of-sample loss
}
### RMSE out-of-sample
oltree=sqrt(oltree/nrow(val))
### RMSE in-sample
iltree = sqrt(iltree/nrow(train))
rgl = range(c(iltree,oltree))
plot(range(sztree),rgl,
type='n', #Type = n removes points from plot
xlab='Number of Leaves',ylab='Loss')
points(sztree,iltree,
pch=15, #Type of point
col='red')
points(sztree,oltree,
pch=16, #Type of point
col='blue')
legend("center", #Position of the legend
legend=c('in-sample','out-of-sample'), #Text in the legend
pch=c(15,16), #Types of points
col=c('red','blue')) #Color of pointsenter
Here is an example of the data:
structure(list(X = c(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
13, 14, 15, 16, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
30, 31, 32, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46,
47, 48, 49, 50, 51, 52, 53, 55, 56, 57, 58, 59, 60, 61, 62, 63,
64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
80, 81, 82, 83, 84, 85, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96,
97, 99, 100, 101, 102, 103, 104), type = structure(c(2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L), .Label = c("red", "white"), class = "factor"), fixed.acidity = c(7,
6.3, 8.1, 7.2, 7.2, 8.1, 6.2, 7, 6.3, 8.1, 8.1, 8.6, 7.9, 6.6,
8.3, 6.6, 6.3, 7.4, 6.5, 6.2, 6.4, 6.8, 7.6, 6.6, 7, 6.9, 7,
7.4, 7.2, 8.5, 8.3, 7.4, 5.8, 7.3, 6.5, 7, 7.3, 7.3, 6.7, 6.7,
7, 6.6, 6.7, 7.4, 6.2, 6.2, 7, 6.9, 7.2, 6.6, 6.2, 6.4, 6.9,
7.2, 6, 6.6, 7.4, 6.8, 6, 7, 6.6, 7.2, 6.2, 6.4, 6.7, 6.7, 7.4,
6.2, 6.8, 6, 8.6, 6.7, 7.4, 7.1, 7, 7.4, 6.7, 6.8, 7.1, 7.1,
7.3, 7.1, 7.1, 6.8, 7.1, 7.1, 7.2, 6.1, 6.9, 6.9, 7.5, 7.1, 6,
8.6, 6, 7.4, 7.1, 6, 7.5, 7.4), volatile.acidity = c(0.27, 0.3,
0.28, 0.23, 0.23, 0.28, 0.32, 0.27, 0.3, 0.22, 0.27, 0.23, 0.18,
0.16, 0.42, 0.17, 0.48, 0.34, 0.31, 0.66, 0.31, 0.26, 0.67, 0.27,
0.25, 0.24, 0.28, 0.27, 0.32, 0.24, 0.14, 0.25, 0.27, 0.28, 0.39,
0.33, 0.24, 0.24, 0.23, 0.24, 0.31, 0.24, 0.23, 0.18, 0.45, 0.46,
0.31, 0.19, 0.19, 0.25, 0.16, 0.18, 0.25, 0.21, 0.19, 0.38, 0.2,
0.22, 0.19, 0.47, 0.38, 0.24, 0.35, 0.26, 0.25, 0.23, 0.24, 0.27,
0.3, 0.27, 0.23, 0.23, 0.24, 0.18, 0.32, 0.18, 0.54, 0.22, 0.2,
0.34, 0.22, 0.43, 0.44, 0.25, 0.43, 0.44, 0.39, 0.27, 0.24, 0.21,
0.17, 0.26, 0.34, 0.265, 0.34, 0.25, 0.12, 0.21, 0.305, 0.25),
citric.acid = c(0.36, 0.34, 0.4, 0.32, 0.32, 0.4, 0.16, 0.36,
0.34, 0.43, 0.41, 0.4, 0.37, 0.4, 0.62, 0.38, 0.04, 0.42,
0.14, 0.48, 0.38, 0.42, 0.14, 0.41, 0.32, 0.35, 0.39, 0.48,
0.36, 0.39, 0.34, 0.36, 0.2, 0.43, 0.23, 0.32, 0.39, 0.39,
0.39, 0.39, 0.26, 0.27, 0.26, 0.31, 0.26, 0.25, 0.26, 0.35,
0.31, 0.29, 0.33, 0.35, 0.35, 0.34, 0.26, 0.15, 0.36, 0.24,
0.26, 0.07, 0.15, 0.27, 0.03, 0.24, 0.13, 0.31, 0.29, 0.43,
0.23, 0.28, 0.46, 0.31, 0.29, 0.36, 0.34, 0.3, 0.28, 0.31,
0.34, 0.2, 0.3, 0.61, 0.62, 0.31, 0.61, 0.62, 0.63, 0.43,
0.33, 0.33, 0.32, 0.29, 0.66, 0.36, 0.66, 0.37, 0.32, 0.24,
0.4, 0.37), residual.sugar = c(20.7, 1.6, 6.9, 8.5, 8.5,
6.9, 7, 20.7, 1.6, 1.5, 1.45, 4.2, 1.2, 1.5, 19.25, 1.5,
1.1, 1.1, 7.5, 1.2, 2.9, 1.7, 1.5, 1.3, 9, 1, 8.7, 1.1, 2,
10.4, 1.1, 2.05, 14.95, 1.7, 5.4, 1.2, 17.95, 17.95, 2.5,
2.9, 7.4, 1.4, 1.4, 1.4, 4.4, 4.4, 7.4, 5, 1.6, 1.1, 1.1,
1, 1.3, 11.9, 12.4, 4.6, 1.2, 4.9, 12.4, 1.1, 4.6, 1.4, 1.2,
6.4, 1.2, 2.1, 10.1, 7.8, 4.6, 4.8, 1, 2.1, 10.1, 1.4, 1.3,
8.8, 5.4, 1.4, 16, 6.1, 8.2, 11.8, 11.8, 13.3, 11.8, 11.8,
11, 7.5, 1.7, 1.8, 1.7, 12.4, 15.9, 1.2, 15.9, 13.5, 9.6,
12.1, 18.9, 13.5), chlorides = c(0.045, 0.049, 0.05, 0.058,
0.058, 0.05, 0.045, 0.045, 0.049, 0.044, 0.033, 0.035, 0.04,
0.044, 0.04, 0.032, 0.046, 0.033, 0.044, 0.029, 0.038, 0.049,
0.074, 0.052, 0.046, 0.052, 0.051, 0.047, 0.033, 0.044, 0.042,
0.05, 0.044, 0.08, 0.051, 0.053, 0.057, 0.057, 0.172, 0.173,
0.069, 0.057, 0.06, 0.058, 0.063, 0.066, 0.069, 0.067, 0.062,
0.068, 0.057, 0.045, 0.039, 0.043, 0.048, 0.044, 0.038, 0.092,
0.048, 0.035, 0.044, 0.038, 0.064, 0.04, 0.041, 0.046, 0.05,
0.056, 0.061, 0.063, 0.054, 0.046, 0.05, 0.043, 0.042, 0.064,
0.06, 0.053, 0.05, 0.063, 0.047, 0.045, 0.044, 0.05, 0.045,
0.044, 0.044, 0.049, 0.035, 0.034, 0.04, 0.044, 0.046, 0.034,
0.046, 0.06, 0.054, 0.05, 0.059, 0.06), free.sulfur.dioxide = c(45,
14, 30, 47, 47, 30, 30, 45, 14, 28, 11, 17, 16, 48, 41, 28,
30, 17, 34, 29, 19, 41, 25, 16, 56, 35, 32, 17, 37, 20, 7,
31, 22, 21, 25, 38, 45, 45, 63, 63, 28, 33, 33, 38, 63, 62,
28, 32, 31, 39, 21, 39, 29, 37, 50, 25, 44, 30, 50, 17, 25,
31, 29, 27, 81, 30, 21, 48, 50.5, 31, 9, 30, 21, 31, 20,
26, 21, 34, 51, 47, 42, 54, 52, 69, 54, 52, 55, 65, 47, 48,
51, 62, 26, 15, 26, 52, 64, 55, 44, 52), total.sulfur.dioxide = c(170,
132, 97, 186, 186, 97, 136, 170, 132, 129, 63, 109, 75, 143,
172, 112, 99, 171, 133, 75, 102, 122, 168, 142, 245, 146,
141, 132, 114, 142, 47, 100, 179, 123, 149, 138, 149, 149,
158, 157, 160, 152, 154, 167, 206, 207, 160, 150, 173, 124,
82, 108, 191, 213, 147, 78, 111, 123, 147, 151, 78, 122,
120, 124, 174, 96, 105, 244, 238.5, 201, 72, 96, 105, 87,
69, 103, 105, 114, 166, 164, 207, 155, 152, 202, 155, 152,
156, 243, 136, 136, 148, 240, 164, 80, 164, 192, 162, 164,
170, 192), density = c(1.001, 0.994, 0.9951, 0.9956, 0.9956,
0.9951, 0.9949, 1.001, 0.994, 0.9938, 0.9908, 0.9947, 0.992,
0.9912, 1.0002, 0.9914, 0.9928, 0.9917, 0.9955, 0.9892, 0.9912,
0.993, 0.9937, 0.9951, 0.9955, 0.993, 0.9961, 0.9914, 0.9906,
0.9974, 0.9934, 0.992, 0.9962, 0.9905, 0.9934, 0.9906, 0.9999,
0.9999, 0.9937, 0.9937, 0.9954, 0.9934, 0.9934, 0.9931, 0.994,
0.9939, 0.9954, 0.995, 0.9917, 0.9914, 0.991, 0.9911, 0.9908,
0.9962, 0.9972, 0.9931, 0.9926, 0.9951, 0.9972, 0.991, 0.9931,
0.9927, 0.9934, 0.9903, 0.992, 0.9926, 0.9962, 0.9956, 0.9958,
0.9964, 0.9941, 0.9926, 0.9962, 0.9898, 0.9912, 0.9961, 0.9949,
0.9929, 0.9985, 0.9946, 0.9966, 0.9974, 0.9975, 0.9972, 0.9974,
0.9975, 0.9974, 0.9957, 0.99, 0.9899, 0.9916, 0.9969, 0.9979,
0.9913, 0.9979, 0.9975, 0.9962, 0.997, 1, 0.9975), pH = c(3,
3.3, 3.26, 3.19, 3.19, 3.26, 3.18, 3, 3.3, 3.22, 2.99, 3.14,
3.18, 3.54, 2.98, 3.25, 3.24, 3.12, 3.22, 3.33, 3.17, 3.47,
3.05, 3.42, 3.25, 3.45, 3.38, 3.19, 3.1, 3.2, 3.47, 3.19,
3.37, 3.19, 3.24, 3.13, 3.21, 3.21, 3.11, 3.1, 3.13, 3.22,
3.24, 3.16, 3.27, 3.25, 3.13, 3.36, 3.35, 3.34, 3.32, 3.31,
3.13, 3.09, 3.3, 3.11, 3.36, 3.03, 3.3, 3.02, 3.11, 3.15,
3.22, 3.22, 3.14, 3.33, 3.13, 3.1, 3.32, 3.69, 2.95, 3.33,
3.13, 3.26, 3.31, 2.94, 3.27, 3.39, 3.21, 3.17, 3.33, 3.11,
3.12, 3.22, 3.11, 3.12, 3.09, 3.12, 3.26, 3.25, 3.21, 3.04,
3.14, 2.95, 3.14, 3, 3.4, 3.34, 2.99, 3), sulphates = c(0.45,
0.49, 0.44, 0.4, 0.4, 0.44, 0.47, 0.45, 0.49, 0.45, 0.56,
0.53, 0.63, 0.52, 0.67, 0.55, 0.36, 0.53, 0.5, 0.39, 0.35,
0.48, 0.51, 0.47, 0.5, 0.44, 0.53, 0.49, 0.71, 0.53, 0.4,
0.44, 0.37, 0.42, 0.35, 0.28, 0.36, 0.36, 0.36, 0.34, 0.46,
0.56, 0.56, 0.53, 0.52, 0.52, 0.46, 0.48, 0.44, 0.58, 0.46,
0.35, 0.52, 0.5, 0.36, 0.38, 0.34, 0.46, 0.36, 0.34, 0.38,
0.46, 0.54, 0.49, 0.42, 0.64, 0.35, 0.51, 0.6, 0.71, 0.49,
0.64, 0.35, 0.37, 0.65, 0.56, 0.37, 0.77, 0.6, 0.42, 0.46,
0.45, 0.46, 0.48, 0.45, 0.46, 0.44, 0.47, 0.4, 0.41, 0.44,
0.42, 0.5, 0.36, 0.5, 0.44, 0.41, 0.39, 0.46, 0.44), alcohol = c(8.8,
9.5, 10.1, 9.9, 9.9, 10.1, 9.6, 8.8, 9.5, 11, 12, 9.7, 10.8,
12.4, 9.7, 11.4, 9.6, 11.3, 9.5, 12.8, 11, 10.5, 9.3, 10,
10.4, 10, 10.5, 11.6, 12.3, 10, 10.2, 10.8, 10.2, 12.8, 10,
11.2, 8.6, 8.6, 9.4, 9.4, 9.8, 9.5, 9.5, 10, 9.8, 9.8, 9.8,
9.8, 11.7, 11, 10.9, 10.9, 11, 9.6, 8.9, 10.2, 9.9, 8.6,
8.9, 10.5, 10.2, 10.3, 9.1, 12.6, 9.8, 10.7, 9.5, 9, 9.5,
10, 9.1, 10.7, 9.5, 12.7, 12, 9.3, 9, 10.6, 9.2, 10, 9.5,
8.7, 8.7, 9.7, 8.7, 8.7, 8.7, 9, 12.6, 12.6, 11.5, 9.2, 8.8,
11.4, 8.8, 9.1, 9.4, 9.4, 9, 9.1), quality = structure(c(4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 3L, 3L, 3L, 5L, 3L, 5L,
4L, 4L, 3L, 6L, 5L, 6L, 3L, 4L, 4L, 4L, 4L, 4L, 5L, 4L, 4L,
4L, 3L, 3L, 3L, 4L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 5L, 2L, 3L,
4L, 3L, 4L, 5L, 5L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 3L, 4L,
4L, 3L, 5L, 3L, 6L, 3L, 4L, 3L, 3L, 4L, 6L, 3L, 5L, 5L, 3L,
3L, 4L, 4L, 3L, 4L, 3L, 4L, 4L, 3L, 4L, 4L, 3L, 5L, 5L, 5L,
4L, 4L, 5L, 4L, 3L, 3L, 3L, 3L, 3L), .Label = c("3", "4",
"5", "6", "7", "8", "9"), class = "factor")), row.names = c("1",
"2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13",
"14", "15", "16", "17", "19", "20", "21", "22", "23", "24", "25",
"26", "27", "28", "29", "30", "31", "32", "33", "35", "36", "37",
"38", "39", "40", "41", "42", "43", "44", "45", "46", "47", "48",
"49", "50", "51", "52", "53", "54", "56", "57", "58", "59", "60",
"61", "62", "63", "64", "65", "66", "67", "68", "69", "70", "71",
"72", "73", "74", "75", "76", "77", "78", "79", "80", "81", "82",
"83", "84", "85", "86", "88", "89", "90", "91", "92", "93", "94",
"95", "96", "97", "98", "100", "101", "102", "103", "104", "105"
), class = "data.frame")
The result I'm getting on the chart doesn't seem right based on what I'm used to. I think it should be more rounded in terms of the points on the graph. At the moment, it just doesn't look right to me. Does anyone have any ideas what might going on? Any feedback provided is very appreciated.
I have a dataset I am trying to get only a certain portion of according to specific criteria but am stuck on how to do so. I think that it has something to do with using the which() and intersect() functions, but I am unfamiliar with them and don't know how to use them.
I have a dataset with a bunch of states, the amounts of gun deaths in the state and their "Brady Score". I am trying to get the states with gun deaths under 4 (per 100,000) and Brady Scores under 0, and then arrange them in a table.
This is the data I am working with:
dput(Guns)
structure(list
(Jurisdiction = structure(1:51, .Label = c("Alabama",
"Alaska", "Arizona", "Arkansas", "California", "Colorado", "Connecticut",
"D.C.", "Delaware", "Florida", "Georgia", "Hawaii", "Idaho",
"Illinois", "Indiana", "Iowa", "Kansas", "Kentucky", "Louisiana",
"Maine", "Maryland", "Massachusetts", "Michigan", "Minnesota",
"Mississippi", "Missouri", "Montana", "Nebraska", "Nevada", "New Hampshire",
"New Jersey", "New Mexico", "New York", "North Carolina", "North Dakota",
"Ohio", "Oklahoma", "Oregon", "Pennsylvania", "Rhode Island",
"South Carolina", "South Dakota", "Tennessee", "Texas", "Utah",
"Vermont", "Virginia", "Washington", "West Virginia", "Wisconsin",
"Wyoming"), class = "factor"),
Homicide.rate = c(7.1, 4.1, 5.5,
5.9, 5, 3.1, 4.1, 13.9, 6.2, 5.2, 5.9, 2.1, 1.8, 5.8, 4.7, 1.5,
2.9, 4.5, 10.8, 1.9, 6.3, 1.8, 7, 1.8, 7.4, 6.5, 2.7, 2.9, 4.5,
1.1, 4.4, 5.6, 3.5, 4.9, 4, 4.3, 5.7, 2.4, 5.4, 3.2, 6.9, 3,
6, 4.4, 1.8, 1.3, 3.8, 3, 3.9, 3, 2.4),
Gun.accident.rate = c(0.44,
0, 0.1, 0.41, 0.08, 0.18, 0, 0, 0, 0.13, 0.29, 0, 0.38, 0.14,
0.22, 0.11, 0.2, 0.4, 0.75, 0, 0.07, 0, 0.09, 0.08, 0.65, 0.33,
0.43, 0.29, 0.13, 0, 0, 0, 0.02, 0.29, 0, 0.14, 0.39, 0.13, 0.28,
0, 0.41, 0, 0.37, 0.18, 0, 0, 0.13, 0.12, 0.23, 0.09, 0),
Sum = c(7.5,
4.1, 5.6, 6.3, 5.1, 3.3, 4.1, 13.9, 6.2, 5.3, 6.2, 2.1, 2.2,
5.9, 4.9, 1.6, 3.1, 4.9, 11.6, 1.9, 6.4, 1.8, 7.1, 1.9, 8.1,
6.8, 3.1, 3.2, 4.6, 1.1, 4.4, 5.6, 3.5, 5.2, 4, 4.4, 6.1, 2.5,
5.7, 3.2, 7.3, 3, 6.4, 4.6, 1.8, 1.3, 3.9, 3.1, 4.1, 3.1, 2.4
),
Brady.score = c(3.5, -7, -8, 1, 75, 14.5, 70, 50, 34.5, 3,
2, 58.5, 0, 45, 4.5, 14, -4, -3.5, -2, 3, 66.5, 60.5, 15, 19.5,
-4, -0.5, -3, 6.5, 1.5, 5.5, 68.5, 0, 65.5, 1.5, 2, 10, 1, 11,
20, 41.5, 1, -4.5, 2, 1.5, -2, -4, 7, 19.5, 3, 13, -5),
Brady.grade = structure(c(8L,
10L, 10L, 10L, 1L, 5L, 1L, 2L, 3L, 10L, 10L, 4L, 10L, 2L, 8L,
6L, 10L, 10L, 10L, 10L, 1L, 4L, 5L, 5L, 10L, 10L, 10L, 7L, 10L,
8L, 1L, 10L, 1L, 10L, 10L, 7L, 10L, 9L, 5L, 3L, 10L, 10L, 10L,
10L, 10L, 10L, 7L, 5L, 10L, 6L, 10L), .Label = c("A?", "B", "B?",
"B+", "C", "C?", "D", "D?", "D+", "F"), class = "factor")),
class = "data.frame", row.names = c(NA,
-51L))
So far I have this:
LowB=(Guns$Brady.score<0)
LowD=(Guns$Sum<4)
LowB1=Guns[LowB,]$Brady.score
LowD1=Guns[LowD,]$Sum
intersect(LowB1,LowD1)
I have succeeded in converting the Brady Scores and Gun Deaths (Sum) into numerical variables, but now have no idea how to align them into a table where each state matches to its correspondent Brady Score and Gun Death sum.
To reiterate, what I want to get at the end is a table with the states that have both a Brady score below a certain number and a Sum below a certain number, where all three variables correspond to eachother. Is there any way I can do this? Thank you.
We can use subset and select the columns that we need.
subset(Guns, Brady.score < 0 & Sum < 4,
select = c('Jurisdiction', 'Sum', 'Brady.score'))
# Jurisdiction Sum Brady.score
#17 Kansas 3.1 -4.0
#27 Montana 3.1 -3.0
#42 South Dakota 3.0 -4.5
#45 Utah 1.8 -2.0
#46 Vermont 1.3 -4.0
#51 Wyoming 2.4 -5.0
In dplyr, we can use filter and select.
library(dplyr)
Guns %>%
filter(Brady.score < 0 & Sum < 4) %>%
select(Jurisdiction, Sum, Brady.score)
An additional Base R method:
df[which(df$Brady.score < 0 & df$Sum < 4),]
I am stuck with a seemingly simple question. I have a dataframe with monthly values from several weather stations.
here is a sample of the raw data, called "cn":
months station temp_davg_c temp_dmax_c temp_dmin_c rain_mm snow_cm precip_mm date
1 Jan courtney 3.0 5.6 0.3 216.0 15.9 231.8 2010-01-01
2 Feb courtney 3.6 7.1 0.0 134.8 9.3 144.1 2010-02-01
3 Mar courtney 5.7 10.0 1.3 127.0 11.3 138.3 2010-03-01
4 Apr courtney 9.1 14.3 3.9 90.7 0.1 90.7 2010-04-01
5 May courtney 12.5 18.1 6.8 53.0 0.0 53.0 2010-05-01
6 Jun courtney 15.5 21.0 9.9 53.0 0.0 53.0 2010-06-01
I can plot all stations, just based on the raw data by doing this. However, what's the point of 14 different lines...
ggplot(data = cn,
aes(x = factor(months), y = temp_davg_c, colour = station))
geom_line(aes(group = station)) +
xlab("Months")+ ylab("Temperature [°C]")+
scale_x_discrete(limits=c("Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"))
So I'd like to obtain monthly averages/mmin/max across the stations. This is where I ran into the first problem:
When I use aggregate to obtain the monthly averages, how do I then save the data in a new df that I can then us for plotting, without having df within dfs?
To get monthly averages/min/max I did this:
stats <- aggregate(cn[,3], list(cn$months), FUN=mean,na.rm=TRUE)
names(stats)[1] <- "months" # rename
names(stats)[2] <- "avg" #rename
stats$max <- (aggregate(cn[,4], list(cn$months), FUN=max,na.rm=TRUE)[2])
stats$min <- (aggregate(cn[,5], list(cn$months), FUN=min,na.rm=TRUE)[2])
Assuming that wasn't a problem, how could I reorder the df so that I have the months in order? I know that I can change the order of a factor by doing this:
factor(stats$months, levels=month.name)
But then how do I do it in the df that I created with all the stats in it?
Since I couldn't figure this out I used "scale_x_discrete" in the ggplot function later, but I was wondering how I would do this.
Now to the final question, how would I plot the monthly overall avg/min/max of all the stations so that for temperature I only end up with three lines?
Assuming that the df within df wasn't an issue, I tried this, assuming I would get my df to look like this:
months avg max min
1 Apr 8.561538 14.3 2.6
2 Aug 17.453846 26.1 10.9
3 Dec 3.075000 6.4 -0.8
4 Feb 3.892308 7.8 -0.7
5 Jan 3.269231 6.8 -0.8
6 Jul 17.446154 25.6 10.8
7 Jun 15.069231 21.9 9.0
8 Mar 5.876923 10.6 0.7
9 May 12.076923 18.6 6.0
10 Nov 5.215385 9.0 0.6
11 Oct 9.230769 14.4 3.8
12 Sep 14.100000 22.4 7.4
ggplot(stats,aes(months))+
geom_line(aes(y=avg)) +
geom_line(aes(y=min)) +
geom_line(aes(y=max)) +
xlab("Months")+ ylab("Temperature [°C]")+
scale_x_discrete(limits=c("Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"))
What am I missing here? Any help is appreciated..
cheers
Sandra
PS: here is a dput of my cn
structure(list(months = structure(c(5L, 4L, 8L, 1L, 9L, 7L, 6L,
2L, 12L, 11L, 10L, 3L, 5L, 4L, 8L, 1L, 9L, 7L, 6L, 2L, 12L, 11L,
10L, 3L, 5L, 4L, 8L, 1L, 9L, 7L, 6L, 2L, 12L, 11L, 10L, 3L, 5L,
4L, 8L, 1L, 9L, 7L, 6L, 2L, 12L, 11L, 10L, 3L, 5L, 4L, 8L, 1L,
9L, 7L, 6L, 2L, 12L, 11L, 10L, 3L, 5L, 4L, 8L, 1L, 9L, 7L, 6L,
2L, 12L, 11L, 10L, 3L, 5L, 4L, 8L, 1L, 9L, 7L, 6L, 2L, 12L, 11L,
10L, 3L, 5L, 4L, 8L, 1L, 9L, 7L, 6L, 2L, 12L, 11L, 10L, 3L, 5L,
4L, 8L, 1L, 9L, 7L, 6L, 2L, 12L, 11L, 10L, 3L, 5L, 4L, 8L, 1L,
9L, 7L, 6L, 2L, 12L, 11L, 10L, 3L, 5L, 4L, 8L, 1L, 9L, 7L, 6L,
2L, 12L, 11L, 10L, 3L, 5L, 4L, 8L, 1L, 9L, 7L, 6L, 2L, 12L, 11L,
10L, 3L, 5L, 4L, 8L, 1L, 9L, 7L, 6L, 2L, 12L, 11L, 10L, 3L, 5L,
4L, 8L, 1L, 9L, 7L, 6L, 2L, 12L, 11L, 10L, 3L), .Label = c("Apr",
"Aug", "Dec", "Feb", "Jan", "Jul", "Jun", "Mar", "May", "Nov",
"Oct", "Sep"), class = "factor"), station = structure(c(7L, 7L,
7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 5L, 5L,
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 6L, 6L,
6L, 6L, 6L, 6L, 6L, 6L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L,
8L, 8L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 13L,
13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 12L, 12L,
12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 10L, 10L, 10L, 10L, 10L,
10L, 10L, 10L, 10L, 10L, 10L, 10L, 11L, 11L, 11L, 11L, 11L, 11L,
11L, 11L, 11L, 11L, 11L, 11L, 14L, 14L, 14L, 14L, 14L, 14L, 14L,
14L, 14L, 14L, 14L, 14L), .Label = c("albernirob", "blackcreeek",
"campbellrivairp", "campbellrivsurf", "capemudge", "comoxairp",
"courtney", "mudbay", "oysterriver", "powriv", "powrivairp",
"qualicumhatch", "qualicumriverres", "stillwater"), class = "factor"),
temp_davg_c = c(3, 3.6, 5.7, 9.1, 12.5, 15.5, 17.9, 17.6,
14.2, 9, 5.1, 3.1, 2.8, 3.4, 5.4, 8.5, 11.7, 14.8, 17.1,
16.9, 13.6, 8.6, 5, 2.8, 2.4, 3.2, 5.2, 8, 11.6, 14.7, 17.3,
17.2, 13.7, 8.6, 4.4, 2.1, 2.6, 3.8, 5.9, 7.4, 11.5, 14.3,
16.2, 17.2, 12.7, 8.1, 4.1, NA, 4.1, 4.6, 6.3, 8.8, 12.1,
14.9, 17.2, 17.1, 14.2, 9.6, 5.8, 3.8, 3.9, 4.3, 6.1, 8.8,
12.4, 15.5, 18, 17.9, 14.5, 9.5, 5.7, 3.5, 3.5, 4, 5.9, 8.6,
12.1, 15.1, 17.5, 17.4, 14.1, 9.3, 5.3, 3.1, 3.3, 3.8, 5.6,
8.3, 12, 15.1, 17.3, 17.2, 13.6, 8.9, 5.2, 3.2, 3.9, 4.2,
5.9, 8.6, 12, 14.9, 17.1, 16.7, 13.6, 9.2, 5.6, 3.5, 2.8,
3.7, 5.8, 8.5, 11.9, 14.9, 17.3, 17.4, 14.1, 9.2, 4.9, 2.6,
2, 3, 5.7, 8.5, 12.3, 15.5, 18.3, 18.5, 15.3, 9.8, 4.6, 1.8,
4.6, 5.1, 7, 9.6, 13, 15.8, 18.4, 18.6, 15.6, 10.8, 6.8,
4.3, 3.6, 3.9, 5.9, 8.6, 11.9, 14.9, 17.2, 17.2, 14.1, 9.4,
5.3, 3.1, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA),
temp_dmax_c = c(5.6, 7.1, 10, 14.3, 18.1, 21, 23.8, 23.7,
20.1, 13, 8, 5.4, 5.8, 7.4, 10, 13.9, 17.3, 20.2, 22.9, 22.8,
19.6, 13, 8.4, 5.4, 5.5, 7.2, 9.7, 13.2, 17, 20.1, 23, 23.3,
19.8, 13.1, 7.7, 4.9, 5.6, 7.5, 10.6, 12.2, 16.7, 19.5, 21.6,
23.2, 18, 12.3, 7.5, NA, 6.6, 7.6, 9.8, 12.9, 16.5, 19.5,
22.1, 22, 18.6, 12.8, 8.5, 6.2, 6.4, 7.4, 9.6, 12.9, 16.6,
19.8, 22.8, 22.7, 19, 12.9, 8.5, 5.9, 6.2, 7.5, 10.1, 13.5,
17.2, 20.3, 23.1, 23.1, 19.5, 13.4, 8.3, 5.6, 6.2, 7.4, 9.8,
13.2, 17.1, 20.2, 22.6, 22.5, 18.9, 12.8, 8.3, 5.8, 6.5,
7.5, 9.9, 12.9, 16.7, 19.6, 22.3, 22.1, 18.7, 13, 8.5, 5.9,
5.5, 7.4, 10.1, 13.5, 17.2, 20.3, 23.1, 23.5, 20, 13.3, 7.8,
5, 4.3, 6.6, 10.5, 14.2, 18.6, 21.9, 25.6, 26.1, 22.4, 14.4,
7.3, 3.8, 6.8, 7.8, 10.4, 13.5, 17.1, 19.8, 22.7, 22.9, 19.5,
13.6, 9, 6.4, 5.8, 6.9, 9.4, 12.8, 16.5, 19.4, 22.1, 22.3,
18.7, 12.6, 7.7, 5.3, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA), temp_dmin_c = c(0.3, 0, 1.3, 3.9, 6.8, 9.9,
11.9, 11.5, 8.2, 5, 2.1, 0.7, -0.3, -0.6, 0.9, 3.1, 6.1,
9.3, 11.3, 10.9, 7.5, 4.2, 1.6, 0.2, -0.8, -0.7, 0.7, 2.8,
6.2, 9.3, 11.5, 11.1, 7.6, 4, 1, -0.8, -0.5, 0, 1.3, 2.6,
6.2, 9, 10.8, 11.1, 7.4, 3.8, 0.6, NA, 1.6, 1.5, 2.8, 4.7,
7.7, 10.3, 12.2, 12.2, 9.7, 6.4, 3.1, 1.4, 1.4, 1.2, 2.5,
4.6, 8, 11.1, 13.3, 13, 9.9, 6, 2.9, 0.9, 0.7, 0.5, 1.7,
3.7, 6.9, 9.8, 11.8, 11.7, 8.6, 5.3, 2.3, 0.5, 0.3, 0.1,
1.5, 3.4, 6.9, 9.8, 11.7, 11.7, 8.2, 5, 2, 0.5, 1.2, 0.8,
2, 4.1, 7.3, 10.1, 11.8, 11.3, 8.4, 5.3, 2.7, 0.9, 0.1, 0.1,
1.4, 3.5, 6.6, 9.4, 11.5, 11.2, 8.2, 5, 1.9, 0.2, -0.3, -0.6,
0.7, 2.7, 6, 9, 10.9, 10.9, 8, 5, 1.8, -0.3, 2.3, 2.4, 3.6,
5.6, 8.8, 11.8, 14, 14.3, 11.6, 8, 4.6, 2.2, 1.2, 0.9, 2.3,
4.3, 7.3, 10.4, 12.3, 12.1, 9.4, 6.1, 2.8, 0.9, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA), rain_mm = c(216, 134.8,
127, 90.7, 53, 53, 29.9, 35.4, 45.7, 146.9, 232.3, 236.4,
216, 166.8, 149.3, 105, 72.8, 63.2, 42.3, 43.1, 54, 171.8,
256.2, 247.3, 194.6, 135.5, 128.4, 91.6, 68.4, 62.9, 39.4,
44.6, 55.2, 161, 222.1, 204.2, 186, 140.2, 120.3, 87.2, 58.2,
51.3, 35.1, 39, 52.9, 154.8, 228.4, 218.5, 215.2, 135.1,
130.8, 93.6, 70.2, 61.1, 39.5, 45.6, 58.7, 168.6, 241, 220.8,
159.1, 107.8, 95.7, 64.4, 45.6, 42.8, 26.7, 29.2, 41.8, 122.7,
191.9, 168.9, 256.9, 174.1, 151.6, 98, 56.6, 45.2, 26, 37.6,
53.6, 189.7, 285.2, 256.7, 182, 144.2, 139.3, 87.2, 64.6,
54.7, 36.4, 39, 48.9, 152.9, 228.4, 215.9, 200.6, 131.1,
116.3, 79.4, 51.3, 45.3, 26, 34.6, 46.3, 146.8, 214, 180.7,
219.3, 150.4, 141, 101.1, 72.1, 62.8, 41.9, 49.5, 59.3, 180.8,
249.9, 234.2, 317, 222.7, 215.6, 143.6, 87.8, 62.2, 31, 46.4,
61.4, 218.3, 345.2, 323.2, 132, 88.4, 92.4, 70.8, 70.9, 57.4,
36.5, 42.3, 51.4, 117.5, 154.9, 134.5, 145.7, 101.9, 104.2,
83.2, 76.6, 67.6, 37.5, 45.3, 54.7, 125.5, 171.6, 146.5,
185.2, 125.5, 127.8, 99.6, 92.4, 73.7, 46, 50.7, 64.6, 152.1,
212.6, 178.5), snow_cm = c(15.9, 9.3, 11.3, 0.1, 0, 0, 0,
0, 0, 0.2, 6, 12.1, 17.3, 10, 6.7, 0.2, 0, 0, 0, 0, 0, 1.1,
6.4, 16, 23.3, 14.4, 11.7, 0.5, 0, 0, 0, 0, 0, 1.2, 10.5,
22.6, 13.2, 8.4, 7.6, 0, 0, 0, 0, 0, 0, 0.8, 7.3, 14.3, 13.8,
6.4, 6.3, 0.2, 0, 0, 0, 0, 0, 0.6, 6, 14.7, 11.9, 6, 9.9,
0.2, 0, 0, 0, 0, 0, 0.1, 8.2, 18.7, 12.9, 13.3, 8.2, 0, 0,
0, 0, 0, 0, 1.1, 4.8, 15.2, 14.9, 7.8, 4.6, 0, 0, 0, 0, 0,
0, 0.9, 4.1, 8.6, 10.4, 8.8, 4.3, 0, 0, 0, 0, 0, 0, 0.4,
4.2, 9.2, 14.8, 10.1, 7.1, 0.1, 0, 0, 0, 0, 0, 0.5, 7.2,
16.5, 22.6, 16.9, 8.2, 0.6, 0, 0, 0, 0, 0, 1.6, 8, 21.4,
6.1, 4.6, 3.8, 0, 0, 0, 0, 0, 0, 0.2, 3.4, 4.2, 13.6, 7.8,
6.8, 0.1, 0, 0, 0, 0, 0, 0.3, 6.5, 11.5, 8.1, 4.8, 2.7, 0,
0, 0, 0, 0, 0, 0.2, 4.4, 9), precip_mm = c(231.8, 144.1,
138.3, 90.7, 53, 53, 29.9, 35.4, 45.7, 147.1, 238.3, 248.5,
233.3, 176.8, 155.9, 105.2, 72.8, 63.2, 42.3, 43.1, 54, 172.9,
262.6, 263.3, 217.5, 149.5, 140, 92.1, 68.4, 62.9, 39.4,
44.6, 55.2, 162.2, 231.9, 225.7, 198.9, 148.6, 127.9, 87.2,
58.2, 51.3, 35.1, 39, 52.9, 155.6, 235.7, 232.8, 229.1, 141.4,
137.1, 93.8, 70.2, 61.1, 39.5, 45.6, 58.7, 169.2, 246.9,
235.5, 171.9, 114.3, 105.7, 64.6, 45.6, 42.8, 26.7, 29.2,
41.8, 122.8, 200.5, 187.9, 269.9, 187.4, 159.8, 98, 56.6,
45.2, 26, 37.6, 53.6, 190.8, 290, 272, 196.9, 151.9, 143.9,
87.2, 64.6, 54.7, 36.4, 39, 48.9, 153.8, 232.6, 224.5, 211,
139.9, 120.6, 79.4, 51.3, 45.3, 26, 34.6, 46.3, 147.2, 218.1,
189.8, 234.1, 160.4, 148, 101.2, 72.1, 62.8, 41.9, 49.5,
59.3, 181.3, 257.1, 250.7, 339.5, 239.6, 223.8, 144.2, 87.8,
62.2, 31, 46.4, 61.4, 219.8, 353.2, 344.6, 138.1, 93.1, 96.1,
70.8, 70.9, 57.4, 36.5, 42.3, 51.4, 117.7, 158.3, 138.7,
158.9, 109.4, 110.7, 83.3, 76.6, 67.6, 37.5, 45.3, 54.7,
125.8, 178, 157.8, 193.3, 130.3, 130.6, 99.6, 92.4, 73.7,
46, 50.7, 64.6, 152.3, 216.9, 187.5), date = structure(c(14610,
14641, 14669, 14700, 14730, 14761, 14791, 14822, 14853, 14883,
14914, 14944, 14610, 14641, 14669, 14700, 14730, 14761, 14791,
14822, 14853, 14883, 14914, 14944, 14610, 14641, 14669, 14700,
14730, 14761, 14791, 14822, 14853, 14883, 14914, 14944, 14610,
14641, 14669, 14700, 14730, 14761, 14791, 14822, 14853, 14883,
14914, 14944, 14610, 14641, 14669, 14700, 14730, 14761, 14791,
14822, 14853, 14883, 14914, 14944, 14610, 14641, 14669, 14700,
14730, 14761, 14791, 14822, 14853, 14883, 14914, 14944, 14610,
14641, 14669, 14700, 14730, 14761, 14791, 14822, 14853, 14883,
14914, 14944, 14610, 14641, 14669, 14700, 14730, 14761, 14791,
14822, 14853, 14883, 14914, 14944, 14610, 14641, 14669, 14700,
14730, 14761, 14791, 14822, 14853, 14883, 14914, 14944, 14610,
14641, 14669, 14700, 14730, 14761, 14791, 14822, 14853, 14883,
14914, 14944, 14610, 14641, 14669, 14700, 14730, 14761, 14791,
14822, 14853, 14883, 14914, 14944, 14610, 14641, 14669, 14700,
14730, 14761, 14791, 14822, 14853, 14883, 14914, 14944, 14610,
14641, 14669, 14700, 14730, 14761, 14791, 14822, 14853, 14883,
14914, 14944, 14610, 14641, 14669, 14700, 14730, 14761, 14791,
14822, 14853, 14883, 14914, 14944), class = "Date")), .Names = c("months",
"station", "temp_davg_c", "temp_dmax_c", "temp_dmin_c", "rain_mm",
"snow_cm", "precip_mm", "date"), row.names = c(NA, -168L), class = "data.frame")
You can do this in one shot, if you pipe your data directly into ggplot() using dplyr and tidyr :
library(dplyr)
library(tidyr)
library(ggplot2)
correct_order <- c("Jan","Feb","Mar","Apr","May","Jun",
"Jul","Aug","Sep","Oct","Nov","Dec")
cn %>% group_by(months) %>%
summarise(min = min(temp_dmin_c, na.rm = TRUE),
max = max(temp_dmax_c, na.rm = TRUE),
avg = mean(temp_davg_c,na.rm = TRUE)) %>%
gather(metric, value, -months) %>%
ggplot(.,aes(x = months, y = value,
group = metric, color = metric)) +
scale_x_discrete(limits=correct_order) +
geom_line()
Below is a small subset of my data frame. The actual dataframe has an explicit name for each variable; not just "DepVar1, and DepVar2 (2 response variables)" or "IndVar (1-9)" (9 explanatory variables - 1 categorical and 8 continuous variables).
I'd like to adapt the loop written by Bergan by changing the function glm() to lmer() found in the lme4 package to produce a series of generalized linear mixed models (GLMM) containing ALL possible combinations of explanatory variables (Indvar 1-9) with random effects specified using a (1|IndVarType) syntax to explain variance in the response variable (DepVar1 and DepVar2).
Example of glmm models:
DepVar1 ~ Indvar (1-9) + (1|IndVarType)
DepVar2 ~ Indvar (1-9) + (1|IndVarType)
After running the loop to produce all glmm models, my aim to sort the best glmm models by the lowest AICc values using the function aictab() in the AICcmodavg package to display associated statistics: (1) Delta_AICc; (2) AICcWt; and (3) Cum.Wt.
I have been attempting to adapt Bergans code to incorporate random effects (1|IndVarType) but so far I have been unsuccessful. Any suggestions how to do that? I have done some searches and can only find examples for loops containing the glm() function. Many thanks in advance if anyone has a solution.
Code
library(lme4)
ind_vars <- c("Indvar1",
"Indvar2",
"Indvar3",
"Indvar4",
"Indvar4",
"Indvar5",
"Indvar6",
"Indvar7",
"Indvar8",
"Indvar9",
"IndvarType")
dep_vars <- c("Depvar1", "DepVar2")
# create all combinations of ind_vars
ind_vars_comb <-
unlist(sapply( seq_len(length(ind_vars)),
function(i) {
apply( combn(ind_vars,i), 2, function(x) paste(x, collapse = "+"))
}))
# pair with dep_vars:
var_comb <- expand.grid(dep_vars, ind_vars_comb )
# formulas for all combinations
formula_vec <- sprintf("%s ~ %s", var_comb$Var1, var_comb$Var2)
# create models
# create models
glm_mixed <- lapply(formula_vec, function(f) {
fit1 <- lmer(f, (1|IndvarType), data = bats)
fit1$coefficients <- coef(summary(fit1))
return(fit1)
})
names(glm_mixed) <- formula_vec
##Error: No random effects terms specified in formula
# Model selection
# Installed AICcmodavg package for AICc values into R
AICc information
# R code from Mazerolle (2014)
library(AICcmodavg)
mydata.aov <- glm_mixed # list of models
mydata.model.names <- formula_vec # list of model names
# generates AICc values # sort models into order of AIC value
aictab(mydata.aov, mydata.model.names, second.ord = TRUE, sort = TRUE)
Data Structure
structure(list(Indvar1 = c(0, 5, 10, 19, 30, 33, 39, 44, 54,
63, 68, 72, 81, 87, 93, 100, 105, 110, 119, 127, 134, 141, 149,
155, 115, 120, 125, 0, 5, 9, 17, 22, 29, 35, 39, 44, 45, 50,
55, 63), IndvarType = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
3L), .Label = c("CONTROL", "LED", "Metal Halide", "SOX"), class = "factor"),
`IndvarCat ` = c(26.9, 25.16, 39, 29.81, 21.83, 20.22, 2.9,
2.1, 0.85, 0.62, 0.39, 0.26, 24.7, 21.99, 20.46, 26.32, 0,
0, 0.43, 0.02, 0.02, 0.03, 0.02, 0.03, 2.62, 0.43, 0.44,
25.16, 39, 29.81, 21.83, 20.22, 20.88, 0.63, 0.56, 0.56,
86.63, 87.97, 88.59, 0.31), Indvar2 = c(10.34, 12.56, 15.76,
10.35, 11.15, 14.6, 15.05, 12.54, 15.29, 19.5, 17.12, 17.62,
13.92, 12.7, 12.55, 17.86, 18.86, 18.23, 19.65, 19.59, 18.11,
19.04, 16.92, 18.39, 18.97, 18.96, 17.72, 7.65, 8.61, 8.98,
8.68, 12.25, 11.71, 16.19, 15.73, 16.02, 13.62, 14.89, 14.98,
17.14), Indvar3 = c(7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
5, 5, 5, 5, 5, 5, 2, 2, 2, 2, 2, 11, 11, 11, 11, 11, 11,
11, 11, 11, 13, 13, 13, 13, 13, 8, 8), Indvar4 = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("Full Moon",
"Waning Gibbous", "Waxing Crescent", "Waxing Gibbous"), class = "factor"),
Indvar5 = c(32.2, 32.2, 32.2, 32.2, 32.2, 32.2, 32.2, 32.2,
32.2, 32.2, 32.2, 32.2, 32.2, 32.2, 32.2, 32.2, 32.2, 32.2,
32.2, 32.9, 32.9, 32.9, 32.9, 32.9, 41.4, 41.4, 41.4, 41.4,
41.4, 41.4, 41.4, 41.4, 41.4, 41.1, 41.1, 41.1, 41.1, 41.1,
42.2, 42.2), Indvar6 = c(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3), Indvar7 = c(18, 18, 18, 18,
18, 18, 18, 18, 18, 18, 18, 18, 18, 14, 14, 14, 14, 14, 14,
14, 14, 13, 13, 13, 14.3, 14.3, 14.3, 14.3, 14.3, 14.3, 14.3,
14.3, 14.3, 15.5, 15.5, 15.5, 15.5, 15.5, 14.6, 14.6), Indvar8 = c(51,
51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 69, 69, 69,
69, 69, 69, 77, 77, 77, 77, 77, 62, 62, 62, 62, 62, 62, 62,
62, 62, 57, 57, 57, 57, 57, 61, 61), Indvar9 = c(0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), Depvar1 = c(3,
2, 5, 6, 15, 2, 10, 12, 17, 2, 0, 0, 15, 7, 17, 0, 1, 0,
14, 10, 12, 7, 4, 1, 5, 4, 2, 9, 7, 7, 9, 5, 4, 3, 0, 0,
12, 11, 9, 1), DepVar2 = c(0.444444444, 0, 0, 0.027777778,
0, 0, 0.25, 0, 0.08650519, 0, 0, 0, 0.111111111, 0, 0.124567474,
0, 0, 0, 0.25, 0.01, 0.111111111, 0.081632653, 0, 0, 0.04,
0.25, 0.25, 0.790123457, 0.510204082, 2.040816327, 1.777777778,
0, 2.25, 0.111111111, 0, 0, 0.027777778, 0.074380165, 0.012345679,
0)), .Names = c("Indvar1", "IndvarType", "IndvarCat ", "Indvar2",
"Indvar3", "Indvar4", "Indvar5", "Indvar6", "Indvar7", "Indvar8",
"Indvar9", "Depvar1", "DepVar2"), row.names = c(NA, 40L), class = "data.frame")