I am new to R and I got a hold of this program I am trying to run. But I am getting error in the variable "outcome01". It could be that if I (somehow) fix this variable, there would be other similar errors. Any help is appreciated; Here is the code:
library(norm)
# Make appropriate changes in file and variable names.
setwd ("c:\\Users\\Dave Desktop\\Dropbox\\Webs\\StatPages\\More_Stuff\\Missing_Data")
x <- read.table("survrateMissingNA.dat", header = TRUE)
y <- as.matrix(x) #convert table to matrix
cat("Logistic regression using 132 cases and missing data \n\n")
print(summary(glm(formula = outcome01~survrate + gsi + avoid + intrus, binomial, data = x))) #Use original data with missing values
attach(x)
##########
## Important The following code will run m = 5 times. The data will be concatenated into ComFile and then analyzed.
# Data Augmentation using norm.R
m <- 5 #Number of imputations
k <- 9 #Number of variables in raw data file
l <- 5 #Number of variables actually used in regression
CombFile <- matrix(nrow = 0, ncol = k)
for (i in 1:m) {
s <- prelim.norm(y) #get preliminary statistics for the analysis
thetahat <-em.norm(s) #Get MLE for start value
rngseed(25672)
theta <- da.norm(s, thetahat, steps=200, showits=TRUE) # GET MLE
getparam.norm(s, theta) #Print out those results
impdata <-imp.norm(s, theta, y) #Impute the data
filename <- paste("CombFile", i, sep = "")
CombFile <- rbind(CombFile, impdata)
write(t(impdata), file = "impsurvrate", ncolumns = 9, sep = " ")
z <- data.frame(impdata)
z$outcome01 <- round(z$outcome01, digits = 0)
summary((glm(formula = outcome01~survrate + gsi + avoid + intrus, binomial, data = z))) #Use imputed data.
}
## Creating the final data file with imputed data 660 rows
nPerImp <- nrow(CombFile)/m
imps <- rep(1:m, each = nPerImp)
# Add a variable representing the imputation number.
data <- as.data.frame(cbind(imps, CombFile))
data$outcome01 <- round(data$outcome01, digits = 0)
# head(data)
attach(data)
## Set up variables to hold results
b <- matrix(NA,nrow = m, ncol = 2*l)
meanb <- numeric(l)
meanvar <- numeric(l)
varb <- numeric(l)
TT <- numeric(l)
sqrtt <- numeric(l)
t <- numeric(l)
## Run a logistic regression on each of the 5 imputed data sets and store the
## coefficients and theire standard errors.
for (i in 1:m) { # Modify following line appropriately
model <- glm(outcome01~survrate + gsi + avoid + intrus ,subset = (imps ==i), binomial, data = data)
a <- summary(model)
# print(a)
n <- 2*l
b[i,] <- a$coefficients[1:n]
}
## Calculate the coefficients, st. errors, and t values across 5 imputations
for (i in 1:l) {
meanb[i] <- mean(b[,i])
meanvar[i] <- mean((b[,i+l]^2))
varb[i] <- var(b[,i])
}
cat("\n\n\nThe mean regression coefficients are: \n\n")
print(meanb)
for (i in 1:l) {
TT[i] <- meanvar[i] + (1 + 1/5)*varb[i]
sqrtt[i] <- sqrt(TT[i])
t[i] <- meanb[i]/sqrtt[i]
}
cat("The standard errors are: \n\n")
print(sqrtt)
cat("\n The t values are: \n\n")
print(t)
Here is the data ( in survrateMissingNA.dat file):
c(1, 4.405, 17.2, 31.144, 491, 1029, 61, 20.2, 999, 2, 8.963,
17.6, 47.951, 445, 934, 32, 21, 3.85, 3, 4.778, 19.3, 32.175,
448, 944, 27, 21.1, 3.296, 4, 999, 17.1, 28.934, 482, 1005, 66,
20.3, 1.792, 5, 4.992, 24, 41.078, 417, 902, 11, 21, 3.807, 6,
5.443, 18.4, 34.571, 462, 980, 62, 21.5, 3.367, 7, 8.817, 14.4,
999, 431, 908, 3, 21.7, 4.394, 8, 7.03, 16.6, 39.076, 429, 897,
3, 21, 4.22, 9, 5.718, 19.1, 32.588, 420, 889, 36, 20.7, 3.871,
10, 5.193, 16.3, 32.291, 406, 854, 16, 20.2, 4.174, 11, 6.078,
17.9, 38.518, 407, 889, 17, 21.6, 999, 12, 4.21, 19.1, 29.783,
468, 979, 62, 21.4, 2.708, 13, 6.136, 17.3, 39.431, 488, 1048,
69, 21.2, 2.565, 14, 5.826, 17.5, 36.785, 415, 882, 19, 21.2,
4.06, 15, 5.483, 15.8, 31.511, 516, 1099, 64, 22.1, 1.609, 16,
5.817, 15.1, 34.652, 503, 1060, 74, 21.7, 2.197, 17, 999, 17,
32.257, 477, 999, 65, 20.1, 2.398, 18, 4.761, 999, 26.461, 486,
1021, 80, 19.4, 2.197, 19, 6.428, 13.8, 31.972, 427, 896, 2,
21.5, 4.22, 20, 7.245, 17, 40.661, 430, 909, 11, 20.7, 4.159,
21, 7.287, 14.8, 40.795, 430, 907, 6, 21.6, 4.382, 22, 6.994,
20.1, 41.895, 484, 1033, 68, 21.3, 999, 23, 6, 17.5, 35.948,
506, 1085, 60, 22.1, 2.197, 24, 4.08, 17.5, 26.818, 496, 1036,
79, 18.7, 1.386, 25, 5.383, 15.5, 31.189, 495, 1045, 64, 21.5,
2.197, 26, 5.692, 16.3, 28.785, 473, 1009, 55, 21.9, 3.045, 27,
5.935, 14.5, 30.922, 494, 1050, 73, 21.7, 2.197, 28, 999, 18.7,
34.836, 434, 917, 39, 21.3, 3.401, 29, 5.859, 15.6, 999, 444,
935, 4, 22.3, 999, 30, 9.774, 13.8, 46.087, 420, 898, 3, 20.8,
4.248, 31, 4.586, 17.2, 28.493, 485, 1015, 59, 20.3, 2.398, 32,
9.623, 15.2, 47.612, 419, 892, 16, 21.9, 4.304, 33, 5.077, 16.2,
30.793, 411, 865, 11, 19.3, 4.094, 34, 4.775, 15.3, 26.327, 515,
1107, 78, 21.4, 1.609, 35, 6.162, 16.6, 36.802, 460, 975, 60,
21.3, 3.135, 36, 4.845, 15.5, 28.172, 491, 1027, 66, 20.6, 2.197,
37, 6.436, 19.9, 38.555, 448, 947, 12, 22.3, 3.932, 38, 7.109,
17.1, 999, 419, 880, 8, 21, 4.248, 39, 999, 14.7, 40.729, 425,
888, 2, 21.4, 4.248, 40, 4.797, 16.4, 30.279, 401, 844, 13, 18.9,
4.06, 41, 4.775, 14.4, 25.994, 505, 1068, 68, 21.3, 1.609, 42,
4.388, 18.6, 32.477, 497, 1040, 83, 19.7, 2.485, 43, 5.222, 15.7,
31.223, 419, 893, 30, 20.2, 3.85, 44, 3.656, 24.3, 29.082, 513,
1076, 69, 21.5, 1.386, 45, 6.75, 13.8, 35.406, 429, 901, 7, 21.9,
4.22, 46, 5.327, 14.6, 33.987, 428, 896, 6, 20.7, 999, 47, 5.906,
20.2, 36.151, 443, 937, 16, 22.4, 3.871, 48, 6.107, 14.8, 31.944,
448, 932, 57, 20, 2.833, 49, 6.93, 15.9, 37.746, 501, 1073, 64,
22.3, 2.197, 50, 6.16, 14.9, 31.285, 476, 1001, 70, 21.4, 2.303)
Can you please be a bit more specific with your question, in particular, where in the code does the error occur?
With the survrateMissingNA.dat, are there any headers on it, or does it appear exactly as you posted it?
If there are no headers, this line of code will give you an error straight away: print(summary(glm(formula = outcome01~survrate + gsi + avoid + intrus, binomial, data = x)))
That line is reading data from the data.frame x, which contains the survrateMissingNA.dat data. If that data file has no headers, then outcome01 does not exist. (not does survrate, gsi avoid orintrus).
To fix this (initial error), make sure that the file contains headings.
Related
I am trying to calculate the value of x where y = 0. I could able to do it for single x using the following code
lm.model <- lm(y ~ x)
cc <- coef(lm.model)
f <- function(x) cc[2]*x + cc[1]
plot(x, y)
abline(coef(lm.model))
abline(h=0, col="blue")
(threshold <- uniroot(f, interval = c(0, 100))$root)
abline(v=threshold, col="blue")
x = c(33.05, 14.22, 15.35, 13.52, 8.7, 13.73, 8.28, 21.02, 9.97,
11.98, 12.87, 5.05, 11.23, 11.65, 10.05, 12.58, 13.88, 9.66,
4.62, 4.56, 5.35, 3.7, 3.29, 4.87, 3.75, 6.55, 4.51, 7.77, 4.7,
4.18, 25.14, 18.08, 10.41)
y = c(16.22699279, 15.78620732, 9.656361014, -17.32805679, -20.85685895,
7.601993251, -4.776053714, 10.50972236, 3.853479771, 7.713563136,
8.579366561, 14.16989395, 7.484692081, -1.2807472, -12.13759458,
-0.29138513, -5.238157067, -2.033194068, -38.12157566, -33.61912493,
-9.763657548, -0.240863712, 9.090638907, 7.345492608, 6.949676888,
-19.94866471, 0.995659732, -1.162616185, 5.497998429, 1.656653092,
2.116687436, 22.23175649, 10.33039543)
But I have multiple x variables. Now how can I apply it for multiple x variables at a time?
Here is an example data
df = structure(list(y = c(16.2269927925813, 15.7862073196372, 9.65636101412767,
-17.3280567922775, -20.8568589521297, 7.6019932507973, -4.77605371404423,
10.5097223644541, 3.85347977129367, 7.71356313645697, 8.57936656085966,
14.1698939499927, 7.4846920807874, -1.28074719969249, -12.1375945758837,
-0.291385130176774, -5.23815706681139, -2.03319406769161, -38.1215756639013,
-33.6191249261727, -9.76365754821171, -0.240863712421707, 9.09063890677045,
7.34549260800693, 6.94967688778232, -19.9486647079697, 0.995659731521127,
-1.16261618452931, 5.49799842947493, 1.65665309209479, 2.11668743610013,
22.2317564898722, 10.3303954315884), x1 = c(8.56, 8.66, 9.09,
8.36, 8.3, 8.63, 8.78, 8.44, 8.34, 8.46, 8.33, 8.19, 8.58, 8.65,
8.75, 8.34, 8.77, 9.06, 9.31, 9.11, 9.26, 9.81, 9.68, 9.79, 9.26,
9.53, 8.89, 8.89, 10.37, 9.58, 10.27, 10.16, 10.27), x2 = c(164,
328.3, 0, 590.2, 406.6, 188.4, 423.8, 355.3, 337.6, 0, 0, 200.1,
0, 315.8, 547.5, 225.6, 655.7, 387.2, 0, 487.4, 400.4, 0, 234.9,
275.5, 0, 0, 613.2, 207.4, 184.4, 162.8, 220, 174.8, 0), x3 = c(4517.7,
2953.4, 2899.3, 2573.8, 3310.7, 3880.3, 3016.8, 3552.3, 2960.1,
323, 2638.5, 3343.1, 3274.7, 3218, 3268.3, 3507.9, 3709.2, 3537.5,
2634.4, 1964.6, 3333.7, 2809.7, 3326.8, 3524.5, 3893.9, 3166.7,
3992.1, 4324.7, 3077.9, 3069.9, 4218.9, 3897.4, 2693.9), x4 = c(14.7,
14.5, 15.5, 17, 16.2, 15.9, 15.7, 15.3, 13.5, 14, 15.4, 16.2,
15.6, 15.7, 15.1, 15.8, 15.3, 14.9, 15.7, 16.3, 15.21000004,
16.7, 15.6, 16.2, 15.7, 16.3, 17.3, 16.9, 15.7, 14.9, 13.81999969,
14.90754509, 12.42847157), x5 = c(28.3, 29.1, 28.3, 29.1, 28.7,
29.3, 28.9, 28.4, 29.3, 29.3, 29.1, 29, 29.9, 29.5, 28.4, 30.3,
29.1, 29.1, 29, 29.5, 29.3, 28.5, 29, 28.7, 29.4, 28.8, 29.2,
30.1, 28.3, 28.7, 24.96999931, 25.79496384, 25.3072052), x6 = c(33.05,
14.22, 15.35, 13.52, 8.7, 13.73, 8.28, 21.02, 9.97, 11.98, 12.87,
5.05, 11.23, 11.65, 10.05, 12.58, 13.88, 9.66, 4.62, 4.56, 5.35,
3.7, 3.29, 4.87, 3.75, 6.55, 4.51, 7.77, 4.7, 4.18, 25.14, 18.08,
10.41), x7 = c(13.8425, 11.1175, 8.95, 13.5375, 5.4025, 13.5625,
13.735, 14.14, 8.0875, 5.565, 12.255, 3.3075, 6.345, 4.8125,
4.0325, 11.475, 10.32, 17.71, 2.3375, 3.92, 5.7, 2.42, 8.3075,
7.4725, 7.7925, 10.8725, 8.005, 11.7475, 13.405, 8.425, 47.155,
26.1, 6.6675), x8 = c(0.95, 3.01, 1.92, 1.51, 2.61, 1.32, 3.55,
1.21, 2.14, 1.1, 1.32, 0.76, 1.34, 5.41, 9.38, 6.55, 4.44, 7.37,
9.84, 12.68, 15.52, 23.01, 18.59, 21.64, 19.69, 25.22, 22.38,
25.03, 37.42, 22.26, 2.1, 3.01, 0.82), x9 = c(26.2, 25.8, 25.8,
25.5, 26, 24.7, 22.9, 25.3, 26.3, 26.1, 22.5, 25.9, 26.4, 25.2,
25.8, 25.4, 25, 23.2, 26.4, 25.8, 26.6, 26.2, 25.8, 26.8, 25,
25.4, 25.6, 26.1, 25.7, 25.8, 24.78000069, 24.98148918, 26.39899826
), x10 = c(35.4, 39, 37.5, 36.4, 37.1, 36.2, 37.3, 36.4, 37.5,
36, 36.6, 35.6, 37.3, 38.3, 37, 37.5, 37.5, 39.6, 37.8, 36.8,
36.6, 38.4, 38.9, 38.4, 38.4, 37.7, 39.1, 37.7, 37.8, 39.4, 36.25,
35.57029343, 35.57416534), x11 = c(653.86191565, 383.1, 457.1,
591.4, 549.2, 475.2, 626.4, 308.8, 652.4, 77, 380.9, 530.5, 393,
712.1, 623.4, 515.7, 706.4, 713.4, 343.7, 559.5, 630.1, 292.3,
578.6, 628.88904574, 480.96959685, 591.35600287, 804.8, 419.6,
403.7, 361.2, 515.07101438, 434.66682808, 299.9531298), x12 = c(163.9793854,
167.9, 135, 215.8, 213, 188.4, 260.6, 191.8, 337.6, 55, 147.6,
200.1, 140.7, 315.8, 189.6, 225.6, 469.3, 201.8, 140, 297.2,
204.6, 142.5, 234.9, 275.494751, 153.7796173, 147.6174622, 433.6,
207.4, 184.4, 162.8, 219.9721832, 174.8355713, 106.8163605),
x13 = c(92, 67, 67, 50, 70, 87, 68, 86, 70, 11, 66, 79, 70,
61, 75, 78, 78, 77, 69, 35, 72, 76, 69, 84, 93, 73, 81, 99,
80, 76, 101, 86, 80), x14 = c(70, 42, 46, 34, 55, 60, 51,
65, 49, 1, 40, 56, 54, 41, 48, 57, 46, 50, 41, 22, 47, 47,
49, 57, 70, 52, 56, 70, 48, 50, 74, 66, 47), x15 = c(21,
12, 13, 10, 14, 16, 10, 13, 10, 0, 9, 14, 16, 20, 14, 14,
13, 15, 10, 7, 17, 8, 14, 14, 14, 11, 17, 19, 12, 11, 17,
17, 9), x16 = c(1076.8, 783.7, 711.8, 1041.9, 957.4, 939.3,
662.9, 768.1, 770.3, 0, 399.2, 606.2, 724.1, 960.8, 943.8,
737.8, 1477.4, 1191.7, 371.3, 956.4, 1251.7, 345.7, 1210.7,
845, 598.1, 821.7, 1310.6, 940.1, 581, 520, 313.5, 606.8,
201.2), x17 = c(163.9793854, 167.9, 128.4, 215.8, 213, 188.4,
260.6, 191.8, 337.6, 55, 147.6, 200.1, 140.7, 315.8, 189.6,
225.6, 469.3, 201.8, 140, 297.2, 204.6, 142.5, 234.9, 157.7472534,
153.7796173, 147.6174622, 133.1873627, 150.2, 184.4, 162.8,
219.9721832, 174.8355713, 106.8163605)), row.names = c(NA,
33L), class = "data.frame")
You can use purrr::map to loop through every x.
library(dplyr)
library(purrr)
thresholds <- df %>%
select(-y) %>%
map_dbl(function(x){
lm.model <- lm(df$y ~ x)
cc <- coef(lm.model)
f <- function(x) cc[2]*x + cc[1]
plot(x, df$y)
abline(coef(lm.model))
abline(h=0, col="blue")
threshold <- tryCatch(uniroot(f, interval = c(0, 100))$root, error = function(cond){NA})
abline(v=threshold, col="blue")
return(threshold)})
For some x's, uniroot(f, interval = c(0, 100))$root yields an error: Error
in uniroot(f, interval = c(0, 100)) : f() values at end points not of opposite sign
So the tryCatch is used to return NA for the threshold associated with that x, instead of breaking the code.
Result:
> thresholds
x1 x2 x3 x4 x5 x6 x7 x8 x9
9.023314 NA NA 15.459841 28.727293 10.514728 10.493577 9.669244 25.522480
x10 x11 x12 x13 x14 x15 x16 x17
37.370852 NA NA 73.398380 50.239522 13.022176 NA NA
Edit: binding the graphs together
graphs <- df %>%
select(-y) %>%
imap(function(x, name){
lm.model <- lm(df$y ~ x)
cc <- coef(lm.model)
f <- function(x) cc[2]*x + cc[1]
threshold <- tryCatch(uniroot(f, interval = c(0, 100))$root, error = function(cond){NA})
g = ggplot(mapping = aes(x)) +
geom_point(aes(y = df$y)) +
geom_line(aes(y = cc[2]*x + cc[1])) +
geom_hline(yintercept = 0, color = "blue") +
labs(title = name, y = "y", x = "x")
if(!is.na(threshold)) {g = g + geom_vline(xintercept = threshold, color = "blue")}
return(g)})
ggpubr::ggarrange(plotlist = graphs)
Result:
Obs2: i assumed that you don't need the thresholds vector defined in the first attempt, if you still need it, it's easy to add it back to the answer
Obs1: let me know if you want any aesthetic change on the graphs
Edit 2: graph with common axis
To use a common axis is better to use facets instead of ggarrange. In order to do that, we need to first save the fitted data for all variables, then plot, so the ggplot expression goes out of the map. Also, we now save the treshold info.
graphs <- df %>%
select(-y) %>%
imap(function(x, name){
lm.model <- lm(df$y ~ x)
cc <- coef(lm.model)
f <- function(x) cc[2]*x + cc[1]
threshold <- tryCatch(uniroot(f, interval = c(0, 100))$root, error = function(cond){NA})
list(threshold = threshold,
data = tibble(y = df$y, "name" = name, "x" = x, "fitted" = cc[2]*x + cc[1]))})
Now we use the purrr::transpose() function to build a dataset for the data and other for the treshold. This functions does something like:
list(x1 = list(treshold, data), x2 = ...) >>> list(treshold = list(x1, x2, ...), data = list(x1, x2, ...))
df2 = graphs %>%
transpose() %>%
`$`(data) %>%
bind_rows() %>%
mutate(name = factor(name, paste0("x", 1:17)))
thresholds = graphs %>%
transpose() %>%
`$`(threshold) %>%
{tibble(int = as.numeric(.), name = names(.))} #both datasets have the name column, to be used inside `facet_wrap()`
ggplot(df2, aes(x)) +
geom_point(aes(y = y)) +
geom_line(aes(y = fitted)) +
facet_wrap(vars(name), scales = "free_x") +
geom_hline(yintercept = 0, color = "blue") +
geom_vline(aes(xintercept = int), thresholds, color = "blue", linetype = 2) +
geom_label(aes(label = round(int, 2), x = int*1, y = min(df$y)), thresholds, size = 4)
Result:
Obs1: the labels position and size can be easily altered. Another option is using the thresholds as a axis break
Obs2: this method can be slow for large datasets. A more efficient option is to save only threshold and cc inside map, and then building the dataset after it.
I am trying to use the function "Summaryplot" from the Openair Package in R. But everytime I tried to use it with the next data matrix, you only have to use the next code to extract the info:
structure(list(Fecha = structure(c(1577840400, 1577844000, 1577847600,
1577851200, 1577854800, 1577858400, 1577862000, 1577865600, 1577869200,
1577872800, 1577876400, 1577880000, 1577883600, 1577887200, 1577890800,
1577894400, 1577898000, 1577901600, 1577905200, 1577908800, 1577912400,
1577916000, 1577919600, 1577923200, 1577926800), class = c("POSIXct",
"POSIXt"), tzone = "UTC"), PM10_CDAR = c(11.4, 8.3, 13.3, 16,
39.5, 35.4, 31, 48.7, 41, 34, 23.3, 16.5, 21.8, 15.7, 17.8, 12.7,
12.8, 16, 11.3, 7.9, 8.1, 10, 10.4, 7.7, 6.1), PM10_KEN = c(49.7,
72.4, 34.5, 50.3, 65.2, 59, 25.5, 19.6, 17.4, 14.3, 48.2, 34.8,
25.3, 56.7, 26, 45.6, 29, 30.5, 24.1, 22, 26.9, 22.2, 17.3, 19.1,
15.5), PM10_LAF = c(28.8, 69, 72.3, 35.1, 82, 44, 69, 73, 46,
43, 29.9, 25.1, 21.4, 15.8, 11.7, 16, 15, 12, 9, 10.8, 10.1,
11.9, 12.9, 12.4, 11.8), PM10_TUN = c(45, 57, 93, 69, 73, 60,
45, 69, 61, 46, 28, 20, 33, 54, 44, 27, 39, 37, 36, 41, 30, 29,
18, 4, 7), PM2.5_CDAR = c(9, 8, 10, 16, 34, 30, 33, 42, 33, 34,
6, 10, 9, 9, 15, 10, 9, 7, 9, 5, 5, 10, 6, 4, 2), PM2.5_KEN = c(49,
81, 110, 83, 63, 59, 79, 68, 84, 76, 48, 19, 22, 34, 36, 33,
29, 19, 13, 22, 3, 16, 16, 6, 9), PM2.5_LAF = c(35, 65, 53, 30,
60, 62, 64, 67, 36, 43, 21, 16, 11, 11, 10, 15, 15, 12, 9, 6,
6, 10, 10, 9, 10), PM2.5_TUN = c(39, 42, 66, 54, 52, 39, 33,
40, 42, 33, 21, 11, 13, 27, 22, 17, 21, 15, 17, 15, 13, 10, 6,
4, 2)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-25L))
the next error appears:
> summaryPlot(date.zoo_2, pollutant = "Kennedy_PM10")
Error in seq.int(0, to0 - from, by) : 'to' must be a finite number
In addition: Warning messages:
1: In min.default(numeric(0), na.rm = TRUE) :
no non-missing arguments to min; returning Inf
2: In max.default(numeric(0), na.rm = TRUE) :
no non-missing arguments to max; returning -Inf
I tried everything, to change the date column into date as. idx <- as.POSIXct(datos_meterologicos$Fecha); datos_meterologicos$Fecha <- read.zoo(datos_meterologicos, FUN=as.POSIXct, format = "%Y/%m/%d %H:%M", tz="UTC"). And frankly, I donĀ“t know what to do because the same error is still appearing.
The whole code is next
date.matrix_2 <- as.data.frame(datos_meterologicos[,-1])
idx_2 <- as.POSIXct(datos_meterologicos$Fecha)
date.xts_2 <- as.xts(date.matrix_2,order.by=idx_2)
date.zoo_2 <- as.zoo(date.xts_2)
I want to choose 100 houses randomly from my dataset, and find the mean value of their total price. Then repeat this action 100 times, and for each time I repeat the action, calculate the mean price. And then plot all the mean values in a histogram. This is my code (rome is the house dataset):
run <- rome[sample(1:nrow(rome), 100, replace=FALSE),]
dun <- mean(run$PRICE)
c <- replicate(100, dun)
I also tried the for loop, which I'm pretty sure I need to use here, but there are mistakes in my code:
d <- for(i in 1:100){
run <- rome[sample(1:nrow(rome), 100, replace=FALSE),]
dun <- mean(run$PRICE)
c <- replicate(100, dun)
}
And finally hist(d) , which doesn't run because of the mistakes. Can you help me?
The data (price values):
good_struct <-
c(
47,
113,
165,
104.3,
62.5,
70,
127.5,
64.5,
145,
63.5,
58.9,
65,
48,
3.5,
12.8,
17.5,
36,
41.9,
53.5,
24.5,
24.5,
55.5,
60,
51,
46,
46,
44,
54.9,
42.5,
44,
44.9,
37.9,
33,
43.9,
49.6,
52,
37.5,
50,
35.9,
42.9,
107,
112,
44.9,
55,
102,
35.5,
62.9,
39,
110,
8,
62,
85.9,
57,
110,
67.7,
89.5,
70,
74,
13,
48,
24,
53.5,
34.5,
53,
87.5,
33.5,
24,
9.6,
30,
41,
30,
38.9,
20.7,
49.9,
18.6,
39,
34,
16,
18.9,
15.2,
41.5,
53,
22,
24.9,
6.7,
32.5,
30,
59,
29.5,
26,
16.5,
39,
48.9,
33.5,
46,
54,
57.9,
37.9,
32,
31,
34,
29,
32.5,
51.9,
31,
41.8,
48,
28,
35,
46.5,
51.9,
35.4,
16,
35,
35,
36.5,
35.9,
45,
40,
35,
38,
37,
23,
25.5,
39.5,
21.5,
9,
67.5,
13.4,
12.5,
28.5,
23,
33.5,
9,
11,
30.9,
31.65,
33,
33.4,
47,
40,
46,
45.5,
57,
29.9,
30,
34,
51,
64.5,
57.5,
85.5,
61,
38,
56.5,
60.4,
51.5,
54,
69,
56,
27.9,
37.5,
32.9,
22,
29.9,
39.9,
32.6,
38.5,
21.5,
25.9,
27.5,
22.9,
31.5,
8.5,
5.5,
33,
57,
47,
43.5,
43.9,
68.5,
44.25,
61,
40,
44.5,
57,
35,
35.1,
64.5,
40,
42.6,
50,
58,
58,
55,
43,
54,
39,
45,
42,
38.9,
43.215,
26.5,
30,
29.5
)
Since replicate is a wrapper to sapply, consider adjusting the call by passing in an expression that subsets a vector then calls mean:
random_mean_prices <- replicate(
100, mean(rome$PRICE[sample(1:nrow(rome), 100, replace=FALSE)])
)
hist(random_mean_prices)
Perhaps something like this?
rome <- data.frame(PRICE = rnorm(1e6,3e5,5e4),
ID = 1:1e6)
dun = NULL
for(i in 1:100){
run <- rome[sample(1:nrow(rome), 100, replace=FALSE),]
dun <- c(dun, mean(run$PRICE))
}
hist(dun)
I need to subset my data frame. I need to include values greater than 95 percentile for each column for every 5 minutes.
My data frame is t:
dput(t)
structure(list(Date = structure(c(1468814400, 1468814700, 1468815000,
1468815300, 1468815600, 1468815900, 1468816200, 1468816800, 1468817400,
1468817700, 1468818000, 1468818300, 1468818600, 1468818900, 1468819200,
1468819500, 1468819800, 1468820100, 1468820400, 1468820700, 1468821000,
1468821300, 1468821600, 1468821900, 1468822200, 1468822500, 1468822800,
1468823100, 1468823400, 1468823700), class = c("POSIXct", "POSIXt"
), tzone = ""), CPU = c(6.09, 4.96, 8.61, 1.07, 5.13, 9.7, 1.97,
4.39, 3.25, 13.5, 1.86, 3.79, 4, 2.68, 8.71, 1.99, 14, 2.96,
2.75, 15.38, 7.97, 4.41, 5.08, 16.26, 12.19, 7.05, 6.97, 17.78,
17.57, 7.23), Trans_A = c(35, 32, 18, 23, 13, 51, 12, 15, 22,
228, 219, 71, 277, 434, 414, 154, 273, 284, 331, 170, 320, 287,
277, 157, 313, 316, 629, 448, 594, 478), Trans_B = c(53, 11,
56, 10, 11, 15, 7, 91, 8, 10, 197, 98, 101, 354, 209, 449, 429,
788, 391, 312, 131, 212, 229, 189, 529, 389, 438, 662, 855, 559
), Heap_A = c(4.58, 7.81, 7.81, 3.3, 3.95, 9.75, 3.01, 10.07,
10.4, 10.64, 3.2, 9.85, 10.56, 7.51, 4.3, 7.31, 10.18, 3.54,
10.64, 9.16, 7.49, 6.61, 10.72, 6.48, 10.48, 9.97, 11.22, 10.8,
10.73, 11.94), Heap_B = c(53, 11, 56, 10, 11, 15, 7, 91, 8, 10,
197, 98, 101, 354, 209, 449, 429, 788, 391, 312, 131, 212, 229,
189, 529, 389, 438, 662, 855, 559)), .Names = c("Date", "CPU",
"Trans_A", "Trans_B", "Heap_A", "Heap_B"), row.names = c(NA,
-30L), class = "data.frame")
I can get the max values like this but I need values greater than 95th percentile for every 5 minutes. How would I do that?
library(dplyr)
ff<-t %>%
mutate(Date = as.POSIXct(Date, format = '%Y-%m-%d %H:%M:%S')
%>% cut('5 min')) %>%
group_by(Date) %>%
dplyr::summarise(mCpu=max(CPU),
mTrans_a=max(Trans_A),
mTrans_b=max(Trans_B),
mHeap_a=max(Heap_A),
mHeap_b=max(Heap_B))
You can't subset with summarise, but you can with filter. After grouping, the quantiles will be calculated for each value of Date.
For example, the following code will give you all rows with a CPU value above the 95% percentile:
library(dplyr)
ff<-t %>%
mutate(
Date = as.POSIXct(Date, format = '%Y-%m-%d %H:%M:%S') %>% cut('5 min')
) %>%
group_by(Date) %>%
filter(CPU > quantile(CPU, 0.95))
If you only want rows that have values > 95% for all columns, use:
filter(
CPU > quantile(CPU, 0.95),
Trans_a > quantile(Trans_a, 0.95),
Trans_b > quantile(Trans_b, 0.95),
Heap_A > quantile(Heap_A, 0.95),
Heap_B > quantile(Heap_B, 0.95)
)
Also see: filter_all.
The Problem
I'm trying to find a solution to overcome a deficient experimental design in establishing sampling points. The aim is to subset the original dataset, forcing sampling points stratification based on 2 factors with several levels.
I need a general formulation of the problem that may allow me to redefine the set of criteria levels.
Note
I've found examples of subseting tables based on criteria, the most relevant is a post from Brian Diggs but I cannot find a general way to apply that solution to my particular case.
The Dataset
My data.frame have 3 columns, sample id and two factors (f1 and f2).
Criteria are based on interval of values for f1 and f2.
dat <- structure(list(id = 1:203, f1 = c(22, 20.8, 20.7, 22, 12.1, 8,
20.6, 22, 22, 21.6, 0, 22, 21.4, 15.9, 21.2, 19.1, 12.5, 16.6,
14, 21.2, 14.7, 20.7, 20.5, 5.4, 19.1, 18.9, 22, 22, 22, 0, 0,
22, 1.3, 1, 0, 9.4, 7.9, 14.5, 0, 1.5, 0, 20.3, 18, 17.3, 1,
22, 0, 15, 17.9, 4.3, 19.5, 21.2, 21.2, 14.6, 2.3, 0, 6.7, 17.9,
9.5, 19, 21.6, 16.6, 11.7, 13.7, 1.5, 1, 7.6, 3.7, 18.5, 13.5,
20.9, 18.2, 11.5, 7.3, 6.5, 21.1, 22, 20.5, 20.5, 20, 16.2, 18.6,
22, 15.1, 14.4, 10.8, 17.1, 5.7, 15.1, 12.8, 14.5, 8.8, 16.8,
18.7, 1, 6.3, 1.8, 14.6, 22, 16.2, 12.9, 9.1, 2, 7.6, 7, 11.7,
1, 1, 9.6, 11, 2, 2, 14, 14.9, 7.8, 11.4, 8.3, 7.6, 9.1, 4.5,
18, 11.4, 3.1, 4.3, 9.3, 8.1, 1.4, 5.2, 14.7, 3.6, 5, 2.7, 10.3,
11.3, 17.9, 5.2, 1, 1.5, 13.2, 0, 1, 7.4, 1.7, 11.5, 20.2, 0,
14.7, 17, 15.2, 22, 22, 22, 17.2, 15.3, 10.9, 18.7, 11.2, 18.5,
20.3, 21, 20.8, 15, 21, 16.9, 18.5, 18.5, 10.3, 12.6, 15, 19.8,
21, 17.2, 16.3, 18.3, 10.3, 17.8, 11.2, 1.5, 1, 0, 1, 14, 19.1,
6.1, 19.2, 17.1, 14.5, 18.4, 22, 20.3, 6, 13, 18.3, 8.5, 15.3,
10.6, 7.2, 6.2, 1, 7.9, 2, 20, 16.3), f2 = c(100, 100, 92.9,
38.5, 100, 90.9, 100, 100, 100, 91.7, 0, 100, 71.4, 100, 100,
53.8, 28.6, 91.7, 100, 100, 64.3, 100, 92.9, 78.6, 100, 100,
27.3, 83.3, 14.3, 0, 0, 9.1, 23.1, 12.5, 0, 100, 81.8, 100, 0,
15.4, 0, 83.3, 100, 75, 7.1, 81.8, 0, 21.4, 84.6, 25, 80, 90.9,
100, 71.4, 50, 0, 46.2, 90.9, 14.3, 66.7, 90.9, 84.6, 46.2, 91.7,
33.3, 7.7, 71.4, 27.3, 46.2, 100, 100, 100, 60, 54.5, 46.2, 53.8,
91.7, 100, 100, 66.7, 45.5, 57.1, 15.4, 75, 75, 76.9, 53.8, 25,
90.9, 84.6, 91.7, 90.9, 100, 54.5, 23.1, 63.6, 30.8, 90.9, 92.9,
100, 92.3, 90.9, 12.5, 38.5, 15.4, 84.6, 27.3, 7.1, 75, 21.4,
7.7, 15.4, 84.6, 100, 69.2, 63.6, 64.3, 53.8, 92.3, 33.3, 11.1,
61.5, 66.7, 23.1, 85.7, 81.8, 41.7, 69.2, 76.9, 38.5, 9.1, 23.1,
85.7, 90, 100, 100, 14.3, 36.4, 84.6, 0, 7.7, 61.5, 25, 50, 100,
0, 63.6, 36.4, 76.9, 100, 100, 100, 100, 90.9, 100, 100, 100,
100, 100, 83.3, 100, 100, 100, 100, 50, 54.5, 71.4, 100, 85.7,
100, 75, 100, 76.9, 83.3, 100, 92.3, 33.3, 76.9, 33.3, 0, 40,
91.7, 100, 53.8, 100, 100, 100, 100, 100, 92.3, 76.9, 23.1, 84.6,
33.3, 100, 92.3, 46.2, 100, 9.1, 53.8, 7.7, 20, 42.9)), .Names = c("id",
"f1", "f2"), class = "data.frame", row.names = c(NA, -203L))
The expected output
Sampling points should ideally be grouped following a crossed design (it is not a complete factorial design).
For Factor f1: 0, 1-15, 30-60, 80-95, 100
For Factor f2: 0, 5-10, 15-20
I need to find points given all combinations of f1 and f2 intervals, something like this fashion:
gr <- expand.grid(f1=c('0', '1-15', '30-60', '80-95', '100'),
f2=c('0', '5-10', '15-20'))
> gr
f1 f2
1 0 0
2 1-15 0
3 30-60 0
4 80-95 0
5 100 0
6 0 5-10
7 1-15 5-10
8 30-60 5-10
9 80-95 5-10
10 100 5-10
11 0 15-20
12 1-15 15-20
13 30-60 15-20
14 80-95 15-20
15 100 15-20
The solution should split dat based on lines of gr.
This is not a complete factorial design since not all combinations will fulfill this particular criteria combination but it is important to identify NA's as well.
Any help will be appreciated. Please let me know if I'm providing sufficient information.
Use cut, to split f1 and f2 into factors based on your breakpoints, paste the factor together, and then split based on the combined factor.
dat$f1.group<-cut(dat$f1,c(0,1,15,30,60,80,90,95,100))
dat$f2.group<-cut(dat$f1,c(0,5,10,15,20))
gr<-expand.grid(levels(dat$f1.group),levels(dat$f2.group))
names(gr)<-c('f1.group','f2.group')
gr$combined = paste(gr$f1.group,gr$f2.group)
dat<-merge(gr,dat)[c('id','f1','f2','combined')]
split(dat,dat$combined)
That will get you a list of data.frame, with one element for each combo defined in gr. You can them easily sample by these strata.