R using ggplot2 to plot mixEM data - r

I have a vector of length 370 that I would like to fit to a mixture of Gaussians. I have followed the example here: Any suggestions for how I can plot mixEM type data using ggplot2 to plot the data, but as you can see from the image link, my results are different from those in the example: Plot of a mixture of three Gaussians
Here is a snippet of the code that I used:
library(ggplot2)
library(mixtools)
gg.mixEM <- function(EM) {
require(ggplot2)
x <- with(EM,seq(min(x),max(x),len=1000))
pars <- with(EM,data.frame(comp=colnames(posterior), mu, sigma,lambda))
em.df <- data.frame(x=rep(x,each=nrow(pars)),pars)
em.df$y <- with(em.df,lambda*dnorm(x,mean=mu,sd=sigma))
ggplot(data.frame(x=EM$x),aes(x,y=..density..)) +
geom_histogram(fill=NA,color="black",bins=41)+
geom_polygon(data=em.df,aes(x,y,fill=comp),color="grey50", alpha=0.5)+
scale_fill_discrete("Component\nMeans",labels=format(em.df$mu,digits=3))+
theme_bw()
}
dput(gradesCS)
c(6.5, 22.375, 20.5, 24.25, 33.25, 24, 26.75, 30.75, 35.5, 23.5,
26.875, 24, 35.5, 29.875, 29.75, 31.25, 32.875, 33.75, 34, 29,
33, 24, 12, 26.375, 6.75, 31.25, 21.625, 32.875, 29.25, 27.125,
28.25, 26.25, 24.875, 35.5, 26.5, 37.5, 35.375, 27.5, 33, 27.5,
39.5, 34.25, 28.125, 28, 32.625, 37.625, 34.5, 29.5, 38.5, 37.5,
28.75, 38, 16, 35.75, 30, 33.5, 36, 31.125, 29.75, 32.5, 35,
24.375, 23.375, 28, 32.125, 36, 31.5, 33.5, 1.5, 30.5, 37, 29.5,
29.5, 31.125, 32.5, 20.5, 28.75, 30.25, 32.5, 28, 36, 37.5, 28.5,
35.5, 30.25, 36.375, 36, 23.25, 31.5, 25.125, 33.5, 34, 19.5,
31.75, 39.5, 33.25, 24.875, 26.75, 23.375, 34, 16.5, 37, 33.375,
31.25, 31.75, 35.5, 32, 27.5, 23.375, 20.625, 35.5, 31.5, 25.375,
24.5, 27.25, 25.25, 35.75, 24, 28.25, 33.125, 31.5, 39.5, 39.25,
24.75, 37, 25.5, 34.75, 34, 20.25, 37.625, 30.5, 32.375, 15,
32.75, 33.5, 32.75, 31.5, 29.25, 30, 37.25, 34.5, 23, 32.5, 38.25,
35.625, 33, 35, 31.125, 37, 28.125, 29.25, 31.75, 34.75, 34.625,
36.625, 15.25, 35.5, 37, 33.5, 30.875, 35, 31.625, 22.75, 31,
31.125, 25.125, 35.5, 2, 36.125, 25.25, 32.5, 28, 38.5, 35.5,
38.5, 30.5, 34, 28.125, 38, 29.25, 29.75, 33.25, 25.125, 35,
34.5, 32, 35, 26.875, 20.5, 35.5, 23.25, 26.25, 36, 35.5, 38,
39.25, 22, 38.5, 31, 35.5, 33.5, 31.5, 26, 30.375, 35.75, 29.75,
34, 37.625, 38, 35.5, 34.25, 24.375, 30, 33.75, 39.5, 36.5, 36.5,
32, 36.5, 29.75, 29.75, 25, 32, 29.25, 32.125, 31.25, 38, 33.5,
33.5, 38.5, 37.25, 31.125, 33.5, 31, 28, 29.75, 36, 36, 37, 22,
29, 36.5, 32.25, 30.75, 38.5, 24.125, 28.75, 38.25, 32.5, 34.75,
29, 30.375, 33.5, 31.25, 30, 33, 33.5, 27.5, 26.5, 30.25, 34.75,
33.5, 39, 33.25, 38.5, 27, 39.5, 34.25, 33, 35.125, 38, 31.25,
32.75, 22.75, 31.125, 34.5, 33, 37.125, 31, 18.75, 30.25, 31.75,
34, 30.75, 29, 34.5, 36, 36.5, 31.5, 26, 27.5, 27.5, 36.5, 19.75,
33, 35.125, 16, 19.75, 31.5, 38.5, 34.25, 36.5, 27, 22, 21.75,
36, 31.5, 33, 29.75, 32.5, 26.25, 33.5, 35.75, 33, 39, 35, 34.25,
28.5, 25.5, 30.5, 28, 21.25, 39.125, 22.75, 28.375, 29.125, 30,
34.125, 31.25, 32, 26.25, 36, 24.5, 30.25, 32.75, 29.625, 16,
34, 16.75, 25.25, 33, 38, 28, 24.75, 29.75, 24.5, 19.25, 32.75,
27.5, 24.75, 17.375, 25.25, 30.125, 38, 28, 35, 11.75, 27.75,
38, 28.625, 31.25, 31.25, 32, 17.25, 18.25, 32.625, 25.5, 27.5,
35.25, 35.5)
b <- gradesCS
c <- b[sample(length(b), length(b)) ]
c3 <- normalmixEM(c, lambda=NULL, mu=NULL, sigma=NULL,k=3,maxit=1000,epsilon = 1e-2)
gg.mixEM(c3)

The problem is that polygons freak out if they don't have continuous drawing space (e.g. if you end abruptly at 0, but the polygon function has not reached 0).
In the first line of the ggplot function, add extra spacing on each side of x. I'm going with 5 here, but you just need enough for the function to hit 0.
x <- with(EM,seq(min(x)-5,max(x)+5,len=1000))
In the bottom, we cut off the excess space with
coord_cartesian(xlim = c(0,42),
expand = c(0,0))
This renders the graph with your spacing, and then "zooms in" on the selected x interval.
fit_test <- normalmixEM(
test,
k = 2)
gg.mixEM <- function(EM) {
require(ggplot2)
x <- with(EM,seq(min(x)-5,max(x)+5,len=1000))
pars <- with(EM,data.frame(comp=colnames(posterior), mu, sigma,lambda))
em.df <- data.frame(x=rep(x,each=nrow(pars)),pars)
em.df$y <- with(em.df,lambda*dnorm(x,mean=mu,sd=sigma))
ggplot(data.frame(x=EM$x),aes(x,y=..density..)) +
geom_histogram(fill=NA,color="black",bins=41)+
geom_polygon(data=em.df,aes(x,y,fill=comp),color="grey50", alpha=0.5)+
scale_fill_discrete("Component\nMeans",labels=format(em.df$mu,digits=3))+
theme_bw() +
coord_cartesian(xlim = c(0,42),
expand = c(0,0))
}
gg.mixEM(fit_test)
And we get

Related

Sample, replicate and histogram in R

I want to choose 100 houses randomly from my dataset, and find the mean value of their total price. Then repeat this action 100 times, and for each time I repeat the action, calculate the mean price. And then plot all the mean values in a histogram. This is my code (rome is the house dataset):
run <- rome[sample(1:nrow(rome), 100, replace=FALSE),]
dun <- mean(run$PRICE)
c <- replicate(100, dun)
I also tried the for loop, which I'm pretty sure I need to use here, but there are mistakes in my code:
d <- for(i in 1:100){
run <- rome[sample(1:nrow(rome), 100, replace=FALSE),]
dun <- mean(run$PRICE)
c <- replicate(100, dun)
}
And finally hist(d) , which doesn't run because of the mistakes. Can you help me?
The data (price values):
good_struct <-
c(
47,
113,
165,
104.3,
62.5,
70,
127.5,
64.5,
145,
63.5,
58.9,
65,
48,
3.5,
12.8,
17.5,
36,
41.9,
53.5,
24.5,
24.5,
55.5,
60,
51,
46,
46,
44,
54.9,
42.5,
44,
44.9,
37.9,
33,
43.9,
49.6,
52,
37.5,
50,
35.9,
42.9,
107,
112,
44.9,
55,
102,
35.5,
62.9,
39,
110,
8,
62,
85.9,
57,
110,
67.7,
89.5,
70,
74,
13,
48,
24,
53.5,
34.5,
53,
87.5,
33.5,
24,
9.6,
30,
41,
30,
38.9,
20.7,
49.9,
18.6,
39,
34,
16,
18.9,
15.2,
41.5,
53,
22,
24.9,
6.7,
32.5,
30,
59,
29.5,
26,
16.5,
39,
48.9,
33.5,
46,
54,
57.9,
37.9,
32,
31,
34,
29,
32.5,
51.9,
31,
41.8,
48,
28,
35,
46.5,
51.9,
35.4,
16,
35,
35,
36.5,
35.9,
45,
40,
35,
38,
37,
23,
25.5,
39.5,
21.5,
9,
67.5,
13.4,
12.5,
28.5,
23,
33.5,
9,
11,
30.9,
31.65,
33,
33.4,
47,
40,
46,
45.5,
57,
29.9,
30,
34,
51,
64.5,
57.5,
85.5,
61,
38,
56.5,
60.4,
51.5,
54,
69,
56,
27.9,
37.5,
32.9,
22,
29.9,
39.9,
32.6,
38.5,
21.5,
25.9,
27.5,
22.9,
31.5,
8.5,
5.5,
33,
57,
47,
43.5,
43.9,
68.5,
44.25,
61,
40,
44.5,
57,
35,
35.1,
64.5,
40,
42.6,
50,
58,
58,
55,
43,
54,
39,
45,
42,
38.9,
43.215,
26.5,
30,
29.5
)
Since replicate is a wrapper to sapply, consider adjusting the call by passing in an expression that subsets a vector then calls mean:
random_mean_prices <- replicate(
100, mean(rome$PRICE[sample(1:nrow(rome), 100, replace=FALSE)])
)
hist(random_mean_prices)
Perhaps something like this?
rome <- data.frame(PRICE = rnorm(1e6,3e5,5e4),
ID = 1:1e6)
dun = NULL
for(i in 1:100){
run <- rome[sample(1:nrow(rome), 100, replace=FALSE),]
dun <- c(dun, mean(run$PRICE))
}
hist(dun)

Interact_plot keeps coming back with Error: data must be compatible with existing data

I have been trying to solve this for days, so any help would be appreciated!
I am trying to make an interaction plot for an OLS Regression.
This is the code I am using:
interact <- lm(ele$vt_c ~ ele$Immigrants:ele$X.qual, data = as.data.frame(ele))
interact_plot(model = interact, pred=Immigrants, modx =X.qual, modx.values = NULL, data = ele)
This is the error that is coming up
Error in ecdf(d[[modx]]) : 'x' must have 1 or more non-missing values
In addition: Warning message:
immigrants and X.qual are not included in an interaction with one another in the model.
Reproducible data
if (!"interactions" %in% installed.packages()) install.packages("interactions")
library(interactions)
ele = structure(list(vt_c = c(68.37056, 67.55938, 69.25354, 67.54727,
67.39343, 67.81161, 65.81312, 64.68675, 70.8572, 72.1439, 67.39006,
64.89897, 62.81833, 63.82975, 58.99062, 67.69617, 68.17096, 65.24267,
67.08106, 66.47592, 68.40781, 70.40636, 69.50657, 72.37613, 70.24236,
67.50159, 71.77177, 67.09047, 74.58491, 70.64892, 65.20199, 70.03566,
70.23142, 71.62487, 66.87982, 70.72528, 66.97507, 69.38713, 67.20061,
68.79907, 67.05735, 67.38101, 66.10595, 60.97635, 61.9047, 61.28828,
72.11577, 63.04311, 71.04747, 77.16823, 63.77144, 72.5249, 69.10145,
74.61647, 55.0847, 70.97664, 73.40273, 72.02715, 69.28485, 68.66256,
77.92079, 69.78192, 71.32363, 79.13777, 76.21347, 72.96919, 71.95923,
70.94545, 64.8141, 55.98621, 74.19439, 72.70276, 68.77999, 63.09397,
61.72898), Immigrants = c(57.3, 55.1, 50.6, 45.7, 42.8, 51.7,
51.2, 50.9, 44.9, 44.5, 44.3, 42.7, 50.5, 50.5, 39.2, 50.6, 39.7,
38.9, 39.2, 41.8, 42.5, 43.1, 39.5, 41.1, 44.2, 38.6, 41.8, 40.1,
43.8, 41.9, 38.2, 38.9, 37.5, 40.8, 33.2, 41.6, 38.1, 30, 38.8,
34.4, 36.5, 32.1, 41.3, 30.6, 32.9, 27.8, 35.4, 28.7, 37.1, 33.3,
29.8, 29.8, 33.8, 32.8, 28.8, 32.6, 31.6, 30.7, 28.6, 30.9, 34.7,
24.6, 24.7, 28.4, 26, 26.2, 27.4, 26.1, 22.6, 24.7, 32.4, 22.9,
26.4, 22.2, 22.1), X.qual = c(32.9, 29.8, 30.8, 32.5, 18.3, 47.3,
30.5, 29.8, 32.7, 38.5, 42.5, 25.8, 54.5, 52.2, 24.9, 29.3, 30.5,
23, 37.6, 22.3, 35.2, 54, 39.6, 42.8, 30.4, 41.5, 47.5, 44.5,
48.4, 31.3, 25.9, 28.2, 41.6, 46.5, 24.8, 36.3, 45.2, 27, 48.7,
40, 42.1, 19.7, 53.7, 26, 21.8, 12.1, 51.6, 19.2, 46.6, 54.4,
24.9, 30.1, 47.4, 51.4, 29.7, 57.4, 48.8, 47.6, 34.3, 22.8, 52,
21.8, 29.6, 55.2, 38.6, 37.4, 39.3, 25.9, 15.7, 19.8, 38.2, 39.3,
37.7, 18.3, 32.6)), class = "data.frame", row.names = c(NA, -75L
))
interact <- lm(vt_c ~ Immigrants:X.qual,
data = ele)
interact_plot(model = interact, pred=immigrants,
modx =X.qual, data = ele)
Thank you!
Welcome to SO, Lucia Thomas!
I read this message and it sounded so much more thorough than what usually write about reproducible questions:
Please make this question reproducible. This includes sample code you've attempted (including listing non-base R packages, and any errors/warnings received), sample unambiguous data (e.g., data.frame(x=...,y=...) or the output from dput(head(x))), and intended output given that input. Refs: stackoverflow.com/q/5963269, minimal reproducible example, and stackoverflow.com/tags/r/info.
That being said, I think I can help. Right now you have called each variable as a vector and called a data frame in your call to lm(). This has led to an incompatibility issue between these two functions.
ele = structure(list(vt_c = c(68.37056, 67.55938, 69.25354, 67.54727,
67.39343, 67.81161, 65.81312, 64.68675, 70.8572, 72.1439, 67.39006,
64.89897, 62.81833, 63.82975, 58.99062, 67.69617, 68.17096, 65.24267,
67.08106, 66.47592, 68.40781, 70.40636, 69.50657, 72.37613, 70.24236,
67.50159, 71.77177, 67.09047, 74.58491, 70.64892, 65.20199, 70.03566,
70.23142, 71.62487, 66.87982, 70.72528, 66.97507, 69.38713, 67.20061,
68.79907, 67.05735, 67.38101, 66.10595, 60.97635, 61.9047, 61.28828,
72.11577, 63.04311, 71.04747, 77.16823, 63.77144, 72.5249, 69.10145,
74.61647, 55.0847, 70.97664, 73.40273, 72.02715, 69.28485, 68.66256,
77.92079, 69.78192, 71.32363, 79.13777, 76.21347, 72.96919, 71.95923,
70.94545, 64.8141, 55.98621, 74.19439, 72.70276, 68.77999, 63.09397,
61.72898), immigrants = c(57.3, 55.1, 50.6, 45.7, 42.8, 51.7,
51.2, 50.9, 44.9, 44.5, 44.3, 42.7, 50.5, 50.5, 39.2, 50.6, 39.7,
38.9, 39.2, 41.8, 42.5, 43.1, 39.5, 41.1, 44.2, 38.6, 41.8, 40.1,
43.8, 41.9, 38.2, 38.9, 37.5, 40.8, 33.2, 41.6, 38.1, 30, 38.8,
34.4, 36.5, 32.1, 41.3, 30.6, 32.9, 27.8, 35.4, 28.7, 37.1, 33.3,
29.8, 29.8, 33.8, 32.8, 28.8, 32.6, 31.6, 30.7, 28.6, 30.9, 34.7,
24.6, 24.7, 28.4, 26, 26.2, 27.4, 26.1, 22.6, 24.7, 32.4, 22.9,
26.4, 22.2, 22.1), X.qual = c(32.9, 29.8, 30.8, 32.5, 18.3, 47.3,
30.5, 29.8, 32.7, 38.5, 42.5, 25.8, 54.5, 52.2, 24.9, 29.3, 30.5,
23, 37.6, 22.3, 35.2, 54, 39.6, 42.8, 30.4, 41.5, 47.5, 44.5,
48.4, 31.3, 25.9, 28.2, 41.6, 46.5, 24.8, 36.3, 45.2, 27, 48.7,
40, 42.1, 19.7, 53.7, 26, 21.8, 12.1, 51.6, 19.2, 46.6, 54.4,
24.9, 30.1, 47.4, 51.4, 29.7, 57.4, 48.8, 47.6, 34.3, 22.8, 52,
21.8, 29.6, 55.2, 38.6, 37.4, 39.3, 25.9, 15.7, 19.8, 38.2, 39.3,
37.7, 18.3, 32.6)), class = "data.frame", row.names = c(NA, -75L
))
Since you called the data frame, call the names of the columns, without the data frame appended:
interact <- lm(vt_c ~ immigrants:X.qual,
data = ele)
interact_plot(model = interact, pred=immigrants,
modx =X.qual, data = ele)

R how to lag 4000 columns 50 times

I have a data frame with 4000 columns and daily observations sorted by time. I want to create new columns that lag all existing columns 50 times in the past. So for a column Y create 50 additional columns that are Y-1day,Y-2days,Y-3days...Y-50days.
So far I've wrapped the following loop which does what I need to make.
The issue is that it's not very fast. Is there a more efficient way I can test?
for(i in 2:ncol(Data)){
for(j in 1:50){
Data<- slide(Data, Var = names(Data[i]), slideBy = -j)
}}
I'm attaching a snapshot of my data frame for reproducible example:
structure(list(time = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76,
77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92,
93, 94, 95, 96, 97, 98, 99, 100), A = c(17.081545, 16.630901,
16.623749, 16.258942, 16.244635, 16.165951, 15.886981, 15.865522,
15.529327, 15.772532, 16.04435, 15.779685, 15.915594, 15.593705,
15.336195, 15.593705, 15.736767, 15.736767, 15.457797, 15.815451,
16.108727, 16.237482, 15.808297, 16.058655, 16.53791, 16.988556,
16.516453, 16.480686, 16.967096, 17.181688, 17.446352, 17.11731,
16.952789, 16.8598, 16.795422, 16.437769, 16.587982, 16.845493,
17.167381, 17.510729, 17.410587, 17.474964, 17.246065, 17.703863,
17.424892, 17.174536, 17.103004, 16.695278, 16.93133, 16.638054,
16.115879, 16.20887, 15.987124, 16.151646, 16.151646, 16.115879,
16.173105, 16.101574, 16.080114, 15.9299, 15.879828, 15.786839,
15.314735, 15.27897, 15.493563, 15.436337, 15.286123, 15.121602,
15.27897, 14.88555, 14.785408, 14.592275, 14.785408, 14.856938,
14.670959, 15.243204, 15.09299, 15.250358, 15.264664, 15.18598,
14.771102, 14.842632, 15, 15.150214, 15.200286, 15.078684, 15.379113,
15.658083, 15.636623, 15.879828, 15.715307, 15.729613, 15.422031,
16.080114, 16.39485, 16.502146, 16.74535, 16.902718, 17.088697,
16.831188), AAP = c(29.033333, 28.84, 28.893333, 28.866667, 28.700001,
28.799999, 28.973333, 28.866667, 28.806667, 28.973333, 29.713333,
29.033333, 28.626667, 28.546667, 28.173334, 28.166666, 28.24,
28.553333, 28.366667, 28.733334, 28.833334, 28.9, 29.166666,
29.846666, 30.08, 30.093334, 29.673334, 29.860001, 30.053333,
30.186666, 29.833334, 29.673334, 34.533333, 33.82, 33.373333,
33.633335, 33.593334, 33.833332, 33.586666, 33.946667, 34.66,
34.599998, 34.84, 34.779999, 34.093334, 33.713333, 33.560001,
33.933334, 33.086666, 33.139999, 33.279999, 33.200001, 33.259998,
32.466667, 32.713333, 32.686668, 33.053333, 33.806667, 33.333332,
33.613335, 33.633335, 33.799999, 34.206665, 34.5, 34.166668,
34.206665, 33.933334, 34, 34.373333, 33.700001, 33.173332, 32.633335,
32.639999, 34.013332, 33.566666, 34.053333, 34.053333, 34.826668,
35.106667, 35.68, 35.653332, 35.566666, 35.380001, 35.419998,
35.966667, 36.573334, 36.673332, 36.486668, 36.286667, 36.099998,
35.433334, 35.419998, 35.84, 36.533333, 36.779999, 38.98, 39.633335,
39.646667, 39.486668, 39.433334), AAPL = c(4.520714, 4.567143,
4.607143, 4.610714, 4.946429, 4.925714, 4.611429, 4.675714, 4.985714,
5.014286, 5.046429, 4.991428, 5.032857, 5.035, 5.054286, 5.146429,
5.160714, 5.188571, 5.284286, 5.492857, 5.537857, 5.687857, 5.557857,
5.631429, 5.638571, 5.778572, 5.624286, 5.597143, 5.800714, 6.045,
6.315, 6.437857, 6.272143, 6.200714, 6.092143, 6.302143, 6.352143,
6.356429, 6.408571, 6.357143, 6.302857, 5.97, 6.115714, 6.107143,
5.79, 5.621428, 5.69, 5.752857, 5.76, 5.851429, 5.882857, 6.035714,
6.137143, 6.242857, 6.118571, 6.078571, 6.071429, 6.075714, 5.964286,
6.114286, 5.952857, 5.841429, 5.87, 5.984286, 6.047143, 6.222857,
6.248571, 5.988572, 6.094285, 5.862857, 5.322857, 5.05, 5.088572,
5.298572, 5.072857, 5.311429, 5.071429, 5.282857, 5.17, 5.135714,
5.077143, 5.151429, 5.204286, 5.172857, 5.307143, 5.24, 5.32,
5.281428, 5.202857, 5.087143, 4.875714, 4.967143, 5.078571, 5.051429,
5.12, 5.364286, 5.364286, 5.68, 5.671429, 5.682857), ABC = c(14.5375,
14.4225, 14.395, 14.5175, 14.475, 14.475, 14.51, 14.515, 14.275,
14.3175, 14.4875, 14.375, 14.5025, 14.2525, 14.3925, 14.13, 14.47,
14.365, 14.5925, 14.57, 14.74, 14.71, 14.995, 14.9, 14.8625,
15.0325, 14.78, 14.875, 15.085, 15.0525, 15.4275, 15.3075, 14.9225,
15, 14.7025, 14.7975, 15, 15, 14.975, 15.3775, 15.435, 15.5325,
15.6625, 15.6575, 15.695, 15.1275, 15.1025, 15.0775, 15.265,
15.0325, 14.905, 15.1975, 15.215, 15.2025, 15.1025, 15.3775,
15.2775, 13.5075, 13.5275, 13.95, 14.3225, 14.09, 14.4275, 14.735,
14.6475, 14.8, 14.4575, 14.62, 14.7525, 14.7, 14.9, 15.125, 14.83,
14.9525, 14.825, 14.9625, 15, 14.975, 14.9675, 15.0975, 15.0875,
15.32, 15.5125, 15.38, 15.51, 15.575, 15.7475, 15.9975, 15.9175,
15.895, 15.955, 15.98, 16.209999, 16.459999, 16.5725, 16.514999,
16.4925, 16.5, 16.495001, 16.4825), ABMD = c(15.01, 14.98, 14.69,
14.52, 14.29, 14.42, 14.31, 14.17, 12.45, 12.05, 11.87, 11.97,
11.41, 11.16, 11.06, 11.2, 11.1, 11.57, 11.43, 11.88, 11.58,
11.12, 11.16, 11.32, 10.97, 10.88, 10.72, 10.3, 10.75, 10.25,
10.29, 10.41, 10.02, 10.05, 10.08, 10, 10.24, 10.89, 10.7, 10.8,
10.66, 10.71, 11.12, 11.18, 11.2, 10.95, 11.07, 11.12, 11.3,
11.19, 10.83, 10.56, 10.37, 10.47, 10.33, 10.17, 10.51, 10.4,
10.56, 10.74, 10.58, 10.6, 10.57, 10.71, 11.23, 11.28, 11.51,
11.15, 10.98, 10.98, 11.05, 10.76, 10.96, 11.1, 10.62, 11.1,
10.53, 10.69, 10.65, 10.73, 10.15, 10.15, 9.52, 9.6, 9.6, 9.52,
9.47, 9.44, 9.35, 9.27, 9.13, 8.92, 9.26, 9.45, 9.97, 10.25,
10.28, 9.99, 10.16, 10.17), ABT = c(22.392265, 22.166759, 21.912466,
22.40666, 22.790501, 23.011208, 22.588984, 22.517014, 22.085194,
22.19075, 22.089993, 22.09479, 21.95085, 22.061205, 22.037214,
22.027618, 22.018023, 21.811708, 21.720547, 21.600595, 21.854891,
21.898071, 21.907667, 21.840496, 21.874083, 21.725344, 21.667768,
21.581404, 22.166759, 22.305902, 22.488226, 22.469034, 22.339487,
22.26272, 21.802113, 21.946053, 22.243528, 22.200346, 22.066002,
22.051607, 22.099588, 22.075598, 22.267517, 22.382669, 22.310699,
22.02282, 22.209942, 22.070801, 22.128376, 21.907667, 21.792517,
21.365494, 21.336706, 21.048826, 20.996048, 21.39908, 21.562212,
21.677364, 21.95085, 22.430651, 22.368277, 22.161963, 22.157164,
22.646561, 22.843279, 23.19833, 22.963228, 22.91045, 22.98242,
23.049591, 23.169542, 23.927626, 23.500605, 23.111965, 22.69454,
23.078381, 22.824085, 22.920046, 23.001612, 23.255905, 23.073582,
23.586967, 23.692524, 23.634949, 23.850859, 23.601362, 23.519796,
23.543785, 23.438231, 23.634949, 23.567776, 23.395048, 23.735706,
23.706919, 23.678129, 23.529392, 23.452623, 23.366261, 23.351866,
23.145552), ACN = c(26.370001, 25.75, 25.65, 25.42, 26.610001,
26.959999, 26.5, 26.389999, 26.18, 26.290001, 26.1, 26, 25.67,
25.16, 24.9, 25.200001, 25.4, 25.68, 25.6, 26.049999, 25.99,
25.83, 25.48, 25.73, 25.77, 25.85, 25.51, 25.42, 25.200001, 24.639999,
24.9, 25.049999, 24.51, 24.9, 24.799999, 24.709999, 24.48, 25.15,
25.549999, 25.59, 25.42, 25.110001, 25.370001, 25.49, 25.32,
25.17, 24.950001, 24.459999, 24.48, 23.98, 24.030001, 23.950001,
23.66, 24.01, 24.280001, 24.299999, 24.4, 24.57, 24.16, 24.559999,
24.15, 24.440001, 24.35, 24.860001, 24.969999, 24.889999, 23.700001,
23.34, 23.440001, 23.120001, 22.860001, 22.5, 22.57, 22.440001,
21.9, 21.959999, 21.75, 21.85, 21.549999, 21.469999, 21.620001,
21.700001, 21.969999, 22.1, 22.1, 21.82, 22, 22.08, 21.860001,
21.92, 21.99, 22.049999, 22.01, 22.049999, 22.5, 22.790001, 22.719999,
22.76, 22.67, 22.34), ADBE = c(30.844999, 30.030001, 29.865,
29.370001, 29.389999, 29.41, 29.059999, 29.49, 29.110001, 29.115,
29.190001, 28.940001, 29.035, 28.535, 27.695, 27.790001, 28.004999,
28.084999, 27.74, 28.450001, 28.950001, 31.145, 31.709999, 31.995001,
31.76, 31.85, 31.295, 31.34, 31.85, 31.735001, 32.455002, 32.299999,
31.535, 31.415001, 30.754999, 30.875, 30.695, 30.715, 30.875,
31.17, 31.174999, 31.174999, 31.885, 32.535, 32.474998, 32.255001,
32.654999, 32.209999, 32.669998, 32.27, 31.594999, 31.945, 33.904999,
33.349998, 33.18, 33.134998, 33.27, 33.555, 33.110001, 33.865002,
33.584999, 33.380001, 33.290001, 33.424999, 34.049999, 34.195,
33.630001, 33.400002, 33.450001, 32.535, 31.74, 30.33, 27.385,
29.049999, 28.625, 29.77, 30.145, 30.02, 29.559999, 29.225, 29.235001,
29.735001, 28.575001, 28.645, 28.775, 28.459999, 28.85, 29.334999,
28.76, 28.965, 28.889999, 29.049999, 29.955, 29.889999, 30.549999,
31.059999, 31.115, 31.360001, 32.419998, 32.759998), ADI = c(36.389999,
35.400002, 35.560001, 35.5, 35.549999, 35.41, 35.080002, 35.560001,
35.099998, 35.639999, 36.07, 35.139999, 34.650002, 34.470001,
34.049999, 34.299999, 34.880001, 34.830002, 34.740002, 35.889999,
35.990002, 36.009998, 35.240002, 37.52, 37.52, 38.02, 37.18,
36.830002, 38.049999, 37.599998, 37.32, 37.130001, 36.700001,
36.299999, 36.5, 36.59, 37.32, 37.5, 36.720001, 38, 37.709999,
36.93, 37.119999, 37.049999, 36.950001, 36.919998, 37.849998,
37.130001, 37.209999, 36.57, 35.919998, 36.02, 35.830002, 35.709999,
35.830002, 36.23, 35.799999, 35.66, 35.119999, 36.330002, 36.139999,
35.709999, 35.599998, 35.310001, 35.41, 36.09, 35.669998, 35.34,
34.93, 34.099998, 33.650002, 32.84, 33.360001, 33.849998, 33.419998,
34.349998, 33.799999, 33.700001, 33.52, 33.360001, 33.52, 34.110001,
33.849998, 33.669998, 34.560001, 34.619999, 34.619999, 34.549999,
34.130001, 34.060001, 34.310001, 35.490002, 36.419998, 36.700001,
36.860001, 36.889999, 37.080002, 36.529999, 36.849998, 36.290001
)), row.names = c(NA, 100L), class = "data.frame")
We can use shift from data.table which can take a vector of values for n
library(data.table)
setDT(Data)
out <- Data[, shift(.SD, n = 1:50), .SDcols = -1]
names(out) <- paste0(rep(names(Data)[-1], each = 50), "_", 1:50, "days")
Data[, names(out) := out][]

Prediction interval around a gam

I have a large data set that consists of thousands of measurements of length and weight. I have provided a subset of 500 observations here:
df <- structure(list(length_cm = c(24.7, 23.8, 21.9, 23.2, 23.5, 22.2,
20.5, 22.6, 24, 21.6, 22.4, 21.2, 20.6, 23.1, 21.4, 23.1, 23.5,
23, 21.8, 22.4, 23, 23.8, 24, 21, 23.4, 23.2, 21.6, 25.9, 22.1,
30.6, 22.1, 21.7, 23.2, 21.1, 23.8, 23.2, 27.2, 23.8, 21.6, 21.1,
21.7, 22.9, 23.3, 24.1, 22.7, 20.4, 22.5, 21.7, 23.2, 22.7, 20.6,
23.7, 24.6, 23.5, 26.3, 23.6, 22.2, 23.6, 21.4, 23.3, 24.7, 24.4,
21.8, 24.9, 22.2, 23.1, 25, 23.5, 22.5, 20.4, 23.9, 23.7, 24,
24.2, 22.9, 36.4, 30, 26, 28.5, 27, 35.7, 24.3, 28.6, 29.8, 18.7,
25.7, 34.7, 31.4, 23.4, 37.7, 26.7, 28.3, 30.8, 29.2, 27.2, 25.6,
39, 35.1, 41.2, 35.7, 29.9, 25.7, 24.6, 24, 24.9, 31, 29.9, 29.4,
25.4, 20.2, 27.8, 32.7, 23.4, 29.1, 26.3, 25.7, 26, 24.9, 26.3,
31.5, 30.1, 25.9, 28.8, 37.9, 38.4, 21.5, 20.5, 21.3, 21.3, 20.9,
20.8, 22.5, 22.4, 21.4, 16.8, 17.3, 22.7, 19.7, 21.2, 18.1, 23.5,
18.1, 22, 18.5, 18.4, 19.2, 19.4, 19.9, 20.5, 18.6, 22.6, 20.9,
20.7, 20.6, 20.6, 21.6, 23.7, 22.8, 22.9, 20.8, 21.3, 23.5, 21.1,
21.6, 24, 21, 23.3, 20.3, 22.4, 23.7, 24.6, 20.7, 23.1, 22.6,
22.7, 19.5, 23, 19.8, 21, 19.8, 19.8, 17.2, 21.8, 25.3, 21.3,
19.2, 22.1, 24.5, 23.2, 22.6, 19, 22, 17.5, 19.9, 24.4, 23.7,
19.9, 23, 20.5, 18.3, 23.2, 21.1, 20.4, 22.2, 19.7, 19.2, 24,
23.3, 23.3, 19, 21.5, 22, 19.1, 23.7, 19.9, 21.2, 23, 27.3, 20.7,
22, 19.3, 24.9, 18.2, 20, 19.3, 25, 18, 21.8, 23.4, 23.9, 25.2,
18.5, 22.2, 24.6, 22, 20.4, 20.7, 21.7, 19.1, 23.1, 21.5, 21.2,
20.6, 22.3, 22.8, 21.3, 21.6, 22, 23, 24.2, 21.3, 19.7, 18.8,
20.9, 20.3, 22.3, 18.9, 19.9, 20.2, 23.9, 19.7, 19.5, 17.6, 23.1,
20.4, 20, 19.7, 20.3, 21.2, 23.9, 24, 25.6, 23.9, 23.5, 20.5,
30.8, 32.8, 28.4, 28.7, 28, 28.9, 29.8, 31, 31.7, 28.6, 28.7,
28.7, 26.7, 24.6, 30, 36.5, 26.5, 32, 29.6, 30.7, 27.7, 24.1,
29.8, 28.8, 26, 22.4, 24, 24.8, 22.7, 22.7, 23.8, 25.3, 32.3,
26.8, 22.1, 24.2, 23.8, 25.3, 24.1, 22.6, 22.9, 24.4, 26.7, 24.4,
24.7, 25, 23.7, 24.3, 22.3, 22.7, 20, 22.5, 24.5, 25.1, 24, 22,
20, 21.9, 18.3, 19.9, 19.4, 23.5, 20.2, 20, 17.8, 20.5, 23.2,
18.5, 21.2, 18.2, 19.1, 22.1, 18.3, 21.6, 19.5, 22.7, 23.6, 24.6,
23.2, 24.4, 19.1, 22.8, 23, 18.8, 22.6, 19, 21.7, 20.8, 23.7,
20.8, 20, 23.2, 22, 21.4, 20.6, 22.6, 23.8, 21, 26.4, 24.5, 32.6,
36.1, 36, 31, 33.1, 31.3, 34.2, 41.9, 35.4, 33.9, 31.9, 29.3,
34.2, 29.9, 36.4, 38.5, 30.7, 40.2, 34.1, 29.7, 37.8, 37.8, 35.3,
39, 39.5, 34.1, 30.5, 33.3, 33.2, 36, 31.6, 35, 34.2, 33.1, 31.5,
33.5, 33.7, 39, 33.2, 35, 34.1, 32.6, 36.2, 34.4, 31.7, 32, 37.5,
31.5, 32.7, 31.7, 35.7, 32.4, 28.5, 33.7, 33.9, 33.6, 34, 32,
29.8, 35, 36, 31.7, 32.5, 32, 31, 29.5, 33.4, 32.5, 26.5, 28,
35.3, 26, 26.5, 38.9, 32.7, 36.4, 35.7, 27.7, 25.8, 25.3, 30.1,
36, 33.4, 37, 33.6, 31.7, 29.7, 35.9, 28.5, 33.1, 33.9, 29, 36.5,
35.5, 29.2, 37.3, 40.3, 35.7, 32.6, 38.8, 40, 38.9, 39, 33.3,
33.5, 34.3, 38.8, 34.4, 36, 35.9, 35.1, 30.7, 38.1, 31.3, 35,
36.3, 32.4, 32.3, 35.5, 36.4, 36, 40.8, 34.2, 30.1, 35.6), wt_kg = c(0.165,
0.1412, 0.1043, 0.1225, 0.1247, 0.1099, 0.087, 0.1176, 0.1431,
0.1041, 0.1213, 0.0937, 0.0856, 0.1255, 0.1099, 0.124, 0.1361,
0.1384, 0.1021, 0.1113, 0.12, 0.1513, 0.1448, 0.0978, 0.138,
0.1232, 0.0942, 0.1881, 0.1038, 0.3498, 0.1122, 0.094, 0.1268,
0.1009, 0.1358, 0.12, 0.2388, 0.1456, 0.0982, 0.0903, 0.1005,
0.1252, 0.1138, 0.1476, 0.1326, 0.0849, 0.108, 0.0996, 0.1229,
0.1279, 0.0874, 0.1492, 0.1416, 0.1187, 0.193, 0.1383, 0.1125,
0.1449, 0.0941, 0.1265, 0.1823, 0.1455, 0.0948, 0.1603, 0.1119,
0.1124, 0.1641, 0.1259, 0.116, 0.086, 0.1361, 0.1284, 0.1403,
0.1461, 0.1195, 0.5985, 0.3099, 0.1829, 0.2688, 0.2244, 0.6214,
0.1554, 0.2475, 0.2976, 0.0683, 0.1731, 0.4751, 0.356, 0.1388,
0.5939, 0.2122, 0.2784, 0.3689, 0.3127, 0.2284, 0.1775, 0.6697,
0.5998, 0.8374, 0.5647, 0.3187, 0.1704, 0.1619, 0.1413, 0.1621,
0.3577, 0.319, 0.2846, 0.1815, 0.0776, 0.2567, 0.4483, 0.1337,
0.2798, 0.202, 0.1847, 0.1758, 0.1659, 0.1828, 0.3669, 0.3211,
0.1863, 0.2559, 0.6901, 0.6483, 0.0922, 0.088, 0.099, 0.0836,
0.094, 0.099, 0.1157, 0.1138, 0.1046, 0.0495, 0.0513, 0.119,
0.0761, 0.0936, 0.0564, 0.1438, 0.0636, 0.1134, 0.0641, 0.0594,
0.0713, 0.0733, 0.0804, 0.0853, 0.0689, 0.118, 0.0892, 0.0875,
0.0837, 0.0807, 0.1065, 0.1385, 0.1163, 0.1305, 0.0923, 0.0974,
0.1176, 0.0848, 0.1059, 0.157, 0.0932, 0.1127, 0.0779, 0.1048,
0.1327, 0.1688, 0.1096, 0.1304, 0.1173, 0.115, 0.0742, 0.129,
0.0629, 0.0992, 0.0758, 0.0722, 0.0535, 0.0958, 0.1721, 0.1017,
0.0766, 0.1099, 0.152, 0.128, 0.1185, 0.065, 0.1176, 0.0565,
0.0866, 0.163, 0.12, 0.0825, 0.1149, 0.0839, 0.0587, 0.1335,
0.0968, 0.0901, 0.1073, 0.0802, 0.0744, 0.1493, 0.1384, 0.1128,
0.0738, 0.1146, 0.1108, 0.08, 0.1285, 0.0829, 0.1116, 0.1368,
0.2348, 0.0995, 0.0989, 0.0748, 0.1484, 0.0629, 0.0823, 0.075,
0.1768, 0.0607, 0.1142, 0.1289, 0.1506, 0.1742, 0.0626, 0.1187,
0.1509, 0.1144, 0.0928, 0.0946, 0.099, 0.0717, 0.1318, 0.1025,
0.093, 0.0972, 0.1325, 0.1209, 0.0943, 0.1006, 0.1073, 0.1336,
0.1439, 0.1066, 0.0765, 0.0673, 0.1082, 0.0923, 0.1139, 0.068,
0.0758, 0.0868, 0.1499, 0.0779, 0.0794, 0.0575, 0.1392, 0.0915,
0.0845, 0.086, 0.084, 0.1049, 0.1486, 0.1573, 0.177, 0.1319,
0.13, 0.0872, 0.388, 0.4751, 0.2898, 0.2931, 0.2663, 0.2838,
0.3494, 0.3675, 0.4342, 0.2907, 0.3072, 0.2815, 0.2761, 0.1945,
0.3512, 0.615, 0.2195, 0.4818, 0.3684, 0.4056, 0.2841, 0.1617,
0.3425, 0.288, 0.1962, 0.1285, 0.1553, 0.1708, 0.1332, 0.1167,
0.1491, 0.2028, 0.1267, 0.2406, 0.1257, 0.1499, 0.1559, 0.1895,
0.1508, 0.1111, 0.1274, 0.1675, 0.2324, 0.1732, 0.1491, 0.1568,
0.1465, 0.1548, 0.1245, 0.1399, 0.0855, 0.1151, 0.1612, 0.1693,
0.1493, 0.1208, 0.088, 0.1106, 0.0654, 0.0827, 0.0794, 0.1331,
0.0834, 0.0837, 0.0619, 0.092, 0.1397, 0.071, 0.1035, 0.0676,
0.0729, 0.0906, 0.064, 0.0985, 0.0823, 0.1206, 0.155, 0.1438,
0.1357, 0.1695, 0.0834, 0.1359, 0.1289, 0.0764, 0.1249, 0.0775,
0.1139, 0.104, 0.1566, 0.1069, 0.0869, 0.1376, 0.1223, 0.105,
0.0996, 0.1356, 0.1335, 0.0951, 0.2162, 0.1744, 0.4547, 0.5789,
0.5555, 0.3899, 0.5037, 0.4281, 0.486, 1.0209, 0.5855, 0.5312,
0.488, 0.3133, 0.5054, 0.3724, 0.59, 0.8119, 0.3811, 0.797, 0.5139,
0.348, 0.7722, 0.743, 0.548, 0.8791, 0.9054, 0.5392, 0.4333,
0.5314, 0.4976, 0.5953, 0.4288, 0.5179, 0.5634, 0.5331, 0.4371,
0.5709, 0.5065, 0.8047, 0.5368, 0.5657, 0.5816, 0.4763, 0.5907,
0.533, 0.4384, 0.4949, 0.7277, 0.4445, 0.4894, 0.4655, 0.5384,
0.5106, 0.3343, 0.5186, 0.5262, 0.5311, 0.495, 0.4691, 0.3465,
0.5558, 0.5975, 0.4768, 0.4802, 0.4573, 0.4037, 0.3316, 0.5152,
0.4673, 0.2356, 0.2905, 0.5672, 0.2097, 0.2216, 0.7384, 0.4089,
0.6159, 0.5219, 0.2866, 0.2443, 0.2071, 0.3658, 0.5861, 0.5021,
0.6953, 0.5053, 0.3978, 0.3853, 0.6207, 0.2944, 0.507, 0.4412,
0.3424, 0.6597, 0.5892, 0.3295, 0.6505, 0.9334, 0.6674, 0.4919,
0.8392, 0.9123, 0.813, 0.8223, 0.5801, 0.5745, 0.5148, 0.8514,
0.5563, 0.6417, 0.6445, 0.5701, 0.4186, 0.8303, 0.46, 0.6041,
0.6537, 0.5221, 0.4782, 0.5657, 0.6499, 0.6667, 0.9074, 0.555,
0.6696, 0.6083)), .Names = c("length_cm", "wt_kg"), row.names = c(NA,
500L), class = "data.frame")
The relationship between length and weight is not linear. Unfortunately I could not include the whole data set here but when the whole data set is used a gam provides the best fit, unlike in this subset where loess is suggested.
I would like to focus on gam since an answer that works for the whole data set is what I am after.
It is obvious, even in the subset provided, that my data has some outliers, in the example data set (df) there are at least two obvious outliers.
library(ggplot2)
ggplot(df, aes(x=wt_kg, y=length_cm))+
geom_point()+
stat_smooth(method = "gam", formula = y ~ s(x), size = 1)
Moving forward with a gam approach I would like to generate the prediction interval so that I can identify which points fall in and out of say the 95% prediction interval.
This is extremely simple to do with a linear regression using predict:
l_model <- lm(wt_kg ~ length_cm, data=df)
df <- cbind(df, predict(l_model, interval = "prediction"))
Then simply plotting the upper and lower bounds of the interval
ggplot(df, aes(y=wt_kg, x=length_cm)) +
geom_ribbon(aes(ymin = lwr, ymax = upr),
fill = "blue", alpha = 0.2) +
geom_point()
But I can't seem to find a similar approach that works when using gam instead of lm. I have tried predict.gam from the mgcv package with no success.
library(mgcv)
df_model <- gam(wt_kg ~ length_cm, data=df)
gam_pred <- cbind(df, mgcv::predict.gam(df_model))
I don't get any errors when running this however what i get back is a single col of data which I am unsure how to interpret. Any help would be much appreciated.
I think that part of your code is:
require(broom)
require(gam)
mod <- gam(wt_kg ~ length_cm, data=df)
pred <- augment(mod)
But i dont understand the second ggplot2. "Pred" has the fitted value and others features about your regression, mainly .resid

Plotting Visual Tables

I am trying to plot a table in R and I am trying to format it so that is visually attractive for presentations.
I am trying to make it look like:
2000-01-01 2000-03-01 2000-06-01 ...
Revenue 3.5 4.6 7.9
Cost 2.3 2.7 5.6
And have the boxes that encapsulate the words, numbers, be the right size such that the column header dates and row labels are not squished . How do I do that??
plot.table(t(z))
z <- structure(c(68.2, 66.1, 64.7, 31.8, 30.9, 25.4, 36.1, 38.3, 38.3,
42.2, 43.3, 40.2, 41.9, 47.7, 50.8, 46.7, 48.2, 55.2, 58.2, 55.3,
58.2, 62.5, 62.2, 59.5, 59.3, 59.4, 58.7, 68.2, 64.9, 94.7, 75.7,
72, 73.5, 77.9, 83.8, 82.6, 83.8, 88.8, 91.5, 91.8, 92.6, 103.4,
100.5, 110.8, 105.4, 113.5, 110, 110.2, 118.9, 125.5, 122.5,
121.4, 122.6, 122.6, 127.4, 133.8, 131.5, 137.6, 142.7, 133,
39.8, 46.3, 38.2, 16.6, 14.5, 17.4, 17.7, 19.1, 19, 21.2, 20.9,
21.2, 19.9, 23.5, 25.2, 25.3, 23.3, 27.9, 29.3, 28.1, 29.6, 32.4,
31.3, 31.1, 31.3, 31.3, 31.5, 36, 36.9, 40.1, 39, 37.4, 38.1,
41.1, 43.1, 42.3, 42.4, 45.3, 46.4, 47.3, 48.2, 54.1, 51.6, 57.8,
54.3, 59.7, 56.1, 56.1, 60.9, 65.8, 62.8, 62.8, 62.1, 63.8, 65.5,
68.2, 66.7, 72.1, 75.1, 71.6), .Dim = c(60L, 2L), .Dimnames = list(
c("11323", "11413", "11504", "11596", "11688", "11778", "11869",
"11961", "12053", "12143", "12234", "12326", "12418", "12509",
"12600", "12692", "12784", "12874", "12965", "13057", "13149",
"13239", "13330", "13422", "13514", "13604", "13695", "13787",
"13879", "13970", "14061", "14153", "14245", "14335", "14426",
"14518", "14610", "14700", "14791", "14883", "14975", "15065",
"15156", "15248", "15340", "15431", "15522", "15614", "15706",
"15796", "15887", "15979", "16071", "16161", "16252", "16344",
"16436", "16526", "16617", "16709"), c("revenue", "cost")))

Resources