Sample, replicate and histogram in R - r

I want to choose 100 houses randomly from my dataset, and find the mean value of their total price. Then repeat this action 100 times, and for each time I repeat the action, calculate the mean price. And then plot all the mean values in a histogram. This is my code (rome is the house dataset):
run <- rome[sample(1:nrow(rome), 100, replace=FALSE),]
dun <- mean(run$PRICE)
c <- replicate(100, dun)
I also tried the for loop, which I'm pretty sure I need to use here, but there are mistakes in my code:
d <- for(i in 1:100){
run <- rome[sample(1:nrow(rome), 100, replace=FALSE),]
dun <- mean(run$PRICE)
c <- replicate(100, dun)
}
And finally hist(d) , which doesn't run because of the mistakes. Can you help me?
The data (price values):
good_struct <-
c(
47,
113,
165,
104.3,
62.5,
70,
127.5,
64.5,
145,
63.5,
58.9,
65,
48,
3.5,
12.8,
17.5,
36,
41.9,
53.5,
24.5,
24.5,
55.5,
60,
51,
46,
46,
44,
54.9,
42.5,
44,
44.9,
37.9,
33,
43.9,
49.6,
52,
37.5,
50,
35.9,
42.9,
107,
112,
44.9,
55,
102,
35.5,
62.9,
39,
110,
8,
62,
85.9,
57,
110,
67.7,
89.5,
70,
74,
13,
48,
24,
53.5,
34.5,
53,
87.5,
33.5,
24,
9.6,
30,
41,
30,
38.9,
20.7,
49.9,
18.6,
39,
34,
16,
18.9,
15.2,
41.5,
53,
22,
24.9,
6.7,
32.5,
30,
59,
29.5,
26,
16.5,
39,
48.9,
33.5,
46,
54,
57.9,
37.9,
32,
31,
34,
29,
32.5,
51.9,
31,
41.8,
48,
28,
35,
46.5,
51.9,
35.4,
16,
35,
35,
36.5,
35.9,
45,
40,
35,
38,
37,
23,
25.5,
39.5,
21.5,
9,
67.5,
13.4,
12.5,
28.5,
23,
33.5,
9,
11,
30.9,
31.65,
33,
33.4,
47,
40,
46,
45.5,
57,
29.9,
30,
34,
51,
64.5,
57.5,
85.5,
61,
38,
56.5,
60.4,
51.5,
54,
69,
56,
27.9,
37.5,
32.9,
22,
29.9,
39.9,
32.6,
38.5,
21.5,
25.9,
27.5,
22.9,
31.5,
8.5,
5.5,
33,
57,
47,
43.5,
43.9,
68.5,
44.25,
61,
40,
44.5,
57,
35,
35.1,
64.5,
40,
42.6,
50,
58,
58,
55,
43,
54,
39,
45,
42,
38.9,
43.215,
26.5,
30,
29.5
)

Since replicate is a wrapper to sapply, consider adjusting the call by passing in an expression that subsets a vector then calls mean:
random_mean_prices <- replicate(
100, mean(rome$PRICE[sample(1:nrow(rome), 100, replace=FALSE)])
)
hist(random_mean_prices)

Perhaps something like this?
rome <- data.frame(PRICE = rnorm(1e6,3e5,5e4),
ID = 1:1e6)
dun = NULL
for(i in 1:100){
run <- rome[sample(1:nrow(rome), 100, replace=FALSE),]
dun <- c(dun, mean(run$PRICE))
}
hist(dun)

Related

R/Open air Error in seq.int(0, to0 - from, by) : 'to' must be a finite number

I am trying to use the function "Summaryplot" from the Openair Package in R. But everytime I tried to use it with the next data matrix, you only have to use the next code to extract the info:
structure(list(Fecha = structure(c(1577840400, 1577844000, 1577847600,
1577851200, 1577854800, 1577858400, 1577862000, 1577865600, 1577869200,
1577872800, 1577876400, 1577880000, 1577883600, 1577887200, 1577890800,
1577894400, 1577898000, 1577901600, 1577905200, 1577908800, 1577912400,
1577916000, 1577919600, 1577923200, 1577926800), class = c("POSIXct",
"POSIXt"), tzone = "UTC"), PM10_CDAR = c(11.4, 8.3, 13.3, 16,
39.5, 35.4, 31, 48.7, 41, 34, 23.3, 16.5, 21.8, 15.7, 17.8, 12.7,
12.8, 16, 11.3, 7.9, 8.1, 10, 10.4, 7.7, 6.1), PM10_KEN = c(49.7,
72.4, 34.5, 50.3, 65.2, 59, 25.5, 19.6, 17.4, 14.3, 48.2, 34.8,
25.3, 56.7, 26, 45.6, 29, 30.5, 24.1, 22, 26.9, 22.2, 17.3, 19.1,
15.5), PM10_LAF = c(28.8, 69, 72.3, 35.1, 82, 44, 69, 73, 46,
43, 29.9, 25.1, 21.4, 15.8, 11.7, 16, 15, 12, 9, 10.8, 10.1,
11.9, 12.9, 12.4, 11.8), PM10_TUN = c(45, 57, 93, 69, 73, 60,
45, 69, 61, 46, 28, 20, 33, 54, 44, 27, 39, 37, 36, 41, 30, 29,
18, 4, 7), PM2.5_CDAR = c(9, 8, 10, 16, 34, 30, 33, 42, 33, 34,
6, 10, 9, 9, 15, 10, 9, 7, 9, 5, 5, 10, 6, 4, 2), PM2.5_KEN = c(49,
81, 110, 83, 63, 59, 79, 68, 84, 76, 48, 19, 22, 34, 36, 33,
29, 19, 13, 22, 3, 16, 16, 6, 9), PM2.5_LAF = c(35, 65, 53, 30,
60, 62, 64, 67, 36, 43, 21, 16, 11, 11, 10, 15, 15, 12, 9, 6,
6, 10, 10, 9, 10), PM2.5_TUN = c(39, 42, 66, 54, 52, 39, 33,
40, 42, 33, 21, 11, 13, 27, 22, 17, 21, 15, 17, 15, 13, 10, 6,
4, 2)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-25L))
the next error appears:
> summaryPlot(date.zoo_2, pollutant = "Kennedy_PM10")
Error in seq.int(0, to0 - from, by) : 'to' must be a finite number
In addition: Warning messages:
1: In min.default(numeric(0), na.rm = TRUE) :
no non-missing arguments to min; returning Inf
2: In max.default(numeric(0), na.rm = TRUE) :
no non-missing arguments to max; returning -Inf
I tried everything, to change the date column into date as. idx <- as.POSIXct(datos_meterologicos$Fecha); datos_meterologicos$Fecha <- read.zoo(datos_meterologicos, FUN=as.POSIXct, format = "%Y/%m/%d %H:%M", tz="UTC"). And frankly, I donĀ“t know what to do because the same error is still appearing.
The whole code is next
date.matrix_2 <- as.data.frame(datos_meterologicos[,-1])
idx_2 <- as.POSIXct(datos_meterologicos$Fecha)
date.xts_2 <- as.xts(date.matrix_2,order.by=idx_2)
date.zoo_2 <- as.zoo(date.xts_2)

Within a simple linear regression in R, how do I rescale age to estimate it's beta-coefficient per per year/5 years/10 years?

This might be a bit of a dumb question, but roaming around SO and other websites I can't find a straightforward answer: I've got data on the relationship between age and a continuous outcome:
library(dplyr)
library(tidyverse)
library(magrittr)
mydata <-
structure(list(ID = c(104, 157, 52, 152, 114, 221, 320, 125,
75, 171, 80, 76, 258, 82, 142, 203, 37, 92, 202, 58, 194, 38,
4, 137, 25, 87, 40, 117, 21, 255, 277, 315, 96, 134, 185, 94,
3, 153, 172, 65, 279, 209, 60, 13, 154, 160, 24, 29, 159, 213,
127, 74, 48, 126, 184, 132, 61, 141, 27, 49, 8, 39, 164, 162,
34, 205, 179, 119, 77, 135, 138, 165, 103, 253, 14, 20, 310,
84, 30, 273, 22, 105, 262, 116, 86, 83, 145, 31, 95, 51, 81,
271, 36, 50, 189, 2, 115, 7, 197, 54), age = c(67.1, 70.7, 53,
61.7, 66.1, 57.7, 54.1, 67.2, 60.9, 55.8, 40.7, 57.6, 64.1, 70.7,
47.5, 46.3, 66.7, 55, 63.3, 68.2, 61.2, 60.5, 52, 65.3, 48.9,
56.9, 62.7, 75.2, 61.4, 57.9, 53.6, 58.1, 51, 67.3, 63.9, 57,
43.2, 64.7, 62.8, 56.3, 51.7, 39.4, 45.2, 57.8, 55.7, 69.6, 61.5,
50.1, 73.7, 55.5, 65.2, 54.6, 49, 35.2, 52.9, 46.3, 55, 52.5,
54.2, 61, 57.4, 56.5, 53.6, 47.7, 64.2, 53.4, 60.9, 58.2, 60.7,
50.3, 48.3, 74.7, 52.1, 59.9, 52.4, 70.8, 61.2, 66.5, 55.4, 57.5,
59.2, 60.1, 52.3, 60.2, 54.8, 36.3, 61.5, 48.6, 56, 62, 64.8,
40.4, 68.3, 60, 69.1, 56.6, 45.3, 58.5, 52.3, 52), continuous_outcome = c(3636.6,
1128.2, 2007.5, 802.9, 332.3, 2636.1, 169.5, 67.9, 3261.8, 1920.3,
155.2, 1677.2, 198.2, 11189.7, 560.9, 633.1, 196.1, 13.9, 100.7,
7594.5, 1039.8, 83.9, 2646.8, 284.6, 306, 1135.6, 1883.1, 5681.4,
1706.2, 2241.1, 97.7, 1106.8, 1107.1, 290.8, 2123.4, 267, 115.3,
138.5, 152.7, 1338.9, 6709.8, 561.7, 1931.7, 3112.4, 1876.3,
3795.9, 5706.7, 7.4, 1324.9, 4095.4, 205.4, 1886, 177.3, 304.4,
1319.1, 415.9, 537.2, 3141.1, 740, 1976.7, 624.8, 983.1, 1163.5,
1432.6, 3730.4, 2023.4, 498.2, 652.5, 982.7, 1345.3, 138.4, 1505.1,
3528.1, 11.9, 884.5, 10661.6, 1911.4, 2800.8, 81.5, 396.4, 409.1,
417.3, 186, 1892.4, 1689.7, 0, 210.1, 210.5, 3484.5, 3196.8,
57.2, 20.2, 947, 540, 1603.1, 1571.8, 9.1, 149.2, 122, 63.2)), row.names = c(NA,
-100L), class = c("tbl_df", "tbl", "data.frame"))
As you can see in the tibble, age is a continuous variable measured to precision of 1 decimal place:
head(mydata)
# A tibble: 6 x 3
ID age continuous_outcome
<dbl> <dbl> <dbl>
1 104 67.1 3637.
2 157 70.7 1128.
3 52 53 2008.
4 152 61.7 803.
5 114 66.1 332.
6 221 57.7 2636.
When I fit a simple linear regression (for now assuming all assumptions are not-violated) I get the following beta-coefficient:
fit <-
lm(formula=continuous_outcome ~ age,
data=mydata)
fit
Call:
lm(formula = continuous_outcome ~ age, data = mydata)
Coefficients:
(Intercept) age
-3400.12 86.06
The beta-coefficient for age is 86.06. Does this mean that, as age is measured to 1 decimal place, that for every 0.1 years increase my outcome increases by 86.06? If so, how do I rescale age so that I am measuring the effect of age per, for example, 5 years or 10 years?
Thanks in advance!
The beta coefficient shows the amount that the dependent variable (DV, in this case continuous_outcome) will increase for every one unit increase in your independent variable (IV, in this case age in years).
If you want to show the relationship per 1/10th of a year, multiply your age column before fitting the model, or divide the beta coefficient by 10.
For your specific requests, since the beta coefficient is 86.06, you can multiply this by the number of years to get the increase of the continuous variable. So:
1 year increase = +86.06
5 year increase = +430.3 (86.06 * 5)
10 year increase = +860.6 (86.06 * 10)
To answer the last question (The estimate for the effect of age per 5 years), that would be 430.3, which is 86.06 * 5. So for every 5 years that a persons age increases, the continuous_outcome increases by 430.3 on average.

How to find and plot correlated variables in R?

I'm reviewing factors related to cancer and hoping to find how they're related to one another. After I input my .xslx file into R, I'm at a loss how to use that list of data to find the z-scores for each measurement and then using that matrix Z of z-scores of X to compute the covariance matrix of A using cov(Z). How do I plot and analyze entries of A (which tell me how correlated the measurements in X are)?
This is an example of the plot I'm trying to achieve with my factors (UrbanPop, Rape, Assault, and Murder are factors of that plot -- you can ignore the states plotted).
Thank you!
The factors I'm reviewing are Age, BMI, Glucose, Insulin, HOMA, Leptin, Adiponectin, Resistin, MCP.1
This is my data -- I used dput(mydata)
structure(list(Age = c(48, 83, 82, 68, 86, 49, 89, 76, 73, 75,
34, 29, 25, 24, 38, 44, 47, 61, 64, 32, 36, 34, 29, 35, 54, 45,
50, 66, 35, 36, 66, 53, 28, 43, 51, 67, 66, 69, 60, 77, 76, 76,
75, 69, 71, 66, 75, 78, 69, 85, 76, 77, 45, 45, 49, 34, 42, 68,
51, 62, 38, 69, 49, 51, 59, 45, 54, 64, 46, 44, 45, 44, 51, 72,
46, 43, 55, 43, 86, 41, 59, 81, 48, 71, 42, 65, 48, 85, 48, 58,
40, 82, 52, 49, 60, 49, 44, 40, 71, 69, 74, 66, 65, 72, 57, 73,
45, 46, 68, 75, 54, 45, 62, 65, 72, 86), BMI = c(23.5, 20.69049,
23.12467, 21.36752, 21.11111, 22.85446, 22.7, 23.8, 22, 23, 21.47,
23.01, 22.86, 18.67, 23.34, 20.76, 22.03, 32.03896, 34.52972,
36.51264, 28.57668, 31.97501, 32.27079, 30.27682, 30.48316, 37.03561,
38.57876, 31.44654, 35.25076, 34.17489, 36.21228, 36.79017, 35.85581,
34.42217, 27.68878, 29.60677, 31.23859, 35.0927, 26.34929, 35.58793,
29.21841, 27.2, 27.3, 32.5, 30.3, 27.7, 25.7, 25.3, 29.4, 26.6,
27.1, 25.9, 21.30395, 20.83, 20.95661, 24.24242, 21.35991, 21.08281,
19.13265, 22.65625, 22.49964, 21.51386, 21.36752, 22.89282, 22.83288,
23.1405, 24.21875, 22.22222, 20.83, 19.56, 20.26, 24.74, 18.37,
23.62, 22.21, 26.5625, 31.97501, 31.25, 26.66667, 26.67276, 28.67263,
31.64037, 32.46191, 25.5102, 29.29688, 29.66655, 28.125, 27.68878,
31.25, 29.15452, 30.83653, 31.21748, 30.80125, 32.46191, 31.23141,
29.77778, 27.88762, 27.63605, 27.91552, 28.44444, 28.65014, 26.5625,
30.91558, 29.13632, 34.83815, 37.10938, 29.38476, 33.18, 35.56,
30.48, 36.05, 26.85, 26.84, 32.05, 25.59, 27.18), Glucose = c(70,
92, 91, 77, 92, 92, 77, 118, 97, 83, 78, 82, 82, 88, 75, 86,
84, 85, 95, 87, 86, 87, 84, 84, 90, 83, 106, 90, 90, 80, 101,
101, 87, 89, 77, 79, 82, 101, 103, 76, 83, 94, 85, 93, 102, 90,
94, 60, 89, 96, 110, 85, 102, 74, 94, 92, 93, 102, 93, 92, 95,
112, 78, 103, 98, 116, 86, 98, 88, 114, 92, 106, 105, 105, 86,
101, 92, 103, 201, 97, 77, 100, 99, 112, 98, 85, 90, 196, 199,
139, 128, 100, 87, 134, 131, 70, 99, 103, 104, 108, 88, 89, 97,
83, 95, 134, 90, 92, 131, 152, 119, 92, 100, 97, 82, 138), Insulin = c(2.707,
3.115, 4.498, 3.226, 3.549, 3.226, 4.69, 6.47, 3.35, 4.952, 3.469,
5.663, 4.09, 6.107, 5.782, 7.553, 2.869, 18.077, 4.427, 14.026,
4.345, 4.53, 5.81, 4.376, 5.537, 6.76, 6.703, 9.245, 6.817, 6.59,
15.533, 10.175, 8.576, 23.194, 3.855, 5.819, 4.181, 5.646, 5.138,
3.881, 5.376, 14.07, 5.197, 5.43, 8.34, 6.042, 8.079, 3.508,
10.704, 4.462, 26.211, 4.58, 13.852, 4.56, 12.305, 21.699, 2.999,
6.2, 4.364, 3.482, 5.261, 6.683, 2.64, 2.74, 6.862, 4.902, 3.73,
5.7, 3.42, 15.89, 3.44, 58.46, 6.03, 4.42, 36.94, 10.555, 16.635,
4.328, 41.611, 22.033, 3.188, 9.669, 28.677, 10.395, 4.172, 14.649,
2.54, 51.814, 12.162, 16.582, 41.894, 18.077, 30.212, 24.887,
30.13, 8.396, 9.208, 2.432, 18.2, 8.808, 3.012, 6.524, 10.491,
10.949, 12.548, 5.636, 4.713, 5.75, 8.15, 7.01, 11.91, 3.33,
4.53, 5.73, 2.82, 19.91), HOMA = c(0.467409, 0.706897, 1.009651,
0.612725, 0.805386, 0.732087, 0.890787, 1.883201, 0.801543, 1.013839,
0.667436, 1.145436, 0.827271, 1.33, 1.06967, 1.6, 0.59, 3.790144,
1.037394, 3.00998, 0.921719, 0.972138, 1.203832, 0.906707, 1.229214,
1.383997, 1.752611, 2.05239, 1.513374, 1.300427, 3.869788, 2.534932,
1.84041, 5.091856, 0.732193, 1.133929, 0.845677, 1.406607, 1.305395,
0.727558, 1.100646, 3.262364, 1.089638, 1.245642, 2.098344, 1.341324,
1.873251, 0.519184, 2.349885, 1.056602, 7.111918, 0.960273, 3.485163,
0.832352, 2.853119, 4.924226, 0.687971, 1.55992, 1.001102, 0.790182,
1.232828, 1.84629, 0.507936, 0.696143, 1.658774, 1.402626, 0.791257,
1.37788, 0.742368, 4.468268, 0.780651, 15.28534, 1.56177, 1.14478,
7.836205, 2.629602, 3.775036, 1.099601, 20.63073, 5.271762, 0.605507,
2.38502, 7.002923, 2.871792, 1.008511, 3.071407, 0.56388, 25.05034,
5.96992, 5.685415, 13.22733, 4.458993, 6.483495, 8.225983, 9.736007,
1.449709, 2.248594, 0.61789, 4.668907, 2.346451, 0.653805, 1.432235,
2.510147, 2.241625, 2.940415, 1.862886, 1.046286, 1.304867, 2.633537,
2.628283, 3.495982, 0.755688, 1.1174, 1.370998, 0.570392, 6.777364
), Leptin = c(8.8071, 8.8438, 17.9393, 9.8827, 6.6994, 6.8317,
6.964, 4.311, 4.47, 17.127, 14.57, 35.59, 20.45, 8.88, 15.26,
14.09, 26.65, 30.7729, 21.2117, 49.3727, 15.1248, 28.7502, 45.6196,
39.2134, 12.331, 39.9802, 46.6401, 45.9624, 50.6094, 10.2809,
74.7069, 27.1841, 68.5102, 31.2128, 20.092, 21.9033, 16.2247,
83.4821, 24.2998, 21.7863, 28.562, 35.891, 10.39, 15.145, 56.502,
24.846, 65.926, 6.633, 45.272, 7.85, 21.778, 13.74, 7.6476, 7.7529,
11.2406, 16.7353, 19.0826, 9.6994, 11.0816, 9.8648, 8.438, 32.58,
6.3339, 8.0163, 14.9037, 17.9973, 8.6874, 12.1905, 12.87, 13.08,
7.65, 18.16, 9.62, 21.78, 10.16, 9.8, 37.2234, 25.7816, 47.647,
44.7059, 17.022, 38.8066, 46.076, 19.0653, 12.2617, 26.5166,
15.5325, 70.8824, 18.1314, 22.8884, 31.0385, 31.6453, 29.2739,
42.3914, 37.843, 51.3387, 12.6757, 14.3224, 53.4997, 14.7485,
31.1233, 14.9084, 44.0217, 26.8081, 33.1612, 41.4064, 23.8479,
18.69, 17.87, 50.53, 89.27, 54.68, 12.45, 61.48, 24.96, 90.28
), Adiponectin = c(9.7024, 5.429285, 22.43204, 7.16956, 4.81924,
13.67975, 5.589865, 13.25132, 10.35873, 11.57899, 13.11, 26.72,
23.67, 36.06, 17.95, 20.32, 38.04, 7.780255, 5.46262, 5.1, 8.6,
7.64276, 6.209635, 9.048185, 9.73138, 4.617125, 4.667645, 10.35526,
6.966895, 5.065915, 7.53955, 20.03, 4.7942, 8.300955, 3.19209,
2.19428, 4.267105, 6.796985, 2.19428, 8.12555, 7.36996, 9.34663,
9.000805, 11.78796, 8.13, 7.652055, 3.74122, 10.5673, 8.2863,
7.9317, 4.935635, 9.75326, 21.05663, 8.237405, 8.412175, 21.82375,
8.462915, 8.574655, 5.80762, 11.23624, 4.77192, 4.138025, 3.886145,
9.349775, 4.230105, 4.294705, 3.70523, 4.783985, 18.55, 20.37,
16.67, 16.1, 12.76, 17.86, 9.76, 6.420295, 11.01846, 12.71896,
5.357135, 13.49487, 16.44048, 10.63653, 21.57, 5.4861, 6.695585,
7.28287, 10.22231, 7.901685, 4.104105, 10.26266, 6.160995, 9.92365,
6.26854, 10.79394, 8.40443, 10.73174, 5.47817, 6.78387, 1.65602,
5.288025, 7.65222, 8.42996, 3.71009, 2.78491, 2.36495, 3.335665,
6.644245, 9.16, 11.9, 10.06, 8.01, 12.1, 21.42, 22.54, 33.75,
14.11), Resistin = c(7.99585, 4.06405, 9.27715, 12.766, 10.57635,
10.3176, 12.9361, 5.1042, 6.28445, 7.0913, 6.92, 4.58, 5.14,
6.85, 9.35, 7.64, 3.32, 13.68392, 6.70188, 17.10223, 9.1539,
5.62592, 24.6033, 16.43706, 10.19299, 8.70448, 11.78388, 23.3819,
22.03703, 15.72187, 22.32024, 10.26309, 21.44366, 6.71026, 10.37518,
4.2075, 3.29175, 82.1, 20.2535, 17.2615, 8.04375, 8.4156, 7.5767,
11.78796, 4.2989, 6.7052, 4.49685, 4.6638, 4.53, 9.6135, 8.49395,
11.774, 23.03408, 28.0323, 23.1177, 12.06534, 17.37615, 13.74244,
5.57055, 10.69548, 15.73606, 15.69876, 22.94254, 11.55492, 8.2049,
5.2633, 10.34455, 13.91245, 13.56, 4.62, 7.84, 5.31, 3.21, 4.82,
5.68, 16.1, 7.16514, 38.6531, 24.3701, 27.8325, 31.6904, 29.5583,
10.15726, 42.7447, 53.6717, 19.46324, 16.11032, 55.2153, 53.6308,
13.97399, 17.55503, 19.94687, 24.24591, 5.768, 11.50005, 20.76801,
23.03306, 26.0136, 49.24184, 16.48508, 18.35574, 14.91922, 20.4685,
14.76966, 9.9542, 6.89235, 15.55625, 8.89, 4.19, 11.73, 5.06,
10.96, 7.32, 10.33, 3.27, 4.35), MCP.1 = c(417.114, 468.786,
554.697, 928.22, 773.92, 530.41, 1256.083, 280.694, 136.855,
318.302, 354.6, 174.8, 313.73, 632.22, 165.02, 63.61, 191.72,
444.395, 252.449, 588.46, 534.224, 572.783, 904.981, 733.797,
1227.91, 586.173, 887.16, 1102.11, 667.928, 581.313, 864.968,
695.754, 358.624, 960.246, 473.859, 585.307, 634.602, 263.499,
378.996, 618.272, 698.789, 377.227, 335.393, 270.142, 200.976,
225.88, 206.802, 209.749, 215.769, 232.006, 45.843, 488.829,
552.444, 382.955, 573.63, 481.949, 321.919, 448.799, 90.6, 703.973,
199.055, 713.239, 737.672, 359.232, 355.31, 518.586, 635.049,
395.976, 301.21, 220.66, 193.87, 244.75, 513.66, 195.94, 312,
806.724, 483.377, 775.322, 1698.44, 783.796, 910.489, 426.175,
738.034, 799.898, 1041.843, 1698.44, 1698.44, 1078.359, 1698.44,
923.886, 638.261, 994.316, 764.667, 656.393, 396.021, 602.486,
407.206, 293.123, 256.001, 353.568, 572.401, 269.487, 396.648,
232.018, 655.834, 788.902, 621.273, 209.19, 198.4, 99.45, 218.28,
268.23, 330.16, 314.05, 392.46, 90.09)), class = "data.frame", row.names = c(NA,
-116L))
To get the z-scores you can use scale() and to get the covariance matrix you can use cov(). But, there is no reason to produce the covariance matrix from the z-scores instead of the raw data. You can also visualize a correlation matrix using the corrplot function in the corrplot package. The corrplot function will also take the raw data as an input. Not the covariance matrix.

R using ggplot2 to plot mixEM data

I have a vector of length 370 that I would like to fit to a mixture of Gaussians. I have followed the example here: Any suggestions for how I can plot mixEM type data using ggplot2 to plot the data, but as you can see from the image link, my results are different from those in the example: Plot of a mixture of three Gaussians
Here is a snippet of the code that I used:
library(ggplot2)
library(mixtools)
gg.mixEM <- function(EM) {
require(ggplot2)
x <- with(EM,seq(min(x),max(x),len=1000))
pars <- with(EM,data.frame(comp=colnames(posterior), mu, sigma,lambda))
em.df <- data.frame(x=rep(x,each=nrow(pars)),pars)
em.df$y <- with(em.df,lambda*dnorm(x,mean=mu,sd=sigma))
ggplot(data.frame(x=EM$x),aes(x,y=..density..)) +
geom_histogram(fill=NA,color="black",bins=41)+
geom_polygon(data=em.df,aes(x,y,fill=comp),color="grey50", alpha=0.5)+
scale_fill_discrete("Component\nMeans",labels=format(em.df$mu,digits=3))+
theme_bw()
}
dput(gradesCS)
c(6.5, 22.375, 20.5, 24.25, 33.25, 24, 26.75, 30.75, 35.5, 23.5,
26.875, 24, 35.5, 29.875, 29.75, 31.25, 32.875, 33.75, 34, 29,
33, 24, 12, 26.375, 6.75, 31.25, 21.625, 32.875, 29.25, 27.125,
28.25, 26.25, 24.875, 35.5, 26.5, 37.5, 35.375, 27.5, 33, 27.5,
39.5, 34.25, 28.125, 28, 32.625, 37.625, 34.5, 29.5, 38.5, 37.5,
28.75, 38, 16, 35.75, 30, 33.5, 36, 31.125, 29.75, 32.5, 35,
24.375, 23.375, 28, 32.125, 36, 31.5, 33.5, 1.5, 30.5, 37, 29.5,
29.5, 31.125, 32.5, 20.5, 28.75, 30.25, 32.5, 28, 36, 37.5, 28.5,
35.5, 30.25, 36.375, 36, 23.25, 31.5, 25.125, 33.5, 34, 19.5,
31.75, 39.5, 33.25, 24.875, 26.75, 23.375, 34, 16.5, 37, 33.375,
31.25, 31.75, 35.5, 32, 27.5, 23.375, 20.625, 35.5, 31.5, 25.375,
24.5, 27.25, 25.25, 35.75, 24, 28.25, 33.125, 31.5, 39.5, 39.25,
24.75, 37, 25.5, 34.75, 34, 20.25, 37.625, 30.5, 32.375, 15,
32.75, 33.5, 32.75, 31.5, 29.25, 30, 37.25, 34.5, 23, 32.5, 38.25,
35.625, 33, 35, 31.125, 37, 28.125, 29.25, 31.75, 34.75, 34.625,
36.625, 15.25, 35.5, 37, 33.5, 30.875, 35, 31.625, 22.75, 31,
31.125, 25.125, 35.5, 2, 36.125, 25.25, 32.5, 28, 38.5, 35.5,
38.5, 30.5, 34, 28.125, 38, 29.25, 29.75, 33.25, 25.125, 35,
34.5, 32, 35, 26.875, 20.5, 35.5, 23.25, 26.25, 36, 35.5, 38,
39.25, 22, 38.5, 31, 35.5, 33.5, 31.5, 26, 30.375, 35.75, 29.75,
34, 37.625, 38, 35.5, 34.25, 24.375, 30, 33.75, 39.5, 36.5, 36.5,
32, 36.5, 29.75, 29.75, 25, 32, 29.25, 32.125, 31.25, 38, 33.5,
33.5, 38.5, 37.25, 31.125, 33.5, 31, 28, 29.75, 36, 36, 37, 22,
29, 36.5, 32.25, 30.75, 38.5, 24.125, 28.75, 38.25, 32.5, 34.75,
29, 30.375, 33.5, 31.25, 30, 33, 33.5, 27.5, 26.5, 30.25, 34.75,
33.5, 39, 33.25, 38.5, 27, 39.5, 34.25, 33, 35.125, 38, 31.25,
32.75, 22.75, 31.125, 34.5, 33, 37.125, 31, 18.75, 30.25, 31.75,
34, 30.75, 29, 34.5, 36, 36.5, 31.5, 26, 27.5, 27.5, 36.5, 19.75,
33, 35.125, 16, 19.75, 31.5, 38.5, 34.25, 36.5, 27, 22, 21.75,
36, 31.5, 33, 29.75, 32.5, 26.25, 33.5, 35.75, 33, 39, 35, 34.25,
28.5, 25.5, 30.5, 28, 21.25, 39.125, 22.75, 28.375, 29.125, 30,
34.125, 31.25, 32, 26.25, 36, 24.5, 30.25, 32.75, 29.625, 16,
34, 16.75, 25.25, 33, 38, 28, 24.75, 29.75, 24.5, 19.25, 32.75,
27.5, 24.75, 17.375, 25.25, 30.125, 38, 28, 35, 11.75, 27.75,
38, 28.625, 31.25, 31.25, 32, 17.25, 18.25, 32.625, 25.5, 27.5,
35.25, 35.5)
b <- gradesCS
c <- b[sample(length(b), length(b)) ]
c3 <- normalmixEM(c, lambda=NULL, mu=NULL, sigma=NULL,k=3,maxit=1000,epsilon = 1e-2)
gg.mixEM(c3)
The problem is that polygons freak out if they don't have continuous drawing space (e.g. if you end abruptly at 0, but the polygon function has not reached 0).
In the first line of the ggplot function, add extra spacing on each side of x. I'm going with 5 here, but you just need enough for the function to hit 0.
x <- with(EM,seq(min(x)-5,max(x)+5,len=1000))
In the bottom, we cut off the excess space with
coord_cartesian(xlim = c(0,42),
expand = c(0,0))
This renders the graph with your spacing, and then "zooms in" on the selected x interval.
fit_test <- normalmixEM(
test,
k = 2)
gg.mixEM <- function(EM) {
require(ggplot2)
x <- with(EM,seq(min(x)-5,max(x)+5,len=1000))
pars <- with(EM,data.frame(comp=colnames(posterior), mu, sigma,lambda))
em.df <- data.frame(x=rep(x,each=nrow(pars)),pars)
em.df$y <- with(em.df,lambda*dnorm(x,mean=mu,sd=sigma))
ggplot(data.frame(x=EM$x),aes(x,y=..density..)) +
geom_histogram(fill=NA,color="black",bins=41)+
geom_polygon(data=em.df,aes(x,y,fill=comp),color="grey50", alpha=0.5)+
scale_fill_discrete("Component\nMeans",labels=format(em.df$mu,digits=3))+
theme_bw() +
coord_cartesian(xlim = c(0,42),
expand = c(0,0))
}
gg.mixEM(fit_test)
And we get

Fitting Gamma distribution to data in R using optim, ML

Im kinda new to R. I have a dataset, which also includes data of family income and I have to fit a Gamma distribution to this data, using the Maximum Likelihood Estimates. It is specifically told that we need to use the package optim, and not fitdistr. So this is my code:
t1 <- sum(log(newdata$faminc))
t2 <- sum(newdata$faminc)
obs <- nrow(newdata)
lh.gamma <- function(par) {
-((par[1]-1)*t1 - par[2]*t2 - obs*par[1]*log(par[2]) - obs*lgamma(par[1]))
}
#initial guess for a = mean^2(x)/var(x) and b = mean(x) / var(x)
a1 <- (mean(newdata$faminc))^2/var(newdata$faminc)
b1 <- mean(newdata$faminc)/var(newdata$faminc)
init <- c(a1,b1)
q <- optim(init, lh.gamma, method = "BFGS")
q
Also tried filling in just values in the init vector, and including this piece of code;
dlh.gamma <- function(par){
cbind(obs*digamma(par[1])+obs*log(par[2])-t2,
obs*par[1]/par[2]-1/par[2]^2*t1)
}
and then the optim would look like :
q <- optim(init, lh.gamma, dhl.gamma, method="BFGS")
None of it 'works'. First, when I tried the code at school computers, it gave me very huge numbers for the shape and rate parameters, which was not possible. Now, trying at home, I get this:
> q <- optim(init, lh.gamma, method = "BFGS")
Error in optim(init, lh.gamma, method = "BFGS") :
non-finite finite-difference value [2]
In addition: There were 50 or more warnings (use warnings() to see the first 50)
> q
function (save = "default", status = 0, runLast = TRUE)
.Internal(quit(save, status, runLast))
<bytecode: 0x000000000eaac960>
<environment: namespace:base>
q is not even 'created'. Except for when I include the dlh.gamma part above, but then I just get huge numbers again and no convergence.
Anybody who knows what goes wrong/what to do?
Edit:
> dput(sample(newdata$faminc, 500))
c(42.5, 87.5, 22.5, 17.5, 12.5, 30, 30, 17.5, 42.5, 62.5, 62.5,
30, 30, 150, 22.5, 30, 42.5, 30, 17.5, 8.75, 42.5, 42.5, 42.5,
62.5, 42.5, 30, 17.5, 87.5, 62.5, 150, 42.5, 150, 42.5, 42.5,
42.5, 6.25, 62.5, 87.5, 6.25, 87.5, 30, 150, 22.5, 62.5, 42.5,
150, 17.5, 42.5, 42.5, 42.5, 62.5, 22.5, 42.5, 42.5, 30, 62.5,
30, 62.5, 87.5, 87.5, 42.5, 22.5, 62.5, 22.5, 8.75, 30, 30, 17.5,
87.5, 8.75, 62.5, 30, 17.5, 22.5, 62.5, 42.5, 30, 17.5, 62.5,
8.75, 62.5, 42.5, 150, 30, 62.5, 87.5, 17.5, 62.5, 30, 62.5,
87.5, 42.5, 62.5, 30, 62.5, 42.5, 87.5, 150, 12.5, 42.5, 62.5,
42.5, 62.5, 62.5, 150, 30, 87.5, 12.5, 17.5, 42.5, 62.5, 30,
6.25, 62.5, 42.5, 12.5, 62.5, 8.75, 17.5, 42.5, 62.5, 87.5, 8.75,
62.5, 30, 62.5, 87.5, 42.5, 62.5, 62.5, 12.5, 150, 42.5, 62.5,
12.5, 62.5, 42.5, 62.5, 62.5, 87.5, 42.5, 62.5, 30, 42.5, 150,
42.5, 30, 62.5, 62.5, 87.5, 42.5, 30, 62.5, 62.5, 42.5, 42.5,
30, 62.5, 42.5, 42.5, 62.5, 62.5, 150, 42.5, 30, 42.5, 62.5,
17.5, 62.5, 17.5, 150, 8.75, 62.5, 30, 62.5, 42.5, 42.5, 22.5,
150, 62.5, 42.5, 62.5, 62.5, 22.5, 30, 62.5, 30, 150, 42.5, 42.5,
42.5, 62.5, 30, 12.5, 30, 150, 12.5, 8.75, 22.5, 30, 22.5, 30,
42.5, 42.5, 42.5, 30, 12.5, 62.5, 42.5, 30, 22.5, 42.5, 87.5,
22.5, 12.5, 42.5, 62.5, 62.5, 62.5, 30, 42.5, 30, 62.5, 30, 62.5,
12.5, 22.5, 42.5, 22.5, 87.5, 30, 22.5, 17.5, 42.5, 62.5, 17.5,
250, 150, 42.5, 30, 42.5, 30, 62.5, 17.5, 87.5, 22.5, 150, 62.5,
42.5, 6.25, 87.5, 62.5, 42.5, 30, 42.5, 62.5, 42.5, 87.5, 62.5,
150, 42.5, 30, 6.25, 22.5, 30, 42.5, 42.5, 62.5, 250, 8.75, 150,
42.5, 30, 42.5, 30, 42.5, 42.5, 30, 30, 150, 22.5, 62.5, 30,
8.75, 150, 62.5, 87.5, 150, 42.5, 30, 42.5, 42.5, 42.5, 30, 8.75,
42.5, 42.5, 30, 22.5, 62.5, 17.5, 62.5, 62.5, 42.5, 8.75, 42.5,
12.5, 12.5, 150, 42.5, 42.5, 17.5, 42.5, 62.5, 62.5, 42.5, 42.5,
30, 42.5, 62.5, 30, 62.5, 42.5, 42.5, 42.5, 22.5, 62.5, 62.5,
62.5, 22.5, 150, 62.5, 42.5, 62.5, 42.5, 30, 30, 62.5, 22.5,
62.5, 87.5, 62.5, 42.5, 42.5, 22.5, 62.5, 62.5, 30, 42.5, 42.5,
8.75, 87.5, 42.5, 42.5, 87.5, 30, 62.5, 17.5, 62.5, 42.5, 17.5,
22.5, 62.5, 8.75, 62.5, 22.5, 22.5, 22.5, 42.5, 17.5, 22.5, 62.5,
42.5, 42.5, 42.5, 42.5, 42.5, 30, 30, 8.75, 30, 42.5, 62.5, 22.5,
6.25, 30, 42.5, 62.5, 17.5, 62.5, 42.5, 8.75, 22.5, 30, 17.5,
22.5, 62.5, 42.5, 150, 87.5, 22.5, 12.5, 62.5, 62.5, 62.5, 30,
42.5, 22.5, 62.5, 87.5, 30, 42.5, 62.5, 22.5, 87.5, 30, 30, 22.5,
87.5, 87.5, 250, 30, 62.5, 250, 62.5, 42.5, 42.5, 62.5, 62.5,
42.5, 6.25, 62.5, 62.5, 62.5, 42.5, 42.5, 150, 62.5, 62.5, 30,
150, 22.5, 87.5, 30, 150, 17.5, 8.75, 62.5, 42.5, 62.5, 150,
42.5, 22.5, 42.5, 42.5, 17.5, 62.5, 17.5, 62.5, 42.5, 150, 250,
22.5, 42.5, 30, 62.5, 62.5, 42.5, 42.5, 30, 150, 150, 42.5, 17.5,
17.5, 42.5, 8.75, 62.5, 42.5, 42.5, 22.5, 150, 62.5, 30, 250,
62.5, 87.5, 62.5, 8.75, 62.5, 30, 30, 8.75, 17.5, 17.5, 150,
22.5, 62.5, 62.5, 42.5)
The faminc variable is in 1000s
Edit2:
Okay, the code is good, but now I try to fit the distribution over the histogram using the following:
x <- rgamma(500,shape=q$par[1],scale=q$par[2])
hist(newdata$faminc, prob = TRUE)
curve(dgamma(x, shape=q$par[1], scale=q$par[2]), add=TRUE, col='blue')
It just produces a flat blue line at the x-axis..
You've got some things going on I haven't been able to work out, but here's a demonstration of the estimation.
Let's start by generating some data (so we know if the optimization is working). I only changed your optimization function below, and used Nelder-Mead instead of the quasi-Newton.
set.seed(23)
a <- 2 # shape
b <- 3 # rate
require(data.table)
newdata <- data.table(faminc = rgamma(10000, a, b))
t1 <- sum(log(newdata$faminc))
t2 <- sum(newdata$faminc)
obs <- nrow(newdata)
llf <- function(x){
a <- x[1]
b <- x[2]
# log-likelihood function
return( - ((a - 1) * t1 - b * t2 - obs * a * log(1/b) - obs * log(gamma(a))))
}
# initial guess for a = mean^2(x)/var(x) and b = mean(x) / var(x)
a1 <- (mean(newdata$faminc))^2/var(newdata$faminc)
b1 <- mean(newdata$faminc)/var(newdata$faminc)
q <- optim(c(a1, b1), llf)
q$par
[1] 2.024353 3.019376
I'd say we're pretty close.
With your data:
(est <- q$par)
[1] 2.21333613 0.04243384
theoretical <- data.table(true = rgamma(10000, est[1], est[2]))
library(ggplot2)
ggplot(newdata, aes(x = faminc)) + geom_density() + geom_density(data = theoretical, aes(x = true, colour = "red")) + theme(legend.position = "none")
Not great, but reasonable for 500 obs.
Response to OP Edit 2:
You should look more closely at the functions you're using, curve accepts a function argument, not vector values:
gamma_density = function(x, a, b) ((b^a)/gamma(a)) * (x^(a - 1)) * exp(-b * x)
hist(newdata$faminc, prob = TRUE, ylim = c(0, 0.015))
curve(gamma_density(x, a = q$par[1], b = q$par[2]), add=TRUE, col='blue')

Resources