Too many geom_points after facetting in ggplot2 - r

Running the following script I was hoping to have one datapoint for each of the six terms with different colors depending on the dataset, facetted by adjustment. However, I get three and four point for each term in each facet. Any idea how this can happen when I only have 24 rows in the dataset?
library(ggplot2)
tb5 <- structure(list(term = structure(c(1L, 2L, 3L, 4L, 5L, 6L, 1L,
2L, 3L, 4L, 5L, 6L, 1L, 2L, 3L, 4L, 5L, 6L, 1L, 2L, 3L, 4L, 5L,
6L), .Label = c("A", "B", "C", "D", "E", "F"), class = "factor"),
coef = c(-1.444, 0.035, -0.034, 0.005, 0.001, 2.43, -1.032,
0.032, -0.024, 0.025, 0.003, 1.758, -1.148, 0.02, 0.003,
0.027, 0.003, 12.713, -1.494, 0.028, -0.021, 0.007, 0.004,
13.499), ci.lb = c(-1.826, 0.025, -0.087, -0.011, -0.004,
0.3, -1.293, 0.026, -0.061, 0.016, -0.001, -0.273, -1.48,
0.011, -0.045, 0.014, -0.003, 11.858, -1.931, 0.015, -0.08,
-0.014, -0.002, 12.624), ci.ub = c(-1.071, 0.045, 0.019,
0.022, 0.007, 7.305, -0.775, 0.038, 0.012, 0.035, 0.007,
6.613, -0.816, 0.029, 0.051, 0.039, 0.008, 13.569, -1.056,
0.04, 0.038, 0.027, 0.01, 14.375), Adjusted = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("Adjusted", "Unadjusted"
), class = "factor"), Dataset = structure(c(1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L), .Label = c("a", "b"), class = "factor")), .Names = c("term",
"coef", "ci.lb", "ci.ub", "Adjusted", "Dataset"), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -24L))
ggplot(data = tb5,aes(x=term,y=coef,color=Dataset))+geom_point()+
facet_grid(facets = ~Adjusted)+
geom_jitter(height = .8)

Related

Different results in Stata and R with the "same" anova code

I have some Stata code and I want to replicate the results in R. However, even with the same dataset and, I think, the same code, I get different results in R from those in Stata. I think it could be because Stata makes the order of the regression different than keyed in.
Do I need exactly the same order as in Stata to get the same results and how can I do this?
I changed all the variables to factors and tried again but the problem is still there.
I noticed that when I change the order of the explanatory variables I get different results, but I don`t find "the right order" to replicate the Stata results.
Stata code:
. anova testm2 c.testm1 i.hptreat c.cortm1 c.cortm2 i.female if inelig == 0 & anyoutv1 == 0
Number of obs =39 R-squared =0.7048
Root MSE= 16.0144 Adj R-squared =0.6601
Source | Partial SS df MS F Prob>F
---------------------------------------------------------------
Model | 20209.281 5 4041.8563 15.76 0.0000
testm1 | 3516.6527 1 3516.6527 13.71 0.0008
hptreat| 1183.5007 1 1183.5007 4.61 0.0391
cortm1 | 8.5753841 1 8.5753841 0.03 0.8560
cortm2 | 2810.9353 1 2810.9353 10.96 0.0023
female | 2557.3444 1 2557.3444 9.97 0.0034
Residual| 8463.2532 33 256.46222
----------------------------------------------------------------
Total | 28672.535 38 754.54038
R code:
FosseTest<-aov(testm2~testm1+hptreat+cortm1+cortm2+female,data=X2data)
summary(FosseTest)
Df Sum Sq Mean Sq F value Pr(>F)
testm1 1 15121 15121 58.962 7.68e-09 ***
hptreat 1 524 524 2.043 0.16228
cortm1 1 23 23 0.089 0.76715
cortm2 1 1984 1984 7.735 0.00888 **
female 1 2557 2557 9.972 0.00339 **
Residuals 33 8463 256
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
You can see that I get totally different values in the replication.
in the X2data Set I already subset the values for if inelig == 0 & anyoutv1 == 0
for the reconstruction of the data:
dput(X2data)
structure(list(id = c(29L, 30L, 31L, 32L, 34L, 35L, 36L, 37L,
39L, 41L, 42L, 43L, 44L, 46L, 47L, 49L, 50L, 51L, 52L, 53L, 54L,
55L, 57L, 58L, 59L, 60L, 61L, 62L, 64L, 65L, 66L, 67L, 68L, 69L,
70L, 71L, 72L, 73L, 74L), inelig = structure(c(1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), .Label = c("Analytic sample (keep)", "Ineligible (drop)"
), class = "factor"), ccydrop = c(0L, 0L, 0L, 0L, 1L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L
), cortm1v2 = c(0.003, 0.086, 0.047, 0.106, NA, 0.153, 0.086,
0.005, 0.133, 0.036, 0.03, 0.015, 0.014, 0.111, 0.389, 0.298,
0.4, 0.215, 0.062, 0.021, 0.075, 0.073, 0.033, 0.243, 0.126,
0.147, 0.019, 0.048, 0.28, 0.052, 0.039, 0.105, 0.111, 0.133,
0.065, 0.051, 0.143, 0.127, 0.095), cortm2v2 = c(0.025, 0.167,
0.059, 0.112, 0.171, 0.183, 0.102, 0.018, 0.08, 0.015, 0.027,
0.05, 0.025, 0.046, 0.085, 0.144, 0.155, 0.09, 0.057, 0.023,
0.038, 0.205, 0.035, 0.198, 0.112, 0.211, 0.042, 0.142, 0.328,
0.076, 0.067, 0.094, 0.245, 0.153, 0.115, 0.127, 0.257, 0.125,
0.096), cdiffv2 = c(0.022, 0.081, 0.012, 0.006, NA, 0.03, 0.016,
0.013, -0.053, -0.021, -0.003, 0.035, 0.011, -0.065, -0.304,
-0.154, -0.245, -0.125, -0.005, 0.002, -0.037, 0.132, 0.002,
-0.045, -0.014, 0.064, 0.023, 0.094, 0.048, 0.024, 0.028, -0.011,
0.134, 0.02, 0.05, 0.076, 0.114, -0.002, 0.001), testm1v2 = c(38.72,
32.77, 32.32, 17.99, 73.58, 80.69, 48.56, 21.92, 27.24, 40.93,
31.73, 60.05, 38.04, 30.17, 59.07, 26.92, 25.41, 47.81, 63.02,
34.49, 104.38, 38.08, 30.99, 35.23, 104.81, 49.33, 50.03, 11.65,
143.57, 48.31, 90.37, 48.56, 41.67, 75.23, 60.56, 39.03, 18.16,
37.9, 84.5), testm2v2 = c(62.37, 29.23, 27.51, 28.66, 44.67,
105.48, 42.67, 15.01, 21.33, 10.87, 2.14, 44.53, 35.8, 10.43,
47.54, 48.5, 38.98, 91.32, 52.94, 22.43, 58.68, 81.63, 34.79,
38.57, 94.86, 50.83, 55.75, 45.33, 111.62, 65.15, 81.08, 50.08,
44.86, 58.63, 85.85, 58.69, 16.35, 35.97, 99.08), tdiffv2 = c(23.65,
-3.54, -4.81, 10.67, -28.91, 24.79, -5.89, -6.91, -5.91, -30.06,
-29.59, -15.52, -2.24, -19.74, -11.53, 21.58, 13.57, 43.51, -10.08,
-12.06, -45.7, 43.55, 3.8, 3.34, -9.95, 1.5, 5.72, 33.68, -31.95,
16.84, -9.29000000000001, 1.52, 3.19, -16.6, 25.29, 19.66, -1.81,
-1.93, 14.58), testoutv1 = structure(c(1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L), .Label = c("Not selected", "Selected"), class = "factor"),
cortoutv1 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L
), .Label = c("Not selected", "Selected"), class = "factor"),
anyoutv1 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L
), .Label = c("Not selected", "Selected"), class = "factor"),
testoutv2 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L
), .Label = c("Not selected", "Selected"), class = "factor"),
cortoutv2 = structure(c(1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L
), .Label = c("Not selected", "Selected"), class = "factor"),
anyoutv2 = structure(c(1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L
), .Label = c("Not selected", "Selected"), class = "factor"),
pose1rate = c(6L, 7L, 6L, 6L, 7L, 7L, 6L, 7L, 5L, 6L, 7L,
4L, 7L, 7L, 7L, 6L, 7L, 7L, 7L, 7L, 6L, 7L, 7L, 7L, 7L, 7L,
7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L), pose2rate = c(6L,
6L, 5L, 7L, 7L, 7L, 7L, 7L, 6L, 7L, 7L, 7L, 7L, 7L, 7L, 6L,
6L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 6L, 7L, 6L, 7L, 7L, 7L,
6L, 7L, 7L, 7L, 7L, 7L, 6L, 6L), poseratem = c(6, 6.5, 5.5,
6.5, 7, 7, 6.5, 7, 5.5, 6.5, 7, 5.5, 7, 7, 7, 6, 6.5, 7,
7, 7, 6.5, 7, 7, 7, 7, 6.5, 7, 6.5, 7, 7, 7, 6.5, 7, 7, 7,
7, 7, 6.5, 6.5), saldiff = c(24.30555556, 20.83333333, 29.16666667,
18.75, 23.61111111, 34.02777778, 18.05555556, 19.44444444,
21.52777778, 15.97222222, 22.91666667, 13.88888889, 22.22222222,
25, 22.22222222, 22.22222222, 18.05555556, 17.36111111, 22.22222222,
27.08333333, 20.83333333, 24.30555556, 22.22222222, 28.47222222,
24.30555556, 25, 27.77777778, 22.22222222, 15.97222222, 24.30555556,
21.52777778, 19.44444444, 15.97222222, 15.27777778, 15.97222222,
24.30555556, 19.44444444, 24.30555556, 15.27777778), sal2manip = c(19.80555556,
16.33333333, 24.66666667, 14.25, 19.11111111, 29.52777778,
13.55555556, 14.94444444, 17.02777778, 11.47222222, 18.41666667,
9.38888889, 17.72222222, 20.5, 17.72222222, 17.72222222,
13.55555556, 12.86111111, 17.72222222, 22.58333333, 16.33333333,
19.80555556, 17.72222222, 23.97222222, 19.80555556, 20.5,
23.27777778, 17.72222222, 11.47222222, 19.80555556, 17.02777778,
14.94444444, 11.47222222, 10.77777778, 11.47222222, 19.80555556,
14.94444444, 19.80555556, 10.77777778), hptreat = structure(c(2L,
1L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 2L,
1L, 2L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 1L,
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L), .Label = c("0", "1"), class = "factor"),
female = structure(c(1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 2L,
1L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 1L
), .Label = c("0", "1"), class = "factor"), age = c(19L,
20L, 20L, 18L, 21L, 20L, 18L, 21L, 35L, 20L, 18L, 20L, 20L,
18L, 20L, 25L, 18L, 23L, 21L, 19L, 20L, 20L, 30L, 19L, 22L,
18L, 19L, 22L, 19L, 20L, 28L, 28L, 19L, 19L, 20L, 25L, 20L,
25L, 23L), cort1a1 = c(0.004, 0.085, 0.049, 0.107, 0.486,
0.159, 0.088, 0.004, 0.138, 0.035, 0.03, 0.018, 0.017, 0.111,
0.39, 0.292, 0.396, 0.213, 0.065, 0.022, 0.074, 0.077, 0.035,
0.241, 0.126, 0.154, 0.021, 0.05, 0.296, 0.054, 0.04, 0.109,
0.114, 0.133, 0.063, 0.055, 0.149, 0.134, 0.098), cort1a2 = c(0.001,
0.086, 0.045, 0.105, 0.482, 0.147, 0.085, 0.005, 0.127, 0.037,
0.031, 0.013, 0.011, 0.111, 0.389, 0.304, 0.405, 0.218, 0.059,
0.02, 0.076, 0.069, 0.032, 0.246, 0.126, 0.141, 0.017, 0.046,
0.264, 0.051, 0.038, 0.101, 0.109, 0.133, 0.068, 0.048, 0.137,
0.12, 0.092), cort2a1 = c(0.027, 0.174, 0.056, 0.111, 0.175,
0.179, 0.103, 0.021, 0.079, 0.014, 0.028, 0.051, 0.024, 0.051,
0.083, 0.148, 0.156, 0.086, 0.062, 0.024, 0.038, 0.209, 0.036,
0.199, 0.114, 0.207, 0.041, 0.141, 0.333, 0.078, 0.065, 0.088,
0.238, 0.157, 0.119, 0.132, 0.268, 0.132, 0.099), cort2a2 = c(0.023,
0.161, 0.062, 0.113, 0.166, 0.188, 0.101, 0.016, 0.081, 0.015,
0.026, 0.049, 0.026, 0.041, 0.086, 0.139, 0.154, 0.093, 0.052,
0.022, 0.038, 0.202, 0.034, 0.198, 0.111, 0.215, 0.042, 0.142,
0.324, 0.075, 0.068, 0.101, 0.252, 0.149, 0.111, 0.123, 0.247,
0.118, 0.093), cortm1 = c(0.0024999999, 0.085500002, 0.046999998,
0.106, 0.484, 0.153, 0.086499996, 0.0044999998, 0.13249999,
0.035999998, 0.0305, 0.0155, 0.014, 0.111, 0.38949999, 0.29800001,
0.4005, 0.2155, 0.061999999, 0.021, 0.075000003, 0.072999999,
0.033500001, 0.24349999, 0.126, 0.14749999, 0.018999999,
0.048, 0.28, 0.052499998, 0.039000001, 0.105, 0.1115, 0.133,
0.065499999, 0.0515, 0.14300001, 0.127, 0.094999999), cortm2 = c(0.025,
0.1675, 0.059, 0.112, 0.1705, 0.18350001, 0.102, 0.0185,
0.079999998, 0.0145, 0.027000001, 0.050000001, 0.025, 0.046,
0.0845, 0.1435, 0.155, 0.089500003, 0.057, 0.023, 0.037999999,
0.20550001, 0.035, 0.19850001, 0.1125, 0.211, 0.041499998,
0.1415, 0.3285, 0.076499999, 0.066500001, 0.094499998, 0.245,
0.153, 0.115, 0.1275, 0.25749999, 0.125, 0.096000001), cdiff = c(0.022500001,
0.082000002, 0.012000002, 0.0060000047, -0.31349999, 0.03050001,
0.015500002, 0.014, -0.052499995, -0.021499999, -0.0034999996,
0.034500003, 0.011, -0.064999998, -0.30500001, -0.15450001,
-0.2455, -0.12599999, -0.004999999, 0.0020000003, -0.037000004,
0.13250001, 0.0014999993, -0.044999987, -0.013500005, 0.063500002,
0.022499999, 0.093499996, 0.048500001, 0.024, 0.0275, -0.010499999,
0.13350001, 0.019999996, 0.049500003, 0.075999998, 0.11449999,
-0.0020000041, 0.001000002), test1a1 = c(39.87, 33.22, 32.52,
19.74, 78.85, 83.51, 48.37, 22.31, 28.17, 41.44, 32.92, 61.4,
40.31, 30.36, 59.44, 27.52, 26.14, 46.75, 63.73, 34.03, 98.47,
36.62, 30.26, 37.15, 105.64, 47.99, 50.15, 11.33, 149.12,
48.57, 92.04, 51.22, 42.25, 77.07, 62.75, 38.8, 17.91, 40.28,
88.47), test1a2 = c(37.58, 32.32, 32.12, 16.25, 68.31, 77.88,
48.75, 21.53, 26.32, 40.42, 30.55, 58.7, 35.78, 29.97, 58.7,
26.32, 24.69, 48.87, 62.32, 34.95, 110.29, 39.53, 31.72,
33.32, 103.99, 50.67, 49.9, 11.97, 138.02, 48.05, 88.7, 45.89,
41.08, 73.39, 58.38, 39.25, 18.41, 35.53, 80.54), test2a1 = c(64.22,
29.43, 27.98, 28.17, 46.14, 105.92, 43.68, 16.41, 21.42,
11.35, 1.66, 44.17, 38.58, 11.11, 48.57, 48.31, 39.71, 92.04,
52.73, 22.3, 58.23, 82.01, 35.76, 39.59, 94.06, 50.52, 55.82,
45.91, 115.13, 67.59, 82.97, 49.89, 45.09, 57.86, 86.76,
58.83, 16.53, 36.7, 100.4), test2a2 = c(60.53, 29.04, 27.04,
29.14, 43.2, 105.05, 41.66, 13.62, 21.25, 10.39, 2.63, 44.9,
33.02, 9.75, 46.52, 48.7, 38.25, 90.59, 53.15, 22.57, 59.14,
81.24, 33.81, 37.55, 95.66, 51.14, 55.69, 44.74, 108.1, 62.71,
79.18, 50.27, 44.63, 59.39, 84.94, 58.55, 16.16, 35.24, 97.75
), testm1 = c(38.724998, 32.77, 32.32, 17.995001, 73.580002,
80.695, 48.560001, 21.92, 27.245001, 40.93, 31.735001, 60.049999,
38.044998, 30.165001, 59.07, 26.92, 25.415001, 47.810001,
63.025002, 34.490002, 104.38, 38.075001, 30.99, 35.235001,
104.815, 49.330002, 50.025002, 11.65, 143.57001, 48.310001,
90.370003, 48.555, 41.665001, 75.230003, 60.564999, 39.025002,
18.16, 37.904999, 84.504997), testm2 = c(62.375, 29.235001,
27.51, 28.655001, 44.669998, 105.485, 42.669998, 15.015,
21.334999, 10.87, 2.145, 44.535, 35.799999, 10.43, 47.544998,
48.505001, 38.98, 91.315002, 52.939999, 22.434999, 58.685001,
81.625, 34.785, 38.57, 94.860001, 50.830002, 55.755001, 45.325001,
111.615, 65.150002, 81.074997, 50.080002, 44.860001, 58.625,
85.849998, 58.689999, 16.344999, 35.970001, 99.074997), tdiff = c(23.650002,
-3.5349998, -4.8099995, 10.66, -28.910004, 24.790001, -5.8900032,
-6.9049997, -5.9100018, -30.060001, -29.59, -15.514999, -2.2449989,
-19.735001, -11.525002, 21.585001, 13.564999, 43.505001,
-10.085003, -12.055002, -45.694996, 43.549999, 3.7950001,
3.3349991, -9.9550018, 1.5, 5.7299995, 33.675003, -31.955009,
16.84, -9.2950058, 1.5250015, 3.1949997, -16.605003, 25.285,
19.664997, -1.8150005, -1.9349976, 14.57), feelpower = structure(c(2L,
3L, 1L, 2L, 1L, 2L, 3L, 3L, 3L, 3L, 3L, 4L, 3L, 1L, 2L, 1L,
1L, 2L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 3L, 4L, 3L,
4L, 3L, 1L, 3L, 4L, 2L, 2L, 3L), .Label = c("2", "3", "Not at all",
"Very much"), class = "factor"), incharge = structure(c(1L,
1L, 3L, 4L, 1L, 2L, 3L, 3L, 1L, 1L, 3L, 4L, 3L, 2L, 2L, 1L,
3L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 1L,
1L, 3L, 1L, 1L, 4L, 3L, 1L, 1L), .Label = c("2", "3", "Not at all",
"Very much"), class = "factor"), powm = structure(c(3L, 1L,
1L, 5L, 2L, 4L, 6L, 6L, 1L, 1L, 6L, 7L, 6L, 3L, 4L, 2L, 1L,
4L, 4L, 3L, 2L, 4L, 2L, 2L, 3L, 3L, 3L, 4L, 1L, 5L, 1L, 4L,
6L, 2L, 1L, 7L, 2L, 3L, 1L), .Label = c("1.5", "2", "2.5",
"3", "3.5", "Not at all", "Very much"), class = "factor"),
diceroll = structure(c(2L, 1L, 2L, 2L, 1L, 2L, 2L, 2L, 2L,
1L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 2L,
2L, 2L, 2L, 1L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L
), .Label = c("No", "Yes"), class = "factor")), row.names = c(2L,
3L, 4L, 5L, 7L, 8L, 9L, 10L, 12L, 14L, 15L, 16L, 17L, 19L, 20L,
22L, 23L, 24L, 25L, 26L, 27L, 28L, 30L, 31L, 32L, 33L, 34L, 35L,
37L, 38L, 39L, 40L, 41L, 42L, 43L, 44L, 45L, 46L, 47L), class = "data.frame")
You can get the same results in R using drop1(FosseTest, test = "F"). This will test the effect of leaving one of the variables off the aov.
drop1(FosseTest, test = "F")
#
# Single term deletions
#
# Model:
# testm2 ~ testm1 + hptreat + cortm1 + cortm2 + female
# Df Sum of Sq RSS AIC F value Pr(>F)
# <none> 8463.3 221.82
# testm1 1 3516.7 11979.9 233.37 13.7122 0.0007751 ***
# hptreat 1 1183.5 9646.8 224.92 4.6147 0.0391333 *
# cortm1 1 8.6 8471.8 219.86 0.0334 0.8560279
# cortm2 1 2810.9 11274.2 231.00 10.9604 0.0022605 **
# female 1 2557.3 11020.6 230.11 9.9716 0.0033895 **
# ---
# Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
summary(FosseTest) displays the sequential effect of addeding the variables one after another.
There was a different way how to access this, but at the moment I can't remember...

shade a facet.grid of kernel density plots with ggplot2

Consider the following df:
df<-structure(list(Trial = structure(c(1L, 5L, 1L, 5L, 1L, 4L, 3L,
2L, 2L, 4L, 3L, 3L, 2L, 5L, 4L, 1L, 2L, 3L, 5L, 1L, 2L, 1L, 4L,
3L, 1L, 3L, 3L, 2L, 3L, 5L, 1L, 3L, 3L, 5L, 5L, 1L, 4L, 3L, 3L,
1L, 1L, 5L, 5L, 1L, 3L, 5L, 2L, 1L, 5L, 3L, 2L, 1L, 4L, 3L, 5L,
3L, 4L, 1L, 2L, 2L, 2L, 2L, 4L, 1L, 4L, 5L, 3L, 1L, 5L, 3L, 3L,
4L, 2L, 2L, 4L, 4L, 1L, 3L, 4L, 5L, 4L, 2L, 3L, 1L, 1L, 4L, 2L,
3L, 5L, 2L, 2L, 4L, 1L, 4L, 4L, 5L, 2L, 4L, 2L, 4L, 1L, 4L, 3L,
5L, 4L, 5L, 2L, 3L, 2L, 2L, 5L, 1L, 3L, 3L, 3L, 1L, 2L, 4L, 5L,
3L, 1L, 2L, 5L, 1L, 4L, 3L, 2L, 2L, 5L, 1L, 5L, 1L, 4L, 5L, 5L,
2L, 1L, 2L, 1L, 2L, 1L, 2L, 3L, 2L, 5L, 2L, 5L, 2L, 4L, 1L), .Label =
c("ES8-13", "ES14-25", "ES26-38", "SA1-12", "SA14-25"), class = "factor"),
MAF = c(-0.022, 0.141, -0.035, 0.076, -0.019, -0.064, -0.044,
0.088, 0.067, 0.049, 0.088, 0.053, -0.052, -0.078, 0.104,
-0.019, -0.075, -0.049, 0.098, -0.145, 0.094, 0.014, 0.016,
0.00599999999999999, 0.027, 0.117, -0.011, 0.055, 0.079,
0.26, -0.049, 0.065, 0.227, 0.141, -0.091, -0.021, -0.306,
0.162, -0.089, -0.068, 0.00700000000000001, 0.034, 0.02,
0.038, 0.129, 0.099, 0.06, -0.09, 0.104, 0.195, 0.165, -0.047,
0.074, -0.01, 0.002, -0.068, 0.054, 0.012, -0.012, 0.222,
0.046, 0.00700000000000001, -0.022, 0.00499999999999995,
-0.051, 0.126, 0.073, 0.094, -0.254, 0.185, 0.238, 0.099,
0.027, 0.044, -0.018, 0.014, -0.058, -0.005, -0.00999999999999998,
-0.002, 0.061, 0.178, 0.001, 0.105, -0.001, -0.088, 0.113,
0.134, 0.175, 0.06, -0.026, 0.048, 0.003, 0.049, 0.0649999999999999,
-0.135, -0.036, -0.069, 0.015, -0.058, 0.024, 0.093, 0.123,
-0.144, 0.011, 0.343, 0.002, -0.018, 0.055, -0.047, -0.317,
-0.033, -0.018, 0.068, -0.044, 0.05, 0.079, 0.122, -0.071,
0.13, 0.078, 0.085, 0.012, -0.02, -0.088, -0.086, -0.026,
0.046, 0.101, -0.026, 0.005, 0.00700000000000001, 0.064,
0.066, -0.085, 0.114, 0.003, 0.004, -0.003, 0.097, 0.055,
-0.063, -0.089, 0.104, -0.199, 0.01, 0.184, 0.183, 0.129,
-0.059)), row.names = c(1146L, 163986L, 34946L, 168682L,
33356L, 152862L, 103827L, 54557L, 68666L, 141066L, 118349L, 93909L,
67299L, 193633L, 129212L, 39273L, 71459L, 102636L, 176655L, 30543L,
46107L, 32608L, 122906L, 100356L, 37635L, 81566L, 116510L, 61803L,
96219L, 187927L, 9211L, 106999L, 88554L, 181316L, 176250L, 32656L,
150472L, 80615L, 111414L, 16038L, 23319L, 185075L, 175803L, 32648L,
106332L, 185991L, 65155L, 32165L, 189972L, 92486L, 44161L, 404L,
123856L, 80513L, 180030L, 101190L, 145315L, 5498L, 75891L, 77358L,
67571L, 72894L, 127763L, 6584L, 139250L, 163126L, 101492L, 22520L,
181276L, 82673L, 94756L, 142750L, 48377L, 59931L, 140900L, 154339L,
2769L, 110265L, 130494L, 186334L, 138079L, 50754L, 82207L, 24578L,
26393L, 128021L, 69283L, 84549L, 187875L, 76775L, 45715L, 138049L,
1972L, 137218L, 158324L, 200014L, 61611L, 147430L, 60938L, 154928L,
22421L, 159532L, 98190L, 166565L, 151667L, 180407L, 55681L, 89127L,
54396L, 65975L, 172695L, 21969L, 80439L, 81202L, 87282L, 35394L,
53137L, 131886L, 163181L, 84221L, 32007L, 57711L, 160393L, 32843L,
157924L, 104820L, 63993L, 55023L, 160342L, 20800L, 167583L, 15849L,
143476L, 172878L, 195659L, 49812L, 4971L, 44583L, 24399L, 77026L,
16862L, 56500L, 113282L, 65688L, 188635L, 75437L, 190601L, 54633L,
137420L, 27389L), class = "data.frame")
Here is a snippet of the df:
Trial MAF
ES8-13 -0.022
SA14-25 0.141
ES8-13 -0.035
SA14-25 0.076
ES8-13 -0.019
SA1-12 -0.064
I have produced the following kernel density plot :
p <- ggplot(df,aes(x=MAF)) +
geom_density(fill='grey') + facet_grid(Trial ~.)
p
I would like to shade both tail regions that fall above 90% of the values. With the following command for example I could get the quantiles for the whole df:
qt <- quantile(df$MAF,probs=c(.05,.95))
But I rather need the quantiles for every level of the factor Trial as follows:
require(dplyr)
qt05<-alele_freq_dev %>% group_by(Trial) %>%
summarise(quantile(MAF,probs=c(.05)))
qt95<-alele_freq_dev %>% group_by(Trial) %>%
summarise(quantile(MAF,probs=c(.95)))
With those quantiles in mind I would need to shade every level of factor Trial for every facet of the graph. I found solutions for this problem but only for a singular plot case.
Could someone help me to get this done for a facet.grid case ?
I use library(ggridges) for distribution viz like this, because it has a lot of nice features, including the ability to customize quantile shading!
Here is an example without the faceting, because with this strategy you might not need to facet anymore:
library(ggridges)
ggplot(df, aes(x=MAF, y = Trial, fill=factor(..quantile..))) +
stat_density_ridges(geom = "density_ridges_gradient", calc_ecdf = TRUE, quantiles = c(0.05, 0.95), scale = 1) +
scale_fill_manual(values = c("#FDE725FF", "#A0A0A0A0", "#FDE725FF"),
name = NULL,
labels = c("lower 5%", "middle 90%", "upper 90%"))
If you still want to do the facet route, one drawback is that stat_density_ridges requires a y aesthetic. So I would do something like this to tweak the theme a bit and keep the plot looking pretty and clean (no one will ever know there is a y aes lurking in there!):
ggplot(df, aes(x=MAF, y = Trial, fill=factor(..quantile..))) +
stat_density_ridges(geom = "density_ridges_gradient", calc_ecdf = TRUE, quantiles = c(0.05, 0.95), scale = 1) +
scale_fill_manual(values = c("#FDE725FF", "#A0A0A0A0", "#FDE725FF"),
name = NULL,
labels = c("lower 5%", "middle 90%", "upper 5%")) +
facet_grid(Trial~ ., scales = "free_y") +
theme(axis.text.y = element_blank(), # clean up overhead
axis.ticks.y = element_blank())
Obviously you can tweak the colors and labels as you see fit, just make sure they make sense with the quantiles you set in the geom layer. Let me know if you have more questions.

Optimizing degrees of freedom in spline regression

I have two gene-expression time-course data sets:
First, gene expression was measured over 14 time points from 4 groups:
df1 <- structure(list(val = c(-0.1, -0.13, -0.4, -0.3, -0.3, -0.2, -0.24,
0.1, 0.2, 0.13, 0, 0.63, 0.83, 0.85, -0.07, -0.07, -0.27, -0.2,
-0.2, -0.1, 0.2, 0.1, 0.07, 0.17, 0.6, 0.75, 1.1, 1.1, -0.13,
-0.15, -0.26, -0.25, -0.14, 0.04, 0.2, 0.24, 0.23, 0.2, 0.1,
0.73, 1, 1.3, 0, 0.06, -0.24, -0.17, -0.17, -0.04, 0.16, 0.1,
0.14, 0.27, 0.34, 0.9, 0.97, 1.04),
time = c(-1, 0, 1, 1.58,2, 2.58, 3, 3.32, 3.58, 4.17, 4.58, 5.58, 6.17, 7.39,
-1, 0, 1, 1.58, 2, 2.58, 3, 3.32, 3.58, 4.17, 4.58, 5.58, 6.17, 7.39,
-1, 0, 1, 1.58, 2, 2.58, 3, 3.32, 3.58, 4.17, 4.58, 5.58, 6.17,7.39,
-1, 0, 1, 1.58, 2, 2.58, 3, 3.32, 3.58, 4.17, 4.58, 5.58,6.17, 7.39),
group = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,2L, 2L, 2L, 2L, 2L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,3L, 3L, 3L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,4L),
.Label = c("a", "b", "c", "d"), class = "factor")), .Names = c("val","time", "group"),
row.names = c(NA, -56L), class = "data.frame")
df1$group <- factor(df1$group,levels=c("a","b","c","d"))
which looks like this (adding a loess smoothed trend line):
library(ggplot2)
ggplot(df1,aes(x=time,y=val,color=group))+geom_point()+theme_minimal()+geom_smooth(se=F)+theme(legend.position="top",legend.title=element_blank())
Second, gene expression was measured over similar 14 time points but now from 2 different groups, each represented by the two sexes:
df2 <- structure(list(val = c(-0.23, -0.01, -0.14, -0.01, -0.21, -0.16,
-0.24, -0.11, 0.02, -0.11, -0.01, -0.25, -0.47, -1.25, 0.02,
-0.3, -0.02, 0.14, 0.25, -0.05, 0.15, 0.11, -0.24, -0.18, -0.39,
-0.49, -0.5, -0.65, -0.06, 0.09, 0.1, 0.15, 0.08, 0.15, 0.4,
0.24, 0.07, 0.08, -0.18, -0.35, -0.19, -0.81, -0.16, 0.29, -0.05,
0.14, 0.14, 0.48, 0.34, 0.11, -0.07, -0.13, -0.41, -0.22, -0.54,
-0.76, 0.35, 0.34, -0.06, 0.21, 0.14, 0.14, 0.25, 0.22, 0.25,
0.16, 0.3, 0.44, 0.08, 0.48, 0.1, 0.16, -0.03, -0.22, 0.2, 0.01,
-0.09, -0.02, -0.01, 0.06, -0.13, 0.19, 0.11, -0.04, -0.39, 0.03,
-0.01, 0.09, 0.1, -0.14, -0.12, -0.1, 0.36, 0.08, 0.09, 0.09,
0.42, 0.37, -0.14, 0.12, 0.09, 0.03, 0.06, -0.25, 0.2, -0.06,
-0.44, 0.23, 0.03, 0.16, 0.81, 0.83),
time = c(-1, 0, 1, 1.58,2, 2.58, 3, 3.32, 3.58, 4.17, 4.58, 5.58, 6.17, 7.39,
-1, 0,1, 1.58, 2, 2.58, 3, 3.32, 3.58, 4.17, 4.58, 5.58, 6.17, 7.39,
-1, 0, 1, 1.58, 2, 2.58, 3, 3.32, 3.58, 4.17, 4.58, 5.58, 6.17,7.39,
-1, 0, 1, 1.58, 2, 2.58, 3, 3.32, 3.58, 4.17, 4.58, 5.58,6.17, 7.39,
-1, 0, 1, 1.58, 2, 2.58, 3, 3.32, 3.58, 4.17, 4.58,5.58, 6.17, 7.39,
-1, 0, 1, 1.58, 2, 2.58, 3, 3.32, 3.58, 4.17,4.58, 5.58, 6.17, 7.39,
-1, 0, 1, 1.58, 2, 2.58, 3, 3.32, 3.58, 4.17, 4.58, 5.58, 6.17, 7.39,
-1, 0, 1, 1.58, 2, 2.58, 3, 3.32, 3.58, 4.17, 4.58, 5.58, 6.17, 7.39),
sex = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L),
.Label = c("F", "M"), class = "factor"), group = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L),
.Label = c("a", "b"), class = "factor")), .Names = c("val", "time", "sex", "group"), row.names = c(NA, -112L), class = "data.frame")
df2$sex <- ordered(df2$sex,levels=c("M","F"))
df2$group <- ordered(df2$group,levels=c("a","b"))
df2$col <- factor(paste0(df2$group,":",df2$sex))
which looks like this (adding a loess smoothed trend line):
ggplot(df2,aes(x=time,y=val,color=col))+geom_point()+theme_minimal()+geom_smooth(se=F)+theme(legend.position="top",legend.title=element_blank())
For df1, I would like to estimate the effect of time on val, adjusting for group.
For df2, I would like to estimate the effect of time:group on val, adjusting for sex.
Looking at the data I thought using spline regressions would be appropriate so I used the gam function from the mgcv package, which as far as I understand optimizes the degrees of freedom of the splines fitted to the data.
This is what I fitted for df1:
mgcv1.fit <- mgcv::gam(val ~ group+s(time),data=df1)
Which gives:
Family: gaussian
Link function: identity
Formula:
val ~ group + s(time)
Estimated degrees of freedom:
7.18 total = 11.18
GCV score: 0.01258176
But 7.18 degrees of freedom seems too much for these data.
For df2:
mgcv2.fit <- mgcv::gam(val ~ sex+s(time,by=group),data=df2)
which gives:
Family: gaussian
Link function: identity
Formula:
val ~ sex + s(time, by = group)
Estimated degrees of freedom:
1.72 total = 3.72
GCV score: 0.08522094
I guess that in this case I'd imagine the degrees of freedom to be slightly higher.
One more point. Plotting the fitted values for these two data sets:
df1$mgcv <- mgcv1.fit$fitted.values
ggplot(df1,aes(x=time,y=mgcv,color=group))+geom_point()+theme_minimal()+geom_smooth(se=F)+theme(legend.position="top",legend.title=element_blank())
which looks fine.
But for df2
df2$mgcv <- mgcv2.fit$fitted.values
ggplot(df2,aes(x=time,y=mgcv,color=col))+geom_point()+theme_minimal()+geom_smooth(se=F)+theme(legend.position="top",legend.title=element_blank())
Looks like it flipped the group labels.
So my questions are:
Am I using mgcv::gam correctly for optimizing the spline degrees of freedom for my questions?
Does mgcv reorders the samples in its fitted.values?
First of all, mgcv does the right thing on the factor levels. If you check str(df2$sex), you will see that "M" (male) is the first level and "F" (female) is the second. But it seems from str(df2$col) that "F" is the first, so you get some mislabeling when making plot.
Secondly, your second model has not been specified correctly.
The spline s(time) is under centering constraint when there is no "by" variable, or the "by" is a factor. So you to provide your "by" variable group as a separate term in your model formula to catch its marginal effect;
Since the "by" variable group is an ordered variable, mgcv applies contrasts on it, dropping the first level "a" when constructing the s(time, by = group). So you need to provide a separate s(time) as the baseline smooth.
Your current mgcv2.fit is a rather poor model (not surprising), giving an explained deviance of 9%. But if you do the following you get 64%.
gam(val ~ sex + s(time) + group + s(time, by = group), data = df2, method = "REML")
The ggplot now looks right (I haven't changed df2$col so the coloring is still probably reversed).
gam defaults to use "GCV.Cp" as smoothing parameter selection method. But it is recommended to use "REML" as it is less prone to overfitting.
Remark 1
If the "by" variable group is a (non-ordered) factor, it is not subject to contrasts. So the model formula should be:
val ~ sex + group + s(time, by = group)
The following is quoted from 'by' variables section of ?gam.models:
If a ‘by’ variable is a ‘factor’ then it generates an indicator
vector for each level of the factor, unless it is an ‘ordered’
factor. In the non-ordered case, the model matrix for the smooth
term is then replicated for each factor level, and each copy has
its rows multiplied by the corresponding rows of its indicator
variable. The smoothness penalties are also duplicated for each
factor level. In short a different smooth is generated for each
factor level (the ‘id’ argument to ‘s’ and ‘te’ can be used to
force all such smooths to have the same smoothing parameter).
‘ordered’ ‘by’ variables are handled in the same way, except that
no smooth is generated for the first level of the ordered factor
(see ‘b3’ example below). This is useful for setting up
identifiable models when the same smooth occurs more than once in
a model, with different factor ‘by’ variables.
Remark 2
I am not to judge your model, but there seems to be a clear within-group difference between "F" and "M". From your data we see that "F" and "M" has a bigger difference in group "b" than in group "a". At the moment the effect of sex is identical in both groups, and it is just a vertical shift. You can observe this in the above ggplot in this answer. It is up to you to decide the model in the end, but just in case that you want to model this sex-group interaction, you can do
df2$sex_group <- with(df2, interaction(sex, group)) ## the new variable is unordered
test <- gam(val ~ sex + group + s(time, by = sex_group), data = df2, method = "REML")
Note how I provide two factor variables to by. An auxiliary variable sex_group is created.

How to get geom_smooth() ignore my colour grouping

I'm trying to make a plot with fitted lines for two levels of my factor(grouped by color). I used shapes to group another variant but when I try to fit smoother, I end up with 4 lines while I only need one two lines in total (1 per color)
Here is the data and code I use:
data <- structure(list(K = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("2s", "4s"), class = "factor"),
q = c(0.12, 0.11, 0.1, 0.09, 0.08, 0.07, 0.06, 0.05, 0.04,
0.03, 0.02, 0.01, 0.12, 0.11, 0.1, 0.09, 0.08, 0.07, 0.06,
0.05, 0.04, 0.03, 0.02, 0.01, 0.12, 0.11, 0.1, 0.09, 0.08,
0.07, 0.06, 0.05, 0.04, 0.03, 0.02, 0.01, 0.12, 0.11, 0.1,
0.09, 0.08, 0.07, 0.06, 0.05, 0.04, 0.03, 0.02, 0.01), rarity = c(0.907,
0.9206, 0.9359, 0.9321, 0.9405, 0.9344, 0.9449, 0.9106, 0.8844,
0.8829, 0.8989, 0.798, 0.7464, 0.8225, 0.877, 0.8521, 0.9127,
0.9317, 0.9245, 0.9595, 0.9628, 0.9573, 0.9423, 0.9428, 0.5802,
0.6414, 0.5123, 0.57, 0.587, 0.5655, 0.5231, 0.517, 0.4694,
0.5459, 0.3745, 0.3274, 0.7936, 0.7821, 0.7297, 0.7227, 0.6814,
0.6608, 0.6721, 0.6202, 0.5924, 0.5659, 0.5448, 0.6138),
metric = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("APD", "ED"
), class = "factor")), .Names = c("K", "q", "rarity", "metric"
), class = "data.frame", row.names = c(NA, -48L))
library(ggplot2)
ggplot(data=data, aes(x=q, y=rarity, colour=metric, shape=K))+
ggtitle("Relationship")+
xlab("rate of character change")+
ylab("Correlation coefficient to average rarity")+
geom_point()+
geom_smooth(method=lm,se=FALSE)
Any advice on that?
You're getting two lines for each group becase it's being split by both metric and K. You really want the shape aesthetic to only apply to the point layer, not the smooth later. It's better just to move the aes() for that property there.
ggplot(data=data, aes(x=q, y=rarity, colour=metric))+
ggtitle("Relationship")+
xlab("rate of character change")+
ylab("Correlation coefficient to average rarity")+
geom_point(aes(shape=K))+
geom_smooth(method=lm,se=FALSE)

Passing a String Via Index in R

I'm trying to use a for loop to pull subsets of data out of a dataframe with R.
I have a little vector to hold all the possible occurences of the names in that column
meter_class<-c("one_s_120","nine_s_120", "nine_s_480","fortyfive_s_120", "fortyfive_s_480")
Whenever I try to address it by index reference, it fails. Either nothing in the data subset survives (NULLs everywhere), or R complains about not passing the right argument by using meter_class[1]
attach(meter_class[1])
Error in attach(meter_class[1]) : file 'one_s_120' not found
subset(cal, cal$Form==as.character(meter_class[1]))
[1] Test Amps Type Accuracy Voltage Form
<0 rows> (or 0-length row.names)
Also, here's the output of dput on the datafram cal:
structure(list(Test = structure(c(3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L), .Label = c("Burst", "ESD", "Inspection",
"Surge"), class = "factor"), Amps = c(15, 15, 1.5, 2.5, 2.5,
0.25, 2.5, 2.5, 0.25, 2.5, 2.5, 0.25, 2.5, 2.5, 0.25, 15, 15,
1.5, 2.5, 2.5, 0.25, 2.5, 2.5, 0.25, 2.5, 2.5, 0.25, 2.5, 2.5,
0.25, 15, 15, 1.5, 2.5, 2.5, 0.25, 2.5, 2.5, 0.25, 2.5, 2.5,
0.25, 2.5, 2.5, 0.25, 15, 15, 1.5, 2.5, 2.5, 0.25, 2.5, 2.5,
0.25, 2.5, 2.5, 0.25, 2.5, 2.5, 0.25), Type = structure(c(2L,
1L, 3L, 2L, 1L, 3L, 2L, 1L, 3L, 2L, 1L, 3L, 2L, 1L, 3L, 2L, 1L,
3L, 2L, 1L, 3L, 2L, 1L, 3L, 2L, 1L, 3L, 2L, 1L, 3L, 2L, 1L, 3L,
2L, 1L, 3L, 2L, 1L, 3L, 2L, 1L, 3L, 2L, 1L, 3L, 2L, 1L, 3L, 2L,
1L, 3L, 2L, 1L, 3L, 2L, 1L, 3L, 2L, 1L, 3L), .Label = c("Lag - 0.5",
"Unity - Full", "Unity - Light"), class = "factor"), Accuracy = c(-0.011,
0.012, 0.027, 0.033, 0.076, 0.006, 0.052, 0.046, -0.016, 0.021,
0.008, 0.023, 0.034, 0.036, 0.038, 0.002, 0.012, 0.097, 0.055,
0.093, 0.033, 0.068, 0.048, -0.016, 0.042, 0.03, 0.035, 0.041,
0.024, 0.027, 0.004, -0.012, 0.002, 0.038, 0.084, 0.015, 0.049,
0.045, -0.009, 0.025, 0.002, 0.029, 0.03, 0.032, 0.064, 0.011,
0.024, 0.033, 0.054, 0.085, 0.027, 0.071, 0.059, 0.01, 0.051,
0.012, 0.051, 0.048, 0.04, 0.051), Voltage = c(120, 120, 120,
120, 120, 120, 480, 480, 480, 120, 120, 120, 480, 480, 480, 120,
120, 120, 120, 120, 120, 480, 480, 480, 120, 120, 120, 480, 480,
480, 120, 120, 120, 120, 120, 120, 480, 480, 480, 120, 120, 120,
480, 480, 480, 120, 120, 120, 120, 120, 120, 480, 480, 480, 120,
120, 120, 480, 480, 480), Form = structure(c(3L, 3L, 3L, 2L,
2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 3L, 3L, 2L, 2L,
2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 3L, 3L, 2L, 2L, 2L,
2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 3L, 3L, 2L, 2L, 2L, 2L,
2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("45S", "9S", "i210plus"
), class = "factor")), .Names = c("Test", "Amps", "Type", "Accuracy",
"Voltage", "Form"), class = "data.frame", row.names = c(NA, -60L
))
I know this is a simple thing to do if you know how to do it...Can anyone light the way?
Thanks!
It seems that none of the values of "meter_class" are represented in "Form" in your data frame.
unique(df$Form)
# [1] i210plus 9S 45S
meter_class %in% unique(df$Form)
# [1] FALSE FALSE FALSE FALSE FALSE
Just try two forms of subsetting, using values of "Form" actually present in the data:
subset(df, Form == "9S")
df[df$Form == "9S", ]
I also note that you wish to "pull subsets of data out of a dataframe". Not knowing the full story and your objectives of doing so, but please note that there are loads of functions that allow you to perform calculations, plotting, or whatever, on subsets of your data.
Update following comment
You can subset a data frame by combining logical conditions with logical operators (see e.g. ?Extract, ?&)
meter_class <- c("i210plus", "9S", "45S")
df[df$Form == "9S" & df$Voltage == 120, ]
# or
subset(df, Form == "9S" & Voltage == 120)

Resources