I have been collaborating on this code that creates an NMDS plot and I want to add shaded polygons of the points. However, the ordihull code keeps returning the following error. Why would the argument be of length zero?
Error in if (n < 4) return(colMeans(x[-n, , drop = FALSE])) : argument is of length zero
> m1 <- metaMDS(d1)
> m2 <- metaMDS(d2)
> m3 <- metaMDS(d3)
> mdat <- data.frame(m3$points)
> mdat$site <- substr(rownames(mdat), 1, 1) mdat$col <- ifelse(mdat$site == "D", "red",
ifelse(mdat$site == "H", "blue", "green"))
> plot(mdat[,1], mdat[,2], pch=16, col=mdat$col, display = "sites",
xlab="NMDS1", ylab="NMDS2", xlim=c(-0.2, 0.2),
ylim=c(-0.2, 0.2), main= "Phylum")
> ordihull(mdat[,1], mdat[,2], display="sites", label=T,
lwd=2, draw="polygon",col= c("blue", "red", "green"))
Here is the Dput:
> structure(list(p__Proteobacteria = c(44.807, 40.907, 36.558,36.811,
39.401, 40.114, 45.911, 43.133, 30.137, 27.734, 26.722,
31.261), p__Actinobacteria = c(26.819, 34.651, 40.904, 38.847,
39.446, 37.523, 29.881, 29.251, 31.783, 23.641, 34.918, 31.308
), p__Acidobacteria = c(8.48, 6.6, 5.934, 6.609, 5.89, 7.567,
5.795, 6.666, 10.616, 10.709, 8.988, 11.794), p__Bacteroidetes =
c(7.56, 8.189, 5.363, 6.223, 4.716, 3.613, 4.65, 5.2, 4.281, 2.785,
2.808, 3.271), p__Gemmatimonadetes = c(3.529, 2.108, 1.213, 1.193,
1.541, 1.439, 1.006, 1.171, 5.794, 4.107, 4.001, 2.747),
p__Chloroflexi = c(2.686, 2.987, 2.979, 3.049, 4.128, 4.564, 5.304,
4.624, 3.669, 2.775, 4.534, 4.94), p__Bacteria_unclassified =
c(2.38, 1.869, 1.579, 1.247, 2.3, 2.108, 1.36, 1.193, 3.126, 1.885,
2.987, 2.37), p__Firmicutes = c(0.998, 0.807, 2.76, 2.962, 0.866,
1.32, 1.651, 2.073, 1.099, 1.046, 1.3, 1.302), p__Verrucomicrobia =
c(0.676, 0.404, 0.32, 0.35, 0.293, 0.239, 0.188, 0.261, 0.521,
0.726, 0.52, 0.397), p__Nitrospirae = c(0.464, 0.244, 0.198, 0.208,
0.016, 0.032, 0.024, 0.042, 0.296, 0.103, 0.229, 0.211),
p__Candidatus_Saccharibacteria = c(0.421, 0.511, 0.456, 0.552,
0.523, 0.6, 0.842, 1.016, 0.672, 0.636, 0.465, 0.736),
p__Planctomycetes = c(0.392, 0.267, 0.354, 0.285, 0.275, 0.356,
0.285, 0.276, 0.33, 0.438, 0.552, 0.365), p__Fibrobacteres = c(0.14,
0.074, 0.007, 0.009, 0.072, 0.044, 0.136, 0.079, 0.117, 0.018,
0.167, 0.065), p__Candidatus_Latescibacteria = c(0.113, 0.059,
0.017, 0.005, 0.004, 0.017, 0.015, 0.009, 0, 0.011, 0.007, 0.018
), p__Latescibacteria = c(0.085, 0.04, 0.01, 0.004, 0.012, 0.015,
0.033, 0.015, 0.012, 0.016, 0.011, 0.018), p__Cyanobacteria =
c(0.079, 0.048, 1.071, 1.372, 0.32, 0.19, 2.629, 4.689, 7.133,
22.963, 11.417, 8.767), p__Thermodesulfobacteria = c(0.068, 0.057,
0.115, 0.103, 0.008, 0.01, 0.015, 0.007, 0.01, 0.003, 0.002, 0.013),
p__Elusimicrobia = c(0.059, 0.021, 0.012, 0.001, 0.004, 0.002,
0.015, 0.017, 0, 0.002, 0.005, 0.006), p__Chlorobi = c(0.052,
0.025, 0.002, 0.012, 0.029, 0.046, 0.033, 0.04, 0.05, 0.02,
0.046, 0.025), p__Armatimonadetes = c(0.046, 0.053, 0.051,
0.072, 0.076, 0.095, 0.048, 0.053, 0.197, 0.159, 0.128, 0.125
), p__Spirochaetes = c(0.035, 0.021, 0.002, 0.001, 0, 0.002,
0.024, 0.039, 0, 0, 0, 0), p__Parcubacteria = c(0.03, 0.013,
0, 0, 0.01, 0.015, 0.042, 0.037, 0.032, 0.059, 0.053, 0.011
), p__Chlamydiae = c(0.028, 0.017, 0.046, 0.05, 0.014, 0.007,
0.021, 0.022, 0.07, 0.074, 0.08, 0.152)), class = "data.frame",
row.names = c("D15B", "D610B", "D15F", "D610F", "HR15B", "HR610B",
"HR15F", "HR610F", "C15B", "C610B", "C15F", "C610F"))
Here are the codes:
> phylum.dat <- dput
> x <- data.frame(tax=names(phylum.dat), nsites=apply(phylum.dat, 2, function(x){length(which(x>0))}))
> d1 <- vegdist(phylum.dat, method = "jaccard", binary = TRUE)
> d2 <- vegdist(log1p(phylum.dat, method = "jaccard"))
> logit_phylum <- as.matrix(phylum.dat+1)/100
> d3 <- qlogis(logit_phylum)
> d3 <- d3+abs(min(d3))
> d3 <- vegdist(d3, method = "jaccard")
> m1 <- metaMDS(d1)
> m2 <- metaMDS(d2)
> m3 <- metaMDS(d3)
> e1 <- envfit(m3, phylum.dat)
> exy <- data.frame(tax=names(phylum.dat),
> x=e1$vectors$arrows[,1],
> y=e1$vectors$arrows[,2],
> pval=e1$vectors$pvals,
> r=e1$vectors$r)
> rownames(exy) <- NULL
> exy <- exy[order(-exy$r),]
> mdat <- data.frame(m3$points)
> mdat$site <- substr(rownames(mdat), 1, 1)
> mdat$col <- ifelse(mdat$site == "D", "red",
> ifelse(mdat$site == "H", "blue", "green"))
> mdat$rad <- sqrt((mdat$MDS1^2) + (mdat$MDS2^2))
> max(mdat$rad)
> exy$x2 <- 0.17 * exy$r * exy$x
> exy$y2 <- 0.17 * exy$r * exy$y
> exy$adj <- ifelse(exy$x < 0, 1, 0)
> plot(mdat[,1], mdat[,2], pch=16, col=mdat$col,
> xlab="NMDS1", ylab="NMDS2", xlim=c(-0.2, 0.2),
> ylim=c(-0.2, 0.2), main= "Phylum")
> ordihull(mdat[,1], mdat[,2], display="sites", label=T,
> lwd=2, draw="polygon",col= c("blue", "red", "green"))
Related
I am a research student coming to grips with R for the first time.
I am trying to make a PCA plot from a series of body measurements, the specimens names and a subspecies tag (BIN) are in sperate columns. The BIN column contains the BIN ID for each sample.
The difficulty I am facing is filtering out individuals with certain BIN's.
My desired output is to produce a PCA plot identical to the one below but only displaying the named BIN's ("ACZ5516", "ADF3772") and not the remaining BIN's.
Revised image
#import data set
Anotylus<-read.csv("DataSO.csv", header = TRUE, sep = ",",
row.names = 1)
#row.names sets specimen ID as specimen name
#set BIN as factor
Anotylus$BIN<-as.factor(Anotylus$BIN)
# Number of BINs and number of individuals in each
table(Anotylus["BIN"])
#create PCA of data set, excludes column for BIN (column 12)
Ano.pca<-PCA(Anotylus[,c(1:11)], graph = FALSE)
#visualise PCA with all individuals in the d.f.
fviz_pca_ind(Ano.pca,
geom.ind = "point",
col.ind = Anotylus$BIN,
repel = TRUE,
legend.title = "BIN",
addEllipses = TRUE)
#With individuals from selected BINs
top<-list(name=c("ACZ5516", "ADF3772"))
fviz_pca_ind(Ano.pca,
geom.ind = "point",
col.ind = Anotylus$BIN,#
select.ind = top,
repel = TRUE,
legend.title = "BIN",
addEllipses = TRUE)
#no samples visible at all
#wouild like to see only the two named
I have tried using a subset of the data but the Principal Components variation changes and produces different a result.
How do I filter the individuals displayed to a curated list?
Any advice or guidance is deeply appreciated!
Best,
Dante
Sample data set below
> dput(Anotylus)
structure(list(Total.Anten.Length..mm. = c(0.66, 0.635, 0.676,
0.559, 1.249, 0.675, 0.704, 0.649, 0.661, 0.795, 0.836, 0.888,
0.941, 0.781, 0.899, 0.918, 0.854, 0.834, 0.888, 0.884, 0.879,
0.776, 0.954, 0.853, 0.96, 0.527, 0.515, 0.653, 0.491, 0.474,
0.538, 0.694, 1.01, 0.53, 0.641, 0.509, 0.918, 0.849, 0.452,
0.536), Body.Length...mm. = c(1.842, 1.664, 1.901, 1.917, 3.061,
1.961, 1.862, 1.99, 1.85, 1.449, 2.455, 2.077, 2.578, 2.478,
2.798, 2.589, 2.291, 2.882, 2.472, 2.55, 2.53, 2.757, 2.689,
2.166, 2.894, 1.944, 1.48, 2.385, 1.715, 1.674, 1.532, 2.27,
2.598, 1.677, 1.67, 1.68, 2.374, 2.877, 1.699, 1.656),
Eye.Area..mm2. = c(0.01,
0.009, 0.01, 0.006, 0.026, 0.007, 0.01, 0.01, 0.009, 0.006, 0.016,
0.014, 0.015, 0.018, 0.02, 0.016, 0.019, 0.015, 0.013, 0.011,
0.015, 0.014, 0.017, 0.014, 0.012, 0.007, 0.006, 0.02, 0.007,
0.006, 0.005, 0.013, 0.013, 0.006, 0.007, 0.005, 0.013, 0.006,
0.008, 0.005), Eye.Width..mm. = c(0.046, 0.036, 0.054, 0.033,
0.071, 0.04, 0.046, 0.047, 0.044, 0.05, 0.059, 0.053, 0.073,
0.063, 0.068, 0.051, 0.044, 0.07, 0.064, 0.061, 0.054, 0.042,
0.038, 0.059, 0.059, 0.043, 0.046, 0.079, 0.037, 0.035, 0.037,
0.054, 0.047, 0.045, 0.045, 0.028, 0.05, 0.037, 0.043, 0.045),
Head.Width..mm. = c(0.359, 0.362, 0.377, 0.317, 0.731, 0.456,
0.38, 0.414, 0.359, 0.453, 0.568, 0.449, 0.519, 0.517, 0.516,
0.515, 0.512, 0.513, 0.511, 0.456, 0.503, 0.474, 0.598, 0.453,
0.574, 0.309, 0.306, 0.574, 0.314, 0.298, 0.295, 0.386, 0.557,
0.289, 0.318, 0.306, 0.505, 0.291, 0.298, 0.263),
Pronotum.Width..mm. = c(0.413,
0.455, 0.439, 0.352, 0.741, 0.462, 0.467, 0.461, 0.442, 0.493,
0.573, 0.549, 0.584, 0.617, 0.632, 0.61, 0.614, 0.624, 0.631,
0.533, 0.587, 0.562, 0.609, 0.522, 0.621, 0.342, 0.341, 0.598,
0.336, 0.314, 0.331, 0.467, 0.547, 0.343, 0.342, 0.317, 0.545,
0.328, 0.329, 0.284), Pronotum.Length..mm. = c(0.304, 0.326,
0.334, 0.24, 0.48, 0.317, 0.303, 0.329, 0.302, 0.36, 0.418,
0.383, 0.424, 0.428, 0.399, 0.442, 0.404, 0.461, 0.435, 0.376,
0.393, 0.403, 0.373, 0.41, 0.435, 0.259, 0.247, 0.403, 0.257,
0.252, 0.23, 0.387, 0.388, 0.248, 0.26, 0.215, 0.336, 0.223,
0.231, 0.247), Elytra.Width..mm. = c(0.558, 0.552, 0.586,
0.43, 0.854, 0.506, 0.528, 0.586, 0.548, 0.54, 0.75, 0.716,
0.794, 0.816, 0.746, 0.82, 0.786, 0.8, 0.722, 0.69, 0.758,
0.766, 0.736, 0.668, 0.852, 0.468, 0.462, 0.741, 0.461, 0.323,
0.406, 0.637, 0.617, 0.41, 0.366, 0.422, 0.718, 0.42, 0.408,
0.278), Elytra.Length..mm. = c(0.469, 0.437, 0.386, 0.346,
0.631, 0.428, 0.464, 0.451, 0.445, 0.532, 0.583, 0.543, 0.558,
0.62, 0.625, 0.623, 0.613, 0.605, 0.623, 0.588, 0.606, 0.48,
0.568, 0.568, 0.598, 0.373, 0.352, 0.516, 0.365, 0.326, 0.327,
0.502, 0.464, 0.346, 0.344, 0.319, 0.519, 0.346, 0.329, 0.346
), Pronotum.Value = c(0.288, 0.319, 0.306, 0.331, 0.179,
0.278, 0.224, 0.211, 0.204, 0.273, 0.26, 0.33, 0.241, 0.218,
0.203, 0.209, 0.241, 0.227, 0.31, 0.236, 0.341, 0.288, 0.283,
0.263, 0.279, 0.173, 0.162, 0.22, 0.183, 0.209, 0.193, 0.185,
0.236, 0.181, 0.172, 0.227, 0.275, 0.164, 0.21, 0.217),
Elytra.Value = c(0.314,
0.319, 0.393, 0.243, 0.205, 0.297, 0.21, 0.205, 0.244, 0.359,
0.288, 0.335, 0.375, 0.291, 0.243, 0.238, 0.288, 0.283, 0.351,
0.271, 0.48, 0.415, 0.325, 0.294, 0.193, 0.182, 0.271, 0.237,
0.216, 0.246, 0.214, 0.193, 0.233, 0.205, 0.18, 0.262, 0.225,
0.176, 0.303, 0.251), BIN = structure(c(1L, 1L, 1L, 3L, 8L,
1L, 1L, 1L, 1L, 4L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 7L, 3L, 3L, 6L, 3L, 3L, 3L, 2L, 5L, 3L, 3L,
3L, 5L, 3L, 3L, 3L), .Label = c("ACZ5516", "ACZ5742", "ADF3772",
"ADF4138", "ADG1201", "ADH9095", "ADI3175", "ADR2790"), class =
"factor")), row.names = c("CCDB-22214-D03",
"CCDB-22214-D06", "CCDB-22214-D08", "CCDB-22214-G09", "CCDB-22214-
H02",
"CCDB-22214-H09", "CCDB-22215-A11", "CCDB-22215-A12", "CCDB-22215-
F04",
"CCDB-23850-B07", "CCDB-23851-C04", "CCDB-23851-C05", "CCDB-23851-
C11",
"CCDB-23851-C12", "CCDB-23851-D02", "CCDB-23851-D03", "CCDB-23851-
D04",
"CCDB-23851-D06", "CCDB-23851-E08", "CCDB-23851-E09", "CCDB-23851-
E11",
"CCDB-23851-F03", "CCDB-23851-G05", "CCDB-23851-G09", "CCDB-23858-
B08",
"CCDB-23858-G12", "CCDB-23858-H01", "CCDB-23859-B10", "CCDB-23859-
E07",
"CCDB-23859-E10", "CCDB-23859-E11", "CCDB-25504-E04", "CCDB-25505-
E02",
"CCDB-25510-B12", "CCDB-25510-D02", "CCDB-25510-E09", "CCDB-25511-
B06",
"CCDB-25511-B12", "CCDB-25511-E11", "CCDB-25512-E12"), class =
"data.frame")
Apparently factoextra "produces ggplot2-based elegant data visualization with less typing". From what I can tell, fviz_pca_ind is essentially plotting the PCA coordinate for each individual point, and compute a multivariate normal distribution as an ellipse.
Here's the replication of the plot you have attached in stripped down ggplot code:
#constructing a plotting data frame with the BIN identifier and each pca qualitative coordinates
df <- cbind.data.frame(BIN = Anotylus$BIN, Ano.pca$ind$coord)
ggplot(df, aes(x=Dim.1, y=Dim.2, color=BIN)) +
geom_point() +
stat_ellipse(type="norm")
Note that as there are only 1 or 2 points for all BIN other than ACZ5516 and ADF3772, there will be "Too few points to calculate an ellipse" and as such no ellipse is plotted.
In order to "hide" the other BIN in your figure, you can either just plot the BIN you wanted or you can create a new grouping (ACZ5516, ADF3772 and others) in the plotting data and set the points you do not want to focus on in less visible colour.
library(dplyr)
# Plot only BIN ACZ5516 and ADF3772
df %>%
filter(BIN %in% c("ACZ5516", "ADF3772")) %>%
ggplot(aes(x=Dim.1, y=Dim.2, color=BIN)) +
geom_point() +
stat_ellipse(type="norm")
# Create a new grouping for BIN other than ACZ5516 and ADF3772
df2 <- df %>%
mutate(BIN = ifelse(BIN %in% c("ACZ5516", "ADF3772"), as.character(BIN), "Others"))
df2 %>%
ggplot(aes(x=Dim.1, y=Dim.2, color=BIN)) +
geom_point() +
stat_ellipse(data = df %>% filter(BIN %in% c("ACZ5516", "ADF3772")), type="norm") +
scale_colour_manual(values = c("darkgreen", "orange", "gray"))
I have a dataset of volumes of 12 brain structures generated by two different methods. The raw data looks like this (just a small sample):
> dput(WT_MD_Raw[sample(nrow(WT_MD_Raw), 20), ])
structure(list(Method = c("ITKSNAP", "Stereology", "ITKSNAP",
"Stereology", "Stereology", "Stereology", "ITKSNAP", "Stereology",
"ITKSNAP", "ITKSNAP", "ITKSNAP", "Stereology", "ITKSNAP", "ITKSNAP",
"ITKSNAP", "ITKSNAP", "ITKSNAP", "ITKSNAP", "Stereology", "ITKSNAP"
), HCH_L = c(0.11, 0.157834822, 0.128, 0.119263065, 0.177391743,
0.14736469, 0.12, 0.175141504, 0.09, 0.13, 0.1, 0.171363868,
0.1159875, 0.13, 0.103, 0.122, 0.11, 0.125, 0.143858524, 0.123
), HCH_R = c(0.12, 0.16579533, 0.133, 0.124396906, 0.207531117,
0.127146973, 0.116, 0.146630096, 0.1, 0.13, 0.12, 0.170502743,
0.1241375, 0.15, 0.12, 0.133, 0.102, 0.133, 0.190865816, 0.123
), HCB_L = c(0.05, 0.076765729, 0.077, 0.063580641, 0.068459435,
0.073682345, 0.066, 0.072409924, 0.06, 0.05, 0.05, 0.073195622,
0.0585125, 0.05, 0.057, 0.061, 0.055, 0.049, 0.081047056, 0.063
), HCB_R = c(0.07, 0.081317642, 0.083, 0.085300735, 0.074917872,
0.078175171, 0.07, 0.074672734, 0.06, 0.05, 0.05, 0.082237434,
0.061475, 0.05, 0.056, 0.052, 0.049, 0.063, 0.05835388, 0.064
), HCT_L = c(0.03, 0.041923225, 0.049, 0.042650368, 0.037028374,
0.043580411, 0.05, 0.039372896, 0.04, 0.03, 0.02, 0.038750623,
0.0379125, 0.05, 0.035, 0.024, 0.046, 0.037, 0.052680586, 0.037
), HCT_R = c(0.03, 0.036264039, 0.044, 0.018560808, 0.027125436,
0.035493325, 0.049, 0.03348959, 0.05, 0.02, 0.04, 0.039181186,
0.0344, 0.04, 0.032, 0.026, 0.048, 0.027, 0.055922468, 0.033),
ERC_L = c(0.095, 0.193585925, 0.124, 0.140588249, 0.200211554,
0.172524515, 0.108, 0.206368284, 0.113, 0.1, 0.116, 0.289768551,
0.112275, 0.129, 0.114, 0.103, 0.128, 0.113, 0.166551699,
0.139), ERC_R = c(0.094, 0.191115764, 0.121, 0.160728701,
0.183419618, 0.204872861, 0.109, 0.234427129, 0.117, 0.117,
0.113, 0.304407675, 0.113825, 0.16, 0.104, 0.099, 0.12, 0.111,
0.184787287, 0.142), PRC_L = c(0.138, 0.124528754, 0.154,
0.109785206, 0.140363369, 0.115016343, 0.204, 0.143914724,
0.131, 0.122, 0.164, 0.096876559, 0.1371125, 0.195, 0.139,
0.117, 0.167, 0.118, 0.106171643, 0.112), PRC_R = c(0.151,
0.129179281, 0.124, 0.131900211, 0.145099557, 0.121755582,
0.146, 0.122644309, 0.115, 0.117, 0.164, 0.102904433, 0.1369875,
0.181, 0.154, 0.12, 0.18, 0.117, 0.141021877, 0.131), PHC_L = c(0.182,
0.212439273, 0.237, 0.182448795, 0.196767055, 0.200829318,
0.184, 0.17197357, 0.153, 0.134, 0.147, 0.185141868, 0.1674875,
0.171, 0.174, 0.193, 0.199, 0.148, 0.20423858, 0.201), PHC_R = c(0.172,
0.193777133, 0.205, 0.190347011, 0.201933804, 0.160843167,
0.178, 0.196411919, 0.151, 0.122, 0.146, 0.182989055, 0.1556125,
0.153, 0.14, 0.169, 0.182, 0.157, 0.18965011, 0.192)), row.names = c(54L,
161L, 14L, 123L, 148L, 81L, 13L, 93L, 50L, 56L, 39L, 91L, 80L,
5L, 72L, 64L, 33L, 58L, 135L, 18L), class = "data.frame")
Where the columns HCH_L, HCH_R, HCB_L, etc... are the structures, and Method is the method used to generate the volumes for each participant. I wanted a plot which would show the mean volumes for each method, paired together, for each structure. I had already calculated the means for each structure per method, so I just melt this and the plot is exactly what I want:
ggplot(data = reshape2::melt(WT_MD)) +
geom_bar(mapping = aes(x = variable, y = value, fill = Method), stat = "identity", position = "dodge")
I conducted some pairwise comparisons between the volumes each method generated for each structure using Mann-Whitney U tests. I'd like to show with an asterisk above each pair of bars (e.g., between HCH_L for ITK and Stereo) whether the volumes are significantly different, but am struggling with this. I tried with ggsignif, but I don't know how to express what I want:
ggplot(data = reshape2::melt(WT_MD)) +
geom_bar(mapping = aes(x = variable, y = value, fill = Method), stat = "identity", position = "dodge") +
geom_signif(comparisons = list(c("ITKSNAP", "Stereology")), map_signif_level = TRUE)
This gives me an error saying that it can't do comparisons for variables mapped to other aesthetics than the x-axis, which is fair enough:
Error in f(...) :
Can only handle data with groups that are plotted on the x-axis
I'm trying to compare across levels of one variable (method) within levels of another variable (structure), so it's unavoidable that one of these will be mapped to a different aesthetic. I've been looking at this all morning and am starting to get tunnel-vision - can anybody help with this please?
My data looks like this:
> dput(head(CORt, 5))
structure(list(rDate = structure(c(1438019100, 1438019400, 1438019700,
1438020000, 1438020300), class = c("POSIXct", "POSIXt"), tzone = "UTC"),
I630 = c(0.536, 0.506, 0.419, 0.456, 0.427), I800 = c(0.414,
0.388, 0.339, 0.351, 0.331), I532 = c(0.547, 0.534, 0.463,
0.488, 0.464), I570 = c(0.522, 0.508, 0.467, 0.468, 0.445
), WR630 = c(0.0127, 0.0573, 0.0083, 0.0057, 0.0053), WR800 = c(0.0144,
0.0506, 0.0249, 0.0163, 0.0159), WR532 = c(0.0139, 0.0394,
0.006, 0.005, 0.0049), WR570 = c(0.0176, 0.0379, 0.0094,
0.0054, 0.0049), NR630 = c(0.006, 0.034, 0.006, 0.004, 0.004
), NR800 = c(0.007, 0.04, 0.019, 0.02, 0.019), NR532 = c(0.007,
0.072, 0.01, 0.007, 0.007), NR570 = c(0.009, 0.077, 0.008,
0.007, 0.007), ER630 = c(0.0351, 0.0746, 0.0116, 0.0055,
0.0052), ER800 = c(0.0278, 0.0596, 0.03, 0.0324, 0.0303),
ER532 = c(0.04, 0.085, 0.013, 0.008, 0.008), ER570 = c(0.034,
0.083, 0.013, 0.009, 0.008)), row.names = c(NA, 5L), class = "data.frame")
In the CORt dataframe when the values of WR630 > I630 I want to turn all values of that row(s) into NA but I want to preserve the rDate column dates and the ER532 values of that row(s).
I have been using this code (example):
which(CORt$WR630>CORt$I630)
CORt[c(7632, 12530, 13684, 14260, 18295, 19735, 23770, 24634, 27529, 44055), setdiff(names(CORt), c("rDate", "ER532"))] <- NA
but this is not handy when I have 200 lines, for example. I'm looking for a code that will turn the row values when WR630 > I630 into NA directly.
Any help is much appreciated.
You can use the which command instead of typing output all the row numbers manually.
CORt[which(CORt$WR630>CORt$I630),setdiff(names(CORt), c("rDate", "ER532"))] <- NA
If you don't have any missing values in the data you can also skip which.
CORt[CORt$WR630>CORt$I630,setdiff(names(CORt), c("rDate", "ER532"))] <- NA
How does this work for you?
nrow(data) %>% map(
.f = function(i) {
if (data[i,"WR630"] > data[i,"I630"] ) {
data[i,-c(1,16)] <- NA
}
data
}
)
Please note that it uses index numbering, instead of names to avoid setting rDate and ER532 to NA. In the data you provided, I didnt find any cases where you condition held true, so I tested it reversely to be certain it works.
I am trying to plot a histogram with a custom colour palette. The problem arises when I set the xlim of the histogram.
Please see below the reproducible example:
# sample dataframe
test_dt <- structure(list(col_1 = c(0.057, -0.063, -0.319, 0.02, 0.079,
0.007, -0.105, -0.084, 0.019, 0.28, -0.064, -0.243, -0.116, 0.079,
0.07, -0.187, -0.725, 0.134, 0.062, -0.056, -0.074, 0.392, -0.014,
-0.062, 0.214, 0.371, 0.069, -0.03, 0.036, -0.175, 0.097, 0.358,
0.153, -0.092, -0.038, -0.051, 0.017, -0.108, 0.133, 0.105, 0.187,
-0.056, -0.316, 0.15, -0.142, 0.076, 0.242, -0.069, 0.155, 0.214,
0.162, -0.037, -0.109, 0.111, -0.077, -0.435, 0.003, 0.187, 0.134,
0.027, 0.107, 0.175, -0.355, -0.572, 0.038, -0.209, -0.263, -0.147,
-0.23, -0.174, 0.203, -0.118, 0.008, -0.268, -0.001, 0.227, -0.019,
0.08, 0.044, -0.065, -0.131, 0.093, 0.127, -0.131, 0.039, 0.045,
0.032, 0.343, 0.053, -0.033, 0.453, 0.07, -0.225, 0.094, 0.002,
-0.119, 0.014, -0.125, 0.003, -0.48)), row.names = c(NA, -100L
), class = "data.frame")
# colour palette
RBW <- colorRampPalette(c("darkred","white","darkblue"))
# plot histogram without xlim
ggplot(test_dt) +
geom_histogram(aes(x=col_1),
position = "identity",
bins = 60,
color = "grey10",
fill = RBW(60))
When I run the following lines is when I get the error:
Aesthetics must be either length 1 or the same as the data
# plot histogram with xlim
ggplot(test_dt) +
geom_histogram(aes(x=col_1),
position = "identity",
bins = 60,
color = "grey10",
fill = RBW(60)) +
xlim(-2,2)
instead of xlim, add + coord_cartesian(xlim = c(-2,2))
library(ggplot2)
``` r
ggplot(test_dt) +
geom_histogram(aes(x=col_1),
position = "identity",
bins = 60,
color = "grey10",
fill = RBW(60)) +
coord_cartesian(xlim = c(-2,2))
Created on 2020-02-11 by the reprex package (v0.3.0)
This is my data frame, with two columns Y (response) and X (covariate):
## Editor edit: use `dat` not `data`
dat <- structure(list(Y = c(NA, -1.793, -0.642, 1.189, -0.823, -1.715,
1.623, 0.964, 0.395, -3.736, -0.47, 2.366, 0.634, -0.701, -1.692,
0.155, 2.502, -2.292, 1.967, -2.326, -1.476, 1.464, 1.45, -0.797,
1.27, 2.515, -0.765, 0.261, 0.423, 1.698, -2.734, 0.743, -2.39,
0.365, 2.981, -1.185, -0.57, 2.638, -1.046, 1.931, 4.583, -1.276,
1.075, 2.893, -1.602, 1.801, 2.405, -5.236, 2.214, 1.295, 1.438,
-0.638, 0.716, 1.004, -1.328, -1.759, -1.315, 1.053, 1.958, -2.034,
2.936, -0.078, -0.676, -2.312, -0.404, -4.091, -2.456, 0.984,
-1.648, 0.517, 0.545, -3.406, -2.077, 4.263, -0.352, -1.107,
-2.478, -0.718, 2.622, 1.611, -4.913, -2.117, -1.34, -4.006,
-1.668, -1.934, 0.972, 3.572, -3.332, 1.094, -0.273, 1.078, -0.587,
-1.25, -4.231, -0.439, 1.776, -2.077, 1.892, -1.069, 4.682, 1.665,
1.793, -2.133, 1.651, -0.065, 2.277, 0.792, -3.469, 1.48, 0.958,
-4.68, -2.909, 1.169, -0.941, -1.863, 1.814, -2.082, -3.087,
0.505, -0.013, -0.12, -0.082, -1.944, 1.094, -1.418, -1.273,
0.741, -1.001, -1.945, 1.026, 3.24, 0.131, -0.061, 0.086, 0.35,
0.22, -0.704, 0.466, 8.255, 2.302, 9.819, 5.162, 6.51, -0.275,
1.141, -0.56, -3.324, -8.456, -2.105, -0.666, 1.707, 1.886, -3.018,
0.441, 1.612, 0.774, 5.122, 0.362, -0.903, 5.21, -2.927, -4.572,
1.882, -2.5, -1.449, 2.627, -0.532, -2.279, -1.534, 1.459, -3.975,
1.328, 2.491, -2.221, 0.811, 4.423, -3.55, 2.592, 1.196, -1.529,
-1.222, -0.019, -1.62, 5.356, -1.885, 0.105, -1.366, -1.652,
0.233, 0.523, -1.416, 2.495, 4.35, -0.033, -2.468, 2.623, -0.039,
0.043, -2.015, -4.58, 0.793, -1.938, -1.105, 0.776, -1.953, 0.521,
-1.276, 0.666, -1.919, 1.268, 1.646, 2.413, 1.323, 2.135, 0.435,
3.747, -2.855, 4.021, -3.459, 0.705, -3.018, 0.779, 1.452, 1.523,
-1.938, 2.564, 2.108, 3.832, 1.77, -3.087, -1.902, 0.644, 8.507
), X = c(0.056, 0.053, 0.033, 0.053, 0.062, 0.09, 0.11, 0.124,
0.129, 0.129, 0.133, 0.155, 0.143, 0.155, 0.166, 0.151, 0.144,
0.168, 0.171, 0.162, 0.168, 0.169, 0.117, 0.105, 0.075, 0.057,
0.031, 0.038, 0.034, -0.016, -0.001, -0.031, -0.001, -0.004,
-0.056, -0.016, 0.007, 0.015, -0.016, -0.016, -0.053, -0.059,
-0.054, -0.048, -0.051, -0.052, -0.072, -0.063, 0.02, 0.034,
0.043, 0.084, 0.092, 0.111, 0.131, 0.102, 0.167, 0.162, 0.167,
0.187, 0.165, 0.179, 0.177, 0.192, 0.191, 0.183, 0.179, 0.176,
0.19, 0.188, 0.215, 0.221, 0.203, 0.2, 0.191, 0.188, 0.19, 0.228,
0.195, 0.204, 0.221, 0.218, 0.224, 0.233, 0.23, 0.258, 0.268,
0.291, 0.275, 0.27, 0.276, 0.276, 0.248, 0.228, 0.223, 0.218,
0.169, 0.188, 0.159, 0.156, 0.15, 0.117, 0.088, 0.068, 0.057,
0.035, 0.021, 0.014, -0.005, -0.014, -0.029, -0.043, -0.046,
-0.068, -0.073, -0.042, -0.04, -0.027, -0.018, -0.021, 0.002,
0.002, 0.006, 0.015, 0.022, 0.039, 0.044, 0.055, 0.064, 0.096,
0.093, 0.089, 0.173, 0.203, 0.216, 0.208, 0.225, 0.245, 0.23,
0.218, -0.267, 0.193, -0.013, 0.087, 0.04, 0.012, -0.008, 0.004,
0.01, 0.002, 0.008, 0.006, 0.013, 0.018, 0.019, 0.018, 0.021,
0.024, 0.017, 0.015, -0.005, 0.002, 0.014, 0.021, 0.022, 0.022,
0.02, 0.025, 0.021, 0.027, 0.034, 0.041, 0.04, 0.038, 0.033,
0.034, 0.031, 0.029, 0.029, 0.029, 0.022, 0.021, 0.019, 0.021,
0.016, 0.007, 0.002, 0.011, 0.01, 0.01, 0.003, 0.009, 0.015,
0.018, 0.017, 0.021, 0.021, 0.021, 0.022, 0.023, 0.025, 0.022,
0.022, 0.019, 0.02, 0.023, 0.022, 0.024, 0.022, 0.025, 0.025,
0.022, 0.027, 0.024, 0.016, 0.024, 0.018, 0.024, 0.021, 0.021,
0.021, 0.021, 0.022, 0.016, 0.015, 0.017, -0.017, -0.009, -0.003,
-0.012, -0.009, -0.008, -0.024, -0.023)), .Names = c("Y", "X"
), row.names = c(NA, -234L), class = "data.frame")
With this I run a OLS regression: lm(dat[,1] ~ dat[,2]).
At a set of values: X = quantile(dat[,2], c(0.1, 0.5, 0.7)), I would like to plot a graph similar to the following, with conditional density P(Y|X) displaying along the regression line.
How can I do this in R? Is it even possible?
I call your dataset dat. Don't use data as it masks R function data.
dat <- na.omit(dat) ## retain only complete cases
## use proper formula rather than `$` or `[,]`;
## otherwise you get trouble in prediction with `predict.lm`
fit <- lm(Y ~ X, dat)
## prediction point, as given in your question
xp <- quantile(dat$X, probs = c(0.1, 0.5, 0.7), names = FALSE)
## make prediction and only keep `$fit` and `$se.fit`
pred <- predict.lm(fit, newdata = data.frame(X = xp), se.fit = TRUE)[1:2]
#$fit
# 1 2 3
#0.20456154 0.14319857 0.00678734
#
#$se.fit
# 1 2 3
#0.2205000 0.1789353 0.1819308
To understand the theory behind the following, read Plotting conditional density of prediction after linear regression. Now I am to use mapply function to apply the same computation to multiple points:
## a function to make 101 sample points from conditional density
f <- function (mu, sig) {
x <- seq(mu - 3.2 * sig, mu + 3.2 * sig, length = 101)
dx <- dnorm(x, mu, sig)
cbind(x, dx)
}
## apply `f` to all `xp`
lst <- mapply(f, pred[[1]], pred[[2]], SIMPLIFY = FALSE)
## To plot rotated density curve, we basically want to plot `(dx, x)`
## but scaling `(alpha * dx, x)` is needed for good scaling with regression line
## Also to plot rotated density along the regression line,
## a shift is needed: `(alpha * dx + xp, x)`
## The following function adds rotated, scaled density to a regression line
## a "for-loop" is used for readability, with no loss of efficiency.
## (make sure there is an existing plot; otherwise you get `plot.new` error!!)
addrsd <- function (xp, lst, alpha = 1) {
for (i in 1:length(xp)) {
x0 <- xp[i]; mat <- lst[[i]]
dx. <- alpha * mat[, 2] + x0 ## rescale and shift
x. <- mat[, 1]
lines(dx., x., col = "gray") ## rotate and plot
segments(x0, x.[1], x0, x.[101], col = "gray") ## a local axis
}
}
Now let's see the picture:
## This is one simple way to draw the regression line
## A better way is to generate and grid and predict on the grid
## In later example I will show this
plot(dat$X, fit$fitted, type = "l", ylim = c(-0.6, 1))
## we try `alpha = 0.01`;
## you can also try `alpha = 1` in raw scale to see what it looks like
addrsd(xp, lst, 0.01)
Note, we have only scaled the height of the density, not its span. The span sort of implies confidence band, and should not be scaled. Consider further overlaying confidence band on the plot. If the use of matplot is not clear, read How do I change colours of confidence interval lines when using matlines for prediction plot?.
## A grid is necessary for nice regression plot
X.grid <- seq(min(dat$X), max(dat$X), length = 101)
## 95%-CI based on t-statistic
CI <- predict.lm(fit, newdata = data.frame(X = X.grid), interval = "confidence")
## use `matplot`
matplot(X.grid, CI, type = "l", col = c(1, 2, 2), lty = c(1, 2, 2))
## add rotated, scaled conditional density
addrsd(xp, lst, 0.01)
You see that the span of the density curve agrees with the confidence ribbon.