Averaging the replicate data in omics / biostatistics - r

I have a dataframe for gene expression data. Samples are named as Genotype_Time_Replicate (e.g. AOX_1h_4).
E.g. data set
df <- structure(list(ID = c("AT5G54740.1", "AT5G55730.2", "AT5G57655.2", "AT5G64100.1", "AT5G64260.1", "AT5G67360.1", "AT1G30630.1", "AT1G62380.1", "AT1G70830.1", "AT3G14990.1", "AT4G18800.1", "AT4G24510.1", "AT5G15650.1", "AT5G19820.1", "AT5G59840.1", "AT5G47200.1", "AT1G12840.1", "AT1G76030.1", "AT1G78900.2", "AT3G42050.1", "AT4G11150.1", "AT1G11860.2", "AT1G17290.1" ),
Location = c("extracellular", "extracellular", "extracellular", "extracellular", "extracellular", "extracellular", "golgi", "golgi", "golgi", "golgi", "golgi", "golgi", "golgi", "golgi", "golgi", "ER", "ER", "ER", "mitochondrion", "mitochondrion", "mitochondrion", "mitochondrion", "mitochondrion"),
AOX_1h_1 = c(0.844651873, 0.50954096, 1.12e-08, 0.012981372, 0.978148381, 0.027579578, 0.068010151, 0.410629215, 0.253838635, 0.033631788, 0.335713512, 0.982799013, 0.025910457, 0.793810264, 0.762431665, 0.152154436, 0.027114103, 0.000227, 1.07e-05, 0.721209032, 0.086281162, 0.483130711, 0.014795515),
AOX_1h_2 = c(0.894623378, 0.011521413, 1.62e-06, 0.085249729, 0.02863972, 0.956962154, 0.225208718, 0.932679767, 0.002574192, 0.071700671, 0.233682544, 0.936572874, 1.12e-05, 0.241658735, 0.865205515, 0.000537, 0.103471292, 8.66e-07, 1.22e-08, 0.950878446, 0.145012176, 0.092919172, 0.599713247),
AOX_1h_3 = c(0.880951025, 0.00145276, 8.59e-10, 0.087023475, 0.675527672, 0.765543306, 0.305860948, 0.899172011, 0.020973476, 0.542988545, 0.735571562, 0.157569324, 0.025488075, 0.071006507, 0.262324019, 0.080470612, 0.0436526, 6.65e-09, 5.63e-10, 0.020557091, 0.069577215, 0.005502212, 0.852099232),
AOX_1h_4 = c(0.980823252, 0.158123518, 0.00210702, 0.006317657, 0.30496173, 0.489709702, 0.091469807, 0.958443361, 0.015583593, 0.566165972, 0.66746161, 0.935102341, 0.087733288, 0.744313619, 0.021169383, 0.633250945, 0.257489406, 0.024345088, 0.000355, 0.226279179, 0.004038493, 0.479275204, 0.703522761),
AOX_2h_1 = c(0.006474022, 0.246530998, 5.38e-06, 0.47169153, 0.305973663, 0.466202566, 0.191733645, 0.016121487, 0.234839116, 0.043866023, 0.089819656, 0.107934599, 2.09e-06, 0.413229678, 0.464078018, 0.004118766, 0.774970986, 3.79e-07, 2.3e-10, 0.428591262, 0.002326292, 0.385580707, 0.106216066),
AOX_2h_2 = c(0.166169729, 0.005721199, 7.77e-08, 0.099146712, 0.457164663, 0.481987525, 7.4e-05, 0.969805081, 0.100894997, 0.062103337, 0.095718425, 0.001686206, 0.009710516, 0.134651787, 0.887036569, 0.459218152, 0.074576369, 3.88e-09, 3.31e-15, 0.409645805, 0.064874307, 0.346371524, 0.449444779),
AOX_2h_3 = c(1.06e-05, 0.576589898, 4.03e-08, 0.787468189, 0.971119601, 0.432593753, 0.000274, 0.86932399, 0.08657663, 4.22e-06, 0.071190008, 0.697384316, 0.161623604, 0.422628778, 0.299545652, 0.767867006, 0.00295567, 0.078724176, 4.33e-09, 0.988576028, 0.080278831, 0.66505527, 0.014158693),
AOX_2h_4 = c(0.010356719, 0.026506539, 9.48e-09, 0.91009296, 0.302464488, 0.894377768, 0.742233323, 0.75032613, 0.175841127, 0.000721, 0.356904918, 0.461234653, 1.08e-05, 0.65800831, 0.360085919, 0.004814238, 0.174670947, 0.004246734, 7.31e-11, 0.778725214, 0.051334623, 0.10212841, 0.155831664 ),
AOX_6h_1 = c(0.271681878, 0.004822226, 1.87e-11, 0.616969208, 0.158860224, 0.684690326, 0.011798791, 0.564591916, 0.000314, 4.79e-06, 0.299871385, 0.001909713, 0.00682428, 0.039107415, 0.574143284, 0.061532691, 0.050483892, 2.28e-08, 1.92e-12, 0.058747794, 0.027147473, 0.196608218, 0.513693112),
AOX_6h_2 = c(5.72e-12, 0.719814288, 0.140016259, 0.927094438, 0.841229414, 0.224510089, 0.026567282, 0.242981965, 0.459311076, 0.038295888, 0.127935565, 0.453746728, 0.005023732, 0.554532387, 0.280899096, 0.336458018, 0.002024021, 0.793915731, 0.012838565, 0.873716549, 0.10097853, 0.237426815, 0.003711539),
AOX_6h_3 = c(3.16e-12, 0.780424491, 0.031315419, 0.363891436, 0.09562579, 0.104833988, 3.52e-05, 0.104196756, 0.870952423, 0.002036134, 0.016480622, 0.671475063, 2.3e-05, 0.00256744, 0.66263641, 0.005026601, 0.57280276, 0.058724117, 6.4e-10, 0.030965264, 0.005301006, 0.622027012, 0.371659724),
AOX_6h_4 = c(7.99e-10, 0.290847169, 0.001319424, 0.347344795, 0.743846306, 0.470908425, 0.00033, 0.016149973, 0.080036584, 0.020899676, 0.00723071, 0.187288769, 0.042514886, 0.00150443, 0.059344154, 0.06554177, 0.112601764, 0.000379, 2.36e-10, 0.78131093, 0.105861995, 0.174370801, 0.05570041 ),
WT_1h_1 = c(0.857, 0.809, 2.31e-05, 0.286, 0.87, 0.396, 0.539, 0.787, 0.73, 0.427, 0.764, 0.87, 0.386, 0.852, 0.848, 0.661, 0.393, 0.0415, 0.00611, 0.843, 0.576, 0.804, 0.304 ),
WT_1h_2 = c(0.898, 0.509, 0.0192, 0.729, 0.616, 0.902, 0.811, 0.9, 0.343, 0.712, 0.814, 0.901, 0.0446, 0.816, 0.896, 0.217, 0.747, 0.0143, 0.000964, 0.901, 0.776, 0.737, 0.876 ),
WT_1h_3 = c(0.939, 0.627, 0.0104, 0.867, 0.932, 0.935, 0.91, 0.939, 0.803, 0.926, 0.934, 0.888, 0.813, 0.859, 0.905, 0.864, 0.838, 0.0223, 0.00917, 0.802, 0.858, 0.724, 0.938 ),
WT_1h_4 = c(0.911, 0.782, 0.298, 0.396, 0.837, 0.871, 0.727, 0.91, 0.506, 0.88, 0.89, 0.909, 0.723, 0.896, 0.547, 0.887, 0.824, 0.566, 0.175, 0.814, 0.348, 0.869, 0.893),
WT_2h_1 = c(0.748, 0.911, 0.231, 0.929, 0.917, 0.928, 0.903, 0.801, 0.909, 0.849, 0.878, 0.884, 0.183, 0.925, 0.928, 0.719, 0.941, 0.108, 0.00817, 0.926, 0.678, 0.923, 0.884),
WT_2h_2 = c(0.935, 0.851, 0.163, 0.925, 0.951, 0.952, 0.63, 0.963, 0.926, 0.916, 0.925, 0.804, 0.868, 0.931, 0.961, 0.951, 0.92, 0.0706, 0.000265, 0.95, 0.917, 0.947, 0.951),
WT_2h_3 = c(0.0197, 0.894, 0.000613, 0.911, 0.922, 0.877, 0.122, 0.916, 0.739, 0.0125, 0.718, 0.905, 0.801, 0.875, 0.852, 0.91, 0.302, 0.729, 0.00015, 0.923, 0.731, 0.902, 0.504),
WT_2h_4 = c(0.696, 0.765, 0.0142, 0.931, 0.893, 0.931, 0.925, 0.925, 0.87, 0.45, 0.899, 0.908, 0.144, 0.921, 0.899, 0.631, 0.87, 0.62, 0.0014, 0.926, 0.807, 0.844, 0.865),
WT_6h_1 = c(0.898, 0.727, 0.00395, 0.921, 0.881, 0.924, 0.776, 0.919, 0.542, 0.234, 0.901, 0.67, 0.747, 0.83, 0.919, 0.848, 0.841, 0.056, 0.00144, 0.846, 0.815, 0.888, 0.916),
WT_6h_2 = c(2.38e-09, 0.88, 0.708, 0.898, 0.891, 0.768, 0.443, 0.777, 0.843, 0.505, 0.695, 0.842, 0.208, 0.859, 0.794, 0.813, 0.14, 0.887, 0.326, 0.894, 0.661, 0.775, 0.182),
WT_6h_3 = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L),
WT_6h_4 = c(0.0357, 0.953, 0.792, 0.956, 0.967, 0.96, 0.711, 0.892, 0.931, 0.899, 0.866, 0.946, 0.917, 0.799, 0.925, 0.927, 0.938, 0.72, 0.025, 0.967, 0.936, 0.945, 0.923)),
class = "data.frame", row.names = c(NA, -23L))
I want to summarize data for each organelle (averaged by organelle and samples' replicates) and plot the Wildtype and mutant data side by side with standard error for each time point
df <-
melted <- melt(df)
head(melted)
melted$variable<- str_replace_all(melted$variable, '_[0-9]$', '')
melted$variable <- factor(melted$variable,levels=c("WT_1h","AOX_1h","WT_2h","AOX_2h","WT_6h","AOX_6h"))
my_comparisons <- list( c("WT_1h","AOX_1h"), c("WT_2h","AOX_2h"),c("WT_6h","AOX_6h"))
ggbarplot(melted, x = "variable", y = "value", add = "mean_se",
color = "variable", palette = c("grey","black","grey","black","grey","black"),
facet.by = "Location")+
stat_compare_means(comparisons = my_comparisons, label = "p.signif")
How can I use tidyverse (dplyr / tidyr) for this purpose?
How can I use tidyverse (dplyr / tidyr) to follow this pathway instead of above scripts?

You can use different functions to normalise this data. I use gather() in this example alongside stringr functions to extract the data from the character vector that has 3 columns of data in it.
dat %>%
gather(key, value, -ID, -Location) %>%
mutate(type = map_chr(str_split(key,"_"),~.x[1]),
hour = map_chr(str_split(key,"_"),~.x[2]),
n = map_chr(str_split(key,"_"),~.x[3])) %>%
group_by(type, hour) %>%
summarise(mean = mean(value))
Gives
# A tibble: 6 x 3
# Groups: type [?]
type hour mean
<chr> <chr> <dbl>
1 AOX 1h 0.3235302
2 AOX 2h 0.2709910
3 AOX 6h 0.2226648
4 WT 1h 0.6633866
5 WT 2h 0.7263108
6 WT 6h 0.7915662
This you can use in ggplot() to make a nice barplot.
To get it in a table you can use
dat %>%
gather(key, value, -ID, -Location) %>%
mutate(type = map_chr(str_split(key,"_"),~.x[1]),
hour = map_chr(str_split(key,"_"),~.x[2]),
n = map_chr(str_split(key,"_"),~.x[3])) %>%
group_by(type, hour) %>%
summarise(mean = mean(value)) %>%
spread(type, mean)
to get
# A tibble: 3 x 3
hour AOX WT
* <chr> <dbl> <dbl>
1 1h 0.3235302 0.6633866
2 2h 0.2709910 0.7263108
3 6h 0.2226648 0.7915662

Another version going from the df object:
The df object is a list, and expression values after cbind are character type, so you can do
tb <- as_tibble(do.call(cbind, df)) %>%
mutate_at(3:14, as.numeric)
NB that usually for gene expression data it is easier to read in count data using read_tsv or read.table and combine into matrix, data.frame or tibble.
NBB the df object specified has no "WT" samples (from my copy/paste anyway) so I renamed last 4 samples in tb as "WT_1h" replicates
colnames(tb)[11:14] <- paste0("WT_1h_",c(1:4))
Create means from replicates by function
rowMeanNrep <- function(tb, nm){
varname <- paste0(nm, "_mean")
selectn <- grep(nm, colnames(tb))
tb %>%
dplyr::mutate(!!varname := rowMeans(dplyr::select(., !!selectn)))
}
Specify which timepoints to use, and apply
tps <- c("AOX_1h", "WT_1h")
tb_1h_mean <- cbind(tb_1h[,1:2],
do.call(cbind, lapply(tps, function(f){
rowMeanNrep(tb=tb, nm=f) %>%
dplyr::select(paste0(f, "_mean"))
}))
)
A final NB, think about using boxplots instead of barplots, see this paper

Related

Change distance between ticks on axis with DateTime variable

I have made a plot with ggplot where the x-axis is a DateTime variable. I would like to increase the distance between the ticks on the x axis so basically make the plot longer without changing the actual limits (the limits are two dates which appear correctly on the plot). For visibility reasons I need the plot to have more distance between each point/tick on the x axis so that the plotted change is easier to follow.
plot2 <- ggplot(Data, aes(DateTime, group = 1)) +
geom_line(aes(y = Change, colour = "Change"))
I have tried using a function I found online which makes ggplot display all of the ticks over a specified interval. However, this did not expand the actual distance between the ticks, just added more ticks. Function used shown below.
breaks = function(x)
seq.Date(from = min(x),
to = max(x),
by = "10 days")
minor_breaks = function(x)
seq.Date(from = min(x),
to = max(x),
by = "1 day")
plot2 +
scale_x_datetime(breaks = "15 days",
minor_breaks = "1 days")
Any ideas how I can expand the distance between ticks rather than simply adding more ticks or expanding the limits of the plot?
Here is the data:
structure(list(DateTime = structure(c(1670415374, 1670506704,
1670511629, 1670517043, 1670523367, 1670528144, 1670584731, 1670587219,
1670594506, 1670598044, 1670602687, 1670612571, 1670615016, 1670669321,
1670674548, 1670685813, 1670696903, 1670700710, 1670765570, 1670773501,
1670783869, 1670789931, 1670841137, 1670846470, 1670847141, 1670852707,
1670862620, 1670868387, 1670873869, 1670928407, 1670933500, 1670939207,
1670943626, 1670950125, 1670955461, 1670966455, 1671019197, 1671043928,
1671116424, 1671127891, 1671193896, 1671193896, 1671205681, 1671219303,
1671284825, 1671290071, 1671312145, 1671373499, 1671379886, 1671387493,
1671462033, 1671474322, 1671542373, 1671563513, 1671624638, 1671649144,
1671725712, 1671732040, 1671827734, 1671888249, 1671893712, 1671915669,
1671915669, 1671915669, 1671994322, 1671994322, 1672076098, 1672130014,
1672152035, 1672167136, 1672255279, 1672333309, 1672337893, 1672404995,
1672413431, 1672424101, 1672488057, 1672523537, 1672589460, 1672596553,
1672662188, 1672671457, 1672691857, 1672748172, 1672760508, 1672772744,
1672935802, 1672945396, 1673007435, 1673019876, 1673028304, 1673092863,
1673108795, 1673120023, 1673179769, 1673190069, 1673201163, 1673265620,
1673274863, 1673287229, 1673356126, 1673365774, 1673387010, 1673442864,
1673462481, 1673538703, 1673549051, 1673620368, 1673622111, 1673634853,
1673699438, 1673719261, 1673786770, 1673794936, 1673807902, 1673876509,
1673884159, 1673895512, 1673957880, 1673967545, 1673968363, 1673978875,
1674052099, 1674065378, 1674144571, 1674151393, 1674217307, 1674227431,
1674248909, 1674308920, 1674318010, 1674325431, 1674398675, 1674400192,
1674410504, 1674475288, 1674486319, 1674497762, 1674563116, 1674583639,
1674648195, 1674671450, 1674902477, 1674902477, 1674908963, 1674912648,
1674920521, 1674923572, 1674929410, 1674934232, 1674989081, 1674993673,
1675001115, 1675024579, 1675076219, 1675076219, 1675086360, 1675095923,
1675096891, 1675101964, 1675107017, 1675161022, 1675166442, 1675172546,
1675178387, 1675183108, 1675188015, 1675195910, 1675247464, 1675252883,
1675263629, 1675269012, 1675278293, 1675288565, 1675334193, 1675339936,
1675347483, 1675350039, 1675355417, 1675362687, 1675368937), class = c("POSIXct",
"POSIXt"), tzone = ""), Change = c(0.677, 0.522,
0.252, 0.759, 0.733, 0.331, 0.658, 0.661, 0.245, 0.5, 0.5, 0.679,
0.703, 0.5, 0.5, 0.391, 0.688, 0.702, 0.824, 0.718, 0.5, 0.778,
0.295, 0.263, 0.249, 0.297, 0.737, 0.76, 0.755, 0.704, 0.492,
0.333, 0.5, 0.774, 0.899, 0.5, 0.5, 0.822, 0.649, 0.684, 0.72,
0.72, 0.694, 0.813, 0.318, 0.257, 0.739, 0.74, 0.691, 0.786,
0.735, 0.5, 0.5, 0.834, 0.5, 0.452, 0.706, 0.735, 0.733, 0.74,
0.778, 0.887, 0.887, 0.887, 0.688, 0.688, 0.728, 0.726, 0.803,
0.922, 0.788, 0.764, 0.736, 0.724, 0.5, 0.763, 0.286, 0.831,
0.838, 0.743, 0.701, 0.5, 0.316, 0.5, 0.5, 0.774, 0.741, 0.732,
0.902, 0.5, 0.749, 0.592, 0.699, 0.72, 0.785, 0.702, 0.764, 0.74,
0.329, 0.181, 0.902, 0.5, 0.654, 0.79, 0.705, 0.729, 0.766, 0.26,
0.5, 0.728, 0.767, 0.673, 0.832, 0.804, 0.88, 0.274, 0.792, 0.732,
0.741, 0.834, 0.272, 0.745, 0.726, 0.76, 0.305, 0.83, 0.741,
0.284, 0.691, 0.771, 0.755, 0.669, 0.662, 0.696, 0.684, 0.721,
0.5, 0.775, 0.5, 0.66, 0.699, 0.711, 0.743, 0.743, 0.654, 0.717,
0.767, 0.714, 0.724, 0.683, 0.68, 0.771, 0.699, 0.733, 0.21,
0.21, 0.717, 0.739, 0.774, 0.823, 0.73, 0.5, 0.508, 0.5, 0.5,
0.5, 0.785, 0.469, 0.887, 0.732, 0.5, 0.702, 0.696, 0.862, 0.142,
0.815, 0.5, 0.5, 0.694, 0.775, 0.883)), row.names = c(NA, -181L
), class = c("tbl_df", "tbl", "data.frame"))

How to implement k-fold cross-validation while forcing linear regression of predicted to real values to 1:1 line

I'm trying to train y as a polynomial function of x so that when the predicted y values are linearly regressed against the real y values, the relationship is on the 1:1 line (diagram - The image on the right uses geom_smooth(method="lm") for demonstration, but with SMA from the lmodel2() function, the regression line is 1:1). I'm kind of a stats amateur so I'm aware there might be problems with this, but without forcing the model tends to overestimate low values and underestimate high values. My question is: How do I introduce k-fold cross-validation using an existing package like caret or cvms? It seems like they need a model object to be returned and I can't figure out how to code my problem like that. Is there some way I can train the model by minimizing my custom metric and still return a model object with ypred and use it in k-fold CV?
This is my code for calculating the coefficients without k-fold CV:
data <- data.frame(
x = c(1.514, 1.514, 1.825, 1.281, 1.118, 1.279, 1.835, 1.819, 0.462, 1.53, 1.004, 1.19, 1.275, 0.428, 0.313, 0.909, 0.995, 0.995, 0.706, 0.563, 0.827, 0.65, 0.747, 1.013, 1.013, 1.163, 1.091, 1.163, 1.091, 0.955, 0.955, 2.044, 2.044, 1.777, 1.777, 1.434, 1.393, 1.324, 0.981, 0.845, 1.595, 1.595, 1.517, 1.517, 1.403, 1.403, 0.793, 0.793, 1.016, 0.901, 0.847, 1.054, 0.877, 1.639, 1.639, 1.268, 1.268, 0.842, 0.842, 0.827, 0.777, 1.024, 1.238, 1.238, 1.702, 1.702, 0.673, 0.673, 1.256, 1.256, 0.898, 0.898, 0.66, 0.933, 0.827, 0.836, 1.122, 1.5, 1.5, 1.44, 1.44, 0.671, 0.671, 0.486, 0.486, 1.051, 1.051, 0.971, 0.538, 0.971, 0.538, 1.012, 1.012, 0.776, 0.776, 0.854, 0.854, 0.74, 0.989, 0.989),
y = c(0.19, 0.18, 0.816, 2.568, 0.885, 0.521, 0.268, 0.885, 4.781, 1.648, 0.989, 1.614, 1.492, 0.679, 2.256, 3.17, 1.926, 1.631, 0.462, 2.48, 0.658, 0.355, 0.373, 2.31, 3.263, 1.374, 1.374, 2.637, 2.637, 2.073, 2.298, 0.257, 0.292, 0.359, 0.329, 1.329, 1.272, 3.752, 1.784, 0.76, 0.458, 0.488, 0.387, 0.387, 3.401, 1.458, 8.945, 9.12, 0.308, 0.386, 0.405, 6.444, 3.17, 0.458, 0.47, 0.572, 0.589, 1.961, 1.909, 0.636, 0.32, 1.664, 0.756, 0.851, 0.403, 0.232, 23.112, 22.042, 0.745, 0.477, 2.349, 3.01, 0.39, 0.246, 0.43, 1.407, 1.358, 0.235, 0.215, 0.595, 0.685, 2.539, 2.128, 8.097, 5.372, 0.644, 0.626, 17.715, 17.715, 6.851, 6.851, 2.146, 1.842, 3.147, 2.95, 1.127, 1.019, 8.954, 0.796, 0.758),
stringsAsFactors = FALSE)
optim_results <- optim(par = c(a0 = 0.3, a1 = -3.8, a2 = -1, a3 = 1, a4 = 1),
fn = function (params, x, y) {
params <- as.list(params)
ypred <- with(params, (a0 + (a1*x) + (a2*x^2) + (a3*x^3) + (a4*x^4)))
mod <- suppressMessages(lmodel2::lmodel2(ypred ~ y))$regression.results[3,]
line <- mod$Slope * y + mod$Intercept
return(sum((y - line)^2))},
x = log10(data$x),
y = log10(data$y))
cf <- as.numeric(optim_results$par)
data <- data %>% dplyr::mutate(ypred = 10^(cf[1] + cf[2]*log10(x) + cf[3]*log10(x)^2 + cf[4]*log10(x)^3 + cf[5]*log10(x)^4))
str(data)
Great question!
cvms::cross_validate_fn() allows you to cross-validate custom functions. You just have to wrap your code in a model function and a predict function as so:
EDIT: Added extraction of model parameters from the optim() output. optim() returns a list, which we convert to a class and then tell coef() how to extract the coefficients for that class.
library(dplyr)
library(groupdata2)
library(cvms)
# Set seed for reproducibility
set.seed(2)
data <- data.frame(
x = c(1.514, 1.514, 1.825, 1.281, 1.118, 1.279, 1.835, 1.819, 0.462, 1.53, 1.004, 1.19, 1.275, 0.428, 0.313, 0.909, 0.995, 0.995, 0.706, 0.563, 0.827, 0.65, 0.747, 1.013, 1.013, 1.163, 1.091, 1.163, 1.091, 0.955, 0.955, 2.044, 2.044, 1.777, 1.777, 1.434, 1.393, 1.324, 0.981, 0.845, 1.595, 1.595, 1.517, 1.517, 1.403, 1.403, 0.793, 0.793, 1.016, 0.901, 0.847, 1.054, 0.877, 1.639, 1.639, 1.268, 1.268, 0.842, 0.842, 0.827, 0.777, 1.024, 1.238, 1.238, 1.702, 1.702, 0.673, 0.673, 1.256, 1.256, 0.898, 0.898, 0.66, 0.933, 0.827, 0.836, 1.122, 1.5, 1.5, 1.44, 1.44, 0.671, 0.671, 0.486, 0.486, 1.051, 1.051, 0.971, 0.538, 0.971, 0.538, 1.012, 1.012, 0.776, 0.776, 0.854, 0.854, 0.74, 0.989, 0.989),
y = c(0.19, 0.18, 0.816, 2.568, 0.885, 0.521, 0.268, 0.885, 4.781, 1.648, 0.989, 1.614, 1.492, 0.679, 2.256, 3.17, 1.926, 1.631, 0.462, 2.48, 0.658, 0.355, 0.373, 2.31, 3.263, 1.374, 1.374, 2.637, 2.637, 2.073, 2.298, 0.257, 0.292, 0.359, 0.329, 1.329, 1.272, 3.752, 1.784, 0.76, 0.458, 0.488, 0.387, 0.387, 3.401, 1.458, 8.945, 9.12, 0.308, 0.386, 0.405, 6.444, 3.17, 0.458, 0.47, 0.572, 0.589, 1.961, 1.909, 0.636, 0.32, 1.664, 0.756, 0.851, 0.403, 0.232, 23.112, 22.042, 0.745, 0.477, 2.349, 3.01, 0.39, 0.246, 0.43, 1.407, 1.358, 0.235, 0.215, 0.595, 0.685, 2.539, 2.128, 8.097, 5.372, 0.644, 0.626, 17.715, 17.715, 6.851, 6.851, 2.146, 1.842, 3.147, 2.95, 1.127, 1.019, 8.954, 0.796, 0.758),
stringsAsFactors = FALSE)
# Fold data
# Will do 10-fold repeated cross-validation (10 reps)
data <- fold(
data = data,
k = 10, # Num folds
num_fold_cols = 10 # Num repetitions
)
# Write a model function from your code
# This ignores the formula and hyperparameters but
# you could pass values through those if you wanted
# to try different formulas or hyperparameter values
model_fn <- function(train_data, formula, hyperparameters){
out <- optim(par = c(a0 = 0.3, a1 = -3.8, a2 = -1, a3 = 1, a4 = 1),
fn = function (params, x, y) {
params <- as.list(params)
ypred <- with(params, (a0 + (a1*x) + (a2*x^2) + (a3*x^3) + (a4*x^4)))
mod <- suppressMessages(lmodel2::lmodel2(ypred ~ y))$regression.results[3,]
line <- mod$Slope * y + mod$Intercept
return(sum((y - line)^2))},
x = log10(train_data$x),
y = log10(train_data$y))
# Convert output to an S3 class
# so we can extract parameters with coef()
class(out) <- "OptimModel"
out
}
# Tell coef() how to extract the parameters
# This can modified if you need more info from the optim() output
# Just return a named list
coef.OptimModel <- function(object) {
object$par
}
# Write a predict function from your code
predict_fn <- function(test_data, model, formula, hyperparameters, train_data){
cf <- as.numeric(model$par)
test_data %>%
dplyr::mutate(
ypred = 10^(cf[1] + cf[2]*log10(x) + cf[3]*log10(x)^2 + cf[4]*log10(x)^3 + cf[5]*log10(x)^4)
) %>%
.[["ypred"]]
}
# Cross-validate the model
cv <- cross_validate_fn(
data = data,
model_fn = model_fn,
predict_fn = predict_fn,
formulas = c("y ~ x"), # Not currently used by the model function
fold_cols = paste0('.folds_', seq_len(10)),
type = 'gaussian'
)
#> Will cross-validate 1 models. This requires fitting 100 model instances.
# Check output
cv
# A tibble: 1 × 17
Fixed RMSE MAE NRMSE(I…¹ RRSE RAE RMSLE Predic…² Results Coeffi…³ Folds
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <list> <list> <list> <int>
1 x 4.00 2.31 2.66 1.47 1.17 0.662 <tibble> <tibble> <tibble> 100
# … with 6 more variables: `Fold Columns` <int>, `Convergence Warnings` <int>,
# `Other Warnings` <int>, `Warnings and Messages` <list>, Process <list>,
# Dependent <chr>, and abbreviated variable names ¹​`NRMSE(IQR)`,
# ²​Predictions, ³​Coefficients
# ℹ Use `colnames()` to see all variable names
Created on 2022-10-15 with reprex v2.0.2

How do you filter out individuals on a figure after creating a PCA plot in Factoextra?

I am a research student coming to grips with R for the first time.
I am trying to make a PCA plot from a series of body measurements, the specimens names and a subspecies tag (BIN) are in sperate columns. The BIN column contains the BIN ID for each sample.
The difficulty I am facing is filtering out individuals with certain BIN's.
My desired output is to produce a PCA plot identical to the one below but only displaying the named BIN's ("ACZ5516", "ADF3772") and not the remaining BIN's.
Revised image
#import data set
Anotylus<-read.csv("DataSO.csv", header = TRUE, sep = ",",
row.names = 1)
#row.names sets specimen ID as specimen name
#set BIN as factor
Anotylus$BIN<-as.factor(Anotylus$BIN)
# Number of BINs and number of individuals in each
table(Anotylus["BIN"])
#create PCA of data set, excludes column for BIN (column 12)
Ano.pca<-PCA(Anotylus[,c(1:11)], graph = FALSE)
#visualise PCA with all individuals in the d.f.
fviz_pca_ind(Ano.pca,
geom.ind = "point",
col.ind = Anotylus$BIN,
repel = TRUE,
legend.title = "BIN",
addEllipses = TRUE)
#With individuals from selected BINs
top<-list(name=c("ACZ5516", "ADF3772"))
fviz_pca_ind(Ano.pca,
geom.ind = "point",
col.ind = Anotylus$BIN,#
select.ind = top,
repel = TRUE,
legend.title = "BIN",
addEllipses = TRUE)
#no samples visible at all
#wouild like to see only the two named
I have tried using a subset of the data but the Principal Components variation changes and produces different a result.
How do I filter the individuals displayed to a curated list?
Any advice or guidance is deeply appreciated!
Best,
Dante
Sample data set below
> dput(Anotylus)
structure(list(Total.Anten.Length..mm. = c(0.66, 0.635, 0.676,
0.559, 1.249, 0.675, 0.704, 0.649, 0.661, 0.795, 0.836, 0.888,
0.941, 0.781, 0.899, 0.918, 0.854, 0.834, 0.888, 0.884, 0.879,
0.776, 0.954, 0.853, 0.96, 0.527, 0.515, 0.653, 0.491, 0.474,
0.538, 0.694, 1.01, 0.53, 0.641, 0.509, 0.918, 0.849, 0.452,
0.536), Body.Length...mm. = c(1.842, 1.664, 1.901, 1.917, 3.061,
1.961, 1.862, 1.99, 1.85, 1.449, 2.455, 2.077, 2.578, 2.478,
2.798, 2.589, 2.291, 2.882, 2.472, 2.55, 2.53, 2.757, 2.689,
2.166, 2.894, 1.944, 1.48, 2.385, 1.715, 1.674, 1.532, 2.27,
2.598, 1.677, 1.67, 1.68, 2.374, 2.877, 1.699, 1.656),
Eye.Area..mm2. = c(0.01,
0.009, 0.01, 0.006, 0.026, 0.007, 0.01, 0.01, 0.009, 0.006, 0.016,
0.014, 0.015, 0.018, 0.02, 0.016, 0.019, 0.015, 0.013, 0.011,
0.015, 0.014, 0.017, 0.014, 0.012, 0.007, 0.006, 0.02, 0.007,
0.006, 0.005, 0.013, 0.013, 0.006, 0.007, 0.005, 0.013, 0.006,
0.008, 0.005), Eye.Width..mm. = c(0.046, 0.036, 0.054, 0.033,
0.071, 0.04, 0.046, 0.047, 0.044, 0.05, 0.059, 0.053, 0.073,
0.063, 0.068, 0.051, 0.044, 0.07, 0.064, 0.061, 0.054, 0.042,
0.038, 0.059, 0.059, 0.043, 0.046, 0.079, 0.037, 0.035, 0.037,
0.054, 0.047, 0.045, 0.045, 0.028, 0.05, 0.037, 0.043, 0.045),
Head.Width..mm. = c(0.359, 0.362, 0.377, 0.317, 0.731, 0.456,
0.38, 0.414, 0.359, 0.453, 0.568, 0.449, 0.519, 0.517, 0.516,
0.515, 0.512, 0.513, 0.511, 0.456, 0.503, 0.474, 0.598, 0.453,
0.574, 0.309, 0.306, 0.574, 0.314, 0.298, 0.295, 0.386, 0.557,
0.289, 0.318, 0.306, 0.505, 0.291, 0.298, 0.263),
Pronotum.Width..mm. = c(0.413,
0.455, 0.439, 0.352, 0.741, 0.462, 0.467, 0.461, 0.442, 0.493,
0.573, 0.549, 0.584, 0.617, 0.632, 0.61, 0.614, 0.624, 0.631,
0.533, 0.587, 0.562, 0.609, 0.522, 0.621, 0.342, 0.341, 0.598,
0.336, 0.314, 0.331, 0.467, 0.547, 0.343, 0.342, 0.317, 0.545,
0.328, 0.329, 0.284), Pronotum.Length..mm. = c(0.304, 0.326,
0.334, 0.24, 0.48, 0.317, 0.303, 0.329, 0.302, 0.36, 0.418,
0.383, 0.424, 0.428, 0.399, 0.442, 0.404, 0.461, 0.435, 0.376,
0.393, 0.403, 0.373, 0.41, 0.435, 0.259, 0.247, 0.403, 0.257,
0.252, 0.23, 0.387, 0.388, 0.248, 0.26, 0.215, 0.336, 0.223,
0.231, 0.247), Elytra.Width..mm. = c(0.558, 0.552, 0.586,
0.43, 0.854, 0.506, 0.528, 0.586, 0.548, 0.54, 0.75, 0.716,
0.794, 0.816, 0.746, 0.82, 0.786, 0.8, 0.722, 0.69, 0.758,
0.766, 0.736, 0.668, 0.852, 0.468, 0.462, 0.741, 0.461, 0.323,
0.406, 0.637, 0.617, 0.41, 0.366, 0.422, 0.718, 0.42, 0.408,
0.278), Elytra.Length..mm. = c(0.469, 0.437, 0.386, 0.346,
0.631, 0.428, 0.464, 0.451, 0.445, 0.532, 0.583, 0.543, 0.558,
0.62, 0.625, 0.623, 0.613, 0.605, 0.623, 0.588, 0.606, 0.48,
0.568, 0.568, 0.598, 0.373, 0.352, 0.516, 0.365, 0.326, 0.327,
0.502, 0.464, 0.346, 0.344, 0.319, 0.519, 0.346, 0.329, 0.346
), Pronotum.Value = c(0.288, 0.319, 0.306, 0.331, 0.179,
0.278, 0.224, 0.211, 0.204, 0.273, 0.26, 0.33, 0.241, 0.218,
0.203, 0.209, 0.241, 0.227, 0.31, 0.236, 0.341, 0.288, 0.283,
0.263, 0.279, 0.173, 0.162, 0.22, 0.183, 0.209, 0.193, 0.185,
0.236, 0.181, 0.172, 0.227, 0.275, 0.164, 0.21, 0.217),
Elytra.Value = c(0.314,
0.319, 0.393, 0.243, 0.205, 0.297, 0.21, 0.205, 0.244, 0.359,
0.288, 0.335, 0.375, 0.291, 0.243, 0.238, 0.288, 0.283, 0.351,
0.271, 0.48, 0.415, 0.325, 0.294, 0.193, 0.182, 0.271, 0.237,
0.216, 0.246, 0.214, 0.193, 0.233, 0.205, 0.18, 0.262, 0.225,
0.176, 0.303, 0.251), BIN = structure(c(1L, 1L, 1L, 3L, 8L,
1L, 1L, 1L, 1L, 4L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 7L, 3L, 3L, 6L, 3L, 3L, 3L, 2L, 5L, 3L, 3L,
3L, 5L, 3L, 3L, 3L), .Label = c("ACZ5516", "ACZ5742", "ADF3772",
"ADF4138", "ADG1201", "ADH9095", "ADI3175", "ADR2790"), class =
"factor")), row.names = c("CCDB-22214-D03",
"CCDB-22214-D06", "CCDB-22214-D08", "CCDB-22214-G09", "CCDB-22214-
H02",
"CCDB-22214-H09", "CCDB-22215-A11", "CCDB-22215-A12", "CCDB-22215-
F04",
"CCDB-23850-B07", "CCDB-23851-C04", "CCDB-23851-C05", "CCDB-23851-
C11",
"CCDB-23851-C12", "CCDB-23851-D02", "CCDB-23851-D03", "CCDB-23851-
D04",
"CCDB-23851-D06", "CCDB-23851-E08", "CCDB-23851-E09", "CCDB-23851-
E11",
"CCDB-23851-F03", "CCDB-23851-G05", "CCDB-23851-G09", "CCDB-23858-
B08",
"CCDB-23858-G12", "CCDB-23858-H01", "CCDB-23859-B10", "CCDB-23859-
E07",
"CCDB-23859-E10", "CCDB-23859-E11", "CCDB-25504-E04", "CCDB-25505-
E02",
"CCDB-25510-B12", "CCDB-25510-D02", "CCDB-25510-E09", "CCDB-25511-
B06",
"CCDB-25511-B12", "CCDB-25511-E11", "CCDB-25512-E12"), class =
"data.frame")
Apparently factoextra "produces ggplot2-based elegant data visualization with less typing". From what I can tell, fviz_pca_ind is essentially plotting the PCA coordinate for each individual point, and compute a multivariate normal distribution as an ellipse.
Here's the replication of the plot you have attached in stripped down ggplot code:
#constructing a plotting data frame with the BIN identifier and each pca qualitative coordinates
df <- cbind.data.frame(BIN = Anotylus$BIN, Ano.pca$ind$coord)
ggplot(df, aes(x=Dim.1, y=Dim.2, color=BIN)) +
geom_point() +
stat_ellipse(type="norm")
Note that as there are only 1 or 2 points for all BIN other than ACZ5516 and ADF3772, there will be "Too few points to calculate an ellipse" and as such no ellipse is plotted.
In order to "hide" the other BIN in your figure, you can either just plot the BIN you wanted or you can create a new grouping (ACZ5516, ADF3772 and others) in the plotting data and set the points you do not want to focus on in less visible colour.
library(dplyr)
# Plot only BIN ACZ5516 and ADF3772
df %>%
filter(BIN %in% c("ACZ5516", "ADF3772")) %>%
ggplot(aes(x=Dim.1, y=Dim.2, color=BIN)) +
geom_point() +
stat_ellipse(type="norm")
# Create a new grouping for BIN other than ACZ5516 and ADF3772
df2 <- df %>%
mutate(BIN = ifelse(BIN %in% c("ACZ5516", "ADF3772"), as.character(BIN), "Others"))
df2 %>%
ggplot(aes(x=Dim.1, y=Dim.2, color=BIN)) +
geom_point() +
stat_ellipse(data = df %>% filter(BIN %in% c("ACZ5516", "ADF3772")), type="norm") +
scale_colour_manual(values = c("darkgreen", "orange", "gray"))

Find the 3 nearest neighbours (dist()?) and calculate mean in new column

This is a sample of the data
structure(list(Season = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("2018/2019",
"2019/2020"), class = "factor"), Date2 = structure(c(17860, 17888,
17916, 17940, 17945, 17952, 17953, 17954, 17978, 17999, 18005,
18188, 18209, 18223, 18237, 18320, 18322, 18334, 18447, 18476
), class = "Date"), HT.av.points = c(0.57, 1.5, 1.67, 1.8, 1.09,
2.18, 1.42, 1.45, 1.79, 1.35, 1.14, 1.83, 2, 1.17, 1.88, 1.83,
1.33, 0.92, 1.31, 1.06), AT.av.points = c(1.14, 2.33, 0.56, 1.2,
1.09, 1.6, 1.08, 1.9, 1.17, 0.9, 1.38, 0.67, 2.14, 1.33, 0.62,
1.08, 2.17, 1.38, 0.56, 0.94), HT_av.PointsTotal = c(0.86, 1.16,
1.18, 1.23, 0.86, 1.86, 1.2, 1.18, 1.5, 1.1, 1.07, 1.46, 1.6,
1.08, 1.75, 1.4, 1.16, 0.92, 1.03, 0.97), AT_av.PointsTotal = c(2.07,
2.21, 0.76, 1.42, 1.59, 1.5, 1.2, 1.91, 1.65, 1.43, 1.38, 0.54,
1.87, 1.58, 0.8, 1.6, 2.32, 1.42, 1.12, 1.32), DIFF.AV.POINTS.PREDICTION = c(-0.28,
-0.43, 0.51, 0.52, -0.36, 0.56, 0.28, -0.38, -0.2, 0.03, -0.43,
1.24, -0.32, -0.29, 1.44, 0.28, -0.85, -0.38, 1.01, 0.22), Over2.5G = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1)), row.names = c(NA,
-20L), .internal.selfref = <pointer: 0x1ca2448>, class = c("data.table",
"data.frame"), .Names = c("Season", "Date2", "HT.av.points",
"AT.av.points", "HT_av.PointsTotal", "AT_av.PointsTotal", "DIFF.AV.POINTS.PREDICTION",
"Over2.5G"))
What I want to do:
group by Season
After the group by, I want to find the 3 previous rows that are most similar (according to the following columns) to the current row.
(HT.av.points, AT.av.points, HT_av.PointsTotal, AT_av.PointsTotal, DIFF.AV.POINTS.PREDICTION)
I guess the dist() function is a possibility.
Finally I want to create a new column with the mean of the values of the Over2.5G column of those 3 most similar rows.
New column:
First 3 rows(of the Season) NAs.
In fourth row(of the Season) the 3 nearest neighbours (and their Over2.5G values) will always be the first 3 rows.
breaking below code up:
a helper function which returns row indices of nearest neighbours with a ready-made function, e.g. get.knn of package FNN
calling this function for increasingly large slices (from row one to current) of the input data df and storing the result as an extra column
extracting the row indices as integers from the result string to index the desired column of the input data for the aggregation (mean, in your case)
here we go:
## helper function returns row indices of nearest 3 neighbours
## as comma-separated string
find_nearest_predecessors <- function(df, ...){
ifelse(nrow(df) < 4, ## can't calculate n neighbours for <n rows:
paste(1:3, collapse = ','),
## otherwise = if sufficient rows,
## get row indices of 3 nearest neighbours:
get.knn(data = df,
k = 3,
algo = 'CR'
) %>%
.[['nn.index']] %>%
tail(1) %>% paste(collapse = ',')
)
}
## df being your input data:
df %>%
mutate(rownum = row_number()) %>%
rowwise %>%
mutate(nearest_neighbours = find_nearest_predecessors(
df = ## use previous data up to current row:
slice(df, 1:rownum) %>%
## choose features/dimensions of distance:
select(HT.av.points, AT.av.points, HT_av.PointsTotal,
AT_av.PointsTotal, DIFF.AV.POINTS.PREDICTION)
),
## calculate mean of OVER2.5G
mean_Over2.5G = mean(df$Over2.5G[
strsplit(nearest_neighbours,',') %>%
unlist %>% as.integer
], na.rm = TRUE)
)

Concatenating two cells in R with a ± sign

I want to calculate mean and standard deviation from pairwise mirrored matrices in a list and write a table for further text procession:
mean_SG<- as.data.frame(lapply(list_SG, function(x) mean(x[upper.tri(x)])))
sd_SG <- as.data.frame(lapply(list_SG, function(x) sd(x[upper.tri(x)])))
write.table(t(rbind(round(mean_SG,3),round(sd_SG,3))), "SG.txt")
My idea is to directly concatenate the numeric values from mean_SG and sd_SG with the plus-minus symbol ± and write this in a single column with write.table. Is that possible in R?
Here is some data:
SG <- structure(c(85, 84.016, 82.9, 79, 85.167, 83.467, 78.5, 83.051,
80.064, 81.436, 79.94, 83.731, 83.468, 82.775, 83.294, 81.608,
82.176, 84.138, 82.6, 85.325, 82.297, 81.546, 83.569, 84.561,
87.039, 92.45, 86.35, 83.153, 84.447, 81.899, 81.972, 81.32,
81.949, 82.101, 0.656, 0.966, 1.833, NA, 0.643, 0.459, 0.608,
1.189, 1.024, 0.848, 1.207, 0.66, 0.757, 1.235, 0.872, 1.308,
0.958, 1.151, 0.914, 1.302, 0.708, 0.79, 1.349, 0.799, 1.297,
2.554, 0.55, 1.041, 1.216, 1.065, 0.981, 0.937, 1.133, 1.302), .Dim = c(34L,
2L), .Dimnames = list(c("X19_vs_11B.2", "X19_vs_AT.s3.28", "X19_vs_B276.D12",
"X19_vs_BP.U1C.1g10", "X19_vs_d142", "X19_vs_FFCH5909", "X19_vs_GBS.L1.B05",
"X19_vs_SG01", "X19_vs_SG02", "X19_vs_SG03", "X19_vs_SG04", "X19_vs_SG05",
"X19_vs_SG06", "X19_vs_SG07a", "X19_vs_SG07b", "X19_vs_SG08.Aca",
"X19_vs_SG08.Holo", "X19_vs_SG09", "X19_vs_SG10", "X19_vs_SG11",
"X19_vs_SG12", "X19_vs_SG13", "X19_vs_SG15", "X19_vs_SG17", "X19_vs_SG18",
"X19_vs_SG19", "X19_vs_SG20", "X19_vs_SG21", "X19_vs_SG22", "X19_vs_SG23",
"X19_vs_SG25", "X19_vs_SG26", "X19_vs_ThAna", "X19_vs_TPD.58"
), c("1", "2"))
I like sprintf for this. It allows you to specify the number of digits.
sprintf("%.3f \U00B1 %.3f", SG[,1], SG[,2])
#[1] "85.000 ± 0.656" "84.016 ± 0.966" "82.900 ± 1.833" "79.000 ± NA" "85.167 ± 0.643" "83.467 ± 0.459" ...

Resources