Clustering around fixed vector of values - r

I have a dataset of brands with different features like calories, sugar content, fiber content, etc.
for eg
Using dput():
structure(list(Row = 1:30, Brands = structure(c(1L, 112L, 223L, 242L,
253L, 264L, 275L, 286L, 297L, 2L, 13L, 24L, 35L, 46L, 57L, 68L, 79L,
90L, 101L, 113L, 124L, 135L, 146L, 157L, 168L, 179L, 190L, 201L,
212L, 224L), .Label = c("Brand 1", "Brand 10", "Brand 100", "Brand
101", "Brand 102", "Brand 103", "Brand 104", "Brand 105", "Brand
106", "Brand 107", "Brand 108", "Brand 109", "Brand 11", "Brand 110",
"Brand 111", "Brand 112", "Brand 113", "Brand 114", "Brand 115",
"Brand 116", "Brand 117", "Brand 118", "Brand 119", "Brand 12",
"Brand 120", "Brand 121", "Brand 122", "Brand 123", "Brand 124",
"Brand 125", "Brand 126", "Brand 127", "Brand 128", "Brand 129",
"Brand 13", "Brand 130", "Brand 131", "Brand 132", "Brand 133",
"Brand 134", "Brand 135", "Brand 136", "Brand 137", "Brand 138",
"Brand 139", "Brand 14", "Brand 140", "Brand 141", "Brand 142",
"Brand 143", "Brand 144", "Brand 145", "Brand 146", "Brand 147",
"Brand 148", "Brand 149", "Brand 15", "Brand 150", "Brand 151",
"Brand 152", "Brand 153", "Brand 154", "Brand 155", "Brand 156",
"Brand 157", "Brand 158", "Brand 159", "Brand 16", "Brand 160",
"Brand 161", "Brand 162", "Brand 163", "Brand 164", "Brand 165",
"Brand 166", "Brand 167", "Brand 168", "Brand 169", "Brand 17",
"Brand 170", "Brand 171", "Brand 172", "Brand 173", "Brand 174",
"Brand 175", "Brand 176", "Brand 177", "Brand 178", "Brand 179",
"Brand 18", "Brand 180", "Brand 181", "Brand 182", "Brand 183",
"Brand 184", "Brand 185", "Brand 186", "Brand 187", "Brand 188",
"Brand 189", "Brand 19", "Brand 190", "Brand 191", "Brand 192",
"Brand 193", "Brand 194", "Brand 195", "Brand 196", "Brand 197",
"Brand 198", "Brand 199", "Brand 2", "Brand 20", "Brand 200", "Brand
201", "Brand 202", "Brand 203", "Brand 204", "Brand 205", "Brand
206", "Brand 207", "Brand 208", "Brand 209", "Brand 21", "Brand 210",
"Brand 211", "Brand 212", "Brand 213", "Brand 214", "Brand 215",
"Brand 216", "Brand 217", "Brand 218", "Brand 219", "Brand 22",
"Brand 220", "Brand 221", "Brand 222", "Brand 223", "Brand 224",
"Brand 225", "Brand 226", "Brand 227", "Brand 228", "Brand 229",
"Brand 23", "Brand 230", "Brand 231", "Brand 232", "Brand 233",
"Brand 234", "Brand 235", "Brand 236", "Brand 237", "Brand 238",
"Brand 239", "Brand 24", "Brand 240", "Brand 241", "Brand 242",
"Brand 243", "Brand 244", "Brand 245", "Brand 246", "Brand 247",
"Brand 248", "Brand 249", "Brand 25", "Brand 250", "Brand 251",
"Brand 252", "Brand 253", "Brand 254", "Brand 255", "Brand 256",
"Brand 257", "Brand 258", "Brand 259", "Brand 26", "Brand 260",
"Brand 261", "Brand 262", "Brand 263", "Brand 264", "Brand 265",
"Brand 266", "Brand 267", "Brand 268", "Brand 269", "Brand 27",
"Brand 270", "Brand 271", "Brand 272", "Brand 273", "Brand 274",
"Brand 275", "Brand 276", "Brand 277", "Brand 278", "Brand 279",
"Brand 28", "Brand 280", "Brand 281", "Brand 282", "Brand 283",
"Brand 284", "Brand 285", "Brand 286", "Brand 287", "Brand 288",
"Brand 289", "Brand 29", "Brand 290", "Brand 291", "Brand 292",
"Brand 293", "Brand 294", "Brand 295", "Brand 296", "Brand 297",
"Brand 298", "Brand 299", "Brand 3", "Brand 30", "Brand 300", "Brand
301", "Brand 302", "Brand 303", "Brand 304", "Brand 305", "Brand
306", "Brand 307", "Brand 31", "Brand 32", "Brand 33", "Brand 34",
"Brand 35", "Brand 36", "Brand 37", "Brand 38", "Brand 39", "Brand
4", "Brand 40", "Brand 41", "Brand 42", "Brand 43", "Brand 44",
"Brand 45", "Brand 46", "Brand 47", "Brand 48", "Brand 49", "Brand
5", "Brand 50", "Brand 51", "Brand 52", "Brand 53", "Brand 54",
"Brand 55", "Brand 56", "Brand 57", "Brand 58", "Brand 59", "Brand
6", "Brand 60", "Brand 61", "Brand 62", "Brand 63", "Brand 64",
"Brand 65", "Brand 66", "Brand 67", "Brand 68", "Brand 69", "Brand 7",
"Brand 70", "Brand 71", "Brand 72", "Brand 73", "Brand 74", "Brand
75", "Brand 76", "Brand 77", "Brand 78", "Brand 79", "Brand 8",
"Brand 80", "Brand 81", "Brand 82", "Brand 83", "Brand 84", "Brand
85", "Brand 86", "Brand 87", "Brand 88", "Brand 89", "Brand 9",
"Brand 90", "Brand 91", "Brand 92", "Brand 93", "Brand 94", "Brand
95", "Brand 96", "Brand 97", "Brand 98", "Brand 99"), class =
"factor"), Fiber = c(82L, 36L, 51L, 86L, 26L, 98L, 91L, 28L, 1L, 88L,
35L, 84L, 27L, 58L, 9L, 43L, 49L, 56L, 66L, 43L, 62L, 73L, 20L, 33L,
17L, 88L, 57L, 45L, 89L, 16L), Sugar = c(77L, 87L, 40L, 69L, 9L, 1L,
54L, 64L, 24L, 52L, 29L, 14L, 76L, 24L, 39L, 54L, 18L, 72L, 54L, 9L,
45L, 65L, 43L, 90L, 40L, 93L, 75L, 50L, 1L, 44L), Calories = c(94L,
14L, 36L, 34L, 40L, 91L, 58L, 82L, 91L, 19L, 60L, 79L, 44L, 60L, 80L,
27L, 17L, 5L, 10L, 89L, 63L, 43L, 29L, 99L, 92L, 19L, 9L, 38L, 43L,
9L), Feature.4 = c(0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 1L,
1L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 1L, 1L, 1L, 0L, 1L, 0L, 0L, 1L, 0L,
1L), Feature.5 = c(1L, 0L, 0L, 1L, 0L, 1L, 1L, 1L, 1L, 0L, 0L, 0L,
1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 0L, 0L, 1L, 1L, 0L, 1L, 1L, 1L,
0L), Feature.6 = c(7L, 11L, 45L, 45L, 35L, 28L, 56L, 52L, 1L, 49L,
28L, 68L, 99L, 70L, 62L, 73L, 97L, 2L, 41L, 14L, 68L, 84L, 76L, 2L,
53L, 38L, 3L, 52L, 12L, 70L), Feature.7 = c(54L, 22L, 11L, 67L, 22L,
67L, 69L, 67L, 89L, 24L, 32L, 25L, 90L, 62L, 82L, 100L, 53L, 50L,
75L, 79L, 53L, 4L, 31L, 96L, 55L, 35L, 69L, 74L, 88L, 9L)), row.names
= c(NA, 30L), class = "data.frame")
I want to pick 5 brands among them, say Brand 1,2,3,4&5 and then form clusters or groups of the brands similar in features to each of these 5 brands and keep all the rest of all the brands which are not similar as a separate cluster.
So, I will have 1 cluster Brand 1, 1 for Brand 2, and l Brand 3 and similarly for Brand 4 & 5. And then there will be 1 cluster of those brands which are not similar to any of these 5 brands. The feature may be a dummy or continuous.
I think this is should be easy, however, I couldn't find any package for this in "R".

Here is a simple example using the data you included which I am calling dta. First we compute z-scores for the values except for the dichotomies:
library(fields)
dta.zscores <- dta
dta.zscores[, c(3:5, 8:9)] <- scale(dta[, c(3:5, 8:9)])
Now dta.zscores contains the z-scores of the original data so that each variable will be weighted equally. Next we compute the distances from each row to rows 1 - 5 (Brands 1 through 5) using columns 3 through 9:
dta.dist <- rdist(dta.zscores[1:5, 3:9], dta.zscores[, 3:9])
dta.mindist <- apply(dta.dist, 2, min)
dta.brand <- apply(dta.dist, 2, which.min)
quantile(dta.mindist[-c(1:5)])
# 0% 25% 50% 75% 100%
# 1.131532 1.952891 2.383079 2.908602 3.475676
table(dta.brand)
# dta.brand
# 1 2 3 4 5
# 4 2 7 11 6
The matrix dta.dist is 30 columns (each of the observations) by 5 rows (distance to each of the brands). The vector dta.mindist is the minimum distance for each observation. That will be 0 for the first 5 observations. The vector dta.brand indicates which of the 5 brands is the nearest for that observation. The quartile function shows the range of distances to the nearest brand after excluding the first 5 brands which were used to define the groups. Finally the table shows how many observations are assigned to each brand.
You would still need to decide how far is too far to assign an observation to a brand and move these observations to another cluster, e.g. cluster 6. Based on the quantiles, 25% of the distances are greater than 2.9. You can specify other quantiles using the probs= argument in quantile(), e.g. .90 or .95 depending on how many observations you want to be in cluster 6.

Related

is there a way to place significance results in my ggplot?

I'm looking for a way to place my dunn's test results on my ggplot:
either directly by somehow implementing the necessary code lines on the plot regarding the results of the dunn test
or indirectly by placing just the letters regarding significance by the order in which i know they appear
In the example graphic I attached, I put in the letters myself through a 2nd party drawing software, to give you an idea of what I'm looking for - ideally i would want the lower case letters to represent significant differences only within the same variable called "type" but a different "day" and capitals for the opposite. I do however recognize this is a tall order, and would be happy with just the differences within the same type.
here is my dput and str:
> dput(table5)
structure(list(day = c("day 00", "day 00", "day 00", "day 00",
"day 00", "day 00", "day 00", "day 00", "day 07", "day 07", "day 07",
"day 07", "day 07", "day 07", "day 07", "day 07", "day 14", "day 14",
"day 14", "day 14", "day 14", "day 14", "day 14", "day 14", "day 21",
"day 21", "day 21", "day 21", "day 21", "day 21", "day 21", "day 21",
"day 28", "day 28", "day 28", "day 28", "day 28", "day 28", "day 28",
"day 28", "day 00", "day 00", "day 00", "day 00", "day 00", "day 00",
"day 00", "day 00", "day 07", "day 07", "day 07", "day 07", "day 07",
"day 07", "day 07", "day 07", "day 14", "day 14", "day 14", "day 14",
"day 14", "day 14", "day 14", "day 14", "day 21", "day 21", "day 21",
"day 21", "day 21", "day 21", "day 21", "day 21", "day 28", "day 28",
"day 28", "day 28", "day 28", "day 28", "day 28", "day 28", "day 00",
"day 00", "day 00", "day 00", "day 00", "day 00", "day 00", "day 00",
"day 07", "day 07", "day 07", "day 07", "day 07", "day 07", "day 07",
"day 07", "day 14", "day 14", "day 14", "day 14", "day 14", "day 14",
"day 14", "day 14", "day 21", "day 21", "day 21", "day 21", "day 21",
"day 21", "day 21", "day 21", "day 28", "day 28", "day 28", "day 28",
"day 28", "day 28", "day 28", "day 28", "day 00", "day 00", "day 00",
"day 00", "day 00", "day 00", "day 00", "day 00", "day 07", "day 07",
"day 07", "day 07", "day 07", "day 07", "day 07", "day 07", "day 14",
"day 14", "day 14", "day 14", "day 14", "day 14", "day 14", "day 14",
"day 21", "day 21", "day 21", "day 21", "day 21", "day 21", "day 21",
"day 21", "day 28", "day 28", "day 28", "day 28", "day 28", "day 28",
"day 28", "day 28", "day 00", "day 00", "day 00", "day 00", "day 00",
"day 00", "day 00", "day 00", "day 07", "day 07", "day 07", "day 07",
"day 07", "day 07", "day 07", "day 07", "day 14", "day 14", "day 14",
"day 14", "day 14", "day 14", "day 14", "day 14", "day 21", "day 21",
"day 21", "day 21", "day 21", "day 21", "day 21", "day 21", "day 28",
"day 28", "day 28", "day 28", "day 28", "day 28", "day 28", "day 28",
"day 00", "day 00", "day 00", "day 00", "day 00", "day 00", "day 00",
"day 00", "day 07", "day 07", "day 07", "day 07", "day 07", "day 07",
"day 07", "day 07", "day 14", "day 14", "day 14", "day 14", "day 14",
"day 14", "day 14", "day 14", "day 21", "day 21", "day 21", "day 21",
"day 21", "day 21", "day 21", "day 21", "day 28", "day 28", "day 28",
"day 28", "day 28", "day 28", "day 28", "day 28", "day 00", "day 00",
"day 00", "day 00", "day 00", "day 00", "day 00", "day 00", "day 07",
"day 07", "day 07", "day 07", "day 07", "day 07", "day 07", "day 07",
"day 14", "day 14", "day 14", "day 14", "day 14", "day 14", "day 14",
"day 14", "day 21", "day 21", "day 21", "day 21", "day 21", "day 21",
"day 21", "day 21", "day 28", "day 28", "day 28", "day 28", "day 28",
"day 28", "day 28", "day 28"), type = c("control", "control",
"control", "control", "control", "control", "control", "control",
"control", "control", "control", "control", "control", "control",
"control", "control", "control", "control", "control", "control",
"control", "control", "control", "control", "control", "control",
"control", "control", "control", "control", "control", "control",
"control", "control", "control", "control", "control", "control",
"control", "control", "nZn1_", "nZn1_", "nZn1_", "nZn1_", "nZn1_",
"nZn1_", "nZn1_", "nZn1_", "nZn1_", "nZn1_", "nZn1_", "nZn1_",
"nZn1_", "nZn1_", "nZn1_", "nZn1_", "nZn1_", "nZn1_", "nZn1_",
"nZn1_", "nZn1_", "nZn1_", "nZn1_", "nZn1_", "nZn1_", "nZn1_",
"nZn1_", "nZn1_", "nZn1_", "nZn1_", "nZn1_", "nZn1_", "nZn1_",
"nZn1_", "nZn1_", "nZn1_", "nZn1_", "nZn1_", "nZn1_", "nZn1_",
"nZn10_", "nZn10_", "nZn10_", "nZn10_", "nZn10_", "nZn10_", "nZn10_",
"nZn10_", "nZn10_", "nZn10_", "nZn10_", "nZn10_", "nZn10_", "nZn10_",
"nZn10_", "nZn10_", "nZn10_", "nZn10_", "nZn10_", "nZn10_", "nZn10_",
"nZn10_", "nZn10_", "nZn10_", "nZn10_", "nZn10_", "nZn10_", "nZn10_",
"nZn10_", "nZn10_", "nZn10_", "nZn10_", "nZn10_", "nZn10_", "nZn10_",
"nZn10_", "nZn10_", "nZn10_", "nZn10_", "nZn10_", "nZn100_",
"nZn100_", "nZn100_", "nZn100_", "nZn100_", "nZn100_", "nZn100_",
"nZn100_", "nZn100_", "nZn100_", "nZn100_", "nZn100_", "nZn100_",
"nZn100_", "nZn100_", "nZn100_", "nZn100_", "nZn100_", "nZn100_",
"nZn100_", "nZn100_", "nZn100_", "nZn100_", "nZn100_", "nZn100_",
"nZn100_", "nZn100_", "nZn100_", "nZn100_", "nZn100_", "nZn100_",
"nZn100_", "nZn100_", "nZn100_", "nZn100_", "nZn100_", "nZn100_",
"nZn100_", "nZn100_", "nZn100_", "Zn1_", "Zn1_", "Zn1_", "Zn1_",
"Zn1_", "Zn1_", "Zn1_", "Zn1_", "Zn1_", "Zn1_", "Zn1_", "Zn1_",
"Zn1_", "Zn1_", "Zn1_", "Zn1_", "Zn1_", "Zn1_", "Zn1_", "Zn1_",
"Zn1_", "Zn1_", "Zn1_", "Zn1_", "Zn1_", "Zn1_", "Zn1_", "Zn1_",
"Zn1_", "Zn1_", "Zn1_", "Zn1_", "Zn1_", "Zn1_", "Zn1_", "Zn1_",
"Zn1_", "Zn1_", "Zn1_", "Zn1_", "Zn10_", "Zn10_", "Zn10_", "Zn10_",
"Zn10_", "Zn10_", "Zn10_", "Zn10_", "Zn10_", "Zn10_", "Zn10_",
"Zn10_", "Zn10_", "Zn10_", "Zn10_", "Zn10_", "Zn10_", "Zn10_",
"Zn10_", "Zn10_", "Zn10_", "Zn10_", "Zn10_", "Zn10_", "Zn10_",
"Zn10_", "Zn10_", "Zn10_", "Zn10_", "Zn10_", "Zn10_", "Zn10_",
"Zn10_", "Zn10_", "Zn10_", "Zn10_", "Zn10_", "Zn10_", "Zn10_",
"Zn10_", "Zn100_", "Zn100_", "Zn100_", "Zn100_", "Zn100_", "Zn100_",
"Zn100_", "Zn100_", "Zn100_", "Zn100_", "Zn100_", "Zn100_", "Zn100_",
"Zn100_", "Zn100_", "Zn100_", "Zn100_", "Zn100_", "Zn100_", "Zn100_",
"Zn100_", "Zn100_", "Zn100_", "Zn100_", "Zn100_", "Zn100_", "Zn100_",
"Zn100_", "Zn100_", "Zn100_", "Zn100_", "Zn100_", "Zn100_", "Zn100_",
"Zn100_", "Zn100_", "Zn100_", "Zn100_", "Zn100_", "Zn100_"),
TAC = c(0.0134723395589115, 0.0161888871061509, 0.0146337654145718,
0.0153067871292595, 0.012800314735395, 0.0160841665978896,
0.0140621616691814, 0.0135425580967982, 0.0132198270328205,
0.0138496077219653, 0.0135775493518084, 0.0126333962864469,
0.0164821881641534, 0.0132516331108305, 0.0157791571175251,
0.0129960024291699, 0.0146323678504021, 0.0134451215151322,
0.0143262838325461, 0.0153573779185249, 0.0139773746147923,
0.0159350865128266, 0.0156720782857077, 0.0155096081292032,
0.013476349735956, 0.0140104181996115, 0.0129878390010014,
0.0147239859165112, 0.015160930718777, 0.0148955399340424,
0.013274378116328, 0.0153663044374496, 0.0145472559523844,
0.0132898660703847, 0.0139871399975842, 0.0124985111701027,
0.0149240276338179, 0.0129573902698069, 0.0147729343794709,
0.0128674264777598, 0.0147815872982594, 0.0139767796824041,
0.0144185398405766, 0.0155799146991459, 0.0135417909851351,
0.015988596586438, 0.0139603963976125, 0.0126397298299191,
0.013297964384596, 0.012347536157165, 0.0152573470818857,
0.0136566619097667, 0.0125192707022401, 0.0141156296691061,
0.0139603724286662, 0.0141388938152221, 0.0127749097766803,
0.0142082519110294, 0.0149398326676766, 0.0143207529313558,
0.0144381103787128, 0.0149147414885484, 0.0139224295866318,
0.0161358891403436, 0.0151690152511571, 0.0120945286936824,
0.0153132383654698, 0.0131770823852777, 0.0136750345235747,
0.0129352436377984, 0.0162120454010317, 0.0155409171425954,
0.0135940425474181, 0.0142951343511937, 0.0143779323175896,
0.0136891451722703, 0.0140286347004686, 0.0122667606250391,
0.0152446224172418, 0.013442306549535, 0.0129068996979612,
0.0147404146947943, 0.013688825582269, 0.0130193063055386,
0.01285971255513, 0.0151660181611206, 0.0138280467330508,
0.0135147736966651, 0.0158580706409006, 0.0149366602534351,
0.0106554950909403, 0.0179654260106192, 0.0120425346368713,
0.0145387164119486, 0.0139546280207597, 0.0121871897075845,
0.0150418870034593, 0.0148117380734173, 0.0139690179111281,
0.0170751257982307, 0.0129661477952429, 0.0144612227917873,
0.0146065893466387, 0.0126241343210384, 0.0170751257982307,
0.0130964557093226, 0.0134570968344701, 0.0165480203562944,
0.0151921149184481, 0.0130666062376204, 0.012722050697886,
0.0155582048904096, 0.0125288074742436, 0.016985639190516,
0.0176528351294189, 0.0138432089287227, 0.013890319218671,
0.017035215335001, 0.0168839977227436, 0.0133203267470888,
0.013892777179513, 0.0155216139064973, 0.0130076218759369,
0.013903958340264, 0.0135000204009635, 0.0148519977852621,
0.0153029154169557, 0.0141832966293512, 0.0176005510379328,
0.0180687740940438, 0.0177789446952697, 0.0182099087520794,
0.0184723827329167, 0.022483746075728, 0.0196648164641345,
0.0170131886149416, 0.0215058343136062, 0.0211259597744559,
0.0196373761289472, 0.0206737739206, 0.020532594441278, 0.0193494766153245,
0.0211617300063814, 0.0213333413267872, 0.0202163436360403,
0.0236752367085596, 0.0231873026647459, 0.0228522660496144,
0.0238366734630018, 0.0264524093818515, 0.0268093919646026,
0.0252668406573153, 0.0258403852690662, 0.0223986018317785,
0.0272147558779617, 0.0225116847733454, 0.0247724813762193,
0.022691182948792, 0.0235805783268122, 0.0270689051186104,
0.0126334908832258, 0.0164665820507107, 0.0129386884401034,
0.0119158011756844, 0.0130928729787235, 0.0149940706645974,
0.0129535502638655, 0.0162831996423606, 0.0176755444192191,
0.0161755659998132, 0.0174173101524856, 0.0155714069341957,
0.01433383826834, 0.0143819293817603, 0.0185494616259894,
0.0140319779691521, 0.0144114680062016, 0.0174497227904159,
0.0180907703704672, 0.0157478259355293, 0.0158958906812569,
0.0147163839619763, 0.0146701443994308, 0.0180369287296324,
0.0149336258279806, 0.0186097801562105, 0.0137231521985133,
0.0153650910635747, 0.0138998273293687, 0.0155199902217533,
0.0163903022171882, 0.015754928008943, 0.0171808546793322,
0.0154244829039175, 0.0134954450270778, 0.0147187179502944,
0.0160939056001929, 0.0145497150558122, 0.0154571534643691,
0.015511148172344, 0.0132885919777709, 0.0138910418368534,
0.0152496449072613, 0.0132820365830201, 0.013480084079182,
0.016683045565325, 0.0176337406920335, 0.0151657804062655,
0.0125455114843902, 0.0118102856445592, 0.0116410665300014,
0.0146556231989517, 0.014464999427952, 0.0121229802720933,
0.0146834533301593, 0.0121645122630423, 0.0136816673389857,
0.0135984961089614, 0.0164906141382343, 0.0149265724276527,
0.0163311308492402, 0.017967595623527, 0.0143263172313383,
0.0145117513172078, 0.0149694356038913, 0.0136478358101476,
0.0148523043836901, 0.0140267859486034, 0.0136857372651645,
0.0161384954212, 0.0171836598216303, 0.0165288287203719,
0.0163703032374203, 0.0149628937118673, 0.0167639896711626,
0.0144140290861155, 0.0164700832677882, 0.017097353142466,
0.0177233791174971, 0.016410406871025, 0.0145656397252108,
0.0127795571441824, 0.0139787766512734, 0.0145603577832239,
0.0130325210010334, 0.0157142193796273, 0.0165295708322065,
0.0154878492755022, 0.0176888974165639, 0.0186435561581489,
0.0177330425080685, 0.0182856446463086, 0.0219973970170363,
0.0217533371623466, 0.0176290655250839, 0.0202192044566584,
0.01917805317661, 0.0186277616395779, 0.0170154664932417,
0.0195884686724334, 0.0201420675026667, 0.0183148068985733,
0.020836323932372, 0.0207067552945439, 0.018534989031893,
0.019680916901509, 0.0219673944081694, 0.0236890701508884,
0.0235543150426157, 0.0234233849979097, 0.0210565415662947,
0.0232511101944444, 0.0227186732866978, 0.0225332903957415,
0.0234773944195847, 0.0229988542468931, 0.022618525386521,
0.0197686090869307, 0.0186686467858637, 0.0189525178016395
), conditions = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 5L, 5L,
5L, 5L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 7L,
7L, 7L, 7L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 9L, 9L, 9L, 9L,
9L, 9L, 9L, 9L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 11L,
11L, 11L, 11L, 11L, 11L, 11L, 11L, 12L, 12L, 12L, 12L, 12L,
12L, 12L, 12L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 14L,
14L, 14L, 14L, 14L, 14L, 14L, 14L, 15L, 15L, 15L, 15L, 15L,
15L, 15L, 15L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 17L,
17L, 17L, 17L, 17L, 17L, 17L, 17L, 18L, 18L, 18L, 18L, 18L,
18L, 18L, 18L, 19L, 19L, 19L, 19L, 19L, 19L, 19L, 19L, 20L,
20L, 20L, 20L, 20L, 20L, 20L, 20L, 21L, 21L, 21L, 21L, 21L,
21L, 21L, 21L, 22L, 22L, 22L, 22L, 22L, 22L, 22L, 22L, 23L,
23L, 23L, 23L, 23L, 23L, 23L, 23L, 24L, 24L, 24L, 24L, 24L,
24L, 24L, 24L, 25L, 25L, 25L, 25L, 25L, 25L, 25L, 25L, 26L,
26L, 26L, 26L, 26L, 26L, 26L, 26L, 27L, 27L, 27L, 27L, 27L,
27L, 27L, 27L, 28L, 28L, 28L, 28L, 28L, 28L, 28L, 28L, 29L,
29L, 29L, 29L, 29L, 29L, 29L, 29L, 30L, 30L, 30L, 30L, 30L,
30L, 30L, 30L, 31L, 31L, 31L, 31L, 31L, 31L, 31L, 31L, 32L,
32L, 32L, 32L, 32L, 32L, 32L, 32L, 33L, 33L, 33L, 33L, 33L,
33L, 33L, 33L, 34L, 34L, 34L, 34L, 34L, 34L, 34L, 34L, 35L,
35L, 35L, 35L, 35L, 35L, 35L, 35L), levels = c("controlday 00",
"controlday 07", "controlday 14", "controlday 21", "controlday 28",
"nZn1_day 00", "nZn1_day 07", "nZn1_day 14", "nZn1_day 21",
"nZn1_day 28", "nZn10_day 00", "nZn10_day 07", "nZn10_day 14",
"nZn10_day 21", "nZn10_day 28", "nZn100_day 00", "nZn100_day 07",
"nZn100_day 14", "nZn100_day 21", "nZn100_day 28", "Zn1_day 00",
"Zn1_day 07", "Zn1_day 14", "Zn1_day 21", "Zn1_day 28", "Zn10_day 00",
"Zn10_day 07", "Zn10_day 14", "Zn10_day 21", "Zn10_day 28",
"Zn100_day 00", "Zn100_day 07", "Zn100_day 14", "Zn100_day 21",
"Zn100_day 28"), class = "factor")), class = "data.frame", row.names = c(NA,
-280L))
> str(table5)
'data.frame': 280 obs. of 4 variables:
$ day : chr "day 00" "day 00" "day 00" "day 00" ...
$ type : chr "control" "control" "control" "control" ...
$ TAC : num 0.0135 0.0162 0.0146 0.0153 0.0128 ...
$ conditions: Factor w/ 35 levels "controlday 00",..: 1 1 1 1 1 1 1 1 2 2 ...
and my ggplot plus dunn codes:
library(plotrix)
library(ggplot2)
aa <- aggregate(CAT ~ day + type, data=table5, FUN=mean)
bb <- aggregate(CAT ~ day + type, data=table5, FUN=sd)
ee <- aggregate(CAT ~ day + type, data=table5, FUN=std.error)
cc <- merge(aa, ee, by=c("day", "type"))
colnames(cc)[3:4] <- c("mean", "se")
ggplot(cc, aes(x = type, y = mean, fill = day))+
geom_bar(stat="identity", position= "dodge") +
scale_fill_brewer(palette="Paired")+
theme_minimal() +
labs(x="", y="ratio", title = "CAT") +
theme(panel.background = element_blank(),
axis.line = element_line(colour = "black"),
panel.grid=element_blank(),
plot.title = element_text(size=40, hjust = 0),
legend.text = element_text(size=30),
legend.title = element_text(size=32),
axis.title.x = element_text(size = 36),
axis.text.x = element_text(size = 34),
axis.title.y = element_text(size = 36),
axis.text.y = element_text(size = 28)
) +
geom_errorbar(aes(ymin = mean-se,
ymax = mean+se),
position = "dodge")
library(dunn.test)
dunn.test(table5$CAT, table5$conditions, method = "bh")
All my thanks for any help provided.

tidyr summarize and mutate by multiple groups - calculation

I have some data, see subset below. For each Method I want to calculate the difference in the mean Cq between the 2uL and 4 uL and the 4 uL and the 8uL.
I have a function to calculate the mean for each Method, grouped by Volume. But I can't figure out how to add another column with the difference. I think I might have to summarize the summ table, but I"m getting confused. Any help appreciated. Thanks
dat_summ<-
dat %>%
group_by(Volume,Method) %>%
summarise(mean_Cq = mean(Cq,na.rm=T), sd_Cq=sd(Cq,na.rm=T),
CV=(sd(Cq,na.rm=T)/mean(Cq,na.rm=T))*100)
what I want but know if wrong:
dat_summ<-
dat %>%
group_by(Volume,Method) %>%
summarise(mean_Cq = mean(Cq,na.rm=T), sd_Cq=sd(Cq,na.rm=T),
CV=(sd(Cq,na.rm=T)/mean(Cq,na.rm=T))*100)+
**mutate(delta_doub=mean_Cq_for2uL-meanCq_for4uL)**
current output:
> dat_summ
# A tibble: 12 × 5
# Groups: Volume [3]
Volume Method mean_Cq sd_Cq CV
<chr> <fct> <dbl> <dbl> <dbl>
1 2ul 2ew 20.0 0.295 1.47
2 2ul 3ew 21.9 1.79 8.18
3 2ul Manual 22.2 0.248 1.12
4 2ul WN2ew 20.5 0.604 2.94
5 4ul 2ew 19.3 0.278 1.44
6 4ul 3ew 21.2 1.33 6.29
7 4ul Manual 22.2 0.139 0.627
8 4ul WN2ew 19.9 0.493 2.48
9 8ul 2ew 18.8 0.270 1.43
10 8ul 3ew 20.8 1.21 5.81
11 8ul Manual 23.7 1.50 6.35
12 8ul WN2ew 19.5 0.463 2.38
subset of dat:
sample Method Volume Cq
1 Sample 1 2ew 2ul 20.11
2 Sample 2 2ew 2ul 20.12
3 Sample 3 2ew 2ul 19.76
17 Sample 1 WN2ew 2ul 19.89
18 Sample 2 WN2ew 2ul 20.62
19 Sample 3 WN2ew 2ul 21.07
20 Sample 4 WN2ew 2ul 20.08
52 Sample 1 2ew 4ul 19.30
53 Sample 2 2ew 4ul 19.33
54 Sample 3 2ew 4ul 19.16
68 Sample 1 WN2ew 4ul 19.49
69 Sample 2 WN2ew 4ul 19.46
70 Sample 3 WN2ew 4ul 20.42
103 Sample 1 2ew 8ul 18.91
104 Sample 2 2ew 8ul 18.60
105 Sample 3 2ew 8ul 18.42
119 Sample 1 WN2ew 8ul 18.66
120 Sample 2 WN2ew 8ul 19.13
121 Sample 3 WN2ew 8ul 19.52
> dput(dat)
structure(list(sample = c("Sample 1", "Sample 2", "Sample 3",
"Sample 4", "Sample 5", "Sample 6", "Sample 7", "Sample 8", "Sample 9",
"Sample 10", "Sample 11", "Sample 12", "Sample 13", "Sample 14",
"Sample 15", "Sample 16", "Sample 1", "Sample 2", "Sample 3",
"Sample 4", "Sample 5", "Sample 6", "Sample 7", "Sample 8", "Sample 9",
"Sample 10", "Sample 11", "Sample 12", "Sample 13", "Sample 14",
"Sample 15", "Sample 16", "Sample 1", "Sample 2", "Sample 3",
"Sample 4", "Sample 5", "Sample 6", "Sample 7", "Sample 8", "Sample 10",
"Sample 11", "Sample 12", "Sample 13", "Sample 14", "Sample 15",
"Sample 16", "Sample 1", "Sample 2", "Sample 3", "Sample 4",
"Sample 1", "Sample 2", "Sample 3", "Sample 4", "Sample 5", "Sample 6",
"Sample 7", "Sample 8", "Sample 9", "Sample 10", "Sample 11",
"Sample 12", "Sample 13", "Sample 14", "Sample 15", "Sample 16",
"Sample 1", "Sample 2", "Sample 3", "Sample 4", "Sample 5", "Sample 6",
"Sample 7", "Sample 8", "Sample 9", "Sample 10", "Sample 11",
"Sample 12", "Sample 13", "Sample 14", "Sample 15", "Sample 16",
"Sample 1", "Sample 2", "Sample 3", "Sample 4", "Sample 5", "Sample 6",
"Sample 7", "Sample 8", "Sample 10", "Sample 11", "Sample 12",
"Sample 13", "Sample 14", "Sample 15", "Sample 16", "Sample 1",
"Sample 2", "Sample 3", "Sample 4", "Sample 1", "Sample 2", "Sample 3",
"Sample 4", "Sample 5", "Sample 6", "Sample 7", "Sample 8", "Sample 9",
"Sample 10", "Sample 11", "Sample 12", "Sample 13", "Sample 14",
"Sample 15", "Sample 16", "Sample 1", "Sample 2", "Sample 3",
"Sample 4", "Sample 5", "Sample 6", "Sample 7", "Sample 8", "Sample 9",
"Sample 10", "Sample 11", "Sample 12", "Sample 13", "Sample 14",
"Sample 15", "Sample 16", "Sample 1", "Sample 2", "Sample 3",
"Sample 4", "Sample 5", "Sample 6", "Sample 7", "Sample 8", "Sample 10",
"Sample 11", "Sample 12", "Sample 13", "Sample 14", "Sample 15",
"Sample 16", "Sample 1", "Sample 2", "Sample 3", "Sample 4"),
Method = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L,
3L, 3L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L), .Label = c("2ew", "3ew",
"Manual", "WN2ew"), class = "factor"), Volume = c("2ul",
"2ul", "2ul", "2ul", "2ul", "2ul", "2ul", "2ul", "2ul", "2ul",
"2ul", "2ul", "2ul", "2ul", "2ul", "2ul", "2ul", "2ul", "2ul",
"2ul", "2ul", "2ul", "2ul", "2ul", "2ul", "2ul", "2ul", "2ul",
"2ul", "2ul", "2ul", "2ul", "2ul", "2ul", "2ul", "2ul", "2ul",
"2ul", "2ul", "2ul", "2ul", "2ul", "2ul", "2ul", "2ul", "2ul",
"2ul", "2ul", "2ul", "2ul", "2ul", "4ul", "4ul", "4ul", "4ul",
"4ul", "4ul", "4ul", "4ul", "4ul", "4ul", "4ul", "4ul", "4ul",
"4ul", "4ul", "4ul", "4ul", "4ul", "4ul", "4ul", "4ul", "4ul",
"4ul", "4ul", "4ul", "4ul", "4ul", "4ul", "4ul", "4ul", "4ul",
"4ul", "4ul", "4ul", "4ul", "4ul", "4ul", "4ul", "4ul", "4ul",
"4ul", "4ul", "4ul", "4ul", "4ul", "4ul", "4ul", "4ul", "4ul",
"4ul", "4ul", "8ul", "8ul", "8ul", "8ul", "8ul", "8ul", "8ul",
"8ul", "8ul", "8ul", "8ul", "8ul", "8ul", "8ul", "8ul", "8ul",
"8ul", "8ul", "8ul", "8ul", "8ul", "8ul", "8ul", "8ul", "8ul",
"8ul", "8ul", "8ul", "8ul", "8ul", "8ul", "8ul", "8ul", "8ul",
"8ul", "8ul", "8ul", "8ul", "8ul", "8ul", "8ul", "8ul", "8ul",
"8ul", "8ul", "8ul", "8ul", "8ul", "8ul", "8ul", "8ul"),
Cq = c(20.11, 20.12, 19.76, 20.07, 20.19, 19.87, 20.33, 19.81,
20.15, 19.79, 19.67, 20.23, 19.9, 20.9, 19.93, 19.96, 19.89,
20.62, 21.07, 20.08, 21.32, 21.15, 21.07, 20.85, 21.16, 21.03,
20.79, 19.39, 20.25, 19.6, 20.14, 20.32, 26.35, 21.36, 21.67,
21.13, 21.28, 21.27, 21.36, 21.08, 20.56, 26.18, 21.31, 21.35,
21.06, 21.15, 21.28, 22.2, 22.18, 21.96, 22.56, 19.3, 19.33,
19.16, 19.27, 19.42, 19.16, 19.53, 19.1, 19.38, 19.08, 19.2,
19.44, 19.18, 20.11, 19.43, 18.81, 19.49, 19.46, 20.42, 19.21,
20.69, 20.39, 20.19, 20.13, 20.29, 20.49, 20.09, 19.19, 19.63,
19.27, 19.82, 19.76, 25.57, 20.45, 20.83, 20.68, 20.72, 21.25,
21.14, 21.06, 20.47, 22.51, 20.49, 20.9, 20.47, 20.24, 20.71,
22.09, 22.07, 22.13, 22.37, 18.91, 18.6, 18.42, 18.64, 19.14,
18.77, 18.77, 18.71, 19.39, 18.7, 18.67, 19.18, 18.79, 19.22,
18.73, NA, 18.66, 19.13, 19.52, 19.02, 20.25, 19.66, 19.78,
19.71, 19.89, 20.25, 19.47, 19.06, 19.49, 18.84, 19.27, 19.22,
24.97, 20.05, 20.33, 20.05, 20.59, 20.39, 20.08, 20.73, 20.3,
20.76, 21.12, 20.81, 20.22, 20.32, 20.69, 22.15, 25.2, 24.69,
22.63)), row.names = c(NA, -153L), class = "data.frame")
If the mean delta should be for each 'Method', then create the column first grouped by 'Method' (or if it is based across all the Method, then we do not need any grouping), get the mean difference of 'Cq' where 'Volume is '2ul' and '4ul' respectively, use that in grouping for calculating the rest of the summarised columns
library(dplyr)
dat %>%
group_by(Method) %>%
mutate(delta_doub =mean(Cq[Volume == '2ul'], na.rm = TRUE) -
mean(Cq[Volume=='4ul'], na.rm = TRUE) ) %>%
group_by(Volume, Method, delta_doub) %>%
summarise(mean_Cq = mean(Cq,na.rm=TRUE), sd_Cq=sd(Cq,na.rm=TRUE),
CV=(sd(Cq,na.rm=TRUE)/mean(Cq,na.rm=TRUE))*100, .groups = "drop")
-output
# A tibble: 12 × 6
Volume Method delta_doub mean_Cq sd_Cq CV
<chr> <fct> <dbl> <dbl> <dbl> <dbl>
1 2ul 2ew 0.743 20.0 0.295 1.47
2 2ul 3ew 0.727 21.9 1.79 8.18
3 2ul Manual 0.0600 22.2 0.248 1.12
4 2ul WN2ew 0.638 20.5 0.604 2.94
5 4ul 2ew 0.743 19.3 0.278 1.44
6 4ul 3ew 0.727 21.2 1.33 6.29
7 4ul Manual 0.0600 22.2 0.139 0.627
8 4ul WN2ew 0.638 19.9 0.493 2.48
9 8ul 2ew 0.743 18.8 0.270 1.43
10 8ul 3ew 0.727 20.8 1.21 5.81
11 8ul Manual 0.0600 23.7 1.50 6.35
12 8ul WN2ew 0.638 19.5 0.463 2.38
Or it can be
dat %>%
group_by(Volume,Method) %>%
summarise(mean_Cq = mean(Cq,na.rm=TRUE), sd_Cq=sd(Cq,na.rm=TRUE),
CV=(sd(Cq,na.rm=TRUE)/mean(Cq,na.rm=TRUE))*100,
.groups = 'drop') %>%
mutate(delta_doub_2_4 = mean(mean_Cq[Volume == '2ul']) -
mean(mean_Cq[Volume == '4ul']),
delta_doub_4_8 = mean(mean_Cq[Volume == '4ul']) -
mean(mean_Cq[Volume == '8ul']))
-output
# A tibble: 12 × 7
Volume Method mean_Cq sd_Cq CV delta_doub_2_4 delta_doub_4_8
<chr> <fct> <dbl> <dbl> <dbl> <dbl> <dbl>
1 2ul 2ew 20.0 0.295 1.47 0.542 -0.0443
2 2ul 3ew 21.9 1.79 8.18 0.542 -0.0443
3 2ul Manual 22.2 0.248 1.12 0.542 -0.0443
4 2ul WN2ew 20.5 0.604 2.94 0.542 -0.0443
5 4ul 2ew 19.3 0.278 1.44 0.542 -0.0443
6 4ul 3ew 21.2 1.33 6.29 0.542 -0.0443
7 4ul Manual 22.2 0.139 0.627 0.542 -0.0443
8 4ul WN2ew 19.9 0.493 2.48 0.542 -0.0443
9 8ul 2ew 18.8 0.270 1.43 0.542 -0.0443
10 8ul 3ew 20.8 1.21 5.81 0.542 -0.0443
11 8ul Manual 23.7 1.50 6.35 0.542 -0.0443
12 8ul WN2ew 19.5 0.463 2.38 0.542 -0.0443

Calculating Intake energy with loops

I am trying to run through an old coworkers script and I am hoping someone can help inform me of what exactly he did during this code segment. Earlier in the script we calculated the intake rate for several prey species and now it appears that we are grouping them based on unique locations. The section of code after this requires that there be 41 rows ( 1 row for each unique location in the complete dataset). I believe that the code subsets the data based on latitude and then adds an 'alpha' column. The main issue that I am having is what is this line calculating: x= x + d$Intakerate_kjday[j]*d$alpha[j]. For locations that had several prey items (profit.fall.38.4.959) is this code summing up "intakerate_kjday" and "alpha" and then multiplying them together? When the code is performed I receive the error Error in
`[<-.data.frame`(`*tmp*`, k, , value = c("2", "Bishop's Head", : replacement has 6 items, need 7
I would really appreciate any insight into what he was trying to calculate and a potential work around. Thank you.
dput(profit)
structure(list(Sample.ID = structure(c(5L, 19L, 27L, 28L, 30L,
38L, 12L, 62L, 49L, 29L, 25L, 17L, 61L, 67L, 27L, 26L, 32L, 9L,
47L, 45L, 5L, 26L, 27L, 44L, 45L, 4L, 1L, 43L, 19L, 35L), .Label = c("Barren Island Mud 1",
"BH High 1", "BH High 2", "BH Low 1", "BH Low 2", "BH Low 3",
"BHH 1 C", "BHH 2 E", "BHL 1 E", "BHL 2", "BHL 3 (B)", "BHM 1 C",
"BI High 1", "BI Low 1", "BI Low 2C", "BI Low 3", "BI Mud", "BIHI High B",
"BIL1 (low) E", "BIL1E", "BIL2 E", "BIL2E", "BW Fresh 1", "BW Fresh 2",
"BW High 1", "BW High 2", "BW High 5", "BW Low 3", "BW Money Stump",
"BW Mud 1", "BW SAV 1", "BW SAV 2", "BWH 1 D", "BWH 2", "BWH 3",
"BWH 5", "BWL 1", "BWL 2", "BWL 3", "BWM 1", "BWMS D", "EN High 2",
"EN High 4", "EN High 5", "EN Low 1", "EN Low 2", "EN Mud 2",
"ENH3 A High", "ENH4 A High", "ENH5 A High", "ENL1 Low E", "ENM1 A Mud",
"ENS1 SAV", "ENS2 SAV 2C", "ENS3 SAV 3E", "High 3C", "MWP 29 Low 1",
"MWP 30 Mud 1", "MWP 31 Low 2", "MWP 32 Mud 2", "MWP 33 Low 3",
"MWP 34 Low 4", "PWRC Fresh", "WP 27 HM-MARC", "WP 28 HM-MARC",
"WP 30 IT MARE", "WP29 LM-MARC"), class = "factor"), Season = structure(c(2L,
3L, 2L, 2L, 2L, 3L, 3L, 2L, 3L, 2L, 2L, 2L, 2L, 3L, 2L, 2L, 2L,
3L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 4L, 2L, 3L, 3L), .Label = c("",
"Fall", "Spring", "Spring?"), class = "factor"), Refuge = structure(c(3L,
2L, 5L, 5L, 5L, 5L, 4L, 7L, 6L, 5L, 5L, 2L, 7L, 7L, 5L, 5L, 5L,
4L, 6L, 6L, 3L, 5L, 5L, 6L, 6L, 3L, 2L, 6L, 2L, 5L), .Label = c("",
"Barren Island", "Bishop's Head", "Bishops Head", "Blackwater",
"Eastern Neck", "Martin", "PWRC"), class = "factor"), Habitat.Type = structure(c(3L,
3L, 2L, 3L, 4L, 3L, 4L, 3L, 2L, 3L, 2L, 4L, 3L, 3L, 2L, 2L, 5L,
3L, 4L, 3L, 3L, 2L, 2L, 2L, 3L, 3L, 4L, 2L, 3L, 2L), .Label = c("Fresh",
"High", "Low", "Mud", "SAV"), class = "factor"), Longitude = c(-76.03896,
-76.26205, -76.05714, -76.06332, -76.14641, -76.23522, -76.03869,
-75.99733, -76.21661, -76.23491, -76.22003, -76.26163, -75.99354,
-76.01407, -76.05714, -76.01762, -76.10363, -76.04883, -76.21547,
-76.23986, -76.03896, -76.01762, -76.05714, -76.2181, -76.23986,
-76.04883, -76.26163, -76.21661, -76.26205, -76.0235), Latitude = c(38.22447,
38.33905, 38.40959, 38.39708, 38.41795, 38.43055, 38.23255, 37.99369,
39.03264, 38.43141, 38.41026, 38.33606, 37.98833, 38.01108, 38.40959,
38.41913, 38.40351, 38.22694, 39.04036, 39.02677, 38.22447, 38.41913,
38.40959, 39.03887, 39.02677, 38.22694, 38.33606, 39.03264, 38.33905,
38.39138), Prey = structure(c(11L, 41L, 35L, 30L, 41L, 41L, 41L,
3L, 18L, 31L, 40L, 9L, 41L, 38L, 30L, 13L, 35L, 41L, 20L, 27L,
4L, 40L, 13L, 35L, 41L, 5L, 5L, 15L, 22L, 20L), .Label = c("Hydrobia",
"Hydrobia genus", "Hydrobia sp.", "Hydrobia spp", "Melampus bidentatus",
"Ruppia (maritima or rostellata)", "Ruppia genus", "Ruppia maritima",
"Schoenoplectus pungens", "Schoenoplectus robustus", "Schoenoplectus spp",
"Schoenoplectus spp.", "Scirpus acutus", "Scirpus acutus?", "Scirpus americanus",
"Scirpus fluviatilis", "Scirpus genus", "Scirpus genus 1", "Scirpus genus 1?",
"Scirpus genus 2", "Scirpus genus 3", "Scirpus genus?", "Scirpus heterochaetus",
"Scirpus meterochaetus", "Scirpus mevadensis", "Scirpus olney?",
"Scirpus olneyi", "Scirpus paludosis", "Scirpus paludosus", "Scirpus robustus",
"Scirpus robustus?", "Scirpus species", "Scirpus subterminalis",
"Scirpus subtermiralis", "Scirpus validus", "Spartina alterniflora",
"Spartina genus", "Spartina genus?", "Spartina patens", "Spartina pectinata",
"Zannichallia palustris"), class = "factor"), Density = c(2.36e-05,
0.000101477, 0.000335244, 1.17e-05, 1.91e-06, 2.8e-06, 1.72e-05,
1.34e-05, 2.71e-05, 0.000107843, 2.16e-06, 4.46e-06, 1.22e-05,
6.61e-05, 0.000263052, 3.91e-05, 0.00034925, 3.69e-06, 8.02e-06,
2.04e-05, 2.9e-05, 2.05e-05, 0.000564046, 0.001912535, 2.04e-05,
0.001117905, 0.00255132, 9.03e-05, 4.23e-05, 0.000248282), Intakerate_kcals = c(-3.5399430250046e-07,
7.6382794280604e-14, -5.02872205332896e-06, -1.7549698484651e-07,
2.70599529637464e-17, 5.81535679492809e-17, 2.19440708445348e-15,
4.34155540862746e-08, -4.06493587341127e-07, -1.61763139817e-06,
-3.23994151550826e-08, -6.68988064422799e-08, 1.10402768540446e-15,
-9.91487886840506e-07, -3.94580269988612e-06, -5.8649138992111e-07,
-5.23882134070119e-06, 1.00998060784975e-16, -1.2029789281118e-07,
-3.05994985702607e-07, 9.3958523768985e-08, -3.07494963928282e-07,
-8.46097103856411e-06, -2.86925082960488e-05, 3.08688633134856e-15,
3.62058033172122e-06, 8.25888178764606e-06, -1.35448644277712e-06,
-6.34490870510011e-07, -3.72424640639279e-06), Intakerate_kjs = c(-1.48111216166192e-06,
3.19585611270047e-13, -2.10401730711284e-05, -7.34279384597799e-07,
1.13218843200315e-16, 2.43314528299791e-16, 9.18139924135334e-15,
1.81650678296973e-07, -1.70076916943527e-06, -6.76816976994329e-06,
-1.35559153008866e-07, -2.79904606154499e-07, 4.61925183573226e-15,
-4.14838531854068e-06, -1.65092384963235e-05, -2.45387997542992e-06,
-2.19192284894938e-05, 4.22575886324335e-16, -5.03326383521979e-07,
-1.28028302017971e-06, 3.93122463449433e-07, -1.28655892907593e-06,
-3.54007028253523e-05, -0.000120049454710668, 1.29155324103624e-14,
1.51485081079216e-05, 3.45551613995111e-05, -5.66717127657947e-06,
-2.65470980221389e-06, -1.55822469643474e-05), Intakerate_kjday = c(-0.12796809076759,
2.76121968137321e-08, -1.81787095334549, -0.0634417388292498,
9.78210805250721e-12, 2.1022375245102e-11, 7.93272894452929e-10,
0.0156946186048585, -0.146946456239208, -0.584769868123101, -0.011712310819966,
-0.0241837579717487, 3.99103358607267e-10, -0.358420491521915,
-1.42639820608235, -0.212015229877145, -1.89382134149226, 3.65105565784226e-11,
-0.043487399536299, -0.110616452943527, 0.033965780842031, -0.111158691472161,
-3.05862072411043, -10.3722728870017, 1.11590200025531e-09, 1.30883110052443,
2.98556594491776, -0.489643598296466, -0.22936692691128, -1.34630613771962
)), row.names = c(NA, -30L), class = "data.frame")
lat=unique(profit$Latitude)
## for each location I am calculating the weight for Fall only
nfall=0
latfall<-c(double())
for(i in lat){
name = paste0("profit.fall.",round(i,5))
x = subset(profit,Latitude==i & Season=="Fall")
if(nrow(x)>=1){
for(j in 1:nrow(x)){
x$alpha[j]<- 1 # used to be this x$Density[j]/sum(x$Density)
}
nfall= nfall+1
assign(name, data.frame(x))
latfall<-c(latfall,round(i,5))
print(name)
}
}
View(profit.fall.38.4.959)
profit.fall.all <- data.frame(matrix(ncol=7,nrow=nfall))
names(profit.fall.all)[1]<-'Id'
names(profit.fall.all)[2]<-'Refuge'
names(profit.fall.all)[3]<-'Season'
names(profit.fall.all)[4]<-'HType'
names(profit.fall.all)[5]<-'Lat'
names(profit.fall.all)[6]<-'Long'
names(profit.fall.all)[7]<-'IntakeEnergy'
View(profit.fall.all)
k=0
lat=latfall
for(i in lat){
df=as.name(paste0('profit.fall.',i))
d=get(as.character(df))
x=0
for(j in 1:nrow(d)){
x= x + d$Intakerate_kjday[j]*d$alpha[j]
}
k=k+1
new_row <- c(k,as.character(d$Refuge[1]),as.character(d$Season[1]),as.character(d$Habitat.Type[1]),as.numeric(d$Latitude[1]),as.numeric(d$Longtitude[1]),as.numeric(x))
#names(new_row)<-c("id","Refuge","Season","HType","Lat","Long","Intakerate_kjday")
#profit.spring.all <- rbind(profit.spring.all, new_row)
profit.fall.all[k,] <- new_row
}
View(profit.fall.all)
The code in question apparently computes (very inefficiently and inaccurately)
sum(d$Intakerate_kjday * d$alpha)
Your error however suggests, that a column is missing in one of the data frames.
Take a look at new_row here:
for(i in lat){
df=as.name(paste0('profit.fall.',i))
d=get(as.character(df))
x=0
for(j in 1:nrow(d)){
x= x + d$Intakerate_kjday[j]*d$alpha[j]
}
k=k+1
new_row <- c(k,as.character(d$Refuge[1]),as.character(d$Season[1]),as.character(d$Habitat.Type[1]),as.numeric(d$Latitude[1]),as.numeric(d$Longtitude[1]),as.numeric(x))
#names(new_row)<-c("id","Refuge","Season","HType","Lat","Long","Intakerate_kjday")
#profit.spring.all <- rbind(profit.spring.all, new_row)
if (length(new_row) != ncol(profit.fall.all)) {
# Catch the bad df
browser()
}
profit.fall.all[k,] <- new_row
}

Is there a way to find the median using beta distribution parameters in R?

I am working with CSV dataset called productQuality where every row represents a weld type and the beta distribution parameters (α and β) for that specific weld. I was wondering if there's a way to calculate and list the medians for each weld type?
Here's a dput of my dataset:
structure(list(weld.type.ID = 1:33, weld.type = structure(c(29L,
11L, 16L, 4L, 28L, 17L, 19L, 5L, 24L, 27L, 21L, 32L, 12L, 20L,
26L, 25L, 3L, 7L, 13L, 22L, 33L, 1L, 9L, 10L, 18L, 15L, 31L,
8L, 23L, 2L, 14L, 6L, 30L), .Label = c("1,40,Material A", "1,40S,Material C",
"1,80,Material A", "1,STD,Material A", "1,XS,Material A", "10,10S,Material C",
"10,160,Material A", "10,40,Material A", "10,40S,Material C",
"10,80,Material A", "10,STD,Material A", "10,XS,Material A",
"13,40,Material A", "13,40S,Material C", "13,80,Material A",
"13,STD,Material A", "13,XS,Material A", "14,40,Material A",
"14,STD,Material A", "14,XS,Material A", "15,STD,Material A",
"15,XS,Material A", "2,10S,Material C", "2,160,Material A", "2,40,Material A",
"2,40S,Material C", "2,80,Material A", "2,STD,Material A", "2,XS,Material A",
"4,80,Material A", "4,STD,Material A", "6,STD,Material A", "6,XS,Material A"
), class = "factor"), alpha = c(281L, 196L, 59L, 96L, 442L, 98L,
66L, 30L, 68L, 43L, 35L, 44L, 23L, 14L, 24L, 38L, 8L, 8L, 5L,
19L, 37L, 38L, 6L, 11L, 29L, 6L, 16L, 6L, 16L, 3L, 4L, 9L, 12L
), beta = c(7194L, 4298L, 3457L, 2982L, 4280L, 3605L, 2229L,
1744L, 2234L, 1012L, 1096L, 1023L, 1461L, 1303L, 531L, 233L,
630L, 502L, 328L, 509L, 629L, 554L, 358L, 501L, 422L, 566L, 403L,
211L, 159L, 268L, 167L, 140L, 621L)), class = "data.frame", row.names = c(NA,
-33L))
According to Wikipedia there is an approximate solution for the median for alpha, beta >1, but no general closed-form solution. Below I implement the brute-force exact solution and the approximate solution:
## I_{1/2}^{-1}(alpha,beta)
med_exact0 <- function(alpha,beta,eps=1e-12) {
uniroot(function(x) pbeta(x,alpha,beta)-1/2,
interval=c(eps,1-eps))$root
}
med_exact <- Vectorize(med_exact0, vectorize.args=c("alpha","beta"))
med_approx <- function(alpha,beta) (alpha-1/3)/(alpha+beta-2/3)
edit comments point out that the inverse ('brute force') solution is already implemented in base R as qbeta(p=0.5,...)! Almost certainly more robust and computationally efficient than my solution ...
I called your data dd:
evals <- with(dd,med_exact(alpha,beta))
avals <- with(dd,med_approx(alpha,beta))
evals2 <- with(dd,qbeta(0.5,alpha,beta))
max(abs((evals-avals)/evals)) ## 0.0057
In the worst case in your data the exact and approximate solutions are off by about 0.6% ...

How do I separate these apparently linked columns?

Running into something strange as I try to use dplyr's select command to reduce the number of columns I have. I name three columns but I keep getting 4. Aside from the star trek chain of command flashbacks I find this behaviour odd and not sure how to get around it. Also, why is this happening?
Here is my dataframe expressing the number of occurrences at in each block of time in a day. It's also pretty verbose for just 6 rows of data.
library(dplyr)
library(tidyr)
test <- structure(list(Day = c("Dec 10", "Dec 10", "Dec 10", "Dec 10",
"Dec 11", "Dec 11"), Number = c(10L, 10L, 10L, 10L, 11L, 11L),
time = c("08:30", "12:00", "15:30", "19:00", "08:30", "12:00"
), Start = structure(c(1544430600, 1544443200, 1544455800,
1544468400, 1544517000, 1544529600), class = c("POSIXct",
"POSIXt"), tzone = "UTC"), n = c(29L, 74L, 20L, 26L, 29L,
32L)), class = c("grouped_df", "tbl_df", "tbl", "data.frame"
), row.names = c(NA, -6L), vars = c("Day", "Number", "time"), drop = TRUE, indices = list(
0L, 1L, 2L, 3L, 4L, 5L), group_sizes = c(1L, 1L, 1L, 1L,
1L, 1L), biggest_group_size = 1L, labels = structure(list(Day = c("Dec 10",
"Dec 10", "Dec 10", "Dec 10", "Dec 11", "Dec 11"), Number = c(10L,
10L, 10L, 10L, 11L, 11L), time = c("08:30", "12:00", "15:30",
"19:00", "08:30", "12:00")), class = "data.frame", row.names = c(NA,
-6L), vars = c("Day", "Number", "time"), drop = TRUE, indices = list(
0L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L,
14L, 15L, 16L, 17L, 18L, 19L, 20L, 21L, 22L, 23L, 24L, 25L,
26L, 27L, 28L, 29L, 30L, 31L, 32L, 33L, 34L, 35L, 36L, 37L,
38L, 39L, 40L, 41L, 42L, 43L, 44L, 45L, 46L, 47L, 48L, 49L,
50L, 51L, 52L), group_sizes = c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), biggest_group_size = 1L, labels = structure(list(
Day = c("Dec 10", "Dec 10", "Dec 10", "Dec 10", "Dec 11",
"Dec 11", "Dec 11", "Dec 11", "Dec 12", "Dec 12", "Dec 12",
"Dec 12", "Dec 13", "Dec 13", "Dec 13", "Dec 13", "Dec 14",
"Dec 14", "Dec 14", "Dec 14", "Dec 15", "Dec 15", "Dec 15",
"Dec 17", "Dec 17", "Dec 17", "Dec 17", "Dec 18", "Dec 18",
"Dec 18", "Dec 18", "Dec 19", "Dec 19", "Dec 19", "Dec 4",
"Dec 4", "Dec 4", "Dec 4", "Dec 5", "Dec 5", "Dec 5", "Dec 5",
"Dec 6", "Dec 6", "Dec 6", "Dec 6", "Dec 7", "Dec 7", "Dec 7",
"Dec 7", "Dec 8", "Dec 8", "Dec 8"), Number = c(10L, 10L,
10L, 10L, 11L, 11L, 11L, 11L, 12L, 12L, 12L, 12L, 13L, 13L,
13L, 13L, 14L, 14L, 14L, 14L, 15L, 15L, 15L, 17L, 17L, 17L,
17L, 18L, 18L, 18L, 18L, 19L, 19L, 19L, 4L, 4L, 4L, 4L, 5L,
5L, 5L, 5L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 8L, 8L, 8L),
time = c("08:30", "12:00", "15:30", "19:00", "08:30", "12:00",
"15:30", "19:00", "08:30", "12:00", "15:30", "19:00", "08:30",
"12:00", "15:30", "19:00", "08:30", "12:00", "15:30", "19:00",
"08:30", "12:00", "15:30", "08:30", "12:00", "15:30", "19:00",
"08:30", "12:00", "15:30", "19:00", "08:30", "12:00", "15:30",
"08:30", "12:00", "15:30", "19:00", "08:30", "12:00", "15:30",
"19:00", "08:30", "12:00", "15:30", "19:00", "08:30", "12:00",
"15:30", "19:00", "08:30", "12:00", "15:30")), class = "data.frame", row.names = c(NA,
-53L), vars = c("Day", "Number", "time"), drop = TRUE)))
You can see in the output that there's only 3 variables listed but oddly shows more. And when I select for specific variables or subtract others it won't work.
test %>%
select(Day, time, n)
The tibble should be ungrouped before selecting variables, as described here "Adding missing grouping variables" message in dplyr in R:
Without ungrouping:
test %>%
select(Day, time, n)
> test %>%
+ select(Day, time, n)
Adding missing grouping variables: `Number`
# A tibble: 6 x 4
# Groups: Day, Number, time [6]
Number Day time n
<int> <chr> <chr> <int>
1 10 Dec 10 08:30 29
2 10 Dec 10 12:00 74
3 10 Dec 10 15:30 20
4 10 Dec 10 19:00 26
5 11 Dec 11 08:30 29
6 11 Dec 11 12:00 32
With ungrouping
test %>%
ungroup() %>%
select(Day, time, n)
> test %>%
+ ungroup() %>%
+ select(Day, time, n)
# A tibble: 6 x 3
Day time n
<chr> <chr> <int>
1 Dec 10 08:30 29
2 Dec 10 12:00 74
3 Dec 10 15:30 20
4 Dec 10 19:00 26
5 Dec 11 08:30 29
6 Dec 11 12:00 32
Ungrouping the dataframe fixes it all.

Resources