multifactor anova keeps giving error codes - r

I am working with a dataset in which I need to compare ordinal data to continuous data in a different column. i.e, individals were categorized (by age, actually) and I need to compare different age ranges to two different test values. I have been attempting to run a multifactor anova, and have had no luck.
First, I subset each age category and tried this:
aov.first.molar<-aov(carbon.combo~first.m.cat.1+first.m.cat.2+first.m.cat.3+first.m.cat.4+first.m.cat.5)
Error in model.frame.default(formula = carbon.combo ~ first.m.cat.1 + :
invalid type (list) for variable 'first.m.cat.1'
So the subsets didn't work, so I tried just using the column headers, just to see if it would magically organize by category...
> aov.albania.first<-aov(albania$AgeCat_first~albania$juv_deltaC_dentine+albania$Adult_deltaC_collagen)
Warning messages:
1: In model.response(mf, "numeric") :
using type = "numeric" with a factor response will be ignored
2: In Ops.factor(y, z$residuals) : ‘-’ not meaningful for factors
> summary(aov.albania.first)
Error in levels(x)[x] : only 0's may be mixed with negative subscripts
That obviously didn't work either, and I am not sure what I am doing wrong. I set everything as a factor, and I don't understand why the code is not working.
I am wondering if it has something to do with the fact that the nature of my test data is negative. I am not sure how to fix that without altering the data
Here is my data, as requested. I am sorry it's so messy, I am not sure how to format it better. Turning it into a matrix helped, but I am still having problems with anov and ggplot not being able to find certain things that I already turned into factors...
structure(list(Number = structure(1:10, .Label = c("142-c-1",
"142-c-3", "142-c-5", "156-c-1", "156-c-4", "156-c-6", "157-c-1",
"157-c-3", "157-c-5", "157-c-6", "158-c-3", "158-c-6", "178-c-1/A",
"178-c-2/A", "178-c-2/b", "178-c-3/b", "178-c-4/b", "186-c-2/a",
"186-c-2/b", "186-c-3/b", "186-c-4/b", "186-c-5/b", "186-c-6/b",
"192-c-1", "192-c-2", "192-c-3", "192-c-4", "192-c-5", "205-c-1",
"205-c-2", "205-c-3", "205-c-4", "205-c-5", "205-c-6", "210-c-1",
"210-c-2", "210-c-3", "210-c-4", "210-c-5", "215-c-1", "215-c-2",
"215-c-3", "215-c-4", "215-c-5", "215-c-6", "215-c-7", "270-c-1",
"270-c-2", "270-c-3", "270-c-4", "270-c-5", "295-c-1", "295-c-3",
"295-c-4", "353-c-2", "353-c-3", "353-c-4", "353-c-5", "353-c-6",
"382-c-1", "390-c-1", "390-c-2", "390-c-3"), class = "factor"),
ToothID = structure(c(3L, 3L, 3L, 8L, 8L, 8L, 7L, 7L, 7L,
7L), .Label = c("LI2", "LM1", "LM1-2", "LM3", "LP1-2", "M2",
"RM1-2", "RM2"), class = "factor"), sex = structure(c(2L,
2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L), .Label = c("F", "M"), class = "factor"),
Al.Qahtani.category = structure(c(2L, 5L, 8L, 2L, 5L, 8L,
2L, 6L, 7L, 8L), .Label = c("AC", "CR 1/2", "CR 3/4", "CRC",
"R 1/2", "R 1/4", "R 3/4", "RC", "Ri ", "unk"), class = "factor"),
AgeCat_first = structure(c(1L, 2L, 3L, 2L, 3L, 4L, 1L, 2L,
2L, 3L), .Label = c("1", "2", "3", "4", "5"), class = "factor"),
AgeCat_second = c(2L, 3L, 4L, 2L, 3L, 4L, 2L, 3L, 4L, 4L),
sample_age_first = structure(c(9L, 18L, 23L, 17L, 27L, 6L,
10L, 13L, 21L, 23L), .Label = c("10.5 to 16.5", "11.5 to 14.5",
"11.5 to 15.5", "11.5 to 18.5", "11.5 to 19.5", "12.5 to 15.5",
"12.5 to 19.5", "15.5 to 20.5", "1.5 to 2.5", "1.5 to 3.5",
"17.5 to 22.5", "2.5 to 4.5", "3.5 to 6.5", "3.5 to 7.5",
" 4.5 to 6.5 ", "4.5 to 6.5", "4.5 to 7.5", "4.5 to 8.5",
"6.5 to 11.5", "6.5 to 8.5", "6.5 to 9.5", "7.5 to 10.5",
"8.5 to 10.5", "8.5 to 11.5", "8.5 to 12.5", "9.5 to 12.5",
"9.5 to 13.5", "9.5 to 15.5", "unk"), class = "factor"),
sample_age_second = structure(c(16L, 25L, 7L, 15L, 26L, 7L,
15L, 22L, 2L, 7L), .Label = c("10.5 to 16.5", "11.5 to 13.5",
"11.5 to 14.5", "11.5 to 15.5", "11.5 to 18.5", "11.5 to 19.5",
"12.5 to 15.5", "12.5 to 19.5", "14.5 to 17.5", "15.5 to 20.5",
"1.5 to 3.5", "17.5 to 22.5", "3.5 to 6.5", "4.5 to 6.5",
"4.5 to 7.5", "4.5 to7.5", " 5.5 to 6.5 ", "6.5 to 11.5",
"6.5 to 8.5", "6.5 to 9.5", "7.5 to 11.5", "7.5 to 12.5",
"8.5 to 12.5", "9.5 to 12.5", "9.5 to12.5", "9.5 to 13.5",
"9.5 to 15.5", "unk"), class = "factor"), AgeCat_adult = c(9L,
9L, 9L, 8L, 8L, 8L, 7L, 7L, 7L, 7L), age_at_death = structure(c(3L,
3L, 3L, 2L, 2L, 2L, 1L, 1L, 1L, 1L), .Label = c("18-30",
"31-45", ">45", "Adolescent", "Ind"), class = "factor"),
weight_percent_.N = c(11.5, 6.6, 6.8, 7.8, 8.7, 9.4, 5.6,
5.6, 9.1, 3.9), weight_percent_C = c(37.8, 26.2, 29.5, 32.7,
34.7, 34.4, 22, 30.7, 46.8, 22.7), juv_deltaN_dentine = c(4.54,
4.45, NA, 4.03, 5.73, 6.81, 5.03, 4.58, 0.3, NA), juv_deltaC_dentine = c(-22.042,
-22.865, -24.345, -23.557, -23.24, -22.282, -22.85, -22.697,
-25.439, -25.776), juv_proxy = c(7.958, 7.135, 5.655, 6.443,
6.76, 7.718, 7.15, 7.303, 4.561, 4.224), Adult_deltaC_collagen = c(-18.62,
-18.62, -18.62, -18.9, -18.9, -18.9, -18.64, -18.64, -18.64,
-18.64), adult_proxy = c(11.38, 11.38, 11.38, 11.1, 11.1,
11.1, 11.36, 11.36, 11.36, 11.36), Adult_deltaC_apatite = c(12.29,
12.29, 12.29, -10.23, -10.23, -10.23, -10.73, -10.73, -10.73,
-10.73), Adult_deltaN = c(-18.62, -18.62, -18.62, -18.9,
-18.9, -18.9, -18.64, -18.64, -18.64, -18.64), apatite_collagen_spacing = c(8.66,
8.66, 8.66, 7.67, 7.67, 7.67, 7.74, 7.74, 7.74, 7.74), Adult_percent_C = structure(c(2L,
2L, 2L, 6L, 6L, 6L, 7L, 7L, 7L, 7L), .Label = c("14.31%",
"22.35%", "33.96%", "34.58%", "36.60%", "39.07%", "39.51%",
"42.12%", "42.17%", "42.29%", "42.81%", "44.01%", "44.72%",
"45.52%"), class = "factor"), Adult_percent_N = structure(c(14L,
14L, 14L, 4L, 4L, 4L, 5L, 5L, 5L, 5L), .Label = c("12.16%",
"12.30%", "13.04%", "13.78%", "14.20%", "14.89%", "14.97%",
"15.13%", "15.18%", "15.66%", "15.85%", "16.10%", "4.60%",
"7.98%"), class = "factor"), Adult_CN_ratio = c(3.27, 3.27,
3.27, 3.31, 3.31, 3.31, 3.25, 3.25, 3.25, 3.25), delta_18O = c(-5.5,
-5.5, -5.5, -4.79, -4.79, -4.79, -5.39, -5.39, -5.39, -5.39
), CP = c(0.17, 0.17, 0.17, 0.21, 0.21, 0.21, 0.2, 0.2, 0.2,
0.2), IR_SF = c(3.33, 3.33, 3.33, 3.12, 3.12, 3.12, 3.19,
3.19, 3.19, 3.19), adult_bone_sampled = structure(c(2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("femur", "humerus",
"occipital", "temporal", "tibia"), class = "factor")), .Names = c("Number",
"ToothID", "sex", "Al.Qahtani.category", "AgeCat_first", "AgeCat_second",
"sample_age_first", "sample_age_second", "AgeCat_adult", "age_at_death",
"weight_percent_.N", "weight_percent_C", "juv_deltaN_dentine",
"juv_deltaC_dentine", "juv_proxy", "Adult_deltaC_collagen", "adult_proxy",
"Adult_deltaC_apatite", "Adult_deltaN", "apatite_collagen_spacing",
"Adult_percent_C", "Adult_percent_N", "Adult_CN_ratio", "delta_18O",
"CP", "IR_SF", "adult_bone_sampled"), row.names = c(NA, 10L), class = "data.frame")

Your data corresponds to the second question, and so does this answer.
The way the aov function works is by measuring response as dependent on the categories. The formula thus needs to be designed as variable ~ factor.
aov.albania.first <- aov(juv_deltaC_dentine + Adult_deltaC_collagen ~ AgeCat_first,
data = albania)
summary(aov.albania.first)
Df Sum Sq Mean Sq F value Pr(>F)
AgeCat_first 3 6.480 2.160 1.667 0.272
Residuals 6 7.773 1.296
The problem with the first question might be similar to this. Further, check str(first.m.cat.1) and reformat the variable to vector.

Related

How to rearrange output from dplyr

When I write the following code
ddply(milkers, .(dim_cat, lact_cat), function(x) mean(x$milkyield))
I get the following output
The mean calculations regarding milk production by class of stock (1 vs 2) are correct. I would like to end up with a table more like the one below.
Effectively I am trying to get the number of animals in each time period and calculate their mean milk production. The problem is that it is calculating the total number of animals for all time periods and mean milk production for all time periods.
The code I used to generate this data is below.
heiferdat <- subset(milkers, lact_cat== 1)
cowdat <- subset(milkers, lact_cat== 2)
ddply(milkers, .(dim_cat), function(x) c(Heifers = sum(milkers$lact_cat==1), H_Milk= mean(heiferdat$milkyield), Cows = sum(milkers$lact_cat==2), C_Milk= mean(cowdat$milkyield)))
I had anticipated that in this code the .(dim_cat) variable would be applied to the function to restrict the sum and mean functions to only include animals in the correct time period.
I am looking for advice as to how I can get the output with one row per time period with the number of animals for each class lact_cat and the mean milk production for each lact_cat
Thank you
The following is a subset of the data that i am working with.
dput(milkers[180:200, c(11, 25, 26)])
dput(heiferdat[1:20, c(11, 25, 26)])
dput(cowdat[1:20, c(11, 25, 26)])
> dput(milkers[180:200, c(11, 25, 26)])
structure(list(milkyield = structure(c(8.42, 38.32, 14.27, 7.68,
16.59, 17.19, 24.45, 33.47, 36.16, 25.88, 11.61, 18.96, 11.27,
33.6, 21.57, 20.87, 9.62, 7.93, 21.02, 17.75, 22.01), label = "Milk (L)", class = c("labelled",
"numeric")), dim_cat = structure(c(5L, 3L, 7L, 7L, 2L, 7L, 2L,
2L, 2L, 3L, 6L, 6L, 2L, 3L, 6L, 6L, 6L, 6L, 6L, 7L, 6L), .Label = c("<31",
"31-90", "91-150", "151-210", "211-270", "271-330", ">330"), class = c("labelled",
"factor"), label = "Days in Milk"), lact_cat = structure(c(2L,
2L, 1L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L,
1L, 1L, 1L, 1L), .Label = c("1", "2"), class = "factor")), row.names = 180:200, class = "data.frame")
> dput(heiferdat[1:20, c(11, 25, 26)])
structure(list(milkyield = structure(c(14.27, 17.19, 11.61, 18.96,
11.27, 21.57, 20.87, 9.62, 7.93, 21.02, 17.75, 22.01, 25.15,
11.75, 12.6, 15.62, 19.29, 8.85, 15.52, 11.62), label = "Milk (L)", class = c("labelled",
"numeric")), dim_cat = structure(c(7L, 7L, 6L, 6L, 2L, 6L, 6L,
6L, 6L, 6L, 7L, 6L, 6L, 6L, 6L, 7L, 6L, 6L, 6L, 6L), .Label = c("<31",
"31-90", "91-150", "151-210", "211-270", "271-330", ">330"), class = c("labelled",
"factor"), label = "Days in Milk"), lact_cat = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), .Label = c("1", "2"), class = "factor")), row.names = c(182L,
185L, 190L, 191L, 192L, 194L, 195L, 196L, 197L, 198L, 199L, 200L,
201L, 202L, 203L, 204L, 205L, 206L, 207L, 208L), class = "data.frame")
> dput(cowdat[1:20, c(11, 25, 26)])
structure(list(milkyield = structure(c(15.73, 14.56, 16.94, 16.25,
39.09, 9.79, 8.41, 3.05, 38.89, 11.7, 29.89, 19.73, 18.2, 20.63,
20.32, 52.99, 10.11, 8.08, 10.84, 33.75), label = "Milk (L)", class = c("labelled",
"numeric")), dim_cat = structure(c(3L, 6L, 6L, 2L, 3L, 7L, 6L,
7L, 3L, 7L, 3L, 6L, 3L, 6L, 2L, 2L, 7L, 6L, 7L, 7L), .Label = c("<31",
"31-90", "91-150", "151-210", "211-270", "271-330", ">330"), class = c("labelled",
"factor"), label = "Days in Milk"), lact_cat = structure(c(2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L), .Label = c("1", "2"), class = "factor")), row.names = c(NA,
20L), class = "data.frame")
Following from #DanChaltiel's advice to use dplyr. Here is a dplyr approach:
library(dplyr)
all_summary = milkers %>%
group_by(dim_cat, lact_cat) %>%
summarise(avg = mean(milkyield),
num = n())
At this point you have all the summary information calculated. The following code is just formatting/presentation.
heifer_summary = all_summary %>%
filter(lact_cat == 1) %>%
select(dim_cat, Heifers = num, H_Milk = avg)
cow_summary = all_summary %>%
filter(lact_cat == 2) %>%
select(dim_cat, Cows = num, C_Milk = avg)
arranged_summary = full_join(heifer_summary, cow_summary, by = "dim_cat") %>%
select(dim_cat, Heifers, H_Milk, Cows, C_Milk) %>%
arrange(dim_cat)

Flexdashboard checkboxGroupInput losing ggplot data

I have an issue that is related to this one, but was unable to come to a solution for mine.
I have a reactive ggplot that I would like to update using a check box based on group data.
Currently, when I have ONE box selected, the data displays correctly. If I select more than one check box, I lose data points. See pictures below. I think I have to change the way I'm filtering my data and use droplevels somewhere but not sure how to integrate that (I'm new to shiny!). Any suggestions are appreciated!
WHOC_Sum_CMJ <- structure(list(Athlete = structure(c(1L, 1L, 1L, 7L, 7L, 7L,
7L, 8L, 8L, 8L, 8L, 9L, 9L, 9L, 9L, 10L, 10L, 10L, 10L, 11L,
11L, 11L, 11L, 12L, 12L, 12L, 12L, 13L, 13L, 13L, 13L, 14L, 14L,
14L, 14L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 5L,
5L, 5L, 5L, 6L, 6L, 6L, 6L), .Label = c("Athlete 1", "Athlete 10",
"Athlete 11", "Athlete 12", "Athlete 13", "Athlete 14", "Athlete 2",
"Athlete 3", "Athlete 4", "Athlete 5", "Athlete 6", "Athlete 7",
"Athlete 8", "Athlete 9"), class = "factor"), Date = structure(c(1L,
4L, 5L, 1L, 3L, 5L, 7L, 2L, 3L, 5L, 7L, 1L, 3L, 5L, 7L, 1L, 3L,
5L, 7L, 1L, 3L, 6L, 7L, 2L, 4L, 5L, 8L, 1L, 3L, 5L, 7L, 1L, 3L,
5L, 7L, 1L, 3L, 5L, 7L, 1L, 3L, 5L, 7L, 1L, 3L, 5L, 7L, 1L, 3L,
6L, 7L, 1L, 3L, 5L, 7L), .Label = c("2020-01-06", "2020-01-07",
"2020-01-13", "2020-01-14", "2020-01-21", "2020-01-23", "2020-01-27",
"2020-01-28"), class = "factor"), Position = structure(c(2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L), .Label = c("DEF", "FWD", "GOALIE"), class = "factor"),
Program = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 1L, 1L, 1L, 1L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 4L, 4L, 4L, 4L, 1L, 1L, 1L, 1L,
4L, 4L, 4L, 4L, 1L, 1L, 1L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L), .Label = c("Navy", "Red", "RTP", "White"), class = "factor"),
mRSI = c(0.36, 0.38, 0.42, 0.46, 0.46, 0.47, 0.48, 0.31,
0.3, 0.24, 0.3, 0.29, 0.26, 0.28, 0.28, 0.36, 0.35, 0.43,
0.43, 0.28, 0.31, 0.28, 0.3, 0.33, 0.36, 0.35, 0.37, 0.37,
0.36, 0.37, 0.36, 0.3, 0.36, 0.34, 0.37, 0.26, 0.28, 0.34,
0.3, 0.39, 0.4, 0.43, 0.43, 0.43, 0.47, 0.46, 0.48, 0.34,
0.36, 0.33, 0.37, 0.28, 0.28, 0.34, 0.33), SystemWeight = c(617.21,
612.4, 620.45, 672.08, 682.23, 670.5, 663.41, 517.33, 515.23,
511.62, 517.85, 697.55, 703.92, 689.43, 691.33, 859.06, 845.9,
850.97, 851.84, 655.79, 665.09, 673.91, 667.92, 626.78, 632.92,
634.52, 624.88, 637.55, 645.6, 648.78, 646.64, 558.03, 563.23,
569.58, 560.95, 693.63, 695.54, 684.37, 684.58, 641.18, 660.8,
663.95, 660, 594.92, 596.97, 591.36, 585.64, 522.35, 518.17,
530.95, 523.5, 780.65, 789.81, 775.84, 775.48), FTCT = c(0.61,
0.62, 0.67, 0.74, 0.75, 0.77, 0.77, 0.54, 0.55, 0.44, 0.53,
0.53, 0.49, 0.53, 0.56, 0.6, 0.58, 0.68, 0.68, 0.53, 0.57,
0.54, 0.55, 0.61, 0.63, 0.64, 0.65, 0.59, 0.58, 0.59, 0.59,
0.51, 0.59, 0.59, 0.59, 0.53, 0.57, 0.63, 0.59, 0.76, 0.76,
0.79, 0.78, 0.67, 0.72, 0.72, 0.74, 0.63, 0.65, 0.61, 0.63,
0.49, 0.5, 0.53, 0.57), JumpHeight_cm = c(28.97, 29.78, 31.43,
35.83, 35.41, 36.59, 36.92, 27.56, 26.11, 26.15, 26.82, 26.15,
25.08, 24.98, 24.62, 29.39, 30.17, 32.42, 32.56, 26.6, 27.25,
25.58, 27.88, 29.17, 31.58, 28.48, 31.24, 33.73, 32.78, 33.09,
33.43, 29.73, 31.91, 30.65, 32.98, 24.15, 24.24, 27.57, 25.44,
26.68, 26.39, 27.43, 28.87, 35.44, 36.29, 35.71, 36.06, 26.79,
27.76, 26.82, 29.71, 28.69, 26.9, 31.12, 29.77), EJH = c(17.6,
18.58, 21.11, 26.66, 26.69, 28.08, 28.38, 14.99, 14.39, 11.41,
14.33, 13.8, 12.34, 13.29, 13.67, 17.58, 17.5, 22.03, 22.19,
14.03, 15.59, 13.92, 15.39, 17.7, 19.75, 18.37, 20.3, 19.99,
18.9, 19.62, 19.61, 15.09, 18.8, 18.18, 19.6, 12.78, 13.87,
17.28, 15.06, 20.44, 20.12, 21.74, 22.52, 23.8, 26.25, 25.68,
26.73, 16.99, 18.13, 16.42, 18.82, 14.09, 13.43, 16.61, 16.9
), Weight = c(62.94, 62.45, 63.27, 68.54, 69.57, 68.38, 67.65,
52.76, 52.54, 52.17, 52.81, 71.13, 71.78, 70.31, 70.5, 87.61,
86.26, 86.78, 86.87, 66.88, 67.82, 68.72, 68.11, 63.92, 64.54,
64.71, 63.72, 65.02, 65.84, 66.16, 65.94, 56.91, 57.44, 58.09,
57.2, 70.74, 70.93, 69.79, 69.81, 65.39, 67.39, 67.71, 67.31,
60.67, 60.88, 60.31, 59.72, 53.27, 52.84, 54.15, 53.39, 79.61,
80.54, 79.12, 79.08)), class = "data.frame", row.names = c(NA,
-55L))
```
checkboxGroupInput("Program", label = "Program", choices = unique(WHOC_Sum_CMJ$Program), selected = "Red", inline = TRUE)
# (Note: for the code I cut out some of the styling to make it more readable. That's why it looks different than the pictures).
```
```
renderPlot({
f <- WHOC_Sum_CMJ %>%
select(Date, Athlete, JumpHeight_cm, Program)%>%
filter(Program == input$Program)
p <- ggplot(f)+
geom_line(aes(x=Date, y=JumpHeight_cm, colour = Athlete))+
geom_point(aes(x=Date, y=JumpHeight_cm, colour = Athlete))+
theme_bw() +
labs(title = "Team Jump Height",
x = "Date",
y = "Jump Height (cm)")+
scale_x_date(limits = c(min = min(WHOC_Sum_CMJ$Date), max = max(WHOC_Sum_CMJ$Date)), labels = date_format("%m/%d"),
date_breaks = "2 weeks", expand = c(.08,0))+
guides(col = guide_legend(nrow = 3))+
geom_text_repel(data= subset(f, Date == min(Date)), aes(x=Date, y=JumpHeight_cm,label = unique(Athlete)),
force = .1,
nudge_x = -2,
direction = "y",
hjust = 1,
)
p
})
The issue in your code indeed is based on the filter call. You'll need to use %in%instead of ==, when filtering a vector of statements. Please see the following:
---
title: "Test"
output: flexdashboard::flex_dashboard
runtime: shiny
---
```{r global, include=FALSE}
library(ggplot2)
library(dplyr)
library(scales)
library(ggrepel)
WHOC_Sum_CMJ <- structure(list(Athlete = structure(c(1L, 1L, 1L, 7L, 7L, 7L,
7L, 8L, 8L, 8L, 8L, 9L, 9L, 9L, 9L, 10L, 10L, 10L, 10L, 11L,
11L, 11L, 11L, 12L, 12L, 12L, 12L, 13L, 13L, 13L, 13L, 14L, 14L,
14L, 14L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 5L,
5L, 5L, 5L, 6L, 6L, 6L, 6L), .Label = c("Athlete 1", "Athlete 10",
"Athlete 11", "Athlete 12", "Athlete 13", "Athlete 14", "Athlete 2",
"Athlete 3", "Athlete 4", "Athlete 5", "Athlete 6", "Athlete 7",
"Athlete 8", "Athlete 9"), class = "factor"), Date = structure(c(1L,
4L, 5L, 1L, 3L, 5L, 7L, 2L, 3L, 5L, 7L, 1L, 3L, 5L, 7L, 1L, 3L,
5L, 7L, 1L, 3L, 6L, 7L, 2L, 4L, 5L, 8L, 1L, 3L, 5L, 7L, 1L, 3L,
5L, 7L, 1L, 3L, 5L, 7L, 1L, 3L, 5L, 7L, 1L, 3L, 5L, 7L, 1L, 3L,
6L, 7L, 1L, 3L, 5L, 7L), .Label = c("2020-01-06", "2020-01-07",
"2020-01-13", "2020-01-14", "2020-01-21", "2020-01-23", "2020-01-27",
"2020-01-28"), class = "factor"), Position = structure(c(2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L), .Label = c("DEF", "FWD", "GOALIE"), class = "factor"),
Program = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 1L, 1L, 1L, 1L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 4L, 4L, 4L, 4L, 1L, 1L, 1L, 1L,
4L, 4L, 4L, 4L, 1L, 1L, 1L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L), .Label = c("Navy", "Red", "RTP", "White"), class = "factor"),
mRSI = c(0.36, 0.38, 0.42, 0.46, 0.46, 0.47, 0.48, 0.31,
0.3, 0.24, 0.3, 0.29, 0.26, 0.28, 0.28, 0.36, 0.35, 0.43,
0.43, 0.28, 0.31, 0.28, 0.3, 0.33, 0.36, 0.35, 0.37, 0.37,
0.36, 0.37, 0.36, 0.3, 0.36, 0.34, 0.37, 0.26, 0.28, 0.34,
0.3, 0.39, 0.4, 0.43, 0.43, 0.43, 0.47, 0.46, 0.48, 0.34,
0.36, 0.33, 0.37, 0.28, 0.28, 0.34, 0.33), SystemWeight = c(617.21,
612.4, 620.45, 672.08, 682.23, 670.5, 663.41, 517.33, 515.23,
511.62, 517.85, 697.55, 703.92, 689.43, 691.33, 859.06, 845.9,
850.97, 851.84, 655.79, 665.09, 673.91, 667.92, 626.78, 632.92,
634.52, 624.88, 637.55, 645.6, 648.78, 646.64, 558.03, 563.23,
569.58, 560.95, 693.63, 695.54, 684.37, 684.58, 641.18, 660.8,
663.95, 660, 594.92, 596.97, 591.36, 585.64, 522.35, 518.17,
530.95, 523.5, 780.65, 789.81, 775.84, 775.48), FTCT = c(0.61,
0.62, 0.67, 0.74, 0.75, 0.77, 0.77, 0.54, 0.55, 0.44, 0.53,
0.53, 0.49, 0.53, 0.56, 0.6, 0.58, 0.68, 0.68, 0.53, 0.57,
0.54, 0.55, 0.61, 0.63, 0.64, 0.65, 0.59, 0.58, 0.59, 0.59,
0.51, 0.59, 0.59, 0.59, 0.53, 0.57, 0.63, 0.59, 0.76, 0.76,
0.79, 0.78, 0.67, 0.72, 0.72, 0.74, 0.63, 0.65, 0.61, 0.63,
0.49, 0.5, 0.53, 0.57), JumpHeight_cm = c(28.97, 29.78, 31.43,
35.83, 35.41, 36.59, 36.92, 27.56, 26.11, 26.15, 26.82, 26.15,
25.08, 24.98, 24.62, 29.39, 30.17, 32.42, 32.56, 26.6, 27.25,
25.58, 27.88, 29.17, 31.58, 28.48, 31.24, 33.73, 32.78, 33.09,
33.43, 29.73, 31.91, 30.65, 32.98, 24.15, 24.24, 27.57, 25.44,
26.68, 26.39, 27.43, 28.87, 35.44, 36.29, 35.71, 36.06, 26.79,
27.76, 26.82, 29.71, 28.69, 26.9, 31.12, 29.77), EJH = c(17.6,
18.58, 21.11, 26.66, 26.69, 28.08, 28.38, 14.99, 14.39, 11.41,
14.33, 13.8, 12.34, 13.29, 13.67, 17.58, 17.5, 22.03, 22.19,
14.03, 15.59, 13.92, 15.39, 17.7, 19.75, 18.37, 20.3, 19.99,
18.9, 19.62, 19.61, 15.09, 18.8, 18.18, 19.6, 12.78, 13.87,
17.28, 15.06, 20.44, 20.12, 21.74, 22.52, 23.8, 26.25, 25.68,
26.73, 16.99, 18.13, 16.42, 18.82, 14.09, 13.43, 16.61, 16.9
), Weight = c(62.94, 62.45, 63.27, 68.54, 69.57, 68.38, 67.65,
52.76, 52.54, 52.17, 52.81, 71.13, 71.78, 70.31, 70.5, 87.61,
86.26, 86.78, 86.87, 66.88, 67.82, 68.72, 68.11, 63.92, 64.54,
64.71, 63.72, 65.02, 65.84, 66.16, 65.94, 56.91, 57.44, 58.09,
57.2, 70.74, 70.93, 69.79, 69.81, 65.39, 67.39, 67.71, 67.31,
60.67, 60.88, 60.31, 59.72, 53.27, 52.84, 54.15, 53.39, 79.61,
80.54, 79.12, 79.08)), class = "data.frame", row.names = c(NA,
-55L))
WHOC_Sum_CMJ$Date <- as.Date(WHOC_Sum_CMJ$Date)
```
Column {.sidebar}
-----------------------------------------------------------------------
```{r}
checkboxGroupInput("Program", label = "Program", choices = unique(WHOC_Sum_CMJ$Program), selected = "Red", inline = TRUE)
# (Note: for the code I cut out some of the styling to make it more readable. That's why it looks different than the pictures).
```
Column
-----------------------------------------------------------------------
```{r}
renderPlot({
f <- WHOC_Sum_CMJ %>%
dplyr::select(Date, Athlete, JumpHeight_cm, Program) %>%
filter(Program %in% input$Program)
p <- ggplot(f) +
geom_line(aes(x=Date, y=JumpHeight_cm, colour = Athlete)) +
geom_point(aes(x=Date, y=JumpHeight_cm, colour = Athlete)) +
theme_bw() +
labs(title = "Team Jump Height",
x = "Date",
y = "Jump Height (cm)") +
scale_x_date(limits = c(min = min(WHOC_Sum_CMJ$Date), max = max(WHOC_Sum_CMJ$Date)), labels = date_format("%m/%d"),
date_breaks = "2 weeks", expand = c(.08,0)) +
guides(col = guide_legend(nrow = 3)) +
geom_text_repel(data= subset(f, Date == min(Date)), aes(x=Date, y=JumpHeight_cm,label = unique(Athlete)),
force = .1,
nudge_x = -2,
direction = "y",
hjust = 1,
)
p
})
```

problem with paired t-test: not all arguments have the same length

I want to do paired t-test with a data frame. I think I grouped them right but do not know why it reports the error:
Error in complete.cases(x, y) : not all arguments have the same length.
centre_g is my data frame containing all the info I want to use in my analysis. Paired t-test is a right way to do it.
str(centre_g)
# Classes ‘grouped_df’, ‘tbl_df’, ‘tbl’ and 'data.frame':
# 24 obs. of 17 variables
# (I will only list two variables that is used for my anaysis):
# $ BA: Factor w/ 2 levels "after","before": 2 1 2 1 2 1 2 1 2 1 ...
# $ Pb: num 437 1183 1465 3105 NA ...
I used to extract "before" and "after" for "Pb", i.e. I extracted two vectors in the data frame, and did paired t-test, it works fine
(tResult <- t.test(before$Pb, after$Pb, paired = TRUE))
but when I tried to do the paired t-test directly on my data frame, it has the error message mentioned in the question
(tResult <- t.test(Pb ~ BA, data = centre_g, paired = TRUE))
I tried several times, with grouped data or sorted data. I do not know what is wrong with the second method. Is it because the NA values I have got in my data frame? but the first method is fine?
Since I have quite a lot more information in my data frame waiting to be analysed, I do not want to extract vectors for every single of them. I hope to do my paired t-test on my data frame. Could anyone help me?
the detail of centre_g is:
structure(list(day = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), SAMPLE.No = structure(c(1L,
13L, 15L, 17L, 19L, 21L, 23L, 25L, 27L, 3L, 5L, 7L, 9L, 11L,
1L, 13L, 15L, 17L, 19L, 21L, 23L, 25L, 27L, 3L), .Label = c("s1",
"s1.2", "s10", "s10.2", "s11", "s11.2", "s12", "s12.2", "s13",
"s13.2", "s14", "s14.2", "s2", "s2.2", "s3", "s3.2", "s4", "s4.2",
"s5", "s5.2", "s6", "s6.2", "s7", "s7.2", "s8", "s8.2", "s9",
"s9.2"), class = "factor"), weir = c(1L, 1L, 2L, 2L, 3L, 3L,
4L, 4L, 5L, 5L, 6L, 6L, 7L, 7L, 8L, 8L, 9L, 9L, 10L, 10L, 11L,
11L, 12L, 12L), BA = structure(c(2L, 1L, 2L, 1L, 2L, 1L, 2L,
1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L,
1L), .Label = c("after", "before"), class = "factor"), centre.bank = structure(c(2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("bank", "centre"), class = "factor"),
Pb = c(436.65, 1182.93, 1465.21, 3105.36, 39.1, 1493.91,
NA, 165.28, 38.83, 351.48, 80.26, 47.39, 151.27, 434.01,
-97.58, 240.83, 56.8, 40.24, 38.8, NA, 41.13, 38.93, 44.39,
39.05), Pb.Error = c(16.41, 30.01, 51.26, 102.44, 27.21,
79.63, NA, 13.82, 48.78, 16.71, 19.1, 21.43, 18.65, 21.41,
232.7, 18.83, 12.19, 15.28, 11.94, NA, 22.24, 14.01, 10.56,
9.63), Zn = c(542.52, 981.83, 1234.78, 7554.41, 529.38, 5240.01,
NA, 542.65, 526.08, 820.87, 649.7, 793.42, 707.23, 1204.3,
-34.56, 209.86, 172.5, 130.29, 187.96, NA, 234.57, 137.38,
165.21, 135.05), Zn.Error = c(19.5, 29.31, 48.12, 161.54,
42.36, 144.56, NA, 23.37, 52.5, 26.18, 33.33, 39.87, 31.89,
35.79, 44.83, 17.24, 15.11, 21.25, 19.76, NA, 26.65, 18.67,
15.12, 13.97), Fe = c(3731.23, 14239.54, 23774.52, 52349.37,
3896.63, 13311.26, NA, 2756.96, 3511.06, 2664.12, 2383.16,
2785.75, 2834.59, 6288.39, -321.14, 14704.05, 3825.8, 5017.52,
13181.67, NA, 31190.39, 8516.23, 14130, 18348.01), Fe.Error = c(106.82,
229.87, 432.59, 884.29, 239.03, 496.1, NA, 111.92, 283.9,
102.44, 137.69, 161.02, 137.66, 172.32, 187.37, 274.6, 140.64,
240.97, 310.62, NA, 565.41, 265.57, 260.75, 291.45), Mn = c(110.65,
1337.08, 1126.82, 3495.03, 410.99, 5267.34, NA, 314.42, 338.8,
591.99, 308.46, 427.59, 573.87, 896.23, 277.82, 421.17, 969.72,
535.07, 879.97, NA, 742.39, 350.62, 379.98, 834.36), Mn.Error = c(43.39,
93.86, 133.34, 297.53, 125.08, 410.14, NA, 63.25, 155.08,
68.16, 82.1, 96.34, 88.97, 89.89, 1470.88, 78, 92.24, 118.6,
112.32, NA, 134.87, 91.97, 72.7, 91.12), Cr = c(-38.15, 50.8,
25.9, 53.32, 21.52, 132.82, NA, 8.13, 5.46, 35.07, 93.78,
88.18, 71.23, 47.26, 32.91, 25.49, 10.36, 19.99, 5.13, NA,
32.61, 22.13, 47.5, -5.82), Cr.Error = c(9.05, 16.41, 7.7,
9.99, 4.58, 33.88, NA, 7.84, 2.86, 9.18, 8.75, 7.55, 7.98,
9.62, 6.38, 5.54, 6.72, 4.6, 6.5, NA, 6.64, 4.62, 9.51, 11.3
), Ca = c(32195.21, 46510.98, 21723.24, 17820.74, 14639.01,
45937.9, NA, 37840.08, 4704.64, 37705.36, 28625.21, 25115.24,
41579.19, 91829.16, 19752.96, 14605.4, 34654.73, 15798.87,
13873.07, NA, 22901.14, 4097.09, 12053.38, 276525.69), Ca.Error = c(211.2,
326.69, 160.54, 142.76, 120.63, 304.76, NA, 219.4, 66.28,
225.41, 187.03, 169.88, 226.15, 378.53, 149.92, 125.47, 208.18,
127.73, 127.4, NA, 168.31, 64.51, 128.02, 908.61)), row.names = c(1L,
4L, 6L, 8L, 10L, 12L, 13L, 16L, 17L, 19L, 21L, 23L, 26L, 28L,
29L, 32L, 34L, 36L, 38L, 39L, 42L, 43L, 46L, 48L), class = "data.frame")
I am interested in doing paired t test on "Pb" column, trying to compare "before" and "after" (as shown in column "BA"). Each "weir" would be an individual.
I have worked it our after a day. I found it is because a row of NA data. There are some places where I did not manage to take samples, so there appears to be a whole row of NA data (except the factors columns).
To make sure the data frame has the whole length (24 instead of 23) and does not omit NA data, add na.rm = FALSE when subsetting the data frame into centre_g.
centre_g <- subset(HM_selected, centre.bank == "centre", na.rm = FALSE)
(I think I gave the right centre_g in my question dataset, but occationally I just got 23 data. adding na.rm to make sure how NA data are processed)
When doing the paired t-test, also add na.rm = FALSE.
(tRESULT <- t.test(Pb ~ BA, data = centre, paired = TRUE, na.rm = FALSE)
and that works perfectly for me.
sorry if there is any confusion in the question

Boxplot with multiple x variables

I'm new to R and having a few issues with using ggplot2.
This is an example of my data (subset of larger data set) :
df <-
structure(list(logpvalue = c(22.36, 6.93, 16.78, 1.78, 17.75,
20.99, 21.03, 9.19, 15.01, 22.25, 13.4, 6.47, 1.34, 13.4, 3.21,
0.37, 0.5, 0.12, 1.8, 0.71, 1.15, 6.73, 0.12, 6.97, 0.64, 9.85,
1.45, 1.67, 2.6, 1.8, 1.35, 4.69, 0.37, 1.91, 0.31, 0, 2.45,
1.68, 2.31, 1.35, 6.48, 4.68), SNP = structure(c(1L, 7L, 6L,
5L, 11L, 1L, 9L, 5L, 8L, 11L, 7L, 5L, 8L, 11L, 1L, 7L, 1L, 4L,
2L, 3L, 10L, 7L, 1L, 4L, 2L, 3L, 10L, 4L, 2L, 3L, 10L, 4L, 2L,
3L, 10L, 4L, 2L, 3L, 7L, 9L, 5L, 1L), .Label = c("rs10244", "rs10891244",
"rs10891245", "rs11213821", "rs12296076", "rs138567267", "rs45615536",
"rs6589218", "rs7103178", "rs7127721", "rs7944895"), class = "factor"),
X173 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L), .Label = c("het", "hom"), class = "factor")), .Names = c("logpvalue",
"SNP", "X173"), class = "data.frame", row.names = c(NA, -42L))
I want to plot a boxplot of logpvalue on y axis, with SNP on the x-axis but with each SNP also categorized by whether the patient is het or hom for X173. So from this data I'd imagine 4 boxes on my boxplot.
If possible I'd also like to incorporate the individual data points (dotplot-boxplot overlay) with jitter.
This is the usual code I'd use for a boxplot of logpavlue vs SNP:
qplot(logpvalue, SNP, data = mydata, geom="boxplot")
+ geom_jitter(position=position_jitter(w=0.1, h=0.1)) + theme_bw()
How do I add the extra x variable into this code?
Try this:
boxplot(df$logpvalue~paste(df$SNP,df$X173))
Or using ggolot2 :
library(ggplot2)
ggplot(data=df,aes(SNP,logpvalue,colour=SNP)) +
geom_boxplot() +
geom_jitter() +
facet_grid(.~X173)

Bar graph in ggplot2 with width as a variable and even spacing between bars

So I am trying to make a stacked bar graph with bar width mapped to a variable; but I want the spacing between my bars to be constant.
Does anyone know how to make the spacing constant between the bars?
Right now I've got this:
p<-ggplot(dd, aes(variable, value.y, fill=Date, width=value.x / 15))+ coord_flip() + opts(ylab="")
p1<-p+ geom_bar(stat="identity") + scale_fill_brewer(palette="Dark2") + scale_fill_hue(l=55,c=55)
p2<-p1 + opts(axis.title.x = theme_blank(), axis.title.y = theme_blank())
p2
Thanks in advance.
Here's my data by the way (sorry for the long, bulky dput):
> dput(dd)
structure(list(variable = structure(c(1L, 1L, 1L, 1L, 1L, 3L,
3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 7L, 7L, 7L, 7L, 7L, 2L, 2L,
2L, 2L, 2L, 6L, 6L, 6L, 6L, 6L, 5L, 5L, 5L, 5L, 5L, 9L, 9L, 9L,
9L, 9L, 8L, 8L, 8L, 8L, 8L), .Label = c("Alcohol and Tobacco",
"Health and Personal Care", "Clothing", "Energy", "Recreation and Education",
"Household", "Food", "Transportation", "Shelter"), class = "factor", scores = structure(c(2.91,
5.31, 10.08, 15.99, 4.95, 11.55, 11.2, 27.49, 20.6), .Dim = 9L, .Dimnames = list(
c("Alcohol and Tobacco", "Clothing", "Energy", "Food", "Health and Personal Care",
"Household", "Recreation and Education", "Shelter", "Transportation"
)))), value.x = c(2.91, 2.91, 2.91, 2.91, 2.91, 5.31, 5.31,
5.31, 5.31, 5.31, 10.08, 10.08, 10.08, 10.08, 10.08, 15.99, 15.99,
15.99, 15.99, 15.99, 4.95, 4.95, 4.95, 4.95, 4.95, 11.55, 11.55,
11.55, 11.55, 11.55, 11.2, 11.2, 11.2, 11.2, 11.2, 27.49, 27.49,
27.49, 27.49, 27.49, 20.6, 20.6, 20.6, 20.6, 20.6), Date = structure(c(5L,
4L, 3L, 2L, 1L, 5L, 4L, 3L, 2L, 1L, 5L, 4L, 3L, 2L, 1L, 5L, 4L,
3L, 2L, 1L, 5L, 4L, 3L, 2L, 1L, 5L, 4L, 3L, 2L, 1L, 5L, 4L, 3L,
2L, 1L, 5L, 4L, 3L, 2L, 1L, 5L, 4L, 3L, 2L, 1L), .Label = c("1993-2001",
"2001-2006", "2007-2010", "2010-2011", "2012 Jan - May"), class = "factor"),
value.y = c(2.1, 2.5, 7.6, 21.7, 2.8, 1.5, 0.3, -4.1, -4.2,
4.7, 3, 16.9, 1.9, 32.8, 23.9, 3.2, 4.6, 11.3, 8.9, 12.9,
1.7, 2, 7.8, 5.9, 10, 1.9, 2.1, 5.6, 2.2, 9.9, 1.4, 1.3,
2.2, 0.6, 17.3, 1.1, 2.3, 6.4, 13.1, 10, 4.3, 7.6, 0.9, 15.2,
20.5)), .Names = c("variable", "value.x", "Date", "value.y"
), row.names = c(NA, -45L), class = "data.frame")
For a categorical or "discrete" scale - you can adjust the width, but it needs to be between 0 and 1. Your value.x's put it over 1, hence the overlap. You can use rescale, from the scales packages to adjust this quickly so that the within category width of the bar is representative of some other variable (in this case value.x)
install.packages("scales")
library(scales)
ggplot(dd,aes(x=variable,y=value.y,fill=Date)) +
geom_bar(aes(width=rescale(value.x,c(0.5,1))),stat="identity",position="stack")' +
coord_flip()
Play with rescaling for optimal "view" change 0.5 to 0.25... etc.
Personally, I think something like this is more informative:
ggplot(dd,aes(x=variable,y=value.y,fill=Date)) +
geom_bar(aes(width=rescale(value.x,c(0.2,1))),stat="identity") +
coord_flip() + facet_grid(~Date) + opts(legend.position="none")
Attempt # 2.
I'm tricking ggplot2 into writing a continuous scale as categorical.
# The numbers for tmp I calculated by hand. Not sure how to program
# this part but the math is
# last + half(previous_width) + half(current_width)
# Change the 1st number in cumsum to adjust the between category width
tmp <- c(2.91,7.02,14.715,27.75,38.22,46.47,57.845,77.19,101.235) + cumsum(rep(5,9))
dd$x.pos1 <- rep(tmp,each=5)
ggplot(dd,aes(x=x.pos1,y=value.y,fill=Date)) +
geom_bar(aes(width=value.x),stat="identity",position="stack") +
scale_x_continuous(breaks=tmp,labels=levels(dd$variable)) +
coord_flip()
For good measure you're probably going to want to adjust the text size. That's done with ... + opts(axis.text.y=theme_text(size=12))

Resources