How can I do a Tukey's HSD test on this data? - r

I am trying to perform a Tukey's HSD test or an LSD test on my data. I have two factors, Collection (2 treatments) and Irrigation (5 treatments), and want to do the test on the Sucrose responses from each combination, so 10 total treatments.
Data:
structure(list(Collection = structure(c(1L, 1L, 1L, 1L, 1L, 2L
), .Label = c("1", "2"), class = "factor"), Irrigation = structure(c(1L,
2L, 3L, 4L, 5L, 1L), .Label = c("Rate1", "Rate2", "Rate3", "Rate4",
"Rate5"), class = "factor"), meanSuc = c(0.585416666666667, 0.5032,
0.61375, 0.602775, 0.688466666666667, 0.545133333333333)), row.names =
c(NA,
-6L), groups = structure(list(Collection = structure(1:2, .Label = c("1",
"2"), class = "factor"), .rows = list(1:5, 6L)), row.names = c(NA,
-2L), class = c("tbl_df", "tbl", "data.frame"), .drop = TRUE), class =
c("grouped_df",
"tbl_df", "tbl", "data.frame"))
Attempt at combining treatments into a column and using Agricolae to perform test:
Tukey_data <- dataAvgSucCI %>%
mutate(Tukey_ID = paste(Collection, Irrigation, sep="_"))
TukeyAov <- aov(meanSuc ~ Tukey_ID,Tukey_data)
HSD.test(TukeyAov, "Tukey_ID", group=TRUE)
Error message:
Error in if (pvalue[k] <= 0.001) sig[k] <- "***" else if (pvalue[k] <=
:
missing value where TRUE/FALSE needed
In addition: Warning message:
In qtukey(1 - alpha, ntr, DFerror) : NaNs produced
How should I edit my code to make it work?
Or would I be better off writing something entirely different?

The data have to look like this (One way ANOVA):
Collection = rep(1:2, times = 1, each = 5)
Irrigation = rep(1:5, times = 2, each = 1)
meanSuc = rnorm(10, mean = 0, sd = 1)
d = data.frame(Collection, Irrigation, meanSuc)
fit = aov(meanSuc ~ as.factor(Collection), data=d)
TukeyHSD(fit)
or Two way ANOVA:
fit2 = aov(meanSuc ~ as.factor(Collection) + as.factor(Irrigation), data = d)
TukeyHSD(fit2)
I think you like to perform a two way ANOVA. Like AkselA said, there is no variation in your target variable (meanSuc), If you perform a one way ANOVA like you did.

Related

connect points within position_dodged factor x-axis in ggplot2

I'm trying to add significance annotations to an errorbar plot with a factor x-axis and dodged groups within each level of the x-axis. It is a similar but NOT identical use case to this
My base errorbar plot is:
library(ggplot2)
library(dplyr)
pres_prob_pd = structure(list(x = structure(c(1, 1, 1, 2, 2, 2, 3, 3, 3), labels = c(`1` = 1,
`2` = 2, `3` = 3)), predicted = c(0.571584427222816, 0.712630712634987,
0.156061969566517, 0.0162388386564817, 0.0371877245103279, 0.0165022541901018,
0.131528946944238, 0.35927812866896, 0.0708662221985375), std.error = c(0.355802875027348,
0.471253661425626, 0.457109887762665, 0.352871728451576, 0.442646879181155,
0.425913568532558, 0.376552208691762, 0.48178172708116, 0.451758041335245
), conf.low = c(0.399141779923204, 0.496138837620712, 0.0701919316506831,
0.00819832576725402, 0.0159620304815404, 0.00722904089045731,
0.0675129352870401, 0.17905347369819, 0.030504893442457), conf.high = c(0.728233665534388,
0.861980236164486, 0.311759350126477, 0.031911364587827, 0.0842227723261319,
0.0372248587668487, 0.240584344249407, 0.590437963881823, 0.156035177669385
), group = structure(c(1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L), .Label = c("certain",
"neutral", "uncertain"), class = "factor"), group_col = structure(c(1L,
2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L), .Label = c("certain", "neutral",
"uncertain"), class = "factor"), language = structure(c(2L, 2L,
2L, 1L, 1L, 1L, 3L, 3L, 3L), .Label = c("english", "dutch", "german"
), class = "factor"), top = c(0.861980236164486, 0.861980236164486,
0.861980236164486, 0.0842227723261319, 0.0842227723261319, 0.0842227723261319,
0.590437963881823, 0.590437963881823, 0.590437963881823)), row.names = c(NA,
-9L), groups = structure(list(language = structure(1:3, .Label = c("english",
"dutch", "german"), class = "factor"), .rows = structure(list(
4:6, 1:3, 7:9), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, 3L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))
#dodge
pd = position_dodge(.75)
#plot
p = ggplot(pres_prob_pd,aes(x=language,y=predicted,color=group,shape=group)) +
geom_point(position=pd,size=2) +
geom_errorbar(aes(ymax=conf.high,ymin=conf.low),width=.125,position=pd)
p
What I want to do is annotate the plot such that the contrasts between group within each level of language are annotated for significance. I've plotted points representing the relevant contrasts and (toy) sig. annotations as follows:
#bump function
f = function(x){
v = c()
bump=0.025
constant = 0
for(i in x){
v = c(v,i+constant+bump)
bump = bump + 0.075
}
v
}
#create contrasts
combs = data.frame(gtools::combinations(3, 2, v=c("certain", "neutral", "uncertain"), set=F, repeats.allowed=F)) %>%
mutate(contrast=c("cont_1","cont_2","cont_3"))
combs = rbind(combs %>% mutate(language = 'english'),
combs %>% mutate(language='dutch'),
combs %>% mutate(language = "german")) %>%
left_join(select(pres_prob_pd,language:top)%>%distinct(),by='language') %>%
group_by(language)
#long transform and calc y_pos
combs_long = mutate(combs,y_pos=f(top)) %>% gather(long, probability, X1:X2, factor_key=TRUE) %>% mutate(language=factor(language,levels=c("english","dutch","german"))) %>%
arrange(language,contrast)
#back to wide
combs_wide =combs_long %>% spread(long,probability)
combs_wide$p = rep(c('***',"*","ns"),3)
#plot
p +
geom_point(data=combs_long,
aes(x = language,
color=probability,
shape=probability,
y=y_pos),
inherit.aes = T,
position=pd,
size=2) +
geom_text(data=combs_wide,
aes(x=language,
label=p,
y=y_pos+.025,
group=X1),
color='black',
position=position_dodge(.75),
inherit.aes = F)
What I am failing to achieve is plotting a line connecting each of the contrasts of group within each level of language, as is standard when annotating significant group-wise differences. Any help much appreciated!

how to reshape the matrix and fill the missing value as 0

I have a question about matrix structure manipulation in R, here I need to first transpose the matrix and combine the month and status columns, filling the missing values with 0. Here I have an example, currently my data is like belows. It seems very tricky. I would appreciate if anyone could help on this. Thank you.
Hi, my data looks like the follows:
structure(list(Customer = c("1096261", "1096261", "1169502",
"1169502"), Phase = c("2", "3", "1", "2"), Status = c("Ontime",
"Ontime", "Ontime", "Ontime"), Amount = c(21216.32, 42432.65,
200320.05, 84509.24)), .Names = c("Customer", "Phase", "Status",
"Amount"), row.names = c(NA, -4L), class = c("grouped_df", "tbl_df",
"tbl", "data.frame"), vars = c("Customer", "Phase"), drop = TRUE, indices
= list(
0L, 1L, 2L, 3L), group_sizes = c(1L, 1L, 1L, 1L), biggest_group_size = 1L,
labels = structure(list(
Customer = c("1096261", "1096261", "1169502", "1169502"),
Phase = c("2", "3", "1", "2")), row.names = c(NA, -4L), class =
"data.frame", vars = c("Customer",
"Phase"), drop = TRUE, .Names = c("Customer", "Phase")))
I need to have the reshaped matrix with the following columns:
Customer Phase1earlyTotal Phase2earlyTotal....Phase4earlyTotal...Phase1_ Ontimetotal...Phase4_Ontimetotal...Phase1LateTotal_Phase4LateTotal. For example Phase1earlytotal includes the sum of the amount with the Phase=1 and Status=Early.
Currently I use the following scripts, which does not work, coz I dont know
how to combine Phase and Stuatus Column.
mydata2<-data.table(mydata2,V3,V4)
mydata2$V4<-NULL
datacus <- data.frame(mydata2[-1,],stringsAsFactors = F);
datacus <- datacus %>% mutate(Phase= as.numeric(Phase),Amount=
as.numeric(Amount)) %>%
complete(Phase = 1:4,fill= list(Amount = 0)) %>%
dcast(datacus~V3, value.var = 'Amount',fill = 0) %>% select(Phase, V3)
%>%t()
I believe you are looking for somethink like this?
sample data
df <- structure(list(Customer = c("1096261", "1096261", "1169502",
"1169502"), Phase = c("2", "3", "1", "2"), Status = c("Ontime",
"Ontime", "Ontime", "Ontime"), Amount = c(21216.32, 42432.65,
200320.05, 84509.24)), .Names = c("Customer", "Phase", "Status",
"Amount"), row.names = c(NA, -4L), class = c("grouped_df", "tbl_df",
"tbl", "data.frame"), vars = c("Customer", "Phase"), drop = TRUE, indices
= list(
0L, 1L, 2L, 3L), group_sizes = c(1L, 1L, 1L, 1L), biggest_group_size = 1L,
labels = structure(list(
Customer = c("1096261", "1096261", "1169502", "1169502"),
Phase = c("2", "3", "1", "2")), row.names = c(NA, -4L), class =
"data.frame", vars = c("Customer",
"Phase"), drop = TRUE, .Names = c("Customer", "Phase")))
# Customer Phase Status Amount
# 1: 1096261 2 Ontime 21216.32
# 2: 1096261 3 Ontime 42432.65
# 3: 1169502 1 Ontime 200320.05
# 4: 1169502 2 Ontime 84509.24
code
library( data.table )
dcast( setDT( df ), Customer ~ Phase + Status, fun = sum, value.var = "Amount" )[]
output
# Customer 1_Ontime 2_Ontime 3_Ontime
# 1: 1096261 0 21216.32 42432.65
# 2: 1169502 200320 84509.24 0.00

Error in decompose(ts(x[1L:wind], start = start(x), frequency = f), seasonal) : in R

I can't understand why i get the error mentioned in post title.
so what i do.
data example
mydat=structure(list(date = structure(c(3L, 2L, 6L, 1L, 7L, 5L, 4L), .Label = c("apr-15",
"feb-15", "jan-15", "jul15", "jun-15", "march-15", "may-15"), class = "factor"),
x1 = c(653411L, 620453L, 742567L, 578548L, 720100L, 553740L,
588145L), x2 = c(242108L, 210841L, 255046L, 185243L, 257159L,
182594L, 246051L), x3 = c(234394L, 289563L, 341791L, 293608L,
306807L, 285190L, 279252L), x4 = c(309228L, 226175L, 292387L,
183745L, 223322L, 161218L, 201499L)), .Names = c("date",
"x1", "x2", "x3", "x4"), class = "data.frame", row.names = c(NA,
-7L))
mydat<- read.csv("path.csv", sep=";",dec=",")
mydat <- stats::ts(mydat[,-1], frequency = 12, start = c(2015,1))
library("forecast")
my_forecast <- function(x){
model <- HoltWinters(x,beta = FALSE, seasonal = "additive")
fcast <- forecast(model, 5) # 5 month
return(fcast)
}
progn=lapply(mydat[1:34], my_forecast)
and the error
Error in decompose(ts(x[1L:wind], start = start(x), frequency = f), seasonal) :
time series has no or less than 2 periods
5.
stop("time series has no or less than 2 periods")
4.
decompose(ts(x[1L:wind], start = start(x), frequency = f), seasonal)
3.
HoltWinters(x, beta = FALSE, seasonal = "additive")
2.
FUN(X[[i]], ...)
1.
lapply(d2[1:34], my_forecast)
How to fix it?
The main idea it is perform HoltWinters analysis for all 34 variable.
the problem is the lapply function. lapply returns a list of the same length as X, each element of which is the result of applying FUN to the corresponding element of X. You can just run your 'my_forecast' function.
mydat=structure(list(date = structure(c(3L, 2L, 6L, 1L, 7L, 5L, 4L), .Label = c("apr-15",
"feb-15", "jan-15", "jul15", "jun-15", "march-15", "may-15"), class = "factor"),
x1 = c(653411L, 620453L, 742567L, 578548L, 720100L, 553740L,
588145L), x2 = c(242108L, 210841L, 255046L, 185243L, 257159L,
182594L, 246051L), x3 = c(234394L, 289563L, 341791L, 293608L,
306807L, 285190L, 279252L), x4 = c(309228L, 226175L, 292387L,
183745L, 223322L, 161218L, 201499L)), .Names = c("date",
"x1", "x2", "x3", "x4"), class = "data.frame", row.names = c(NA,
-7L))
mydat<- read.csv("path.csv", sep=";",dec=",")
mydat <- stats::ts(mydat[,-1], frequency = 12, start = c(2015,1))
library("forecast")
my_forecast <- function(x){
model <- HoltWinters(x,beta = FALSE, seasonal = "additive")
fcast <- forecast(model, 5) # 5 month
return(fcast)
}
my_forecast(ts(mydat, start=c(2015,1), end=c(2015,7), frequency=7))

Collapse and aggregate several row values by date

I've got a data set that looks like this:
date, location, value, tally, score
2016-06-30T09:30Z, home, foo, 1,
2016-06-30T12:30Z, work, foo, 2,
2016-06-30T19:30Z, home, bar, , 5
I need to aggregate these rows together, to obtain a result such as:
date, location, value, tally, score
2016-06-30, [home, work], [foor, bar], 3, 5
There are several challenges for me:
The resulting row (a daily aggregate) must include the rows for this day (2016-06-30 in my above example
Some rows (strings) will result in an array containing all the values present on this day
Some others (ints) will result in a sum
I've had a look at dplyr, and if possible I'd like to do this in R.
Thanks for your help!
Edit:
Here's a dput of the data
structure(list(date = structure(1:3, .Label = c("2016-06-30T09:30Z",
"2016-06-30T12:30Z", "2016-06-30T19:30Z"), class = "factor"),
location = structure(c(1L, 2L, 1L), .Label = c("home", "work"
), class = "factor"), value = structure(c(2L, 2L, 1L), .Label = c("bar",
"foo"), class = "factor"), tally = c(1L, 2L, NA), score = c(NA,
NA, 5L)), .Names = c("date", "location", "value", "tally",
"score"), class = "data.frame", row.names = c(NA, -3L))
mydat<-structure(list(date = structure(1:3, .Label = c("2016-06-30T09:30Z",
"2016-06-30T12:30Z", "2016-06-30T19:30Z"), class = "factor"),
location = structure(c(1L, 2L, 1L), .Label = c("home", "work"
), class = "factor"), value = structure(c(2L, 2L, 1L), .Label = c("bar",
"foo"), class = "factor"), tally = c(1L, 2L, NA), score = c(NA,
NA, 5L)), .Names = c("date", "location", "value", "tally",
"score"), class = "data.frame", row.names = c(NA, -3L))
mydat$date <- as.Date(mydat$date)
require(data.table)
mydat.dt <- data.table(mydat)
mydat.dt <- mydat.dt[, lapply(.SD, paste0, collapse=" "), by = date]
cbind(mydat.dt, aggregate(mydat[,c("tally", "score")], by=list(mydat$date), FUN = sum, na.rm=T)[2:3])
which gives you:
date location value tally score
1: 2016-06-30 home work home foo foo bar 3 5
Note that if you wanted to you could probably do it all in one step in the reshaping of the data.table but I found this to be a quicker and easier way for me to achieve the same thing in 2 steps.

Manipulate data before running a linear regression

My data looks something like this:
example <- structure(list(ID = structure(c(1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L,
3L), .Label = c("A1", "A2", "A3"), class = "factor"), y = c(44.1160205053166,
33.0574407376116, 50.5295183433918, 44.1160205053166, 33.0574407376116,
50.5295183433918, 44.1160205053166, 33.0574407376116, 50.5295183433918
), day = structure(c(1392647220, 1392733620, 1392820020, 1392647220,
1392733620, 1392820020, 1392647220, 1392733620, 1392820020), class = c("POSIXct",
"POSIXt"), tzone = ""), P = c(16.345885329647, 6.21615618292708,
9.89848991157487, 14.4955473870505, 8.47820783441421, 2.36668747442309,
10.4325918923132, 9.26802998466883, 14.8380589560838), o = c(25.6364896567538,
10.5067015672103, 12.0306829502806, 25.6364896567538, 10.5067015672103,
12.0306829502806, 25.6364896567538, 10.5067015672103, 12.0306829502806
)), .Names = c("ID", "y", "day", "P", "x"), row.names = c(NA,
-9L), class = "data.frame")
I want to ran a regression of Y on P on day 1, 2, and 3. That is
y ~ p[1] + p[2] + p[3] + x
What is the best way of doing this? Do I need to create a new data frame with these variables before running lm? Or there is a better way?
Thanks!
Use substet parameter in lm function
lm(Y ~ P, data=df, subset=df$P %in% 1:3)

Resources