Related
I'm trying to add significance annotations to an errorbar plot with a factor x-axis and dodged groups within each level of the x-axis. It is a similar but NOT identical use case to this
My base errorbar plot is:
library(ggplot2)
library(dplyr)
pres_prob_pd = structure(list(x = structure(c(1, 1, 1, 2, 2, 2, 3, 3, 3), labels = c(`1` = 1,
`2` = 2, `3` = 3)), predicted = c(0.571584427222816, 0.712630712634987,
0.156061969566517, 0.0162388386564817, 0.0371877245103279, 0.0165022541901018,
0.131528946944238, 0.35927812866896, 0.0708662221985375), std.error = c(0.355802875027348,
0.471253661425626, 0.457109887762665, 0.352871728451576, 0.442646879181155,
0.425913568532558, 0.376552208691762, 0.48178172708116, 0.451758041335245
), conf.low = c(0.399141779923204, 0.496138837620712, 0.0701919316506831,
0.00819832576725402, 0.0159620304815404, 0.00722904089045731,
0.0675129352870401, 0.17905347369819, 0.030504893442457), conf.high = c(0.728233665534388,
0.861980236164486, 0.311759350126477, 0.031911364587827, 0.0842227723261319,
0.0372248587668487, 0.240584344249407, 0.590437963881823, 0.156035177669385
), group = structure(c(1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L), .Label = c("certain",
"neutral", "uncertain"), class = "factor"), group_col = structure(c(1L,
2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L), .Label = c("certain", "neutral",
"uncertain"), class = "factor"), language = structure(c(2L, 2L,
2L, 1L, 1L, 1L, 3L, 3L, 3L), .Label = c("english", "dutch", "german"
), class = "factor"), top = c(0.861980236164486, 0.861980236164486,
0.861980236164486, 0.0842227723261319, 0.0842227723261319, 0.0842227723261319,
0.590437963881823, 0.590437963881823, 0.590437963881823)), row.names = c(NA,
-9L), groups = structure(list(language = structure(1:3, .Label = c("english",
"dutch", "german"), class = "factor"), .rows = structure(list(
4:6, 1:3, 7:9), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, 3L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))
#dodge
pd = position_dodge(.75)
#plot
p = ggplot(pres_prob_pd,aes(x=language,y=predicted,color=group,shape=group)) +
geom_point(position=pd,size=2) +
geom_errorbar(aes(ymax=conf.high,ymin=conf.low),width=.125,position=pd)
p
What I want to do is annotate the plot such that the contrasts between group within each level of language are annotated for significance. I've plotted points representing the relevant contrasts and (toy) sig. annotations as follows:
#bump function
f = function(x){
v = c()
bump=0.025
constant = 0
for(i in x){
v = c(v,i+constant+bump)
bump = bump + 0.075
}
v
}
#create contrasts
combs = data.frame(gtools::combinations(3, 2, v=c("certain", "neutral", "uncertain"), set=F, repeats.allowed=F)) %>%
mutate(contrast=c("cont_1","cont_2","cont_3"))
combs = rbind(combs %>% mutate(language = 'english'),
combs %>% mutate(language='dutch'),
combs %>% mutate(language = "german")) %>%
left_join(select(pres_prob_pd,language:top)%>%distinct(),by='language') %>%
group_by(language)
#long transform and calc y_pos
combs_long = mutate(combs,y_pos=f(top)) %>% gather(long, probability, X1:X2, factor_key=TRUE) %>% mutate(language=factor(language,levels=c("english","dutch","german"))) %>%
arrange(language,contrast)
#back to wide
combs_wide =combs_long %>% spread(long,probability)
combs_wide$p = rep(c('***',"*","ns"),3)
#plot
p +
geom_point(data=combs_long,
aes(x = language,
color=probability,
shape=probability,
y=y_pos),
inherit.aes = T,
position=pd,
size=2) +
geom_text(data=combs_wide,
aes(x=language,
label=p,
y=y_pos+.025,
group=X1),
color='black',
position=position_dodge(.75),
inherit.aes = F)
What I am failing to achieve is plotting a line connecting each of the contrasts of group within each level of language, as is standard when annotating significant group-wise differences. Any help much appreciated!
I have a question about matrix structure manipulation in R, here I need to first transpose the matrix and combine the month and status columns, filling the missing values with 0. Here I have an example, currently my data is like belows. It seems very tricky. I would appreciate if anyone could help on this. Thank you.
Hi, my data looks like the follows:
structure(list(Customer = c("1096261", "1096261", "1169502",
"1169502"), Phase = c("2", "3", "1", "2"), Status = c("Ontime",
"Ontime", "Ontime", "Ontime"), Amount = c(21216.32, 42432.65,
200320.05, 84509.24)), .Names = c("Customer", "Phase", "Status",
"Amount"), row.names = c(NA, -4L), class = c("grouped_df", "tbl_df",
"tbl", "data.frame"), vars = c("Customer", "Phase"), drop = TRUE, indices
= list(
0L, 1L, 2L, 3L), group_sizes = c(1L, 1L, 1L, 1L), biggest_group_size = 1L,
labels = structure(list(
Customer = c("1096261", "1096261", "1169502", "1169502"),
Phase = c("2", "3", "1", "2")), row.names = c(NA, -4L), class =
"data.frame", vars = c("Customer",
"Phase"), drop = TRUE, .Names = c("Customer", "Phase")))
I need to have the reshaped matrix with the following columns:
Customer Phase1earlyTotal Phase2earlyTotal....Phase4earlyTotal...Phase1_ Ontimetotal...Phase4_Ontimetotal...Phase1LateTotal_Phase4LateTotal. For example Phase1earlytotal includes the sum of the amount with the Phase=1 and Status=Early.
Currently I use the following scripts, which does not work, coz I dont know
how to combine Phase and Stuatus Column.
mydata2<-data.table(mydata2,V3,V4)
mydata2$V4<-NULL
datacus <- data.frame(mydata2[-1,],stringsAsFactors = F);
datacus <- datacus %>% mutate(Phase= as.numeric(Phase),Amount=
as.numeric(Amount)) %>%
complete(Phase = 1:4,fill= list(Amount = 0)) %>%
dcast(datacus~V3, value.var = 'Amount',fill = 0) %>% select(Phase, V3)
%>%t()
I believe you are looking for somethink like this?
sample data
df <- structure(list(Customer = c("1096261", "1096261", "1169502",
"1169502"), Phase = c("2", "3", "1", "2"), Status = c("Ontime",
"Ontime", "Ontime", "Ontime"), Amount = c(21216.32, 42432.65,
200320.05, 84509.24)), .Names = c("Customer", "Phase", "Status",
"Amount"), row.names = c(NA, -4L), class = c("grouped_df", "tbl_df",
"tbl", "data.frame"), vars = c("Customer", "Phase"), drop = TRUE, indices
= list(
0L, 1L, 2L, 3L), group_sizes = c(1L, 1L, 1L, 1L), biggest_group_size = 1L,
labels = structure(list(
Customer = c("1096261", "1096261", "1169502", "1169502"),
Phase = c("2", "3", "1", "2")), row.names = c(NA, -4L), class =
"data.frame", vars = c("Customer",
"Phase"), drop = TRUE, .Names = c("Customer", "Phase")))
# Customer Phase Status Amount
# 1: 1096261 2 Ontime 21216.32
# 2: 1096261 3 Ontime 42432.65
# 3: 1169502 1 Ontime 200320.05
# 4: 1169502 2 Ontime 84509.24
code
library( data.table )
dcast( setDT( df ), Customer ~ Phase + Status, fun = sum, value.var = "Amount" )[]
output
# Customer 1_Ontime 2_Ontime 3_Ontime
# 1: 1096261 0 21216.32 42432.65
# 2: 1169502 200320 84509.24 0.00
I can't understand why i get the error mentioned in post title.
so what i do.
data example
mydat=structure(list(date = structure(c(3L, 2L, 6L, 1L, 7L, 5L, 4L), .Label = c("apr-15",
"feb-15", "jan-15", "jul15", "jun-15", "march-15", "may-15"), class = "factor"),
x1 = c(653411L, 620453L, 742567L, 578548L, 720100L, 553740L,
588145L), x2 = c(242108L, 210841L, 255046L, 185243L, 257159L,
182594L, 246051L), x3 = c(234394L, 289563L, 341791L, 293608L,
306807L, 285190L, 279252L), x4 = c(309228L, 226175L, 292387L,
183745L, 223322L, 161218L, 201499L)), .Names = c("date",
"x1", "x2", "x3", "x4"), class = "data.frame", row.names = c(NA,
-7L))
mydat<- read.csv("path.csv", sep=";",dec=",")
mydat <- stats::ts(mydat[,-1], frequency = 12, start = c(2015,1))
library("forecast")
my_forecast <- function(x){
model <- HoltWinters(x,beta = FALSE, seasonal = "additive")
fcast <- forecast(model, 5) # 5 month
return(fcast)
}
progn=lapply(mydat[1:34], my_forecast)
and the error
Error in decompose(ts(x[1L:wind], start = start(x), frequency = f), seasonal) :
time series has no or less than 2 periods
5.
stop("time series has no or less than 2 periods")
4.
decompose(ts(x[1L:wind], start = start(x), frequency = f), seasonal)
3.
HoltWinters(x, beta = FALSE, seasonal = "additive")
2.
FUN(X[[i]], ...)
1.
lapply(d2[1:34], my_forecast)
How to fix it?
The main idea it is perform HoltWinters analysis for all 34 variable.
the problem is the lapply function. lapply returns a list of the same length as X, each element of which is the result of applying FUN to the corresponding element of X. You can just run your 'my_forecast' function.
mydat=structure(list(date = structure(c(3L, 2L, 6L, 1L, 7L, 5L, 4L), .Label = c("apr-15",
"feb-15", "jan-15", "jul15", "jun-15", "march-15", "may-15"), class = "factor"),
x1 = c(653411L, 620453L, 742567L, 578548L, 720100L, 553740L,
588145L), x2 = c(242108L, 210841L, 255046L, 185243L, 257159L,
182594L, 246051L), x3 = c(234394L, 289563L, 341791L, 293608L,
306807L, 285190L, 279252L), x4 = c(309228L, 226175L, 292387L,
183745L, 223322L, 161218L, 201499L)), .Names = c("date",
"x1", "x2", "x3", "x4"), class = "data.frame", row.names = c(NA,
-7L))
mydat<- read.csv("path.csv", sep=";",dec=",")
mydat <- stats::ts(mydat[,-1], frequency = 12, start = c(2015,1))
library("forecast")
my_forecast <- function(x){
model <- HoltWinters(x,beta = FALSE, seasonal = "additive")
fcast <- forecast(model, 5) # 5 month
return(fcast)
}
my_forecast(ts(mydat, start=c(2015,1), end=c(2015,7), frequency=7))
I've got a data set that looks like this:
date, location, value, tally, score
2016-06-30T09:30Z, home, foo, 1,
2016-06-30T12:30Z, work, foo, 2,
2016-06-30T19:30Z, home, bar, , 5
I need to aggregate these rows together, to obtain a result such as:
date, location, value, tally, score
2016-06-30, [home, work], [foor, bar], 3, 5
There are several challenges for me:
The resulting row (a daily aggregate) must include the rows for this day (2016-06-30 in my above example
Some rows (strings) will result in an array containing all the values present on this day
Some others (ints) will result in a sum
I've had a look at dplyr, and if possible I'd like to do this in R.
Thanks for your help!
Edit:
Here's a dput of the data
structure(list(date = structure(1:3, .Label = c("2016-06-30T09:30Z",
"2016-06-30T12:30Z", "2016-06-30T19:30Z"), class = "factor"),
location = structure(c(1L, 2L, 1L), .Label = c("home", "work"
), class = "factor"), value = structure(c(2L, 2L, 1L), .Label = c("bar",
"foo"), class = "factor"), tally = c(1L, 2L, NA), score = c(NA,
NA, 5L)), .Names = c("date", "location", "value", "tally",
"score"), class = "data.frame", row.names = c(NA, -3L))
mydat<-structure(list(date = structure(1:3, .Label = c("2016-06-30T09:30Z",
"2016-06-30T12:30Z", "2016-06-30T19:30Z"), class = "factor"),
location = structure(c(1L, 2L, 1L), .Label = c("home", "work"
), class = "factor"), value = structure(c(2L, 2L, 1L), .Label = c("bar",
"foo"), class = "factor"), tally = c(1L, 2L, NA), score = c(NA,
NA, 5L)), .Names = c("date", "location", "value", "tally",
"score"), class = "data.frame", row.names = c(NA, -3L))
mydat$date <- as.Date(mydat$date)
require(data.table)
mydat.dt <- data.table(mydat)
mydat.dt <- mydat.dt[, lapply(.SD, paste0, collapse=" "), by = date]
cbind(mydat.dt, aggregate(mydat[,c("tally", "score")], by=list(mydat$date), FUN = sum, na.rm=T)[2:3])
which gives you:
date location value tally score
1: 2016-06-30 home work home foo foo bar 3 5
Note that if you wanted to you could probably do it all in one step in the reshaping of the data.table but I found this to be a quicker and easier way for me to achieve the same thing in 2 steps.
My data looks something like this:
example <- structure(list(ID = structure(c(1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L,
3L), .Label = c("A1", "A2", "A3"), class = "factor"), y = c(44.1160205053166,
33.0574407376116, 50.5295183433918, 44.1160205053166, 33.0574407376116,
50.5295183433918, 44.1160205053166, 33.0574407376116, 50.5295183433918
), day = structure(c(1392647220, 1392733620, 1392820020, 1392647220,
1392733620, 1392820020, 1392647220, 1392733620, 1392820020), class = c("POSIXct",
"POSIXt"), tzone = ""), P = c(16.345885329647, 6.21615618292708,
9.89848991157487, 14.4955473870505, 8.47820783441421, 2.36668747442309,
10.4325918923132, 9.26802998466883, 14.8380589560838), o = c(25.6364896567538,
10.5067015672103, 12.0306829502806, 25.6364896567538, 10.5067015672103,
12.0306829502806, 25.6364896567538, 10.5067015672103, 12.0306829502806
)), .Names = c("ID", "y", "day", "P", "x"), row.names = c(NA,
-9L), class = "data.frame")
I want to ran a regression of Y on P on day 1, 2, and 3. That is
y ~ p[1] + p[2] + p[3] + x
What is the best way of doing this? Do I need to create a new data frame with these variables before running lm? Or there is a better way?
Thanks!
Use substet parameter in lm function
lm(Y ~ P, data=df, subset=df$P %in% 1:3)