How to plot exponential decay in geom_smooth in ggplot2 in R? - r

Data
> dput(new.gapdata.cc)
structure(list(gap.interval = structure(c(1L, 2L, 3L, 4L, 5L,
6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L,
19L, 20L, 21L, 22L, 23L, 24L, 25L, 26L, 27L, 28L, 29L, 30L, 34L
), .Label = c("[0.0568,10.1]", "(10.1,20.1]", "(20.1,30.1]",
"(30.1,40.1]", "(40.1,50.1]", "(50.1,60.1]", "(60.1,70.1]", "(70.1,80.1]",
"(80.1,90.1]", "(90.1,100]", "(100,110]", "(110,120]", "(120,130]",
"(130,140]", "(140,150]", "(150,160]", "(160,170]", "(170,180]",
"(180,190]", "(190,200]", "(200,210]", "(210,220]", "(220,230]",
"(230,240]", "(240,250]", "(250,260]", "(260,270]", "(270,280]",
"(280,290]", "(290,300]", "(300,310]", "(310,320]", "(320,330]",
"(330,340]", "(340,350]", "(350,360]", "(360,370]", "(370,380]",
"(380,390]", "(390,400]", "(400,410]", "(410,420]", "(420,430]",
"(430,440]", "(440,450]", "(450,460]", "(460,470]", "(470,480]",
"(480,490]", "(490,500]", "(500,510]", "(510,520]", "(520,530]",
"(530,540]", "(540,550]", "(550,560]", "(560,570]", "(570,580]",
"(580,590]", "(590,600]", "(600,610]", "(610,620]", "(620,630]",
"(630,640]", "(640,650]", "(650,660]", "(660,670]", "(670,680]",
"(680,690]", "(690,700]", "(700,710]", "(710,720]", "(720,730]",
"(730,740]", "(740,750]"), class = "factor"), Vehicle.class = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("Car following",
"Heavy-Vehicle following"), class = "factor"), PrecVehClass = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("Car",
"Heavy-Vehicle"), class = "factor"), sd.speed = c(8.10631184218832,
11.4437550056097, 11.8038327709683, 10.8543703246156, 9.99720748006444,
9.44865875583687, 8.96606665646703, 8.49351869704553, 7.93669264490773,
8.13551032227591, 7.84202528436342, 8.0475744381228, 7.91648183675322,
7.43125313026708, 7.35410275703108, 7.60500908370333, 7.0498555301719,
7.55232413932399, 8.06598948864824, 6.76873032867712, 9.5638441069889,
8.04863015016668, 6.3210319215341, 4.64833690603376, 6.62719482422681,
6.64056528224281, 4.73744287133819, 7.47515815690314, 7.69289983159388,
0.306328206216196, 0.686563613792699), m.speed = c(7.49142882761648,
14.9015932672865, 23.2183766318976, 29.4281833927603, 33.2698195905316,
35.8151829762138, 37.5490804914733, 38.5477371278585, 39.3540677299243,
40.6919294171912, 41.1003756008852, 41.8182626555034, 43.0467747414578,
42.8363357874289, 43.4938190765401, 43.3542212600658, 45.4415004558705,
46.0292158248193, 45.2411112123218, 45.3142872888847, 45.8483490730252,
44.9081708678314, 48.91998889291, 47.3070826500395, 47.6670737425671,
46.3952054632908, 43.9972157634013, 51.2984320152685, 60.9675201903266,
44.7204961417801, 49.3765339447783), m.gapdist = c(7.7653843749647,
16.1638754974281, 25.4776617248361, 35.2445820779774, 44.9431006950918,
54.8030747287456, 64.7488740187079, 74.7493853439047, 84.7618392182203,
94.6265821702835, 104.858371321352, 114.633780836178, 124.562176064196,
134.473095135859, 144.806940411055, 154.554692908294, 164.982952591097,
174.906212522406, 185.553895860064, 194.461299821333, 204.825162321106,
215.128853160835, 225.333436194581, 235.137188240688, 244.880475531984,
255.160919142993, 264.314402521448, 274.575498681999, 285.224335149303,
293.119840359603, 337.618758706201)), .Names = c("gap.interval",
"Vehicle.class", "PrecVehClass", "sd.speed", "m.speed", "m.gapdist"
), row.names = c(3L, 8L, 13L, 18L, 24L, 31L, 37L, 43L, 49L, 55L,
61L, 66L, 71L, 76L, 81L, 85L, 88L, 91L, 94L, 96L, 98L, 100L,
102L, 105L, 107L, 109L, 112L, 114L, 116L, 118L, 121L), class = "data.frame")
What I want to achieve
I have 'sd.speed/m.speed' as dependent variable and 'm.gapdist' as explanatory variable. When I do a scatter plot the trend seems to be exponential decay. So I want to get the summary statistics as well as the plot fitted on the data points. I used following code:
ggplot() +
geom_point(data=new.gapdata.cc,
aes(y=sd.speed/m.speed, x=m.gapdist, shape=interaction(Vehicle.class,PrecVehClass)),
size=3) +
geom_smooth(data=new.gapdata.cc,
mapping = aes(y= sd.speed/m.speed, x=m.gapdist,
linetype=interaction(Vehicle.class,PrecVehClass)), method="lm", formula = log(y) ~ x,
se=F, size=1, color="black")
Question
This does not plot the exponential decay curve on the points. How can I fit the curve on points?

Related

How to insert blank space on x-axis in boxplot using ggplot /R?

I have My Data stored in p, which can be found below.
I have four specific categories for a group of tumor patients. Three of the groups correspond to the tumor stage and is stored as p$WHO.Grade=1,2,3. The last group is All tumor patients combined.
I am producing a specific plot consisting of multiple boxplots demonstrating the distribution of a continuous covariate (p$ki67pro) in the four groups described as above and in relation to the event of recurrence (p$recurrence==0 for no and p$recurrence==1 for yes).
As it turns out, there are no events for p$WHO.Grade==3, which means that I want my blot to look exactly like this (manipulated in photoshop):
However, I get the picture below when I use the following script:
library(ggplot2)
library(dplyr)
p %>%
bind_rows(p %>% mutate(WHO.Grade = 4)) %>%
mutate(WHO.Grade = factor(WHO.Grade),
recurrence = factor(recurrence)) %>%
ggplot(aes(WHO.Grade, ki67pro,
fill = recurrence, colour = recurrence)) +
geom_boxplot(outlier.alpha = 0,
position = position_dodge(width = 0.78, preserve = "single")) +
geom_point(size = 3, shape = 21,
position = position_jitterdodge()) +
scale_x_discrete(name = "",
label = c("WHO-I","WHO-II","WHO-III","All")) +
scale_y_continuous(name = "x", breaks=seq(0,30,5), limits=c(0,30)) +
scale_fill_manual(values = c("#edf1f9", "#fcebeb"), name = "",
labels = c("", "")) +
scale_colour_manual(values = c("#1C73C2", "red"), name = "",
labels = c("","")) +
theme(legend.position="none",
panel.background = element_blank(),
axis.line = element_line(colour = "black"))
It seems like p$WHO.Grade==All automatically is inserted at the p$WHO.Grade==3 space, which should be leaved blank.
Therefore, my question is: how can I graphically insert a blank space at p$WHO.Grade==3 given my
script above?
p <- structure(list(WHO.Grade = c(1L, 2L, 1L, 1L, 1L, 1L, 2L, 1L,
1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L), ki67pro = c(1L, 12L, 3L, 3L, 5L, 3L,
25L, 7L, 4L, 5L, 12L, 3L, 15L, 4L, 5L, 7L, 8L, 3L, 12L, 10L,
4L, 10L, 7L, 3L, 2L, 3L, 7L, 4L, 7L, 10L, 4L, 5L, 5L, 3L, 5L,
2L, 5L, 3L, 3L, 3L, 4L, 4L, 3L, 2L, 5L, 1L, 5L, 2L, 3L, 1L, 2L,
3L, 3L, 5L, 4L, 20L, 5L, 0L, 4L, 3L, 0L, 3L, 4L, 1L, 2L, 20L,
2L, 3L, 5L, 4L, 8L, 1L, 4L, 5L, 4L, 3L, 6L, 12L, 3L, 4L, 4L,
2L, 5L, 3L, 3L, 3L, 2L, 5L, 4L, 2L, 3L, 4L, 3L, 3L, 2L, 2L, 4L,
7L, 4L, 3L, 4L, 2L, 3L, 6L, 2L, 3L, 10L, 5L, 10L, 3L, 10L, 3L,
4L, 5L, 2L, 4L, 3L, 4L, 4L, 4L, 5L, 3L, 12L, 5L, 4L, 3L, 2L,
4L, 3L, 4L, 2L, 1L, 6L, 1L, 4L, 12L, 3L, 4L, 3L, 2L, 6L, 5L,
4L, 3L, 4L, 4L, 4L, 3L, 5L, 4L, 5L, 4L, 1L, 3L, 3L, 4L, 0L, 3L
), recurrence = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L,
0L, 1L, 0L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 0L, 0L,
1L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
1L, 1L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 1L, 1L, 0L, 1L, 0L, 0L, 1L,
1L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L,
0L, 0L, 0L)), class = "data.frame", row.names = c(1L, 2L, 3L,
9L, 10L, 11L, 13L, 14L, 15L, 16L, 18L, 19L, 20L, 21L, 22L, 23L,
24L, 25L, 26L, 27L, 28L, 29L, 30L, 31L, 32L, 33L, 34L, 35L, 36L,
37L, 38L, 39L, 40L, 41L, 44L, 45L, 46L, 47L, 48L, 49L, 50L, 51L,
52L, 53L, 54L, 55L, 57L, 59L, 60L, 61L, 62L, 63L, 64L, 65L, 66L,
67L, 68L, 69L, 70L, 71L, 72L, 73L, 74L, 75L, 76L, 77L, 78L, 79L,
80L, 81L, 82L, 83L, 84L, 85L, 87L, 89L, 90L, 91L, 92L, 93L, 94L,
96L, 97L, 98L, 99L, 100L, 101L, 102L, 103L, 104L, 105L, 106L,
107L, 109L, 110L, 111L, 112L, 113L, 114L, 115L, 116L, 117L, 118L,
119L, 120L, 121L, 123L, 124L, 125L, 126L, 127L, 128L, 130L, 131L,
132L, 133L, 134L, 135L, 136L, 137L, 138L, 139L, 140L, 141L, 142L,
143L, 144L, 145L, 146L, 147L, 148L, 149L, 150L, 151L, 152L, 153L,
154L, 155L, 156L, 157L, 158L, 159L, 160L, 161L, 162L, 163L, 164L,
165L, 166L, 167L, 168L, 169L, 170L, 171L, 172L, 173L, 174L, 175L
))
The simplest way is to adjust your WHO.Grade factor to include all 4 levels - c("WHO-I","WHO-II","WHO-III","All")```. Here's the first adjustment on line 3:
p %>%
bind_rows(p %>% mutate(WHO.Grade = 4)) %>%
mutate(WHO.Grade = factor(WHO.Grade, levels = 1:4, labels = c("WHO-I","WHO-II","WHO-III","All")),
recurrence = factor(recurrence))
Now that we've named our factors, we can modify the scale_x_discrete() call to remove the label and add drop = FALSE:
scale_x_discrete(name = "",
# label = c("WHO-I","WHO-II","WHO-III","All"),
drop = FALSE)
Putting everything together we get:
p %>%
bind_rows(p %>% mutate(WHO.Grade = 4)) %>% as_tibble()%>%
mutate(WHO.Grade = factor(WHO.Grade, levels = 1:4, labels = c("WHO-I","WHO-II","WHO-III","All")),
recurrence = factor(recurrence))%>%
ggplot(aes(WHO.Grade, ki67pro,
fill = recurrence, colour = recurrence)) +
geom_boxplot(outlier.alpha = 0,
position = position_dodge(width = 0.78, preserve = "single")) +
geom_point(size = 3, shape = 21,
position = position_jitterdodge()) +
scale_x_discrete(name = "",
# label = c("WHO-I","WHO-II","WHO-III","All"),
drop = FALSE) +
scale_y_continuous(name = "x", breaks=seq(0,30,5), limits=c(0,30)) +
scale_fill_manual(values = c("#edf1f9", "#fcebeb"), name = "",
labels = c("", "")) +
scale_colour_manual(values = c("#1C73C2", "red"), name = "",
labels = c("","")) +
theme(legend.position="none",
panel.background = element_blank(),
axis.line = element_line(colour = "black"))
I could not reach to the result you want as appeared in the Photoshop image, but you could gain the following image:
Which it is what you want, but all the entries of "WHO-III" are zeros
The code that generates it is:
library(ggplot2)
library(dplyr)
p= p %>%
bind_rows(p %>% mutate(WHO.Grade = 3)) %>%
bind_rows(p %>% mutate(WHO.Grade = 4))
p[p$WHO.Grade == 3, 2] = 0
p %>%
mutate(WHO.Grade = factor(WHO.Grade),
recurrence = factor(recurrence)) %>%
ggplot(aes(WHO.Grade, ki67pro,
fill = recurrence, colour = recurrence)) +
geom_boxplot(outlier.alpha = 0,
position = position_dodge(width = 0.78, preserve = "single")) +
geom_point(size = 3, shape = 21,
position = position_jitterdodge()) +
scale_x_discrete(name = "",
label = c("WHO-I","WHO-II","WHO-III","All"), drop = FALSE) +
scale_y_continuous(name = "x", breaks=seq(0,30,5), limits=c(0,30)) +
scale_fill_manual(values = c("#edf1f9", "#fcebeb"), name = "",
labels = c("", "")) +
scale_colour_manual(values = c("#1C73C2", "red"), name = "",
labels = c("","")) +
theme(legend.position="none",
panel.background = element_blank(),
axis.line = element_line(colour = "black"))
Hope this helps

How can I remove the legend from this boxplot in ggplot? [duplicate]

This question already has answers here:
Remove legend ggplot 2.2
(4 answers)
Closed 3 years ago.
Please find My Data below.
How can I remove the red, encircled legend from my boxplot?
I wish to keep the same colors and design. I have tried numerous different solutions, but this has unfortunately not solved the problem.
This might be kinda basic, but simply can't figure out how to solve this. I hope you can help - thanks in advance!
My script is:
df <- data.frame(x = as.factor(c(p$WHO.Grade)),
y = c(p$ki67pro),
f = rep(c("Ki67pro"), c(nrow(p))))
ggplot(df) +
geom_boxplot(aes(x, y, fill = f, colour = f), outlier.alpha = 0, position = position_dodge(width = 0.78)) +
scale_x_discrete(name = "", label=c("WHO-I\nn=108","WHO-II\nn=34","WHO-III\nn=1")) +
scale_y_continuous(name="Ki-67 proliferative index", breaks=seq(0,30,5), limits=c(0,30)) +
stat_boxplot(aes(x, y, colour = f), geom = "errorbar", width = 0.3,position = position_dodge(0.7753)) +
geom_point(aes(x, y, fill = f, colour = f), size = 3, shape = 21, position = position_jitterdodge()) +
scale_fill_manual(values = c("#52C1C76D"), name = "",
labels = c("\nTotal cohort\nn=159\n ")) +
scale_colour_manual(values = c("#51BFC4"), name = "",
labels = c("\nTotal cohort\nn=159\n "))
And My Data
p <- structure(list(WHO.Grade = c(1L, 2L, 1L, 1L, 1L, 1L, 3L, 2L,
1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L), ki67pro = c(1L, 12L, 3L, 3L, 5L,
3L, 20L, 25L, 7L, 4L, 5L, 12L, 3L, 15L, 4L, 5L, 7L, 8L, 3L, 12L,
10L, 4L, 10L, 7L, 3L, 2L, 3L, 7L, 4L, 7L, 10L, 4L, 5L, 5L, 3L,
5L, 2L, 5L, 3L, 3L, 3L, 4L, 4L, 3L, 2L, 5L, 1L, 5L, 2L, 3L, 1L,
2L, 3L, 3L, 5L, 4L, 20L, 5L, 0L, 4L, 3L, 0L, 3L, 4L, 1L, 2L,
20L, 2L, 3L, 5L, 4L, 8L, 1L, 4L, 5L, 4L, 3L, 6L, 12L, 3L, 4L,
4L, 2L, 5L, 3L, 3L, 3L, 2L, 5L, 4L, 2L, 3L, 4L, 3L, 3L, 2L, 2L,
4L, 7L, 4L, 3L, 4L, 2L, 3L, 6L, 2L, 3L, 10L, 5L, 10L, 3L, 10L,
3L, 4L, 5L, 2L, 4L, 3L, 4L, 4L, 4L, 5L, 3L, 12L, 5L, 4L, 3L,
2L, 4L, 3L, 4L, 2L, 1L, 6L, 1L, 4L, 12L, 3L, 4L, 3L, 2L, 6L,
5L, 4L, 3L, 4L, 4L, 4L, 3L, 5L, 4L, 5L, 4L, 1L, 3L, 3L, 4L, 0L,
3L)), class = "data.frame", row.names = c(1L, 2L, 3L, 9L, 10L,
11L, 12L, 13L, 14L, 15L, 16L, 18L, 19L, 20L, 21L, 22L, 23L, 24L,
25L, 26L, 27L, 28L, 29L, 30L, 31L, 32L, 33L, 34L, 35L, 36L, 37L,
38L, 39L, 40L, 41L, 44L, 45L, 46L, 47L, 48L, 49L, 50L, 51L, 52L,
53L, 54L, 55L, 57L, 59L, 60L, 61L, 62L, 63L, 64L, 65L, 66L, 67L,
68L, 69L, 70L, 71L, 72L, 73L, 74L, 75L, 76L, 77L, 78L, 79L, 80L,
81L, 82L, 83L, 84L, 85L, 87L, 89L, 90L, 91L, 92L, 93L, 94L, 96L,
97L, 98L, 99L, 100L, 101L, 102L, 103L, 104L, 105L, 106L, 107L,
109L, 110L, 111L, 112L, 113L, 114L, 115L, 116L, 117L, 118L, 119L,
120L, 121L, 123L, 124L, 125L, 126L, 127L, 128L, 130L, 131L, 132L,
133L, 134L, 135L, 136L, 137L, 138L, 139L, 140L, 141L, 142L, 143L,
144L, 145L, 146L, 147L, 148L, 149L, 150L, 151L, 152L, 153L, 154L,
155L, 156L, 157L, 158L, 159L, 160L, 161L, 162L, 163L, 164L, 165L,
166L, 167L, 168L, 169L, 170L, 171L, 172L, 173L, 174L, 175L))
You can use theme() as follows:
... + theme(legend.position = "none")
This should eliminate the legend
reference: https://www.datanovia.com/en/blog/ggplot-legend-title-position-and-labels/

Speeding up a loop (extracting specific values from a data frame)

My task is to extract all values in a column "2" after sorting by factor level in another column "3" (for the interested, i am sorting fasta sequences by organism). I am using this very simple code to get what i need.
df <- read.table("outfile.txt", fill=T) # the original output file includes many empty cells
# df is availabe at the bottom of this post
# splitting by factors
list1 <- split(df, df$V3)
# extract all values in column 2
list2 <- lapply(list1, function(x) as.data.frame(x$V2))
# writing results to file
for (x in names(list2))
write.table(list2[[x]], file=paste(x,".txt"), quote=F, row.names = F, col.names=F)
The works well on a small df. However, the output file contains several gigabytes of data. I tried a subset (500,000 rows on my local machine with 8GB RAM), but the second command is extremely slow (or R just hangs).
So i wondered and am asking the community, if there is a better way to solve this. Thank you.
Here is df:
dput(df)
structure(list(V1 = structure(c(1L, 2L, 1L, 1L, 1L, 1L, 1L, 2L,
1L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L,
1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 2L, 2L,
2L, 1L, 1L, 1L, 1L, 1L), .Label = c("C", "U"), class = "factor"),
V2 = structure(c(10L, 2L, 27L, 29L, 25L, 32L, 28L, 39L, 40L,
22L, 8L, 7L, 19L, 38L, 15L, 3L, 16L, 26L, 34L, 13L, 17L,
18L, 14L, 41L, 44L, 12L, 45L, 46L, 5L, 1L, 31L, 4L, 37L,
11L, 43L, 20L, 21L, 30L, 23L, 35L, 24L, 42L, 9L, 33L, 36L,
6L), .Label = c("M02978:20:000000000-B8C4P:1:1101:11008:4137",
"M02978:20:000000000-B8C4P:1:1101:14389:3444", "M02978:20:000000000-B8C4P:1:1101:14986:3769",
"M02978:20:000000000-B8C4P:1:1101:15333:4161", "M02978:20:000000000-B8C4P:1:1101:15438:4092",
"M02978:20:000000000-B8C4P:1:1101:15516:4514", "M02978:20:000000000-B8C4P:1:1101:16313:3660",
"M02978:20:000000000-B8C4P:1:1101:16433:3650", "M02978:20:000000000-B8C4P:1:1101:16663:4462",
"M02978:20:000000000-B8C4P:1:1101:17179:3407", "M02978:20:000000000-B8C4P:1:1101:17779:4225",
"M02978:20:000000000-B8C4P:1:1101:18008:3981", "M02978:20:000000000-B8C4P:1:1101:18047:3851",
"M02978:20:000000000-B8C4P:1:1101:18920:3936", "M02978:20:000000000-B8C4P:1:1101:19086:3737",
"M02978:20:000000000-B8C4P:1:1101:19203:3783", "M02978:20:000000000-B8C4P:1:1101:19335:3908",
"M02978:20:000000000-B8C4P:1:1101:19520:3921", "M02978:20:000000000-B8C4P:1:1101:19612:3701",
"M02978:20:000000000-B8C4P:1:1101:19655:4289", "M02978:20:000000000-B8C4P:1:1101:19918:4313",
"M02978:20:000000000-B8C4P:1:1101:20321:3602", "M02978:20:000000000-B8C4P:1:1101:21089:4350",
"M02978:20:000000000-B8C4P:1:1101:22293:4406", "M02978:20:000000000-B8C4P:1:1101:22453:3490",
"M02978:20:000000000-B8C4P:1:1101:23026:3811", "M02978:20:000000000-B8C4P:1:1101:23065:3472",
"M02978:20:000000000-B8C4P:1:1101:23770:3507", "M02978:20:000000000-B8C4P:1:1101:23991:3472",
"M02978:20:000000000-B8C4P:1:1101:24290:4332", "M02978:20:000000000-B8C4P:1:1101:24415:4142",
"M02978:20:000000000-B8C4P:1:1101:25066:3498", "M02978:20:000000000-B8C4P:1:1101:25678:4466",
"M02978:20:000000000-B8C4P:1:1101:25992:3830", "M02978:20:000000000-B8C4P:1:1101:26431:4388",
"M02978:20:000000000-B8C4P:1:1101:26573:4479", "M02978:20:000000000-B8C4P:1:1101:5567:4179",
"M02978:20:000000000-B8C4P:1:1101:6323:3723", "M02978:20:000000000-B8C4P:1:1101:6675:3536",
"M02978:20:000000000-B8C4P:1:1101:6868:3559", "M02978:20:000000000-B8C4P:1:1101:7078:3965",
"M02978:20:000000000-B8C4P:1:1101:8145:4431", "M02978:20:000000000-B8C4P:1:1101:8449:4257",
"M02978:20:000000000-B8C4P:1:1101:8592:3966", "M02978:20:000000000-B8C4P:1:1101:9468:4026",
"M02978:20:000000000-B8C4P:1:1101:9970:4051"), class = "factor"),
V3 = c(926550L, 0L, 1121396L, 406818L, 1265505L, 1167006L,
1121399L, 0L, 177437L, 0L, 1536652L, 0L, 1196029L, 0L, 1178540L,
138119L, 0L, 1536652L, 186802L, 0L, 1322246L, 1232437L, 1196029L,
1121396L, 452637L, 0L, 0L, 0L, 1541959L, 1121403L, 96561L,
1167006L, 767528L, 0L, 0L, 653733L, 1423815L, 857293L, 0L,
0L, 0L, 468059L, 1167006L, 1232437L, 880073L, 761193L), V4 = c(171L,
NA, 264L, 88L, 356L, 257L, 128L, NA, 97L, NA, 243L, NA, 96L,
NA, 80L, 93L, NA, 138L, 155L, NA, 243L, 262L, 77L, 470L,
135L, NA, NA, NA, 124L, 161L, 211L, 202L, 91L, NA, NA, 146L,
98L, 93L, NA, NA, NA, 107L, 382L, 247L, 130L, 157L), V5 = structure(c(25L,
1L, 2L, 17L, 9L, 5L, 3L, 1L, 16L, 1L, 14L, 1L, 7L, 1L, 6L,
11L, 1L, 14L, 24L, 1L, 10L, 8L, 7L, 2L, 18L, 1L, 1L, 1L,
15L, 4L, 26L, 5L, 13L, 1L, 1L, 20L, 12L, 22L, 1L, 1L, 1L,
19L, 5L, 8L, 23L, 21L), .Label = c("", "1121396,", "1121399,",
"1121403,", "1167006,", "1178540,", "1196029,", "1232437,",
"1265505,", "1322246,", "138119,", "1423815,", "1460634,1460635,",
"1536652,", "1541959,", "177437,", "406818,", "452637,",
"468059,", "653733,", "761193,", "857293,", "880073,", "883109,888727,1161902,1230734,1392487,",
"926550,", "96561,"), class = "factor")), .Names = c("V1",
"V2", "V3", "V4", "V5"), class = "data.frame", row.names = c(NA,
-46L))
using data.table package combined with write.table.
order by V3 and then write the V2 columns separately for each group in V3.
library('data.table')
setDT(df)[ order(V3), write.table(V2, file = paste0( V3, ".txt")), by = V3]
This worked for me but I cannot speak for how fast it would be on your machine.
lapply(unique(df$V3), function(x) write.table(df[which(df$V3 == x),]$V2, file = paste(x, ".txt", sep = ""), quote = FALSE, row.names = FALSE, col.names = FALSE))

statistic test on univariate time series without replicates in R

I'm having the following data on an experiment, where I want to find out, how an bacterium reacts on two similar levels (nucleic acids) to a treatment.
Treatment happened after the sampling on day 0 (vertical dashed line). As you can see, it got more abundant (line is average, dots are measured triplicates). I have 3 technical replicates (doing the lab work 3 times on the same sample) but no biological replicates.
For publication purposes, I want to show that the induced change is significant. So far I used a two tailed t test for heteroscedastic samples, using the 3 sample points day -25 to 0 as sample group 1 and 5 sample points day 3 to 17 as sample group 2 (this is the range where most of my bacteria reacted).
Afterwards I performed the Bonferroni correction on the p values to correct for multiple testing. But is this the correct way and is it possible with only technical replicates?
I'm finding many hints on fitting models to my graph, but I only want to test for statistic significance of difference between before and after treatment. So I'm searching for the correct statistics and also how to apply it in R. Any help appreciated!
here is the plot:
require(ggplot2)
require(scales)
ggplot(data=sample_data, aes(x=days-69,y=value,colour=nucleic_acid,group=nucleic_acid,lty=nucleic_acid))+
geom_vline(aes(xintercept=0),linetype="dashed", size=1.2)+
geom_point(aes(),colour="black")+
stat_summary(aes(colour=nucleic_acid),colour="black",fun.y="mean", geom="line", size=1.5)+
scale_linetype_manual(values=c("dna"=1,"cdna"=4),
name="Nucleic acid ",
breaks=c("cdna","dna"),
labels=c("16S rRNA","16S rDNA"))+
scale_x_continuous(breaks = scales::pretty_breaks(n = 20))+
theme_bw()+
scale_y_continuous(label= function(x) {ifelse(x==0, "0", parse(text=gsub("[+]", "", gsub("e", " %*% 10^", scientific_format()(x)))))})+
theme(axis.title.y = element_text(angle=90,vjust=0.5))+
theme(axis.text=element_text(size=12))+
theme(legend.text=element_text(size=11))+
theme(panel.grid.major=element_line(colour = NA, size = 0.2))+
theme(panel.grid.minor=element_line(colour = NA, size = 0.5))+
theme(legend.position="bottom")+
theme(legend.background = element_rect(fill="grey90",linetype="solid"))+
labs(x="Days",
y=expression(atop("Absolute abundance in cell equivalents",bgroup("[",relative~abundance~x~cells~mL^{-1},"]"))))
and here is my data:
sample_data<-structure(list(time = c(10L, 10L, 10L, 10L, 10L, 10L, 11L, 11L,
11L, 11L, 11L, 11L, 12L, 12L, 12L, 12L, 12L, 12L, 13L, 13L, 13L,
13L, 13L, 13L, 14L, 14L, 14L, 14L, 14L, 14L, 15L, 15L, 15L, 15L,
15L, 15L, 16L, 16L, 16L, 16L, 16L, 16L, 17L, 17L, 17L, 17L, 18L,
18L, 18L, 18L, 18L, 18L, 19L, 19L, 19L, 19L, 19L, 19L, 4L, 4L,
4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 6L, 6L,
7L, 7L, 7L, 7L, 7L, 7L, 8L, 8L, 8L, 8L, 8L, 8L, 9L, 9L, 9L, 9L,
9L, 9L), days = c(83L, 83L, 83L, 83L, 83L, 83L, 86L, 86L, 86L,
86L, 86L, 86L, 91L, 91L, 91L, 91L, 91L, 91L, 98L, 98L, 98L, 98L,
98L, 98L, 105L, 105L, 105L, 105L, 105L, 105L, 112L, 112L, 112L,
112L, 112L, 112L, 119L, 119L, 119L, 119L, 119L, 119L, 126L, 126L,
126L, 126L, 133L, 133L, 133L, 133L, 133L, 133L, 140L, 140L, 140L,
140L, 140L, 140L, 44L, 44L, 44L, 44L, 44L, 44L, 62L, 62L, 62L,
62L, 62L, 62L, 69L, 69L, 69L, 69L, 69L, 69L, 72L, 72L, 72L, 72L,
72L, 72L, 76L, 76L, 76L, 76L, 76L, 76L, 79L, 79L, 79L, 79L, 79L,
79L), parallel = c(3L, 1L, 2L, 2L, 3L, 1L, 2L, 3L, 3L, 2L, 1L,
1L, 2L, 1L, 3L, 3L, 1L, 2L, 2L, 3L, 3L, 1L, 1L, 2L, 2L, 3L, 1L,
1L, 3L, 2L, 1L, 1L, 2L, 3L, 3L, 2L, 2L, 3L, 3L, 1L, 1L, 2L, 3L,
1L, 1L, 3L, 2L, 3L, 1L, 1L, 2L, 3L, 1L, 2L, 3L, 3L, 1L, 2L, 2L,
3L, 3L, 1L, 1L, 2L, 2L, 3L, 1L, 1L, 3L, 2L, 1L, 2L, 3L, 3L, 1L,
2L, 2L, 3L, 3L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 3L, 3L, 1L, 2L, 3L,
3L, 1L, 2L), nucleic_acid = structure(c(1L, 1L, 1L, 2L, 2L, 2L,
2L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 1L, 2L,
1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 1L, 1L,
2L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L,
1L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L,
2L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 1L,
1L, 2L, 1L, 1L, 1L, 2L, 2L, 2L), .Label = c("cdna", "dna"), class = "factor"),
habitat = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "water", class = "factor"),
value = c(5316639.62, 6402573.912, 6294710.95, 2369809.996,
2679661.691, 2105693.166, 2108794.224, 2487177.041, 6021765.438,
5524939.499, 6016021.786, 2628427.206, 3164229.113, 896068.7656,
2966515.364, 4436008.425, 1860580.149, 3911309.508, 888489.0268,
1004334.365, 1141636.992, 961140.0729, 1072009.18, 1134997.852,
668013.4333, 459645.1058, 645944.1129, 702293.6865, 590620.3693,
642136.7523, 932531.1588, 1224299.065, 1502344.5, 1545034.46,
1122002.798, 1411050.57, 1465061.711, 1378876.488, 810348.2823,
1361496.248, 1056558.288, 897876.4169, 931519.9524, 1165768.09,
957873.9045, 746011.7558, 624116.5603, 522209.2283, 551120.1371,
440096.4446, 565108.4447, 373304.8604, 266595.7171, 333767.4042,
185612.6681, 144899.8736, 173739.3969, 211490.827, 223815.0867,
296455.4243, 1278759.217, 247292.4355, 1171554.199, 1146278.577,
227443.8462, 233542.6719, 253224.2629, 875040.4892, 1151921.616,
1285744.479, 355381.9156, 110724.7928, 252238.9632, 912865.3372,
608269.6498, 500307.5301, 774955.9598, 1374106.94, 3121909.308,
1071086.757, 3033665.589, 2984567.998, 1396313.444, 1356465.773,
4480581.956, 4273141.231, 4957691.655, 1910056.657, 5520085.32,
5094686.657, 5990052.759, 2272441.566, 1513268.608, 1821716.75
), treatment2 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "Treatment", class = "factor")), .Names = c("time",
"days", "parallel", "nucleic_acid", "habitat", "value", "treatment2"
), class = "data.frame", row.names = c(51243L, 51244L, 51245L,
51246L, 51247L, 51248L, 51255L, 51256L, 51257L, 51258L, 51259L,
51260L, 51267L, 51268L, 51269L, 51270L, 51271L, 51272L, 51279L,
51280L, 51281L, 51282L, 51283L, 51284L, 51291L, 51292L, 51293L,
51294L, 51295L, 51296L, 51303L, 51304L, 51305L, 51306L, 51307L,
51308L, 51315L, 51316L, 51317L, 51318L, 51319L, 51320L, 51326L,
51327L, 51328L, 51329L, 51336L, 51337L, 51338L, 51339L, 51340L,
51341L, 51348L, 51349L, 51350L, 51351L, 51352L, 51353L, 51360L,
51361L, 51362L, 51363L, 51364L, 51365L, 51372L, 51373L, 51374L,
51375L, 51376L, 51377L, 51384L, 51385L, 51386L, 51387L, 51388L,
51389L, 51396L, 51397L, 51398L, 51399L, 51400L, 51401L, 51408L,
51409L, 51410L, 51411L, 51412L, 51413L, 51420L, 51421L, 51422L,
51423L, 51424L, 51425L))
If you want to test for significance of the effect of your treatment and you know how to fit model(s) on your data, you can simply fit a model which includes your treatment effect and a model which doesn't. Then compare the models by means of a likelihood ratio test.
In R it is pretty straightforward (I assume for simplicity a linear model, which anyway may not be the best choice, based on your data):
# Models fit
model_effect <- lm(y~Time + Treatment, data)
model_null <- lm(y~Time, data)
# Models comparison
anova(model_effect, model_null)

Calculating seasonal index from tbats components

I have aggregated retail weekly data with seasonal periods of 52.2 (a 53rd week every five years). I want to use this aggregated data to calculate a seasonal index that can be applied to each item within the category to derive its de-seasonalised demand.
Using stl, I would calculate the seasonal index as "seasonal" / "trend" + 1 (normalised to 52). I switched to tbats because my seasonality was not an integer and I have multiple seasonal periods (52.2 and 261)
I am using tbats with seasonal.periods = 52.2 and extract the components using tbats.components. The components are "observed", "level" and "season". Google has not revealed much in terms of what these components are and how to consume them. I also extracted the residuals
I noticed that "observed" is the log of my data. I also notice that season is changing over time (which is exactly what I want)
My questions are:
1.Is "season" a natural log too?
2.How can I extract the future "season" values? I can run a forecast on the data so I am assuming that there must be a projected "season"
3. What would be the best approach to calculating an "index" considering that it will be divided into the granular data. I am currently using: exp("season") / centered moving average(exp("season"))
My Data:
weeklyu <-structure(list(V1 = c(8L, 5L, 7L, 3L, 1L, 2L, 3L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 8L, 2L,
4L, 8L, 6L, 7L, 8L, 9L, 15L, 15L, 13L, 9L, 16L, 19L, 16L, 16L,
10L, 31L, 45L, 90L, 185L, 34L, 8L, 19L, 11L, 19L, 21L, 8L, 5L,
7L, 6L, 3L, 10L, 2L, 2L, 4L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 2L, 16L, 22L, 18L, 23L, 11L, 5L, 8L, 21L, 18L, 11L, 26L,
28L, 9L, 3L, 6L, 3L, 6L, 1L, 5L, 3L, 3L, 2L, 1L, 4L, 1L, 1L,
3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 5L,
19L, 11L, 17L, 23L, 50L, 52L, 23L, 18L, 22L, 44L, 37L, 22L, 30L,
32L, 47L, 34L, 30L, 26L, 25L, 44L, 87L, 65L, 30L, 17L, 12L, 2L,
16L, 14L, 17L, 6L, 7L, 3L, 6L, 7L, 8L, 11L, 12L, 4L, 1L, 3L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L)), .Names = "V1", class = "data.frame", row.names = c(NA,
-188L))
My Code:
wklytbat <- tbats(msts(weeklyu, seasonal.periods = 52.2, ts.frequency=52.2), use.parallel=FALSE)
extract season:
seasu <-data.table(exp(as.numeric(tbats.components(wklytbat)[,'season'])))

Resources