I have a dataset that looks like the following:
ID RECN EXSTDAT EXSTPDAT EXONGO EX2LD DOSEA DOSFRM DOSFRQ ADURN STUDYST EXSTDAY EXSTPDAY
<int> <dbl> <date> <date> <dbl> <dbl> <dbl> <chr> <chr> <dbl> <date> <dbl> <dbl>
1 1 1 2022-07-08 2022-07-27 0 0 50 Capsule QD 19 2022-07-08 0 19
2 1 2 2022-07-28 2022-08-14 0 1 50 Capsule QD 17 2022-07-08 20 37
3 2 2 2022-06-09 2022-06-09 0 0 50 Capsule QD 0 2022-06-09 0 0
4 2 1 2022-06-14 2022-08-02 0 0 50 Capsule QD 49 2022-06-09 5 54
5 2 3 2022-08-03 2022-08-14 0 0 0 Capsule QD 11 2022-06-09 55 66
6 2 5 2022-08-15 2022-09-26 0 0 50 Capsule QD 42 2022-06-09 67 109
7 2 4 2022-09-27 2023-02-15 1 0 100 Capsule QD 141 2022-06-09 110 251
8 3 1 2022-06-30 2022-08-03 0 1 50 Capsule QD 34 2022-06-30 0 34
9 4 1 2022-08-24 2022-10-04 0 1 100 Capsule QD 41 2022-08-24 0 41
10 5 1 2022-12-30 2023-01-19 0 1 200 Capsule QD 20 2022-12-30 0 20
I would like to generate an observation for each day between the intervals of EXSTDAY and EXSTPDAY, keeping ID, DOSEA, DOSFRM, and DOSFRQ. Below is an example of the desired results for up to study day (STDAY <= 8) for ID 1 & 2:
ID DOSEA DOSFRM DOSFRQ STDAY
1 50 Capsule QD 0
1 50 Capsule QD 1
1 50 Capsule QD 2
1 50 Capsule QD 3
1 50 Capsule QD 4
1 50 Capsule QD 5
1 50 Capsule QD 6
1 50 Capsule QD 7
1 50 Capsule QD 8
2 50 Capsule QD 0
2 50 Capsule QD 5
2 50 Capsule QD 6
2 50 Capsule QD 7
2 50 Capsule QD 8
I have no idea where to start, so any advice is much appreciated!
dput of original dataset:
structure(list(ID = c(1L, 1L, 2L, 2L, 2L, 2L, 2L, 3L, 4L, 5L,
6L, 7L, 8L, 8L, 9L, 10L, 10L, 10L, 10L, 10L, 11L, 11L, 11L, 12L,
12L, 13L, 14L, 14L, 14L, 14L, 14L, 14L), RECN = c(1, 2, 2, 1,
3, 5, 4, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 3, 4, 1, 2, 3, 1, 2,
1, 1, 2, 3, 4, 5, 6), EXSTDAT = structure(c(19181, 19201, 19152,
19157, 19207, 19219, 19262, 19173, 19228, 19356, 19356, 19377,
19303, 19326, 19363, 19216, 19220, 19346, 19362, 19365, 19264,
19277, 19282, 19219, 19226, 19310, 19345, 19351, 19352, 19354,
19355, 19370), class = "Date"), EXSTPDAT = structure(c(19200,
19218, 19152, 19206, 19218, 19261, 19403, 19207, 19269, 19376,
19376, 19403, 19325, 19328, 19383, 19216, 19361, 19366, 19364,
19403, 19275, 19281, 19403, 19225, 19226, 19338, 19350, 19351,
19353, 19354, 19369, 19370), class = "Date"), EXONGO = c(0, 0,
0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
0, 0, 0, 0, 0, 0, 0, 0, 0), EX2LD = c(0, 1, 0, 0, 0, 0, 0, 1,
1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
0, 0, 0), DOSEA = c(50, 50, 50, 50, 0, 50, 100, 50, 100, 200,
200, 100, 100, 100, 200, 100, 100, 100, 100, 100, 100, 100, 100,
50, 0, 100, 200, 0, 200, 0, 200, 0), DOSFRM = c("Capsule", "Capsule",
"Capsule", "Capsule", "Capsule", "Capsule", "Capsule", "Capsule",
"Capsule", "Capsule", "Capsule", "Capsule", "Capsule", "Tablet",
"Capsule", "Capsule", "Capsule", "Capsule", "Tablet", "Tablet",
"Tablet", "Tablet", "Tablet", "Capsule", "Capsule", "Capsule",
"Capsule", "Capsule", "Capsule", "Capsule", "Capsule", "Capsule"
), DOSFRQ = c("QD", "QD", "QD", "QD", "QD", "QD", "QD", "QD",
"QD", "QD", "QD", "QD", "QD", "QD", "QD", "QD", "QD", "QD", "QD",
"QD", "QD", "QD", "QD", "QD", "QD", "QD", "QD", "QD", "QD", "QD",
"QD", "QD"), ADURN = c(19, 17, 0, 49, 11, 42, 141, 34, 41, 20,
20, 26, 22, 2, 20, 0, 141, 20, 2, 38, 11, 4, 121, 6, 0, 28, 5,
0, 1, 0, 14, 0), STUDYST = structure(c(19181, 19181, 19152, 19152,
19152, 19152, 19152, 19173, 19228, 19356, 19356, 19377, 19303,
19303, 19363, 19216, 19216, 19216, 19216, 19216, 19264, 19264,
19264, 19219, 19219, 19310, 19345, 19345, 19345, 19345, 19345,
19345), class = "Date"), EXSTDAY = c(0, 20, 0, 5, 55, 67, 110,
0, 0, 0, 0, 0, 0, 23, 0, 0, 4, 130, 146, 149, 0, 13, 18, 0, 7,
0, 0, 6, 7, 9, 10, 25), EXSTPDAY = c(19, 37, 0, 54, 66, 109,
251, 34, 41, 20, 20, 26, 22, 25, 20, 0, 145, 150, 148, 187, 11,
17, 139, 6, 7, 28, 5, 6, 8, 9, 24, 25)), row.names = c(NA, -32L
), class = c("tbl_df", "tbl", "data.frame"))
If I understand you correctly, you want one row in your data frame for each day that each subject was participating in your study. If that's the case then you can do:
library(tidyverse)
df %>%
rowwise() %>%
reframe(ID, DOSEA, DOSFRM, DOSFRQ,
STDAY = as.numeric(if(EXSTPDAT - EXSTDAT == 0) EXSTDAT - STUDYST else
seq(EXSTPDAT - EXSTDAT) + EXSTDAT - STUDYST - 1))
#># A tibble: 861 x 5
#> ID DOSEA DOSFRM DOSFRQ STDAY
#> <int> <dbl> <chr> <chr> <dbl>
#> 1 1 50 Capsule QD 0
#> 2 1 50 Capsule QD 1
#> 3 1 50 Capsule QD 2
#> 4 1 50 Capsule QD 3
#> 5 1 50 Capsule QD 4
#> 6 1 50 Capsule QD 5
#> 7 1 50 Capsule QD 6
#> 8 1 50 Capsule QD 7
#> 9 1 50 Capsule QD 8
#>10 1 50 Capsule QD 9
#># ... with 851 more rows
#># i Use `print(n = ...)` to see more rows
If you want to generate a sequence between EXSTDAY and EXSTPDAY one approach could be using map2 from purrr as follows. The final select will indicate which columns you wish to retain in the end.
library(tidyverse)
df %>%
mutate(STDAY = map2(EXSTDAY, EXSTPDAY, seq)) %>%
unnest(STDAY) %>%
select(ID, DOSEA, DOSFRM, DOSFRQ, STDAY)
Output
ID DOSEA DOSFRM DOSFRQ STDAY
<int> <dbl> <chr> <chr> <int>
1 1 50 Capsule QD 0
2 1 50 Capsule QD 1
3 1 50 Capsule QD 2
4 1 50 Capsule QD 3
5 1 50 Capsule QD 4
6 1 50 Capsule QD 5
7 1 50 Capsule QD 6
8 1 50 Capsule QD 7
9 1 50 Capsule QD 8
10 1 50 Capsule QD 9
# … with 877 more rows
Related
I would like to calculate Day.Before_nextCLS with 3 columns below
tibble::tribble(
~Day, ~CLS, ~BAL.D,
0, 0, NA,
3, 0, 15000,
6, 0, 10000,
20, 0, 2000,
25, 0, -4771299,
26, 0, -1615637,
27, 0, -920917,
31, 1, -923089,
32, 1, -81863,
33, 1, 19865,
34, 1, 9865,
37, 1, 609865
)
Desired output is below tribble.
For Day27, Day.Before_nextCLS is 4,
because when CLS is 2, Day is 31, and interval between 27 and 31 is 4.
tibble::tribble(
~Day, ~CLS, ~BAL.D, ~Day.Before_nextCLS
0, 0, NA, 31,
3, 0, 15000, 28,
6, 0, 10000, 25,
20, 0, 2000, 11,
25, 0, -4771299, 6,
26, 0, -1615637, 5,
27, 0, -920917, 4,
31, 1, -923089, NA, (for we don't have date when CLS ==2)
32, 1, -81863, NA,
33, 1, 19865, NA,
34, 1, 9865, NA,
37, 1, 609865, NA,
)
How can I achieve this?
Thank you very much!!
We create a lead column and then do a group by subtract from the last value of lead column with the Day column
library(dplyr)
df1 %>%
mutate(DayLead = lead(Day)) %>%
group_by(CLS) %>%
mutate(Day.Before_nextCLS = last(DayLead) - Day, DayLead = NULL) %>%
ungroup
-output
# A tibble: 12 × 4
Day CLS BAL.D Day.Before_nextCLS
<dbl> <dbl> <dbl> <dbl>
1 0 0 NA 31
2 3 0 15000 28
3 6 0 10000 25
4 20 0 2000 11
5 25 0 -4771299 6
6 26 0 -1615637 5
7 27 0 -920917 4
8 31 1 -923089 NA
9 32 1 -81863 NA
10 33 1 19865 NA
11 34 1 9865 NA
12 37 1 609865 NA
I have a large dataset of mineral nitrogen values from different plots which includes some missing data were on some dates we could not take samples. it is known that mineral N values in soil change linearly between samplings.
for the sake of simplification I have created a data frame that has 10 plots with 4 dates (with different distances between them) with missing data in one of the dates:
df <- data.frame(plot= c(1,2,3,4,5,6,7,8,9,10),
date = c("2020-10-01", "2020-10-01","2020-10-01","2020-10-01","2020-10-01","2020-10-01","2020-10-01","2020-10-01","2020-10-01","2020-10-01",
"2020-10-08", "2020-10-08","2020-10-08","2020-10-08","2020-10-08","2020-10-08","2020-10-08","2020-10-08","2020-10-08","2020-10-08",
"2020-10-29","2020-10-29","2020-10-29","2020-10-29","2020-10-29","2020-10-29","2020-10-29","2020-10-29","2020-10-29","2020-10-29",
"2020-11-05","2020-11-05","2020-11-05","2020-11-05","2020-11-05","2020-11-05","2020-11-05","2020-11-05","2020-11-05","2020-11-05"),
Nmin = c(100, 120, 50, 60, 70, 80, 100, 70, 30, 50, 90, 130, 60, 60, 60, 90, 105, 60, 25, 40, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, 50, 170, 100, 60, 20, 130, 125, 20, 5, 0))
df$date <- as.Date(df$date, format="%d.%m.%Y")
df$Nmin <- as.numeric(df$Nmin)
is there a function that can calculate the missing values of Nmin plot-wise and takes in concideration the time between samplings (date)?
Using approx.
df <- transform(df, flag=ifelse(is.na(Nmin), 1, 0)) ## set flag for sake of identification
res <- by(df, df$plot, transform, Nmin=approx(date, Nmin, date)$y) |> unsplit(df$plot)
res
# plot date Nmin flag
# 1 1 2020-10-01 100 0
# 2 2 2020-10-01 120 0
# 3 3 2020-10-01 50 0
# 4 4 2020-10-01 60 0
# 5 5 2020-10-01 70 0
# 6 6 2020-10-01 80 0
# 7 7 2020-10-01 100 0
# 8 8 2020-10-01 70 0
# 9 9 2020-10-01 30 0
# 10 10 2020-10-01 50 0
# 11 1 2020-10-08 90 0
# 12 2 2020-10-08 130 0
# 13 3 2020-10-08 60 0
# 14 4 2020-10-08 60 0
# 15 5 2020-10-08 60 0
# 16 6 2020-10-08 90 0
# 17 7 2020-10-08 105 0
# 18 8 2020-10-08 60 0
# 19 9 2020-10-08 25 0
# 20 10 2020-10-08 40 0
# 21 1 2020-10-29 60 1
# 22 2 2020-10-29 160 1
# 23 3 2020-10-29 90 1
# 24 4 2020-10-29 60 1
# 25 5 2020-10-29 30 1
# 26 6 2020-10-29 120 1
# 27 7 2020-10-29 120 1
# 28 8 2020-10-29 30 1
# 29 9 2020-10-29 10 1
# 30 10 2020-10-29 10 1
# 31 1 2020-11-05 50 0
# 32 2 2020-11-05 170 0
# 33 3 2020-11-05 100 0
# 34 4 2020-11-05 60 0
# 35 5 2020-11-05 20 0
# 36 6 2020-11-05 130 0
# 37 7 2020-11-05 125 0
# 38 8 2020-11-05 20 0
# 39 9 2020-11-05 5 0
# 40 10 2020-11-05 0 0
Let's take a look at the plot.
clr <- rainbow(10)
with(res, plot(Nmin ~ date, type='n'))
by(res, res$plot, with, points(jitter(Nmin) ~ date, type='b', pch=ifelse(flag == 1, 21, 16), col=clr[plot], bg='white'))
legend('topleft', legend=paste('plot', 1:10), lty=1, col=clr, ncol=4, bty='n', cex=.7)
Note: For non-linear inter/extrapolation, see this answer.
Data:
df <- structure(list(plot = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1, 2,
3, 4, 5, 6, 7, 8, 9, 10, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1, 2,
3, 4, 5, 6, 7, 8, 9, 10), date = structure(c(18536, 18536, 18536,
18536, 18536, 18536, 18536, 18536, 18536, 18536, 18543, 18543,
18543, 18543, 18543, 18543, 18543, 18543, 18543, 18543, 18564,
18564, 18564, 18564, 18564, 18564, 18564, 18564, 18564, 18564,
18571, 18571, 18571, 18571, 18571, 18571, 18571, 18571, 18571,
18571), class = "Date"), Nmin = c(100, 120, 50, 60, 70, 80, 100,
70, 30, 50, 90, 130, 60, 60, 60, 90, 105, 60, 25, 40, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, 50, 170, 100, 60, 20, 130, 125,
20, 5, 0), flag = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0)), class = "data.frame", row.names = c(NA, -40L
))
So here I what I want, I want to plot 4 columns (Standing, Sitting, Stepping, Cycling) vs Time, and have 1 plot per date. I also want the Y scale to be scaled between 0.5 and 4.5, BUT the Y axis be invisible and a legend saying which color is which.
Here is a sample of my data:
> head(graph_pre,30)
Date Time Axis1 Axis2 Axis3 VM Standing Stepping Cycling New_Sitting Counter
1 2022-05-10 2022-05-10 09:01:00 21 40 2 45.22 0 0 2 0 0
2 2022-05-10 2022-05-10 09:01:01 0 36 1 36.01 0 0 0 1 1
3 2022-05-10 2022-05-10 09:01:02 24 1 0 24.02 0 0 0 1 0
4 2022-05-10 2022-05-10 09:01:03 48 31 4 57.28 0 0 2 0 1
5 2022-05-10 2022-05-10 09:01:04 0 6 0 6.00 0 0 0 1 1
6 2022-05-10 2022-05-10 09:01:05 0 0 0 0.00 0 0 0 1 0
7 2022-05-10 2022-05-10 09:01:06 0 0 0 0.00 0 0 0 1 0
8 2022-05-10 2022-05-10 09:01:07 0 0 0 0.00 0 0 0 1 0
9 2022-05-10 2022-05-10 09:01:08 0 5 2 5.39 0 0 0 1 0
10 2022-05-10 2022-05-10 09:01:09 20 33 3 38.70 0 0 0 1 0
11 2022-05-10 2022-05-10 09:01:10 14 26 29 41.39 0 0 2 0 1
12 2022-05-10 2022-05-10 09:01:11 11 0 4 11.70 0 0 0 1 1
13 2022-05-10 2022-05-10 09:01:12 0 0 0 0.00 0 0 0 1 0
14 2022-05-10 2022-05-10 09:01:13 0 0 0 0.00 0 0 0 1 0
15 2022-05-10 2022-05-10 09:01:14 82 126 113 188.07 0 3 0 0 1
16 2022-05-10 2022-05-10 09:01:15 60 64 47 99.52 0 0 2 0 1
17 2022-05-10 2022-05-10 09:01:16 98 140 236 291.38 0 0 2 0 0
18 2022-05-10 2022-05-10 09:01:17 151 118 221 292.52 0 0 2 0 0
19 2022-05-10 2022-05-10 09:01:18 44 13 99 109.11 0 0 2 0 0
20 2022-05-10 2022-05-10 09:01:19 6 6 53 53.67 0 0 2 0 0
21 2022-05-10 2022-05-10 09:01:20 39 8 65 76.22 0 0 2 0 0
22 2022-05-10 2022-05-10 09:01:21 17 20 57 62.75 0 0 2 0 0
23 2022-05-10 2022-05-10 09:01:22 51 46 269 277.63 0 0 2 0 0
24 2022-05-10 2022-05-10 09:01:23 15 45 82 94.73 0 3 0 0 1
25 2022-05-10 2022-05-10 09:01:24 22 34 4 40.69 0 0 2 0 1
26 2022-05-10 2022-05-10 09:01:25 114 93 41 152.73 0 0 2 0 0
27 2022-05-10 2022-05-10 09:01:26 74 67 92 135.75 0 0 2 0 0
28 2022-05-10 2022-05-10 09:01:27 117 9 40 123.98 0 0 2 0 0
29 2022-05-10 2022-05-10 09:01:28 33 15 0 36.25 0 0 0 1 1
30 2022-05-10 2022-05-10 09:01:29 0 0 0 0.00 0 0 0 1 0
I have the code to separate by date, and to "kinda" plot, but I need it for the 4 columns.
graph_pre <- mutate(graph_pre, day = lubridate::day(Date))
ggplot(graph_pre, aes(x = Time, y = Posture))+
geom_point()+
facet_wrap(~day, scales = "free_x")
dput(head(graph_pre,30))
structure(list(Date = structure(c(19122, 19122, 19122, 19122,
19122, 19122, 19122, 19122, 19122, 19122, 19122, 19122, 19122,
19122, 19122, 19122, 19122, 19122, 19122, 19122, 19122, 19122,
19122, 19122, 19122, 19122, 19122, 19122, 19122, 19122), class = "Date"),
Time = structure(c(1652187660, 1652187661, 1652187662, 1652187663,
1652187664, 1652187665, 1652187666, 1652187667, 1652187668,
1652187669, 1652187670, 1652187671, 1652187672, 1652187673,
1652187674, 1652187675, 1652187676, 1652187677, 1652187678,
1652187679, 1652187680, 1652187681, 1652187682, 1652187683,
1652187684, 1652187685, 1652187686, 1652187687, 1652187688,
1652187689), class = c("POSIXct", "POSIXt"), tzone = ""),
Axis1 = c(21, 0, 24, 48, 0, 0, 0, 0, 0, 20, 14, 11, 0, 0,
82, 60, 98, 151, 44, 6, 39, 17, 51, 15, 22, 114, 74, 117,
33, 0), Axis2 = c(40, 36, 1, 31, 6, 0, 0, 0, 5, 33, 26, 0,
0, 0, 126, 64, 140, 118, 13, 6, 8, 20, 46, 45, 34, 93, 67,
9, 15, 0), Axis3 = c(2, 1, 0, 4, 0, 0, 0, 0, 2, 3, 29, 4,
0, 0, 113, 47, 236, 221, 99, 53, 65, 57, 269, 82, 4, 41,
92, 40, 0, 0), VM = c(45.22, 36.01, 24.02, 57.28, 6, 0, 0,
0, 5.39, 38.7, 41.39, 11.7, 0, 0, 188.07, 99.52, 291.38,
292.52, 109.11, 53.67, 76.22, 62.75, 277.63, 94.73, 40.69,
152.73, 135.75, 123.98, 36.25, 0), Standing = c(0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0), Stepping = c(0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0,
0, 0, 0, 0), Cycling = c(2, 0, 0, 2, 0, 0, 0, 0, 0, 0, 2,
0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 0, 0),
New_Sitting = c(0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1), Counter = c(0L,
1L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 1L, 1L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 1L, 0L)), row.names = c(NA,
30L), class = "data.frame")
First thing, we should pivot_longer to pull the four posture columns into name-value pairs. Here I've put the names into the "Posture" column. Then we can map that to color and use the values for the y axis.
I've specified the range in scale_y_continuous, but it could also be done with coord_cartesian(ylim = c(0.5,4.5)) -- the difference will be that the out of range points are filtered out in this way, but are in some sense "still there" if you use the coord_cartesian option. That can make a difference if you are doing a summary step, like geom_boxplot or geom_smooth.
Finally, I use theme to specify the y-axis related elements that should be hidden.
library(tidyverse)
graph %>%
mutate(day = lubridate::day(Date)) %>%
pivot_longer(Standing:New_Sitting, names_to = "Posture") %>%
ggplot(aes(x = Time, y = value, color = Posture))+
geom_point()+
scale_y_continuous(limits = c(0.5,4.5), expand = expansion(0)) +
facet_wrap(~day, scales = "free_x") +
labs(title = "Posture vs. Time") +
theme(axis.title.y = element_blank(),
axis.text.y = element_blank(),
axis.ticks.y = element_blank(),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank())
Here you go:
library(tidyverse)
graph_pre_long <- graph_pre %>% pivot_longer(c(Standing, New_Sitting , Stepping, Cycling), names_to = "Posture")
ggplot(graph_pre_long, aes(x = Time, y = value, color = Posture))+
geom_point()+
facet_wrap(~day, scales = "free_x") +
ylim(.5, 4.5) +
theme(axis.title.y = element_blank(), axis.text.y = element_blank(), axis.ticks.y = element_blank())
I want to merge 4 columns together, (Standing, Stepping, Cycling, New_Sitting). In this case, I want to create a new column (called "Posture"). This new column (as per the example below) should be like:
Posture
<dbl>
2
3
2
1
1
1
3
4
4
4
Here is an example of my data:
> head(graph_pre,30)
# A tibble: 30 × 11
# Groups: Date [1]
Date Time Axis1 Axis2 Axis3 VM Standing Stepping Cycling New_Sitting
<date> <dttm> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 2022-03-14 2022-03-14 09:51:00 89 41 39 105. 0 0 2 0
2 2022-03-14 2022-03-14 09:51:01 88 135 117 199. 0 3 0 0
3 2022-03-14 2022-03-14 09:51:02 0 61 8 61.5 0 0 2 0
4 2022-03-14 2022-03-14 09:51:03 0 25 0 25 0 0 0 1
5 2022-03-14 2022-03-14 09:51:04 0 0 0 0 0 0 0 1
6 2022-03-14 2022-03-14 09:51:05 0 0 0 0 0 0 0 1
7 2022-03-14 2022-03-14 09:51:06 0 24 35 42.4 0 3 0 0
8 2022-03-14 2022-03-14 09:51:07 0 28 0 28 4 0 0 0
9 2022-03-14 2022-03-14 09:51:08 4 96 20 98.1 4 0 0 0
10 2022-03-14 2022-03-14 09:51:09 0 11 0 11 4 0 0 0
# … with 20 more rows, and 1 more variable: Counter <int>
Please let me know if you need more information as I'm new to this.
EDIT
> dput(head(graph_pre,30))
structure(list(Date = structure(c(19065, 19065, 19065, 19065,
19065, 19065, 19065, 19065, 19065, 19065, 19065, 19065, 19065,
19065, 19065, 19065, 19065, 19065, 19065, 19065, 19065, 19065,
19065, 19065, 19065, 19065, 19065, 19065, 19065, 19065), class = "Date"),
Time = structure(c(1647265860, 1647265861, 1647265862, 1647265863,
1647265864, 1647265865, 1647265866, 1647265867, 1647265868,
1647265869, 1647265870, 1647265871, 1647265872, 1647265873,
1647265874, 1647265875, 1647265876, 1647265877, 1647265878,
1647265879, 1647265880, 1647265881, 1647265882, 1647265883,
1647265884, 1647265885, 1647265886, 1647265887, 1647265888,
1647265889), tzone = "", class = c("POSIXct", "POSIXt")),
Axis1 = c(89, 88, 0, 0, 0, 0, 0, 0, 4, 0, 3, 9, 5, 0, 1,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13, 11, 3, 0), Axis2 = c(41,
135, 61, 25, 0, 0, 24, 28, 96, 11, 91, 44, 8, 8, 29, 1, 17,
0, 0, 0, 15, 0, 0, 0, 0, 28, 47, 28, 48, 0), Axis3 = c(39,
117, 8, 0, 0, 0, 35, 0, 20, 0, 22, 2, 16, 21, 48, 3, 35,
0, 5, 29, 32, 0, 0, 0, 0, 4, 26, 68, 5, 0), VM = c(105.47,
199.14, 61.52, 25, 0, 0, 42.44, 28, 98.14, 11, 93.67, 44.96,
18.57, 22.47, 56.09, 3.16, 38.91, 0, 5, 29, 35.34, 0, 0,
0, 0, 28.28, 55.26, 74.36, 48.35, 0), Standing = c(0, 0,
0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 0, 4, 0, 4, 4, 0, 0,
4, 4, 4, 4, 4, 0, 0, 4, 4), Stepping = c(0, 3, 0, 0, 0, 0,
3, 0, 0, 0, 0, 0, 0, 0, 3, 0, 3, 0, 0, 3, 3, 0, 0, 0, 0,
0, 3, 3, 0, 0), Cycling = c(2, 0, 2, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0), New_Sitting = c(0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), Counter = c(0L,
1L, 1L, 1L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L,
1L, 1L, 0L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 0L, 1L, 0L)), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -30L), groups = structure(list(
Date = structure(19065, class = "Date"), .rows = structure(list(
1:30), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -1L), .drop = TRUE))
What you can do is first replace the zeros with NA and after that unite the columns together. You can use the following code:
library(dplyr)
library(tidyr)
graph_pre %>%
mutate(across(Standing:New_Sitting, na_if, 0)) %>%
unite(Posture, Standing:New_Sitting, na.rm = TRUE, sep = '', remove = T) %>%
mutate(Posture = as.numeric(Posture))
Output:
# A tibble: 30 × 8
# Groups: Date [1]
Date Time Axis1 Axis2 Axis3 VM Posture Counter
<date> <dttm> <dbl> <dbl> <dbl> <dbl> <dbl> <int>
1 2022-03-14 2022-03-14 14:51:00 89 41 39 105. 2 0
2 2022-03-14 2022-03-14 14:51:01 88 135 117 199. 3 1
3 2022-03-14 2022-03-14 14:51:02 0 61 8 61.5 2 1
4 2022-03-14 2022-03-14 14:51:03 0 25 0 25 1 1
5 2022-03-14 2022-03-14 14:51:04 0 0 0 0 1 0
6 2022-03-14 2022-03-14 14:51:05 0 0 0 0 1 0
7 2022-03-14 2022-03-14 14:51:06 0 24 35 42.4 3 1
8 2022-03-14 2022-03-14 14:51:07 0 28 0 28 4 1
9 2022-03-14 2022-03-14 14:51:08 4 96 20 98.1 4 0
10 2022-03-14 2022-03-14 14:51:09 0 11 0 11 4 0
# … with 20 more rows
If you just want to merge them by summing the values for each row, you can do this:
library(tidyverse)
your_dataframe %>%
mutate(Posture = sum(Standing, Stepping, Cycling, New_Sitting))
Which will add an extra column called Posture at the end of your dataframe
I want to compute various index with their confidence interval according to factors and display in a graph using ggplot2.
In column, 1=positif and 0=negatif; "individual=1" means that 1 individual was tested.
the following index have to be computed per species+population+pathogen+dpi
...
example: AL: yu: dv: 21dpi infectrate =(2/3)*100; dissemrate = (2/2)*100;
transrate = (2/2)*100; st=(220+100)/2 ##mean for the population, the
pathogen and the dpi
AL: ti dv: 21dpi infectrate = (2/4)*100
infectrate = (number positif/number of individuals tested)*100;
dissemrate = (number positif$dissem/number positif$infect)*100;
transrate = (number positif$trans/number positif$dissem)*100;
strate = mean($st);
species population individual pathogen dpi infect dissem trans st
AL yu 1 dv 21 1 1 1 220
AL yu 2 dv 21 1 1 1 100
AL yu 3 dv 21 0 0 0 0
AL ti 1 dv 21 0 0 0 0
AL ti 2 dv 21 1 1 1 60
AL ti 3 dv 21 1 1 0 0
AL ti 4 dv 21 0 0 0 0
AA dla 1 dv 21 1 1 1 180
AA dla 2 dv 21 1 1 0 0
AA dla 3 dv 21 1 1 1 360
AL yu 1 zk 21 0 0 0 0
AL yu 2 zk 21 0 0 0 0
AA mra 1 zk 14 1 1
AA mra 2 zk 14 1 1
AA yu 1 yv 21 0 0 0 0
AA yu 2 yv 21 1 1 0 0
AL bz 1 zk 14 1 1
AL bz 2 zk 14 1 1
I've tried to use the dplyr package, but I didn't succeed.
...
When I compute the code, it gives the same value for all the population for an index.
Any help is needed, Thanks in advance.
I am not sure I fully understood the calculations. I think this is what you are looking for.
library(tidyverse)
df <-
data.frame(stringsAsFactors=FALSE,
species = c("AL", "AL", "AL", "AL", "AL", "AL", "AL", "AA", "AA", "AA",
"AL", "AL", "AA", "AA", "AA", "AA", "AL", "AL"),
population = c("yu", "yu", "yu", "ti", "ti", "ti", "ti", "dla", "dla",
"dla", "yu", "yu", "mra", "mra", "yu", "yu", "bz", "bz"),
individual = c(1, 2, 3, 1, 2, 3, 4, 1, 2, 3, 1, 2, 1, 2, 1, 2, 1, 2),
pathogen = c("dv", "dv", "dv", "dv", "dv", "dv", "dv", "dv", "dv", "dv",
"zk", "zk", "zk", "zk", "yv", "yv", "zk", "zk"),
dpi = c(21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 14, 14, 21,
21, 14, 14),
infect = c(1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1),
dissem = c(1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1),
trans = c(1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, NA, NA, 0, 0, NA, NA),
st = c(220, 100, 0, 0, 60, 0, 0, 180, 0, 360, 0, 0, NA, NA, 0, 0,
NA, NA)
)
# infectrate = (number positif/number of individuals tested)*100;
# dissemrate = (number positif$dissem/number positif$infect)*100;
# transrate = (number positif$trans/number positif$dissem)*100;
# strate = mean($st);
df %>%
group_by(species, population, pathogen, dpi) %>%
summarise(
infectrate = sum(infect)/n()*100,
dissemrate = ifelse(infectrate == 0, 0, sum(dissem)/sum(infect)*100),
transrate = ifelse(dissemrate == 0, 0, sum(trans)/sum(dissem)*100),
strate = mean(st)
) %>%
ungroup()
#> df
# A tibble: 7 x 8
# species population pathogen dpi infectrate dissemrate transrate strate
# <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 AA dla dv 21 100 100 66.7 180
#2 AA mra zk 14 100 100 NA NA
#3 AA yu yv 21 50 100 0 0
#4 AL bz zk 14 100 100 NA NA
#5 AL ti dv 21 50 100 50 15
#6 AL yu dv 21 66.7 100 100 107.
#7 AL yu zk 21 0 0 0 0