I would like to sum up the rows in Col1 for which the Col2 is equal to 0. And add the sum to the next value in Col1. I show an example below. I have many products in the data frame. I started with this:
library(dplyr)
TD %>% group_by(Product,Date) %>% mutate(Sum = sum(Col1[Col2 == 0])[1]) %>%
mutate(Expected = Col1)
Date <- seq(as.Date("2021-01-01"), as.Date("2021-01-07"), by = "day")
Product<-rep("A",7)
Col1 <- c(13, 10, 15, 7, 9, 4, 3)
Col2 <- c(7, 0, 0, 8, 0, 0, 27)
Expected <- c(13, 10, 15, 32, 9, 4, 16)
TD <- data.frame(Date, Product, Col1, Col2, Expected)
Other data:
Date Product Col1 Col2 Expected2
<date> <chr> <dbl> <dbl> <dbl>
1 2021-02-12 831 15 384 631
2 2021-02-13 831 11 373 631
3 2021-02-14 831 13 360 631
4 2021-02-15 831 14 826 631
5 2020-12-03 832 10 11 20
6 2020-12-04 832 10 1 20
7 2020-12-05 832 7 0 7
8 2020-12-06 832 11 0 11
9 2020-12-07 832 13 0 13
10 2020-12-08 832 10 0 10
structure(list(Date = structure(c(18670, 18671, 18672, 18673,
18599, 18600, 18601, 18602, 18603, 18604, 18605, 18606, 18607,
18608, 18609, 18610, 18611, 18612, 18613, 18614, 18615, 18616,
18617, 18618, 18619, 18620, 18621, 18622, 18623, 18624), class = "Date"),
Product = c("831", "831", "831", "831", "832", "832", "832",
"832", "832", "832", "832", "832", "832", "832", "832", "832",
"832", "832", "832", "832", "832", "832", "832", "832", "832",
"832", "832", "832", "832", "832"), Col1 = c(15, 11, 13,
14, 10, 10, 7, 11, 13, 10, 8, 11, 9, 8, 10, 17, 15, 17, 16,
16, 14, 14, 15, 17, 18, 16, 17, 18, 18, 8), Col2 = c(384,
373, 360, 826, 11, 1, 0, 0, 0, 0, 0, 70, 61, 53, 43, 26,
11, 0, 0, 84, 70, 56, 41, 24, 6, 0, 0, 0, 0, 0), Expected2 = c(631,
631, 631, 631, 20, 20, 7, 11, 13, 10, 8, 119, 119, 119, 119,
119, 119, 17, 16, 127, 127, 127, 127, 127, 127, 16, 17, 18,
18, 8)), row.names = c(NA, -30L), class = c("tbl_df", "tbl",
"data.frame"))
We can do
library(dplyr)
TD %>%
group_by(Product) %>%
group_by(grp = cumsum(lag(Col2 != 0, default = FALSE)), .add = TRUE) %>%
mutate(Expected2 = sum(Col1)) %>%
group_by(Product) %>%
mutate(Expected2 = case_when(Col2 == 0 | !any(Col2 == 0) ~
Col1, TRUE ~ Expected2)) %>%
ungroup %>%
select(-grp)
-output
# A tibble: 7 x 6
# Date Product Col1 Col2 Expected Expected2
# <date> <chr> <dbl> <dbl> <dbl> <dbl>
#1 2021-01-01 A 13 7 13 13
#2 2021-01-02 A 10 0 10 10
#3 2021-01-03 A 15 0 15 15
#4 2021-01-04 A 7 8 32 32
#5 2021-01-05 A 9 0 9 9
#6 2021-01-06 A 4 0 4 4
#7 2021-01-07 A 3 27 16 16
-testing on the second dataset
TD2 %>%
group_by(Product) %>%
group_by(grp = cumsum(lag(Col2 != 0, default = FALSE)), .add = TRUE)%>%
mutate(tmp = sum(Col1), Expected2 = case_when(any(Col2 == 0) &
(row_number() == n() & Col2 != 0) ~ tmp, TRUE ~ Col1)) %>%
ungroup %>%
select(-grp, -tmp)
-output
# A tibble: 15 x 5
# Date Product Col1 Col2 Expected2
# <chr> <chr> <int> <int> <int>
# 1 2020-12-03 00:00:00 B 10 206 10
# 2 2020-12-04 00:00:00 B 5 364 5
# 3 2020-12-05 00:00:00 B 10 354 10
# 4 2020-12-06 00:00:00 B 8 346 8
# 5 2020-12-07 00:00:00 B 5 341 5
# 6 2020-12-08 00:00:00 B 8 333 8
# 7 2020-12-09 00:00:00 B 12 321 12
# 8 2020-12-10 00:00:00 B 5 316 5
# 9 2020-12-11 00:00:00 B 7 309 7
#10 2020-12-12 00:00:00 B 13 296 13
#11 2020-12-13 00:00:00 B 9 287 9
#12 2020-12-14 00:00:00 B 11 276 11
#13 2020-12-15 00:00:00 B 10 266 10
#14 2020-12-16 00:00:00 B 17 249 17
#15 2020-12-17 00:00:00 B 14 235 14
Or the image data
TD3 %>%
group_by(Product) %>%
group_by(grp = cumsum(lag(Col2 != 0, default = FALSE)), .add = TRUE) %>%
mutate(tmp = sum(Col1), Expected2 = case_when(any(Col2 == 0) &
(row_number() == n() & Col2 != 0) ~ tmp, TRUE ~ Col1)) %>%
ungroup %>%
select(-grp, -tmp)
-output
# A tibble: 21 x 4
# Product Col1 Col2 Expected2
# <chr> <dbl> <dbl> <dbl>
# 1 C 11 52 11
# 2 C 7 45 7
# 3 C 6 39 6
# 4 C 15 24 15
# 5 C 14 10 14
# 6 C 10 0 10
# 7 C 8 0 8
# 8 C 10 125 28
# 9 C 12 113 12
#10 C 11 102 11
# … with 11 more rows
Or using the OP's latest dput data
TD4 %>%
group_by(Product) %>%
group_by(grp = cumsum(lag(Col2 != 0, default = FALSE)), .add = TRUE) %>%
mutate(tmp = sum(Col1), Expected2 = case_when(any(Col2 == 0) &
(row_number() == n() & Col2 != 0) ~ tmp, TRUE ~ Col1)) %>%
ungroup %>%
select(-grp, -tmp) %>%
as.data.frame
-output
# Date Product Col1 Col2 Expected2
#1 2021-02-12 831 15 384 15
#2 2021-02-13 831 11 373 11
#3 2021-02-14 831 13 360 13
#4 2021-02-15 831 14 826 14
#5 2020-12-03 832 10 11 10
#6 2020-12-04 832 10 1 10
#7 2020-12-05 832 7 0 7
#8 2020-12-06 832 11 0 11
#9 2020-12-07 832 13 0 13
#10 2020-12-08 832 10 0 10
#11 2020-12-09 832 8 0 8
#12 2020-12-10 832 11 70 60
#13 2020-12-11 832 9 61 9
#14 2020-12-12 832 8 53 8
#15 2020-12-13 832 10 43 10
#16 2020-12-14 832 17 26 17
#17 2020-12-15 832 15 11 15
#18 2020-12-16 832 17 0 17
#19 2020-12-17 832 16 0 16
#20 2020-12-18 832 16 84 49
#21 2020-12-19 832 14 70 14
#22 2020-12-20 832 14 56 14
#23 2020-12-21 832 15 41 15
#24 2020-12-22 832 17 24 17
#25 2020-12-23 832 18 6 18
#26 2020-12-24 832 16 0 16
#27 2020-12-25 832 17 0 17
#28 2020-12-26 832 18 0 18
#29 2020-12-27 832 18 0 18
#30 2020-12-28 832 8 0 8
data
TD2 <- structure(list(Date = c("2020-12-03 00:00:00", "2020-12-04 00:00:00",
"2020-12-05 00:00:00", "2020-12-06 00:00:00", "2020-12-07 00:00:00",
"2020-12-08 00:00:00", "2020-12-09 00:00:00", "2020-12-10 00:00:00",
"2020-12-11 00:00:00", "2020-12-12 00:00:00", "2020-12-13 00:00:00",
"2020-12-14 00:00:00", "2020-12-15 00:00:00", "2020-12-16 00:00:00",
"2020-12-17 00:00:00"), Product = c("B", "B", "B", "B", "B",
"B", "B", "B", "B", "B", "B", "B", "B", "B", "B"), Col1 = c(10L,
5L, 10L, 8L, 5L, 8L, 12L, 5L, 7L, 13L, 9L, 11L, 10L, 17L, 14L
), Col2 = c(206L, 364L, 354L, 346L, 341L, 333L, 321L, 316L, 309L,
296L, 287L, 276L, 266L, 249L, 235L), Expected2 = c(144L, 144L,
144L, 144L, 144L, 144L, 144L, 144L, 144L, 144L, 144L, 144L, 144L,
144L, 144L)), class = "data.frame", row.names = c("1", "2", "3",
"4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15"
))
TD3 <- structure(list(Product = c("C", "C", "C", "C", "C", "C", "C",
"C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "C",
"C"), Col1 = c(11, 7, 6, 15, 14, 10, 8, 10, 12, 11, 10, 20, 20,
22, 19, 23, 21, 20, 26, 26, 27), Col2 = c(52, 45, 39, 24, 10,
0, 0, 125, 113, 102, 92, 72, 52, 30, 11, 138, 117, 97, 71, 45,
18)), class = "data.frame", row.names = c(NA, -21L))
Related
I have a data frame which is a time series data but multiple items, their data starts from different dates.
I want to figure out a way to dynamically take the data from the month in which the first data point is visible and ignore the 0 values in the start of it and perform outlier cleansing . This is because if I fix the the start of the time frame and end date of time frame, the results are wrong.
I was planning on using a for loop and perform an outlier identification but the issue is I need to find the start date and end date.
The start date is for the cases where there is 0 for at least 3M before we see the first data point and then select the date of the first data point as the start year and month. The end case is for cases when the value is 0 value for 3M after the last data point and the last data point will be selected as the End year and end month. For cases where there is no 0 at the start or end, we can fix the dates.
structure(list(`Row Labels` = c("2019-01-01", "2019-02-01", "2019-03-01",
"2019-04-01", "2019-05-01", "2019-06-01", "2019-07-01", "2019-08-01",
"2019-09-01", "2019-10-01", "2019-11-01", "2019-12-01", "2020-01-01",
"2020-02-01", "2020-03-01", "2020-04-01", "2020-05-01", "2020-06-01",
"2020-07-01", "2020-08-01", "2020-09-01", "2020-10-01", "2020-11-01",
"2020-12-01", "2021-01-01", "2021-02-01", "2021-03-01", "2021-04-01",
"2021-05-01", "2021-06-01", "2021-07-01", "2021-08-01", "2021-09-01",
"2021-10-01", "2021-11-01", "2021-12-01", "2022-01-01", "2022-02-01",
"2022-03-01", "2022-04-01", "2022-05-01", "2022-06-01", "2022-07-01",
"2022-08-01", "2022-09-01", "2022-10-01"), `XYZ|146` = c(0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12, 18, 16, 16, 17, 12, 22, 6,
7, 6, 0, 15, 0, 17, 17, 5, 19, 16, 7, 25, 19, 34, 26, 41, 50,
29, 42, 20, 14, 16, 27, 10, 28, 21), `XYZ|666` = c(0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 17, 45,
9, 21, 33, 3, 8, 11, 16, 3, 17, 14, 59, 26, 35, 26, 15, 7, 4,
4, 2, 7, 6, 2), `XYZ|574` = c(0, 0, 0, 0, 0, 0, 0, 0, 74, 179,
464, 880, 324, 184, 90, 170, 140, 96, 78, 83, 83, 121, 245, 740,
332, 123, 117, 138, 20, 42, 70, 70, 42, 103, 490, 641, 488, 245,
142, 95, 63, 343, 57, 113, 100, 105), `XYZ|851` = c(0, 0, 0,
0, 0, 0, 0, 0, 0, 206, 1814, 2324, 772, 1116, 1636, 1906, 957,
829, 911, 786, 938, 1313, 2384, 1554, 1777, 1635, 1534, 1015,
827, 982, 685, 767, 511, 239, 1850, 1301, 426, 261, 201, 33,
0, 0, 0, 0, 0, 0)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, -46L))
So can someone help me identify a method to identify the start date and end dates dynamically in each columns.
The code I have written is.
library(readxl)
library(dplyr)
library(forecast)
Book2 <- read_excel("C:/X/X/X- Y/X/Book5.xlsx")
View(Book2)
dput(Book2)
Dates <- Book2 %>%
select(`Row Labels`)
for(i in 2:ncol(Book2))
{
Start_Year =
Start_Month =
End_Year =
End_Month =
Y <- ts(data = Book2[,i],
frequency = 12,
start = c(Start_Year,Start_Month),
end = c(End_Year,End_Month))
autoplot(tsclean(Y),series = "Clean", color = 'red', lwd=0.9) +
autolayer(Y,series = "original",color = "grey",lwd=1)
}
Can someone help me to figure out how to set out the Start_Year, Start_Month and End_Year, End_Month dynamically based on the above mentioned logic.
Is this doable or is it too difficult?
I believe that NAs are much easier to deal with (as far as auto-removal) than 0s, so let's do a rolling-window on the data to NA-ize where three or more are all 0s. (Also, since the rows to remove will vary between columns, there's no way to remove some rows from one column and keep them for another. This way, the frame never changes dims, so it retains its data.frame properties nicely.)
The biggest weakness with this is that it assumes that each row is a month; if you have gaps, you will need to adapt the width= argument to rollapply based on the time spans.
(There is no strict need to define this fun, you can use the rollapply directly as across(.., ~ zoo::rollapply(z, 3, ...)). The reason I defined fun was for terse code.)
fun <- function(z) zoo::rollapply(z, 3, align = "right", partial = TRUE, FUN = function(z) if (all(z %in% c(NA, 0))) z[length(z)][NA] else z[length(z)])
dplyr
library(dplyr)
quux %>%
mutate(across(-`Row Labels`, ~ fun(.))) %>%
print(n=99)
# # A tibble: 46 × 5
# `Row Labels` `XYZ|146` `XYZ|666` `XYZ|574` `XYZ|851`
# <chr> <dbl> <dbl> <dbl> <dbl>
# 1 2019-01-01 NA NA NA NA
# 2 2019-02-01 NA NA NA NA
# 3 2019-03-01 NA NA NA NA
# 4 2019-04-01 NA NA NA NA
# 5 2019-05-01 NA NA NA NA
# 6 2019-06-01 NA NA NA NA
# 7 2019-07-01 NA NA NA NA
# 8 2019-08-01 NA NA NA NA
# 9 2019-09-01 NA NA 74 NA
# 10 2019-10-01 NA NA 179 206
# 11 2019-11-01 NA NA 464 1814
# 12 2019-12-01 NA NA 880 2324
# 13 2020-01-01 12 NA 324 772
# 14 2020-02-01 18 NA 184 1116
# 15 2020-03-01 16 NA 90 1636
# 16 2020-04-01 16 NA 170 1906
# 17 2020-05-01 17 NA 140 957
# 18 2020-06-01 12 NA 96 829
# 19 2020-07-01 22 NA 78 911
# 20 2020-08-01 6 NA 83 786
# 21 2020-09-01 7 NA 83 938
# 22 2020-10-01 6 NA 121 1313
# 23 2020-11-01 0 17 245 2384
# 24 2020-12-01 15 45 740 1554
# 25 2021-01-01 0 9 332 1777
# 26 2021-02-01 17 21 123 1635
# 27 2021-03-01 17 33 117 1534
# 28 2021-04-01 5 3 138 1015
# 29 2021-05-01 19 8 20 827
# 30 2021-06-01 16 11 42 982
# 31 2021-07-01 7 16 70 685
# 32 2021-08-01 25 3 70 767
# 33 2021-09-01 19 17 42 511
# 34 2021-10-01 34 14 103 239
# 35 2021-11-01 26 59 490 1850
# 36 2021-12-01 41 26 641 1301
# 37 2022-01-01 50 35 488 426
# 38 2022-02-01 29 26 245 261
# 39 2022-03-01 42 15 142 201
# 40 2022-04-01 20 7 95 33
# 41 2022-05-01 14 4 63 0
# 42 2022-06-01 16 4 343 0
# 43 2022-07-01 27 2 57 NA
# 44 2022-08-01 10 7 113 NA
# 45 2022-09-01 28 6 100 NA
# 46 2022-10-01 21 2 105 NA
base
quux[-1] <- lapply(quux[-1], fun)
quux
# # A tibble: 46 × 5
# `Row Labels` `XYZ|146` `XYZ|666` `XYZ|574` `XYZ|851`
# <chr> <dbl> <dbl> <dbl> <dbl>
# 1 2019-01-01 NA NA NA NA
# 2 2019-02-01 NA NA NA NA
# 3 2019-03-01 NA NA NA NA
# 4 2019-04-01 NA NA NA NA
# 5 2019-05-01 NA NA NA NA
# 6 2019-06-01 NA NA NA NA
# 7 2019-07-01 NA NA NA NA
# 8 2019-08-01 NA NA NA NA
# 9 2019-09-01 NA NA 74 NA
# 10 2019-10-01 NA NA 179 206
# # … with 36 more rows
(all output the same)
I have data on repeated measurements (5 or less) which include blood pressure measurements. I formatted the data in a long format, but seeing as I've done this for the first time, I now no longer see how to get descriptive statistics of my variables.
My example data:
questiondata <- structure(list(id = c(1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3,
3, 4, 4, 5, 5, 5, 5, 6, 6, 7, 7, 8, 8),
time = c("time1", "time2", "time3", "time5", "time1", "time2", "time3", "time5",
"time1", "time2", "time3", "time5", "time4", "time5", "time4", "time5",
"time4", "time5", "time4", "time5", "time4", "time5", "time4", "time5"),
cohort = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2),
systolicBP = c(102, 137, 132, 150, 152, 146, 160.5, 159.5, 144, 138, 137.5, 163,
137, 147, 125, 141, 147, 150, 148, 167.5, 153.5, 164.5, 159, 123),
diastolicBP = c(56, 99, 78, 90, 77, 78, 80.5, 82, 72, 70, 67.5, 61, 86, 90, 80.5,
84, 75, 81, 91, 96, 80, 87.5, 87, 79),
egfr = c(78.2, NA, 55.8, NA, NA, NA, 87.6, NA, NA, NA, 75.6, 70.9, 71.9, 71.8,
47.9, 36.6, 93.7, 81.5, 93.2, 82.1, 92.9, 79.1, 66.6, 55.2)),
row.names = c(NA, -24L), class = c("tbl_df", "tbl", "data.frame"))
Which corresponds to the following tibble:
# A tibble: 24 x 6
id time cohort systolicBP diastolicBP egfr
<dbl> <chr> <dbl> <dbl> <dbl> <dbl>
1 1 time1 1 102 56 78.2
2 1 time2 1 137 99 NA
3 1 time3 1 132 78 55.8
4 1 time5 1 150 90 NA
5 2 time1 1 152 77 NA
6 2 time2 1 146 78 NA
7 2 time3 1 160. 80.5 87.6
8 2 time5 1 160. 82 NA
9 3 time1 1 144 72 NA
10 3 time2 1 138 70 NA
11 3 time3 1 138. 67.5 75.6
12 3 time5 1 163 61 70.9
13 4 time4 2 137 86 71.9
14 4 time5 2 147 90 71.8
15 5 time4 2 125 80.5 47.9
16 5 time5 2 141 84 36.6
17 6 time4 2 147 75 93.7
18 6 time5 2 150 81 81.5
19 7 time4 2 148 91 93.2
20 7 time5 2 168. 96 82.1
21 8 time4 2 154. 80 92.9
22 8 time5 2 164. 87.5 79.1
23 9 time4 2 159 87 66.6
24 9 time5 2 123 79 55.2
So for example now I want to
Get how many persons are in each cohort
What is the mean of the systolicBP, in total, for each cohort, and at each timepoint.
I'm sure this is very easy, but I can't seem to get any viable results.
library(tidyverse)
questiondata %>%
group_by(cohort) %>%
summarise(n = n_distinct(id),
mean_systolic = mean(systolicBP, na.rm = TRUE), .groups = "drop")
#> # A tibble: 2 x 3
#> cohort n mean_systolic
#> <dbl> <int> <dbl>
#> 1 1 3 143.
#> 2 2 6 147.
questiondata %>%
group_by(cohort, time) %>%
summarise(mean_systolic_time = mean(systolicBP, na.rm = TRUE), .groups = "drop")
#> # A tibble: 6 x 3
#> cohort time mean_systolic_time
#> <dbl> <chr> <dbl>
#> 1 1 time1 133.
#> 2 1 time2 140.
#> 3 1 time3 143.
#> 4 1 time5 158.
#> 5 2 time4 145.
#> 6 2 time5 149.
Created on 2021-06-25 by the reprex package (v2.0.0)
questiondata %>%
group_by(cohort, time) %>%
summarise(mean_systolic_time = str_c(round(mean(systolicBP, na.rm = TRUE), digits = 0), " mmHg"),
.groups = "drop")
# A tibble: 6 x 3
cohort time mean_systolic_time
<dbl> <chr> <chr>
1 1 time1 133 mmHg
2 1 time2 140 mmHg
3 1 time3 143 mmHg
4 1 time5 158 mmHg
5 2 time4 145 mmHg
6 2 time5 149 mmHg
I have a large dataset of 70 000 rows that I want to perform some operations on, but I can't find an appropriate solution.
bib sta run course finish comment day
1 42 9 1 SG 19.88 99 1
2 42 17 2 A 19.96 11 1
3 42 27 3 B 20.92 22 1
4 42 39 4 A 19.60 11 1
5 42 48 5 SG 20.24 99 1
6 42 61 6 C 22.90 33 1
7 42 76 7 B 20.70 22 1
8 42 86 8 C 22.74 33 1
9 42 93 9 C 22.75 33 1
10 42 103 10 A 19.79 11 1
11 42 114 11 B 20.67 22 1
12 42 120 12 SG 20.10 99 1
I want to end up with a tibble that:
calculates the mean finish time in SG course for each bib number on one particular day. For example, 19.88 + 20.24 + 20.10 / 3
calculate a difference score for each observation in the dataset by subtracting finish from this mean SG score. For example, 19.88 - mean(SG), 19.96 - mean(SG).
I have tried the following approach:
First group by day, bib and course. Then filter by SG and calculate the mean:
avg.sgtime <- df %>%
group_by(day, bib, course) %>%
filter(course == 'SG') %>%
mutate(avg.sg = mean(finish))
Resulting in the following tibble
bib sta run course finish comment day avg.sg
<int> <int> <int> <chr> <dbl> <int> <chr> <dbl>
1 42 9 1 SG 19.9 99 1 20.1
2 42 48 5 SG 20.2 99 1 20.1
3 42 120 12 SG 20.1 99 1 20.1
4 42 6 1 SG 20.0 99 2 19.9
5 42 42 5 SG 19.8 77 2 19.9
6 42 130 15 SG 19.9 99 2 19.9
7 42 6 1 SG 20.6 99 3 20.5
8 42 68 12 SG 20.6 77 3 20.5
9 42 90 15 SG 20.4 77 3 20.5
Finally I join the two tibbles together using the following syntax:
df %>% full_join(avg.sgtime) %>%
mutate(diff = finish - avg.sg)
However, this doesn't work. It only works for the SG course but not for course A, B and C. Is there a way to fix this or is there a better solution to the problem?
bib sta run course finish comment day avg.sg diff
1 42 9 1 SG 19.88 99 1 20.07333 -0.193333333
2 42 17 2 A 19.96 11 1 NA NA
3 42 27 3 B 20.92 22 1 NA NA
4 42 39 4 A 19.60 11 1 NA NA
5 42 48 5 SG 20.24 99 1 20.07333 0.166666667
You can filter your values for finish within the mutate() and calculate the mean based on those:
df %>%
group_by(day,bib) %>%
mutate(
avg.sg = mean(finish[course=="SG"]),
diff = finish - avg.sg)
Is the following what you are aiming for?
(note that I added a few random values for a second bib just to make sure the join is done properly)
The difference to your attempt is using summarise() instead of mutate() to consolidate the avg.sgtime data frame, and also dropping a few columns so that the join is not populated with NAs. Instead of dropping you can also set the relevant columns to join by passing the by argument to the left_join() function.
library(dplyr)
library(tidyr) # for join
avg.sgtime <- df %>%
group_by(day, bib, course) %>%
filter(course == 'SG') %>%
summarise(avg.sg = mean(finish), .groups = "drop") %>%
select(c(bib, day, avg.sg))
avg.sgtime
#> # A tibble: 3 x 3
#> bib day avg.sg
#> <dbl> <dbl> <dbl>
#> 1 42 1 20.1
#> 2 43 1 19.1
#> 3 44 2 19.3
df %>% left_join(avg.sgtime) %>%
mutate(diff = finish - avg.sg)
#> Joining, by = c("bib", "day")
#> # A tibble: 36 x 9
#> bib sta run course finish comment day avg.sg diff
#> <dbl> <dbl> <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 42 9 1 SG 19.9 99 1 20.1 -0.193
#> 2 42 17 2 A 20.0 11 1 20.1 -0.113
#> 3 42 27 3 B 20.9 22 1 20.1 0.847
#> 4 42 39 4 A 19.6 11 1 20.1 -0.473
#> 5 42 48 5 SG 20.2 99 1 20.1 0.167
#> 6 42 61 6 C 22.9 33 1 20.1 2.83
#> 7 42 76 7 B 20.7 22 1 20.1 0.627
#> 8 42 86 8 C 22.7 33 1 20.1 2.67
#> 9 42 93 9 C 22.8 33 1 20.1 2.68
#> 10 42 103 10 A 19.8 11 1 20.1 -0.283
#> # … with 26 more rows
Created on 2021-07-04 by the reprex package (v2.0.0)
data
df <- tribble(~bib, ~sta, ~run, ~course, ~finish, ~comment, ~day,
42, 9, 1, "SG", 19.88, 99, 1,
42, 17, 2, "A", 19.96, 11, 1,
42, 27, 3, "B", 20.92, 22, 1,
42, 39, 4, "A", 19.60, 11, 1,
42, 48, 5, "SG", 20.24, 99, 1,
42, 61, 6, "C", 22.90, 33, 1,
42, 76, 7, "B", 20.70, 22, 1,
42, 86, 8, "C", 22.74, 33, 1,
42, 93, 9, "C", 22.75, 33, 1,
42, 103, 10, "A", 19.79, 11, 1,
42, 114, 11, "B", 20.67, 22, 1,
42, 120, 12, "SG", 20.10, 99, 1,
43, 9, 1, "SG", 19.12, 99, 1,
43, 17, 2, "A", 19.64, 11, 1,
43, 27, 3, "B", 20.62, 22, 1,
43, 39, 4, "A", 19.23, 11, 1,
43, 48, 5, "SG", 20.11, 99, 1,
43, 61, 6, "C", 22.22, 33, 1,
43, 76, 7, "B", 20.33, 22, 1,
43, 86, 8, "C", 22.51, 33, 1,
43, 93, 9, "C", 22.78, 33, 1,
43, 103, 10, "A", 19.98, 11, 1,
43, 114, 11, "B", 20.11, 22, 1,
43, 120, 12, "SG", 18.21, 99, 1,
44, 9, 1, "SG", 19.18, 99, 2,
44, 17, 2, "A", 19.56, 11, 2,
44, 27, 3, "B", 20.62, 22, 2,
44, 39, 4, "A", 19.20, 11, 2,
44, 48, 5, "SG", 20.74, 99, 2,
44, 61, 6, "C", 22.50, 33, 2,
44, 76, 7, "B", 20.60, 22, 2,
44, 86, 8, "C", 22.74, 33, 2,
44, 93, 9, "C", 22.85, 33, 2,
44, 103, 10, "A", 19.59, 11, 2,
44, 114, 11, "B", 20.27, 22, 2,
44, 120, 12, "SG", 18.10, 99, 2,
)
Thanks #Marcelo Avila for providing me with a very good hint:
I hope this is what you are looking for:
library(dplyr)
df %>%
group_by(bib, day) %>%
mutate(across(finish, ~ mean(.x[course == "SG"]), .names = "avg_{.col}"),
diff = finish - avg_finish,
avg_finish = ifelse(course == "SG", avg_finish, NA))
# A tibble: 12 x 9
# Groups: bib, day [1]
bib sta run course finish comment day avg_finish diff
<int> <int> <int> <chr> <dbl> <int> <int> <dbl> <dbl>
1 42 9 1 SG 19.9 99 1 20.1 -0.193
2 42 17 2 A 20.0 11 1 NA -0.113
3 42 27 3 B 20.9 22 1 NA 0.847
4 42 39 4 A 19.6 11 1 NA -0.473
5 42 48 5 SG 20.2 99 1 20.1 0.167
6 42 61 6 C 22.9 33 1 NA 2.83
7 42 76 7 B 20.7 22 1 NA 0.627
8 42 86 8 C 22.7 33 1 NA 2.67
9 42 93 9 C 22.8 33 1 NA 2.68
10 42 103 10 A 19.8 11 1 NA -0.283
11 42 114 11 B 20.7 22 1 NA 0.597
12 42 120 12 SG 20.1 99 1 20.1 0.0267
I also added another alternative solution with a minor change, using dear #Marcelo Avila's data set:
df %>%
group_by(bib, day) %>%
mutate(across(finish, ~ mean(.x[select(cur_data(), course) == "SG"]), .names = "avg_{.col}"),
diff = finish - avg_finish,
avg_finish = ifelse(course == "SG", avg_finish, NA))
# A tibble: 36 x 9
# Groups: bib, day [3]
bib sta run course finish comment day avg_finish diff
<dbl> <dbl> <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 42 9 1 SG 19.9 99 1 20.1 -0.193
2 42 17 2 A 20.0 11 1 NA -0.113
3 42 27 3 B 20.9 22 1 NA 0.847
4 42 39 4 A 19.6 11 1 NA -0.473
5 42 48 5 SG 20.2 99 1 20.1 0.167
6 42 61 6 C 22.9 33 1 NA 2.83
7 42 76 7 B 20.7 22 1 NA 0.627
8 42 86 8 C 22.7 33 1 NA 2.67
9 42 93 9 C 22.8 33 1 NA 2.68
10 42 103 10 A 19.8 11 1 NA -0.283
# ... with 26 more rows
I'm having trouble using the pivot_longer on blocks of variables. Suppose I have this:
and I want this:
dfwide <- structure(list(date = structure(c(1577836800, 1577923200, 1578009600,
1578096000, 1578182400, 1578268800), class = c("POSIXct", "POSIXt"
), tzone = "UTC"), x1_a = c(20, 15, 12, NA, 25, 27), x1_b = c(33,
44, 85, 10, 12, 3), x1_c = c(70, 20, 87, 11, 20, 5), x2_a = c(85,
65, 33, 46, 82, 9), x2_b = c(87, 25, 55, 64, 98, 5), x2_c = c(77,
51, 92, 20, 37, 98)), row.names = c(NA, -6L), class = c("tbl_df",
"tbl", "data.frame"))
##Tried:
dfwide %>%
pivot_longer(cols = -date,
names_sep = c("x1", "x2"),
names_to = c("a", "b", "c"),
values_to = "value")
This line is taking advantage of the name separation option of the pivot_longer function.
pivot_longer(dfwide, -date, names_sep = "_",
names_to=c("which", ".value")) %>%
arrange(which)
# A tibble: 12 x 5
date which a b c
<dttm> <chr> <dbl> <dbl> <dbl>
1 2020-01-01 00:00:00 x1 20 33 70
2 2020-01-02 00:00:00 x1 15 44 20
3 2020-01-03 00:00:00 x1 12 85 87
4 2020-01-04 00:00:00 x1 NA 10 11
5 2020-01-05 00:00:00 x1 25 12 20
6 2020-01-06 00:00:00 x1 27 3 5
7 2020-01-01 00:00:00 x2 85 87 77
8 2020-01-02 00:00:00 x2 65 25 51
9 2020-01-03 00:00:00 x2 33 55 92
10 2020-01-04 00:00:00 x2 46 64 20
11 2020-01-05 00:00:00 x2 82 98 37
12 2020-01-06 00:00:00 x2 9 5 98
You can try this code:
library(tidyverse)
dfwide %>%
pivot_longer(cols = -date,
names_to = "which",
values_to = "value") %>%
separate(which, into = c("which","letter"), sep = "_") %>%
pivot_wider(names_from = "letter", values_from = "value") %>%
arrange(which)
This is the result:
# A tibble: 12 x 5
date which a b c
<dttm> <chr> <dbl> <dbl> <dbl>
1 2020-01-01 00:00:00 x1 20 33 70
2 2020-01-02 00:00:00 x1 15 44 20
3 2020-01-03 00:00:00 x1 12 85 87
4 2020-01-04 00:00:00 x1 NA 10 11
5 2020-01-05 00:00:00 x1 25 12 20
6 2020-01-06 00:00:00 x1 27 3 5
7 2020-01-01 00:00:00 x2 85 87 77
8 2020-01-02 00:00:00 x2 65 25 51
9 2020-01-03 00:00:00 x2 33 55 92
10 2020-01-04 00:00:00 x2 46 64 20
11 2020-01-05 00:00:00 x2 82 98 37
12 2020-01-06 00:00:00 x2 9 5 98
If you're okay with doing it in more than one step, this works. First gather the columns, separate by underscore, and then spread the values.
pivot_longer(dfwide, x1_a:x2_c, names_to="which") %>%
extract(which, into=c("var", "letter"), regex="(.*)_(.*)") %>%
pivot_wider(names_from=letter, values_from=value)
I have a dataframe and have duplicated rows based on the difference between two dates: Date (date of collection of samples) and hatch (date of birth) using the following code. Difference (basically lifespan in months) ranges from 4-12 months:
library(zoo)
test$difference <- 12 * as.numeric(as.yearmon(test$Date) - as.yearmon(test$hatch))
test$difference <- ceiling(test$difference)
test2 <- test[rep(row.names(test), test$difference),]
I need to create a sequence for each specimen (each of which has a unique serial number, sn) starting from the hatching date, increasing in 1 month increments and with the length out equal to the value in the differences column. I tried a loop but I could not work out how to phrase it as I'm quite new to R.
At the moment the dates are in the format %Y-%m-%d but I'm only interested in the month and year groupings.
Any insights would be really useful :)
structure(list(Sex = structure(c(3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("F", "J", "M"), class = "factor"),
Maturity = c(5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L,
5L, 5L, 5L, 5L), XLength = c(12, 12, 12, 12, 12, 12, 12,
12, 12, 16.5, 16.5, 16.5, 16.5, 16.5, 16.5), Weight = c(44.1,
44.1, 44.1, 44.1, 44.1, 44.1, 44.1, 44.1, 44.1, 73.6, 73.6,
73.6, 73.6, 73.6, 73.6), Ringcount = c(232L, 232L, 232L,
232L, 232L, 232L, 232L, 232L, 232L, 225L, 225L, 225L, 225L,
225L, 225L), Date = structure(c(10480, 10480, 10480, 10480,
10480, 10480, 10480, 10480, 10480, 10480, 10480, 10480, 10480,
10480, 10480), class = "Date"), hatch = structure(c(10248,
10248, 10248, 10248, 10248, 10248, 10248, 10248, 10248, 10255,
10255, 10255, 10255, 10255, 10255), class = "Date"), sn = c(1,
1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2), difference = c(9,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9)), .Names = c("Sex",
"Maturity", "XLength", "Weight", "Ringcount", "Date", "hatch",
"sn", "difference"), row.names = c(NA, 15L), class = "data.frame")
1) Assuming you want to add a new column with the dates, using test we can write the following. It uses no packages.
Seq <- function(h) seq(h[1], length = length(h), by = "month")
transform(test, dates = ave(hatch, sn, FUN = Seq))
giving:
Sex Maturity XLength Weight Ringcount Date hatch sn difference dates
1 M 5 12.0 44.1 232 1998-09-11 1998-01-22 1 9 1998-01-22
2 M 5 12.0 44.1 232 1998-09-11 1998-01-22 1 9 1998-02-22
3 M 5 12.0 44.1 232 1998-09-11 1998-01-22 1 9 1998-03-22
4 M 5 12.0 44.1 232 1998-09-11 1998-01-22 1 9 1998-04-22
5 M 5 12.0 44.1 232 1998-09-11 1998-01-22 1 9 1998-05-22
6 M 5 12.0 44.1 232 1998-09-11 1998-01-22 1 9 1998-06-22
7 M 5 12.0 44.1 232 1998-09-11 1998-01-22 1 9 1998-07-22
8 M 5 12.0 44.1 232 1998-09-11 1998-01-22 1 9 1998-08-22
9 M 5 12.0 44.1 232 1998-09-11 1998-01-22 1 9 1998-09-22
10 M 5 16.5 73.6 225 1998-09-11 1998-01-29 2 9 1998-01-29
11 M 5 16.5 73.6 225 1998-09-11 1998-01-29 2 9 1998-03-01
12 M 5 16.5 73.6 225 1998-09-11 1998-01-29 2 9 1998-03-29
13 M 5 16.5 73.6 225 1998-09-11 1998-01-29 2 9 1998-04-29
14 M 5 16.5 73.6 225 1998-09-11 1998-01-29 2 9 1998-05-29
15 M 5 16.5 73.6 225 1998-09-11 1998-01-29 2 9 1998-06-29
2) Alternately if you just want year and month use yearmon class from zoo.
library(zoo)
Seq_ym <- function(h) h[1] + (seq_along(h) - 1) / 12
transform(test, dates = ave(as.yearmon(hatch), sn, FUN = Seq_ym))
giving:
Sex Maturity XLength Weight Ringcount Date hatch sn difference dates
1 M 5 12.0 44.1 232 1998-09-11 1998-01-22 1 9 Jan 1998
2 M 5 12.0 44.1 232 1998-09-11 1998-01-22 1 9 Feb 1998
3 M 5 12.0 44.1 232 1998-09-11 1998-01-22 1 9 Mar 1998
4 M 5 12.0 44.1 232 1998-09-11 1998-01-22 1 9 Apr 1998
5 M 5 12.0 44.1 232 1998-09-11 1998-01-22 1 9 May 1998
6 M 5 12.0 44.1 232 1998-09-11 1998-01-22 1 9 Jun 1998
7 M 5 12.0 44.1 232 1998-09-11 1998-01-22 1 9 Jul 1998
8 M 5 12.0 44.1 232 1998-09-11 1998-01-22 1 9 Aug 1998
9 M 5 12.0 44.1 232 1998-09-11 1998-01-22 1 9 Sep 1998
10 M 5 16.5 73.6 225 1998-09-11 1998-01-29 2 9 Jan 1998
11 M 5 16.5 73.6 225 1998-09-11 1998-01-29 2 9 Feb 1998
12 M 5 16.5 73.6 225 1998-09-11 1998-01-29 2 9 Mar 1998
13 M 5 16.5 73.6 225 1998-09-11 1998-01-29 2 9 Apr 1998
14 M 5 16.5 73.6 225 1998-09-11 1998-01-29 2 9 May 1998
15 M 5 16.5 73.6 225 1998-09-11 1998-01-29 2 9 Jun 1998