analyse multiple row dependig of the value of the first? - r

I have this table :
record_id result date_start date_end
1 1 pos
2 1 26/06/2019 28/06/2019
3 1 27/06/2019 29/06/2019
4 1 28/06/2019 30/06/2019
5 1 29/06/2019 01/07/2019
6 2 neg
7 2 01/07/2019 03/07/2019
8 2 02/07/2019 04/07/2019
9 2 03/07/2019 05/07/2019
10 2 04/07/2019 06/07/2019
11 2 05/07/2019 07/07/2019
12 3 pos
13 3 07/07/2019 09/07/2019
14 3 08/07/2019 10/07/2019
I want to calculate the difference of date for each row, no problem with that. What i want after that is to analyse the group of "pos" and "neg" separately. But i have not the value of result in my data when i have the date. This is data imported from REDCap, with repeat instruments.
I use tidyverse, and i think dplyr could help, isn't it a pivot_wider i must do ? I've try, but no way ...
Thanks if anyone could help ...

Like this, to e.g., calculate the mean date difference per group?
library(tidyverse)
library(lubridate)
df %>%
fill(result, .direction = "down") %>%
filter(!is.na(date_start)) %>%
mutate(date_start = dmy(date_start),
date_end = dmy(date_end)) %>%
group_by(result) %>%
summarise(mean_date_dif = mean(date_end - date_start))
#`summarise()` ungrouping output (override with `.groups` argument)
## A tibble: 2 x 2
# result mean_date_dif
# <chr> <drtn>
#1 neg 2 days
#2 pos 2 days
Data
df <- tibble::tribble(
~record_id, ~result, ~date_start, ~date_end,
1L, "pos", NA, NA,
1L, NA, "26/06/2019", "28/06/2019",
1L, NA, "27/06/2019", "29/06/2019",
1L, NA, "28/06/2019", "30/06/2019",
1L, NA, "29/06/2019", "01/07/2019",
2L, "neg", NA, NA,
2L, NA, "01/07/2019", "03/07/2019",
2L, NA, "02/07/2019", "04/07/2019",
2L, NA, "03/07/2019", "05/07/2019",
2L, NA, "04/07/2019", "06/07/2019",
2L, NA, "05/07/2019", "07/07/2019",
3L, "pos", NA, NA,
3L, NA, "07/07/2019", "09/07/2019",
3L, NA, "08/07/2019", "10/07/2019"
)

Related

Collapsing data per family

I have this data set, with values for twins within families:
zyg fid x_t1 x_t2 y_t1 y_t2
1 499474 NA 1 1 NA
1 499474 NA NA NA NA
1 499474 NA NA NA 1
1 499474 NA NA NA NA
1 499540 NA NA 1 NA
1 499540 NA NA NA NA
2 499874 NA NA NA NA
2 499874 NA NA 1 NA
2 499874 NA NA NA 1
2 499874 2 NA NA 1
How do I collapse the families retaining the phenotype information for x and y, when these are present?
The expected for family 499479 is:
zyg fid x_t1 x_t2 y_t1 y_t2
1 499474 NA 1 1 1
and for family 499874, it should be:
2 499874 2 NA 1 1
You can use the following code:
library(dplyr)
df %>%
group_by(fid) %>%
summarise_all(~first(na.omit(.)))
Output:
# A tibble: 3 × 6
fid zyg x_t1 x_t2 y_t1 y_t2
<int> <int> <int> <int> <int> <int>
1 499474 1 NA 1 1 1
2 499540 1 NA NA 1 NA
3 499874 2 2 NA 1 1
Your data:
df<-structure(list(zyg = c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L
), fid = c(499474L, 499474L, 499474L, 499474L, 499540L, 499540L,
499874L, 499874L, 499874L, 499874L), x_t1 = c(NA, NA, NA, NA,
NA, NA, NA, NA, NA, 2L), x_t2 = c(1L, NA, NA, NA, NA, NA, NA,
NA, NA, NA), y_t1 = c(1L, NA, NA, NA, 1L, NA, NA, 1L, NA, NA),
y_t2 = c(NA, NA, 1L, NA, NA, NA, NA, NA, 1L, 1L)), class = "data.frame", row.names = c(NA,
-10L))
If there are only one non-NA element, per group
library(dplyr)
df1 %>%
group_by(zyg, fid) %>%
summarise(across(everything(), ~ .x[complete.cases(.x)][1]), .groups = "drop")
-output
# A tibble: 3 × 6
zyg fid x_t1 x_t2 y_t1 y_t2
<int> <int> <int> <int> <int> <int>
1 1 499474 NA 1 1 1
2 1 499540 NA NA 1 NA
3 2 499874 2 NA 1 1
data
df1 <- structure(list(zyg = c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L
), fid = c(499474L, 499474L, 499474L, 499474L, 499540L, 499540L,
499874L, 499874L, 499874L, 499874L), x_t1 = c(NA, NA, NA, NA,
NA, NA, NA, NA, NA, 2L), x_t2 = c(1L, NA, NA, NA, NA, NA, NA,
NA, NA, NA), y_t1 = c(1L, NA, NA, NA, 1L, NA, NA, 1L, NA, NA),
y_t2 = c(NA, NA, 1L, NA, NA, NA, NA, NA, 1L, 1L)),
class = "data.frame", row.names = c(NA,
-10L))
Another possible solution:
library(dplyr)
df %>%
group_by(zyg, fid) %>%
summarise(across(everything(), ~ if (all(is.na(.x))) {NA} else
{max(.x, na.rm = T)}), .groups = "drop")
#> # A tibble: 3 × 6
#> zyg fid x_t1 x_t2 y_t1 y_t2
#> <int> <int> <int> <int> <int> <int>
#> 1 1 499474 NA 1 1 1
#> 2 1 499540 NA NA 1 NA
#> 3 2 499874 2 NA 1 1
very similiar to the other answers but wanted to give my own solution too.
df %>%
group_by(zyg,fid) %>%
summarise(across(everything(),~sum(.,na.rm=TRUE))
)
You want to do something what coalesce does by rows for columns:
Here is how you could do it:
libarary(dplyr)
coalesce_by_column <- function(df) {
return(dplyr::coalesce(!!! as.list(df)))
}
df %>%
group_by(fid) %>%
summarise(across(everything(), coalesce_by_column))
fid zyg x_t1 x_t2 y_t1 y_t2
<int> <int> <int> <int> <int> <int>
1 499474 1 NA 1 1 1
2 499540 1 NA NA 1 NA
3 499874 2 2 NA 1 1
Here is another possible option using fill and slice:
library(tidyverse)
df %>%
group_by(zyg, fid) %>%
fill(everything(), .direction = "downup") %>%
slice(1)
Output
zyg fid x_t1 x_t2 y_t1 y_t2
<int> <int> <int> <int> <int> <int>
1 1 499474 NA 1 1 1
2 1 499540 NA NA 1 NA
3 2 499874 2 NA 1 1
Data
df <- structure(list(zyg = c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L
), fid = c(499474L, 499474L, 499474L, 499474L, 499540L, 499540L,
499874L, 499874L, 499874L, 499874L), x_t1 = c(NA, NA, NA, NA,
NA, NA, NA, NA, NA, 2L), x_t2 = c(1L, NA, NA, NA, NA, NA, NA,
NA, NA, NA), y_t1 = c(1L, NA, NA, NA, 1L, NA, NA, 1L, NA, NA),
y_t2 = c(NA, NA, 1L, NA, NA, NA, NA, NA, 1L, 1L)), class = "data.frame", row.names = c(NA,
-10L))

time series plot for missing data

I have some sequence event data for which I want to plot the trend of missingness on value across time. Example below:
id time value
1 aa122 1 1
2 aa2142 1 1
3 aa4341 1 1
4 bb132 1 2
5 bb2181 2 1
6 bb3242 2 3
7 bb3321 2 NA
8 cc122 2 1
9 cc2151 2 2
10 cc3241 3 1
11 dd161 3 3
12 dd2152 3 NA
13 dd3282 3 NA
14 ee162 3 1
15 ee2201 4 2
16 ee3331 4 NA
17 ff1102 4 NA
18 ff2141 4 NA
19 ff3232 5 1
20 gg142 5 3
21 gg2192 5 NA
22 gg3311 5 NA
23 gg4362 5 NA
24 ii111 5 NA
The NA suppose to increase over time (the behaviors are fading). How do I plot the NA across time
I think this is what you're looking for? You want to see how many NA's appear over time. Assuming this is correct, if each time is a group, then you can count the number of NA's appear in each group
data:
df <- structure(list(id = structure(1:24, .Label = c("aa122", "aa2142",
"aa4341", "bb132", "bb2181", "bb3242", "bb3321", "cc122", "cc2151",
"cc3241", "dd161", "dd2152", "dd3282", "ee162", "ee2201", "ee3331",
"ff1102", "ff2141", "ff3232", "gg142", "gg2192", "gg3311", "gg4362",
"ii111"), class = "factor"), time = c(1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L,
5L, 5L), value = c(1L, 1L, 1L, 2L, 1L, 3L, NA, 1L, 2L, 1L, 3L,
NA, NA, 1L, 2L, NA, NA, NA, 1L, 3L, NA, NA, NA, NA)), class = "data.frame", row.names = c(NA,
-24L))
library(tidyverse)
library(ggplot2)
df %>%
group_by(time) %>%
summarise(sumNA = sum(is.na(value)))
# A tibble: 5 × 2
time sumNA
<int> <int>
1 1 0
2 2 1
3 3 2
4 4 3
5 5 4
You can then plot this using ggplot2
df %>%
group_by(time) %>%
summarise(sumNA = sum(is.na(value))) %>%
ggplot(aes(x=time)) +
geom_line(aes(y=sumNA))
As you can see, as time increases, the number of NA's also increases

Dividing data into quintiles using group_by

I am looking for a way to change my way in such a way that it sorts the data into quintiles instead of the top 5 and bottom 5. My current code looks like this:
CombData <- CombData %>%
group_by(Date) %>%
mutate(
R=min_rank(Value),
E_P = case_when(
R < 6 ~ "5w",
R > max(R, na.rm =TRUE) - 5 ~ "5b",
TRUE ~ NA_character_)
) %>%
ungroup() %>%
arrange(Date, E_P)
My dataset is quite large therefore I will just provide sample data. The data I use is more complex and the code should, therefore, allow for varying lengths of the column Date and also for multiple values that are missing (NAs):
df <- data.frame( Date = c(rep("2010-01-31",16), rep("2010-02-28", 14)), Value=c(rep(c(1,2,3,4,5,6,7,8,9,NA,NA,NA,NA,NA,15),2))
Afterward, I would also like to test the minimum size of quintiles i.e. how many data points are minimum in each quintile in the entire dataset.
The expected output would look like this:
structure(list(Date = structure(c(14640, 14640, 14640, 14640,
14640, 14640, 14640, 14640, 14640, 14640, 14640, 14640, 14640,
14640, 14640, 14640, 14668, 14668, 14668, 14668, 14668, 14668,
14668, 14668, 14668, 14668, 14668, 14668, 14668, 14668), class = "Date"),
Value = c(1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 15, NA, NA, NA, NA,
NA, 2, 3, 4, 5, 6, 7, 8, 9, 15, NA, NA, NA, NA, NA), R = c(1L,
1L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, NA, NA, NA, NA,
NA, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, NA, NA, NA, NA, NA
), S_P = c("Worst", "Worst", "Worst", NA, NA, NA, NA, "Best",
"Best", "Best", NA, NA, NA, NA, NA, NA, "Worst", "Worst", NA, NA,
NA, NA, NA, "Best", "Best", NA, NA, NA, NA, NA)), row.names = c(NA,
-30L), class = c("tbl_df", "tbl", "data.frame"))
Probably, you could use something like this with quantile :
library(dplyr)
out <- CombData %>%
group_by(Date) %>%
mutate(S_P = case_when(Value <= quantile(Value, 0.2, na.rm = TRUE) ~ 'Worst',
Value >= quantile(Value, 0.8, na.rm = TRUE) ~ 'Best'))
You could change the value of quantile according to your preference.
To get minimum number of "Best" and "Worst" we can do :
out %>%
count(Date, S_P) %>%
na.omit() %>%
ungroup() %>%
select(-Date) %>%
group_by(S_P) %>%
top_n(-1, n)
# S_P n
# <chr> <int>
#1 Best 2
#2 Worst 2
When I understand you correctly, you want to rank your column 'Value' and mark those with rank below the quantile 20% with "worst" and those above 80% with "best". After that you want a table.
You could use use ave for both, the ranking and the quantile identification. The quantile function yields three groups, that you can identify with findInterval, code as a factor variable and label them at will. I'm not sure, though, which ranks should be included in the quantiles, I therefore make the E_P coding in two separate columns for comparison purposes.
dat2 <- within(dat, {
R <- ave(Value, Date, FUN=function(x) rank(x, na.last="keep"))
E_P <- ave(R, Date, FUN=function(x) {
findInterval(x, quantile(R, c(.2, .8), na.rm=TRUE))
})
E_P.fac <- factor(E_P, labels=c("worst", NA, "best"))
})
dat2 <- dat2[order(dat2$Date, dat2$E_P), ] ## order by date and E_P
Yields:
dat2
# Date Value E_P.fac E_P R
# 1 2010-01-31 1 worst 0 1.5
# 16 2010-01-31 1 worst 0 1.5
# 2 2010-01-31 2 <NA> 1 3.0
# 3 2010-01-31 3 <NA> 1 4.0
# 4 2010-01-31 4 <NA> 1 5.0
# 5 2010-01-31 5 <NA> 1 6.0
# 6 2010-01-31 6 <NA> 1 7.0
# 7 2010-01-31 7 <NA> 1 8.0
# 8 2010-01-31 8 best 2 9.0
# 9 2010-01-31 9 best 2 10.0
# 15 2010-01-31 15 best 2 11.0
# 10 2010-01-31 NA <NA> NA NA
# 11 2010-01-31 NA <NA> NA NA
# 12 2010-01-31 NA <NA> NA NA
# 13 2010-01-31 NA <NA> NA NA
# 14 2010-01-31 NA <NA> NA NA
# 17 2010-02-28 2 worst 0 1.0
# 18 2010-02-28 3 worst 0 2.0
# 19 2010-02-28 4 <NA> 1 3.0
# 20 2010-02-28 5 <NA> 1 4.0
# 21 2010-02-28 6 <NA> 1 5.0
# 22 2010-02-28 7 <NA> 1 6.0
# 23 2010-02-28 8 <NA> 1 7.0
# 24 2010-02-28 9 <NA> 1 8.0
# 30 2010-02-28 15 best 2 9.0
# 25 2010-02-28 NA <NA> NA NA
# 26 2010-02-28 NA <NA> NA NA
# 27 2010-02-28 NA <NA> NA NA
# 28 2010-02-28 NA <NA> NA NA
# 29 2010-02-28 NA <NA> NA NA
When I check the quintiles of the Rank column, it appears to be right.
quantile(dat2$R, c(.2, .8), na.rm=TRUE)
# 20% 80%
# 2.8 8.2
After that you could just make a table to get the numbers of each category.
with(dat2, table(Date, E_P.fac))
# E_P.fac
# Date worst <NA> best
# 2010-01-31 2 6 3
# 2010-02-28 2 6 1
Data
dat <- structure(list(Date = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("2010-01-31", "2010-02-28"
), class = "factor"), Value = c(1, 2, 3, 4, 5, 6, 7, 8, 9, NA,
NA, NA, NA, NA, 15, 1, 2, 3, 4, 5, 6, 7, 8, 9, NA, NA, NA, NA,
NA, 15)), row.names = c(NA, -30L), class = "data.frame")

How do I use approx() inside mutate_at() with a conditional statement in dplyr?

I want to interpolate missing values using dplyr, piping, and approx().
Data:
test <- structure(list(site = structure(c(3L, 3L, 3L, 3L, 1L, 1L, 1L,
1L, 2L, 2L, 2L, 2L), .Label = c("lake", "stream", "wetland"), class = "factor"),
depth = c(0L, -3L, -4L, -8L, 0L, -1L, -3L, -5L, 0L, -2L,
-4L, -6L), var1 = c(1L, NA, 3L, 4L, 1L, 2L, NA, 4L, 1L, NA,
NA, 4L), var2 = c(1L, NA, 3L, 4L, NA, NA, NA, NA, NA, 2L,
NA, NA)), .Names = c("site", "depth", "var1", "var2"), class = "data.frame", row.names = c(NA,
-12L))
This code works:
library(tidyverse)
# interpolate missing var1 values for each site using approx()
test_int <- test %>%
group_by(site) %>%
mutate_at(vars(c(var1)),
funs("i" = approx(depth, ., depth, rule=1, method="linear")[["y"]]))
But the code no longer works if it encounters a grouping (site & var) that doesn't have at least 2 non-NA values, e.g.,
# here I'm trying to interpolate missing values for var1 & var2
test_int2 <- test %>%
group_by(site) %>%
mutate_at(vars(c(var1, var2)),
funs("i" = approx(depth, ., depth, rule=1, method="linear")[["y"]]))
R appropriately throws this error:
Error in mutate_impl(.data, dots) :
Evaluation error: need at least two non-NA values to interpolate.
How do I include a conditional statement or filter so that it only tries to interpolate cases where the site has at least 2 non-NA values and skips the rest or returns NA for those?
This will do what you are looking for...
test_int2 <- test %>%
group_by(site) %>%
mutate_at(vars(c(var1, var2)),
funs("i"=if(sum(!is.na(.))>1)
approx(depth, ., depth, rule=1, method="linear")[["y"]]
else
NA))
test_int2
# A tibble: 12 x 6
# Groups: site [3]
site depth var1 var2 var1_i var2_i
<fctr> <int> <int> <int> <dbl> <dbl>
1 wetland 0 1 1 1.0 1.0
2 wetland -3 NA NA 2.5 2.5
3 wetland -4 3 3 3.0 3.0
4 wetland -8 4 4 4.0 4.0
5 lake 0 1 NA 1.0 NA
6 lake -1 2 NA 2.0 NA
7 lake -3 NA NA 3.0 NA
8 lake -5 4 NA 4.0 NA
9 stream 0 1 NA 1.0 NA
10 stream -2 NA 2 2.0 NA
11 stream -4 NA NA 3.0 NA
12 stream -6 4 NA 4.0 NA

Calculate weighted average in R dataframe

"f","index","values","lo.80","lo.95","hi.80","hi.95"
"auto.arima",2017-07-31 16:40:00,2.81613884762163,NA,NA,NA,NA
"auto.arima",2017-07-31 16:40:10,2.83441637197378,NA,NA,NA,NA
"auto.arima",2017-07-31 20:39:10,3.18497899649267,2.73259824384436,2.49312233904087,3.63735974914098,3.87683565394447
"auto.arima",2017-07-31 20:39:20,3.16981166809297,2.69309866988864,2.44074205235297,3.64652466629731,3.89888128383297
"ets",2017-07-31 16:40:00,2.93983529828936,NA,NA,NA,NA
"ets",2017-07-31 16:40:10,3.09739640066054,NA,NA,NA,NA
"ets",2017-07-31 20:39:10,3.1951571771414,2.80966705285567,2.60560090776504,3.58064730142714,3.78471344651776
"ets",2017-07-31 20:39:20,3.33876776870274,2.93593322313957,2.72268549604222,3.7416023142659,3.95485004136325
"bats",2017-07-31 16:40:00,2.82795253090081,NA,NA,NA,NA
"bats",2017-07-31 16:40:10,2.96389759682623,NA,NA,NA,NA
"bats",2017-07-31 20:39:10,3.1383560278272,2.76890864400062,2.573335012715,3.50780341165378,3.7033770429394
"bats",2017-07-31 20:39:20,3.3561357998535,2.98646195085452,2.79076843614824,3.72580964885248,3.92150316355876
I have a dataframe like above which has column names as:"f","index","values","lo.80","lo.95","hi.80","hi.95".
What I want to do is calculate the weighted average on forecast results from different models for a particular timestamp. By this what i mean is
For every row in auto.arima there is a corresponding row in ets and bats with the same timestamp value, so weighted average should be calculated something like this:
value_arima*1/3 + values_ets*1/3 + values_bats*1/3 ; similary values for lo.80 and other columns should be calculated.
This result should be stored in a new dataframe with all the weighted average values.
New dataframe can look something like:
index(timesamp from above dataframe),avg,avg_lo_80,avg_lo_95,avg_hi_80,avg_hi_95
I think I need to use spread() and mutate () function to achieve this. Being new to R I'm unable to proceed after forming this dataframe.
Please help.
The example you provide is not a weighted average but a simple average.
What you want is a simple aggregate.
The first part is your dataset as provided by dput (better for sharing here)
d <- structure(list(f = structure(c(1L, 1L, 1L, 1L, 3L, 3L, 3L, 3L,
2L, 2L, 2L, 2L), .Label = c("auto.arima", "bats", "ets"), class = "factor"),
index = structure(c(1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L,
3L, 4L), .Label = c("2017-07-31 16:40:00", "2017-07-31 16:40:10",
"2017-07-31 20:39:10", "2017-07-31 20:39:20"), class = "factor"),
values = c(2.81613884762163, 2.83441637197378, 3.18497899649267,
3.16981166809297, 2.93983529828936, 3.09739640066054, 3.1951571771414,
3.33876776870274, 2.82795253090081, 2.96389759682623, 3.1383560278272,
3.3561357998535), lo.80 = c(NA, NA, 2.73259824384436, 2.69309866988864,
NA, NA, 2.80966705285567, 2.93593322313957, NA, NA, 2.76890864400062,
2.98646195085452), lo.95 = c(NA, NA, 2.49312233904087, 2.44074205235297,
NA, NA, 2.60560090776504, 2.72268549604222, NA, NA, 2.573335012715,
2.79076843614824), hi.80 = c(NA, NA, 3.63735974914098, 3.64652466629731,
NA, NA, 3.58064730142714, 3.7416023142659, NA, NA, 3.50780341165378,
3.72580964885248), hi.95 = c(NA, NA, 3.87683565394447, 3.89888128383297,
NA, NA, 3.78471344651776, 3.95485004136325, NA, NA, 3.7033770429394,
3.92150316355876)), .Names = c("f", "index", "values", "lo.80",
"lo.95", "hi.80", "hi.95"), class = "data.frame", row.names = c(NA,
-12L))
> aggregate(d[,3:7], by = d["index"], FUN = mean)
index values lo.80 lo.95 hi.80 hi.95
1 2017-07-31 16:40:00 2.861309 NA NA NA NA
2 2017-07-31 16:40:10 2.965237 NA NA NA NA
3 2017-07-31 20:39:10 3.172831 2.770391 2.557353 3.575270 3.788309
4 2017-07-31 20:39:20 3.288238 2.871831 2.651399 3.704646 3.925078
You can save this output in an object and change the column names as you want.
If you really want a weighted average this is a way to obtain it (here bat has a weight of 0.8 and the 2 others 0.1) :
> d$weight <- (d$f)
> levels(d$weight) # check the levels
[1] "auto.arima" "bats" "ets"
> levels(d$weight) <- c(0.1, 0.8, 0.1)
> # transform the factor into numbers
> # warning as.numeric(d$weight) is not correct !!
> d$weight <- as.numeric(as.character((d$weight)))
>
> # Here the result is saved in a data.frame called "result
> result <- aggregate(d[,3:7] * d$weight, by = d["index"], FUN = sum)
> result
index values lo.80 lo.95 hi.80 hi.95
1 2017-07-31 16:40:00 2.837959 NA NA NA NA
2 2017-07-31 16:40:10 2.964299 NA NA NA NA
3 2017-07-31 20:39:10 3.148698 2.769353 2.568540 3.528043 3.728857
4 2017-07-31 20:39:20 3.335767 2.952073 2.748958 3.719460 3.922576

Resources