I have sparse data which has a score taken at periodic intervals and a measurement taken at more regular interval for multiple subjects along with corresponding dates. I would like to generate date ranges based on the score dates for each subject ID ie. starting at the score date and ending at the next score date (or starting/ending at the first/last subject observation if the score doesn't fall on those dates).
I would then like to average the measurement variable within these date ranges. The averaging step should be straightforward but I am stuck on generating the date ranges.
Below is a sample of the data and an example of how I would envision the resulting data
sample data:
structure(list(ID = c("A", "A", "A", "A", "A", "A", "A", "A",
"A", "A", "A", "A", "A", "A", "A", "A", "A", "B", "B", "B", "B",
"B", "B", "C", "C", "C", "D", "D", "D", "D", "D", "D", "D", "D",
"D", "D", "D", "D", "D", "D", "D"), date = c("1/21/2020", "1/27/2020",
"2/1/2020", "2/3/2020", "2/5/2020", "2/6/2020", "2/8/2020", "2/9/2020",
"2/11/2020", "2/12/2020", "2/13/2020", "2/15/2020", "2/18/2020",
"2/20/2020", "2/21/2020", "2/22/2020", "2/25/2020", "2/1/2020",
"2/5/2020", "2/7/2020", "2/8/2020", "2/11/2020", "2/12/2020",
"1/30/2020", "2/10/2020", "2/11/2020", "2/6/2020", "2/7/2020",
"2/8/2020", "2/9/2020", "2/11/2020", "2/13/2020", "2/14/2020",
"2/16/2020", "2/17/2020", "2/20/2020", "2/23/2020", "2/26/2020",
"3/1/2020", "3/3/2020", "3/5/2020"), score = c(0.5, 2, NA, NA,
3, NA, NA, NA, NA, NA, 2.5, NA, NA, 1.5, NA, NA, NA, 3, NA, NA,
2.5, NA, 1, 0.5, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 14,
NA, NA, 11.5, NA, 9.5, NA), measure = c(0.394160734, 0.722462998,
0.82984815, 0.738432745, 0.321792398, 0.167492308, 0.218020898,
0.929210786, 0.686818585, 0.939678073, 0.708172942, 0.299863884,
0.48216267, 0.290307369, 0.801947902, 0.579418467, 0.78101844,
0.219494852, 0.875129822, 0.517971003, 0.475625007, 0.723003744,
0.257473477, 0.629818537, 0.817369151, 0.628573413, 0.364660834,
0.5971024, 0.002274261, 0.318937617, 0.983917106, 0.685933928,
0.487922831, 0.151769304, 0.392413694, 0.012429414, 0.149627658,
0.011724992, 0.536998203, 0.798399999, 0.763353822)), class = "data.frame", row.names = c(NA,
-41L))
answer data:
structure(list(ID = c("A", "A", "A"), startDate = c("1/21/2020",
"1/27/2020", "2/5/2020"), endDate = c("1/27/2020", "2/5/2020",
"2/13/2020"), score = c(0.5, 2, 3), measure = c(0.394160734,
0.763581298, 0.543835508)), class = "data.frame", row.names = c(NA,
-3L))
Here's a way with dplyr :
library(dplyr)
df %>%
group_by(ID, grp = cumsum(!is.na(score))) %>%
summarise(start_date = first(date),
score = first(score),
measure = mean(measure)) %>%
mutate(end_date = lead(start_date, default = last(start_date))) %>%
select(-grp)
# ID start_date score measure end_date
# <chr> <chr> <dbl> <dbl> <chr>
# 1 A 1/21/2020 0.5 0.394 1/27/2020
# 2 A 1/27/2020 2 0.764 2/5/2020
# 3 A 2/5/2020 3 0.544 2/13/2020
# 4 A 2/13/2020 2.5 0.497 2/20/2020
# 5 A 2/20/2020 1.5 0.613 2/20/2020
# 6 B 2/1/2020 3 0.538 2/8/2020
# 7 B 2/8/2020 2.5 0.599 2/12/2020
# 8 B 2/12/2020 1 0.257 2/12/2020
# 9 C 1/30/2020 0.5 0.692 1/30/2020
#10 D 2/6/2020 NA 0.449 2/17/2020
#11 D 2/17/2020 14 0.185 2/26/2020
#12 D 2/26/2020 11.5 0.274 3/3/2020
#13 D 3/3/2020 9.5 0.781 3/3/2020
Using data.table
library(data.table)
setDT(df)[, .(start_date = first(date),
score = first(score),
measure = mean(measure)),
by = .(ID, grp = cumsum(!is.na(score)))
][, end_date := shift(start_date, type= 'lead', fill = last(start_date))
][, grp := NULL][]
Related
I am trying to perform a calculation that is fairly simple in Excel. But I am having difficulty in figuring out a way to do the same in R.
This is the data:
structure(list(Industry = c("A ", "B", "C", "A ", "B", "C", "A ",
"B", "C", "A ", "B", "C"), Date = c("06-01-2022", "06-01-2022",
"06-01-2022", "07-01-2022", "07-01-2022", "07-01-2022", "08-01-2022",
"08-01-2022", "08-01-2022", "09-01-2022", "09-01-2022", "09-01-2022"
), Value = c(43496, 159927, 42428, 44895, 162891, 43091, NA,
NA, NA, NA, NA, NA), Growth = c(NA, NA, NA, NA, NA, NA, 0.05,
0.04, 0.03, 0.01, 0.02, 0.06)), row.names = c(NA, 12L), class = "data.frame")
I want to create the missing values where value in a given month for a group is calculated by multiplying the growth rate with the previous month's value. so for month 8, the value in month 7 is multiplied with growth rate and for month 9, the value of month 8 is multiplied with the growth rate in Growth column.
Output should look like this:
structure(list(Industry = c("A ", "B", "C", "A ", "B", "C", "A ",
"B", "C", "A ", "B", "C"), Date = c("06-01-2022", "06-01-2022",
"06-01-2022", "07-01-2022", "07-01-2022", "07-01-2022", "08-01-2022",
"08-01-2022", "08-01-2022", "09-01-2022", "09-01-2022", "09-01-2022"
), Value = c(43496, 159927, 42428, 44895, 162891, 43091, 47139.75,
169406.64, 44383.73, 47611.1475, 172794.7728, 47046.7538), Growth = c(NA,
NA, NA, NA, NA, NA, 0.05, 0.04, 0.03, 0.01, 0.02, 0.06)), row.names = c(NA,
12L), class = "data.frame")
Is there any neat dplyr/data.table solution for this?
Data table answer is way better, I just don't understand the syntax of data.table.
# original data
df <- structure(list(Industry = c("A ", "B", "C", "A ", "B", "C", "A ",
"B", "C", "A ", "B", "C"), Date = c("06-01-2022", "06-01-2022",
"06-01-2022", "07-01-2022", "07-01-2022", "07-01-2022", "08-01-2022",
"08-01-2022", "08-01-2022", "09-01-2022", "09-01-2022", "09-01-2022"
), Value = c(43496, 159927, 42428, 44895, 162891, 43091, NA,
NA, NA, NA, NA, NA), Growth = c(NA, NA, NA, NA, NA, NA, 0.05,
0.04, 0.03, 0.01, 0.02, 0.06)), row.names = c(NA, 12L), class = "data.frame")
# basic cleaning of names, industry, arrangement
df <- df %>%
janitor::clean_names() %>%
mutate(industry = str_trim(industry)) %>%
arrange(industry, date) %>%
group_by(industry)
# create a custom function
replace_na_growth <- function(df){
warning("must be grouped dataframe")
# count the maximum number of consecutive NA by industry
count_na <- unique(df %>%
mutate(n = cumsum(is.na(value))) %>%
filter(n == max(n)) %>%
pull(n))
df <- df %>%
mutate(growth = ifelse(is.na(growth), 1, 1 + growth))
# write a for loop that runs through the max cumulative sum of NA --------
#' if value is NA, then take value(lag by 1) * it by growth
#' you have to do this multiple times to get it to keep replacing consecutive na's
for(i in 1:count_na){
df<- df %>%
group_by(industry) %>%
mutate(value = ifelse(is.na(value), lag(value, 1)*growth, value))
}
return(df)
}
df %>%
replace_na_growth()
Here is a two-step data.table approach in which we first carry the Growth value backward in time (for convenience) and then make use of Reduce() setting accumulate = TRUE to carry the Value column forward in time:
library(data.table)
setDT(dat)
## carry growth column backward
dat[is.na(Growth), Growth := Value / shift(Value, type = "lag", fill = Value[1]) - 1, by = "Industry"]
## carry value column forward
dat[, Value := Reduce(\(val, gro) (1 + gro) * val, x = Growth, init = Value[1], accumulate = TRUE)[-1], by = "Industry"]
#> Industry Date Value Growth
#> 1: A 06-01-2022 43496.00 0.00000000
#> 2: B 06-01-2022 159927.00 0.00000000
#> 3: C 06-01-2022 42428.00 0.00000000
#> 4: A 07-01-2022 44895.00 0.03216388
#> 5: B 07-01-2022 162891.00 0.01853346
#> 6: C 07-01-2022 43091.00 0.01562647
#> 7: A 08-01-2022 47139.75 0.05000000
#> 8: B 08-01-2022 169406.64 0.04000000
#> 9: C 08-01-2022 44383.73 0.03000000
#> 10: A 09-01-2022 47611.15 0.01000000
#> 11: B 09-01-2022 172794.77 0.02000000
#> 12: C 09-01-2022 47046.75 0.06000000
Data
dat <- structure(list(Industry = c("A ", "B", "C", "A ", "B", "C", "A ",
"B", "C", "A ", "B", "C"), Date = c("06-01-2022", "06-01-2022",
"06-01-2022", "07-01-2022", "07-01-2022", "07-01-2022", "08-01-2022",
"08-01-2022", "08-01-2022", "09-01-2022", "09-01-2022", "09-01-2022"
), Value = c(43496, 159927, 42428, 44895, 162891, 43091, NA,
NA, NA, NA, NA, NA), Growth = c(NA, NA, NA, NA, NA, NA, 0.05,
0.04, 0.03, 0.01, 0.02, 0.06)), row.names = c(NA, 12L), class = "data.frame")
This question already has answers here:
How to combine multiple conditions to subset a data-frame using "OR"?
(5 answers)
Closed 1 year ago.
I have a data set that includes a name, date and earliest_date, in which some name will have a earliest_date. Now I want to remove all the data after the earliest_date based on name. And ignore those that have NA in earliest_date. And sicne different name will have different earliest_date, I am pretty sure I can't use filter() with a set date. Any help will be much appericated.
Part of the data is below:
dput(mydata[1:10,])
structure(list(name = c("a", "b", "c",
"d", "e", "f", "g",
"a", "h", "i"), Date = structure(c(13214,
17634, 15290, 18046, 16326, 18068, 10234, 12647, 15485, 15182
), class = "Date"), earliest_date = structure(c(12647, NA, NA,
NA, NA, NA, NA, 12647, NA, 15552), class = "Date")), row.names = c(NA,
10L), class = "data.frame")
Desired output:
The first row will be removed as the Date recorded after earliest_date
dput(mydata[2:10,])
structure(list(name = c("b", "c",
"d", "e", "f", "g",
"a", "h", "i"), Date = structure(c(17634, 15290,
18046, 16326, 18068, 10234, 12647, 15485, 15182), class = "Date"),
earliest_date = structure(c(NA, NA, NA, NA, NA, NA, 12647,
NA, 15552), class = "Date")), row.names = 2:10, class = "data.frame")
This may helps
mydata %>%
filter(is.na(earliest_date) | Date<=earliest_date)
name Date earliest_date
1 b 2018-04-13 <NA>
2 c 2011-11-12 <NA>
3 d 2019-05-30 <NA>
4 e 2014-09-13 <NA>
5 f 2019-06-21 <NA>
6 g 1998-01-08 <NA>
7 a 2004-08-17 2004-08-17
8 h 2012-05-25 <NA>
9 i 2011-07-27 2012-07-31
Or try:
library(data.table)
setDT(mydata)[is.na(mydata$earliest_date) | mydata$Date<=earliest_date,]
Is there an elegant/tidy way to fill in the data if there are non-null values to the right? I have a wonky work-around but wanted to know if there was a nice dplyr way to do this.
actual <-
tibble(
a = c("A", NA, NA, NA, NA, NA, NA, "B", NA, NA, NA),
b = c(NA, "A", NA, NA, NA, "C", NA, NA, "E", NA, NA),
c = c(NA, NA, "B", NA, NA, NA, "D", NA, NA, "F", "G"),
d = c(NA, NA, NA, "C", "D", NA, NA, NA, NA, NA, NA)
)
desired <-
tibble(
w = c("A", "A", "A", "A", "A", "A", "A", "B", "B", "B", "B"),
x = c(NA, "A", "A", "A", "A", "C", "C", NA, "E", "E", "E"),
y = c(NA, NA, "B", "B", "B", NA, "D", NA, NA, "F", "G"),
z = c(NA, NA, NA, "C", "D", NA, NA, NA, NA, NA, NA)
)
We can use fill from tidyr together with dplyr like the following.
library(dplyr)
library(tidyr)
dat <- actual %>%
fill(a) %>%
group_by(a) %>%
fill(b) %>%
group_by(b) %>%
fill(c) %>%
group_by(c) %>%
fill(d) %>%
ungroup()
print(dat)
# # A tibble: 11 x 4
# a b c d
# <chr> <chr> <chr> <chr>
# 1 A NA NA NA
# 2 A A NA NA
# 3 A A B NA
# 4 A A B C
# 5 A A B D
# 6 A C NA NA
# 7 A C D NA
# 8 B NA NA NA
# 9 B E NA NA
# 10 B E F NA
# 11 B E G NA
I spread a column using pivot_wider so I could compare two groups (var1 vs var2) using an xy plot. But I can't compare them because there is a corresponding NA in the column.
Here is an example dataframe:
df <- data.frame(group = c("a", "a", "b", "b", "c", "c"), var1 = c(3, NA, 1, NA, 2, NA),
var2 = c(NA, 2, NA, 4, NA, 8))
I would like it to look like:
df2 <- data.frame(group = c("a", "b", "c"), var1 = c(3, 1, 2),
var2 = c( 2, 4, 8))
You can use summarize. But this treats the symptom not the cause. You may have a column in id_cols which is one-to-one with your variable in values_from.
library(dplyr)
df %>%
group_by(group) %>%
summarize_all(sum, na.rm = T)
# A tibble: 3 x 3
group var1 var2
<fct> <dbl> <dbl>
1 a 3 2
2 b 1 4
3 c 2 8
This solution is a bit more robust, with a slightly more general data.frame to begin with:
df <- data.frame(col_1 = c("A", "A", "A", "A", "A", "A", "B", "B", "B"),
col_2 = c(1, 3, NA, NA, NA, NA, 4, NA, NA),
col_3 = c(NA, NA, 2, 5, NA, NA, NA, 5, NA),
col_4 = c(NA, NA, NA, NA, 5, 6, NA, NA, 7))
df %>% dplyr::group_by(col_1) %>%
dplyr::summarise_all(purrr::discard, is.na)
Here is a way to do it, assuming you only have two rows by group and one row with NA
library(dplyr)
df %>% group_by(group) %>%
summarise(var1=max(var1,na.rm=TRUE),
var2=max(var2,na.rm=TRUE))
The na.rm=TRUE will not count the NAs and get the max on only one value (the one which is not NA)
I have paired data for 10 subjects (with some missing and some ties). My goal is to select the eye with the best disc_grade (A > B > C) and label ties accordingly from the data frame below.
I'm stuck on how to use R code to select the rows with the best disc_grade for each subject.
df <- structure(list(patientID = c(1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6,
6, 7, 7, 8, 8, 9, 9, 10, 10), eye = c("R", "L", "R", "L", "R",
"L", "R", "L", "R", "L", "R", "L", "R", "L", "R", "L", "R", "L",
"R", "L"), disc_grade = c(NA, "B", "C", "B", "B", "C", "B", "C",
"B", "A", "B", "B", "C", "B", NA, NA, "B", "C", "B", "C")), .Names = c("patientID", "eye", "disc_grade"), class = c("tbl_df", "data.frame"), row.names = c(NA, -20L))
The desired output is:
patientID eye disc_grade
2 1 L B
4 2 L B
5 3 R B
7 4 R B
10 5 L A
11 6 Tie B
14 7 L B
17 9 R B
19 10 R B
This seems to work:
df %>%
group_by(patientID) %>%
filter(disc_grade == min(disc_grade, na.rm=TRUE)) %>%
summarise(eye = if (n()==1) eye else "Tie", disc_grade = first(disc_grade))
patientID eye disc_grade
(dbl) (chr) (chr)
1 1 L B
2 2 L B
3 3 R B
4 4 R B
5 5 L A
6 6 Tie B
7 7 L B
8 9 R B
9 10 R B
There is a warning for group 8, but we get the desired result thanks to how filter works on NAs.
With data.table:
setDT(df)[,
.SD[ disc_grade == min(disc_grade, na.rm=TRUE) ][,
.( eye = if (.N==1) eye else "Tie", disc_grade = disc_grade[1] )
]
, by=patientID]
Again, there's a warning, but now we do get a row for group 8, since [ does not ignore NAs. To get around this, you could filter the NAs before or after the operation (as in other answers). My best idea for doing it during the main operation is pretty convoluted:
setDT(df)[,
.SD[ which(disc_grade == min(disc_grade, na.rm=TRUE)) ][,
if (.N >= 1) list( eye = if (.N==1) eye else "Tie", disc_grade = disc_grade[1] )
]
, by=patientID]
One option with data.table
library(data.table)
na.omit(setDT(df))[, eye:=if(uniqueN(disc_grade)==1 &
.N >1) 'Tie' else eye, patientID
][order(factor(disc_grade, levels=c('A', 'B', 'C'))),
.SD[1L] ,patientID][order(patientID)]
# patientID eye disc_grade
#1: 1 L B
#2: 2 L B
#3: 3 R B
#4: 4 R B
#5: 5 L A
#6: 6 Tie B
#7: 7 L B
#8: 9 R B
#9: 10 R B
library(dplyr)
df <- structure(list(patientID = c(1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6,
6, 7, 7, 8, 8, 9, 9, 10, 10), eye = c("R", "L", "R", "L", "R",
"L", "R", "L", "R", "L", "R", "L", "R", "L", "R", "L", "R", "L",
"R", "L"), disc_grade = c(NA, "B", "C", "B", "B", "C", "B", "C",
"B", "A", "B", "B", "C", "B", NA, NA, "B", "C", "B", "C")), .Names = c("patientID", "eye", "disc_grade"), class = c("tbl_df", "data.frame"), row.names = c(NA, -20L))
df %>%
filter(!is.na(disc_grade)) %>% ## remove rows with NAs
group_by(patientID) %>% ## for each patient
filter(disc_grade == min(disc_grade)) %>% ## keep the row (his eye) that has the best score
mutate(eye_upd = ifelse(n() > 1, "tie", eye)) %>% ## if you kept both eyes you have a tie
select(patientID,eye_upd,disc_grade) %>%
distinct()
# patientID eye_upd disc_grade
# (dbl) (chr) (fctr)
# 1 1 L B
# 2 2 L B
# 3 3 R B
# 4 4 R B
# 5 5 L A
# 6 6 tie B
# 7 7 L B
# 8 9 R B
# 9 10 R B
There's certainly a better way to do this, but this gets the job done...need more coffee...
df_orig <- df
library(dplyr)
df %>%
filter(!is.na(disc_grade)) %>%
group_by(patientID) %>%
summarise(best = min(disc_grade)) %>%
left_join(., df_orig, by = c("patientID" = "patientID",
"best" = "disc_grade")) %>%
group_by(patientID) %>%
mutate(eye = ifelse(n() > 1, "tie", eye)) %>%
distinct(patientID) %>%
select(patientID, eye, best)
Note: I am able to get away with min(disc_grade) because of type conversation. Consider looking at as.numeric(as.factor(df$disc_grade)).