Fill subset of rows with values from row above - r

I have a long format dataset with longitudinal data and for one variable I want to fill in the missings in timepoint 0 with the values in timepoint 1, but I do not want to fill in the missings from timepoint 1 with values from timepoint 2 and so on.
My dataset is ordered by id and timepoint.
I have used the fill function succesfully in cases where I just needed to fill missings from all timepoints from a specific id.
Example dataframe:
df <- data.frame(id=c(1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4),
timepoint=c(0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3),
var1=c(NA,9,8,10, NA, 10, NA, 12, NA, NA, 12, 11, NA, 12, 12, NA))
> df
id timepoint var1
1 1 0 NA
2 1 1 9
3 1 2 8
4 1 3 10
5 2 0 NA
6 2 1 10
7 2 2 NA
8 2 3 12
9 3 0 NA
10 3 1 NA
11 3 2 12
12 3 3 11
13 4 0 NA
14 4 1 12
15 4 2 12
16 4 3 NA
This is what works when I just need to fill any missing no matter the timepoint:
library(dplyr)
library(tidyr)
df <- df %>%
group_by(id) %>%
fill(`var9`:`var12`, .direction = "up") %>%
as.data.frame
But now I have trouble specifying to only fill in the missings in rows at timepoint 0. Any help is appreciated.
My expected output:
> df
id timepoint var1
1 1 0 9
2 1 1 9
3 1 2 8
4 1 3 10
5 2 0 10
6 2 1 10
7 2 2 NA
8 2 3 12
9 3 0 NA
10 3 1 NA
11 3 2 12
12 3 3 11
13 4 0 12
14 4 1 12
15 4 2 12
16 4 3 NA

This might be an oversimplification, but you can just call the fill function again, but this time with direction down. Then your entire data frame will be complete.
df <- data.frame(id=c(1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4),
timepoint=c(0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3),
var1=c(NA,9,8,10, NA, 10, NA, 12, NA, NA, 12, 11, NA, 12, 12, NA))
In this case I will use an ifelse statement followed the by the lead function.
library(dplyr); library(tidyr);
df %>%
group_by(id) %>%
mutate(var1 = ifelse(is.na(var1) & timepoint == 0,
lead(var1, 1), var1))
Yields:
# A tibble: 16 x 3
# Groups: id [4]
id timepoint var1
<dbl> <dbl> <dbl>
1 1 0 9
2 1 1 9
3 1 2 8
4 1 3 10
5 2 0 10
6 2 1 10
7 2 2 NA
8 2 3 12
9 3 0 NA
10 3 1 NA
11 3 2 12
12 3 3 11
13 4 0 12
14 4 1 12
15 4 2 12
16 4 3 NA

We can group_by id and use replace to change the values where timepoint = 0 & var1 is NA from the corresponding value of var1 where timepoint = 1 in each group.
library(dplyr)
df %>%
group_by(id) %>%
mutate(var2 = replace(var1, timepoint == 0 & is.na(var1), var1[timepoint == 1]))
# id timepoint var1 var2
# <dbl> <dbl> <dbl> <dbl>
# 1 1 0 NA 9
# 2 1 1 9 9
# 3 1 2 8 8
# 4 1 3 10 10
# 5 2 0 NA 10
# 6 2 1 10 10
# 7 2 2 NA NA
# 8 2 3 12 12
# 9 3 0 NA NA
#10 3 1 NA NA
#11 3 2 12 12
#12 3 3 11 11
#13 4 0 NA 12
#14 4 1 12 12
#15 4 2 12 12
#16 4 3 NA NA

Related

Creating an indexed column in R, grouped by user_id, and not increase when NA

I want to create a column (in R) that indexes the presence of a number in another column grouped by a user_id column. And when the other column is NA, the new desired column should not increase.
The example should bring clarity.
I have this df:
data <- data.frame(user_id = c(1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3),
one=c(1,NA,3,2,NA,0,NA,4,3,4,NA))
user_id tobeindexed
1 1 1
2 1 NA
3 1 3
4 2 2
5 2 NA
6 2 0
7 2 NA
8 3 4
9 3 3
10 3 4
11 3 NA
I want to make a new column looking like "desired" in the following df:
> cbind(data,data.frame(desired = c(1,1,2,1,1,2,2,1,2,3,3)))
user_id tobeindexed desired
1 1 1 1
2 1 NA 1
3 1 3 2
4 2 2 1
5 2 NA 1
6 2 0 2
7 2 NA 2
8 3 4 1
9 3 3 2
10 3 4 3
11 3 NA 3
How can I solve this?
Using colsum and group_by gets me close, but the count does not start over from 1 when the user_id changes...
> data %>% group_by(user_id) %>% mutate(desired = cumsum(!is.na(tobeindexed)))
user_id tobeindexed desired
<dbl> <dbl> <int>
1 1 1 1
2 1 NA 1
3 1 3 2
4 2 2 3
5 2 NA 3
6 2 0 4
7 2 NA 4
8 3 4 5
9 3 3 6
10 3 4 7
11 3 NA 7
Given the sample data you provided (with the one) column, this works unchanged. The code is retained below for demonstration.
base R
data$out <- ave(data$one, data$user_id, FUN = function(z) cumsum(!is.na(z)))
data
# user_id one out
# 1 1 1 1
# 2 1 NA 1
# 3 1 3 2
# 4 2 2 1
# 5 2 NA 1
# 6 2 0 2
# 7 2 NA 2
# 8 3 4 1
# 9 3 3 2
# 10 3 4 3
# 11 3 NA 3
dplyr
library(dplyr)
data %>%
group_by(user_id) %>%
mutate(out = cumsum(!is.na(one))) %>%
ungroup()
# # A tibble: 11 × 3
# user_id one out
# <dbl> <dbl> <int>
# 1 1 1 1
# 2 1 NA 1
# 3 1 3 2
# 4 2 2 1
# 5 2 NA 1
# 6 2 0 2
# 7 2 NA 2
# 8 3 4 1
# 9 3 3 2
# 10 3 4 3
# 11 3 NA 3

Fill missing values (NA) before the first non-NA value by group

I have a data frame grouped by 'id' and a variable 'age' which contains missing values, NA.
Within each 'id', I want to replace missing values of 'age', but only "fill up" before the first non-NA value.
data <- data.frame(id=c(1,1,1,1,1,1,2,2,2,2,2,3,3,3,3,3),
age=c(NA,6,NA,8,NA,NA,NA,NA,3,8,NA,NA,NA,7,NA,9))
id age
1 1 NA
2 1 6 # first non-NA in id = 1. Fill up from here
3 1 NA
4 1 8
5 1 NA
6 1 NA
7 2 NA
8 2 NA
9 2 3 # first non-NA in id = 2. Fill up from here
10 2 8
11 2 NA
12 3 NA
13 3 NA
14 3 7 # first non-NA in id = 3. Fill up from here
15 3 NA
16 3 9
Expected output:
1 1 6
2 1 6
3 1 NA
4 1 8
5 1 NA
6 1 NA
7 2 3
8 2 3
9 2 3
10 2 8
11 2 NA
12 3 7
13 3 7
14 3 7
15 3 NA
16 3 9
I tried using fill with .direction = "up" like this:
library(dplyr)
library(tidyr)
data1 <- data %>% group_by(id) %>%
fill(!is.na(age[1]), .direction = "up")
You could use cumall(is.na(age)) to find the positions before the first non-NA value.
library(dplyr)
data %>%
group_by(id) %>%
mutate(age2 = replace(age, cumall(is.na(age)), age[!is.na(age)][1])) %>%
ungroup()
# A tibble: 16 × 3
id age age2
<dbl> <dbl> <dbl>
1 1 NA 6
2 1 6 6
3 1 NA NA
4 1 8 8
5 1 NA NA
6 1 NA NA
7 2 NA 3
8 2 NA 3
9 2 3 3
10 2 8 8
11 2 NA NA
12 3 NA 7
13 3 NA 7
14 3 7 7
15 3 NA NA
16 3 9 9
Another option (agnostic about where the missing and non-missing values start) could be:
data %>%
group_by(id) %>%
mutate(rleid = with(rle(is.na(age)), rep(seq_along(lengths), lengths)),
age2 = ifelse(rleid == min(rleid[is.na(age)]),
age[rleid == (min(rleid[is.na(age)]) + 1)][1],
age))
id age rleid age2
<dbl> <dbl> <int> <dbl>
1 1 NA 1 6
2 1 6 2 6
3 1 NA 3 NA
4 1 8 4 8
5 1 NA 5 NA
6 1 NA 5 NA
7 2 NA 1 3
8 2 NA 1 3
9 2 3 2 3
10 2 8 2 8
11 2 NA 3 NA
12 3 NA 1 7
13 3 NA 1 7
14 3 7 2 7
15 3 NA 3 NA
16 3 9 4 9

Replace NA values when they are in two adjacent columns

Hi this is an example of a similar dataframe I am working with. I have an experiment with 10 samples and two replicates
df <- data.frame("ID" = c(1,2,3,4,5,6,7,8,9,10),
"Rep1" = c(6,5,3,"Na","Na",9,4,"Na","Na",2),
"Rep2" = c(8,4,4,"Na",3,"Na",6,"Na",2,1))
I have different Na values, however, I only want to replace them with zeros in the samples 4 and 8 due to they are the only ones which have NA in both replicates. Then, other samples would maintain the "NA".
You can also use the following solution. In the following solution we iterate over each row and detect corresponding index or indices that is (are) equal to Na then if there were more that one index we replace it with 0 otherwise the row will remain as it:
library(dplyr)
library(purrr)
df %>%
pmap_df(., ~ {ind <- which(c(...) == "Na");
if(length(ind) > 1) {
replace(c(...), ind, "0")
} else {
c(...)
}
}
) %>%
mutate(across(ID, as.integer))
# A tibble: 10 x 3
ID Rep1 Rep2
<int> <chr> <chr>
1 1 6 8
2 2 5 4
3 3 3 4
4 4 0 0
5 5 Na 3
6 6 9 Na
7 7 4 6
8 8 0 0
9 9 Na 2
10 10 2 1
P.S = I almost went crazy as why I could not get it to work only to realize your NAs are in fact Na.
We create an index where the 'Rep' columns are both "Na" with rowSums on a logical matrix. Use the row, column index/names to subset the data and assign the values to 0
nm1 <- grep("Rep", names(df), value = TRUE)
i1 <- rowSums(df[nm1] == "Na") == length(nm1)
df[i1, nm1] <- 0
-output
df
ID Rep1 Rep2
1 1 6 8
2 2 5 4
3 3 3 4
4 4 0 0
5 5 Na 3
6 6 9 Na
7 7 4 6
8 8 0 0
9 9 Na 2
10 10 2 1
As the OP created string "Na", the column types are not numeric. We can convert this to numeric as
df[-1] <- lapply(df[-1], as.numeric)
forces the "Na" to be converted to NA
-output
df
ID Rep1 Rep2
1 1 6 8
2 2 5 4
3 3 3 4
4 4 0 0
5 5 NA 3
6 6 9 NA
7 7 4 6
8 8 0 0
9 9 NA 2
10 10 2 1
With dplyr we could:
library(dplyr)
df %>%
mutate(across(starts_with("Rep"), ~case_when(.=="Na" & ID==4 | ID==8 ~ "0",
TRUE ~ .)))
Output:
ID Rep1 Rep2
1 1 6 8
2 2 5 4
3 3 3 4
4 4 0 0
5 5 Na 3
6 6 9 Na
7 7 4 6
8 8 0 0
9 9 Na 2
10 10 2 1
Though it has been marked as solved, yet I propose a simple answer
df <- data.frame("ID" = c(1,2,3,4,5,6,7,8,9,10),
"Rep1" = c(6,5,3,"Na","Na",9,4,"Na","Na",2),
"Rep2" = c(8,4,4,"Na",3,"Na",6,"Na",2,1))
library(dplyr)
df %>% group_by(ID) %>%
mutate(replace(cur_data(), all(cur_data() == 'Na'), '0'))
#> # A tibble: 10 x 3
#> # Groups: ID [10]
#> ID Rep1 Rep2
#> <dbl> <chr> <chr>
#> 1 1 6 8
#> 2 2 5 4
#> 3 3 3 4
#> 4 4 0 0
#> 5 5 Na 3
#> 6 6 9 Na
#> 7 7 4 6
#> 8 8 0 0
#> 9 9 Na 2
#> 10 10 2 1
OR
df %>% rowwise() %>%
mutate(replace(cur_data()[-1], all(cur_data()[-1] == 'Na'), '0'))

Get a value based on the value of another column in R - dplyr

i got this df:
df <- data.frame(month = c(1,1,1,1,1,2,2,2,2,2,3,3,3,3,3,4,4,4,4,4),
day = c(1,2,3,4,5,1,2,3,4,5,1,2,3,4,5,1,2,3,4,5),
flow = c(2,5,7,8,5,4,6,7,9,2,NA,1,6,10,2,NA,NA,NA,NA,NA))
and i want to reach this result:
month day flow dayofminflow
1 1 1 2 1
2 1 2 5 1
3 1 3 7 1
4 1 4 8 1
5 1 5 5 1
6 2 1 4 5
7 2 2 6 5
8 2 3 7 5
9 2 4 9 5
10 2 5 2 5
11 3 1 NA 2
12 3 2 1 2
13 3 3 6 2
14 3 4 10 2
15 3 5 2 2
16 4 1 NA NA
17 4 2 NA NA
18 4 3 NA NA
19 4 4 NA NA
20 4 5 NA NA
I was using this solution, but it returns NA when the first value is NA:
newdf <- df %>% group_by(month) %>% mutate(Val=day[flow==min(flow)][1])
And this solution returns an error when all data is NA:
library(dplyr)
df <- df %>%
group_by(month) %>%
mutate(dayminflowofthemonth = day[which.min(flow)]) %>%
ungroup
You would just change the default na.rm = TRUE in min() from the first solution to ignore NAs?
df %>%
group_by(month) %>%
mutate(dayofminflow = day[which(min(flow, na.rm = TRUE) == flow)][1])
# A tibble: 20 x 4
# Groups: month [4]
month day flow dayofminflow
<dbl> <dbl> <dbl> <dbl>
1 1 1 2 1
2 1 2 5 1
3 1 3 7 1
4 1 4 8 1
5 1 5 5 1
6 2 1 4 5
7 2 2 6 5
8 2 3 7 5
9 2 4 9 5
10 2 5 2 5
11 3 1 NA 2
12 3 2 1 2
13 3 3 6 2
14 3 4 10 2
15 3 5 2 2
16 4 1 NA NA
17 4 2 NA NA
18 4 3 NA NA
19 4 4 NA NA
20 4 5 NA NA
Though you get a warning no non-missing arguments to min; returning Inf from month 4 since all flow values are NA.

Rolling sum in dplyr

set.seed(123)
df <- data.frame(x = sample(1:10, 20, replace = T), id = rep(1:2, each = 10))
For each id, I want to create a column which has the sum of previous 5 x values.
df %>% group_by(id) %>% mutate(roll.sum = c(x[1:4], zoo::rollapply(x, 5, sum)))
# Groups: id [2]
x id roll.sum
<int> <int> <int>
3 1 3
8 1 8
5 1 5
9 1 9
10 1 10
1 1 36
6 1 39
9 1 40
6 1 41
5 1 37
10 2 10
5 2 5
7 2 7
6 2 6
2 2 2
9 2 39
3 2 32
1 2 28
4 2 25
10 2 29
The 6th row should be 35 (3 + 8 + 5 + 9 + 10), the 7th row should be 33 (8 + 5 + 9 + 10 + 1) and so on.
However, the above function is also including the row itself for calculation. How can I fix it?
library(zoo)
df %>% group_by(id) %>%
mutate(Sum_prev = rollapply(x, list(-(1:5)), sum, fill=NA, align = "right", partial=F))
#you can use rollapply(x, list((1:5)), sum, fill=NA, align = "left", partial=F)
#to sum the next 5 elements scaping the current one
x id Sum_prev
1 3 1 NA
2 8 1 NA
3 5 1 NA
4 9 1 NA
5 10 1 NA
6 1 1 35
7 6 1 33
8 9 1 31
9 6 1 35
10 5 1 32
11 10 2 NA
12 5 2 NA
13 7 2 NA
14 6 2 NA
15 2 2 NA
16 9 2 30
17 3 2 29
18 1 2 27
19 4 2 21
20 10 2 19
There is the rollify function in the tibbletime package that you could use. You can read about it in this vignette: Rolling calculations in tibbletime.
library(tibbletime)
library(dplyr)
rollig_sum <- rollify(.f = sum, window = 5)
df %>%
group_by(id) %>%
mutate(roll.sum = lag(rollig_sum(x))) #added lag() here
# A tibble: 20 x 3
# Groups: id [2]
# x id roll.sum
# <int> <int> <int>
# 1 3 1 NA
# 2 8 1 NA
# 3 5 1 NA
# 4 9 1 NA
# 5 10 1 NA
# 6 1 1 35
# 7 6 1 33
# 8 9 1 31
# 9 6 1 35
#10 5 1 32
#11 10 2 NA
#12 5 2 NA
#13 7 2 NA
#14 6 2 NA
#15 2 2 NA
#16 9 2 30
#17 3 2 29
#18 1 2 27
#19 4 2 21
#20 10 2 19
If you want the NAs to be some other value, you can use, for example, if_else
df %>%
group_by(id) %>%
mutate(roll.sum = lag(rollig_sum(x))) %>%
mutate(roll.sum = if_else(is.na(roll.sum), x, roll.sum))

Resources