replacing missing value with non-values in grouped data using tidyverse - r

For each id, I am trying to replace missing values with data that is available.
library(tidyverse)
df <- data.frame(id=c(1,1,1,2,2,2,3),
a=c(NA, NA, 10, NA, 12, NA, 10),
b=c(10, NA, NA, NA, 13,NA, NA))
> df
id a b
1 1 NA 10
2 1 NA NA
3 1 10 NA
4 2 NA NA
5 2 12 13
6 2 NA NA
7 3 10 NA
I have tried:
df %>%
dplyr::group_by(id) %>%
dplyr::mutate_at(vars(a:b), fill(., direction="up"))
and get the following error:
Error: 1 components of `...` had unexpected names.
We detected these problematic arguments:
* `direction`
Did you misspecify an argument?
Desired output:
id a b
1 1 10 10
2 1 10 NA
3 1 10 NA
4 2 12 13
5 2 12 13
6 2 12 13
7 3 10 NA

We dont' use fill with mutate_at. According to ?fill
data - A data frame. and
... - A selection of columns. If empty, nothing happens. You can supply bare variable names, select all variables between x and z with x:z, exclude y with -y. F
library(dplyr)
library(tidyr)
df %>%
group_by(id) %>%
fill(a:b, .direction = 'up')
# A tibble: 7 x 3
# Groups: id [3]
# id a b
# <dbl> <dbl> <dbl>
#1 1 10 10
#2 1 10 NA
#3 1 10 NA
#4 2 12 13
#5 2 12 13
#6 2 NA NA
#7 3 10 NA

Related

Group_by id and count the consective NA's and then restart counting when a new series of NA's is encountered

I have a dataframe like this:
df <- data_frame(id = c(rep('A', 10), rep('B', 10)),
value = c(1:3, rep(NA, 2), 1:2, rep(NA, 3), 1, rep(NA, 4), 1:3, rep(NA, 2)))
I need to count the number of consective NA's in the value column. The count needs to be grouped by ID, and it needs to restart at 1 every time a new NA or new series of NA's is encountered. The exptected output should look like this:
df$expected_output <- c(rep(NA, 3), 1:2, rep(NA, 2), 1:3, NA, 1:4, rep(NA, 3), 1:2)
If anyone can give me a dplyr solution that would also be great :)
I've tried a few things but nothing is giving any sort of sensical result. Thanks in advance^!
A solution using dplyr and data.table.
library(dplyr)
library(data.table)
df2 <- df %>%
group_by(id) %>%
mutate(info = rleid(value)) %>%
group_by(id, info) %>%
mutate(expected_output = row_number()) %>%
ungroup() %>%
mutate(expected_output = ifelse(!is.na(value), NA, expected_output)) %>%
select(-info)
df2
# # A tibble: 20 x 3
# id value expected_output
# <chr> <dbl> <int>
# 1 A 1 NA
# 2 A 2 NA
# 3 A 3 NA
# 4 A NA 1
# 5 A NA 2
# 6 A 1 NA
# 7 A 2 NA
# 8 A NA 1
# 9 A NA 2
# 10 A NA 3
# 11 B 1 NA
# 12 B NA 1
# 13 B NA 2
# 14 B NA 3
# 15 B NA 4
# 16 B 1 NA
# 17 B 2 NA
# 18 B 3 NA
# 19 B NA 1
# 20 B NA 2
We can use rle to get length of groups that are or are not na, and use purrr::map2 to apply seq if they are NA and get the growing count or just fill in with NA values using rep.
library(tidyverse)
count_na <- function(x) {
r <- rle(is.na(x))
consec <- map2(r$lengths, r$values, ~ if (.y) seq(.x) else rep(NA, .x))
unlist(consec)
}
df %>%
mutate(expected_output = count_na(value))
#> # A tibble: 20 × 3
#> id value expected_output
#> <chr> <dbl> <int>
#> 1 A 1 NA
#> 2 A 2 NA
#> 3 A 3 NA
#> 4 A NA 1
#> 5 A NA 2
#> 6 A 1 NA
#> 7 A 2 NA
#> 8 A NA 1
#> 9 A NA 2
#> 10 A NA 3
#> 11 B 1 NA
#> 12 B NA 1
#> 13 B NA 2
#> 14 B NA 3
#> 15 B NA 4
#> 16 B 1 NA
#> 17 B 2 NA
#> 18 B 3 NA
#> 19 B NA 1
#> 20 B NA 2
Here is a solution using rle:
x <- rle(is.na(df$value))
df$new[is.na(df$value)] <- sequence(x$lengths[x$values])
# A tibble: 20 x 3
id value new
<chr> <dbl> <int>
1 A 1 NA
2 A 2 NA
3 A 3 NA
4 A NA 1
5 A NA 2
6 A 1 NA
7 A 2 NA
8 A NA 1
9 A NA 2
10 A NA 3
11 B 1 NA
12 B NA 1
13 B NA 2
14 B NA 3
15 B NA 4
16 B 1 NA
17 B 2 NA
18 B 3 NA
19 B NA 1
20 B NA 2
Yet another solution:
library(tidyverse)
df %>%
mutate(aux =data.table::rleid(value)) %>%
group_by(id, aux) %>%
mutate(eout = ifelse(is.na(value), row_number(), NA_real_)) %>%
ungroup %>% select(-aux)
#> # A tibble: 20 × 4
#> id value expected_output eout
#> <chr> <dbl> <int> <dbl>
#> 1 A 1 NA NA
#> 2 A 2 NA NA
#> 3 A 3 NA NA
#> 4 A NA 1 1
#> 5 A NA 2 2
#> 6 A 1 NA NA
#> 7 A 2 NA NA
#> 8 A NA 1 1
#> 9 A NA 2 2
#> 10 A NA 3 3
#> 11 B 1 NA NA
#> 12 B NA 1 1
#> 13 B NA 2 2
#> 14 B NA 3 3
#> 15 B NA 4 4
#> 16 B 1 NA NA
#> 17 B 2 NA NA
#> 18 B 3 NA NA
#> 19 B NA 1 1
#> 20 B NA 2 2

Is there a way to group values in a column between data gaps in R?

I want to group my data in different chunks when the data is continuous. Trying to get the group column from dummy data like this:
a b group
<dbl> <dbl> <dbl>
1 1 1 1
2 2 2 1
3 3 3 1
4 4 NA NA
5 5 NA NA
6 6 NA NA
7 7 12 2
8 8 15 2
9 9 NA NA
10 10 25 3
I tried using
test %>% mutate(test = complete.cases(.)) %>%
group_by(group = cumsum(test == TRUE)) %>%
select(group, everything())
But it doesn't work as expected:
group a b test
<int> <dbl> <dbl> <lgl>
1 1 1 1 TRUE
2 2 2 2 TRUE
3 3 3 3 TRUE
4 3 4 NA FALSE
5 3 5 NA FALSE
6 3 6 NA FALSE
7 4 7 12 TRUE
8 5 8 15 TRUE
9 5 9 NA FALSE
10 6 10 25 TRUE
Any advice?
Using rle in base R -
transform(df, group1 = with(rle(!is.na(b)), rep(cumsum(values), lengths))) |>
transform(group1 = replace(group1, is.na(b), NA))
# a b group group1
#1 1 1 1 1
#2 2 2 1 1
#3 3 3 1 1
#4 4 NA NA NA
#5 5 NA NA NA
#6 6 NA NA NA
#7 7 12 2 2
#8 8 15 2 2
#9 9 NA NA NA
#10 10 25 3 3
A couple of approaches to consider if you wish to use dplyr for this.
First, you could look at transition from non-complete cases (using lag) to complete cases.
library(dplyr)
test %>%
mutate(test = complete.cases(.)) %>%
group_by(group = cumsum(test & !lag(test, default = F))) %>%
mutate(group = replace(group, !test, NA))
Alternatively, you could add row numbers to your data.frame. Then, you could filter to include only complete cases, and group_by enumerating with cumsum based on gaps in row numbers. Then, join back to original data.
test$rn <- seq.int(nrow(test))
test %>%
filter(complete.cases(.)) %>%
group_by(group = c(0, cumsum(diff(rn) > 1)) + 1) %>%
right_join(test) %>%
arrange(rn) %>%
dplyr::select(-rn)
Output
a b group
<int> <int> <dbl>
1 1 1 1
2 2 2 1
3 3 3 1
4 4 NA NA
5 5 NA NA
6 6 NA NA
7 7 12 2
8 8 15 2
9 9 NA NA
10 10 25 3
Using data.table, get rleid then remove group IDs for NAs, then fix the sequence with factor to integer conversion:
library(data.table)
setDT(test)[, group1 := {
x <- complete.cases(test)
grp <- rleid(x)
grp[ !x ] <- NA
as.integer(factor(grp))
}]
# a b group group1
# 1: 1 1 1 1
# 2: 2 2 1 1
# 3: 3 3 1 1
# 4: 4 NA NA NA
# 5: 5 NA NA NA
# 6: 6 NA NA NA
# 7: 7 12 2 2
# 8: 8 15 2 2
# 9: 9 NA NA NA
# 10: 10 25 3 3

na.approx only when less than 3 consecutive NA in a column

mydata <-data.frame(group = c(1,1,1,1,1,2,2,2,2,2), score = c(10, NA, NA, 20, 30, 5, NA, NA, NA, 40))
From 'mydata' I am trying to use dplyr to interpolate 'x' using na.approx when there are fewer than 3 consecutive NAs between the closest non-NA entries in 'value'. The interpolated x values are store in 'x_approx'.
Without the condition on the number of consecutive NAs in 'value' I use this code:
library(zoo)
mydata %>%
group_by(group) %>%
mutate(score_approx = na.approx(score)) %>%
mutate(score_approx = coalesce(score_approx,score))
mydata
# A tibble: 10 x 3
# Groups: group [2]
group score score_approx
<dbl> <dbl> <dbl>
1 1 10 10
2 1 NA 13.3
3 1 NA 16.7
4 1 20 20
5 1 30 30
6 2 5 5
7 2 NA 13.8
8 2 NA 22.5
9 2 NA 31.2
10 2 40 40
However, the desired data frame is:
# A tibble: 10 x 3
# Groups: group [2]
group score score_approx
<dbl> <dbl> <dbl>
1 1 10 10
2 1 NA 13.3
3 1 NA 16.7
4 1 20 20
5 1 30 30
6 2 5 5
7 2 NA NA
8 2 NA NA
9 2 NA NA
10 2 40 40
You can use maxgap argument in na.approx -
library(dplyr)
library(zoo)
mydata %>%
group_by(group) %>%
mutate(score_approx = na.approx(score, maxgap = 2)) %>%
ungroup
# group score score_approx
# <dbl> <dbl> <dbl>
# 1 1 10 10
# 2 1 NA 13.3
# 3 1 NA 16.7
# 4 1 20 20
# 5 1 30 30
# 6 2 5 5
# 7 2 NA NA
# 8 2 NA NA
# 9 2 NA NA
#10 2 40 40

Filtering data relative to first and last occurance of an event

I have a dataframe of an experiment, where stimulus is shown to participants, and time is measured continuously.
# reprex
df <-
tibble(stim = c(NA, NA, NA, NA, "a", "b", NA, "c", NA, "d", NA, NA, NA),
time = 0:12)
# A tibble: 13 x 2
stim time
<chr> <int>
1 NA 0
2 NA 1
3 NA 2
4 NA 3
5 a 4
6 b 5
7 NA 6
8 c 7
9 NA 8
10 d 9
11 NA 10
12 NA 11
13 NA 12
I want to create a generalized solution, using tidyverse functions to drop the data 1 second before and 2 seconds after the first and last marker, respectively. Using tidyverse, I thought this will work, but it throws an uninformative error.
df %>%
# store times for first and last stim
mutate(first_stim = drop_na(stim) %>% pull(time) %>% first(),
last_stim = drop_na(stim) %>% pull(time) %>% last()) %>%
# filter df based on new variables
filter(time >= first(first_stim) - 1 &
time <= first(last_stim) + 2)
Error in mutate_impl(.data, dots) : bad value
So I made a pretty ugly base r code to overcome this issue by changing the mutate:
df2 <- df %>%
mutate(first_stim = .[!is.na(.$stim), "time"][1,1],
last_stim = .[!is.na(.$stim), "time"][nrow(.[!is.na(.$stim), "time"]), 1])
# A tibble: 13 x 4
stim time first_stim last_stim
<chr> <int> <tibble> <tibble>
1 NA 0 4 9
2 NA 1 4 9
3 NA 2 4 9
4 NA 3 4 9
5 a 4 4 9
6 b 5 4 9
7 NA 6 4 9
8 c 7 4 9
9 NA 8 4 9
10 d 9 4 9
11 NA 10 4 9
12 NA 11 4 9
13 NA 12 4 9
Now I would only need to filter based on the new variables first_stim - 1 and last_stim + 2. But filter fails too:
df2 %>%
filter(time >= first(first_stim) - 1 &
time <= first(last_stim) + 2)
Error in filter_impl(.data, quo) :
Not compatible with STRSXP: [type=NULL].
I was able to do it in base R, but it is really ugly:
df2[(df2$time >= (df2[[1, "first_stim"]] - 1)) &
(df2$time <= (df2[[1, "last_stim"]] + 2))
,]
The desired output should look like this:
# A tibble: 13 x 2
stim time
<chr> <int>
4 NA 3
5 a 4
6 b 5
7 NA 6
8 c 7
9 NA 8
10 d 9
11 NA 10
12 NA 11
I believe that the errors are related to dplyr::nth() and related functions. And I've found some old issues that are related to this behavior, but should no longer exist https://github.com/tidyverse/dplyr/issues/1980
I would really appreciate if someone could highlight what is the problem, and how to do this in a tidy way.
You could use a combination of is.na and which...
library(dplyr)
df <-
tibble(stim = c(NA, NA, NA, NA, "a", "b", NA, "c", NA, "d", NA, NA, NA),
time = 0:12)
df %>%
filter(row_number() >= first(which(!is.na(stim))) - 1 &
row_number() <= last(which(!is.na(stim))) + 2)
# # A tibble: 9 x 2
# stim time
# <chr> <int>
# 1 NA 3
# 2 a 4
# 3 b 5
# 4 NA 6
# 5 c 7
# 6 NA 8
# 7 d 9
# 8 NA 10
# 9 NA 11
you could also make your first attempt work with a little modification...
df %>%
mutate(first_stim = first(drop_na(., stim) %>% pull(time)),
last_stim = last(drop_na(., stim) %>% pull(time))) %>%
filter(time >= first(first_stim) - 1 &
time <= first(last_stim) + 2)
We can create a cumulative sum of non-NA values and then find the row indices where the we encounter the first non-NA value and the last one. We then select rows based on the requirement. (-1 from start and +2 from end).
library(tidyverse)
df %>%
mutate(count_cumsum = cumsum(!is.na(stim))) %>%
slice((which.max(count_cumsum == 1) -1):(which.max(count_cumsum) + 2)) %>%
select(-count_cumsum)
# stim time
# <chr> <int>
#1 NA 3
#2 a 4
#3 b 5
#4 NA 6
#5 c 7
#6 NA 8
#7 d 9
#8 NA 10
#9 NA 11
Just to give an idea how count_cumsum looks:
df %>%
mutate(count_cumsum = cumsum(!is.na(stim)))
# A tibble: 13 x 3
# stim time count_cumsum
# <chr> <int> <int>
#1 NA 0 0
#2 NA 1 0
#3 NA 2 0
#4 NA 3 0
#5 a 4 1
#6 b 5 2
#7 NA 6 2
#8 c 7 3
#9 NA 8 3
#10 d 9 4
#11 NA 10 4
#12 NA 11 4
#13 NA 12 4

tidyr spread does not aggregate data

I have data of the following:
> data <- data.frame(unique=1:9, grouping=rep(c('a', 'b', 'c'), each=3), value=sample(1:30, 9))
> data
unique grouping value
1 1 a 15
2 2 a 21
3 3 a 26
4 4 b 8
5 5 b 6
6 6 b 4
7 7 c 17
8 8 c 1
9 9 c 3
I would like to create a table that looks like this:
a b c
1 15 8 17
2 21 6 1
3 26 6 3
I am using tidyr::spread and not getting the correct result:
> data %>% spread(grouping, value)
unique a b c
1 1 15 NA NA
2 2 21 NA NA
3 3 26 NA NA
4 4 NA 8 NA
5 5 NA 6 NA
6 6 NA 4 NA
7 7 NA NA 17
8 8 NA NA 1
9 9 NA NA 3
Or
> data %>% select(grouping, value) %>% spread(grouping, value)
Error: Duplicate identifiers for rows (1, 2, 3), (4, 5, 6), (7, 8, 9)
Is there a way to do this also when one group (c) has a different length than the others?
We need to create a sequence column to avoid the duplicate identifiers row Error.
library(tidyr)
library(dplyr)
data %>%
group_by(grouping) %>%
mutate(id = row_number()) %>%
select(-unique) %>%
spread(grouping, value) %>%
select(-id)
# a b c
# (int) (int) (int)
#1 15 8 17
#2 21 6 1
#3 26 4 3

Resources