Vectorialised column addition [duplicate] - r

This question already has answers here:
How can I automatically create n lags in a timeseries?
(3 answers)
Closed 1 year ago.
Given this tibble:
tibble(x = c(1:9))
I want to add a column x_lag_1 = c(NA,1:8), a column x_lag_2 = c(NA,NA,1:7), etc.
Up to x_lag_n.

This can be quick with data.table:
library(data.table)
n <- seq(4)
setDT(df)[, paste0('x_lag_', n) := shift(x, n)]
df
x x_lag_1 x_lag_2 x_lag_3 x_lag_4
1: 1 NA NA NA NA
2: 2 1 NA NA NA
3: 3 2 1 NA NA
4: 4 3 2 1 NA
5: 5 4 3 2 1
6: 6 5 4 3 2
7: 7 6 5 4 3
8: 8 7 6 5 4
9: 9 8 7 6 5

You may use map_dfc to add n new columns.
library(dplyr)
library(purrr)
df <- tibble(x = c(1:9))
n <- 3
bind_cols(df, map_dfc(seq_len(n), ~df %>%
transmute(!!paste0('x_lag', .x) := lag(x, .x))))
# x x_lag1 x_lag2 x_lag3
# <int> <int> <int> <int>
#1 1 NA NA NA
#2 2 1 NA NA
#3 3 2 1 NA
#4 4 3 2 1
#5 5 4 3 2
#6 6 5 4 3
#7 7 6 5 4
#8 8 7 6 5
#9 9 8 7 6

Edit 2: Reworked the answer to contemplate the case of a grouped df.
library(tidyverse)
set.seed(123)
df <- tibble(group = sample(letters[1:3], 30, replace = TRUE), x = c(1:30))
formulas <- seq(3, 12, 3) %>%
map(~ as.formula(str_glue("~lag(.,n={.x})"))) %>%
set_names(str_c("lag", seq(3, 12, 3)))
df %>%
summarise(x, across(x, lst(!!!formulas)))
#> # A tibble: 30 × 5
#> x x_lag3 x_lag6 x_lag9 x_lag12
#> <int> <int> <int> <int> <int>
#> 1 1 NA NA NA NA
#> 2 2 NA NA NA NA
#> 3 3 NA NA NA NA
#> 4 4 1 NA NA NA
#> 5 5 2 NA NA NA
#> 6 6 3 NA NA NA
#> 7 7 4 1 NA NA
#> 8 8 5 2 NA NA
#> 9 9 6 3 NA NA
#> 10 10 7 4 1 NA
#> # … with 20 more rows
df %>%
group_by(group) %>%
summarise(x, across(x, lst(!!!formulas)), .groups = "drop")
#> # A tibble: 30 × 6
#> group x x_lag3 x_lag6 x_lag9 x_lag12
#> <chr> <int> <int> <int> <int> <int>
#> 1 a 10 NA NA NA NA
#> 2 a 13 NA NA NA NA
#> 3 a 16 NA NA NA NA
#> 4 a 19 10 NA NA NA
#> 5 a 20 13 NA NA NA
#> 6 a 21 16 NA NA NA
#> 7 a 22 19 10 NA NA
#> 8 a 27 20 13 NA NA
#> 9 b 4 NA NA NA NA
#> 10 b 6 NA NA NA NA
#> # … with 20 more rows
Created on 2021-12-30 by the reprex package (v2.0.1)

Related

Match and re-order rows in multiple columns in R (tidyverse)

I have a dataset like this (in the actual dataset, I have more columns like subj01):
# A tibble: 10 x 4
item subj01 subj02 subj03
<int> <dbl> <dbl> <dbl>
1 1 1 1 1
2 2 2 2 6
3 3 5 5 9
4 4 9 6 NA
5 5 10 8 NA
6 6 NA 9 NA
7 7 NA 10 NA
8 8 NA NA NA
9 9 NA NA NA
10 10 NA NA NA
I created the dataset using the code below.
data = tibble(item = 1:10, subj01 = c(1,2,5,9,10,NA,NA,NA,NA,NA), subj02 = c(1,2,5,6,8,9,10,NA,NA,NA), subj03 = c(1,6,9,NA,NA,NA,NA,NA,NA,NA))
I would like to reorder all the columns beginning with "subj" so that the position of the values match that in the item column.
That is, for this example dataset, I would like to end up with this:
# A tibble: 10 x 4
item subj01 subj02 subj03
<int> <dbl> <dbl> <dbl>
1 1 1 1 1
2 2 2 2 NA
3 3 NA NA NA
4 4 NA NA NA
5 5 5 5 NA
6 6 NA 6 6
7 7 NA NA NA
8 8 NA 8 NA
9 9 9 9 9
10 10 10 10 NA
I've figured that I can match and re-order one column by running this:
data$subj01[match(data$item,data$subj01)]
[1] 1 2 NA NA 5 NA NA NA 9 10
But I am struggling to apply this across multiple columns (ideally I'd like to embed the command in a dplyr pipe).
I tried the command below, but this gave me an error "Error in mutate(x. = x.[match(item, x.)]) : object 'x.' not found".
data = data %>% across(mutate(x.=x.[match(item,x.)]))
I'd appreciate any suggestions! Thank you.
library(tidyverse)
data %>%
pivot_longer(-item) %>%
filter(!is.na(value)) %>%
mutate(item = value) %>%
complete(item = 1:10, name) %>%
pivot_wider(names_from = name, values_from = value)
# A tibble: 10 × 4
item subj01 subj02 subj03
<dbl> <dbl> <dbl> <dbl>
1 1 1 1 1
2 2 2 2 NA
3 3 NA NA NA
4 4 NA NA NA
5 5 5 5 NA
6 6 NA 6 6
7 7 NA NA NA
8 8 NA 8 NA
9 9 9 9 9
10 10 10 10 NA

Group_by id and count the consective NA's and then restart counting when a new series of NA's is encountered

I have a dataframe like this:
df <- data_frame(id = c(rep('A', 10), rep('B', 10)),
value = c(1:3, rep(NA, 2), 1:2, rep(NA, 3), 1, rep(NA, 4), 1:3, rep(NA, 2)))
I need to count the number of consective NA's in the value column. The count needs to be grouped by ID, and it needs to restart at 1 every time a new NA or new series of NA's is encountered. The exptected output should look like this:
df$expected_output <- c(rep(NA, 3), 1:2, rep(NA, 2), 1:3, NA, 1:4, rep(NA, 3), 1:2)
If anyone can give me a dplyr solution that would also be great :)
I've tried a few things but nothing is giving any sort of sensical result. Thanks in advance^!
A solution using dplyr and data.table.
library(dplyr)
library(data.table)
df2 <- df %>%
group_by(id) %>%
mutate(info = rleid(value)) %>%
group_by(id, info) %>%
mutate(expected_output = row_number()) %>%
ungroup() %>%
mutate(expected_output = ifelse(!is.na(value), NA, expected_output)) %>%
select(-info)
df2
# # A tibble: 20 x 3
# id value expected_output
# <chr> <dbl> <int>
# 1 A 1 NA
# 2 A 2 NA
# 3 A 3 NA
# 4 A NA 1
# 5 A NA 2
# 6 A 1 NA
# 7 A 2 NA
# 8 A NA 1
# 9 A NA 2
# 10 A NA 3
# 11 B 1 NA
# 12 B NA 1
# 13 B NA 2
# 14 B NA 3
# 15 B NA 4
# 16 B 1 NA
# 17 B 2 NA
# 18 B 3 NA
# 19 B NA 1
# 20 B NA 2
We can use rle to get length of groups that are or are not na, and use purrr::map2 to apply seq if they are NA and get the growing count or just fill in with NA values using rep.
library(tidyverse)
count_na <- function(x) {
r <- rle(is.na(x))
consec <- map2(r$lengths, r$values, ~ if (.y) seq(.x) else rep(NA, .x))
unlist(consec)
}
df %>%
mutate(expected_output = count_na(value))
#> # A tibble: 20 × 3
#> id value expected_output
#> <chr> <dbl> <int>
#> 1 A 1 NA
#> 2 A 2 NA
#> 3 A 3 NA
#> 4 A NA 1
#> 5 A NA 2
#> 6 A 1 NA
#> 7 A 2 NA
#> 8 A NA 1
#> 9 A NA 2
#> 10 A NA 3
#> 11 B 1 NA
#> 12 B NA 1
#> 13 B NA 2
#> 14 B NA 3
#> 15 B NA 4
#> 16 B 1 NA
#> 17 B 2 NA
#> 18 B 3 NA
#> 19 B NA 1
#> 20 B NA 2
Here is a solution using rle:
x <- rle(is.na(df$value))
df$new[is.na(df$value)] <- sequence(x$lengths[x$values])
# A tibble: 20 x 3
id value new
<chr> <dbl> <int>
1 A 1 NA
2 A 2 NA
3 A 3 NA
4 A NA 1
5 A NA 2
6 A 1 NA
7 A 2 NA
8 A NA 1
9 A NA 2
10 A NA 3
11 B 1 NA
12 B NA 1
13 B NA 2
14 B NA 3
15 B NA 4
16 B 1 NA
17 B 2 NA
18 B 3 NA
19 B NA 1
20 B NA 2
Yet another solution:
library(tidyverse)
df %>%
mutate(aux =data.table::rleid(value)) %>%
group_by(id, aux) %>%
mutate(eout = ifelse(is.na(value), row_number(), NA_real_)) %>%
ungroup %>% select(-aux)
#> # A tibble: 20 × 4
#> id value expected_output eout
#> <chr> <dbl> <int> <dbl>
#> 1 A 1 NA NA
#> 2 A 2 NA NA
#> 3 A 3 NA NA
#> 4 A NA 1 1
#> 5 A NA 2 2
#> 6 A 1 NA NA
#> 7 A 2 NA NA
#> 8 A NA 1 1
#> 9 A NA 2 2
#> 10 A NA 3 3
#> 11 B 1 NA NA
#> 12 B NA 1 1
#> 13 B NA 2 2
#> 14 B NA 3 3
#> 15 B NA 4 4
#> 16 B 1 NA NA
#> 17 B 2 NA NA
#> 18 B 3 NA NA
#> 19 B NA 1 1
#> 20 B NA 2 2

Is there a way to group values in a column between data gaps in R?

I want to group my data in different chunks when the data is continuous. Trying to get the group column from dummy data like this:
a b group
<dbl> <dbl> <dbl>
1 1 1 1
2 2 2 1
3 3 3 1
4 4 NA NA
5 5 NA NA
6 6 NA NA
7 7 12 2
8 8 15 2
9 9 NA NA
10 10 25 3
I tried using
test %>% mutate(test = complete.cases(.)) %>%
group_by(group = cumsum(test == TRUE)) %>%
select(group, everything())
But it doesn't work as expected:
group a b test
<int> <dbl> <dbl> <lgl>
1 1 1 1 TRUE
2 2 2 2 TRUE
3 3 3 3 TRUE
4 3 4 NA FALSE
5 3 5 NA FALSE
6 3 6 NA FALSE
7 4 7 12 TRUE
8 5 8 15 TRUE
9 5 9 NA FALSE
10 6 10 25 TRUE
Any advice?
Using rle in base R -
transform(df, group1 = with(rle(!is.na(b)), rep(cumsum(values), lengths))) |>
transform(group1 = replace(group1, is.na(b), NA))
# a b group group1
#1 1 1 1 1
#2 2 2 1 1
#3 3 3 1 1
#4 4 NA NA NA
#5 5 NA NA NA
#6 6 NA NA NA
#7 7 12 2 2
#8 8 15 2 2
#9 9 NA NA NA
#10 10 25 3 3
A couple of approaches to consider if you wish to use dplyr for this.
First, you could look at transition from non-complete cases (using lag) to complete cases.
library(dplyr)
test %>%
mutate(test = complete.cases(.)) %>%
group_by(group = cumsum(test & !lag(test, default = F))) %>%
mutate(group = replace(group, !test, NA))
Alternatively, you could add row numbers to your data.frame. Then, you could filter to include only complete cases, and group_by enumerating with cumsum based on gaps in row numbers. Then, join back to original data.
test$rn <- seq.int(nrow(test))
test %>%
filter(complete.cases(.)) %>%
group_by(group = c(0, cumsum(diff(rn) > 1)) + 1) %>%
right_join(test) %>%
arrange(rn) %>%
dplyr::select(-rn)
Output
a b group
<int> <int> <dbl>
1 1 1 1
2 2 2 1
3 3 3 1
4 4 NA NA
5 5 NA NA
6 6 NA NA
7 7 12 2
8 8 15 2
9 9 NA NA
10 10 25 3
Using data.table, get rleid then remove group IDs for NAs, then fix the sequence with factor to integer conversion:
library(data.table)
setDT(test)[, group1 := {
x <- complete.cases(test)
grp <- rleid(x)
grp[ !x ] <- NA
as.integer(factor(grp))
}]
# a b group group1
# 1: 1 1 1 1
# 2: 2 2 1 1
# 3: 3 3 1 1
# 4: 4 NA NA NA
# 5: 5 NA NA NA
# 6: 6 NA NA NA
# 7: 7 12 2 2
# 8: 8 15 2 2
# 9: 9 NA NA NA
# 10: 10 25 3 3

dplyr conditional column if not null to calculate overall percent

Hello a really simple question but I have just got stuck, how do I add a conditional column containing number 1 where completed column is not NA?
id completed
<chr> <chr>
1 abc123sdf 35929
2 124cv NA
3 125xvdf 36295
4 126v NA
5 127sdsd 43933
6 128dfgs NA
7 129vsd NA
8 130sdf NA
9 131sdf NA
10 123sdfd NA
I need this to calculate an overall percent of completed column/id.
(Additional question - how can I do this in dplyr without using a helper column?)
Thanks
You can use is.na to check for NA values.
library(dplyr)
df %>% mutate(newcol = as.integer(!is.na(completed)))
# id completed newcol
#1 abc123sdf 35929 1
#2 124cv NA 0
#3 125xvdf 36295 1
#4 126v NA 0
#5 127sdsd 43933 1
#6 128dfgs NA 0
#7 129vsd NA 0
#8 130sdf NA 0
#9 131sdf NA 0
#10 123sdfd NA 0
library("dplyr")
df <- data.frame(id = 1:10,
completed = c(35929, NA, 36295, NA, 43933, NA, NA, NA, NA, NA))
df %>%
mutate(is_na = as.integer(!is.na(completed)))
#> id completed is_na
#> 1 1 35929 1
#> 2 2 NA 0
#> 3 3 36295 1
#> 4 4 NA 0
#> 5 5 43933 1
#> 6 6 NA 0
#> 7 7 NA 0
#> 8 8 NA 0
#> 9 9 NA 0
#> 10 10 NA 0
But you shouldn't need this extra column to calculate a percentage, you can just use na.rm:
df %>%
mutate(pct = completed / sum(completed, na.rm = TRUE))
#> id completed pct
#> 1 1 35929 0.3093141
#> 2 2 NA NA
#> 3 3 36295 0.3124650
#> 4 4 NA NA
#> 5 5 43933 0.3782209
#> 6 6 NA NA
#> 7 7 NA NA
#> 8 8 NA NA
#> 9 9 NA NA
#> 10 10 NA NA
We can also do
library(dplyr)
df %>%
mutate(newcol = +(!is.na(completed)))

R - Replace missing values with highest of 4 previous values

This is a variation of the last observation carried forward problem in a vector with some missing values. Instead of filling in NA values with the last non NA observation, I would like to fill in NA values with the highest value in the 4 observations preceding it. If all 4 observations preceding are also NA, the NA missing value should be retained. Would also appreciate it this can be done by groups in a data frame/data table.
Example:
Original DF:
ID Week Value
a 1 5
a 2 1
a 3 NA
a 4 NA
a 5 3
a 6 4
a 7 NA
b 1 NA
b 2 NA
b 3 NA
b 4 NA
b 5 NA
b 6 1
b 7 NA
Output DF:
ID Week Value
a 1 5
a 2 1
a 3 5
a 4 5
a 5 3
a 6 4
a 7 4
b 1 NA
b 2 NA
b 3 NA
b 4 NA
b 5 NA
b 6 1
b 7 1
lag shifts the column by n steps and lets you peek at previous values. pmax is element-wise maximum and lets to pick the highest value for each set/row of the observations.
To abstract away notion of 4 and maintain vectorized performance, you may use quasiquotes from rlang: http://dplyr.tidyverse.org/articles/programming.html#quasiquotation
It can look a little cryptic at first but is very precise and expressive.
df <- readr::read_table(
" ID Week Value
a 1 5
a 2 1
a 3 NA
a 4 NA
a 5 3
a 6 4
a 7 NA
b 1 NA
b 2 NA
b 3 NA
b 4 NA
b 5 NA
b 6 1
b 7 NA")
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
df %>%
group_by(ID) %>%
mutate(
Value = if_else(is.na(Value), pmax(lag(Value, 1), lag(Value, 2), lag(Value, 3), lag(Value, 4), na.rm = TRUE), Value)
)
#> # A tibble: 14 x 3
#> # Groups: ID [2]
#> ID Week Value
#> <chr> <int> <int>
#> 1 a 1 5
#> 2 a 2 1
#> 3 a 3 5
#> 4 a 4 5
#> 5 a 5 3
#> 6 a 6 4
#> 7 a 7 4
#> 8 b 1 NA
#> 9 b 2 NA
#> 10 b 3 NA
#> 11 b 4 NA
#> 12 b 5 NA
#> 13 b 6 1
#> 14 b 7 1
# or if you are an rlang ninja
library(purrr)
pmax_lag_n <- function(column, n) {
column <- enquo(column)
1:n %>%
map(~quo(lag(!!column, !!.x))) %>%
{ quo(pmax(!!!., na.rm = TRUE)) }
}
df %>%
group_by(ID) %>%
mutate(Value = if_else(is.na(Value), !!pmax_lag_n(Value, 4), Value))
#> # A tibble: 14 x 3
#> # Groups: ID [2]
#> ID Week Value
#> <chr> <int> <int>
#> 1 a 1 5
#> 2 a 2 1
#> 3 a 3 5
#> 4 a 4 5
#> 5 a 5 3
#> 6 a 6 4
#> 7 a 7 4
#> 8 b 1 NA
#> 9 b 2 NA
#> 10 b 3 NA
#> 11 b 4 NA
#> 12 b 5 NA
#> 13 b 6 1
#> 14 b 7 1
Define function Max which accepts a vector x and returns NA if all its elements are NA. Otherwise, if the last value is NA it returns the maximum of all non-NA elements and if the last value is not NA then it returns it.
Also define na.max which runs Max on a rolling window of length n (given by the second argument to na.max -- default 5).
Finally apply na.max to Value by ID using ave.
library(zoo)
Max <- function(x) {
last <- tail(x, 1)
if (all(is.na(x))) NA
else if (is.na(last)) max(x, na.rm = TRUE)
else last
}
na.max <- function(x, n = 5) rollapplyr(x, n, Max, partial = TRUE)
transform(DF, Value = ave(Value, ID, FUN = na.max))
giving:
ID Week Value
1 a 1 5
2 a 2 1
3 a 3 5
4 a 4 5
5 a 5 3
6 a 6 4
7 a 7 4
8 b 1 NA
9 b 2 NA
10 b 3 NA
11 b 4 NA
12 b 5 NA
13 b 6 1
14 b 7 1
Note: Input DF in reproducible form:
Lines <- "
ID Week Value
a 1 5
a 2 1
a 3 NA
a 4 NA
a 5 3
a 6 4
a 7 NA
b 1 NA
b 2 NA
b 3 NA
b 4 NA
b 5 NA
b 6 1
b 7 NA"
DF <- read.table(text = Lines, header = TRUE)

Resources