dplyr conditional column if not null to calculate overall percent - r

Hello a really simple question but I have just got stuck, how do I add a conditional column containing number 1 where completed column is not NA?
id completed
<chr> <chr>
1 abc123sdf 35929
2 124cv NA
3 125xvdf 36295
4 126v NA
5 127sdsd 43933
6 128dfgs NA
7 129vsd NA
8 130sdf NA
9 131sdf NA
10 123sdfd NA
I need this to calculate an overall percent of completed column/id.
(Additional question - how can I do this in dplyr without using a helper column?)
Thanks

You can use is.na to check for NA values.
library(dplyr)
df %>% mutate(newcol = as.integer(!is.na(completed)))
# id completed newcol
#1 abc123sdf 35929 1
#2 124cv NA 0
#3 125xvdf 36295 1
#4 126v NA 0
#5 127sdsd 43933 1
#6 128dfgs NA 0
#7 129vsd NA 0
#8 130sdf NA 0
#9 131sdf NA 0
#10 123sdfd NA 0

library("dplyr")
df <- data.frame(id = 1:10,
completed = c(35929, NA, 36295, NA, 43933, NA, NA, NA, NA, NA))
df %>%
mutate(is_na = as.integer(!is.na(completed)))
#> id completed is_na
#> 1 1 35929 1
#> 2 2 NA 0
#> 3 3 36295 1
#> 4 4 NA 0
#> 5 5 43933 1
#> 6 6 NA 0
#> 7 7 NA 0
#> 8 8 NA 0
#> 9 9 NA 0
#> 10 10 NA 0
But you shouldn't need this extra column to calculate a percentage, you can just use na.rm:
df %>%
mutate(pct = completed / sum(completed, na.rm = TRUE))
#> id completed pct
#> 1 1 35929 0.3093141
#> 2 2 NA NA
#> 3 3 36295 0.3124650
#> 4 4 NA NA
#> 5 5 43933 0.3782209
#> 6 6 NA NA
#> 7 7 NA NA
#> 8 8 NA NA
#> 9 9 NA NA
#> 10 10 NA NA

We can also do
library(dplyr)
df %>%
mutate(newcol = +(!is.na(completed)))

Related

Vectorialised column addition [duplicate]

This question already has answers here:
How can I automatically create n lags in a timeseries?
(3 answers)
Closed 1 year ago.
Given this tibble:
tibble(x = c(1:9))
I want to add a column x_lag_1 = c(NA,1:8), a column x_lag_2 = c(NA,NA,1:7), etc.
Up to x_lag_n.
This can be quick with data.table:
library(data.table)
n <- seq(4)
setDT(df)[, paste0('x_lag_', n) := shift(x, n)]
df
x x_lag_1 x_lag_2 x_lag_3 x_lag_4
1: 1 NA NA NA NA
2: 2 1 NA NA NA
3: 3 2 1 NA NA
4: 4 3 2 1 NA
5: 5 4 3 2 1
6: 6 5 4 3 2
7: 7 6 5 4 3
8: 8 7 6 5 4
9: 9 8 7 6 5
You may use map_dfc to add n new columns.
library(dplyr)
library(purrr)
df <- tibble(x = c(1:9))
n <- 3
bind_cols(df, map_dfc(seq_len(n), ~df %>%
transmute(!!paste0('x_lag', .x) := lag(x, .x))))
# x x_lag1 x_lag2 x_lag3
# <int> <int> <int> <int>
#1 1 NA NA NA
#2 2 1 NA NA
#3 3 2 1 NA
#4 4 3 2 1
#5 5 4 3 2
#6 6 5 4 3
#7 7 6 5 4
#8 8 7 6 5
#9 9 8 7 6
Edit 2: Reworked the answer to contemplate the case of a grouped df.
library(tidyverse)
set.seed(123)
df <- tibble(group = sample(letters[1:3], 30, replace = TRUE), x = c(1:30))
formulas <- seq(3, 12, 3) %>%
map(~ as.formula(str_glue("~lag(.,n={.x})"))) %>%
set_names(str_c("lag", seq(3, 12, 3)))
df %>%
summarise(x, across(x, lst(!!!formulas)))
#> # A tibble: 30 × 5
#> x x_lag3 x_lag6 x_lag9 x_lag12
#> <int> <int> <int> <int> <int>
#> 1 1 NA NA NA NA
#> 2 2 NA NA NA NA
#> 3 3 NA NA NA NA
#> 4 4 1 NA NA NA
#> 5 5 2 NA NA NA
#> 6 6 3 NA NA NA
#> 7 7 4 1 NA NA
#> 8 8 5 2 NA NA
#> 9 9 6 3 NA NA
#> 10 10 7 4 1 NA
#> # … with 20 more rows
df %>%
group_by(group) %>%
summarise(x, across(x, lst(!!!formulas)), .groups = "drop")
#> # A tibble: 30 × 6
#> group x x_lag3 x_lag6 x_lag9 x_lag12
#> <chr> <int> <int> <int> <int> <int>
#> 1 a 10 NA NA NA NA
#> 2 a 13 NA NA NA NA
#> 3 a 16 NA NA NA NA
#> 4 a 19 10 NA NA NA
#> 5 a 20 13 NA NA NA
#> 6 a 21 16 NA NA NA
#> 7 a 22 19 10 NA NA
#> 8 a 27 20 13 NA NA
#> 9 b 4 NA NA NA NA
#> 10 b 6 NA NA NA NA
#> # … with 20 more rows
Created on 2021-12-30 by the reprex package (v2.0.1)

Group_by id and count the consective NA's and then restart counting when a new series of NA's is encountered

I have a dataframe like this:
df <- data_frame(id = c(rep('A', 10), rep('B', 10)),
value = c(1:3, rep(NA, 2), 1:2, rep(NA, 3), 1, rep(NA, 4), 1:3, rep(NA, 2)))
I need to count the number of consective NA's in the value column. The count needs to be grouped by ID, and it needs to restart at 1 every time a new NA or new series of NA's is encountered. The exptected output should look like this:
df$expected_output <- c(rep(NA, 3), 1:2, rep(NA, 2), 1:3, NA, 1:4, rep(NA, 3), 1:2)
If anyone can give me a dplyr solution that would also be great :)
I've tried a few things but nothing is giving any sort of sensical result. Thanks in advance^!
A solution using dplyr and data.table.
library(dplyr)
library(data.table)
df2 <- df %>%
group_by(id) %>%
mutate(info = rleid(value)) %>%
group_by(id, info) %>%
mutate(expected_output = row_number()) %>%
ungroup() %>%
mutate(expected_output = ifelse(!is.na(value), NA, expected_output)) %>%
select(-info)
df2
# # A tibble: 20 x 3
# id value expected_output
# <chr> <dbl> <int>
# 1 A 1 NA
# 2 A 2 NA
# 3 A 3 NA
# 4 A NA 1
# 5 A NA 2
# 6 A 1 NA
# 7 A 2 NA
# 8 A NA 1
# 9 A NA 2
# 10 A NA 3
# 11 B 1 NA
# 12 B NA 1
# 13 B NA 2
# 14 B NA 3
# 15 B NA 4
# 16 B 1 NA
# 17 B 2 NA
# 18 B 3 NA
# 19 B NA 1
# 20 B NA 2
We can use rle to get length of groups that are or are not na, and use purrr::map2 to apply seq if they are NA and get the growing count or just fill in with NA values using rep.
library(tidyverse)
count_na <- function(x) {
r <- rle(is.na(x))
consec <- map2(r$lengths, r$values, ~ if (.y) seq(.x) else rep(NA, .x))
unlist(consec)
}
df %>%
mutate(expected_output = count_na(value))
#> # A tibble: 20 × 3
#> id value expected_output
#> <chr> <dbl> <int>
#> 1 A 1 NA
#> 2 A 2 NA
#> 3 A 3 NA
#> 4 A NA 1
#> 5 A NA 2
#> 6 A 1 NA
#> 7 A 2 NA
#> 8 A NA 1
#> 9 A NA 2
#> 10 A NA 3
#> 11 B 1 NA
#> 12 B NA 1
#> 13 B NA 2
#> 14 B NA 3
#> 15 B NA 4
#> 16 B 1 NA
#> 17 B 2 NA
#> 18 B 3 NA
#> 19 B NA 1
#> 20 B NA 2
Here is a solution using rle:
x <- rle(is.na(df$value))
df$new[is.na(df$value)] <- sequence(x$lengths[x$values])
# A tibble: 20 x 3
id value new
<chr> <dbl> <int>
1 A 1 NA
2 A 2 NA
3 A 3 NA
4 A NA 1
5 A NA 2
6 A 1 NA
7 A 2 NA
8 A NA 1
9 A NA 2
10 A NA 3
11 B 1 NA
12 B NA 1
13 B NA 2
14 B NA 3
15 B NA 4
16 B 1 NA
17 B 2 NA
18 B 3 NA
19 B NA 1
20 B NA 2
Yet another solution:
library(tidyverse)
df %>%
mutate(aux =data.table::rleid(value)) %>%
group_by(id, aux) %>%
mutate(eout = ifelse(is.na(value), row_number(), NA_real_)) %>%
ungroup %>% select(-aux)
#> # A tibble: 20 × 4
#> id value expected_output eout
#> <chr> <dbl> <int> <dbl>
#> 1 A 1 NA NA
#> 2 A 2 NA NA
#> 3 A 3 NA NA
#> 4 A NA 1 1
#> 5 A NA 2 2
#> 6 A 1 NA NA
#> 7 A 2 NA NA
#> 8 A NA 1 1
#> 9 A NA 2 2
#> 10 A NA 3 3
#> 11 B 1 NA NA
#> 12 B NA 1 1
#> 13 B NA 2 2
#> 14 B NA 3 3
#> 15 B NA 4 4
#> 16 B 1 NA NA
#> 17 B 2 NA NA
#> 18 B 3 NA NA
#> 19 B NA 1 1
#> 20 B NA 2 2

Is there a way to group values in a column between data gaps in R?

I want to group my data in different chunks when the data is continuous. Trying to get the group column from dummy data like this:
a b group
<dbl> <dbl> <dbl>
1 1 1 1
2 2 2 1
3 3 3 1
4 4 NA NA
5 5 NA NA
6 6 NA NA
7 7 12 2
8 8 15 2
9 9 NA NA
10 10 25 3
I tried using
test %>% mutate(test = complete.cases(.)) %>%
group_by(group = cumsum(test == TRUE)) %>%
select(group, everything())
But it doesn't work as expected:
group a b test
<int> <dbl> <dbl> <lgl>
1 1 1 1 TRUE
2 2 2 2 TRUE
3 3 3 3 TRUE
4 3 4 NA FALSE
5 3 5 NA FALSE
6 3 6 NA FALSE
7 4 7 12 TRUE
8 5 8 15 TRUE
9 5 9 NA FALSE
10 6 10 25 TRUE
Any advice?
Using rle in base R -
transform(df, group1 = with(rle(!is.na(b)), rep(cumsum(values), lengths))) |>
transform(group1 = replace(group1, is.na(b), NA))
# a b group group1
#1 1 1 1 1
#2 2 2 1 1
#3 3 3 1 1
#4 4 NA NA NA
#5 5 NA NA NA
#6 6 NA NA NA
#7 7 12 2 2
#8 8 15 2 2
#9 9 NA NA NA
#10 10 25 3 3
A couple of approaches to consider if you wish to use dplyr for this.
First, you could look at transition from non-complete cases (using lag) to complete cases.
library(dplyr)
test %>%
mutate(test = complete.cases(.)) %>%
group_by(group = cumsum(test & !lag(test, default = F))) %>%
mutate(group = replace(group, !test, NA))
Alternatively, you could add row numbers to your data.frame. Then, you could filter to include only complete cases, and group_by enumerating with cumsum based on gaps in row numbers. Then, join back to original data.
test$rn <- seq.int(nrow(test))
test %>%
filter(complete.cases(.)) %>%
group_by(group = c(0, cumsum(diff(rn) > 1)) + 1) %>%
right_join(test) %>%
arrange(rn) %>%
dplyr::select(-rn)
Output
a b group
<int> <int> <dbl>
1 1 1 1
2 2 2 1
3 3 3 1
4 4 NA NA
5 5 NA NA
6 6 NA NA
7 7 12 2
8 8 15 2
9 9 NA NA
10 10 25 3
Using data.table, get rleid then remove group IDs for NAs, then fix the sequence with factor to integer conversion:
library(data.table)
setDT(test)[, group1 := {
x <- complete.cases(test)
grp <- rleid(x)
grp[ !x ] <- NA
as.integer(factor(grp))
}]
# a b group group1
# 1: 1 1 1 1
# 2: 2 2 1 1
# 3: 3 3 1 1
# 4: 4 NA NA NA
# 5: 5 NA NA NA
# 6: 6 NA NA NA
# 7: 7 12 2 2
# 8: 8 15 2 2
# 9: 9 NA NA NA
# 10: 10 25 3 3

trying to calculate sum of row with dataframe having NA values

I am trying to sum the row of values if any column have values but not working for me like below
df=data.frame(
x3=c(2,NA,3,5,4,6,NA,NA,3,3),
x4=c(0,NA,NA,6,5,6,NA,0,4,2))
df$summ <- ifelse(is.na(c(df[,"x3"] & df[,"x4"])),NA,rowSums(df[,c("x3","x4")], na.rm=TRUE))
the output should be like
An alternative solution:
library(data.table)
setDT(df)[!( is.na(x3) & is.na(x4)),summ:=rowSums(.SD, na.rm = T)]
You can do :
df <- transform(df, summ = ifelse(is.na(x3) & is.na(x4), NA,
rowSums(df, na.rm = TRUE)))
df
# x3 x4 summ
#1 2 0 2
#2 NA NA NA
#3 3 NA 3
#4 5 6 11
#5 4 5 9
#6 6 6 12
#7 NA NA NA
#8 NA 0 0
#9 3 4 7
#10 3 2 5
In general for any number of columns :
cols <- c('x3', 'x4')
df <- transform(df, summ = ifelse(rowSums(is.na(df[cols])) == length(cols),
NA, rowSums(df, na.rm = TRUE)))
Try the code below with rowSums + replace
df$summ <- replace(rowSums(df, na.rm = TRUE), rowSums(is.na(df)) == 2, NA)
which gives
> df
x3 x4 summ
1 2 0 2
2 NA NA NA
3 3 NA 3
4 5 6 11
5 4 5 9
6 6 6 12
7 NA NA NA
8 NA 0 0
9 3 4 7
10 3 2 5
This is not much different from already posted answers, however, it contains some useful functions:
library(dplyr)
df %>%
rowwise() %>%
mutate(Count = ifelse(all(is.na(cur_data())), NA,
sum(c_across(everything()), na.rm = TRUE)))
# A tibble: 10 x 3
# Rowwise:
x3 x4 Count
<dbl> <dbl> <dbl>
1 2 0 2
2 NA NA NA
3 3 NA 3
4 5 6 11
5 4 5 9
6 6 6 12
7 NA NA NA
8 NA 0 0
9 3 4 7
10 3 2 5

How do I create new rows based on cell value?

I have a dataframe df where:
Days Treatment A Treatment B Treatment C
0 5 1 1
1 0 2 3
2 1 1 0
For example, there were 5 individuals receiving Treatment A that survived 0 days and 1 who survived 2, etc. However, I would like it where those 5 individuals now become a unique row, with that cell representing the days they survived:
Patient # A B C
1 0
2 0
3 0
4 0
5 0
6 2
7 0
8 1
9 1
10 2
11 0
12 1
13 1
14 1
Let Patient # = an arbitrary value.
I am sorry if this is not descriptive enough, but I appreciate any and all help you have to offer! I have the dataset in Excel at the moment, but I can place it into R if that's easier.
We can replicate values the 'Days' with each of the 'Patient' column values in a list, then create a list of the sequence, use Map to construct a data.frame and finally use bind_rows
library(dplyr)
lst1 <- lapply(df[-1], function(x) rep(df$Days, x))
bind_rows(Map(function(x, y, z) setNames(data.frame(x, y),
c("Patient", z)), relist(seq_along(unlist(lst1)),
skeleton = lst1), lst1, sub("Treatment\\s+", "", names(lst1))))
-output
# Patient A B C
#1 1 0 NA NA
#2 2 0 NA NA
#3 3 0 NA NA
#4 4 0 NA NA
#5 5 0 NA NA
#6 6 2 NA NA
#7 7 NA 0 NA
#8 8 NA 1 NA
#9 9 NA 1 NA
#10 10 NA 2 NA
#11 11 NA NA 0
#12 12 NA NA 1
#13 13 NA NA 1
#14 14 NA NA 1
Or another option with reshaping into 'long' and then to 'wide'
library(tidyr)
df %>%
pivot_longer(cols = -Days) %>%
separate(name, into = c('name1', 'name2')) %>%
group_by(name2) %>%
summarise(value = rep(Days, value), .groups = 'drop') %>%
mutate(Patient = row_number()) %>%
pivot_wider(names_from = name2, values_from = value)
-output
# A tibble: 14 x 4
# Patient A B C
# <int> <int> <int> <int>
# 1 1 0 NA NA
# 2 2 0 NA NA
# 3 3 0 NA NA
# 4 4 0 NA NA
# 5 5 0 NA NA
# 6 6 2 NA NA
# 7 7 NA 0 NA
# 8 8 NA 1 NA
# 9 9 NA 1 NA
#10 10 NA 2 NA
#11 11 NA NA 0
#12 12 NA NA 1
#13 13 NA NA 1
#14 14 NA NA 1
data
df <- structure(list(Days = 0:2, `Treatment A` = c(5L, 0L, 1L),
`Treatment B` = c(1L,
2L, 1L), `Treatment C` = c(1L, 3L, 0L)), class = "data.frame", row.names = c(NA,
-3L))

Resources