Replace all zero columns with NA - r

I have the following dataframe in R
library(dplyr)
library(tidyr)
df= data.frame("ID"= c("A", "A", "A", "A", "B", "B", "B"),
"A1"= c(0,0, 0, 0, 1,0,1), "B1"= c(1,0, 1,0, 0, 0, 0))
The dataframe appears as follows
ID A1 B1
1 A 0 1
2 A 0 0
3 A 0 1
4 A 0 0
5 B 1 0
6 B 0 0
7 B 1 0
I would like to obtain the following dataframe
ID A1 B1
1 A NA 1
2 A NA 0
3 A NA 1
4 A NA 0
5 B 1 NA
6 B 0 NA
7 B 1 NA
I have tried the following code
df%>%group_by(ID)%>%
mutate(A1=case_when(sum(A1)==0~NA))%>%
mutate(B1=case_when(sum(B1)==0~NA))
This converts A1 and B1 completely to NA values.
I request some help here.

We can group_by ID and use mutate_all with replace
library(dplyr)
df %>%
group_by(ID) %>%
mutate_all(~replace(., all(. == 0), NA))
# ID A1 B1
# <fct> <dbl> <dbl>
#1 A NA 1
#2 A NA 0
#3 A NA 1
#4 A NA 0
#5 B 1 NA
#6 B 0 NA
#7 B 1 NA
If there are other columns and we want to apply this only to specific columns we can use mutate_at
df %>%
group_by(ID) %>%
mutate_at(vars(A1,B1), ~replace(., all(. == 0), NA))
Using case_when we can do this as
df %>%
group_by(ID) %>%
mutate_all(~case_when(all(. == 0) ~ NA_real_, TRUE ~ .))
The problem in OP's attempt was there was no TRUE case defined in case_when so when no condition is matched it returns NA by default. From ?case_when
If no cases match, NA is returned.
So if we define the TRUE case it would work as expected. Also we should not check for sum(A1)==0 because if there are negative and positive values in the column (like -2 , +2) they would add up to 0 giving unexpected results.
df%>%
group_by(ID) %>%
mutate(A1 = case_when(all(A1 == 0) ~ NA_real_, TRUE ~ A1),
B1 = case_when(all(B1 == 0) ~ NA_real_, TRUE ~ B1))

With tidyverse, we can use if/else
library(tidyverse)
df %>%
group_by(ID) %>%
mutate_all(list(~ if(all(.==0)) NA_integer_ else .))
# ID A1 B1
# <fct> <dbl> <dbl>
#1 A NA 1
#2 A NA 0
#3 A NA 1
#4 A NA 0
#5 B 1 NA
#6 B 0 NA
#7 B 1 NA
Or without any if/else
df %>%
group_by(ID) %>%
mutate_all(~ NA^all(!.) * .)
or using data.table
library(data.table)
setDT(df)[, lapply(.SD, function(x) replace(x, all(x == 0), NA)), ID]
Or using base R
by(df[-1], df$ID, FUN = function(x) x * (NA^ !colSums(!!x))[col(x)])

Related

Counting observations and considering condition

I have a database like this:
id <- c(rep(1,3), rep(2, 3), rep(3, 3))
condition <- c(0, 0, 1, 0, 0, 1, 1, 1, 0)
time_point1 <- c(1, 1, NA)
time_point2 <- c(NA, 1, NA)
time_point3 <- c(NA, NA, NA)
time_point4 <- c(1, NA, NA, 1, NA, NA, NA, NA, 1)
data <- data.frame(id, condition, time_point1, time_point2, time_point3, time_point4)
data
id condition time_point1 time_point2 time_point3 time_point4
1 1 0 1 NA NA 1
2 1 0 1 1 NA NA
3 1 1 NA NA NA NA
4 2 0 1 NA NA 1
5 2 0 1 1 NA NA
6 2 1 NA NA NA NA
7 3 1 1 NA NA NA
8 3 1 1 1 NA NA
9 3 0 NA NA NA 1
I want to make a table with how many have the condition == 1 (n_x) and also how many are in each time point (n_t). In case there is none also I want a 0. I tried this:
data %>%
pivot_longer(cols = contains("time_point")) %>%
filter (!is.na(value)) %>%
group_by(name) %>%
mutate(n_t = n_distinct(id)) %>%
ungroup() %>%
filter(condition == 1) %>%
group_by(name) %>%
summarise(n_x = n_distinct(id), n_t = first(n_t))
Obtaining this:
name n_x n_t
<chr> <int> <int>
1 time_point1 1 3
2 time_point2 1 3
Desired Outcome: I want this type of table that considers the cases with condition and without it:
name n_x n_t
1 time_point1 2 6
2 time_point2 1 3
3 time_point3 0 0
4 time_point4 0 3
Thank you!
You can pivot_longer() to be able to group_by() time_points and then summarise just adding up the values. For conditions only sum values where the column values != NA.
data %>%
pivot_longer(cols=c(3:6),names_to = 'point', values_to='values') %>%
group_by(point) %>%
summarise(n_x = sum(condition[!is.na(values)]), n_t = sum(values, na.rm = TRUE))
Output:
# A tibble: 4 x 3
point n_x n_t
<chr> <dbl> <dbl>
1 time_point1 2 6
2 time_point2 1 3
3 time_point3 0 0
4 time_point4 0 3

How do I efficiently recode groups of dummies conditional on one dummy?

I'm trying to recode several dummy variables at once but am struggling to come up with a functioning vectorized solution (alternatively a for loop).
reprex:
library(tidyverse)
library(magrittr)
library(dummies)
library(janitor)
df_raw <- data.frame(
species = as.factor(c("cat", "dog", NA, "dog", "dog")),
weight = rnorm(5, mean = 5, sd = 1),
sex = as.factor(c("m", NA, "f", "f", "m"))
)
df_raw
species weight sex
1 cat 3.025896 m
2 dog 3.223064 <NA>
3 <NA> 5.230367 f
4 dog 4.231511 f
5 dog 5.819032 m
I split the factor variables (species and sex) into dummies but the NA get their own indicators (species_na and sex_na)
df_dummy <- dummies::dummy.data.frame(df_raw,
dummy.classes = "factor",
sep = "_",
omit.constants = TRUE,
all = TRUE) %>%
janitor::clean_names()
species_cat species_dog species_na weight sex_f sex_m sex_na
1 1 0 0 3.025896 0 1 0
2 0 1 0 3.223064 0 0 1
3 0 0 1 5.230367 1 0 0
4 0 1 0 4.231511 1 0 0
5 0 1 0 5.819032 0 1 0
My problem: how do I efficiently recode all of the factor dummies ("indexed" by the prefix, e.g. species_) to NA conditional on value of the _na dummy in the respective group of dummies? In other words, I need to mutate all dummies with the prefix species_ as NA whenever the species_na == 1 etc.
I have come up with the solution below, but I haven't been able to generalize the last step to the entire dataset
factor_vars <- dplyr::select_if(df_raw, is.factor) %>% colnames()
na_labs <- paste(factor_vars,
"na",
sep = "_")
df_dummy <- df_dummy %>%
dplyr::mutate(across(all_of(na_labs),
.fns = list(var = ~ . == 1),
.names = "{fn}_{col}" ))
# --- trial run for one variable only
test <- df_dummy %>%
mutate(species_cat = ifelse(var_species_na == TRUE,
NA,
species_cat))
Any help is appreciated!
How about this?
df_dummy <- df_dummy %>%
mutate(across(c(starts_with("species")), ~ factor(ifelse(species_na == 1, NA, .)))) %>%
mutate(across(c(starts_with("sex")), ~ factor(ifelse(sex_na == 1, NA, .))))
df_dummy
species_cat species_dog species_na weight sex_f sex_m sex_na
1 1 0 0 4.879161 0 1 0
2 0 1 0 5.960176 <NA> <NA> <NA>
3 <NA> <NA> <NA> 5.189566 1 0 0
4 0 1 0 5.165760 1 0 0
5 0 1 0 5.952365 0 1 0
You can try -
library(dplyr)
library(purrr)
df_dummy <- dummies::dummy.data.frame(df_raw,
dummy.classes = "factor",
sep = "_",
omit.constants = TRUE,
all = TRUE) %>%
janitor::clean_names()
factor_vars <- dplyr::select_if(df_raw, is.factor) %>% colnames()
na_labs <- paste(factor_vars,
"na",
sep = "_")
map_dfc(factor_vars, ~df_dummy %>%
select(contains(.x)) %>%
mutate(across(.fns = ~ifelse(.data[[paste0(.x, '_na')]] == 1, NA, .))))
# species_cat species_dog species_na sex_f sex_m sex_na
#1 1 0 0 0 1 0
#2 0 1 0 NA NA NA
#3 NA NA NA 1 0 0
#4 0 1 0 1 0 0
#5 0 1 0 0 1 0
I have a package on github {dplyover} which can create dummy variables in an across-like manner. Below we select all factor variables with where(is.factor) and apply to each column dist_value which is a wrapper around unique which returns all non-NA values. The function in .fns takes each selected column as .x and applies to it each of the unique values from dist_values as .y.
library(dplyr)
library(dplyover) # https://github.com/TimTeaFan/dplyover
df_raw %>%
mutate(crossover(where(is.factor),
dist_values,
.fns = ~ if_else(.y == .x, 1, 0)))
#> species weight sex species_cat species_dog sex_f sex_m
#> 1 cat 5.281178 m 1 0 0 1
#> 2 dog 4.343656 <NA> 0 1 NA NA
#> 3 <NA> 4.555380 f NA NA 1 0
#> 4 dog 4.990039 f 0 1 1 0
#> 5 dog 4.988497 m 0 1 0 1
Created on 2021-09-13 by the reprex package (v2.0.1)

Replacing all values to 1 after a condition

My current data is like below,
df<-data.frame(id=c(1:5),t1=c(NA,1,0,0,0),t2=c(0,1,0,1,0),
t3=c(NA,0,0,0,1),t4=c(NA,NA,NA,0,0))
And the way I'm trying to restructure this is,
for each id, if there's a "1" in that row, all the 0s in the subsequent columns would change to 1. (but leaving the NA as an NA).
So for id#1, nothing would change since there's no 1 in that row, but for id#2, after 1 in the column t2, any 0s afterwards would be replaced by 1.
i.e., this is what I'm trying to get at the end:
final<-data.frame(id=c(1:5),t1=c(0,1,0,0,0),t2=c(0,1,0,1,0),
t3=c(NA,1,0,1,1),t4=c(NA,NA,NA,1,1))
I've been trying different ways but nothing seems to work... I'd really appreciate any help!!!
In base R we can apply the cummax by row after changing the NA to a lower value and then replace the value back to NA
df[-1] <- t(apply(replace(df[-1], is.na(df[-1]), -999), 1, cummax)) *
NA^(is.na(df[-1]))
df
# id t1 t2 t3 t4
#1 1 NA 0 NA NA
#2 2 1 1 1 NA
#3 3 0 0 0 NA
#4 4 0 1 1 1
#5 5 0 0 1 1
Or use rowCummaxs from matrixStats
library(matrixStats)
df[-1] <- rowCummaxs(as.matrix(replace(df[-1], is.na(df[-1]), -999))) *
NA^(is.na(df[-1]))
With tidyverse you can try:
library(tidyverse)
df %>%
pivot_longer(cols = starts_with("t"), names_to = "Time", values_to = "Value") %>%
group_by(id) %>%
mutate(Cummax = cummax(Value)) %>%
mutate(Value = replace(Value, Value == 0 & Cummax == 1, 1)) %>%
pivot_wider(id_cols = id, names_from = "Time", values_from = "Value")
Output
# A tibble: 5 x 5
# Groups: id [5]
id t1 t2 t3 t4
<int> <dbl> <dbl> <dbl> <dbl>
1 1 NA 0 NA NA
2 2 1 1 1 NA
3 3 0 0 0 NA
4 4 0 1 1 1
5 5 0 0 1 1
Another approach in base R using apply row-wise could be to find out column number where first 1 occurs and replace all the 0 values after it with 1.
df[-1] <- t(apply(df[-1], 1, function(x) {
a_id <- which(x == 1)[1]
if(length(a_id) > 0)
replace(x, x == 0 & seq_along(x) > a_id, 1)
else x
}))
df
# id t1 t2 t3 t4
#1 1 NA 0 NA NA
#2 2 1 1 1 NA
#3 3 0 0 0 NA
#4 4 0 1 1 1
#5 5 0 0 1 1

Mutating based on multiple columns in a data frame

Ok so my dataframe looks like this let's call if df
KEY A1 A2 A3 A4 B1 B2 B3 B4 C1 C2 C3 C4
1 120 100 NA 110 1 1 NA 1 NA NA NA NA
2 100 NA 115 NA NA NA NA NA Y N Y N
So what I'm trying to do is make it so that when an A columns has a value of 100 and the corresponding B or C column has a value of 1 or "Y" respectively that makes a new column with a X with a value of 1. In Row 1 that would be A2 and B2 and in row that would be A1 and C1.
I tried doing three sets of gather and then using the mutate function using case_when. like so
df<- df %>%
gather(key="A",value="code",dx)%>%
gather(key="B",value="number",dxadm)%>%
gather(key="C",value="character",dxpoa) %>%
mutate(X=case_when(
code == 100 & present >0 ~ 1,
code ==100 & character == "Y"~1)
)
Except my spread function of these rows came back with rows all array and my X out of place.
Alternatively, I considered something like
df <- df %>%
mutate(X=case_when(
A1 == 100 & B1 >0 ~ 1,
A1 ==100 & C1 == "Y"~1,
A2 == 100 & B2 >0 ~ 1,
A2 ==100 & C2 == "Y"~1,)
and so on for all permutations. The two problems with this are that I have a lot of columns and I'd like to this for multiple different values of A.
Can anyone recommend an alternative or at least a way to make the second solution into something that would only require one annoying long piece of code that I could make into a more generalizable function? Thanks!
A suggestion
require(read.so) #awesome package to read from Stackoverflow,
# available on GitHub [https://alistaire47.github.io/read.so/][1]
require(tidyr)
require(reshape2)
require(dplyr)
dat <- read.so()
dat %>% gather(var, value, 2:13) %>% #make it long
mutate(var = gsub('([A-Z])', '\\1_', .[['var']])) %>% #add underscore
separate(var, c('var', 'number') ) %>% #separate your column
dcast(KEY+number ~ var) %>% #dcast is a bit complex but quite powerful
group_by(KEY) %>%
filter(A == 100)
# A tibble: 2 x 5
# Groups: KEY [2]
KEY number A B C
<int> <chr> <chr> <chr> <chr>
1 1 2 100 1 <NA>
2 2 1 100 <NA> Y
A solution using dplyr and tidyr. We can gather all the columns except KEY, separate the letters and numbers, and then spread the letter so that we can create the X column without specifying the numbers. Notice that I assume if the condition is not met, X would be 0, and based on your description, I used any(A %in% 100 & (B %in% 1 | C %in% "Y")) to test the condition as any given numbers met the condition, X would be 1.
library(dplyr)
library(tidyr)
df2 <- df %>%
gather(Column, Value, -KEY) %>%
separate(Column, into = c("Letter", "Number"), sep = 1) %>%
spread(Letter, Value, convert = TRUE) %>%
group_by(KEY) %>%
mutate(X = ifelse(any(A %in% 100 & (B %in% 1 | C %in% "Y")), 1L, 0L))
df2 %>% as.data.frame()
# KEY Number A B C X
# 1 1 1 120 1 <NA> 1
# 2 1 2 100 1 <NA> 1
# 3 1 3 NA NA <NA> 1
# 4 1 4 110 1 <NA> 1
# 5 2 1 100 NA Y 1
# 6 2 2 NA NA N 1
# 7 2 3 115 NA Y 1
# 8 2 4 NA NA N 1
I think the structure of df2 is good, but if you really want the original structure, we can do the following.
df3 <- df2 %>%
gather(Letter, Value, A:C) %>%
unite(Column, Letter, Number, sep = "") %>%
spread(Column, Value) %>%
select(names(df), X)
df3 %>% as.data.frame()
# KEY A1 A2 A3 A4 B1 B2 B3 B4 C1 C2 C3 C4 X
# 1 1 120 100 <NA> 110 1 1 <NA> 1 <NA> <NA> <NA> <NA> 1
# 2 2 100 <NA> 115 <NA> <NA> <NA> <NA> <NA> Y N Y N 1
df3 is the final output.
DATA
df <- read.table(text = "KEY A1 A2 A3 A4 B1 B2 B3 B4 C1 C2 C3 C4
1 120 100 NA 110 1 1 NA 1 NA NA NA NA
2 100 NA 115 NA NA NA NA NA Y N Y N",
header = TRUE, stringsAsFactors = FALSE)
Same idea as Tjebo, but sticking to the tidyverse....
library(tidyverse)
dat <- data.frame(stringsAsFactors=FALSE,
KEY = c(1L, 2L),
A1 = c(120L, 100L),
A2 = c(100L, NA),
A3 = c(NA, 115L),
A4 = c(110L, NA),
B1 = c(1L, NA),
B2 = c(1L, NA),
B3 = c(NA, NA),
B4 = c(1L, NA),
C1 = c(NA, "Y"),
C2 = c(NA, "N"),
C3 = c(NA, "Y"),
C4 = c(NA, "N"))
dat %>%
gather(var, value, -KEY) %>% #make it long
extract(var, regex = "(.)(.)", into = c("var", "number") ) %>%
spread(var, value) %>%
filter( A %in% 100 )
#> KEY number A B C
#> 1 1 2 100 1 <NA>
#> 2 2 1 100 <NA> Y
Created on 2018-02-27 by the reprex package (v0.2.0).

Run length sequence by time and ID

This problem does not seem to have been put out here before.
I want to find the number of subjects that score 1 for 6 consecutive hours.
The subjects have not been scored every hour so if an hour is missing the hours are not consecutive and the output for that 6-hour period should be NA.
The reason for assigning NA would be that we do not know how the subject has scored on the missing hour. This problem can be used to count consecutive hits, but only count it if a subject has participated.
My dataframe looks like this:
ID<-c(1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2)
hour<-c(1,2,3,7,8,9,10,11,12,17,18,19,1,2,3,4,5,6,8,9,15)
A<-c(0,1,0,1,1,1,1,1,1,0,0,0,1,1,1,1,1,1,1,1,1)
df<-data.frame(ID,hour,A)
I have tried to use the rle function (I am sure its possible) but I cannot get it to condition on both hour and ID.
The output would be like this:
ID<-c(1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2)
hour<-c(1,2,3,7,8,9,10,11,12,17,18,19,1,2,3,4,5,6,8,9,15)
A<-c(0,1,0,1,1,1,1,1,1,0,0,0,1,1,1,1,1,1,1,1,1)
six<-c(NA,NA,NA,0,0,0,0,0,1,NA,NA,NA,0,0,0,0,0,1,NA,NA,NA)
df<-data.frame(ID,hour,A,six)
Thank you in advance.
I believe the original data set I gave was too small to make the solutions more generalizable.
I just tried the codes with this dataset and found that this will result in a wrong result.
ID<-c(1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4)
hour<-c(1,2,3,7,8,9,10,12,13,17,18,19,1,2,3,4,5,6,8,9,15,1:23,27,28,29,30,31)
A<-c(0,1,0,1,1,1,1,1,1,0,0,0,1,1,1,1,1,1,1,1,1,rep(1,28))
df<-data.frame(ID,hour,A)
For the new dataset the output should be:
ID<-c(1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4)
hour<-c(1,2,3,7,8,9,10,12,13,17,18,19,1,2,3,4,5,6,8,9,15,1:23,27,28,29,30,31)
A<-c(0,1,0,1,1,1,1,1,1,0,0,0,1,1,1,1,1,1,1,1,1,rep(1,28))
six<-c(NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,0,0,0,0,0,1,NA,NA,NA,0,0,0,0,0,1,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA)
df<-data.frame(ID,hour,A,six)
Here is an approach in tidyverse with the updated data set:
library(tidyverse)
df %>%
group_by(ID) %>%
expand(hour = seq(min(hour), max(hour))) %>%
left_join(df) %>%
mutate(rle = rep(rle(A)$lengths, times = rle(A)$lengths)) %>%
group_by(ID, rle) %>%
mutate(sum = cumsum(A),
six = ifelse(rle >= 6 & A == 1, 0, NA),
six = ifelse(sum == 6, 1, ifelse(sum > 6, NA, six))) %>%
filter(!is.na(A)) %>%
ungroup() %>%
select(ID, hour, A, six) %>%
as.data.frame() -> df_out2
check requested output:
ID<-c(1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4)
hour<-c(1,2,3,7,8,9,10,12,13,17,18,19,1,2,3,4,5,6,8,9,15,1:23,27,28,29,30,31)
A<-c(0,1,0,1,1,1,1,1,1,0,0,0,1,1,1,1,1,1,1,1,1,rep(1,28))
six<-c(NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,0,0,0,0,0,1,NA,NA,NA,0,0,0,0,0,1,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA)
df<-data.frame(ID,hour,A,six)
all.equal(df, df_out2)
#output
TRUE
The old answer:
df %>%
mutate(rle = rep(rle(A)$lengths, times = rle(A)$lengths)) %>%
group_by(ID, rle) %>%
mutate(sum = cumsum(A),
six = ifelse(rle >= 6 & A == 1, 0, NA),
six = ifelse(sum == 6, 1, ifelse(sum > 6, NA, six))) %>%
ungroup() %>%
select(ID, hour, A, six) %>%
as.data.frame() -> df_out2
Lets check if the result is like requested:
ID <- c(1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2)
hour <- c(1,2,3,7,8,9,10,11,12,17,18,19,1,2,3,4,5,6,8,9,15)
A <- c(0,1,0,1,1,1,1,1,1,0,0,0,1,1,1,1,1,1,1,1,1)
six <- c(NA,NA,NA,0,0,0,0,0,1,NA,NA,NA,0,0,0,0,0,1,NA,NA,NA)
df1 <- data.frame(ID, hour, A, six)
df1 is requested output
all.equal(df1, df_out2)
#output
TRUE
some benchmarking:
library(microbenchmark)
library(data.table)
akrun <- function(df){
setDT(df)[, grp := rleid(A)][, Anew := A *((hour - shift(hour, fill = hour[1])) ==1), grp
][, sixnew :=if(sum(A)>=5) rep(c(0, 1), c(.N-1, 1)) else NA_real_,.(rleid(Anew), grp)]
i1 <- df[, .I[which(is.na(sixnew) & shift(sixnew == 0, type = 'lead'))], grp]$V1
df[i1, sixnew := 0][, c("Anew", "grp") := NULL][]
}
missuse <- function(df){
df %>%
mutate(rle = rep(rle(A)$lengths, times = rle(A)$lengths)) %>%
group_by(ID, rle) %>%
mutate(sum = cumsum(A),
six = ifelse(rle >= 6 & A == 1, 0, NA),
six = ifelse(sum == 6, 1, ifelse(sum > 6, NA, six))) %>%
ungroup() %>%
select(ID, hour, A, six)
}
Mike <- function(df){
ave(df$A,
cumsum(!(df$hour == shift(df$hour, fill = 0) + 1)),
FUN = function(x) {
if(all(x==1) & length(x) >= 6) return(c(rep(0, length(x) - 1), 1))
else return(rep(NA, length(x)))})
}
microbenchmark(Mike(df),
akrun(df),
missuse(df))
#output
Unit: microseconds
expr min lq mean median uq max neval
Mike(df) 491.291 575.7115 704.2213 597.7155 629.0295 9578.684 100
akrun(df) 6568.313 6725.5175 7867.4059 6843.5790 7279.2240 69790.755 100
missuse(df) 11042.822 11321.0505 12434.8671 11512.3200 12616.3485 43170.935 100
way to go Mike H.!
To get the groupings you can compare the current hour to the lagged hour to see if they are "consecutive" or 1 integer apart and then take the cumsum of that. Once you have the groupings you can use a simple ave to get the output you want.
library(data.table)
df$six <- ave(df$A,
cumsum(!(df$hour == shift(df$hour, fill = 0) + 1)),
FUN = function(x) {
if(all(x==1) & length(x) >= 6) return(c(rep(0, length(x) - 1), 1))
else return(rep(NA, length(x)))}
)
df
# ID hour A six
#1 1 1 0 NA
#2 1 2 1 NA
#3 1 3 0 NA
#4 1 7 1 0
#5 1 8 1 0
#6 1 9 1 0
#7 1 10 1 0
#8 1 11 1 0
#9 1 12 1 1
#10 1 17 0 NA
#11 1 18 0 NA
#12 1 19 0 NA
#13 2 1 1 0
#14 2 2 1 0
#15 2 3 1 0
#16 2 4 1 0
#17 2 5 1 0
#18 2 6 1 1
#19 2 8 1 NA
#20 2 9 1 NA
#21 2 15 1 NA
If you only want to select a patient at most once, this will select the last time period:
df$six_adj <- ave(df$six, df$ID, df$six, FUN = function(x) {
if(all(x==1)) return(c(rep(0, length(x) - 1), 1))
else return(x)}
)
We can use rleid from data.table. Create a grouping column with run-length-id of 'A' ('grp'). Take the difference of 'hour' with the next value of 'hour', check if it is equal to 1 and multiply with 'A' to create 'Anew'. Grouped by run-length-id of 'Anew' and 'grp', if the sum of 'A' is greater than a particular value, replicate with 0s and 1s or else return NA. In the last phase, assign some spillover NAs to 0 by creating an index ('i1')
library(data.table)
setDT(df)[, grp := rleid(A)][, Anew := A *((hour - shift(hour, fill = hour[1])) ==1), grp
][, sixnew :=if(sum(A)>=5) rep(c(0, 1), c(.N-1, 1)) else NA_real_,.(rleid(Anew), grp)]
i1 <- df[, .I[which(is.na(sixnew) & shift(sixnew == 0, type = 'lead'))], grp]$V1
df[i1, sixnew := 0][, c("Anew", "grp") := NULL][]
# ID hour A six sixnew
# 1: 1 1 0 NA NA
# 2: 1 2 1 NA NA
# 3: 1 3 0 NA NA
# 4: 1 7 1 0 0
# 5: 1 8 1 0 0
# 6: 1 9 1 0 0
# 7: 1 10 1 0 0
# 8: 1 11 1 0 0
# 9: 1 12 1 1 1
#10: 1 17 0 NA NA
#11: 1 18 0 NA NA
#12: 1 19 0 NA NA
#13: 2 1 1 0 0
#14: 2 2 1 0 0
#15: 2 3 1 0 0
#16: 2 4 1 0 0
#17: 2 5 1 0 0
#18: 2 6 1 1 1
#19: 2 8 1 NA NA
#20: 2 9 1 NA NA
#21: 2 15 1 NA NA
Or slightly more compact option would be
i1 <- setDT(df)[, if(sum(A)>= 5) .I[.N] , rleid(c(TRUE, diff(hour)==1), A)]$V1
df[i1, sixnew := 1][unlist(Map(`:`, i1-5, i1-1)), sixnew := 0]
df
# ID hour A six sixnew
# 1: 1 1 0 NA NA
# 2: 1 2 1 NA NA
# 3: 1 3 0 NA NA
# 4: 1 7 1 0 0
# 5: 1 8 1 0 0
# 6: 1 9 1 0 0
# 7: 1 10 1 0 0
# 8: 1 11 1 0 0
# 9: 1 12 1 1 1
#10: 1 17 0 NA NA
#11: 1 18 0 NA NA
#12: 1 19 0 NA NA
#13: 2 1 1 0 0
#14: 2 2 1 0 0
#15: 2 3 1 0 0
#16: 2 4 1 0 0
#17: 2 5 1 0 0
#18: 2 6 1 1 1
#19: 2 8 1 NA NA
#20: 2 9 1 NA NA
#21: 2 15 1 NA NA
Benchmarks
Created a slightly bigger dataset and tested the solutions
-data
ID<-c(1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2)
hour<-c(1,2,3,7,8,9,10,11,12,17,18,19,1,2,3,4,5,6,8,9,15)
A<-c(0,1,0,1,1,1,1,1,1,0,0,0,1,1,1,1,1,1,1,1,1)
ID <- rep(1:5000, rep(c(12, 9), 2500))
A <- rep(A, 2500)
hour <- rep(hour, 2500)
dftest <- data.frame(ID, hour, A)
-functions
akrun <- function(df){
df1 <- copy(df)
setDT(df1)[, grp := rleid(A)][, Anew := A *((hour - shift(hour, fill = hour[1])) ==1), grp
][, sixnew :=if(sum(A)>=5) rep(c(0, 1), c(.N-1, 1))
else NA_real_,.(rleid(Anew), grp)]
i1 <- df1[, .I[which(is.na(sixnew) & shift(sixnew == 0, type = 'lead'))], grp]$V1
df1[i1, sixnew := 0][, c("Anew", "grp") := NULL][]
}
akrun2 <- function(df) {
df1 <- copy(df)
i1 <- setDT(df1)[, if(sum(A)>= 5) .I[.N] , rleid(c(TRUE, diff(hour)==1), A)]$V1
df1[i1, sixnew := 1][unlist(Map(`:`, i1-5, i1-1)), sixnew := 0]
}
missuse <- function(df){
df %>%
mutate(rle = rep(rle(A)$lengths, times = rle(A)$lengths)) %>%
group_by(ID, rle) %>%
mutate(sum = cumsum(A),
six = ifelse(rle >= 6 & A == 1, 0, NA),
six = ifelse(sum == 6, 1, ifelse(sum > 6, NA, six))) %>%
ungroup() %>%
select(ID, hour, A, six)
}
Mike <- function(df){
ave(df$A,
cumsum(!(df$hour == shift(df$hour, fill = 0) + 1)),
FUN = function(x) {
if(all(x==1) & length(x) >= 6) return(c(rep(0, length(x) - 1), 1))
else return(rep(NA, length(x)))})
}
-benchmark
microbenchmark(Mike(dftest),
akrun(dftest),
akrun2(dftest),
missuse(dftest),
times = 10L, unit = 'relative')
-output
#Unit: relative
# expr min lq mean median uq max neval cld
# Mike(dftest) 1.682794 1.754494 1.673811 1.68806 1.632765 1.640221 10 a
# akrun(dftest) 13.159245 12.950117 12.176965 12.33716 11.856271 11.095228 10 b
# akrun2(dftest) 1.000000 1.000000 1.000000 1.00000 1.000000 1.000000 10 a
# missuse(dftest) 37.899905 36.773837 34.726845 34.87672 33.155939 30.665840 10 c

Resources