Counting observations and considering condition - r

I have a database like this:
id <- c(rep(1,3), rep(2, 3), rep(3, 3))
condition <- c(0, 0, 1, 0, 0, 1, 1, 1, 0)
time_point1 <- c(1, 1, NA)
time_point2 <- c(NA, 1, NA)
time_point3 <- c(NA, NA, NA)
time_point4 <- c(1, NA, NA, 1, NA, NA, NA, NA, 1)
data <- data.frame(id, condition, time_point1, time_point2, time_point3, time_point4)
data
id condition time_point1 time_point2 time_point3 time_point4
1 1 0 1 NA NA 1
2 1 0 1 1 NA NA
3 1 1 NA NA NA NA
4 2 0 1 NA NA 1
5 2 0 1 1 NA NA
6 2 1 NA NA NA NA
7 3 1 1 NA NA NA
8 3 1 1 1 NA NA
9 3 0 NA NA NA 1
I want to make a table with how many have the condition == 1 (n_x) and also how many are in each time point (n_t). In case there is none also I want a 0. I tried this:
data %>%
pivot_longer(cols = contains("time_point")) %>%
filter (!is.na(value)) %>%
group_by(name) %>%
mutate(n_t = n_distinct(id)) %>%
ungroup() %>%
filter(condition == 1) %>%
group_by(name) %>%
summarise(n_x = n_distinct(id), n_t = first(n_t))
Obtaining this:
name n_x n_t
<chr> <int> <int>
1 time_point1 1 3
2 time_point2 1 3
Desired Outcome: I want this type of table that considers the cases with condition and without it:
name n_x n_t
1 time_point1 2 6
2 time_point2 1 3
3 time_point3 0 0
4 time_point4 0 3
Thank you!

You can pivot_longer() to be able to group_by() time_points and then summarise just adding up the values. For conditions only sum values where the column values != NA.
data %>%
pivot_longer(cols=c(3:6),names_to = 'point', values_to='values') %>%
group_by(point) %>%
summarise(n_x = sum(condition[!is.na(values)]), n_t = sum(values, na.rm = TRUE))
Output:
# A tibble: 4 x 3
point n_x n_t
<chr> <dbl> <dbl>
1 time_point1 2 6
2 time_point2 1 3
3 time_point3 0 0
4 time_point4 0 3

Related

summarise by group returns 0 instead of NA if all values are NA

library(dplyr)
dat <-
data.frame(id = rep(c(1,2,3,4), each = 3),
value = c(NA, NA, NA, 0, 1, 2, 0, 1, NA, 1, 2,3))
dat %>%
dplyr::group_by(id) %>%
dplyr::summarise(value_sum = sum(value, na.rm = T))
# A tibble: 4 x 2
id value_sum
1 0
2 3
3 1
4 6
Is there any way I can return NA if all the entries in a group are NA. For e.g. id 1 has all the entries as NA so I want the value_sum to be NA as well.
# A tibble: 4 x 2
id value_sum
1 NA
2 3
3 1
4 6
One way is to use an if/else statement: If all is Na return NA else return sum():
dat %>%
dplyr::group_by(id) %>%
#dplyr::summarise(value_sum = sum(value, na.rm = F)) %>%
summarise(number = if(all(is.na(value))) NA_real_ else sum(value, na.rm = TRUE))
id number
<dbl> <dbl>
1 1 NA
2 2 3
3 3 1
4 4 6
We could use fsum
library(collapse)
fsum(dat$value, g = dat$id)
1 2 3 4
NA 3 1 6
Or with dplyr
library(dplyr)
dat %>%
group_by(id) %>%
summarise(number = fsum(value))
# A tibble: 4 × 2
id number
<dbl> <dbl>
1 1 NA
2 2 3
3 3 1
4 4 6

Get the rowwise minimum of certain columns excluding 0 and NA

I have made a very complex solution to something I feel should have a much simpler solution.
In short what I want:
I want to compute a new column containing the minimum value across 3 columns
I want to ignore zeros and NAs
If I only have zeros and NAs I want a zero
If I have only NAs I want a NA
Here is my solution, it works, but it is very complex and produces a warning.
> library(dplyr)
> df <- data.frame(
+ id = c(1, 2, 3, 4),
+ test1 = c( NA, NA, 2 , 3),
+ test2 = c( NA, 0, 1 , 1),
+ test3 = c(NA, NA, 0 , 2)
+ )
> df2 <- df %>%
+ mutate(nieuw = apply(across(test1:test3), 1, function(x) min(x[x>0]))) %>%
+ rowwise() %>%
+ mutate(nieuw = if_else(is.na(nieuw), max(across(test1:test3), na.rm = TRUE), nieuw)) %>%
+ mutate(nieuw = ifelse(is.infinite(nieuw), NA, nieuw))
> df
id test1 test2 test3
1 1 NA NA NA
2 2 NA 0 NA
3 3 2 1 0
4 4 3 1 2
> df2
# A tibble: 4 x 5
# Rowwise:
id test1 test2 test3 nieuw
<dbl> <dbl> <dbl> <dbl> <dbl>
1 1 NA NA NA NA
2 2 NA 0 NA 0
3 3 2 1 0 1
4 4 3 1 2 1
Warning message:
Problem while computing `nieuw = if_else(...)`.
i no non-missing arguments to max; returning -Inf
i The warning occurred in row 1.
You can create a helper function and then apply it rowwise:
library(dplyr)
safe <- function(x, f, ...) ifelse(all(is.na(x)), NA,
ifelse(all(is.na(x) | x == 0),
0, f(x[x > 0], na.rm = TRUE, ...)))
df %>%
rowwise() %>%
mutate(a = safe(c_across(test1:test3), min))
# A tibble: 4 × 5
# Rowwise:
id test1 test2 test3 a
<dbl> <dbl> <dbl> <dbl> <dbl>
1 1 NA NA NA NA
2 2 NA 0 NA 0
3 3 2 1 0 1
4 4 3 1 2 1
Here is another option. It leverages making zeros and NA's very large and then recodes them at the end:
library(tidyverse)
get_min <- function(data, cols){
data[is.na(data)] <- 1e6
data[data == 0] <- 1e5
nums <- do.call(pmin, select(data, all_of(cols)))
recode(nums, `1e+06` = NA_real_, `1e+05` = 0.)
}
df %>%
mutate(nieuw = get_min(., c("test1", "test2", "test3")))
#> id test1 test2 test3 nieuw
#> 1 1 NA NA NA NA
#> 2 2 NA 0 NA 0
#> 3 3 2 1 0 1
#> 4 4 3 1 2 1

A computation efficient way to find the IDs of the Type 1 rows just above and below each Type 2 rows?

I have the following data
df <- tibble(Type=c(1,2,2,1,1,2),ID=c(6,4,3,2,1,5))
Type ID
1 6
2 4
2 3
1 2
1 1
2 5
For each of the type 2 rows, I want to find the IDs of the type 1 rows just below and above them. For the above dataset, the output will be:
Type ID IDabove IDbelow
1 6 NA NA
2 4 6 2
2 3 6 2
1 2 NA NA
1 1 NA NA
2 5 1 NA
Naively, I can write a for loop to achieve this, but that would be too time consuming for the dataset I am dealing with.
One approach using dplyr lead,lag to get next and previous value respectively and data.table's rleid to create groups of consecutive Type values.
library(dplyr)
library(data.table)
df %>%
mutate(IDabove = ifelse(Type == 2, lag(ID), NA),
IDbelow = ifelse(Type == 2, lead(ID), NA),
grp = rleid(Type)) %>%
group_by(grp) %>%
mutate(IDabove = first(IDabove),
IDbelow = last(IDbelow)) %>%
ungroup() %>%
select(-grp)
# Type ID IDabove IDbelow
# <dbl> <dbl> <dbl> <dbl>
#1 1 6 NA NA
#2 2 4 6 2
#3 2 3 6 2
#4 1 2 NA NA
#5 1 1 NA NA
#6 2 5 1 NA
A dplyr only solution:
You could create your own rleid function then apply the logic provided by Ronak(Many thanks. Upvoted).
library(dplyr)
my_func <- function(x) {
x <- rle(x)$lengths
rep(seq_along(x), times=x)
}
# this part is the same as provided by Ronak.
df %>%
mutate(IDabove = ifelse(Type == 2, lag(ID), NA),
IDbelow = ifelse(Type == 2, lead(ID), NA),
grp = my_func(Type)) %>%
group_by(grp) %>%
mutate(IDabove = first(IDabove),
IDbelow = last(IDbelow)) %>%
ungroup() %>%
select(-grp)
Output:
Type ID IDabove IDbelow
<dbl> <dbl> <dbl> <dbl>
1 1 6 NA NA
2 2 4 6 2
3 2 3 6 2
4 1 2 NA NA
5 1 1 NA NA
6 2 5 1 NA

Last observation carried forward conditional on value and colums

I have a longhitudinal dataframe with a lot of missing values that looks like this.
ID = c(1,1,1,1,1,2,2,2,2,2,3,3,3,3,3)
date = c(1,2,3,4,5,1,2,3,4,5,1,2,3,4,5)
cond = c(0,0,0,1,0,0,0,0,1,0,0,0,0,0,0)
var = c(1, NA , 2, 0,NA, NA, 3, NA,0, NA, 2, NA, 1,NA,NA)
df = data.frame(ID, date, cond,var)
I would like to carry forward the last observation based on two conditions:
1) when cond=0 it should carry on the observation the higher value of the variable of interest.
2) when cond=1 it should carry forward the lower value of the variable of interest.
Does anyone have an idea on how I could do this in an elegant way?
The final dataset should look like this
ID = c(1,1,1,1,1,2,2,2,2,2,3,3,3,3,3)
date = c(1,2,3,4,5,1,2,3,4,5,1,2,3,4,5)
cond = c(0,0,0,1,0,0,0,0,1,0,0,0,0,0,0)
var = c(1, 1 , 2, 0, 0, NA, 3, 3, 0, 0,2,2,2,2,2)
final = data.frame(ID, date, cond,var)
So far I was able to carry forward the last observation, but I was unable to impose the conditions
library(zoo)
df <- df %>%
group_by(ID) %>%
mutate(var =
na.locf(var, na.rm = F))
any suggestion is welcomed
This is the use of accumulate2 ie
df%>%
group_by(ID)%>%
mutate(d = unlist(accumulate2(var,cond[-1],function(z,x,y) if(y) min(z,x,na.rm=TRUE) else max(z,x,na.rm=TRUE))))
# A tibble: 15 x 5
# Groups: ID [3]
ID date cond var d
<dbl> <dbl> <dbl> <dbl> <dbl>
1 1 1 0 1 1
2 1 2 0 NA 1
3 1 3 0 2 2
4 1 4 1 0 0
5 1 5 0 NA 0
6 2 1 0 NA NA
7 2 2 0 3 3
8 2 3 0 NA 3
9 2 4 1 0 0
10 2 5 0 NA 0
11 3 1 0 2 2
12 3 2 0 NA 2
13 3 3 0 1 2
14 3 4 0 NA 2
15 3 5 0 NA 2
I think, if I understand what you are after is this?
ID = c(1,1,1,1,1,2,2,2,2,2,3,3,3,3,3)
date = c(1,2,3,4,5,1,2,3,4,5,1,2,3,4,5)
cond = c(0,0,0,1,0,0,0,0,1,0,0,0,0,0,0)
var = c(1, NA , 2, 0,NA, NA, 3, NA,0, NA, 2, NA, 1,NA,NA)
df = data.frame(ID, date, cond,var)
Using case_when you can do some conditional checks. I'm unsure if you mean to return the minimum for all of the "ID" field, but this will look at the condition and then lag or lead to find a non missing value
library(dplyr)
df %>%
mutate(var_imput = case_when(
cond == 0 & is.na(var)~lag(x = var, n = 1, default = NA),
cond == 1 & is.na(var)~lead(x = var, n = 1, default = NA),
TRUE~var
))
Which yields:
ID date cond var var_imput
1 1 1 0 1 1
2 1 2 0 NA 1
3 1 3 0 2 2
4 1 4 1 0 0
5 1 5 0 NA 0
6 2 1 0 NA NA
7 2 2 0 3 3
8 2 3 0 NA 3
9 2 4 1 0 0
10 2 5 0 NA 0
11 3 1 0 2 2
12 3 2 0 NA 2
13 3 3 0 1 1
14 3 4 0 NA 1
15 3 5 0 NA NA
If you want to group by ID then you could generate an impute table by ID, then join it with the original table like this:
# enerate input table
input_table <- df %>%
group_by(ID) %>%
summarise(min = min(var, na.rm = T),
max = max(var, na.rm = T)) %>%
gather(cond, value, -ID) %>%
mutate(cond = ifelse(cond == "min", 0, 1))
# Join and impute missing
df %>%
left_join(input_table,by = c("ID", "cond")) %>%
mutate(var_imput = ifelse(is.na(var), value, var))

How to merge variables looping through by variable number in R

I have a dataframe with a lot of variables seen in multiple conditions. I'd like to merge each variable by condition.
The example data frame is a simplified version of what I have (3 variables over 2 conditions).
VAR.B_1 <- c(1, 2, 3, 4, 5, 'NA', 'NA', 'NA', 'NA', 'NA')
VAR.B_2 <- c(2, 2, 3, 4, 5,'NA', 'NA', 'NA', 'NA', 'NA')
VAR.B_3 <- c(1, 1, 1, 1, 1,'NA', 'NA', 'NA', 'NA', 'NA')
VAR.E_1 <- c(NA, NA, NA, NA, NA, 1, 1, 1, 1, 1)
VAR.E_2 <- c(NA, NA, NA, NA, NA, 1, 2, 3, 4, 5)
VAR.E_3 <- c(NA, NA, NA, NA, NA, 1, 1, 1, 1, 1)
Condition <- c("B", "B","B","B","B","E","E","E","E","E")
#Example dataset
data<-as.data.frame(cbind(VAR.B_1,VAR.B_2,VAR.B_3, VAR.E_1,VAR.E_2, VAR.E_3, Condition))
I want to end up with this, appended to the original data frame:
VAR_1 VAR_2 VAR_3
1 2 1
2 2 1
3 3 1
4 4 1
5 5 1
1 1 1
1 2 1
1 3 1
1 4 1
1 5 1
I understand that R won't work with i inside the variable name, but I have an example of the kind of for loop I was trying to do. I would rather not call variables by column location, since there will be a lot of variables.
##Example of how I want to merge - this code does not work
for(i in 1:3) {
data$VAR_[,i] <-ifelse(data$Condition == "B", VAR.B_[,i],
ifelse(data$Condition == "E", VAR.E_[,i], NA))
}
This might work for your situation:
library(tidyverse)
library(stringr)
data %>%
mutate_all(as.character) %>%
gather(key, value, -Condition) %>%
filter(!is.na(value), value != "NA") %>%
mutate(key = str_replace(key, paste0("\\.", Condition), "")) %>%
group_by(Condition, key) %>%
mutate(rowid = 1:n()) %>%
spread(key, value) %>%
bind_cols(data)
#> # A tibble: 10 x 12
#> # Groups: Condition [2]
#> Condition rowid VAR_1 VAR_2 VAR_3 VAR.B_1 VAR.B_2 VAR.B_3 VAR.E_1
#> <chr> <int> <chr> <chr> <chr> <fctr> <fctr> <fctr> <fctr>
#> 1 B 1 1 2 1 1 2 1 NA
#> 2 B 2 2 2 1 2 2 1 NA
#> 3 B 3 3 3 1 3 3 1 NA
#> 4 B 4 4 4 1 4 4 1 NA
#> 5 B 5 5 5 1 5 5 1 NA
#> 6 E 1 1 1 1 NA NA NA 1
#> 7 E 2 1 2 1 NA NA NA 1
#> 8 E 3 1 3 1 NA NA NA 1
#> 9 E 4 1 4 1 NA NA NA 1
#> 10 E 5 1 5 1 NA NA NA 1
#> # ... with 3 more variables: VAR.E_2 <fctr>, VAR.E_3 <fctr>,
#> # Condition1 <fctr>
data.frame(lapply(split.default(data[-NCOL(data)], gsub("\\D+", "", head(names(data), -1))),
function(a){
a = sapply(a, function(x) as.numeric(as.character(x)))
rowSums(a, na.rm = TRUE)
}))
# X1 X2 X3
#1 1 2 1
#2 2 2 1
#3 3 3 1
#4 4 4 1
#5 5 5 1
#6 1 1 1
#7 1 2 1
#8 1 3 1
#9 1 4 1
#10 1 5 1
#Warning messages:
#1: In FUN(X[[i]], ...) : NAs introduced by coercion
#2: In FUN(X[[i]], ...) : NAs introduced by coercion
#3: In FUN(X[[i]], ...) : NAs introduced by coercion
Your data appears to have two kinds of NA values in it. It has NA, or R's NA value, and it also has the string 'NA'. In my solution below, I replace both with zero, cast each column in the data frame to numeric, and then just sum together like-numbered VAR columns. Then, drop the original columns which you don't want anymore.
data <- as.data.frame(cbind(VAR.B_1,VAR.B_2,VAR.B_3, VAR.E_1,VAR.E_2, VAR.E_3),
stringsAsFactors=FALSE)
data[is.na(data)] <- 0
data[data == 'NA'] <- 0
data <- as.data.frame(lapply(data, as.numeric))
data$VAR_1 <- data$VAR.B_1 + data$VAR.E_1
data$VAR_2 <- data$VAR.B_2 + data$VAR.E_2
data$VAR_3 <- data$VAR.B_3 + data$VAR.E_3
data <- data[c("VAR_1", "VAR_2", "VAR_3")]
Demo

Resources