Rowwise indicator if all observations are NA - r

I want to indicate if all observations in a given row are NA. For example, with the following data:
dat <- tibble::tribble(
~x, ~y, ~z,
1, 2, NA,
1, 2, 3,
NA, NA, NA,
NA, NA, NA
)
dat
# A tibble: 4 x 3
x y z
<dbl> <dbl> <dbl>
1 1 2 NA
2 1 2 3
3 NA NA NA
4 NA NA NA
I want to create a new column (allisna) to indicate if all observations are NA. Note: I want to do this using dplyr (if needed, can use other tidyverse functions, not base R functions like apply().
I have the following solution, but I prefer a solution that uses rowwise() and another dplyr function call inside of mutate.
library(dplyr)
dat %>%
mutate(allisna = apply(tmp, 1, function(x){
case_when(all(is.na(x)) ~ 1,
TRUE ~ 0)
}))
The final product should be:
# A tibble: 4 x 4
x y z allisna
<dbl> <dbl> <dbl> <dbl>
1 1 2 NA 0
2 1 2 3 0
3 NA NA NA 1
4 NA NA NA 1

in base R without using apply you can do
dat$allisna <- +(rowSums(!is.na(dat)) == 0)

Figured out one solution (but am open to and will reward more!):
dat %>%
rowwise() %>%
mutate(allisna = case_when(all(is.na(c_across(everything()))) ~ 1,
TRUE ~ 0))
EDIT to include #RonakShah's answer
dat %>%
mutate(allisna = as.numeric(rowSums(!is.na(.)) == 0))

Related

How to rank a variable in a column based on a conditional, when there are NAs in the column

I have a longitudinal data set with two people in which the rows of data are numbered as 'episodes', and some episodes have a test 'result'. The goal of the below code is to:
Create binary variable 'sup' to evaluate a 'result'. If result == NA, then sup == NA. This code works.
Create sup_rank to enumerate the occurrence of sup == 1 within people who had an occurrence of sup==1. In other words, I want to know if this is the first time, second time, etc. that sup==1. Problem: This code currently does not work since person 2's first sup==1 is ranked as '2' (when it should be ranked as '1').
Create an event variable that:
equals 1 if sup_rank==1
equals 0 if sup == 0 OR sup_rank does not equal 1
equals NA if sup (and thus sup_rank) equals NA
Currently I tried to do #3 in two steps with event and event final. Problem: it does not work because 'sup_rank' does not work, but regardless, it would be ideal to create 'event' as one variable (and not need an 'event_final').
#Load packages
pacman::p_load(dplyr)
#Create variables for data set
person <- c(1, 1, 2, 2, 2, 2, 2, 2, 2, 2)
episode <- c(1, 2, 1, 2, 3, 4, 5, 6, 7, 8)
result <- c(NA, NA, NA, 1, NA, 2, NA, 2, NA, 2)
#Populate data frame with variables
d <- cbind(person, episode, result)
d <- as.data.frame(d)
#Manipulate data frame to create 4 new variables
d1 <- d %>%
#Need to create new variables within each person
group_by(person) %>%
#Need to correctly order the rows of data before creating the variables
arrange(person, episode) %>%
#Create variable to evaluate 'result'
mutate(sup = if_else(result == 2, 1, 0, NA_real_)) %>%
#if sup == 1, rank it
mutate(sup_rank = ifelse(sup == 1, rank(sup == 1, na.last = 'keep', ties.method = 'first'), NA_real_)) %>%
#create an event if the rank of the sup == 1 is equal to 1 (we want the initial suppression)
mutate(event = if_else(sup_rank == 1, 1, 0, NA_real_)) %>%
#now override the value of event to be equal to 0 if sup==0
mutate(event_final = if_else(sup == 0, 0, event)) %>%
arrange(person, episode)
print(d1)
#> # A tibble: 10 x 7
#> # Groups: person [2]
#> person episode result sup sup_rank event event_final
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 1 NA NA NA NA NA
#> 2 1 2 NA NA NA NA NA
#> 3 2 1 NA NA NA NA NA
#> 4 2 2 1 0 NA NA 0
#> 5 2 3 NA NA NA NA NA
#> 6 2 4 2 1 2 0 0
#> 7 2 5 NA NA NA NA NA
#> 8 2 6 2 1 3 0 0
#> 9 2 7 NA NA NA NA NA
#> 10 2 8 2 1 4 0 0
Created on 2022-04-20 by the reprex package (v2.0.0)
There is a more efficient way to do this for sure, but in the meantime, here's a solution I created:
#Load packages
pacman::p_load(dplyr)
#Create variables for data set
person <- c(1, 1, 2, 2, 2, 2, 2, 2, 2, 2)
episode <- c(1, 2, 1, 2, 3, 4, 5, 6, 7, 8)
result <- c(NA, NA, NA, 1, NA, 2, NA, 2, NA, 2)
#Populate data frame with variables
d <- cbind(person, episode, result)
d <- as.data.frame(d)
#Manipulate data frame to create 5 new variables
d1 <- d %>%
#Need to create new variables within each person
group_by(person) %>%
#Need to correctly order the rows of data before creating the variables
arrange(person, episode) %>%
#Create variable to evaluate 'result'
mutate(sup = if_else(result == 2, 1, 0, NA_real_)) %>%
#Create a flag for each time sup==1
mutate(sup_flag = if_else(sup == 1, 1, NA_real_, NA_real_)) %>%
#if sup == 1, rank it
mutate(sup_rank = ifelse(sup == 1, rank(sup_flag, na.last = 'keep', ties.method = 'first'), NA_real_)) %>%
#create an event if the rank of the sup == 1 is equal to 1 (we want the initial suppression)
mutate(event = if_else(sup_rank == 1, 1, 0, NA_real_)) %>%
#now override the value of event to be equal to 0 if sup==0
mutate(event_final = if_else(sup == 0, 0, event)) %>%
arrange(person, episode)
print(d1)
#> # A tibble: 10 x 8
#> # Groups: person [2]
#> person episode result sup sup_flag sup_rank event event_final
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 1 NA NA NA NA NA NA
#> 2 1 2 NA NA NA NA NA NA
#> 3 2 1 NA NA NA NA NA NA
#> 4 2 2 1 0 NA NA NA 0
#> 5 2 3 NA NA NA NA NA NA
#> 6 2 4 2 1 1 1 1 1
#> 7 2 5 NA NA NA NA NA NA
#> 8 2 6 2 1 1 2 0 0
#> 9 2 7 NA NA NA NA NA NA
#> 10 2 8 2 1 1 3 0 0
Created on 2022-04-22 by the reprex package (v2.0.0)

compare sets of columns in R dataframe and keep one value from each set of two columns

Basically, I have a large dataset with many different variables. The data is ordered in pairs (2019 and 2020) and for some variables for neither year data is available for some only 2019 and some only 2020. I would like the 2020 data to 'override' the 2019 data, but only if it is available in 2020 and 2019. If no data is available for either year, then the data should stay missing. I now do this with a little helper function, but this should be more scalable, so that I can do it for 200+ column pairs. What am I missing in mutate(across(....),)
# Create data
mydf <- tibble(ID = 1:5,
var1_2019 = c(9, NA, 3, 2, NA),
var1_2020 = c(NA, NA, 3, 2, 4),
var2_2019 = c("A", "B",NA, "D", "C"),
var2_2020 = c(NA, "B",NA, "R", NA),
var3_2019 = c(T, F, NA, NA, NA),
var3_2020 = c(NA, NA, NA, NA, F))
# create little helper function. this is good because
# it could be made more complex in the future,
# for example for numeric variables keeping the larger of the two
which_to_keep_f <-
function(x, y) {
if (is.na(x) && is.na(y)) {
output <- NA
}
if (is.na(x) && !is.na(y)) {
output <- y
}
if (!is.na(x) && is.na(y)) {
output <- x
}
if (!is.na(x) && !is.na(y)) {
output <- y
}
output
}
# vectorize it
which_to_keep_f_vec <- Vectorize(which_to_keep_f)
# use function inside mutate
mydf %>%
mutate(var1 = which_to_keep_f_vec(var1_2019, var1_2020)) %>%
mutate(var2 = which_to_keep_f_vec(var2_2019, var2_2020)) %>%
mutate(var3 = which_to_keep_f_vec(var3_2019, var3_2020)) %>%
select(-contains("_20"))
Solution
Thanks to TarJae and micahkimel I got to 99% of the solution. This is the complete solution (including dropping the variables that are no longer needed and renaming the variables to their desired format)
mydf %>%
mutate(across(ends_with('_2019'),
~(which_to_keep_f_vec(.,
get(stringr::str_replace(cur_column(), "_2019$", "_2020"))))) %>%
unnest(cols=c()))%>%
select(-contains("_2020")) %>%
rename_all(~ stringr::str_replace(., regex("_2019$", ignore_case = TRUE), ""))
Update: Thanks to micahkimel removing list to not duplicate the data:
Is this what you are looking for. Here we apply your function to sets of pairs:
library(dplyr)
library(stringr)
mydf %>%
mutate(across(ends_with('_2019'),
~(which_to_keep_f_vec(.,
get(str_replace(cur_column(), "_2019$", "_2020"))))) %>%
unnest(cols=c())
ID var1_2019 var1_2020 var2_2019 var2_2020 var3_2019 var3_2020
<int> <dbl> <dbl> <chr> <chr> <lgl> <lgl>
1 1 9 NA A NA TRUE NA
2 2 NA NA B B FALSE NA
3 3 3 3 NA NA NA NA
4 4 2 2 R R NA NA
5 5 4 4 C NA FALSE FALSE
Here's an approach that results in just one variable for each pair of variables in your input table. First, use pivot_longer() to collapse the pairs into single variables, and add year as a column (with twice as many observations).
mydf_long = mydf %>%
pivot_longer(cols = matches("_20"), names_to = c(".value", "year"),
names_sep = "_")
ID year var1 var2 var3
<int> <chr> <dbl> <chr> <lgl>
1 1 2019 9 A TRUE
2 1 2020 NA NA NA
3 2 2019 NA B FALSE
4 2 2020 NA B NA
5 3 2019 3 NA NA
6 3 2020 3 NA NA
7 4 2019 2 D NA
8 4 2020 2 R NA
9 5 2019 NA C NA
10 5 2020 4 NA FALSE
Next, use fill() to populate later NA values with earlier non-missing values. Then we can just filter to the most recent year (2020). For each variable, that year will have its own value if it had one before; otherwise, it will carry over the value from the previous year.
mydf_long %>%
group_by(ID) %>%
fill(var1, var2, var3) %>%
filter(year == 2020)
ID year var1 var2 var3
<int> <chr> <dbl> <chr> <lgl>
1 1 2020 9 A TRUE
2 2 2020 NA B FALSE
3 3 2020 3 NA NA
4 4 2020 2 R NA
5 5 2020 4 C FALSE

Why does case_when() compute false condition?

I have a data.frame with a group variable and an integer variable, with missing data.
df<-data.frame(group=c(1,1,2,2,3,3),a=as.integer(c(1,2,NA,NA,1,NA)))
I want to compute the maximum available value of variable a within each group : in my example, I should get 2 for group 1, NA for group 2 and 1 for group 3.
df %>% group_by(group) %>% mutate(max.a=case_when(sum(!is.na(a))==0 ~ NA_integer_,
T ~ max(a,na.rm=T)))
The above code generates an error, seemingly because in group 2 all values of a are missing so max(a,na.rm=T) is set to -Inf, which is not an integer.
Why is this case computed for group 2 whereas the condition is false, as the following verification confirms ?
df %>% group_by(group) %>% mutate(test=sum(!is.na(a))==0)
I found a workaround converting a to double, but I still get a warning and dissatisfaction not to have found a better solution.
case_when evaluates all the RHS of the condition irrespective if the condition is satisfied or not hence you get an error. You may use hablar::max_ which returns NA if all the values are NA.
library(dplyr)
df %>%
group_by(group) %>%
mutate(max.a= hablar::max_(a)) %>%
ungroup
# group a max.a
# <dbl> <int> <int>
#1 1 1 2
#2 1 2 2
#3 2 NA NA
#4 2 NA NA
#5 3 1 1
#6 3 NA 1
Instead of making use of case_when I would suggest to use an if () statement like so:
library(dplyr)
df <- data.frame(group = c(1, 1, 2, 2, 3, 3), a = as.integer(c(1, 2, NA, NA, 1, NA)))
df %>%
group_by(group) %>%
mutate(max.a = if (all(is.na(a))) NA_real_ else max(a, na.rm = T))
#> # A tibble: 6 x 3
#> # Groups: group [3]
#> group a max.a
#> <dbl> <int> <dbl>
#> 1 1 1 2
#> 2 1 2 2
#> 3 2 NA NA
#> 4 2 NA NA
#> 5 3 1 1
#> 6 3 NA 1
This code gives a warning but it works.
library(dplyr)
df %>%
group_by(group) %>%
dplyr::summarise(max.a = max(a, na.rm=TRUE))
Output:
group max.a
<dbl> <dbl>
1 1 2
2 2 -Inf
3 3 1

Creating an index/numeral sequence for subsequent N/As in a data frame

I have a column in a data frame (here named "a") where starts of an sequence are marked with 1, while subsequent incidents, belonging to the same sequence are marked with N/A. Now I would like to create a new column ("b") to index all incidents belonging to the same sequence (1:n) and then create a third column ("c") with numbers indicating which incidents belong to the same sequence.
I am sure the solution is very easy and striking once I see it, however, at the moment I just don't manage to come up with an idea myself how to best solve this. Also other questions did not cover my question, as far as I have seen.
Usually I am using dplyr (I also need to do some group_by with my data, which in reality is more complex than I outlined here), so I would be very happy about a dplyr solution if possible!
Example code to start with:
df <- data.frame("a"= c(1, NA, NA, NA, 1, NA, 1, 1, 1))
How it should look like in the end:
df_final <- data.frame("a"= c(1, NA, NA, NA, 1, NA, 1, 1, 1), "b"= c(1, 2, 3, 4, 1, 2, 1, 1, 1), "c" = c(1, 1, 1, 1, 2, 2, 3, 4, 5))
EDIT
Since the question has changed now, getting expected output is more simple now
library(dplyr)
df %>%
group_by(c = cumsum(!is.na(a))) %>%
mutate(b = row_number())
# a c b
# <dbl> <int> <int>
#1 1 1 1
#2 NA 1 2
#3 NA 1 3
#4 NA 1 4
#5 1 2 1
#6 NA 2 2
#7 1 3 1
#8 1 4 1
#9 1 5 1
And using base R that would be :
df$c <- cumsum(!is.na(df$a))
df$b <- with(df, ave(a, c, FUN = seq_along))
Original Answer
Unfortunately, the grouping for creation of b and c is different. For b we group_by sequential non-NA values and take cumulative over them and then generate a row_number for every group. For c we take rle on non-NA values and repeat the group values lengths times.
library(dplyr)
df %>%
group_by(group = cumsum(!is.na(a))) %>%
mutate(b = row_number()) %>%
ungroup() %>%
select(-group) %>%
mutate(c = with(rle(!is.na(a)), rep(cumsum(values), lengths)))
# A tibble: 9 x 3
# a b c
# <dbl> <int> <int>
#1 1 1 1
#2 NA 2 1
#3 NA 3 1
#4 NA 4 1
#5 1 1 2
#6 NA 2 2
#7 1 1 3
#8 1 1 3
#9 1 1 3
Of course this is not dplyr specific answer and can be answered with base R as well
df$b <- with(df, ave(a, cumsum(!is.na(a)), FUN = seq_along))
df$c <- with(df, with(rle(!is.na(a)), rep(cumsum(values), lengths)))

How to merge variables looping through by variable number in R

I have a dataframe with a lot of variables seen in multiple conditions. I'd like to merge each variable by condition.
The example data frame is a simplified version of what I have (3 variables over 2 conditions).
VAR.B_1 <- c(1, 2, 3, 4, 5, 'NA', 'NA', 'NA', 'NA', 'NA')
VAR.B_2 <- c(2, 2, 3, 4, 5,'NA', 'NA', 'NA', 'NA', 'NA')
VAR.B_3 <- c(1, 1, 1, 1, 1,'NA', 'NA', 'NA', 'NA', 'NA')
VAR.E_1 <- c(NA, NA, NA, NA, NA, 1, 1, 1, 1, 1)
VAR.E_2 <- c(NA, NA, NA, NA, NA, 1, 2, 3, 4, 5)
VAR.E_3 <- c(NA, NA, NA, NA, NA, 1, 1, 1, 1, 1)
Condition <- c("B", "B","B","B","B","E","E","E","E","E")
#Example dataset
data<-as.data.frame(cbind(VAR.B_1,VAR.B_2,VAR.B_3, VAR.E_1,VAR.E_2, VAR.E_3, Condition))
I want to end up with this, appended to the original data frame:
VAR_1 VAR_2 VAR_3
1 2 1
2 2 1
3 3 1
4 4 1
5 5 1
1 1 1
1 2 1
1 3 1
1 4 1
1 5 1
I understand that R won't work with i inside the variable name, but I have an example of the kind of for loop I was trying to do. I would rather not call variables by column location, since there will be a lot of variables.
##Example of how I want to merge - this code does not work
for(i in 1:3) {
data$VAR_[,i] <-ifelse(data$Condition == "B", VAR.B_[,i],
ifelse(data$Condition == "E", VAR.E_[,i], NA))
}
This might work for your situation:
library(tidyverse)
library(stringr)
data %>%
mutate_all(as.character) %>%
gather(key, value, -Condition) %>%
filter(!is.na(value), value != "NA") %>%
mutate(key = str_replace(key, paste0("\\.", Condition), "")) %>%
group_by(Condition, key) %>%
mutate(rowid = 1:n()) %>%
spread(key, value) %>%
bind_cols(data)
#> # A tibble: 10 x 12
#> # Groups: Condition [2]
#> Condition rowid VAR_1 VAR_2 VAR_3 VAR.B_1 VAR.B_2 VAR.B_3 VAR.E_1
#> <chr> <int> <chr> <chr> <chr> <fctr> <fctr> <fctr> <fctr>
#> 1 B 1 1 2 1 1 2 1 NA
#> 2 B 2 2 2 1 2 2 1 NA
#> 3 B 3 3 3 1 3 3 1 NA
#> 4 B 4 4 4 1 4 4 1 NA
#> 5 B 5 5 5 1 5 5 1 NA
#> 6 E 1 1 1 1 NA NA NA 1
#> 7 E 2 1 2 1 NA NA NA 1
#> 8 E 3 1 3 1 NA NA NA 1
#> 9 E 4 1 4 1 NA NA NA 1
#> 10 E 5 1 5 1 NA NA NA 1
#> # ... with 3 more variables: VAR.E_2 <fctr>, VAR.E_3 <fctr>,
#> # Condition1 <fctr>
data.frame(lapply(split.default(data[-NCOL(data)], gsub("\\D+", "", head(names(data), -1))),
function(a){
a = sapply(a, function(x) as.numeric(as.character(x)))
rowSums(a, na.rm = TRUE)
}))
# X1 X2 X3
#1 1 2 1
#2 2 2 1
#3 3 3 1
#4 4 4 1
#5 5 5 1
#6 1 1 1
#7 1 2 1
#8 1 3 1
#9 1 4 1
#10 1 5 1
#Warning messages:
#1: In FUN(X[[i]], ...) : NAs introduced by coercion
#2: In FUN(X[[i]], ...) : NAs introduced by coercion
#3: In FUN(X[[i]], ...) : NAs introduced by coercion
Your data appears to have two kinds of NA values in it. It has NA, or R's NA value, and it also has the string 'NA'. In my solution below, I replace both with zero, cast each column in the data frame to numeric, and then just sum together like-numbered VAR columns. Then, drop the original columns which you don't want anymore.
data <- as.data.frame(cbind(VAR.B_1,VAR.B_2,VAR.B_3, VAR.E_1,VAR.E_2, VAR.E_3),
stringsAsFactors=FALSE)
data[is.na(data)] <- 0
data[data == 'NA'] <- 0
data <- as.data.frame(lapply(data, as.numeric))
data$VAR_1 <- data$VAR.B_1 + data$VAR.E_1
data$VAR_2 <- data$VAR.B_2 + data$VAR.E_2
data$VAR_3 <- data$VAR.B_3 + data$VAR.E_3
data <- data[c("VAR_1", "VAR_2", "VAR_3")]
Demo

Resources