Remove groups if all NA - r

Let's say I have a table like so:
df <- data.frame("Group" = c("A","A","A","B","B","B","C","C","C"),
"Num" = c(1,2,3,1,2,NA,NA,NA,NA))
Group Num
1 A 1
2 A 2
3 A 3
4 B 1
5 B 2
6 B NA
7 C NA
8 C NA
9 C NA
In this case, because group C has Num as NA for all entries, I would like to remove rows in group C from the table. Any help is appreciated!

You could group_by on you Group and filter the groups with all values that are NA. You can use the following code:
library(dplyr)
df %>%
group_by(Group) %>%
filter(!all(is.na(Num)))
#> # A tibble: 6 × 2
#> # Groups: Group [2]
#> Group Num
#> <chr> <dbl>
#> 1 A 1
#> 2 A 2
#> 3 A 3
#> 4 B 1
#> 5 B 2
#> 6 B NA
Created on 2023-01-18 with reprex v2.0.2

In base R you could index based on all the groups that have at least one non-NA value:
idx <- df$Group %in% unique(df[!is.na(df$Num),"Group"])
idx
df[idx,]
# or in one line
df[df$Group %in% unique(df[!is.na(df$Num),"Group"]),]
output
Group Num
1 A 1
2 A 2
3 A 3
4 B 1
5 B 2
6 B NA

Using ave.
df[with(df, !ave(Num, Group, FUN=\(x) all(is.na(x)))), ]
# Group Num
# 1 A 1
# 2 A 2
# 3 A 3
# 4 B 1
# 5 B 2
# 6 B NA

Related

Roll max in R. From first row to current row

I would like to calculate max value from first row to current row
df <- data.frame(id = c(1,1,1,1,2,2,2), value = c(2,5,3,2,4,5,4), result = c(NA,2,5,5,NA,4,5))
I have tried grouping by id with dplyr and using rollmax function from zoo but did not success
1) rollmax is used with a fixed width but here we have a variable width so using rollapplyr, which seems close to the approach of the question, we have:
library(dplyr)
library(zoo)
df %>%
group_by(id) %>%
mutate(out = lag(rollapplyr(value, 1:n(), max))) %>%
ungroup
giving:
# A tibble: 7 x 4
# Groups: id [2]
id value result out
<dbl> <dbl> <dbl> <dbl>
1 1 2 NA NA
2 1 5 2 2
3 1 3 5 5
4 1 2 5 5
5 2 4 NA NA
6 2 5 4 4
7 2 4 5 5
2) It is also possible to perform the grouping via the width (second) argument of rollapplyr like this eliminating dplyr. In this case the widths are 1, 2, 3, 4, 1, 2, 3 and Max is like max except it does not use the last element of its argument x. (An alternate expression for the width would be seq_along(id) - match(id, id) + 1).
library(zoo)
Max <- function(x) if (length(x) == 1) NA else max(head(x, -1))
transform(df, out = rollapplyr(value, sequence(rle(id)$lengths), Max))
giving:
id value result out
1 1 2 NA NA
2 1 5 2 2
3 1 3 5 5
4 1 2 5 5
5 2 4 NA NA
6 2 5 4 4
7 2 4 5 5
A data.table option using shift + cummax
> setDT(df)[, result2 := shift(cummax(value)), id][]
id value result result2
1: 1 2 NA NA
2: 1 5 2 2
3: 1 3 5 5
4: 1 2 5 5
5: 2 4 NA NA
6: 2 5 4 4
7: 2 4 5 5
library(dplyr)
df |>
group_by(id) |>
mutate(result = lag(cummax(value)))
# # A tibble: 7 x 3
# # Groups: id [2]
# id value result
# <dbl> <dbl> <dbl>
# 1 1 2 NA
# 2 1 5 2
# 3 1 3 5
# 4 1 2 5
# 5 2 4 NA
# 6 2 5 4
# 7 2 4 5
Here is a base R solution. This would just get you the cumulative maximum:
df$result = ave(df$value, df$i, FUN=cummax)
To get the cumulative maximum with the lag you wanted:
df$result = ave(df$value, df$i, FUN=function(x) c(NA,cummax(x[-(length(x))])))

Filter data.frame with all colums NA but keep when some are NA

I want to remove rows from a data.frame where all cols are NA. But I'd like to keep rows that have some values NA.
I know how to do this with base R but I'm trying to figure out how to make it work with tidyverse. I'm trying the across operator.
library(tidyverse)
teste <- data.frame(a = c(1,NA,3, NA), b = c(NA, NA, 3, 4), c = c(1, NA, 3, 4))
teste
#> a b c
#> 1 1 NA 1
#> 2 NA NA NA
#> 3 3 3 3
#> 4 NA 4 4
# I whant to remove rows where all values are NA
# that is, remove only line 2
# here I can get the lines with all values NA
teste %>%
filter(across(a:c, is.na))
#> a b c
#> 1 NA NA NA
# If I negate the filter, it does not work
# the last line (NA, 4, 4) is missing
teste %>%
filter(!across(a:c, is.na))
#> a b c
#> 1 1 NA 1
#> 2 3 3 3
# This is what I'm expecting
# a b c
# 1 NA 1
# 3 3 3
# NA 4 4
# Using base I can do this with
teste[apply(teste, 1, function(x) sum(is.na(x))) < 3,]
#> a b c
#> 1 1 NA 1
#> 3 3 3 3
#> 4 NA 4 4
How can I do this using tidyverse?
Created on 2020-08-18 by the reprex package (v0.3.0)
We can use base R
teste[rowSums(!is.na(teste)) >0,]
# a b c
#1 1 NA 1
#3 3 3 3
#4 NA 4 4
Or using apply and any
teste[apply(!is.na(teste), 1, any),]
which can be also used within filter
teste %>%
filter(rowSums(!is.na(.)) >0)
Or using c_across from dplyr, we can directly remove the rows with all NA
library(dplyr)
teste %>%
rowwise %>%
filter(!all(is.na(c_across(everything()))))
# A tibble: 3 x 3
# Rowwise:
# a b c
# <dbl> <dbl> <dbl>
#1 1 NA 1
#2 3 3 3
#3 NA 4 4
NOTE: filter_all is getting deprecated
Previously in dplyr, you could use filter_all (for all columns)/filter_at (for specific columns) which had any_vars :
library(dplyr)
teste %>% filter_all(any_vars(!is.na(.)))
However, across does not have direct replacement of any_vars so you can use this with Reduce :
teste %>% filter(Reduce(`|`, across(.fns = Negate(is.na))))
# a b c
#1 1 NA 1
#2 3 3 3
#3 NA 4 4
Using data.table, you can produce the same outcome.
teste2 <- teste[-which(is.na(teste$a)&is.na(teste$b)&is.na(teste$c)),]

Remove trailing NA by group in a data.frame

I have a data.frame with a grouping variable, and some NAs in the value column.
df = data.frame(group=c(1,1,2,2,2,2,2,3,3), value1=1:9, value2=c(NA,4,9,6,2,NA,NA,1,NA))
I can use zoo::na.trim to remove NA at the end of a column: this will remove the last line of the data.frame:
library(zoo)
library(dplyr)
df %>% na.trim(sides="right")
Now I want to remove the trailing NAs by group; how can I achieve this using dplyr?
Expected output for value2 column: c(NA, 4,9,6,2,1)
You could write a little helper function that checks for trailing NAs of a vector and then use group_by and filter.
f <- function(x) { rev(cumsum(!is.na(rev(x)))) != 0 }
library(dplyr)
df %>%
group_by(group) %>%
filter(f(value2))
# A tibble: 6 x 3
# Groups: group [3]
group value1 value2
<dbl> <int> <dbl>
1 1 1 NA
2 1 2 4
3 2 3 9
4 2 4 6
5 2 5 2
6 3 8 1
edit
If we need to remove both leading and trailing zero we need to extend that function a bit.
f1 <- function(x) { cumsum(!is.na(x)) != 0 & rev(cumsum(!is.na(rev(x)))) != 0 }
Given df1
df1 = data.frame(group=c(1,1,2,2,2,2,2,3,3), value1=1:9, value2=c(NA,4,9,NA,2,NA,NA,1,NA))
df1
# group value1 value2
#1 1 1 NA
#2 1 2 4
#3 2 3 9
#4 2 4 NA
#5 2 5 2
#6 2 6 NA
#7 2 7 NA
#8 3 8 1
#9 3 9 NA
We get this result
df1 %>%
group_by(group) %>%
filter(f1(value2))
# A tibble: 5 x 3
# Groups: group [3]
group value1 value2
<dbl> <int> <dbl>
1 1 2 4
2 2 3 9
3 2 4 NA
4 2 5 2
5 3 8 1
Using lapply, loop through group:
do.call("rbind", lapply(split(df, df$group), na.trim, sides = "right"))
# group value1 value2
# 1.1 1 1 NA
# 1.2 1 2 4
# 2.3 2 3 9
# 2.4 2 4 6
# 2.5 2 5 2
# 3 3 8 1
Or using by, as mentioned by #Henrik:
do.call("rbind", by(df, df$group, na.trim, sides = "right"))

R - Replace missing values with highest of 4 previous values

This is a variation of the last observation carried forward problem in a vector with some missing values. Instead of filling in NA values with the last non NA observation, I would like to fill in NA values with the highest value in the 4 observations preceding it. If all 4 observations preceding are also NA, the NA missing value should be retained. Would also appreciate it this can be done by groups in a data frame/data table.
Example:
Original DF:
ID Week Value
a 1 5
a 2 1
a 3 NA
a 4 NA
a 5 3
a 6 4
a 7 NA
b 1 NA
b 2 NA
b 3 NA
b 4 NA
b 5 NA
b 6 1
b 7 NA
Output DF:
ID Week Value
a 1 5
a 2 1
a 3 5
a 4 5
a 5 3
a 6 4
a 7 4
b 1 NA
b 2 NA
b 3 NA
b 4 NA
b 5 NA
b 6 1
b 7 1
lag shifts the column by n steps and lets you peek at previous values. pmax is element-wise maximum and lets to pick the highest value for each set/row of the observations.
To abstract away notion of 4 and maintain vectorized performance, you may use quasiquotes from rlang: http://dplyr.tidyverse.org/articles/programming.html#quasiquotation
It can look a little cryptic at first but is very precise and expressive.
df <- readr::read_table(
" ID Week Value
a 1 5
a 2 1
a 3 NA
a 4 NA
a 5 3
a 6 4
a 7 NA
b 1 NA
b 2 NA
b 3 NA
b 4 NA
b 5 NA
b 6 1
b 7 NA")
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
df %>%
group_by(ID) %>%
mutate(
Value = if_else(is.na(Value), pmax(lag(Value, 1), lag(Value, 2), lag(Value, 3), lag(Value, 4), na.rm = TRUE), Value)
)
#> # A tibble: 14 x 3
#> # Groups: ID [2]
#> ID Week Value
#> <chr> <int> <int>
#> 1 a 1 5
#> 2 a 2 1
#> 3 a 3 5
#> 4 a 4 5
#> 5 a 5 3
#> 6 a 6 4
#> 7 a 7 4
#> 8 b 1 NA
#> 9 b 2 NA
#> 10 b 3 NA
#> 11 b 4 NA
#> 12 b 5 NA
#> 13 b 6 1
#> 14 b 7 1
# or if you are an rlang ninja
library(purrr)
pmax_lag_n <- function(column, n) {
column <- enquo(column)
1:n %>%
map(~quo(lag(!!column, !!.x))) %>%
{ quo(pmax(!!!., na.rm = TRUE)) }
}
df %>%
group_by(ID) %>%
mutate(Value = if_else(is.na(Value), !!pmax_lag_n(Value, 4), Value))
#> # A tibble: 14 x 3
#> # Groups: ID [2]
#> ID Week Value
#> <chr> <int> <int>
#> 1 a 1 5
#> 2 a 2 1
#> 3 a 3 5
#> 4 a 4 5
#> 5 a 5 3
#> 6 a 6 4
#> 7 a 7 4
#> 8 b 1 NA
#> 9 b 2 NA
#> 10 b 3 NA
#> 11 b 4 NA
#> 12 b 5 NA
#> 13 b 6 1
#> 14 b 7 1
Define function Max which accepts a vector x and returns NA if all its elements are NA. Otherwise, if the last value is NA it returns the maximum of all non-NA elements and if the last value is not NA then it returns it.
Also define na.max which runs Max on a rolling window of length n (given by the second argument to na.max -- default 5).
Finally apply na.max to Value by ID using ave.
library(zoo)
Max <- function(x) {
last <- tail(x, 1)
if (all(is.na(x))) NA
else if (is.na(last)) max(x, na.rm = TRUE)
else last
}
na.max <- function(x, n = 5) rollapplyr(x, n, Max, partial = TRUE)
transform(DF, Value = ave(Value, ID, FUN = na.max))
giving:
ID Week Value
1 a 1 5
2 a 2 1
3 a 3 5
4 a 4 5
5 a 5 3
6 a 6 4
7 a 7 4
8 b 1 NA
9 b 2 NA
10 b 3 NA
11 b 4 NA
12 b 5 NA
13 b 6 1
14 b 7 1
Note: Input DF in reproducible form:
Lines <- "
ID Week Value
a 1 5
a 2 1
a 3 NA
a 4 NA
a 5 3
a 6 4
a 7 NA
b 1 NA
b 2 NA
b 3 NA
b 4 NA
b 5 NA
b 6 1
b 7 NA"
DF <- read.table(text = Lines, header = TRUE)

How to substitute NA by 0 in 20 columns?

I want to substitute NA by 0 in 20 columns. I found this approach for 2 columns, however I guess it's not optimal if the number of columns is 20. Is there any alternative and more compact solution?
mydata[,c("a", "c")] <-
apply(mydata[,c("a","c")], 2, function(x){replace(x, is.na(x), 0)})
UPDATE:
For simplicity lets take this data with 8 columns and substitute NAs in columns b, c, e, f and d
a b c d e f g d
1 NA NA 2 3 4 7 6
2 g 3 NA 4 5 4 Y
3 r 4 4 NA t 5 5
The result must be this one:
a b c d e f g d
1 0 0 2 3 4 7 6
2 g 3 NA 4 5 4 Y
3 r 4 4 0 t 5 5
The replace_na function from tidyr can be applied over a vector as well as a dataframe (http://tidyr.tidyverse.org/reference/replace_na.html).
Use it with a mutate_at variation from dplyr to apply it to multiple columns at the same time:
my_data %>% mutate_at(vars(b,c,e,f), replace_na, 0)
or
my_data %>% mutate_at(c('b','c','e','f'), replace_na, 0)
Here is a tidyverse way to replace NA with different values based on the data type of the column.
library(tidyverse)
dataset %>% mutate_if(is.numeric, replace_na, 0) %>%
mutate_if(is.character, replace_na, "")
Another option:
library(tidyr)
v <- c('b', 'c', 'e', 'f')
replace_na(df, as.list(setNames(rep(0, length(v)), v)))
Which gives:
# a b c d e f g d.1
#1 1 0 0 2 3 4 7 6
#2 2 g 3 NA 4 5 4 Y
#3 3 r 4 4 0 t 5 5
We can use NAer from qdap to convert the NA to 0. If there are multiple column, loop using lapply.
library(qdap)
nm1 <- c('b', 'c', 'e', 'f')
mydata[nm1] <- lapply(mydata[nm1], NAer)
mydata
# a b c d e f g d.1
#1 1 0 0 2 3 4 7 6
#2 2 g 3 NA 4 5 4 Y
#3 3 r 4 4 0 t 5 5
Or using dplyr
library(dplyr)
mydata %>%
mutate_each_(funs(replace(., which(is.na(.)), 0)), nm1)
# a b c d e f g d.1
#1 1 0 0 2 3 4 7 6
#2 2 g 3 NA 4 5 4 Y
#3 3 r 4 4 0 t 5 5
Another strategy using tidyr::replace_na()
library(tidyverse)
df <- read.table(header = T, text = 'a b c d e f g h
1 NA NA 2 3 4 7 6
2 g 3 NA 4 5 4 Y
3 r 4 4 NA t 5 5')
df %>%
mutate(across(everything(), ~replace_na(., 0)))
#> a b c d e f g h
#> 1 1 0 0 2 3 4 7 6
#> 2 2 g 3 0 4 5 4 Y
#> 3 3 r 4 4 0 t 5 5
Created on 2021-08-22 by the reprex package (v2.0.0)
Knowing that replace_na() accepts a named list for the replace argument, using purrr::map() is a good option here to reduce the amount of code. It is also possible to replace different values in each column using 'map2()'.
code:
library(data.table)
library(tidyverse)
tbl <-read_table("a b c d e f g d
1 NA NA 2 3 4 7 6
2 g 3 NA 4 5 4 Y
3 r 4 4 NA t 5 5")
#> Warning: Duplicated column names deduplicated: 'd' => 'd_1' [8]
nms <- c('b', 'c', 'e', 'f', 'g')
imap_dfc(tbl, ~ if(any(.y == nms)) replace_na(.x, 0) else .x)
#> # A tibble: 3 × 8
#> a b c d e f g d_1
#> <dbl> <chr> <dbl> <dbl> <dbl> <chr> <dbl> <chr>
#> 1 1 0 0 2 3 4 7 6
#> 2 2 g 3 NA 4 5 4 Y
#> 3 3 r 4 4 0 t 5 5
#using data.table
tblDT <- as.data.table(tbl)
#Further explanation here: https://stackoverflow.com/questions/16846380
tblDT[, (nms) := map(.SD, ~replace_na(., 0)), .SDcols = nms]
tblDT %>%
as_tibble()
#> # A tibble: 3 × 8
#> a b c d e f g d_1
#> <dbl> <chr> <dbl> <dbl> <dbl> <chr> <dbl> <chr>
#> 1 1 0 0 2 3 4 7 6
#> 2 2 g 3 NA 4 5 4 Y
#> 3 3 r 4 4 0 t 5 5
#to replace na's in every column:
tbl %>%
replace_na(map(., ~0))
#> # A tibble: 3 × 8
#> a b c d e f g d_1
#> <dbl> <chr> <dbl> <dbl> <dbl> <chr> <dbl> <chr>
#> 1 1 0 0 2 3 4 7 6
#> 2 2 g 3 0 4 5 4 Y
#> 3 3 r 4 4 0 t 5 5
Created on 2021-09-25 by the reprex package (v2.0.1)

Resources