R Equality while ignoring NAs - r

Is there an equivalent of == but with the result that x != NA if x is not NA?
The following does what I want, but it's clunky:
mapply(identical, vec1, vec2)

Just replace "==" with %in%.
Example:
> df <- data.frame(col1= c("a", "b", NA), col2= 1:3)
> df
col1 col2
1 a 1
2 b 2
3 <NA> 3
> df[df$col1=="a", ]
col1 col2
1 a 1
NA <NA> NA
> df[df$col1%in%"a", ]
col1 col2
1 a 1
> "x"==NA
[1] NA
> "x"%in%NA
[1] FALSE

1 == NA returns a logical NA rather than TRUE or FALSE. If you want to call NA FALSE, you could add a second conditional:
set.seed(1)
x <- 1:10
x[4] <- NA
y <- sample(1:10, 10)
x <= y
# [1] TRUE TRUE TRUE NA FALSE TRUE TRUE FALSE TRUE FALSE
x <= y & !is.na(x)
# [1] TRUE TRUE TRUE FALSE FALSE TRUE TRUE FALSE TRUE FALSE
You could also use a second processing step to convert all the NA values from your equality test to FALSE.
foo <- x <= y
foo[is.na(foo)] <- FALSE
foo
# [1] TRUE TRUE TRUE FALSE FALSE TRUE TRUE FALSE TRUE FALSE
Also, for what its worth, NA == NA returns NA as does NA != NA.

The == operator is often used in combination with filtering data.frames.
In that situation, dplyr::filter will retain only rows where your condition evaluates to TRUE, unlike [. That effectively implements == but where 1 == NA evalutes as FALSE.
Example:
> df <- data.frame(col1= c("a", "b", NA), col2= 1:3)
> df
col1 col2
1 a 1
2 b 2
3 <NA> 3
> dplyr::filter(df, col1=="a")
col1 col2
1 a 1

Why not use base R:
df <- data.frame(col1 = c("a", "b", NA), col2 = 1:3, col3 = 11:13)
df
subset(x = df, subset = col1=="a", select = c(col1, col2))
# col1 col2
# 1 a 1
or with arrays:
df <- c("a", "b", NA)
subset(x = df, subset = df == "a")

Related

Get vector of column names depending on logicals of same dataframe

I have a named dataframe containig logicals with missings and I want to get a vector with the column names where values are TRUE (going down the rows and, if multiple TRUEs in one row, going from left to right). Here an example:
df <- data.frame(a= c(FALSE, NA, TRUE, TRUE),
b= c(TRUE, FALSE, FALSE, NA),
c= c(TRUE, TRUE, NA, NA))
df
# a b c
# 1 FALSE TRUE TRUE
# 2 NA FALSE TRUE
# 3 TRUE FALSE NA
# 4 TRUE NA NA
expected <- c("b", "c", "c", "a", "a")
Going from first to last row we see TRUE in the first row. Here are multiple TRUEs, thus we go from left to right and get "b" and "c". In second tow we get "c", and so on.
How to do this (in an elegant way)?
You can do in base R:
pos <- which(t(df) == TRUE, arr.ind = TRUE)
names(df)[pos[, "row"]]
[1] "b" "c" "c" "a" "a"
You can also try using apply
unlist(apply(df, 1, function(x){na.omit(names(df)[x])}))
[1] "b" "c" "c" "a" "a"
Here is a tidyverse way:
library(dplyr)
library(tidyr)
vector <- df %>%
mutate(across(, ~case_when(.==TRUE ~ cur_column()), .names = 'new_{col}')) %>%
unite(New_Col, starts_with('new'), na.rm = TRUE, sep = ', ') %>%
separate_rows(New_Col) %>%
pull(New_Col)
Or:
library(dplyr)
library(tidyr)
df %>%
mutate(across(, ~case_when(.==TRUE ~ cur_column()))) %>%
pivot_longer(everything()) %>%
na.omit() %>%
pull(value)
[1] "b" "c" "c" "a" "a"
Another possible solution, based on purrr::pmap:
library(tidyverse)
pmap(df, ~ names(df)[c(...)] %>% na.omit) %>% unlist
#> [1] "b" "c" "c" "a" "a"
You can use %%(modulo) to identify the column indices.
names(df)[(which(t(df)) - 1) %% ncol(df) + 1]
# [1] "b" "c" "c" "a" "a"
Benchmark
df <- as.data.frame(matrix(sample(c(TRUE, FALSE, NA), 1e7, TRUE), 1e5, 1e2))
# A data.frame: 100,000 × 100
# V1 V2 V3 V4 V5 ...
# 1 TRUE TRUE NA FALSE FALSE ...
# 2 NA TRUE TRUE TRUE NA ...
# 3 NA FALSE FALSE FALSE TRUE ...
# 4 NA FALSE FALSE TRUE FALSE ...
# 5 NA FALSE FALSE FALSE TRUE ...
library(microbenchmark)
bm <- microbenchmark(
Darren = {
x1 <- names(df)[(which(t(df)) - 1) %% ncol(df) + 1]
}, Clemsang = {
x2 <- names(df)[which(t(df) == TRUE, arr.ind = TRUE)[, "row"]]
})
all(x1 == x2)
# [1] TRUE
bm
# Unit: milliseconds
# expr min lq mean median uq max neval
# Darren 140.5595 153.3333 163.7934 159.4783 167.5418 284.4146 100
# Clemsang 219.7802 242.6169 254.9226 250.8673 264.0462 356.9299 100

R return FALSE when NA when using == operator

i have an if loop than needs a logical check.
However the check is x==2
so that
> x<-2
> x==2
[1] TRUE
However when
> x<-NA
> x==2
[1] NA
How can I deal with this problem? I use this check within a for loop and the loop stops executing. When x<-NA, then x==2 should return FALSE
1) First ensure that x is not NA and then AND that with the desired check. If x were NA then it reduces to FALSE & NA and that equals FALSE.
x <- c(1, NA, 2) # test input
(!is.na(x)) & x == 2
## [1] FALSE FALSE TRUE
2) If x is a scalar then this also works because isTRUE(X) is only TRUE if the argument is TRUE and if it is anything else including NA it is FALSE.
x <- 2
isTRUE(x == 2)
## [1] TRUE
x <- 0
isTRUE(x == 2)
## [1] FALSE
x <- NA
isTRUE(x == 2)
## [1] FALSE
2a) or if x is not necessarily a scalar we can modify it like this
x <- c(1, NA, 2) # test input
sapply(x == 2, isTRUE)
## [1] FALSE FALSE TRUE
2b) or
x <- c(1, NA, 2) # test input
Vectorize(isTRUE)(x == 2)
## [1] FALSE FALSE TRUE
You may want to try:
x <- 2
ifelse(is.na(x), FALSE, x == 2)
#> [1] TRUE
x <- NA
ifelse(is.na(x), FALSE, x == 2)
#> [1] FALSE
Use %in% instead of == which may fix this problem.
x <- 2
x %in% 2
#[1] TRUE
x <- NA
x %in% 2
#[1] FALSE

Count repetitions of a set of characters

How can I count repetitions of a set of characters in a vector? Imagine the following vector consisting of "A" and "B":
x <- c("A", "A", "A", "B", "B", "A", "A", "B", "A")
In this example, the first set would be the sequence of "A" and "B" from index 1 to 5, the second set is the sequence of "A" and "B" from index 6 to 8, and then the third set is the last single "A":
x <- c("A", "A", "A", "B", "B", # set 1
"A", "A", "B", # set 2
"A") # set 3
How can set a counter for each set of variables? I need a vector like this:
c(1, 1, 1, 1, 1, 2, 2, 2, 3)
thanks
Use rle:
x <- c("A", "A", "A", "B", "B", "A", "A", "B", "A")
tmp <- rle(x)
#Run Length Encoding
# lengths: int [1:5] 3 2 2 1 1
# values : chr [1:5] "A" "B" "A" "B" "A"
Now change the values:
tmp$values <- ave(rep(1L, length(tmp$values)), tmp$values, FUN = cumsum)
and inverse the run length encoding:
y <- inverse.rle(tmp)
#[1] 1 1 1 1 1 2 2 2 3
Alternative 1.
cumsum(c(TRUE, diff(match(x, c("A", "B"))) == -1))
# [1] 1 1 1 1 1 2 2 2 3
Step by step:
match(x, c("A", "B"))
# [1] 1 1 1 2 2 1 1 2 1
diff(match(x, c("A", "B")))
# [1] 0 0 1 0 -1 0 1 -1
diff(match(x, c("A", "B"))) == -1
# [1] FALSE FALSE FALSE FALSE TRUE FALSE FALSE TRUE
c(TRUE, diff(match(x, c("A", "B"))) == -1)
# [1] TRUE FALSE FALSE FALSE FALSE TRUE FALSE FALSE TRUE
Alternative 2.
Using data.table::rleid:
library(data.table)
cumsum(c(TRUE, diff(rleid(x) %% 2) == 1))
# [1] 1 1 1 1 1 2 2 2 3
Step by step:
rleid(x)
# [1] 1 1 1 2 2 3 3 4 5
rleid(x) %% 2
# [1] 1 1 1 0 0 1 1 0 1
diff(rleid(x) %% 2)
# [1] 0 0 -1 0 1 0 -1 1
diff(rleid(x) %% 2) == 1
# [1] FALSE FALSE FALSE FALSE TRUE FALSE FALSE TRUE
c(TRUE, diff(rleid(x) %% 2) == 1)
# [1] TRUE FALSE FALSE FALSE FALSE TRUE FALSE FALSE TRUE
We can use only base R methods
x1 <- split(x, cumsum(c(TRUE, x[-1]!= x[-length(x)])))
x2 <- sapply(x1, `[`, 1)
as.numeric(rep(ave(x2, x2, FUN = seq_along), lengths(x1)))
#[1] 1 1 1 1 1 2 2 2 3

replace <NA> with NA

I have a data frame containing entries; It appears that these values are not treated as NA since is.na returns FALSE. I would like to convert these values to NA but could not find the way.
Use dfr[dfr=="<NA>"]=NA where dfr is your dataframe.
For example:
> dfr<-data.frame(A=c(1,2,"<NA>",3),B=c("a","b","c","d"))
> dfr
A B
1 1 a
2 2 b
3 <NA> c
4 3 d
> is.na(dfr)
A B
[1,] FALSE FALSE
[2,] FALSE FALSE
[3,] FALSE FALSE
[4,] FALSE FALSE
> dfr[dfr=="<NA>"] = NA **key step**
> is.na(dfr)
A B
[1,] FALSE FALSE
[2,] FALSE FALSE
[3,] TRUE FALSE
[4,] FALSE FALSE
The two classes where this is likely to be an issue are character and factor. This should loop over a dtaframe and convert the "NA" values into true <NA>'s but just for those two classes:
make.true.NA <- function(x) if(is.character(x)||is.factor(x)){
is.na(x) <- x=="NA"; x} else {
x}
df[] <- lapply(df, make.true.NA)
(Untested in the absence of a data example.) The use of the form: df_name[] will attempt to retain the structure of the original dataframe which would otherwise lose its class attribute. I see that ujjwal thinks your spelling of NA has flanking "<>" characters so you might try this functions as more general:
make.true.NA <- function(x) if(is.character(x)||is.factor(x)){
is.na(x) <- x %in% c("NA", "<NA>"); x} else {
x}
You can do this with the naniar package as well, using replace_with_na and associated functions.
dfr <- data.frame(A = c(1, 2, "<NA>", 3), B = c("a", "b", "c", "d"))
library(naniar)
# dev version - devtools::install_github('njtierney/naniar')
is.na(dfr)
#> A B
#> [1,] FALSE FALSE
#> [2,] FALSE FALSE
#> [3,] FALSE FALSE
#> [4,] FALSE FALSE
dfr %>% replace_with_na(replace = list(A = "<NA>")) %>% is.na()
#> A B
#> [1,] FALSE FALSE
#> [2,] FALSE FALSE
#> [3,] TRUE FALSE
#> [4,] FALSE FALSE
# You can also specify how to do this for many variables
dfr %>% replace_with_na_all(~.x == "<NA>")
#> # A tibble: 4 x 2
#> A B
#> <int> <int>
#> 1 2 1
#> 2 3 2
#> 3 NA 3
#> 4 4 4
You can read more about using replace_with_na here

How do i use grepl on each column in a data frame?

I have some values in my data frames #N/A that I want to convert to NA. I'm trying what seems like a straightforward grepl via lapply on the data frame, but its not working. Here's a simple example...
a = c("#N/A", "A", "B", "#N/A", "C")
b = c("d", "#N/A", "e", "f", "123")
df = as.data.frame(cbind(a,b))
lapply(df, function(x){x[grepl("#N/A", x)]=NA})
Which outputs:
$a
[1] NA
$b
[1] NA
Can someone point me in the right direction? I'd appreciate it.
Your function needs to return x as the return value.
Try:
lapply(df, function(x){x[grepl("#N/A", x)] <- NA; x})
$a
[1] <NA> A B <NA> C
Levels: #N/A A B C
$b
[1] d <NA> e f 123
Levels: #N/A 123 d e f
But you should really use gsub instead of grep:
lapply(df, function(x)gsub("#N/A", NA, x))
$a
[1] NA "A" "B" NA "C"
$b
[1] "d" NA "e" "f" "123"
A better (more flexible and possibly easier to maintain) solution might be:
replace <- function(x, ptn="#N/A") ifelse(x %in% ptn, NA, x)
lapply(df, replace)
$a
[1] NA 2 3 NA 4
$b
[1] 3 NA 4 5 2
You need to return x, and it's probably best to use apply in this case. Creating a data.frame with cbind is best avoided as well.
a = c("#N/A", "A", "B", "#N/A", "C")
b = c("d", "#N/A", "e", "f", "123")
df = data.frame(a=a, b=b, stringsAsFactors = FALSE)
str(df)
apply(df, 2, function(x){x[grepl("#N/A", x)] <- NA; return(x)})
If you are reading this data in from a CSV/tab delimited file, just set na.strings = "#N/A".
read.table("my file.csv", na.strings = "#N/A")
Update from comment: or maybe na.strings = c("#N/A", "#N/A#N/A").
Even if you are stuck with the case you described in your question, you still don't need grepl.
df <- data.frame(
a = c("#N/A", "A", "B", "#N/A", "C"),
b = c("d", "#N/A", "e", "f", "123")
)
df[] <- lapply(
df,
function(x)
{
x[x == "#N/A"] <- NA
x
}
)
df
## a b
## 1 <NA> d
## 2 A <NA>
## 3 B e
## 4 <NA> f
## 5 C 123
As per your example in the question, you don't need any types of apply loops, just do
df[df == "#N/A"] <- NA
As per cases when you have #N/A#N/A (although you didn't provide such data), another way to solve this would be
df[sapply(df, function(x) grepl("#N/A", x))] <- NA
In both cases the data itself will be updated, rather just printed to consule

Resources