Comparing 2 data frame with different size - r

I would like to check if the values in my dataframe df is larger than the threshold in df2. I tried making df2 to have the size with df to test on the threshold, but is an alternate way to do this?
> df
A B C
5 12 -5
4 4 0
15 5 9
1 11 1
11 1 -3
> df2
A B C
5 6 3
I tried replicating df2 into and then checking if df > df2
> df2
A B C
5 6 3
5 6 3
5 6 3
5 6 3
5 6 3
dput
> dput(df)
structure(list(A = c(5, 4, 15, 1, 11), B = c(12, 4, 5, 11, 1),
C = c(-5, 0, 9, 1, -3)), row.names = c(NA, -5L), class = c("tbl_df",
"tbl", "data.frame"))
> dput(df2)
structure(list(A = 5, B = 6, C = 3), row.names = c(NA, -1L), class = c("tbl_df",
"tbl", "data.frame"))

You can try using sweep :
sweep(df, 2, unlist(df2), `>`)
# A B C
#[1,] FALSE TRUE FALSE
#[2,] FALSE FALSE FALSE
#[3,] TRUE FALSE TRUE
#[4,] FALSE TRUE FALSE
#[5,] TRUE FALSE FALSE

Using tidyverse
library(dplyr)
df %>%
mutate(across(everything(), ~ . > df2[[cur_column()]]))
# A tibble: 5 x 3
# A B C
# <lgl> <lgl> <lgl>
#1 FALSE TRUE FALSE
#2 FALSE FALSE FALSE
#3 TRUE FALSE TRUE
#4 FALSE TRUE FALSE
#5 TRUE FALSE FALSE
Or using map2
library(purrr)
map2_df(df, df2, `>`)
# A tibble: 5 x 3
# A B C
# <lgl> <lgl> <lgl>
#1 FALSE TRUE FALSE
#2 FALSE FALSE FALSE
#3 TRUE FALSE TRUE
#4 FALSE TRUE FALSE
#5 TRUE FALSE FALSE

You can use the follwing code
setNames(data.frame(do.call("cbind",lapply(names(df), function(nam) {
df[[nam]] > df2[[nam]]
}))),names(df))
# A B C
#1 FALSE TRUE FALSE
#2 FALSE FALSE FALSE
#3 TRUE FALSE TRUE
#4 FALSE TRUE FALSE
#5 TRUE FALSE FALSE
If its enough to get the result as named matrix (coercing to data.frame is quite time consuming if not really needed), you can just do:
comparedMatrix <- do.call("cbind",lapply(names(df), function(nam) {
df[[nam]] > df2[[nam]]
}))
colnames(comparedMatrix) <- names(df)
comparedMatrix

A base R option using t + unlist
> t(t(df)> unlist(df2))
A B C
[1,] FALSE TRUE FALSE
[2,] FALSE FALSE FALSE
[3,] TRUE FALSE TRUE
[4,] FALSE TRUE FALSE
[5,] TRUE FALSE FALSE

Related

How to convert columns to multiple boolean columns with tidyverse

I have a group of columns for each time and I want to convert it to a lot of boolean columns (one by category) with mutate() and across() like that :
data <- data.frame(category_t1 = c("A","B","C","C","A","B"),
category_t2 = c("A","C","B","B","B",NA),
category_t3 = c("C","C",NA,"B",NA,"A"))
data %>% mutate(across(starts_with("category"),
~case_when(.x == "A" ~ TRUE, !is.na(.x) ~ FALSE),
.names = "{str_replace(.col, 'category', 'A')}"),
across(starts_with("category"),
~case_when(.x == "B" ~ TRUE, !is.na(.x) ~ FALSE),
.names = "{str_replace(.col, 'category', 'B')}"),
across(starts_with("category"),
~case_when(.x == "C" ~ TRUE, !is.na(.x) ~ FALSE),
.names = "{str_replace(.col, 'category', 'C')}"))
Which makes :
category_t1 category_t2 category_t3 A_t1 A_t2 A_t3 B_t1 B_t2 B_t3 C_t1 C_t2
1 A A C TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE
2 B C C FALSE FALSE FALSE TRUE FALSE FALSE FALSE TRUE
3 C B <NA> FALSE FALSE NA FALSE TRUE NA TRUE FALSE
4 C B B FALSE FALSE FALSE FALSE TRUE TRUE TRUE FALSE
5 A B <NA> TRUE FALSE NA FALSE TRUE NA FALSE FALSE
6 B <NA> A FALSE NA TRUE TRUE NA FALSE FALSE NA
It works but I would like to know if there is a better idea because here I am doing the same code 3 times instead of one big code (and imagine if I had 10 times to repeat it...). I though I could do it with map() but I didn't manage to make it work.
I think there is a problem because of .names argument in across() that cannot connect with the string I use in case_when().
I think maybe there is something to do in the ... argument, like :
data %>% mutate(across(starts_with("category"),
~case_when(.x == mod ~ TRUE, !is.na(.x) ~ FALSE),
mod = levels(as.factor(data$category_t1)),
.names = "{str_replace(.col, 'category', mod)}"))
But of course that doesn't work here. Do you know how to do that ?
Thanks a lot.
We may use table in across
library(dplyr)
library(stringr)
library(tidyr)
data %>%
mutate(across(everything(), ~ as.data.frame.matrix(table(row_number(), .x) *
NA^(is.na(.x)) > 0),
.names = "{str_remove(.col, 'category_')}")) %>%
unpack(where(is.data.frame), names_sep = ".")
-output
# A tibble: 6 × 12
category_t1 category_t2 category_t3 t1.A t1.B t1.C t2.A t2.B t2.C t3.A t3.B t3.C
<chr> <chr> <chr> <lgl> <lgl> <lgl> <lgl> <lgl> <lgl> <lgl> <lgl> <lgl>
1 A A C TRUE FALSE FALSE TRUE FALSE FALSE FALSE FALSE TRUE
2 B C C FALSE TRUE FALSE FALSE FALSE TRUE FALSE FALSE TRUE
3 C B <NA> FALSE FALSE TRUE FALSE TRUE FALSE NA NA NA
4 C B B FALSE FALSE TRUE FALSE TRUE FALSE FALSE TRUE FALSE
5 A B <NA> TRUE FALSE FALSE FALSE TRUE FALSE NA NA NA
6 B <NA> A FALSE TRUE FALSE NA NA NA TRUE FALSE FALSE
Or use model.matrix from base R
data1 <- replace(data, is.na(data), "NA")
lvls <- lapply(data1, \(x) levels(factor(x, levels = c("NA", "A", "B", "C"))))
m1 <- model.matrix(~ 0 + ., data = data1, xlev = lvls)
out <- cbind(data, m1[, -grep("NA", colnames(m1))] > 0)
-output
out
category_t1 category_t2 category_t3 category_t1A category_t1B category_t1C category_t2A category_t2B category_t2C category_t3A category_t3B category_t3C
1 A A C TRUE FALSE FALSE TRUE FALSE FALSE FALSE FALSE TRUE
2 B C C FALSE TRUE FALSE FALSE FALSE TRUE FALSE FALSE TRUE
3 C B <NA> FALSE FALSE TRUE FALSE TRUE FALSE FALSE FALSE FALSE
4 C B B FALSE FALSE TRUE FALSE TRUE FALSE FALSE TRUE FALSE
5 A B <NA> TRUE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE
6 B <NA> A FALSE TRUE FALSE FALSE FALSE FALSE TRUE FALSE FALSE
> colnames(out)
[1] "category_t1" "category_t2" "category_t3"
[4] "category_t1A" "category_t1B" "category_t1C"
[7] "category_t2A" "category_t2B" "category_t2C"
[10] "category_t3A"
[11] "category_t3B" "category_t3C"
Or another option with table
cbind(data, do.call(cbind.data.frame,
lapply(data, \(x) (table(seq_along(x), x)* NA^is.na(x)) > 0)))
-output
category_t1 category_t2 category_t3 category_t1.A category_t1.B category_t1.C category_t2.A category_t2.B category_t2.C category_t3.A category_t3.B
1 A A C TRUE FALSE FALSE TRUE FALSE FALSE FALSE FALSE
2 B C C FALSE TRUE FALSE FALSE FALSE TRUE FALSE FALSE
3 C B <NA> FALSE FALSE TRUE FALSE TRUE FALSE NA NA
4 C B B FALSE FALSE TRUE FALSE TRUE FALSE FALSE TRUE
5 A B <NA> TRUE FALSE FALSE FALSE TRUE FALSE NA NA
6 B <NA> A FALSE TRUE FALSE NA NA NA TRUE FALSE
category_t3.C
1 TRUE
2 TRUE
3 NA
4 FALSE
5 NA
6 FALSE
Not a tidyverse option (although pipe-compatible), it is very easily doable with package fastDummies:
fastDummies::dummy_cols(data, ignore_na = TRUE)
category_t1 category_t2 category_t3 category_t1_A category_t1_B category_t1_C category_t2_A category_t2_B category_t2_C category_t3_A category_t3_B category_t3_C
1 A A C 1 0 0 1 0 0 0 0 1
2 B C C 0 1 0 0 0 1 0 0 1
3 C B <NA> 0 0 1 0 1 0 NA NA NA
4 C B B 0 0 1 0 1 0 0 1 0
5 A B <NA> 1 0 0 0 1 0 NA NA NA
6 B <NA> A 0 1 0 NA NA NA 1 0 0
purrr's map_dfc could match well with your current approach:
library(dplyr)
library(purrr)
bind_cols(data,
map_dfc(LETTERS[1:3], \(letter) { mutate(data,
across(starts_with("category"),
~ case_when(.x == letter ~ TRUE, !is.na(.x) ~ FALSE),
.names = paste0("{str_replace(.col, 'category', '", letter, "')}")),
.keep = "none") }
)
)
Or skip the bind_cols and use .keep = ifelse(letter == "A", "all", "none").
Output:
category_t1 category_t2 category_t3 A_t1 A_t2 A_t3 B_t1 B_t2 B_t3 C_t1 C_t2 C_t3
1 A A C TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
2 B C C FALSE FALSE FALSE TRUE FALSE FALSE FALSE TRUE TRUE
3 C B <NA> FALSE FALSE NA FALSE TRUE NA TRUE FALSE NA
4 C B B FALSE FALSE FALSE FALSE TRUE TRUE TRUE FALSE FALSE
5 A B <NA> TRUE FALSE NA FALSE TRUE NA FALSE FALSE NA
6 B <NA> A FALSE NA TRUE TRUE NA FALSE FALSE NA FALSE
A base solution with nested lapply():
cbind(data, lapply(data, \(x) {
lev <- levels(factor(x))
sapply(setNames(lev, lev), \(y) x == y)
}))
category_t1 category_t2 category_t3 category_t1.A category_t1.B category_t1.C category_t2.A category_t2.B category_t2.C category_t3.A category_t3.B category_t3.C
1 A A C TRUE FALSE FALSE TRUE FALSE FALSE FALSE FALSE TRUE
2 B C C FALSE TRUE FALSE FALSE FALSE TRUE FALSE FALSE TRUE
3 C B <NA> FALSE FALSE TRUE FALSE TRUE FALSE NA NA NA
4 C B B FALSE FALSE TRUE FALSE TRUE FALSE FALSE TRUE FALSE
5 A B <NA> TRUE FALSE FALSE FALSE TRUE FALSE NA NA NA
6 B <NA> A FALSE TRUE FALSE NA NA NA TRUE FALSE FALSE

Find similar groups of numbers across rows R

I'm trying to find similar patterns of numbers across a dataframe. I have a dataframe with 5 columns and some columns have a random number between 3 and 50. However, for some rows 2 or 3 columns don't have a number.
A B C D E
5 23 6
9 33 7 8 12
33 7 14
6 18 23 48
8 44 33 7 9
I want to know what are the recurring numbers, so I'm interested in:
Row 1 and 4 that have the number 23 and 6,
Row 2 and 5 that have number 9, 33 and 8,
Row 2, 3 and 5 that have number 33 and 7.
Basically I'm trying to get the number of different combinations.
I'm a bit stuck about how to do this. I've tried to join the numbers in a list.
for (i in 1:dim(knots_all)[1]) {
knots_all$list_knots <- list(sort(knots_all[i,1:5]))
}
I've also tried intersect but it doesn't seem very efficient as R also considers the NAs which I want to disregard.
I would like to hear some ideas about the best way to achieve this. I've been thinking about this problem but I'm not able to understand how to get to the answer. My mind is stuck so any idea is much appreciated!
Thank you!
There's no specific/target pattern you want to capture. It seems like you need a process to identify the numbers that appear more often in your dataset and then see in which rows they appear.
I'll modify your example dataset to have number 23 appearing twice in the same row in order to illustrate some useful differences in counts.
df = read.table(text = "
A B C D E
5 23 6 23 NA
9 33 7 8 12
33 7 14 NA NA
6 18 23 48 NA
8 44 33 7 9
", header=T)
library(dplyr)
library(tidyr)
df %>%
mutate(row_id = row_number()) %>% # add a row flag
gather(col_name,value,-row_id) %>% # reshape
filter(!is.na(value)) %>% # exclude NAs
group_by(value) %>% # for each number value
summarise(NumOccurences = n(), # count occurences
rows = paste(sort(row_id), collapse = "_"), # capture rows
NumRowOccurences = n_distinct(row_id), # count occurences in unique rows
unique_rows = paste(sort(unique(row_id)), collapse = "_")) %>% # capture unique rows
arrange(desc(NumOccurences)) # order by number popularity (occurences)
# # A tibble: 12 x 5
# value NumOccurences rows NumRowOccurences unique_rows
# <int> <int> <chr> <int> <chr>
# 1 7 3 2_3_5 3 2_3_5
# 2 23 3 1_1_4 2 1_4
# 3 33 3 2_3_5 3 2_3_5
# 4 6 2 1_4 2 1_4
# 5 8 2 2_5 2 2_5
# 6 9 2 2_5 2 2_5
# 7 5 1 1 1 1
# 8 12 1 2 1 2
# 9 14 1 3 1 3
# 10 18 1 4 1 4
# 11 44 1 5 1 5
# 12 48 1 4 1 4
Make a list of lists:
List = [1[],2[],...,n[]].
Loop through your data frame and for your example ad A to List = [1[],2[],.5[A]..,[n]] (at index = 5). And so on for every column.
after this loop through list check if the list (in the list) are filled and have multiple columns.
this should get you started.
good luck
This is an algorithm which can detect numbers presents in two columns.
df <- data.frame(A = c(5, 23, 6, NA, NA),
B = c(9, 33, 7, 8, 12),
C = c(33, 7, 14, NA, NA),
D = c(6, 18, 23, 48, NA),
E = c(8, 44, 33, 7, 9))
L <- as.list(df)
LL <- rep(list(rep(list(NA), length(L))), length(L))
for(i in 1:length(L)){
for(j in 1:length(L))
LL[[i]][[j]] <- intersect(L[[i]], L[[j]])
}
To see the overlapping numbers in columns 1 and 4:
LL[[1]][[4]]
[1] 23 6 NA
To see all overapping numbers:
unique(unlist(LL))
[1] 5 23 6 NA 9 33 7 8 12 14 18 48 44
It could be changed a little bit (by adding a level in the nested loop and if the for loop) to see the pesence in 3 different columns etc
One example for dealing with the NA would be to temporarily fill them with randomly generated numbers:
# data
df <- data.frame(A = c(5,9,33,6,8),
B = c(23,33,7,18,44),
C = c(6,7,14,23,33),
D = c(NA, 8, NA, 48, 7),
E = c(NA, 12, NA, NA, 9))
# fill NA with random numbers
set.seed(1)
df2 <- as.data.frame(do.call(cbind, lapply(df, function(x) ifelse(is.na(x), rnorm(1), x))))
> df2
A B C D E
1 5 23 6 -0.6264538 0.1836433
2 9 33 7 8.0000000 12.0000000
3 33 7 14 -0.6264538 0.1836433
4 6 18 23 48.0000000 0.1836433
5 8 44 33 7.0000000 9.0000000
# split data by rows
df2 <- split(df2, seq_along(df2))
# compare rows with each other
temp <- lapply(lapply(df2, function(x) lapply(df2, function(y) x %in% y)), function(x) do.call(rbind, x))
# delete self comparisons
output <- lapply(1:5, function(x) temp[[x]] <- temp[[x]][-x,])
Result:
[[1]]
[,1] [,2] [,3] [,4] [,5]
2 FALSE FALSE FALSE FALSE FALSE
3 FALSE FALSE FALSE TRUE TRUE
4 FALSE TRUE TRUE FALSE TRUE
5 FALSE FALSE FALSE FALSE FALSE
[[2]]
[,1] [,2] [,3] [,4] [,5]
1 FALSE FALSE FALSE FALSE FALSE
3 FALSE TRUE TRUE FALSE FALSE
4 FALSE FALSE FALSE FALSE FALSE
5 TRUE TRUE TRUE TRUE FALSE
[[3]]
[,1] [,2] [,3] [,4] [,5]
1 FALSE FALSE FALSE TRUE TRUE
2 TRUE TRUE FALSE FALSE FALSE
4 FALSE FALSE FALSE FALSE TRUE
5 TRUE TRUE FALSE FALSE FALSE
[[4]]
[,1] [,2] [,3] [,4] [,5]
1 TRUE FALSE TRUE FALSE TRUE
2 FALSE FALSE FALSE FALSE FALSE
3 FALSE FALSE FALSE FALSE TRUE
5 FALSE FALSE FALSE FALSE FALSE
[[5]]
[,1] [,2] [,3] [,4] [,5]
1 FALSE FALSE FALSE FALSE FALSE
2 TRUE FALSE TRUE TRUE TRUE
3 FALSE FALSE TRUE TRUE FALSE
4 FALSE FALSE FALSE FALSE FALSE

split vector or data.frame into intervals by condition and print interval's first and last value

I have data.frame which looks like this:
v1 <- c(1:10)
v2 <- c(FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE)
dfb <- data.frame(v1, v2)
> dfb
v1 v2
1 1 FALSE
2 2 FALSE
3 3 TRUE
4 4 FALSE
5 5 FALSE
6 6 FALSE
7 7 TRUE
8 8 FALSE
9 9 FALSE
10 10 FALSE
I need those operations:
split data.frame into intervals according to V2 if is TRUE
rows where V2 is TRUE will be last interval element
if the last element is not TRUE it will be treated as if is (this can be easily achieved by adding TRUE to last vector position)
print V1 as first and last element from created intervals
after this operations my results should look like this:
> df_final
Vx Vy
1 3
4 7
8 10
I've tried cumsum on v2 vector but TRUE values are treated as first interval element not last
> split(v2, cumsum(v2==TRUE))
$`0`
[1] FALSE FALSE
$`1`
[1] TRUE FALSE FALSE FALSE
$`2`
[1] TRUE FALSE FALSE FALSE
You can still use cumsum, you just have to slightly adjust v2:
v3 <- c(TRUE,v2[-length(v2)])
v3
[1] TRUE FALSE FALSE TRUE FALSE FALSE FALSE TRUE FALSE FALSE
res <- split(v2,cumsum(v3))
res[[length(res)]][length(last(res))] <- T
res
$`1`
[1] FALSE FALSE TRUE
$`2`
[1] FALSE FALSE FALSE TRUE
$`3`
[1] FALSE FALSE TRUE
df_final <- data.frame(Vx=which(v3),Vy=which(unlist(res,use.names=F)))
df_final
Vx Vy
1 1 3
2 4 7
3 8 10
Get df_final
Vy <- c(which(dfb$v2 %in% T),nrow(dfb))
Vx <- c(1,Vy[-length(Vy)]+1)
df_final <- data.frame(Vx,Vy)
Split Df
library(data.table)
split_ind <- rleid(dfb$v2)-!(rleid(dfb$v2) %% 2)
split(dfb,split_ind)
I will also post my answer heavily inspired by Eldioo, this one is useful also when V1 are non numeric values and avoids using split and cumsum functions.
Input:
v1 <- letters[1:10]
v2 <- c(FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE)
dfb <- data.frame(v1, v2)
> dfb
v1 v2
1 a FALSE
2 b FALSE
3 c TRUE
4 d FALSE
5 e FALSE
6 f FALSE
7 g TRUE
8 h FALSE
9 i FALSE
10 j FALSE
Solution:
# data wrangling
library(data.table)
dfb["v3"] <- c(TRUE,dfb$v2[-length(dfb$v2)])
dfb["v4"] <- dfb$v2
dfb$v4[length(dfb$v4)] <- T
Vx <- which(dfb$v3)
Vy <- which(dfb$v4)
Vx <- dfb[Vx, ]$v1
Vy <- dfb[Vy, ]$v1
# for debugging purposes
dfb
v1 v2 v3 v4
1 a FALSE TRUE FALSE
2 b FALSE FALSE FALSE
3 c TRUE FALSE TRUE
4 d FALSE TRUE FALSE
5 e FALSE FALSE FALSE
6 f FALSE FALSE FALSE
7 g TRUE FALSE TRUE
8 h FALSE TRUE FALSE
9 i FALSE FALSE FALSE
10 j FALSE FALSE TRUE
# final results
data.frame(Vx, Vy)
Vx Vy
1 a c
2 d g
3 h j

how to loop through columns in R

I have a very large data set including 250 string and numeric variables. I want to compare one after another columns together. For example, I am going to compare (difference) the first variable with second one, third one with fourth one, fifth one with sixth one and so on.
For example (The structure of the data set is something like this example), I want to compare number.x with number.y, day.x with day.y, school.x with school.y and etc.
number.x<-c(1,2,3,4,5,6,7)
number.y<-c(3,4,5,6,1,2,7)
day.x<-c(1,3,4,5,6,7,8)
day.y<-c(4,5,6,7,8,7,8)
school.x<-c("a","b","b","c","n","f","h")
school.y<-c("a","b","b","c","m","g","h")
city.x<- c(1,2,3,7,5,8,7)
city.y<- c(1,2,3,5,5,7,7)
You mean, something like this?
> number.x == number.y
[1] FALSE FALSE FALSE FALSE FALSE FALSE TRUE
> length(which(number.x==number.y))
[1] 1
> school.x == school.y
[1] TRUE TRUE TRUE TRUE FALSE FALSE TRUE
> test.day <- day.x == day.y
> test.day
[1] FALSE FALSE FALSE FALSE FALSE TRUE TRUE
EDIT: Given your example variables above, we have:
df <- data.frame(number.x,
number.y,
day.x,
day.y,
school.x,
school.y,
city.x,
city.y,
stringsAsFactors=FALSE)
n <- ncol(df) # no of columns (assumed EVEN number)
k <- 1
comp <- list() # comparisons will be stored here
while (k <= n-1) {
l <- (k+1)/2
comp[[l]] <- df[,k] == df[,k+1]
k <- k+2
}
After which, you'll have:
> comp
[[1]]
[1] FALSE FALSE FALSE FALSE FALSE FALSE TRUE
[[2]]
[1] FALSE FALSE FALSE FALSE FALSE TRUE TRUE
[[3]]
[1] TRUE TRUE TRUE TRUE FALSE FALSE TRUE
[[4]]
[1] TRUE TRUE TRUE FALSE TRUE FALSE TRUE
To get the comparison result between columns k and k+1, you look at the (k+1)/2 element of comp - i.e to get the comparison results between columns 7 & 8, you look at the comp element 8/2=4:
> comp[[4]]
[1] TRUE TRUE TRUE FALSE TRUE FALSE TRUE
EDIT 2: To have the comparisons as new columns in the dataframe:
new.names <- rep('', n/2)
for (i in 1:(n/2)) {
new.names[i] <- paste0('V', i)
}
cc <- as.data.frame(comp, optional=TRUE)
names(cc) <- new.names
df.new <- cbind(df, cc)
After which, you have:
> df.new
number.x number.y day.x day.y school.x school.y city.x city.y V1 V2 V3 V4
1 1 3 1 4 a a 1 1 FALSE FALSE TRUE TRUE
2 2 4 3 5 b b 2 2 FALSE FALSE TRUE TRUE
3 3 5 4 6 b b 3 3 FALSE FALSE TRUE TRUE
4 4 6 5 7 c c 7 5 FALSE FALSE TRUE FALSE
5 5 1 6 8 n m 5 5 FALSE FALSE FALSE TRUE
6 6 2 7 7 f g 8 7 FALSE TRUE FALSE FALSE
7 7 7 8 8 h h 7 7 TRUE TRUE TRUE TRUE

replace <NA> with NA

I have a data frame containing entries; It appears that these values are not treated as NA since is.na returns FALSE. I would like to convert these values to NA but could not find the way.
Use dfr[dfr=="<NA>"]=NA where dfr is your dataframe.
For example:
> dfr<-data.frame(A=c(1,2,"<NA>",3),B=c("a","b","c","d"))
> dfr
A B
1 1 a
2 2 b
3 <NA> c
4 3 d
> is.na(dfr)
A B
[1,] FALSE FALSE
[2,] FALSE FALSE
[3,] FALSE FALSE
[4,] FALSE FALSE
> dfr[dfr=="<NA>"] = NA **key step**
> is.na(dfr)
A B
[1,] FALSE FALSE
[2,] FALSE FALSE
[3,] TRUE FALSE
[4,] FALSE FALSE
The two classes where this is likely to be an issue are character and factor. This should loop over a dtaframe and convert the "NA" values into true <NA>'s but just for those two classes:
make.true.NA <- function(x) if(is.character(x)||is.factor(x)){
is.na(x) <- x=="NA"; x} else {
x}
df[] <- lapply(df, make.true.NA)
(Untested in the absence of a data example.) The use of the form: df_name[] will attempt to retain the structure of the original dataframe which would otherwise lose its class attribute. I see that ujjwal thinks your spelling of NA has flanking "<>" characters so you might try this functions as more general:
make.true.NA <- function(x) if(is.character(x)||is.factor(x)){
is.na(x) <- x %in% c("NA", "<NA>"); x} else {
x}
You can do this with the naniar package as well, using replace_with_na and associated functions.
dfr <- data.frame(A = c(1, 2, "<NA>", 3), B = c("a", "b", "c", "d"))
library(naniar)
# dev version - devtools::install_github('njtierney/naniar')
is.na(dfr)
#> A B
#> [1,] FALSE FALSE
#> [2,] FALSE FALSE
#> [3,] FALSE FALSE
#> [4,] FALSE FALSE
dfr %>% replace_with_na(replace = list(A = "<NA>")) %>% is.na()
#> A B
#> [1,] FALSE FALSE
#> [2,] FALSE FALSE
#> [3,] TRUE FALSE
#> [4,] FALSE FALSE
# You can also specify how to do this for many variables
dfr %>% replace_with_na_all(~.x == "<NA>")
#> # A tibble: 4 x 2
#> A B
#> <int> <int>
#> 1 2 1
#> 2 3 2
#> 3 NA 3
#> 4 4 4
You can read more about using replace_with_na here

Resources