Changing NA value if matching a defined length - r

I have this kind of data :
daynight
[1] NA NA NA NA 2 1 NA NA
I want R to detect if there is a series of at least x NA and replace these by another value.
For example if x=3 and the replacement value is 3 I want R to give me in output :
daynight
[1] 3 3 3 3 2 1 NA NA
Would you have any ideas?

We can use rle
daynight <- c(NA, NA, NA, NA ,2 ,1, NA, NA)
x <- 3
r <- 3
daynight[with(rle(is.na(daynight)), rep(lengths >= x & values, lengths))] <- r
daynight
#[1] 3 3 3 3 2 1 NA NA
Taking another example :
daynight <- c(NA, NA, NA, 3,2,1, NA, NA, 1, NA, NA, NA, 1, NA, NA)
daynight[with(rle(is.na(daynight)), rep(lengths >= x & values, lengths))] <- r
#[1] 3 3 3 3 2 1 NA NA 1 3 3 3 1 NA NA

And here is another solution using the zoo package
library(zoo)
replace_consecutive_NAs <- function(x, nrNAs = 3, replaceBy = nrNAs){
x <- as.numeric(is.na(x))
indexes <- (rollapply(x, 3, prod, fill = 0, align = "left") +
rollapply(x, 3, prod, fill = 0, align = "right")) != 0
x[indexes] <- replaceBy
x
}
x <- c(NA, NA, NA, NA ,2 ,1, NA, NA)
replace_consecutive_NAs(x, 3, 999)
[1] 999 999 999 999 2 1 NA NA

Related

Forming a new column from whichever of two columns isn’t NA [duplicate]

This question already has answers here:
Replace a value NA with the value from another column in R
(5 answers)
Closed last month.
I have a simplified dataframe:
test <- data.frame(
x = c(1,2,3,NA,NA,NA),
y = c(NA, NA, NA, 3, 2, NA),
a = c(NA, NA, NA, NA, NA, TRUE)
)
I want to create a new column rating that has the value of the number in either column x or column y. The dataset is such a way that whenever there's a numeric value in x, there's a NA in y. If both columns are NAs, then the value in rating should be NA.
In this case, the expected output is: 1,2,3,3,2,NA
With coalesce:
library(dplyr)
test %>%
mutate(rating = coalesce(x, y))
x y a rating
1 1 NA NA 1
2 2 NA NA 2
3 3 NA NA 3
4 NA 3 NA 3
5 NA 2 NA 2
6 NA NA TRUE NA
library(dplyr)
test %>%
mutate(rating = if_else(is.na(x),
y, x))
x y a rating
1 1 NA NA 1
2 2 NA NA 2
3 3 NA NA 3
4 NA 3 NA 3
5 NA 2 NA 2
6 NA NA TRUE NA
Here several solutions.
# Input
test <- data.frame(
x = c(1,2,3,NA,NA,NA),
y = c(NA, NA, NA, 3, 2, NA),
a = c(NA, NA, NA, NA, NA, TRUE)
)
# Base R solution
test$rating <- ifelse(!is.na(test$x), test$x,
ifelse(!is.na(test$y), test$y, NA))
# dplyr solution
library(dplyr)
test <- test %>%
mutate(rating = case_when(!is.na(x) ~ x,
!is.na(y) ~ y,
TRUE ~ NA_real_))
# data.table solution
library(data.table)
setDT(test)
test[, rating := ifelse(!is.na(x), x, ifelse(!is.na(y), y, NA))]
Created on 2022-12-23 with reprex v2.0.2
test <- data.frame(
x = c(1,2,3,NA,NA,NA),
y = c(NA, NA, NA, 3, 2, NA),
a = c(NA, NA, NA, NA, NA, TRUE)
)
test$rating <- dplyr::coalesce(test$x, test$y)

Merge survey columns across variables in R

I am analyzing a very large survey in which I want to combine four parts of the survey, through several combinations of 4 questions. Below I have created a small example. A little background: a respondent either answered q2, q5, q8 or q9, because they only filled in 1 of 4 parts of the survey based on their answer in q1 (not shown here).Therefore, only one of the four columns contains an answer (1 or 2), while the others contain NAs. q2, q5, q8, q9 are similar questions that have the same answer options, which is why I want to combine them to make my dataset less wide and make it easier to further analyze the data.
q2_1 <- c(NA, NA, NA, NA, NA, NA, rep(c(1:2), 1))
q5_1 <- c(NA, NA, NA, NA, rep(c(1:2), 1), NA, NA)
q8_1 <- c(NA, NA, rep(c(1:2), 1), NA, NA, NA, NA)
q9_1 <- c(rep(c(1:2), 1), NA, NA, NA, NA, NA, NA)
q2_2 <- c(NA, NA, NA, NA, NA, NA, rep(c(1:2), 1))
q5_2 <- c(NA, NA, NA, NA, rep(c(1:2), 1), NA, NA)
q8_2 <- c(NA, NA, rep(c(1:2), 1), NA, NA, NA, NA)
q9_2 <- c(rep(c(1:2), 1), NA, NA, NA, NA, NA, NA)
df <- data.frame(q2_1, q5_1, q8_1, q9_1, q2_2, q5_2, q8_2, q9_2)
df
# running df shows:
q2_1 q5_1 q8_1 q9_1 q2_2 q5_2 q8_2 q9_2
1 NA NA NA 1 NA NA NA 1
2 NA NA NA 2 NA NA NA 2
3 NA NA 1 NA NA NA 1 NA
4 NA NA 2 NA NA NA 2 NA
5 NA 1 NA NA NA 1 NA NA
6 NA 2 NA NA NA 2 NA NA
7 1 NA NA NA 1 NA NA NA
8 2 NA NA NA 2 NA NA NA
My desired end result would be a dataframe with only columns for questions starting with q2_ (so, in the example that would be q2_1 and q2_2; in reality there's about 20 for this question), but with the NAs replaced for the answer options from the corresponding q5_, q8_, and q_9.
# desired end result
q2_1 q2_2
1 1 1
2 1 2
3 1 1
4 2 2
5 1 1
6 2 2
7 1 1
8 2 2
For single questions, i've done this using the code below, but this is very manual and because q2, q5, q8, and q9 both go up to _20, I'm looking for a way to automate this more.
# example single question
library(tidyverse)
df <- df %>%
mutate(q2_1 = case_when(!is.na(q2_1) ~ q2_1,
!is.na(q5_1) ~ q5_1,
!is.na(q8_1) ~ q8_1,
!is.na(q9_1) ~ q9_1))
I hope I explained myself well enough and looking forward for some directions!
Here's one way, using coalesce:
df %>%
mutate(q2_1 = do.call(coalesce, across(ends_with('_1'))),
q2_2 = do.call(coalesce, across(ends_with('_2')))) %>%
select(q2_1, q2_2)
#> q2_1 q2_2
#> 1 1 1
#> 2 2 2
#> 3 1 1
#> 4 2 2
#> 5 1 1
#> 6 2 2
#> 7 1 1
#> 8 2 2
q2_1 <- c(NA, NA, NA, NA, NA, NA, rep(c(1:2), 1))
q5_1 <- c(NA, NA, NA, NA, rep(c(1:2), 1), NA, NA)
q8_1 <- c(NA, NA, rep(c(1:2), 1), NA, NA, NA, NA)
q9_1 <- c(rep(c(1:2), 1), NA, NA, NA, NA, NA, NA)
q2_2 <- c(NA, NA, NA, NA, NA, NA, rep(c(1:2), 1))
q5_2 <- c(NA, NA, NA, NA, rep(c(1:2), 1), NA, NA)
q8_2 <- c(NA, NA, rep(c(1:2), 1), NA, NA, NA, NA)
q9_2 <- c(rep(c(1:2), 1), NA, NA, NA, NA, NA, NA)
df <- data.frame(q2_1, q5_1, q8_1, q9_1, q2_2, q5_2, q8_2, q9_2)
df
#> q2_1 q5_1 q8_1 q9_1 q2_2 q5_2 q8_2 q9_2
#> 1 NA NA NA 1 NA NA NA 1
#> 2 NA NA NA 2 NA NA NA 2
#> 3 NA NA 1 NA NA NA 1 NA
#> 4 NA NA 2 NA NA NA 2 NA
#> 5 NA 1 NA NA NA 1 NA NA
#> 6 NA 2 NA NA NA 2 NA NA
#> 7 1 NA NA NA 1 NA NA NA
#> 8 2 NA NA NA 2 NA NA NA
library(tidyverse)
suffix <- str_c("_", 1:2)
map_dfc(.x = suffix,
.f = ~ transmute(df, !!str_c("q2", .x) := rowSums(across(ends_with(.x
)), na.rm = T)))
#> q2_1 q2_2
#> 1 1 1
#> 2 2 2
#> 3 1 1
#> 4 2 2
#> 5 1 1
#> 6 2 2
#> 7 1 1
#> 8 2 2
Created on 2022-04-04 by the reprex package (v2.0.1)

Mutate and ifelse with NA and values from other variable in R [duplicate]

This question already has answers here:
R: coalescing a large data frame
(2 answers)
How to implement coalesce efficiently in R
(9 answers)
Closed 2 years ago.
I have a df that looks something like this:
id <- c(1:8)
born.swis <- c(0, 1, NA, NA, NA, 2, NA, NA)
born2005 <- c(NA, NA, 2, NA, NA, NA, NA, NA)
born2006 <- c(NA, NA, NA, 1, NA, NA, NA, NA)
born2007 <- c(NA, NA, NA, NA, NA, NA, NA, 1)
born2008 <- c(NA, NA, NA, NA, NA, NA, 2, NA)
born2009 <- c(NA, NA, NA, NA, NA, NA, NA, NA)
df <- data.frame(id, born.swis, born2005, born2006, born2007, born2008, born2009)
I'm trying to mutate born.swis based on the values of the other variables. Basically, I want the value bornswis to be filled with the value one of the other variables IF born.id is NA and IF it is not NA for that variable. Something like this:
id <- c(1:8)
born.swis <- c(0, 1, 2, 1, NA, 2, 2,1)
df.desired <- data.frame(id, born.swis)
I tried several things with mutate and ifelse, like this:
df <- df%>%
mutate(born.swis = ifelse(is.na(born.swis), born2005, NA,
ifelse(is.na(born.swis), born2006, NA,
ifelse(is.na(born.swis), born2007, NA,
ifelse(is.na(born.swis), born2008, NA,
ifelse(is.na(born.swis), born2009, NA,)
)))))
and similar things, but I'm not able to reach my desired outcome.
Any ideas?
Many thanks!
One dplyr option could be:
df %>%
mutate(born.swis_res = coalesce(!!!select(., starts_with("born"))))
id born.swis born2005 born2006 born2007 born2008 born2009 born.swis_res
1 1 0 NA NA NA NA NA 0
2 2 1 NA NA NA NA NA 1
3 3 NA 2 NA NA NA NA 2
4 4 NA NA 1 NA NA NA 1
5 5 NA NA NA NA NA NA NA
6 6 2 NA NA NA NA NA 2
7 7 NA NA NA NA 2 NA 2
8 8 NA NA NA 1 NA NA 1
Or with dplyr 1.0.0:
df %>%
mutate(born.swis_res = Reduce(coalesce, across(starts_with("born"))))
In base R, you can use max.col :
df[cbind(1:nrow(df), max.col(!is.na(df[-1])) + 1 )]
#[1] 0 1 2 1 NA 2 2 1
max.col gives the column position of the first non-NA value in each row (exlcuding first column), we create a matrix with row-index and use it to subset df.
base R
df$born.swis <- apply(df[-1], 1, function(x) ifelse(all(is.na(x)), NA, sum(x, na.rm = T)))

Combining two matching Variables in a Dataframe

I´m working with survey data and I have two factor variables in a data frame which are the same measure but for different groups in the experiment. (it was a mistake in the survey.)
So now I have:
df$a1 <- c(NA, NA, NA, 1, 0, 1)
df$a2 <- c(1, 1, 0, NA, NA, NA)
How can i combine those two columns s.t. the values of one fill into the NA's of the other?
Using R base:
df <- data.frame(a1 = c(NA, NA, NA, 1, 0, 1),
a2 = c(1, 1, 0, NA, NA, NA))
df$a_comb <- ifelse(is.na(df$a1), df$a2, df$a1)
df
a1 a2 a_comb
1 NA 1 1
2 NA 1 1
3 NA 0 0
4 1 NA 1
5 0 NA 0
6 1 NA 1
Could also do:
df$combined <- dplyr::coalesce(df$a1, df$a2)
Output:
a1 a2 combined
1 NA 1 1
2 NA 1 1
3 NA 0 0
4 1 NA 1
5 0 NA 0
6 1 NA 1

In R: How to replace NA in a Vector found between two integers

I have the following vector:
A:(NA NA NA NA 1 NA NA 4 NA NA 1 NA NA NA NA NA 4 NA 1 NA 4)
I would like to replace all the Nas between 1 and 4 with 2 (but not the Nas between 4 and 1)
Are there any approaches you would recommend/use for this task?
It may also be managed as a dataframe:
A
----
NA
NA
NA
NA
1
NA
NA
4
NA
NA
1
NA
NA
NA
NA
NA
4
NA
1
NA
4
----
Edit:
1. I changed the string "Na" to NA.
SOLUTION/UPDATE
Thank you to everyone for your insights. I learnt from them to come up with the following solution to my case. I hope it is useful to someone else:
A <- c(df$A)
index.1<-which(df$A %in% c(1)) # define location for 1s in A
index.14<-which(df$A %in% c(1,4)) # define location for 1s and 4s in A
loc.1<-which(index.14 %in% index.1) # location of 1s in index.14
loc.4<-loc.1+1 # location of 4s relative to 1s in index.14
start.i<-((index.14[loc.1])+1) # starting index for replacing with 2
end.i<-((index.14[loc.4])-1) # ending index for replacing with 2 in index
fill.v<-sort(c(start.i, end.i))# sequence of indexes to fill-in with # 2
# create matrix of beginning and ending sequence
fill.m<-matrix(fill.v,nrow = (length(fill.v)/2),ncol = 2, byrow=TRUE)
# create a list with indexes to replace
list.1<-apply(fill.m, MARGIN=1,FUN=function(x) seq(x[1],x[2]))
# unlist list to use as the indexes for replacement
list.2<-unlist(list.1)
df$A[list.2] <- 2 # replace indexed location with 2
Assuming A is as shown reproducibly in the Note at the end, the difference of cumsum's shown gives TRUE for the elements between 1 and 4 inclusive and the next condition eliminates the endpoints. Finally we replace the positions having TRUE in what is left with 2.
replace(A, (cumsum(A == 1) - cumsum(A == 4)) & (A == "Na"), 2)
giving:
[1] "Na" "Na" "Na" "Na" "1" "2" "2" "4" "Na" "Na" "1" "2" "2" "2" "2"
[16] "2" "4" "Na" "1" "2" "4"
NA values
R is case sensitive and Na is not the same as NA. The sample data in the question showed Na values and not NA values but if what was actually meant was a numeric vector with NA values as in AA in the Note below then modify the expression to be as shown here:
replace(AA, cumsum(!is.na(AA) & AA == 1) - cumsum(!is.na(AA) & AA == 4) & is.na(AA), 2)
giving:
[1] NA NA NA NA 1 2 2 4 NA NA 1 2 2 2 2 2 4 NA 1 2 4
Note
A <- c("Na", "Na", "Na", "Na", "1", "Na", "Na", "4", "Na", "Na",
"1", "Na", "Na", "Na", "Na", "Na", "4", "Na", "1", "Na", "4")
AA <- as.numeric(replace(A, A == "Na", NA))
I'm sure there's a better solution to this problem but this should do the trick:
A <-
c(NA, NA, NA, NA, 1, NA, NA, 4, NA, NA, 1, NA, NA, NA, NA, NA, 4, NA, 1, NA, 4)
replace <- FALSE
for (i in 1:length(A)) {
if (!is.na(A[i])) {
if (A[i] == 1) {
start <- i + 1
replace <- TRUE
}
if (A[i] == 4 & replace == TRUE) {
A[start:(i - 1)] <- 2
replace <- FALSE
}
}
}
EDIT: if you only want to replace the NAs if there's nothing else (for example a 3) between the 1 and the 3 you could use this:
A <-
c(NA, NA, NA, NA, 1, NA, 3, 4, NA, NA, 1, NA, NA, NA, NA, NA, 4, NA, 1, NA, 4)
replace <- FALSE
for (i in 1:length(A)) {
if (!is.na(A[i])) {
if (A[i] == 1) {
start <- i + 1
replace <- TRUE
}
if (A[i] == 4 & replace == TRUE) {
A[start:(i - 1)] <- 2
replace <- FALSE
}
if (A[i] != 4 & A[i] != 1){
replace <- FALSE
}
}
}
Output:
> A
[1] NA NA NA NA 1 NA 3 4 NA NA 1 2 2 2 2 2 4 NA 1 2 4
And if you only want to replace NAs but keep other values between 1 and 4 use this:
A <-
c(NA, NA, NA, NA, 1, NA, 3, 4, NA, NA, 1, NA, NA, NA, NA, NA, 4, NA, 1, NA, 4)
replace <- FALSE
for (i in 1:length(A)) {
if (!is.na(A[i])) {
if (A[i] == 1) {
start <- i + 1
replace <- TRUE
}
if (A[i] == 4 & replace == TRUE) {
sub <- A[start:(i - 1)]
sub[is.na(sub)] <- 2
A[start:(i - 1)] <- sub
replace <- FALSE
}
}
}
Output:
> A
[1] NA NA NA NA 1 2 3 4 NA NA 1 2 2 2 2 2 4 NA 1 2 4
This should work as well and I assumed you were referring to NA's and no the string "Na". It would work for either though (or a mix).
> A <- c(NA, NA, NA, NA, 1, NA, NA, 4, NA, NA, 1, NA, NA, NA, NA, NA, 4, NA, 1, NA, 4)
>
> btw_1_4 <- unlist(lapply(Map(`:`, which(A == 1), which(A == 4)), function(x) x[2:(length(x)-1)]))
>
> A[btw_1_4] <- 2
>
> A
[1] NA NA NA NA 1 2 2 4 NA NA 1 2 2 2 2 2 4 NA 1 2 4
Map(:, which(A == 1), which(A == 4))
Creates a list of positions for 1-4 ranges in the vector (in order)
lapply(Map_List, function(x) x[2:(length(x)-1)])
Removes the first and last element of each vector in the list (the position of 1 and 4)
unlist makes all the remaining positions (NA's between 1 and 4) a single vector

Resources