trying to calculate sum of row with dataframe having NA values - r

I am trying to sum the row of values if any column have values but not working for me like below
df=data.frame(
x3=c(2,NA,3,5,4,6,NA,NA,3,3),
x4=c(0,NA,NA,6,5,6,NA,0,4,2))
df$summ <- ifelse(is.na(c(df[,"x3"] & df[,"x4"])),NA,rowSums(df[,c("x3","x4")], na.rm=TRUE))
the output should be like

An alternative solution:
library(data.table)
setDT(df)[!( is.na(x3) & is.na(x4)),summ:=rowSums(.SD, na.rm = T)]

You can do :
df <- transform(df, summ = ifelse(is.na(x3) & is.na(x4), NA,
rowSums(df, na.rm = TRUE)))
df
# x3 x4 summ
#1 2 0 2
#2 NA NA NA
#3 3 NA 3
#4 5 6 11
#5 4 5 9
#6 6 6 12
#7 NA NA NA
#8 NA 0 0
#9 3 4 7
#10 3 2 5
In general for any number of columns :
cols <- c('x3', 'x4')
df <- transform(df, summ = ifelse(rowSums(is.na(df[cols])) == length(cols),
NA, rowSums(df, na.rm = TRUE)))

Try the code below with rowSums + replace
df$summ <- replace(rowSums(df, na.rm = TRUE), rowSums(is.na(df)) == 2, NA)
which gives
> df
x3 x4 summ
1 2 0 2
2 NA NA NA
3 3 NA 3
4 5 6 11
5 4 5 9
6 6 6 12
7 NA NA NA
8 NA 0 0
9 3 4 7
10 3 2 5

This is not much different from already posted answers, however, it contains some useful functions:
library(dplyr)
df %>%
rowwise() %>%
mutate(Count = ifelse(all(is.na(cur_data())), NA,
sum(c_across(everything()), na.rm = TRUE)))
# A tibble: 10 x 3
# Rowwise:
x3 x4 Count
<dbl> <dbl> <dbl>
1 2 0 2
2 NA NA NA
3 3 NA 3
4 5 6 11
5 4 5 9
6 6 6 12
7 NA NA NA
8 NA 0 0
9 3 4 7
10 3 2 5

Related

Is there a way to group values in a column between data gaps in R?

I want to group my data in different chunks when the data is continuous. Trying to get the group column from dummy data like this:
a b group
<dbl> <dbl> <dbl>
1 1 1 1
2 2 2 1
3 3 3 1
4 4 NA NA
5 5 NA NA
6 6 NA NA
7 7 12 2
8 8 15 2
9 9 NA NA
10 10 25 3
I tried using
test %>% mutate(test = complete.cases(.)) %>%
group_by(group = cumsum(test == TRUE)) %>%
select(group, everything())
But it doesn't work as expected:
group a b test
<int> <dbl> <dbl> <lgl>
1 1 1 1 TRUE
2 2 2 2 TRUE
3 3 3 3 TRUE
4 3 4 NA FALSE
5 3 5 NA FALSE
6 3 6 NA FALSE
7 4 7 12 TRUE
8 5 8 15 TRUE
9 5 9 NA FALSE
10 6 10 25 TRUE
Any advice?
Using rle in base R -
transform(df, group1 = with(rle(!is.na(b)), rep(cumsum(values), lengths))) |>
transform(group1 = replace(group1, is.na(b), NA))
# a b group group1
#1 1 1 1 1
#2 2 2 1 1
#3 3 3 1 1
#4 4 NA NA NA
#5 5 NA NA NA
#6 6 NA NA NA
#7 7 12 2 2
#8 8 15 2 2
#9 9 NA NA NA
#10 10 25 3 3
A couple of approaches to consider if you wish to use dplyr for this.
First, you could look at transition from non-complete cases (using lag) to complete cases.
library(dplyr)
test %>%
mutate(test = complete.cases(.)) %>%
group_by(group = cumsum(test & !lag(test, default = F))) %>%
mutate(group = replace(group, !test, NA))
Alternatively, you could add row numbers to your data.frame. Then, you could filter to include only complete cases, and group_by enumerating with cumsum based on gaps in row numbers. Then, join back to original data.
test$rn <- seq.int(nrow(test))
test %>%
filter(complete.cases(.)) %>%
group_by(group = c(0, cumsum(diff(rn) > 1)) + 1) %>%
right_join(test) %>%
arrange(rn) %>%
dplyr::select(-rn)
Output
a b group
<int> <int> <dbl>
1 1 1 1
2 2 2 1
3 3 3 1
4 4 NA NA
5 5 NA NA
6 6 NA NA
7 7 12 2
8 8 15 2
9 9 NA NA
10 10 25 3
Using data.table, get rleid then remove group IDs for NAs, then fix the sequence with factor to integer conversion:
library(data.table)
setDT(test)[, group1 := {
x <- complete.cases(test)
grp <- rleid(x)
grp[ !x ] <- NA
as.integer(factor(grp))
}]
# a b group group1
# 1: 1 1 1 1
# 2: 2 2 1 1
# 3: 3 3 1 1
# 4: 4 NA NA NA
# 5: 5 NA NA NA
# 6: 6 NA NA NA
# 7: 7 12 2 2
# 8: 8 15 2 2
# 9: 9 NA NA NA
# 10: 10 25 3 3

R: creating multiple new variables based on conditions of selection of other variables with similar names

I have a data frame where each condition (in the example: hope, dream, joy) has 5 variables (in the example, coded with suffixes x, y, z, a, b - the are the same for each condition).
df <- data.frame(matrix(1:16,5,16))
names(df) <- c('ID','hopex','hopey','hopez','hopea','hopeb','dreamx','dreamy','dreamz','dreama','dreamb','joyx','joyy','joyz','joya','joyb')
df[1,2:6] <- NA
df[3:5,c(7,10,14)] <- NA
This is how the data looks like:
ID hopex hopey hopez hopea hopeb dreamx dreamy dreamz dreama dreamb joyx joyy joyz joya joyb
1 1 NA NA NA NA NA 15 4 9 14 3 8 13 2 7 12
2 2 7 12 1 6 11 16 5 10 15 4 9 14 3 8 13
3 3 8 13 2 7 12 NA 6 11 NA 5 10 15 NA 9 14
4 4 9 14 3 8 13 NA 7 12 NA 6 11 16 NA 10 15
5 5 10 15 4 9 14 NA 8 13 NA 7 12 1 NA 11 16
I want to create a new variable for each condition (hope, dream, joy) that codes whether all of the variables x...b for that condition are NA (0 if all are NA, 1 if any is non-NA). And I want the new variables to be stored in the data frame. Thus, the output should be this:
ID hopex hopey hopez hopea hopeb dreamx dreamy dreamz dreama dreamb joyx joyy joyz joya joyb hope joy dream
1 1 NA NA NA NA NA 15 4 9 14 3 8 13 2 7 12 0 1 1
2 2 7 12 1 6 11 16 5 10 15 4 9 14 3 8 13 1 1 1
3 3 8 13 2 7 12 NA 6 11 NA 5 10 15 NA 9 14 1 1 1
4 4 9 14 3 8 13 NA 7 12 NA 6 11 16 NA 10 15 1 1 1
5 5 10 15 4 9 14 NA 8 13 NA 7 12 1 NA 11 16 1 1 1
The code below does it, but I'm looking for a more elegant solution (e.g., for a case where I have even more conditions). I've tried with various combinations of all(), select(), mutate(), but while they all seem useful, I cannot figure out how to combine them to get what I want. I'm stuck and would be interested in learning to code more efficiently. Thanks in advance!
df$hope <- 0
df[is.na(df$hopex) == FALSE | is.na(df$hopey) == FALSE | is.na(df$hopez) == FALSE | is.na(df$hopea) == FALSE | is.na(df$hopeb) == FALSE, "hope"] <- 1
df$dream <- 0
df[is.na(df$dreamx) == FALSE | is.na(df$dreamy) == FALSE | is.na(df$dreamz) == FALSE | is.na(df$dreama) == FALSE | is.na(df$dreamb) == FALSE, "dream"] <- 1
df$joy<- 0
df[is.na(df$joyx) == FALSE | is.na(df$joyy) == FALSE | is.na(df$joyz) == FALSE | is.na(df$joya) == FALSE | is.na(df$joyb) == FALSE, "joy"] <- 1
Here is an option with tidyverse
library(dplyr)
library(purrr)
library(magrittr)
df %>%
mutate(hope = select(., starts_with('hope')) %>%
is.na %>%
`!` %>%
rowSums %>%
is_greater_than(0) %>%
as.integer)
# hopex hopey hopez hopea hopeb dreamx dreamy dreamz dreama dreamb joyx joyy joyz joya joyb hope
#1 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA 0
#2 1 1 4 3 2 3 5 4 5 2 5 NA 4 3 1 1
#3 2 NA 4 4 4 3 5 NA 5 5 4 NA 4 5 1 1
#4 4 3 NA 1 1 1 5 2 NA 5 1 2 1 1 1 1
#5 1 NA 4 NA NA 2 1 5 1 2 NA 3 1 2 5 1
Or with rowSums
df %>%
mutate(hope = +(rowSums(!is.na(select(., starts_with('hope'))))!= 0))
For multiple columns, we can create a function
f1 <- function(dat, colSubstr) {
dplyr::select(dat, starts_with(colSubstr)) %>%
is.na %>%
`!` %>%
rowSums %>%
is_greater_than(0) %>%
as.integer
}
df %>%
mutate(hope = f1(., 'hope'),
dream = f1(., 'dream'),
joy = f1(., 'joy'))
Or using base R
cbind(df, sapply(split.default(df, sub(".$", "", names(df))),
function(x) +(rowSums(!is.na(x)) != 0)))
If we want to subset columns
nm1 <- setdiff(names(df), "ID")
cbind(df, sapply(split.default(df[nm1], sub(".$", "", names(df[nm1]))),
function(x) +(rowSums(!is.na(x)) != 0)))
data
set.seed(24)
df <- as.data.frame(matrix(sample(c(NA, 1:5), 5 * 15, replace = TRUE),
ncol = 15, dimnames = list(NULL, paste0(rep(c("hope", "dream", "joy"),
each = 5), c('x', 'y', 'z', 'a', 'b')))))
df[1,] <- NA

Applying custom function to each row uses only first value of argument

I am trying to recode NA values to 0 in a subset of columns using the following dataset:
set.seed(1)
df <- data.frame(
id = c(1:10),
trials = sample(1:3, 10, replace = T),
t1 = c(sample(c(1:9, NA), 10)),
t2 = c(sample(c(1:7, rep(NA, 3)), 10)),
t3 = c(sample(c(1:5, rep(NA, 5)), 10))
)
Each row has a certain number of trials associated with it (between 1-3), specified by the trials column. columns t1-t3 represent scores for each trial.
The number of trials indicates the subset of columns in which NAs should be recoded to 0: NAs that are within the number of trials represent missing data, and should be recoded as 0, while NAs outside the number of trials are not meaningful, and should remain NAs. So, for a row where trials == 3, an NA in column t3 would be recoded as 0, but in a row where trials == 2, an NA in t3 would remain an NA.
So, I tried using this function:
replace0 <- function(x, num.sun) {
x[which(is.na(x[1:(num.sun + 2)]))] <- 0
return(x)
}
This works well for single vectors. When I try applying the same function to a data frame with apply(), though:
apply(df, 1, replace0, num.sun = df$trials)
I get a warning saying:
In 1:(num.sun + 2) :
numerical expression has 10 elements: only the first used
The result is that instead of having the value of num.sun change every row according to the value in trials, apply() simply uses the first value in the trials column for every single row. How could I apply the function so that the num.sun argument changes according to the value of df$trials?
Thanks!
Edit: as some have commented, the original example data had some non-NA scores that didn't make sense according to the trials column. Here's a corrected dataset:
df <- data.frame(
id = c(1:5),
trials = c(rep(1, 2), rep(2, 1), rep(3, 2)),
t1 = c(NA, 7, NA, 6, NA),
t2 = c(NA, NA, 3, 7, 12),
t3 = c(NA, NA, NA, 4, NA)
)
Another approach:
# create an index of the NA values
w <- which(is.na(df), arr.ind = TRUE)
# create an index with the max column by row where an NA is allowed to be replaced by a zero
m <- matrix(c(1:nrow(df), (df$trials + 2)), ncol = 2)
# subset 'w' such that only the NA's which fall in the scope of 'm' remain
i <- w[w[,2] <= m[,2][match(w[,1], m[,1])],]
# use 'i' to replace the allowed NA's with a zero
df[i] <- 0
which gives:
> df
id trials t1 t2 t3
1 1 1 3 NA 5
2 2 2 2 2 NA
3 3 2 6 6 4
4 4 3 0 1 2
5 5 1 5 NA NA
6 6 3 7 0 0
7 7 3 8 7 0
8 8 2 4 5 1
9 9 2 1 3 NA
10 10 1 9 4 3
You could easily wrap this in a function:
replace.NA.with.0 <- function(df) {
w <- which(is.na(df), arr.ind = TRUE)
m <- matrix(c(1:nrow(df), (df$trials + 2)), ncol = 2)
i <- w[w[,2] <= m[,2][match(w[,1], m[,1])],]
df[i] <- 0
return(df)
}
Now, using replace.NA.with.0(df) will produce the above result.
As noted by others, some rows (1, 3 & 10) have more values than trails. You could tackle that problem by rewriting the above function to:
replace.with.NA.or.0 <- function(df) {
w <- which(is.na(df), arr.ind = TRUE)
df[w] <- 0
v <- tapply(m[,2], m[,1], FUN = function(x) tail(x:5,-1))
ina <- matrix(as.integer(unlist(stack(v)[2:1])), ncol = 2)
df[ina] <- NA
return(df)
}
Now, using replace.with.NA.or.0(df) produces the following result:
id trials t1 t2 t3
1 1 1 3 NA NA
2 2 2 2 2 NA
3 3 2 6 6 NA
4 4 3 0 1 2
5 5 1 5 NA NA
6 6 3 7 0 0
7 7 3 8 7 0
8 8 2 4 5 NA
9 9 2 1 3 NA
10 10 1 9 NA NA
Here I just rewrite your function using double subsetting x[paste0('t',x['trials'])], which overcome the problem in the other two solutions with row 6
replace0 <- function(x){
#browser()
x_na <- x[paste0('t',x['trials'])]
if(is.na(x_na)){x[paste0('t',x['trials'])] <- 0}
return(x)
}
t(apply(df, 1, replace0))
id trials t1 t2 t3
[1,] 1 1 3 NA 5
[2,] 2 2 2 2 NA
[3,] 3 2 6 6 4
[4,] 4 3 NA 1 2
[5,] 5 1 5 NA NA
[6,] 6 3 7 NA 0
[7,] 7 3 8 7 0
[8,] 8 2 4 5 1
[9,] 9 2 1 3 NA
[10,] 10 1 9 4 3
Here is a way to do it:
x <- is.na(df)
df[x & t(apply(x, 1, cumsum)) > 3 - df$trials] <- 0
The output looks like this:
> df
id trials t1 t2 t3
1 1 1 3 NA 5
2 2 2 2 2 NA
3 3 2 6 6 4
4 4 3 0 1 2
5 5 1 5 NA NA
6 6 3 7 0 0
7 7 3 8 7 0
8 8 2 4 5 1
9 9 2 1 3 NA
10 10 1 9 4 3
> x <- is.na(df)
> df[x & t(apply(x, 1, cumsum)) > 3 - df$trials] <- 0
> df
id trials t1 t2 t3
1 1 1 3 NA 5
2 2 2 2 2 NA
3 3 2 6 6 4
4 4 3 0 1 2
5 5 1 5 NA NA
6 6 3 7 0 0
7 7 3 8 7 0
8 8 2 4 5 1
9 9 2 1 3 NA
10 10 1 9 4 3
Note: row 1/3/10, is problematic since there are more non-NA values than the trials.
Here's a tidyverse way, note that it doesn't give the same output as other solutions.
Your example data shows results for trials that "didn't happen", I assumed your real data doesn't.
library(tidyverse)
df %>%
nest(matches("^t\\d")) %>%
mutate(data = map2(data,trials,~mutate_all(.,replace_na,0) %>% select(.,1:.y))) %>%
unnest
# id trials t1 t2 t3
# 1 1 1 3 NA NA
# 2 2 2 2 2 NA
# 3 3 2 6 6 NA
# 4 4 3 0 1 2
# 5 5 1 5 NA NA
# 6 6 3 7 0 0
# 7 7 3 8 7 0
# 8 8 2 4 5 NA
# 9 9 2 1 3 NA
# 10 10 1 9 NA NA
Using the more commonly used gather strategy this would be:
df %>%
gather(k,v,matches("^t\\d")) %>%
arrange(id) %>%
group_by(id) %>%
slice(1:first(trials)) %>%
mutate_at("v",~replace(.,is.na(.),0)) %>%
spread(k,v)
# # A tibble: 10 x 5
# # Groups: id [10]
# id trials t1 t2 t3
# <int> <int> <dbl> <dbl> <dbl>
# 1 1 1 3 NA NA
# 2 2 2 2 2 NA
# 3 3 2 6 6 NA
# 4 4 3 0 1 2
# 5 5 1 5 NA NA
# 6 6 3 7 0 0
# 7 7 3 8 7 0
# 8 8 2 4 5 NA
# 9 9 2 1 3 NA
# 10 10 1 9 NA NA

Filtering data relative to first and last occurance of an event

I have a dataframe of an experiment, where stimulus is shown to participants, and time is measured continuously.
# reprex
df <-
tibble(stim = c(NA, NA, NA, NA, "a", "b", NA, "c", NA, "d", NA, NA, NA),
time = 0:12)
# A tibble: 13 x 2
stim time
<chr> <int>
1 NA 0
2 NA 1
3 NA 2
4 NA 3
5 a 4
6 b 5
7 NA 6
8 c 7
9 NA 8
10 d 9
11 NA 10
12 NA 11
13 NA 12
I want to create a generalized solution, using tidyverse functions to drop the data 1 second before and 2 seconds after the first and last marker, respectively. Using tidyverse, I thought this will work, but it throws an uninformative error.
df %>%
# store times for first and last stim
mutate(first_stim = drop_na(stim) %>% pull(time) %>% first(),
last_stim = drop_na(stim) %>% pull(time) %>% last()) %>%
# filter df based on new variables
filter(time >= first(first_stim) - 1 &
time <= first(last_stim) + 2)
Error in mutate_impl(.data, dots) : bad value
So I made a pretty ugly base r code to overcome this issue by changing the mutate:
df2 <- df %>%
mutate(first_stim = .[!is.na(.$stim), "time"][1,1],
last_stim = .[!is.na(.$stim), "time"][nrow(.[!is.na(.$stim), "time"]), 1])
# A tibble: 13 x 4
stim time first_stim last_stim
<chr> <int> <tibble> <tibble>
1 NA 0 4 9
2 NA 1 4 9
3 NA 2 4 9
4 NA 3 4 9
5 a 4 4 9
6 b 5 4 9
7 NA 6 4 9
8 c 7 4 9
9 NA 8 4 9
10 d 9 4 9
11 NA 10 4 9
12 NA 11 4 9
13 NA 12 4 9
Now I would only need to filter based on the new variables first_stim - 1 and last_stim + 2. But filter fails too:
df2 %>%
filter(time >= first(first_stim) - 1 &
time <= first(last_stim) + 2)
Error in filter_impl(.data, quo) :
Not compatible with STRSXP: [type=NULL].
I was able to do it in base R, but it is really ugly:
df2[(df2$time >= (df2[[1, "first_stim"]] - 1)) &
(df2$time <= (df2[[1, "last_stim"]] + 2))
,]
The desired output should look like this:
# A tibble: 13 x 2
stim time
<chr> <int>
4 NA 3
5 a 4
6 b 5
7 NA 6
8 c 7
9 NA 8
10 d 9
11 NA 10
12 NA 11
I believe that the errors are related to dplyr::nth() and related functions. And I've found some old issues that are related to this behavior, but should no longer exist https://github.com/tidyverse/dplyr/issues/1980
I would really appreciate if someone could highlight what is the problem, and how to do this in a tidy way.
You could use a combination of is.na and which...
library(dplyr)
df <-
tibble(stim = c(NA, NA, NA, NA, "a", "b", NA, "c", NA, "d", NA, NA, NA),
time = 0:12)
df %>%
filter(row_number() >= first(which(!is.na(stim))) - 1 &
row_number() <= last(which(!is.na(stim))) + 2)
# # A tibble: 9 x 2
# stim time
# <chr> <int>
# 1 NA 3
# 2 a 4
# 3 b 5
# 4 NA 6
# 5 c 7
# 6 NA 8
# 7 d 9
# 8 NA 10
# 9 NA 11
you could also make your first attempt work with a little modification...
df %>%
mutate(first_stim = first(drop_na(., stim) %>% pull(time)),
last_stim = last(drop_na(., stim) %>% pull(time))) %>%
filter(time >= first(first_stim) - 1 &
time <= first(last_stim) + 2)
We can create a cumulative sum of non-NA values and then find the row indices where the we encounter the first non-NA value and the last one. We then select rows based on the requirement. (-1 from start and +2 from end).
library(tidyverse)
df %>%
mutate(count_cumsum = cumsum(!is.na(stim))) %>%
slice((which.max(count_cumsum == 1) -1):(which.max(count_cumsum) + 2)) %>%
select(-count_cumsum)
# stim time
# <chr> <int>
#1 NA 3
#2 a 4
#3 b 5
#4 NA 6
#5 c 7
#6 NA 8
#7 d 9
#8 NA 10
#9 NA 11
Just to give an idea how count_cumsum looks:
df %>%
mutate(count_cumsum = cumsum(!is.na(stim)))
# A tibble: 13 x 3
# stim time count_cumsum
# <chr> <int> <int>
#1 NA 0 0
#2 NA 1 0
#3 NA 2 0
#4 NA 3 0
#5 a 4 1
#6 b 5 2
#7 NA 6 2
#8 c 7 3
#9 NA 8 3
#10 d 9 4
#11 NA 10 4
#12 NA 11 4
#13 NA 12 4

Fill missing values with new data R-Python

I have two dataset x and y
> x
a index b
1 1 1 5
2 NA 2 6
3 2 3 NA
4 NA 4 9
> y
index a
1 2 100
2 4 101
>
I would like to fill the missing values of x with the values contained in y.
I have tried to use the merge function but the result is not what I want.
> merge(x,y, by = 'index', all=T)
index a.x b a.y
1 1 1 5 NA
2 2 NA 6 100
3 3 2 7 NA
4 4 NA 9 101
In the real problem there are additional limitations:
1 - y does not fill all the missing values
2 - x and y have in common more variables (so not only a and index)
EDIT : More realistic example
> x
a index b c
1 1 1 5 NA
2 NA 2 6 NA
3 2 3 NA 5
4 NA 4 9 NA
5 NA 5 10 6
> y
index a c
1 2 100 4
2 4 101 NA
>
The solution would be accepted both in python or R
I used your merge idea and did the following using dplyr. I am sure there will be better ways of doing this task.
index <- 1:5
a <- c(1, NA, 2, NA, NA)
b <- c(5,6,NA,9,10)
c <- c(NA,NA,5,NA,6)
ana <- data.frame(index, a,b,c, stringsAsFactors=F)
index <- c(2,4)
a <- c(100, 101)
c <- c(4, NA)
bob <- data.frame(index, a,c, stringsAsFactors=F)
> ana
index a b c
1 1 1 5 NA
2 2 NA 6 NA
3 3 2 NA 5
4 4 NA 9 NA
5 5 NA 10 6
> bob
index a c
1 2 100 4
2 4 101 NA
ana %>%
merge(., bob, by = "index", all = TRUE) %>%
mutate(a.x = ifelse(a.x %in% NA, a.y, a.x)) %>%
mutate(c.x = ifelse(c.x %in% NA, c.y, c.x))
index a.x b c.x a.y c.y
1 1 1 5 NA NA NA
2 2 100 6 4 100 4
3 3 2 NA 5 NA NA
4 4 101 9 NA 101 NA
5 5 NA 10 6 NA NA
I overwrote a.x (ana$$a) using a.y (bob$a) using mutate. I did a similar thing for c.x (ana$c). If you remove a.y and c.y in the end, that will be the outcome you expect, I think.
Try:
xa = x[,c(1,2)]
m1 = merge(y,xa,all=T)
m1 = m1[!duplicated(m1$index),]
m1$b = x$b[match(m1$index, x$index)]
m1$c = x$c[match(m1$index, x$index)]
m1
index a b c
1 1 1 5 NA
2 2 100 6 NA
4 3 2 NA 5
5 4 101 9 NA
7 5 NA 10 6
or, if there many other columns like b and c:
xa = x[,c(1,2)]
m1 = merge(y,xa,all=T)
m1 = m1[!duplicated(m1$index),]
for(nn in names(x)[3:4]) m1[,nn] = x[,nn][match(m1$index, x$index)]
m1
index a b c
1 1 1 5 NA
2 2 100 6 NA
4 3 2 NA 5
5 4 101 9 NA
7 5 NA 10 6
If there are multiple columns to replace, you could try converting from wide to long form as shown in the first two methods and replace in one step
m1 <- merge(x,y, by="index", all=TRUE)
m1L <- reshape(m1, idvar="index", varying=grep("\\.", colnames(m1)), direction="long", sep=".")
row.names(m1L) <- 1:nrow(m1L)
lst1 <- split(m1L, m1L$time)
indx <- is.na(lst1[[1]][,4:5])
lst1[[1]][,4:5][indx] <- lst1[[2]][,4:5][indx]
res <- lst1[[1]][,c(4,1,2,5)]
res
# a index b c
#1 1 1 5 NA
#2 100 2 6 4
#3 2 3 NA 5
#4 101 4 9 NA
#5 NA 5 10 6
Or you could use dplyr with tidyr
library(dplyr)
library(tidyr)
z <- left_join(x, y, by="index") %>%
gather(Var, Val, matches("\\.")) %>%
separate(Var, c("Var1", "Var2"))
indx1 <- which(is.na(z$Val) & z$Var2=="x")
z$Val[indx1] <- z$Val[indx1+nrow(z)/2]
z %>%
spread(Var1, Val) %>%
filter(Var2=="x") %>%
select(-Var2)
# index b a c
#1 1 5 1 NA
#2 2 6 100 4
#3 3 NA 2 5
#4 4 9 101 NA
#5 5 10 NA 6
Or split the columns by matching names before the . and use lapply to replace the NA's.
indx <- grep("\\.", colnames(m1),value=TRUE)
res <- cbind(m1[!names(m1) %in% indx],
sapply(split(indx, gsub("\\..*", "", indx)), function(x) {
x1 <- m1[x]
indx1 <- is.na(x1[,1])
x1[,1][indx1] <- x1[,2][indx1]
x1[,1]} ))
res
# index b a c
#1 1 5 1 NA
#2 2 6 100 4
#3 3 NA 2 5
#4 4 9 101 NA
#5 5 10 NA 6

Resources