Join list to dataframe in R - r

I have the following dataframe df and list l (dput below):
> df
group value
1 A 1
2 B 2
3 C 3
> l
$A
[1] 999
$B
[1] 55
I would like to join the values of the list to the dataframe based on the names in the list with the group variable of the dataframe and call it "value_l". The expected output should look like this:
group value value_l
1 A 1 999
2 B 2 55
3 C 3 NA
So I was wondering if anyone knows how to join a list to a dataframe based on their names?
dput df and l:
df <- structure(list(group = c("A", "B", "C"), value = c(1, 2, 3)), class = "data.frame", row.names = c(NA,
-3L))
l <- list(A = 999, B = 55)

You can use match. In case df$group is a character (what is here the case) it could be directly used to subset the list.
df$value_l <- l[match(df$group, names(l))]
#df$value_l <- l[df$group] #Short alternative by #akrun works only in case df$group is a character, but not for factor or numeric
#df$value_l <- l[as.character(df$group)] #Maybe more secure
df
# group value value_l
#1 A 1 999
#2 B 2 55
#3 C 3 NULL
In case there is a need for NA, instead of NULL use in addition:
df$value_l[vapply(df$value_l, is.null, TRUE)] <- NA
df
# group value value_l
#1 A 1 999
#2 B 2 55
#3 C 3 NA
Or make it in single steps:
. <- match(df$group, names(l))
df$value_l <- l[.]
is.na(df$value_l) <- is.na(.)
Here we have joined a list to a data.frame.
str(df)
#'data.frame': 3 obs. of 3 variables:
# $ group : chr "A" "B" "C"
# $ value : num 1 2 3
# $ value_l:List of 3
# ..$ A : num 999
# ..$ B : num 55
# ..$ NA: logi NA
In case the List can be trasfomed to a vector you can use unlist before (thanks to #G. Grothendieck for the comment). But here we have then joined a vector to the data.frame.
df$value_l <- unlist(l)[match(df$group, names(l))]
#df$value_l <- unlist(l)[as.character(df$group)] #Option like shown above
df
# group value value_l
#1 A 1 999
#2 B 2 55
#3 C 3 NA
str(df)
#'data.frame': 3 obs. of 3 variables:
# $ group : chr "A" "B" "C"
# $ value : num 1 2 3
# $ value_l: num 999 55 NA
Another option, also joined a vector to the data.frame will be using merge.
merge(df, unlist(l), by.x="group", by.y=0, all.x = TRUE)
# group value y
#1 A 1 999
#2 B 2 55
#3 C 3 NA
Note: For the given list the results look similar but this will not be the case if the list looks e.g. like:
l <- list(A = 999, B = c(7, 55), A = 9)
A potential solution might be:
Taking first match:
df$value_l <- l[as.character(df$group)]
df
# group value value_l
#1 A 1 999
#2 B 2 7, 55
#3 C 3 NULL
Making a left Join
merge(df, list2DF(list(group = names(l), value_l = l)), all.x=TRUE)
#merge(df, data.frame(group = names(l), value_l = I(l)), all.x=TRUE) #Alternative
# group value value_l
#1 A 1 999
#2 A 1 9
#3 B 2 7, 55
#4 C 3 NA
Other options.
merge(df, list2DF(list(group = names(l), value_l = l))) #Inner
merge(df, list2DF(list(group = names(l), value_l = l)), all=TRUE) #Outer
merge(df, list2DF(list(group = names(l), value_l = l)), all.y=TRUE) #Right
For other options please have a look at How to join (merge) data frames (inner, outer, left, right).

Use merge from base R
merge(df, stack(l), by.x = 'group', by.y = 'ind', all.x = TRUE)
group value values
1 A 1 999
2 B 2 55
3 C 3 NA
Or with dplyr
library(dplyr)
df %>%
rowwise %>%
mutate(value_l = if(group %in% names(l)) l[[group]] else NA) %>%
ungroup
-output
# A tibble: 3 × 3
group value value_l
<chr> <dbl> <dbl>
1 A 1 999
2 B 2 55
3 C 3 NA
Or using enframe/unnest
library(tidyr)
library(tibble)
enframe(l, name = 'group', value = 'value_l') %>%
unnest(value_l) %>%
left_join(df, .)
group value value_l
1 A 1 999
2 B 2 55
3 C 3 NA
Or if it can be a list column
df$value_l <- l[df$group]
> df
group value value_l
1 A 1 999
2 B 2 55
3 C 3 NULL

You can do:
library(tidyverse)
l |>
as.data.frame() |>
pivot_longer(cols = everything(),
names_to = "group",
values_to = "value_1") |>
left_join(x = df,
y = _,
by = "group")
which gives:
group value value_1
1 A 1 999
2 B 2 55
3 C 3 NA

Update:
Maybe this one:
library(dplyr)
stack(unlist(l)) %>%
full_join(df, by=c("ind"="group"))
values ind value
1 999 A 1
2 55 B 2
3 NA C 3
First answer:
Slightly different:
library(dplyr)
library(tidyr)
bind_rows(l) %>%
pivot_longer(everything()) %>%
full_join(df, by=c("name"="group")) %>%
select(name, value = value.y, value_l=value.x)
name value value_l
<chr> <dbl> <dbl>
1 A 1 999
2 B 2 55
3 C 3 NA

This is a simpler version of what GKi suggested with unlist(). If your list always has a name and a single numeric value, you can convert it to a named vector and then use it as a lookup vector, which is simpler than doing merges or matches:
temp_vec = unlist(l)
df$l_value = temp_vec[df$group]
df
group value l_value
1 A 1 999
2 B 2 55
3 C 3 NA
Without the intermediate variable for a single line solution:
df$l_value = unlist(l)[df$group]
df
group value l_value
1 A 1 999
2 B 2 55
3 C 3 NA
Depending on what else you need the list for, it may even make sense just to use a named vector instead of a list in the first place.

Related

R: How can I merge two column in one (for loger column, not merge by column )

For example, I have this data frame:
Id
Age
1
14
2
28
and I want to make a long column like this:
Id
new column
1
1
2
2
14
28
What should I do?
We may unlist data and create the column by padding NA based on the max length
lst1 <- list(df1$id, unlist(df1))
out <- data.frame(lapply(lst1, `length<-`, max(lengths(lst1))))
names(out) <- c("id", "new_column")
Here is another approach:
df1 <- data.frame(New_column = c(df[,"Id"], df[,"Age"]))
merge(df$Id, df1, by="row.names", all=TRUE)[,-1]
Output:
x New_column
1 1 1
2 2 2
3 NA 14
4 NA 28
An approach with dplyr
library(dplyr)
df %>%
mutate(Age = Id) %>%
bind_rows(
df %>%
mutate(Id = NA)
) %>%
rename(new_column = Age)
# A tibble: 4 x 2
Id new_column
<int> <int>
1 1 1
2 2 2
3 NA 14
4 NA 28

Update column of dataframe1 based on column of dataframe2 + create new row if column1 is not empty

I have a dataframe that I want to update with information from another dataframe, a lookup dataframe.
In particular, I'd like to update the cells of df1$value with the cells of df2$value based on the columns id and id2.
If the cell of df1$value is NA, I know how to do it using the package data.table
BUT
If the cell of df1$value is not empty, data.table will update it with the cell of df2$value anyway.
I don't want that. I'd like to have that:
IF the cell of df1$value is NOT empty (in this case the row in which df1$id is c), do not update the cell but create a duplicate row of df1 in which the cell of df1$value takes the value from the cell of df2$value
I already looked for solutions online but I couldn't find any. Is there a way to do it easily with tidyverse or data.table or an sql-like package?
Thank you for your help!
edit: I've just realized that I forgot to put the corner case in which in both dataframes the row is NA. With the replies I had so far (07/08/19 14:42) the row e is removed from the last dataframe. But I really need to keep it!
Outline:
> df1
id id2 value
1 a 1 100
2 b 2 101
3 c 3 50
4 d 4 NA
5 e 5 NA
> df2
id id2 value
1 c 3 200
2 d 4 201
3 e 5 NA
# I'd like:
> df5
id id2 value
1 a 1 100
2 b 2 101
3 c 3 50
4 c 3 200
5 d 4 201
6 e 5 NA
This is how I managed to solve my problem but it's quite cumbersome.
# I create the dataframes
df1 <- data.frame(id=c('a', 'b', 'c', 'd'), id2=c(1,2,3,4),value=c(100, 101, 50, NA))
df2 <- data.frame(id=c('c', 'd', 'e'),id2=c(3,4, 5), value=c(200, 201, 300))
# I first do a left_join so I'll have two value columnes: value.x and value.y
df3 <- dplyr::left_join(df1, df2, by = c("id","id2"))
# > df3
# id id2 value.x value.y
# 1 a 1 100 NA
# 2 b 2 101 NA
# 3 c 3 50 200
# 4 d 4 NA 201
# I keep only the rows in which value.x is NA, so the 4th row
df4 <- df3 %>%
filter(is.na(value.x)) %>%
dplyr::select(id, id2, value.y)
# > df4
# id id2 value.y
# 1 d 4 201
# I rename the column "value.y" to "value". (I don't do it with dplyr because the function dplyr::replace doesn't work in my R version)
colnames(df4)[colnames(df4) == "value.y"] <- "value"
# > df4
# id id2 value
# 1 d 4 201
# I update the df1 with the df4$value. This step is necessary to update only the rows of df1 in which df1$value is NA
setDT(df1)[setDT(df4), on = c("id","id2"), `:=`(value = i.value)]
# > df1
# id id2 value
# 1: a 1 100
# 2: b 2 101
# 3: c 3 50
# 4: d 4 201
# I filter only the rows in which both value.x and value.y are NAs
df3 <- as_tibble(df3) %>%
filter(!is.na(value.x), !is.na(value.y)) %>%
dplyr::select(id, id2, value.y)
# > df3
# # A tibble: 1 x 3
# id id2 value.y
# <chr> <dbl> <dbl>
# 1 c 3 200
# I rename column df3$value.y to value
colnames(df3)[colnames(df3) == "value.y"] <- "value"
# I bind by rows df1 and df3 and I order by the column id
df5 <- rbind(df1, df3) %>%
arrange(id)
# > df5
# id id2 value
# 1 a 1 100
# 2 b 2 101
# 3 c 3 50
# 4 c 3 200
# 5 d 4 201
A left join with data.table:
library(data.table)
setDT(df1); setDT(df2)
df2[df1, on=.(id, id2), .(value =
if (.N == 0) i.value
else na.omit(c(i.value, x.value))
), by=.EACHI]
id id2 value
1: a 1 100
2: b 2 101
3: c 3 50
4: c 3 200
5: d 4 201
How it works: The syntax is x[i, on=, j, by=.EACHI]: for each row of i = df1 do j.
In this case j = .(value = expr) where .() is a shortcut to list() since in general j should return a list of columns.
Regarding the expression, .N is the number of rows of x = df2 that are found for each row of i = df1, so if no matches are found we keep values from i; and otherwise we keep values from both tables, dropping missing values.
A dplyr way:
bind_rows(df1, semi_join(df2, df1, by=c("id", "id2"))) %>%
group_by(id, id2) %>%
do(if (nrow(.) == 1) . else na.omit(.))
# A tibble: 5 x 3
# Groups: id, id2 [4]
id id2 value
<chr> <dbl> <dbl>
1 a 1 100
2 b 2 101
3 c 3 50
4 c 3 200
5 d 4 201
Comment. The dplyr way is kind of awkward because do() is needed to get a dynamically determined number of rows, but do() is typically discouraged and does not support n() and other helper functions. The data.table way is kind of awkward because there is no simple semi join functionality.
Data:
df1 <- data.frame(id=c('a', 'b', 'c', 'd'), id2=c(1,2,3,4),value=c(100, 101, 50, NA))
df2 <- data.frame(id=c('c', 'd', 'e'),id2=c(3,4, 5), value=c(200, 201, 300))
> df1
id id2 value
1 a 1 100
2 b 2 101
3 c 3 50
4 d 4 NA
> df2
id id2 value
1 c 3 200
2 d 4 201
3 e 5 300
Another idea via base R is to remove the rows from df2 that do not match in df1, bind the two data frames rowwise (rbind) and omit the NAs, i.e.
na.omit(rbind(df1, df2[do.call(paste, df2[1:2]) %in% do.call(paste, df1[1:2]),]))
# id id2 value
#1 a 1 100
#2 b 2 101
#3 c 3 50
#5 c 3 200
#6 d 4 201
To answer your new requirements, we can keep the same rbind method and filter based on your conditions, i.e.
dd <- rbind(df1, df2[do.call(paste, df2[1:2]) %in% do.call(paste, df1[1:2]),])
dd[!!with(dd, ave(value, id, id2, FUN = function(i)(all(is.na(i)) & !duplicated(i)) | !is.na(i))),]
# id id2 value
#1 a 1 100
#2 b 2 101
#3 c 3 50
#5 e 5 NA
#6 c 3 200
#7 d 4 201
A possible approach with data.table using update join then full outer merge:
merge(df1[is.na(value), value := df2[.SD, on=.(id, id2), x.value]], df2, all=TRUE)
output:
id id2 value
1: a 1 100
2: b 2 101
3: c 3 50
4: c 3 200
5: d 4 201
6: e 5 NA
data:
library(data.table)
df1 <- data.table(id=c('a', 'b', 'c', 'd', 'e'), id2=c(1,2,3,4,5),value=c(100, 101, 50, NA, NA))
df2 <- data.table(id=c('c', 'd', 'e'), id2=c(3,4, 5), value=c(200, 201, NA))
Here is one way using full_join and gather
library(dplyr)
left_join(df1, df2, by = c("id","id2")) %>%
tidyr::gather(key, value, starts_with("value"), na.rm = TRUE) %>%
select(-key)
# id id2 value
#1 a 1 100
#2 b 2 101
#3 c 3 50
#7 c 3 200
#8 d 4 201
For the updated case, we can do
left_join(df1, df2, by = c("id","id2")) %>%
tidyr::gather(key, value, starts_with("value")) %>%
group_by(id, id2) %>%
filter((all(is.na(value)) & !duplicated(value)) | !is.na(value)) %>%
select(-key)
# id id2 value
# <chr> <int> <int>
#1 a 1 100
#2 b 2 101
#3 c 3 50
#4 e 5 NA
#5 c 3 200
#6 d 4 201

Replace all NA values for variable with one row equal to 0

Slightly difficult to phrase, as far as I saw none of the similar questions answered my problem.
I have a data.frame such as:
df1 <- data.frame(id = rep(c("a", "b","c"), each = 4),
val = c(NA, NA, NA, NA, 1, 2, 2, 3,NA,2,NA,3))
df1
id val
1 a NA
2 a NA
3 a NA
4 a NA
5 b 1
6 b 2
7 b 2
8 b 3
9 c NA
10 c 2
11 c NA
12 c 3
and I want to get rid of all the NA values (easy enough using e.g. filter() ) but make sure that if this removes all of one id value (in this case it removes every instance of "a") that one extra row is inserted of (e.g.) a = 0
so that:
id val
1 a 0
2 b 1
3 b 2
4 b 2
5 b 3
6 c 2
7 c 3
obviously easy enough to do this in a roundabout way but I was wondering if there's a tidy/elegant way to do this. I thought tidyr::complete() might help but not entirely sure how to apply it to a case like this
I don't care about the order of the rows
Cheers!
edit: updated with clearer desired output. might make desired answers submitted before that a bit less clear
Another idea using dplyr,
library(dplyr)
df1 %>%
group_by(id) %>%
mutate(val = ifelse(row_number() == 1 & all(is.na(val)), 0, val)) %>%
na.omit()
which gives,
# A tibble: 5 x 2
# Groups: id [2]
id val
<fct> <dbl>
1 a 0
2 b 1
3 b 2
4 b 2
5 b 3
We may do
df1 %>% group_by(id) %>% do(if(all(is.na(.$val))) replace(.[1, ], 2, 0) else na.omit(.))
# A tibble: 5 x 2
# Groups: id [2]
# id val
# <fct> <dbl>
# 1 a 0
# 2 b 1
# 3 b 2
# 4 b 2
# 5 b 3
After grouping by id, if everything in val is NA, then we leave only the first row with the second element replaced by 0, otherwise the same data is returned after applying na.omit.
In a more readable format that would be
df1 %>% group_by(id) %>%
do(if(all(is.na(.$val))) data.frame(id = .$id[1], val = 0) else na.omit(.))
(Here I presume that you indeed want to get rid of all NA values; otherwise there is no need for na.omit.)
df1[is.na(df1)] <- 0
df1[!(duplicated(df1$id) & df1$val == 0), ]
id val
1 a 0
5 b 1
6 b 2
7 b 2
8 b 3
Base R option is to find groups with all NAs and transform them by changing their val to 0 and select only unique rows so that there is only one row per group. We rbind this dataframe with the groups which are !all_NA.
all_NA <- with(df1, ave(is.na(val), id, FUN = all))
rbind(unique(transform(df1[all_NA, ], val = 0)), df1[!all_NA, ])
# id val
#1 a 0
#5 b 1
#6 b 2
#7 b 2
#8 b 3
dplyr option looks ugly but one way is to make two groups of dataframes one with groups of all NA values and other with groups of all non-NA values. For groups with all NA values we add row with it's id and val as 0 and bind this to the other group.
library(dplyr)
bind_rows(df1 %>%
group_by(id) %>%
filter(all(!is.na(val))),
df1 %>%
group_by(id) %>%
filter(all(is.na(val))) %>%
ungroup() %>%
summarise(id = unique(id),
val = 0)) %>%
arrange(id)
# id val
# <fct> <dbl>
#1 a 0
#2 b 1
#3 b 2
#4 b 2
#5 b 3
Changed the df to make example more exhaustive -
df1 <- data.frame(id = rep(c("a", "b","c"), each = 4),
val = c(NA, NA, NA, NA, 1, 2, 2, 3,NA,2,NA,3))
library(dplyr)
df1 %>%
group_by(id) %>%
mutate(case=sum(is.na(val))==n(), row_num=row_number() ) %>%
mutate(val=ifelse(is.na(val)&case,0,val)) %>%
filter( !(case&row_num!=1) ) %>%
select(id, val)
Output
id val
<fct> <dbl>
1 a 0
2 b 1
3 b 2
4 b 2
5 b 3
6 c NA
7 c 2
8 c NA
9 c 3
Another base approach, one that doesn't maintain the order of the rows and takes advantage of factors remembering lost values:
df1 <- na.omit(df1)
df1 <- rbind(
df1,
data.frame(
id = levels(df1$id)[!levels(df1$id) %in% df1$id],
val = 0)
)
I do personally prefer the dplyr approach given by Sotos, as I don't like rbind-ing data.frames back together so it's a matter of taste, but this isn't unbearably complicated by my eye. It's easy enough to adapt to a character id column with a unique(df1$id) variable.
Here is an option too:
df1 %>%
mutate_if(is.factor,as.character) %>%
mutate_all(funs(replace(.,is.na(.),0))) %>%
slice(4:nrow(.))
This gives:
id val
1 a 0
2 b 1
3 b 2
4 b 2
5 b 3
Alternative:
df1 %>%
mutate_if(is.factor,as.character) %>%
mutate_all(funs(replace(.,is.na(.),0))) %>%
unique()
UPDATE based on other requirements:
Some users suggested to test on this dataframe. Of course this answer assumes you'll look at everything by hand. Might be less useful if you have to look at everything by "hand" but here goes:
df1 <- data.frame(id = rep(c("a", "b","c"), each = 4), val = c(NA, NA, NA, NA, 1, 2, 2, 3,NA,2,NA,3))
df1 %>%
mutate_if(is.factor,as.character) %>%
mutate(val=ifelse(id=="a",0,val)) %>%
slice(4:nrow(.))
This yields:
id val
1 a 0
2 b 1
3 b 2
4 b 2
5 b 3
6 c NA
7 c 2
8 c NA
9 c 3
Here is a base R solution.
res <- lapply(split(df1, df1$id), function(DF){
if(anyNA(DF$val)) {
i <- is.na(DF$val)
DF$val[i] <- 0
DF <- rbind(DF[i & !duplicated(DF[i, ]), ], DF[!i, ])
}
DF
})
res <- do.call(rbind, res)
row.names(res) <- NULL
res
# id val
#1 a 0
#2 b 1
#3 b 2
#4 b 2
#5 b 3
Edit.
A dplyr solution could be the following.
It was tested with the original dataset posted by the OP, with the dataset in Vivek Kalyanarangan's answer and with the dataset in markus' comment, renamed df2 and df3, respectively.
library(dplyr)
na2zero <- function(DF){
DF %>%
group_by(id) %>%
mutate(val = ifelse(is.na(val), 0, val),
crit = val == 0 & duplicated(val)) %>%
filter(!crit) %>%
select(-crit)
}
na2zero(df1)
na2zero(df2)
na2zero(df3)
One may try this :
df1 = data.frame(id = rep(c("a", "b","c"), each = 4),
val = c(NA, NA, NA, NA, 1, 2, 2, 3,NA,2,NA,3))
df1
# id val
#1 a NA
#2 a NA
#3 a NA
#4 a NA
#5 b 1
#6 b 2
#7 b 2
#8 b 3
#9 c NA
#10 c 2
#11 c NA
#12 c 3
Task is to remove all rows corresponding to any id IFF val for the corresponding id is all NAs and add new row with this id and val = 0.
In this example, id = a.
Note : val for c also has NAs but all the val corresponding to c are not NA therefore we need to remove the corresponding row for c where val = NA.
So lets create another column say, val2 which indicates 0 means its all NAs and 1 otherwise.
library(dplyr)
df1 = df1 %>%
group_by(id) %>%
mutate(val2 = if_else(condition = all(is.na(val)),true = 0, false = 1))
df1
# A tibble: 12 x 3
# Groups: id [3]
# id val val2
# <fct> <dbl> <dbl>
#1 a NA 0
#2 a NA 0
#3 a NA 0
#4 a NA 0
#5 b 1 1
#6 b 2 1
#7 b 2 1
#8 b 3 1
#9 c NA 1
#10 c 2 1
#11 c NA 1
#12 c 3 1
Get the list of ids with corresponding val = NA for all.
all_na = unique(df1$id[df1$val2 == 0])
Then remove theids from the dataframe df1 with val = NA.
df1 = na.omit(df1)
df1
# A tibble: 6 x 3
# Groups: id [2]
# id val val2
# <fct> <dbl> <dbl>
# 1 b 1 1
# 2 b 2 1
# 3 b 2 1
# 4 b 3 1
# 5 c 2 1
# 6 c 3 1
And create a new dataframe with ids in all_na and val = 0
all_na_df = data.frame(id = all_na, val = 0)
all_na_df
# id val
# 1 a 0
then combine these two dataframes.
df1 = bind_rows(all_na_df, df1[,c('id', 'val')])
df1
# id val
# 1 a 0
# 2 b 1
# 3 b 2
# 4 b 2
# 5 b 3
# 6 c 2
# 7 c 3
Hope this helps and Edits are most welcomed :-)

r Retain row names with order()

I want to produce output that shows my df sorted by the number of NAs in each row (as in the df_rows_sorted_by_NAs column below) but that keeps the original row name/number (df col). The combination would look like column 3 below:
# df_rows_sorted_by_NAs df desired_output
# Row 1 : 38 Row 442 : 37 Row 3112 : 38
# Row 2 : 38 Row 3112 : 38 Row 3113 : 38
# Row 3 : 37 Row 3113 : 38 Row 442 : 37
# Row 18 : 30 Row 1128 : 30 Row 1128 : 30
I get the first output with this:
# Sort df by num of NAs
df_rows_sorted_by_NAs <- df[order(rowSums(is.na(df)), decreasing = TRUE), drop = FALSE, ]
# View obs with >=30 NAs
for (row_name in row.names(df_rows_sorted_by_NAs)) {
if (rowSums(is.na(df_rows_sorted_by_NAs[row_name,])) >= 30) {
cat("Row ", row_name, ": ",
rowSums(is.na(df_rows_sorted_by_NAs[row_name,])), "\n")
}
}
I get the second output with this:
for (row_name in row.names(df)) {
if (rowSums(is.na(df[row_name,])) >= 30) {
cat("Row ", row_name, ": ", rowSums(is.na(df[row_name,])), "\n")
}
}
I tried drop = FALSE for order but got the same result. Any suggestions on how to keep the row names when I create the new df?
This seems to work for me:
a <- c(1, 2, 3)
b<- c(1, NA, 3)
c <- c(NA, NA, 3)
d <- c(1, NA, NA)
e <- c(NA, 2, 3)
df <- data.frame(a, b, c, d, e)
df
df <- df[order(rowSums(is.na(df)), decreasing = TRUE),]
df
gives
a b c d e
1 1 1 NA 1 NA
2 2 NA NA NA 2
3 3 3 3 NA 3
then
a b c d e
2 2 NA NA NA 2
1 1 1 NA 1 NA
3 3 3 3 NA 3
and then
df[rowSums(is.na(df)) >1,]
a b c d e
2 2 NA NA NA 2
1 1 1 NA 1 NA
Is the actual question how do you put "Row:" in front?
paste0("Row ", row.names( df[rowSums(is.na(df)) >1,]), ": ",
rowSums(is.na(df)))
Gives you the vector with the strings, you can make that print vertically but that's a different question than getting the sort done.
The tidyverse package is good for these tasks:
library(tidyverse)
An example dataframe:
df <- tribble(
~Length, ~Width, ~Mass, ~Date,
10.3, 3.1, 0.021, "2018-11-28",
NA, 3.3, NA, "2018-11-29",
10.5, NA, 0.025, "2018-11-30"
)
With package dplyr, you can create an ID column and "number of NAs" column with row_number() and rowSums. Of course, if you already have a row ID column, then you can remove ID = row_number() from mutate:
df %>%
mutate(ID = row_number(), noNAs = rowSums(is.na(.)))
... results in ...
# A tibble: 3 x 6
Length Width Mass Date ID noNAs
<dbl> <dbl> <dbl> <chr> <int> <dbl>
1 10.3 3.1 0.021 2018-11-28 1 0
2 NA 3.3 NA 2018-11-29 2 2
3 10.5 NA 0.025 2018-11-30 3 1
... adding select by ID and noNAs, arranging by noNAs (in descending order):
df <- df %>%
mutate(ID = row_number(), noNAs = rowSums(is.na(.)))%>%
select(ID, noNAs) %>%
arrange(desc(noNAs))
... results in ...
# A tibble: 3 x 2
ID noNAs
<int> <dbl>
1 2 2
2 3 1
3 1 0
Finally, if you wanted to filter for rows with more than 30 NAs, then:
df %>% filter(noNAs > 30)

R: Conditionally replacing values based on column pre-fixes and suffixes

I have two data frames. Data frame A has many observations/rows, an ID for each observation, and many additional columns. For a subset of observations X, the values for a set of columns are missing/NA. Data frame B contains a subset of the observations in X (which can be matched across data frames using the ID) and variables with identical names as in data frame A, but containing values to replace the missing values in the set of columns with missing/NA.
My code below (using a join operation) merely adds columns rather than replacing missing values. For each of the additional variables (let's name them W) in B, the resulting table produces W.x and W.y.
library(dplyr)
foo <- data.frame(id = seq(1:6), x = c(NA, NA, NA, 1, 3, 8), z = seq_along(10:15))
bar <- data.frame(id = seq(1:2), x = c(10, 9))
dplyr::left_join(x = foo, y = bar, by = "id")
I am trying to replace the missing values in A using the values in B based on the ID, but do so in an efficient manner since I have many columns and many rows. My goal is this:
id x z
1 1 10 1
2 2 9 2
3 3 NA 3
4 4 1 4
5 5 3 5
6 6 8 6
One thought was to use ifelse() after joining, but typing out ifelse() functions for all of the variables is not feasible. Is there a way to do this simply without the database join or is there a way to apply a function across all columns ending in .x to replace the values in .x with the value in .y if the value in .x is missing?
Another attempt which should essentially only be one assignment operation. Using #alistaire's data again:
vars <- c("x","y")
foo[vars] <- Map(pmax, foo[vars], bar[match(foo$id, bar$id), vars], na.rm=TRUE)
foo
# id x y z
#1 1 10 1 1
#2 2 9 2 2
#3 3 NA 3 3
#4 4 1 4 4
#5 5 3 5 5
#6 6 8 6 6
EDIT
Updating the answer taking #alistaire 's example dataframe.
We can extend the same answer given below using mapply so that it can handle multiple columns for both foo and bar.
Finding out common columns between two dataframes and sorting them so they are in the same order.
vars <- sort(intersect(names(foo), names(bar))[-1])
foo[vars] <- mapply(function(x, y) {
ind = is.na(x)
replace(x, ind, y[match(foo$id[ind], bar$id)])
}, foo[vars], bar[vars])
foo
# id x y z
#1 1 10 1 1
#2 2 9 2 2
#3 3 NA 3 3
#4 4 1 4 4
#5 5 3 5 5
#6 6 8 6 6
Original Answer
I think this does what you are looking for :
foo[-1] <- sapply(foo[-1], function(x) {
ind = is.na(x)
replace(x, ind, bar$x[match(foo$id[ind], bar$id)])
})
foo
# id x z
#1 1 10 1
#2 2 9 2
#3 3 NA 3
#4 4 1 4
#5 5 3 5
#6 6 8 6
For every column (except id) we find the missing value in foo and replace it with corresponding values from bar.
If you don't mind verbose baseR approaches, then you can easily accomplish this using merge() and a careful subsetting of your data frame.
df <- merge(foo, bar, by="id", all.x=TRUE)
names(df) <- c("id", "x", "z", "y")
df$x[is.na(df$x)] <- df$y[is.na(df$x)]
df <- df[c("id", "x", "z")]
> df
id x z
1 1 10 1
2 2 9 2
3 3 NA 3
4 4 1 4
5 5 3 5
6 6 8 6
You can iterate dplyr::coalesce over the intersect of non-grouping columns. It's not elegant, but it should scale reasonably well:
library(tidyverse)
foo <- data.frame(id = seq(1:6),
x = c(NA, NA, NA, 1, 3, 8),
y = 1:6, # add extra shared variable
z = seq_along(10:15))
bar <- data.frame(id = seq(1:2),
y = c(1L, NA),
x = c(10, 9))
# names of non-grouping variables in both
vars <- intersect(names(foo), names(bar))[-1]
foobar <- left_join(foo, bar, by = 'id')
foobar <- vars %>%
map(paste0, c('.x', '.y')) %>% # make list of columns to coalesce
map(~foobar[.x]) %>% # for each set, subset foobar to a two-column data.frame
invoke_map(.f = coalesce) %>% # ...and coalesce it into a vector
set_names(vars) %>% # add names to list elements
bind_cols(foobar) %>% # bind into data.frame and cbind to foobar
select(union(names(foo), names(bar))) # drop duplicated columns
foobar
#> # A tibble: 6 x 4
#> id x y z
#> <int> <dbl> <int> <int>
#> 1 1 10 1 1
#> 2 2 9 2 2
#> 3 3 NA 3 3
#> 4 4 1 4 4
#> 5 5 3 5 5
#> 6 6 8 6 6

Resources