Replace NA each column based on another vector using dplyr - r

I am trying to replace NAs in a data.frame of many columns using another vector in which the replacement values for each column are given. I know how I could replace each value using a function, but not to find the value in another vector. I am searching for a dplyr approach:
For example:
require(dplyr)
test <- data.frame(A = c(1,2,3,NA), B = c(4,5,NA,2), C = c(NA,2,2,NA), D = c(1,2,3,4))
replace_na <- c(A = 100, B = 200, C = 300)
# Replace with median should be replace with look up value in vector based on the name of the vector or position
test %>% mutate_each_(funs(replace(., is.na(.), median(.,na.rm = T))), names(replace_na))
expected_result <- data.frame(A = c(1,2,3,100), B = c(4,5,200,2), C = c(300,2,2,300), D = c(1,2,3,4))
> expected_result
A B C D
1 1 4 300 1
2 2 5 2 2
3 3 200 2 3
4 100 2 300 4

It is as easy as using replace_na function from tidyr-package:
library(tidyr)
test %>% replace_na(as.list(replacements))
The output:
A B C D
1 1 4 300 1
2 2 5 2 2
3 3 200 2 3
4 100 2 300 4
This function needs a list for which columns the NA's you want to replace. So, it is possible to replace for only selected columns. Example:
replacements2 <- list(B = 200, C = 300)
test %>% replace_na(replacements2)
output:
A B C D
1 1 4 300 1
2 2 5 2 2
3 3 200 2 3
4 NA 2 300 4
As you can see, only the NA's for the B and C columns are replaced.
Data:
test <- data.frame(A = c(1,2,3,NA), B = c(4,5,NA,2), C = c(NA,2,2,NA), D = c(1,2,3,4))
replacements <- c(A = 100, B = 200, C = 300)

We can use Map from base R
test[names(replace_na)] <- Map(function(x,y)
replace(x, is.na(x), y), test[names(replace_na)], replace_na)
test
# A B C D
#1 1 4 300 1
#2 2 5 2 2
#3 3 200 2 3
#4 100 2 300 4
Or with tidyverse
library(tidyverse)
test %>%
select_at(names(replace_na)) %>%
map2_df(., replace_na, ~replace(., is.na(.), .y)) %>%
bind_cols(., select_at(test, setdiff(names(test), names(replace_na))))
# A tibble: 4 x 4
# A B C D
# <dbl> <dbl> <dbl> <dbl>
#1 1 4 300 1
#2 2 5 2 2
#3 3 200 2 3
#4 100 2 300 4
Or with set from data.table
library(data.table)
setDT(test)
for(j in names(replace_na)){
set(test, i = which(is.na(test[[j]])), j = j, value = replace_na[j])
}
test
# A B C D
#1: 1 4 300 1
#2: 2 5 2 2
#3: 3 200 2 3
#4: 100 2 300 4

Related

is it possible to filter rows of one dataframe based on another dataframe?

is it possible to filter rows of one dataframe based on another dataframe?
I have this 2 dataframe:
df_node <- data.frame( id= c("a","b","c","d","e","f","g","h","i"),
group= c(1,1,1,2,2,2,3,3,3))
df_link <- data.frame(from = c("a","d","f","i","b"),
to = c("d","f","i","b","h"))
I would like to delete the lines with characters that are not present in the second dataframe, like this:
here is a basic way to do that:
df_node <- data.frame( id= c("a","b","c","d","e","f","g","h","i"),
group= c(1,1,1,2,2,2,3,3,3))
df_link <- data.frame(from = c("a","d","f","i","b"),
to = c("d","f","i","b","h"))
library(dplyr)
df_result <- df_node%>%
filter(id%in%c(df_link$from,df_link$to))
df_result
# > df_result
# id group
# 1 a 1
# 2 b 1
# 3 d 2
# 4 f 2
# 5 h 3
# 6 i 3
We could use a semi_join:
library(dplyr)
df_node |>
semi_join(tibble(id = c(df_link$from, df_link$to)))
Output:
id group
1 a 1
2 b 1
3 d 2
4 f 2
5 h 3
6 i 3
Here is a oneliner with base R:
df_node[df_node$id %in% unlist(df_link),]
id group
1 a 1
2 b 1
4 d 2
6 f 2
8 h 3
9 i 3
But you could also use a join:
library(dplyr)
df_uniqueID <- data.frame(id = unique(c(df_link$from,df_link$to)) )
right_join(df_node,df_uniqueID)
Joining, by = "id"
id group
1 a 1
2 b 1
3 d 2
4 f 2
5 h 3
6 i 3

bind_rows to each group of tibble

Consider the following two tibbles:
library(tidyverse)
a <- tibble(time = c(-1, 0), value = c(100, 200))
b <- tibble(id = rep(letters[1:2], each = 3), time = rep(1:3, 2), value = 1:6)
So a and b have the same columns and b has an additional column called id.
I want to do the following: group b by id and then add tibble a on top of each group.
So the output should look like this:
# A tibble: 10 x 3
id time value
<chr> <int> <int>
1 a -1 100
2 a 0 200
3 a 1 1
4 a 2 2
5 a 3 3
6 b -1 100
7 b 0 200
8 b 1 4
9 b 2 5
10 b 3 6
Of course there are multiple workarounds to achieve this (like loops for example). But in my case I have a large number of IDs and a very large number of columns.
I would be thankful if anyone could point me towards the direction of a solution within the tidyverse.
Thank you
We can expand the data frame a with id from b and then bind_rows them together.
library(tidyverse)
a2 <- expand(a, id = b$id, nesting(time, value))
b2 <- bind_rows(a2, b) %>% arrange(id, time)
b2
# # A tibble: 10 x 3
# id time value
# <chr> <dbl> <dbl>
# 1 a -1 100
# 2 a 0 200
# 3 a 1 1
# 4 a 2 2
# 5 a 3 3
# 6 b -1 100
# 7 b 0 200
# 8 b 1 4
# 9 b 2 5
# 10 b 3 6
split from base R will divide a data frame into a list of subsets based on an index.
b %>%
split(b[["id"]]) %>%
lapply(bind_rows, a) %>%
lapply(select, -"id") %>%
bind_rows(.id = "id")
# # A tibble: 10 x 3
# id time value
# <chr> <dbl> <dbl>
# 1 a 1 1
# 2 a 2 2
# 3 a 3 3
# 4 a -1 100
# 5 a 0 200
# 6 b 1 4
# 7 b 2 5
# 8 b 3 6
# 9 b -1 100
# 10 b 0 200
An idea (via base R) is to split your data frame and create a new one with id + the other data frame and rbind, i.e.
df = do.call(rbind, lapply(split(b, b$id), function(i)rbind(data.frame(id = i$id[1], a), i)))
which gives
id time value
a.1 a -1 100
a.2 a 0 200
a.3 a 1 1
a.4 a 2 2
a.5 a 3 3
b.1 b -1 100
b.2 b 0 200
b.3 b 1 4
b.4 b 2 5
b.5 b 3 6
NOTE: You can remove the rownames by simply calling rownames(df) <- NULL
We can nest and add the relevant rows to each nested item :
library(tidyverse)
b %>%
nest(-id) %>%
mutate(data= map(data,~bind_rows(a,.x))) %>%
unnest
# # A tibble: 10 x 3
# id time value
# <chr> <dbl> <dbl>
# 1 a -1 100
# 2 a 0 200
# 3 a 1 1
# 4 a 2 2
# 5 a 3 3
# 6 b -1 100
# 7 b 0 200
# 8 b 1 4
# 9 b 2 5
# 10 b 3 6
Maybe not the most efficient way, but easy to follow:
library(tidyverse)
a <- tibble(time = c(-1, 0), value = c(100, 200))
b <- tibble(id = rep(letters[1:2], each = 3), time = rep(1:3, 2), value =
1:6)
a.a <- a %>% add_column(id = rep("a",length(a)))
a.b <- a %>% add_column(id = rep("b",length(a)))
joint <- bind_rows(b,a.a,a.b)
(joint <- arrange(joint,id))

R group by key get max value for multiple columns

I want to do something like this:
How to make a unique in R by column A and keep the row with maximum value in column B
Except my data.table has one key column, and multiple value columns. So say I have the following:
a b c
1: 1 1 1
2: 1 2 1
3: 1 2 2
4: 2 1 1
5: 2 2 5
6: 2 3 3
7: 3 1 4
8: 3 2 1
If the key is column a, I want for each unique a to return the row with the maximum b, and if there is more than one unique max b, get the one with the max c and so on for multiple columns. So the result should be:
a b c
1: 1 2 2
2: 2 3 3
3: 3 2 1
I'd also like this to be done for an arbitrary number of columns. So if my data.table had 20 columns, I'd want the max function to be applied in order from left to right.
Here is a suggested data.table solution. You might want to consider using data.table::frankv as follows:
DT[, .SD[frankv(.SD, ties.method="first")[.N],], by=a]
frankv returns the order. Then [.N] will take the largest rank. Then .SD[ subset to that particular row.
Please let me know if it fails for your larger dataset.
to make this work for any number of columns, a possible dplyr solution would be to use arrange_all
df <- data.frame(a = c(1,1,1,2,2,2,3,3), b = c(1,2,2,1,2,3,1,2),
c = c(1,1,2,1,5,3,4,1))
df %>% group_by(a) %>% arrange_all() %>% filter(row_number() == n())
# A tibble: 3 x 3
# Groups: a [3]
# a b c
# 1 1 2 2
# 2 2 3 3
# 3 3 2 1
The generic solution can be achieved for arbitrary number of column using mutate_at. In the below example c("a","b","c") are arbitrary columns.
library(dplyr)
df %>% arrange_at(.vars = vars(c("a","b","c"))) %>%
mutate(changed = ifelse(a != lead(a), TRUE, FALSE)) %>%
filter(is.na(changed) | changed ) %>%
select(-changed)
a b c
1 1 2 2
2 2 3 3
3 3 2 1
Another option could be using max and dplyr as below. The approach is to first group_by on a and then filter for max value of b. The again group_by on both a and b and filter for rows with max value of c.
library(dplyr)
df %>% group_by(a) %>%
filter(b == max(b)) %>%
group_by(a, b) %>%
filter(c == max(c))
# Groups: a, b [3]
# a b c
# <int> <int> <int>
#1 1 2 2
#2 2 3 3
#3 3 2 1
Data
df <- read.table(text = "a b c
1: 1 1 1
2: 1 2 1
3: 1 2 2
4: 2 1 1
5: 2 2 5
6: 2 3 3
7: 3 1 4
8: 3 2 1", header = TRUE, stringsAsFactors = FALSE)
dat <- data.frame(a = c(1,1,1,2,2,2,3,3),
b = c(1,2,2,1,2,3,1,2),
c = c(1,1,2,1,5,3,4,1))
library(sqldf)
sqldf("with d as (select * from 'dat' group by a order by b, c desc) select * from d order by a")
a b c
1 1 2 2
2 2 3 3
3 3 2 1

Fill NA with character in list

I have some data as follows:
library(tidyr)
library(data.table)
thisdata <- data.frame(numbers = c(1,3,4,5,6,1,2,4,5,6)
,letters = c('A','A','A','A','A','B','B','B','B','B'))
otherdata <- data.frame(numbers = c(1,2,3,4,5,6))
I am looking to split 'thisdata' by the letters column, merge the two lists to 'otherdata' by the numbers column, then fill letters NA with the corresponding letter in that list. So:
out <- split(thisdata , f = thisdata$letters )
out2 <- lapply(out, function(x) merge(x,otherdata,by="numbers",all = TRUE))
However, I can't get the 'fill' function in tidyr to work within the lapply
out3 <- lapply(out2,function(x) fill(x$channel))
Error in UseMethod("fill_") :
no applicable method for 'fill_' applied to an object of class "NULL"
This is the output I'm after, but would rather perform the calculation within the list format:
out4 <- rbindlist(out2)
out5 <- out4 %>%
fill(letters) %>% #default direction down
fill(letters,.direction = "up")
numbers letters
1: 1 A
2: 2 A
3: 3 A
4: 4 A
5: 5 A
6: 6 A
7: 1 B
8: 2 B
9: 3 B
10: 4 B
11: 5 B
12: 6 B
fill expects a data frame as first parameter, try fill(x, letters) or x %>% fill(letters) with magrittr pipe:
out3 <- lapply(out2,function(x) fill(x, letters))
out3
#$A
# numbers letters
#1 1 A
#2 2 A
#3 3 A
#4 4 A
#5 5 A
#6 6 A
#$B
# numbers letters
#1 1 B
#2 2 B
#3 3 B
#4 4 B
#5 5 B
#6 6 B
A simpler method is use tidyr::complete:
thisdata %>%
complete(numbers = otherdata$numbers, letters) %>%
arrange(letters)
# A tibble: 12 x 2
# numbers letters
# <dbl> <fctr>
# 1 1 A
# 2 2 A
# 3 3 A
# 4 4 A
# 5 5 A
# 6 6 A
# 7 1 B
# 8 2 B
# 9 3 B
#10 4 B
#11 5 B
#12 6 B

replace NAs with division of two columns

I have a data.frame that looks like:
a b c d
1 2 NA 1
NA 2 2 1
3 2 NA 1
NA NA 20 2
And I want to replace the NAs with c / d (and delete c and d) to look like:
a b
1 2
2 2
3 2
10 10
Some background: d is a sum of NAs in that particular row.
I don't know the names of the columns, so I tried a few variations of things like:
df2[, 1:(length(colnames(df2)) - 2)][is.na(df2[, 1:(length(colnames(df2)) - 2)])] = df2$c / df2$d
but got:
Error in `[<-.data.frame`(`*tmp*`, is.na(df2[, 1:(length(colnames(df2)) - :
'value' is the wrong length
Here's a way you can do this with dplyr.
library(dplyr)
df <- tibble(
a = c(1, NA, 3, NA),
b = c(2, 2, 2, NA),
c = c(NA, 2, NA, 20L),
d = c(1, 1, 1, 2)
)
df %>%
mutate_at(vars(-c, -d), funs(if_else(is.na(.), c / d, .))) %>%
select(-c, -d)
#> # A tibble: 4 x 2
#> a b
#> <dbl> <dbl>
#> 1 1 2
#> 2 2 2
#> 3 3 2
#> 4 10 10
You can specify the variables in the vars() call using any of the functions from ?dplyr::select_helpers. These could be regex, a simple vector of names, or you can just use all columns except c and d (as I've changed this example to now).
library(data.table)
data<-fread("a b c d
1 2 NA 1
NA 2 2 1
3 2 NA 1
NA NA 20 2")
names_to_loop<-names(data)
names_to_loop<-names_to_loop[names_to_loop!="c"&names_to_loop!="d"]
for (ntl in names_to_loop){
set(data,j=ntl,value=ifelse(is.na(data[[ntl]]),data[["c"]]/data[["d"]],data[[ntl]]))
}
data[,c:=NULL]
data[,d:=NULL]
> data
a b
1: 1 2
2: 2 2
3: 3 2
4: 10 10

Resources