How to fill NA rows by conditions from columns in R - r

Here is an example:
df<-data.frame(v1=rep(1:2, 4),
v2=rep(c("a", "b"), each=4),
v3=paste0(rep(1:2, each=4), rep(c("m", "n", "o", "p"), each=2)),
v4=c(1,2, NA, NA, 3,4, NA,NA),
v5=c(5,6, NA, NA, 7,8, NA,NA),
v6=c(9,10, NA, NA, 11,12, NA,NA))
df
v1 v2 v3 v4 v5 v6
1 1 a 1m 1 5 9
2 2 a 1m 2 6 10
3 1 a 1n NA NA NA
4 2 a 1n NA NA NA
5 1 b 2o 3 7 11
6 2 b 2o 4 8 12
7 1 b 2p NA NA NA
8 2 b 2p NA NA NA
What I wanted is, if column v1+v2+v3 are same by ignore the last letter of v3, fill the NAs from the rows that are not NA . In this case, row3's NA should be filled by row1 due to same 1a1 by ignoring m. So a desired output would be:
v1 v2 v3 v4 v5 v6
1 1 a 1m 1 5 9
2 2 a 1m 2 6 10
3 1 a 1n 1 5 9
4 2 a 1n 2 6 10
5 1 b 2o 3 7 11
6 2 b 2o 4 8 12
7 1 b 2p 3 7 11
8 2 b 2p 4 8 12

I don't know but I think this is a simpler way of producing your results
library(tidyverse)
df %>%
group_by(v1,v2) %>%
fill(v4:v6)
Adding the v3 logic
df %>%
mutate(v7 = v3 %>% as.character() %>% parse_number()) %>%
group_by(v1,v2,v7) %>%
fill(v4:v6) %>%
select(-v7)

Here is a solution that recodes v3 into a variable that only takes into account the numeric part.
library(dplyr)
library(stringr)
#Extract numeric part of the string in v3
df$v7<-str_extract(df$v3,"[[:digit:]]+")
df %>%
group_by(v1,v2,v7) %>%
fill(v4:v6)

Here's a solution using data.table and zoo which ignores v3 column's last letter:
library(data.table)
setDT(df)[, match_cols := paste0(v1, v2, substr(v3, 1, nchar(as.character(v3)) - 1))][, id := .GRP, by = match_cols][, v4 := zoo::na.locf(v4, na.rm = F), by = id][, v5 := zoo::na.locf(v5, na.rm = F), by = id][, v6 := zoo::na.locf(v6, na.rm = F), by = id][ , c("match_cols", "id") := NULL]
df
# v1 v2 v3 v4 v5 v6
#1: 1 a 1m 1 5 9
#2: 2 a 1m 2 6 10
#3: 1 a 1n 1 5 9
#4: 2 a 1n 2 6 10
#5: 1 b 2o 3 7 11
#6: 2 b 2o 4 8 12
#7: 1 b 2p 3 7 11
#8: 2 b 2p 4 8 12

Using na.locf from zoo
library(zoo)
library(data.table)
setDT(df)[, na.locf(.SD),.(v1, v2)]
# v1 v2 v3 v4 v5 v6
#1: 1 a 1m 1 5 9
#2: 1 a 1n 1 5 9
#3: 2 a 1m 2 6 10
#4: 2 a 1n 2 6 10
#5: 1 b 2o 3 7 11
#6: 1 b 2p 3 7 11
#7: 2 b 2o 4 8 12
#8: 2 b 2p 4 8 12
If we want to add the condition in 'v3'
setDT(df)[, names(df)[4:6] := na.locf(.SD),.(v1, v2, sub("\\D+", "", v3))][]
# v1 v2 v3 v4 v5 v6
#1: 1 a 1m 1 5 9
#2: 2 a 1m 2 6 10
#3: 1 a 1n 1 5 9
#4: 2 a 1n 2 6 10
#5: 1 b 2o 3 7 11
#6: 2 b 2o 4 8 12
#7: 1 b 2p 3 7 11
#8: 2 b 2p 4 8 12

Related

Removing rows with all NA's, from data.frames within a list

Example Data:
df1 <- as.data.frame(rbind(c(1,2,3), c(1, NA, 4), c(NA, NA, NA), c(4,6,7), c(4, 8, NA)))
df2 <- as.data.frame(rbind(c(1,2,3), c(1, NA, 4), c(4,6,7), c(NA, NA, NA), c(4, 8, NA)))
dfList <- list(df1,df2)
colnames <- c("A","B","C")
dfList[[1]]
V1 V2 V3
1 1 2 3
2 1 NA 4
3 NA NA NA
4 4 6 7
5 4 8 NA
dfList[[2]]
V1 V2 V3
1 1 2 3
2 1 NA 4
3 4 6 7
4 NA NA NA
5 4 8 NA
How do I remove the rows that are empty/have ALL values NA, within each of the data.frames in the list?
Desired outcome:
V1 V2 V3
1 1 2 3
2 1 NA 4
3 4 6 7
4 4 8 NA
V1 V2 V3
1 1 2 3
2 1 NA 4
3 4 6 7
4 4 8 NA
You can use lapply to iterate over the list and rowSums to drop rows with all NA values.
lapply(dfList, function(x) x[rowSums(!is.na(x)) != 0, ])
#[[1]]
# V1 V2 V3
#1 1 2 3
#2 1 NA 4
#4 4 6 7
#5 4 8 NA
#[[2]]
# V1 V2 V3
#1 1 2 3
#2 1 NA 4
#3 4 6 7
#5 4 8 NA
use tidyverse
library(tidyverse)
library(janitor)
map(dfList, remove_empty, which = c("rows"))
[[1]]
V1 V2 V3
1 1 2 3
2 1 NA 4
4 4 6 7
5 4 8 NA
[[2]]
V1 V2 V3
1 1 2 3
2 1 NA 4
3 4 6 7
5 4 8 NA
Here is another solution with all()
lapply(dfList, function(d) d[!apply(is.na(d), 1, all),])

Iteratively shift variables in data

Some example of my data:
library(tidyverse)
set.seed(1234)
df <- tibble(
v1 = c(1:6),
v2 = rnorm(6, 5, 2) %>% round,
v3 = rnorm(6, 4, 2) %>% round,
v4 = rnorm(6, 4, 1) %>% round %>% lag(1),
v5 = rnorm(6, 6, 2) %>% round %>% lag(2),
v6 = rnorm(6, 5, 3) %>% round %>% lag(3),
v7 = rnorm(6, 5, 3) %>% round %>% lag(4))
v1 v2 v3 v4 v5 v6 v7
1 1 3 3 NA NA NA NA
2 2 6 3 3 NA NA NA
3 3 7 3 4 4 NA NA
4 4 0 2 5 11 3 NA
5 5 6 3 4 6 1 8
6 6 6 2 3 5 7 4
I want to shift it by diagonal, that separates NA and filled data.
So, desired output looks like this:
v1 v2 v3 v4 v5 v6 v7
1 NA NA 3 3 4 3 8
2 NA 3 3 4 11 1 4
3 1 6 3 5 6 7 NA
4 2 7 2 4 5 NA NA
5 3 0 3 4 NA NA NA
6 4 6 2 NA NA NA NA
7 5 6 NA NA NA NA NA
8 6 NA NA NA NA NA NA
Each column around v3 is just shifted by 1, 2, 3.. etc rows down and up.
Tried to achieve this inside dplyr::mutate_all() but I failed to iterate it with a lag() and lead() functions.
EDIT: after #wibeasley advice I made this
df %>%
mutate(dummy1 = c(3:8)) %>%
gather("var", "val", -dummy1) %>%
mutate(
dummy2 = sub("v", "", var, fixed = T),
dummy3 = dummy1 - as.numeric(dummy2) + 1) %>%
select(-dummy1, -dummy2) %>%
spread(var, val) %>%
slice(-c(1:4)) %>% select(-dummy3)
Looks ugly, but works.
We can use lapply to handle each column, putting NA to the back.
df[] <- lapply(df, function(x) c(x[!is.na(x)], x[is.na(x)]))
df
# # A tibble: 6 x 7
# v1 v2 v3 v4 v5 v6 v7
# <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 1 3 3 3 4 3 8
# 2 2 6 3 4 11 1 4
# 3 3 7 3 5 6 7 NA
# 4 4 0 2 4 5 NA NA
# 5 5 6 3 3 NA NA NA
# 6 6 6 2 NA NA NA NA

fill column with previous column if NA

I have a data frame like this
df <- data.frame(v1 = 10:14, v2 = c(NA, 1, NA, 3, 6), v3 = c(1, NA, NA, 9, 4))
v1 v2 v3
1 10 NA 1
2 11 1 NA
3 12 NA NA
4 13 3 9
5 14 6 4
I now want to fill the NAs with the value of the previous column, so it looks like this:
v1 v2 v3
1 10 10 1
2 11 1 1
3 12 12 12
4 13 3 9
5 14 6 4
I know how to do this manually, like this:
df$v2 <- ifelse(is.na(df$v2), df$v1, df$v2)
How can I automate this for a full data frame with many columns?
You can do this with fill from tidyr:
library(dplyr)
library(tidyr)
data.frame(t(df)) %>%
fill(., names(.)) %>%
t()
Result:
v1 v2 v3
X1 10 10 1
X2 11 1 1
X3 12 12 12
X4 13 3 9
X5 14 6 4
Note:
Basically, I transposed df, filled every column downward, then transposed it back to the original orientation
for (i in 2:ncol(df))
df[,i] = ifelse(is.na(df[,i]), df[,i-1],df[,i])
This will propagate values across streaks of NA columns. If you don't want this, just reverse the order of the indexes in the for loop declaration.
Another option using Reduce with ifelse:
df[] <- Reduce(function(x, y) ifelse(is.na(y), x, y), df, accumulate = TRUE)
df
# v1 v2 v3
#1 10 10 1
#2 11 1 1
#3 12 12 12
#4 13 3 9
#5 14 6 4
You could use apply but note that the output will be a matrix
t(apply(df, 1, function(x){
replace(x, is.na(x), x[cumsum(!is.na(x))][is.na(x)])
}))
# v1 v2 v3
#[1,] 10 10 1
#[2,] 11 1 1
#[3,] 12 12 12
#[4,] 13 3 9
#[5,] 14 6 4
By using zoo na.locf
data.frame(t(apply(df,1,function(x) na.locf(x))))
v1 v2 v3
1 10 10 1
2 11 1 1
3 12 12 12
4 13 3 9
5 14 6 4

Rank within groups in R with special NA handling

I have a dataframe like this:
df <- data.frame(
A = rep(c("A", "B", "C", "D"), each = 3),
B = rep(c("V1", "V2", "V3"), 4),
C = c(1,2,3,5,2,NA,4,6,7,3,7,8)
)
# Output
A B C
1 A V1 1
2 A V2 2
3 A V3 3
4 B V1 5
5 B V2 2
6 B V3 NA
7 C V1 4
8 C V2 6
9 C V3 7
10 D V1 3
11 D V2 7
12 D V3 8
My goal is to reveice the ranks grouped by column B on the values in column C. If there is an NA value, this should not be part of the ranking at all. The RANK column should be filled with NA, NULL or something like this then. Ties should end in averages.
The result should look like:
A B C RANK
1 A V1 1 4
2 A V2 2 3.5
3 A V3 3 3
4 B V1 5 1
5 B V2 2 3.5
6 B V3 NA NA
7 C V1 4 2
8 C V2 6 2
9 C V3 7 2
10 D V1 3 3
11 D V2 7 1
12 D V3 8 1
We can group by 'B', rank on 'C', specify the i with a logical condition to select only the non-NA elements from 'C' and assign (:=) the rank values to create the 'RANK' column. By default, the rows that are not used i.e. NA will be NA in the new column
library(data.table)
setDT(df)[!is.na(C), RANK := rank(-C) , B]
df
# A B C RANK
# 1: A V1 1 4.0
# 2: A V2 2 3.5
# 3: A V3 3 3.0
# 4: B V1 5 1.0
# 5: B V2 2 3.5
# 6: B V3 NA NA
# 7: C V1 4 2.0
# 8: C V2 6 2.0
# 9: C V3 7 2.0
#10: D V1 3 3.0
#11: D V2 7 1.0
#12: D V3 8 1.0
Using the ave() function from baseR for ranking the C values within the groups B
First approach:(an improved version of the second approach) Credit: Henrik
df$Rank <- with(df, ave(C, B, FUN=function(x) rank(-x, na.last = "keep",
ties.method = "average")))
Second approach:
df$Rank <- with(df, ave(C, B, FUN=function(x) rank(-x, ties.method = "average")))
df$Rank[is.na(df$C)] <- NA
Output for both approaches:
df
# A B C Rank
# 1 A V1 1 4.0
# 2 A V2 2 3.5
# 3 A V3 3 3.0
# 4 B V1 5 1.0
# 5 B V2 2 3.5
# 6 B V3 NA NA
# 7 C V1 4 2.0
# 8 C V2 6 2.0
# 9 C V3 7 2.0
# 10 D V1 3 3.0
# 11 D V2 7 1.0
# 12 D V3 8 1.0
Finally, the dplyr approach with same output
df %>% group_by(B) %>% mutate(rank = rank(-C, na.last = "keep",
ties.method = "average"))

Maintaining order in subseting dataframe in R

I have a dataframe df :
V1 V2 V3
1 227 Day1
2 288 Day2
3 243 Day3
4 258 Day4
5 274 Day5
6 245 Day6
7 254 Day7
8 249 Day8
9 230 Day9
10 244 Day10
I want to subset df where V1 contains 5,1,7,3 in order. I used subset(df,V1 %in% c(5,1,7,3)) but what I am getting is:
V1 V2 V3
1 227 Day1
3 243 Day3
5 274 Day5
7 254 Day7
I want to maintain the order of rows in V1 to be 5,1,7,3 and not 1,3,5,7. How can I have the output to be:
V1 V2 V3
5 274 Day5
1 227 Day1
7 254 Day7
3 243 Day3
This is perhaps overcomplicated, but I would approach this task as follows:
An exemplary data frame:
(data2 <- data.frame(V1=1:10, V2=letters[1:10]))
## V1 V2
## 1 1 a
## 2 2 b
## 3 3 c
## 4 4 d
## 5 5 e
## 6 6 f
## 7 7 g
## 8 8 h
## 9 9 i
## 10 10 j
Let's find in which row we have a match (NA - no match, i - i-th value was matched):
(m <- match(data2$V1, c(5,1,7,3)))
## [1] 2 NA 4 NA 1 NA 3 NA NA NA
Now we select matching rows matches and permute them accordingly:
data2[!is.na(m),][order(na.omit(m)),]
## V1 V2
## 5 5 e
## 1 1 a
## 7 7 g
## 3 3 c
On the other hand, if you know that V1 consists of consecutive natural numbers (starting from 1), the solution is easy as 1-2-3:
data2[c(5,1,7,3),]
## V1 V2
## 5 5 e
## 1 1 a
## 7 7 g
## 3 3 c
You can use merge with sort=FALSE and it seems to return in the order that it matches. It will also work if you have repeated values in V1. E.g.:
dat <- data.frame(V1=1:10, V2=letters[1:10])
dat$V1[8] <- 5
dat
# V1 V2
#1 1 a
#2 2 b
#3 3 c
#4 4 d
#5 5 e
#6 6 f
#7 7 g
#8 5 h
#9 9 i
#10 10 j
merge(data.frame(V1=c(5,1,7,3)),dat,by="V1",sort=FALSE)
# V1 V2
#1 5 e
#2 5 h
#3 1 a
#4 7 g
#5 3 c

Resources