Iterate from one data frame to another, error - r

I want to fill the NA in df with the values in data frame dat and iterate over columns, but it doesn't work, why? Or is there a better solution?
id <- factor(rep(letters[1:2], each=5))
A <- c(1,2,NA,6,8,9,0,6,7,9)
B <- c(5,6,1,9,8,1,NA,9,7,4)
C <- c(2,3,5,NA,NA,2,7,6,4,6)
D <- c(6,5,8,3,2,9,NA,2,6,8)
df <- data.frame(id, A, B,C,D)
df
id A B C D
1 a 1 5 2 6
2 a 2 6 3 5
3 a NA 1 5 8
4 a 6 9 NA 3
5 a 8 8 NA 2
6 b 9 1 2 9
7 b 0 NA 7 NA
8 b 6 9 6 2
9 b 7 7 4 6
10 b 9 4 6 8
dat <- data.frame(col=c("A","B","C","D"), value=c(23,45,26,89))
dat
col value
1 A 23
2 B 45
3 C 26
4 D 89
test <- function(i){
df[,i][is.na(df[,i])] <- dat$value[dat$col==i]
return(df)
}
df <-df[,-1]
for(i in colnames(df)){
df[[i]] <- test(i)
}
df #DOESN'T WORK
Should look like:
df
id A B C D
1 a 1 5 2 6
2 a 2 6 3 5
3 a 23 1 5 8
4 a 6 9 26 3
5 a 8 8 26 2
6 b 9 1 2 9
7 b 0 45 7 89
8 b 6 9 6 2
9 b 7 7 4 6
10 b 9 4 6 8

the replace_na function from tidyr should do what you want.
library(tidyverse)
df %>%
replace_na(list(
"A" = 23,
"B" = 45,
"C" = 26,
"D" = 89
))

Related

Extract positions in a data frame based on a vector

In a dataset I want to know where there are missing values, therefore i use which(is.na(df)). Then I do for example imputation in this dataset and thereafter I want to extract the imputed positions. But I dont know how to extract these data. Does anyone have suggestions? Thanks!
id <- factor(rep(letters[1:2], each=5))
A <- c(1,2,NA,67,8,9,0,6,7,9)
B <- c(5,6,31,9,8,1,NA,9,7,4)
C <- c(2,3,5,NA,NA,2,7,6,4,6)
D <- c(6,5,89,3,2,9,NA,12,69,8)
df <- data.frame(id, A, B,C,D)
df
id A B C D
1 a 1 5 2 6
2 a 2 6 3 5
3 a NA 31 5 89
4 a 67 9 NA 3
5 a 8 8 NA 2
6 b 9 1 2 9
7 b 0 NA 7 NA
8 b 6 9 6 12
9 b 7 7 4 69
10 b 9 4 6 8
pos_na <- which(is.na(df))
pos_na
[1] 13 27 34 35 47
# after imputation
id <- factor(rep(letters[1:2], each=5))
A <- c(1,2,4,67,8,9,0,6,7,9)
B <- c(5,6,31,9,8,1,65,9,7,4)
C <- c(2,3,5,8,2,2,7,6,4,6)
D <- c(6,5,89,3,2,9,6,12,69,8)
df <- data.frame(id, A, B,C,D)
df
id A B C D
1 a 1 5 2 6
2 a 2 6 3 5
3 a 4 31 5 89
4 a 67 9 8 3
5 a 8 8 2 2
6 b 9 1 2 9
7 b 0 65 7 6
8 b 6 9 6 12
9 b 7 7 4 69
10 b 9 4 6 8
Wanted output: 4,65,8,2 6
To store positions of NA use which with arr.ind = TRUE which gives row and column numbers.
pos_na <- which(is.na(df), arr.ind = TRUE)
pos_na
# row col
#[1,] 3 2
#[2,] 7 3
#[3,] 4 4
#[4,] 5 4
#[5,] 7 5
So that after imputation you can extract the values directly.
as.numeric(df[pos_na])
[1] 4 65 8 2 6
Instead of wrapping with which, we can keep it as a logical matrix
i1 <- is.na(df[-1])
Then, after the imputation, just use the i1
df[-1][i1]
#[1] 4 65 8 2 6
Note, the -1 indexing for columns is to remove the first column which is 'character'

Make values not adjacent to each other NA

The values >=10 in the data frame below (values 31,89,12,69) does sometimes come in order like 89 and 12. By that I mean de order 123456789, they are adjacent to eachother. I would like to make the values which are not adjacent to each other(31,69, in 31 nr 2 is missing in between to be in order, for 69, nr 7 and8 are missing to be in order) NA. How to code this? Imagine a big dataset! :)
id <- factor(rep(letters[1:2], each=5))
A <- c(1,2,NA,67,8,9,0,6,7,9)
B <- c(5,6,31,9,8,1,NA,9,7,4)
C <- c(2,3,5,NA,NA,2,7,6,4,6)
D <- c(6,5,89,3,2,9,NA,12,69,8)
df <- data.frame(id, A, B,C,D)
df
id A B C D
1 a 1 5 2 6
2 a 2 6 3 5
3 a NA 31 5 89
4 a 67 9 NA 3
5 a 8 8 NA 2
6 b 9 1 2 9
7 b 0 NA 7 NA
8 b 6 9 6 12
9 b 7 7 4 69
10 b 9 4 6 8
It should look like:
id A B C D
1 a 1 5 2 6
2 a 2 6 3 5
3 a NA NA 5 89
4 a 67 9 NA 3
5 a 8 8 NA 2
6 b 9 1 2 9
7 b 0 NA 7 NA
8 b 6 9 6 12
9 b 7 7 4 NA
10 b 9 4 6 8
Another solution defining a vector of the values to keep beforehand (only up to two-digit numbers, but could be extended):
numerals <- 1:9
vector <- 0:9
for (i in numerals) {
j <- numerals[i+1]
if (!is.na(j)) {
number <- as.numeric(paste(c(i, j), collapse = ""))
number_reverse <- as.numeric(paste(c(j, i), collapse = ""))
vector <- c(vector, number, number_reverse)
}
}
vector
[1] 0 1 2 3 4 5 6 7 8 9 12 21 23 32 34 43 45 54 56 65 67 76 78 87 89 98
Function to replace number if not in vector:
replace <- function(x) {
x <- ifelse(!x %in% vector, NA, x)
return(x)
}
Result:
df %>% mutate_at(c("A", "B", "C", "D"), replace)
id A B C D
1 a 1 5 2 6
2 a 2 6 3 5
3 a NA NA 5 89
4 a 67 9 NA 3
5 a 8 8 NA 2
6 b 9 1 2 9
7 b 0 NA 7 NA
8 b 6 9 6 12
9 b 7 7 4 NA
10 b 9 4 6 8
Here is a function that tests individual numbers
MyFunction <- function(A){
NumbersToCheck <- lapply(strsplit(as.character(A),""),as.integer)
check <- lapply(2:length(unlist(NumbersToCheck)), function(X) ifelse(NumbersToCheck[[1]][X]-NumbersToCheck[[1]][X-1]==1,TRUE,FALSE))
return(ifelse(FALSE %in% check,NA,A))
}
Which can then be applied to your entire df as follows
df[,2:ncol(df)] <- lapply(2:ncol(df), function(X) unlist(lapply(df[,X],MyFunction)))
to get the following result
> df
id A B C D
1 a 1 5 2 6
2 a 2 6 3 5
3 a NA NA 5 89
4 a 67 9 NA 3
5 a 8 8 NA 2
6 b 9 1 2 9
7 b 0 NA 7 NA
8 b 6 9 6 12
9 b 7 7 4 NA
10 b 9 4 6 8
df[] <- lapply(df, function(col) {
# Split each value character by character
NAs <- sapply(strsplit(as.character(col), split = ""), function(chars) {
# Convert them back to integer to compare with `diff`
# and verify the increment is always 1 or -1
diff <- diff(as.integer(chars))
!all(diff == 1) && !all(diff == -1)
})
# If not, replace those values with NA
col[NAs] <- NA
col
})
#> Warning in diff(as.integer(chars)): NAs introduced by coercion
#> Warning in diff(as.integer(chars)): NAs introduced by coercion
#> ...
#> Warning in diff(as.integer(chars)): NAs introduced by coercion
df
#> id A B C D
#> 1 a 1 5 2 6
#> 2 a 2 6 3 5
#> 3 a NA NA 5 89
#> 4 a 67 9 NA 3
#> 5 a 8 8 NA 2
#> 6 b 9 1 2 9
#> 7 b 0 NA 7 NA
#> 8 b 6 9 6 12
#> 9 b 7 7 4 NA
#> 10 b 9 4 6 8
Created on 2020-03-31 by the reprex package (v0.3.0)

Create a function to Impute values form one data frame into another

The NA values in column A should be filled by the A value from the dat data frame and so on for the other variables.
id <- factor(rep(letters[1:2], each=5))
A <- c(1,2,NA,6,8,9,0,6,7,9)
B <- c(5,6,1,9,8,1,NA,9,7,4)
C <- c(2,3,5,NA,NA,2,7,6,4,6)
D <- c(6,5,8,3,2,9,NA,2,6,8)
df <- data.frame(id, A, B,C,D)
df
id A B C D
1 a 1 5 2 6
2 a 2 6 3 5
3 a NA 1 5 8
4 a 6 9 NA 3
5 a 8 8 NA 2
6 b 9 1 2 9
7 b 0 NA 7 NA
8 b 6 9 6 2
9 b 7 7 4 6
10 b 9 4 6 8
dat <- data.frame(col=c("A","B","C","D"), value=c(23,45,26,89))
dat
dat
col value
1 A 23
2 B 45
3 C 26
4 D 89
It should look like:
id A B C D
1 a 1 5 2 6
2 a 2 6 3 5
3 a 23 1 5 8
4 a 6 9 26 3
5 a 8 8 26 2
6 b 9 1 2 9
7 b 0 45 7 89
8 b 6 9 6 2
9 b 7 7 4 6
10 b 9 4 6 8
I was thinking something like this but I dont know how to connect those data frames in a function...
test <- function(i){
df[,i][is.na(df[,i])] <- dat$value
}
test(2)
If you want it in your format
test <- function(i){
df[,i][is.na(df[,i])] <<- dat$value[dat$col==i]
}
test("A")
id A B C D
1 a 1 5 2 6
2 a 2 6 3 5
3 a 23 1 5 8
4 a 6 9 NA 3
5 a 8 8 NA 2
6 b 9 1 2 9
7 b 0 NA 7 NA
8 b 6 9 6 2
9 b 7 7 4 6
10 b 9 4 6 8
One approach is to iterate over the columns and values and use coalesce():
library(dplyr)
library(purrr)
df[-1] <- map2_df(df[-1], dat$value, coalesce)
df
id A B C D
1 a 1 5 2 6
2 a 2 6 3 5
3 a 23 1 5 8
4 a 6 9 26 3
5 a 8 8 26 2
6 b 9 1 2 9
7 b 0 45 7 89
8 b 6 9 6 2
9 b 7 7 4 6
10 b 9 4 6 8
Or same using replace():
map2_df(df[-1], dat$value, ~ replace(.x, is.na(.x), .y))

Keep rows with specific string and the following row

This is my data frame
df <- data.frame(
id = 1:14,
group_id = c(rep(1:2, each = 3), rep(3:4, each = 4)),
type = rep("A", 14), stringsAsFactors = FALSE)
df[c(2,4,8,12),"type"] <- "B"
id group_id type
1 1 1 A
2 2 1 B
3 3 1 A
4 4 2 B
5 5 2 A
6 6 2 A
7 7 3 A
8 8 3 B
9 9 3 A
10 10 3 A
11 11 4 A
12 12 4 B
13 13 4 A
14 14 4 A
I'd like to keep all rows with type B as well as the following row.
I could do...
B <- which(df$type=="B")
afterB <- B+1
df_sel <- df[c(B, afterB), ]
df_sel <- df_sel[order(df_sel$id),]
df_sel
...to get what I want.
id group_id type
2 2 1 B
3 3 1 A
4 4 2 B
5 5 2 A
8 8 3 B
9 9 3 A
12 12 4 B
13 13 4 A
How can this be done in a more generic way.
Another way, very similar to what you do but in one step and without the need to reorder:
df_sel <- df[rep(which(df$type=="B"), e=2)+c(0, 1), ]
df_sel
# id group_id type
# 2 2 1 B
# 3 3 1 A
# 4 4 2 B
# 5 5 2 A
# 8 8 3 B
# 9 9 3 A
# 12 12 4 B
# 13 13 4 A
Using lag from dplyr
library(dplyr)
df[df$type == "B" | lag(df$type == "B", default = FALSE), ]
# id group_id type
#2 2 1 B
#3 3 1 A
#4 4 2 B
#5 5 2 A
#8 8 3 B
#9 9 3 A
#12 12 4 B
#13 13 4 A
using grep will provide a row index of all instances of B - rows; concatenate (c()) this with rows + 1 to select from df will work.
rows <- grep("B", df[, "type"])
df[sort(c(rows, rows + 1)), ]
gives:
id group_id type
2 2 1 B
3 3 1 A
4 4 2 B
5 5 2 A
8 8 3 B
9 9 3 A
12 12 4 B
13 13 4 A

Subset data.frame by column

I have this data.frame:
a <- c(rep("1", 3), rep("2", 3), rep("3",3), rep("4",3), rep("5",3))
b <- c(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15)
df <-data.frame(a,b)
a b
1 1 1
2 1 2
3 1 3
4 2 4
5 2 5
6 2 6
7 3 7
8 3 8
9 3 9
10 4 10
11 4 11
12 4 12
13 5 13
14 5 14
15 5 15
I want to have something like this:
a <- c(rep("2", 3), rep("3", 3))
b <- c(4,5,6,7,8,9)
dffinal<-data.frame(a,b)
a b
1 2 4
2 2 5
3 2 6
4 3 7
5 3 8
6 3 9
I could use the "subset" function, but its not working
sub <- subset(df,c(2,3) == a )
a b
5 2 5
8 3 8
This command only takes one row of "2" and "3" in column "a".
Any Help?
You're confusing == with %in%:
subset(df, a %in% c(2,3))
# a b
# 4 2 4
# 5 2 5
# 6 2 6
# 7 3 7
# 8 3 8
# 9 3 9
what about this?
library(dplyr)
df %>% filter(a == 2 | a==3)
a b
1 2 4
2 2 5
3 2 6
4 3 7
5 3 8
6 3 9
We can use data.table. We convert the 'data.frame' to 'data.table' (setDT(df)), and set the 'key' as column 'a', then we subset the rows.
library(data.table)
setDT(df, key= 'a')[c('2','3')]
# a b
#1: 2 4
#2: 2 5
#3: 2 6
#4: 3 7
#5: 3 8
#6: 3 9

Resources