Make values not adjacent to each other NA - r

The values >=10 in the data frame below (values 31,89,12,69) does sometimes come in order like 89 and 12. By that I mean de order 123456789, they are adjacent to eachother. I would like to make the values which are not adjacent to each other(31,69, in 31 nr 2 is missing in between to be in order, for 69, nr 7 and8 are missing to be in order) NA. How to code this? Imagine a big dataset! :)
id <- factor(rep(letters[1:2], each=5))
A <- c(1,2,NA,67,8,9,0,6,7,9)
B <- c(5,6,31,9,8,1,NA,9,7,4)
C <- c(2,3,5,NA,NA,2,7,6,4,6)
D <- c(6,5,89,3,2,9,NA,12,69,8)
df <- data.frame(id, A, B,C,D)
df
id A B C D
1 a 1 5 2 6
2 a 2 6 3 5
3 a NA 31 5 89
4 a 67 9 NA 3
5 a 8 8 NA 2
6 b 9 1 2 9
7 b 0 NA 7 NA
8 b 6 9 6 12
9 b 7 7 4 69
10 b 9 4 6 8
It should look like:
id A B C D
1 a 1 5 2 6
2 a 2 6 3 5
3 a NA NA 5 89
4 a 67 9 NA 3
5 a 8 8 NA 2
6 b 9 1 2 9
7 b 0 NA 7 NA
8 b 6 9 6 12
9 b 7 7 4 NA
10 b 9 4 6 8

Another solution defining a vector of the values to keep beforehand (only up to two-digit numbers, but could be extended):
numerals <- 1:9
vector <- 0:9
for (i in numerals) {
j <- numerals[i+1]
if (!is.na(j)) {
number <- as.numeric(paste(c(i, j), collapse = ""))
number_reverse <- as.numeric(paste(c(j, i), collapse = ""))
vector <- c(vector, number, number_reverse)
}
}
vector
[1] 0 1 2 3 4 5 6 7 8 9 12 21 23 32 34 43 45 54 56 65 67 76 78 87 89 98
Function to replace number if not in vector:
replace <- function(x) {
x <- ifelse(!x %in% vector, NA, x)
return(x)
}
Result:
df %>% mutate_at(c("A", "B", "C", "D"), replace)
id A B C D
1 a 1 5 2 6
2 a 2 6 3 5
3 a NA NA 5 89
4 a 67 9 NA 3
5 a 8 8 NA 2
6 b 9 1 2 9
7 b 0 NA 7 NA
8 b 6 9 6 12
9 b 7 7 4 NA
10 b 9 4 6 8

Here is a function that tests individual numbers
MyFunction <- function(A){
NumbersToCheck <- lapply(strsplit(as.character(A),""),as.integer)
check <- lapply(2:length(unlist(NumbersToCheck)), function(X) ifelse(NumbersToCheck[[1]][X]-NumbersToCheck[[1]][X-1]==1,TRUE,FALSE))
return(ifelse(FALSE %in% check,NA,A))
}
Which can then be applied to your entire df as follows
df[,2:ncol(df)] <- lapply(2:ncol(df), function(X) unlist(lapply(df[,X],MyFunction)))
to get the following result
> df
id A B C D
1 a 1 5 2 6
2 a 2 6 3 5
3 a NA NA 5 89
4 a 67 9 NA 3
5 a 8 8 NA 2
6 b 9 1 2 9
7 b 0 NA 7 NA
8 b 6 9 6 12
9 b 7 7 4 NA
10 b 9 4 6 8

df[] <- lapply(df, function(col) {
# Split each value character by character
NAs <- sapply(strsplit(as.character(col), split = ""), function(chars) {
# Convert them back to integer to compare with `diff`
# and verify the increment is always 1 or -1
diff <- diff(as.integer(chars))
!all(diff == 1) && !all(diff == -1)
})
# If not, replace those values with NA
col[NAs] <- NA
col
})
#> Warning in diff(as.integer(chars)): NAs introduced by coercion
#> Warning in diff(as.integer(chars)): NAs introduced by coercion
#> ...
#> Warning in diff(as.integer(chars)): NAs introduced by coercion
df
#> id A B C D
#> 1 a 1 5 2 6
#> 2 a 2 6 3 5
#> 3 a NA NA 5 89
#> 4 a 67 9 NA 3
#> 5 a 8 8 NA 2
#> 6 b 9 1 2 9
#> 7 b 0 NA 7 NA
#> 8 b 6 9 6 12
#> 9 b 7 7 4 NA
#> 10 b 9 4 6 8
Created on 2020-03-31 by the reprex package (v0.3.0)

Related

conditionally adding columns to a list of dataframes

I have a list of dataframes with either 2 or 4 columns.
a <- data.frame(a=1:10,
b=1:10,
c=1:10,
d=1:10)
b <- data.frame(a=1:10,
b=1:10)
list_of_df <- list(a,b)
I want to add 2 empty columns to each dataframe with only 2 columns.
I've tried this lapply approach:
lapply(list_of_df, function(x) ifelse(ncol(x) < 4,x%>%add_column(empty=NA),x <- x))
Which does not work unfortunately. How can I fix this?
I came up with something similar:
add_col <- function(x){
col_to_add <- 4 - ncol(x)
if(col_to_add == 0) return(x)
z <- rep(NA, nrow(x))
for (i in 1:col_to_add){
x <- cbind(x, z)
}
x
}
lapply(list_of_df, add_col)
I would use a for loop to avoid copying the whole list:
for (i in seq_along(list_of_df)) {
n_columns = ncol(list_of_df[[i]])
if (n_columns == 2L) {
list_of_df[[i]][c('empty1', 'empty2')] <- NA
}
}
Result:
> list_of_df
[[1]]
a b c d
1 1 1 1 1
2 2 2 2 2
3 3 3 3 3
4 4 4 4 4
5 5 5 5 5
6 6 6 6 6
7 7 7 7 7
8 8 8 8 8
9 9 9 9 9
10 10 10 10 10
[[2]]
a b empty1 empty2
1 1 1 NA NA
2 2 2 NA NA
3 3 3 NA NA
4 4 4 NA NA
5 5 5 NA NA
6 6 6 NA NA
7 7 7 NA NA
8 8 8 NA NA
9 9 9 NA NA
10 10 10 NA NA
We could use bind_rows and then group_split and map from purrr to remove the id_Group column:
library(dplyr)
library(purrr)
bind_rows(list_of_df) %>%
group_split(id_Group =cumsum(a==1)) %>%
map(., ~ (.x %>% ungroup() %>%
select(-id_Group)))
[[1]]
# A tibble: 10 x 4
a b c d
<int> <int> <int> <int>
1 1 1 1 1
2 2 2 2 2
3 3 3 3 3
4 4 4 4 4
5 5 5 5 5
6 6 6 6 6
7 7 7 7 7
8 8 8 8 8
9 9 9 9 9
10 10 10 10 10
[[2]]
# A tibble: 10 x 4
a b c d
<int> <int> <int> <int>
1 1 1 NA NA
2 2 2 NA NA
3 3 3 NA NA
4 4 4 NA NA
5 5 5 NA NA
6 6 6 NA NA
7 7 7 NA NA
8 8 8 NA NA
9 9 9 NA NA
10 10 10 NA NA

Extract positions in a data frame based on a vector

In a dataset I want to know where there are missing values, therefore i use which(is.na(df)). Then I do for example imputation in this dataset and thereafter I want to extract the imputed positions. But I dont know how to extract these data. Does anyone have suggestions? Thanks!
id <- factor(rep(letters[1:2], each=5))
A <- c(1,2,NA,67,8,9,0,6,7,9)
B <- c(5,6,31,9,8,1,NA,9,7,4)
C <- c(2,3,5,NA,NA,2,7,6,4,6)
D <- c(6,5,89,3,2,9,NA,12,69,8)
df <- data.frame(id, A, B,C,D)
df
id A B C D
1 a 1 5 2 6
2 a 2 6 3 5
3 a NA 31 5 89
4 a 67 9 NA 3
5 a 8 8 NA 2
6 b 9 1 2 9
7 b 0 NA 7 NA
8 b 6 9 6 12
9 b 7 7 4 69
10 b 9 4 6 8
pos_na <- which(is.na(df))
pos_na
[1] 13 27 34 35 47
# after imputation
id <- factor(rep(letters[1:2], each=5))
A <- c(1,2,4,67,8,9,0,6,7,9)
B <- c(5,6,31,9,8,1,65,9,7,4)
C <- c(2,3,5,8,2,2,7,6,4,6)
D <- c(6,5,89,3,2,9,6,12,69,8)
df <- data.frame(id, A, B,C,D)
df
id A B C D
1 a 1 5 2 6
2 a 2 6 3 5
3 a 4 31 5 89
4 a 67 9 8 3
5 a 8 8 2 2
6 b 9 1 2 9
7 b 0 65 7 6
8 b 6 9 6 12
9 b 7 7 4 69
10 b 9 4 6 8
Wanted output: 4,65,8,2 6
To store positions of NA use which with arr.ind = TRUE which gives row and column numbers.
pos_na <- which(is.na(df), arr.ind = TRUE)
pos_na
# row col
#[1,] 3 2
#[2,] 7 3
#[3,] 4 4
#[4,] 5 4
#[5,] 7 5
So that after imputation you can extract the values directly.
as.numeric(df[pos_na])
[1] 4 65 8 2 6
Instead of wrapping with which, we can keep it as a logical matrix
i1 <- is.na(df[-1])
Then, after the imputation, just use the i1
df[-1][i1]
#[1] 4 65 8 2 6
Note, the -1 indexing for columns is to remove the first column which is 'character'

Iterate from one data frame to another, error

I want to fill the NA in df with the values in data frame dat and iterate over columns, but it doesn't work, why? Or is there a better solution?
id <- factor(rep(letters[1:2], each=5))
A <- c(1,2,NA,6,8,9,0,6,7,9)
B <- c(5,6,1,9,8,1,NA,9,7,4)
C <- c(2,3,5,NA,NA,2,7,6,4,6)
D <- c(6,5,8,3,2,9,NA,2,6,8)
df <- data.frame(id, A, B,C,D)
df
id A B C D
1 a 1 5 2 6
2 a 2 6 3 5
3 a NA 1 5 8
4 a 6 9 NA 3
5 a 8 8 NA 2
6 b 9 1 2 9
7 b 0 NA 7 NA
8 b 6 9 6 2
9 b 7 7 4 6
10 b 9 4 6 8
dat <- data.frame(col=c("A","B","C","D"), value=c(23,45,26,89))
dat
col value
1 A 23
2 B 45
3 C 26
4 D 89
test <- function(i){
df[,i][is.na(df[,i])] <- dat$value[dat$col==i]
return(df)
}
df <-df[,-1]
for(i in colnames(df)){
df[[i]] <- test(i)
}
df #DOESN'T WORK
Should look like:
df
id A B C D
1 a 1 5 2 6
2 a 2 6 3 5
3 a 23 1 5 8
4 a 6 9 26 3
5 a 8 8 26 2
6 b 9 1 2 9
7 b 0 45 7 89
8 b 6 9 6 2
9 b 7 7 4 6
10 b 9 4 6 8
the replace_na function from tidyr should do what you want.
library(tidyverse)
df %>%
replace_na(list(
"A" = 23,
"B" = 45,
"C" = 26,
"D" = 89
))

Create a function to Impute values form one data frame into another

The NA values in column A should be filled by the A value from the dat data frame and so on for the other variables.
id <- factor(rep(letters[1:2], each=5))
A <- c(1,2,NA,6,8,9,0,6,7,9)
B <- c(5,6,1,9,8,1,NA,9,7,4)
C <- c(2,3,5,NA,NA,2,7,6,4,6)
D <- c(6,5,8,3,2,9,NA,2,6,8)
df <- data.frame(id, A, B,C,D)
df
id A B C D
1 a 1 5 2 6
2 a 2 6 3 5
3 a NA 1 5 8
4 a 6 9 NA 3
5 a 8 8 NA 2
6 b 9 1 2 9
7 b 0 NA 7 NA
8 b 6 9 6 2
9 b 7 7 4 6
10 b 9 4 6 8
dat <- data.frame(col=c("A","B","C","D"), value=c(23,45,26,89))
dat
dat
col value
1 A 23
2 B 45
3 C 26
4 D 89
It should look like:
id A B C D
1 a 1 5 2 6
2 a 2 6 3 5
3 a 23 1 5 8
4 a 6 9 26 3
5 a 8 8 26 2
6 b 9 1 2 9
7 b 0 45 7 89
8 b 6 9 6 2
9 b 7 7 4 6
10 b 9 4 6 8
I was thinking something like this but I dont know how to connect those data frames in a function...
test <- function(i){
df[,i][is.na(df[,i])] <- dat$value
}
test(2)
If you want it in your format
test <- function(i){
df[,i][is.na(df[,i])] <<- dat$value[dat$col==i]
}
test("A")
id A B C D
1 a 1 5 2 6
2 a 2 6 3 5
3 a 23 1 5 8
4 a 6 9 NA 3
5 a 8 8 NA 2
6 b 9 1 2 9
7 b 0 NA 7 NA
8 b 6 9 6 2
9 b 7 7 4 6
10 b 9 4 6 8
One approach is to iterate over the columns and values and use coalesce():
library(dplyr)
library(purrr)
df[-1] <- map2_df(df[-1], dat$value, coalesce)
df
id A B C D
1 a 1 5 2 6
2 a 2 6 3 5
3 a 23 1 5 8
4 a 6 9 26 3
5 a 8 8 26 2
6 b 9 1 2 9
7 b 0 45 7 89
8 b 6 9 6 2
9 b 7 7 4 6
10 b 9 4 6 8
Or same using replace():
map2_df(df[-1], dat$value, ~ replace(.x, is.na(.x), .y))

How do I select rows in a data frame before and after a condition is met?

I'm searching the web for a few a days now and I can't find a solution to my (probably easy to solve) problem.
I have huge data frames with 4 variables and over a million observations each. Now I want to select 100 rows before, all rows while and 1000 rows after a specific condition is met and fill the rest with NA's. I tried it with a for loop and if/ifelse but it doesn't work so far. I think it shouldn't be a big thing, but in the moment I just don't get the hang of it.
I create the data using:
foo<-data.frame(t = 1:15, a = sample(1:15), b = c(1,1,1,1,1,4,4,4,4,1,1,1,1,1,1), c = sample(1:15))
My Data looks like this:
ID t a b c
1 1 4 1 7
2 2 7 1 10
3 3 10 1 6
4 4 2 1 4
5 5 13 1 9
6 6 15 4 3
7 7 8 4 15
8 8 3 4 1
9 9 9 4 2
10 10 14 1 8
11 11 5 1 11
12 12 11 1 13
13 13 12 1 5
14 14 6 1 14
15 15 1 1 12
What I want is to pick the value of a (in this example) 2 rows before, all rows while and 3 rows after the value of b is >1 and fill the rest with NA's. [Because this is just an example I guess you can imagine that after these 15 rows there are more rows with the value for b changing from 1 to 4 several times (I did not post it, so I won't spam the question with unnecessary data).]
So I want to get something like:
ID t a b c d
1 1 4 1 7 NA
2 2 7 1 10 NA
3 3 10 1 6 NA
4 4 2 1 4 2
5 5 13 1 9 13
6 6 15 4 3 15
7 7 8 4 15 8
8 8 3 4 1 3
9 9 9 4 2 9
10 10 14 1 8 14
11 11 5 1 11 5
12 12 11 1 13 11
13 13 12 1 5 NA
14 14 6 1 14 NA
15 15 1 1 12 NA
I'm thankful for any help.
Thank you.
Best regards,
Chris
here is the same attempt as missuse, but with data.table:
library(data.table)
foo<-data.frame(t = 1:11, a = sample(1:11), b = c(1,1,1,4,4,4,4,1,1,1,1), c = sample(1:11))
DT <- setDT(foo)
DT[ unique(c(DT[,.I[b>1] ],DT[,.I[b>1]+3 ],DT[,.I[b>1]-2 ])), d := a]
t a b c d
1: 1 10 1 2 NA
2: 2 6 1 10 6
3: 3 5 1 7 5
4: 4 11 4 4 11
5: 5 4 4 9 4
6: 6 8 4 5 8
7: 7 2 4 8 2
8: 8 3 1 3 3
9: 9 7 1 6 7
10: 10 9 1 1 9
11: 11 1 1 11 NA
Here
unique(c(DT[,.I[b>1] ],DT[,.I[b>1]+3 ],DT[,.I[b>1]-2 ]))
gives you your desired indixes : the unique indices of the line for your condition, the same indices+3 and -2.
Here is an attempt.
Get indexes that satisfy the condition b > 1
z <- which(foo$b > 1)
get indexes for (z - 2) : (z + 3)
ind <- unique(unlist(lapply(z, function(x){
g <- pmax(x - 2, 1) #if x - 2 is negative
g : (x + 3)
})))
create d column filled with NA
foo$d <- NA
replace elements with appropriate indexes with foo$a
foo$d[ind] <- foo$a[ind]
library(dplyr)
library(purrr)
# example dataset
foo<-data.frame(t = 1:15,
a = sample(1:15),
b = c(1,1,1,1,1,4,4,4,4,1,1,1,1,1,1),
c = sample(1:15))
# function to get indices of interest
# for a given index x go 2 positions back and 3 forward
# keep only positive indices
GetIDsBeforeAfter = function(x) {
v = (x-2) : (x+3)
v[v > 0]
}
foo %>% # from your dataset
filter(b > 1) %>% # keep rows where b > 1
pull(t) %>% # get the positions
map(GetIDsBeforeAfter) %>% # for each position apply the function
unlist() %>% # unlist all sets indices
unique() -> ids_to_remain # keep unique ones and save them in a vector
foo$d = foo$c # copy column c as d
foo$d[-ids_to_remain] = NA # put NA to all positions not in our vector
foo
# t a b c d
# 1 1 5 1 8 NA
# 2 2 6 1 14 NA
# 3 3 4 1 10 NA
# 4 4 1 1 7 7
# 5 5 10 1 5 5
# 6 6 8 4 9 9
# 7 7 9 4 15 15
# 8 8 3 4 6 6
# 9 9 7 4 2 2
# 10 10 12 1 3 3
# 11 11 11 1 1 1
# 12 12 15 1 4 4
# 13 13 14 1 11 NA
# 14 14 13 1 13 NA
# 15 15 2 1 12 NA

Resources