conditionally adding columns to a list of dataframes - r

I have a list of dataframes with either 2 or 4 columns.
a <- data.frame(a=1:10,
b=1:10,
c=1:10,
d=1:10)
b <- data.frame(a=1:10,
b=1:10)
list_of_df <- list(a,b)
I want to add 2 empty columns to each dataframe with only 2 columns.
I've tried this lapply approach:
lapply(list_of_df, function(x) ifelse(ncol(x) < 4,x%>%add_column(empty=NA),x <- x))
Which does not work unfortunately. How can I fix this?

I came up with something similar:
add_col <- function(x){
col_to_add <- 4 - ncol(x)
if(col_to_add == 0) return(x)
z <- rep(NA, nrow(x))
for (i in 1:col_to_add){
x <- cbind(x, z)
}
x
}
lapply(list_of_df, add_col)

I would use a for loop to avoid copying the whole list:
for (i in seq_along(list_of_df)) {
n_columns = ncol(list_of_df[[i]])
if (n_columns == 2L) {
list_of_df[[i]][c('empty1', 'empty2')] <- NA
}
}
Result:
> list_of_df
[[1]]
a b c d
1 1 1 1 1
2 2 2 2 2
3 3 3 3 3
4 4 4 4 4
5 5 5 5 5
6 6 6 6 6
7 7 7 7 7
8 8 8 8 8
9 9 9 9 9
10 10 10 10 10
[[2]]
a b empty1 empty2
1 1 1 NA NA
2 2 2 NA NA
3 3 3 NA NA
4 4 4 NA NA
5 5 5 NA NA
6 6 6 NA NA
7 7 7 NA NA
8 8 8 NA NA
9 9 9 NA NA
10 10 10 NA NA

We could use bind_rows and then group_split and map from purrr to remove the id_Group column:
library(dplyr)
library(purrr)
bind_rows(list_of_df) %>%
group_split(id_Group =cumsum(a==1)) %>%
map(., ~ (.x %>% ungroup() %>%
select(-id_Group)))
[[1]]
# A tibble: 10 x 4
a b c d
<int> <int> <int> <int>
1 1 1 1 1
2 2 2 2 2
3 3 3 3 3
4 4 4 4 4
5 5 5 5 5
6 6 6 6 6
7 7 7 7 7
8 8 8 8 8
9 9 9 9 9
10 10 10 10 10
[[2]]
# A tibble: 10 x 4
a b c d
<int> <int> <int> <int>
1 1 1 NA NA
2 2 2 NA NA
3 3 3 NA NA
4 4 4 NA NA
5 5 5 NA NA
6 6 6 NA NA
7 7 7 NA NA
8 8 8 NA NA
9 9 9 NA NA
10 10 10 NA NA

Related

Finding equal rows between dataframes in R

I have the following data set as example:
df1 <- data.frame(V1 = 1:10, V2 = 1:10, V3 = 1:10)
df2 <- data.frame(V1 = 5:1, V2 = 5:1, v3 = c(1, 4, 5, 2, 3))
If a row in df1 are present in df2, I would create a column in df1 that indicates the corresponding row to the df2 and for other rows showed FALSE or NULL or NA or 0 or ...
output expected:
V1 V2 V3 rows_matched
1 1 1 1 FALSE
2 2 2 2 4
3 3 3 3 FALSE
4 4 4 4 2
5 5 5 5 FALSE
6 6 6 6 FALSE
7 7 7 7 FALSE
8 8 8 8 FALSE
9 9 9 9 FALSE
10 10 10 10 FALSE
in Base R:
cbind(df1, matched = match(interaction(df1), interaction(df2)))
V1 V2 V3 matched
1 1 1 1 NA
2 2 2 2 4
3 3 3 3 NA
4 4 4 4 2
5 5 5 5 NA
6 6 6 6 NA
7 7 7 7 NA
8 8 8 8 NA
9 9 9 9 NA
10 10 10 10 NA
You can do a simple left join. Note: I fixed the column name in df2 from v3 to V3 to match the names of df1
left_join(
df1,
df2 %>% mutate(rows_matched=row_number())
)
Output:
V1 V2 V3 rows_matched
1 1 1 1 NA
2 2 2 2 4
3 3 3 3 NA
4 4 4 4 2
5 5 5 5 NA
6 6 6 6 NA
7 7 7 7 NA
8 8 8 8 NA
9 9 9 9 NA
10 10 10 10 NA
Here is another way of solving your problem using data.table
library(data.table)
setDT(df1)
setDT(df2)
df1[, rows_matched := df2[df1, on=.(V1,V2,V3), which=TRUE]]
#
# V1 V2 V3 rows_matched
# 1: 1 1 1 NA
# 2: 2 2 2 4
# 3: 3 3 3 NA
# 4: 4 4 4 2
# 5: 5 5 5 NA
# 6: 6 6 6 NA
# 7: 7 7 7 NA
# 8: 8 8 8 NA
# 9: 9 9 9 NA
# 10: 10 10 10 NA
Another possible solution, based on dplyr::left_join (we have to previously capitalize V3 in df2):
library(dplyr)
df1 %>%
left_join(df2 %>% mutate(new = row_number()))
#> Joining, by = c("V1", "V2", "V3")
#> V1 V2 V3 new
#> 1 1 1 1 NA
#> 2 2 2 2 4
#> 3 3 3 3 NA
#> 4 4 4 4 2
#> 5 5 5 5 NA
#> 6 6 6 6 NA
#> 7 7 7 7 NA
#> 8 8 8 8 NA
#> 9 9 9 9 NA
#> 10 10 10 10 NA

Create run-length ID for subset of values

In this type of dataframe:
df <- data.frame(
x = c(3,3,1,12,2,2,10,10,10,1,5,5,2,2,17,17)
)
how can I create a new column recording the run-length ID of only a subset of x values, say, 3-20?
My own attempt only succeeds at inserting NA where the run-length count should be interrupted; but internally it seems the count is uninterrupted:
library(data.table)
df %>%
mutate(rle = ifelse(x %in% 3:20, rleid(x), NA))
x rle
1 3 1
2 3 1
3 1 NA
4 12 3
5 2 NA
6 2 NA
7 10 5
8 10 5
9 10 5
10 1 NA
11 5 7
12 5 7
13 2 NA
14 2 NA
15 17 9
16 17 9
The expected result:
x rle
1 3 1
2 3 1
3 1 NA
4 12 2
5 2 NA
6 2 NA
7 10 3
8 10 3
9 10 3
10 1 NA
11 5 4
12 5 4
13 2 NA
14 2 NA
15 17 5
16 17 5
In base R:
df[df$x %in% 3:20, "rle"] <- data.table::rleid(df[df$x %in% 3:20, ])
x rle
1 3 1
2 3 1
3 1 NA
4 12 2
5 2 NA
6 2 NA
7 10 3
8 10 3
9 10 3
10 1 NA
11 5 4
12 5 4
13 2 NA
14 2 NA
15 17 5
16 17 5
With left_join:
left_join(df, df %>%
filter(x %in% 3:20) %>%
distinct() %>%
mutate(rle = row_number()))
Joining, by = "x"
x rle
1 3 1
2 3 1
3 1 NA
4 12 2
5 2 NA
6 2 NA
7 10 3
8 10 3
9 10 3
10 1 NA
11 5 4
12 5 4
13 2 NA
14 2 NA
15 17 5
16 17 5
With data.table:
library(data.table)
setDT(df)
df[x %between% c(3,20),rle:=rleid(x)][]
x rle
<num> <int>
1: 3 1
2: 3 1
3: 1 NA
4: 12 2
5: 2 NA
6: 2 NA
7: 10 3
8: 10 3
9: 10 3
10: 1 NA
11: 5 4
12: 5 4
13: 2 NA
14: 2 NA
15: 17 5
16: 17 5

How can I make some row values NA if other is NA in R?

I have a dataframe with three columns Time, observed value (Obs.Value), and an interpolated value (Interp.Value). If the value of Obs.Value is NA then the value of Interp.Value should also be NA. I can make the whole row NA but I need to keep the Time value.
Here is the repex:
dat <- data.frame(matrix(ncol = 3, nrow = 10))
x <- c("Time", "Obs.Value", "Interp.Value")
colnames(dat) <- x
dat$Time <- seq(1,10,1)
dat$Obs.Value <- c(5,6,7,NA,NA,5,4,3,NA,2)
interp <- approx(dat$Time,dat$Obs.Value,dat$Time)
dat$Interp.Value <- round(interp$y,1)
Here is the code that makes the whole row NA
dat[with(dat, is.na(Obs.Value)|is.na("Interp.Value")),] <- NA
Here is what the output should look like:
Time Obs.Value Interp.Value
1 1 5 5
2 2 6 6
3 3 7 7
4 4 NA NA
5 5 NA NA
6 6 5 5
7 7 4 4
8 8 3 3
9 9 NA NA
10 10 2 2
dat$Interp.Value[is.na(dat$Obs.Value)] <- NA
dat
# Time Obs.Value Interp.Value
# 1 1 5 5
# 2 2 6 6
# 3 3 7 7
# 4 4 NA NA
# 5 5 NA NA
# 6 6 5 5
# 7 7 4 4
# 8 8 3 3
# 9 9 NA NA
# 10 10 2 2
Or if either column being NA is sufficient, then
dat[!complete.cases(dat[,-1]),-1] <- NA
If there is only one column to change #r2evans' answer is pretty straightforward and way to go. If there are more than one column that you want to change you can use across in dplyr.
library(dplyr)
dat %>%
mutate(across(-c(Time,Obs.Value), ~replace(., is.na(Obs.Value), NA)))
# Time Obs.Value Interp.Value
#1 1 5 5
#2 2 6 6
#3 3 7 7
#4 4 NA NA
#5 5 NA NA
#6 6 5 5
#7 7 4 4
#8 8 3 3
#9 9 NA NA
#10 10 2 2

Mutate, across, and case_when

I am having some trouble getting mutate, across, and case_when to function properly, I've recreated a simple version of my problem here:
a <- c(1:10)
b <- c(2:11)
c <- c(3:12)
test <- tibble(a, b, c)
# A tibble: 10 x 3
a b c
<int> <int> <int>
1 1 2 3
2 2 3 4
3 3 4 5
4 4 5 6
5 5 6 7
6 6 7 8
7 7 8 9
8 8 9 10
9 9 10 11
10 10 11 12
My goal is to replace all of the 3's with 4's, and keep everything else the same. I have the following code:
test_1 <-
test %>%
mutate(across(a:c, ~ case_when(. == 3 ~ 4)))
# A tibble: 10 x 3
a b c
<dbl> <dbl> <dbl>
1 NA NA 4
2 NA 4 NA
3 4 NA NA
4 NA NA NA
5 NA NA NA
6 NA NA NA
7 NA NA NA
8 NA NA NA
9 NA NA NA
10 NA NA NA
It's close but I get NA values where I want to maintain the value in the original tibble. How do I maintain the original values using the mutate across structure?
Thank you in advance!
What about this?
> test %>%
+ mutate(across(a:c, ~ case_when(. == 3 ~ 4, TRUE ~ 1 * (.))))
# A tibble: 10 x 3
a b c
<dbl> <dbl> <dbl>
1 1 2 4
2 2 4 4
3 4 4 5
4 4 5 6
5 5 6 7
6 6 7 8
7 7 8 9
8 8 9 10
9 9 10 11
10 10 11 12
or
> test %>%
+ replace(. == 3, 4)
# A tibble: 10 x 3
a b c
<int> <int> <int>
1 1 2 4
2 2 4 4
3 4 4 5
4 4 5 6
5 5 6 7
6 6 7 8
7 7 8 9
8 8 9 10
9 9 10 11
10 10 11 12
In base R, we can do
test[test ==3] <- 4
This also works:
a <- c(1:10)
b <- c(2:11)
c <- c(3:12)
tibble(a, b, c) %>%
modify(~ ifelse(. == 3, 4, .))
# A tibble: 10 x 3
a b c
<dbl> <dbl> <dbl>
1 1 2 4
2 2 4 4
3 4 4 5
4 4 5 6
5 5 6 7
6 6 7 8
7 7 8 9
8 8 9 10
9 9 10 11
10 10 11 12

Make values not adjacent to each other NA

The values >=10 in the data frame below (values 31,89,12,69) does sometimes come in order like 89 and 12. By that I mean de order 123456789, they are adjacent to eachother. I would like to make the values which are not adjacent to each other(31,69, in 31 nr 2 is missing in between to be in order, for 69, nr 7 and8 are missing to be in order) NA. How to code this? Imagine a big dataset! :)
id <- factor(rep(letters[1:2], each=5))
A <- c(1,2,NA,67,8,9,0,6,7,9)
B <- c(5,6,31,9,8,1,NA,9,7,4)
C <- c(2,3,5,NA,NA,2,7,6,4,6)
D <- c(6,5,89,3,2,9,NA,12,69,8)
df <- data.frame(id, A, B,C,D)
df
id A B C D
1 a 1 5 2 6
2 a 2 6 3 5
3 a NA 31 5 89
4 a 67 9 NA 3
5 a 8 8 NA 2
6 b 9 1 2 9
7 b 0 NA 7 NA
8 b 6 9 6 12
9 b 7 7 4 69
10 b 9 4 6 8
It should look like:
id A B C D
1 a 1 5 2 6
2 a 2 6 3 5
3 a NA NA 5 89
4 a 67 9 NA 3
5 a 8 8 NA 2
6 b 9 1 2 9
7 b 0 NA 7 NA
8 b 6 9 6 12
9 b 7 7 4 NA
10 b 9 4 6 8
Another solution defining a vector of the values to keep beforehand (only up to two-digit numbers, but could be extended):
numerals <- 1:9
vector <- 0:9
for (i in numerals) {
j <- numerals[i+1]
if (!is.na(j)) {
number <- as.numeric(paste(c(i, j), collapse = ""))
number_reverse <- as.numeric(paste(c(j, i), collapse = ""))
vector <- c(vector, number, number_reverse)
}
}
vector
[1] 0 1 2 3 4 5 6 7 8 9 12 21 23 32 34 43 45 54 56 65 67 76 78 87 89 98
Function to replace number if not in vector:
replace <- function(x) {
x <- ifelse(!x %in% vector, NA, x)
return(x)
}
Result:
df %>% mutate_at(c("A", "B", "C", "D"), replace)
id A B C D
1 a 1 5 2 6
2 a 2 6 3 5
3 a NA NA 5 89
4 a 67 9 NA 3
5 a 8 8 NA 2
6 b 9 1 2 9
7 b 0 NA 7 NA
8 b 6 9 6 12
9 b 7 7 4 NA
10 b 9 4 6 8
Here is a function that tests individual numbers
MyFunction <- function(A){
NumbersToCheck <- lapply(strsplit(as.character(A),""),as.integer)
check <- lapply(2:length(unlist(NumbersToCheck)), function(X) ifelse(NumbersToCheck[[1]][X]-NumbersToCheck[[1]][X-1]==1,TRUE,FALSE))
return(ifelse(FALSE %in% check,NA,A))
}
Which can then be applied to your entire df as follows
df[,2:ncol(df)] <- lapply(2:ncol(df), function(X) unlist(lapply(df[,X],MyFunction)))
to get the following result
> df
id A B C D
1 a 1 5 2 6
2 a 2 6 3 5
3 a NA NA 5 89
4 a 67 9 NA 3
5 a 8 8 NA 2
6 b 9 1 2 9
7 b 0 NA 7 NA
8 b 6 9 6 12
9 b 7 7 4 NA
10 b 9 4 6 8
df[] <- lapply(df, function(col) {
# Split each value character by character
NAs <- sapply(strsplit(as.character(col), split = ""), function(chars) {
# Convert them back to integer to compare with `diff`
# and verify the increment is always 1 or -1
diff <- diff(as.integer(chars))
!all(diff == 1) && !all(diff == -1)
})
# If not, replace those values with NA
col[NAs] <- NA
col
})
#> Warning in diff(as.integer(chars)): NAs introduced by coercion
#> Warning in diff(as.integer(chars)): NAs introduced by coercion
#> ...
#> Warning in diff(as.integer(chars)): NAs introduced by coercion
df
#> id A B C D
#> 1 a 1 5 2 6
#> 2 a 2 6 3 5
#> 3 a NA NA 5 89
#> 4 a 67 9 NA 3
#> 5 a 8 8 NA 2
#> 6 b 9 1 2 9
#> 7 b 0 NA 7 NA
#> 8 b 6 9 6 12
#> 9 b 7 7 4 NA
#> 10 b 9 4 6 8
Created on 2020-03-31 by the reprex package (v0.3.0)

Resources