Iterating variables over a pipeline with for loop in R - r

I have a dataset like the following simplified one:
x_1 <- c(1, NA, 2, 3, NA, 4, 5)
x_2 <- c(2, 1, NA, NA, NA, 4, 6)
y_1 <- c(2, 4, 6, 8, NA, 10, NA)
y_2 <- c(NA, 4, NA, 8, 10, 11, 13)
df <- data.frame(x_1, x_2, y_1, y_2)
x_1 x_2 y_1 y_2
1 1 2 2 NA
2 NA 1 4 4
3 2 NA 6 NA
4 3 NA 8 8
5 NA NA NA 10
6 4 4 10 11
7 5 6 NA 13
The goal is to coalesce each of the two corresponding variables (x and y) and to replace the values that are not the same (e.g. first row of x_1 and x_2) with NA. I did this with the following:
df <- df %>%
mutate(x = coalesce(x_1, x_2)) %>%
mutate(x = ifelse(!is.na(x) &
!is.na(x_2) &
x != x_2,
NA,
x)) %>%
select(!c(x_1, x_2))
Now, I have to do this with 21 variables so I thought I put the variables in a list and feed them through the pipeline with a for loop like this:
cols <- c("x", "y")
for(i in cols){
var_1 <- paste(i, "1", sep = "_")
var_2 <- paste(i, "2", sep = "_")
df <- df %>%
mutate(i = coalesce(var_1, var_2)) %>%
mutate(i = ifelse(!is.na(i) &
!is.na(var_2) &
i != var_2,
NA,
i)) %>%
select(!c(var_1, var_2))
}
What happens is that the code is executed, but instead of the new variables there is only the variable "i" with empty values. It seems as if R does not recognise the "i" in the pipeline as the iterator, however it does recognize "var_1" and "var_2" (because they are being removed from the dataset).
Does anyone know why that is and how I can fix it?
Thanks a lot in advance.

fun <- function(x, var) {
var_1 <- sym(paste(var, "1", sep = "_"))
var_2 <- sym(paste(var, "2", sep = "_"))
x %>%
mutate(!!var := ifelse((!!var_1 != !!var_2) %in% TRUE,
NA, coalesce(!!var_1, !!var_2))) %>%
select(!c(var_1, var_2))
}
cols <- c("x", "y")
Reduce(fun, cols, init = df)
# x y
# 1 NA 2
# 2 1 4
# 3 2 6
# 4 3 8
# 5 NA 10
# 6 4 NA
# 7 NA 13

If you want to avoid rlang:
library(tidyverse)
library(stringr)
x_1 <- c(1, NA, 2, 3, NA, 4, 5)
x_2 <- c(2, 1, NA, NA, NA, 4, 6)
y_1 <- c(2, 4, 6, 8, NA, 10, NA)
y_2 <- c(NA, 4, NA, 8, 10, 11, 13)
df <- data.frame(x_1, x_2, y_1, y_2)
my_coalesce <- function(d) {
vec_1 <- select(d, 1) %>% pull()
vec_2 <- select(d, 2) %>% pull()
res <- coalesce(vec_1, vec_2)
res[vec_1 != vec_2] <- NA
res
}
cols <- c("x", "y")
map(cols, ~df %>%
select(starts_with(.x)) %>% # or:
#select(str_c(.x, "_", 1:2)) %>%
my_coalesce()) %>%
set_names(cols) %>%
as_tibble()

Related

R: Set next row to NA in group_by

I want to set the next row i+1 in the same column to NA if there is already an NA in row i and then do this by groups. Here is my attempt:
dfeg <- tibble(id = c(rep("A", 5), rep("B", 5)),
x = c(1, 2, NA, NA, 3, 5, 6, NA, NA, 7))
setNextrowtoNA <- function(x){
for (j in 1:length(x)){
if(is.na(x[j])){x[j+1] <- NA}
}
}
dfeg <- dfeg %>% group_by(id) %>% mutate(y = setNextrowtoNA(x))
However my attempt doesn't create the column y that am looking for. Can anyone help with this? Thanks!
EDIT: In my actual data I have multiple values in a row that need to be set to NA, for example my data is more like this:
dfeg <- tibble(id = c(rep("A", 6), rep("B", 6)),
x = c(1, 2, NA, NA, 3, 4, 15, 16, NA, NA, 17, 18))
And need to create a column like this:
y = c(1, 2, NA, NA, NA, NA, 15, 16, NA, NA, NA, NA)
Any ideas? Thanks!
EDIT 2:
I figured it out on my own, this seems to work:
dfeg <- tibble(id = c(rep("A", 6), rep("B", 6)),
x = c(1, 2, NA, NA, 3, 4, 15, 16, NA, NA, 17, 18))
setNextrowtoNA <- function(x){
for (j in 1:(length(x))){
if(is.na(x[j]))
{
x[j+1] <- NA
}
lengthofx <- length(x)
x <- x[-lengthofx]
print(x[j])
}
return(x)
}
dfeg <- dfeg %>% group_by(id) %>% mutate(y = NA,
y = setNextrowtoNA(x))
Use cumany:
library(dplyr)
dfeg %>%
group_by(id) %>%
mutate(y = ifelse(cumany(is.na(x)), NA, x))
id x y
<chr> <dbl> <dbl>
1 A 1 1
2 A 2 2
3 A NA NA
4 A NA NA
5 A 3 NA
6 A 4 NA
7 B 15 15
8 B 16 16
9 B NA NA
10 B NA NA
11 B 17 NA
12 B 18 NA
Previous answer:
Use an ifelse statement with lag:
library(dplyr)
dfeg %>%
group_by(id) %>%
mutate(y = ifelse(is.na(lag(x, default = 0)), NA, x))

How to automatically fill in a blank column

I am trying to get the list of sums of two columns from my original data set, from left to right
I have made a loop:
for (i in 1:ncol(df)) {
m = i
n = i + 1
if (i %% 2 != 0) {
df_cum$V1 <- sum(df[,m] + df[,n])
}
}
But, the way to add value to the new list is wrong:
df_cum$V1 <- sum(df[,m] + df[,n])
would be really appreciated if anyone knows how to do that in R
You can try split.default(), i.e.
sapply(split.default(df, gsub('\\d+', '', names(df))), sum)
A B
17 12
A base R option using tapply -
tapply(unlist(df),
rep(1:ncol(df), each = nrow(df) * 2, length.out = nrow(df) * ncol(df)),
sum)
# 1 2 3
#17 12 13
The logic here is to create group of every 2 columns and sum them.
data
It is easier to help if you provide data in a reproducible format
df <- data.frame(A1 = c(0, 3, 2), A2 = c(2, 6, 4),
B1 = c(3, 0, 1), B2 = c(2, 3, 3),
C1 = c(7, 3, 2), C2 = c(1, 0, 0))
We can do this in tidyverse
library(dplyr)
library(tidyr)
df1 %>%
pivot_longer(everything(), names_to = c(".value", "grp"),
names_sep ="(?<=[A-Z])(?=[0-9])") %>%
select(-grp) %>%
summarise(across(everything(), sum, na.rm = TRUE), .groups = 'drop')
-output
# A tibble: 1 x 3
A B C
<dbl> <dbl> <dbl>
1 17 12 13
Or using base R
aggregate(values ~ ., transform(stack(df1),
ind = sub("\\d+", "", ind)), FUN = sum)
ind values
1 A 17
2 B 12
3 C 13
Or another option with rowsum from base R
with(stack(df1), rowsum(values, group = trimws(ind, whitespace = "\\d+")))
[,1]
A 17
B 12
C 13
Or another option is with colSums and rowsum
{tmp <- colSums(df1); rowsum(tmp, group = substr(names(tmp), 1, 1))}
[,1]
A 17
B 12
C 13
data
df1 <- structure(list(A1 = c(0, 3, 2), A2 = c(2, 6, 4), B1 = c(3, 0,
1), B2 = c(2, 3, 3), C1 = c(7, 3, 2), C2 = c(1, 0, 0)),
class = "data.frame", row.names = c(NA,
-3L))

Take Symmetrical Mean of a tibble (ignoring the NAs)

I have a tibble where the rows and columns are the same IDs and I would like to take the mean (ignoring the NAs) to make the df symmetrical. I am struggling to see how.
data <- tibble(group = LETTERS[1:4],
A = c(NA, 10, 20, NA),
B = c(15, NA, 25, 30),
C = c(20, NA, NA, 10),
D = c(10, 12, 15, NA)
)
I would normally do
A <- as.matrix(data[-1])
(A + t(A))/2
But this does not work because of the NAs.
Edit: below is the expected output.
output <- tibble(group = LETTERS[1:4],
A = c(NA, 12.5, 20, 10),
B = c(12.5, NA, 25, 21),
C = c(20, 25, NA, 12.5),
D = c(10, 21, 12.5, NA))
Here is a suggestion using tidyverse code.
library(tidyverse)
data <- tibble(group = LETTERS[1:4],
A = c(NA, 10, 20, NA),
B = c(15, NA, 25, 30),
C = c(20, NA, NA, 10),
D = c(10, 12, 15, NA)
)
A <- data %>%
pivot_longer(-group, values_to = "x")
B <- t(data) %>%
as.data.frame() %>%
setNames(LETTERS[1:4]) %>%
rownames_to_column("group") %>%
pivot_longer(-group, values_to = "y") %>%
left_join(A, by = c("group", "name")) %>%
mutate(
mean = if_else(!(is.na(x) | is.na(y)), (x + y)/2, x),
mean = if_else(is.na(mean) & !is.na(y), y, mean)
) %>%
select(-x, -y) %>%
pivot_wider(names_from = name, values_from = mean)
B
## A tibble: 4 x 5
# group A B C D
# <chr> <dbl> <dbl> <dbl> <dbl>
#1 A NA 12.5 20 10
#2 B 12.5 NA 25 21
#3 C 20 25 NA 12.5
#4 D 10 21 12.5 NA
Okay so this is how I ended up doing this. I would have preferred if I didnt use a for loop because the actual data I have is much bigger but beggars cant be choosers!
A <- as.matrix(data[-1])
for (i in 1:nrow(A)){
for (j in 1:ncol(A)){
if(is.na(A[i,j])){
A[i,j] <- A[j, i]
}
}
}
output <- (A + t(A))/2
output %>%
as_tibble() %>%
mutate(group = data$group) %>%
select(group, everything())
# A tibble: 4 x 5
group A B C D
<chr> <dbl> <dbl> <dbl> <dbl>
1 A NA 12.5 20 10
2 B 12.5 NA 25 21
3 C 20 25 NA 12.5
4 D 10 21 12.5 NA

replace NA values of two data frames with matching ID and dates in R

I have two data frames with different lengths in rows and columns
data.frame(
stringsAsFactors = FALSE,
Date = c("01/01/2000", "01/01/2010", "01/01/2020"),
Germany = c(5, 8, 9),
France = c(4, NA, 7),
Luxembourg = c(10, 6, 3)
) -> df1
data.frame(
stringsAsFactors = FALSE,
Date = c("01/01/1990", "01/01/2000", "01/01/2010", "01/01/2020"),
Germany = c(1, 9, 7, NA),
France = c(10, 3, 9, 6),
Luxembourg = c(10, NA, NA, 7),
Belgium = c(NA, 8, 1, 9)
) -> df2
I have to create a third df (df3) where,
NA values of df1 are replaced with the values of df2 by matching IDs and Dates and viceversa (the NA from df2 replaced by df1)
The values of df1 are priority (=TRUE)
All those columns that are not in one data frame (like Belgium in this case) should also be included in the df3
df3 should look like this:
Any help would be greatly appreciated
We can do a join on the 'Date' and use fcoalesce to replace the NA with the corresponding non-NA
library(data.table)
nm2 <- intersect(names(df2)[-1], names(df1)[-1])
df3 <- copy(df2)
setDT(df3)[df1, (nm2) := Map(fcoalesce, mget(nm2),
mget(paste0('i.', nm2))), on = .(Date)]
-output
df3
# Date Germany France Luxembourg Belgium
#1: 01/01/1990 1 10 10 NA
#2: 01/01/2000 9 3 10 8
#3: 01/01/2010 7 9 6 1
#4: 01/01/2020 9 6 7 9
Or this can be done with tidyverse
library(dplyr)
library(stringr)
left_join(df2, df1, by = 'Date') %>%
mutate(Date, across(ends_with(".x"),
~ coalesce(., get(str_replace(cur_column(), "\\.x$", ".y"))))) %>%
select(-ends_with('.y')) %>%
rename_with(~ str_remove(., "\\.x$"), ends_with('.x'))
Here is another data.table option
cols <- setdiff(intersect(names(df1), names(df2)), "Date")
setDT(df1)[setDT(df2),
on = "Date"
][
,
c(cols) :=
Map(
fcoalesce,
.SD[, cols, with = FALSE],
.SD[, paste0("i.", cols), with = FALSE]
)
][,
.SD,
.SDcols = patterns("^[^i]")
]
giving
Date Germany France Luxembourg Belgium
1: 01/01/1990 1 10 10 NA
2: 01/01/2000 5 4 10 8
3: 01/01/2010 8 9 6 1
4: 01/01/2020 9 7 3 9
An approach with dplyr only using mutate(across...
I also propose use of full_join instead of left_join or right_join as full_join will take all rows from df1 or df2 as opposed to left or right joins.
data.frame(
stringsAsFactors = FALSE,
Date = c("01/01/2000", "01/01/2010", "01/01/2020"),
Germany = c(5, 8, 9),
France = c(4, NA, 7),
Luxembourg = c(10, 6, 3)
) -> df1
data.frame(
stringsAsFactors = FALSE,
Date = c("01/01/1990", "01/01/2000", "01/01/2010", "01/01/2020"),
Germany = c(1, 9, 7, NA),
France = c(10, 3, 9, 6),
Luxembourg = c(10, NA, NA, 7),
Belgium = c(NA, 8, 1, 9)
) -> df2
library(dplyr)
df1 %>% full_join(df2, by = 'Date', suffix = c('_x', '_y')) %>%
mutate(across(ends_with('_x'), ~coalesce(., get(sub('_x', '_y', cur_column()))),
.names = '{sub("_x", "", {.col})}')) %>%
select(!ends_with('_x') & !ends_with('_y'))
#> Date Belgium Germany France Luxembourg
#> 1 01/01/2000 8 5 4 10
#> 2 01/01/2010 1 8 9 6
#> 3 01/01/2020 9 9 7 3
#> 4 01/01/1990 NA 1 10 10
Created on 2021-05-18 by the reprex package (v2.0.0)
library(tidyverse)
library(lubridate)
df1 <- tibble::tribble(
~Date, ~Germany, ~France, ~Luxembourg,
"01/01/2000", 5, 4, 10,
"01/01/2010", 8, NA, 6,
"01/01/2020", 9, 7, 3
)
df2 <- tibble::tribble(
~Date, ~Germany, ~France, ~Luxembourg, ~Belgium,
"01/01/1990", 1, 10, 10, NA,
"01/01/2000", 9, 3, NA, 8,
"01/01/2010", 7, 9, NA, 1,
"01/01/2020", NA, 6, 7, 9
)
bind_rows(df1 %>%
mutate(priority = 1),
df2 %>%
mutate(priority = 2)) %>%
mutate(Date = lubridate::dmy(Date)) %>%
group_by(Date) %>%
arrange(priority) %>%
summarise(across(-priority, ~ first(na.omit(.))))
#> # A tibble: 4 x 5
#> Date Germany France Luxembourg Belgium
#> <date> <dbl> <dbl> <dbl> <dbl>
#> 1 1990-01-01 1 10 10 NA
#> 2 2000-01-01 5 4 10 8
#> 3 2010-01-01 8 9 6 1
#> 4 2020-01-01 9 7 3 9
Base R solution:
# Store as a variable a list denoting each data.frame's column names:
# cnames => character vector
cnames <- list(names(df1), names(df2))
# Determine which vector of names is required in the resulting data.frame
# required_vecs => character vector
required_vecs <- cnames[[which.max(lengths(cnames))]]
# Merge the data: full_data => data.frame
full_data <- merge(
df1,
df2,
by = "Date",
all = TRUE
)
# Resolve the vector names of vectors requiring coalescing:
# clsce_required_vecs=> character vector
clsce_required_vecs <- setdiff(intersect(names(df1), names(df2)), c("Date"))
# Resolve the vector names of vectors not requiring coalescing:
# nt_rqrd_vecs => character vector
nt_rqrd_vecs <- setdiff(unlist(cnames), clsce_required_vecs)
# Split-Apply-Combine data requiring coalescing: coalesced_data => data.frame
coalesced_data <- setNames(
data.frame(
do.call(
cbind,
lapply(
clsce_required_vecs,
function(x) {
# Subset the data to only contain relevant vectors: sbst => data.frame
sbst <- full_data[,grepl(x, names(full_data))]
# Split each column (of the same data) into a data.frame in a list:
# same_vecs => list of data.frames
same_vecs <- split.default(sbst, seq_len(ncol(sbst)))
# Rename the data.frames as required and row-bind them into a single df:
# vector => GlobalEnv()
Reduce(
function(y, z){
replace(y, is.na(y), z[is.na(y)])
},
do.call(cbind, same_vecs)
)
}
)
), row.names = NULL),
clsce_required_vecs)
# Column bind and order the columns:
res <- cbind(full_data[, nt_rqrd_vecs], coalesced_data)[,required_vecs]

Insert specified values in R grouped df and fill up missing values using another df (R)

I have 2 dfs : df & xdf.
df <- tibble(id = c("a", "a", "a", "a", "b", "b", "b", "b"),
x = c(1, 2, 3, 4, 1, 2, 3, 4),
y = c(0.2, 0, 0.9, 7, 1, 0.3, 5, 5.1))
xdf <- tibble(id = c("a", "b"),
x = c(2, 3.5))
In df, within "id" column, for the groups (a & b), I would like to insert only that row of xdf which matches the same id name as in df. How can I make it ? I have tried following commands but all of the values of xdf$x are inserted for each group.
ndf <- df %>%
group_by(id) %>%
do(add_row(., id = .$id[1], x = xdf$x))
> ndf
# A tibble: 12 x 3
# Groups: id [2]
id x y
<chr> <dbl> <dbl>
1 a 1 0.2
2 a 2 0
3 a 3 0.9
4 a 4 7
5 a 2 NA
6 a 3.5 NA
7 b 1 1
8 b 2 0.3
9 b 3 5
10 b 4 5.1
11 b 2 NA
12 b 3.5 NA
# expected result should be : ndf <- ndf[c(-6,-11),]
My end goal is to fill these newborns NA of ndf with the approx() function. But my issue remains because I'm using xout = xdf$x that calls supernumerary values. How can I overcome this? Can you help to write a function that makes xout varies?
f <- function(z)
{
fdf <- approx(z$x, z$y, xout = xdf$x, method = "linear")
return(data.frame(nx= fdf$x, y.out = fdf$y, id = unique(z$id)))
}
jdf <- as.data.frame(ddply(ndf, .(id), f))
zdf <- subset(jdf, select = c(id, nx, y.out))
> zdf
id nx y.out
1 a 2.0 0.00
2 a 3.5 3.95
3 b 2.0 0.30
4 b 3.5 5.05
# expected results
id nx y.out
1 a 2.0 0.00
2 b 3.5 5.05
Any helpful tips to this is welcome. Many thanks!
library(dplyr)
df <- tibble(id = c("a", "a", "a", "a", "b", "b", "b", "b"),
x = c(1, 2, 3, 4, 1, 2, 3, 4),
y = c(0.2, 0, 0.9, 7, 1, 0.3, 5, 5.1))
xdf <- tibble(id = c("a", "b"),
x = c(2, 3.5))
ndf <- df %>%
bind_rows(xdf) %>%
arrange(id)
zdf <- ndf %>%
group_by(id) %>%
group_modify(~mutate(., y_approx = approx(.$x, .$y, .$x, method = "linear")[["y"]])) %>%
ungroup() %>%
filter(is.na(y)) %>%
select(id, y_approx)

Resources