Put column name as a value across several columns, under condition - r

I have a dataset looking like that:
set.seed(123)
test_data <- data.frame(
id = c("a", "b", "c", "d", "e"),
x = sample(c(0,1), 5, replace = T),
y = sample(c(0,1), 5, replace = T)
)
> test_data
id x y
1 a 0 1
2 b 0 1
3 c 0 1
4 d 1 0
5 e 0 0
For the columns x and y, if the value is equal to 1, the value is replaced by the name of the column. In my example, I would like to have:
id x y
1 a <NA> y
2 b <NA> y
3 c <NA> y
4 d x <NA>
5 e <NA> <NA>
The thing that I don't know how many columns should be treated this way. Basically, I know that the first column ("id") is not concerned, but after this column, I could have any number of columns (even 0) that need to be treated this way.
I tried something like that but it doesn't work:
library(dplyr)
test_data %>%
mutate(
across(
.cols = 1:last_col(),
.funs = function(x) {
ifelse(x == 1, as.character(x), NA)
}
)
)
How can I do that? A dplyr answer is preferred.

You can do this way also:
library(tidyverse)
## define a function for your job
fn <- function(x, name){
return(ifelse(x ==1, name, NA))
}
test_data %>%
select(-id) %>%
map2_dfr(., names(.), ~fn(.x, .y)) %>%
bind_cols('id'= test_data$id, .)
Another version could be:
fn <- function(x, name){
if(name != 'id'){
return(ifelse(x ==1, name, NA))
} else {
return(x)
}
}
test_data %>%
map2_dfr(., names(.), ~fn(.x, .y))

Here is a try using purrr
library(dplyr)
library(purrr)
col_to_fix <- names(test_data)[2:length(test_data)]
walk(.x = col_to_fix, .f = function(x) {
# Note that I used <<- assigment here to change the test_data in global
test_data[[x]] <<- case_when(
test_data[[x]] == 1 ~ x,
TRUE ~ NA_character_
)
})
Output
> test_data
id x y
1 a <NA> y
2 b <NA> y
3 c <NA> y
4 d x <NA>
5 e <NA> <NA>

Related

Collapsing Columns in R using tidyverse with mutate, replace, and unite. Writing a function to reuse?

Data:
ID
B
C
1
NA
x
2
x
NA
3
x
x
Results:
ID
Unified
1
C
2
B
3
B_C
I'm trying to combine colums B and C, using mutate and unify, but how would I scale up this function so that I can reuse this for multiple columns (think 100+), instead of having to write out the variables each time? Or is there a function that's already built in to do this?
My current solution is this:
library(tidyverse)
Data %>%
mutate(B = replace(B, B == 'x', 'B'), C = replace(C, C == 'x', 'C')) %>%
unite("Unified", B:C, na.rm = TRUE, remove= TRUE)
We may use across to loop over the column, replace the value that corresponds to 'x' with column name (cur_column())
library(dplyr)
library(tidyr)
Data %>%
mutate(across(B:C, ~ replace(., .== 'x', cur_column()))) %>%
unite(Unified, B:C, na.rm = TRUE, remove = TRUE)
-output
ID Unified
1 1 C
2 2 B
3 3 B_C
data
Data <- structure(list(ID = 1:3, B = c(NA, "x", "x"), C = c("x", NA,
"x")), class = "data.frame", row.names = c(NA, -3L))
Here are couple of options.
Using dplyr -
library(dplyr)
cols <- names(Data)[-1]
Data %>%
rowwise() %>%
mutate(Unified = paste0(cols[!is.na(c_across(B:C))], collapse = '_')) %>%
ungroup -> Data
Data
# ID B C Unified
# <int> <chr> <chr> <chr>
#1 1 NA x C
#2 2 x NA B
#3 3 x x B_C
Base R
Data$Unified <- apply(Data[cols], 1, function(x)
paste0(cols[!is.na(x)], collapse = '_'))

Finding min turning point in a data frame

I have a data frame df1. I would like to find the minimum turning point at each column, where the value before and after the minimum point is larger than it. For example in x=c(2,5,3,6,1,1,1), I would like to determine that the minimum turning point is at 3, but with the min function, I am only able to find the minimum point which is 1. If there is no minimum point, I would like to get NA. Thanks.
> df
structure(list(x = c(2, 5, 3, 6, 1, 1, 1), y = c(6, 9, 3, 6,
3, 1, 1), z = c(9, 3, 5, 1, 4, 6, 2)), row.names = c(NA, -7L), class = c("tbl_df",
"tbl", "data.frame"))
df1>
x y z
2 6 9
5 9 3
3 3 5
6 6 1
1 3 4
1 1 6
1 1 2
Desired result as shown below.
df2>
x y z
3 3 1
You can use lead and lag to compare current value with previous and next value.
library(dplyr)
df %>% summarise(across(.fns = ~min(.x[which(lag(.x) > .x & lead(.x) > .x)])))
# x y z
# <dbl> <dbl> <dbl>
#1 3 3 1
You can use diff, get the sign than diff again to get the valleys. Use min to get the lowest valey.
#Value
sapply(df, function(x) min(x[1+which(diff(sign(diff(x))) == 2)]))
#x y z
#3 3 1
#Position
sapply(df, function(x) {
tt <- 1+which(diff(sign(diff(x))) == 2)
tt[which.min(x[tt])] })
#x y z
#3 3 4
But this will work only in case the valley is one position wide.
Am more robust solution will be using the function from Finding local maxima and minima:
peakPosition <- function(x, inclBorders=TRUE) {
if(inclBorders) {y <- c(min(x), x, min(x))
} else {y <- c(x[1], x)}
y <- data.frame(x=sign(diff(y)), i=1:(length(y)-1))
y <- y[y$x!=0,]
idx <- diff(y$x)<0
(y$i[c(idx,F)] + y$i[c(F,idx)] - 1)/2
}
#Value
sapply(df, function(x) min(x[ceiling(peakPosition(-x, FALSE))]))
#x y z
#3 3 1
#Position
sapply(df, function(x) {
tt <- peakPosition(-x, FALSE)
tt[which.min(x[floor(tt)])] })
#x y z
#3 3 4
An alternative would be to use rle:
x <- c(8,9,3,3,8,1,1)
y <- rle(x)
i <- 1 + which(diff(sign(diff(y$values))) == 2)
min(y$values[i]) #Value
#[1] 3
j <- which.min(y$values[i])
1+sum(y$lengths[seq(i[j])-1]) #First Position
#[1] 3
sum(y$lengths[seq(i[j])]) #Last Position
#[1] 4
Alternate approach
df %>% summarise_all(~ifelse(min(.)==last(.) | min(.) == first(.), min(.[. != last(.) & . != first(.)]), min(.)))
x y z
1 3 3 1
For returning the row_nums
df %>% mutate_all(~ifelse(min(.)==last(.) | min(.) == first(.), min(.[. != last(.) & . != first(.)]), min(.))) %>%
mutate(id = row_number()) %>% left_join(df %>% mutate(id = row_number()), by = "id") %>%
mutate(x_r = ifelse(x.x == x.y, row_number(), 0),
y_r = ifelse(y.x == y.y, row_number(), 0),
z_r = ifelse(z.x == z.y, row_number(), 0)) %>%
select(ends_with("r")) %>% summarise_all(~min(.[. != 0]))
x_r y_r z_r
1 3 3 4
```

`r`/`rlang`/`dplyr`: How to render `sym` resilient to `NULL`?

Edited in response to #akrun's insight:
This works:
require("magrittr")
requireNamespace("dplyr")
df <- data.frame(a = 1:5)
b_column <- c_column <- "a"
df %>% dplyr::mutate(
b = !!dplyr::sym(b_column),
c = !!dplyr::sym(c_column))
But when any one of the *_columns is NULL it doesn't:
c_column <- NULL
df %>% dplyr::mutate(
b = !!dplyr::sym(b_column),
c = !!dplyr::sym(c_column))
The resulting error is:
Error: Only strings can be converted to symbols
Run `rlang::last_error()` to see where the error occurred.
How would I make the call to ANY of the ensymboled *_column variables resilient to it being NULL?
If we need to check for NULL case, use an if condition
df1 <- if(!is.null(c)) {
df %>%
dplyr::mutate(b = !!dplyr::sym(c))
} else df
With multiple columns, an option is map
library(purrr)
b_column <- c_column <- "a"
map2_dfc(list(b_column, c_column), c("b", "c"), ~
if(!is.null(.x)) df %>%
transmute(!! .y := !! sym(.x))) %>%
bind_cols(df, .)
-output
# a b c
#1 1 1 1
#2 2 2 2
#3 3 3 3
#4 4 4 4
#5 5 5 5
If one of them is NULL
c_column <- NULL
map2_dfc(list(b_column, c_column), c("b", "c"), ~
if(!is.null(.x)) df %>%
transmute(!! .y := !! sym(.x))) %>%
bind_cols(df, .)
# a b
#1 1 1
#2 2 2
#3 3 3
#4 4 4
#5 5 5
Another option is mutate with across, but make sure that we need to rename only the columns that are not NULL
nm1 <- c("b", "c")
i1 <- !map_lgl(list(b_column, c_column), is.null)
nm2 <- nm1[i1]
df %>%
mutate(across(all_of(c(b_column, c_column)), ~ .)) %>%
rename_at(vars(everything()), ~ nm2) %>%
bind_cols(df, .)

Returning a tibble: how to vectorize with case_when?

I have a function which returns a tibble. It runs OK, but I want to vectorize it.
library(tidyverse)
tibTest <- tibble(argX = 1:4, argY = 7:4)
square_it <- function(xx, yy) {
if(xx >= 4){
tibble(x = NA, y = NA)
} else if(xx == 3){
tibble(x = as.integer(), y = as.integer())
} else if (xx == 2){
tibble(x = xx^2 - 1, y = yy^2 -1)
} else {
tibble(x = xx^2, y = yy^2)
}
}
It runs OK in a mutate when I call it with map2, giving me the result I wanted:
tibTest %>%
mutate(sq = map2(argX, argY, square_it)) %>%
unnest()
## A tibble: 3 x 4
# argX argY x y
# <int> <int> <dbl> <dbl>
# 1 1 7 1 49
# 2 2 6 3 35
# 3 4 4 NA NA
My first attempt to vectorize it failed, and I can see why - I can't return a vector of tibbles.
square_it2 <- function(xx, yy){
case_when(
x >= 4 ~ tibble(x = NA, y = NA),
x == 3 ~ tibble(x = as.integer(), y = as.integer()),
x == 2 ~ tibble(x = xx^2 - 1, y = yy^2 -1),
TRUE ~ tibble(x = xx^2, y = yy^2)
)
}
# square_it2(4, 2) # FAILS
My next attempt runs OK on a simple input. I can return a list of tibbles, and that's what I want for the unnest
square_it3 <- function(xx, yy){
case_when(
xx >= 4 ~ list(tibble(x = NA, y = NA)),
xx == 3 ~ list(tibble(x = as.integer(), y = as.integer())),
xx == 2 ~ list(tibble(x = xx^2 - 1, y = yy^2 -1)),
TRUE ~ list(tibble(x = xx^2, y = yy^2))
)
}
square_it3(4, 2)
# [[1]]
# # A tibble: 1 x 2
# x y
# <lgl> <lgl>
# 1 NA NA
But when I call it in a mutate, it doesn't give me the result I had with square_it. I can sort of see what's
wrong. In the xx == 2 clause, xx acts as an atomic value of 2. But in
building the tibble, xx is a length-4 vector.
tibTest %>%
mutate(sq = square_it3(argX, argY)) %>%
unnest()
# # A tibble: 9 x 4
# argX argY x y
# <int> <int> <dbl> <dbl>
# 1 1 7 1 49
# 2 1 7 4 36
# 3 1 7 9 25
# 4 1 7 16 16
# 5 2 6 0 48
# 6 2 6 3 35
# 7 2 6 8 24
# 8 2 6 15 15
# 9 4 4 NA NA
How do I get the same result as I did with square_it, but from a vectorized function using case_when ?
We define row_case_when which has a similar formula interface as case_when except it has a first argument of .data, acts by row and expects that the value of each leg to be a data frame. It returns a data.frame/tibble. Wrapping in a list, rowwise and unnest are not needed.
case_when2 <- function (.data, ...) {
fs <- dplyr:::compact_null(rlang:::list2(...))
n <- length(fs)
if (n == 0) {
abort("No cases provided")
}
query <- vector("list", n)
value <- vector("list", n)
default_env <- rlang:::caller_env()
quos_pairs <- purrr::map2(fs, seq_along(fs), dplyr:::validate_formula,
rlang:::default_env, rlang:::current_env())
for (i in seq_len(n)) {
pair <- quos_pairs[[i]]
query[[i]] <- rlang::eval_tidy(pair$lhs, data = .data, env = default_env)
value[[i]] <- rlang::eval_tidy(pair$rhs, data = .data, env = default_env)
if (!is.logical(query[[i]])) {
abort_case_when_logical(pair$lhs, i, query[[i]])
}
if (query[[i]]) return(value[[i]])
}
}
row_case_when <- function(.data, ...) {
.data %>%
group_by(.group = 1:n(), !!!.data) %>%
do(case_when2(., ...)) %>%
mutate %>%
ungroup %>%
select(-.group)
}
Test run
It is used like this:
library(dplyr)
tibTest <- tibble(argX = 1:4, argY = 7:4) # test data from question
tibTest %>%
row_case_when(argX >= 4 ~ tibble(x = NA, y = NA),
argX == 3 ~ tibble(x = as.integer(), y = as.integer()),
argX == 2 ~ tibble(x = argX^2 - 1, y = argY^2 -1),
TRUE ~ tibble(x = argX^2, y = argY^2)
)
giving:
# A tibble: 3 x 4
argX argY x y
<int> <int> <dbl> <dbl>
1 1 7 1 49
2 2 6 3 35
3 4 4 NA NA
mutate_cond and mutate_when
These are not quite the same as row_case_when since they don't run through conditions taking the first true one but by using mutually exclusive conditions they can be used for certain aspects of this problem. They do not handle changing the number of rows in the result but we can use dplyr::filter to remove rows for a particular condition.
mutate_cond defined in dplyr mutate/replace several columns on a subset of rows is like mutate except the second argument is a condition and the subsequent arguments are applied only to rows for which that condition is TRUE.
mutate_when defined in
dplyr mutate/replace several columns on a subset of rows is similar to case_when except it applies to rows, the replacement values are provided in a list and alternate arguments are conditions and lists. Also all legs are always run applying the replacement values to the rows satisfying the conditions (as opposed to, for each row, performing the replacement on just the first true leg). To get a similar effect to row_case_when be sure that the conditions are mutually exclusive.
# mutate_cond example
tibTest %>%
filter(argX != 3) %>%
mutate(x = NA_integer_, y = NA_integer_) %>%
mutate_cond(argX == 2, x = argX^2 - 1L, y = argY^2 - 1L) %>%
mutate_cond(argX < 2, x = argX^2, y = argY^2)
# mutate_when example
tibTest %>%
filter(argX != 3) %>%
mutate_when(TRUE, list(x = NA_integer_, y = NA_integer_),
argX == 2, list(x = argX^2 - 1L, y = argY^2 - 1L),
argX < 2, list(x = argX^2, y = argY^2))
You need to ensure you are creating a 1-row tibble with each call of the function, then vectorize that.
This works whether you have rowwise groups or not.
You can do this with switch wrapped in a map2:
Here's a reprex:
library(tidyverse)
tibTest <- tibble(argX = 1:4, argY = 7:4)
square_it <- function(xx, yy) {
map2(xx, yy, function(x, y){
switch(which(c(x >= 4,
x == 3,
x == 2,
x < 4 & x != 3 & x != 2)),
tibble(x = NA, y = NA),
tibble(x = as.integer(), y = as.integer()),
tibble(x = x^2 - 1, y = y^2 -1),
tibble(x = x^2, y = y^2))})
}
tibTest %>% mutate(sq = square_it(argX, argY)) %>% unnest(cols = sq)
#> # A tibble: 3 x 4
#> argX argY x y
#> <int> <int> <dbl> <dbl>
#> 1 1 7 1 49
#> 2 2 6 3 35
#> 3 4 4 NA NA
Created on 2020-05-16 by the reprex package (v0.3.0)

R: Extract columns from list of data.frames in a tibble

I am wondering how to manipulate a list containing data.frames stored in a tibble.
Specifically, I would like to extract two columns from a data.frame that are stored in a tibble list column.
I would like to go from this tibble c
random_data<-list(a=letters[1:10],b=LETTERS[1:10])
x<-as.data.frame(random_data, stringsAsFactors=FALSE)
y<-list()
y[[1]]<-x[1,,drop=FALSE]
y[[3]]<-x[2,,drop=FALSE]
c<-tibble(z=c(1,2,3),my_data=y)
to this tibble d
d<-tibble(z=c(1,2,3),a=c('a',NA,'b'),b=c('A',NA,'B'))
thanks
Iain
c2 is the final output.
library(tidyverse)
c2 <- c %>%
filter(!map_lgl(my_data, is.null)) %>%
unnest() %>%
right_join(c, by = "z") %>%
select(-my_data)
You could create a function f to change out the NULL values, then apply it to the my_data column and finish with unnest.
library(dplyr); library(tidyr)
unnest(mutate(c, my_data = lapply(my_data, f)))
# # A tibble: 3 x 3
# z a b
# <dbl> <chr> <chr>
# 1 1 a A
# 2 2 <NA> <NA>
# 3 3 b B
Where f is a helper function to change out the NULL values, and is defined as
f <- function(x) {
if(is.null(x)) data.frame(a = NA, b = NA) else x
}
I think this does the trick with d the requested tibble:
library(dplyr)
new.y <- lapply(y, function(x) if(is.null(x)) data.frame(a = NA, b = NA) else x)
d <- cbind(z = c(1, 2, 3), bind_rows(new.y)) %>% tbl_df()
# # A tibble: 3 x 3
# z a b
# <dbl> <fctr> <fctr>
# 1 1 a A
# 2 2 NA NA
# 3 3 b B
Do you know your column names ahead of time?
extract_column <- function( d, column_name ) {
if( is.null(d) ) {
NA_character_
} else {
as.character(d[[column_name]])
}
}
cc %>%
dplyr::mutate(
a = purrr::map_chr(.$my_data, extract_column, column_name="a"),
b = purrr::map_chr(.$my_data, extract_column, column_name="b")
) %>%
dplyr::select(-my_data)
(I renamed your c tibble to cc so it can't collide with c().)

Resources