Finding min turning point in a data frame - r

I have a data frame df1. I would like to find the minimum turning point at each column, where the value before and after the minimum point is larger than it. For example in x=c(2,5,3,6,1,1,1), I would like to determine that the minimum turning point is at 3, but with the min function, I am only able to find the minimum point which is 1. If there is no minimum point, I would like to get NA. Thanks.
> df
structure(list(x = c(2, 5, 3, 6, 1, 1, 1), y = c(6, 9, 3, 6,
3, 1, 1), z = c(9, 3, 5, 1, 4, 6, 2)), row.names = c(NA, -7L), class = c("tbl_df",
"tbl", "data.frame"))
df1>
x y z
2 6 9
5 9 3
3 3 5
6 6 1
1 3 4
1 1 6
1 1 2
Desired result as shown below.
df2>
x y z
3 3 1

You can use lead and lag to compare current value with previous and next value.
library(dplyr)
df %>% summarise(across(.fns = ~min(.x[which(lag(.x) > .x & lead(.x) > .x)])))
# x y z
# <dbl> <dbl> <dbl>
#1 3 3 1

You can use diff, get the sign than diff again to get the valleys. Use min to get the lowest valey.
#Value
sapply(df, function(x) min(x[1+which(diff(sign(diff(x))) == 2)]))
#x y z
#3 3 1
#Position
sapply(df, function(x) {
tt <- 1+which(diff(sign(diff(x))) == 2)
tt[which.min(x[tt])] })
#x y z
#3 3 4
But this will work only in case the valley is one position wide.
Am more robust solution will be using the function from Finding local maxima and minima:
peakPosition <- function(x, inclBorders=TRUE) {
if(inclBorders) {y <- c(min(x), x, min(x))
} else {y <- c(x[1], x)}
y <- data.frame(x=sign(diff(y)), i=1:(length(y)-1))
y <- y[y$x!=0,]
idx <- diff(y$x)<0
(y$i[c(idx,F)] + y$i[c(F,idx)] - 1)/2
}
#Value
sapply(df, function(x) min(x[ceiling(peakPosition(-x, FALSE))]))
#x y z
#3 3 1
#Position
sapply(df, function(x) {
tt <- peakPosition(-x, FALSE)
tt[which.min(x[floor(tt)])] })
#x y z
#3 3 4
An alternative would be to use rle:
x <- c(8,9,3,3,8,1,1)
y <- rle(x)
i <- 1 + which(diff(sign(diff(y$values))) == 2)
min(y$values[i]) #Value
#[1] 3
j <- which.min(y$values[i])
1+sum(y$lengths[seq(i[j])-1]) #First Position
#[1] 3
sum(y$lengths[seq(i[j])]) #Last Position
#[1] 4

Alternate approach
df %>% summarise_all(~ifelse(min(.)==last(.) | min(.) == first(.), min(.[. != last(.) & . != first(.)]), min(.)))
x y z
1 3 3 1
For returning the row_nums
df %>% mutate_all(~ifelse(min(.)==last(.) | min(.) == first(.), min(.[. != last(.) & . != first(.)]), min(.))) %>%
mutate(id = row_number()) %>% left_join(df %>% mutate(id = row_number()), by = "id") %>%
mutate(x_r = ifelse(x.x == x.y, row_number(), 0),
y_r = ifelse(y.x == y.y, row_number(), 0),
z_r = ifelse(z.x == z.y, row_number(), 0)) %>%
select(ends_with("r")) %>% summarise_all(~min(.[. != 0]))
x_r y_r z_r
1 3 3 4
```

Related

Put column name as a value across several columns, under condition

I have a dataset looking like that:
set.seed(123)
test_data <- data.frame(
id = c("a", "b", "c", "d", "e"),
x = sample(c(0,1), 5, replace = T),
y = sample(c(0,1), 5, replace = T)
)
> test_data
id x y
1 a 0 1
2 b 0 1
3 c 0 1
4 d 1 0
5 e 0 0
For the columns x and y, if the value is equal to 1, the value is replaced by the name of the column. In my example, I would like to have:
id x y
1 a <NA> y
2 b <NA> y
3 c <NA> y
4 d x <NA>
5 e <NA> <NA>
The thing that I don't know how many columns should be treated this way. Basically, I know that the first column ("id") is not concerned, but after this column, I could have any number of columns (even 0) that need to be treated this way.
I tried something like that but it doesn't work:
library(dplyr)
test_data %>%
mutate(
across(
.cols = 1:last_col(),
.funs = function(x) {
ifelse(x == 1, as.character(x), NA)
}
)
)
How can I do that? A dplyr answer is preferred.
You can do this way also:
library(tidyverse)
## define a function for your job
fn <- function(x, name){
return(ifelse(x ==1, name, NA))
}
test_data %>%
select(-id) %>%
map2_dfr(., names(.), ~fn(.x, .y)) %>%
bind_cols('id'= test_data$id, .)
Another version could be:
fn <- function(x, name){
if(name != 'id'){
return(ifelse(x ==1, name, NA))
} else {
return(x)
}
}
test_data %>%
map2_dfr(., names(.), ~fn(.x, .y))
Here is a try using purrr
library(dplyr)
library(purrr)
col_to_fix <- names(test_data)[2:length(test_data)]
walk(.x = col_to_fix, .f = function(x) {
# Note that I used <<- assigment here to change the test_data in global
test_data[[x]] <<- case_when(
test_data[[x]] == 1 ~ x,
TRUE ~ NA_character_
)
})
Output
> test_data
id x y
1 a <NA> y
2 b <NA> y
3 c <NA> y
4 d x <NA>
5 e <NA> <NA>

Finding the index for 2nd Min value in a data frame

I have a data frame df1. I would like to find the index for the second smallest value from this dataframe. With the function which.min I was able to get the row index for the smallest value but is there a way to get the index for the second smallest value?
> df1
structure(list(x = c(1, 2, 3, 4, 3), y = c(2, 3, 2, 4, 6), z = c(1,
4, 2, 3, 11)), row.names = c(NA, -5L), class = c("tbl_df", "tbl",
"data.frame"))
>df1
x y z
1 2 1
2 3 4
3 2 2
4 4 3
3 6 11
This is my desired output. For example, in x, the value 2 in row 2 is the second smallest value. Thank you.
>df2
x 2
y 2
z 3
Updated answer
You can write a function like the following, using factor:
which_min <- function(x, pos) {
sapply(x, function(y) {
which(as.numeric(factor(y, sort(unique(y)))) == pos)[1]
})
}
which_min(df1, 2)
# x y z
# 2 2 3
Testing it out with other data:
df2 <- df1
df2$new <- c(1, 1, 1, 2, 3)
which_min(df2, 2)
# x y z new
# 2 2 3 4
Original answer
Instead of sort, you can use order:
sapply(df1, function(x) order(unique(x))[2])
# x y z
# 2 2 3
Or you can make use of the index.return argument in sort:
sapply(df1, function(x) sort(unique(x), index.return = TRUE)$ix[2])
# x y z
# 2 2 3
You can do :
sapply(df1, function(x) which.max(x == sort(unique(x))[2]))
#x y z
#2 2 3
Or with dplyr :
library(dplyr)
df1 %>%
summarise(across(.fns = ~which.max(. == sort(unique(.))[2])))
# x y z
# <int> <int> <int>
#1 2 2 3
Another base R version using rank
> sapply(df1, function(x) which(rank(unique(x)) == 2))
x y z
2 2 3
You could try something like:
sort(unique(unlist(df1)))[2]

Returning a tibble: how to vectorize with case_when?

I have a function which returns a tibble. It runs OK, but I want to vectorize it.
library(tidyverse)
tibTest <- tibble(argX = 1:4, argY = 7:4)
square_it <- function(xx, yy) {
if(xx >= 4){
tibble(x = NA, y = NA)
} else if(xx == 3){
tibble(x = as.integer(), y = as.integer())
} else if (xx == 2){
tibble(x = xx^2 - 1, y = yy^2 -1)
} else {
tibble(x = xx^2, y = yy^2)
}
}
It runs OK in a mutate when I call it with map2, giving me the result I wanted:
tibTest %>%
mutate(sq = map2(argX, argY, square_it)) %>%
unnest()
## A tibble: 3 x 4
# argX argY x y
# <int> <int> <dbl> <dbl>
# 1 1 7 1 49
# 2 2 6 3 35
# 3 4 4 NA NA
My first attempt to vectorize it failed, and I can see why - I can't return a vector of tibbles.
square_it2 <- function(xx, yy){
case_when(
x >= 4 ~ tibble(x = NA, y = NA),
x == 3 ~ tibble(x = as.integer(), y = as.integer()),
x == 2 ~ tibble(x = xx^2 - 1, y = yy^2 -1),
TRUE ~ tibble(x = xx^2, y = yy^2)
)
}
# square_it2(4, 2) # FAILS
My next attempt runs OK on a simple input. I can return a list of tibbles, and that's what I want for the unnest
square_it3 <- function(xx, yy){
case_when(
xx >= 4 ~ list(tibble(x = NA, y = NA)),
xx == 3 ~ list(tibble(x = as.integer(), y = as.integer())),
xx == 2 ~ list(tibble(x = xx^2 - 1, y = yy^2 -1)),
TRUE ~ list(tibble(x = xx^2, y = yy^2))
)
}
square_it3(4, 2)
# [[1]]
# # A tibble: 1 x 2
# x y
# <lgl> <lgl>
# 1 NA NA
But when I call it in a mutate, it doesn't give me the result I had with square_it. I can sort of see what's
wrong. In the xx == 2 clause, xx acts as an atomic value of 2. But in
building the tibble, xx is a length-4 vector.
tibTest %>%
mutate(sq = square_it3(argX, argY)) %>%
unnest()
# # A tibble: 9 x 4
# argX argY x y
# <int> <int> <dbl> <dbl>
# 1 1 7 1 49
# 2 1 7 4 36
# 3 1 7 9 25
# 4 1 7 16 16
# 5 2 6 0 48
# 6 2 6 3 35
# 7 2 6 8 24
# 8 2 6 15 15
# 9 4 4 NA NA
How do I get the same result as I did with square_it, but from a vectorized function using case_when ?
We define row_case_when which has a similar formula interface as case_when except it has a first argument of .data, acts by row and expects that the value of each leg to be a data frame. It returns a data.frame/tibble. Wrapping in a list, rowwise and unnest are not needed.
case_when2 <- function (.data, ...) {
fs <- dplyr:::compact_null(rlang:::list2(...))
n <- length(fs)
if (n == 0) {
abort("No cases provided")
}
query <- vector("list", n)
value <- vector("list", n)
default_env <- rlang:::caller_env()
quos_pairs <- purrr::map2(fs, seq_along(fs), dplyr:::validate_formula,
rlang:::default_env, rlang:::current_env())
for (i in seq_len(n)) {
pair <- quos_pairs[[i]]
query[[i]] <- rlang::eval_tidy(pair$lhs, data = .data, env = default_env)
value[[i]] <- rlang::eval_tidy(pair$rhs, data = .data, env = default_env)
if (!is.logical(query[[i]])) {
abort_case_when_logical(pair$lhs, i, query[[i]])
}
if (query[[i]]) return(value[[i]])
}
}
row_case_when <- function(.data, ...) {
.data %>%
group_by(.group = 1:n(), !!!.data) %>%
do(case_when2(., ...)) %>%
mutate %>%
ungroup %>%
select(-.group)
}
Test run
It is used like this:
library(dplyr)
tibTest <- tibble(argX = 1:4, argY = 7:4) # test data from question
tibTest %>%
row_case_when(argX >= 4 ~ tibble(x = NA, y = NA),
argX == 3 ~ tibble(x = as.integer(), y = as.integer()),
argX == 2 ~ tibble(x = argX^2 - 1, y = argY^2 -1),
TRUE ~ tibble(x = argX^2, y = argY^2)
)
giving:
# A tibble: 3 x 4
argX argY x y
<int> <int> <dbl> <dbl>
1 1 7 1 49
2 2 6 3 35
3 4 4 NA NA
mutate_cond and mutate_when
These are not quite the same as row_case_when since they don't run through conditions taking the first true one but by using mutually exclusive conditions they can be used for certain aspects of this problem. They do not handle changing the number of rows in the result but we can use dplyr::filter to remove rows for a particular condition.
mutate_cond defined in dplyr mutate/replace several columns on a subset of rows is like mutate except the second argument is a condition and the subsequent arguments are applied only to rows for which that condition is TRUE.
mutate_when defined in
dplyr mutate/replace several columns on a subset of rows is similar to case_when except it applies to rows, the replacement values are provided in a list and alternate arguments are conditions and lists. Also all legs are always run applying the replacement values to the rows satisfying the conditions (as opposed to, for each row, performing the replacement on just the first true leg). To get a similar effect to row_case_when be sure that the conditions are mutually exclusive.
# mutate_cond example
tibTest %>%
filter(argX != 3) %>%
mutate(x = NA_integer_, y = NA_integer_) %>%
mutate_cond(argX == 2, x = argX^2 - 1L, y = argY^2 - 1L) %>%
mutate_cond(argX < 2, x = argX^2, y = argY^2)
# mutate_when example
tibTest %>%
filter(argX != 3) %>%
mutate_when(TRUE, list(x = NA_integer_, y = NA_integer_),
argX == 2, list(x = argX^2 - 1L, y = argY^2 - 1L),
argX < 2, list(x = argX^2, y = argY^2))
You need to ensure you are creating a 1-row tibble with each call of the function, then vectorize that.
This works whether you have rowwise groups or not.
You can do this with switch wrapped in a map2:
Here's a reprex:
library(tidyverse)
tibTest <- tibble(argX = 1:4, argY = 7:4)
square_it <- function(xx, yy) {
map2(xx, yy, function(x, y){
switch(which(c(x >= 4,
x == 3,
x == 2,
x < 4 & x != 3 & x != 2)),
tibble(x = NA, y = NA),
tibble(x = as.integer(), y = as.integer()),
tibble(x = x^2 - 1, y = y^2 -1),
tibble(x = x^2, y = y^2))})
}
tibTest %>% mutate(sq = square_it(argX, argY)) %>% unnest(cols = sq)
#> # A tibble: 3 x 4
#> argX argY x y
#> <int> <int> <dbl> <dbl>
#> 1 1 7 1 49
#> 2 2 6 3 35
#> 3 4 4 NA NA
Created on 2020-05-16 by the reprex package (v0.3.0)

How to make an aggregation function based on calculation between 2 columns?

My dataframe is
df <- data.frame(x = c(4,4,4,2,2,2), y = c(1,2,3,1,2,3), y_share = c(0.2,0.4,0.2,0.5,0.3,0.2))
I want to have an aggregation df with 2 columns of y and z with
z = sum(x*y_share)/sum(y_share).
In this case, the resulted dataframe should be like this:
result = data.frame(y = c(1,2,3), z = c(2.57, 3.14, 3))
I tried this
func = function(x) {y=sum(vector(x[1])*vector(x[3]))/sum(vector(x[3]))
return(y)}
agg = aggregate(df, by=list(df$y), FUN=func)
but it doesn't work.
Thank you
We can use data.table
library(data.table)
setDT(df)[, .(z = sum(x * y_share)/sum(y_share)), by = y]
# y z
#1: 1 2.571429
#2: 2 3.142857
#3: 3 3.000000
Or if we want to use base R, here is an option with by
stack(by(df, list(df$y), FUN = function(z)
with(z, sum(x * y_share)/sum(y_share))))[2:1]
data
df <- data.frame(x=c(4,4,4,2,2,2), y=c(1,2,3,1,2,3),
y_share=c(0.2,0.4,0.2,0.5,0.3,0.2))
Tidyverse approach (using dplyr):
library(dplyr)
result <- df %>%
group_by(y) %>%
summarise(z = sum(x*y_share)/sum(y_share)) %>%
ungroup()
Result
result
# A tibble: 3 x 2
# y z
# <dbl> <dbl>
# 1 1 2.57
# 2 2 3.14
# 3 3 3.
Data
df <- data.frame(x = c(4,4,4,2,2,2),
y = c(1,2,3,1,2,3),
y_share = c(0.2,0.4,0.2,0.5,0.3,0.2))
result <- data.frame(y = c(1,2,3),
z = c(2.57, 3.14, 3))

Create a new tibble using the previous row value of a column as a parameter of the current row value

I want to manually create a tibble where one column values are calculated depending on the previous value of the same column.
For example:
tibble(
x = 1:5,
y = x + lag(y, default = 0)
)
I expect the following result:
# A tibble: 5 x 2
x y
<int> <dbl>
1 1 1
2 2 3
3 3 6
4 4 10
5 5 15
But I obtain the error:
Error in lag(y, default = 0) : object 'y' not found
Update - more real example:
tibble(
years = 1:5,
salary = 20000 * (1.01) ^ lag(years, default = 0),
qta = salary * 0.06
) %>%
mutate(
total = ifelse(row_number() == 1,
(qta + 50000) * (1.02),
(qta + lag(total, default = 0)) * (1.02))
)
In this example I have a tibble, and I want to add a column 'total' that is defined depending on its previous value, but the lag(total, default = 0) doesn't work.
We can use accumulate
library(tidyverse)
tibble(x = 1:5, y = accumulate(x, `+`))
# A tibble: 5 x 2
# x y
# <int> <int>
#1 1 1
#2 2 3
#3 3 6
#4 4 10
#5 5 15
For a general function, it would be
tibble(x = 1:5, y = accumulate(x, ~ .x + .y))
We can also specify the initialization value
tibble(x = 1:5, y = accumulate(x[-1], ~ .x + .y, .init = x[1]))
You're missing x instead of y in the lag() function to run without an error:
tibble(
x = 1:5,
y = x + lag(x, default = 0)
)
But as per #Ronak Shah's comment, you need the cumsum() function to get the same result as your example:
tibble(
x = 1:5,
y = cumsum(x)
)

Resources