dplyr summarise() with special functions for edge cases - r

I would like to make a groupwise summarise() operation in dplyr, but where a different function applies if edge cases are met.
I have count data that looks like this. The concentration and the standard deviation are calculated like this:
library(dplyr)
testdata <- data_frame(sample = sort(rep(1:3, 4)),
volume = rep(c(1e-1, 1e-1, 1e-2, 1e-2), 3),
count = c(400, 400, 40, 40, 0, 0, 0, 0, 400, 400, 400, 400))
testdata %>%
group_by(sample) %>%
summarise(concentration = sum(count) / sum(volume),
sd = sqrt(sum(count)))
However, when making the calculation only counts with values between 25-250 are to be included. which I could achieve with:
testdata %>%
group_by(sample) %>%
filter((count >= 25) & (count <= 250)) %>%
summarise(concentration = sum(count) / sum(volume),
sd = sqrt(sum(count)))
But then samples 2 & 3 have no concentration.
The edge cases for each group might be calculated with something like:
if (all(count <= 25)){
summarise(concentration = 25 / min(volume),
sd = NA)
}
else if (all(count >= 250)){
summarise(concentration = 250 / max(volume),
sd = NA)
}
Can such edge cases be integrated into the summarise() function?
I would ideally also like a flag to indicate an edge case which returns result = "OK" for all cases except edge cases that return:
if (all(count <= 25)){
summarise(concentration = 25 / min(volume),
sd = NA,
result = "LOW")
}
else if (all(count >= 250)){
summarise(concentration = 250 / max(volume),
sd = NA,
result = "HIGH")
}

One way is to encode your logic within summarise using ifelse:
library(dplyr)
result <- testdata %>% group_by(sample) %>%
summarise(concentration = ifelse(all(count <= 25),
25 / min(volume),
ifelse(all(count >= 250),
250 / max(volume),
sum(count) / sum(volume))),
sd = ifelse(all(count <= 25),
NA,
ifelse(all(count >= 250),
NA,
sqrt(sum(count)))),
result = ifelse(all(count <= 25),
"LOW",
ifelse(all(count >= 250),
"HIGH",
"OK")))
print(result)
### A tibble: 3 x 4
## sample concentration sd result
## <int> <dbl> <dbl> <chr>
##1 1 4000 29.66479 OK
##2 2 2500 NA LOW
##3 3 2500 NA HIGH
Updated approach
Another approach, which is hopefully closer to what the OP asks, is to define a function:
summarise.func <- function(count, volume) {
if (all(count <= 25)) {
concentration <- 25 / min(volume)
sd <- NA
result <- "LOW"
} else if (all(count >= 250)) {
concentration <- 250 / max(volume)
sd <- NA
result <- "HIGH"
} else {
concentration <- sum(count) / sum(volume)
sd <- sqrt(sum(count))
result <- "OK"
}
data.frame(concentration=concentration, sd=sd, result=result, stringsAsFactors=FALSE)
}
that handles both the regular case and the edge cases. The key is that this function return a data.frame containing the summarized results. Then, summarise will create a column that is a list containing these data frames that can then be tidyr::unnested:
library(dplyr)
library(tidyr)
result <- testdata %>% group_by(sample) %>%
summarise(csr=list(f(count, volume))) %>%
unnest(csr)
print(result)
### A tibble: 3 x 4
## sample concentration sd result
## <int> <dbl> <dbl> <chr>
##1 1 4000 29.66479 OK
##2 2 2500 NA LOW
##3 3 2500 NA HIGH

Related

How to force minimum number of groups in linear programming problem using ompr

The following problem seeks to maximize the weight across any 3 items while being under a cost of 20. I gave group "a" a large weight so that the model will only select the 3 items from group "a". How do I force the model to include at least 2 groups?
library(ompr)
library(ROI.plugin.glpk)
library(ompr.roi)
library(dplyr)
set.seed(1)
d <- data.frame(
id = seq_len(10),
weight = c(rep(100, 3), runif(7, 1, 50)),
cost = runif(10, 1, 10),
group = c(rep("a", 3), rep("b", 3), rep("c", 3), "d")
)
m <- ompr::MIPModel() %>%
ompr::add_variable(x[i],
i = d$id,
type = "binary") %>%
# set objective to maximize the weight
ompr::set_objective(
ompr::sum_over(d$weight[i] * x[i],
i = d$id), "max"
) %>%
# cost must be less than 20
ompr::add_constraint(
ompr::sum_over(d$cost[i] * x[i],
i = d$id) <= 20
) %>%
# can only include 3 items
ompr::add_constraint(
ompr::sum_over(
x[i],
i = d$id
) == 3
)
res <- ompr::solve_model(m, ompr.roi::with_ROI(solver = "glpk"))
res %>%
ompr::get_solution(x[i]) %>%
dplyr::filter(.data$value > 0) %>%
dplyr::inner_join(d, by = c("i" = "id"))
#> variable i value weight cost group
#> 1 x 1 1 100 6.947180 a
#> 2 x 2 1 100 6.662026 a
#> 3 x 3 1 100 1.556076 a
Created on 2023-02-05 with reprex v2.0.2
Here is one way to do it:
library(ompr)
library(ROI.plugin.glpk)
library(ompr.roi)
library(dplyr)
set.seed(1)
d <- data.frame(
id = seq_len(10),
weight = c(rep(100, 3), runif(7, 1, 50)),
cost = runif(10, 1, 10),
group = c(rep("a", 3), rep("b", 3), rep("c", 3), "d")
)
m <- ompr::MIPModel() %>%
ompr::add_variable(x[i],
i = d$id,
type = "binary") %>%
ompr::add_variable(group[j],
j = unique(d$group),
type = "binary") %>%
# set objective to maximize the weight
ompr::set_objective(
ompr::sum_over(d$weight[i] * x[i],
i = d$id, j = unique(d$group)), "max"
) %>%
# cost must be less than 20
ompr::add_constraint(
ompr::sum_over(d$cost[i] * x[i],
i = d$id) <= 20
) %>%
# can only include 3 items
ompr::add_constraint(
ompr::sum_over(
x[i],
i = d$id
) == 3
) %>%
# force group binary variables to be 1 if item is in group
ompr::add_constraint(
ompr::sum_over(
x[i],
i = d$id[which(d$group == j)]
) - 10000 * group[j] <= 0,
j = unique(d$group)
) %>%
# force at least one binary variable for item inclusion to be 1 across all items in group
# if group binary is 1
ompr::add_constraint(
group[j] - 10000 * ompr::sum_over(
x[i],
i = d$id[which(d$group == j)]
) <= 0,
j = unique(d$group)
) %>%
# force at least two groups
ompr::add_constraint(
ompr::sum_over(
group[j],
j = unique(d$group)
) >= 2
)
res <- ompr::solve_model(m, ompr.roi::with_ROI(solver = "glpk"))
res %>%
ompr::get_solution(x[i]) %>%
dplyr::filter(.data$value > 0) %>%
dplyr::inner_join(d, by = c("i" = "id"))
#> variable i value weight cost group
#> 1 x 1 1 100.00000 6.947180 a
#> 2 x 3 1 100.00000 1.556076 a
#> 3 x 10 1 47.28909 7.458567 d
Created on 2023-02-05 with reprex v2.0.2
The key is to link group[j] variables to the decision variables, x[i], by setting constraints such that if any x[i] inside group[j] is 1 then group[j] is 1. And when group[j] is one, then at least one x[i] in that group must also be 1. Then it's straightforward to set another constraint that the sum of group[j] is greater than or equal to 2.

Filter using Dataset Position in R

I'm not really familiar with dplyr function in R. However, I want to filter my dataset into certain conditions.
Let's say I've more than 100 of attributes in my dataset. And I want to perform filter with multiple condition.
Can I put my coding filter the position of the column instead of their name as follow:
y = filter(retag, c(4:50) != 8 & c(90:110) == 8)
I've tried few times similar with this coding, however still haven't get the result.
I also did tried coding as follow, but not sure how to add another conditions into the rowSums function.
retag[rowSums((retag!=8)[,c(4:50)])>=1,]
The only example that I found was using the dataset names instead of the position.
Or is there any way to filter using the dataset position as my data quite huge.
You can use a combination of filter() and across(). I didn't have your version of the retag dataframe so I created my own as an example
set.seed(2000)
retag <- tibble(
col1 = runif(n = 1000, min = 0, max = 10) %>% round(0),
col2 = runif(n = 1000, min = 0, max = 10) %>% round(0),
col3 = runif(n = 1000, min = 0, max = 10) %>% round(0),
col4 = runif(n = 1000, min = 0, max = 10) %>% round(0),
col5 = runif(n = 1000, min = 0, max = 10) %>% round(0)
)
# filter where the first, second, and third column all equal 5 and the fourth column does not equal 5
retag %>%
filter(
across(1:3, function(x) x == 5),
across(4, function(x) x != 5)
)
if_all() and if_any() were recently introduced into the tidyverse for the purpose of filtering across multiple variables.
library(dplyr)
filter(retag, if_all(X:Y, ~ .x > 10 & .x < 35))
# # A tibble: 5 x 2
# X Y
# <int> <int>
# 1 11 30
# 2 12 31
# 3 13 32
# 4 14 33
# 5 15 34
filter(retag, if_any(X:Y, ~ .x == 2 | .x == 25))
# # A tibble: 2 x 2
# X Y
# <int> <int>
# 1 2 21
# 2 6 25
Data
retag <- structure(list(X = 1:20, Y = 20:39), row.names = c(NA, -20L), class = c("tbl_df",
"tbl", "data.frame"))
Here's a base R option.
This will select rows where there is no 8 in column 4 to 50 and there is at least one 8 in column 90 to 110.
result <- retag[rowSums(retag[4:50] == 8, na.rm = TRUE) == 0 &
rowSums(retag[90:110] == 8,na.rm = TRUE) > 0, ]

Conditional statement that modifies two variables

I have the following data frame:
dat<-data.frame(site=c(rep("A", 3), rep("B", 3)),
landuse= rep(c("urban", "dev", "undev"),2),
percent= c(30,30,40, 50, 30, 20))
For each site, I want to filter for rows where the percent is greater than or equal to 50, but for sites where all landuse categories do not meet the minimum criteria, the landuse entry is changed to "mixed" and the percent is changed to 100.
The result data frame would look like this:
result<- data.frame(site= c("A", "B"), landuse=c("mixed", "urban"), percent= c(100, 50))
With dplyr you can try :
library(dplyr)
dat %>%
group_by(site) %>%
summarise(landuse = if (all(percent < 50)) "mixed" else landuse[percent >= 50],
percent = ifelse(landuse == 'mixed', 100, percent))
# site landuse percent
# <chr> <chr> <dbl>
#1 A mixed 100
#2 B urban 50
Does this work:
library(dplyr)
dat %>% group_by(site) %>% mutate(landuse = case_when(all(percent < 50) ~ 'mixed', TRUE ~ landuse),
percent= case_when(landuse == 'mixed' ~ 100, TRUE ~ percent)) %>%
filter(percent >= 50) %>% distinct()
# A tibble: 2 x 3
# Groups: site [2]
site landuse percent
<chr> <chr> <dbl>
1 A mixed 100
2 B urban 50

finding outliers and counting number of occurrence

I have a data frame as below
raw_data <- data.frame(
"id" = c(1, 1, 1, 2, 2),
"salary" = c(10000,15000,20000,40000,50000),
"expenditure" = c(10000,15000,20000,30000,40000))
if a salary greater than 15000 would be flaged as an outlier, and if an expenditure is greater than 10000, it should be flaged as an outlier. But the problem is now, how to count how many times an outlier (both spearately) occured by a specific id. The output should look like the following df
output <- data.frame(
"id"=c(1,1,1,2,2,2,2),
"question_name"=c("expenditure", "salary","expenditure","salary","expenditure","salary","expenditure"),
"values"=c(15000,20000,20000,30000,40000,500000,40000),
"count"=c(1,1,1,1,1,1,1))
Here's a dplyr solution:
raw_data %>%
mutate(salary_flag =
ifelse(salary > 15000, 1, 0),
expenditure_flag = ifelse(expenditure > 10000, 1, 0)) %>%
group_by(id) %>%
mutate(total_outlier = sum(salary_flag) + sum(expenditure_flag))
You are flagging for salary and expenditure, then grouping by id and calculating the sum of all salary_flag and the sum of all expenditure_flag for each id.
id salary expenditure salary_flag expenditure_flag total_outlier
<int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 10000 10000 0 0 0
2 2 15000 15000 0 1 1
3 3 20000 20000 1 1 2
4 4 40000 30000 1 1 2
5 5 50000 40000 1 1 2
If you're only concerned with the total outliers, #MartinGal provided a very nice option:
raw_data %>%
group_by(id) %>%
mutate(total_outlier = sum(salary>15000, expenditure>10000))
Gives us:
id salary expenditure total_outlier
<int> <dbl> <dbl> <int>
1 1 10000 10000 0
2 2 15000 15000 1
3 3 20000 20000 2
4 4 40000 30000 2
5 5 50000 40000 2
edit:
This seems to get the end result that you're looking for:
raw_data %>%
group_by(id) %>%
summarise(count = sum(salary>15000, expenditure>10000),
value = min(salary)) %>%
mutate(title = "salary") %>%
select(id, title, value, count)
Which gives you:
id title value count
<int> <chr> <dbl> <int>
1 1 salary 10000 0
2 2 salary 15000 1
3 3 salary 20000 2
4 4 salary 40000 2
5 5 salary 50000 2
Raw data is :
raw_data <- data.frame("id" = c(1, 1, 1, 2, 2),
"salary" = c(10000,15000,20000,40000,50000),
"expenditure" = c(10000,15000,15000,30000,40000))
And the solution is :
raw_data <- raw_data %>% filter(salary>15000 | expenditure>10000)
entry_variables <- raw_data %>%select(id,salary,expenditure) %>%
pivot_longer(cols = -id,
names_to = "Question_name", values_to= "Value",
values_drop_na = TRUE) %>%
count(id, Question_name, Value)
You can try the following
raw_data <- data.frame("id" = 1:5,
"salary" = c(10000,15000,20000,40000,50000),
"expenditure" = c(10000,15000,20000,30000,40000))
raw_data$SaleryOutlier <- ifelse(
raw_data$salary > 15000, TRUE, FALSE)
raw_data$ExpenditureOutlier <- ifelse(
raw_data$expenditure > 10000, TRUE, FALSE)
You can then use aggregate function to summarize the data, e.g. for each id by using FUN=sum. This should look like
aggregate(raw_data, by=list(id = raw_data$id), FUN=sum)
This works because TRUE=1.
I hope this helps.
EDIT
Based on your comment, I guess you are looking for
raw_data <- data.frame("id" = c(1, 1, 1, 2, 2),
"salary" = c(10000,15000,20000,40000,50000),
"expenditure" = c(10000,15000,20000,30000,40000))
raw_data$SaleryOutlier <- ifelse(
raw_data$salary > 15000, TRUE, FALSE)
raw_data$ExpenditureOutlier <- ifelse(
raw_data$expenditure > 10000, TRUE, FALSE)
raw_data_aggregate <- aggregate(raw_data, by=list(id = raw_data$id), FUN=sum)
raw_data_aggregate$count <- raw_data_aggregate$SaleryOutlier + raw_data_aggregate$ExpenditureOutlier
EDIT TWO
If you want to aggregate over two variables, just exchange the above aggregate with
raw_data_aggregate <- aggregate(
SalaryOutlier + ExpenditureOutlier ~ id + salary + expenditure, raw_data, FUN=sum)
EDIT THREE
Based on the comments below, I created the following code
raw_data <- data.frame(
"id" = c(1, 1, 1, 2, 2),
"salary" = c(10000,15000,20000,40000,50000),
"expenditure" = c(10000,15000,20000,30000,40000))
# Identify salary outliers
raw_data$SalaryOutlier <- ifelse(
raw_data$salary > 15000, TRUE, FALSE)
# Identify expenditure outliers
raw_data$ExpenditureOutlier <- ifelse(
raw_data$expenditure > 10000, TRUE, FALSE)
# Aggregate over id + salay
raw_data_aggregate_salary <- aggregate(
SalaryOutlier ~ id + salary, raw_data, FUN=sum)
# Aggregate over id + expenditure
raw_data_aggregate_expenditure <- aggregate(
ExpenditureOutlier ~ id + expenditure, raw_data, FUN=sum)
# Just some renaming to fit with desired output data frame.
raw_data_aggregate_salary$question_name <- "salary"
raw_data_aggregate_expenditure$question_name <- "expenditure"
colnames(raw_data_aggregate_salary)[2] <- "values"
colnames(raw_data_aggregate_expenditure)[2] <- "values"
colnames(raw_data_aggregate_salary)[3] <- "count"
colnames(raw_data_aggregate_expenditure)[3] <- "count"
# Bind result together into one df.
raw_data_aggregate <- rbind(
raw_data_aggregate_salary, raw_data_aggregate_expenditure)
# Only select entries where we actually have a count.
raw_data_aggregate <- subset(
raw_data_aggregate,
raw_data_aggregate$count > 0)
# Order to fit with desired output
raw_data_aggregate <- raw_data_aggregate[ order(raw_data_aggregate$id), ]
In a data.table this would look like
raw_data[, flag0 := (salary > 15000) + (expenditure > 10000)]
raw_data[, flag := sum(flag0), by = "id"]
Here flag0 is the flag by row (which can later be deleted if you like) and flag would be the final result.
Edit: Seeing your reply to #Matt, you seem to want the total amount by salary and expenditure seperately. You can do something like
raw_data[, flag_salary := as.integer(salary > 15000)]
raw_data[, flag_expenditure := as.integer(expenditure > 10000)]
raw_data[, flag_salary := sum(flag_salary), by = "id"]
raw_data[, flag_expenditure := sum(flag_expenditure), by = "id"]

cumulative sum to zigzag indicator

The following code provides example data:
library(TTR)
set.seed(15)
r <- rnorm(1000, 0, .01)
P_1 <- 100
P <- P_1*cumprod(1+r)
zz <- ZigZag(P, change = 5, percent = TRUE)
set.seed(15)
volume <- round(runif(1000, 50, 550), digits = 0)
data <- as.data.frame(cbind(P, zz, volume))
plot(P, type = "l")
lines(zz, col = "red")
in the end I would like to create cumulative sum of volume in new column, where reset happens when zigzag line (zz) changes direction. I have tried to play with s <- sign(diff(data$zz, lag = 1)), which would show those turning points, but haven't been able to use cumsum with it.
Here is a solution that uses dplyr:
library(dplyr)
data %>%
mutate(
zz_up = (zz - lag(zz) > 0),
zz_switch = zz_up != lag(zz_up),
zz_switch = ifelse(is.na(zz_switch), FALSE, zz_switch),
group = cumsum(zz_switch)
) %>%
group_by(group) %>%
mutate(cum_volume = cumsum(volume))
Attempt with RcppRoll:
Code
Vectorize(require)(package = c("magrittr", "dplyr", "RcppRoll"),
char = TRUE)
data %<>%
# Create difference for ZigZag
mutate(diffZZ = c(0,diff(zz))) %>%
# Use it as a group
group_by(diffZZ) %>%
# Use RcppRoll to compute that sum
mutate(sumVolByDiff = roll_sum(x = volume, n = 2, fill = NA)) %>%
# Clean / not important
ungroup()
Preview
> head(data)
Source: local data frame [6 x 5]
P zz volume diffZZ sumVolByDiff
(dbl) (dbl) (dbl) (dbl) (dbl)
1 100.2588 100.2588 351 0.000000 NA
2 102.0947 100.5596 148 0.300785 523
3 101.7480 100.8604 533 0.300785 1077
4 102.6608 101.1612 375 0.300785 609
5 103.1618 101.4620 234 0.300785 692
6 101.8668 101.7627 544 0.300785 938

Resources