Using mclapply over a list of lists - r

Suppose I have the below list of lists :
users_days_delays <- list(users, days, delays)
where
users <- list(1, 2, 3)
days <- list(1, 2, 3, 4)
delays <- list(0, 100)
I have the below function
compute_time <- function(user_day_delay){
user <- user_day_delay[[1]]
days <- user_day_delay[[2]]
delays <- user_day_delay[[3]]
time_per_user <- days+delays - 180*user
return(time_per_user)
}
Is there a way for me to use mclapply on the users_days_delays list of lists?
For ex. can I do :
time_per_users <- mclapply(users_days_delays, compute_time)
Thank you!

library(tidyverse)
compute_time <- function(user, day, delay){
time_per_user <- day + delay - 180*user
return(time_per_user)
}
df <- expand_grid(
user = c(1, 2, 3),
day = c(1, 2, 3, 4),
delay = c(0, 100),
)
df$time_per_user <- pmap_dbl(df, compute_time)
print(df)
#> # A tibble: 24 x 4
#> user day delay time_per_user
#> <dbl> <dbl> <dbl> <dbl>
#> 1 1 1 0 -179
#> 2 1 1 100 -79
#> 3 1 2 0 -178
#> 4 1 2 100 -78
#> 5 1 3 0 -177
#> 6 1 3 100 -77
#> 7 1 4 0 -176
#> 8 1 4 100 -76
#> 9 2 1 0 -359
#> 10 2 1 100 -259
#> # ... with 14 more rows
Created on 2022-03-04 by the reprex package (v2.0.1)

Related

How to apply a function to each group of IDs based on conditions with dplyr

I am trying to apply a custom function over a group of IDs only when some conditions are met for each column of that group (data should be only numeric and their sum non-zero). Here is the reproducible example:
dat <- as.Date("2021/08/04")
len <- 5
seq(dat, by = "day", length.out = len)
input <- data.frame(
date = c(seq(dat, by = "day", length.out = len) , seq(dat, by = "day", length.out = len)),
id = c("aa", "aa","aa","aa","aa","bb","bb","bb","bb","bb"),
var1 = c(2,3,4,6,7,8,9,3,5,6),
var2 = c(0, 0, 0, 0, 0, 1, 2, 3 ,4, 5),
var3 = c("hi", "hi", "hi", "hi", "hi", 1, 2, 3 ,4, 5)
)
Here is my custom Rescale function:
rescale = function(x,max_range=100){
return(((x-min(x))/(max(x)-min(x)))*max_range)
}
And this is the desired output:
output <- data.frame(
date = c(seq(dat, by = "day", length.out = len) , seq(dat, by = "day", length.out = len)),
id = c("aa", "aa","aa","aa","aa","bb","bb","bb","bb","bb"),
var1 = c(0, 20, 40, 80, 100, 83.3, 100, 0, 33.3, 50),
var2 = c(0, 0, 0, 0, 0, 0, 25, 50 ,75, 100),
var3 = c("hi", "hi", "hi", "hi", "hi", 0, 25, 50 ,75, 100)
)
I am using the following lines to solve this, using dplyr:
out = input %>%
dplyr::group_by(id) %>%
dplyr::mutate_if(~is.numeric(.) && sum(.x) != 0 ,rescale) %>%
dplyr::arrange(date, .by_group = TRUE) %>%
dplyr::ungroup()
The problem with these lines is that conditions do not refer to the columns of each group exclusively, but to the whole column of the input table. Hence, the function is applied to the whole table-column although the conditions are met only for one ID. In this example function was applied for aa~var2 (which is not desired) and wasn't applied for bb~var3 (which is desired).
Could you please help me correct the code of out ? Thank you.
This gives you the desired output, but it is much harder than it needs to be because you want to preserve the "hi" values. A numeric column cannot have text in it, so you have to convert to numeric, handle NA values, rescale the non-NA values, then convert to character and rewrite the "hi" values in place. Furthermore, you need to go back at the end and re-convert the columns without "hi" in them back to numeric. You could avoid all this by having NA values instead of "hi", but anyway, if you really have to preserve the "hi" values, you can do:
library(dplyr)
input %>%
group_by(id) %>%
mutate(across(contains("var"), function(x) {
x_n <- suppressWarnings(as.numeric(x))
if(all(is.na(x_n))) return(x)
if(all(x_n == 0)) return(as.character(x_n))
x_n[!is.na(x_n)] <- rescale(x_n[!is.na(x_n)])
if(any(is.na(x_n))) {
x_n <- as.character(x_n)
x_n[is.na(x_n)] <- x[is.na(x_n)]
}
as.character(x_n)
})) %>%
ungroup() %>%
mutate(across(contains("var"), function(x) {
if(any(is.na(suppressWarnings(as.numeric(x))))) x else as.numeric(x)
}))
#> # A tibble: 10 x 5
#> date id var1 var2 var3
#> <date> <chr> <dbl> <dbl> <chr>
#> 1 2021-08-04 aa 0 0 hi
#> 2 2021-08-05 aa 20 0 hi
#> 3 2021-08-06 aa 40 0 hi
#> 4 2021-08-07 aa 80 0 hi
#> 5 2021-08-08 aa 100 0 hi
#> 6 2021-08-04 bb 83.3 0 0
#> 7 2021-08-05 bb 100 25 25
#> 8 2021-08-06 bb 0 50 50
#> 9 2021-08-07 bb 33.3 75 75
#> 10 2021-08-08 bb 50 100 100
If you are prepared to have NA values instead of "hi" (and thereby have numeric columns that you can actually perform calculations on), you can simplify to
input %>%
group_by(id) %>%
mutate(across(contains("var"), function(x) {
x_n <- suppressWarnings(as.numeric(x))
if(all(is.na(x_n))) return(NA)
if(all(x_n == 0)) return(x_n)
x_n[!is.na(x_n)] <- rescale(x_n[!is.na(x_n)])
x_n
}))
#> # A tibble: 10 x 5
#> # Groups: id [2]
#> date id var1 var2 var3
#> <date> <chr> <dbl> <dbl> <dbl>
#> 1 2021-08-04 aa 0 0 NA
#> 2 2021-08-05 aa 20 0 NA
#> 3 2021-08-06 aa 40 0 NA
#> 4 2021-08-07 aa 80 0 NA
#> 5 2021-08-08 aa 100 0 NA
#> 6 2021-08-04 bb 83.3 0 0
#> 7 2021-08-05 bb 100 25 25
#> 8 2021-08-06 bb 0 50 50
#> 9 2021-08-07 bb 33.3 75 75
#> 10 2021-08-08 bb 50 100 100
Edit
Removing the complication of having a non-numeric column altogether by having var3 = c(1, 2, 3 ,4, 5,1, 2, 3 ,4, 5) as suggested in the comments by the OP makes this far easier:
input %>%
group_by(id) %>%
mutate(across(contains("var"), ~ if(all(.x == 0)) .x else rescale(.x)))
#> # A tibble: 10 x 5
#> # Groups: id [2]
#> date id var1 var2 var3
#> <date> <chr> <dbl> <dbl> <dbl>
#> 1 2021-08-04 aa 0 0 0
#> 2 2021-08-05 aa 20 0 25
#> 3 2021-08-06 aa 40 0 50
#> 4 2021-08-07 aa 80 0 75
#> 5 2021-08-08 aa 100 0 100
#> 6 2021-08-04 bb 83.3 0 0
#> 7 2021-08-05 bb 100 25 25
#> 8 2021-08-06 bb 0 50 50
#> 9 2021-08-07 bb 33.3 75 75
#> 10 2021-08-08 bb 50 100 100
rescale = function(x,max_range=100){
if(min(x) == max(x)) return(x)
return(((x-min(x))/(max(x)-min(x)))*max_range)
}
input %>%
group_by(id)%>%
mutate(across(where(is.numeric), rescale))
# A tibble: 10 × 5
# Groups: id [2]
date id var1 var2 var3
<date> <chr> <dbl> <dbl> <chr>
1 2021-08-04 aa 0 0 hi
2 2021-08-05 aa 20 0 hi
3 2021-08-06 aa 40 0 hi
4 2021-08-07 aa 80 0 hi
5 2021-08-08 aa 100 0 hi
6 2021-08-04 bb 83.3 0 1
7 2021-08-05 bb 100 25 2
8 2021-08-06 bb 0 50 3
9 2021-08-07 bb 33.3 75 4
10 2021-08-08 bb 50 100 5

R dataframe with special cumsum

I have a dateframe like this:
df <- data.frame(grp = c(rep("a", 5), rep("b", 5)), t = c(1:5, 1:5), value = c(-1, 5, 9, -15, 6, 5, 1, 7, -11, 9))
# Limits for desired cumulative sum (CumSum)
maxCumSum <- 8
minCumSum <- 0
What I would like to calculate is a cumulative sum of value by group (grp) within the values of maxCumSum and minCumSum. The respective table dt2 should look something like this:
grp t value CumSum
a 1 -1 0
a 2 5 5
a 3 9 8
a 4 -15 0
a 5 6 6
b 1 5 5
b 2 1 6
b 3 7 8
b 4 -11 0
b 5 9 8
Think of CumSum as a water storage with has a certain maximum capacity and the level of which cannot sink below zero.
The normal cumsum does obviously not do the trick since there are no limitations to maximum or minimum. Has anyone a suggestion how to achieve this? In the real dataframe there are of course more than 2 groups and far more than 5 times.
Many thanks!
What you can do is create a function which calculate the cumsum until it reach the max value and start again at the min value like this:
df <- data.frame(grp = c(rep("a", 5), rep("b", 5)), t = c(1:5, 1:5), value = c(-1, 5, 9, -15, 6, 5, 1, 7, -11, 9))
library(dplyr)
maxCumSum <- 8
minCumSum <- 0
f <- function(x, y) max(min(x + y, maxCumSum), minCumSum)
df %>%
group_by(grp) %>%
mutate(CumSum = Reduce(f, value, 0, accumulate = TRUE)[-1])
#> # A tibble: 10 × 4
#> # Groups: grp [2]
#> grp t value CumSum
#> <chr> <int> <dbl> <dbl>
#> 1 a 1 -1 0
#> 2 a 2 5 5
#> 3 a 3 9 8
#> 4 a 4 -15 0
#> 5 a 5 6 6
#> 6 b 1 5 5
#> 7 b 2 1 6
#> 8 b 3 7 8
#> 9 b 4 -11 0
#> 10 b 5 9 8
Created on 2022-07-04 by the reprex package (v2.0.1)

Counting of conditional frequency in R

I have a table with only one column and more than 200 rows. It includes three values, 0, 1 and 3. I´m interested in only these incidents, where an 1 follwos a 0. Can R count all X=1 if X-1 = =, given that X is the value of any row.
It would be great, if someone could help !
Best, Anna
Do you mean something like this?
# Create some sample data
set.seed(2020)
df <- data.frame(incident = sample(c(0, 1, 3), 10, replace = TRUE))
# incident
#1 3
#2 1
#3 0
#4 0
#5 1
#6 1
#7 0
#8 0
#9 1
#10 1
sum(c(df$incident[-1] == 1, FALSE) * (df$incident == 0))
# Or: with(df, sum(c(incident[-1] == 1, FALSE) * (incident == 0)))
#[1] 2
Here, c(incident[-1] == 1, FALSE) * (incident == 0) is the logical AND of x[i-1] = 0 and x[i] = 1. sum then sums the number of occurrences (in this case there are 2: one in rows 4/5 and one in rows 8/9).
library(tidyverse)
set.seed(123)
(df <- tibble(value = sample(c(0, 1, 3),size = 200, replace = TRUE)))
#> # A tibble: 200 x 1
#> value
#> <dbl>
#> 1 3
#> 2 3
#> 3 3
#> 4 1
#> 5 3
#> 6 1
#> 7 1
#> 8 1
#> 9 3
#> 10 0
#> # … with 190 more rows
count <- 0
#use map instead of walk to view the process row by row
walk(2:nrow(df), ~ {
if (df$value[[.x - 1]] == 0 && df$value[[.x]] == 1) count <<- count + 1
})
count
#> [1] 26
#some rows where the pattern is happening
df[86:87, ]
#> # A tibble: 2 x 1
#> value
#> <dbl>
#> 1 0
#> 2 1
df[93:94, ]
#> # A tibble: 2 x 1
#> value
#> <dbl>
#> 1 0
#> 2 1
Created on 2021-06-28 by the reprex package (v2.0.0)
Using dplyr:
transmute(df, dif = c(NA, diff(value))) %>%
count(dif) %>%
filter(dif == 1)
#> # A tibble: 1 x 2
#> dif n
#> <dbl> <int>
#> 1 1 26
Created on 2021-06-28 by the reprex package (v2.0.0)

High and low objects using R's car library

I'm trying to add a column which reads my dataframe's column and outputs a 1 if the element is bigger than a certain number (and a zero if the condition isn't met). However, this code doesn't seem to work: df is an existing dataframe.
df2 <- data.frame(df2, C=Recode(df$numbers, "hi:200=1; else=0")) ##C = numbers > 200 = 1
I'm using R's car library.
Does this achieve what you need?
df2 <- tibble(numbers = c(1, 200, 201))
df2$recoded <- ifelse(df2$numbers > 200, 1, 0)
df2
# # A tibble: 3 x 2
# numbers recoded
# <dbl> <dbl>
# 1 1 0
# 2 200 0
# 3 201 1
In base R we can also do
df2$recoded <- as.integer(df2$numbers > 200)
In data.table we could do:
library(data.table)
df <- datasets::cars
setDT(df)
df[, numbers := ifelse(df$dist > 10, 1, 0)][1:10, ]
#> speed dist numbers
#> 1: 4 2 0
#> 2: 4 10 0
#> 3: 7 4 0
#> 4: 7 22 1
#> 5: 8 16 1
#> 6: 9 10 0
#> 7: 10 18 1
#> 8: 10 26 1
#> 9: 10 34 1
#> 10: 11 17 1
Created on 2021-03-17 by the reprex package (v0.3.0)

compress / summarize string start and length data in R

I have a data.frame of (sub)string positions within a larger string. The data contains the start of a (sub)string and it's length. The end position of the (sub)string can be easily calculated.
data1 <- data.frame(start = c(1,3,4,9,10,13),
length = c(2,1,3,1,2,1)
)
data1$end <- (data1$start + data1$length - 1)
data1
#> start length end
#> 1 1 2 2
#> 2 3 1 3
#> 3 4 3 6
#> 4 9 1 9
#> 5 10 2 11
#> 6 13 1 13
Created on 2019-12-10 by the reprex package (v0.3.0)
I would like to 'compress' this data.frame by summarizing continuous (sub)strings (strings that are connected with each other) so that my new data looks like this:
data2 <- data.frame(start = c(1,9,13),
length = c(6,3,1)
)
data2$end <- (data2$start + data2$length - 1)
data2
#> start length end
#> 1 1 6 6
#> 2 9 3 11
#> 3 13 1 13
Created on 2019-12-10 by the reprex package (v0.3.0)
Is there preferably a base R solution which gets me from data1 to data2?
f = cumsum(with(data1, c(0, start[-1] - head(end, -1))) != 1)
do.call(rbind, lapply(split(data1, f), function(x){
with(x, data.frame(start = start[1],
length = tail(end, 1) - start[1] + 1,
end = tail(end, 1)))}))
# start length end
#1 1 6 6
#2 9 3 11
#3 13 1 13
Using dplyr we can do the following:
library(dplyr)
data1 %>%
group_by(consecutive = cumsum(start != lag(end, default = 0) + 1)) %>%
summarise(start = min(start), length=sum(length), end=max(end)) %>%
ungroup %>% select(-consecutive)
#> # A tibble: 3 x 3
#> start length end
#> <dbl> <dbl> <dbl>
#> 1 1 6 6
#> 2 9 3 11
#> 3 13 1 13

Resources