Index list within map function - r

This is a continuation from the previous question:
Apply function over every entry one table to every entry of another
I have the following tables loss.tib and bandstib and function bandedlossfn:
library(tidyverse)
set.seed(1)
n <- 5
loss.tib <- tibble(lossid = seq(n),
loss = rbeta(n, 1, 10) * 100)
bandstib <- tibble(bandid = seq(4),
start = seq(0, 75, by = 25),
end = seq(25, 100, by = 25))
bandedlossfn <- function(loss, start, end) {
pmin(end - start, pmax(0, loss - start))
}
It is possible to apply this function over loss.tib using bandstib as arguments:
loss.tib %>%
mutate(
result = map(
loss, ~ tibble(result = bandedlossfn(.x, bandstib$start,
bandstib$end))
)
) %>% unnest
However, I would like to add an index within map as follows:
loss.tib %>%
mutate(
result = map(
loss, ~ tibble(result = bandedlossfn(.x, bandstib$start,
bandstib$end)) %>%
mutate(bandid2 = row_number())
)
) %>% unnest
But it does not seem to work as intended.
I also want to add filter(!near(result,0)) within the map function too for efficient memory management.
The result I'm expecting is:
lossid loss bandid result
1 21.6691088 1 21.6691088
2 6.9390647 1 6.9390647
3 0.5822383 1 0.5822383
4 5.5671643 1 5.5671643
5 27.8237244 1 25.0000000
5 27.8237244 2 2.8237244
Thank you.

Here is one possibility:
you first nest bandstib and add it to loss.tib. This way the id sticks to your calculations:
bandstib <- tibble(bandid = seq(4),
start = seq(0, 75, by = 25),
end = seq(25, 100, by = 25)) %>%
nest(.key = "data")
set.seed(1)
n <- 5
result <- tibble(loss = rbeta(n, 1, 10) * 100) %>%
bind_cols(., slice(bandstib, rep(1, n))) %>%
mutate(result = map2(loss, data, ~bandedlossfn(.x, .y$start, .y$end))) %>%
unnest()

Related

R: Creating a Function with a "Dynamic" Structure

I am working with the R programming language.
Suppose there is a classroom of students - each student flips the same coin many times (the students don't flip the coin the same number of times). Here is a simulate dataset to represent this example:
library(tidyverse)
library(dplyr)
set.seed(123)
ids = 1:100
student_id = sample(ids, 1000, replace = TRUE)
coin_result = sample(c("H", "T"), 1000, replace = TRUE)
my_data = data.frame(student_id, coin_result)
my_data = my_data[order(my_data$student_id),]
I want to count the number of "3 Flip Sequences" recorded by each student (e.g. Student 1 got HHHTH : HHH 1 time, HHT 1 time, HTH 1 time)
And the probability of the 3rd Flip based on the previous 2 flips (e.g. in general, over all students, the probability of a H following HH was 0.54)
Here is some R code that performs these tasks:
results = my_data %>%
group_by(student_id) %>%
summarize(Sequence = str_c(coin_result, lead(coin_result), lead(coin_result, 2)), .groups = 'drop') %>%
filter(!is.na(Sequence)) %>%
count(Sequence)
final = results %>%
mutate(two_seq = substr(Sequence, 1, 2)) %>%
group_by(two_seq) %>%
mutate(third = substr(Sequence, 3, 3)) %>%
group_by(two_seq, third) %>%
summarize(sums = sum(n)) %>%
mutate(prob = sums / sum(sums))
My Question: Suppose I want to now extend this problem to "4 Flip Sequences" (e.g. probability of H given HHH) - I can manually extend this code:
results = my_data %>%
group_by(student_id) %>%
summarize(Sequence = str_c(coin_result, lead(coin_result), lead(coin_result, 2), lead(coin_result, 3)), .groups = 'drop') %>%
filter(!is.na(Sequence)) %>%
count(Sequence)
final = results %>%
mutate(three_seq = substr(Sequence, 1, 3)) %>%
group_by(three_seq) %>%
mutate(fourth = substr(Sequence, 4, 4)) %>%
group_by(three_seq, fourth) %>%
summarize(sums = sum(n)) %>%
mutate(prob = sums / sum(sums))
Is it possible to convert the above code into a function such that I can repeat this for arbitrary combinations? For example:
results <- function(i) {return(my_data %>%
group_by(student_id) %>%
summarize(Sequence = str_c(coin_result, lead(coin_result), lead(coin_result, i+1), lead(coin_result, i+2) .....### insert code here ####), .groups = 'drop') %>%
filter(!is.na(Sequence)) %>%
count(Sequence))}
final <- function(i)
return(results %>%
mutate(three_seq = substr(Sequence, 1, i)) %>%
group_by(three_seq) %>%
mutate(fourth = substr(Sequence, i+1, i+1)) %>%
group_by(three_seq, fourth) %>%
summarize(sums = sum(n)) %>%
mutate(prob = sums / sum(sums)))
}
I am not sure how exactly I would do this, seeing as the first function would require to be "dynamically changed" depending on the value of "i".
Can someone please show me how to do this?
Thanks!
Here's a way you can do it in base R:
# Returns a vector of 0's and 1's, bit more efficient than sample
tosses <- floor(runif(1e3, 0, 2))
count_seqs <- function(x, seq_length) {
vec_length <- length(x)
rolling_window_indices <- rep(1:seq_length, vec_length - seq_length + 1) +
rep(0:(vec_length - seq_length), each = seq_length)
mat <- matrix(x[rolling_window_indices], nrow = seq_length)
sequences <- apply(mat, 2, paste0, collapse = "")
table(sequences)
}
count_seqs(tosses, 3)
Notice I didn't include any ids in the code above. The reason is that, if all students have the same probability of tossing heads or tails, we can treat them as independent (or, more precisely, treat the design as ignorable). However, it's easy to expand the code for situations where the tosses are not independent, e.g. where each participant has a different probability of tossing heads:
ids <- floor(runif(1e3, 1, 101))
probs <- runif(1e2, 0, 1)
tosses_by_id <- lapply(ids, function(i) rbinom(10, 1, probs[i]))
lapply(tosses_by_id, function(x) count_seqs(x, 3))

Identifying entries that have any overlap with reference times using dplyr R

I have a set of audio clips from different sites and files that I have manually extracted and a set that have been extracted automatically. I want to identify any clips in the automatically extracted set at the same site and file that overlap at all with those in the manually extracted set.
For example:
library(dplyr)
set.seed(123)
group <- as.factor(round(runif(100, 1, 10),0))
file <- sample(LETTERS, 100, TRUE)
start <- round(runif(100, 0, 100), 2)
end <- round(start + (runif(100, 0, 1)), 2)
auto <- bind_cols(group, file, start, end) %>%
set_names(., nm = c("group", "file", "start", "end"))
man <- bind_rows(sample_n(auto, 10) %>%
mutate(start = round(start - runif(10, 0, 1), 2),
end = round(end - runif(10,0, 0.5), 2)),
sample_n(auto, 10) %>%
mutate(start = round(start + runif(10, 0, 1), 2),
end = round(start + runif(10,0, 0.5), 2)))
I have been trying to use dplyr::between like this:
res <- auto %>%
left_join(., man, by = c("group", "file")) %>%
drop_na() %>%
mutate(pos = (between(start.x, start.y, end.y) | between(end.x, start.y, end.y)))
But it is just returning FALSE. The code doesn't throw an error so I am guessing there is in issue in the way I have written it.
If you split out the or in the last mutate to check if either between function is the issue, like this:
res <- auto %>%
left_join(., man, by = c("group", "file")) %>%
drop_na() %>%
mutate(x = between(start.x, start.y, end.y),
y = between(end.x, start.y, end.y),
pos = x | y == "TRUE")
you can see that both return FALSE.
It seems like it should be a relatively simple thing, but I am struggling to figure it out.

How to fill a vector from a for-loop containing a numeric sequence with a given step?

I have the following piece of code:
library(dplyr)
Q = 10000
span = 1995:2016
time = rep(span,times = Q, each= Q)
id = rep(1:Q,times=length(span))
s1 = rep(rnorm(Q,0,1),times=length(span))
gdp = rep(rnorm(Q,0,1),times=length(span))
e = rep(rnorm(Q,0,1),times=length(span))
dfA = data.frame(id,time,s1,e,gdp)
mgr = double()
stp = 10
for(K in seq(10,Q,stp)){
gr = double()
for(t in span){
wt1 = dfA %>% filter(time == t-1) %>%
arrange(desc(s1)) %>% mutate(w= s1/gdp)
zt1 = dfA %>% filter(time == t-1) %>% mutate(z1 = log(s1/e))
zt = dfA %>% filter(time == t) %>% mutate(z = log(s1/e))
gt = left_join(zt1,zt,by="name") %>%
mutate(g = z-z1) %>% select(name,g) %>% na.omit()
a = left_join(wt1,gt,by="name") %>% na.omit()
a = a %>% mutate(id = 1:length(a$name)) %>%
filter(id <= Q) %>% mutate(gbar = mean(g)) %>%
filter(id <= K) %>% mutate(sck = g-gbar,
gamma = w*sck)
gr = append(gr, sum(a$gamma))
}
mgr = append(mgr,mean(gr))
}
where dfA is a data frame containing an id variable and a time variable, among others. Since the time variable ranges from 1995 to 2016 and K is a sequence with step 10, I resorted to append() to store gr and mgr, respectively. The problem is that it takes too long to compute.
So my question is: Is there any way to avoid using append() to fill the vectors gr and mgr and thus reduce the time spent to compute the code?
You could initiate the 'gr' and 'mgr' vectors with a set length rather than just initiate them as a double and have R extend them every iteration. The advantage is that the memory for the vector is allocated beforehand and you don't have to redefine the entire variable mgr/gr.
## initiate vectors with set length
mgr <- double(length = length(seq(10,Q,stp)))
gr <- double(length = length(1995:2016))
# fill the positions in each iteration
outerIteration <- (K - 10) / stp
innerIteration <- t - 1994
gr[innerIteration] <- sum(a$gamma)
# take the mean for each block of length 21 (2016 - 1995)
mgr[outerIteration] <- mean(gr[(outerIteraion -1)*21 + 1 : outerIteration*21])

How to add new rows with different value in one column in R

Basically I have a vector names of all names, and a dataframe df with a BIN (0/1) field and a NAME field. For every row with BIN==0, I want to create a duplicate row but with 1 instead and add it to the bottom of df with a different name. Here's what I have to select a new name, given the current name:
sample(names[names!=name], 1)
But I'm not sure how to vectorize this and furthermore add it to df with the same data from BIN.
EDIT:
Sample data:
df = data.frame(BIN=c(1,0,1), NAME=c("alice","bob","cate"))
names = c("alice","bob","cate","dan")
I got closer with something like this:
rbind(df, df %>% filter(BIN == 0) %>%
mutate(NAME = sample(names[names!=NAME],1)))
But I get an error: In binattr(e1, e2): length(e1) not a multiple of length(e2).
Here's a simple approach. I think it's pretty straightforward, let me know if you have questions:
rename = subset(df, BIN == 0)
rename$NEW_NAME = sample(names, size = nrow(rename), replace = TRUE)
while(any(rename$NAME == rename$NEW_NAME)) {
matches = rename$NAME == rename$NEW_NAME
rename$NEW_NAME[matches] = sample(names, size = sum(matches), replace = TRUE)
}
rename$BIN = 1
rename$NAME = rename$NEW_NAME
rename$NEW_NAME = NULL
result = rbind(df, rename)
result
# BIN NAME
# 1 1 alice
# 2 0 bob
# 3 1 cate
# 21 1 alice
Here's another approach, less clear but more efficient. This is the "right" way to do it, but it requires a little bit more thought and explanation.
df$NAME = factor(df$NAME, levels = names)
rename = subset(df, BIN == 0)
n = length(names)
# we will increment each level number with a random integer from
# 1 to n - 1 (with a mod to make it cyclical)
offset = sample(1:(n - 1), size = nrow(rename), replace = TRUE)
adjusted = (as.integer(rename$NAME) + offset) %% n
# reconcile 1-indexed factor levels with 0-indexed mod operator
adjusted[adjusted == 0] = n
rename$NAME = names[adjusted]
rename$BIN = 1
result = rbind(df, rename)
(or, rewritten for dplyr)
df = mutate(df, NAME = factor(NAME, levels = names))
n = length(names)
df %>% filter(BIN == 0) %>%
mutate(
offset = sample(1:(n - 1), size = n(), replace = TRUE),
adjusted = (as.integer(NAME) + offset) %% n,
adjusted = if_else(adjusted == 0, n, adjusted),
NAME = names[adjusted],
BIN = 1
) %>%
select(-offset, -adjusted) %>%
rbind(df, .)
Since your issue is the vectorization part, I'd recommend testing answer on a sample case with more than one BIN 0 row, I used this:
df = data.frame(BIN=c(1,0,1,0,0,0,0,0,0), NAME=rep(c("alice","bob","cate"), 3))
And, because I was curious, here's a benchmark for 10k rows with 26 names. Results first, code below:
# Unit: milliseconds
# expr min lq mean median uq max neval
# while_loop 34.070438 34.327020 37.53357 35.548047 39.922918 46.206454 10
# increment 1.397617 1.458592 1.88796 1.526512 2.123894 3.196104 10
# increment_dplyr 24.002169 24.681960 25.50568 25.374429 25.750548 28.054954 10
# map_char 346.531498 347.732905 361.82468 359.736403 374.648635 383.575265 10
The "clever" way is by far the fastest. My guess is the dplyr slowdown is because we can't do the direct replacement of only the relevant bits of adjusted, and instead have to add the overhead of if_else. That and we are actually adding columns to the data frame for adjusted and offset rather than dealing with vectors. This is enough to make it almost as slow as the while loop approach, which is still 10x faster than the map_chr which has to go one row at a time.
nn = 10000
df = data.frame(
BIN = sample(0:1, size = nn, replace = TRUE, prob = c(0.7, 0.3)),
NAME = factor(sample(letters, size = nn, replace = TRUE), levels = letters)
)
get.new.name <- function(c){
return(sample(names[names!=c],1))
}
microbenchmark::microbenchmark(
while_loop = {
rename = subset(df, BIN == 0)
rename$NEW_NAME = sample(names, size = nrow(rename), replace = TRUE)
while (any(rename$NAME == rename$NEW_NAME)) {
matches = rename$NAME == rename$NEW_NAME
rename$NEW_NAME[matches] = sample(names, size = sum(matches), replace = TRUE)
}
rename$BIN = 1
rename$NAME = rename$NEW_NAME
rename$NEW_NAME = NULL
result = rbind(df, rename)
},
increment = {
rename = subset(df, BIN == 0)
n = length(names)
# we will increment each level number with a random integer from
# 1 to n - 1 (with a mod to make it cyclical)
offset = sample(1:(n - 1), size = nrow(rename), replace = TRUE)
adjusted = (as.integer(rename$NAME) + offset) %% n
# reconcile 1-indexed factor levels with 0-indexed mod operator
adjusted[adjusted == 0] = n
rename$NAME = names[adjusted]
rename$BIN = 1
},
increment_dplyr = {
n = length(names)
df %>% filter(BIN == 0) %>%
mutate(
offset = sample(1:(n - 1), size = n(), replace = TRUE),
adjusted = (as.integer(NAME) + offset) %% n,
adjusted = if_else(adjusted == 0, n, adjusted),
NAME = names[adjusted],
BIN = 1
) %>%
select(-offset,-adjusted)
},
map_char = {
new.df <- df %>% filter(BIN == 0) %>%
mutate(NAME = map_chr(NAME, get.new.name)) %>%
mutate(BIN = 1)
},
times = 10
)
Well I didn't intend to answer my own question but I did find a simpler solution. I think it's better than using rowwise() but I don't know if it's necessarily the most efficient way.
library(tidyverse)
get.new.name <- function(c){
return(sample(names[names!=c],1))
}
new.df <- rbind(df, df %>% filter(BIN == 0) %>%
mutate(NAME = map_chr(NAME, get.new.name)) %>%
mutate(BIN = 1)
map_char ended up being pretty important instead of just map since the latter would return a weird list of lists.
A little weird but I think this should be what you want:
library(tidyverse)
df <- data.frame(BIN=c(1,0,1), NAME=c("alice","bob","cate"), stringsAsFactors = FALSE)
names <- c("alice","bob","cate","dan")
df %>%
mutate(NAME_new = ifelse(BIN == 0, sample(names, n(), replace = TRUE), NA)) %>%
gather(name_type, NAME, NAME:NAME_new, na.rm = TRUE) %>%
mutate(BIN = ifelse(name_type == "NAME_new", 1, BIN)) %>%
select(-name_type)
Output:
BIN NAME
1 1 alice
2 0 bob
3 1 cate
4 1 alice

Combine pipeable functions in code blocks

I would like to summarise pipeable functions (of magrittr/dplyr) into shorter function-"blocks" (to hopefully have more readable code). For example:
library(dplyr)
d <- tbl_df(data.frame(A = rep(LETTERS[2:5], each = 5),
M = rep(letters[1:2], times = 10),
X = round(rnorm(20, 10, 2), 1)))
# I want to replace this
# ----------------------
d %>%
group_by(A) %>%
summarise(X = mean(X)) -> d_test_1
# with this
# ---------
my_mean <- function(d, by_var, x) {
expr <- substitute(by_var) # group variable, seems ok
expr_2 <- substitute(expression(x = mean(x))) # calculate mean
print(deparse(expr_2))
# problem: x = mean(x) is only substituted to x = mean(X) .. 1 capital x, should be 2
expr_3 <- parse(text = paste(deparse(substitute(x)), "=mean(",
deparse(substitute(x)), ")"))
print(deparse(expr_3))
# expr_3 does not work either
d %>%
group_by(eval(expr)) %>%
#summarise(X = mean(X)) -> d # uses right group variable
summarise(eval(expr_3)) -> d # uses wrong group variable <> side-effect of "expr"?
invisible(d)
}
# this is the short version I am after
d %>%
my_mean(A, X) -> d_test_2
d_test_1
d_test_2
Thx & kind regards
Maybe somebody else does not know where to look either:
library(dplyr)
d <- tbl_df(data.frame(A = rep(LETTERS[2:5], each = 5),
M = rep(letters[1:2], times = 10),
X = round(rnorm(20, 10, 2), 1),
stringsAsFactors = F))
my_mean <- function(d, by_var, x) {
d %>%
group_by(!!enquo(by_var)) %>%
summarise(!!quo_name(enquo(x)) := mean(!!enquo(x)))
}
d %>%
my_mean(A, X) -> want
want

Resources