Using a vector as a variable in a formula - r

I've got a function which acts on several columns, but I'd like to adapt it to use a different value of the main variable, mode, for each column. I have put a simplified example below.
My data is a cross tabulation of frequency i.e. in column A01 there are 6485 counts of 13 CAGs, 35 counts of 14 CAGs etc. The modal value for column 1 is therefore 13.
I need to calculate:
1) Skewness using (mean - mode)/sd
2) Proportion of each column where CAG is > than the mode
The code below works for that. However, I now need to compare each sample to the mode of a control sample and I'm a bit stuck with the code. The sample against which each needs to be compared is defined in the table controls. Could I please ask for help adapting my code so that skewmode and prop are calculated using the appropriate control mode for each column? I hope that makes sense!
#Data set
data <- data.frame(CAG = c(13, 14, 15, 17),
A01 = c(6485,35,132, 12),
A02 = c(0,42,56, 4))
#Mode
mode <- data[sapply(data[2:ncol(data)], which.max), ]$CAG
#Summary statistics
sumstats <- sapply(data[, 2:ncol(data)], function(x) {
data_e <- rep(data$CAG, x)
library(psych)
data.frame(
describe(data_e)
)
})
sumstats <- as.data.frame(t(sumstats))
sumstats[] <- lapply(sumstats, function(x) {
as.numeric(x)
})
# Results table
results <- data.frame(mode, sumstats)
# Skewness - I'd like to replace 'results$mode' here
# with the relevant mode from the controls table
skewmode <- (results$mean - results$mode) / results$sd
# Proportion > mode I'd like to replace 'mod' here
# with the relevant mode from the controls table
prop <- lapply(data[, 2:ncol(data)], function(x) {
mod <- data$CAG[which.max(x)]
B <- sum(x[data$CAG >= mod])
A <- sum(x[data$CAG <= mod])
B/(A+B)
})
prop <- as.data.frame(prop)
prop <- t(prop)
results <- data.frame(mode, sumstats, skewmode, prop)
# Controls
ctrls <- data.frame(samples = c('A01', 'A02', 'A03', 'A04'),
ctrl = c('A01','A01', 'A03', 'A03'))

Consider Map (the wrapper to mapply) which passes both sample mode and control mode iteratively into a defined function, prop_skew_calc(), to calculate skewmode and prop. At the end, outputs a list of dataframes for final row bind.
NOTE: below demonstrates with base R's summary() since I do not have the psyche package. However, I leave a comment in code on how to integrate psych::describe() which docs indicate returns a dataframe of summary stats useful for psychometrics:
Data (adding A03 and A04)
#Data set
data <- data.frame(CAG = c(13, 14, 15, 17),
A01 = c(6485,35,132, 12),
A02 = c(0,42,56, 4),
A03 = c(33,5014,2221, 18),
A04 = c(106,89,436, 11))
#Controls
ctrls <- data.frame(samples = c('A01', 'A02', 'A03', 'A04'),
ctrl = c('A01','A01', 'A03', 'A03'))
Function (removes any l/sapply looping since scalar values will be passed iteratively by Map)
library(psych)
prop_skew_calc <- function(x, y) {
#Mode
samplemode <- data$CAG[which.max(data[[x]])]
cntrlmode <- data$CAG[which.max(data[[y]])]
#Summary statistics
sumstats <- summary(rep(data$CAG, data[[x]])) # R base's summary()
sumstats <- as.data.frame(t(unclass(sumstats)))
#sumstats <- describe(rep(data$CAG, data[[x]])) # pysche's describe()
#sumstats <- as.data.frame(t(sumstats))
# Results table
results <- data.frame(cntrlmode, sumstats)
# Skewness
skewmode <- (results$Mean - results$cntrlmode) / results$Min
# Proportion
B <- sum(data[data$CAG >= cntrlmode, x])
A <- sum(data[data$CAG <= cntrlmode, x])
prop <- B/(A+B)
results <- data.frame(samplemode, cntrlmode, sumstats, skewmode, prop=prop)
}
Map (calling above function, passing columns of ctrl dataframe)
dfList <- Map(prop_skew_calc, ctrls$samples, ctrls$ctrl)
finaldf <- do.call(rbind, dfList)
finaldf
# samplemode cntrlmode Min. X1st.Qu. Median Mean X3rd.Qu. Max. skewmode prop
# 1 17 17 13 14 15 14.90 17 17 -0.1615385 0.223684211
# 2 13 17 13 13 13 13.05 13 17 -0.3038462 0.001797484
# 3 15 13 14 14 15 14.67 15 17 0.1192857 1.000000000
# 4 14 13 13 14 14 14.31 15 17 0.1007692 0.995491187

Related

How to optimzie my function by dropping loops

I have the following function that uses nested loops and honestly I'm not sure how to proceed with making the code run more efficient. It runs fine for 100 sims in my opinion but when I ran for 2000 sims it took almost 12 seconds.
This code will generate any n Brownian Motion simulations and works well, the issue is once the simulation size is increased to say 500+ then it starts to bog down, and when it hits 2k then it's pretty slow ie 12.
Here is the function:
ts_brownian_motion <- function(.time = 100, .num_sims = 10, .delta_time = 1,
.initial_value = 0) {
# TidyEval ----
T <- as.numeric(.time)
N <- as.numeric(.num_sims)
delta_t <- as.numeric(.delta_time)
initial_value <- as.numeric(.initial_value)
# Checks ----
if (!is.numeric(T) | !is.numeric(N) | !is.numeric(delta_t) | !is.numeric(initial_value)){
rlang::abort(
message = "All parameters must be numeric values.",
use_cli_format = TRUE
)
}
# Initialize empty data.frame to store the simulations
sim_data <- data.frame()
# Generate N simulations
for (i in 1:N) {
# Initialize the current simulation with a starting value of 0
sim <- c(initial_value)
# Generate the brownian motion values for each time step
for (t in 1:(T / delta_t)) {
sim <- c(sim, sim[t] + rnorm(1, mean = 0, sd = sqrt(delta_t)))
}
# Bind the time steps, simulation values, and simulation number together in a data.frame and add it to the result
sim_data <- rbind(
sim_data,
data.frame(
t = seq(0, T, delta_t),
y = sim,
sim_number = i
)
)
}
# Clean up
sim_data <- sim_data %>%
dplyr::as_tibble() %>%
dplyr::mutate(sim_number = forcats::as_factor(sim_number)) %>%
dplyr::select(sim_number, t, y)
# Return ----
attr(sim_data, ".time") <- .time
attr(sim_data, ".num_sims") <- .num_sims
attr(sim_data, ".delta_time") <- .delta_time
attr(sim_data, ".initial_value") <- .initial_value
return(sim_data)
}
Here is some output of the function:
> ts_brownian_motion(.time = 10, .num_sims = 25)
# A tibble: 275 × 3
sim_number t y
<fct> <dbl> <dbl>
1 1 0 0
2 1 1 -2.13
3 1 2 -1.08
4 1 3 0.0728
5 1 4 0.562
6 1 5 0.255
7 1 6 -1.28
8 1 7 -1.76
9 1 8 -0.770
10 1 9 -0.536
# … with 265 more rows
# ℹ Use `print(n = ...)` to see more rows
As suggested in the comments, if you want speed, you should use cumsum. You need to be clear what type of Brownian Motion you want (arithmetic, geometric). For geometric Brownian motion, you'll need to correct the approximation error by adjusting the mean. As an example, the NMOF package (which I maintain), contains a function gbm that implements geometric Brownian Motion through cumsum. Here is an example call for 2000 paths with 100 timesteps each.
library("NMOF")
library("zoo") ## for plotting
timesteps <- 100
system.time(b <- NMOF::gbm(2000, tau = 1, timesteps = 100, r = 0, v = 1))
## user system elapsed
## 0.013 0.000 0.013
dim(b) ## each column is one path, starting at time zero
## [1] 101 2000
plot(zoo(b[, 1:5], 0:timesteps), plot.type = "single")

Elbow/knee in a curve in R

I've got this data processing:
library(text2vec)
##Using perplexity for hold out set
t1 <- Sys.time()
perplex <- c()
for (i in 3:25){
set.seed(17)
lda_model2 <- LDA$new(n_topics = i)
doc_topic_distr2 <- lda_model2$fit_transform(x = dtm, progressbar = F)
set.seed(17)
sample.dtm2 <- itoken(rawsample$Abstract,
preprocessor = prep_fun,
tokenizer = tok_fun,
ids = rawsample$id,
progressbar = F) %>%
create_dtm(vectorizer,vtype = "dgTMatrix", progressbar = FALSE)
set.seed(17)
new_doc_topic_distr2 <- lda_model2$transform(sample.dtm2, n_iter = 1000,
convergence_tol = 0.001, n_check_convergence = 25,
progressbar = FALSE)
perplex[i] <- text2vec::perplexity(sample.dtm2, topic_word_distribution =
lda_model2$topic_word_distribution,
doc_topic_distribution = new_doc_topic_distr2)
}
print(difftime(Sys.time(), t1, units = 'sec'))
I know there are a lot of questions like this, but I haven't been able to exactly find the answer to my situation. Above you see perplexity calculation from 3 to 25 topic number for a Latent Dirichlet Allocation model. I want to get the most sufficient value among those, meaning that I want to find the elbow or knee, for those values that might only be considered as a simple numeric vector which outcome looks like this:
1 NA
2 NA
3 222.6229
4 210.3442
5 200.1335
6 190.3143
7 180.4195
8 174.2634
9 166.2670
10 159.7535
11 153.7785
12 148.1623
13 144.1554
14 141.8250
15 138.8301
16 134.4956
17 131.0745
18 128.8941
19 125.8468
20 123.8477
21 120.5155
22 118.4426
23 116.4619
24 113.2401
25 114.1233
plot(perplex)
This is how plot looks like
I would say that the elbow would be 13 or 16, but I'm not completely sure and I want the exact number as an outcome. I saw in this paper that f''(x) / (1+f'(x)^2)^1.5 is the knee formula, which I tried like this and says it's 18:
> d1 <- diff(perplex) # first derivative
> d2 <- diff(d1) / diff(perplex[-1]) # second derivative
> knee <- (d2)/((1+(d1)^2)^1.5)
Warning message:
In (d2)/((1 + (d1)^2)^1.5) :
longer object length is not a multiple of shorter object length
> which.min(knee)
[1] 18
I can't fully figure this thing out. Would someone like to share how I could get the exact ideal topics number according to perplexity as an outcome?
Found this: "The LDA model with the optimal coherence score, obtained with an elbow method (the point with maximum absolute second derivative) (...)" in this paper, so this coding does the work: d1 <- diff(perplex); k <- which.max(abs(diff(d1) / diff(perplex[-1])))

BTYD Individual Level Estimations For All Observations

I am using BTYD BG NBD in R and did the individual level estimates.
For instance following the documentation in page 20 of:
BTYD Walkthrough
Code for Data Prep:
system.file("data/cdnowElog.csv", package = "BTYD")%>%
dc.ReadLines(., cust.idx = 2, date.idx = 3, sales.idx = 5)%>%
dc.MergeTransactionsOnSameDate()%>%
mutate(date = parse_date_time(date, "%Y%m%d")) -> elog
end.of.cal.period <- as.Date("1997-09-30")
elog.cal <- elog[which(elog$date <= end.of.cal.period), ]
split.data <- dc.SplitUpElogForRepeatTrans(elog.cal);
birth.periods <- split.data$cust.data$birth.per
last.dates <- split.data$cust.data$last.date
clean.elog <- split.data$repeat.trans.elog;
freq.cbt <- dc.CreateFreqCBT(clean.elog);
tot.cbt <- dc.CreateFreqCBT(elog)
cal.cbt <- dc.MergeCustomers(tot.cbt, freq.cbt)
cal.cbs.dates <- data.frame(birth.periods, last.dates, end.of.cal.period)
cal.cbs <- dc.BuildCBSFromCBTAndDates(cal.cbt, cal.cbs.dates,per="week")
params <- pnbd.EstimateParameters(cal.cbs);
one could get estimates for a particular observation.
Code for Individual Level Estimation:
cal.cbs["1516",]
# x t.x T.cal
# 26.00 30.86 31.00
x <- cal.cbs["1516", "x"]
t.x <- cal.cbs["1516", "t.x"]
T.cal <- cal.cbs["1516", "T.cal"]
bgnbd.ConditionalExpectedTransactions(params, T.star = 52,
x, t.x, T.cal)
# [1] 25.76
My question is, is it possible to recursively run this such that I could get a data frame containing the expectations for each row instead of hard coding a particular ID number such as "1516" in this case?
Thanks!
Yes, it is straightforward with dplyr's mutate()
cal.cbs%>%
data.frame()%>%
mutate(`Conditional Expectation` = bgnbd.ConditionalExpectedTransactions(params, T.star = 52, x, t.x, T.cal))
x t.x T.cal Conditional Expectation
1 2 30.428571 38.85714 2.3224971
2 1 1.714286 38.85714 1.0646350
3 0 0.000000 38.85714 0.5607707
4 0 0.000000 38.85714 0.5607707
5 0 0.000000 38.85714 0.5607707
6 7 29.428571 38.85714 6.0231497

apply.rolling window / loop with a rank function

How can I make a rolling window / loop (look-back period 30 days / data points) while ranking the data with base::rank? See below that the apply.rolling function seems not to work.
See example below:
# example data
require(xts)
set.seed(3)
A <- matrix(runif(900, max=30), ncol=3)
Data <- xts(A, Sys.Date()-300:1)
names(Data) <- c("C1", "C2", "C3")
This results in (only last 7 days / data points are shown):
2016-06-20 16.71131510 12.80074552 19.27525535
2016-06-21 22.92512330 25.11613536 17.45237229
2016-06-22 20.09403965 17.20945809 28.06481040
2016-06-23 28.68593738 4.84698272 18.36108782
2016-06-24 15.52956209 25.54946621 3.97892474
2016-06-25 25.76582707 18.14117193 8.17883282
2016-06-26 25.23925100 16.07418907 15.35118717
I select only the last 30 data points:
rolldata30 <- tail(Data[,2:3], 30)
rollindex30 <- tail(Data[,1], 30)
I rank the data (last 30 data points) of vector C2 and C3 based on their original values. Thus this is the period 2016-05-28 until 2016-6-26. Then I make a new vector which calculates an average of the two.
factorx shows the result I am interested in.
rank30 <- as.xts(apply(-rolldata30, 2, rank, na.last= "keep"))
factor <- cbind(rollindex30, global = rowMeans(rank30))
factorx <- last(factor)
Which results in:
2016-06-20 16.711315 14.5
2016-06-21 22.925123 9.5
2016-06-22 20.094040 9.0
2016-06-23 28.685937 19.0
2016-06-24 15.529562 15.0
2016-06-25 25.765827 18.5
2016-06-26 25.239251 17.0
with data on the last day:
C1 global
2016-06-26 25.23925 17
How can I make the calculation rolling in order to make the same calculation for 2016-5-27 until 2016-06-26, 2016-05-26 until 2016-06-25, etc.?
Using PerformanceAnalytics::apply.rolling gives an error:
Error in xts(x, order.by = order.by, frequency = frequency, .CLASS = "double", :
order.by requires an appropriate time-based object
require(PerformanceAnalytics)
test1 <- apply.rolling(Data, width=30, gap=30, by=1, FUN=function(x) as.xts(-x, 2, rank))
I made the following function. factorz gives the same result. Perhaps the function helps to make it rolling?
rollrank <- function(x)
{
a <- tail(x, 30)
b <- as.xts(apply(-a, 2, rank, na.last= "keep"))
c <- cbind(a, global = rowMeans(b))
d <- last(c)
return(d)
}
factorz <- rollrank(Data[,2:3])
The FUN argument to apply.rolling doesn't make sense. I suspect you meant FUN = function(x) as.xts(apply(-x, 2, rank, na.last="keep")). But that still will not work because FUN returns an object with more than one row.
Your rollrank function comes very close to what you need, and I recommend you use rollapply instead of apply.rolling. I suggest that you make a function based on your first example, then pass that function to rollapply.
myrank <- function(x) {
rolldata30 <- x[,2:3]
rollindex30 <- x[,1]
rank30 <- as.xts(apply(-rolldata30, 2, rank, na.last= "keep"))
factor <- cbind(rollindex30, global = rowMeans(rank30))
factorx <- last(factor)
return(factorx)
}
test1 <- rollapply(Data, 30, myrank, by.column=FALSE)
tail(test1)
# C1 global
# 2016-06-23 7.806336 19.5
# 2016-06-24 17.456436 17.5
# 2016-06-25 29.196350 12.5
# 2016-06-26 25.185687 11.0
# 2016-06-27 19.775105 6.5
# 2016-06-28 12.067774 16.0

view values used by function boot to bootstrap estimates

I have written the code below to obtain a bootstrap estimate of a mean. My objective is to view the numbers selected from the data set, ideally in the order they are selected, by the function boot in the boot package.
The data set only contains three numbers: 1, 10, and 100 and I am only using two bootstrap samples.
The estimated mean is 23.5 and the R code below indicates that the six numbers included one '1', four '10' and one '100'. However, there are 30 possible combinations of those numbers that would have resulted in a mean of 23.5.
Is there a way for me to determine which of those 30 possible combinations is the combination that actually appeared in the two bootstrap samples?
library(boot)
set.seed(1234)
dat <- c(1, 10, 100)
av <- function(dat, i) { sum(dat[i])/length(dat[i]) }
av.boot <- boot(dat, av, R = 2)
av.boot
#
# ORDINARY NONPARAMETRIC BOOTSTRAP
#
#
# Call:
# boot(data = dat, statistic = av, R = 2)
#
#
# Bootstrap Statistics :
# original bias std. error
# t1* 37 -13.5 19.09188
#
mean(dat) + -13.5
# [1] 23.5
# The two samples must have contained one '1', four '10' and one '100',
# but there are 30 possibilities.
# Which of these 30 possible sequences actual occurred?
# This code shows there must have been one '1', four '10' and one '100'
# and shows the 30 possible combinations
my.combos <- expand.grid(V1 = c(1, 10, 100),
V2 = c(1, 10, 100),
V3 = c(1, 10, 100),
V4 = c(1, 10, 100),
V5 = c(1, 10, 100),
V6 = c(1, 10, 100))
my.means <- apply(my.combos, 1, function(x) {( (x[1] + x[2] + x[3])/3 + (x[4] + x[5] + x[6])/3 ) / 2 })
possible.samples <- my.combos[my.means == 23.5,]
dim(possible.samples)
n.1 <- rowSums(possible.samples == 1)
n.10 <- rowSums(possible.samples == 10)
n.100 <- rowSums(possible.samples == 100)
n.1[1]
n.10[1]
n.100[1]
length(unique(n.1)) == 1
length(unique(n.10)) == 1
length(unique(n.100)) == 1
I think you can determine the numbers sampled and the order in which they are sampled with the code below. You have to extract the function ordinary.array from the boot package and paste that function into your R code. Then specify the values for n, R and strata, where n is the number of observations in the data set and R is the number of replicate samples you want.
I do not know how general this approach is, but it worked with a couple of simple examples I tried, including the example below.
library(boot)
set.seed(1234)
dat <- c(1, 10, 100, 1000)
av <- function(dat, i) { sum(dat[i])/length(dat[i]) }
av.boot <- boot(dat, av, R = 3)
av.boot
#
# ORDINARY NONPARAMETRIC BOOTSTRAP
#
#
# Call:
# boot(data = dat, statistic = av, R = 3)
#
#
# Bootstrap Statistics :
# original bias std. error
# t1* 277.75 -127.5 132.2405
#
#
mean(dat) + -127.5
# [1] 150.25
# boot:::ordinary.array
ordinary.array <- function (n, R, strata)
{
inds <- as.integer(names(table(strata)))
if (length(inds) == 1L) {
output <- sample.int(n, n * R, replace = TRUE)
dim(output) <- c(R, n)
}
else {
output <- matrix(as.integer(0L), R, n)
for (is in inds) {
gp <- seq_len(n)[strata == is]
output[, gp] <- if (length(gp) == 1)
rep(gp, R)
else bsample(gp, R * length(gp))
}
}
output
}
# I think the function ordinary.array determines which elements
# of the data are sampled in each of the R samples
set.seed(1234)
ordinary.array(n=4,R=3,1)
# [,1] [,2] [,3] [,4]
# [1,] 1 3 1 3
# [2,] 3 4 1 3
# [3,] 3 3 3 3
#
# which equals:
((1+100+1+100) / 4 + (100+1000+1+100) / 4 + (100+100+100+100) / 4) / 3
# [1] 150.25

Resources