apply.rolling window / loop with a rank function - r

How can I make a rolling window / loop (look-back period 30 days / data points) while ranking the data with base::rank? See below that the apply.rolling function seems not to work.
See example below:
# example data
require(xts)
set.seed(3)
A <- matrix(runif(900, max=30), ncol=3)
Data <- xts(A, Sys.Date()-300:1)
names(Data) <- c("C1", "C2", "C3")
This results in (only last 7 days / data points are shown):
2016-06-20 16.71131510 12.80074552 19.27525535
2016-06-21 22.92512330 25.11613536 17.45237229
2016-06-22 20.09403965 17.20945809 28.06481040
2016-06-23 28.68593738 4.84698272 18.36108782
2016-06-24 15.52956209 25.54946621 3.97892474
2016-06-25 25.76582707 18.14117193 8.17883282
2016-06-26 25.23925100 16.07418907 15.35118717
I select only the last 30 data points:
rolldata30 <- tail(Data[,2:3], 30)
rollindex30 <- tail(Data[,1], 30)
I rank the data (last 30 data points) of vector C2 and C3 based on their original values. Thus this is the period 2016-05-28 until 2016-6-26. Then I make a new vector which calculates an average of the two.
factorx shows the result I am interested in.
rank30 <- as.xts(apply(-rolldata30, 2, rank, na.last= "keep"))
factor <- cbind(rollindex30, global = rowMeans(rank30))
factorx <- last(factor)
Which results in:
2016-06-20 16.711315 14.5
2016-06-21 22.925123 9.5
2016-06-22 20.094040 9.0
2016-06-23 28.685937 19.0
2016-06-24 15.529562 15.0
2016-06-25 25.765827 18.5
2016-06-26 25.239251 17.0
with data on the last day:
C1 global
2016-06-26 25.23925 17
How can I make the calculation rolling in order to make the same calculation for 2016-5-27 until 2016-06-26, 2016-05-26 until 2016-06-25, etc.?
Using PerformanceAnalytics::apply.rolling gives an error:
Error in xts(x, order.by = order.by, frequency = frequency, .CLASS = "double", :
order.by requires an appropriate time-based object
require(PerformanceAnalytics)
test1 <- apply.rolling(Data, width=30, gap=30, by=1, FUN=function(x) as.xts(-x, 2, rank))
I made the following function. factorz gives the same result. Perhaps the function helps to make it rolling?
rollrank <- function(x)
{
a <- tail(x, 30)
b <- as.xts(apply(-a, 2, rank, na.last= "keep"))
c <- cbind(a, global = rowMeans(b))
d <- last(c)
return(d)
}
factorz <- rollrank(Data[,2:3])

The FUN argument to apply.rolling doesn't make sense. I suspect you meant FUN = function(x) as.xts(apply(-x, 2, rank, na.last="keep")). But that still will not work because FUN returns an object with more than one row.
Your rollrank function comes very close to what you need, and I recommend you use rollapply instead of apply.rolling. I suggest that you make a function based on your first example, then pass that function to rollapply.
myrank <- function(x) {
rolldata30 <- x[,2:3]
rollindex30 <- x[,1]
rank30 <- as.xts(apply(-rolldata30, 2, rank, na.last= "keep"))
factor <- cbind(rollindex30, global = rowMeans(rank30))
factorx <- last(factor)
return(factorx)
}
test1 <- rollapply(Data, 30, myrank, by.column=FALSE)
tail(test1)
# C1 global
# 2016-06-23 7.806336 19.5
# 2016-06-24 17.456436 17.5
# 2016-06-25 29.196350 12.5
# 2016-06-26 25.185687 11.0
# 2016-06-27 19.775105 6.5
# 2016-06-28 12.067774 16.0

Related

How can I get the coefficients from nlsList into a dataframe?

Is there a way to extract just the estimates from nlsList()?
Sample data:
library(nlme)
dat<-read.table(text="time gluc starch solka
1 6.32 7.51 1.95
2 20.11 25.49 6.43
3 36.03 47.53 10.39
6 107.52 166.31 27.01
12 259.28 305.19 113.72
24 283.40 342.56 251.14
48 297.55 353.66 314.22", header = TRUE)
long <- tidyr::pivot_longer(dat, -1, values_to = "y")
long$name <- factor(long$name)
st0 <- list(Max = 200, k = 0.1, Lag = 0.5)
kinetics<-nlsList(y ~ (time > Lag) * Max * (1-exp(-k * (time - Lag))) | name, long, start = st0)
I would like to end up with a data frame like the image below of the samples and their estimates for Max, k, and Lag but cannot figure out how.
We could extract the coef and then loop over the 3d array with apply
library(nlme)
m1 <- apply(summary(kinetics)$coef, 3, function(x) x[,1])
dat <- transform(as.data.frame(m1), name = row.names(m1))[c(4, 1:3)]
row.names(dat) <- NULL
-output
dat
name Max k Lag
1 gluc 299.6637 0.16155846 2.426204
2 solka 337.5416 0.06583197 4.966971
3 starch 353.7206 0.18416048 2.276593
Here is a simple way, just coerce the coefficients appropriate dimension to class "data.frame".
cf_smry <- coef(summary(kinetics))[, 1, ]
as.data.frame(cf_smry)
# Max k Lag
#gluc 299.6637 0.16155846 2.426204
#solka 337.5416 0.06583197 4.966971
#starch 353.7206 0.18416048 2.276593
coef(kinetics) gives a data frame so any of these would work and differ only in whether the names appear as row names (first one) or as a column (others).
coef(kinetics)
data.frame(name = names(kinetics), coef(kinetics), row.names = NULL)
tibble::rownames_to_column(coef(kinetics), var = "name")

R: Error in calculating the average of a variable at different time intervals for many factors using for loop

I have a data frame in which a variable(var1) is expressed over time in seconds. I want to calculate the mean of var1 for each sample at different time intervals (10 seconds interval until 500 seconds).
the dataframe looks like this:
sample time var1
S1 1 3.5
S1 2 6.3
S1 3 7.8
S1 4 20.5
S1 … ...
S1 530 4.5
S2 1 6.7
S2 2 20.3
S2 3 5.4
S2 … ...
S2 710 70.3
...
The data frame that I want to obtain looks like this
Sample var1_mean10:20sec var1_mean20:30sec .... var1_mean490:500sec
S1
S2
..
So I wrote this code:
setwd("…")
A <- read_excel("dati.xlsx")
for (cat in unique(A$sample))
{
A.s <- subset(A, A$sample == cat)
cuts <- cut (A.s$time, breaks=seq.int(from = 0, to = 500, by = 10))
d <- by (A.s$var1, cuts, mean)
Y<-data.frame(d)
j <- t(Y)
write.csv(Y, file = paste(cat, "var1", sep = "_"))
}
But when I run it I get Error message: Error in as.data.frame.default(x[[i]], optional = TRUE) : cannot coerce class ""by"" to a data.frame
The plan is to eventually merge all the different csv.
If I understood your problem correctly you are trying to average your data in 10 second interval. I would like to propose an alternative approach using the function aggregate to compute the mean across the 10 seconds interval. The 10 seconds interval would be created through a fictitious 'time' array used to group your 10 seconds interval and then averaging.
# try to create some data similar to yours
A <- data.frame(sample = c(rep('A1', 530), rep('A2', 710)),
time = c(1 : 530, 1:710), var1 = runif(530+710))
A$times <- ceiling(A$time / 10)
Y <- aggregate(var1 ~ sample + times, data = A, FUN = mean)
Then you could export tmp straightaway.
HTH
Solved :
A <- read_excel("data.xlsx")
n <- subset(A, time <= 500)
d<-data.frame(sample= n$sample, time= n$time, ms=n$var1)
storage.data<-data.frame(matrix(nrow = n, ncol = n))
for(cat in unique(d$sample)){
g <- subset(d, d$sample == cat)
cuts <- cut (g$time, breaks=seq.int(from = 0, to = 500, by = 10))
p <- by (g$ms, cuts, mean)
storage.data[cat] = p}
View(storage.data)
storage.data_t <- t(storage.data)
View(storage.data_t)
write.csv(storage.data_t, file = "filename.csv")

BTYD Individual Level Estimations For All Observations

I am using BTYD BG NBD in R and did the individual level estimates.
For instance following the documentation in page 20 of:
BTYD Walkthrough
Code for Data Prep:
system.file("data/cdnowElog.csv", package = "BTYD")%>%
dc.ReadLines(., cust.idx = 2, date.idx = 3, sales.idx = 5)%>%
dc.MergeTransactionsOnSameDate()%>%
mutate(date = parse_date_time(date, "%Y%m%d")) -> elog
end.of.cal.period <- as.Date("1997-09-30")
elog.cal <- elog[which(elog$date <= end.of.cal.period), ]
split.data <- dc.SplitUpElogForRepeatTrans(elog.cal);
birth.periods <- split.data$cust.data$birth.per
last.dates <- split.data$cust.data$last.date
clean.elog <- split.data$repeat.trans.elog;
freq.cbt <- dc.CreateFreqCBT(clean.elog);
tot.cbt <- dc.CreateFreqCBT(elog)
cal.cbt <- dc.MergeCustomers(tot.cbt, freq.cbt)
cal.cbs.dates <- data.frame(birth.periods, last.dates, end.of.cal.period)
cal.cbs <- dc.BuildCBSFromCBTAndDates(cal.cbt, cal.cbs.dates,per="week")
params <- pnbd.EstimateParameters(cal.cbs);
one could get estimates for a particular observation.
Code for Individual Level Estimation:
cal.cbs["1516",]
# x t.x T.cal
# 26.00 30.86 31.00
x <- cal.cbs["1516", "x"]
t.x <- cal.cbs["1516", "t.x"]
T.cal <- cal.cbs["1516", "T.cal"]
bgnbd.ConditionalExpectedTransactions(params, T.star = 52,
x, t.x, T.cal)
# [1] 25.76
My question is, is it possible to recursively run this such that I could get a data frame containing the expectations for each row instead of hard coding a particular ID number such as "1516" in this case?
Thanks!
Yes, it is straightforward with dplyr's mutate()
cal.cbs%>%
data.frame()%>%
mutate(`Conditional Expectation` = bgnbd.ConditionalExpectedTransactions(params, T.star = 52, x, t.x, T.cal))
x t.x T.cal Conditional Expectation
1 2 30.428571 38.85714 2.3224971
2 1 1.714286 38.85714 1.0646350
3 0 0.000000 38.85714 0.5607707
4 0 0.000000 38.85714 0.5607707
5 0 0.000000 38.85714 0.5607707
6 7 29.428571 38.85714 6.0231497

Using a vector as a variable in a formula

I've got a function which acts on several columns, but I'd like to adapt it to use a different value of the main variable, mode, for each column. I have put a simplified example below.
My data is a cross tabulation of frequency i.e. in column A01 there are 6485 counts of 13 CAGs, 35 counts of 14 CAGs etc. The modal value for column 1 is therefore 13.
I need to calculate:
1) Skewness using (mean - mode)/sd
2) Proportion of each column where CAG is > than the mode
The code below works for that. However, I now need to compare each sample to the mode of a control sample and I'm a bit stuck with the code. The sample against which each needs to be compared is defined in the table controls. Could I please ask for help adapting my code so that skewmode and prop are calculated using the appropriate control mode for each column? I hope that makes sense!
#Data set
data <- data.frame(CAG = c(13, 14, 15, 17),
A01 = c(6485,35,132, 12),
A02 = c(0,42,56, 4))
#Mode
mode <- data[sapply(data[2:ncol(data)], which.max), ]$CAG
#Summary statistics
sumstats <- sapply(data[, 2:ncol(data)], function(x) {
data_e <- rep(data$CAG, x)
library(psych)
data.frame(
describe(data_e)
)
})
sumstats <- as.data.frame(t(sumstats))
sumstats[] <- lapply(sumstats, function(x) {
as.numeric(x)
})
# Results table
results <- data.frame(mode, sumstats)
# Skewness - I'd like to replace 'results$mode' here
# with the relevant mode from the controls table
skewmode <- (results$mean - results$mode) / results$sd
# Proportion > mode I'd like to replace 'mod' here
# with the relevant mode from the controls table
prop <- lapply(data[, 2:ncol(data)], function(x) {
mod <- data$CAG[which.max(x)]
B <- sum(x[data$CAG >= mod])
A <- sum(x[data$CAG <= mod])
B/(A+B)
})
prop <- as.data.frame(prop)
prop <- t(prop)
results <- data.frame(mode, sumstats, skewmode, prop)
# Controls
ctrls <- data.frame(samples = c('A01', 'A02', 'A03', 'A04'),
ctrl = c('A01','A01', 'A03', 'A03'))
Consider Map (the wrapper to mapply) which passes both sample mode and control mode iteratively into a defined function, prop_skew_calc(), to calculate skewmode and prop. At the end, outputs a list of dataframes for final row bind.
NOTE: below demonstrates with base R's summary() since I do not have the psyche package. However, I leave a comment in code on how to integrate psych::describe() which docs indicate returns a dataframe of summary stats useful for psychometrics:
Data (adding A03 and A04)
#Data set
data <- data.frame(CAG = c(13, 14, 15, 17),
A01 = c(6485,35,132, 12),
A02 = c(0,42,56, 4),
A03 = c(33,5014,2221, 18),
A04 = c(106,89,436, 11))
#Controls
ctrls <- data.frame(samples = c('A01', 'A02', 'A03', 'A04'),
ctrl = c('A01','A01', 'A03', 'A03'))
Function (removes any l/sapply looping since scalar values will be passed iteratively by Map)
library(psych)
prop_skew_calc <- function(x, y) {
#Mode
samplemode <- data$CAG[which.max(data[[x]])]
cntrlmode <- data$CAG[which.max(data[[y]])]
#Summary statistics
sumstats <- summary(rep(data$CAG, data[[x]])) # R base's summary()
sumstats <- as.data.frame(t(unclass(sumstats)))
#sumstats <- describe(rep(data$CAG, data[[x]])) # pysche's describe()
#sumstats <- as.data.frame(t(sumstats))
# Results table
results <- data.frame(cntrlmode, sumstats)
# Skewness
skewmode <- (results$Mean - results$cntrlmode) / results$Min
# Proportion
B <- sum(data[data$CAG >= cntrlmode, x])
A <- sum(data[data$CAG <= cntrlmode, x])
prop <- B/(A+B)
results <- data.frame(samplemode, cntrlmode, sumstats, skewmode, prop=prop)
}
Map (calling above function, passing columns of ctrl dataframe)
dfList <- Map(prop_skew_calc, ctrls$samples, ctrls$ctrl)
finaldf <- do.call(rbind, dfList)
finaldf
# samplemode cntrlmode Min. X1st.Qu. Median Mean X3rd.Qu. Max. skewmode prop
# 1 17 17 13 14 15 14.90 17 17 -0.1615385 0.223684211
# 2 13 17 13 13 13 13.05 13 17 -0.3038462 0.001797484
# 3 15 13 14 14 15 14.67 15 17 0.1192857 1.000000000
# 4 14 13 13 14 14 14.31 15 17 0.1007692 0.995491187

Using rollapply and lm over multiple columns of data

I have a data frame similar to the following with a total of 500 columns:
Probes <- data.frame(Days=seq(0.01, 4.91, 0.01), B1=5:495,B2=-100:390, B3=10:500,B4=-200:290)
I would like to calculate a rolling window linear regression where my window size is 12 data points and each sequential regression is separated by 6 data points. For each regression, "Days" will always be the x component of the model, and the y's would be each of the other columns (B1, followed by B2, B3, etc). I would then like to save the co-efficients as a dataframe with the existing column titles (B1, B2, etc).
I think my code is close, but is not quite working. I used rollapply from the zoo library.
slopedata<-rollapply(zoo(Probes), width=12, function(Probes) {
coef(lm(formula=y~Probes$Days, data = Probes))[2]
}, by = 6, by.column=TRUE, align="right")
If possible, I would also like to have the "xmins" saved to a vector to add to the dataframe. This would mean the smallest x value used in each regression (basically it would be every 6 numbers in the "Days" column.)
Thanks for your help.
1) Define a zoo object z whose data contains Probes and whose index is taken from the first column of Probes, i.e. Days. Noting that lm allows y to be a matrix define a coefs function which computes the regression coefficients. Finally rollapply over z. Note that the index of the returned object gives xmin.
library(zoo)
z <- zoo(Probes, Probes[[1]])
coefs <- function(z) c(unlist(as.data.frame(coef(lm(z[,-1] ~ z[,1])))))
rz <- rollapply(z, 12, by = 6, coefs, by.column = FALSE, align = "left")
giving:
> head(rz)
B11 B12 B21 B22 B31 B32 B41 B42
0.01 4 100 -101 100 9 100 -201 100
0.07 4 100 -101 100 9 100 -201 100
0.13 4 100 -101 100 9 100 -201 100
0.19 4 100 -101 100 9 100 -201 100
0.25 4 100 -101 100 9 100 -201 100
0.31 4 100 -101 100 9 100 -201 100
Note that DF <- fortify.zoo(rz) could be used if you needed a data frame representation of rz.
2) An alternative somewhat similar approch would be to rollaplly over the row numbers:
library(zoo)
y <- as.matrix(Probes[-1])
Days <- Probes$Days
n <- nrow(Probes)
coefs <- function(ix) c(unlist(as.data.frame(coef(lm(y ~ Days, subset = ix)))),
xmins = Days[ix][1])
r <- rollapply(1:n, 12, by = 6, coefs)
try this:
# here are the xmin values you wanted
xmins <- Probes$Days[seq(1,nrow(Probes),6)]
# here we build a function that will run regressions across the columns
# y1 vs x, y2 vs x, y3 vs x...
# you enter the window and by (12/6) in order to limit the interval being
# regressed. this is later called in do.call
runreg <- function(Probes,m,window=12,by=6){
# beg,end are used to specify the interval
beg <- seq(1,nrow(Probes),by)[m]
end <- beg+window-1
# this is used to go through all the columns
N <- ncol(Probes)-1
tmp <- numeric(N)
# go through each column and store the coefficients in tmp
for(i in 1:N){
y <- Probes[[i+1]][beg:end]
x <- Probes$Days[beg:end]
tmp[i] <- coef(lm(y~x))[2][[1]]
}
# put all our column regressions into a dataframe
res <- rbind('coeff'=tmp)
colnames(res) <- colnames(Probes)[-1]
return(res)
}
# now that we've built the function to do the column regressions
# we just need to go through all the window-ed regressions (row regressions)
res <- do.call(rbind,lapply(1:length(xmins),function(m) runreg(Probes,m)))
# these rownames are the index of the xmin values
rownames(res) <- seq(1,nrow(Probes),6)
res <- data.frame(res,xmins)
You can also use the rollRegres package as follows
# setup data
Probes <- data.frame(
# I changed the days to be intergers
Days=seq(1L, 491L, 1L),
B1=5:495, B2=-100:390, B3=10:500 , B4=-200:290)
# setup grp argument
grp_arg <- as.integer((Probes$Days - 1L) %/% 6)
# estimate coefs. width argument is realtive in grp units
library(rollRegres)
X <- cbind(1, Probes$Days / 100)
Ys <- as.matrix(Probes[, 2:5])
out <- lapply(1:ncol(Ys), function(i)
roll_regres.fit(x = X, y = Ys[, i], width = 2L, grp = grp_arg)$coefs)
out <- do.call(cbind, out)
# only keep the complete.cases and the unique values
colnames(out) <- sapply(1:4, function(i) paste0("B", i, 0:1))
out <- out[c(T, grp_arg[-1] != head(grp_arg, -1)), ]
out <- out[complete.cases(out), ]
head(out)
#R B10 B11 B20 B21 B30 B31 B40 B41
#R [1,] 4 100 -101 100 9 100 -201 100
#R [2,] 4 100 -101 100 9 100 -201 100
#R [3,] 4 100 -101 100 9 100 -201 100
#R [4,] 4 100 -101 100 9 100 -201 100
#R [5,] 4 100 -101 100 9 100 -201 100
#R [6,] 4 100 -101 100 9 100 -201 100
The solution is a lot faster than e.g., the zoo solution
library(zoo) coefs <- function(z) c(unlist(as.data.frame(coef(lm(z[,-1] ~ z[,1]))))) microbenchmark::microbenchmark( rollapply = {
z <- zoo(Probes, Probes[[1]])
rz <- rollapply(z, 12, by = 6, coefs, by.column = FALSE, align = "left") }, roll_regres = {
grp_arg <- as.integer((Probes$Days - 1L) %/% 6)
X <- cbind(1, Probes$Days / 100)
Ys <- as.matrix(Probes[, 2:5])
out <- lapply(1:ncol(Ys), function(i)
roll_regres.fit(x = X, y = Ys[, i], width = 2L, grp = grp_arg)$coefs)
out <- do.call(cbind, out)
colnames(out) <- sapply(1:4, function(i) paste0("B", i, 0:1))
out <- out[c(T, grp_arg[-1] != head(grp_arg, -1)), ]
out <- out[complete.cases(out), ]
head(out) } )
#R Unit: microseconds
#R expr min lq mean median uq max neval
#R rollapply 53392.614 56330.492 59793.106 58363.2825 60902.938 119206.76 100
#R roll_regres 865.186 920.297 1074.161 983.9015 1047.705 5071.41 100
At present you though need to install the package from Github due to an error in the validation in version 0.1.0. Thus, run
devtools::install_github("boennecd/rollRegres", upgrade_dependencies = FALSE,
build_vignettes = TRUE)

Resources