I have a data frame with more then 300mil rows and I want to detect anomaly in each group, which is consist of country and IDs (Each group) then I wrote the following code to detect anomaly points however it takes a lot time. Could you please suggest any other option that make it faster. The data frame format:
df <- data.frame("id" = 1:n,"country"= ("US",..),"date"=("2021-01-01",..),"value"=c(10,....))
registerDoParallel()
groupColumns <- c("country","id")
system.time(temp_anom <- ddply(df, groupColumns, function(x){
x <- x[,c('date','value')]
resid.q <- quantile(x$value,prob = c(0.1,0.90))
iqr <- diff(resid.q)
limits <- resid.q + 3 * iqr * c(-1,1)
lower_bound <- limits[1]
upper_bound <- limits[2]
outlier_dip_index <- dplyr::filter(x, value < lower_bound) %>% data.frame()
if (nrow(outlier_dip_index) > 0) {
outlier_dip_index$status <- "dip"}
outlier_spike_index <- dplyr::filter(x, value > upper_bound) %>% data.frame()
if (nrow(outlier_spike_index) > 0) {
outlier_spike_index$status <- "spike"
outlier <- rbind(outlier_spike_index,outlier_dip_index)
outlier
}
},.paralle = T))
For increasing the speed of parallel computing we need to ind the optimum number of cores in Doparallel, in this case the optimum is 5. only by modifying the code like below we can see tremendous improvement in the seed.
doParallel::registerDoParallel(cores = 5)
system.time(temp_anom <- plyr::ldply(df$id, function(ids){
title_dataset <- df[which(df$short_id == ids),]
result_dataset <- plyr::ldply(title_dataset$country, function(iso){
country_dataset <- title_dataset[which(title_dataset$country == iso),]
resid.q <- quantile(country_dataset$raw_de, prob = c(0.1, 0.90))
iqr <- diff(resid.q)
limits <- resid.q + 3 * iqr * c(-1,1)
temp_dataset <- data.frame(country = iso, lower_bound = limits[1], upper_bound = limits[2])
temp_dataset
})
result_dataset$id <- ids
result_dataset
}, .parallel = T))
Related
require(deSolve)
require(reshape2)
require(ggplot2)
library(minpack.lm) # library for least squares fit using levenberg-marquart algorithm
N = 10000000
delta1=0.005
sigma1=0.15
#initial value of population 5 states
initial_state_values <- c(S = 1634485, V = 6300185, E = 701660, I = 637873, R = 616197)
#initial parameters
parameters <- c(beta1 = 0.0000000001, alpha1 = 0.1, gamma1 = 0.33, kappa1 = 0.0022)
#time intervals for specific period of days
time <- seq(from = 1, to = 60, by = 0.1) #to have a detailed solution, I've specified time
#plot observed data
data <- read.csv("observeddata3acc.csv")
tmp = melt(data,id.vars=c("time"),variable.name="States",value.name="Number_of_individuals")
ggplot(data=tmp, aes(x=time, y=Number_of_individuals, color=States)) +
geom_line(size=1)
# Input function: from the Differential equations that we have
SVEIR_fn <- function(time, initial_state_values, parameters) {
with(as.list(c(initial_state_values, parameters)), {
N <- S+V+E+I+R
dS <- delta1*R - kappa1*S-beta1*S*I/N
dV <- kappa1*S - beta1*sigma1*V*I/N
dE <- beta1*S*I/N + beta1*V*sigma1*I/N -gamma1*E
dI <- gamma1*E - I*alpha1
dR <- alpha1*I - delta1*R
return(list(c(dS, dV, dE, dI, dR)))
})
}
result <- as.data.frame(ode(y = initial_state_values,
times = time,
func = SVEIR_fn,
parms = parameters)
)
result
result$total_prevalenceI <- result$I
result$total_prevalenceR <- result$R
result$total_prevalenceS <- result$S
result$total_prevalenceE <- result$E
result$total_prevalenceV <- result$V
# Distance Function measured with data
data <- read.csv("observeddata3acc.csv")
SVEIR_SSQ <- function(parameters, data) {
result <- as.data.frame(ode(y = initial_state_values,
times = time,
func = SVEIR_fn,
parms = parameters)
)
result$total_prevalenceI <- result$I
result$total_prevalenceR <- result$R
result$total_prevalenceS <- result$S
result$total_prevalenceE <- result$E
result$total_prevalenceV <- result$V
deltas_squareI <- (result$total_prevalenceI[result$time %in% data$time] - data$II)^2
deltas_squareR <- (result$total_prevalenceR[result$time %in% data$time] - data$RR)^2
deltas_squareS <- (result$total_prevalenceS[result$time %in% data$time] - data$SS)^2
deltas_squareE <- (result$total_prevalenceE[result$time %in% data$time] - data$EE)^2
deltas_squareV <- (result$total_prevalenceV[result$time %in% data$time] - data$VV)^2
SSQ <- sum(deltas_squareI+deltas_squareR+deltas_squareS+deltas_squareE+deltas_squareV)
return(SSQ)
}
SSQ
#fivalue=nls.lm(par=c(beta1 = 0.0000000001, alpha1 = 0.1, gamma1 = 0.33, kappa1 = 0.0022), `enter code here`fn=SVEIR_SSQ)
fitval=nls.lm(par=parms,lower=c(0.000000001, 1/15, 1/5, 0.0011), upper=c(1.0, 1/7 , 1/2, `enter code here`0.05), fn=SVEIR_SSQ, data)
I couldn't get output or result due to the error:
Error in result$time %in% data$time :
argument "data" is missing, with no default
How to solve the issue and could I use melt function my function to be optimized is the square of residuals (difference between the observed and simulated ones.
Regads
I used the following r code to determine the coverage probability.
theta <- seq(0,1, length = 100)
CD_theta <- function(y, p, n){
1 - pbinom (y, size = n, prob = p) + 1/2*dbinom(y, size = n, prob = p)
}
y <- 5
n <- 100
phat <- y/n
mytheta <- CD_theta(5, theta, 100)
set.seed(650)
ci <- list()
n <- 100
B <- 1000
result = rep(NA, B)
all_confInt <- function(B) {
for (i in 1:B){
boot.sample <- sample(mytheta, replace = TRUE)
lower <- theta[which.min(abs(boot.sample - .025))]
upper <- theta[which.min(abs(boot.sample - .975))]
ci[[i]] <- data.frame(lowerCI = lower, upperCI = upper)
intervals <- unlist(ci)
}
return(intervals)
}
df <- data.frame(matrix(all_confInt(B), nrow=B, byrow=T))
colnames(df)[1] <- "Lower"
colnames(df)[2] <- "Upper"
names(df)
dim(df)
mean(df$Lower < phat & df$Upper > phat)*100
However, I obtained 6.4% which is too low. Why am I getting really lower percentage?. Is there any problem in the r function?
I am using sjPlot, the sjp.int function, to plot an interaction of an lme.
The options for the moderator values are means +/- sd, quartiles, all, max/min. Is there a way to plot the mean +/- 2sd?
Typically it would be like this:
model <- lme(outcome ~ var1+var2*time, random=~1|ID, data=mydata, na.action="na.omit")
sjp.int(model, show.ci=T, mdrt.values="meansd")
Many thanks
Reproducible example:
#create data
mydata <- data.frame( SID=sample(1:150,400,replace=TRUE),age=sample(50:70,400,replace=TRUE), sex=sample(c("Male","Female"),200, replace=TRUE),time= seq(0.7, 6.2, length.out=400), Vol =rnorm(400),HCD =rnorm(400))
mydata$time <- as.numeric(mydata$time)
#insert random NAs
NAins <- NAinsert <- function(df, prop = .1){
n <- nrow(df)
m <- ncol(df)
num.to.na <- ceiling(prop*n*m)
id <- sample(0:(m*n-1), num.to.na, replace = FALSE)
rows <- id %/% m + 1
cols <- id %% m + 1
sapply(seq(num.to.na), function(x){
df[rows[x], cols[x]] <<- NA
}
)
return(df)
}
mydata2 <- NAins(mydata,0.1)
#run the lme which gives error message
model = lme(Vol ~ age+sex*time+time* HCD, random=~time|SID,na.action="na.omit",data=mydata2);summary(model)
mydf <- ggpredict(model, terms=c("time","HCD [-2.5, -0.5, 2.0]"))
#lmer works
model2 = lmer(Vol ~ age+sex*time+time* HCD+(time|SID),control=lmerControl(check.nobs.vs.nlev = "ignore",check.nobs.vs.rankZ = "ignore", check.nobs.vs.nRE="ignore"), na.action="na.omit",data=mydata2);summary(model)
mydf <- ggpredict(model2, terms=c("time","HCD [-2.5, -0.5, 2.0]"))
#plotting gives problems (jittered lines)
plot(mydf)
With sjPlot, it's currently not possible. However, I have written a package especially dedicated to compute and plot marginal effects: ggeffects. This package is a bit more flexible (for marginal effects plots).
In the ggeffects-package, there's a ggpredict()-function, where you can compute marginal effects at specific values. Once you know the sd of your model term in question, you can specify these values in the function call to plot your interaction:
library(ggeffects)
# plot interaction for time and var2, for values
# 10, 30 and 50 of var2
mydf <- ggpredict(model, terms = c("time", "var2 [10,30,50]"))
plot(mydf)
There are some examples in the package-vignette, see especially this section.
Edit
Here are the results, based on your reproducible example (note that GitHub-Version is currently required!):
# requires at least the GitHub-Versiob 0.1.0.9000!
library(ggeffects)
library(nlme)
library(lme4)
library(glmmTMB)
#create data
mydata <-
data.frame(
SID = sample(1:150, 400, replace = TRUE),
age = sample(50:70, 400, replace = TRUE),
sex = sample(c("Male", "Female"), 200, replace = TRUE),
time = seq(0.7, 6.2, length.out = 400),
Vol = rnorm(400),
HCD = rnorm(400)
)
mydata$time <- as.numeric(mydata$time)
#insert random NAs
NAins <- NAinsert <- function(df, prop = .1) {
n <- nrow(df)
m <- ncol(df)
num.to.na <- ceiling(prop * n * m)
id <- sample(0:(m * n - 1), num.to.na, replace = FALSE)
rows <- id %/% m + 1
cols <- id %% m + 1
sapply(seq(num.to.na), function(x) {
df[rows[x], cols[x]] <<- NA
})
return(df)
}
mydata2 <- NAins(mydata, 0.1)
# run the lme, works now
model = lme(
Vol ~ age + sex * time + time * HCD,
random = ~ time |
SID,
na.action = "na.omit",
data = mydata2
)
summary(model)
mydf <- ggpredict(model, terms = c("time", "HCD [-2.5, -0.5, 2.0]"))
plot(mydf)
lme-plot
# lmer also works
model2 <- lmer(
Vol ~ age + sex * time + time * HCD + (time |
SID),
control = lmerControl(
check.nobs.vs.nlev = "ignore",
check.nobs.vs.rankZ = "ignore",
check.nobs.vs.nRE = "ignore"
),
na.action = "na.omit",
data = mydata2
)
summary(model)
mydf <- ggpredict(model2, terms = c("time", "HCD [-2.5, -0.5, 2.0]"), ci.lvl = NA)
# plotting works, but only w/o CI
plot(mydf)
lmer-plot
# lmer also works
model3 <- glmmTMB(
Vol ~ age + sex * time + time * HCD + (time | SID),
data = mydata2
)
summary(model)
mydf <- ggpredict(model3, terms = c("time", "HCD [-2.5, -0.5, 2.0]"))
plot(mydf)
plot(mydf, facets = T)
glmmTMB-plots
I'm trying to create a dataframe computing 10 percentiles based on the Recency, Frequency and Monetary. I have most of it set up, but I can't figure out why my code is returning three NTILES, when I'm asking for 10. I'm currently at a stand still. The next step will be calculating the percentage of customers in each ntile.
Here is my code:
rm(list = ls())
setwd("/Users/a76475/Documents/Customer_Analytics")
rfm<-read.csv("cdnow_students_transaction.csv")
#Set up test and calibration samples
rfm$DATE <- as.Date(rfm$DATE, format = "%m/%d/%y")
calib <- subset(rfm, rfm$DATE<"1997-09-29")
valid <- subset(rfm, rfm$DATE>"1997-09-30")
#Aggregate for frequency, monetary, and recency -- Calibration
recency<- aggregate(DATE ~ ID, data =calib, FUN = max)
colnames(recency) <- c("ID","Recency")
frequency <- aggregate(DOLLARS ~ ID, data =calib, FUN = length)
colnames(frequency) <- c("ID","Frequency")
monetary <- aggregate(DOLLARS ~ ID, data =calib, FUN = mean)
colnames(frequency) <- c("ID","Monetary")
calib <- merge(frequency, monetary, by = "ID")
calib <- merge(calib, recency, by = "ID")
#Aggregate for frequency, monetary, and recency -- Validation
recency<- aggregate(DATE ~ ID, data =valid, FUN = max)
colnames(recency) <- c("ID","Recency")
frequency <- aggregate(DOLLARS ~ ID, data =valid, FUN = length)
colnames(frequency) <- c("ID","Frequency")
monetary <- aggregate(DOLLARS ~ ID, data =valid, FUN = mean)
colnames(frequency) <- c("ID","Monetary")
valid <- merge(frequency, monetary, by = "ID")
valid <- merge(valid, recency, by = "ID")
colnames(valid) <- c("ID","FREQ","MONETARY","RECENCY")
colnames(calib) <- c("ID","FREQ","MONETARY","RECENCY")
calib$RECENCY <- NULL
#Create recency score
#For validation
for (i in 1:nrow(valid)) {
valid$RECENCY1[i] = as.numeric(max(valid$RECENCY) - valid$RECENCY[i])
}
valid$RECENCY <- valid$RECENCY1
valid$RECENCY1 <- NULL
#For calibration
for (i in 1:nrow(calib)) {
calib$RECENCY1[i] = as.numeric(max(valid$RECENCY) - calib$RECENCY[i])
}
calib$RECENCY <- calib$RECENCY1
calib$RECENCY1 <- NULL
#Merge datasets
rfm <- merge(calib,valid, by="ID", all.x = TRUE)
#Create Column for retention%
require(dplyr)
rfm$monetary.ntile <- ntile(rfm$MONETARY.y,10)
rfm$freq.ntile <- ntile(rfm$FREQ.y,10)
rfm$recency.ntile <- ntile(rfm$RECENCY,10)
For example, if you want 10 buckets for Recency, Frequency and Monetary Ratio:
set.seed(1)
n <- 100
df <- data.frame(
R = runif(n, 1, 365),
F = runif(n, 1, 5),
M = runif(n, 0, 100)
)
apply(df, 2, function(col) {
breaks <- quantile(col, probs=seq(0, 1, length.out = 10))
findInterval(col, breaks)
})
I am trying to reproduce some results from the book "Financial Risk Modelling and Portfolio Optimisation with R" and I get an error that I can't seem to get my head around.
I get the following error in the COPPosterior function:
error in abs(alpha) : non-numeric argument to mathematical function
Is anyone able to see why I get the error?
The error is from the following script:
library(urca)
library(vars)
library(fMultivar)
## Loading data set and converting to zoo
data(EuStockMarkets)
Assets <- as.zoo(EuStockMarkets)
## Aggregating as month-end series
AssetsM <- aggregate(Assets, as.yearmon, tail, 1)
head(AssetsM)
## Applying unit root tests for sub-sample
AssetsMsub <- window(AssetsM, start = start(AssetsM),
end = "Jun 1996")
## Levels
ADF <- lapply(AssetsMsub, ur.df, type = "drift",
selectlags = "AIC")
ERS <- lapply(AssetsMsub, ur.ers)
## Differences
DADF <- lapply(diff(AssetsMsub), ur.df, selectlags = "AIC")
DERS <- lapply(diff(AssetsMsub), ur.ers)
## VECM
VEC <- ca.jo(AssetsMsub, ecdet = "none", spec = "transitory")
summary(VEC)
## Index of time stamps in back test (extending window)
idx <- index(AssetsM)[-c(1:60)]
ANames <- colnames(AssetsM)
NAssets <- ncol(AssetsM)
## Function for return expectations
f1 <- function(x, ci, percent = TRUE){
data <- window(AssetsM, start = start(AssetsM), end = x)
Lobs <- t(tail(data, 1))
vec <- ca.jo(data, ecdet = "none", spec = "transitory")
m <- vec2var(vec, r = 1)
fcst <- predict(m, n.ahead = 1, ci = ci)
LU <- matrix(unlist(fcst$fcst),
ncol = 4, byrow = TRUE)[, c(2, 3)]
RE <- rep(0, NAssets)
PView <- LU[, 1] > Lobs
NView <- LU[, 2] < Lobs
RE[PView] <- (LU[PView, 1] / Lobs[PView, 1] - 1)
RE[NView] <- (LU[NView, 1] / Lobs[NView, 1] - 1)
names(RE) <- ANames
if(percent) RE <- RE * 100
return(RE)
}
ReturnEst <- lapply(idx, f1, ci = 0.5)
qv <- zoo(matrix(unlist(ReturnEst),
ncol = NAssets, byrow = TRUE), idx)
colnames(qv) <- ANames
tail(qv)
library(BLCOP)
library(fPortfolio)
## Computing returns and EW-benchmark returns
R <- (AssetsM / lag(AssetsM, k = -1) -1.0) * 100
## Prior distribution
## Fitting of skewed Student's t distribution
MSTfit <- mvFit(R, method = "st")
mu <- c(MSTfit#fit[["beta"]])
S <- MSTfit#fit[["Omega"]]
skew <- c(MSTfit#fit[["alpha"]])
df <- MSTfit#fit[["df"]]
CopPrior <- mvdistribution("mvst", dim = NAssets, mu = mu,
Omega = S, alpha = skew, df = df)
## Pick matrix and view distributions for last forecast
RetEstCop <- ReturnEst[[27]]
RetEstCop
PCop <- matrix(0, ncol = NAssets, nrow = 3)
colnames(PCop) <- ANames
PCop[1, ANames[1]] <- 1
PCop[2, ANames[2]] <- 1
PCop[3, ANames[4]] <- 1
Sds <- apply(R, 2, sd)
RetViews <- list(distribution("norm", mean = RetEstCop[1],
sd = Sds[1]),
distribution("norm", mean = RetEstCop[2],
sd = Sds[2]),
distribution("norm", mean = RetEstCop[4],
sd = Sds[4])
)
CopViews <- COPViews(pick = PCop, viewDist = RetViews,
confidences = rep(0.5, 3),
assetNames = ANames)
## Simulation of posterior
NumSim <- 10000
CopPost <- COPPosterior(CopPrior, CopViews,
numSimulations = NumSim)
print(CopPrior)
print(CopViews)
slotNames(CopPost)
look at the structure of MSTfit:
str(MSTfit)
You can see that if you want the estimated alpha value, you need to access it via:
MSTfit#fit$estimated[['alpha']]
rather than
MSTfit#fit[['alpha']]