I need to create plots sequentially from all files in a folder, add certain information from calculated values to the plot (e.g. Mean SOC, TNPP, ANPP, BNPP), and arrange them over as many pages as needed to array them in a 5 x 3 layout.
Please, find a sample folder with trial files here:
https://www.dropbox.com/sh/evty00a0t9a062b/AAABG-rIq3Uhtlf-yOWo2fNGa?dl=0
Following different online sources and other threads, I have tried:
path <- "C:/Users/.../trialFiles"
dfs <- dir(path, "*.csv", full.names = FALSE, ignore.case = TRUE, all.files = TRUE)
plotModel <- function(df) {
dat <- read.csv(paste(path, df, sep = "/"), header = TRUE, sep = ",")
Time <- dat$time
SOC <- dat$somtc
AGBM <- dat$agcprd
BGBM <- dat$bgcjprd
time_frame <- Time >= oT & Time <= fT
sTime <- Time[time_frame]
sSOC <- SOC[sTime]
sAGBM <- AGBM[sTime]
sBGBM <- BGBM[sTime]
iM_AGBM <- mean(sAGBM)
iM_BGBM <- mean(sBGBM)
iMSOC <- mean(sSOC)
iTNPP <- sum(iM_AGBM, iM_BGBM)
plot(Time, SOC)
legend("bottomright", bty = "n", legend = paste(df, "\n\n",
"SOC =", format(iMSOC, digits = 6), "\n",
"TNPP =", format(iTNPP, digits = 6), "\n",
"ANPP =", format(iM_AGBM, digits = 5), "\n",
"BNPP =", format(iM_BGBM, digits = 5), sep = ""))
}
eq_plot <- lapply(dfs, plotModel)
nPlot <- length(eq_plot)
cols <- 3
layout <- matrix(seq(1, cols * ceiling(nPlot/cols)),
ncol = cols, nrow = ceiling(nPlot/cols))
grid.newpage()
pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))
for (i in 1:nPlot) {
matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))
print(eq_plot[[i]], vp = viewport(layout.pos.row = matchidx$row,
layout.pos.col = matchidx$col))
}
It does not give me an error, but is not showing the graphs either, just displays the text specified in the legend.
I have also tried with ggplot as follows:
dfs <- dir(period, "*.csv", full.names = FALSE, ignore.case = TRUE, all.files = TRUE)
plotModel <- function(df) {
dat <- read.csv(paste(period, df, sep = "/"), header = TRUE, sep = ",")
Time <- dat$time
SOC <- dat$somtc
AGBM <- dat$agcprd
BGBM <- dat$bgcjprd
time_frame <- Time >= oT & Time <= fT
sTime <- Time[time_frame]
sSOC <- SOC[sTime]
sAGBM <- AGBM[sTime]
sBGBM <- BGBM[sTime]
iM_AGBM <- mean(sAGBM)
iM_BGBM <- mean(sBGBM)
iMSOC <- mean(sSOC)
iTNPP <- sum(iM_AGBM, iM_BGBM)
ggplot(dat, aes(x=Time, y=SOC)) +
geom_line() +
ggtitle(df, subtitle = paste("SOC =", format(iMSOC, digits = 6), "\n",
"TNPP =", format(iTNPP, digits = 6), "\n",
"ANPP =", format(iM_AGBM, digits = 5), "\n",
"BNPP =", format(iM_BGBM, digits = 5), sep = ""))
}
eq_plot <- lapply(dfs, plotModel)
multi.page <- ggarrange(eq_plot, nrow = 5, ncol = 3)
ggexport(multi.page, filename = "diag_plots")
But I get the following error message: ggplot2 doesn't know how to deal with data of class uneval.
Please, forgive if cross posting. I have tried following the examples but there's something I am doing wrong.
Thanks
You need to save the plot within your function in an object and then return it.
Here is the solution for the first 3 files in that directory. You will need to adjust the last 2 lines of this code to handle all your plots:
library(ggplot2)
library(ggpubr)
path <- "C:/Users/.../path/to/your/dir"
dfs <- dir(path, "*.csv", full.names = FALSE, ignore.case = TRUE, all.files = TRUE)
# define oT and fT
oT=100
fT=1000
plotModel <- function(df) {
dat <- read.csv(paste(path, df, sep = "/"), header = TRUE, sep = ",")
# This part can be optimized, see below the simplified version of the function
Time <- dat$time
SOC <- dat$somtc
AGBM <- dat$agcprd
BGBM <- dat$bgcjprd
time_frame <- Time >= oT & Time <= fT
sTime <- Time[time_frame]
sSOC <- SOC[sTime]
sAGBM <- AGBM[sTime]
sBGBM <- BGBM[sTime]
iM_AGBM <- mean(sAGBM)
iM_BGBM <- mean(sBGBM)
iMSOC <- mean(sSOC)
iTNPP <- sum(iM_AGBM, iM_BGBM)
# save graph in an object
g<- ggplot(dat, aes(x=Time, y=SOC)) +
geom_line() +
ggtitle(df,
subtitle = paste("SOC =", format(iMSOC, digits=6), "\n",
"TNPP =", format(iTNPP, digits=6), "\n",
"ANPP =", format(iM_AGBM, digits=5), "\n",
"BNPP =", format(iM_BGBM, digits=5), sep = ""))
return(g)
}
eq_plot <- lapply(dfs, plotModel)
ggarrange(eq_plot[[1]], eq_plot[[2]], eq_plot[[3]], nrow = 1, ncol = 3) %>%
ggexport(filename = "diag_plots.png")
dev.off()
The body of the function can be simplified for clarity and efficiency:
plotModel <- function(df) {
dat <- read.csv(paste(path, df, sep = "/"), header = TRUE, sep = ",")
var.means <- colMeans(dat[dat$time >= oT & dat$time <= fT, c("agcprd","bgcjprd","somtc")])
# save graph in an object
g<- ggplot(dat, aes(x=time, y=somtc)) +
geom_line() +
ggtitle(df,
subtitle = paste("SOC =", format(var.means["somtc"], digits=6), "\n",
"TNPP =", format(var.means["agcprd"] + var.means["bgcjprd"], digits=6), "\n",
"ANPP =", format(var.means["agcprd"], digits=5), "\n",
"BNPP =", format(var.means["bgcjprd"], digits=5), sep = ""))
return(g)
}
Related
I have a function that generates a figure of a table:
plot_covariate_means_by_ntile <- function(.df, .ntile = "ntile", n_top = 10, directory) {
.df <- as.data.frame(.df)
covariate_names <- covariate_names
#.df[, .ntile] <- as.factor(.df[, .ntile])
.df[, .ntile] <- as_factor(.df[, .ntile], levels = "both")
# Regress each covariate on ntile/subgroup assignment to means p
cov_means <- lapply(covariate_names, function(covariate) {
lm_robust(as.formula(paste0(covariate, " ~ 0 + ", .ntile)), data = .df, se_type = "stata")
})
# Extract the mean and standard deviation of each covariate per ntile/subgroup
cov_table <- lapply(cov_means, function(cov_mean) {
means <- as.data.frame(t(coef(summary(cov_mean))[,c("Estimate", "Std. Error")]))
means
})
# Preparation to color the chart
temp_standardized <- sapply(seq_along(covariate_names), function(j) {
covariate_name <- covariate_names[j]
.mean <- mean(.df[, covariate_name], na.rm = TRUE)
.sd <- sd(.df[, covariate_name], na.rm = TRUE)
m <- as.matrix(round(signif(cov_table[[j]], digits=4), 3))
.standardized <- (m["Estimate",] - .mean) / .sd
.standardized
})
colnames(temp_standardized) <- covariate_names
ordering <- order(apply(temp_standardized, MARGIN = 2, function(x) {.range <- range(x); abs(.range[2] - .range[1])}), decreasing = TRUE)
# fwrite(tibble::rownames_to_column(as.data.frame(t(temp_standardized)[ordering,])),
# paste0(directory$data, "/covariate_standardized_means_by_", .ntile, ".csv"))
color_scale <- max(abs(c(max(temp_standardized, na.rm = TRUE), min(temp_standardized, na.rm = TRUE))))
color_scale <- color_scale * c(-1,1)
max_std_dev <- floor(max(color_scale))
breaks <- -max_std_dev:max_std_dev
labels <- c(" ", breaks, " ")
breaks <- c(min(color_scale), breaks, max(color_scale))
# Little trick to display the standard errors
table <- lapply(seq_along(covariate_names), function(j) {
covariate_name <- covariate_names[j]
.mean <- mean(.df[, covariate_name], na.rm = TRUE)
.sd <- sd(.df[, covariate_name], na.rm = TRUE)
m <- as.matrix(round(signif(cov_table[[j]], digits=4), 3))
.standardized <- (m["Estimate",] - .mean) / .sd
return(data.frame(covariate = covariate_name,
group = c(1,2,5) ,
estimate = m["Estimate",], std.error = m["Std. Error",],
standardized = .standardized))
})
# table <- do.call(rbind, table)
table <- rbindlist(table)
setnames(table, "group", .ntile)
table[, covariate := factor(covariate, levels = rev(covariate_names[ordering]), ordered = TRUE)]
table[covariate %in% head(covariate_names[ordering], n_top)] %>%
mutate(info = paste0(estimate, "\n(", std.error, ")")) %>%
ggplot(aes_string(x = .ntile, y = "covariate")) +
# Add coloring
geom_raster(aes(fill = standardized)
, alpha = 0.9
) +
scale_fill_distiller(palette = "RdBu",
direction = 1,
breaks = breaks,
labels = labels,
limits = color_scale,
name = "Standard\nDeviation on\nNormalized\nDistribution"
) +
# add numerics
geom_text(aes(label = info), size=2.1) +
# reformat
labs(title = paste0("Covariate averages within ", ifelse(tolower(.ntile) == "leaf", .ntile, "Assigned Group")),
y = "within covariate") +
scale_x_continuous(position = "top") #+
#cowplot::theme_minimal_hgrid(16)
}
But the output shows all 5 columns, I want it to show only 1 , 2 and 5.
I can adjust the line
groups = 1:ncol(m)
But then that incorrectly labels the groups, the third column is actually group 5:
Is there any way to adjust the function to present the correct columns and the correct labels for them?
Maybe you could use facet_wrap as a workaround?
library(tidyverse)
data.frame(X = rep(1:5, each = 25),
Y = rep(factor(rev(LETTERS[-26]),
levels = rev(LETTERS[-26])), 5),
Z = rnorm(125, 5, 1)) %>%
mutate(X = ifelse(X %in% c(1,2,5), X, NA)) %>%
na.omit() %>%
ggplot(aes(x = X, y = Y, fill = Z)) +
geom_raster() +
facet_wrap(~X, ncol=3, scales="free_x") +
theme_minimal() +
theme(axis.text.x = element_blank())
I tried to figure out a solution using scale_x_discrete (e.g. something like scale_x_discrete(limits = c("1", "2", "5"), breaks = c("1", "2", "5"))) and it 'feels' like it could work, but I gave up - maybe something worth pursuing.
I put together a function to identify outliers. It takes a dataframe and then shows plots of the data with lines to indicate potential outliers. It'll give a table with outliers marked, too.
But, it is SLOOOW. The problem is it takes a really long time for the plots to load.
I was curious if you might have advice on how to speed this up.
Related: Is the default plotting system faster than ggplot?
I'll start with the dependencies
#These next four functions are not mine. They're used in GetOutliers()
ExtractDetails <- function(x, down, up){
outClass <- rep("N", length(x))
indexLo <- which(x < down)
indexHi <- which(x > up)
outClass[indexLo] <- "L"
outClass[indexHi] <- "U"
index <- union(indexLo, indexHi)
values <- x[index]
outClass <- outClass[index]
nOut <- length(index)
maxNom <- max(x[which(x <= up)])
minNom <- min(x[which(x >= down)])
outList <- list(nOut = nOut, lowLim = down,
upLim = up, minNom = minNom,
maxNom = maxNom, index = index,
values = values,
outClass = outClass)
return(outList)
}
Hampel <- function(x, t = 3){
#
mu <- median(x, na.rm = TRUE)
sig <- mad(x, na.rm = TRUE)
if (sig == 0){
message("Hampel identifer implosion: MAD scale estimate is zero")
}
up<-mu+t*sig
down<-mu-t*sig
out <- list(up = up, down = down)
return(out)
}
ThreeSigma <- function(x, t = 3){
#
mu <- mean(x, na.rm = TRUE)
sig <- sd(x, na.rm = TRUE)
if (sig == 0){
message("All non-missing x-values are identical")
}
up<-mu+t* sig
down<-mu-t * sig
out <- list(up = up, down = down)
return(out)
}
BoxplotRule <- function(x, t = 1.5){
#
xL <- quantile(x, na.rm = TRUE, probs = 0.25, names = FALSE)
xU <- quantile(x, na.rm = TRUE, probs = 0.75, names = FALSE)
Q<-xU-xL
if(Q==0){
message("Boxplot rule implosion: interquartile distance is zero")
}
up<-xU+t*Q
down<-xU-t*Q
out <- list(up = up, down = down)
return(out)
}
FindOutliers <- function(x, t3 = 3, tH = 3, tb = 1.5){
threeLims <- ThreeSigma(x, t = t3)
HampLims <- Hampel(x, t = tH)
boxLims <- BoxplotRule(x, t = tb)
n <- length(x)
nMiss <- length(which(is.na(x)))
threeList <- ExtractDetails(x, threeLims$down, threeLims$up)
HampList <- ExtractDetails(x, HampLims$down, HampLims$up)
boxList <- ExtractDetails(x, boxLims$down, boxLims$up)
sumFrame <- data.frame(method = "ThreeSigma", n = n,
nMiss = nMiss, nOut = threeList$nOut,
lowLim = threeList$lowLim,
upLim = threeList$upLim,
minNom = threeList$minNom,
maxNom = threeList$maxNom)
upFrame <- data.frame(method = "Hampel", n = n,
nMiss = nMiss, nOut = HampList$nOut,
lowLim = HampList$lowLim,
upLim = HampList$upLim,
minNom = HampList$minNom,
maxNom = HampList$maxNom)
sumFrame <- rbind.data.frame(sumFrame, upFrame)
upFrame <- data.frame(method = "BoxplotRule", n = n,
nMiss = nMiss, nOut = boxList$nOut,
lowLim = boxList$lowLim,
upLim = boxList$upLim,
minNom = boxList$minNom,
maxNom = boxList$maxNom)
sumFrame <- rbind.data.frame(sumFrame, upFrame)
threeFrame <- data.frame(index = threeList$index,
values = threeList$values,
type = threeList$outClass)
HampFrame <- data.frame(index = HampList$index,
values = HampList$values,
type = HampList$outClass)
boxFrame <- data.frame(index = boxList$index,
values = boxList$values,
type = boxList$outClass)
outList <- list(summary = sumFrame, threeSigma = threeFrame,
Hampel = HampFrame, boxplotRule = boxFrame)
return(outList)
}
#strip non-numeric variables out of a dataframe
num_vars <- function(df){
X <- which(sapply(df, is.numeric))
num_vars <- df[names(X)]
return(num_vars)
}
This is the function
GetOutliers <- function(df){
library('dplyr')
library('ggplot2')
#strip out the non-numeric columns
df_out <- num_vars(df)
#initialize the data frame
df_out$Hampel <- NA
df_out$threeSigma <- NA
df_out$boxplotRule <- NA
df_out_id <- df_out
#identify outliers for each column
for (i in 1:length(names(num_vars(df)))){
#find the outliers
Outs <- FindOutliers(df_out[[i]])
OutsSum <- Outs$summary
#re-enter the outlier status
df_out$Hampel <- NA
df_out$threeSigma <- NA
df_out$boxplotRule <- NA
ifelse(is.na(Outs$Hampel), print(), df_out[unlist(Outs$Hampel[1]),]$Hampel <- TRUE)
ifelse(is.na(Outs$threeSigma), print(), df_out[unlist(Outs$threeSigma[1]),]$threeSigma <- TRUE)
ifelse(is.na(Outs$boxplotRule), print(), df_out[unlist(Outs$boxplotRule[1]),]$boxplotRule <- TRUE)
#visualize the outliers and print outlier information
Temp <- df_out
A <- colnames(Temp)[i]
AA <- paste(A,"Index")
colnames(Temp)[i] <- 'curr_column'
#table with outlier status
X <- arrange(subset(Temp,Hampel == TRUE | boxplotRule == TRUE | threeSigma == TRUE), desc(curr_column))
#scatterplot with labels
Y <- ggplot(Temp,aes(seq_along(curr_column),curr_column)) + geom_point() +
geom_hline(yintercept=OutsSum$lowLim[1],linetype = 'dashed') +
geom_hline(yintercept=OutsSum$lowLim[2],linetype = 'dashed') +
geom_hline(yintercept=OutsSum$lowLim[3],linetype = 'dashed') +
geom_hline(yintercept=OutsSum$upLim[1],linetype = 'dashed') +
geom_hline(yintercept=OutsSum$upLim[2],linetype = 'dashed') +
geom_hline(yintercept=OutsSum$upLim[3],linetype = 'dashed') +
geom_text(aes(40,OutsSum$lowLim[1],label="ThreeSigma Lower",vjust=-1)) +
geom_text(aes(40,OutsSum$lowLim[2],label="Hampel Lower",vjust=-1)) +
geom_text(aes(40,OutsSum$lowLim[3],label="Boxplot Lower",vjust=-1)) +
geom_text(aes(40,OutsSum$upLim[1],label="ThreeSigma Upper",vjust=-1)) +
geom_text(aes(40,OutsSum$upLim[2],label="Hampel Upper",vjust=-1)) +
geom_text(aes(40,OutsSum$upLim[3],label="Boxplot Upper",vjust=-1)) +
xlab(AA) + ylab(A)
#scatterplot without labels
Z <- ggplot(Temp,aes(seq_along(curr_column),curr_column)) + geom_point() +
geom_hline(yintercept=OutsSum$lowLim[1],linetype = 'dashed') +
geom_hline(yintercept=OutsSum$lowLim[2],linetype = 'dashed') +
geom_hline(yintercept=OutsSum$lowLim[3],linetype = 'dashed') +
geom_hline(yintercept=OutsSum$upLim[1],linetype = 'dashed') +
geom_hline(yintercept=OutsSum$upLim[2],linetype = 'dashed') +
geom_hline(yintercept=OutsSum$upLim[3],linetype = 'dashed') +
xlab(AA) + ylab(A)
U <- ggplot(Temp,aes(curr_column)) + geom_density() + xlab(A)
print(A)
print(X)
print(OutsSum)
print(Z)
print(Y)
print(U)
#mark the extreme outliers, the rest are reasonable outliers
A <- colnames(df_out_id[i])
Q <- as.numeric(readline(prompt="Enter the index for final Extreme value on the upper limit (if none, enter 0): "))
W <- as.numeric(readline(prompt="Enter the index for first Extreme value on the lower limit (if none, enter 0): "))
col <- df_out_id[i]
df_out_id[i] <- sapply(col[[1]], function(x){
if(Q>1 & x %in% X$curr_column[1:Q]) return('Extreme')
if(W>1 & x %in% X$curr_column[W:length(X$curr_column)]) return('Extreme')
else if (x %in% X$curr_column[Q+1:length(X$curr_column)]) return('Reasonable')
else return('Non-Outlier')
})
}
#return a dataframe with outlier status, excluding the outlier ID columns
summary(df_out_id)
return(df_out_id[1:(length(names(df_out_id))-3)])
}
Example
library('ISLR')
data(Carseats)
GetOutliers(Carseats)
It'll show you the outliers for each numeric variable.
It'll plot the variable density and then a scatterplot with identifier lines
It will also accept input so you can mark some outliers as reasonable and other as extreme
# Function to construct a mortality demogdata object from HMD
hmd.mx <- function(country, username, password, label=country){
path <- paste("https://www.mortality.org/hmd/", country, "/STATS/", "Mx_1x1.txt", sep = "")
}
userpwd <- paste(username, ":", password, sep = "")
txt <- RCurl::getURL(path, userpwd = userpwd)
con <- textConnection(txt)
mx <- try(utils::read.table(con, skip = 2, header = TRUE, na.strings = "."),TRUE)
close(con)
if(class(mx)=="try-error")
stop("Connection error at www.mortality.org. Please check username, password and country label.")
path <- paste("https://www.mortality.org/hmd/", country, "/STATS/", "Exposures_1x1.txt", sep = "")
userpwd <- paste(username, ":", password, sep = "")
txt <- RCurl::getURL(path, userpwd = userpwd)
con <- textConnection(txt)
pop <- try(utils::read.table(con, skip = 2, header = TRUE, na.strings = "."),TRUE)
close(con)
if(class(pop)=="try-error")
stop("Exposures file not found at www.mortality.org")
obj <- list(type="mortality",label=label,lambda=0)
obj$year <- sort(unique(mx[, 1]))
#obj$year <- ts(obj$year, start=min(obj$year))
n <- length(obj$year)
m <- length(unique(mx[, 2]))
obj$age <- mx[1:m, 2]
obj$rate <- obj$pop <- list()
for (i in 1:n.mort)
{ obj$rate[[i]] <- matrix(mx[, i + 2], nrow = m, ncol = n)
obj$rate[[i]][obj$rate[[i]] < 0] <- NA
obj$pop[[i]] <- matrix(pop[, i + 2], nrow = m, ncol = n)
obj$pop[[i]][obj$pop[[i]] < 0] <- NA
dimnames(obj$rate[[i]]) <- dimnames(obj$pop[[i]]) <- list(obj$age, obj$year)
}
names(obj$pop) = names(obj$rate) <- tolower(mnames)
obj$age <- as.numeric(as.character(obj$age))
if (is.na(obj$age[m])) {
obj$age[m] <- 2 * obj$age[m - 1] - obj$age[m - 2] }
return(structure(obj, class = "demogdata"))
}
Above is the code that we are using to import our population data into r.
NLdata <- hmd.mx(country = "NLD",username = "username",password="password")
This would be the specific code to obtain the Dutch data.
Would anyone happen to know how to add multiple countries into one, and put that data into one dataframe (same format as the demography data packages that we download)? So for example the mortality rates for the (Netherlands + France + Norway) / 3 into one package.
You can try this code. However I could not run your demography package. So you might need to edit the code a bit. Perhaps someone else can fill in the second part? I saw that no one has reacted yet.
C1 <- data.frame(Year = 1980:2018, value1 = rnorm(39), value2 = rnorm(39), Cat =rbinom(39,1,0.5), Country = "France")
C2 <- data.frame(Year = 1980:2018, value1 = rnorm(39), value2 = rnorm(39), Cat =rbinom(39,1,0.5),Country = "England")
C3 <- data.frame(Year = 1970:2018, value1 = rnorm(49), value2 = rnorm(49), Cat =rbinom(49,1,0.5),Country = "Netherlands")
C1 <- split(C1, C1$Cat)
C2 <- split(C2, C2$Cat)
C3 <- split(C3, C3$Cat)
list_all <- list(rbind(C1[[1]],C2[[1]],C3[[1]]),rbind(C1[[2]],C2[[2]],C3[[2]]))
Final_list <- lapply(list_all, function(x) x %>% group_by(Year) %>% summarise(Val1 = mean(value1), Val2 = mean(value2), Country = "All") %>% as.data.frame)
I am currently trying to run goodness of fit tests for data in the unmarked package. To do this I am using code written in the associated google group. This relies on parboot to assess the goodness of fit of the model. It then produces a Chi squared P value and C-hat value.
Strangely when I only perform >90 simulations of the model do I get the following error:
Error in checkForRemoteErrors(val) : 3 nodes produced errors;
first error: could not find function "mb.chisq.RN"
Below this number of simulations, the error is not encountered and the statistic can be computed.
I first run; mb.chisq.RN
mb.chisq.RN <- function (mod, print.table = TRUE, maxK=50,
...){
y.raw <- mod#data#y
N.raw <- nrow(y.raw)
na.raw <- apply(X = y.raw, MARGIN = 1, FUN = function(i) all(is.na(i)))
y.data <- y.raw[!na.raw, ]
N <- N.raw - sum(na.raw)
T <- ncol(y.data)
K <- 0:maxK
det.hist <- apply(X = y.data, MARGIN = 1, FUN = function(i) paste(i,
collapse = ""))
preds.lam <- predict(mod, type = "state")$Predicted
preds.p <- matrix(data = predict(mod, type = "det")$Predicted,
ncol = T, byrow = TRUE)
out.hist <- data.frame(det.hist, preds.lam)
un.hist <- unique(det.hist)
n.un.hist <- length(un.hist)
na.vals <- length(grep(pattern = "NA", x = un.hist)) > 0
if (na.vals) {
id.na <- grep(pattern = "NA", x = un.hist)
id.det.hist.na <- grep(pattern = "NA", x = det.hist)
cohort.na <- sort(un.hist[id.na])
n.cohort.na <- length(cohort.na)
unique.na <- gsub(pattern = "NA", replacement = "N",
x = cohort.na)
na.visits <- sapply(strsplit(x = unique.na, split = ""),
FUN = function(i) paste(ifelse(i == "N", 1, 0), collapse = ""))
names(cohort.na) <- na.visits
n.hist.missing.cohorts <- table(na.visits)
n.missing.cohorts <- length(n.hist.missing.cohorts)
out.hist.na <- out.hist[id.det.hist.na, ]
out.hist.na$det.hist <- droplevels(out.hist.na$det.hist)
just.na <- sapply(X = out.hist.na$det.hist, FUN = function(i) gsub(pattern = "1",
replacement = "0", x = i))
out.hist.na$coh <- sapply(X = just.na, FUN = function(i) gsub(pattern = "NA",
replacement = "1", x = i))
freqs.missing.cohorts <- table(out.hist.na$coh)
na.freqs <- table(det.hist[id.det.hist.na])
preds.p.na <- preds.p[id.det.hist.na, ]
cohort.not.na <- sort(un.hist[-id.na])
out.hist.not.na <- out.hist[-id.det.hist.na, ]
out.hist.not.na$det.hist <- droplevels(out.hist.not.na$det.hist)
n.cohort.not.na <- length(cohort.not.na)
n.sites.not.na <- length(det.hist) - length(id.det.hist.na)
preds.p.not.na <- preds.p[-id.det.hist.na, ]
}
else {
cohort.not.na <- sort(un.hist)
out.hist.not.na <- out.hist
preds.p.not.na <- preds.p
n.cohort.not.na <- length(cohort.not.na)
n.sites.not.na <- length(det.hist)
}
if (n.cohort.not.na > 0) {
exp.freqs <- rep(NA, n.cohort.not.na)
names(exp.freqs) <- cohort.not.na
for (i in 1:n.cohort.not.na) {
eq.solved <- rep(NA, n.sites.not.na)
select.hist <- cohort.not.na[i]
strip.hist <- unlist(strsplit(select.hist, split = ""))
hist.mat <- new.hist.mat <- new.hist.mat1 <- new.hist.mat0 <- matrix(NA, nrow = n.sites.not.na, ncol = T)
for (j in 1:n.sites.not.na) {
if (n.sites.not.na == 1) {
hist.mat[j,] <- preds.p.not.na
} else {hist.mat[j,] <- preds.p.not.na[j,]}
#Pr(y.ij=1|K)
p.k.mat <- sapply(hist.mat[j,],function(r){1-(1-r)^K})
new.hist.mat1[j,] <- dpois(K,out.hist.not.na[j, "preds.lam"]) %*% p.k.mat
new.hist.mat0[j,] <- dpois(K,out.hist.not.na[j, "preds.lam"]) %*% (1-p.k.mat)
new.hist.mat[j,] <- ifelse(strip.hist == "1",
new.hist.mat1[j,], ifelse(strip.hist == "0",
new.hist.mat0[j,], 0))
combo.lam.p <- paste(new.hist.mat[j, ], collapse = "*")
eq.solved[j] <- eval(parse(text = as.expression(combo.lam.p)))
}
exp.freqs[i] <- sum(eq.solved, na.rm = TRUE)
}
freqs <- table(out.hist.not.na$det.hist)
out.freqs <- matrix(NA, nrow = n.cohort.not.na, ncol = 4)
colnames(out.freqs) <- c("Cohort", "Observed", "Expected",
"Chi-square")
rownames(out.freqs) <- names(freqs)
out.freqs[, 1] <- 0
out.freqs[, 2] <- freqs
out.freqs[, 3] <- exp.freqs
out.freqs[, 4] <- ((out.freqs[, "Observed"] - out.freqs[,
"Expected"])^2)/out.freqs[, "Expected"]
}
if (na.vals) {
missing.cohorts <- list()
if (!is.matrix(preds.p.na)) {
preds.p.na <- matrix(data = preds.p.na, nrow = 1)
}
for (m in 1:n.missing.cohorts) {
select.cohort <- out.hist.na[which(out.hist.na$coh ==
names(freqs.missing.cohorts)[m]), ]
select.preds.p.na <- preds.p.na[which(out.hist.na$coh ==
names(freqs.missing.cohorts)[m]), ]
if (!is.matrix(select.preds.p.na)) {
select.preds.p.na <- matrix(data = select.preds.p.na,
nrow = 1)
}
select.preds.p.na[, gregexpr(pattern = "N", text = gsub(pattern = "NA",
replacement = "N", x = select.cohort$det.hist[1]))[[1]]] <- 1
n.total.sites <- nrow(select.cohort)
freqs.na <- table(droplevels(select.cohort$det.hist))
cohort.na.un <- sort(unique(select.cohort$det.hist))
n.hist.na <- length(freqs.na)
exp.na <- rep(NA, n.hist.na)
names(exp.na) <- cohort.na.un
for (i in 1:n.hist.na) {
n.sites.hist <- freqs.na[i]
eq.solved <- rep(NA, n.total.sites)
select.hist <- gsub(pattern = "NA", replacement = "N",
x = cohort.na.un[i])
strip.hist <- unlist(strsplit(select.hist, split = ""))
hist.mat <- new.hist.mat <- new.hist.mat1 <-new.hist.mat0 <- matrix(NA, nrow = n.total.sites, ncol = T)
for (j in 1:n.total.sites) {
hist.mat[j, ] <- select.preds.p.na[j, ]
#Pr(y.ij=1|K)
p.k.mat <- sapply(hist.mat[j,],function(r){1-(1-r)^K})
new.hist.mat1[j,] <- dpois(K,select.cohort[j, "preds.lam"]) %*% p.k.mat
new.hist.mat0[j,] <- dpois(K,select.cohort[j, "preds.lam"]) %*% (1-p.k.mat)
new.hist.mat[j,] <- ifelse(strip.hist == "1",
new.hist.mat1[j,], ifelse(strip.hist == "0",
new.hist.mat0[j,], 1))
combo.lam.p <- paste(new.hist.mat[j, ], collapse = "*")
eq.solved[j] <- eval(parse(text = as.expression(combo.lam.p)))
}
exp.na[i] <- sum(eq.solved, na.rm = TRUE)
}
out.freqs.na <- matrix(NA, nrow = n.hist.na, ncol = 4)
colnames(out.freqs.na) <- c("Cohort", "Observed",
"Expected", "Chi-square")
rownames(out.freqs.na) <- cohort.na.un
out.freqs.na[, 1] <- m
out.freqs.na[, 2] <- freqs.na
out.freqs.na[, 3] <- exp.na
out.freqs.na[, 4] <- ((out.freqs.na[, "Observed"] -
out.freqs.na[, "Expected"])^2)/out.freqs.na[,
"Expected"]
missing.cohorts[[m]] <- list(out.freqs.na = out.freqs.na)
}
}
if (na.vals) {
chisq.missing <- do.call("rbind", lapply(missing.cohorts,
FUN = function(i) i$out.freqs.na))
if (n.cohort.not.na > 0) {
chisq.unobs.det <- N - sum(out.freqs[, "Expected"]) -
sum(chisq.missing[, "Expected"])
chisq.table <- rbind(out.freqs, chisq.missing)
}
else {
chisq.unobs.det <- N - sum(chisq.missing[, "Expected"])
chisq.table <- chisq.missing
}
}
else {
chisq.unobs.det <- N - sum(out.freqs[, "Expected"])
chisq.na <- 0
chisq.table <- out.freqs
}
chisq <- sum(chisq.table[, "Chi-square"]) + chisq.unobs.det
if (print.table) {
out <- list(chisq.table = chisq.table, chi.square = chisq,
model.type = "single-season")
}
else {
out <- list(chi.square = chisq, model.type = "single-season")
}
class(out) <- "mb.chisq"
return(out)
}
Which will successfuly compute a Chi squared value.
I then run the test.
mb.gof.test.RN <- function (mod, nsim = 100, plot.hist = TRUE, ...){
mod.table <- mb.chisq.RN(mod)
out <- parboot(mod, statistic = function(i) mb.chisq.RN(i)$chi.square,
nsim = nsim)
p.value <- sum(out#t.star >= out#t0)/nsim
if (p.value == 0) {
p.display <- paste("<", 1/nsim)
}
else {
p.display = paste("=", round(p.value, digits = 4))
}
if (plot.hist) {
hist(out#t.star, main = paste("Bootstrapped MacKenzie and Bailey fit statistic (",
nsim, " samples)", sep = ""), xlim = range(c(out#t.star,
out#t0)), xlab = paste("Simulated statistic ", "(observed = ",
round(out#t0, digits = 2), ")", sep = ""))
title(main = bquote(paste(italic(P), " ", .(p.display))),
line = 0.5)
abline(v = out#t0, lty = "dashed", col = "red")
}
c.hat.est <- out#t0/mean(out#t.star)
gof.out <- list(model.type = mod.table$model.type, chisq.table = mod.table$chisq.table,
chi.square = mod.table$chi.square, t.star = out#t.star,
p.value = p.value, c.hat.est = c.hat.est, nsim = nsim)
class(gof.out) <- "mb.chisq"
return(gof.out)
}
>mb.gof.test.RN(fm9)
which produces the following error:
Error in checkForRemoteErrors(val) : 3 nodes produced errors;
first error: could not find function "mb.chisq.RN"
I'm not entirely sure why this error only occurs above a certain number of simulations so any pointers would be greatly received.
I'm having some trouble when plotting the "bgnbd.PlotFrequencyInCalibration" in the "BTYD" package.
There is no NA in the dataset and other plots works without error.
Below is my code for the plots:
CustData<- read.csv("~/ltv/CustData")
> cal.cbs<-cbind(CustData$t.x,CustData$x,CustData$T.cal,CustData$x.star)
> colnames(cal.cbs)<-c("t.x","x","T.cal","x.star")
est.params<-c(0.0313,0.9165,1.088,0.7903)
bgnbd.PlotFrequencyInCalibration(est.params,cal.cbs,7)
Error in plot.window(xlim, ylim, log = log, ...) :
need finite 'ylim' values
Any help would be appreciated. Thank you.
Kara
subset of the data
I fixed for pnbd.pnbd.PlotFrequencyInCalibration. Repeat the same for bgnbd. If you look at the actual function for pnbd.PlotFrequencyInCalibration :
"https://github.com/cran/BTYD/blob/master/R/pnbd.R" (check here)
pnbd.PlotFrequencyInCalibration <- function(params, cal.cbs, censor, plotZero = TRUE,
xlab = "Calibration period transactions", ylab = "Customers", title = "Frequency of Repeat Transactions") {
tryCatch(x <- cal.cbs[, "x"], error = function(e) stop("Error in pnbd.PlotFrequencyInCalibration: cal.cbs must have a frequency column labelled \"x\""))
tryCatch(T.cal <- cal.cbs[, "T.cal"], error = function(e) stop("Error in pnbd.PlotFrequencyInCalibration: cal.cbs must have a column for length of time observed labelled \"T.cal\""))
dc.check.model.params(c("r", "alpha", "s", "beta"), params, "pnbd.PlotFrequencyInCalibration")
if (censor > max(x))
stop("censor too big (> max freq) in PlotFrequencyInCalibration.")
n.x <- rep(0, max(x) + 1)
custs = nrow(cal.cbs)
for (ii in unique(x)) {
n.x[ii + 1] <- sum(ii == x)
}
n.x.censor <- sum(n.x[(censor + 1):length(n.x)])
n.x.actual <- c(n.x[1:censor], n.x.censor)
T.value.counts <- table(T.cal)
T.values <- as.numeric(names(T.value.counts))
n.T.values <- length(T.values)
total.probability <- 0
n.x.expected <- rep(0, length(n.x.actual))
for (ii in 1:(censor)) {
this.x.expected <- 0
for (T.idx in 1:n.T.values) {
T <- T.values[T.idx]
if (T == 0)
next
n.T <- T.value.counts[T.idx]
expected.given.x.and.T <- n.T * pnbd.pmf(params, T, ii - 1)
this.x.expected <- this.x.expected + expected.given.x.and.T
total.probability <- total.probability + expected.given.x.and.T/custs
}
n.x.expected[ii] <- this.x.expected
}
n.x.expected[censor + 1] <- custs * (1 - total.probability)
col.names <- paste(rep("freq", length(censor + 1)), (0:censor), sep = ".")
col.names[censor + 1] <- paste(col.names[censor + 1], "+", sep = "")
censored.freq.comparison <- rbind(n.x.actual, n.x.expected)
colnames(censored.freq.comparison) <- col.names
cfc.plot <- censored.freq.comparison
if (plotZero == FALSE)
cfc.plot <- cfc.plot[, -1]
n.ticks <- ncol(cfc.plot)
if (plotZero == TRUE) {
x.labels <- 0:(n.ticks - 1)
x.labels[n.ticks] <- paste(n.ticks - 1, "+", sep = "")
} else {
x.labels <- 1:(n.ticks)
x.labels[n.ticks] <- paste(n.ticks, "+", sep = "")
}
ylim <- c(0, ceiling(max(cfc.plot,na.rm = TRUE) * 1.1))
barplot(cfc.plot, names.arg = x.labels, beside = TRUE, ylim = ylim, main = title,
xlab = xlab, ylab = ylab, col = 1:2)
legend("topright", legend = c("Actual", "Model"), col = 1:2, lwd = 2)
return(censored.freq.comparison)
}
There is a line:
ylim <- c(0, ceiling(max(cfc.plot) * 1.1))
Add to it, na.rm=TRUE
ylim <- c(0, ceiling(max(cfc.plot,na.rm = TRUE) * 1.1))
Run the function again, should work now