snapPointsToLines can't keep attributes in R - r

I recently find a problem of snapPointsToLines. It can't keep the attributes of the spatial point dataframe. The example is as below:
# Generate a spatial line dataframe
l1 = cbind(c(1,2,3),c(3,2,2))
l1a = cbind(l1[,1]+.05,l1[,2]+.05)
l2 = cbind(c(1,2,3),c(1,1.5,1))
Sl1 = Line(l1)
Sl1a = Line(l1a)
Sl2 = Line(l2)
S1 = Lines(list(Sl1, Sl1a), ID="a")
S2 = Lines(list(Sl2), ID="b")
Sl = SpatialLines(list(S1,S2))
df = data.frame(z = c(1,2), row.names=sapply(slot(Sl, "lines"), function(x) slot(x, "ID")))
Sldf = SpatialLinesDataFrame(Sl, data = df)
# Generate a spatial point dataframe
xc = c(1.2,1.5,2.5)
yc = c(1.5,2.2,1.6)
Spoints = SpatialPoints(cbind(xc, yc))
Spdf <- SpatialPointsDataFrame(Spoints, data = data.frame(value = 1:length(Spoints)))
#use the function SpatialPointsDataFrame
res <- snapPointsToLines(Spdf, Sldf)
res only has "nearest_line_id" and "snap_dist". It doesn't have "value" field from Spdf, which I need.
#use the function SpatialPointsDataFrame with "withAttrs = TRUE" parameter
res <- snapPointsToLines(Spdf, Sldf, withAttrs = TRUE)
It reports error:
"Error in snapPointsToLines(Spdf, Sldf, withAttrs = TRUE) :
A SpatialPoints object has no attributes! Please set withAttrs as FALSE."
But Spdf is the spatialpointdataframe with attribute.
I don't know what problem it is. When I used this function several weeks ago, it didn't have this problem.

I think the problem may be due to the function itself. When you look at the codes of this function, we can see the codes at the beginning part as below.
if (class(points) == "SpatialPoints" && missing(withAttrs))
withAttrs = FALSE
if (class(points) == "SpatialPoints" && withAttrs == TRUE)
stop("A SpatialPoints object has no attributes! Please set withAttrs as FALSE.")
Sometimes a SpatialPointsDataFrame could be identified as SpatialPoints. So the function will treat your SpatialPointsDataFrame as SpatialPoints and will not keep the attributes in the function.
You can make a little modification in the the codes of the function as below.
snapPointsToLines1 <- function (points, lines, maxDist = NA, withAttrs = TRUE, idField = NA)
{
if (rgeosStatus()) {
if (!requireNamespace("rgeos", quietly = TRUE))
stop("package rgeos required for snapPointsToLines")
}
else stop("rgeos not installed")
if (is(points, "SpatialPointsDataFrame")==FALSE && missing(withAttrs))
withAttrs = FALSE
if (is(points, "SpatialPointsDataFrame")==FALSE && withAttrs == TRUE)
stop("A SpatialPointsDataFrame object is needed! Please set withAttrs as FALSE.")
d = rgeos::gDistance(points, lines, byid = TRUE)
if (!is.na(maxDist)) {
distToLine <- apply(d, 2, min, na.rm = TRUE)
validPoints <- distToLine <= maxDist
distToPoint <- apply(d, 1, min, na.rm = TRUE)
validLines <- distToPoint <= maxDist
points <- points[validPoints, ]
lines = lines[validLines, ]
d = d[validLines, validPoints, drop = FALSE]
distToLine <- distToLine[validPoints]
if (!any(validPoints)) {
if (is.na(idField)) {
idCol = character(0)
}
else {
idCol = lines#data[, idField][0]
}
newCols = data.frame(nearest_line_id = idCol, snap_dist = numeric(0))
if (withAttrs)
df <- cbind(points#data, newCols)
else df <- newCols
res <- SpatialPointsDataFrame(points, data = df,
proj4string = CRS(proj4string(points)), match.ID = FALSE)
return(res)
}
}
else {
distToLine = apply(d, 2, min, na.rm = TRUE)
}
nearest_line_index = apply(d, 2, which.min)
coordsLines = coordinates(lines)
coordsPoints = coordinates(points)
mNewCoords = vapply(1:length(points), function(x) nearestPointOnLine(coordsLines[[nearest_line_index[x]]][[1]],
coordsPoints[x, ]), FUN.VALUE = c(0, 0))
if (!is.na(idField)) {
nearest_line_id = lines#data[, idField][nearest_line_index]
}
else {
nearest_line_id = sapply(slot(lines, "lines"),
function(i) slot(i, "ID"))[nearest_line_index]
}
if (withAttrs)
df = cbind(points#data, data.frame(nearest_line_id, snap_dist = distToLine))
else df = data.frame(nearest_line_id, snap_dist = distToLine,
row.names = names(nearest_line_index))
SpatialPointsDataFrame(coords = t(mNewCoords), data = df,
proj4string = CRS(proj4string(points)))
}
Then using this new function snapPointsToLines1, you can get the attributes what you want.

Related

How can i start this code found on github?

I'm following this code on github and in line 51 i have a problem with option[i,]<- skew.raw why? Said: object "i" not found. Why? What should i put?
It also fails to take values as after starting the get.option function I have NA values.
# Define function for formating/retrieving options data from json obj
get.options = function(symbols, date){
options = matrix(ncol = 11, nrow = length(symbols))
colnames(options) = c('Cl_price', "call_strike",
"call_lastPrice","call_vol","call_openInt", "call_ImpVoli",
"put_strike","put_lastPrice", 'put_vol',"put_openInt", 'put_ImpVoli')
rownames(options) = symbols
for(u in 1:length(symbols)){
s = symbols[u]
d = as.numeric(as.POSIXct(date, origin = '1970-01-01', tz = 'GMT'))
json_file <- sprintf('https://query2.finance.yahoo.com/v7/finance/options/%s?
date=%d&formatted=true&crumb=UNus6VhY1bn&lang=en-US&region=US&corsDomain=finance.yahoo.com',s,d)
json_data <- suppressWarnings(fromJSON(paste(readLines(json_file), collapse = "")))
# CALLS
n = length(json_data$optionChain$result[[1]]$options[[1]]$calls)
if (n < 1) next
calls = matrix(ncol = 6, nrow = n)
for(i in 1:n) calls[,2][i] = json_data$optionChain$result[[1]]$options[[1]]$calls[[i]]$strike$raw
Cl.price = json_data$optionChain$result[[1]]$quote$regularMarketPrice
x <- which.min(abs((calls[,2]/Cl.price) -1))
calls = calls[x,]
calls[1] = Cl.price
calls[3] = json_data$optionChain$result[[1]]$options[[1]]$calls[[x]]$lastPrice$raw
calls[4] = json_data$optionChain$result[[1]]$options[[1]]$calls[[x]]$volume$raw
calls[5] = json_data$optionChain$result[[1]]$options[[1]]$calls[[x]]$openInterest$raw
calls[6] = json_data$optionChain$result[[1]]$options[[1]]$calls[[x]]$impliedVolatility$raw
# PUTS
n = length(json_data$optionChain$result[[1]]$options[[1]]$puts)
if(n < 1) next
puts = matrix(ncol = 5, nrow = n)
for(i in 1:n) puts[,1][i] = json_data$optionChain$result[[1]]$options[[1]]$puts[[i]]$strike$raw
x <- which.min(abs((puts[,1]/Cl.price) - 0.95))
puts = puts[x,]
puts[2] = json_data$optionChain$result[[1]]$options[[1]]$puts[[x]]$lastPrice$raw
puts[3] = json_data$optionChain$result[[1]]$options[[1]]$puts[[x]]$volume$raw
puts[4] = json_data$optionChain$result[[1]]$options[[1]]$puts[[x]]$openInterest$raw
puts[5] = json_data$optionChain$result[[1]]$options[[1]]$puts[[x]]$impliedVolatility$raw
options[u,] = c(calls, puts)
}
return(options)
}
# Define stocks and gather options data
date = '2017-04-21'
symbols <- c('DIS','CAT','TSLA')
daily.options = as.data.frame(get.options(symbols, date))
which(is.na(daily.options))
skew.raw = daily.options$put_ImpVoli - daily.options$call_ImpVoli # SKEW(i,t)
options[i,] <- skew.raw
write.table(options, 'DISCATTSLA', sep = ",")
options = read.table('DISCATTSLA', sep = ",")**
I’m following this code because I read the paper by Rhui Zhao but in the paper I did not talk about how to implement the skew volatility on a software and then I was able to find this code on github.

Calculating p-values for divisive hierarchical clustering in R (pvclust package)

Would anyone know how to calculate significance values for clusters in the R package pvclust using a divisive hierarchical clustering method (e.g. diana from the cluster package)? The pvclust package supports only agglomerative hierarchical clustering methods (implemented by the hclust function), but I have been hoping that forcing pvclust to use diana instead of hclust might be possible. I tried modifying some of the internal pvclust functions as follows, but the only result was an error:
library(pvclust)
library(cluster)
pvclust.nonparallel <- function (data, method.hclust, method.dist, use.cor, nboot, r,
store, weight, iseed, quiet)
{
if (!is.null(iseed))
set.seed(seed = iseed)
n <- nrow(data)
p <- ncol(data)
if (is.function(method.dist)) {
distance <- method.dist(data)
}
else {
distance <- pvclust:::dist.pvclust(data, method = method.dist,
use.cor = use.cor)
}
####### replace hclust with diana
# data.hclust <- hclust(distance, method = method.hclust) # original version
data.hclust <- diana(distance, diss = T)
if (method.hclust == "ward" && getRversion() >= "3.1.0") {
method.hclust <- "ward.D"
}
size <- floor(n * r)
rl <- length(size)
if (rl == 1) {
if (r != 1)
warning("Relative sample size r is set to 1.0. AU p-values are not calculated\n")
r <- list(1)
}
else r <- as.list(size/n)
mboot <- lapply(r, boot.hclust, data = data, object.hclust = data.hclust,
nboot = nboot, method.dist = method.dist, use.cor = use.cor,
method.hclust = method.hclust, store = store, weight = weight,
quiet = quiet)
result <- pvclust:::pvclust.merge(data = data, object.hclust = data.hclust,
mboot = mboot)
return(result)
}
boot.hclust <- function (r, data, object.hclust, method.dist, use.cor, method.hclust,
nboot, store, weight = FALSE, quiet = FALSE)
{
n <- nrow(data)
size <- round(n * r, digits = 0)
if (size == 0)
stop("invalid scale parameter(r)")
r <- size/n
pattern <- pvclust:::hc2split(object.hclust)$pattern
edges.cnt <- table(factor(pattern)) - table(factor(pattern))
st <- list()
rp <- as.character(round(r, digits = 2))
if (r == 1)
rp <- paste(rp, ".0", sep = "")
if (!quiet)
cat(paste("Bootstrap (r = ", rp, ")... ",
sep = ""))
w0 <- rep(1, n)
na.flag <- 0
for (i in 1:nboot) {
if (weight && r > 10) {
w1 <- as.vector(rmultinom(1, size, w0))
suppressWarnings(distance <- distw.pvclust(data,
w1, method = method.dist, use.cor = use.cor))
}
else {
smpl <- sample(1:n, size, replace = TRUE)
if (is.function(method.dist)) {
suppressWarnings(distance <- method.dist(data[smpl,
]))
}
else {
suppressWarnings(distance <- pvclust:::dist.pvclust(data[smpl,
], method = method.dist, use.cor = use.cor))
}
}
if (all(is.finite(distance))) {
####### replace hclust with diana
# x.hclust <- hclust(distance, method = method.hclust) # original version
x.hclust <- diana(distance, diss = T)
pattern.i <- pvclust:::hc2split(x.hclust)$pattern
edges.cnt <- edges.cnt + table(factor(pattern.i,
levels = pattern))
}
else {
x.hclust <- NULL
na.flag <- 1
}
if (store)
st[[i]] <- x.hclust
}
if (!quiet)
cat("Done.\n")
if (na.flag == 1)
warning(paste("inappropriate distance matrices are omitted in computation: r = ",
r), call. = FALSE)
boot <- list(edges.cnt = edges.cnt, method.dist = method.dist,
use.cor = use.cor, method.hclust = method.hclust, nboot = nboot,
size = size, r = r, store = st)
class(boot) <- "boot.hclust"
return(boot)
}
assignInNamespace("pvclust.nonparallel",pvclust.nonparallel,ns="pvclust")
assignInNamespace("boot.hclust",boot.hclust,ns="pvclust")
data(lung)
res.pv <- pvclust(t(lung), method.dist = "euclidean")
plot(res.pv)
# Error in barplot.default(w, xlab = xlab, horiz = TRUE, space = 0, axes = FALSE, :
# argument 9 matches multiple formal arguments

issue with disag_model() function from disaggregation R package

I was trying to use the disaggregation package to evaluate if it could be used on the dataset I have. My original data are disaggregated, so I've aggregated them to use the disag_model function from disaggregation package and compare "fitted values" with actual values.
However when I run the function the R session aborts.
I tried to execute the disag_model function step by step and I saw that the problem is due to the use of nlminb() to optimize the a posteriori density function, but I cannot understand why it's happening and how to solve it.
Thanks for your help.
You can find the data I used at this link: https://www.dropbox.com/sh/au7l0e11trzfo19/AACpfRSUpd4gRCveUsh5JX6Ea?dl=0
Please download the folder to run the code.
This is the code I used:
library(tidyverse)
library(raster)
library(disaggregation)
library(sp)
path<- "yourPath/Data"
load(file.path(path, "myRS"))
load(file.path(path, "RAST"))
Data <- read.csv(file = paste(path, "/sim_data.csv", sep = ""))
Data$HasRes <- ifelse(Data$PN50 > runif(nrow(Data)), 1, 0)
for (i in 1:nlayers(myRS)) {
myRS#layers[[i]]#file#name<-file.path(path, "predStackl10")
}
DFCov <-
as.data.frame(raster::extract(myRS, Data[c("XCoord", "YCoord")]))
Data <- cbind(Data, DFCov)
# Remove NA
NAs <- which(is.na(rowSums(Data[names(myRS)])))
Data <- Data[-NAs, ]
Data$ISO3 <- as.factor(Data$ISO3)
world_shape <-
shapefile(file.path(path, "World.shp"))
lmic_shape <-
world_shape[(world_shape#data$ISO3 %in% levels(Data$ISO3)),]
plot(lmic_shape)
# I would like to convert Data in a SpatialPointsDataFrame object
PN50 <- Data
coordinates(PN50) <- c("XCoord", "YCoord")
is.projected(PN50) # see if a projection is defined
proj4string(PN50) <- CRS("+proj=longlat +datum=WGS84")
# compute the mean P50 within each state
PN50_mean <- aggregate(x = PN50,
by = list(Data$ISO3),
FUN = mean)
# compute the centroid of the observations coordinates for each state
PN50_centroid <-
Data %>% group_by(ISO3) %>% summarise(meanX = mean(XCoord), meanY = mean(YCoord))
# assign to each mean the centroid coordinates
PN50_agg <-
as.data.frame(
cbind(
PN50_mean = PN50_mean#data$PN50,
XCoord = PN50_centroid$meanX,
YCoord = PN50_centroid$meanY
)
)
PN50_agg$XCoord <- as.numeric(PN50_agg$XCoord)
PN50_agg$YCoord <- as.numeric(PN50_agg$YCoord)
PN50_agg$ISO3 <- as.character(PN50_centroid$ISO3)
samsiz <-
Data %>% group_by(ISO3) %>% summarise(sz = sum(SampleSize))
PN50_agg$sample_size <- as.numeric(samsiz$sz)
PN50_agg$case <- round(PN50_agg$PN50_mean * PN50_agg$sample_size)
# I would like having data in a SpatialPolygonsDataFrame format to use the disaggrgation package
library(sp)
coordinates(PN50_agg) <- c("XCoord", "YCoord")
proj4string(PN50_agg) <- CRS("+proj=longlat +datum=WGS84")
PN50_polyg <- lmic_shape
PN50_polyg#data <-
full_join(PN50_polyg#data, PN50_agg#data, by = "ISO3")
# covariates raster
covariate_stack <-
getCovariateRasters(path, shape = raster(x = paste0(path, '/multi.tif')))
names(covariate_stack)
covariate_stack2 <- dropLayer(covariate_stack, nlayers(covariate_stack))
names(covariate_stack2)
plot(covariate_stack2)
covariate_stack2 <- raster::stack(covariate_stack2)
covariate_stack2<-brick(covariate_stack2)
# population raster
extracted <- raster::extract(raster(x = paste0(path, '/multi.tif')), PN50_polyg)
n_cells <- sapply(extracted, length)
PN50_polyg#data$pop_per_cell <- PN50_polyg#data$sample_size / n_cells
population_raster <-
rasterize(PN50_polyg, covariate_stack2, field = 'pop_per_cell')
# prepare data for disag_model()
dis_data <- prepare_data(
polygon_shapefile = PN50_polyg,
covariate_rasters = covariate_stack2,
aggregation_raster = population_raster,
mesh.args = list(
max.edge = c(5, 40),
cut = 0.0005,
offset = 1
),
id_var = "ISO3",
response_var = "case",
sample_size_var = "sample_size",
na.action = TRUE,
ncores = 8
)
# Rho and p(Rho<Rho_min)
dist <- pointDistance(PN50_agg#coords, lonlat = F, allpairs = T)
rownames(dist) <- PN50_agg$ISO3
colnames(dist) <- PN50_agg$ISO3
flattenDist <- function(dist) {
up <- upper.tri(dist)
flat <- data_frame(row = rownames(dist)[row(dist)[up]],
column = rownames(dist)[col(dist)[up]],
dist = dist[up])
return(flat)
}
pair_dist <- flattenDist(dist)
d <- pair_dist$dist
k <- 0.036
CorMatern <- k * d * besselK(k * d, 1)
limits <- sp::bbox(PN50_polyg)
hypontenuse <-
sqrt((limits[1, 2] - limits[1, 1]) ^ 2 + (limits[2, 2] - limits[2, 1]) ^
2)
prior_rho <- hypontenuse / 3
p_rho <- sum(d[CorMatern <= 0.1] < prior_rho) / length(d[CorMatern <= 0.1])
# sigma and p(sigma>sigma_max)
sigma_boost <- function(data, i) {
sd(data[i] / mean(data[i]))
}
sigma <-
boot(data = dis_data$polygon_data$response,
statistic = sigma_boost,
10000)
prior_sigma <- sigma$t0
p_sigma <- sum(sigma$t >= sigma$t0) / length(sigma$t)
default_priors <-
list(
priormean_intercept = 0,
priorsd_intercept = 4,
priormean_slope = 0,
priorsd_slope = 2,
prior_rho_min = prior_rho,
prior_rho_prob = p_rho,
prior_sigma_max = prior_sigma,
prior_sigma_prob = p_sigma,
prior_iideffect_sd_max = 0.1,
prior_iideffect_sd_prob = 0.01
)
fitted_model <- disag_model(
data = dis_data,
iterations = 1000,
family = "binomial",
link = "logit",
# priors = default_priors,
field = TRUE,
iid = TRUE,
silent = TRUE
)
I was able to run the disag_model function using your dis_data object. There were no errors or crashes. I ran the following lines.
fitted_model <- disag_model(
data = dis_data,
iterations = 1000,
family = "binomial",
link = "logit",
field = TRUE,
iid = TRUE,
silent = TRUE
)
I am running on a Windows machine with 64GB RAM and 8 cores. It took over an hour and used all of my RAM for a while and up to 50% of my CPU, which is not surprising as you are fitting 5.5M pixels over the whole world. Therefore, I suspect it is related to your computer running out of resources. I suggest you try a smaller example to test it out first. Try fewer polygons and fewer pixels in each polygon.

R: incorporating fisher.test into Hmisc's summaryM leads to error

catTestfisher <-
function (tab)
{
st <- if (!is.matrix(tab) || nrow(tab) < 2 | ncol(tab) <
2)
list(p.value = NA, statistic = NA, parameter = NA)
else {
rowcounts <- tab %*% rep(1, ncol(tab))
tab <- tab[rowcounts > 0, ]
if (!is.matrix(tab))
list(p.value = NA, statistic = NA, parameter = NA)
else fisher.test(tab)
}
list(P = st$p.value, stat = "", df = "",
testname = "Fisher's Exact", statname = "", latexstat = "", namefun = "",
plotmathstat = "")
}
I wanted to use library(Hmisc)'s summaryM function but with Fisher's exact test, so I wrote a catTestfisher function and set catTest = catTestfisher in my own summaryM2 function, which is exactly the same as summaryM, except for catTest = catTestfisher
summaryM2 <-
function (formula, groups = NULL, data = NULL, subset, na.action = na.retain,
overall = FALSE, continuous = 10, na.include = FALSE, quant = c(0.025,
0.05, 0.125, 0.25, 0.375, 0.5, 0.625, 0.75, 0.875, 0.95,
0.975), nmin = 100, test = FALSE, conTest = conTestkw,
catTest = catTestfisher, ordTest = ordTestpo)
{
marg <- length(data) && ".marginal." %in% names(data)
if (marg)
formula <- update(formula, . ~ . + .marginal.)
formula <- Formula(formula)
Y <- if (!missing(subset) && length(subset))
model.frame(formula, data = data, subset = subset, na.action = na.action)
else model.frame(formula, data = data, na.action = na.action)
X <- model.part(formula, data = Y, rhs = 1)
Y <- model.part(formula, data = Y, lhs = 1)
getlab <- function(x, default) {
lab <- attr(x, "label")
if (!length(lab) || lab == "")
default
else lab
}
if (marg) {
xm <- X$.marginal.
X$.marginal. <- NULL
}
else xm <- rep("", nrow(X))
if (length(X)) {
xname <- names(X)
if (length(xname) == 1 && !length(groups))
groups <- xname
if (!length(groups) && length(xname) > 1) {
warnings("Must specify groups when > 1 right hand side variable is present.\ngroups taken as first right hand variable.")
groups <- xname[1]
}
svar <- if (length(xname) == 1)
factor(rep(".ALL.", nrow(X)))
else do.call("interaction", list(X[setdiff(xname, groups)],
sep = " "))
group <- X[[groups]]
glabel <- getlab(group, groups)
}
else {
svar <- factor(rep(".ALL.", nrow(Y)))
group <- rep("", nrow(Y))
groups <- group.freq <- NULL
glabel <- ""
}
quants <- unique(c(quant, 0.025, 0.05, 0.125, 0.25, 0.375,
0.5, 0.625, 0.75, 0.875, 0.95, 0.975))
nv <- ncol(Y)
nameY <- names(Y)
R <- list()
for (strat in levels(svar)) {
instrat <- svar == strat
n <- integer(nv)
type <- n
comp <- dat <- vector("list", nv)
names(comp) <- names(dat) <- nameY
labels <- Units <- vector("character", nv)
if (test) {
testresults <- vector("list", nv)
names(testresults) <- names(comp)
}
gr <- group[instrat]
xms <- xm[instrat]
if (all(xms != ""))
xms <- rep("", length(xms))
group.freq <- table(gr)
group.freq <- group.freq[group.freq > 0]
if (overall)
group.freq <- c(group.freq, Combined = sum(group.freq))
for (i in 1:nv) {
w <- Y[instrat, i]
if (length(attr(w, "label")))
labels[i] <- attr(w, "label")
if (length(attr(w, "units")))
Units[i] <- attr(w, "units")
if (!inherits(w, "mChoice")) {
if (!is.factor(w) && !is.logical(w) && length(unique(w[!is.na(w)])) <
continuous)
w <- as.factor(w)
s <- !is.na(w)
if (na.include && !all(s) && length(levels(w))) {
w <- na.include(w)
levels(w)[is.na(levels(w))] <- "NA"
s <- rep(TRUE, length(s))
}
n[i] <- sum(s & xms == "")
w <- w[s]
g <- gr[s, drop = TRUE]
if (is.factor(w) || is.logical(w)) {
tab <- table(w, g)
if (test) {
if (is.ordered(w))
testresults[[i]] <- ordTest(g, w)
else testresults[[i]] <- catTest(tab)
}
if (nrow(tab) == 1) {
b <- casefold(dimnames(tab)[[1]], upper = TRUE)
pres <- c("1", "Y", "YES", "PRESENT")
abse <- c("0", "N", "NO", "ABSENT")
jj <- match(b, pres, nomatch = 0)
if (jj > 0)
bc <- abse[jj]
else {
jj <- match(b, abse, nomatch = 0)
if (jj > 0)
bc <- pres[jj]
}
if (jj) {
tab <- rbind(tab, rep(0, ncol(tab)))
dimnames(tab)[[1]][2] <- bc
}
}
if (overall)
tab <- cbind(tab, Combined = apply(tab, 1,
sum))
comp[[i]] <- tab
type[i] <- 1
}
else {
sfn <- function(x, quant) {
o <- options(digits = 10)
on.exit(options(o))
c(quantile(x, quant), Mean = mean(x), SD = sqrt(var(x)),
N = sum(!is.na(x)))
}
qu <- tapply(w, g, sfn, simplify = TRUE, quants)
if (test)
testresults[[i]] <- conTest(g, w)
if (overall)
qu$Combined <- sfn(w, quants)
comp[[i]] <- matrix(unlist(qu), ncol = length(quants) +
3, byrow = TRUE, dimnames = list(names(qu),
c(format(quants), "Mean", "SD", "N")))
if (any(group.freq <= nmin))
dat[[i]] <- lapply(split(w, g), nmin = nmin,
function(x, nmin) if (length(x) <= nmin)
x
else NULL)
type[i] <- 2
}
}
else {
w <- as.numeric(w) == 1
n[i] <- sum(!is.na(apply(w, 1, sum)) & xms ==
"")
g <- as.factor(gr)
ncat <- ncol(w)
tab <- matrix(NA, nrow = ncat, ncol = length(levels(g)),
dimnames = list(dimnames(w)[[2]], levels(g)))
if (test) {
pval <- numeric(ncat)
names(pval) <- dimnames(w)[[2]]
d.f. <- stat <- pval
}
for (j in 1:ncat) {
tab[j, ] <- tapply(w[, j], g, sum, simplify = TRUE,
na.rm = TRUE)
if (test) {
tabj <- rbind(table(g) - tab[j, ], tab[j,
])
st <- catTest(tabj)
pval[j] <- st$P
stat[j] <- st$stat
d.f.[j] <- st$df
}
}
if (test)
testresults[[i]] <- list(P = pval, stat = stat,
df = d.f., testname = st$testname, statname = st$statname,
latexstat = st$latexstat, plotmathstat = st$plotmathstat)
if (overall)
tab <- cbind(tab, Combined = apply(tab, 1,
sum))
comp[[i]] <- tab
type[i] <- 3
}
}
labels <- ifelse(nchar(labels), labels, names(comp))
R[[strat]] <- list(stats = comp, type = type, group.freq = group.freq,
labels = labels, units = Units, quant = quant, data = dat,
N = sum(!is.na(gr) & xms == ""), n = n, testresults = if (test) testresults)
}
structure(list(results = R, group.name = groups, group.label = glabel,
call = call, formula = formula), class = "summaryM")
}
After trying to test it on the following data, I get a warning and an error:
library(Hmisc)
set.seed(173)
sex <- factor(sample(c("m","f"), 500, rep=TRUE))
treatment <- factor(sample(c("Drug","Placebo"), 500, rep=TRUE))
> summaryM2(sex ~ treatment, test=TRUE, overall = TRUE)
Error in round(teststat, 2) :
non-numeric argument to mathematical function
I tried stepping through the summaryM2 function line by line, but could not figure out what's causing the problem.
In your catTestfisher function, the output variables stat (test statistic) and df (degrees of freedom) should be numeric variables not empty strings. In the programming stat is coverted to teststat for rounding before being outputted (hence the error message for round("", 2) is non-numeric argument to mathematical function). See lines 1718 to 1721 in the summary.formula code) .
You can set df = NULL but a value is required for stat (not NA or NULL) otherwise no output is returned. You can get around the problem by setting stat = 0 (or any other number), and then only displaying the p value using prtest = "P".
catTestfisher2 <- function (tab)
{
st <- fisher.test(tab)
list(P = st$p.value, stat = 0, df = NULL,
testname = st$method, statname = "", latexstat = "", namefun = "",
plotmathstat = "")
}
output <- summaryM(sex ~ treatment, test=TRUE, overall = TRUE, catTest = catTestfisher2)
print(output, prtest = "P")
Descriptive Statistics (N=500)
+-------+-----------+-----------+-----------+-------+
| |Drug |Placebo |Combined |P-value|
| |(N=257) |(N=243) |(N=500) | |
+-------+-----------+-----------+-----------+-------+
|sex : m|0.52 (133)|0.52 (126)|0.52 (259)| 1 |
+-------+-----------+-----------+-----------+-------+
Note there is no need to define your own summaryM2 function. Just use catTest = to pass in your function.

Loop or batch process through a list of CSV files in R

I would like to loop through a list of CSV files:
Macro <- read.csv("P:/R/R_Input/JWN_Input.csv")
Macro <- read.csv("P:/R/R_Input/BBY_Input.csv")
...
That also output to a respective CSV file:
write.csv(a, "P:/Model_Output/JWN.csv", row.names = F, na="")
write.csv(a, "P:/Model_Output/BBY.csv", row.names = F, na="")
...
The above two items are the only unique input/out. The body of the code is below. I am attempting to essentially batch process the input/output CSV files using the body of code below.
Macro <- read.csv("P:/R/R_Input/JWN_Input.csv")
# train set up
ctrl <- caret::trainControl(method = "timeslice", initialWindow = 8, horizon = 1,
fixedWindow = FALSE, savePredictions = TRUE)
# Loads all variable names from Macro and Macro2
vars_macro = names(Macro)[!names(Macro) %in% c("qtrs", "y", "s1", "s2", "s3")]
vars_macro2 = names(Macro2)[!names(Macro2) %in% c("y", "s1", "s2", "s3")]
vars_macro3 = names(Macro3)[!names(Macro3) %in% c("y", "s1", "s2", "s3")]
vars = c(vars_macro, vars_macro2, vars_macro3)
# run lm
lst = foreach(var = vars) %dopar% {
if (var %in% vars_macro)
foo <- function(start, mod_formula) {
myfit <- caret::train(mod_formula, data = Macro[start:14, ,drop = FALSE],
method = "lm", trControl = ctrl)
c(myfit$pred) ## return; drop dimension as a vector
}
if (var %in% vars_macro2)
foo <- function(start, mod_formula) {
myfit <- caret::train(mod_formula, data = Macro2[start:14, ,drop = FALSE],
method = "lm", trControl = ctrl)
c(myfit$pred) ## return; drop dimension as a vector
}
if (var %in% vars_macro3)
foo <- function(start, mod_formula) {
myfit <- caret::train(mod_formula, data = Macro3[start:14, ,drop = FALSE],
method = "lm", trControl = ctrl)
c(myfit$pred) ## return; drop dimension as a vector
}
f = formula(paste0("y ~ ", var, "+ s1 + s2 + s3"))
Forecast <- sapply(1:6, foo, mod_formula = f)
F9 <- c(Forecast[[1,1]][1])
F10 <- c(Forecast[[1,1]][2], Forecast[[1,2]][1])
F11 <- c(Forecast[[1,1]][3], Forecast[[1,2]][2], Forecast[[1,3]][1])
F12 <- c(Forecast[[1,1]][4], Forecast[[1,2]][3], Forecast[[1,3]][2],
Forecast[[1,4]][1])
F13 <- c(Forecast[[1,1]][5], Forecast[[1,2]][4], Forecast[[1,3]][3],
Forecast[[1,4]][2], Forecast[[1,5]][1])
F14 <- c(Forecast[[1,1]][6], Forecast[[1,2]][5], Forecast[[1,3]][4],
Forecast[[1,4]][3], Forecast[[1,5]][2], Forecast[[1,6]][1])
A <-c((mean(F9)/Macro[9:9,2:2]-1), (mean(F10)/Macro[10:10,2:2]-1),
(mean(F11)/Macro[11:11,2:2]-1), (mean(F12)/Macro[12:12,2:2]-1),
(mean(F13)/Macro[13:13,2:2]-1),(mean(F14)/Macro[14:14,2:2]-1))
Temp <- mean(abs(A[0:5]))
P <-c((mean(F9)/Macro[9:9,2:2]-1), (mean(F10)/Macro[10:10,2:2]-1),
(mean(F11)/Macro[11:11,2:2]-1), (mean(F12)/Macro[12:12,2:2]-1),
(mean(F13)/Macro[13:13,2:2]-1),(mean(F14)/Macro[14:14,2:2]-1),
Temp,(mean(F14)/(1+mean(A[3:5])))/Macro[14:14,2:2]-1)
#E <- scales::percent(P)
C <- c(mean(F9),mean(F10),mean(F11), mean(F12), mean(F13), mean(F14),
"abs error",mean(F14)/(1+mean(P[3:5])))
data.frame(C, P)
}
# Summary
model_error = as.character(sapply(lst, function(elt) elt$P[7]))
forecasts = as.numeric(as.character(sapply(lst, function(elt) elt$C[8])))
delta = as.character(sapply(lst, function(elt) elt$P[8]))
df = data.frame(Card = vars, Model_Avg_Error = model_error,
Forecast = forecasts, Delta = delta)
df$blankVar = NA
df_macro1 = df[df$Card %in% vars_macro,]
df_macro1$blankVar = NA
df_macro2 = df[df$Card %in% vars_macro2,]
df_macro2 = df_macro2[order(df_macro2$Model_Avg_Error),]
df_macro2$blankVar = NA
df_macro3 = df[df$Card %in% vars_macro3,]
df_macro3 = df_macro3[order(df_macro3$Model_Avg_Error),]
df_macro3$blankVar = NA
df_macro4 = df[df$Card %in% names(Macro4),]
df_macro4 = df_macro4[order(df_macro4$Model_Avg_Error),]
df = df[order(df$Model_Avg_Error),]
a = cbind.fill(df_macro1, df_macro2, df_macro3, df, df_macro4)
# save
write.csv(a, "P:/Model_Output/JWN.csv", row.names = F, na="")
Just create a list of the filenames and loop through them like this:
Files = c("JWN", "BBY")
for(f in Files) {
InFile = paste("P:/R/R_Input/", f, "_Input.csv", sep="")
OutFile = paste("P:/R/R_Input/", f, "_Output.csv", sep="")
Macro <- read.csv(InFile)
## All of that other code
write.csv(a, OutFile, row.names = F, na="")
}
Addendum based on comments:
Original Poster got an error:
Error in file(con, "w") : all connections are in use"
This was addressed by adding closeAllConnections() immediately after the write.csv statement.

Resources