Error: Passing a dataframe through quantile function - r

Having trouble. What I have is:
dec_amt <- function(x, k) format(round(x, k), nsmall=k) # Formatting decimal places
example.df <- data.frame(replicate(8,sample(0:100000,1000,rep=TRUE)))
names(example.df) <- c("AF", "CD", "CS", "ED", "LP", "PI", "RR", "TD")
probTab_test2 <- function(x = c(...), y=c(...), z=c(...))
{
m.TABLE <- list()
EXP <- list()
PROB <- c(seq(.10, .90, .10), seq(.91,.99,.01), seq(.995, .999, .001))
PERIOD <- dec_amt(1/(1-PROB), 2)
for (i in 1:(length(z))) {
if (length(z) == 1)
{
break
}
EXP <- quantile(example.df[,z[i]], PROB)
EXP <- formatC(EXP, format='d', big.mark=',')
m.TABLE <- list(data.frame(PERIOD, EXP))
print(m.TABLE)
}
EXP <- quantile(example.df[,z], PROB)
EXP <- formatC(EXP, format='d', big.mark=',')
TABLE <- data.frame(PERIOD, EXP)
return(TABLE)
}
probTab_test2(c("Consumer Products"), c("All Revenues"),c("TD", "LP"))
Error in `[.data.frame`(x, order(x, na.last = na.last, decreasing = decreasing)) :
undefined columns selected
What I want is, if the length of the argument z is > 1 then for every element of z, I want it to create an 'EXP' column that I can bind into a dataframe (defined as m.TABLE) so at the end I would have a list of 'z' number of dataframes.
I feel like the quantile function is not happy about me passing through a dataframe instead of a vector, but not sure how to get around that in this loop. Suggestions would be great, happy to provide further information.
Note - feel free to disregard my x, y arguments - those will be used to call an outside function, but not noteworthy for this issue.

You can make use of the lapply function to loop over the variables listed in z and simplify the code:
probTab_test2 <- function(x = c(...), y=c(...), z=c(...)) {
PROB <- c(seq(.10, .90, .10), seq(.91,.99,.01), seq(.995, .999, .001))
PERIOD <- dec_amt(1/(1-PROB), 2)
m.TABLE <- lapply(as.data.frame(example.df[, z]), function (vector) {
quantiles <- quantile(vector, PROB)
formatted.quantiles <- formatC(EXP, quantiles, format='d', big.mark=',')
return(data.frame(PROB, formatted.quantiles))
})
return(m.TABLE)
}
For each variable listed in z, this function calculates the quantiles and creates a data frame for each variable listed in z. The as.data.frame is necessary so that lapply works even when length(z)==1.

Related

Repeat analysis for several datasets in R

How can I repeat this code for each subject (xxx), so that the results are added to the data.frame (centralities)?
fullDataDetrend_xxx <- subset(fullDataDetrend, subjno == xxx, select=c(subjno,depressed,sad,tired,interest,happy,neg_thoughts,concentration_probl,ruminating,activity,datevar,timestamp,dayno,beepno))
model_xxx <- var1(
fullDataDetrend_xxx)
model_xxx_omega <- getmatrix(model_xxx, "omega_zeta")
centrality_model_xxx_omega <- centrality(model_xxx_omega )
centralities[nrow(centralities) + 1,] <- c("xxx",centrality_model_xxx_omega$InExpectedInfluence)
Did as suggested:
fullDataDetrend_split <- split(fulldataDetrend, fulldataDetrend$subjno)
then, to estimate network, pull centrality estimates, and write to centralities in global environment:
analyze_one <- function(dataframe){
network_model <- var1(
dataframe,
vars = useVars,
contemporaneous = "ggm",
dayvar = "dayno",
beepvar = "beepno",
estimator = "FIML",
verbose = TRUE,
omega_zeta = "full")
model_omega <- getmatrix(network_model, "omega_zeta")
centrality_omega<- centrality(model_omega)
model_beta <- getmatrix(network_model, "beta")
centrality_beta<- centrality(model_beta)
subjno <- as.list(dataframe[1,2])
centralities[nrow(centralities) + 1,] <- c(subjno,centrality_omega$InExpectedInfluence,centrality_beta$InExpectedInfluence,centrality_beta$OutExpectedInfluence)
assign('centralities',centralities, envir=.GlobalEnv)
}
then rerun the code with lapply for all dataframes (with ignoring errors):
lapply_with_error <- function(X,FUN,...){
lapply(X, function(x, ...) tryCatch(FUN(x, ...),
error=function(e) NULL))
}
lapply_with_error(fullDataDetrend_split, FUN = analyze_one)

Making custom functions in R involving summation

I am a novice in R asked to compute for a descriptive statistic called dominance (D; expressed as a percentage). D is basically defined as the mean abundance (MA) value of x divided by the sum of MA values of x to i. MA meanwhile is defined as the sum of all values in a vector over the length of the said vector. Here is how I normally approach things:
#Example data
x <- c(1, 2, 3)
y <- c(4, 5, 6)
z <- c(7, 8, 9)
#Mean abundance function
mean.abundance <- function(x){
N_sum <- sum(x)
N_count <- length(x)
N_sum/N_count
}
#Percent dominance function (workaround)
percent.dominance <- function(x, ...){
MA_a <- (x)
sum_MA_i <- sum(x, ...)
(MA_a/sum_MA_i)*100
}
MA_x <- mean.abundance(x)
MA_y <- mean.abundance(y)
MA_z <- mean.abundance(z)
MA <- c(MA_x, MA_y, MA_z)
MA
D_x <- percent.dominance(MA_x, MA_y, MA_z)
D_y <- percent.dominance(MA_y, MA_x, MA_z)
D_z <- percent.dominance(MA_z, MA_x, MA_y)
D <- c(D_x, D_y, D_z)
D
That approach alone already gives me the %D values I am looking for. My problem is that my (perfectionist) PI is asking me to compute for the %D values directly using vectors x, y, and z (and not stepwise by means of calculating MA values then using vectors MA_x, MA_y, and MA_z to calculate for %D). I am stumped making a custom function for %D that involves vectors containing raw data; here is a failed attempt to revise said custom function, just to give a general idea.
#Percent dominance function (incorrect)
percent.dominance <- function(x, ...){
MA_a <- sum(x)/length(x)
sum_MA_i <- sum(x, ...)/length(x, ...)
(MA_a/sum_MA_i)*100
}
You can capture the optional data passed with list(...) and make the following changes to the function -
percent.dominance <- function(x, ...){
data <- list(...)
MA_a <- sum(x)/length(x)
sum_MA_i <- sum(x, unlist(data))/(length(data) + 1)
(MA_a/sum_MA_i)*100
}
percent.dominance(x, y, z)
#[1] 13.33333
percent.dominance(y, x, z)
#[1] 33.33333
percent.dominance(z, x, y)
#[1] 53.33333

Error in if ((dimension < 1) | (dimension > n)) stop("wrong embedding dimension") : argument is of length zero

my code is like the following:
unemp <- c(1:10)
bsp_li <- list(c(1:10),c(11:20),c(21:30))
var_data_rep <- lapply(bsp_li, function(x) {cbind(as.numeric(x), as.numeric(unemp))} ) # Create VAR data matrices
var_data_rep2 <- lapply(var_data_rep, function(x) {colnames(x) = c("rGDP", "U"); return(x)}) # Name columns
var_data_rep_ts <- lapply(var_data_rep2, function(x) {ts(x, frequency=1, start=c(1977))} ) # Make it ts again
var_data_rep_lag <- lapply(var_data_rep_ts, function(x) {VARselect(x, lag.max = 5, type = "const")} ) # Take lag with lowest SC criteria (VAR.pdf)
VARgdp_rep <- lapply(var_data_rep_ts, function(x) {VAR(x, p = var_data_rep_lag$x$selection[['SC(n)']], type = "const"); return(x)} ) # Lag=lowest SC criteria from var_data_rep_lag
if i run only the last line r always gives me the error:
Error in if ((dimension < 1) | (dimension > n)) stop("wrong embedding dimension") :
argument is of length zero
Called from: embed(y, dimension = p + 1)
But if im running it with Source then it seems to work.. any suggestions?
This seems to work (at least no error is thrown) :
VARgdp_rep <-
lapply(index(var_data_rep_ts),
function(x) {
res <- VAR(var_data_rep_ts[[x]], p =
var_data_rep_lag[[x]]$selection[['SC(n)']], type = "const");
return(res)
}
)
In you code, return(x) is strange because after doing VAR calculations .. you just return the x withc was pass to the function.
And $x seems to have no meaning here.

Multiple loop Syntax Error

I cannot figure out what's going wrong with my loop and it is already too complicated for my current level. I have already tried applybut obviously I do something wrong, so I didn't use it at all.
library('wavelets')
library('benford.analysis')
indeces <- ls() # my initial datasets
wfilters <- array(c("haar","la8","d4","c6")) # filter option in "modwt" function
wfiltname <- array(c("h","l","d","c")) # to rename the new objects
for (i in 1:nrow(as.array(indeces))) {
x <- get(as.matrix(indeces[i]))
x <- x[,2]
# Creates modwt objects equal to the number of filters
for (j in 1:nrow(as.array(wfilters))) {
x <- wavelets::modwt(x, filter = wfilters[j], n.levels = 4,
boundary = "periodic")
# A loop that creates a matrix with benford fun output per modwt n.levels option
for (l in 1:4) {
x <- as.matrix(x#W$W[l]) # n.levels are represented as x#W$W1, x#W$W2,...
x <- benford.analysis::benford(x, number.of.digits = 1,
sign = "both", discrete = T,
round = 3) # accepts matrices
x[,l] <- x$bfd$data.dist # it always has 9 elements
}
assign(paste0("b", wfiltname[j], indeces[i]), x)
}
}
The above loop should be reproducible with any data (where the values are in second column). The error I get is the following:
Error in array(x, c(length(x), 1L), if (!is.null(names(x))) list(names(x), :
'data' must be of a vector type, was 'NULL'
Thanks to #Cath and #jogo I made it work after some improvements. Here's the correct code:
temp <- list.files(path = "...")
list2env(
lapply(setNames(temp, make.names(gsub("*.csv$", "", temp))),
read.csv), envir = .GlobalEnv)
rm(temp)
indeces <- ls()
wfilters <- array(c("haar","la8","d4","c6"))
wfiltname <- array(c("h","l","d","c"))
k <- data.frame(matrix(nrow = 9,ncol = 4))
nlvl <- 4
for (i in 1:length(indeces)) {
x <- as.matrix(get(indeces[i]))
for (j in 1:length(wfilters)) {
y <- wavelets::modwt(as.matrix(x), filter = wfilters[j], n.levels = nlvl,
boundary = "periodic")
y <- as.matrix(y#W)
for(m in 1:nlvl) {
z <- as.matrix(y[[m]])
z <- benford.analysis::benford(z, number.of.digits = 1, sign = "both", discrete = TRUE, round = 16)
k[m] <- as.data.frame(z$bfd$data.dist)
colnames(k)[m] <- paste0(wfilters[j], "W", m)
}
assign(paste0(indeces[i], wfiltname[j]), k)
}
}
rm(x,y,z,i,j,m,k)
I would appreciate if there is a way to write it more efficiently. Thank you very much

Autocorrelation plot for only negative values

I would like to do an acf plot in R for only the negative values of a time series. I cannot do this by just subsetting the data for only negative values beforehand, because then the autocorrelation will remove arbitrary number of positive days in between the negative values and be unreasonably high, but rather, I would like to run the autocorrelation on the whole time series and then filter out the results given the first day is negative.
For example, in theory, I could make a data frame with the original series and all of the lagged time series in a data frame, then filter for the negative values in the original series, and then plot the correlations. However, I would like to automate this using the acf plot.
Here is an example of my time series:
> dput(exampleSeries)
c(0, 0, -0.000687, -0.004489, -0.005688, 0.000801, 0.005601,
0.004546, 0.003451, -0.000836, -0.002796, 0.005581, -0.003247,
-0.002416, 0.00122, 0.005337, -0.000195, -0.004255, -0.003097,
0.000751, -0.002037, 0.00837, -0.003965, -0.001786, 0.008497,
0.000693, 0.000824, 0.005681, 0.002274, 0.000773, 0.001141, 0.000652,
0.001559, -0.006201, 0.000479, -0.002041, 0.002757, -0.000736,
-2.1e-05, 0.000904, -0.000319, -0.000227, -0.006589, 0.000998,
0.00171, 0.000271, -0.004121, -0.002788, -9e-04, 0.001639, 0.004245,
-0.00267, -0.004738, 0.001192, 0.002175, 0.004666, 0.006005,
0.001218, -0.003188, -0.004363, 0.000462, -0.002241, -0.004806,
0.000463, 0.000795, -0.005715, 0.004635, -0.004286, -0.008908,
-0.001044, -0.000842, -0.00445, -0.006094, -0.001846, 0.005013,
-0.006599, 0.001914, 0.00221, 6.2e-05, -0.001391, 0.004369, -0.005739,
-0.003467, -0.002103, -0.000882, 0.001483, 0.003074, 0.00165,
-0.00035, -0.000573, -0.00316, -0.00102, -0.00144, 0.003421,
0.005436, 0.001994, 0.00619, 0.005319, 7.3e-05, 0.004513)
I tried to implement your description.
correl <- function(x, lag.max = 10){
library(dplyr)
m <- matrix(ncol = lag.max, nrow = length(x))
for(i in 1:lag.max){
m[,i] <- lag(x, i)
}
m <- m[x<0,]
res <- apply(m, 2, function(y) cor(y, x[x<0], use = "complete.obs"))
barplot(res)
}
correl(exampleSeries)
Maybe just write your own function? Something like:
negativeACF <- function(x, num.lags = 10)
{
n <- length(x)
acfs <- sapply(0:num.lags, function(i) cor(x[-i:-1], x[(-n-1+i):-n]))
names(acfs) <- 0:num.lags
acfs[acfs < 0]
}
results <- negativeACF(exampleSeries, num.lags=20)
barplot(results)
Yea I ended up writing my own functions and just replacing the values in the R acf object with my own values that are just the correlations. So:
genACF <- function(series, my.acf, lag.max = NULL, neg){
x <- na.fail(as.ts(series))
x.freq <- frequency(x)
x <- as.matrix(x)
if (!is.numeric(x))
stop("'x' must be numeric")
sampleT <- as.integer(nrow(x))
nser <- as.integer(ncol(x))
if (is.null(lag.max))
lag.max <- floor(10 * (log10(sampleT) - log10(nser)))
lag.max <- as.integer(min(lag.max, sampleT - 1L))
if (is.na(lag.max) || lag.max < 0)
stop("'lag.max' must be at least 0")
if(neg){
indices <- which(series < 0)
}else{
indices <- which(series > 0)
}
series <- scale(series, scale = FALSE)
series.zoo <- zoo(series)
for(i in 0:lag.max){
lag.series <- lag(series.zoo, k = -i, na.pad = TRUE)
temp.corr <- cor(series.zoo[indices], lag.series[indices], use = 'complete.obs', method = 'pearson')
my.acf[i+1] <- temp.corr
}
my.acf[1] <- 0
return(my.acf)
}
plotMyACF <- function(series, main, type = 'correlation', neg = TRUE){
series.acf <- acf(series, plot = FALSE)
my.acf <- genACF(series, series.acf$acf, neg = neg)
series.acf$acf <- my.acf
plot(series.acf, xlim = c(1, dim(series.acf$acf)[1] - (type == 'correlation')), xaxt = "n", main = main)
if (dim(series.acf$acf)[1] < 25){
axis(1, at = 1:(dim(series.acf$acf)[1] - 1))
}else{
axis(1)
}
}
And I get something like this:

Resources