Time series forecasting using R - r

I have a problem forecasting this non stationery data(https://drive.google.com/file/d/14o5hHe8zxR0onRWq0mZNcYqI101O0Dkw/view?usp=sharing) using Auto Arima.
Please review my code.
# Read Data
r = read.csv('../Amazon/Amazon1.csv', header = TRUE, stringsAsFactors = FALSE)
# Time Series construction
ts = ts(t(r[,1:25]), frequency = 12, start = c(2016,01) )
# plotting Time series
ts.plot(ts[,1:2],type = 'b', xlab = 'Monthly Cycle', ylab = 'Number of Sales', main = "(TIME SERIES) Amazon Sales Cycle of multiple products for 24 months",col=c(rep("black",1),rep("red",2)))
legend("topleft",cex=.65,legend = ts[1,1:2], col = 1:ncol(ts), lty = 1)
set1 = ts[2:20,1]
#set2 = ts[15:20,1]
set1 = as.numeric(set1)
#set2 = as.numeric(set2)
# Building Forecasting models
mf = meanf(set1,h=4,level=c(90,95),fan=FALSE,lambda=NULL)
plot(mf)
mn = naive(set1,h=4,level=c(90,95),fan=FALSE,lambda=NULL)
plot(mn)
md = rwf(set1,h=4,drift=T,level=c(90,95),fan=FALSE,lambda=NULL)
plot(md)
# Checking Accuracy
accuracy(mf)
accuracy(mn)
accuracy(md)
# Identifying Stationarity/Non-Stationarity(unit Root testing)
adf = adf.test(set1)
adf
kpss = kpss.test(set1)
kpss
ndiffs(set1)
diff_data = diff(set1)
adf.test(diff_data) # Rerunning unit test on differenced data
# Identifying Seasonality/Trend
Stl = stl(set1,s.window='periodic')
# ARIMA modelling
ar_set1 = forecast::auto.arima(diff(diff_data), approximation=FALSE,trace=FALSE, stationary = TRUE)
forecast(ar_set1, h = 5)
ar_set2 = forecast::ets(diff(diff_data))
ar_set3 = forecast::nnetar(diff(diff_data), approximation=FALSE,trace=FALSE, stationary = TRUE)
# Prediction
predict(ar_set1, n.ahead = 5,se.fit = TRUE)
predict(ar_set2, n.ahead = 5,se.fit = TRUE)
predict(ar_set3, n.ahead = 5,se.fit = TRUE)
plot(forecast(ar_set1,h=6))
points(1:length(diff(diff_data)),fitted(ar_set1),type="l",col="green")
plot(forecast(ar_set2,h=6))
points(1:length(diff(diff_data)),fitted(ar_set2),type="l",col="green")
plot(forecast(ar_set3,h=6))
points(1:length(diff(diff_data)),fitted(ar_set3),type="l",col="green")
accuracy(ar_set1, test = diff_data)
accuracy(ar_set2, test = diff_data)
accuracy(ar_set3, test = diff_data)
I'm unable to get the forecast values using either of those 3 methods. Where am i going wrong?

Related

A question about the incorporation of clusters in the `rake` function

my question regards an example survey that includes a first-stage cluster and the application of survey::rake.
library(survey)
library(tidyr)
library(dplyr)
set.seed(111)
mood = sample(c("happy","neutral","grumpy"),
size = 1000,
replace=TRUE,
c(0.3,0.3,0.4))
set.seed(222)
sex = sample(c("female","male"),
size=1000,
replace=TRUE,
c(0.6,0.4))
set.seed(333)
age_group = sample(c("young","middle","senior"),
size=1000,
replace=TRUE,
c(0.2,0.6,0.2))
status = data.frame(mood=mood,
sex=sex,
age_group=age_group,
income = trunc(runif(1000,1000,2000)),
dnum = rep(c(441,512,39,99,61),each = 200),
fpc1 = rep(200,1000))
# The source data is :
source = data.frame(mood = rep(c("happy","neutral","grumpy"),6),
sex = rep(rep(c("female","male"),each=3),3),
age_group = rep(c("young","middle","senior"),each=6),
Freq = c(0.0750,0.1250,0.0500,0.0750,0.1250,0.0500,0.0600,
0.1000,0.0400,0.0600,0.1000,0.0400,0.0165,
0.0275,0.0110,0.0135,0.0225,0.0090) )
# We created the total population distribution tables as follows :
targets1 = data.frame(mood = c("happy","neutral","grumpy"),
Freq = nrow(status)*c(sum(source$Freq[source$mood=="happy"]),
sum(source$Freq[source$mood=="neutral"]),
sum(source$Freq[source$mood=="grumpy"])))
targets1
targets2 = data.frame(sex = c("male","female"),
Freq = nrow(status)*c(sum(source$Freq[source$sex=="male"]),
sum(source$Freq[source$sex=="female"])) )
targets2
targets3 = data.frame(age_group = c("young","middle","senior"),
Freq = nrow(status)*c(sum(source$Freq[source$age_group=="young"]),
sum(source$Freq[source$age_group=="middle"]),
sum(source$Freq[source$age_group=="senior"])))
targets3
# and now we set the design unweighted and without the cluster :
unweighted = svydesign(id=~1,data=status)
# we apply the raking method :
status_raked = rake(design = unweighted,
sample.margins = list(~mood,~sex,~age_group),
population.margins = list(targets1,targets2,targets3))
# and we extract the weights (weights1) :
status$weights1 = weights(status_raked)
# now we shall set the design considering the cluster :
unweighted_clustered = svydesign(id=~dnum,
data=status,
fpc=~fpc1)
# we use the raking method :
status_clustered_raked = rake(design = unweighted_clustered,
sample.margins = list(~mood,~sex,~age_group),
population.margins = list(targets1,targets2,targets3))
# and we extract the weights as before (weights2) :
status$weights2 = weights(status_clustered_raked)
so my question is : why the weights1 and weights2 are identical eventhough we incorporated the clusters in it? Am I missing something when applying the raking method for the clustered design ?
Would appreciate your help. Thanks.

uwot is throwing an error running the Monocle3 R package's "find_gene_module()" function, likely as an issue with how my data is formatted

I am trying to run the Monocle3 function find_gene_modules() on a cell_data_set (cds) but am getting a variety of errors in this. I have not had any other issues before this. I am working with an imported Seurat object. My first error came back stating that the number of rows were not the same between my cds and cds#preprocess_aux$gene_loadings values. I took a look and it seems my gene loadings were a list under cds#preprocess_aux#listData$gene_loadings. I then ran the following code to make a dataframe version of the gene loadings:
test <- seurat#assays$RNA#counts#Dimnames[[1]]
test <- as.data.frame(test)
cds#preprocess_aux$gene_loadings <- test
rownames(cds#preprocess_aux$gene_loadings) <- cds#preprocess_aux$gene_loadings[,1]
Which created a cds#preprocess_aux$gene_loadings dataframe with the same number of rows and row names as my cds. This resolved my original error but now led to a new error being thrown from uwot as:
15:34:02 UMAP embedding parameters a = 1.577 b = 0.8951
Error in uwot(X = X, n_neighbors = n_neighbors, n_components = n_components, :
No numeric columns found
Running traceback() produces the following information.
> traceback()
4: stop("No numeric columns found")
3: uwot(X = X, n_neighbors = n_neighbors, n_components = n_components,
metric = metric, n_epochs = n_epochs, alpha = learning_rate,
scale = scale, init = init, init_sdev = init_sdev, spread = spread,
min_dist = min_dist, set_op_mix_ratio = set_op_mix_ratio,
local_connectivity = local_connectivity, bandwidth = bandwidth,
gamma = repulsion_strength, negative_sample_rate = negative_sample_rate,
a = a, b = b, nn_method = nn_method, n_trees = n_trees, search_k = search_k,
method = "umap", approx_pow = approx_pow, n_threads = n_threads,
n_sgd_threads = n_sgd_threads, grain_size = grain_size, y = y,
target_n_neighbors = target_n_neighbors, target_weight = target_weight,
target_metric = target_metric, pca = pca, pca_center = pca_center,
pca_method = pca_method, pcg_rand = pcg_rand, fast_sgd = fast_sgd,
ret_model = ret_model || "model" %in% ret_extra, ret_nn = ret_nn ||
"nn" %in% ret_extra, ret_fgraph = "fgraph" %in% ret_extra,
batch = batch, opt_args = opt_args, epoch_callback = epoch_callback,
tmpdir = tempdir(), verbose = verbose)
2: uwot::umap(as.matrix(preprocess_mat), n_components = max_components,
metric = umap.metric, min_dist = umap.min_dist, n_neighbors = umap.n_neighbors,
fast_sgd = umap.fast_sgd, n_threads = cores, verbose = verbose,
nn_method = umap.nn_method, ...)
1: find_gene_modules(cds[pr_deg_ids, ], reduction_method = "UMAP",
max_components = 2, umap.metric = "cosine", umap.min_dist = 0.1,
umap.n_neighbors = 15L, umap.fast_sgd = FALSE, umap.nn_method = "annoy",
k = 20, leiden_iter = 1, partition_qval = 0.05, weight = FALSE,
resolution = 0.001, random_seed = 0L, cores = 1, verbose = T)
I really have no idea what I am doing wrong or how to proceed from here. Does anyone with experience with uwot know where my error is coming from? Really appreciate the help!

Impose constraints on the coefficient matrix of a var model in R

how can I impose constraints on the coefficient matrix of a var model in r.
Some of my code is followed
library(readxl)
dat_pc_log_d <- read_excel("C:/Users/Desktop/dat_pc_log_d.xlsx")
attach(dat_pc_log_d)
dat_pc_log_d$itcrm = NULL
dat_pc_log_d$...1 = NULL
data = ts(dat_pc_log_d,start = c(2004,1),end = c(2019,1),frequency = 4)
VAR_modelo = VAR(data,p=2)
VAR_modelo_restriccion = restrict(VAR_modelo,method = "ser",thresh = 2.0)
ir_pib = irf(VAR_modelo_restriccion, impulse = "pbipc_log_d", response = c("pbipc_log_d", "expopc_log_d", "pbiagr_log_d"),
boot = TRUE, ci = 0.95)
I need to ensure exogeneity of a variable, for it I have to impose zero in some lags coefficients of the independent variable. How can I do it ?
thanks
library(readxl)
dat_pc_log_d <- read_excel("C:/Users//dat_pc_log_d.xlsx")
attach(dat_pc_log_d)
dat_pc_log_d$...1 = NULL
data = ts(dat_pc_log_d,start = c(2004,1),end = c(2019,1),frequency = 4)
VAR_modelo = VAR(data,p=2)
restriccion = matrix(c(1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,1),
nrow=8, ncol=17, byrow = TRUE)
VAR_modelo_restriccion = restrict(VAR_modelo,method = "man", resmat = restriccion)
ir_pib = irf(VAR_modelo_restriccion, impulse = "itcrm", response = c("pbipc_log_d", "expopc_log_d", "inverpc_log_d" , "pbiagr_log_d"),
boot = TRUE, nhead=20 ,ci = 0.68)

Error in `V<-`(`*tmp*`, value = `*vtmp*`) : invalid indexing

I used the bibliometrix function in R, and want to plot some useful graphs.
library(bibliometrix)
??bibliometrix
D<-readFiles("E:\\RE\\savedrecs.txt")
M <- convert2df(D,dbsource = "isi", format= "plaintext")
results <- biblioAnalysis(M ,sep = ";" )
S<- summary(object=results,k=10, pause=FALSE)
plot(x=results,k=10,pause=FALSE)
options(width=100)
S <- summary(object = results, k = 10, pause = FALSE)
NetMatrix <- biblioNetwork(M1, analysis = "co-occurrences", network = "author_keywords", sep = ";")
S <- normalizeSimilarity(NetMatrix, type = "association")
net <- networkPlot(S, n = 200, Title = "co-occurrence network",type="fruchterman", labelsize = 0.7, halo = FALSE, cluster = "walktrap",remove.isolates=FALSE, remove.multiple=FALSE, noloops=TRUE, weighted=TRUE)
res <- thematicMap(net, NetMatrix, S)
plot(res$map)
But in the net <- networkPlot(S, n = 200, Title = "co-occurrence network",type="fruchterman", labelsize = 0.7, halo = FALSE, cluster = "walktrap",remove.isolates=FALSE, remove.multiple=FALSE, noloops=TRUE, weighted=TRUE), it shows error
Error in V<-(*tmp*, value = *vtmp*) : invalid indexing
. Also I cannot do the CR, it always shows unlistCR. I cannot use the NetMatrix function neither.
Some help me plsssssssss
The problem is in the data itself not in the code you presented. When I downloaded the data from bibliometrix.com and changed M1 to M (typo?) in biblioNetwork function call everything worked perfectly. Please see the code below:
library(bibliometrix)
# Plot bibliometric analysis results
D <- readFiles("http://www.bibliometrix.org/datasets/savedrecs.txt")
M <- convert2df(D, dbsource = "isi", format= "plaintext")
results <- biblioAnalysis(M, sep = ";")
S <- summary(results)
plot(x = results, k = 10, pause = FALSE)
# Plot Bibliographic Network
options(width = 100)
S <- summary(object = results, k = 10, pause = FALSE)
NetMatrix <- biblioNetwork(M, analysis = "co-occurrences", network = "author_keywords", sep = ";")
S <- normalizeSimilarity(NetMatrix, type = "association")
net <- networkPlot(S, n = 200, Title = "co-occurrence network", type = "fruchterman",
labelsize = 0.7, halo = FALSE, cluster = "walktrap",
remove.isolates = FALSE, remove.multiple = FALSE, noloops = TRUE, weighted = TRUE)
# Plot Thematic Map
res <- thematicMap(net, NetMatrix, S)
str(M)
plot(res$map)

R package mlr Multilabel Text Classification: how to classify new data

I found this code in a tutorial about multilabel classification with package mlr.
library("mlr")
yeast = getTaskData(yeast.task)
labels = colnames(yeast)[1:14]
yeast.task = makeMultilabelTask(id = "multi", data = yeast, target = labels)
lrn.br = makeLearner("classif.rpart", predict.type = "prob")
lrn.br = makeMultilabelBinaryRelevanceWrapper(lrn.br)
mod = train(lrn.br, yeast.task, subset = 1:1500, weights = rep(1/1500, 1500))
pred = predict(mod, task = yeast.task, subset = 1:10)
pred = predict(mod, newdata = yeast[1501:1600,])
I understand the structure of the dataset yeast, but I do not understand how to use the code when I have new data which I want to classify because then I wouldn´t have any TRUE or FALSE values for the labels. Actually I would have some training data with the same structure as yeast but for my new data the columns 1:14 would be missing.
Am I missunderstanding something? If not: How can I use the code correctly?
Edit:
Here´s a sample code how I would use the code:
library("tm")
train.data = data.frame("id" = c(1,1,2,3,4,4), "text" = c("Monday is nice weather.", "Monday is nice weather.", "Dogs are cute.", "It is very rainy.", "My teacher is angry.", "My teacher is angry."), "label" = c("label1", "label2", "label3", "label1", "label4", "label5"))
test.data = data.frame("id" = c(5,6), "text" = c("Next Monday I will meet my teacher.", "Dogs do not like rain."))
train.data$text = as.character(train.data$text)
train.data$id = as.character(train.data$id)
train.data$label = as.character(train.data$label)
test.data$text = as.character(test.data$text)
test.data$id = as.character(test.data$id)
### Bring training data into structure
train.data$label = make.names(train.data$label)
labels = unique(train.data$label)
# DocumentTermMatrix for all texts
texts = unique(c(train.data$text, test.data$text))
docs <- Corpus(VectorSource(unique(texts)))
terms <-DocumentTermMatrix(docs)
m <- as.data.frame(as.matrix(terms))
# Logical columns for labels
test = data.frame("id" = train.data$id, "topic"=train.data$label)
test2 = as.data.frame(unclass(table(test)))
test2[,c(1:ncol(test2))] = as.logical(unlist(test2[,c(1:ncol(test2))]))
rownames(test2) = unique(test$id)
# Bind columns from dtm
termsDf = cbind(test2, m[1:nrow(test2),])
names(termsDf) = make.names(names(termsDf))
### Create Multilabel Task
classify.task = makeMultilabelTask(id = "multi", data = termsDf, target = labels)
### Now the model
lrn.br = makeLearner("classif.rpart", predict.type = "prob")
lrn.br = makeMultilabelBinaryRelevanceWrapper(lrn.br)
mod = train(lrn.br, classify.task)
### How can I predict for test.data?
So, the problem is that I do not have any labels for test.data because that is what I would actually like to compute?
Edit2:
When I simply use
names(m) = make.names(names(m))
pred = predict(mod, newdata = m[(nrow(termsDf)+1):(nrow(termsDf)+nrow(test.data)),])
the result is for both texts the same and really not that I would expect.

Resources