I have a problem forecasting this non stationery data(https://drive.google.com/file/d/14o5hHe8zxR0onRWq0mZNcYqI101O0Dkw/view?usp=sharing) using Auto Arima.
Please review my code.
# Read Data
r = read.csv('../Amazon/Amazon1.csv', header = TRUE, stringsAsFactors = FALSE)
# Time Series construction
ts = ts(t(r[,1:25]), frequency = 12, start = c(2016,01) )
# plotting Time series
ts.plot(ts[,1:2],type = 'b', xlab = 'Monthly Cycle', ylab = 'Number of Sales', main = "(TIME SERIES) Amazon Sales Cycle of multiple products for 24 months",col=c(rep("black",1),rep("red",2)))
legend("topleft",cex=.65,legend = ts[1,1:2], col = 1:ncol(ts), lty = 1)
set1 = ts[2:20,1]
#set2 = ts[15:20,1]
set1 = as.numeric(set1)
#set2 = as.numeric(set2)
# Building Forecasting models
mf = meanf(set1,h=4,level=c(90,95),fan=FALSE,lambda=NULL)
plot(mf)
mn = naive(set1,h=4,level=c(90,95),fan=FALSE,lambda=NULL)
plot(mn)
md = rwf(set1,h=4,drift=T,level=c(90,95),fan=FALSE,lambda=NULL)
plot(md)
# Checking Accuracy
accuracy(mf)
accuracy(mn)
accuracy(md)
# Identifying Stationarity/Non-Stationarity(unit Root testing)
adf = adf.test(set1)
adf
kpss = kpss.test(set1)
kpss
ndiffs(set1)
diff_data = diff(set1)
adf.test(diff_data) # Rerunning unit test on differenced data
# Identifying Seasonality/Trend
Stl = stl(set1,s.window='periodic')
# ARIMA modelling
ar_set1 = forecast::auto.arima(diff(diff_data), approximation=FALSE,trace=FALSE, stationary = TRUE)
forecast(ar_set1, h = 5)
ar_set2 = forecast::ets(diff(diff_data))
ar_set3 = forecast::nnetar(diff(diff_data), approximation=FALSE,trace=FALSE, stationary = TRUE)
# Prediction
predict(ar_set1, n.ahead = 5,se.fit = TRUE)
predict(ar_set2, n.ahead = 5,se.fit = TRUE)
predict(ar_set3, n.ahead = 5,se.fit = TRUE)
plot(forecast(ar_set1,h=6))
points(1:length(diff(diff_data)),fitted(ar_set1),type="l",col="green")
plot(forecast(ar_set2,h=6))
points(1:length(diff(diff_data)),fitted(ar_set2),type="l",col="green")
plot(forecast(ar_set3,h=6))
points(1:length(diff(diff_data)),fitted(ar_set3),type="l",col="green")
accuracy(ar_set1, test = diff_data)
accuracy(ar_set2, test = diff_data)
accuracy(ar_set3, test = diff_data)
I'm unable to get the forecast values using either of those 3 methods. Where am i going wrong?
Related
my question regards an example survey that includes a first-stage cluster and the application of survey::rake.
library(survey)
library(tidyr)
library(dplyr)
set.seed(111)
mood = sample(c("happy","neutral","grumpy"),
size = 1000,
replace=TRUE,
c(0.3,0.3,0.4))
set.seed(222)
sex = sample(c("female","male"),
size=1000,
replace=TRUE,
c(0.6,0.4))
set.seed(333)
age_group = sample(c("young","middle","senior"),
size=1000,
replace=TRUE,
c(0.2,0.6,0.2))
status = data.frame(mood=mood,
sex=sex,
age_group=age_group,
income = trunc(runif(1000,1000,2000)),
dnum = rep(c(441,512,39,99,61),each = 200),
fpc1 = rep(200,1000))
# The source data is :
source = data.frame(mood = rep(c("happy","neutral","grumpy"),6),
sex = rep(rep(c("female","male"),each=3),3),
age_group = rep(c("young","middle","senior"),each=6),
Freq = c(0.0750,0.1250,0.0500,0.0750,0.1250,0.0500,0.0600,
0.1000,0.0400,0.0600,0.1000,0.0400,0.0165,
0.0275,0.0110,0.0135,0.0225,0.0090) )
# We created the total population distribution tables as follows :
targets1 = data.frame(mood = c("happy","neutral","grumpy"),
Freq = nrow(status)*c(sum(source$Freq[source$mood=="happy"]),
sum(source$Freq[source$mood=="neutral"]),
sum(source$Freq[source$mood=="grumpy"])))
targets1
targets2 = data.frame(sex = c("male","female"),
Freq = nrow(status)*c(sum(source$Freq[source$sex=="male"]),
sum(source$Freq[source$sex=="female"])) )
targets2
targets3 = data.frame(age_group = c("young","middle","senior"),
Freq = nrow(status)*c(sum(source$Freq[source$age_group=="young"]),
sum(source$Freq[source$age_group=="middle"]),
sum(source$Freq[source$age_group=="senior"])))
targets3
# and now we set the design unweighted and without the cluster :
unweighted = svydesign(id=~1,data=status)
# we apply the raking method :
status_raked = rake(design = unweighted,
sample.margins = list(~mood,~sex,~age_group),
population.margins = list(targets1,targets2,targets3))
# and we extract the weights (weights1) :
status$weights1 = weights(status_raked)
# now we shall set the design considering the cluster :
unweighted_clustered = svydesign(id=~dnum,
data=status,
fpc=~fpc1)
# we use the raking method :
status_clustered_raked = rake(design = unweighted_clustered,
sample.margins = list(~mood,~sex,~age_group),
population.margins = list(targets1,targets2,targets3))
# and we extract the weights as before (weights2) :
status$weights2 = weights(status_clustered_raked)
so my question is : why the weights1 and weights2 are identical eventhough we incorporated the clusters in it? Am I missing something when applying the raking method for the clustered design ?
Would appreciate your help. Thanks.
I am trying to run the Monocle3 function find_gene_modules() on a cell_data_set (cds) but am getting a variety of errors in this. I have not had any other issues before this. I am working with an imported Seurat object. My first error came back stating that the number of rows were not the same between my cds and cds#preprocess_aux$gene_loadings values. I took a look and it seems my gene loadings were a list under cds#preprocess_aux#listData$gene_loadings. I then ran the following code to make a dataframe version of the gene loadings:
test <- seurat#assays$RNA#counts#Dimnames[[1]]
test <- as.data.frame(test)
cds#preprocess_aux$gene_loadings <- test
rownames(cds#preprocess_aux$gene_loadings) <- cds#preprocess_aux$gene_loadings[,1]
Which created a cds#preprocess_aux$gene_loadings dataframe with the same number of rows and row names as my cds. This resolved my original error but now led to a new error being thrown from uwot as:
15:34:02 UMAP embedding parameters a = 1.577 b = 0.8951
Error in uwot(X = X, n_neighbors = n_neighbors, n_components = n_components, :
No numeric columns found
Running traceback() produces the following information.
> traceback()
4: stop("No numeric columns found")
3: uwot(X = X, n_neighbors = n_neighbors, n_components = n_components,
metric = metric, n_epochs = n_epochs, alpha = learning_rate,
scale = scale, init = init, init_sdev = init_sdev, spread = spread,
min_dist = min_dist, set_op_mix_ratio = set_op_mix_ratio,
local_connectivity = local_connectivity, bandwidth = bandwidth,
gamma = repulsion_strength, negative_sample_rate = negative_sample_rate,
a = a, b = b, nn_method = nn_method, n_trees = n_trees, search_k = search_k,
method = "umap", approx_pow = approx_pow, n_threads = n_threads,
n_sgd_threads = n_sgd_threads, grain_size = grain_size, y = y,
target_n_neighbors = target_n_neighbors, target_weight = target_weight,
target_metric = target_metric, pca = pca, pca_center = pca_center,
pca_method = pca_method, pcg_rand = pcg_rand, fast_sgd = fast_sgd,
ret_model = ret_model || "model" %in% ret_extra, ret_nn = ret_nn ||
"nn" %in% ret_extra, ret_fgraph = "fgraph" %in% ret_extra,
batch = batch, opt_args = opt_args, epoch_callback = epoch_callback,
tmpdir = tempdir(), verbose = verbose)
2: uwot::umap(as.matrix(preprocess_mat), n_components = max_components,
metric = umap.metric, min_dist = umap.min_dist, n_neighbors = umap.n_neighbors,
fast_sgd = umap.fast_sgd, n_threads = cores, verbose = verbose,
nn_method = umap.nn_method, ...)
1: find_gene_modules(cds[pr_deg_ids, ], reduction_method = "UMAP",
max_components = 2, umap.metric = "cosine", umap.min_dist = 0.1,
umap.n_neighbors = 15L, umap.fast_sgd = FALSE, umap.nn_method = "annoy",
k = 20, leiden_iter = 1, partition_qval = 0.05, weight = FALSE,
resolution = 0.001, random_seed = 0L, cores = 1, verbose = T)
I really have no idea what I am doing wrong or how to proceed from here. Does anyone with experience with uwot know where my error is coming from? Really appreciate the help!
how can I impose constraints on the coefficient matrix of a var model in r.
Some of my code is followed
library(readxl)
dat_pc_log_d <- read_excel("C:/Users/Desktop/dat_pc_log_d.xlsx")
attach(dat_pc_log_d)
dat_pc_log_d$itcrm = NULL
dat_pc_log_d$...1 = NULL
data = ts(dat_pc_log_d,start = c(2004,1),end = c(2019,1),frequency = 4)
VAR_modelo = VAR(data,p=2)
VAR_modelo_restriccion = restrict(VAR_modelo,method = "ser",thresh = 2.0)
ir_pib = irf(VAR_modelo_restriccion, impulse = "pbipc_log_d", response = c("pbipc_log_d", "expopc_log_d", "pbiagr_log_d"),
boot = TRUE, ci = 0.95)
I need to ensure exogeneity of a variable, for it I have to impose zero in some lags coefficients of the independent variable. How can I do it ?
thanks
library(readxl)
dat_pc_log_d <- read_excel("C:/Users//dat_pc_log_d.xlsx")
attach(dat_pc_log_d)
dat_pc_log_d$...1 = NULL
data = ts(dat_pc_log_d,start = c(2004,1),end = c(2019,1),frequency = 4)
VAR_modelo = VAR(data,p=2)
restriccion = matrix(c(1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,1),
nrow=8, ncol=17, byrow = TRUE)
VAR_modelo_restriccion = restrict(VAR_modelo,method = "man", resmat = restriccion)
ir_pib = irf(VAR_modelo_restriccion, impulse = "itcrm", response = c("pbipc_log_d", "expopc_log_d", "inverpc_log_d" , "pbiagr_log_d"),
boot = TRUE, nhead=20 ,ci = 0.68)
I used the bibliometrix function in R, and want to plot some useful graphs.
library(bibliometrix)
??bibliometrix
D<-readFiles("E:\\RE\\savedrecs.txt")
M <- convert2df(D,dbsource = "isi", format= "plaintext")
results <- biblioAnalysis(M ,sep = ";" )
S<- summary(object=results,k=10, pause=FALSE)
plot(x=results,k=10,pause=FALSE)
options(width=100)
S <- summary(object = results, k = 10, pause = FALSE)
NetMatrix <- biblioNetwork(M1, analysis = "co-occurrences", network = "author_keywords", sep = ";")
S <- normalizeSimilarity(NetMatrix, type = "association")
net <- networkPlot(S, n = 200, Title = "co-occurrence network",type="fruchterman", labelsize = 0.7, halo = FALSE, cluster = "walktrap",remove.isolates=FALSE, remove.multiple=FALSE, noloops=TRUE, weighted=TRUE)
res <- thematicMap(net, NetMatrix, S)
plot(res$map)
But in the net <- networkPlot(S, n = 200, Title = "co-occurrence network",type="fruchterman", labelsize = 0.7, halo = FALSE, cluster = "walktrap",remove.isolates=FALSE, remove.multiple=FALSE, noloops=TRUE, weighted=TRUE), it shows error
Error in V<-(*tmp*, value = *vtmp*) : invalid indexing
. Also I cannot do the CR, it always shows unlistCR. I cannot use the NetMatrix function neither.
Some help me plsssssssss
The problem is in the data itself not in the code you presented. When I downloaded the data from bibliometrix.com and changed M1 to M (typo?) in biblioNetwork function call everything worked perfectly. Please see the code below:
library(bibliometrix)
# Plot bibliometric analysis results
D <- readFiles("http://www.bibliometrix.org/datasets/savedrecs.txt")
M <- convert2df(D, dbsource = "isi", format= "plaintext")
results <- biblioAnalysis(M, sep = ";")
S <- summary(results)
plot(x = results, k = 10, pause = FALSE)
# Plot Bibliographic Network
options(width = 100)
S <- summary(object = results, k = 10, pause = FALSE)
NetMatrix <- biblioNetwork(M, analysis = "co-occurrences", network = "author_keywords", sep = ";")
S <- normalizeSimilarity(NetMatrix, type = "association")
net <- networkPlot(S, n = 200, Title = "co-occurrence network", type = "fruchterman",
labelsize = 0.7, halo = FALSE, cluster = "walktrap",
remove.isolates = FALSE, remove.multiple = FALSE, noloops = TRUE, weighted = TRUE)
# Plot Thematic Map
res <- thematicMap(net, NetMatrix, S)
str(M)
plot(res$map)
I found this code in a tutorial about multilabel classification with package mlr.
library("mlr")
yeast = getTaskData(yeast.task)
labels = colnames(yeast)[1:14]
yeast.task = makeMultilabelTask(id = "multi", data = yeast, target = labels)
lrn.br = makeLearner("classif.rpart", predict.type = "prob")
lrn.br = makeMultilabelBinaryRelevanceWrapper(lrn.br)
mod = train(lrn.br, yeast.task, subset = 1:1500, weights = rep(1/1500, 1500))
pred = predict(mod, task = yeast.task, subset = 1:10)
pred = predict(mod, newdata = yeast[1501:1600,])
I understand the structure of the dataset yeast, but I do not understand how to use the code when I have new data which I want to classify because then I wouldn´t have any TRUE or FALSE values for the labels. Actually I would have some training data with the same structure as yeast but for my new data the columns 1:14 would be missing.
Am I missunderstanding something? If not: How can I use the code correctly?
Edit:
Here´s a sample code how I would use the code:
library("tm")
train.data = data.frame("id" = c(1,1,2,3,4,4), "text" = c("Monday is nice weather.", "Monday is nice weather.", "Dogs are cute.", "It is very rainy.", "My teacher is angry.", "My teacher is angry."), "label" = c("label1", "label2", "label3", "label1", "label4", "label5"))
test.data = data.frame("id" = c(5,6), "text" = c("Next Monday I will meet my teacher.", "Dogs do not like rain."))
train.data$text = as.character(train.data$text)
train.data$id = as.character(train.data$id)
train.data$label = as.character(train.data$label)
test.data$text = as.character(test.data$text)
test.data$id = as.character(test.data$id)
### Bring training data into structure
train.data$label = make.names(train.data$label)
labels = unique(train.data$label)
# DocumentTermMatrix for all texts
texts = unique(c(train.data$text, test.data$text))
docs <- Corpus(VectorSource(unique(texts)))
terms <-DocumentTermMatrix(docs)
m <- as.data.frame(as.matrix(terms))
# Logical columns for labels
test = data.frame("id" = train.data$id, "topic"=train.data$label)
test2 = as.data.frame(unclass(table(test)))
test2[,c(1:ncol(test2))] = as.logical(unlist(test2[,c(1:ncol(test2))]))
rownames(test2) = unique(test$id)
# Bind columns from dtm
termsDf = cbind(test2, m[1:nrow(test2),])
names(termsDf) = make.names(names(termsDf))
### Create Multilabel Task
classify.task = makeMultilabelTask(id = "multi", data = termsDf, target = labels)
### Now the model
lrn.br = makeLearner("classif.rpart", predict.type = "prob")
lrn.br = makeMultilabelBinaryRelevanceWrapper(lrn.br)
mod = train(lrn.br, classify.task)
### How can I predict for test.data?
So, the problem is that I do not have any labels for test.data because that is what I would actually like to compute?
Edit2:
When I simply use
names(m) = make.names(names(m))
pred = predict(mod, newdata = m[(nrow(termsDf)+1):(nrow(termsDf)+nrow(test.data)),])
the result is for both texts the same and really not that I would expect.