I have a problem with my R code about a multiple linear regression.
First, I try to use the gam function but this gives me an error.
Here is the code:
install.packages("nlme")
library("mgcv")
library("ggplot2")
#Import dataset
setwd("/Users/Sarah/Documents/Master T&O/Master 1/Statistics IV/Assignment 2 ")
weight_data = read.csv("WeightLossGroup190.csv", sep = "", dec = ".", header = TRUE)
#Name of used data
weight <- weight_data$weight
date <- weight_data$date
dayNr <- weight_data$dayNumber
time <- weight_data$time
#Check linearity
gam1 <- gam(as.numeric(weight_data$weight) ~ s(as.numeric(weight_data$dayNumber)) + s(as.numeric(weight_data$time)))
summary(gam1)
plot.gam(gam1, se = FALSE, rug = TRUE, all.terms = TRUE)
This gives me the following error:
Error in smooth.construct.tp.smooth.spec(object, dk$data, dk$knots) :
A term has fewer unique covariate combinations than specified maximum degrees of freedom
Does anyone have an idea of what I'm doing wrong?
You might want to try controlling the number of knots with "k = " in the gam.
Related
I keep getting this error message, I am not too sure what went wrong as I am trying to do a linear regression analysis.
Ind_v is the independent variable and dep_v is the dependent variable. I switched the data.frame to [] and it doesn't work as well. Thank you so much, everyone!
I split the training and test data to 70/30.
linear_regression <- function(training_dataset,
test_dataset,
dependent_variables,
independent_variables){
formular_me <- paste(dependent_variables, "~", independent_variables)
linear_model <- lm(formula = formular_me, data = training_dataset)
ind_v_from_test_dataset <- subset(test_dataset,select=independent_variables)
linear_model_analysis <- predict(linear_model,ind_v_from_test_dataset)
dep_v_from_test_dataset <- test_dataset[,dependent_variables]
RMSE_me <- round(Nrmse(actual = dep_v_from_test_dataset, predicted = linear_model_analysis),digits=2)
MAE_me <- round(Nmae(actual = dep_v_from_test_dataset,predicted = linear_model_analysis),digits=2)
R2_me <- round(Nr2(linear_model_analysis),digits=2)
linear_analysis_error <- dep_v_from_test_dataset - linear_model_analysis
linear_results<- data.frame(dep_v_from_test_dataset,ind_v_from_test_dataset,linear_analysis_error)
linear_results<- linear_analysis_error[order(ind_v_from_test_dataset),]
plot(linear_results[,independent_variables],
linear_results$ind_v_from_test_dataset,
pch=4,
ylab="dependent variable",
xlab="independent variables",
main="Linear Regression Errors",
sub=paste("MAE=",mae,"RMSE=",RMSE," R2=",r2))
abline(linear_model,col = "blue", lwd=6)
suppressWarnings(arrows(linear_results[,ind_v_from_test_dataset],
linear_results$dep_v_from_test_dataset,
linear_results[,independent_variables],
linear_results$dep_v_from_test_dataset-linear_results$error,
length=0.05,angle=90,code=3,col="red"))
return(
list(RMSE_me=RMSE,
MAE_me=mae,
R2_me=r2))
}
When I run the following code, I do NOT get this error:
## https://www.dataiku.com/learn/guide/code/r/time_series.html
library(readxl)
library(forecast)
library(dplyr)
library(prophet)
library(rstan)
library(Hmisc)
library(caret)
data<-read_excel("Time Series/Items.xlsx", col_types = c("text", "numeric"))
Nper=0.75
stmodels=c("meanf","naive","snaive","rwf","croston","stlf","ses","holt","hw","splinef","thetaf","ets","auto.arima","tbats","prophet")
gkuniforecast = function(data, Np, Ncolumn, tsfreq, model) {
## Preparation
N = ceiling(Np*nrow(data))
## Models
if (model=="prophet"){
df=data
names(df)=c("ds","y")
df$ds=as.Date(paste(df$ds,"-01",sep=""), "%Y-%b-%d")
train.df = df[1:N,]
na.df=data.frame(ds=rep(NA, N),y=rep(NA, N))
test.df <- rbind(na.df, df[(N+1):nrow(data),])
m <- prophet(train.df)
future <- make_future_dataframe(m, periods = nrow(data)-N, freq = 'month')
pro_forecast <- predict(m, future)
plot(m, pro_forecast)
##prophet_plot_components(m, forecast)
acc=matrix(rep(NA, 16),nrow=2,ncol=8,dimnames=list(c("Training set", "Test set"),c("ME","RMSE","MAE","MPE","MAPE","MASE","ACF1","Theil's U")))
acc["Test set","RMSE"]=sqrt(mean((pro_forecast$yhat - test.df)^2, na.rm = TRUE))
}else{
x=pull(data,Ncolumn)
train.x = ts(x[1:N], frequency=tsfreq)
test.x <- ts(c(rep(NA, N), x[(N+1):NROW(x)]), frequency=tsfreq)
str1=paste0("m_",model," = ",model,"(train.x)")
if (Np==1) {str2=paste0("f_",model," = forecast(m_",model,", h=NROW(x)")
} else {str2=paste0("f_",model," = forecast(m_",model,", h=NROW(x)-N)")}
str3=paste0("plot(f_",model,")")
str4="lines(test.x)"
str5=paste0("acc=accuracy(f_",model,",test.x)")
str=paste0(str1,";",str2,";",str3,";",str4,";",str5)
eval(parse(text=str))
}
return(acc)
}
acc = lapply(stmodels, gkuniforecast, data=data, Np=Nper, Ncolumn=2,tsfreq=12)
But when I run this code, I do:
##Forecast data prep
tsfreq=5
x=pull(data,1)
train.x = ts(x[1:N], frequency=tsfreq)
test.x <- ts(c(rep(NA, N), x[(N+1):NROW(x)]), frequency=tsfreq)
stmodels=c("meanf","naive","snaive","rwf","croston","stlf","ses","holt","hw"##,"splinef"
,"thetaf","ets","auto.arima","tbats")
for (i in 1:length(stmodels)){
str1=paste0("m_",stmodels[i]," = ",stmodels[i],"(train.x)")
str2=paste0("f_",stmodels[i]," = forecast(m_",stmodels[i],", h=NROW(x)-N)")
str3=paste0("plot(f_",stmodels[i],")")
str4="lines(test.x)"
str5=paste0('acc[["',stmodels[i],'"]]=accuracy(f_',stmodels[i],',test.x)')
str=paste0(str1,";",str2,";",str3,";",str4,";",str5)
eval(parse(text=str))
}
There seems to be a problem with 'hw' (splinef is commented out, because it gives me another error), but I do not understand why in the first dataset, I get no errors and I do with the second dataset. What is also different is the frequency.
Again the error is:
Please select a longer horizon when the forecasts are first computed
You are mixing functions that create forecasts directly (like meanf()) with functions that generate models (like ets()). For functions that generate forecasts directly, you need to specify the forecast horizon when you call the function. See https://otexts.org/fpp2/the-forecast-package-in-r.html for a list of functions that produce forecasts directly.
I am planning to use TCGAbiolinks to perform an survival analysis by using the following code:
library(TCGAbiolinks)
library(SummarizedExperiment)
query_RNAseq <- GDCquery(project = "TCGA-PAAD",
data.category = "Gene expression",
data.type = "Gene expression quantification",
platform = "Illumina HiSeq",
file.type = "results",
experimental.strategy = "RNA-Seq",
legacy = TRUE)
GDCdownload(query_RNAseq, method = "api",chunks.per.download = 10)
PAADRnaseqSE <- GDCprepare(query_RNAseq)
# Survival Analysis SA
clinical_patient_Cancer <- GDCquery_clinic("TCGA-PAAD","clinical")
dataPAADcomplete <- log2(PAADRnaseqSE)
but it produced an error when I run the log2 function, code and error info are pasted below
> dataPAADcomplete <- log2(PAADRnaseqSE)
Error in log(<S4 object of class "RangedSummarizedExperiment">, 2) :
unused argument (2)
What should I do to solve the problem?
library(nnet)
set.seed(9850)
train1<- sample(1:155,110)
test1 <- setdiff(1:110,train1)
ideal <- class.ind(hepatitis$class)
hepatitisANN = nnet(hepatitis[train1,-20], ideal[train1,], size=10, softmax=TRUE)
j <- predict(hepatitisANN, hepatitis[test1,-20], type="class")
hepatitis[test1,]$class
table(predict(hepatitisANN, hepatitis[test1,-20], type="class"),hepatitis[test1,]$class)
confusionMatrix(hepatitis[test1,]$class, j)
Error:
Error in nnet.default(hepatitis[train1, -20], ideal[train1, ], size = 10, :
NA/NaN/Inf in foreign function call (arg 2)
In addition: Warning message:
In nnet.default(hepatitis[train1, -20], ideal[train1, ], size = 10, :
NAs introduced by coercion
hepatitis variable consists of the hepatitis dataset available on UCI.
This error message is because you have character values in your data.
Try reading the hepatitis dataset with na.strings = "?". This is defined in the description of the dataset on the uci page.
headers <- c("Class","AGE","SEX","STEROID","ANTIVIRALS","FATIGUE","MALAISE","ANOREXIA","LIVER BIG","LIVER FIRM","SPLEEN PALPABLE","SPIDERS","ASCITES","VARICES","BILIRUBIN","ALK PHOSPHATE","SGOT","ALBUMIN","PROTIME","HISTOLOGY")
hepatitis <- read.csv("https://archive.ics.uci.edu/ml/machine-learning-databases/hepatitis/hepatitis.data", header = FALSE, na.strings = "?")
names(hepatitis) <- headers
library(nnet)
set.seed(9850)
train1<- sample(1:155,110)
test1 <- setdiff(1:110,train1)
ideal <- class.ind(hepatitis$Class)
# will give error due to missing values
# 1st column of hepatitis dataset is the class variable
hepatitisANN <- nnet(hepatitis[train1,-1], ideal[train1,], size=10, softmax=TRUE)
This code will not give your error, but it will give an error on missing values. You will need to do address those before you can continue.
Also be aware that the class variable is the first variable in the dataset straight from the UCI data repository
Edit based on comments:
The na.action only works if you use the formula notation of nnet.
So in your case:
hepatitisANN <- nnet(class.ind(Class)~., hepatitis[train1,], size=10, softmax=TRUE, na.action = na.omit)
I'm trying to use cor.ci to obtain polychoric correlations with significance tests, but it keeps giving me an error message. Here is the code:
install.packages("Hmisc")
library(Hmisc)
mydata <- spss.get("S-IAT for R.sav", use.value.labels=TRUE)
install.packages('psych')
library(psych)
poly.example <- cor.ci(mydata(nvar = 10,n = 100)$items,n.iter = 10,poly = TRUE)
poly.example
print(corr.test(poly.example$rho), short=FALSE)
Here is the error message it gives:
> library(psych)
> poly.example <- cor.ci(mydata(nvar = 10,n = 100)$items,n.iter = 10,poly = TRUE)
Error in cor.ci(mydata(nvar = 10, n = 100)$items, n.iter = 10, poly = TRUE) :
could not find function "mydata"
> poly.example
Error: object 'poly.example' not found
> print(corr.test(poly.example$rho), short=FALSE)
Error in is.data.frame(x) : object 'poly.example' not found
How can I make it recognize mydata and/or select certain variables from this dataset for the analysis? I got the above code from here:
Polychoric correlation matrix with significance in R
Thanks!
You have several problems.
1) As previously commented upon, you are treating mydata as a function, but you need to treat it as a data.frame. Thus the call should be
poly.example <- cor.ci(mydata,n.iter = 10,poly = TRUE)
If you are trying to just get the first 100 cases and the first 10 variables, then
poly.example <- cor.ci(mydata[1:10,1:100],n.iter = 10,poly = TRUE)
2) Then, you do not want to run corr.test on the resulting correlation matrix. corr.test should be run on the data.
print(corr.test(mydata[1:10,1:100],short=FALSE)
Note that corr.test is testing the Pearson correlation.