I would like to normalize the data this way:
(trainData - mean(trainData)) / sd(trainData)
(testData - mean(trainData)) / sd(trainData)
For the Train set I can use the function scale(). How can I do for the test set? I tried in different ways the lapply() function .. but I did not succeed.
Many thanks! An exemple of code:
Train <- data.frame(matrix(c(1:100),10,10))
Test <- data.frame(matrix(sample(1:100),10,10))
scaled.Train <- scale(Train)
ct <- ncol(Test)
rt <- nrow(Test)
ncol(Train)
sdmatrix <- data.frame(matrix(,rt,ct))
for (i in 1:ct){
sdmatrix[1,i] <- mean(Train[,i])
sdmatrix[2,i] <- sd(Train[,i])
}
Test <- rbind(Test, sdmatrix)
normTest <- function(x){
a <- x[rt-1]
b <- x[rt]
x <- (x-a)/b
}
Test <- lapply(Test[1:(rt-2),],normTest)
Related
new here!
I want to find the determinant of the pooled sample covariance of the given matrix. Can someone give a leading clue? (i have searched everywhere)
I have tried many things, this isn't the right solution (i have tried many) such as:
det(cov(dfdata))
mvec <- colMeans(dfdata) #sample mean vector#`enter code here`
covM <- cov(dfdata) #sample covariance matrix#
corM <- cor(dfdata) #sample correlation matrix#
covMnum <- cov(dfdatanum)
The following code is what i have developed:
##uploading the data
data2 <- read.table("file.tsv")
data3 <- read.table("file2.tsv")
data4 <- read.table("file3.tsv")
data5 <- read.table("file4.tsv")
## have a first look at data###
head(dfBull)
n <- nrow(dfBull) #n#
p <- ncol(dfBull) #p#
summary(dfBull)
##removing the first rove as it isnt neccesary
a <- data2[-(1), ]
b <- data3[-(1), ]
c <- data4[-(1), ]
d <- data5[-(1), ]
## finding the covariance
cv1 <- cov(as.numeric(a$V1), as.numeric(a$V2))
cv2 <- cov(as.numeric(b$V1), as.numeric(b$V2))
cv3 <- cov(as.numeric(c$V1), as.numeric(c$V2))
cv4 <- cov(as.numeric(c$V1), as.numeric(c$V2))
##This is the function im trying to use:
mat <- matrix(c(cv1,0,0,0,0,cv2,0,0,0,0,cv3,0,0,0,0,cv4), nrow=4, ncol=4, byrow=TRUE)
det(mat)`
So I have a large data set that I have imported and split up. I've made sure to attach everything and tried to run a code to determine the number of breakpoints using AIC.
rm(list=ls())
library(Matching)
library(segmented)
dinosaurs=read.csv("C:/Users/user/Desktop/NEW PLOTS FOR DINOS/centrum_input_fin.csv")
attach(dinosaurs)
names(dinosaurs)
dino_names <- names(dinosaurs)
#NEED TO EXPORT FILES (EXPORT THE ALL_DATA_PLUS_SORTED OUT)
all_data_plus_sorted<-NULL
for(j in 1:length(dino_names))
{
with_gaps<-eval(parse(text = dino_names[j]))
gaps <- which(is.na(with_gaps))
non_gaps <-which(1:length(with_gaps) %in%gaps==FALSE)
sorted_without_gaps <- sort(with_gaps[!is.na(with_gaps)],decreasing=TRUE)
ordered_with_gaps<-rep(NA,length(with_gaps))
for(k in 1:length(non_gaps))
{
ordered_with_gaps[non_gaps[k]] <- sorted_without_gaps[k]
}
to_export<-cbind(with_gaps,ordered_with_gaps)
colnames(to_export)<-c(paste(dino_names[j],"_actual_with_gaps",sep=""),paste(dino_names[j],"_ordered_with_gaps",sep=""))
all_data_plus_sorted<- cbind(all_data_plus_sorted,to_export)
}
all_data_plus_sorted
attach(as.data.frame(all_data_plus_sorted))
print(dinosaurs)
detach(as.data.frame(all_data_plus_sorted))
detach(dinosaurs)
#split species
Dyoplosaurus_acutosquameus_ROM734 <- Dyoplosaurus_acutosquameus_ROM734[!is.na(Dyoplosaurus_acutosquameus_ROM734)]
Staurikosaurus_pricei <- Staurikosaurus_pricei[!is.na(Staurikosaurus_pricei)]
Opistocoelocaudia_skarzynskii <- Opistocoelocaudia_skarzynskii[!is.na(Opistocoelocaudia_skarzynskii)]
Stegosaurus_stenops._NHMUKPVR36730 <- Stegosaurus_stenops._NHMUKPVR36730[!is.na(Stegosaurus_stenops._NHMUKPVR36730)]
Giraffatitan_brancai <- Giraffatitan_brancai[!is.na(Giraffatitan_brancai)]
Camptosaurus <- Camptosaurus[!is.na(Camptosaurus)]
Camptosaurus_prestwichii <- Camptosaurus_prestwichii[!is.na(Camptosaurus_prestwichii)]
A_greppini <- A_greppini[!is.na(A_greppini)]
Astrophocaudia_slaughteri_SMU61732 <- Astrophocaudia_slaughteri_SMU61732[!is.na(Astrophocaudia_slaughteri_SMU61732)]
Tastavinsaurus_sanzi_gen_MPZ999 <- Tastavinsaurus_sanzi_gen_MPZ999[!is.na(Tastavinsaurus_sanzi_gen_MPZ999)]
MOZ_Pv1221 <- MOZ_Pv1221[!is.na(MOZ_Pv1221)]
Mamenchisaurus <- Mamenchisaurus[!is.na(Mamenchisaurus)]
Bromtosaurus_CMNo3018 <- Bromtosaurus_CMNo3018[!is.na(Bromtosaurus_CMNo3018)]
Lufengosaurus_Hueni <- Lufengosaurus_Hueni[!is.na(Lufengosaurus_Hueni)]
Mamenchisaurus_hochuanensi <- Mamenchisaurus_hochuanensi[!is.na(Mamenchisaurus_hochuanensi)]
Spinosaurus_FSACKK11888 <- Spinosaurus_FSACKK11888[!is.na(Spinosaurus_FSACKK11888)]
Buitreraptor_MPCNPV370 <- Buitreraptor_MPCNPV370[!is.na(Buitreraptor_MPCNPV370)]
Buitreraptor_MPCA245 <- Buitreraptor_MPCA245[!is.na(Buitreraptor_MPCA245)]
Huabeisaurus_allocotus_HBV20001 <- Huabeisaurus_allocotus_HBV20001[!is.na(Huabeisaurus_allocotus_HBV20001)]
Tethyshadros_insularis_SC57021 <- Tethyshadros_insularis_SC57021[!is.na(Tethyshadros_insularis_SC57021)]
Compsognathus_longipes_CNJ79 <- Compsognathus_longipes_CNJ79[!is.na(Compsognathus_longipes_CNJ79)]
Archaeopteryx12 <- Archaeopteryx12[!is.na(Archaeopteryx12)]
Sinosauropteryx_NIGP127586 <- Sinosauropteryx_NIGP127586[!is.na(Sinosauropteryx_NIGP127586)]
Sinosauropteryx_NIGP_127587 <- Sinosauropteryx_NIGP_127587[!is.na(Sinosauropteryx_NIGP_127587)]
Tetonosaurus_tilletti_AMNH3040 <- Tetonosaurus_tilletti_AMNH3040[!is.na(Tetonosaurus_tilletti_AMNH3040)]
Bambiraptor_feinbergi_FIP001 <- Bambiraptor_feinbergi_FIP001[!is.na(Bambiraptor_feinbergi_FIP001)]
Seimosaurus.halli_NMMNH3690 <- Seimosaurus.halli_NMMNH3690[!is.na(Seimosaurus.halli_NMMNH3690)]
Diluvicursor_pickeringi_NMVP221080 <- Diluvicursor_pickeringi_NMVP221080[!is.na(Diluvicursor_pickeringi_NMVP221080)]
Zhejiungosuurus_lishuiensis_ZMNHM8718 <- Zhejiungosuurus_lishuiensis_ZMNHM8718[!is.na(Zhejiungosuurus_lishuiensis_ZMNHM8718)]
Tianyulong_confuciusi_STMN.263 <- Tianyulong_confuciusi_STMN.263[!is.na(Tianyulong_confuciusi_STMN.263)]
Lusotitan_atalaiensis <- Lusotitan_atalaiensis[!is.na(Lusotitan_atalaiensis)]
Nemegtonykus_citus_MPCD100203 <- Nemegtonykus_citus_MPCD100203[!is.na(Nemegtonykus_citus_MPCD100203)]
Elaphrosaurus_bambergi_MBR4960 <- Elaphrosaurus_bambergi_MBR4960[!is.na(Elaphrosaurus_bambergi_MBR4960)]
Nomingia_gobiensis_GIN100119 <- Nomingia_gobiensis_GIN100119[!is.na(Nomingia_gobiensis_GIN100119)]
Nomingia_gobiensis_MPCD100119 <- Nomingia_gobiensis_MPCD100119[!is.na(Nomingia_gobiensis_MPCD100119)]
Chirostenotes_pergracilis <- Chirostenotes_pergracilis[!is.na(Chirostenotes_pergracilis)]
Seismosaurus_hallorum_NMMNHP3690 <- Seismosaurus_hallorum_NMMNHP3690[!is.na(Seismosaurus_hallorum_NMMNHP3690)]
Heterodontosaurus_tucki_SAMPKK1332 <- Heterodontosaurus_tucki_SAMPKK1332[!is.na(Heterodontosaurus_tucki_SAMPKK1332)]
Jianianhualong_tengi_DLXH1218 <- Jianianhualong_tengi_DLXH1218[!is.na(Jianianhualong_tengi_DLXH1218)]
Yinlong_downsi_IVPPV18685 <- Yinlong_downsi_IVPPV18685[!is.na(Yinlong_downsi_IVPPV18685)]
Neimongosaurus_yangi_LHV0001 <- Neimongosaurus_yangi_LHV0001[!is.na(Neimongosaurus_yangi_LHV0001)]
Magnapaulia_laticaudus_LACM17715 <- Magnapaulia_laticaudus_LACM17715[!is.na(Magnapaulia_laticaudus_LACM17715)]
Ouranosaurus_nigeriensis <- Ouranosaurus_nigeriensis[!is.na(Ouranosaurus_nigeriensis)]
Dreadnoughtus_schrani_MPMPV1156 <- Dreadnoughtus_schrani_MPMPV1156[!is.na(Dreadnoughtus_schrani_MPMPV1156)]
Pectodens_zhenyuensis_IVPPV18578 <- Pectodens_zhenyuensis_IVPPV18578[!is.na(Pectodens_zhenyuensis_IVPPV18578)]
Dilophosaurus_wetherilli <- Dilophosaurus_wetherilli[!is.na(Dilophosaurus_wetherilli)]
Gobihadros_mongoliensis_MPCD100746 <- Gobihadros_mongoliensis_MPCD100746[!is.na(Gobihadros_mongoliensis_MPCD100746)]
Gobihadros_mongoliensis_MPCD100755 <- Gobihadros_mongoliensis_MPCD100755[!is.na(Gobihadros_mongoliensis_MPCD100755)]
Auroraceratops_rugosus_GJ07913 <- Auroraceratops_rugosus_GJ07913[!is.na(Auroraceratops_rugosus_GJ07913)]
Patagotitan_mayorum_MPEFPV <- Patagotitan_mayorum_MPEFPV[!is.na(Patagotitan_mayorum_MPEFPV)]
Eoraptor_lunensi_PVSJ512 <- Eoraptor_lunensi_PVSJ512[!is.na(Eoraptor_lunensi_PVSJ512)]
Corythosaurus_casuarius <- Corythosaurus_casuarius[!is.na(Corythosaurus_casuarius)]
Caihong._Juji_PMoLB00175 <- Caihong._Juji_PMoLB00175[!is.na(Caihong._Juji_PMoLB00175)]
Eosinopteryx_brevipenna_YFGPT5197 <- Eosinopteryx_brevipenna_YFGPT5197[!is.na(Eosinopteryx_brevipenna_YFGPT5197)]
Rahonavis_ostromi_UA8656 <- Rahonavis_ostromi_UA8656[!is.na(Rahonavis_ostromi_UA8656)]
Changyuraptor_yangi_HGB016 <- Changyuraptor_yangi_HGB016[!is.na(Changyuraptor_yangi_HGB016)]
Herrerasaurus_ischigualastensis_PVL2566 <- Herrerasaurus_ischigualastensis_PVL2566[!is.na(Herrerasaurus_ischigualastensis_PVL2566)]
Herrerasaurus_ischigualastensis_UNSJ53 <- Herrerasaurus_ischigualastensis_UNSJ53[!is.na(Herrerasaurus_ischigualastensis_UNSJ53)]
Ischioceratops_zhuchengensis <- Ischioceratops_zhuchengensis[!is.na(Ischioceratops_zhuchengensis)]
Koreaceratops_hwaseongensis <- Koreaceratops_hwaseongensis[!is.na(Koreaceratops_hwaseongensis)]
# CHOOSE SAMPLE TO ANALYSE
#_________________________________________________________________________________________________
# choose sample
name_to_test <- "Koreaceratops_hwaseongensis"
y_val <- eval(parse(text = paste(name_to_test,"_actual_with_gaps",sep="")))
x_val<-1:length(y_val)
# USE AIC TO DECIDE HOW MANY BREAKS TO USE
#_________________________________________________________________________________________________
# extract AIC for models with 1-3 breakpoints
my_max_it=10
all_mods<-NULL
for(h in 1:4)
{
mod1<-segmented(lm(y_val~x_val),seg.Z=~x_val,psi=NA,control=seg.control(K=h,quant=TRUE,it.max=my_max_it),model=TRUE,nboot=50)
all_mods<-rbind(all_mods,c(h,extractAIC(mod1)[2]))
}
all_mods
my_K<-subset(all_mods,all_mods[,2]==min(all_mods[,2]))[1]
When i run the last section of the code i get the error Error in
crossprod(x, y) :
requires numeric/complex matrix/vector arguments
Not too sure why because I have put it in a data frame, is it because I'm importing the file incorrectly? Not sure how to fix.
I am using a for loop to generate 100 different train and test sets.
What I want to do now, is to save these 100 different train and test sets in order to be able to have a look at e.g. where iteration was 17.
This code shows my program with the for loop and the division into train and test set:
result_df<-matrix(ncol=3,nrow=100)
colnames(result_df)<-c("Acc","Sens","Spec")
for (g in 1:100 )
{
# Divide into Train and test set
smp_size <- floor(0.8 * nrow(mydata1))
train_ind <- sample(seq_len(nrow(mydata1)), size = smp_size)
train <- mydata1[train_ind, ]
test <- mydata1[-train_ind, ]
REST OF MY CODE
# Calculate some statistics
overall <- cm$overall
overall.accuracy <- format(overall['Accuracy'] * 100, nsmall =2, digits = 2)
overall.sensitivity <- format(cm$byClass['Sensitivity']* 100, nsmall =2, digits = 2)
overall.specificity <- format(cm$byClass['Specificity']* 100, nsmall =2, digits = 2)
result_df[g,1] <- overall.accuracy
result_df[g,2] <- overall.sensitivity
result_df[g,3] <- overall.specificity
}
How can I do this?
You could do the following, for example, saving each test and train sets as elements in a list:
result_df<-matrix(ncol=3,nrow=100)
colnames(result_df)<-c("Acc","Sens","Spec")
testlist <- list()
trainlist <- list()
for (g in 1:100 )
{
# Divide into Train and test set
smp_size <- floor(0.8 * nrow(mydata1))
train_ind <- sample(seq_len(nrow(mydata1)), size = smp_size)
train <- mydata1[train_ind, ]
test <- mydata1[-train_ind, ]
trainlist[[g]] <- train
testlist[[g]] <- test
}
EDIT
To retrieve the 7th element of these lists you could use trainlist[[7]]
You can save those in csv file by using the following method
write.csv(train, file = paste0("train-", Sys.time(), ".csv", sep=""))
write.csv(test, file = paste0("test-", Sys.time(), ".csv", sep=""))
One option could be to save the row indexes of your partitions, rather than saving all the datasets, and then select the rows indexes for the iteration you're interested in.
The caret package has a function called createDataPartition, which will do this for you:
library(caret)
df <- data.frame(col1 = rnorm(100), col2 = rnorm(100))
# create 100 partitions
train.idxs <- createDataPartition(1:nrow(df), times = 100, p = 0.8)
for(i in 1:length(train.idxs)) {
# create train and test sets
idx <- train.idxs[[i]]
train.df <- df[idx, ]
test.df <- df[-idx, ]
# calculate statistics ...
result_df[i,1] <- overall.accuracy
result_df[i,2] <- overall.sensitivity
result_df[i,3] <- overall.specificity
}
# check the datasets for the nth partition
# train set
df[train.idxs[[n]], ]
# test set
df[-train.idxs[[n]], ]
Put your code in a function and do a lapply():
result_df <- matrix(ncol=3, nrow=100)
colnames(result_df)<-c("Acc", "Sens", "Spec")
SIMg <- function(g) {
# Divide into Train and test set
smp_size <- floor(0.8 * nrow(mydata1))
train_ind <- sample(seq_len(nrow(mydata1)), size = smp_size)
train <- mydata1[train_ind, ]
test <- mydata1[-train_ind, ]
REST OF THE CODE
return(list(train=train, test=test, ...))
}
L <- lapply(1:100, SIMg)
The resulting list L has 100 elements, each element is a list containing the two dataframes and your results for one simulation run.
To get separate lists trainlist and testlist you can do:
trainlist <- lallpy(L, '[[', "train")
testlist <- lallpy(L, '[[', "test")
I noticed that SVM when fed with decision.values=T (plus sigmoid to get probabilities ) produces non-deterministic result when I permute data frame under analysis. Does anyone has any idea why? Please try the code yourself
install.packages("e1071")
library(e1071)
A <- cbind(rnorm(20,1,1),rnorm(20,1,1),rep(1,20))
B <- cbind(rnorm(20,9,1),rnorm(20,9,1),rep(0,20))
dataframe <- as.data.frame(rbind(A,B))
predc <- rep(0,length(dataframe[,1]))
K <- length(dataframe[1,])
permutator <- sample(nrow(dataframe))
dataframe$V3 <- factor(dataframe$V3)
dataframe <- dataframe[permutator, ]
for(i in 1:length(dataframe[,1])) {
frm <- as.formula(object=paste("V",as.character(K), " ~ .",sep=""))
r <- svm(formula=frm, data=(dataframe[-i,]))
predicted <- predict(r,newdata=dataframe[i,],decision.values=TRUE)
predc[i] <- sigmoid(attr(predicted,'decision.values')[1])
}
plot(sort(predc))
[edited: code]
I have a function myF(g,m,alpha,gam,theta,beta). Which returns three estimates of parameters. I want to iterate this function for (i in 1:10). How can i do this it in R?
myF <- function(g,m,alpha,gam,theta,beta){
dat <- sim.data(g,m,alpha,gam,theta,beta)
time <- dat$times
delta <- dat$cens
i <- dat$group
X1<-dat$cov #cov~rbinom
n <- length(levels(as.factor(i)))
di <- aggregate(delta,by=list(i),FUN=sum)[,2]
D <- sum(di)
loglik <- function(par){
.........................................
return(-lik)
}
initial=c(0.5,0.5,-0.5,0.5)
maxF <- nlm(loglik, initial)
return(c(theta=exp(maxF$estimate[2]),beta1=maxF$estimate[3],alpha=exp(maxF$estimate[2])))
}
This can easily be done using replicate:
replicate(10, myF(g,m,alpha,gam,theta,beta))
This will create a 3*10 matrix of the parameter estimates, where each column is the result of a separate iteration.