Related
I want to test for an interaction ( for Cox proportional hazard model) between type of transplant and disease type using main effects and interaction terms on the data bone marrow transplant study at Ohio State University.
Here is the used code for the data:
time_Allo_NHL<- c(28,32,49,84,357,933,1078,1183,1560,2114,2144)
censor_Allo_NHL<- c(rep(1,5), rep(0,6))
df_Allo_NHL <- data.frame(group = "Allo NHL",
time = time_Allo_NHL,
censor = censor_Allo_NHL,
Z1 = c(90,30,40,60,70,90,100,90,80,80,90),
Z2 = c(24,7,8,10,42,9,16,16,20,27,5))
time_Auto_NHL<- c(42,53,57,63,81,140,176,210,252,476,524,1037)
censor_Auto_NHL<- c(rep(1,7), rep(0,1), rep(1,1), rep(0,1), rep(1,1), rep(0,1))
df_Auto_NHL <- data.frame(group = "Auto NHL",
time = time_Auto_NHL,
censor = censor_Auto_NHL,
Z1 = c(80,90,30,60,50,100,80,90,90,90,90,90),
Z2 = c(19,17,9,13,12,11,38,16,21,24,39,84))
time_Allo_HOD<- c(2,4,72,77,79)
censor_Allo_HOD<- c(rep(1,5))
df_Allo_HOD <- data.frame(group = "Allo HOD",
time = time_Allo_HOD,
censor = censor_Allo_HOD,
Z1 = c(20,50,80,60,70),
Z2 = c(34,28,59,102,71))
time_Auto_HOD<- c(30,36,41,52,62,108,132,180,307,406,446,484,748,1290,1345)
censor_Auto_HOD<- c(rep(1,7), rep(0,8))
df_Auto_HOD <- data.frame(group = "Auto HOD",
time = time_Auto_HOD,
censor = censor_Auto_HOD,
Z1 = c(90,80,70,60,90,70,60,100,100,100,100,90,90,90,80),
Z2 = c(73,61,34,18,40,65,17,61,24,48,52,84,171,20,98))
myData <- Reduce(rbind, list(df_Allo_NHL, df_Auto_NHL, df_Allo_HOD, df_Auto_HOD))
Here is the code for interaction, but I'm not sure what it should be written in here (myData$(here?) from the following code to be able to run it.
n<-length(myData$time)
n
for (i in 1:n){
if (myData$(here?)[i]==2)
myData$W1[i] <-1
else myData$W1[i]<-0
}
for (i in 1:n){
if (myData$(here?)[i]==2)
myData$W2[i] <-1
else myData$W2[i]<-0
}
myData
Coxfit.W<-coxph(Surv(time,censor)~W1+W2+W1*W2, data = myData)
summary(Coxfit.W)
An easy way is to separate the four groups variables using the separate function from the tidyr package.
library(tidyr)
myData <- separate(myData, col=group, into=c("disease","transpl"))
head(myData)
disease transpl time censor Z1 Z2
1 Allo NHL 28 1 90 24
2 Allo NHL 32 1 30 7
3 Allo NHL 49 1 40 8
4 Allo NHL 84 1 60 10
5 Allo NHL 357 1 70 42
6 Allo NHL 933 0 90 9
Then you can put these two new variables (disease and transpl) into the Cox model, with interaction term.
Coxfit.W<-coxph(Surv(time,censor)~transpl*disease, data = myData)
summary(Coxfit.W)
Call:
coxph(formula = Surv(time, censor) ~ transpl * disease, data = myData)
n= 43, number of events= 26
coef exp(coef) se(coef) z Pr(>|z|)
transplNHL -1.8212 0.1618 0.6747 -2.699 0.00695 **
diseaseAuto -1.6628 0.1896 0.6188 -2.687 0.00721 **
transplNHL:diseaseAuto 2.3050 10.0244 0.8494 2.714 0.00665 **
exp(coef) exp(-coef) lower .95 upper .95
transplNHL 0.1618 6.17946 0.04312 0.6073
diseaseAuto 0.1896 5.27387 0.05638 0.6377
transplNHL:diseaseAuto 10.0244 0.09976 1.89700 52.9720
I am using the r predict function, and it is returning more values than I expected it too. I created a linear model for the data to predict MDC from PKWH, MDT, and MDT2, then I created new data for input values into the predict function. The original data for utility has 24 values for each column of MDC, PKWH, MDT, and MDT2.
fit2 <- lm(MDC ~ MDT + MDT2 + PKWH*(1 + MDT + MDT2), data =
utility)
predict <- predict(fit2, data = data.frame(PKWH = 9, MDT = 75, MDT2
= 5625))
I expected the predict() function to produce 1 predicted value for the inputs of PKWH = 9 | MDT = 75 | MDT2 = 5625, but it gave me these 24 values.
1 2 3 4 5 6 7
56.67781 51.66653 45.05200 42.12583 38.98647 38.80904 42.60033
8 9 10 11 12 13 14
46.86545 49.51928 54.15163 61.54441 68.00122 49.17722 45.27917
15 16 17 18 19 20 21
42.88154 40.93468 38.39330 37.80963 39.47550 41.58780 42.94447
22 23 24
46.25884 49.27053 53.98732
Also, when I plug the new input values to calculate the predicted value using the coefficients from the linear model, I get 55.42165 which is not found on the list of the 24 values from the predict() function.
first, I wouldn't name your result predict - you want to save that for the function. You need
predicted_data <- predict(fit2, newdata = data.frame(PKWH = 9, MDT = 75, MDT2
= 5625))
It's not throwing an error because predict has a catch-all (...) at the end where input to data is heading, but it's giving you the predictions for the data you fit the model with.
I have data set from a incomplete lattice design study that I have imported into R from excel and would like to conduct a PBIB.test. However, after running the function as shown below, the output shows object Area not found, even after repeated times.
library("agricolae", lib.loc = "~/R/win-library/3.3")
Rdata2 <- PBIB.test("BlockNo", "AccNo", "Rep", Area, k = 9, c("REML"), console = TRUE)
Error in data.frame(v1 = 1, y) : object 'Area' not found
What is the problem?
See below for a sampleĀ application of PBIB.test, based on the agricolae tutorial.
First, create some sample data.
# Construct the alpha design with 30 treatments, 2 repetitions, and block size = 3
Genotype <- c(paste("gen0", 1:9, sep= ""), paste("gen", 10:30, sep= ""));
r <- 2;
k <- 3;
s <- 10;
b <- s * r;
book <- design.alpha(Genotype, k, r,seed = 5);
# Source dataframe
df <- book$book;
Create a vector of response values.
# Response variable
response <- c(
5,2,7,6,4,9,7,6,7,9,6,2,1,1,3,2,4,6,7,9,8,7,6,4,3,2,2,1,1,2,
1,1,2,4,5,6,7,8,6,5,4,3,1,1,2,5,4,2,7,6,6,5,6,4,5,7,6,5,5,4);
Run PBIB.test
model <- with(df, PBIB.test(block, Genotype, replication, response, k = 3, method="REML"))
head(model);
#$ANOVA
#Analysis of Variance Table
#
#Response: yield
# Df Sum Sq Mean Sq F value Pr(>F)
#Genotype 29 72.006 2.4830 1.2396 0.3668
#Residuals 11 22.034 2.0031
#
#$method
#[1] "Residual (restricted) maximum likelihood"
#
#$parameters
# test name.t treatments blockSize blocks r alpha
# PBIB-lsd Genotype 30 3 10 2 0.05
#
#$statistics
# Efficiency Mean CV
# 0.6170213 4.533333 31.22004
#
#$model
#Linear mixed-effects model fit by REML
# Data: NULL
# Log-restricted-likelihood: -73.82968
# Fixed: y ~ trt.adj
# (Intercept) trt.adjgen02 trt.adjgen03 trt.adjgen04 trt.adjgen05 trt.adjgen06
# 6.5047533 -3.6252940 -0.7701618 -2.5264354 -3.1633495 -1.9413054
#trt.adjgen07 trt.adjgen08 trt.adjgen09 trt.adjgen10 trt.adjgen11 trt.adjgen12
# -3.0096514 -4.0648738 -3.5051139 -2.8765561 -1.7111335 -1.6308755
#trt.adjgen13 trt.adjgen14 trt.adjgen15 trt.adjgen16 trt.adjgen17 trt.adjgen18
# -2.2187974 -2.3393290 -2.0807215 -0.3122845 -3.4526453 -1.0320169
#trt.adjgen19 trt.adjgen20 trt.adjgen21 trt.adjgen22 trt.adjgen23 trt.adjgen24
# -3.1257616 0.2101325 -1.7632411 -1.9177848 -1.0500345 -2.5612960
#trt.adjgen25 trt.adjgen26 trt.adjgen27 trt.adjgen28 trt.adjgen29 trt.adjgen30
# -4.3184716 -2.3071359 1.2239927 -1.3643068 -1.4354599 -0.4726870
#
#Random effects:
# Formula: ~1 | replication
# (Intercept)
#StdDev: 8.969587e-05
#
# Formula: ~1 | block.adj %in% replication
# (Intercept) Residual
#StdDev: 1.683459 1.415308
#
#Number of Observations: 60
#Number of Groups:
# replication block.adj %in% replication
# 2 20
#
#$Fstat
# Fit Statistics
#AIC 213.65937
#BIC 259.89888
#-2 Res Log Likelihood -73.82968
I would like to see If SOM algorithm can be used for classification prediction.
I used to code below but I see that the classification results are far from being right. For example, In the test dataset, I get a lot more than just the 3 values that I have in the training target variable. How can I create a prediction model that will be in alignment to the training target variable?
library(kohonen)
library(HDclassif)
data(wine)
set.seed(7)
training <- sample(nrow(wine), 120)
Xtraining <- scale(wine[training, ])
Xtest <- scale(wine[-training, ],
center = attr(Xtraining, "scaled:center"),
scale = attr(Xtraining, "scaled:scale"))
som.wine <- som(Xtraining, grid = somgrid(5, 5, "hexagonal"))
som.prediction$pred <- predict(som.wine, newdata = Xtest,
trainX = Xtraining,
trainY = factor(Xtraining$class))
And the result:
$unit.classif
[1] 7 7 1 7 1 11 6 2 2 7 7 12 11 11 12 2 7 7 7 1 2 7 2 16 20 24 25 16 13 17 23 22
[33] 24 18 8 22 17 16 22 18 22 22 18 23 22 18 18 13 10 14 15 4 4 14 14 15 15 4
This might help:
SOM is an unsupervised classification algorithm, so you shouldn't expect it to be trained on a dataset that contains a classifier label (if you do that it will need this information to work, and will be useless with unlabelled datasets)
The idea is that it will kind of "convert" an input numeric vector to a network unit number (try to run your code again with a 1 per 3 grid and you'll have the output you expected)
You'll then need to convert those network units numbers back into the categories you are looking for (that is the key part missing in your code)
Reproducible example below will output a classical classification error. It includes one implementation option for the "convert back" part missing in your original post.
Though, for this particular dataset, the model overfitts pretty quickly: 3 units give the best results.
#Set and scale a training set (-1 to drop the classes)
data(wine)
set.seed(7)
training <- sample(nrow(wine), 120)
Xtraining <- scale(wine[training, -1])
#Scale a test set (-1 to drop the classes)
Xtest <- scale(wine[-training, -1],
center = attr(Xtraining, "scaled:center"),
scale = attr(Xtraining, "scaled:scale"))
#Set 2D grid resolution
#WARNING: it overfits pretty quickly
#Errors are 36% for 1 unit, 63% for 2, 93% for 3, 89% for 4
som_grid <- somgrid(xdim = 1, ydim=3, topo="hexagonal")
#Create a trained model
som_model <- som(Xtraining, som_grid)
#Make a prediction on test data
som.prediction <- predict(som_model, newdata = Xtest)
#Put together original classes and SOM classifications
error.df <- data.frame(real = wine[-training, 1],
predicted = som.prediction$unit.classif)
#Return the category number that has the strongest association with the unit
#number (0 stands for ambiguous)
switch <- sapply(unique(som_model$unit.classif), function(x, df){
cat <- as.numeric(names(which.max(table(
error.df[error.df$predicted==x,1]))))
if(length(cat)<1){
cat <- 0
}
return(c(x, cat))
}, df = data.frame(real = wine[training, 1], predicted = som_model$unit.classif))
#Translate units numbers into classes
error.df$corrected <- apply(error.df, MARGIN = 1, function(x, switch){
cat <- switch[2, which(switch[1,] == x["predicted"])]
if(length(cat)<1){
cat <- 0
}
return(cat)
}, switch = switch)
#Compute a classification error
sum(error.df$corrected == error.df$real)/length(error.df$real)
After fitting a Tree with party::ctree() I want to create a table to characterise the branches.
I have fitted these variables
> summary(juridicos_segmentar)
actividad_economica
Financieras : 89
Gubernamental : 48
Sector Primario : 34
Sector Secundario:596
Sector Terciario :669
ingresos_cut
(-Inf,1.03e+08] :931
(1.03e+08,4.19e+08]:252
(4.19e+08,1.61e+09]:144
(1.61e+09, Inf] :109
egresos_cut
(-Inf,6e+07] :922
(6e+07,2.67e+08] :256
(2.67e+08,1.03e+09]:132
(1.03e+09, Inf] :126
patrimonio_cut
(-Inf,2.72e+08] :718
(2.72e+08,1.46e+09]:359
(1.46e+09,5.83e+09]:191
(5.83e+09, Inf] :168
op_ingreso_cut
(-Inf,3] :1308
(3,7] : 53
(7,22] : 44
(22, Inf]: 31
The first one is categorical and the others are ordinal and I fitted them to
another factor variable
> summary(as.factor(segmento))
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
27 66 30 39 36 33 39 15 84 70 271 247 101 34 100 74 47 25 48 50
I used the following code
library(party)
fit_jur <- ctree(cluster ~ .,
data=data.frame(juridicos_segmentar, cluster=as.factor(segmento)))
to get this tree
> fit_jur
Conditional inference tree with 31 terminal nodes
Response: cluster
Inputs: actividad_economica, ingresos_cut, egresos_cut, patrimonio_cut, op_ingreso_cut
Number of observations: 1436
1) actividad_economica == {Financieras}; criterion = 1, statistic = 4588.487
2) ingresos_cut <= (4.19e+08,1.61e+09]; criterion = 1, statistic = 62.896
3) egresos_cut <= (6e+07,2.67e+08]; criterion = 1, statistic = 22.314
4)* weights = 70
3) egresos_cut > (6e+07,2.67e+08]
5)* weights = 10
2) ingresos_cut > (4.19e+08,1.61e+09]
6)* weights = 9
plot of part of the tree
What I want is a table where every row is a path from the node to a leaf saying the prediction of the variable segmento and every column is the condition on the variable to split. Something alike this:
actividad economica ingresos (rango) egresos (rango) patrimonio (rango) operaciones de ingreso segmento
Sector Primario <=261.000.000 18
Sector Primario >261.000.000 20
The problem is there are several leaves to characterise and some time a variable appears several times in one path so I'd like to intersect the conditions, i.e. intersecting the ranges.
I've thought of data.tree::ToDataFrameTable but I've got no idea of how it works with party.
Thank you very much guys!
library(partykit)
fit_jur <- ctree(cluster ~ .,
data=data.frame(juridicos_segmentar, cluster=as.factor(segmento)))
pathpred <- function(object, ...)
{
## coerce to "party" object if necessary
if(!inherits(object, "party")) object <- as.party(object)
## get standard predictions (response/prob) and collect in data frame
rval <- data.frame(response = predict(object, type = "response", ...))
rval$prob <- predict(object, type = "prob", ...)
## get rules for each node
rls <- partykit:::.list.rules.party(object)
## get predicted node and select corresponding rule
rval$rule <- rls[as.character(predict(object, type = "node", ...))]
return(rval)
}
ct_pred_jur <- unique(pathpred(fit_jur)[c(1,3)])
write.csv2(ct_pred_jur,'parametrizacion_juridicos.csv')
thank you Achim Zeileis for pointing me in this direction, I couldn't intersect the rules in a same variable, i.e. evaluate the '&s'. That problem is still open.
You can convert both party class (from partykit) and BinaryTree (from party) to a data.tree, and use it for conversion to data frame and/or printing. For example like this:
library(party)
airq <- subset(airquality, !is.na(Ozone))
airct <- ctree(Ozone ~ ., data = airq,
controls = ctree_control(maxsurrogate = 3))
tree <- as.Node(airct)
df <- ToDataFrameTable(tree,
"pathString",
"label",
criterion = function(x) round(x$criterion$maxcriterion, 3),
statistic = function(x) round(max(x$criterion$statistic), 3)
)
df
This will print like so:
pathString label criterion statistic
1 1/2/3 weights = 10 0.000 0.000
2 1/2/4/5 weights = 48 0.936 6.141
3 1/2/4/6 weights = 21 0.891 5.182
4 1/7/8 weights = 30 0.675 3.159
5 1/7/9 weights = 7 0.000 0.000
Plotting:
#print subtree
subtree <- Clone(tree$`2`)
SetNodeStyle(subtree,
style = "filled,rounded",
shape = "box",
fillcolor = "GreenYellow",
fontname = "helvetica",
label = function(x) x$label,
tooltip = function(x) round(x$criterion$maxcriterion, 3))
plot(subtree)
And the result will look like this: