Having trouble using ctree in R, sometimes it does not display any levels at all, here is an example below.
Can anyone explain why. thanks
library("party")
df <- data.frame(matrix(ncol = 3, nrow = 0))
colnames(df) <- c("a", "b", "c")
df[nrow(df) + 1,] = c("Y","M","N")
df[nrow(df) + 1,] = c("Y","F","N")
df[nrow(df) + 1,] = c("Y","M","Y")
df[nrow(df) + 1,] = c("Y","F","N")
df[nrow(df) + 1,] = c("N","F","Y")
df[nrow(df) + 1,] = c("N","M","N")
df[nrow(df) + 1,] = c("N","M","Y")
df[nrow(df) + 1,] = c("N","M","N")
df[nrow(df) + 1,] = c("N","F","N")
df[nrow(df) + 1,] = c("N","F","N")
df$a <- as.factor(df$a)
df$b <- as.factor(df$b)
df$c <- as.factor(df$c)
TMDT<-ctree(a ~ b + c,data=df)
plot(TMDT,type="simple")
Related
I have a graph and code like the following:
library(igraph)
g <- make_empty_graph (2) %>%
set_vertex_attr("a", value = 1) %>%
add_vertices(2, color = 2, "a" = 2) %>%
add_vertices(2, color = 4, "a" = 3) %>%
add_edges(c(1,2, 2,1, 1,5, 5,1, 1,4 ,4,1)) %>%
set_vertex_attr("xyz", value = 3)
i <- 1
repeat {
prev_value <- mean(V(g)$a == 1)
print(V(g)$a)
print(mean(V(g)$a == 1))
print(i)
V(g)$xyz = sapply(V(g), function(x) {
NeighborList = neighbors(g, x)
length(NeighborList[NeighborList$a == 2])
})
V(g)$a[V(g)$xyz == 1] = 2
i <- i + 1
aa <- mean(V(g)$a == 1)
if (aa == prev_value) {
break
}
}
df <- data.frame(time=i, prop=aa)
library(ggplot2)
ggplot(data=df, aes(x=time, y=prop, group=1)) +
geom_line() +
geom_point()
Whenever I try to run this however, it doesn't work. Ideally I would like an output where ggplot2 would plot a line graph that depicts the change in the proportion of nodes with an attribute "aa" value of 1 as the time variable "i" changes.
I am not sure what exactly you want. But if you want to depict the difference in aa for each iteration then you can add the ggplot inside the repeat.
repeat {
prev_value <- mean(V(g)$a == 1)
print(V(g)$a)
print(mean(V(g)$a == 1))
print(i)
V(g)$xyz = sapply(V(g), function(x) {
NeighborList = neighbors(g, x)
length(NeighborList[NeighborList$a == 2])
})
V(g)$a[V(g)$xyz == 1] = 2
i <- i + 1
aa <- mean(V(g)$a == 1)
if(i == 1){
df <- data.frame(time=i, prop=aa)
print( ggplot(data=df, aes(x=time, y=prop, group=1)) +
geom_line() +
geom_point() )}else{
df <- rbind(df,data.frame(time=i, prop=aa))
print( ggplot(data=df, aes(x=time, y=prop, group=1)) +
geom_line() +
geom_point() )}
if (aa == prev_value) {
break
}
}
I am breaking my data frames into 3 other data frames. I am iterating through each of them and plot histogram based on it. I am storing these plots into separate ones. However, outside the loop, I am able to print only the plot from the last iteration and not the first two. I am able to print all three plots inside the loop though. Here is my code:
catCust1 <- myData[(myData$meanVal > 0 & myData$meanVal <= 20),]
catCust2 <- myData[(myData$meanVal > 20 & myData$meanVal <= 40),]
catCust3 <- myData[(myData$meanVal > 40 & myData$meanVal <= 60),]
for(i in 1:3) {
if(i == 1) {
catCust <- catCust1
} else if(i == 2) {
catCust <- catCust2
} else if(i == 3) {
catCust <- catCust3
}
catCust <- na.omit(catCust)
numOrdersCatCust <- ddply(catCust, .(ORDERDATE), nrow)
numOrdersCatCust$numDay <- 1:nrow(numOrdersCatCust)
catCust$orderCount <- numOrdersCatCust[match(catCust$ORDERDATE, numOrdersCatCust$ORDERDATE), 2]
catCust$numDay <- numOrdersCatCust[match(catCust$ORDERDATE, numOrdersCatCust$ORDERDATE), 3]
setDT(catCust)[, uniqueCounter := .GRP, by = CUSTOMERID]
apply(catCust, 2, function(x)length(unique(x)))
threshold25catOne <- catCust[(catCust$uniqueCounter == as.integer(0.25 * max(uniqueCounter))), ]
threshold25catOneVal <- threshold25catOne$numDay
catCustPlot <- ggplot(data=catCust, aes(catCust$numDay)) +
geom_histogram(binwidth = 1, col="black", fill="white", alpha=0.1) +
labs(x="Day Number (Since 01-09-2016)", y="Orders") +
ggtitle("GRAPH TITLE") +
theme(plot.title = element_text(hjust = 0.5), panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +
geom_vline(xintercept = threshold25catOneVal[1], color="purple") +
annotate("text", x = threshold25catOneVal[1]-7, y = max(catCust$orderCount) + 1000, angle = 0, label = threshold25catOneVal[1], vjust = 1.2, parse = TRUE)
# ABLE TO PRINT DIFFERENT PLOTS HERE
if(i == 1) {
catCustPlot1 <- catCustPlot
print(catCustPlot1)
} else if(i == 2) {
catCustPlot2 <- catCustPlot
print(catCustPlot2)
} else if(i == 3) {
catCustPlot3 <- catCustPlot
print(catCustPlot3)
}
}
# PRINTS ONLY catCustPlot3
print(catCustPlot1)
print(catCustPlot2)
print(catCustPlot3)
The other two plots gives me an error:
Error: Aesthetics must be either length 1 or the same as the data
UPDATE: head(myData)
meanVal sumVal countCat
75.98 75.98000 (60,80]
36.37 80.55727 (80,100]
50.96 52.67500 (40,60]
15.33 15.33000 (0,20]
17.48 27.65000 (20,40]
51.35 101.64900 (100,1e+04]
I just simplified you loop and stored ggplot objects to resPlots (no printing out). When loop finishes you can access/print them.
myData <- na.omit(myData)
resPlots <- list()
for(i in 1:3) {
if (i == 1) {
catCust <- myData[(myData$meanVal > 0 & myData$meanVal <= 20), ]
} else if (i == 2) {
catCust <- myData[(myData$meanVal > 20 & myData$meanVal <= 40), ]
} else if (i == 3) {
catCust <- myData[(myData$meanVal > 40 & myData$meanVal <= 60), ]
}
numOrdersCatCust <- ddply(catCust, .(ORDERDATE), nrow)
numOrdersCatCust$numDay <- 1:nrow(numOrdersCatCust)
catCust$orderCount <- numOrdersCatCust[match(catCust$ORDERDATE, numOrdersCatCust$ORDERDATE), 2]
catCust$numDay <- numOrdersCatCust[match(catCust$ORDERDATE, numOrdersCatCust$ORDERDATE), 3]
setDT(catCust)[, uniqueCounter := .GRP, by = CUSTOMERID]
apply(catCust, 2, function(x)length(unique(x)))
threshold25catOne <- catCust[(catCust$uniqueCounter == as.integer(0.25 * max(uniqueCounter))), ]
threshold25catOneVal <- threshold25catOne$numDay
resPlots[[i]] <- ggplot(catCust, aes(catCust$numDay)) +
geom_histogram(binwidth = 1, col = "black", fill = "white", alpha = 0.1) +
labs(x="Day Number (Since 01-09-2016)", y="Orders") +
ggtitle("GRAPH TITLE") +
theme(plot.title = element_text(hjust = 0.5), panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +
geom_vline(xintercept = threshold25catOneVal[1], color="purple") +
annotate("text", x = threshold25catOneVal[1]-7, y = max(catCust$orderCount) + 1000, angle = 0, label = threshold25catOneVal[1], vjust = 1.2, parse = TRUE)
}
# First plot
resPlots[[1]]
How can I get ggplot to produce something similar like
library(ggplot2)
library(reshape2)
library(ecp)
synthetic_control.data <- read.table("/path/synthetic_control.data.txt", quote="\"", comment.char="")
n <- 2
s <- sample(1:100, n)
idx <- c(s, 100+s, 200+s, 300+s, 400+s, 500+s)
sample2 <- synthetic_control.data[idx,]
df = as.data.frame(t(as.matrix(sample2)))
#calculate the change points
changeP <- e.divisive(as.matrix(df[1]), k=8, R = 400, alpha = 2, min.size = 3)
changeP = changeP$estimates
changeP = changeP[-c(1,length(changeP))]
changePoints = data.frame(changeP,variable=colnames(df)[1])
for(series in 2:ncol(df)){
changeP <- e.divisive(as.matrix(df[series]), k=8, R = 400, alpha = 2, min.size = 3)
changeP = changeP$estimates
changeP = changeP[-c(1,length(changeP))]
changePoints = rbind(changePoints, data.frame(changeP,variable=colnames(df)[2]))
}
this is the interesting part about the plot:
df$id = 1:nrow(df)
dfMelt <- reshape2::melt(df, id.vars = "id")
p = ggplot(dfMelt,aes(x=id,y=value))+geom_line(color = "steelblue")+ facet_grid(variable ~ ., scales = 'free_y')
p + geom_vline(aes(xintercept=changeP), data=changePoints, linetype='dashed')
So far my result is: https://www.dropbox.com/s/mysadkruo946oox/changePoint.pdf which means that there is something wrong with my array passed to the geom_vlines.
Could you point me in the right direction why I only get vlines in the first 2 plots?
This is the solution:
library(ggplot2)
library(reshape2)
library(ecp)
synthetic_control.data <- read.table("/Users/geoHeil/Dropbox/6.Semester/BachelorThesis/rResearch/data/synthetic_control.data.txt", quote="\"", comment.char="")
n <- 2
s <- sample(1:100, n)
idx <- c(s, 100+s, 200+s, 300+s, 400+s, 500+s)
sample2 <- synthetic_control.data[idx,]
df = as.data.frame(t(as.matrix(sample2)))
#calculate the change points
changeP <- e.divisive(as.matrix(df[1]), k=8, R = 400, alpha = 2, min.size = 3)
changeP = changeP$estimates
changeP = changeP[-c(1,length(changeP))]
changePoints = data.frame(changeP,variable=colnames(df)[1])
for(series in 2:ncol(df)){
changeP <- e.divisive(as.matrix(df[series]), k=8, R = 400, alpha = 2, min.size = 3)
changeP = changeP$estimates
changeP = changeP[-c(1,length(changeP))]
changePoints = rbind(changePoints, data.frame(changeP,variable=colnames(df)[series]))
}
# plot
df$id = 1:nrow(df)
dfMelt <- reshape2::melt(df, id.vars = "id")
p = ggplot(dfMelt,aes(x=id,y=value))+geom_line(color = "steelblue")+ facet_grid(variable ~ ., scales = 'free_y')
p + geom_vline(aes(xintercept=changeP), data=changePoints, linetype='dashed', colour='darkgreen')
I have an R script that is generating a number of plots and well as assessing a few linear models. For some reason when I try to print out the plots and linear model statistics in a loop they don't end up in the file but when I remove the loop they get printed to file.
R/Knitr markup that doesn't work (desired lines to print with # I want this to print):
library('RODBC')
library('ggplot2')
library('dplyr')
library('reshape2')
con <- odbcConnect('yield_model')
sql <- "SELECT DISTINCT [grouping],[group],regionId,class,finalPk,avgArea,layer,runTime,random,name FROM dbo.pk p LEFT OUTER JOIN dred.dbo.yasMap y ON p.class = y.id WHERE random=1"
values <- sqlQuery(con,sql,stringsAsFactors = FALSE)
values$model = "NM"
groupings <- unique(select(values,regionId,class,layer))
groupings <- groupings[order(groupings$layer,groupings$regionId,groupings$class),]
for(i in 1:nrow(groupings)) {
data <- subset(values,regionId == groupings$regionId[i] & class == groupings$class[i] & layer == groupings$layer[i])
layer <- unique(select(data,layer))
region <- unique(select(data,regionId))
defectNo <- unique(select(data,class))
defectName <- unique(select(data,name))
count <- length(unique(data$avgArea))
average <- mean(data$finalPk)
myPlot <- ggplot(data=data,aes(x=avgArea,y=finalPk)) +
geom_point(size=4,color="red") +
ggtitle(paste("Defect=",defectName$name,"(", defectNo$class,"), Region=",region$regionId, ", Layer=", layer$layer,sep="")) +
geom_abline(intercept=average,slop=0,size=1,aes(color="mean"))
if(count > 1) {
myPlot <- myPlot + stat_smooth(method ="lm",formula = y ~ x, se = FALSE,size=1,aes(color="linear"))
}
if (count > 3) {
myPlot <- myPlot + stat_smooth(method = "lm", formula = y ~ log(x), se = FALSE, size=1, aes(color="log"))
myPlot <- myPlot + stat_smooth(method = "lm", formula = y ~ poly(x,2), se = FALSE, size=1, aes(color="poly"))
}
if(count > 1 & count < 4) {
myPlot <- myPlot + scale_colour_manual(name="",values=c("mean" = "orange","linear" = "green"))
} else if(count > 3) {
myPlot <- myPlot + scale_colour_manual(name="",values=c("mean" = "orange","linear" = "green","log" = "red","poly" = "blue"))
} else if (count == 2 | count == 3) {
}
plot(myPlot)
paste("Average Pk=",average,sep="") # I want this to print
if(count > 1) {
linMod <- lm(data$finalPk ~ data$avgArea)
coef(linMod) # I want this to print
summary(linMod) # I want this to print
}
if (count > 3) {
linModLog <- lm(data$finalPk ~ log(data$avgArea))
coef(linModLog) # I want this to print
summary(linModLog) # I want this to print
linModPoly <- lm(data$finalPk ~ poly(data$avgArea,2))
coef(linModPoly) # I want this to print
summary(linModPoly) # I want this to print
}
}
R/Knitr that does print:
library('RODBC')
library('ggplot2')
library('dplyr')
library('reshape2')
con <- odbcConnect('yield_model')
sql <- "SELECT DISTINCT [grouping],[group],regionId,class,finalPk,avgArea,layer,runTime,random,name FROM dbo.pk p LEFT OUTER JOIN dred.dbo.yasMap y ON p.class = y.id WHERE random=1"
values <- sqlQuery(con,sql,stringsAsFactors = FALSE)
values$model = "NM"
groupings <- unique(select(values,regionId,class,layer))
groupings <- groupings[order(groupings$layer,groupings$regionId,groupings$class),]
#for(i in 1:nrow(groupings)) {
i <- 1
data <- subset(values,regionId == groupings$regionId[i] & class == groupings$class[i] & layer == groupings$layer[i])
layer <- unique(select(data,layer))
region <- unique(select(data,regionId))
defectNo <- unique(select(data,class))
defectName <- unique(select(data,name))
count <- length(unique(data$avgArea))
average <- mean(data$finalPk)
myPlot <- ggplot(data=data,aes(x=avgArea,y=finalPk)) +
geom_point(size=4,color="red") +
ggtitle(paste("Defect=",defectName$name,"(", defectNo$class,"), Region=",region$regionId, ", Layer=", layer$layer,sep="")) +
geom_abline(intercept=average,slop=0,size=1,aes(color="mean"))
if(count > 1) {
myPlot <- myPlot + stat_smooth(method ="lm",formula = y ~ x, se = FALSE,size=1,aes(color="linear"))
}
if (count > 3) {
myPlot <- myPlot + stat_smooth(method = "lm", formula = y ~ log(x), se = FALSE, size=1, aes(color="log"))
myPlot <- myPlot + stat_smooth(method = "lm", formula = y ~ poly(x,2), se = FALSE, size=1, aes(color="poly"))
}
if(count > 1 & count < 4) {
myPlot <- myPlot + scale_colour_manual(name="",values=c("mean" = "orange","linear" = "green"))
} else if(count > 3) {
myPlot <- myPlot + scale_colour_manual(name="",values=c("mean" = "orange","linear" = "green","log" = "red","poly" = "blue"))
} else if (count == 2 | count == 3) {
}
plot(myPlot)
paste("Average Pk=",average,sep="") # I want this to print
if(count > 1) {
linMod <- lm(data$finalPk ~ data$avgArea)
coef(linMod) # I want this to print
summary(linMod) # I want this to print
}
if (count > 3) {
linModLog <- lm(data$finalPk ~ log(data$avgArea))
coef(linModLog) # I want this to print
summary(linModLog) # I want this to print
linModPoly <- lm(data$finalPk ~ poly(data$avgArea,2))
coef(linModPoly) # I want this to print
summary(linModPoly) # I want this to print
}
#}
My header values (I also tried with no results portion at all):
{r echo=FALSE,results='asis'}
Hopefully somebody has some advice for me. Appreciate the help!
Given that it's everything after the plot(myPlot) statement which is giving you problems in the loop, I'm thinking the answer here might apply to you.
Try adding a plot.new call after the plot(myPlot) statement
I'm trying to make a lot of graphs using ggplot2 script, and add some text (Lm equation and r2 value, using this function) for each graph.
The issue is that my x and y coordinates will be different between each graph.
With 'plot' function, you can convert 'plot' coords to 'figure' coords using cnvr.coord function, but in ggplot2 (grid base package), isn't functionally.
below and example (where "p" is a preexistent ggplot2 object) :
p <- p + geom_text(aes(X, Y, label = lm_eqn(lm(as.numeric(a$value) ~ as.numeric(a$date), a))))
I agree with shujaa. You can simply calculate where the function goes based on the range of your data. Using your link above, I've created an example:
library(ggplot2)
df1 <- data.frame(x = c(1:100))
df1$y <- 2 + 3 * df1$x + rnorm(100, sd = 40)
df1$grp <- rep("Group 1",100)
df2 <- data.frame(x = c(1:100))
df2$y <- 10 -.5 * df2$x + rnorm(100, sd = 100)
df2$grp <- rep("Group 2",100)
df3 <- data.frame(x = c(1:100))
df3$y <- -5 + .2 * df3$x + rnorm(100, sd = 10)
df3$grp <- rep("Group 3",100)
df4 <- data.frame(x = c(1:100))
df4$y <- 2 - 3 * df4$x + rnorm(100, sd = 40)
df4$grp <- rep("Group 4",100)
df <- list(df1,df2,df3,df4)
lm_eqn = function(df) {
m = lm(y ~ x, df);
l <- list(a = format(coef(m)[1], digits = 2),
b = format(abs(coef(m)[2]), digits = 2),
r2 = format(summary(m)$r.squared, digits = 3));
if (coef(m)[2] >= 0) {
eq <- substitute(italic(y) == a + b %.% italic(x)*","~~italic(r)^2~"="~r2,l)
} else {
eq <- substitute(italic(y) == a - b %.% italic(x)*","~~italic(r)^2~"="~r2,l)
}
as.character(as.expression(eq));
}
pdf("I:/test.pdf")
for (i in 1:4) {
text.x <- ifelse(lm(df[[i]]$y~1+df[[i]]$x)$coef[2]>0,min(df[[i]]$x),max(df[[i]]$x))
text.y <- max(df[[i]]$y)
text.hjust <- ifelse(lm(df[[i]]$y~1+df[[i]]$x)$coef[2]>0,0,1)
p <- ggplot(data = df[[i]], aes(x = x, y = y)) +
geom_smooth(method = "lm", se=FALSE, color="black", formula = y ~ x) +
geom_point()
p1 = p + geom_text(aes(x = text.x, y = text.y, label = lm_eqn(df[[i]])), parse = TRUE,hjust=text.hjust)
print(p1)
}
dev.off()