Related
Problem:
I am working with wages data and I want to flag outliers as possible measurement errors. For doing so, I am combining two criteria:
To receive more than twice the value of the 99th percentile of wages within a given year, relative to the whole distribution of wages on my dataset (comparison criteria between persons, within year)
To receive more than twice the value of the second highest wage within a same person, across years. That is an intra-individual criteria (comparison criteria within person, between years).
I accomplished to code the first criteria, but I am having some trouble with coding the second one.
My data is in the wide format. Perhaps the solution to my problem can be easier achieved by reshaping the data to the long format, but as I am working in a multi-author project, if I use this solution I need to reshape it back to the wide format again.
Data example:
Below, I provide some rows of my data with cases that already met the first criteria:
df <- structure(list(
wage_2010 = c(120408.54, 11234.67, 19918.64, NA, 66006.32, 40581.36, 344587.84, 331970.28, NA, 161351.45, NA, 115310.68, 323336.27, 9681.69, NA, 682324.53, 43764.76, 134023.61, 78195.16, 141231.5, 48163.23, 71259.66, 73858.65, 57737.6, NA, 182837.23), wage_2011 = c(413419.86, 24343.04, 36349.02, NA, 99238.53, 18890.34, 129921.58, 108714.29, NA, 169289.89, 36158.73, 129543.51, 130791.99, 13872.76, 4479.58, 222327.52, 826239.14, 48892.78, 78506.06, 111569.8, 653239.41, 813158.54, 72960.17, 80193.15, NA, 209796.19), wage_2012 = c(136750.86, 77386.62, 177528.17, 86512.48, 375958.76, 20302.29, 145373.42, 91071.64, 95612.23, 176866.72, 85244.44, 225698.7, 181093.52, 162585.23, 147918.83, 254057.11, 72845.46, 86001.31, 80958.22, 105629.12, 77723.77, 115217.74, 68959.04, 111843.87, 85180.26, 261942.95 ),
wage_2013 = c(137993.48, 104584.84, 239822.37, 95688.8, 251573.14, 21361.93, 142771.58, 92244.51, 111058.93, 208013.94, 111326.07, 254276.36, 193663.33, 225404.84, 84135.55, 259772.16, 100031.38, 100231.81, 824271.38, 107336.19, 95292.2, 217071.19, 125665.58, 74513.66, 116227.01, 245161.73), wage_2014 = c(134914.8, 527180.87, 284218.4, 112332.41, 189337.74, 23246.46, 144070.09, 92805.77, 114123.3, 251389.07, 235863.98, 285511.12, 192950.23, 205364.45, 292988.3, 318408.56, 86255.91, 497960.18, 85467.13, 152987.99, 145663.31, 242682.93, 184123.01, 107423.03, 132046.43, 248928.89), wage_2015 = c(168812.65, 145961.09, 280556.86, 256268.69, 144549.45, 23997.1, 130253.75, NA, 115522.88, 241031.91, 243697.87, 424135.76, 15927.33, 213203.96, 225118.19, 298042.59, 77749.09, 151336.85, 88596.38, 121741.45, 34054.26, 206284.71, 335127.7, 201891.17, 189409.04, 246440.69),
wage_2016 = c(160742.14, 129892.09, 251333.29, 137192.73, 166127.1, 537611.12, 139350.84, NA, 115395.21, 243154.02, 234685.36, 903334.7, NA, 205664.08, 695079.91, 33771.37, 100938.19, 138864.28, 58658.4, 98576.95, NA, 144613.53, 430393.04, 217989.1, 229369.56, 600079.86), wage_2017 = c(175932.3, 138128.41, 584536.47, 143506.22, 61674.63, 1442.8, 126084.46, NA, 575771.83, 586909.69, 372954.89, 701815.37, NA, 402347.33, 93873.2, NA, 96792.96, 172908.08, 89006.92, 631645.41, NA, 72183.55, 579455.71, 294539.56, 353615.43, 151327.43), wage_2018 = c(146111.42, 149313.9, 627679.77, 850182.4, 72654.62, 9129.35, 41544.24, NA, 248020.12, 334280.68, 611781.99, 597465.2, NA, 535628.5, 63369.44, NA, 93710.71, 146769.63, 100736.71, 108022.87, NA, 79019.43, 772012.47, 549097.81, 504183.59, 99129.6),
outlier_2010 = c(0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), outlier_2011 = c(1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0), outlier_2012 = c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), outlier_2013 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), outlier_2014 = c(0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0), outlier_2015 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), outlier_2016 = c(0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1), outlier_2017 = c(0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0), outlier_2018 = c(0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0)),
groups = structure(list(.rows = structure(list(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, 20L, 21L, 22L, 23L, 24L, 25L, 26L), ptype = integer(0), class = c("vctrs_list_of", "vctrs_vctr", "list"))), row.names = c(NA, -26L), class = c("tbl_df", "tbl", "data.frame")), row.names = c(NA, -26L), class = c("rowwise_df", "tbl_df", "tbl", "data.frame"))
I have averages anual wages from 2010 to 2018, that is, 9 points in time. However, it seems to be hard to use a solution with the quantile function, because of possible missing values for some individuals in some years.
What I have tried:
So far I am using a median function within the dplyer approach. I flag as an outlier (possible error) if, in one given year, the individual receives more than twice the median of what he received across the years:
library(dplyr)
df1 <- df %>%
rowwise %>%
mutate(
median_wage = median(c(wage_2010, wage_2011, wage_2012, wage_2013, wage_2014, wage_2015, wage_2016, wage_2017, wage_2018), na.rm=T)) %>%
mutate(
individual_threshold = median_wage * 2,
) %>%
mutate(
outlier_2010 = case_when (wage_2010 > individual_threshold ~ 1, TRUE ~ 0),
outlier_2011 = case_when (wage_2011 > individual_threshold ~ 1, TRUE ~ 0),
outlier_2012 = case_when (wage_2012 > individual_threshold ~ 1, TRUE ~ 0),
outlier_2013 = case_when (wage_2013 > individual_threshold ~ 1, TRUE ~ 0),
outlier_2014 = case_when (wage_2014 > individual_threshold ~ 1, TRUE ~ 0),
outlier_2015 = case_when (wage_2015 > individual_threshold ~ 1, TRUE ~ 0),
outlier_2016 = case_when (wage_2016 > individual_threshold ~ 1, TRUE ~ 0),
outlier_2017 = case_when (wage_2017 > individual_threshold ~ 1, TRUE ~ 0),
outlier_2018 = case_when (wage_2018 > individual_threshold ~ 1, TRUE ~ 0))
However, when I inspect the data, I see that I am coding as outlier possible legitimate wages. For example, in the third row/person of my data, I am flagging as outliers wages in 2017 and 2018. However, as we can see, there is a pattern of increase in this person's wage. Although he receives more than twice his median wage in these years, probably that is not a mistake, as the increase was recorded in two years in a row.
In the forth row, however, the 2018 wage is more likely to be wrongly reported, since there is not a similar wage to that one for the same person. In 2018 year, that person wage grew more than 4 times than it was ever before (and also became more than twice the 99th percentile of the whole distribution).
Summing up:
I want to write a code to analyse 9 variables for every individual (or rowwise): wage_2010-2018, and compare the highest value to the second highest value. If the highest value is more than twice the size of the second highest value, I flag it as a possible measurement error. Preferably within dplyr.
Here's a way to do this with a helper function.
library(dplyr)
compare_2nd_highest <- function(x) {
#Sort the wages in descending order
x1 <- sort(x, decreasing = TRUE)
#Is the highest value more than double of second highest value
x1[1] > (x1[2] * 2)
}
df %>%
rowwise() %>%
mutate(is_outlier = compare_2nd_highest(c_across(starts_with('wage')))) %>%
ungroup
I am working on the following structure and the following plotting code:
structure(c(NA, 11, 9, 9, 21, 7, 2, 5, 3, 0, 0, 1, 31, NA, 3,
2, 1, 0, 0, 10, 3, 0, 0, 0, 31, 16, NA, 2, 2, 10, 0, 5, 0, 0,
0, 0, 59, 65, 1, NA, 2, 4, 0, 4, 0, 0, 0, 0, 156, 23, 7, 17,
NA, 3, 2, 4, 7, 0, 0, 0, 31, 84, 0, 10, 16, NA, 0, 6, 0, 0, 2,
0, 129, 0, 2, 1, 0, 0, NA, 0, 0, 0, 0, 0, 41, 41, 0, 3, 4, 5,
0, NA, 0, 0, 0, 1, 16, 4, 1, 2, 0, 0, 0, 3, NA, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, NA, 0, 0, 1, 12, 2, 0, 0, 6, 0, 0, 0, 0,
NA, 0, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), .Dim = c(12L,
12L), .Dimnames = list(c("WILL_", "WOULD_", "MAY_", "MIGHT_",
"CAN_", "COULD_", "SHALL_", "SHOULD_", "MUST_", "OUGHT TO_",
"USED TO_", "HAVE TO_"), c("_WILL", "_WOULD", "_MAY", "_MIGHT",
"_CAN", "_COULD", "_SHALL", "_SHOULD", "_MUST", "_OUGHT TO",
"_USED TO", "_HAVE TO")))
breaks <- c(0,1,5,10,50,100,500,100000)
reshape2::melt(structure, value.name = "Freq") %>%
mutate(label = ifelse(is.na(Freq) | Freq == 0, "", as.character(Freq))) %>%
ggplot(aes(Var2, fct_rev(Var1))) +
geom_tile(aes(fill = Freq), color = "black") +
geom_text(aes(label = label), color = "black") +
scale_fill_steps(low = "white", high = "purple", breaks = breaks, na.value = "grey",trans = "log")+
scale_x_discrete(NULL, expand = c(0, 0), position="top") +
scale_y_discrete(NULL, expand = c(0, 0)) +
theme(axis.text.x = element_text(angle=60,vjust = 0.5, hjust = 0))
I am trying to tweak the code so that original NA values (seen on the plot as the tiles forming a diagonal line from the co-occurrence of WILL WILL to HAVE TO HAVE TO, and the X HAVE TO column) are represented as black tiles separately from the other tiles which I would like to keep as they are.
Looking for tips on how to do this as I think I'm doing something wrong with the representation of values at the beginning of my code.
All the best
Cameron
I know this question might be duplicated, but I was trying some of the solutions posted in this forum with no success, and that's why I am posting it here.
Let's start with my dataset to make it reproducible.
dataset <- structure(list(Comparison = c("SH vs SAP", "SH vs NEA", "SH vs ERE",
"SH vs ERH", "SH vs NAL", "SAP vs NEA", "SAP vs ERE", "SAP vs ERH",
"SAP vs NAL", "NEA vs ERE", "NEA vs ERH", "NEA vs NAL", "ERE vs ERH",
"ERE vs NAL", "ERH vs NAL"), DC1 = c(NA, NA, NA, NA, NA, 1, 1,
1, NA, 1, 1, NA, 1, NA, NA), DC2 = c(NA, NA, NA, NA, NA, 1, 1,
1, NA, 0, 0, NA, 1, NA, NA), DC3 = c(1, 1, 1, 1, 1, 1, 1, 1,
0, 1, 0, 0, 1, 0, 1), DC4 = c(1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
0, 1, 1, 1), DC5 = c(0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1,
1), DC6 = c(0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1), DC7 = c(0,
1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1), DC8 = c(0, 1, 0, 1,
1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1), DC9 = c(0, 0, 0, 0, 1, 0, 0,
0, 0, 0, 1, 1, 0, 0, 0), DC10 = c(1, 1, 0, 1, 1, 0, 0, 0, 0,
0, 1, 0, 1, 0, 0)), .Names = c("Comparison", "DC1", "DC2", "DC3",
"DC4", "DC5", "DC6", "DC7", "DC8", "DC9", "DC10"), class = "data.frame", row.names = c(NA,
15L))
I have tried to change the dataset to a matrix, as this been suggested in other posts. However, it keeps giving the same error
heatmap(dataset)
heatmap(as.matrix(dataset))
Error in heatmap(dataset) :
'x' must be a numeric matrix
Error in heatmap(as.matrix(dataset)) :
'x' must be a numeric matrix
I tried to convert to numeric the columns, but the error keeps. And so is the case when I remove DC1 and DC2 columns which contain NA values.
Any help to spot the problem?
dataset[, 1] is character so as.matrix(dataset) is a character matrix. This explains:
'x' must be a numeric matrix
Your probably want
heatmap(as.matrix(dataset[, -1]))
And how can I include the names of the rows on the right?
Set the Comparison variable as the rownames of the matrix:
m <- as.matrix(dataset[, -1])
rownames(m) <- dataset$Comparison
heatmap(m)
So your real issue is really Convert the values in a column into row names in an existing data frame in R although the problem is presented with heatmap.
I am trying to construct a bootstrap t confidence interval for 3 parameter estimates but I only able to construct for first parameter. Attached below is my code:
beta0<--8
beta1<-0.03
gamma<-0.0105
alpha<-0.05
n<-100
N<-10
for (i in 1:N)
{
u<-runif(n)
x<-rnorm(n)
c<-rexp(n,1/1255)
t1<-(1/gamma)*log(1-((gamma/exp(beta0+beta1*x))*log(1-u)))
t<-pmin(t1,c)
delta<-1*(t1>c)
length(delta)
delta[delta==1]<-ifelse(rbinom(length(delta[delta==1]),1,0.75),1,2)
deltae<-ifelse(delta==0, 1,0)
deltar<-ifelse(delta==1, 1,0)
deltai<-ifelse(delta==2, 1,0)
dat=data.frame(t,delta, deltae,deltar,deltai,x)
dat$interval[delta==2] <- as.character(cut(dat$t[delta==2], breaks=seq(0, 600, 100)))
labs <- cut(dat$t[delta==2], breaks=seq(0, 600, 100))
dat$lower[delta==2]<-as.numeric( sub("\\((.+),.*", "\\1", labs) )
dat$upper[delta==2]<-as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs) )
beta0hat.boot <- function(data,j)
{
dat<-data[j,]
data0<-dat[which(dat$delta==0),]#uncensored data
data1<-dat[which(dat$delta==1),]#right censored data
data2<-dat[which(dat$delta==2),]#interval censored data
library(maxLik)
#without imputataion
ll<-function(para)
{
b0<-para[1]
b1<-para[2]
g<-para[3]
e<-sum((b0+b1*data0$x)+g*data0$t+(1/g)*exp(b0+b1*data0$x)*(1-exp(g*data0$t)))
r<-sum((1/g)*exp(b0+b1*data1$x)*(1-exp(g*data1$t)))
i<-sum(log(exp((1/g)*exp(b0+b1*data2$x)*(1-exp(g*data2$lower)))-exp((1/g)*exp(b0+b1*data2$x)*(1-exp(g*data2$upper)))))
l<-e+r+i
return(l)
}
est<-maxLik(logLik=ll,start=c(para<-c(-8,0.03,0.0105)))
beta0hat<-est$estimate[1]
beta1hat<-est$estimate[2]
gammahat<-est$estimate[3]
observed<-solve(-est$hessian)
return(c(beta0hat,beta1hat,gammahat,observed[1,1],observed[2,2],observed[3,3]))
}
library(boot)
out<- boot(dat,beta0hat.boot,100)
ci<-boot.ci(out,type =c("stud","perc"),var.t0= out$t0[4],var.t=out$t[,4])
ci1<-boot.ci(out,type = c("stud","perc"),var.t0= out$t0[4],var.t=out$t[,4],index=1)
}
I am only able to construct the confidence interval for the first parameter only without using index=1 which is as follow
ci<-boot.ci(out,type =c("stud","perc"),var.t0= out$t0[4],var.t=out$t[,4])
when i add in index=1,
ci1<-boot.ci(out,type = c("stud","perc"),var.t0= out$t0[4],var.t=out$t[,4],index=1)
i got warnings:
In boot.ci(out, type = c("stud", "perc"), var.t0 = out$t0[4], var.t = out$t[, :
bootstrap variances needed for studentized intervals.
which caused the confidence interval cannot be built.
I want to add index=1 because I need to construct confidence interval for 2nd and 3rd parameter as well by using index=2 and index=3 but after I add in the index, i cannot get the confidence interval. Any idea how to get bootstrap t confidence interval for more than one statistics? I am able to get the percentile interval only by using the above code but not able to get bootstrap t interval.
dput(head(dat, 30)):
structure(list(t = c(143.786081550783, 104.647251273501, 65.5655695306165,
322.260530195967, 307.324160556309, 123.725827237157, 143.083942557736,
426.646805286557, 360.799323547846, 246.295906287976, 315.638222801499,
421.155652813492, 374.160625875751, 123.570819609099, 389.553807438155,
449.110810924173, 162.547725691726, 296.674617375856, 229.680453578098,
343.823664337269, 268.797764971971, 205.704838761594, 14.8630247008987,
91.6607201565057, 260.886289739501, 193.278377859747, 143.269719508224,
27.4780640122481, 267.191708749538, 39.8027657018974), delta = c(1,
0, 0, 0, 2, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
1, 1, 0, 1, 0, 1, 0, 1), deltae = c(0, 1, 1, 1, 0, 1, 1, 1, 1,
1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0
), deltar = c(1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1), deltai = c(0, 0, 0,
0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0), x = c(-0.377643157724086, 0.218708630964153,
0.153162542263512, 1.28222373181234, 1.1423312203422, -1.04726335577352,
-0.360028857222088, 0.336098821643731, 0.758860540656158, 0.0533940581013279,
-0.0562132826178519, 0.0798656325287969, -0.748956798800072,
-0.235929730488004, -0.737049993834757, 1.05819046250488, 1.28776064495481,
0.457930197196181, -1.45563588128583, -1.1074384621643, -0.829026816498185,
-1.3824961444269, -1.58951008909026, -0.95052226776903, 0.0145909317659764,
-0.198775419436042, 0.0481467746529365, -0.136098038815233, -0.118319488454131,
-0.498263758291143), interval = c(NA, NA, NA, NA, "(300,400]",
NA, NA, NA, NA, NA, "(300,400]", NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), lower = c(NA,
NA, NA, NA, 300, NA, NA, NA, NA, NA, 300, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), upper = c(NA,
NA, NA, NA, 400, NA, NA, NA, NA, NA, 400, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA)), .Names = c("t",
"delta", "deltae", "deltar", "deltai", "x", "interval", "lower",
"upper"), row.names = c(NA, 30L), class = "data.frame")
I have a dataframe with lat, long and dozens of other columns. Basically, I want to write a loop where X=Long,Y=Lat remains constant, but the color changes in every loop. The color is basically the other columns, one for every plot. How can I do this?
library(maps)
library(ggplot2)
library(RColorBrewer)
library(reshape)
usamap <- ggplot2::map_data("state")
myPalette <- colorRampPalette(rev(brewer.pal(11, "Spectral")))
simplefun<-function(colname){
ggplot()+
geom_polygon( data=usamap, aes(x=long, y=lat, group=group),colour="black",fill="white")+
geom_point(data=stat,aes_string(x=stat$longitude,y=stat$latitude,color=colname))+
scale_colour_gradientn(name="name",colours = myPalette(10))+
xlab('Longitude')+
ylab('Latitude')+
coord_map(projection = "mercator")+
theme_bw()+
theme(line = element_blank())+
theme(legend.position = c(.93,.20),panel.grid.major = element_line(colour = "#808080"))+
ggsave(paste0(colname,".png"),width=10, height=8,dpi=300)
}
colname<-names(stat[4:16])
lapply(colname,simplefun)
dput(droplevels(stat))
structure(list(siteId = structure(1:16, .Label = c("US1NYAB0001",
"US1NYAB0006", "US1NYAB0010", "US1NYAB0021", "US1NYAB0023", "US1NYAB0028",
"US1NYAB0032", "US1NYAL0002", "US1NYBM0004", "US1NYBM0007", "US1NYBM0011",
"US1NYBM0014", "US1NYBM0021", "US1NYBM0024", "US1NYBM0032", "US1NYBM0034"
), class = "factor"), latitude = c(42.667, 42.7198, 42.5455,
42.6918, 42.6602, 42.7243, 42.5754, 42.2705, 42.0296, 42.0493,
42.0735, 42.3084, 42.0099, 42.1098, 42.1415, 42.0826), longitude = c(-74.0509,
-73.9304, -74.1475, -73.8311, -73.8103, -73.757, -73.7995, -77.9419,
-76.0213, -76.0288, -75.9296, -75.9569, -75.5142, -75.8858, -75.889,
-75.9912), no = c(2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 1L), min_obs = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0), min_mod = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0), avg_obs = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0.15,
0, 0, 0, 0, 0, 0), avg_mod = c(3136.8388671875, 2997.28173828125,
3258.61840820312, 2970.74340820312, 2992.9765625, 0, 3075.54443359375,
2701.03662109375, 2974.23413085938, 2967.5029296875, 3004.57861328125,
2965.07470703125, 3260.25463867188, 3028.55590820312, 2981.8876953125,
0), max_obs = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0.3, 0, 0, 0, 0, 0,
0), max_mod = c(6273.677734375, 5994.5634765625, 6517.23681640625,
5941.48681640625, 5985.953125, 0, 6151.0888671875, 5402.0732421875,
5948.46826171875, 5935.005859375, 6009.1572265625, 5930.1494140625,
6520.50927734375, 6057.11181640625, 5963.775390625, 0), mean_bias = c(0,
0, 0, 0, 0, NaN, 0, 0, 0, 5.05475490855863e-05, 0, 0, 0, 0, 0,
NaN), corr_coef = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, 1, NA,
NA, NA, NA, NA, NA), additive_bias = c(-6273.677734375, -5994.5634765625,
-6517.23681640625, -5941.48681640625, -5985.953125, 0, -6151.0888671875,
-5402.0732421875, -5948.46826171875, -5934.705859375, -6009.1572265625,
-5930.1494140625, -6520.50927734375, -6057.11181640625, -5963.775390625,
0), mean_error = c(-3136.8388671875, -2997.28173828125, -3258.61840820312,
-2970.74340820312, -2992.9765625, 0, -3075.54443359375, -2701.03662109375,
-2974.23413085938, -2967.3529296875, -3004.57861328125, -2965.07470703125,
-3260.25463867188, -3028.55590820312, -2981.8876953125, 0), mean_abs_error = c(3136.8388671875,
2997.28173828125, 3258.61840820312, 2970.74340820312, 2992.9765625,
0, 3075.54443359375, 2701.03662109375, 2974.23413085938, 2967.3529296875,
3004.57861328125, 2965.07470703125, 3260.25463867188, 3028.55590820312,
2981.8876953125, 0), rmse = c(4436.16006895562, 4238.79648453055,
4608.38234747949, 4201.26561821133, 4232.7080465523, 0, 4349.47664966936,
3819.84262201718, 4206.20224553428, 4196.4707575116, 4249.11582411849,
4193.24886413303, 4610.69632679956, 4283.02483978603, 4217.02602018439,
0)), .Names = c("siteId", "latitude", "longitude", "no", "min_obs",
"min_mod", "avg_obs", "avg_mod", "max_obs", "max_mod", "mean_bias",
"corr_coef", "additive_bias", "mean_error", "mean_abs_error",
"rmse"), row.names = c(NA, -16L), class = "data.frame")
I had the same problem and I solve it like this:
Let's assume ggplotdata in my code is like your dataframe (with more than two columns) in the second post. (dput(droplevels(stat))?)
library(reshape) # package for melting data
shapes <- 1:ncol(ggplot_data) # number of shapes
ggplot_data <- melt(ggplot_data, id = "X1") # melt data together
p1 <- ggplot(ggplot_data, aes(X1,value))
p1 <- p1 +aes(shape = factor(variable))+ # different shapes
geom_point(aes(colour = factor(variable)))+ # different colors
scale_shape_manual(labels=colname, values = shapes)+ # same for the legend
scale_color_manual(labels=colname, values = mypalette) # same for legend