ggplot2 and first data point in a line - r
I am creating two plots using ggplot2 and then using grid.arrange to merge them together. I should say that both of the plots are also using facet_grid for a visual tweaking.
My problem is that the bottom plot, which is really a data table, ends up being "cut off" on the BOTH the left and right sides because of the starting position and ending positions for the facets. Is there a way for me to tweak this? I would like to tweak this so the points are not getting cut off.
Here is the data to reproduce it:
df <- structure(list(SurveyID = c(16L, 16L, 16L, 16L, 16L, 16L, 16L,
16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L,
16L, 16L, 16L, 16L, 26L, 26L, 26L, 26L, 26L, 26L, 26L, 26L, 26L,
26L, 26L, 26L, 26L, 26L, 26L, 26L, 26L, 26L, 26L, 26L, 26L, 26L,
26L, 26L, 47L, 47L, 47L, 47L, 47L, 47L, 47L, 47L, 47L, 47L, 47L,
47L, 47L, 47L, 47L, 47L, 47L, 47L, 47L, 47L, 47L, 47L, 47L, 47L,
56L, 56L, 56L, 56L, 56L, 56L, 56L, 56L, 56L, 56L, 56L, 56L, 56L,
56L, 56L, 56L, 56L, 56L, 56L, 56L, 56L, 56L, 56L, 56L, 76L, 76L,
76L, 76L, 76L, 76L, 76L, 76L, 76L, 76L, 76L, 76L, 76L, 76L, 76L,
76L, 76L, 76L, 76L, 76L, 76L, 76L, 76L, 76L, 83L, 83L, 83L, 83L
), MEPSID = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L
), ServiceID = structure(c(1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L,
2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L,
2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L,
2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L,
2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L,
2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L,
2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L,
2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L,
2L, 3L, 4L), .Label = c("Army", "Navy", "Marines", "Air Force"
), class = "factor"), SurveyReturnedYear = c(2012L, 2012L, 2012L,
2012L, 2012L, 2012L, 2012L, 2012L, 2012L, 2012L, 2012L, 2012L,
2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L,
2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L,
2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L,
2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L,
2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L,
2013L, 2013L, 2013L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L,
2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L,
2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L,
2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L,
2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L,
2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2015L, 2015L, 2015L,
2015L, 2015L, 2015L, 2015L, 2015L, 2015L, 2015L, 2015L, 2015L,
2015L, 2015L, 2015L, 2015L), SurveyReturnedMonth = c(10L, 10L,
10L, 10L, 11L, 11L, 11L, 11L, 12L, 12L, 12L, 12L, 1L, 1L, 1L,
1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 5L, 5L, 5L,
5L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 8L, 8L, 8L, 8L, 9L, 9L, 9L,
9L, 10L, 10L, 10L, 10L, 11L, 11L, 11L, 11L, 12L, 12L, 12L, 12L,
1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L,
5L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 8L, 8L, 8L, 8L,
9L, 9L, 9L, 9L, 10L, 10L, 10L, 10L, 11L, 11L, 11L, 11L, 12L,
12L, 12L, 12L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L,
4L, 4L, 4L, 4L), CompletedSurvey = c(23L, 19L, 38L, 16L, 11L,
16L, 38L, 19L, 6L, 14L, 41L, 10L, 6L, 32L, 46L, 18L, 12L, 30L,
35L, 18L, 11L, 32L, 23L, 19L, 8L, 24L, 46L, 19L, 18L, 28L, 30L,
19L, 12L, 27L, 32L, 15L, 20L, 31L, 34L, 26L, 30L, 25L, 26L, 17L,
41L, 16L, 24L, 12L, 43L, 23L, 22L, 15L, 29L, 21L, 22L, 18L, 38L,
10L, 20L, 13L, 46L, 19L, 19L, 9L, 32L, 10L, 17L, 27L, 31L, 21L,
17L, 18L, 30L, 18L, 19L, 20L, 22L, 23L, 17L, 17L, 34L, 21L, 16L,
4L, 34L, 29L, 20L, 18L, 25L, 21L, 24L, 19L, 15L, 16L, 18L, 13L,
28L, 19L, 24L, 0L, 23L, 13L, 13L, 2L, 34L, 13L, 22L, 4L, 17L,
26L, 5L, 17L, 27L, 18L, 30L, 0L, 30L, 11L, 34L, 0L, 27L, 9L,
34L, 0L), TotalSurvey = c(41L, 19L, 47L, 22L, 43L, 21L, 49L,
23L, 39L, 16L, 44L, 11L, 49L, 34L, 56L, 33L, 39L, 33L, 42L, 21L,
50L, 37L, 56L, 23L, 34L, 26L, 53L, 19L, 36L, 32L, 44L, 21L, 38L,
27L, 49L, 18L, 41L, 34L, 58L, 26L, 37L, 25L, 40L, 21L, 44L, 17L,
51L, 16L, 51L, 24L, 32L, 22L, 34L, 21L, 37L, 20L, 44L, 10L, 36L,
18L, 59L, 21L, 35L, 13L, 46L, 12L, 44L, 29L, 49L, 21L, 36L, 18L,
47L, 19L, 41L, 21L, 29L, 23L, 40L, 20L, 39L, 21L, 38L, 4L, 41L,
30L, 54L, 21L, 30L, 22L, 56L, 24L, 19L, 16L, 49L, 25L, 34L, 22L,
54L, 20L, 33L, 14L, 40L, 10L, 37L, 14L, 43L, 23L, 27L, 30L, 40L,
22L, 34L, 19L, 37L, 23L, 32L, 19L, 37L, 26L, 35L, 11L, 37L, 31L
), meps_labels = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L), .Label = c("Albany", "Albuquerque", "Amarillo",
"Anchorage", "Atlanta", "Baltimore", "Beckley", "Boise", "Boston",
"Buffalo", "Butte", "Charlotte", "Chicago", "Cleveland", "Columbus",
"Dallas", "Denver", "Des Moines", "Detroit", "El Paso", "Fargo",
"Fort Dix", "Fort Jackson", "Fort Lee", "Harrisburg", "Honolulu",
"Houston", "Indianapolis", "Jackson", "Jacksonville", "Kansas City",
"Knoxville", "Lansing", "Little Rock", "Los Angeles", "Louisville",
"Memphis", "Miami", "Milwaukee", "Minneapolis", "Montgomery",
"Nashville", "New Orleans", "New York", "Oklahoma City", "Omaha",
"Phoenix", "Pittsburgh", "Portland, ME", "Portland, OR", "Raleigh",
"Sacramento", "Salt Lake City", "San Antonio", "San Diego", "San Jose",
"San Juan", "Seattle", "Shreveport", "Sioux Falls", "Spokane",
"Springfield", "St. Louis", "Syracuse", "Tampa"), class = "factor"),
RR = c(56, 100, 81, 73, 26, 76, 78, 83, 15, 88, 93, 91, 12,
94, 82, 55, 31, 91, 83, 86, 22, 86, 41, 83, 24, 92, 87, 100,
50, 88, 68, 90, 32, 100, 65, 83, 49, 91, 59, 100, 81, 100,
65, 81, 93, 94, 47, 75, 84, 96, 69, 68, 85, 100, 59, 90,
86, 100, 56, 72, 78, 90, 54, 69, 70, 83, 39, 93, 63, 100,
47, 100, 64, 95, 46, 95, 76, 100, 42, 85, 87, 100, 42, 100,
83, 97, 37, 86, 83, 95, 43, 79, 79, 100, 37, 52, 82, 86,
44, 0, 70, 93, 32, 20, 92, 93, 51, 17, 63, 87, 12, 77, 79,
95, 81, 0, 94, 58, 92, 0, 77, 82, 92, 0), Time = structure(c(15614,
15614, 15614, 15614, 15645, 15645, 15645, 15645, 15675, 15675,
15675, 15675, 15706, 15706, 15706, 15706, 15737, 15737, 15737,
15737, 15765, 15765, 15765, 15765, 15796, 15796, 15796, 15796,
15826, 15826, 15826, 15826, 15857, 15857, 15857, 15857, 15887,
15887, 15887, 15887, 15918, 15918, 15918, 15918, 15949, 15949,
15949, 15949, 15979, 15979, 15979, 15979, 16010, 16010, 16010,
16010, 16040, 16040, 16040, 16040, 16071, 16071, 16071, 16071,
16102, 16102, 16102, 16102, 16130, 16130, 16130, 16130, 16161,
16161, 16161, 16161, 16191, 16191, 16191, 16191, 16222, 16222,
16222, 16222, 16252, 16252, 16252, 16252, 16283, 16283, 16283,
16283, 16314, 16314, 16314, 16314, 16344, 16344, 16344, 16344,
16375, 16375, 16375, 16375, 16405, 16405, 16405, 16405, 16436,
16436, 16436, 16436, 16467, 16467, 16467, 16467, 16495, 16495,
16495, 16495, 16526, 16526, 16526, 16526), class = "Date"),
Year = c("2012", "2012", "2012", "2012", "2012", "2012",
"2012", "2012", "2012", "2012", "2012", "2012", "2013", "2013",
"2013", "2013", "2013", "2013", "2013", "2013", "2013", "2013",
"2013", "2013", "2013", "2013", "2013", "2013", "2013", "2013",
"2013", "2013", "2013", "2013", "2013", "2013", "2013", "2013",
"2013", "2013", "2013", "2013", "2013", "2013", "2013", "2013",
"2013", "2013", "2013", "2013", "2013", "2013", "2013", "2013",
"2013", "2013", "2013", "2013", "2013", "2013", "2014", "2014",
"2014", "2014", "2014", "2014", "2014", "2014", "2014", "2014",
"2014", "2014", "2014", "2014", "2014", "2014", "2014", "2014",
"2014", "2014", "2014", "2014", "2014", "2014", "2014", "2014",
"2014", "2014", "2014", "2014", "2014", "2014", "2014", "2014",
"2014", "2014", "2014", "2014", "2014", "2014", "2014", "2014",
"2014", "2014", "2014", "2014", "2014", "2014", "2015", "2015",
"2015", "2015", "2015", "2015", "2015", "2015", "2015", "2015",
"2015", "2015", "2015", "2015", "2015", "2015")), .Names = c("SurveyID",
"MEPSID", "ServiceID", "SurveyReturnedYear", "SurveyReturnedMonth",
"CompletedSurvey", "TotalSurvey", "meps_labels", "RR", "Time",
"Year"), row.names = c(1L, 2L, 3L, 4L, 261L, 262L, 263L, 264L,
521L, 522L, 523L, 524L, 781L, 782L, 783L, 784L, 1041L, 1042L,
1043L, 1044L, 1301L, 1302L, 1303L, 1304L, 1561L, 1562L, 1563L,
1564L, 1821L, 1822L, 1823L, 1824L, 2081L, 2082L, 2083L, 2084L,
2341L, 2342L, 2343L, 2344L, 2601L, 2602L, 2603L, 2604L, 2861L,
2862L, 2863L, 2864L, 3121L, 3122L, 3123L, 3124L, 3381L, 3382L,
3383L, 3384L, 3641L, 3642L, 3643L, 3644L, 3901L, 3902L, 3903L,
3904L, 4161L, 4162L, 4163L, 4164L, 4421L, 4422L, 4423L, 4424L,
4681L, 4682L, 4683L, 4684L, 4941L, 4942L, 4943L, 4944L, 5201L,
5202L, 5203L, 5204L, 5461L, 5462L, 5463L, 5464L, 5721L, 5722L,
5723L, 5724L, 5981L, 5982L, 5983L, 5984L, 6241L, 6242L, 6243L,
6244L, 6501L, 6502L, 6503L, 6504L, 6761L, 6762L, 6763L, 6764L,
7021L, 7022L, 7023L, 7024L, 7281L, 7282L, 7283L, 7284L, 7541L,
7542L, 7543L, 7544L, 7801L, 7802L, 7803L, 7804L), class = "data.frame")
And the code:
library(ggplot2)
library(grid)
library(scales)
library(gridExtra)
p<- ggplot(data=df[df$MEPSID==1,],
aes(x=Time, y=RR, colour=ServiceID, group=ServiceID, label=round(RR)))+
scale_y_continuous(breaks=seq(0, 100, 10))+
labs(y="Response Rate")+
coord_cartesian(ylim=c(0, 110))+
geom_line(size=.5)+
geom_point()+
scale_color_manual(values=c("green4","blue4","red4","dodgerblue"))+
ggtitle("Counts")+
theme(plot.title=element_text(size=18, face="bold", vjust=1),
axis.title=element_text(size=16),
axis.text.x=element_text(size=10, angle=90),
axis.line=element_line(colour="black", size=.2),
legend.background = element_rect(fill="transparent"),
legend.position="top",
legend.title=element_blank(),
legend.margin=unit(-0.6, "cm"),
legend.position="none",
legend.text=element_text(size=14),
panel.grid.minor.x = element_blank(),
panel.grid.major.x = element_blank(),
panel.grid.minor.y = element_blank(),
panel.background = element_blank(),
panel.grid.major.y=element_line(colour="gray", linetype="solid", size=.2))+ # or theme_blank())
scale_x_date(labels = date_format("%b"), breaks=date_breaks("month"))+
facet_grid(~Year, scales="free", space="free")
p2<-ggplot(df[df$MEPSID==1,], aes(x = Time, y = ServiceID, label=format(round(RR), nsmall=0), colour = ServiceID)) +
geom_text(size = 3.5) +
theme(
panel.grid.major = element_blank(),
legend.position = "none",
panel.border = element_blank(),
panel.background = element_blank(),
axis.text.x = element_text(),
axis.ticks = element_blank(),
plot.margin = unit(c(-0.5,1, 0, 0.5), "lines")) +
xlab(NULL) +
ylab(NULL)+
scale_x_date(labels=c(), breaks=date_breaks("month"), expand=c(0.05,0.05))+
facet_grid(~Year, scales="free", space="free_x")+
scale_y_discrete(limits=rev(levels(df$ServiceID)))+
scale_color_manual(values=c("green4","blue4","red4","dodgerblue"))
grid.arrange(arrangeGrob(p,p2,
nrow=2, heights=c(5,1)))
You can use geom_blank to fine-tune facetted scales.
grid.arrange(p + geom_blank(data = data.frame(Time = as.Date(c("2012-09-20", "2012-12-15",
"2014-12-20", "2015-04-10")),
RR = 1:4,
Year = c(2012, 2012, 2015, 2015)),
aes(colour = NULL, group = NULL, label = NULL)) ,
p2 + geom_blank(data = data.frame(Time = as.Date(c("2012-09-20", "2012-12-15",
"2014-12-20", "2015-04-10")),
ServiceID = 1:4,
Year = c(2012, 2012, 2015, 2015)),
aes(colour = NULL, group = NULL, label = NULL)) ,
nrow=2, heights=c(5,1))
Another option is to adjust text using hjust argument as an aes. But first you should add it to the data as its own column that you will pass into the ggplot command :
library(data.table)
DX <- setDT(df[df$MEPSID==1,])
DX[,hjust:=ifelse(Time==min(Time),0.1,ifelse(Time==max(Time),0.8,0.4)),Year] #This creates a new variable called hjust
p2<-ggplot(DX,
aes(x = Time, y = ServiceID, label=format(round(RR), nsmall=0),
colour = ServiceID,hjust=hjust)) +
## the rest of the plot 2
add some explanation:
Here you are plotting a text using (Time versus ServiceID) by year.
Since we want to shift our text horizontally, we will do it according to the value of Time (x-coordinate). More precisely, will just shift left-points to the right and right-points to the left. This will be done by setting a different hjust value for each group of values ( left vs right).
So for each year( each facet ) , I will horizontally adjust the points corresponding to the min of Time ( the extreme left points of the facets), and the max of time ( the extreme right points of the facets). No need to adjust other points even I do it here.
DX[,hjust:=ifelse(Time==min(Time),0.1, ## extreme left point
ifelse(Time==max(Time),0.8, ## extreme right points
0.4)), ## others
Year] ## for each facet
You can do the trsnformation in base R using ave:
ave(as.numeric(xx$Time),xx$Year,
FUN=function(x)
ifelse(x==min(x),0.1,ifelse(x==max(x),0.8,0.4)))
Related
ANOVA error: why is each row of output *not* identified by a unique combination of keys?
I have a two-way ANOVA test (w/repeated measures) that I'm using with four almost identical datasets: > res.aov <- anova_test( + data = LST_Weather_dataset_N, dv = LST, wid = Month, + within = c(Buffer, TimePeriod), + effect.size = "ges", + detailed = TRUE, + ) Where: LST = surface temperature deviation in C Month = 1-12 Buffer = a value 100-1900 - one of 19 areas outward from the boundary of a solar power plant (each 100m wide) TimePeriod = a factor with a value of 1 or 2 corresponding to pre-/post-construction of a solar power plant. For one dataset I get the error: Error: Each row of output must be identified by a unique combination of keys. Keys are shared for 38 rows: * 10, 11 * 217, 218 * 240, 241 * 263, 264 * 286, 287 * 309, 310 * 332, 333 ... As far as I can tell I have unique combinations. dplyr::count(LST_Weather_dataset_N, LST, Month, Buffer, TimePeriod, sort = TRUE) returns LST Month Buffer TimePeriod n 1 -6.309045316 12 100 2 1 2 -5.655279925 9 1000 2 1 3 -5.224196295 12 200 2 1 4 -5.194473224 9 1100 2 1 5 -5.025429891 12 400 2 1 6 -4.987575966 9 700 2 1 7 -4.979453868 12 600 2 1 8 -4.825298768 12 300 2 1 9 -4.668994574 12 500 2 1 10 -4.652282192 12 700 2 1 ... 'n' is always 1. I can't work out why this is happening. Extract of datafram below: > dput(LST_Weather_dataset_N[sample(1:nrow(LST_Weather_dataset_N), 50),]) structure(list(Buffer = c(1400L, 700L, 300L, 1400L, 100L, 200L, 1700L, 100L, 800L, 1900L, 1100L, 100L, 700L, 800L, 1400L, 400L, 1300L, 200L, 1200L, 500L, 1200L, 1300L, 400L, 1000L, 1300L, 1100L, 100L, 300L, 300L, 600L, 1100L, 1400L, 1500L, 1600L, 1700L, 1800L, 1700L, 1300L, 1200L, 300L, 1100L, 1900L, 1700L, 700L, 1400L, 1200L, 1600L, 1700L, 1900L, 1300L), Date = c("02/05/2014", "18/01/2017", "19/06/2014", "25/12/2013", "15/09/2017", "08/04/2017", "22/08/2014", "21/07/2014", "13/07/2017", "25/12/2013", "22/10/2013", "02/05/2014", "07/03/2017", "15/03/2014", "13/07/2017", "19/06/2014", "25/12/2013", "17/10/2017", "16/04/2014", "06/10/2013", "15/09/2017", "18/01/2017", "10/01/2014", "17/12/2016", "13/07/2017", "19/06/2014", "07/03/2017", "15/03/2014", "11/02/2014", "22/10/2013", "06/10/2013", "15/09/2017", "16/04/2014", "18/01/2017", "15/03/2014", "21/07/2014", "17/10/2017", "15/09/2017", "10/01/2014", "23/09/2014", "16/04/2014", "22/10/2013", "11/06/2017", "26/05/2017", "19/06/2014", "14/08/2017", "11/02/2014", "26/02/2017", "26/02/2017", "11/02/2014"), LST = c(1.255502397, 4.33385966, 3.327025603, -0.388631166, -0.865430798, 4.386292648, -0.243018665, 3.276865987, 0.957036835, -0.065821795, 0.69731779, 4.846851651, -1.437700684, 1.003808572, 0.572460421, 2.995902374, -0.334633662, -1.231447567, 0.644520741, 0.808262029, -3.392959991, 2.324569449, 2.346707612, -3.124354627, 0.58719862, 1.904859254, 1.701580958, 2.792443253, 1.638270039, 1.460743317, 0.699767335, -3.015643366, 0.930527864, 1.309519336, 0.477789664, 0.147584938, -0.498188865, -3.506795723, -1.007487965, 1.149604087, 1.192366386, 0.197471474, 0.999391224, -0.190613618, 1.27324015, 2.686622796, 0.573109026, 0.97847983, 0.395005095, -0.40855426), Month = c(5L, 1L, 6L, 12L, 9L, 4L, 8L, 7L, 7L, 12L, 10L, 5L, 3L, 3L, 7L, 6L, 12L, 10L, 4L, 10L, 9L, 1L, 1L, 12L, 7L, 6L, 3L, 3L, 2L, 10L, 10L, 9L, 4L, 1L, 3L, 7L, 10L, 9L, 1L, 9L, 4L, 10L, 6L, 5L, 6L, 8L, 2L, 2L, 2L, 2L), Year = c(2014L, 2017L, 2014L, 2013L, 2017L, 2017L, 2014L, 2014L, 2017L, 2013L, 2013L, 2014L, 2017L, 2014L, 2017L, 2014L, 2013L, 2017L, 2014L, 2013L, 2017L, 2017L, 2014L, 2016L, 2017L, 2014L, 2017L, 2014L, 2014L, 2013L, 2013L, 2017L, 2014L, 2017L, 2014L, 2014L, 2017L, 2017L, 2014L, 2014L, 2014L, 2013L, 2017L, 2017L, 2014L, 2017L, 2014L, 2017L, 2017L, 2014L ), JulianDay = c(122L, 18L, 170L, 359L, 258L, 98L, 234L, 202L, 194L, 359L, 295L, 122L, 66L, 74L, 194L, 170L, 359L, 290L, 106L, 279L, 258L, 18L, 10L, 352L, 194L, 170L, 66L, 74L, 42L, 295L, 279L, 258L, 106L, 18L, 74L, 202L, 290L, 258L, 10L, 266L, 106L, 295L, 162L, 146L, 170L, 226L, 42L, 57L, 57L, 42L), TimePeriod = c(1L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 2L, 1L), Temperature = c(28L, 9L, 31L, 12L, 27L, 21L, 29L, 36L, 38L, 12L, 23L, 28L, 12L, 21L, 38L, 31L, 12L, 23L, 25L, 22L, 27L, 9L, 11L, 7L, 38L, 31L, 12L, 21L, 14L, 23L, 22L, 27L, 25L, 9L, 21L, 36L, 23L, 27L, 11L, 31L, 25L, 23L, 29L, 27L, 31L, 34L, 14L, 16L, 16L, 14L), Humidity = c(6L, 34L, 7L, 31L, 29L, 22L, 34L, 15L, 19L, 31L, 16L, 6L, 14L, 14L, 19L, 7L, 31L, 12L, 9L, 12L, 29L, 34L, 33L, 18L, 19L, 7L, 14L, 14L, 31L, 16L, 12L, 29L, 9L, 34L, 14L, 15L, 12L, 29L, 33L, 18L, 9L, 16L, 8L, 13L, 7L, 13L, 31L, 31L, 31L, 31L), Wind_speed = c(6L, 0L, 6L, 7L, 13L, 33L, 6L, 20L, 9L, 7L, 0L, 6L, 0L, 6L, 9L, 6L, 7L, 6L, 0L, 7L, 13L, 0L, 0L, 35L, 9L, 6L, 0L, 6L, 6L, 0L, 7L, 13L, 0L, 0L, 6L, 20L, 6L, 13L, 0L, 0L, 0L, 0L, 24L, 11L, 6L, 24L, 6L, 26L, 26L, 6L), Wind_gust = c(0L, 0L, 0L, 0L, 0L, 54L, 0L, 46L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 48L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 46L, 0L, 0L, 0L, 0L, 0L, 0L, 48L, 0L, 0L, 39L, 0L, 41L, 41L, 0L), Wind_trend = c(1L, 0L, 1L, 1L, 2L, 2L, 0L, 1L, 2L, 1L, 0L, 1L, 0L, 1L, 2L, 1L, 1L, 0L, 0L, 2L, 2L, 0L, 1L, 1L, 2L, 1L, 0L, 1L, 1L, 0L, 2L, 2L, 0L, 0L, 1L, 1L, 0L, 2L, 1L, 1L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), Wind_direction = c(0, 0, 0, 337.5, 360, 22.5, 0, 22.5, 0, 337.5, 0, 0, 0, 0, 0, 0, 337.5, 180, 0, 247.5, 360, 0, 0, 180, 0, 0, 0, 0, 337.5, 0, 247.5, 360, 0, 0, 0, 22.5, 180, 360, 0, 0, 0, 0, 360, 22.5, 0, 360, 337.5, 360, 360, 337.5), Pressure = c(940.2, 943.64, 937.69, 951.37, 932.69, 933.94, 937.07, 938.01, 937.69, 951.37, 939.72, 940.2, 948.33, 947.71, 937.69, 937.69, 951.37, 943.32, 932.69, 944.71, 932.69, 943.64, 942.31, 943.01, 937.69, 937.69, 948.33, 947.71, 941.94, 939.72, 944.71, 932.69, 932.69, 943.64, 947.71, 938.01, 943.32, 932.69, 942.31, 938.94, 932.69, 939.72, 928.31, 931.12, 937.69, 932.37, 941.94, 936.13, 936.13, 941.94), Pressure_trend = c(1L, 2L, 0L, 2L, 0L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 0L, 2L, 1L, 2L, 1L, 0L, 2L, 2L, 2L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 0L, 2L, 2L, 1L, 1L, 1L, 0L, 2L, 1L, 2L, 1L, 0L, 0L, 0L, 1L, 1L, 2L, 2L, 1L)), row.names = c(179L, 14L, 195L, 426L, 306L, 118L, 299L, 229L, 244L, 436L, 374L, 153L, 90L, 91L, 256L, 197L, 424L, 348L, 137L, 355L, 328L, 26L, 7L, 419L, 254L, 211L, 78L, 81L, 43L, 359L, 373L, 332L, 143L, 32L, 109L, 263L, 393L, 330L, 23L, 309L, 135L, 398L, 224L, 166L, 217L, 290L, 69L, 72L, 76L, 63L), class = "data.frame")
Well, this is a bit embarrassing. The error arose as there were not, in fact, paired months of the data. Rather than there being 38 data (19x2) for each month, due to an error in determining the month value one month had 57 data (19x3). Correcting this, and checking that each month had the same number of paired data for the ANOVA allowed the test to run sucessfully. > res.aov <- anova_test( + data = LST_Weather_dataset_N, dv = LST, wid = Month, + within = c(Buffer, TimePeriod), + effect.size = "ges", + detailed = TRUE, + ) > get_anova_table(res.aov, correction = "auto") ANOVA Table (type III tests) Effect DFn DFd SSn SSd F p p<.05 ges 1 (Intercept) 1 11 600.135 974.584 6.774 2.50e-02 * 0.189 2 Buffer 18 198 332.217 331.750 11.015 2.05e-21 * 0.115 3 TimePeriod 1 11 29.561 977.945 0.333 5.76e-01 0.011 4 Buffer:TimePeriod 18 198 13.055 283.797 0.506 9.53e-01 0.005 I still don't understand how the error message was telling me this, though.
Identify function is not accurate in R
Here is the problem: When I use cook's distance to check influential points in SLR, I used two methods. First one: plot(mortality.model, which = 4) This one gives me the correct answer. Second one: plot(cooks.distance(mortality.model), type = 'p') identify(cooks.distance(mortality.model)) This one gives me the wrong answer, but very close to the correct answer. Read the data set: df.mortality <- read.csv("mortality.csv", header = TRUE) Build the model: mortality.model <- lm(log(infant) ~ log(income)) By the way, the dataset has NA values. If you would like to see the dataset, I could email it to you. The dput result: structure(list(X = structure(c(4L, 5L, 7L, 15L, 23L, 29L, 30L, 101L, 41L,43L, 46L, 61L, 62L, 66L, 73L, 79L, 86L, 87L, 10L, 97L, 2L, 25L, 38L, 39L, 40L, 52L, 65L, 75L, 100L, 3L, 9L, 18L, 19L, 21L, 24L, 32L, 33L, 42L, 45L, 50L, 55L, 58L, 63L, 68L, 71L, 77L, 83L, 89L, 93L, 94L, 99L, 103L, 105L, 8L, 14L, 20L, 26L, 27L, 31L, 36L, 44L, 47L, 80L, 51L, 59L, 69L, 70L, 72L, 88L, 91L, 95L, 81L, 1L, 6L,11L, 12L, 13L, 16L, 17L, 22L, 28L, 34L, 35L, 37L, 48L, 49L, 53L, 54L, 56L, 57L, 60L, 64L, 67L, 74L, 76L, 78L, 84L, 85L, 90L, 92L, 96L, 98L, 82L, 102L, 104L), .Label = c("Afganistan", "Algeria", "Argentina", "Australia", "Austria", "Bangladesh","Belgium", "Bolivia", "Brazil", "Britain", "Burma","Burundi","Cambodia","Cameroon", "Canada", "Central.African.Republic", "Chad","Chile", "Colombia","Congo", "Costa.Rica", "Dahomey", "Denmark", "Dominican.Republic", "Ecuador", "Egypt", "El.Salvador", "Ethiopia", "Finland", "France", "Ghana", "Greece", "Guatemala", "Guinea", "Haiti", "Honduras", "India", "Indonesia", "Iran", "Iraq", "Ireland", "Israel", "Italy", "Ivory.Coast", "Jamaica", "Japan", "Jordan", "Kenya", "Laos", "Lebanon", "Liberia", "Libya", "Madagascar", "Malawi", "Malaysia", "Mali", "Mauritania", "Mexico", "Moroco", "Nepal", "Netherlands", "New.Zealand", "Nicaragua", "Niger", "Nigeria", "Norway", "Pakistan", "Panama", "Papua.New.Guinea", "Paraguay", "Peru", "Philippines", "Portugal", "Rwanda", "Saudi.Arabia", "Sierra.Leone", "Singapore", "Somalia", "South.Africa", "South.Korea", "South.Vietnam", "Southern.Yemen", "Spain", "Sri.Lanka", "Sudan", "Sweden", "Switzerland", "Syria", "Taiwan", "Tanzania", "Thailand", "Togo", "Trinidad.and.Tobago", "Tunisia", "Turkey", "Uganda", "United.States", "Upper.Volta", "Uruguay", "Venezuela", "West.Germany", "Yemen", "Yugoslavia", "Zaire", "Zambia"), class = "factor"), income = c(3426L, 3350L, 3346L, 4751L, 5029L, 3312L, 3403L, 5040L, 2009L, 2298L, 3292L, 4103L, 3723L, 4102L, 956L, 1000L, 5596L, 2963L, 2503L, 5523L, 400L, 250L, 110L, 1280L, 560L, 3010L, 220L, 1530L, 1240L, 1191L, 425L, 590L, 426L, 725L, 406L, 1760L, 302L, 2526L, 727L, 631L, 295L, 684L, 507L, 754L, 335L, 1268L, 1256L, 261L, 732L, 434L, 799L, 406L, 310L, 200L, 100L, 281L, 210L, 319L, 217L, 284L, 387L, 334L, 344L, 197L, 279L, 477L, 347L, 230L, 334L, 210L, 435L, 130L, 75L, 100L, 73L, 68L, 123L, 122L, 70L, 81L, 79L, 79L, 100L, 93L, 169L, 71L, 120L, 130L, 50L, 174L, 90L, 70L, 102L, 61L, 148L, 85L, 162L, 125L, 120L, 160L, 134L, 82L, 96L, 77L, 118L), infant = c(26.7, 23.7, 17, 16.8, 13.5, 10.1, 12.9, 20.4, 17.8, 25.7, 11.7, 11.6, 16.2, 11.3, 44.8, 71.5, 9.6, 12.8, 17.5, 17.6, 86.3, 78.5, 125, NA, 28.1, 300, 58, 650, 51.7, 59.6, 170, 78, 62.8, 54.4, 48.8, 27.8, 79.1, 22.1, 26.2, 13.6, 32, 60.9, 46, 34.1, 65.1, 20.4, 15.1, 19.1, 26.2, 76.3, 40.4, 43.3, 259, 60.4, 137, 180, 114, 58.2, 63.7, 39.3, 138, 21.3, 58, 159.2, 149, 10.2, 38.6, 67.9, 21.7, 27, 153, 100, 400, 124.3, 200, 150, 100, 190, 160, 109.6, 84.2, 216, NA, 60.6, 55, NA, 102, 148.3, 120, 187, NA, 200, 124.3, 132.9, 170, 158, 45.1, 129.4, 162.5, 127, 160, 180, 80, 50, 104), region = structure(c(3L, 4L, 4L, 2L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 3L, 4L, 4L, 1L, 4L, 4L, 4L, 2L, 1L, 2L, 3L, 3L, 3L, 1L, 1L, 3L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 4L, 2L, 3L, 2L, 3L, 3L, 2L, 2L, 2L, 2L, 3L, 4L, 3L, 2L, 1L, 2L, 4L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 3L, 3L, 1L, 1L, 3L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 3L, 1L, 1L, 1L, 1L, 1L, 2L, 3L, 1L, 3L, 1L, 1L, 1L, 1L, 3L, 1L, 3L, 1L, 1L, 1L, 3L, 1L, 1L, 1L, 1L, 1L, 3L, 3L, 1L), .Label = c("Africa", "Americas", "Asia", "Europe"), class = "factor"), oil = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("no", "yes"), class = "factor")), class = "data.frame", row.names = c(NA, -105L)) Thanks! Here are results:The correct answer The wrong answer Could anyone explain why it happened?
Plot values in ggplot geom_lines
I have a dataframe like this one: > dput(df) structure(list(OBBLIGATORIO = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("no", "yes"), class = "factor"), COUNTRY = structure(c(16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L), .Label = c("Austria", "Belgium", "Bulgaria", "Croatia", "Cyprus", "Czech Republic", "Denmark", "Estonia", "Finland", "France", "Germany", "Greece", "Hungary", "Iceland", "Ireland", "Italy", "Latvia", "Lithuania", "Luxembourg", "Malta", "Norway", "Poland", "Portugal", "Romania", "Slovakia", "Slovenia", "Spain", "Sweden", "United Kingdom of Great Britain and Northern Ireland" ), class = "factor"), YEAR = c(2003L, 2006L, 2007L, 2008L, 2009L, 2010L, 1995L, 1996L, 1997L, 1998L, 1999L, 2000L, 2001L, 2002L, 2003L, 2006L, 2007L, 2008L, 2009L, 2010L, 1995L, 1996L, 1997L, 1998L, 1999L, 2000L, 2001L, 2002L, 2003L, 2006L, 2007L, 2008L, 2009L, 2010L, 1995L, 1996L, 1997L, 1998L, 1999L, 2000L, 2001L, 2002L, 2003L, 2006L, 2007L, 2008L, 2009L, 2010L, 1995L, 1996L, 1997L, 1998L, 1999L, 2000L, 2001L, 2002L, 2003L, 2006L, 2007L, 2008L, 2009L, 2010L, 1995L, 1996L, 1997L, 1998L, 1999L, 2000L, 2001L, 2002L, 2003L, 2006L, 2007L, 2008L, 2009L, 2010L, 1995L, 1996L, 1997L, 1998L, 1999L, 2000L, 2001L, 2002L, 2003L, 2006L, 2007L, 2008L, 2009L, 2010L, 1995L, 1996L, 1997L, 1998L, 1999L, 2000L, 2001L, 2002L), AGE = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "Total", class = "factor"), `CAUSE OF DEATH` = c("Acute poliomyelitis", "Acute poliomyelitis", "Acute poliomyelitis", "Acute poliomyelitis", "Acute poliomyelitis", "Acute poliomyelitis", "Acute poliomyelitis", "Acute poliomyelitis", "Acute poliomyelitis", "Acute poliomyelitis", "Acute poliomyelitis", "Acute poliomyelitis", "Acute poliomyelitis", "Acute poliomyelitis", "Diphtheria", "Diphtheria", "Diphtheria", "Diphtheria", "Diphtheria", "Diphtheria", "Diphtheria", "Diphtheria", "Diphtheria", "Diphtheria", "Diphtheria", "Diphtheria", "Diphtheria", "Diphtheria", "Measles", "Measles", "Measles", "Measles", "Measles", "Measles", "Measles", "Measles", "Measles", "Measles", "Measles", "Measles", "Measles", "Measles", "Tetanus", "Tetanus", "Tetanus", "Tetanus", "Tetanus", "Tetanus", "Tetanus", "Tetanus", "Tetanus", "Tetanus", "Tetanus", "Tetanus", "Tetanus", "Tetanus", "Tuberculosis", "Tuberculosis", "Tuberculosis", "Tuberculosis", "Tuberculosis", "Tuberculosis", "Tuberculosis", "Tuberculosis", "Tuberculosis", "Tuberculosis", "Tuberculosis", "Tuberculosis", "Tuberculosis", "Tuberculosis", "Viral hepatitis", "Viral hepatitis", "Viral hepatitis", "Viral hepatitis", "Viral hepatitis", "Viral hepatitis", "Viral hepatitis", "Viral hepatitis", "Viral hepatitis", "Viral hepatitis", "Viral hepatitis", "Viral hepatitis", "Viral hepatitis", "Viral hepatitis", "Whooping cough", "Whooping cough", "Whooping cough", "Whooping cough", "Whooping cough", "Whooping cough", "Whooping cough", "Whooping cough", "Whooping cough", "Whooping cough", "Whooping cough", "Whooping cough", "Whooping cough", "Whooping cough" ), VALUE = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 4L, 2L, 2L, 2L, 1L, 1L, 6L, 7L, 7L, 1L, 2L, 3L, 2L, 5L, 12L, 9L, 13L, 9L, 13L, 8L, 17L, 14L, 16L, 18L, 15L, 19L, 11L, 10L, 25L, 24L, 21L, 22L, 23L, 20L, 34L, 32L, 31L, 30L, 29L, 28L, 27L, 26L, 41L, 42L, 43L, 45L, 46L, 47L, 33L, 35L, 36L, 37L, 38L, 39L, 40L, 44L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 3L, 1L, 1L, 1L, 1L), .Label = c("0", "1", "2", "3", "6", "7", "9", "17", "18", "19", "21", "22", "27", "28", "30", "31", "37", "41", "42", "301", "329", "333", "344", "350", "396", "413", "415", "460", "517", "558", "597", "609", "622", "647", "681", "1087", "1349", "1413", "1448", "1499", "1576", "1654", "1725", "1948", "2531", "2665", "2757" ), class = "factor"), ID = 1:98), .Names = c("OBBLIGATORIO", "COUNTRY", "YEAR", "AGE", "CAUSE OF DEATH", "VALUE", "ID"), row.names = c(NA, -98L), class = "data.frame") I want to obtain a chart that: on x axis there are values from YEAR column on y axis there are values from VALUE column data are divided by CAUSE OF DEATH column So something like: I try: x11() ggplot(df, aes(x = df$`YEAR`, y = df$`VALUE`, fill = df$`CAUSE OF DEATH`, colour = df$`CAUSE OF DEATH`)) + geom_density(alpha = 0.1) + xlim(1995, 2010) But the result is completely different from the one I want. Thanks
I'm not sure what your actual question is, but one problem with your dataframe is that the VALUE column is currently defined as a factor, not as as a numeric. I think that remedying this will go a long way to solving your problem. I do this post-facto below (i.e. after the dataframe is already created), but if you are getting the data into R via a read.table() or similar command, you can specify the class of your columns at data frame creation time, which is probably a better approach. In my code below I use the dplyr package for manipulating dataframes. It's quite powerful, but for this particular example it isn't doing anything that base R couldn't do. require(ggplot2) require(dplyr) require(magrittr) df <- ### YOUR dput output goes here ### # fix the problem with the `VALUE` column df %<>% mutate(VALUE = VALUE %>% as.character %>% as.numeric) # equivalent in base R: # df$VALUE <- as.numeric(as.character(df$VALUE)) # make a graph (is it the one you want?) df %>% group_by(YEAR, `CAUSE OF DEATH`) %>% summarize(value = sum(VALUE)) %>% ggplot(aes(x = YEAR, y = value, color = `CAUSE OF DEATH`)) + geom_line() + theme_bw() + geom_point() # save graph for uploading to SO ggsave('SO37230266.png') The result is this graph:
Looping subsets in plm
I'm trying to program something quite simple (I think) in R, but I can't seem to get it right. I have a dataset of 50 countries (1 to 50) for 15 years each and about 20 variables per country. For now I am only testing one variable (OS) on my dependent variable (SMD). I would like to do this with a loop country by country so I would get the output for each country in stead of the overall output. I thought it would be wise to create a subset first (to be able to look at country 1 first, after which my loop should increase the number for country and test country 2). I believe my regression at the bottom of the page should give me the output for country 1 in stead of the overall score for the entire dataset. However I keep getting these errors: > pdata <- plm.data(newdata, index=c("Country","Date")) series are constants and have been removed > pooling <- plm(Y ~ X, data=pdata, model= "pooling") series Country, xRegion are constants and have been removed Error in model.matrix.pFormula(formula, data, rhs = 1, model = model, : NA in the individual index variable > summary(pooling) Error in summary(pooling) : object 'pooling' not found I might be looking at this all wrong, but I believe that without getting this to work, there is no point in going further with programming the loop itself. Any advice on solving my errors, or other ways of programming a loop are really appreciated. My code: rm(list = ls()) mydata <- read.table(file = file.choose(), header = TRUE, dec = ",") names(mydata) attach(mydata) Y <- cbind(SMD) X <- cbind(OS) newdata <- subset(mydata, Country %in% c(1)) newdata pdata <- plm.data(newdata, index=c("Country","Date")) pooling <- plm(Y ~ X, data=pdata, model= "pooling") summary(pooling) Edit: data sample of first 2 countries which causes same error dput(mydata) structure(list(Region = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("NAF", "SAME"), class = "factor"), Country = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), Date = c(1995L, 1996L, 1997L, 1998L, 1999L, 2000L, 2001L, 2002L, 2003L, 2004L, 2005L, 2006L, 2007L, 2008L, 2009L, 2010L, 2011L, 2012L, 2013L, 2014L, 1995L, 1996L, 1997L, 1998L, 1999L, 2000L, 2001L, 2002L, 2003L, 2004L, 2005L, 2006L, 2007L, 2008L, 2009L, 2010L, 2011L, 2012L, 2013L, 2014L ), OS = structure(c(19L, 25L, 27L, 15L, 22L, 20L, 23L, 9L, 7L, 5L, 2L, 1L, 4L, 3L, 6L, 10L, 11L, 13L, 11L, 8L, 26L, 25L, 31L, 29L, 28L, 21L, 30L, 24L, 24L, 16L, 11L, 14L, 12L, 17L, 18L, 29L, 32L, 32L, 33L, 34L), .Label = c("51.5", "52.2", "55.6", "56.4", "56.7", "57.7", "57.8", "58.3", "59", "59.2", "59.6", "59.9", "60.2", "60.4", "61.1", "61.2", "62.2", "62.3", "62.8", "63.2", "63.3", "63.8", "63.9", "64.2", "64.3", "64.5", "64.7", "65.3", "65.5", "65.6", "66.4", "68", "69.6", "70.7"), class = "factor"), SMD = structure(c(7L, 12L, 20L, 21L, 17L, 15L, 13L, 10L, 14L, 22L, 23L, 33L, 1L, 32L, 29L, 34L, 28L, 25L, NA, NA, 9L, 6L, 8L, 4L, 2L, 35L, 3L, 36L, 5L, 11L, 16L, 18L, 24L, 19L, 26L, 31L, 27L, 30L, NA, NA), .Label = c("100.3565662", "13.44788845", "13.45858747", "13.56815534", "15.05892471", "17.63789658", "18.04088718", "18.3101351", "19.34226196", "21.25530884", "21.54423145", "23.75898948", "24.08770926", "26.39817342", "29.44079001", "31.40605191", "34.46667996", "34.52913657", "35.66070947", "36.4419931", "39.16875621", "44.0126137", "45.72949566", "49.13062679", "54.83730247", "56.87886311", "59.80971583", "60.5658962", "69.20148901", "70.91362874", "72.64845214", "73.97139238", "75.20140919", "76.18378138", "9.570435019", "9.867635305"), class = "factor")), .Names = c("Region", "Country", "Date", "OS", "SMD"), class = "data.frame", row.names = c(NA, -40L))
Are you sure you need to use plm?? This produces a list of summaries by country. # convert factors to numeric mydata$SMD <- as.numeric(mydata$SMD) mydata$OS <- as.numeric(mydata$OS) # Using lapply(...) smry <- lapply(unique(mydata$Country), function(cntry) summary(lm(SMD~OS,data=mydata[mydata$Country==cntry,]))) # Same thing, using for loop smry <- list() for (cntry in unique(mydata$Country)) { smry <- list(smry, summary(lm(SMD~OS,data=mydata[mydata$Country==cntry,]))) } In your dataset, SMD and OS are factors, which need to be converted to numeric first.
R ggplot geom_bar facet dodge
I'm having some trouble producing a faceted bar_plot in ggplot2. Perhaps it is something very obvious, but I can't figure it out:( I've the following dataset: structure(list(COUNTRY = structure(c(1L, 4L, 7L, 10L, 13L, 16L, 19L, 2L, 5L, 8L, 11L, 14L, 17L, 20L, 3L, 6L, 9L, 12L, 15L, 18L, 2L, 5L, 8L, 11L, 14L, 17L, 20L, 3L, 6L, 9L, 12L, 15L, 18L, 1L, 4L, 7L, 10L, 13L, 16L, 19L, 3L, 6L, 9L, 12L, 15L, 18L, 1L, 4L, 7L, 10L, 13L, 16L, 19L, 2L, 5L, 8L, 11L, 14L, 17L, 20L), .Label = c("Angola", "Botswana", "Burundi", "Comoros", "Eritrea", "Ethiopia", "Kenya", "Lesotho", "Madagascar", "Malawi", "Mozambique", "Namibia", "Rwanda", "Somalia", "South Africa", "Swaziland", "Tanzania", "Uganda", "Zambia", "Zimbabwe"), class = "factor"), Year = structure(c(2L, 2L, 14L, 16L, 16L, 11L, 12L, 2L, 4L, 15L, 5L, 10L, 16L, 16L, 2L, 17L, 14L, 11L, 12L, 10L, 2L, 4L, 15L, 5L, 10L, 16L, 16L, 2L, 17L, 14L, 11L, 12L, 10L, 2L, 2L, 14L, 16L, 16L, 11L, 12L, 2L, 17L, 14L, 11L, 12L, 10L, 2L, 2L, 14L, 16L, 16L, 11L, 12L, 2L, 4L, 15L, 5L, 10L, 16L, 16L), .Label = c("1998", "2000", "2001/2", "2002", "2003", "2003/4", "2004", "2005", "2005/6", "2006", "2006/7", "2007", "2007/8", "2008/9", "2009", "2010", "2011"), class = "factor"), sex = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("m", "f", "b"), class = "factor"), location = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("Urban", "Rural", "Total", "Capital.City", "Other.Cities.towns", "Urban.Non.slum", "Urban.Slum" ), class = "factor"), percent = c(60.4, 42.3, 85.4919452426806, 96.3, 90.2847535659154, 87.7347421555771, 87.7323067592087, 80.4, 80.6, 93.8186266493188, 75.0109418832216, 36.8, 87.1059275774722, 90.1216932603937, 66.8, 83.6279398931798, 89.690685909038, 88.8207941092749, 94.6139558774441, 88.0251085200726, 70.4, 54.7, 86.1919805548309, 56.9792710715853, 13.1, 75.6355555697382, 86.8196674671991, 42.5, 61.9452522893308, 77.597285694676, 88.3453320625631, 94.5192341778471, 80.6271302923487, 44.1, 29, 77.8542469357068, 90, 86.7073851186482, 83.8921034867784, 76.4094871587916, 49.3, 63.952805392032, 77.004884485532, 88.6723566877386, 93.9560433940531, 82.3095948307742, 56.1, 31.1, 80.0235653889704, 91.5, 88.3809682134183, 85.5656196766576, 80.0539027063387, 77, 61.2, 89.2538966046165, 59.6756344409838, 23, 79.6749544074645, 86.9507859695728)), .Names = c("COUNTRY", "Year", "sex", "location", "percent"), row.names = c(1L, 4L, 7L, 10L, 13L, 16L, 19L, 22L, 25L, 28L, 31L, 34L, 37L, 40L, 43L, 46L, 49L, 52L, 55L, 58L, 62L, 65L, 68L, 71L, 74L, 77L, 80L, 83L, 86L, 89L, 92L, 95L, 98L, 101L, 104L, 107L, 110L, 113L, 116L, 119L, 123L, 126L, 129L, 132L, 135L, 138L, 141L, 144L, 147L, 150L, 153L, 156L, 159L, 162L, 165L, 168L, 171L, 174L, 177L, 180L), class = "data.frame") I am trying to make a bar_plot which shows the percentage of people living in rural, urban areas (and the average) for a number of countries, and wish to show this split by gender. I can plot one of these categories on a simple bar plot by using a subset call within the ggplot function as follows: ggplot(edu_melt[c(edu_melt$sex!="b" & edu_melt$location==c("Urban")), ], aes(x=COUNTRY, y=percent, fill=sex)) + geom_bar(position="dodge", width=0.5) + facet_grid(~location) + labs(x="Country") + theme(axis.text.x = element_text(angle=30, hjust=1, vjust=1)) I would however like to compare the data across the location (e.g. urban, rural, and both). I thought this would be a simple case of introducing a facet_wrap call, however I get some odd behaviour where the data is plotted across the three facets - I would expect 20 pairs of bars on each facet, however this code produces 20 pairs of bars spread over the three facets?! ggplot(edu_melt_over[c(edu_melt_over$sex!="b"),], aes(x=COUNTRY, y=percent, fill=sex)) + geom_bar(position="dodge", width=0.5, space=1) + facet_wrap(~location, nrow=3) + labs(x="Country", title="Proportion Net Primary School Enrolement in ESA") + theme(axis.text.x = element_text(angle=30, hjust=1, vjust=1)) I'm not sure why this is happening, but have searched for hints and tips and tried a number of approaches, but get the same result. Anybody have any idea how I could produce this plot? Thanks Marty
Your data looks odd as you don't seem to have any combinations of male and female in the same strata (e.g. Angola has a male urban percent but no female). This is the data not the plotting. ggplot(edu_melt[edu_melt$sex!="b", ], aes(x=COUNTRY, y=percent, fill=sex)) + geom_bar(position="dodge", width=0.25) + facet_grid(location~.) + labs(x="Country") + theme(axis.text.x = element_text(angle=30))