R plotting data in a wrong way - r

I am have an issue with ggplot. Spike going down and up, between 9 and 12 is not in the data.
structure(list(model = structure(c(46L, 46L, 46L, 46L, 46L, 46L,
46L, 46L, 46L), .Label = c("11111", "11112", "11121", "11122",
"11131", "11132", "11211", "11212", "11221", "11222", "11231",
"11232", "12111", "12112", "12121", "12122", "12131", "12132",
"12211", "12212", "12221", "12222", "12231", "12232", "21111",
"21112", "21121", "21122", "21131", "21132", "21211", "21212",
"21221", "21222", "21231", "21232", "22111", "22112", "22121",
"22122", "22131", "22132", "22211", "22212", "22221", "22222",
"22231", "22232"), class = "factor"), sens = c(0.8, 0.8, 0.8,
0.8, 0.8, 0.8, 0.7, 0.7, 0.7), one_min_spec = c(0.448717948717949,
0.423076923076923, 0.397435897435897, 0.358974358974359, 0.358974358974359,
0.346153846153846, 0.346153846153846, 0.346153846153846, 0.333333333333333
), cut_point = 6:14), .Names = c("model", "sens", "one_min_spec",
"cut_point"), row.names = c(279L, 327L, 375L, 423L, 471L, 519L,
567L, 615L, 663L), class = "data.frame")
Plotting function:
require("ggplot")
ggplot(df)+
geom_line(data= df, aes(x = one_min_spec, y = sens), colour = "blue")+
geom_text(aes(x = one_min_spec, y = sens,label=cut_point),hjust=1, vjust=-1)
Thanks in advance.

If you want your points to be connected by observation (row) order instead of x-value, you need to use geom_path instead of geom_line. Tidying a bit and substituting ggrepel::geom_text_repel for geom_text to avoid overlapping labels,
library(ggplot2)
df <- structure(list(model = structure(c(46L, 46L, 46L, 46L, 46L, 46L,
46L, 46L, 46L), .Label = c("11111", "11112", "11121", "11122",
"11131", "11132", "11211", "11212", "11221", "11222", "11231",
"11232", "12111", "12112", "12121", "12122", "12131", "12132",
"12211", "12212", "12221", "12222", "12231", "12232", "21111",
"21112", "21121", "21122", "21131", "21132", "21211", "21212",
"21221", "21222", "21231", "21232", "22111", "22112", "22121",
"22122", "22131", "22132", "22211", "22212", "22221", "22222",
"22231", "22232"), class = "factor"), sens = c(0.8, 0.8, 0.8,
0.8, 0.8, 0.8, 0.7, 0.7, 0.7), one_min_spec = c(0.448717948717949,
0.423076923076923, 0.397435897435897, 0.358974358974359, 0.358974358974359,
0.346153846153846, 0.346153846153846, 0.346153846153846, 0.333333333333333
), cut_point = 6:14), .Names = c("model", "sens", "one_min_spec",
"cut_point"), row.names = c(279L, 327L, 375L, 423L, 471L, 519L,
567L, 615L, 663L), class = "data.frame")
ggplot(df, aes(one_min_spec, sens, label = cut_point)) +
geom_path() + # make line, connecting consecutive observations
geom_point() + # for better visibility on straight section
ggrepel::geom_text_repel() # drop-in replacement for geom_text that avoids overlaps

You can just reorder the dataset row-wise
ggplot(df[order(df$sens),]) +
geom_line(aes(x = one_min_spec, y = sens), colour = "blue")+
geom_text(aes(x = one_min_spec, y = sens, label = cut_point), hjust = 1, vjust = -1)

Related

Plot choropleth from data.frame containing coordinates/zip code and id

I'm analysing real-estate sales for some N. American cities and am using k-means clustering on the data. I have seven clusters and for each observation in the cluster I have the latitude, longitude, zipcode, and cluster_id. I'd like to plot this on a map to better visualize the clusters - I'm not sure what such a plot is called - Choropleth? Polygon?
Most of the examples are using geoJSON files but I only have a data.frame object from my k-means clustering.
Actual data:
https://www.kaggle.com/threnjen/portland-housing-prices-sales-jul-2020-jul-2021
Sample data:
> dput(dt[runif(n = 10,min = 1,max = 25000)])
structure(list(id = c(23126L, 15434L, 5035L, 19573L, NA, 24486L,
NA, 14507L, 3533L, 20192L), zipcode = c(97224L, 97211L, 97221L,
97027L, NA, 97078L, NA, 97215L, 97124L, 97045L), latitude = c(45.40525436,
45.55965805, 45.4983139, 45.39398956, NA, 45.47454071, NA, 45.50736618,
45.52812958, 45.34381485), longitude = c(-122.7599182, -122.6500015,
-122.7288742, -122.591217, NA, -122.8898392, NA, -122.6084061,
-122.91745, -122.5948334), lastSoldPrice = c(469900L, 599000L,
2280000L, 555000L, NA, 370000L, NA, 605000L, 474900L, 300000L
), lotSize = c(5227L, 4791L, 64904L, 9147L, NA, 2178L, NA, 4356L,
2613L, 6969L), livingArea = c(1832L, 2935L, 5785L, 2812L, NA,
1667L, NA, 2862L, 1844L, 742L), cluster_id = c(7, 7, 2, 7, NA,
4, NA, 7, 7, 4)), row.names = c(NA, -10L), class = c("data.table",
"data.frame"), .internal.selfref = <pointer: 0x7faa8000fee0>)
I've followed the example on https://gist.github.com/josecarlosgonz/8565908 to try and create a geoJSON file to be able to plot this data but without success.
I'm not using markers because I have ~25,000 observations - it would be difficult to plot them all and the file would take forever to load.
EDIT:
observations by zipcode:
> dput(dat[, .N, by = .(`address/zipcode`)][(order(`address/zipcode`))])
structure(list(`address/zipcode` = c(7123L, 97003L, 97004L, 97005L,
97006L, 97007L, 97008L, 97009L, 97015L, 97019L, 97023L, 97024L,
97027L, 97030L, 97034L, 97035L, 97038L, 97045L, 97056L, 97060L,
97062L, 97068L, 97070L, 97078L, 97080L, 97086L, 97089L, 97113L,
97123L, 97124L, 97132L, 97140L, 97201L, 97202L, 97203L, 97204L,
97205L, 97206L, 97209L, 97210L, 97211L, 97212L, 97213L, 97214L,
97215L, 97216L, 97217L, 97218L, 97219L, 97220L, 97221L, 97222L,
97223L, 97224L, 97225L, 97227L, 97229L, 97230L, 97231L, 97232L,
97233L, 97236L, 97239L, 97266L, 97267L), N = c(1L, 352L, 9L,
252L, 421L, 1077L, 357L, 1L, 31L, 2L, 4L, 159L, 239L, 525L, 640L,
548L, 1L, 1064L, 5L, 353L, 471L, 736L, 6L, 403L, 866L, 913L,
8L, 5L, 1113L, 776L, 3L, 543L, 219L, 684L, 463L, 1L, 57L, 809L,
189L, 216L, 688L, 510L, 504L, 330L, 318L, 177L, 734L, 195L, 832L,
305L, 276L, 589L, 688L, 716L, 286L, 83L, 1307L, 475L, 77L, 150L,
382L, 444L, 290L, 423L, 430L)), row.names = c(NA, -65L), class = c("data.table",
"data.frame"), .internal.selfref = <pointer: 0x7f904781a6e0>)
I used the kaggle data on a simple laptop (i3 8th gen) to generate a ggplot2 object, with cluster IDs randomly sampled and transform this via the ggplotly() function ... the resulting plotly object seems OK to work with for analysis but I do not know your performance requirements:
library(dplyr)
library(ggplot2)
library(plotly)
library(rnaturalearth) # here we get the basic map data from
# read in data from zip, select minimal number of columns and sample cluster_id
df <- readr::read_csv(unzip("path_to_zip/portland_housing.csv.zip"))%>%
dplyr::select(az = `address/zipcode`, latitude, longitude) %>%
dplyr::mutate(cluster_id = sample(1:7, n(), replace = TRUE))
# get the map data
world <- rnaturalearth::ne_countries(scale = "medium", returnclass = "sf")
# build the ggplot2 object (note that I use rings as shapes and alpha parameter to reduce the over plotting
plt <- ggplot2::ggplot(data = world) +
ggplot2::geom_sf() +
ggplot2::geom_point(data = df, aes(x = longitude, y = latitude, color = factor(cluster_id)), size = 1, shape = 21, alpha = .7) +
ggplot2::coord_sf(xlim = c(-124.5, -122), ylim = c(45, 46), expand = FALSE)
# plot it:
plt
# plotly auto transform from ggplot2 object
plotly::ggplotly(plt)
EDIT
To include a map you can use for example the ggmap package instead of the map data from rnaturalearth... I will only display the plotly result:
library(ggmap)
# https://stackoverflow.com/questions/23130604/plot-coordinates-on-map
sbbox <- ggmap::make_bbox(lon = c(-124.5, -122), lat = c(45, 46), f = .1)
myarea <- ggmap::get_map(location=sbbox, zoom=10, maptype="terrain")
myarea <- ggmap::ggmap(myarea)
plt2 <- myarea +
ggplot2::geom_point(data = df, mapping = aes(x = longitude, y = latitude, color = factor(cluster_id)), shape = 21, alpha = .7)
plotly::ggplotly(plt2)
There are many other approaches concerning the map data, like using the mapbox-api

Combine two faceted plots on one plot

Sorry if this is a duplicate question but I cannot seem to find the answer to my question anywhere. I have two plots and I would like to overlay plot two on plot one so that they form one plot. Is this possible? I will attach how both plots look separately. They are both facetted by the same variable which is by location and are on the same x and y-axis scale so theoretically should be possible.
Thank you.
## Plot one
Proxy<-read.csv("ALLRSL.csv",header=T)
p1<-ggplot()+
geom_ribbon(data=Proxy,aes(x=YEAR,ymin=LOWER,ymax=UPPER,fill=SITE),alpha=.5)+
geom_line(data=Proxy,aes(x=YEAR,y=RSL,col=SITE))+
facet_wrap(~ SITE,ncol= 1)+
scale_fill_manual(values=c("#4E193D","#342955","#4E617E","#97B4CB"))+
scale_color_manual(values=c("#4E193D","#342955","#4E617E","#97B4CB"))+
theme_classic()+
xlim(1900, 2020)+
theme(panel.grid.major.x = element_blank())+
theme(panel.grid.minor.x = element_blank())+
theme(panel.grid.minor.y = element_blank())+
theme(panel.grid.major.y = element_blank())+
theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank())+
theme(
strip.background = element_blank(),
strip.text.x = element_blank()
)+
theme(legend.position="none")
p1
plot two
tgsm<-read.csv("tgsm.csv",header=T)
tgsm<-na.omit(tgsm)
tglonger<-pivot_longer(tgsm, cols=c(-Year),names_to="Site", values_to = "value")
p2<-ggplot()+
geom_point(data=tglonger,aes(x=Year,y=value,col=Site),alpha=.7,size=1)+
facet_wrap(~Site,ncol=1)+
theme_classic()+
xlim(1900,2020)+
scale_color_manual(values=c("#4E193D","#342955","#4E617E","#97B4CB"))+
theme(panel.grid.major.x = element_blank())+
theme(panel.grid.minor.x = element_blank())+
theme(panel.grid.minor.y = element_blank())+
theme(panel.grid.major.y = element_blank())+
theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank())+
theme(
strip.background = element_blank(),
strip.text.x = element_blank()
)+
theme(legend.position="none")
p2
Data
Proxy <- structure(list(RSL = c(-0.305251214, -0.306414006, -0.307194187,
-0.308202139, -0.309150572, -0.309679123), UPPER = c(-0.182716456,
-0.186724068, -0.189331305, -0.193118273, -0.197069799, -0.20118809
), LOWER = c(-0.416725663, -0.413606073, -0.411131729, -0.408930899,
-0.406531588, -0.404478981), YEAR = 1820:1825, SITE = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = c("Little Swanport", "Lutregala",
"Tarra", "Wapengo"), class = "factor")), row.names = c(NA, 6L
), class = "data.frame")
tgsm <- structure(list(Year = 1993:1998, Lg2002 = c(-0.001164223, -0.002229453,
-0.002734792, -0.002977787, -0.002786098, -0.002026994), Wap2002 = c(-0.002531348,
-0.002051993, -0.001468704, -0.001182162, -0.001027132, -0.00020881
), Tar2002 = c(-0.029020612, -0.024330561, -0.019927593, -0.015682528,
-0.012907219, -0.009784772), LSP2002 = c(-0.034514531, -0.030171621,
-0.026095138, -0.021952898, -0.018480702, -0.014531318)), na.action = structure(c(`1` = 1L,
`2` = 2L, `3` = 3L, `4` = 4L, `5` = 5L, `6` = 6L, `7` = 7L, `8` = 8L,
`9` = 9L, `10` = 10L, `11` = 11L, `12` = 12L, `13` = 13L, `14` = 14L,
`15` = 15L, `16` = 16L, `17` = 17L, `18` = 18L, `19` = 19L, `20` = 20L,
`21` = 21L, `22` = 22L, `23` = 23L, `24` = 24L, `25` = 25L, `26` = 26L,
`27` = 27L, `28` = 28L, `29` = 29L, `30` = 30L, `31` = 31L, `32` = 32L,
`33` = 33L, `34` = 34L, `35` = 35L, `36` = 36L, `37` = 37L, `38` = 38L,
`39` = 39L, `40` = 40L, `41` = 41L, `42` = 42L, `43` = 43L, `44` = 44L,
`45` = 45L, `46` = 46L, `47` = 47L, `48` = 48L, `49` = 49L, `50` = 50L,
`51` = 51L, `52` = 52L, `53` = 53L, `54` = 54L, `55` = 55L, `56` = 56L,
`57` = 57L, `58` = 58L, `59` = 59L, `60` = 60L, `61` = 61L, `62` = 62L,
`63` = 63L, `64` = 64L, `65` = 65L, `66` = 66L, `67` = 67L, `68` = 68L,
`69` = 69L, `70` = 70L, `71` = 71L, `72` = 72L, `73` = 73L, `74` = 74L,
`75` = 75L, `76` = 76L, `77` = 77L, `78` = 78L, `79` = 79L, `80` = 80L,
`81` = 81L, `82` = 82L, `83` = 83L, `84` = 84L, `85` = 85L, `86` = 86L,
`87` = 87L, `88` = 88L, `89` = 89L, `90` = 90L, `91` = 91L, `92` = 92L,
`93` = 93L, `94` = 94L, `95` = 95L, `96` = 96L, `97` = 97L, `98` = 98L,
`99` = 99L, `100` = 100L, `101` = 101L, `102` = 102L, `103` = 103L,
`104` = 104L, `105` = 105L, `106` = 106L, `107` = 107L, `108` = 108L,
`109` = 109L, `110` = 110L, `111` = 111L, `112` = 112L, `113` = 113L,
`114` = 114L, `115` = 115L, `116` = 116L, `117` = 117L, `118` = 118L,
`119` = 119L, `120` = 120L, `121` = 121L, `122` = 122L, `123` = 123L,
`124` = 124L, `125` = 125L, `126` = 126L, `127` = 127L, `128` = 128L,
`129` = 129L, `130` = 130L, `131` = 131L, `132` = 132L, `133` = 133L,
`134` = 134L, `135` = 135L, `136` = 136L, `137` = 137L, `138` = 138L,
`139` = 139L, `140` = 140L, `141` = 141L, `142` = 142L, `143` = 143L,
`144` = 144L, `145` = 145L, `146` = 146L, `147` = 147L, `148` = 148L,
`149` = 149L, `150` = 150L, `151` = 151L, `152` = 152L, `153` = 153L,
`154` = 154L, `155` = 155L, `156` = 156L, `157` = 157L, `183` = 183L
), class = "omit"), row.names = 158:163, class = "data.frame")
See plot one how you can do that with patchwork.
However. Conceptually, I am guessing you want to add a sort of prediction to some historic values or so. I personally would put everything in one data frame and plot this. If there is a too large gap between the two time points, you can facet by timepoints (as in my suggestion).
The plots look a bit different than your plot because you only provided data for one Site in Proxy (so I filtered the other for what I thought is the equivalent, it will work nonetheless, because the faceting remains) - and I removed all those theme elements that are not relevant to the problem.
Plot one - combining plots.
library(tidyverse)
library(patchwork)
tgsm<-na.omit(tgsm)
tglonger <-
pivot_longer(tgsm, cols=c(-Year), names_to="SITE", values_to = "RSL") %>%
filter(SITE == "LSP2002") %>%
rename(YEAR = Year)
p1 <- ggplot() +
geom_ribbon(data = Proxy, aes(x = YEAR, ymin = LOWER, ymax = UPPER, fill = SITE), alpha = .5) +
geom_line(data = Proxy, aes(x = YEAR, y = RSL, col = SITE)) +
facet_wrap(~SITE) +
coord_cartesian(xlim = c(1800, 1830), ylim = c(-1, 0)) +
theme_classic() +
theme(
axis.title.x = element_blank(),
axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
strip.background = element_blank(),
strip.text.x = element_blank(),
legend.position = "none"
)
p2 <- ggplot() +
geom_point(data = tglonger, aes(x = YEAR, y = RSL, col = SITE), alpha = .7, size = 1) +
facet_wrap(~SITE) +
coord_cartesian(xlim = c(1990, 2000), ylim = c(-1, 0)) +
theme_classic() +
## only one call to theme!!
theme(
## this is where the theme call is different to above
axis.title = element_blank(),
axis.text = element_blank(),
axis.ticks = element_blank(),
axis.line.y = element_blank(),
strip.background = element_blank(),
strip.text.x = element_blank(),
legend.position = "none",
)
p1 + p2
Suggestion for an alternative visualisation
df_new <-
bind_rows(time1 = Proxy, time2 = tglonger, .id = "timevar") %>%
mutate(SITE = "LSP2002")
ggplot(df_new)+
geom_point(aes(x=YEAR,y=RSL))+
facet_grid(SITE~timevar, scales = "free_x")+
theme(legend.position="none") +
theme(panel.spacing = unit(.5, "lines"))
You can also use this data frame in order to create a list of plots, and then stitch it together with patchwork. This approach doesn't allow to change individual plots though.
ls_p <-
df_new %>%
split(., .$timevar) %>%
map(~{ggplot(.x)+
geom_point(aes(x=YEAR,y=RSL))+
coord_cartesian(ylim = c(-0.4,0))+
facet_grid(~SITE, scales = "free_x")+
theme(legend.position="none") +
theme(panel.spacing = unit(.5, "lines"))})
library(patchwork)
wrap_plots(ls_p)

Plot multiple regression lines on one plot in ggplot2

Sorry if this is a repeat question but I haven't managed to find an answer yet since my data frame has to be split. I am trying to plot two regression lines on one plot, with a regression line for data in period 1 (1815-1899)and a regression line for data in period 2 (1900-2013). I have used dplyr to split the data to run the two separate regressions but can't work out how to get them on the same graph as you seem to need the data frame in the ggplot() command for it to plot the line. Can anyone help?
Thanks.
library(tidyverse)
brest<-read.csv("brest.csv",header=TRUE) ## read in csv
brest<- na.omit(brest) ## get rid of NAs
brestp1<- select(filter(brest, period == 1),c(year,slr,period)) ## Divide into periods
brestp2<- select(filter(brest, period == 2),c(year,slr,period))
fit1 <- lm(slr ~ year, data = brestp1) ## Run lms
summary(fit1)
fit2<- lm(slr ~ year, data = brestp2)
summary(fit2)
## plot graph
ggplot(brestp1, aes(x = year, y = slr)) + ### Need not only brestp1 but also brestp2
geom_point() +
stat_smooth(method = "lm",se=FALSE)+
theme_classic()
## Data
## Brest period 1
structure(list(year = 1815:1820, slr = c(6926L, 6959L, 6945L,
6965L, 6941L, 6909L), period = c(1L, 1L, 1L, 1L, 1L, 1L)), na.action = structure(c(`30` = 30L,
`31` = 31L, `32` = 32L, `33` = 33L, `34` = 34L, `35` = 35L, `36` = 36L,
`37` = 37L, `38` = 38L, `39` = 39L, `51` = 51L, `52` = 52L, `53` = 53L,
`54` = 54L, `138` = 138L, `139` = 139L, `140` = 140L, `141` = 141L,
`142` = 142L, `143` = 143L, `144` = 144L, `145` = 145L, `146` = 146L
), class = "omit"), row.names = c(NA, 6L), class = "data.frame")
##Brest period 2
structure(list(year = 1900:1905, slr = c(6936L, 6916L, 6923L,
6976L, 6931L, 6913L), period = c(2L, 2L, 2L, 2L, 2L, 2L)), na.action = structure(c(`30` = 30L,
`31` = 31L, `32` = 32L, `33` = 33L, `34` = 34L, `35` = 35L, `36` = 36L,
`37` = 37L, `38` = 38L, `39` = 39L, `51` = 51L, `52` = 52L, `53` = 53L,
`54` = 54L, `138` = 138L, `139` = 139L, `140` = 140L, `141` = 141L,
`142` = 142L, `143` = 143L, `144` = 144L, `145` = 145L, `146` = 146L
), class = "omit"), row.names = c(NA, 6L), class = "data.frame")
Use geom_smooth with separate data:
ggplot() +
geom_smooth(aes(x = year, y = slr), data = brest1,
method = "lm", se = FALSE, color = "red") +
geom_smooth(aes(x = year, y = slr), data = brest2,
method = "lm", se = FALSE, color = "blue") +
geom_point(aes(x = year, y = slr), data = brest1, color = "red") +
geom_point(aes(x = year, y = slr), data = brest2, color = "blue")

Add horizontal indicator lines for confidence intervals on a geom_linerange() in ggplot2

I am trying to produce a chart of Obersved / Expected hospital infection rates with error bars showing upper and and lower 95% and 99.7% confidence intervals.
Here are the data:
orthssi <- structure(list(Hospital = structure(1:18, .Label = c("A", "B",
"C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O",
"P", "Q", "R"), class = "factor"), SSIs = c(80L, 38L, 24L, 35L,
39L, 30L, 128L, 27L, 70L, 30L, 30L, 2L, 6L, 38L, 3L, 9L, 52L,
13L), Procedures = c(865L, 1069L, 796L, 891L, 997L, 550L, 2598L,
373L, 1079L, 714L, 477L, 227L, 125L, 589L, 292L, 149L, 1984L,
351L), Expected = c(44.89, 51.149, 35.15, 42.495, 46.987, 26.999,
105.032, 18.304, 57.402, 31.409, 23.497, 10.898, 5.945, 29.614,
13.295, 6.403, 88.449, 16.083), OE = c(1.782, 0.743, 0.683, 0.824,
0.83, 1.111, 1.219, 1.475, 1.219, 0.955, 1.277, 0.184, 1.009,
1.283, 0.226, 1.406, 0.588, 0.808), Probability = c(0.092, 0.036,
0.03, 0.039, 0.039, 0.055, 0.049, 0.072, 0.065, 0.042, 0.063,
0.009, 0.048, 0.065, 0.01, 0.06, 0.026, 0.037), Lower95CI = c(1.42623345874945,
0.528256888855857, 0.439593399216354, 0.576826930846085, 0.593304300509315,
0.755779204034742, 1.02072916367972, 0.983076938281617, 0.957133142026683,
0.648442622896373, 0.86951670130161, 0.0222623281070364, 0.374607285544628,
0.916025825389466, 0.0466447169655834, 0.651315412316165, 0.440436907019126,
0.433548700419708), Upper95CI = c(2.19254259114914, 1.01294537379558,
1.00844478624614, 1.13663544526487, 1.12647704263608, 1.56764294951526,
1.44220199258348, 2.11060633474044, 1.52742490796631, 1.35122425878491,
1.7979937836084, 0.655351540870752, 2.13516991355386, 1.73993010418865,
0.652957058228699, 2.59615214768047, 0.76780774565962, 1.36405674299719
), Lower997CI = c(1.26545076052984, 0.438514548720716, 0.344268212303677,
0.474612391243614, 0.493987129703437, 0.611131699675563, 0.929047103197505,
0.785094671227173, 0.840231592610583, 0.523894662155343, 0.703500240342274,
0.00513071434039922, 0.20498124083422, 0.761788735802099, 0.0165744691173032,
0.414157407130073, 0.376765181801129, 0.303424472695538), Upper997CI = c(2.41408837016438,
1.16380466027892, 1.19392754588927, 1.31181443961792, 1.29178515824501,
1.82316431734875, 1.56260724140783, 2.46519277997175, 1.69514346690893,
1.57391071859294, 2.08889292293282, 0.966984581231215, 2.80639425224956,
1.99287051089929, 0.928351140481638, 3.28891183173877, 0.867510832843342,
1.68791001193251)), .Names = c("Hospital", "SSIs", "Procedures",
"Expected", "OE", "Probability", "Lower95CI", "Upper95CI", "Lower997CI",
"Upper997CI"), row.names = c(NA, -18L), class = "data.frame")
> ssi0106 <- dput(orthssi)
structure(list(Hospital = structure(1:18, .Label = c("A", "B",
"C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O",
"P", "Q", "R"), class = "factor"), SSIs = c(80L, 38L, 24L, 35L,
39L, 30L, 128L, 27L, 70L, 30L, 30L, 2L, 6L, 38L, 3L, 9L, 52L,
13L), Procedures = c(865L, 1069L, 796L, 891L, 997L, 550L, 2598L,
373L, 1079L, 714L, 477L, 227L, 125L, 589L, 292L, 149L, 1984L,
351L), Expected = c(44.89, 51.149, 35.15, 42.495, 46.987, 26.999,
105.032, 18.304, 57.402, 31.409, 23.497, 10.898, 5.945, 29.614,
13.295, 6.403, 88.449, 16.083), OE = c(1.782, 0.743, 0.683, 0.824,
0.83, 1.111, 1.219, 1.475, 1.219, 0.955, 1.277, 0.184, 1.009,
1.283, 0.226, 1.406, 0.588, 0.808), Probability = c(0.092, 0.036,
0.03, 0.039, 0.039, 0.055, 0.049, 0.072, 0.065, 0.042, 0.063,
0.009, 0.048, 0.065, 0.01, 0.06, 0.026, 0.037), Lower95CI = c(1.42623345874945,
0.528256888855857, 0.439593399216354, 0.576826930846085, 0.593304300509315,
0.755779204034742, 1.02072916367972, 0.983076938281617, 0.957133142026683,
0.648442622896373, 0.86951670130161, 0.0222623281070364, 0.374607285544628,
0.916025825389466, 0.0466447169655834, 0.651315412316165, 0.440436907019126,
0.433548700419708), Upper95CI = c(2.19254259114914, 1.01294537379558,
1.00844478624614, 1.13663544526487, 1.12647704263608, 1.56764294951526,
1.44220199258348, 2.11060633474044, 1.52742490796631, 1.35122425878491,
1.7979937836084, 0.655351540870752, 2.13516991355386, 1.73993010418865,
0.652957058228699, 2.59615214768047, 0.76780774565962, 1.36405674299719
), Lower997CI = c(1.26545076052984, 0.438514548720716, 0.344268212303677,
0.474612391243614, 0.493987129703437, 0.611131699675563, 0.929047103197505,
0.785094671227173, 0.840231592610583, 0.523894662155343, 0.703500240342274,
0.00513071434039922, 0.20498124083422, 0.761788735802099, 0.0165744691173032,
0.414157407130073, 0.376765181801129, 0.303424472695538), Upper997CI = c(2.41408837016438,
1.16380466027892, 1.19392754588927, 1.31181443961792, 1.29178515824501,
1.82316431734875, 1.56260724140783, 2.46519277997175, 1.69514346690893,
1.57391071859294, 2.08889292293282, 0.966984581231215, 2.80639425224956,
1.99287051089929, 0.928351140481638, 3.28891183173877, 0.867510832843342,
1.68791001193251)), .Names = c("Hospital", "SSIs", "Procedures",
"Expected", "OE", "Probability", "Lower95CI", "Upper95CI", "Lower997CI",
"Upper997CI"), row.names = c(NA, -18L), class = "data.frame")
Here is my ggplot code to produce the plot:
ggplot(data=orthssi, mapping=aes(x=Hospital, y=OE, ymin = Lower997CI, ymax = Upper997CI)) +
geom_hline(yintercept=1, colour='gray') +
geom_point(colour='blue', size=3) +
scale_y_continuous(limits=c(0,ceiling(max(orthssi$Upper997CI)))) +
geom_linerange(mapping=aes(x=Hospital)) +
theme_bw() + theme(panel.grid.major.x = element_blank())
It produces the following chart:
What I would like to do, and cannot manage, it to place small horizontal ticks on the geom_linerange() indicating the position of the upper and lower 95% an 99.7% confidence intervals.
EDIT
If you are a ggplot ninja I would also accept different line colours indicating where the 99.7& CIs extend past the 95% CIs.
Any help offered on how to add these ticks/colours is greatly appreciated.
geom_errorbar will create a line range with horizonal bars at the end. As such, the following will work
ggplot(data=orthssi, mapping=aes(x=Hospital, y=OE, ymin = Lower997CI, ymax = Upper997CI)) +
geom_hline(yintercept = 1, colour = 'gray') +
geom_point(colour = 'blue', size = 3) +
scale_y_continuous(limits=c(0, ceiling(max(orthssi$Upper997CI)))) +
theme_bw() + theme(panel.grid.major.x = element_blank()) +
geom_errorbar(aes(ymin = Lower95CI,ymax = Upper95CI),width = 0.2,colour = 'red') +
geom_errorbar(width = 0.2)
Note that the geom_linerange(mapping = aes(x = Hospital)) component has been removed as it is drawn as part of geom_errorbar(width = 0.2)

Colours for geom_histogram

I have a dataset that I want to plot with ggplot using geom_histogram (data below). I am having issues with trying to get a gradient colour scheme.
structure(list(UserID = c(39120L, 39536L, 39550L, 39627L, 39632L,
39709L, 39971L, 39988L, 39990L, 40062L, 40065L, 40065L, 40066L,
40142L, 40142L, 40143L, 40161L, 40193L, 40364L, 40437L, 40439L,
40440L, 40451L, 40453L, 40665L, 40665L, 40668L, 40751L, 40843L,
40843L, 40843L, 40846L, 40846L, 40847L, 40847L, 40850L, 40850L,
40884L, 40884L, 40884L, 40896L, 40900L, 40902L, 40903L, 40905L,
40963L, 40966L, 40966L, 40967L, 40967L, 40969L, 40971L, 40971L,
40985L, 40985L, 41010L, 41079L, 41080L, 41080L, 41081L, 41093L,
41108L, 41110L, 41111L, 41113L, 41114L, 41133L, 41137L, 41138L,
41140L, 41161L, 41162L, 41182L, 41186L, 41260L), sales = c(0.0119,
0.0032, 0.0091, 0.0098, 0.0086, 0.0101, 0.0107, 0.0111, 0.0085,
0.0178, 0.0069, 0.0055, 0.0133, 0.0112, 0.0084, 0.0141, 0.0159,
0.01, 0.0054, 0.0129, 0.011, 0.0116, 0.0099, 0.0134, 0.0046,
0.004, 0.0076, 0.005, 0.0027, 0.0037, 3e-04, 0.022, 0.012, 0.0082,
0.0108, 0.0092, 0.0101, 0.0016, 0.0082, 0.0035, 0.007, 0.0098,
0.0146, 0.0074, 0.005, 0.0152, 0.0046, 0.0032, 0.0028, 0, 0.0179,
0.0185, 0.0095, 0.0401, 0.0163, 0.0085, 0.0099, 0.0064, 0.0067,
0.0052, 0.0191, 0.0118, 0.0054, 0.0111, 0.0065, 0.0124, 0.0047,
0.0111, 0.0063, 0.0072, 0.0062, 0.0091, 0.0066, 0.0169, 0.0071
), salesfromtarget = c(0.214285714285714, -0.673469387755102,
-0.0714285714285714, 0, -0.122448979591837, 0.0306122448979592,
0.0918367346938777, 0.13265306122449, -0.13265306122449, 0.816326530612245,
-0.295918367346939, -0.438775510204082, 0.357142857142857, 0.142857142857143,
-0.142857142857143, 0.438775510204082, 0.622448979591837, 0.0204081632653062,
-0.448979591836735, 0.316326530612245, 0.122448979591837, 0.183673469387755,
0.010204081632653, 0.36734693877551, -0.530612244897959, -0.591836734693878,
-0.224489795918367, -0.489795918367347, -0.724489795918367, -0.622448979591837,
-0.969387755102041, 1.24489795918367, 0.224489795918367, -0.163265306122449,
0.102040816326531, -0.0612244897959184, 0.0306122448979592, -0.836734693877551,
-0.163265306122449, -0.642857142857143, -0.285714285714286, 0,
0.489795918367347, -0.244897959183673, -0.489795918367347, 0.551020408163265,
-0.530612244897959, -0.673469387755102, -0.714285714285714, -1,
0.826530612244898, 0.887755102040817, -0.0306122448979592, 3.09183673469388,
0.663265306122449, -0.13265306122449, 0.010204081632653, -0.346938775510204,
-0.316326530612245, -0.469387755102041, 0.948979591836735, 0.204081632653061,
-0.448979591836735, 0.13265306122449, -0.336734693877551, 0.26530612244898,
-0.520408163265306, 0.13265306122449, -0.357142857142857, -0.26530612244898,
-0.36734693877551, -0.0714285714285714, -0.326530612244898, 0.724489795918367,
-0.275510204081633)), .Names = c("UserID", "sales", "salesfromtarget"
), row.names = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L,
12L, 13L, 15L, 16L, 18L, 19L, 20L, 21L, 22L, 23L, 24L, 25L, 26L,
28L, 29L, 31L, 32L, 35L, 36L, 37L, 38L, 39L, 41L, 42L, 44L, 45L,
46L, 47L, 48L, 49L, 50L, 51L, 52L, 53L, 54L, 56L, 57L, 58L, 59L,
60L, 61L, 62L, 63L, 64L, 65L, 67L, 69L, 70L, 72L, 77L, 78L, 79L,
80L, 81L, 82L, 83L, 84L, 85L, 86L, 87L, 88L, 89L, 90L, 93L), class = "data.frame")
Using this code I am trying to use scale_fill_brewer to get a gradient scheme (but it doesn't have to be this specific one, it was just an example of something not working). All of the bars are grey and ideally they would not be grey. Best case scenario I wanted to use something along the lines of scale_fill_gradient(low = "green", high = "blue") (again this doesn't work for me). I feel like I'm missing something fundamental with ggplot2.
require(ggplot2)
require(scales)
require(ggthemes)
ggplot(repex, aes(x = salesfromtarget)) +
geom_histogram(binwidth = .1, alpha = 0.5, colour = "white") +
scale_fill_brewer(palette = "Spectral") +
guides(fill = FALSE) +
theme_solarized() +
ggtitle("Standard Distribution of Sales") + xlab("Sales") + ylab("Frequency") +
theme(plot.title = element_text(size = 13, colour = "black", face = "bold", vjust = 1)) +
theme(axis.title.x = element_text(size = 12, colour = "black", vjust = -.005),
axis.title.y = element_text(size = 12, colour = "black", vjust = 0.2),
axis.text.x = element_text(angle = 45, hjust = 1)) +
scale_x_continuous(breaks = seq(-1.5, 1.5, by = .25),
limits = c(-1.5, 1.5), labels = percent)
You need to define the "fill" variable in the aes() section:
ggplot(repex, aes(x=salesfromtarget, fill=..x..))
+geom_histogram(binwidth=.1)
+scale_fill_gradient("Legend",low = "green", high = "blue")
Since the histogram bars are the count of each x-axis value, if you want to use the original x value you should use "..x..". You can fill with the histogram count using "..count..":
ggplot(repex, aes(x=salesfromtarget, fill=..count..))
+geom_histogram(binwidth=.1)
+scale_fill_gradient("Legend",low = "green", high = "blue")

Resources