How is geom_point removing rows containing missing values? - r

I'm unsure why none of my data points show up on the map.
Store_ID visits CRIND_CC ISCC EBITDAR top_bottom Latitude Longitude
(int) (int) (int) (int) (dbl) (chr) (fctr) (fctr)
1 92 348 14819 39013 76449.15 top 41.731373 -93.58184
2 2035 289 15584 35961 72454.42 top 41.589428 -93.80785
3 50 266 14117 27262 49775.02 top 41.559017 -93.77287
4 156 266 7797 25095 28645.95 top 41.6143 -93.834404
5 66 234 8314 18718 46325.12 top 41.6002 -93.779236
6 207 18 2159 17999 20097.99 bottom 41.636208 -93.531876
7 59 23 10547 28806 52168.07 bottom 41.56153 -93.88083
8 101 23 1469 11611 7325.45 bottom 41.20982 -93.84298
9 130 26 2670 13561 14348.98 bottom 41.614517 -93.65789
10 130 26 2670 13561 14348.98 bottom 41.6145172 -93.65789
11 24 27 17916 41721 69991.10 bottom 41.597134 -93.49263
> dput(droplevels(top_bottom))
structure(list(Store_ID = c(92L, 2035L, 50L, 156L, 66L, 207L,
59L, 101L, 130L, 130L, 24L), visits = c(348L, 289L, 266L, 266L,
234L, 18L, 23L, 23L, 26L, 26L, 27L), CRIND_CC = c(14819L, 15584L,
14117L, 7797L, 8314L, 2159L, 10547L, 1469L, 2670L, 2670L, 17916L
), ISCC = c(39013L, 35961L, 27262L, 25095L, 18718L, 17999L, 28806L,
11611L, 13561L, 13561L, 41721L), EBITDAR = c(76449.15, 72454.42,
49775.02, 28645.95, 46325.12, 20097.99, 52168.07, 7325.45, 14348.98,
14348.98, 69991.1), top_bottom = c("top", "top", "top", "top",
"top", "bottom", "bottom", "bottom", "bottom", "bottom", "bottom"
), Latitude = structure(c(11L, 4L, 2L, 7L, 6L, 10L, 3L, 1L, 8L,
9L, 5L), .Label = c("41.20982", "41.559017", "41.56153", "41.589428",
"41.597134", "41.6002", "41.6143", "41.614517", "41.6145172",
"41.636208", "41.731373"), class = "factor"), Longitude = structure(c(3L,
7L, 5L, 8L, 6L, 2L, 10L, 9L, 4L, 4L, 1L), .Label = c("-93.49263",
"-93.531876", "-93.58184", "-93.65789", "-93.77287", "-93.779236",
"-93.80785", "-93.834404", "-93.84298", "-93.88083"), class = "factor")), row.names = c(NA,
-11L), .Names = c("Store_ID", "visits", "CRIND_CC", "ISCC", "EBITDAR",
"top_bottom", "Latitude", "Longitude"), class = c("tbl_df", "tbl",
"data.frame"))
Creating the plot:
map <- qmap('Des Moines') +
geom_point(data = top_bottom, aes(x = as.numeric(Longitude),
y = as.numeric(Latitude)), colour = top_bottom, size = 3)
I get the warning message:
Removed 11 rows containing missing values (geom_point).
However, this works without the use of ggmap():
ggplot(top_bottom) +
geom_point(aes(x = as.numeric(Longitude), y = as.numeric(Latitude)),
colour = top_bottom, size = 3)
How do I get the points to overlay on ggmap??

You are using as.numeric() with a factor. As seen here that gives you a level number for the factor (not the number represented). Unsurprisingly, all those levels are points not on the canvas displayed for "Des Moines".
Use as.numeric(as.character(Latitude)) and as.numeric(as.character(Longitude)), as ugly as it seems.

Seeing the sample data, it seems that there is one data point which does not stay in the map area.
library(dplyr)
library(ggplot2)
library(ggmap)
### You can find lon/lat for bbox using your ggmap object.
### For instance, des1 <- ggmap(mymap1)
### str(des1)
### You could use bb2bbox() in the ggmap package to find lon/lat.
filter(top_bottom,
between(Latitude, 41.27057, 41.92782),
between(Longitude, -94.04787, -93.16897)) -> inside
setdiff(top_bottom, inside)
# Store_ID visits CRIND_CC ISCC EBITDAR top_bottom Latitude Longitude
#1 101 23 1469 11611 7325.45 bottom 41.20982 -93.84298
Since you used qmap() without specifying zoom, I do not know what zoom level you had. Let's play around a bit. In the first case, there is one data point missing; Removed 1 rows containing missing values (geom_point).
mymap1 <- get_map('Des Moines', zoom = 10)
ggmap(mymap1) +
geom_point(data = top_bottom, aes(x = as.numeric(Longitude),
y = as.numeric(Latitude)), colour = top_bottom, size = 3)
mymap2 <- get_map('Des Moines', zoom = 9)
ggmap(mymap2) +
geom_point(data = top_bottom, aes(x = as.numeric(Longitude),
y = as.numeric(Latitude)), colour = top_bottom, size = 3)
So the key thing, I think, is that you want to make sure you choose the right zoom level for your data set. For that, you may want to specify zoom in qmap(). I hope this will help you.
DATA
top_bottom <- structure(list(Store_ID = c(92L, 2035L, 50L, 156L, 66L, 207L,
59L, 101L, 130L, 130L, 24L), visits = c(348L, 289L, 266L, 266L,
234L, 18L, 23L, 23L, 26L, 26L, 27L), CRIND_CC = c(14819L, 15584L,
14117L, 7797L, 8314L, 2159L, 10547L, 1469L, 2670L, 2670L, 17916L
), ISCC = c(39013L, 35961L, 27262L, 25095L, 18718L, 17999L, 28806L,
11611L, 13561L, 13561L, 41721L), EBITDAR = c(76449.15, 72454.42,
49775.02, 28645.95, 46325.12, 20097.99, 52168.07, 7325.45, 14348.98,
14348.98, 69991.1), top_bottom = structure(c(2L, 2L, 2L, 2L,
2L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("bottom", "top"), class = "factor"),
Latitude = c(41.731373, 41.589428, 41.559017, 41.6143, 41.6002,
41.636208, 41.56153, 41.20982, 41.614517, 41.6145172, 41.597134
), Longitude = c(-93.58184, -93.80785, -93.77287, -93.834404,
-93.779236, -93.531876, -93.88083, -93.84298, -93.65789,
-93.65789, -93.49263)), .Names = c("Store_ID", "visits",
"CRIND_CC", "ISCC", "EBITDAR", "top_bottom", "Latitude", "Longitude"
), class = "data.frame", row.names = c("1", "2", "3", "4", "5",
"6", "7", "8", "9", "10", "11"))

Related

Adding a gap in geom_segment in ggplot2

I have some data with id, time_to_event, event, therapy start and therapy stop variables.
Here is a reproducible example:
data <- structure(list(id = structure(c(1L, 3L, 9L, 2L, 5L, 10L, 7L,
8L, 4L, 6L), .Label = c("1", "4", "2", "9", "5", "10", "7", "8",
"3", "6"), class = "factor"), event = c("Death", "Death", "Death",
"Y", "Death", "Y", "Y", "Y", "Death", "X"), time_to_event = c(89L,
83L, 74L, 88L, 78L, 72L, 77L, 76L, 79L, 78L), start = c(18L,
3L, 13L, 16L, 10L, 6L, 20L, 11L, 14L, 9L), stop = c(89L, 83L,
74L, 88L, 78L, 72L, 77L, 76L, 79L, 78L)), row.names = c(NA, -10L
), class = "data.frame")
I computed a plot, which shows the type of event as geom_point and the time of the therapy as geom_segment with start and stop. Here is the code:
colors <- c("X" = "dodgerblue3",
"Y" = "red2",
"Death" = "black") # Defining the colors for the event type
event_symbols <- c("X" = "\u25CF",
"Y" = "\u25CF",
"Death" = "\u2020") # <-- Problem: When I use the shape 84 (T) the code is plotted. When I use "/u2020", I get the above-mentioned error message
five_day_lines <- seq(0, 90, 5)
day_lines <- 1:90
day_lines <- day_lines[!day_lines %in% five_day_lines]
data$id <- factor(data$id, levels = data$id[order(data$time_to_event, decreasing = T)])
ggplot(data = data) +
geom_hline(yintercept = five_day_lines, color = "grey", alpha = .35) +
geom_hline(yintercept = day_lines, color = "grey", alpha = .25) +
geom_segment(aes(x = id, xend = id, y = start, yend = stop), color = "dimgray", size = 1) +
geom_point(aes(x = id, y = time_to_event, shape = event, color = event), size = 3) +
scale_y_continuous(limits = c(0,90), breaks = c(seq(0, 90, 5)), name = "Days after study inclusion") +
scale_x_discrete(name = "ID") +
coord_flip() +
scale_color_manual(values = colors, breaks = c()) +
scale_shape_manual(values = event_symbols, breaks = c("X", "Y", "Death"),
guide = guide_legend(override.aes = list(color = c("dodgerblue3", "red2", "black")))) +
theme(legend.position = "bottom",
legend.title = element_text(size=12, face="bold"),
panel.background = element_blank(),
legend.background = element_blank()) +
labs(color="Medication", shape="Event type")
The plot looks like the following:
My question is, whether it is possible to add a gap in the geom_segment at a certain point, let's say at half of the length of the segments. Ideally, the break should look like a 'typical' gap in a plot axis like the following from https://rstudio-pubs-static.s3.amazonaws.com/235467_5abd31ab564a43c9ae0f18cdd07eebe7.html:
Here, we can see a kind of double slash. I had the idea of taking the double slash unicode character (Link) but this, I think, would simply generate a new layer and would disregard the desired gap or 'interruption' of the geom_segment. Is there a smart way to compute such a gap, where the geom_segment ends at the one slash and starts again at the 2nd slash? Ideally, it works with the double slash but if there is another way with another appropriate shape or something else, I would be fine with that.
I would be grateful, if there is a smart solution for this problem.
Thanks a lot.
I don't think there exists specialised layers in ggplot2 or extensions that deal with gaps specifically. One can of course abuse the arrow argument in geom_segment() to draw a stumpy arrow at the extremes of a segment, but that probably doesn't alleviate the necessity to use a second layer. Below you'll find a simplified example of that.
library(ggplot2)
data <- structure(list(
id = structure(c(1L, 3L, 9L, 2L, 5L, 10L, 7L, 8L, 4L, 6L),
.Label = c("1", "4", "2", "9", "5", "10", "7", "8", "3", "6"),
class = "factor"),
event = c("Death", "Death", "Death", "Y", "Death", "Y", "Y", "Y", "Death", "X"),
time_to_event = c(89L, 83L, 74L, 88L, 78L, 72L, 77L, 76L, 79L, 78L),
start = c(18L, 3L, 13L, 16L, 10L, 6L, 20L, 11L, 14L, 9L),
stop = c(89L, 83L, 74L, 88L, 78L, 72L, 77L, 76L, 79L, 78L)),
row.names = c(NA, -10L),
class = "data.frame"
)
gap_start <- 30
gap_stop <- 50
ggplot(data, aes(x = start, xend = stop, y = id, yend = id)) +
geom_segment(aes(xend = gap_start), arrow = arrow(90, unit(1, "mm"))) +
geom_segment(aes(x = gap_stop), arrow = arrow(90, unit(1, "mm"), ends = "first"))
Created on 2021-07-14 by the reprex package (v1.0.0)

Labeling a single point with ggrepel

I am trying to use geom_label_repel to add labels to a couple of data points on a plot. In this case, they happen to be outliers on box plots. I've got most of the code working, I can label the outlier, but for some reason I am getting multiple labels (equal to my sample size for the entire data set) mapped to that point. I'd like just one label for this outlier.
Example:
Here is my data:
dput(sus_dev_data)
structure(list(time_point = structure(c(1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L), .Label = c("3", "8", "12"), class = "factor"),
days_to_pupation = c(135L, 142L, 143L, 155L, 149L, 159L,
153L, 171L, 9L, 67L, 53L, 49L, 72L, 67L, 55L, 64L, 60L, 122L,
53L, 51L, 49L, 53L, 50L, 56L, 44L, 47L, 60L)), row.names = c(1L,
2L, 3L, 4L, 5L, 6L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L,
17L, 18L, 20L, 21L, 22L, 23L, 24L, 26L, 27L, 28L, 29L, 30L), class = "data.frame")
and my code...
####################################################################################################
# Time to pupation statistical analysis
####################################################################################################
## linear model
pupation_Model=lm(sus_dev_data$days_to_pupation~sus_dev_data$time_point)
pupationANOVA=aov(pupation_Model)
summary(pupationANOVA)
# Tukey test to study each pair of treatment :
pupationTUKEY <- TukeyHSD(x=pupationANOVA, which = 'sus_dev_data$time_point',
conf.level=0.95)
## Function to generate significance labels on box plot
generate_label_df <- function(pupationTUKEY, variable){
# Extract labels and factor levels from Tukey post-hoc
Tukey.levels <- pupationTUKEY[[variable]][,4]
Tukey.labels <- data.frame(multcompLetters(Tukey.levels, reversed = TRUE)['Letters'])
#I need to put the labels in the same order as in the boxplot :
Tukey.labels$treatment=rownames(Tukey.labels)
Tukey.labels=Tukey.labels[order(Tukey.labels$treatment) , ]
return(Tukey.labels)
}
#generate labels using function
labels<-generate_label_df(pupationTUKEY , "sus_dev_data$time_point")
#rename columns for merging
names(labels)<-c('Letters','time_point')
# obtain letter position for y axis using means
pupationyvalue<-aggregate(.~time_point, data=sus_dev_data, max)
#merge dataframes
pupationfinal<-merge(labels,pupationyvalue)
####################################################################################################
# Time to pupation plot
####################################################################################################
# Plot of data
(pupation_plot <- ggplot(sus_dev_data, aes(time_point, days_to_pupation)) +
Alex_Theme +
geom_boxplot(fill = "grey80", outlier.size = 0.75) +
geom_text(data = pupationfinal, aes(x = time_point, y = days_to_pupation,
label = Letters),vjust=-2,hjust=.5, size = 4) +
#ggtitle(expression(atop("Days to pupation"))) +
labs(y = 'Days to pupation', x = 'Weeks post-hatch') +
scale_y_continuous(limits = c(0, 200)) +
scale_x_discrete(labels=c("3" = "13", "8" = "18",
"12" = "22")) +
geom_label_repel(aes(x = 1, y = 9),
label = '1')
)
Here's a shorter example to demonstrate what is going on. Essentially, your labels are beng recycled to be the same length as the data.
df = data.frame(x=1:5, y=1:5)
ggplot(df, aes(x,y, color=x)) +
geom_point() +
geom_label_repel(aes(x = 1, y = 1), label = '1')
You can override this by providing new data for the ggrepel
ggplot(df, aes(x,y, color=x)) +
geom_point() +
geom_label_repel(data = data.frame(x=1, y=1), label = '1')
Based on your data, you have 3 outliers (one in each group), you can manually identify them by applying the classic definition of outliers by John Tukey (Upper: Q3+1.5*IQR and Lower: Q1-1.5*IQR) (but you are free to set your own rules to define an outlier). You can use the function quantile and IQR to get those points.
Here, I incorporated them in a sequence of pipe using dplyr package:
library(tidyverse)
Outliers <- sus_dev_data %>% group_by(time_point) %>%
mutate(Out_up = ifelse(days_to_pupation > quantile(days_to_pupation,0.75)+1.5*IQR(days_to_pupation), "Out","In"))%>%
mutate(Out_Down = ifelse(days_to_pupation < quantile(days_to_pupation,0.25)-1.5*IQR(days_to_pupation), "Out","In")) %>%
filter(Out_up == "Out" | Out_Down == "Out")
# A tibble: 3 x 4
# Groups: time_point [3]
time_point days_to_pupation Out_up Out_Down
<fct> <int> <chr> <chr>
1 3 9 In Out
2 8 122 Out In
3 12 60 Out In
As mentioned by #dww, you need to pass a new dataframe to geom_label_repel if you want your outliers to be single labeled. So, here we use the dataframe Outliers to feed the geom_label_repel function:
library(ggplot2)
library(ggrepel)
ggplot(sus_dev_data, aes(time_point, days_to_pupation)) +
#Alex_Theme +
geom_boxplot(fill = "grey80", outlier.size = 0.75) +
geom_text(data = pupationfinal, aes(x = time_point, y = days_to_pupation,
label = Letters),vjust=-2,hjust=.5, size = 4) +
#ggtitle(expression(atop("Days to pupation"))) +
labs(y = 'Days to pupation', x = 'Weeks post-hatch') +
scale_y_continuous(limits = c(0, 200)) +
scale_x_discrete(labels=c("3" = "13", "8" = "18",
"12" = "22")) +
geom_label_repel(inherit.aes = FALSE,
data = Outliers,
aes(x = time_point, y = days_to_pupation, label = "Out"))
And you get the following graph:
I hope it helps you to figure it how to label all your outliers.

Drawing SE in xyplot with errorbars

I am trying to construct a simple XY-Graph with the milk production (called FCM) of two different groups of cows (from the output I got from the mixed model, using the lsmeans and SE).
I was able to construct the plot displaying the lsmeans using the xyplot function in lattice:
library(lattice)
xyplot(lsmean~Time, type="b", group=Group, data=lsmeans2[order(lsmeans2$Time),],
pch=16, ylim=c(10,35), col=c("darkorange","darkgreen"),
ylab="FCM (kg/day)", xlab="Week", lwd=2,
key=list(space="top",
lines=list(col=c("darkorange","darkgreen"),lty=c(1,1),lwd=2),
text=list(c("Confinement Group","Pasture Group"), cex=0.8)))
I now want to add the error bars. I tried some things with the panel.arrow function, just copying and pasting from other examples but didnĀ“t get any further.
I would really appreciate some help!
My lsmeans2 dataset:
Group Time lsmean SE df lower.CL upper.CL
Stall wk1 26.23299 0.6460481 59 24.19243 28.27356
Weide wk1 25.12652 0.6701080 58 23.00834 27.24471
Stall wk10 21.89950 0.6460589 59 19.85890 23.94010
Weide wk10 18.45845 0.6679617 58 16.34705 20.56986
Stall wk2 25.38004 0.6460168 59 23.33957 27.42050
Weide wk2 22.90409 0.6679617 58 20.79269 25.01549
Stall wk3 25.02474 0.6459262 59 22.98455 27.06492
Weide wk3 24.05886 0.6679436 58 21.94751 26.17020
Stall wk4 23.91630 0.6456643 59 21.87694 25.95565
Weide wk4 22.23608 0.6678912 58 20.12490 24.34726
Stall wk5 23.97382 0.6493483 59 21.92283 26.02481
Weide wk5 18.14550 0.6677398 58 16.03480 20.25620
Stall wk6 24.48899 0.6456643 59 22.44963 26.52834
Weide wk6 19.40022 0.6697394 58 17.28319 21.51724
Stall wk7 24.98107 0.6459262 59 22.94089 27.02126
Weide wk7 19.71200 0.6677398 58 17.60129 21.82270
Stall wk8 22.65167 0.6460168 59 20.61120 24.69214
Weide wk8 19.35759 0.6678912 58 17.24641 21.46877
Stall wk9 22.64381 0.6460481 59 20.60324 24.68438
Weide wk9 19.26869 0.6679436 58 17.15735 21.38004
For completeness, here is a solution using xyplot:
# Reproducible data
lsmeans2 = structure(list(Group = structure(c(1L, 2L, 1L, 2L, 1L, 2L, 1L,
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L), .Label = c("Stall",
"Weide"), class = "factor"), Time = structure(c(1L, 1L, 2L, 2L,
3L, 3L, 4L, 4L, 5L, 5L, 6L, 6L, 7L, 7L, 8L, 8L, 9L, 9L, 10L,
10L), .Label = c("wk1", "wk10", "wk2", "wk3", "wk4", "wk5", "wk6",
"wk7", "wk8", "wk9"), class = "factor"), lsmean = c(26.23299,
25.12652, 21.8995, 18.45845, 25.38004, 22.90409, 25.02474, 24.05886,
23.9163, 22.23608, 23.97382, 18.1455, 24.48899, 19.40022, 24.98107,
19.712, 22.65167, 19.35759, 22.64381, 19.26869), SE = c(0.6460481,
0.670108, 0.6460589, 0.6679617, 0.6460168, 0.6679617, 0.6459262,
0.6679436, 0.6456643, 0.6678912, 0.6493483, 0.6677398, 0.6456643,
0.6697394, 0.6459262, 0.6677398, 0.6460168, 0.6678912, 0.6460481,
0.6679436), df = c(59L, 58L, 59L, 58L, 59L, 58L, 59L, 58L, 59L,
58L, 59L, 58L, 59L, 58L, 59L, 58L, 59L, 58L, 59L, 58L), lower.CL = c(24.19243,
23.00834, 19.8589, 16.34705, 23.33957, 20.79269, 22.98455, 21.94751,
21.87694, 20.1249, 21.92283, 16.0348, 22.44963, 17.28319, 22.94089,
17.60129, 20.6112, 17.24641, 20.60324, 17.15735), upper.CL = c(28.27356,
27.24471, 23.9401, 20.56986, 27.4205, 25.01549, 27.06492, 26.1702,
25.95565, 24.34726, 26.02481, 20.2562, 26.52834, 21.51724, 27.02126,
21.8227, 24.69214, 21.46877, 24.68438, 21.38004)), .Names = c("Group",
"Time", "lsmean", "SE", "df", "lower.CL", "upper.CL"), class = "data.frame", row.names = c(NA,
-20L))
xyplot(lsmean~Time, type="b", group=Group, data=lsmeans2[order(lsmeans2$Time),],
panel = function(x, y, ...){
panel.arrows(x, y, x, lsmeans2$upper.CL, length = 0.15,
angle = 90, col=c("darkorange","darkgreen"))
panel.arrows(x, y, x, lsmeans2$lower.CL, length = 0.15,
angle = 90, col=c("darkorange","darkgreen"))
panel.xyplot(x,y, ...)
},
pch=16, ylim=c(10,35), col=c("darkorange","darkgreen"),
ylab="FCM (kg/day)", xlab="Week", lwd=2,
key=list(space="top",
lines=list(col=c("darkorange","darkgreen"),lty=c(1,1),lwd=2),
text=list(c("Confinement Group","Pasture Group"), cex=0.8)))
The length argument in panel.arrows changes the width of the error heads. You can fiddle around with this parameter to get a width you like.
Notice that even though you had lsmeans2[order(lsmeans2$Time),] when specifying the data =, the ordering of Time is still wrong. This is because Time is a factor, and R doesn't know you want it to order by the numerical suffix of wk. This means, that it will sort wk10 before wk2, because 1 is smaller than 2. You can use this little trick below to order it correctly:
# Order first by the character lenght, then by Time
Timelevels = levels(lsmeans2$Time)
Timelevels = Timelevels[order(nchar(Timelevels), Timelevels)]
# Reorder the levels
lsmeans2$Time = factor(lsmeans2$Time, levels = Timelevels)
# Create Subset
lsmeansSub = lsmeans2[order(lsmeans2$Time),]
xyplot(lsmean~Time, type="b", group=Group, data=lsmeansSub,
panel = function(x, y, yu, yl, ...){
panel.arrows(x, y, x, lsmeansSub$upper.CL, length = 0.15,
angle = 90, col=c("darkorange","darkgreen"))
panel.arrows(x, y, x, lsmeansSub$lower.CL, length = 0.15,
angle = 90, col=c("darkorange","darkgreen"))
panel.xyplot(x, y, ...)
},
pch=16, ylim=c(10,35), col=c("darkorange","darkgreen"),
ylab="FCM (kg/day)", xlab="Week", lwd=2,
key=list(space="top",
lines=list(col=c("darkorange","darkgreen"),lty=c(1,1),lwd=2),
text=list(c("Confinement Group","Pasture Group"), cex=0.8)))
Note that even after reordering the the levels of "Time", I still need to use the sorted data for the data = argument. This is because xyplot plots the points in the order that appears in the dataset, not the order of the factor levels.
Is there a particular reason you want to use xplot? ggplot2 is much easier to work with and prettier. Here's an example of what I think you want.
#load ggplot2
library(ggplot2)
#load data
d = structure(list(Group = structure(c(1L, 2L, 1L, 2L, 1L, 2L, 1L,
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L), .Label = c("Stall",
"Weide"), class = "factor"), Time = structure(c(1L, 1L, 2L, 2L,
3L, 3L, 4L, 4L, 5L, 5L, 6L, 6L, 7L, 7L, 8L, 8L, 9L, 9L, 10L,
10L), .Label = c("wk1", "wk10", "wk2", "wk3", "wk4", "wk5", "wk6",
"wk7", "wk8", "wk9"), class = "factor"), lsmean = c(26.23299,
25.12652, 21.8995, 18.45845, 25.38004, 22.90409, 25.02474, 24.05886,
23.9163, 22.23608, 23.97382, 18.1455, 24.48899, 19.40022, 24.98107,
19.712, 22.65167, 19.35759, 22.64381, 19.26869), SE = c(0.6460481,
0.670108, 0.6460589, 0.6679617, 0.6460168, 0.6679617, 0.6459262,
0.6679436, 0.6456643, 0.6678912, 0.6493483, 0.6677398, 0.6456643,
0.6697394, 0.6459262, 0.6677398, 0.6460168, 0.6678912, 0.6460481,
0.6679436), df = c(59L, 58L, 59L, 58L, 59L, 58L, 59L, 58L, 59L,
58L, 59L, 58L, 59L, 58L, 59L, 58L, 59L, 58L, 59L, 58L), lower.CL = c(24.19243,
23.00834, 19.8589, 16.34705, 23.33957, 20.79269, 22.98455, 21.94751,
21.87694, 20.1249, 21.92283, 16.0348, 22.44963, 17.28319, 22.94089,
17.60129, 20.6112, 17.24641, 20.60324, 17.15735), upper.CL = c(28.27356,
27.24471, 23.9401, 20.56986, 27.4205, 25.01549, 27.06492, 26.1702,
25.95565, 24.34726, 26.02481, 20.2562, 26.52834, 21.51724, 27.02126,
21.8227, 24.69214, 21.46877, 24.68438, 21.38004)), .Names = c("Group",
"Time", "lsmean", "SE", "df", "lower.CL", "upper.CL"), class = "data.frame", row.names = c(NA,
-20L))
#fix week
library(stringr)
library(magrittr)
d$Time %<>% as.character() %>% str_replace(pattern = "wk", replacement = "") %>% as.numeric()
#plot
ggplot(d, aes(Time, lsmean, color = Group, group = Group)) +
geom_point() +
geom_errorbar(aes(ymin = lower.CL, ymax = upper.CL), width = .2) +
geom_line() +
ylim(10, 35) +
scale_x_continuous(name = "Week", breaks = 1:10) +
ylab("FCM (kg/day)") +
scale_color_discrete(label = c("Confinement Group","Pasture Group"))

R Program Vector, record Column Percent

This is my vector
head(sep)
I must find percent of all SEP 11 in each row.
For instance, in first row, percent of SEP 11 is
100 * ((63 + 124)/ (63 + 124 + 0 + 0))
And would like this stored in newly created 8th column
Thanks
dput
> dput(head(sep))
structure(list(Site = structure(1:6, .Label = c("31R001", "31R002",
"31R003", "31R004", "31R005", "31R006", "31R007", "31R008", "31R011",
"31R013", "31R014", "31R016", "31R018", "31R019", "31R020", "31R021",
"31R022", "31R023", "31R024", "31R025", "31R026", "31R027", "31R029",
"31R030", "31R031", "31R032", "31R034", "31R035", "31R036", "31R038",
"31R039", "31R040", "31R041", "31R042", "31R043", "31R044", "31R045",
"31R046", "31R048", "31R049", "31R050", "31R051", "31R052", "31R053",
"31R054", "31R055", "31R056", "31R057", "31R058", "31R059", "31R060",
"31R061", "31R069", "31R071", "31R072", "31R075", "31R435", "31R440",
"31R445", "31R450", "31R455", "31R460", "31R470", "31R600", "31R722",
"31R801", "31R825", "31R826", "31R829", "31R840", "31R843", "31R861",
"31R880"), class = "factor"), Latitude = c(33.808874, 33.877256,
33.820825, 33.852373, 33.829697, 33.810274), Longitude = c(-117.844048,
-117.700135, -117.811845, -117.795516, -117.787532, -117.830429
), Windows.SEP.11 = c(63L, 174L, 11L, 85L, 163L, 71L), Mac.SEP.11 = c(0L,
1L, 4L, 0L, 0L, 50L), Windows.SEP.12 = c(124L, 185L, 9L, 75L,
23L, 5L), Mac.SEP.12 = c(0L, 1L, 32L, 1L, 0L, 50L)), .Names = c("Site",
"Latitude", "Longitude", "Windows.SEP.11", "Mac.SEP.11", "Windows.SEP.12",
"Mac.SEP.12"), row.names = c(NA, 6L), class = "data.frame")
Assuming that you want to get the rowSums of columns that have 'Windows' as column names, we subset the dataset ("sep1") using grep. Then get the rowSums(Sub1), divide by the rowSums of all the numeric columns (sep1[4:7]), multiply by 100, and assign the results to a new column ("newCol")
Sub1 <- sep1[grep("Windows", names(sep1))]
sep1$newCol <- 100*rowSums(Sub1)/rowSums(sep1[4:7])

scale_fill_manual based on another factor in ggplot2

I am trying to color-code my legend based on a broader categorization of the factor used to "fill" my geom_bar in ggplot2. My plot looks like this: which I got using this R code:
ggplot(df, aes(year, TOTALshark, fill=fishery)) + geom_bar(width=.5,stat="identity", position="dodge")+ facet_wrap(~div)
Here is a dput sample of my dataset:
> dput(smpl)
df <- structure(list(X1 = structure(c(6L, 11L, 22L, 27L, 10L, 10L,
6L, 11L, 6L, 10L, 8L, 6L, 6L, 4L, 22L, 18L, 10L, 10L, 11L, 6L
), .Label = c("AMERICAN PLAICE", "BIGEYE TUNA", "BIVALVE", "BLUEFIN TUNA",
"CAPELIN", "COD(ATL)", "CRAB(SNOW,QUEEN)", "HADDOCK", "HAGFISH(ATL)",
"HALIBUT(ATL)", "HALIBUT(GREENLAND)", "HERRING(ATL)", "JONAH CRAB (CANC.BOR.)",
"LOBSTER", "LONGHORN SCULPIN", "LUMPFISH", "MACKEREL(ATL)", "MONKFISH",
"PAND.BOR.", "PAND.MON.", "POLLOCK", "REDFISH", "SCALLOP", "SEA URCHINS",
"SEACU", "SILVER HAKE", "SWORDFISH", "WHELK", "WHITE HAKE", "WINTER FLOUNDER",
"WITCH FLOUNDER", "YELLOWFIN TUNA", "YELLOWTAIL FLOUNDER"), class = "factor"),
X2 = structure(c(2L, 2L, 8L, 5L, 5L, 5L, 5L, 8L, 5L, 5L,
5L, 2L, 5L, 5L, 8L, 2L, 5L, 5L, 2L, 2L), .Label = c("Dredge",
"Gillnet", "Hook", "Jigger", "Line", "Seine", "Trap", "Trawlb",
"Trawlm"), class = "factor"), fishery = structure(c(12L,
25L, 43L, 50L, 24L, 24L, 15L, 27L, 15L, 24L, 21L, 12L, 15L,
9L, 43L, 36L, 24L, 24L, 25L, 12L), .Label = c("AMERICAN PLAICE-Gillnet",
"AMERICAN PLAICE-Line", "AMERICAN PLAICE-Trawlb", "BIGEYE TUNA-Jigger",
"BIGEYE TUNA-Line", "BIVALVE-Dredge", "BLUEFIN TUNA-Hook",
"BLUEFIN TUNA-Jigger", "BLUEFIN TUNA-Line", "CAPELIN-Seine",
"CAPELIN-Trap", "COD(ATL)-Gillnet", "COD(ATL)-Hook", "COD(ATL)-Jigger",
"COD(ATL)-Line", "COD(ATL)-Trap", "COD(ATL)-Trawlb", "CRAB(SNOW,QUEEN)-Trap",
"CUSK-Line", "HADDOCK-Gillnet", "HADDOCK-Line", "HADDOCK-Trawlb",
"HAGFISH(ATL)-Trap", "HALIBUT(ATL)-Line", "HALIBUT(GREENLAND)-Gillnet",
"HALIBUT(GREENLAND)-Line", "HALIBUT(GREENLAND)-Trawlb", "HERRING(ATL)-Seine",
"HERRING(ATL)-Trawlm", "JONAH CRAB (CANC.BOR.)-Trap", "LOBSTER-Trap",
"LONGHORN SCULPIN-Trawlb", "LUMPFISH-Gillnet", "MACKEREL(ATL)-Seine",
"MACKEREL(ATL)-Trawlm", "MONKFISH-Gillnet", "MONKFISH-Trawlb",
"PAND.BOR.-Trawlb", "PAND.MON.-Trawlb", "POLLOCK-Gillnet",
"POLLOCK-Trawlb", "REDFISH-Gillnet", "REDFISH-Trawlb", "REDFISH-Trawlm",
"SCALLOP-Dredge", "SEA URCHINS-Dredge", "SEACU-Dredge", "SILVER HAKE-Trawlb",
"SWORDFISH-Jigger", "SWORDFISH-Line", "SWORDFISH-unk", "WHELK-Trap",
"WHITE HAKE-Gillnet", "WHITE HAKE-Line", "WINTER FLOUNDER-Gillnet",
"WINTER FLOUNDER-Trawlb", "WITCH FLOUNDER-Trawlb", "YELLOWFIN TUNA-Line",
"YELLOWTAIL FLOUNDER-Trawlb"), class = "factor"), year = c(2008L,
2008L, 2009L, 2009L, 2008L, 2009L, 2009L, 2008L, 2006L, 2007L,
2007L, 2007L, 2007L, 2007L, 2008L, 2008L, 2009L, 2009L, 2009L,
2009L), div = structure(c(6L, 19L, 2L, 4L, 5L, 10L, 3L, 19L,
9L, 10L, 3L, 9L, 6L, 4L, 3L, 9L, 6L, 11L, 7L, 9L), .Label = c("5Z",
"5Y", "4X", "4W", "4V", "4T", "4S", "4R", "3P", "3O", "3N",
"3M", "3L", "3K", "2J", "2H", "2G", "1F", "0B", "1B", "0A"
), class = "factor"), TOTALshark = c(3369.72, 12243.2, 6080.06,
316646.05, 18786.8, 6565.91, 1339771.2, 45841.03, 41329.64,
6411.86, 204980.36, 67608.78, 2617.05, 61547.64, 447349.44,
13226.4, 1362.55, 6012.23, 13152.51, 1067.92), cat = structure(c(1L,
1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L,
1L, 1L, 1L, 1L), .Label = c("groundfish", "largepelagic",
"bivalve", "smallpelagic", "crabs/lobsters", "shrimps", "others"
), class = "factor")), .Names = c("X1", "X2", "fishery",
"year", "div", "TOTALshark", "cat"), class = "data.frame", row.names = c(70L,
278L, 500L, 554L, 242L, 245L, 131L, 315L, 106L, 224L, 194L, 60L,
115L, 37L, 489L, 385L, 249L, 244L, 284L, 75L))
I wish to have the same legend, but with a few colors based on which category "cat" variable (i.e.,, pelagic, groundfish) the fishery falls in.
Is this what you want?
library(ggplot2)
library(plyr)
library(gridExtra)
# create data that links colour per 'cat' with 'fishery'
# the 'cat' colours will be used as manually set fill colours.
# get 'cat' colours
# alt. 1: grab 'cat' colours from plot object
# create a plot with fill = fishery *and* colour = cat
g1 <- ggplot(df, aes(x = year, y = TOTALshark, fill = fishery, colour = cat)) +
geom_bar(width = 0.5, stat = "identity", position = "dodge") +
facet_wrap(~ div)
g1
# grab 'cat' colours for each 'fishery' from plot object
# to be used in manual fill
cat_cols <- unique(ggplot_build(g1)[["data"]][[1]]$colour)
# unique 'cat'
cat <- unique(df$cat)
# create data frame with one colour per 'cat'
df2 <- data.frame(cat = cat, cat_cols)
df2
# alt 2: create your own 'cat' colours
# number of unique 'cat'
n <- length(cats)
# select one colour per 'cat', from e.g. brewer_pal or other palette tools
cat_cols <- brewer_pal(type = "qual")(n)
# cat_cols <- rainbow(n)
# create data frame with one colour per 'cat'
df2 <- data.frame(cat, cat_cols)
df2
# select unique 'fishery' and 'cat' combinations
# in the order they show up in the legend, i.e. ordered ('arranged') by fishery
df3 <- unique(arrange(df[, c("fishery", "cat")], fishery))
df3
# add 'cat' colours to 'fishery'
# use 'join' to keep order
df3 <- join(df3, df2)
df3
# plot with fill by 'fishery' creates a fill scale by fishery,
# but colours are set manually using scale_fill_manual and the 'cat' colours from above
g2 <- ggplot(df, aes(x = year, y = TOTALshark, fill = fishery)) +
geom_bar(width = 0.5, stat = "identity", position = "dodge") +
facet_wrap(~ div, nrow = 5) +
scale_fill_manual(values = as.character(df3$cat_cols))
g2
# create plot with both 'fishery' and 'cat' legend.
# extract 'fisheries' legend
tmp <- ggplot_gtable(ggplot_build(g2))
leg <- which(sapply(tmp$grobs, function(x) x$name) == "guide-box")
legend_fish <- tmp$grobs[[leg]]
# create a non-sense plot just to get a 'fill = cat' legend
g3 <- ggplot(df, aes(x = year, y = TOTALshark, fill = cat)) +
geom_bar(stat = "identity") +
scale_fill_manual(values = as.character(df3$cat_cols))
# extract 'cat' legend
tmp <- ggplot_gtable(ggplot_build(g3))
leg <- which(sapply(tmp$grobs, function(x) x$name) == "guide-box")
legend_cat <- tmp$grobs[[leg]]
# arrange plot and legends
library(gridExtra)
# quick and dirty with grid.arrange
# in the first column, put the plot (g2) without legend (removed using the 'theme' code)
# put the two legends in the second column
grid.arrange(g2 + theme(legend.position = "none"),
arrangeGrob(legend_fish, legend_cat), ncol = 2)
# arrange with viewports
# define plotting regions (viewports)
grid.newpage()
vp_plot <- viewport(x = 0.25, y = 0.5,
width = 0.5, height = 1)
vp_legend <- viewport(x = 0.75, y = 0.7,
width = 0.5, height = 0.5)
vp_sublegend <- viewport(x = 0.7, y = 0.25,
width = 0.5, height = 0.3)
print(g2 + theme(legend.position = "none"), vp = vp_plot)
upViewport(0)
pushViewport(vp_legend)
grid.draw(legend_fish)
upViewport(0)
pushViewport(vp_sublegend)
grid.draw(legend_cat)
See also #mnel's answer here for replacing values in the plot object. It might be worth trying here as well. You may also check gtable methods for arranging grobs.

Resources