Error in ggplot - r

I am trying to make a ggplot. When I had shape in aesthetics, the code was working just fine. However, I need to put shape in geom_point() because I'm trying to reproduce a figure. And when I added shape to geom_point() it gave me the following error:
Aesthetics must be either length 1 or the same as the data (6): shape
I've looked for other answers here but apparently, nothing seems to be working for me. Above I've provided with an image of what my data looks like. There are 17000 entries.
Below is my code:
summarised_data <-ddply(mammals,c('mammals$chr','mammals$Species','mammals$chrMark'),
function (x) c(median_rpkm = median(x$RPKM), median = median(x$dNdS)))
ggplot(summarised_data,aes(x = summarised_data$median_rpkm, y = summarised_data$median,
color = summarised_data$`mammals$Species`)) + geom_smooth(se = FALSE, method = "lm") +
geom_point(shape = summarised_data$`mammals$chrMark`) + xlab("median RPKM") + ylab("dNdS")
"ENSG00000213221", "ENSG00000213341", "ENSG00000213380", "ENSG00000213424",
"ENSG00000213533", "ENSG00000213551", "ENSG00000213619", "ENSG00000213626",
"ENSG00000213699", "ENSG00000213782", "ENSG00000213949", "ENSG00000214013",
"ENSG00000214338", "ENSG00000214357", "ENSG00000214367", "ENSG00000214517",
"ENSG00000214814", "ENSG00000215203", "ENSG00000215305", "ENSG00000215367",
"ENSG00000215440", "ENSG00000215897", "ENSG00000221947", "ENSG00000222011",
"ENSG00000224051", "ENSG00000225830", "ENSG00000225921", "ENSG00000239305",
"ENSG00000239474", "ENSG00000239900", "ENSG00000241058", "ENSG00000242247",
"ENSG00000242612", "ENSG00000243646", "ENSG00000244038", "ENSG00000244045"),
class = "factor"), Species = structure(c(1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L), .Label = c("Chimp", "Gori", "Human", "Maca",
"Mouse", "Oran"), class = "factor"), labs = structure(c(2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("Chimp-A", "Chimp-X",
"Gori-A", "Gori-X", "Human-A", "Human-X", "Maca-A", "Maca-X",
"Mouse-A", "Mouse-X", "Oran-A", "Oran-X"), class = "factor"),
chrMark = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L), .Label = c("A", "X"), class = "factor"), chr = structure(c(27L,
27L, 27L, 27L, 27L, 27L, 27L, 27L, 27L, 27L), .Label = c("1",
"10", "11", "12", "13", "14", "15", "16", "17", "18", "19",
"2", "20", "21", "22", "2a", "2A", "2b", "2B", "3", "4",
"5", "6", "7", "8", "9", "X"), class = "factor"), dN = c(3.00669,
3.27182, 7.02044, 1.01784, 3.0363, 2.32786, 4.92959, 3.03753,
3.0776, 1.02147), dS = c(3.15631, 5.87147, 3.13716, 2.05438,
4.10205, 5.24764, 4.2014, 3.18086, 5.4942, 3.02169), dNdS = c(0.9525965447,
0.5572403504, 2.2378329444, 0.4954487485, 0.7401908802, 0.4436013141,
1.1733207978, 0.954939859, 0.5601543446, 0.3380459279), RPKM = c(31.6,
13.9, 26.3, 9.02, 11.3, 137, 242, 1.05, 59.4, 10.1), Tau = c(0.7113820598,
0.8391023102, 0.3185943152, 0.6887167806, 0.9120531859, 0.6254200542,
0.7165302682, 0.7257435312, 0.2586613298, 0.6493567251),
GC3 = c(0.615502, 0.622543, 0.393064, 0.490141, 0.461592,
0.626407, 0.490305, 0.482853, 0.346424, 0.466484)), .Names = c("gene",
"Species", "labs", "chrMark", "chr", "dN", "dS", "dNdS", "RPKM",
"Tau", "GC3"), row.names = c(NA, 10L), class = "data.frame")

There's a few things wrong with your code and how ggplot handles non-standard evaluation, I'd recommend reading a ggplot tutorial or the docs. Having a column called within summarised_data called 'mammals$species' and 'mammals$chrMark' is going to cause lots of problems.
If we change these to something more sensible...
names(summarised_data)[names(summarised_data) == "mammals$species"] <- "mammals_species"
names(summarised_data)[names(summarised_data) == "mammals$chrMark"] <- "mammals_chrMark"
We can make the ggplot code more friendly. Note that shape has to been within aes, as you're mapping it to your data.
ggplot(summarised_data, aes(x = median_rpkm, y = median)) +
geom_smooth(se = FALSE, method = "lm") +
geom_point(aes(shape = mammals_chrMark,
color = mammals_species)) +
xlab("median RPKM") + ylab("dNdS")
Hopefully this should work, or at least get you somewhere closer to an answer.

Related

Make output of two rows into columns R

I am currently working with behavioural data in R from video analyses in BORIS. Every observation is 15 seconds and during this observation I noted the subject, its behaviour but also some background information such as the date, time of day, temperature, etc. However, the program has put this background information under the column "Behaviour" (so one of the behaviours is now "date") and its output under the column "Modifier" (which now says "15-10-2020" for example).
What I want is make more columns of date, time etc (from the column "Behaviour") and put its output (from the column "Modifier") in these columns, so that every behaviour has a subject, date, time, temperature, and so forth. I have however no idea how to do this.
I thought about using the function aggregate, but this gives me lots of extra rows with mainly NA's. I also looked into the package "tibble" but can't really make that work either.
Any suggestions would be greatly appreciated!
Some example rows (from dput()):
structure(list(Subject = structure(c(2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 7L), .Label = c("fallow deer female", "fallow deer female + calf",
"red deer female + calf", "roe deer male", "wild boar + young",
"wild boar male", "wild boar unknown sex"), class = "factor"),
Behavior = structure(c(1L, 2L, 8L, 7L, 12L, 3L, 5L, 10L,
6L, 4L), .Label = c("auditory vigilant", "date", "day/night",
"foraging", "nr. of individuals", "running", "temperature",
"time of day", "unknown behaviour", "walking", "walking while vigilant",
"weather"), class = "factor"), Behavioral.category = structure(c(4L,
2L, 2L, 2L, 2L, 2L, 2L, 3L, 4L, 3L), .Label = c("", "Background information",
"Non-vigilant", "Vigilant"), class = "factor"), Modifiers = structure(c(1L,
4L, 21L, 27L, 35L, 36L, 32L, 1L, 1L, 1L), .Label = c("",
"0346", "0347", "07172020", "07182020", "07212020", "07242020",
"07262020", "07272020", "08032020", "08052020", "1", "12",
"1307", "1327", "1342", "1343", "1430", "1528", "16", "1604",
"17", "1744", "21", "2119", "2120", "22", "23", "25", "26",
"3", "4", "7", "Clear", "Cloudy", "Day", "Night"), class = "factor")), row.names = c(NA,
10L), class = "data.frame")
The output that I'd like to have would give as column names: Subject; Behavior; Date; Time of Day; Temperature. The modifier output would be the values of the columns "Date", "Time of Day", "Temperature". When this works, I could delete the column Modifiers (since all its values are already in assigned columns).
Split up the dataframe in actual behaviours and background information. Perform this code on the background information:
tidyr::pivot_wider(your_data, names_from = Behavior, values_from = Modifiers)
Merge the dataframes!

combine multiple elements in a list with different indexes in r

I have a list and I need to add together elements with different indexes. I'm struggling because I want to create a loop at different indexes.
data(aSAH)
rocobj <- roc(aSAH$outcome, aSAH$s100b)
dat<-coords(rocobj, "all", ret=c("threshold","sensitivity", "specificity"), as.list=TRUE)
I want to create a function where I can look at all the sensitivity/1-specificity combos at all thresholds in a new data frame. I know threshold is found in dat[1,], sensitivity is found in dat[2,] and specificity is found in dat[3,]. So I tried:
for (i in length(dat)) {
print(dat[1,i]
print(dat[2,i]/(1-dat[3,i]))
}
Where I should end up with a dataframe that has threshold and sensitivity/1-specificity.
DATA
dput(head(aSAH))
structure(list(gos6 = structure(c(5L, 5L, 5L, 5L, 1L, 1L), .Label = c("1",
"2", "3", "4", "5"), class = c("ordered", "factor")), outcome = structure(c(1L,
1L, 1L, 1L, 2L, 2L), .Label = c("Good", "Poor"), class = "factor"),
gender = structure(c(2L, 2L, 2L, 2L, 2L, 1L), .Label = c("Male",
"Female"), class = "factor"), age = c(42L, 37L, 42L, 27L,
42L, 48L), wfns = structure(c(1L, 1L, 1L, 1L, 3L, 2L), .Label = c("1",
"2", "3", "4", "5"), class = c("ordered", "factor")), s100b = c(0.13,
0.14, 0.1, 0.04, 0.13, 0.1), ndka = c(3.01, 8.54, 8.09, 10.42,
17.4, 12.75)), .Names = c("gos6", "outcome", "gender", "age",
"wfns", "s100b", "ndka"), row.names = 29:34, class = "data.frame")
EDIT
One answer:
dat_transform <- as.data.frame(t(dat))
dat_transform <- dat_transform %>% mutate(new=sensitivity/(1-specificity))
You can use :
transform(t, res = sensitivity/(1-specificity))[c(1, 4)]
Or with dplyr :
library(dplyr)
t %>%
mutate(res = sensitivity/(1-specificity)) %>%
select(threshold, res)
Also note that t is a default function in R to tranpose dataframe so better to use some other variable name for the dataframe.

Issues with displaying data points on every frame of facet_wrap/facet_grid object

I'm trying to produce a plot with either facet_wrap or facet_grid (no preference at this time), but display a selection of data points on every frame within the facet_wrap/facet_grid object.
I read that you can simply remove the facetting variable from the data set you want included on every plot, but for whatever reason this doesn't seem to be working for me.
This is on Rstudio Version 1.1.453.
I found this code sample:
ggplot(mpg, aes(displ, hwy)) +
geom_point(data = transform(mpg, class = NULL), colour = "grey85") +
geom_point() +
facet_wrap(~class)
And pretty much copied it for my code below. The above code works fine, but for whatever reason in my implementation it returns an error message. Note I've tried setting both geom features to geom_point also with no luck.
ggplot(data = Total, aes(Total$Time, Total$Killing)) +
geom_jitter(data = transform(Total, Run = NULL), colour = "grey85") +
geom_point() +
facet_wrap(~Run)
Error: Aesthetics must be either length 1 or the same as the data (2700): x, y
This is the error message I've been encountering on attempting to run this code.
Ultimately my goal is to run the below code, but I simplified it a bit for the purposes of the question above.
ggplot(data = filter(Total, Cell_Line != "stDev"), aes(x= Time, y=Killing)) +
geom_line(data = filter(select(Total, -Run), Cell_Line == "Wild_Type"), aes(x = Time, y = filter(Total, Cell_Line == "Wild_Type")[,3])) +
geom_errorbar(aes(x = filter(Total, Cell_Line == "Wild_Type")[,2], ymax = filter(Total, Cell_Line == "Wild_Type")[,3] + filter(Total, Cell_Line == "stDev")[,3], ymin = filter(Total, Cell_Line == "Wild_Type")[,3] - filter(Total, Cell_Line == "stDev")[,3])) +
geom_point() +
facet_wrap(~Run)
And here's the result of dput(Total) trimmed down to the first 30 rows:
structure(list(Cell_Line = structure(c(5L, 12L, 13L, 1L, 2L,
3L, 4L, 6L, 7L, 8L, 9L, 10L, 11L, 15L, 14L, 5L, 12L, 13L, 1L,
2L, 3L, 4L, 6L, 7L, 8L, 9L, 10L, 11L, 15L, 14L), .Label = c("17",
"19", "20", "29", "3", "33", "38", "47", "49", "53", "55", "7",
"8", "stDev", "Wild_Type"), class = "factor"), Time = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("00",
"02", "04", "08", "12", "18", "24", "32", "40", "48", "56", "64",
"72", "80"), class = "factor"), Killing = c(0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0.0704388, 0.2881066, -0.0132908,
0.04700991, 0.03049371, -0.02243472, 0.1513817, 0.129636, 0.09328508,
0.05876777, 0.1063291, 0.0357473, 0.1974026, 0.07732854, 0.07383331
)), row.names = c(NA, 30L), class = "data.frame")
Your call to transform has an error: you don't have a column named Run.
set.seed(1)
Total$Run <- sample(1:100, 30)
# this is your own code:
ggplot(data = Total, aes(Total$Time, Total$Killing)) +
geom_jitter(data = transform(Total, Run = NULL), colour = "grey85") +
geom_point() +
facet_wrap(~Run)
Which produces this plot:

How can I add error bars to a ggplot object

I have two data sets like below
df1<- structure(list(time = structure(c(1L, 1L, 1L, 2L, 2L, 2L, 3L,
3L, 3L), .Label = c("24", "48", "72"), class = "factor"), place = structure(c(1L,
1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L), .Label = c("B,C", "D,E", "F,G"
), class = "factor"), key = c("boy1", "boy2", "boy3", "boy1",
"boy2", "boy3", "boy1", "boy2", "boy3"), value = c(177.72258835,
0, 74.438539625, 134.3410045, 48915.1, 38.302204425, 97.32286187,
25865.25, 28.67291878), x = c("1", "2", "3", "1", "2", "3", "1",
"2", "3"), y = c(177.72258835, 0, 74.438539625, 134.3410045,
48915.1, 38.302204425, 97.32286187, 25865.25, 28.67291878)), .Names = c("time",
"place", "key", "value", "x", "y"), row.names = c(NA, -9L), class = "data.frame")
df2<- structure(list(time = structure(c(1L, 1L, 1L, 2L, 2L, 2L, 3L,
3L, 3L), .Label = c("24", "48", "72"), class = "factor"), place = structure(c(1L,
1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L), .Label = c("B,C", "D,E", "F,G"
), class = "factor"), key = c("boy1", "boy2", "boy3", "boy1",
"boy2", "boy3", "boy1", "boy2", "boy3"), value = c(58.852340736,
0, 21.291893740908, 42.92051958201, 72521.52726, 16.309811239722,
32.403556124268, 38347.81965, 10.342042262244), x = c("1", "2",
"3", "1", "2", "3", "1", "2", "3"), y = c(58.852340736, 0, 21.291893740908,
42.92051958201, 72521.52726, 16.309811239722, 32.403556124268,
38347.81965, 10.342042262244)), .Names = c("time", "place", "key",
"value", "x", "y"), row.names = c(NA, -9L), class = "data.frame")
I want to plot them together with df2 as the standard deviation for df1
when I plot df1, I do the following
library(ggplot2)
ggplot(df1, aes(x, y, col = key)) +
geom_point() +
scale_x_discrete(labels=c("first", "second", "third"), limits = c(1, 2,3)) +
facet_grid(time ~ .)
but now I want to have the second df as the standard deviation (i.e., the first y-value in df1 is 177.72259, so it's standard deviation is the corresponding y-value in df2, which is 58.85234).
If I understand your question correctly, it sounds like you want to include error bars in your plot. This can be accomplished using only a single data frame, if you just add the standard error as an additional variable like so:
df <- structure(list(time = structure(c(1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L),
.Label = c("24", "48", "72"), class = "factor"), place = structure(c(1L, 1L, 1L,
2L, 2L, 2L, 3L, 3L, 3L), .Label = c("B,C", "D,E", "F,G"), class = "factor"),
key = c("boy1", "boy2", "boy3", "boy1", "boy2", "boy3", "boy1", "boy2", "boy3"),
value = c(58.852340736, 0, 21.291893740908, 42.92051958201, 72521.52726,
16.309811239722, 32.403556124268, 38347.81965, 10.342042262244),
x = c("1", "2", "3", "1", "2", "3", "1", "2", "3"), y = c(177.72258835, 0,
74.438539625, 134.3410045, 48915.1, 38.302204425, 97.32286187, 25865.25, 28.67291878),
sd = c(58.852340736, 0, 21.291893740908, 42.92051958201, 72521.52726, 16.309811239722,
32.403556124268,38347.81965, 10.342042262244)), .Names = c("time", "place", "key",
"value", "x", "y", "sd"), row.names = c(NA, -9L), class = "data.frame")
Then you can add error bars to the plot using geom_errorbar(), as follows (I am borrowing the "free-y" scale trick from #jazzurro's answer above):
ggplot(df, aes(x, y, col = key)) +
geom_point() +
scale_x_discrete(labels=c("first", "second", "third"), limits = c(1, 2,3)) +
facet_grid(time ~ .) +
geom_errorbar(aes(ymin = y-sd, ymax = y+sd)) +
facet_grid(time ~ ., scale = "free_y")
Unfortunately your data is a little skewed, in that some measurements are way larger in magnitude than others (especially at time=48 and time=72); you may want to consider a log transformation so that the error bars for the smaller observations do not appear so negligible.
Here is one way for you. I changed the shape of the sd in the second geom_point(). Since the y-scale has a wide range for two of the plots, you see points overlapping.
ggplot() +
geom_point(data = df1, aes(x, y, col = key)) +
geom_point(data = df2, aes(x, y, col = key), shape = 22, alpha = 0.3) +
scale_x_discrete(labels=c("first", "second", "third"), limits = c(1, 2, 3)) +
facet_grid(time ~ ., scale = "free_y")

Stacked Area Graph Using R and ggplot2 Has Holes

I'm trying to create a stacked area graph with r and ggplot2. I'd like it to look
like this, but instead the areas overlap and have holes. I'm trying to ensure that the areas are stacked so that the area with the largest value in the most recent month (2016-05 in this case) are on the bottom.
Related posts like this one seem to have holes in the data, which doesn't seem to be the issue here.
Here's sample code to recreate the issue:
sample.data <- structure(
list(
rank = structure(
c(34L, 34L, 34L, 35L, 35L, 35L, 34L, 34L, 34L, 34L, 35L, 35L, 35L, 35L, 35L, 34L),
.Label = c("1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "23", "24", "25", "26", "27", "28", "29", "30", "31", "32", "33", "34", "35"),
class = "factor"),
vendor = structure(
c(1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 1L),
.Label = c("34", "35"),
class = "factor"),
year.month = c("2015-12", "2016-01", "2015-11", "2015-12", "2016-01", "2015-10", "2016-03", "2016-02", "2015-10", "2016-04", "2015-11", "2016-05", "2016-04", "2016-03", "2016-02", "2016-05"),
value = c(431616L, 272224L, 229288L, 195284L, 155168L, 154194L, 149784L, 137302L, 126612L, 117408L, 94141L, 56161L, 54606L, 53173L, 49898L, 45348L)),
.Names = c("rank", "vendor", "year.month", "value"),
row.names = c(6L, 8L, 4L, 5L, 7L, 1L, 12L, 10L, 2L, 14L, 3L, 15L, 13L, 11L, 9L, 16L),
class = "data.frame"
)
ggplot(data = sample.data, aes(x = year.month, y = value, group = vendor, color = vendor, reorder(-value), fill=vendor)) +
geom_area()
Thanks in advance for your help.
Try: + geom_area(position="dodge",stat="identity")
The following works:
ggplot(data = sample.data[order(sample.data$vendor),],
aes(x = year.month, y = value, group = vendor, color = vendor,
reorder(-value), fill=vendor)) + geom_area()
You just had to order your data: sample.data[order(sample.data$vendor),].
If you want to change the order of the graph, you have to "relevel" the vendor variable which is stored as a factor:
sample.data$vendor <- relevel(sample.data$vendor, ref="35")
Here is some code to figure out what vendor to set as the base level according to your criterion:
with(sample.data, sample.data[year.month=="2016-05",
"vendor"][which.max(sample.data[year.month=="2016-05", "value"])])

Resources