I am trying to produce a geom_violin() plot overlayed with a geom_point() plot, in which the geom_point() plot has different colors of the points based on which subset I have categorized the data into.
I have an error saying "Error in eval(expr, envir, enclos) : object 'ind' not found" when attempting to load the subset dataframe when I do it within the geom_point() function, but I don't understand what I am doing wrong from poking around or googling the error.
(Without that row, the code runs and generates this output, which is what I want other than the color coding of the points: PDF output when the second geom_point is commented out)
Here is the nonsense dataset I used to try and make this work (gene1,2,3 are rownames). I will transpose it in the code below:
,cell_1,cell_2,cell_3,cell_4,cell_5,cell_6,cell_7,cell_8,cell_9,cell_10,cell_11,cell_12,cell_13,cell_14,cell_15,cell_16,cell_17,cell_18,cell_19,cell_20,cell_21,cell_22,cell_23,cell_24,cell_25,cell_26,cell_27,cell_28,cell_29,cell_30,cell_31,cell_32,cell_33,cell_34,cell_35,cell_36,cell_37,cell_38,cell_39,cell_40,cell_41,cell_42,cell_43,cell_44,cell_45,cell_46,cell_47,cell_48,cell_49,cell_50
gene1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.19230,0.0,0.0,0.0,0.19230,0.0,0.0,0.0,69.3915,0.0,0.0,74.123,0,0,0,0,0,13.01,0.0,0.0,0.0,0.0,0.0,0.9231,73.023,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
gene2,0.279204,23.456,13.1,10.5,0.0,14.2,151,2,50.3201,0.0,0.0,128.0,0.0,0.0,0.0,9.74082,20.9432,0.0,0.0,300.023,20.0234,0.0,0.0,300.024,123,201.345,164.681,301.421,173.023,216.537,201.234,302.102,199.234,20.234,40.234,180.0234,0.0,23.234,190.134,170.023,0.0,8.023,40.234,180.0234,0.0,23.234,190.134,170.023,21.24,8.023
gene3,25.9954,77.3398,45.3092,107.508,0.266139,70.4924,114.17,291.324,198.525,190.353,185.381,0.14223,90.323,20.4332,29.012,500.391,2.51459,300.021,60.001,192.023,60.0234,300.022,60.002,192.024,34,500.392,2.51460,300.022,60.002,192.024,60.0235,300.023,60.003,192.025,60.002,192.024,34,500.392,2.51460,300.022,60.002,192.024,60.0235,300.023,60.003,192.025,35,194.231,94.13,32.124
gene4,46.1717,194.241,0.776565,3.0325,0.762981,2.3123,14.507,13.0234,0.538315,0.0,1.5234,11.2341,0.0,1.34819,6.0142,3.2341,4.4444,150.324,0.0,20.9432,134.023,150.325,0.0,20.9433,3.2341,4.4444,150.324,0.0,20.9432,134.023,170.13408,0.0,3.2341,4.4444,150.324,0.0,3.2341,6.7023,150.324,0.0,3.2341,4.4444,170.341,0.0,20.9432,134.023,150.325,0.0,50.234,3.123
gene5,94.2341,301.234,0.0,0.0,123.371,0.0,0.0,155.234,0.0,0.664744,0.0,402.616,222.148,0.0,0.0,0.0,169.234,0.0,10.234,0.0,0.0,0.0,0.99234,0.0,0.99234,0.0,0.0,0.0,0.99234,0.0,0.99234,0.0,0.0,0.0,0.99234,0.0,10.324,0.0,0.0,15.0234,43.1243,0.0,320.023,0.0,0.0,0.0,1.234,0.0,12.123,0.0
Here's the code I wrote:
#Load dataset
df_raw <- read.table("pretend_dataset.csv",
sep=",",
header=TRUE)
#Make gene names into rownames
rownames(df_raw) <- df_raw$Name
#Remove "Name" column
df_raw$Name <- NULL
#TRANSPOSE DATASET
matrix_transp <- t(df_raw)
#Make matrix_transp matrix into dataframe
df <- as.data.frame(as.matrix(matrix_transp))
#Subset gene1 positive and negatve cells
df.positive <- subset(df, gene1 > 0)
#Convert data in data frames to log scale
df.log <- log(df+1)
df.positive.log <- log(df.positive+1)
#Violin plot for each gene with all cells (positive and negative with color coded scatter)
plot <- ggplot(stack(df.log), aes(x = ind, y = values, fill=ind)) +
geom_violin() +
geom_point(position = position_jitterdodge(jitter.width=4)) +
geom_point(data=df.positive.log, aes(x = ind, y = values, fill=ind), position = position_jitterdodge(jitter.width=4), color="red") +
xlab("Gene") + ylab("Expression level (TPM log)") +
theme_classic(base_size = 14, base_family = "Helvetica") +
theme(axis.text.y=element_text(size=14)) +
theme(axis.title.y=element_text(size=14, face="bold")) +
theme(axis.text.x=element_text(size=14)) +
theme(axis.title.x=element_text(size=14, face="bold")) +
scale_fill_brewer(palette="Pastel1")
plot + coord_cartesian(ylim = c(0, 8))
Update:
This question was asked due to a fundamental misunderstanding regarding how data needs to be formatted to efficiently plot it in R.
The data needs to be reformatted into a long instead of a wide format, which can be done i.e. with gather as suggested below, but also with other methods listed in this question: Reshaping multiple sets of measurement columns (wide format) into single columns (long format)
The below answer overlays a coloured violin plot with a jittered set of points that are coloured by positive or negative.
library(dplyr); library(ggplot2); library(tidyr)
#read in data.
df2 <-read.csv(textConnection(df), header=TRUE, row.names = 1)
# Add in the rownames and gather the dataset
df3 <- df2 %>% mutate(Gene= rownames(.)) %>%
gather(., key= "cell", value="value", -Gene) %>%
mutate(positive = value>0, absolute= abs(value), logabs= log(absolute+1))
df3 %>% ggplot(. , aes(x = Gene, y=logabs, fill=Gene)) +
geom_violin() +geom_jitter( aes(colour= positive))
Is this what you were looking for?
EDIT: The read in data line, line pastes in the data you presented above into a text string, then converts the text string to a dataframe. If you already have the data frame it isn't necessary. It is only used as there was not dput() object available to use.
EDIT 2:
This extended answer results from comments to the previous answer. The solution uses a transposed matrix of the data shown in the question. The resulting plot has violin plots, coloured by gene overlaid with points coloured by whether that observation is negative in gene1.
The exact data set is shown below and is the result of calling the dput() command on the matrix.
df <- structure(c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0.1923, 0, 0, 0, 0.1923, 0, 0, 0, 69.3915, 0, 0, 74.123, 0, 0,
0, 0, 0, 13.01, 0, 0, 0, 0, 0, 0.9231, 73.023, 0, 0, 0, 0, 0,
0, 0, 0, 0.279204, 23.456, 13.1, 10.5, 0, 14.2, 151, 2, 50.3201,
0, 0, 128, 0, 0, 0, 9.74082, 20.9432, 0, 0, 300.023, 20.0234,
0, 0, 300.024, 123, 201.345, 164.681, 301.421, 173.023, 216.537,
201.234, 302.102, 199.234, 20.234, 40.234, 180.0234, 0, 23.234,
190.134, 170.023, 0, 8.023, 40.234, 180.0234, 0, 23.234, 190.134,
170.023, 21.24, 8.023, 25.9954, 77.3398, 45.3092, 107.508, 0.266139,
70.4924, 114.17, 291.324, 198.525, 190.353, 185.381, 0.14223,
90.323, 20.4332, 29.012, 500.391, 2.51459, 300.021, 60.001, 192.023,
60.0234, 300.022, 60.002, 192.024, 34, 500.392, 2.5146, 300.022,
60.002, 192.024, 60.0235, 300.023, 60.003, 192.025, 60.002, 192.024,
34, 500.392, 2.5146, 300.022, 60.002, 192.024, 60.0235, 300.023,
60.003, 192.025, 35, 194.231, 94.13, 32.124, 46.1717, 194.241,
0.776565, 3.0325, 0.762981, 2.3123, 14.507, 13.0234, 0.538315,
0, 1.5234, 11.2341, 0, 1.34819, 6.0142, 3.2341, 4.4444, 150.324,
0, 20.9432, 134.023, 150.325, 0, 20.9433, 3.2341, 4.4444, 150.324,
0, 20.9432, 134.023, 170.13408, 0, 3.2341, 4.4444, 150.324, 0,
3.2341, 6.7023, 150.324, 0, 3.2341, 4.4444, 170.341, 0, 20.9432,
134.023, 150.325, 0, 50.234, 3.123), .Dim = c(50L, 4L), .Dimnames = list(
c("cell_1", "cell_2", "cell_3", "cell_4", "cell_5", "cell_6",
"cell_7", "cell_8", "cell_9", "cell_10", "cell_11", "cell_12",
"cell_13", "cell_14", "cell_15", "cell_16", "cell_17", "cell_18",
"cell_19", "cell_20", "cell_21", "cell_22", "cell_23", "cell_24",
"cell_25", "cell_26", "cell_27", "cell_28", "cell_29", "cell_30",
"cell_31", "cell_32", "cell_33", "cell_34", "cell_35", "cell_36",
"cell_37", "cell_38", "cell_39", "cell_40", "cell_41", "cell_42",
"cell_43", "cell_44", "cell_45", "cell_46", "cell_47", "cell_48",
"cell_49", "cell_50"), c("gene1", "gene2", "gene3", "gene4"
)))
The code required to turn the above data set into the plot requested is shown below.
df2 <- df %>% as.data.frame %>% mutate(Cell= rownames(.), positive = gene1>0) %>%
gather(., key= "Gene", value="value", -Cell,-positive) %>%
mutate( absolute= abs(value), logabs= log(absolute+1))
df2 %>% ggplot(. , aes(x = Gene, y=logabs, fill=Gene)) +
geom_violin() +geom_jitter( aes(colour= positive))
As the plot might be difficult to interpret, to additional methods of displaying the status relative to gene1.
df2 %>% ggplot(., aes(x=Gene, y=logabs, fill=positive)) +geom_boxplot()
df2 %>% ggplot(. , aes(x = Gene, y=logabs, fill=positive)) +
geom_violin()
Sample data:
pp.inc <- structure(list(has.di.rec.pp = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0), m.dist.km2 = c(-34.4150009155273, 6.80600023269653, -6.55499982833862,
-61.7700004577637, 15.6840000152588, -11.2869997024536, -26.9729995727539,
0, 81.9940032958984, -35.1459999084473, -12.5179996490479, 0,
21.5919990539551, 81.9940032958984, -20.7770004272461, 85.9469985961914,
-15.2959995269775, -75.5879974365234, 81.9940032958984, 3.04999995231628,
-17.1490001678467, -25.806999206543, -16.0060005187988, -14.91100025177,
-12.9020004272461, -16.0060005187988, 5.44000005722046, -34.4150009155273,
81.9940032958984, 3.61400008201599, 13.7379999160767, 2.71300005912781,
4.31300020217896), treated = c(0, 1, 0, 0, 1, 0, 0, 1, 1, 0,
0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1,
1, 1)), .Names = c("has.di.rec.pp", "m.dist.km2", "treated"), row.names = c(NA,
-33L), class = c("data.table", "data.frame"))
Code:
library(data.table)
library(ggplot2)
rddplot <- function(data, outcome, runvar, treatment = treated, span, bw, ...){
data <- data.table(data)
data.span <- data[abs(runvar) <= span, ]
data.span <- data.span[ , bins := cut(runvar,
seq(-span, span, by = bw),
include.lowest = TRUE, right = FALSE)]
data.span.plot <- data.span[ , list(avg.outcome = mean(outcome),
avg.runvar = mean(runvar),
treated = max(treatment),
n.iid = length(outcome)), keyby = bins]
data.span.plot <- data.span.plot[ , runvar := head(seq(-span, span, by = bw), -1)]
bp <- ggplot(data = data.span.plot, aes(x = runvar, y = avg.outcome))
bp <- bp + geom_point(aes(colour = n.iid))
bp <- bp + stat_smooth(data = data.span, aes(x = runvar, y = outcome,
group = factor(treatment)), ...)
bp
return(bp)
}
rddplot(pp.inc, has.di.rec.pp, m.dist.km2, treated, 50, 5)
This code runs perfect if I do not wrap it in a function. I am a novice in R, only using it very infrequently. What am I doing wrong? Am I missing something obvious or is it to do with data.table or ggplot2? I thought it might be something with ggplot, as other questions mention there is an issue and aes_string should be used. I can rewrite the data.table parts to use base functions. But I think the error already occurs before that, on the second line. How do I make this work?
EDIT:
[Original title:
R function returns Error in eval(expr, envir, enclos) : object 'name' not found]
I had some time to look at this again and have worked out a solution, hence I also modified the title a bit. Using eval() didn't really work out for me, so I went the [['columname']] selection route. I've ditched data.table (and plyr as well), so that this only uses base functions except for ggplot2. I am happy for any comments on how to improve it. Please let me know if there are some essential flaws. If not I will add an answer with my solution later.
I have changed the bin calculation so that there is always a breakpoint at zero, which is necessary. Default binwidth is determined by the Silverman rule. I am thinking of calculating model fit separately and returning it, as the model choice within ggplot is limited, however I can't think of a nice way to incorporate this for a variety of diverse models such as lm or loess, and it's not strictly necessary. I actually wanted to overlay a thin bar plot displaying the number of observations in each bin, but found out this is impossible in ggplot (I know this generally is a bad idea, but there are several well-published papers which use similar graphs). I don't find the size aestetic to appealing here, but these are really minor gripes.
Thanks for getting me on the right path.
My solution:
rddplot <- function(data, outcome, runvar, treatment = treated,
span, bw = bw.nrd0(data[[runvar]]), ...){
breaks <- c(sort(-seq(0, span, by = bw)[-1]), seq(0, span, by = bw))
data.span <- data[abs(data[[runvar]]) <= max(breaks), ]
data.span$bins <- cut(data.span[[runvar]], breaks,
include.lowest = TRUE, right = FALSE)
data.span.plot <- as.data.frame(cbind(tapply(data.span[[outcome]], data.span$bins, mean),
tapply(data.span[[runvar]], data.span$bins, mean),
tapply(data.span[[treatment]], data.span$bins, max),
tapply(data.span[[outcome]], data.span$bins, length),
tapply(data.span[[outcome]], data.span$bins, sum)))
colnames(data.span.plot) <- c("avg.outcome", "avg.runvar", "treated", "n.iid", "n.rec")
data.span.plot$runvar <- head(breaks, -1)
print(data.span.plot)
bp <- ggplot(data = data.span.plot, aes(x = runvar, y = avg.outcome))
bp <- bp + geom_point(aes(size = n.iid))
bp <- bp + stat_smooth(data = data.span, aes_string(x = runvar, y = outcome,
group = treatment), ...)
print(bp)
}
Call:
rddplot(pp.inc, "has.di.rec.pp", "m.dist.km2", "treated", 50,
method = lm, formula = y ~ poly(x, 4, raw = TRUE))
I have an approach using data.table and some deparse(substitute()) and setnames trickery....
rddplot <- function(data, outcome, runvar, treatment = treated, span, bw, ...){
# convert to data.table
data <- data.table(data)
# get the column names as defined in the call to rddplot
outname <- deparse(substitute(outcome))
runname <- deparse(substitute(runvar))
treatname <- deparse(substitute(treatment))
# rename these columns with the argument namses
setnames(data, old = c(outname,runname,treatname), new = c('outcome','runvar', 'treatment'))
# breaks as defined in the second example
breaks <- c(sort(-seq(0, span, by = bw)[-1]), seq(0, span, by = bw))
# the stuff you were doing before
data.span <- data[abs(runvar) <= span, ]
data.span <- data.span[ , bins := cut(runvar,
breaks,
include.lowest = TRUE, right = FALSE)]
data.span.plot <- data.span[ , list(avg.outcome = mean(outcome),
avg.runvar = mean(runvar),
treated = max(treatment),
n.iid = length(outcome)), keyby = bins]
# note I've removed trying to add `runvar` column to data.span.plot....)
bp <- ggplot(data = data.span.plot, aes(x = avg.runvar, y = avg.outcome))
bp <- bp + geom_point(aes(colour = n.iid))
bp <- bp + stat_smooth(data = data.span, aes(x = runvar, y = outcome,
group = treatment), ...)
bp
}
rddplot(pp.inc, has.di.rec.pp, m.dist.km2, treated, 50, 5)
Note that if you didn't convert to data.table within the function, and assumed the data argument was a data.table, then you could use on.exit() to revert the names changed by reference.