Passing a String Via Index in R - r

I'm trying to use a for loop to pull subsets of data out of a dataframe with R.
I have a little vector to hold all the possible occurences of the names in that column
meter_class<-c("one_s_120","nine_s_120", "nine_s_480","fortyfive_s_120", "fortyfive_s_480")
Whenever I try to address it by index reference, it fails. Either nothing in the data subset survives (NULLs everywhere), or R complains about not passing the right argument by using meter_class[1]
attach(meter_class[1])
Error in attach(meter_class[1]) : file 'one_s_120' not found
subset(cal, cal$Form==as.character(meter_class[1]))
[1] Test Amps Type Accuracy Voltage Form
<0 rows> (or 0-length row.names)
Also, here's the output of dput on the datafram cal:
structure(list(Test = structure(c(3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L), .Label = c("Burst", "ESD", "Inspection",
"Surge"), class = "factor"), Amps = c(15, 15, 1.5, 2.5, 2.5,
0.25, 2.5, 2.5, 0.25, 2.5, 2.5, 0.25, 2.5, 2.5, 0.25, 15, 15,
1.5, 2.5, 2.5, 0.25, 2.5, 2.5, 0.25, 2.5, 2.5, 0.25, 2.5, 2.5,
0.25, 15, 15, 1.5, 2.5, 2.5, 0.25, 2.5, 2.5, 0.25, 2.5, 2.5,
0.25, 2.5, 2.5, 0.25, 15, 15, 1.5, 2.5, 2.5, 0.25, 2.5, 2.5,
0.25, 2.5, 2.5, 0.25, 2.5, 2.5, 0.25), Type = structure(c(2L,
1L, 3L, 2L, 1L, 3L, 2L, 1L, 3L, 2L, 1L, 3L, 2L, 1L, 3L, 2L, 1L,
3L, 2L, 1L, 3L, 2L, 1L, 3L, 2L, 1L, 3L, 2L, 1L, 3L, 2L, 1L, 3L,
2L, 1L, 3L, 2L, 1L, 3L, 2L, 1L, 3L, 2L, 1L, 3L, 2L, 1L, 3L, 2L,
1L, 3L, 2L, 1L, 3L, 2L, 1L, 3L, 2L, 1L, 3L), .Label = c("Lag - 0.5",
"Unity - Full", "Unity - Light"), class = "factor"), Accuracy = c(-0.011,
0.012, 0.027, 0.033, 0.076, 0.006, 0.052, 0.046, -0.016, 0.021,
0.008, 0.023, 0.034, 0.036, 0.038, 0.002, 0.012, 0.097, 0.055,
0.093, 0.033, 0.068, 0.048, -0.016, 0.042, 0.03, 0.035, 0.041,
0.024, 0.027, 0.004, -0.012, 0.002, 0.038, 0.084, 0.015, 0.049,
0.045, -0.009, 0.025, 0.002, 0.029, 0.03, 0.032, 0.064, 0.011,
0.024, 0.033, 0.054, 0.085, 0.027, 0.071, 0.059, 0.01, 0.051,
0.012, 0.051, 0.048, 0.04, 0.051), Voltage = c(120, 120, 120,
120, 120, 120, 480, 480, 480, 120, 120, 120, 480, 480, 480, 120,
120, 120, 120, 120, 120, 480, 480, 480, 120, 120, 120, 480, 480,
480, 120, 120, 120, 120, 120, 120, 480, 480, 480, 120, 120, 120,
480, 480, 480, 120, 120, 120, 120, 120, 120, 480, 480, 480, 120,
120, 120, 480, 480, 480), Form = structure(c(3L, 3L, 3L, 2L,
2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 3L, 3L, 2L, 2L,
2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 3L, 3L, 2L, 2L, 2L,
2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 3L, 3L, 2L, 2L, 2L, 2L,
2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("45S", "9S", "i210plus"
), class = "factor")), .Names = c("Test", "Amps", "Type", "Accuracy",
"Voltage", "Form"), class = "data.frame", row.names = c(NA, -60L
))
I know this is a simple thing to do if you know how to do it...Can anyone light the way?
Thanks!

It seems that none of the values of "meter_class" are represented in "Form" in your data frame.
unique(df$Form)
# [1] i210plus 9S 45S
meter_class %in% unique(df$Form)
# [1] FALSE FALSE FALSE FALSE FALSE
Just try two forms of subsetting, using values of "Form" actually present in the data:
subset(df, Form == "9S")
df[df$Form == "9S", ]
I also note that you wish to "pull subsets of data out of a dataframe". Not knowing the full story and your objectives of doing so, but please note that there are loads of functions that allow you to perform calculations, plotting, or whatever, on subsets of your data.
Update following comment
You can subset a data frame by combining logical conditions with logical operators (see e.g. ?Extract, ?&)
meter_class <- c("i210plus", "9S", "45S")
df[df$Form == "9S" & df$Voltage == 120, ]
# or
subset(df, Form == "9S" & Voltage == 120)

Related

Too many geom_points after facetting in ggplot2

Running the following script I was hoping to have one datapoint for each of the six terms with different colors depending on the dataset, facetted by adjustment. However, I get three and four point for each term in each facet. Any idea how this can happen when I only have 24 rows in the dataset?
library(ggplot2)
tb5 <- structure(list(term = structure(c(1L, 2L, 3L, 4L, 5L, 6L, 1L,
2L, 3L, 4L, 5L, 6L, 1L, 2L, 3L, 4L, 5L, 6L, 1L, 2L, 3L, 4L, 5L,
6L), .Label = c("A", "B", "C", "D", "E", "F"), class = "factor"),
coef = c(-1.444, 0.035, -0.034, 0.005, 0.001, 2.43, -1.032,
0.032, -0.024, 0.025, 0.003, 1.758, -1.148, 0.02, 0.003,
0.027, 0.003, 12.713, -1.494, 0.028, -0.021, 0.007, 0.004,
13.499), ci.lb = c(-1.826, 0.025, -0.087, -0.011, -0.004,
0.3, -1.293, 0.026, -0.061, 0.016, -0.001, -0.273, -1.48,
0.011, -0.045, 0.014, -0.003, 11.858, -1.931, 0.015, -0.08,
-0.014, -0.002, 12.624), ci.ub = c(-1.071, 0.045, 0.019,
0.022, 0.007, 7.305, -0.775, 0.038, 0.012, 0.035, 0.007,
6.613, -0.816, 0.029, 0.051, 0.039, 0.008, 13.569, -1.056,
0.04, 0.038, 0.027, 0.01, 14.375), Adjusted = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("Adjusted", "Unadjusted"
), class = "factor"), Dataset = structure(c(1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L), .Label = c("a", "b"), class = "factor")), .Names = c("term",
"coef", "ci.lb", "ci.ub", "Adjusted", "Dataset"), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -24L))
ggplot(data = tb5,aes(x=term,y=coef,color=Dataset))+geom_point()+
facet_grid(facets = ~Adjusted)+
geom_jitter(height = .8)

ezPlot overlapping error bar and data

I am using ezPlot from the ez package in R to plot results of a mixed within and between-ss design. The data point from the two groups I have overlap so that I would like to jitter both the data point and associated error bar.
data<-structure(list(Sub = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("WW", "XX", "YY",
"ZZ"), class = "factor"), DepVar = c(0.67, 0.35, 0.09, 0.2, 0.19,
0.13, 0.45, 0.23, 0.08, 0.32, 0.17, 0.18, 0.67, 0.36, 0.55, 0.4,
0.37, 0.05, 0.26, 0.11, 0.08, 0.46, 0.29, 0.18, 0.16, 0, 0.38,
0.22, 0.08, 0.1, 0.54, 0.17, 0.07, 0.38, 0.75, 0.87, 0.27, 0.57,
0.31, 0.28, 0.07, 0.12, 0.75, 0.33, 0.23, 0.33, 0.26, 0.18),
Group = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("A", "B"), class = "factor"),
Con = structure(c(1L, 3L, 3L, 3L, 4L, 5L, 2L, 3L, 4L, 1L,
2L, 3L, 1L, 3L, 3L, 3L, 4L, 5L, 2L, 3L, 4L, 1L, 2L, 3L, 1L,
3L, 3L, 3L, 4L, 5L, 2L, 3L, 4L, 1L, 2L, 3L, 1L, 3L, 3L, 3L,
4L, 5L, 2L, 3L, 4L, 1L, 2L, 3L), .Label = c("C", "D", "E",
"F", "G"), class = "factor")), .Names = c("Sub", "DepVar",
"Group", "Con"), class = "data.frame", row.names = c(NA, -48L))
ezPlot( data,
dv = .(DepVar),
wid = .(Sub), # subject
within = .(Con),
between=.(Group),
split=.(Group),
do_bars=TRUE,
type = 2,
x = .(Con))
A non elegant trick is so set scale-color manual white so that the underlying data points disappear and then using geom-point position dodge(0.4))
ezPlot( data,
dv = .(DepVar),
wid = .(Sub), # subject
within = .(Con),
between=.(Group),
split=.(Group),
do_bars=TRUE,
type = 2,
x = .(Con))+
scale_color_manual(values=c("white", "white"))+
geom_point(aes(fill=Group), color="black", pch= 21, size= 3, position=position_dodge(0.4))+
geom_line(aes(group = Group), lty = 3, lwd = 1.3, color='black')
however, I would like to have the error bar plotted and I don't know how to achieve this or if other workarounds are possible. I would like to stick to ezplot. Thanks!
One way is to use set print_code = TRUE, to produce data to be plotted, as well as the ggplot code:
library(ggplot2)
stats <- ezPlot( data,
dv = .(DepVar),
wid = .(Sub), # subject
within = .(Con),
between=.(Group),
split=.(Group),
do_bars=TRUE,
type = 2,
x = .(Con),
print_code = TRUE)
Then, manually modify the code to add position = position_dodge(0.4) to each geom, then run the ggplot code.
A more efficient way to do the same thing would be to capture.output the code as a character vector, use gsub to add position = position_dodge(0.4), then eval(parse(text = ...)) the modified code:
gg_code <- capture.output(stats <- ezPlot( data,
dv = .(DepVar),
wid = .(Sub), # subject
within = .(Con),
between=.(Group),
split=.(Group),
do_bars=TRUE,
type = 2,
x = .(Con),
print_code = TRUE))
gg_code <- gsub("alpha", "position = position_dodge(0.4), alpha", gg_code)
eval(parse(text = paste(gg_code, collapse = "")))
Output:

How to get geom_smooth() ignore my colour grouping

I'm trying to make a plot with fitted lines for two levels of my factor(grouped by color). I used shapes to group another variant but when I try to fit smoother, I end up with 4 lines while I only need one two lines in total (1 per color)
Here is the data and code I use:
data <- structure(list(K = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("2s", "4s"), class = "factor"),
q = c(0.12, 0.11, 0.1, 0.09, 0.08, 0.07, 0.06, 0.05, 0.04,
0.03, 0.02, 0.01, 0.12, 0.11, 0.1, 0.09, 0.08, 0.07, 0.06,
0.05, 0.04, 0.03, 0.02, 0.01, 0.12, 0.11, 0.1, 0.09, 0.08,
0.07, 0.06, 0.05, 0.04, 0.03, 0.02, 0.01, 0.12, 0.11, 0.1,
0.09, 0.08, 0.07, 0.06, 0.05, 0.04, 0.03, 0.02, 0.01), rarity = c(0.907,
0.9206, 0.9359, 0.9321, 0.9405, 0.9344, 0.9449, 0.9106, 0.8844,
0.8829, 0.8989, 0.798, 0.7464, 0.8225, 0.877, 0.8521, 0.9127,
0.9317, 0.9245, 0.9595, 0.9628, 0.9573, 0.9423, 0.9428, 0.5802,
0.6414, 0.5123, 0.57, 0.587, 0.5655, 0.5231, 0.517, 0.4694,
0.5459, 0.3745, 0.3274, 0.7936, 0.7821, 0.7297, 0.7227, 0.6814,
0.6608, 0.6721, 0.6202, 0.5924, 0.5659, 0.5448, 0.6138),
metric = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("APD", "ED"
), class = "factor")), .Names = c("K", "q", "rarity", "metric"
), class = "data.frame", row.names = c(NA, -48L))
library(ggplot2)
ggplot(data=data, aes(x=q, y=rarity, colour=metric, shape=K))+
ggtitle("Relationship")+
xlab("rate of character change")+
ylab("Correlation coefficient to average rarity")+
geom_point()+
geom_smooth(method=lm,se=FALSE)
Any advice on that?
You're getting two lines for each group becase it's being split by both metric and K. You really want the shape aesthetic to only apply to the point layer, not the smooth later. It's better just to move the aes() for that property there.
ggplot(data=data, aes(x=q, y=rarity, colour=metric))+
ggtitle("Relationship")+
xlab("rate of character change")+
ylab("Correlation coefficient to average rarity")+
geom_point(aes(shape=K))+
geom_smooth(method=lm,se=FALSE)

Boxplot with multiple x variables

I'm new to R and having a few issues with using ggplot2.
This is an example of my data (subset of larger data set) :
df <-
structure(list(logpvalue = c(22.36, 6.93, 16.78, 1.78, 17.75,
20.99, 21.03, 9.19, 15.01, 22.25, 13.4, 6.47, 1.34, 13.4, 3.21,
0.37, 0.5, 0.12, 1.8, 0.71, 1.15, 6.73, 0.12, 6.97, 0.64, 9.85,
1.45, 1.67, 2.6, 1.8, 1.35, 4.69, 0.37, 1.91, 0.31, 0, 2.45,
1.68, 2.31, 1.35, 6.48, 4.68), SNP = structure(c(1L, 7L, 6L,
5L, 11L, 1L, 9L, 5L, 8L, 11L, 7L, 5L, 8L, 11L, 1L, 7L, 1L, 4L,
2L, 3L, 10L, 7L, 1L, 4L, 2L, 3L, 10L, 4L, 2L, 3L, 10L, 4L, 2L,
3L, 10L, 4L, 2L, 3L, 7L, 9L, 5L, 1L), .Label = c("rs10244", "rs10891244",
"rs10891245", "rs11213821", "rs12296076", "rs138567267", "rs45615536",
"rs6589218", "rs7103178", "rs7127721", "rs7944895"), class = "factor"),
X173 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L), .Label = c("het", "hom"), class = "factor")), .Names = c("logpvalue",
"SNP", "X173"), class = "data.frame", row.names = c(NA, -42L))
I want to plot a boxplot of logpvalue on y axis, with SNP on the x-axis but with each SNP also categorized by whether the patient is het or hom for X173. So from this data I'd imagine 4 boxes on my boxplot.
If possible I'd also like to incorporate the individual data points (dotplot-boxplot overlay) with jitter.
This is the usual code I'd use for a boxplot of logpavlue vs SNP:
qplot(logpvalue, SNP, data = mydata, geom="boxplot")
+ geom_jitter(position=position_jitter(w=0.1, h=0.1)) + theme_bw()
How do I add the extra x variable into this code?
Try this:
boxplot(df$logpvalue~paste(df$SNP,df$X173))
Or using ggolot2 :
library(ggplot2)
ggplot(data=df,aes(SNP,logpvalue,colour=SNP)) +
geom_boxplot() +
geom_jitter() +
facet_grid(.~X173)

How to display date format (Sep-12) in bar chart and set y axis limit?

I have two questions on building a bar plot by using ggplot().
How to display data format (Sep-12)?
I would like to display the date in the format of Sep-12. My data is a quarterly summary. I would like to show Mar, Jun, Sep and Dec quarters. However, I used the as.Date(YearQuarter) within the ggplot() function. It shows a different sequence of Apr, July, Oct, Jan.
How to increase y axis limit?
The y axis is set at 70%, one of value label is out of the picutre. I have added ylim(0,1) to increase the y limit to 1. However, I lost the percentage format as the y axis is not displaying the percentage anymore.
x4.can.t.m <- structure(list(NR_CAT = structure(c(1L, 2L, 3L, 1L, 2L, 3L, 1L,
2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L,
3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L,
1L, 2L, 3L, 1L, 2L, 3L), .Label = c("0%", "1 to 84%", "85% +"
), class = "factor"), TYPE = structure(c(1L, 1L, 1L, 2L, 2L,
2L, 3L, 3L, 3L, 1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 1L, 1L, 1L,
2L, 2L, 2L, 3L, 3L, 3L, 1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 1L,
1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L), .Label = c("PM BUSINESS", "PM CONSUMER",
"PREPAY"), class = "factor"), YearQuarter = structure(c(1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L), .Label = c("2011-09-01",
"2011-12-01", "2012-03-01", "2012-06-01", "2012-09-01"), class = "factor"),
value = c(0.5, 0, 0.5, 0.35, 0, 0.65, 0.28, 0.02, 0.7, 0.4,
0, 0.6, 0.38, 0, 0.62, 0.43, 0.01, 0.56, 0.57, 0, 0.43, 0.35,
0, 0.65, 0.39, 0.01, 0.6, 0.55, 0, 0.45, 0.4, 0.02, 0.58,
0.35, 0.02, 0.63, 0.35, 0, 0.65, 0.55, 0.01, 0.44, 0.47,
0, 0.53)), .Names = c("NR_CAT", "TYPE", "YearQuarter", "value"
), row.names = c(NA, -45L), class = "data.frame")
This is my plot code:
x4.can.t.m$YearQuarter <- as.Date(x4.can.t.m$YearQuarter)
x4.can.t.d.bar <- ggplot(data=x4.can.t.m, aes(x=YearQuarter, y=value,fill=NR_CAT)) +
geom_bar(stat="identity",position = "dodge",ymax=NR_CAT+0.2) +
facet_wrap(~TYPE,ncol=1) +
geom_text(aes(label =paste(round(value*100,0),"%",sep="")),
position=position_dodge(width=0.9),
vjust=-0.25,size=3) +
scale_y_continuous(formatter='percent',ylim=1) +
labs(y="Percentage",x="Year Quarter") +
ylim(0,100%)
x4.can.t.d.bar +scale_fill_manual("Canopy Indicators",values=tourism.cols(c(6,9,8)))+
opts(title="Canopy Indicator: All Customers portout for Network
Issues",size=4)
It looks like you have an older version of ggplot; the following is for ggplot 0.2.9.1. I had to fix several things to make your plot work. Starting from your original definition of x4.can.t.m:
x4.can.t.m$YearQuarter <- format(as.Date(x4.can.t.m$YearQuarter),"%b-%y")
library("scales")
ggplot(data=x4.can.t.m, aes(x=YearQuarter, y=value, fill=NR_CAT)) +
geom_bar(stat="identity", position = "dodge") +
geom_text(aes(label = paste(round(value*100,0),"%",sep=""), group=NR_CAT),
position=position_dodge(width=0.9),
vjust=-0.25, size=3) +
scale_y_continuous("Percentage", labels=percent, limits=c(0,1)) +
labs(x="Year Quarter") +
scale_fill_discrete("Canopy Indicators") +
facet_wrap(~TYPE,ncol=1) +
ggtitle("Canopy Indicator: All Customers portout for Network Issues") +
theme(plot.title = element_text(size=rel(1.2)))
The first part of the question is just achieved by formatting YearQuarter into the format you wanted, leaving it as a string.
The second part specifies the limits in scale_y_continuous and uses the labels argument to specify the formatting function. Note that library("scales") is needed for this part to work.

Resources