Forest plot with table ggplot coding - r

I am trying to get a table side by side with my forest plot but I am having a lot of trouble doing so.
I am able to make a forest plot with the following code:
###dataframe
###dataframe
library(ggplot2)
library(tidyr)
library(grid)
library(gridExtra)
library(forcats)
forestdf <- structure(list(labels = structure(1:36, .Label = c("Age*", "Sex – male vs. female",
"Body-mass index*,1 ", "Systolic blood pressure*", "Race - vs. white",
"Asian", "Black", "Townsend deprivation index", "Social habit",
"Smoking - vs. never", "Previous", "Current", "Alcohol use - vs. never",
"Once or twice a week", "Three or four times a week", "Daily or almost daily",
"Comorbidity", "Cancer", "Diabetes", "Chronic obstructive pulmonary disease2",
"Asthma", "Ischemic heart disease3", "Hypothyroidism", "Hypercholesterolemia",
"Allergic rhinitis", "Depression", "Serology", "White blood cell count",
"Red blood cell count", "Hemoglobin concentration", "Mean corpuscular volume",
"Mean corpuscular hemoglobin concentration", "Platelet count",
"Lymphocyte count", "Monocyte count", "Neutrophil count"), class = "factor"),
rr = c(1.18, 1.45, 1.76, 0.98, NA, 2.16, 2.65, 1.09, NA,
NA, 1.35, 1.15, NA, 0.73, 0.63, 0.63, NA, 1.23, 1.34, 1.51,
1.12, 1.46, 0.96, 1.1, 1.18, 1.38, NA, 1.03, 0.87, 0.93,
1, 0.94, 1, 1.03, 1.17, 1.06), rrhigh = c(1.08, 1.28, 1.57,
0.95, NA, 1.63, 2.03, 1.07, NA, NA, 1.18, 0.94, NA, 0.58,
0.49, 0.5, NA, 0.99, 1.08, 1.09, 0.93, 1.15, 0.71, 0.92,
0.91, 1.1, NA, 1.02, 0.73, 0.87, 0.99, 0.88, 1, 1.01, 1.03,
1.01), rrlow = c(1.28, 1.64, 1.97, 1.02, NA, 2.86, 3.44,
1.11, NA, NA, 1.55, 1.42, NA, 0.9, 0.79, 0.81, NA, 1.53,
1.66, 2.09, 1.34, 1.85, 1.3, 1.31, 1.52, 1.74, NA, 1.04,
1.03, 0.98, 1.01, 1.01, 1, 1.05, 1.32, 1.1)), class = "data.frame", row.names = c(NA,
-36L))
forestdf$labels <- factor(forestdf$labels,levels = forestdf$labels)
levels(forestdf$labels) 1.52, 1.74, NA, 1.04, 1.03, 0.98, 1.01, 1.01, 1, 1.05, 1.32,
#forestplot
p <- ggplot(forestdf, aes(x=rr, y=labels, xmin=rrlow, xmax=rrhigh))+
geom_pointrange(shape=22, fill="black")+
geom_vline(xintercept = 1, linetype=3)+
xlab("Variable")+ylab("Adjusted Relative Risk with 95% Confidence Interval")+theme_classic()+scale_y_discrete(limits = rev(labels))+
scale_x_log10(limits = c(0.25, 4), breaks = c(0.25, 0.5, 1, 2, 4), labels=c("0.25", "0.5", "1", "2", "4"), expand = c(0,0))
p
However, I cannot get the left panel with labels to work:
#dataframe for table
fplottable <- structure(list(labels = structure(c(1L, 30L, 7L, 33L, 27L, 4L,
6L, 35L, 32L, 31L, 26L, 11L, 2L, 24L, 34L, 12L, 10L, 8L, 14L,
9L, 5L, 18L, 17L, 16L, 3L, 13L, 29L, 36L, 28L, 15L, 21L, 20L,
25L, 19L, 22L, 23L), .Label = c("Age*", "Alcohol use - vs. never",
"Allergic rhinitis", "Asian", "Asthma", "Black", "Body-mass index*,1 ",
"Cancer", "Chronic obstructive pulmonary disease2", "Comorbidity",
"Current", "Daily or almost daily", "Depression", "Diabetes",
"Hemoglobin concentration", "Hypercholesterolemia", "Hypothyroidism",
"Ischemic heart disease3", "Lymphocyte count", "Mean corpuscular hemoglobin concentration",
"Mean corpuscular volume", "Monocyte count", "Neutrophil count",
"Once or twice a week", "Platelet count", "Previous", "Race - vs. white",
"Red blood cell count", "Serology", "Sex – male vs. female",
"Smoking - vs. never", "Social habit", "Systolic blood pressure*",
"Three or four times a week", "Townsend deprivation index", "White blood cell count"
), class = "factor"), No..of.Events = c(1073L, 581L, 1061L, 1031L,
NA, 57L, 68L, 1072L, NA, NA, 442L, 117L, NA, 262L, 191L, 172L,
NA, 96L, 107L, 41L, 146L, 86L, 52L, 170L, 66L, 84L, NA, 1009L,
1009L, 1009L, 1009L, 1009L, 1009L, 1005L, 1005L, 1005L), ARR..95..CI. = c("1.18 (1.08-1.28)",
"1.45 (1.28-1.64)", "1.76 (1.57-1.97)", "0.98 (0.95-1.02)", "",
"2.16 (1.63-2.86)", "2.65 (2.03-3.44)", "1.09 (1.07-1.11)", "",
"", "1.35 (1.18-1.55)", "1.15 (0.94-1.42)", "", "0.73 (0.58-0.90)",
"0.63 (0.49-0.79)", "0.63 (0.50-0.81)", "", "1.23 (0.99-1.53)",
"1.34 (1.08-1.66)", "1.51 (1.09-2.09)", "1.12 (0.93-1.34)", "1.46 (1.15-1.85)",
"0.96 (0.71-1.30)", "1.10 (0.92-1.31)", "1.18 (0.91-1.52)", "1.38 (1.10-1.74)",
"", "1.03 (1.02-1.04)", "0.87 (0.73-1.03)", "0.93 (0.87-0.98)",
"1.00 (0.99-1.01)", "0.94 (0.88-1.01)", "1.00 (1.00-1.00)", "1.03 (1.01-1.05)",
"1.17 (1.03-1.32)", "1.06 (1.01-1.10)")), class = "data.frame", row.names = c(NA,
-36L))
###NOT WORKING CODE THAT TRIES TO MAKE TABLE LEFT OF FOREST PLOT
data_table <- geom_text(data=fplottable,aes(y=labels)) +
geom_text(label=eventnum) +
geom_text(label=arr)
data_table
grid.arrange(data_table,p, ncol=2)
I am drawing inspiration from:
Reproduce table and plot from journal and trying to get something similar to what is shown in the forest plot with the pink boxes

There were a few issues as #efz pointed out. In addition, you need to refactor the labels in your second column to allow them to match up with those in your first. It's probably going to look messy with the y axis labels and title alongside the table, so these could be removed too.
That leaves you something like:
forestdf$colour <- rep(c("white", "gray95"), 18)
p <- ggplot(forestdf, aes(x = rr, y = labels, xmin = rrlow, xmax = rrhigh)) +
geom_hline(aes(yintercept = labels, colour = colour), size = 7) +
geom_pointrange(shape = 22, fill = "black") +
geom_vline(xintercept = 1, linetype = 3) +
xlab("Variable") +
ylab("Adjusted Relative Risk with 95% Confidence Interval") +
theme_classic() +
scale_colour_identity() +
scale_y_discrete(limits = rev(forestdf$labels)) +
scale_x_log10(limits = c(0.25, 4),
breaks = c(0.25, 0.5, 1, 2, 4),
labels = c("0.25", "0.5", "1", "2", "4"), expand = c(0,0)) +
theme(axis.text.y = element_blank(), axis.title.y = element_blank())
names(fplottable) <- c("labels", "eventnum", "arr")
fplottable$labels <- factor(fplottable$labels, rev(levels(forestdf$labels)))
fplottable$colour <- rep(c("white", "gray95"), 18)
data_table <- ggplot(data = fplottable, aes(y = labels)) +
geom_hline(aes(yintercept = labels, colour = colour), size = 7) +
geom_text(aes(x = 0, label = labels), hjust = 0) +
geom_text(aes(x = 5, label = eventnum)) +
geom_text(aes(x = 7, label = arr), hjust = 1) +
scale_colour_identity() +
theme_void() +
theme(plot.margin = margin(5, 0, 35, 0))
grid.arrange(data_table,p, ncol = 2)

You can simplify further by merging the two dataframes as fdf <- full_join(forestdf, fplottable, by = "labels") and running your p on fdf. Then p + geom_text(aes(x=22, label=paste(" ", arr," ",eventum, sep=' '))) will give the following output: output
Obviously, limits need to be expanded to 100 to include the table, and the full code is below:
p <- ggplot(fdf, aes(x=rr, y=labels, xmin=rrlow, xmax=rrhigh))+
geom_pointrange(shape=22, fill="black") +
geom_vline(xintercept = 1, linetype=3) +
xlab("Variable")+ylab("Adjusted Relative Risk with 95% Confidence Interval") +
theme_bw() +
#scale_y_discrete(limits = rev(labels))+
scale_x_log10(limits = c(0.25, 100),
breaks = c(0.25, 0.5, 1, 2, 4, 100),
labels=c("0.25", "0.5", "1", "2", "4", ""),
expand = c(0,0)
)+
geom_text(aes(x=22, label=paste(" ", arr," ",eventum, sep=' ')))
p

supposing
names(fplottable)<-c('labels','eventum','arr')
then there are a few issues with the code for data_table. If I understood correctly you meant something like:
data_table <- ggplot(data=fplottable)+geom_text(aes(x= 1, y=labels, label=arr))+geom_text(aes(x= 1.5, y=labels, label=eventum)).
You can play with the value of x and have only one geom_text where label=paste(arr, eventum, sep=' ')
in this case the command grid.arrange(data_table,p, ncol=2) seems to work fine. You can define the space of each panel with width.

Related

Draw histogram between cluster via R

This is dataset with my variables for analysis.
clys<-structure(list(session_price = c(18824.7664, 35584.4106, 21084.4035,
9907.5856, 30806.5486, 15788.1279, 10147.7593, 11977.5904, 11734.3553,
53484.8698, 27788.9949, 11072.0588, 29241.0885, 5676.2372, 14007.0981,
34964.85, 14668.6735, 9425.9294, 16577.845, 153147.2272), flight_type = c(1.2462,
1.1691, 1.0601, 1.2909, 1.5488, 1.1279, 1.166, 1.3862, 1.2936,
1.0195, 1.0451, 1.2904, 1.6684, 1.2786, 1.1358, 1.2958, 1.05,
1.1522, 1.0561, 1.6795), adults_count = c(1.1793, 1.0821, 1.1156,
1.2565, 1.2742, 1.2283, 1.3237, 1.1494, 1.2904, 1.3525, 1.0814,
1.3644, 1.5781, 1.1816, 1.2604, 1.1732, 1.4088, 1.3959, 1.0959,
1.4726), children_count = c(0.2432, 0.0338, 0.1573, 0.0517, 0.0769,
0.0365, 0.1494, 0.0408, 0.1177, 0.128, 0.0579, 0.2749, 0.4045,
0.0823, 0.0943, 0.0677, 0.2088, 0.3009, 0.0817, 0.2353), infants_count = c(0.0152,
0.0048, 0.0731, 0.0259, 0.0129, 0.0046, 0.0954, 0.014, 0.0141,
0.0152, 0.0121, 0.0667, 0.0365, 0.0174, 0.0679, 0.0111, 0.0441,
0.0818, 0.0313, 0.0446), meta_flight_type = c(0.2918, 0.4686,
0.1425, 0.43, 0.3924, 0.6575, 0.6349, 0.0583, 0.2747, 0.167,
0.6179, 0.22, 0.5573, 0.2165, 0.3623, 0.6272, 0.3853, 0.1468,
0.255, 0.4604), flight_kind = c(0.4528, 0.1379, 3.6497, 0.2331,
0.3969, 0.1519, 0.098, NA, 0.6111, NA, 0.1086, 0.1061, NA, NA,
0.8571, 1.3472, NA, 0.0243, 3.3273, 1.1279), service_class_id = c(2,
1.9952, 2, 2, 1.9986, 2, 2, 1.9977, 2, 1.9913, 1.9968, 1.9985,
1.9983, 2, 2, 1.9994, 2, 1.9979, 1.9986, 1.9939), UI_profit = c(249.9766,
210.7159, 121.1932, 46.7757, 202.5403, 58.3467, 35.375, 0, 63.4536,
0, 116.4613, 41.2356, 0, 0, 72.0427, 131.8692, 0, 24.3831, 75.1906,
53), leg_price = c(9807.4805, 23253.6651, 15805.3328, 6148.6305,
15574.0215, 11339.653, 5964.4419, 7846.2151, 6910.2812, 35607.4389,
23953.2572, 5411.9416, 9544.5809, 3568.1491, 9463.4491, 23276.3196,
8357.9574, 4977.6056, 13331.1196, 54673.0944), flight_duration_min = c(307.2136,
269.9225, 439.2894, 143.2841, 197.8477, 110.2875, 114.3542, NA,
173.47, NA, 236.4197, 160.9437, NA, NA, 216.9208, 624.4288, NA,
162.4991, 190.5408, 776.6839), trip_duration_min = c(504.257,
531.7625, 967.9167, 261.4497, 265.9794, 138.0625, 163.9792, NA,
325.6778, NA, 459.6784, 166.7464, NA, NA, 462.5097, 949.2419,
NA, 162.6241, 478.7249, 1346.6982), price_duration_min = c(27.8457,
78.404, 35.9824, 38.95, 56.0142, 102.8833, 49.24, NA, 33.841,
NA, 96.4814, 29.3607, NA, NA, 43.4476, 33.1768, NA, 28.893, 76.8556,
45.4329), days_to_flight = c(27.8068, 23.0823, 23.7821, 12.4188,
26.8415, 19.6586, 24.6713, 16.6704, 13.9125, 10.1796, 13.2141,
18.1858, 119.3786, 12.5782, 20.3807, 31.856, 37.4516, 6.9034,
21.6605, 43.7275), days_RT = c(12.8218, 8.904, 23.2507, 4.585,
8.5987, 13.0174, 7.6805, 6.4065, 4.219, 19.984, 11.874, 8.8732,
14.4032, 4.9503, 11.9996, 12.5172, 4.9677, 8.0309, 12.8996, 15.5516
), mobile_share = c(0.538, 0.5845, 0.7576, 0.5409, 0.5279, 0.6119,
0.6017, 0.5344, 0.5133, 0.7007, 0.7336, 0.7531, 0.5156, 0.6429,
0.7208, 0.6033, 0.7118, 0.8446, 0.6328, 0.6268), desktop_share = c(0.4559,
0.4155, 0.2424, 0.3556, 0.4687, 0.3881, 0.3983, 0.4656, 0.4757,
0.2993, 0.2626, 0.2382, 0.4844, 0.3571, 0.2792, 0.3924, 0.2882,
0.1519, 0.3643, 0.3732), iphone_share = c(0.2128, 0.2947, 0.1443,
0.3103, 0.3459, 0.3379, 0.1618, 0.2882, 0.2308, 0.4707, 0.2606,
0.4327, 0.1892, 0.277, 0.2453, 0.2805, 0.1853, 0.478, 0.1882,
0.3834), android_share = c(0.307, 0.2657, 0.6087, 0.2274, 0.1697,
0.2694, 0.4274, 0.2322, 0.2779, 0.2115, 0.4685, 0.3165, 0.3177,
0.361, 0.4755, 0.3196, 0.5176, 0.3668, 0.4432, 0.2414), multi_share = c(0.2888,
0.0676, 0.8825, 0.1078, 0.0807, 0.0365, 0.1411, 0.0292, 0.2229,
0.0412, 0.1354, 0.1619, 0.0972, 0.1538, 0.1585, 0.3809, 0.1324,
0.0902, 0.473, 0.211), CR_session_to_popup = c(0.1185, 0.1159,
0.0879, 0.2295, 0.1276, 0.2374, 0.1162, 0.1097, 0.1695, 0.1605,
0.1062, 0.2189, 0.0226, 0.2356, 0.166, 0.1383, 0.1118, 0.2994,
0.0874, 0.0467), CR_session_to_booking = c(0.1155, 0.1063, 0.0703,
0.2392, 0.1208, 0.2237, 0.1079, 0.1995, 0.1648, 0.1844, 0.082,
0.1826, 0.0313, 0.2339, 0.1472, 0.1141, 0.1118, 0.2515, 0.0739,
0.0548), corr_winter = c(0.2635, 0.1983, 0.2513, 0.1867, 0.106,
0.4188, 0.0534, 0.1589, 0.2498, 0.4775, 0.4858, 0.3605, 0.0688,
0.318, 0.3394, 0.223, 0.3281, 0.3985, 0.173, 0.112), corr_spring = c(0.3036,
0.2772, 0.2602, 0.2209, 0.3627, 0.1332, 0.4484, 0.2793, 0.2526,
0.506, 0.0814, 0.2088, 0.6824, 0.2407, 0.1407, 0.326, 0.3228,
0.0654, 0.0897, 0.3196), corr_summer = c(0.2673, 0.1791, 0.258,
0.2894, 0.2856, 0.099, 0.2358, 0.2793, 0.276, 0.0165, 0.2525,
0.2087, 0.2488, 0.4413, 0.2477, 0.2744, 0.3491, 0.2917, 0.5926,
0.0861), corr_autumn = c(0.1656, 0.3454, 0.2304, 0.3029, 0.2458,
0.349, 0.2625, 0.2826, 0.2216, 0, 0.1803, 0.222, 0, 0, 0.2722,
0.1766, 0, 0.2444, 0.1447, 0.4823), corr_BL = c(0.4759, 0.5444,
0.4952, 0.4392, 0.4586, 0.4146, 0.4011, 0.4722, 0.4244, 0.4542,
0.4742, 0.4467, 0.4652, 0.4293, 0.4412, 0.4423, 0.4811, 0.4583,
0.496, 0.4882), corr_UP = c(0.5241, 0.4556, 0.5048, 0.5608, 0.5414,
0.5854, 0.5989, 0.5278, 0.5756, 0.5458, 0.5258, 0.5533, 0.5348,
0.5707, 0.5588, 0.5577, 0.5189, 0.5417, 0.504, 0.5118), pam_german.clustering = c(1L,
2L, 2L, 3L, 4L, 4L, 5L, 6L, 1L, 7L, 7L, 8L, 6L, 9L, 8L, 9L, 9L,
8L, 2L, 2L)), class = "data.frame", row.names = c(NA, -20L))
pam_german.clustering is the number of the cluster in which the observation is belong (row)
How for all variable from session_price to corr_UP between all clusters to draw a histogram of the distribution of variables?
I only learn ggplot2, so can't do it self. But to explain what result i need , i can draw using paint.
For session price histogram between cluster
session price
then for flight_type histogram between clusters
flight type
and so on for each variable in dataset.
How using ggplot2 get histogram, as I need?
Your desired plots don't make sense. A histogram always has count or density on the y axis, but you have price. If you want one histogram per cluster, you need 5 different panels side-by side, but price would still be on the x axis of each. There aren't really enough data points in your example data to create this many histograms (one cluster only has a single point).
Data in this structure is normally best shown with a boxplot:
clys %>%
ggplot(aes(x = factor(pam_german.clustering), y = session_price)) +
geom_boxplot(aes(fill = factor(pam_german.clustering))) +
scale_fill_viridis_d() +
theme_light(base_size = 16) +
labs(x = 'Cluster', y = 'Session Price') +
guides(fill = guide_none())
Or perhaps you are wanting columns of averages per cluster with an error bar representing the range?
library(tidyverse)
clys %>%
group_by(pam_german.clustering) %>%
summarize(max = max(session_price),
min = min(session_price),
session_price = mean(session_price),
cluster = factor(mean(pam_german.clustering))) %>%
ggplot(aes(x = cluster, y = session_price, fill = session_price)) +
geom_col() +
geom_errorbar(aes(ymin = min, ymax = max), width = 0.5, size = 0.2) +
scale_fill_viridis_c(option = 7) +
theme_light(base_size = 16) +
labs(y = 'Session Price') +
guides(fill = guide_none())
Certainly, a set of histograms is possible, but really doesn't work very well with this data set due to the lack of data points, and trying to fit too many facets across a single dimension of the plot:
clys %>%
ggplot(aes(x = session_price)) +
geom_histogram() +
facet_grid(.~pam_german.clustering, scales = 'free_x') +
theme_light(base_size = 16)

Change font of specific rows to bold in forestplot

I wrote a script using the "forestplot" package. I want to group the variables in certain categories, which I would like to show in bold, in order to accentuate those categories. How can i adjust my script, so that only certain rows, i.e Risk factor OR (95% CI), patient characteristics, medication history, comorbidities, surgical history and other are shown in bold? I have two colums and 18 rows. Can someone help me? I would be much grateful!!
My script is as below:
tabletext <- cbind(
c("Risk factor" ,"Patient characteristics","Sex, male*", "Bmi (5 points)",
"Alcohol (5 units)", "Smoking*","Medication history",
"Steroid use", "Anticoagulant use*","Comorbidities",
"COPD GOLD 1/2", "COPD GOLD 3/4", "Other pulmonary disease",
"Surgical history",
"Previous colorectal surgery*",
"Previous abdominal surgery (other)","Other", "HIPEC*"),
c("OR (95% CI)",NA, "1.78 (1.20-2.68)", "1.15 (0.95-1.38)", "1.04 (0.94-1.14)",
"1.78 (1.11-2.80)", NA," 1.40 (0.68-2.67)", "1.55 (1.02-2.32)",NA,
"1.40 (0.70-2.61)", "1.56 (0.42-4.67)", "1.78 (0.63-4.28)",NA,
"1.61 (1.03-2.49)", "0.80 (0.47-1.32)",NA, "4.14 (2.14-7.73)"))
?fpTxtGp
require(forestplot)
forestplot(tabletext,
txt_gp = fpTxtGp(label = list(gpar(fontfamily = "Times",
fontface="bold"),
gpar(fontfamily = "",
col = "black"))),
df_c,new_page = TRUE,
boxsize = 0.2,
is.summary = c(rep(FALSE,32)),
clip = c(0,17),
xlab = 'Odds ratio with 95% confidence interval
* indicates significance',
xlog = FALSE,
zero = 1,
plotwidth=unit(12, "cm"),
colgap=unit(2, "mm"),
col = fpColors(box = "royalblue",
line = "darkblue",
summary = "royalblue"))
Its not clear what df_c is so I just created it based on your tabletext matrix:
df_c <- data.frame(mean = c(NA, NA, 1.78, 1.15, 1.04, 1.78, NA, 1.4, 1.55,
NA, 1.4, 1.56, 1.78, NA, 1.61, 0.8, NA, 4.14),
lower = c(NA, NA, 1.2, 0.95, 0.94, 1.11, NA, 0.68, 1.02, NA, 0.7,
0.42, 0.63, NA, 1.03, 0.47, NA, 2.14),
upper = c(NA, NA, 2.68, 1.38,1.14, 2.8, NA, 2.67,2.32, NA,
2.61, 4.67, 4.28, NA, 2.49, 1.32, NA, 7.73))
From there, its just a matter of adjusting the values passed to is.summary:
forestplot(tabletext,
txt_gp = fpTxtGp(label = list(gpar(fontfamily = "Times"),
gpar(fontfamily = "",
col = "black"))),
df_c,new_page = TRUE,
boxsize = 0.2,
is.summary = c(TRUE, TRUE, rep(FALSE, 4),
TRUE, FALSE, FALSE, TRUE,
rep(FALSE,3), TRUE, rep(FALSE,4)),
clip = c(0,17),
xlab = 'Odds ratio with 95% confidence interval
* indicates significance',
xlog = FALSE,
zero = 1,
plotwidth=unit(12, "cm"),
colgap=unit(2, "mm"),
col = fpColors(box = "royalblue",
line = "darkblue",
summary = "royalblue"))
Which generates the following figure:

Select columns from multiple rda files R and save the selected output

I would like to select (by names or position) specified columns in multiple .rda files stored in a directory and then save them only with the selected columns (preferably using purrr and dplyr).
The .rda files have all the same structure: 38 columns, first 28 columns have the same names, the last 10 columns have different names with the same prefix ("SC*").
This is what I have tried so far:
library(tidyverse)
file_names <- as.list(dir(path=".", pattern="ASJC*"))
I load the files selected
files <- map(file_names,load,environment())
Then I try to select specified columns by names:
files_selected <- map(files,dplyr::select(SOURCERECORD_ID,starts_with("SC*")))
and I got the following error message:
Error in dplyr::select(SOURCERECORD_ID, starts_with("SC*")) :
object 'SOURCERECORD_ID' not found
So i tried the following code:
files_selected <- map(files,dplyr::select(1,29:38)
with another error message:
Error in UseMethod("select_") :
no applicable method for 'select_' applied to an object of class "c('double', 'numeric')"
This is the structure of .rda files:
df1 <- structure(list(SOURCERECORD_ID = c("18659", "13951", "5400152705",
"16500154707", "20300195074", "19472"), TITOLO_FONTE = c("ANAIS DA ACADEMIA BRASILEIRA DE CIENCIAS",
"ARABIAN JOURNAL FOR SCIENCE AND ENGINEERING", "ARCHIVES DES SCIENCES",
"ASIAN JOURNAL OF SCIENTIFIC RESEARCH", "ASM SCIENCE JOURNAL",
"BEIJING DAXUE XUEBAO (ZIRAN KEXUE BAN)/ACTA SCIENTIARUM NATURALIUM UNIVERSITATIS PEKINENSIS"
), ISSN_P = c("0001-3765", "1319-8025", "1661-464X", "1992-1454",
"1823-6782", "0479-8023"), ISSN_E = c("1678-2690", NA, NA, NA,
NA, NA), STATUS = c("Active", "Active", "Active", "Active", "Active",
"Active"), COPERTURA = c("1994-ongoing, 1970-1992, 1949", "2003-ongoing, 1981",
"2017-ongoing, 2004-2014", "2009-ongoing", "2011-ongoing", "2001-ongoing"
), LINGUA = c("ENG", "ENG", "ENG, FRE", "ENG", "ENG", "CHI"),
CS2014 = c(0.95, 1.19, 0.64, 0.55, 0.1, 0.24), CS2015 = c(0.89,
0.81, 0.57, 0.36, 0.06, 0.17), CS2016 = c(1.05, 1.02, NA,
0.64, 0.11, 0.35), SJR2014 = c(0.42, 0.332, 0.285, 0.394,
0.107, 0.13), SJR2015 = c(0.332, 0.335, 0.201, 0.163, 0.122,
0.123), SJR2016 = c(0.386, 0.29, 0.149, 0.195, 0.101, 0.157
), SNIP2014 = c(0.756, 1.149, 0.236, 1.021, 0.408, 0.338),
SNIP2015 = c(0.67, 0.51, 0.362, 0.472, 0.082, 0.164), SNIP2016 = c(0.713,
0.657, 0.275, 0.549, 0.595, 0.265), TIPO_FONTE = c("Journal",
"Journal", "Journal", "Journal", "Journal", "Journal"), STORIA_TITOLO = c(NA,
NA, "Formerly known as", NA, NA, NA), TITOLI_COLLEGATI = c(NA,
NA, "Archives des Sciences et Compte Rendu Seances de la Societe",
NA, NA, NA), EDITORE = c("Academia Brasileira de Ciencias",
"Springer Verlag", "Societe de physique et d'histoire naturelle",
"Asian Network for Scientific Information", "Akademi Sains Malaysia",
"Beijing University Press"), PAESE_EDITORE = c("Brazil",
"Germany", "Switzerland", "Pakistan", "Malaysia", "China"
), ASJC = c("1000;", "1000;", "1000;", "1000;", "1000;",
"1000;"), AVG_CS = c(0.963, 1.007, 0.605, 0.517, 0.09, 0.253
), AVG_SJR = c(0.379, 0.319, 0.212, 0.251, 0.11, 0.137),
AVG_SNIP = c(0.713, 0.772, 0.291, 0.681, 0.362, 0.256), ELEGGIBILE = c("Eleggibile",
"Eleggibile", "Eleggibile", "Eleggibile", "Eleggibile", "Eleggibile"
), Percentili_SJR = c(15L, 21L, 47L, 35L, 89L, 77L), Percentili_SNIP = c(25L,
19L, 74L, 28L, 61L, 78L), SC13A_1000_SJR = c("Bottom", "Bottom",
"Bottom", "Bottom", "Bottom", "Bottom"), SC13B_1000_SJR = c("Bottom",
"Bottom", "Bottom", "Bottom", "Bottom", "Bottom"), SC13C_1000_SJR = c("Bottom",
"Bottom", "Bottom", "Bottom", "Bottom", "Bottom"), SC13D_1000_SJR = c("Bottom",
"Bottom", "Bottom", "Bottom", "Bottom", "Bottom"), SC13D4_1000_SJR = c("Bottom",
"Bottom", "Bottom", "Bottom", "Bottom", "Bottom"), SC13A_1000_SNIP = c("Bottom",
"Bottom", "Bottom", "Bottom", "Bottom", "Bottom"), SC13B_1000_SNIP = c("Bottom",
"Bottom", "Bottom", "Bottom", "Bottom", "Bottom"), SC13C_1000_SNIP = c("Bottom",
"Bottom", "Bottom", "Bottom", "Bottom", "Bottom"), SC13D_1000_SNIP = c("Bottom",
"Bottom", "Bottom", "Bottom", "Bottom", "Bottom"), SC13D4_1000_SNIP = c("Bottom",
"Bottom", "Bottom", "Bottom", "Bottom", "Bottom")), .Names = c("SOURCERECORD_ID",
"TITOLO_FONTE", "ISSN_P", "ISSN_E", "STATUS", "COPERTURA", "LINGUA",
"CS2014", "CS2015", "CS2016", "SJR2014", "SJR2015", "SJR2016",
"SNIP2014", "SNIP2015", "SNIP2016", "TIPO_FONTE", "STORIA_TITOLO",
"TITOLI_COLLEGATI", "EDITORE", "PAESE_EDITORE", "ASJC", "AVG_CS",
"AVG_SJR", "AVG_SNIP", "ELEGGIBILE", "Percentili_SJR", "Percentili_SNIP",
"SC13A_1000_SJR", "SC13B_1000_SJR", "SC13C_1000_SJR", "SC13D_1000_SJR",
"SC13D4_1000_SJR", "SC13A_1000_SNIP", "SC13B_1000_SNIP", "SC13C_1000_SNIP",
"SC13D_1000_SNIP", "SC13D4_1000_SNIP"), row.names = c(NA, -6L
), class = c("tbl_df", "tbl", "data.frame"))
Are you sure that files <- map(file_names,load,environment()) holds a list of datframes?
If yes, then try this
map(files,~ dplyr::select(.,SOURCERECORD_ID,starts_with("SC*")))
I think what's missing is the ~ before the select function, and the . in the select function since select is select(.data, ...). By.` you imply here is where the each dataset goes

r - ggplot2 - secondary duplicate axis log transform is incorrect

I am using ggplot 2.2.0 to create a secondary duplicated axis using the log transform.
# install.packages("install.load") # install to use the load_package function
install.load::load_package("ggplot2", "data.table")
sand <- structure(list(`Sieve #` = c("3/8”", "4", "8", "16", "30", "50",
"Pan"), `Size (mm)` = c(9.525, 4.75, 2.36, 1.18, 0.6, 0.3, NA
), `Mass Sieve (kg)` = c(0.642, 0.508, 0.474, 0.408, 0.38, 0.348,
0.376), `Mass Retained + Sieve (kg)` = c(0.642, 0.524, 0.58,
0.526, 0.598, 0.899, 0.463), `Mass Retained (kg)` = c(0, 0.016,
0.106, 0.118, 0.218, 0.551, NA), `Cumulative Mass Retained (kg)` = c(0,
0.016, 0.122, 0.24, 0.458, 1.009, NA), `Cumulative % Retained` = c(0,
1, 11, 22, 42, 92, NA), `% Passing` = c(100, 99, 89, 78, 58,
8, NA)), .Names = c("Sieve #", "Size (mm)", "Mass Sieve (kg)",
"Mass Retained + Sieve (kg)", "Mass Retained (kg)", "Cumulative Mass Retained (kg)",
"Cumulative % Retained", "% Passing"), row.names = c(NA, -7L), class = c("data.table",
"data.frame"))
x1 <- c(0.075, 0.15, 0.3, 0.6, 1.18, 2.36, 4.75, 9.5, 12.5, 19, 25, 37.5, 50)
x1_label <- c("0.075", "0.150", "0.300", "0.600", "1.180", "2.36", "4.75", "9.5",
"12.5", "19.0", "25.0", "37.5", "50.0")
x2 <- c("No. 200", "No. 100", "No. 50", "No. 30", "No. 16", "No. 8", "No. 4",
"3/8 in.", "1/2 in.", "3/4 in.", "1 in.", "1 1/2 in.", "2 in.")
ggplot(sand, aes(`Size (mm)`, `% Passing`)) + geom_point() +
geom_line() + scale_x_continuous(name = "Sieve size (mm)",
limits = c(0.075,
50), expand = c(0.001, 0), breaks = x1, labels = x1_label, minor_breaks = NULL,
trans = "log", position = "bottom", sec.axis = dup_axis(name = "Sieve size",
breaks = x1, labels = x2)) +
labs(title = "Group 1 Sand Gradation Results (ASTM C136)") + scale_y_continuous(limits =
c(0,
100), expand = c(0.01, 0), breaks = seq(0, 100, by = 10), minor_breaks = seq(0,
100, by = 5), name = "% Passing") +
theme_bw() + theme(plot.margin = margin(0.5, 0.5, 0.5, 0.5, "pt"),axis.text.x = element_text(angle = 90, vjust = 0.1))
The bottom x-axis and the top x-axis do not match in the image shown below.
Is it possible to have a duplicated secondary axis using the log transform?
If so, then how should the existing code be modified to get the desired result?
If not, then do you have any suggestions.
Thank you.

barchart with multiple overlaying errorbars

I am trying to create a barplot with multiple errorbars. Something like this: http://flyordie.sin.khk.be/r/histogram%20error%20bars.PNG
I have the following dataset:
http://flyordie.sin.khk.be/r/output.csv
I have tried using ggplot2 and lattice graphics, but haven't found anything that suits my needs.
My current code for showing the barchart is this:
data <- read.csv("c:/output.csv")
data
par(las=3)
barplot(data$PlateId,
height=data$HC.Maximum,
names.arg=data$PlateId,
col="lightblue")
And to show the highest errorbars i use this code
library(ggplot2)
limits <- aes(ymax = qc$HC.Maximum, ymin = qc$HC.Minimum)
p <- ggplot(qc, aes(colour=HC.Median,x=PlateId))
p + geom_bar(position="dodge")+ geom_errorbar(limits,position="dodge")
But I have no clue on how to put them on the same graphic (like in my example)
The data:
qc <- structure(list(row = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "Row", class = "factor"),
ID = 1:14, PlateId = c(35276L, 35279L, 35280L, 35281L, 35282L,
35290L, 35291L, 35292L, 35293L, 35294L, 35295L, 35296L, 35297L,
35298L), LC.Median = c(439688.495, 509376.055, 475218.99,
497368.215, 481801.9, 468603.43, 494713.175, 459047.385,
482819.47, 495162.31, 449592.51, 460564.95, 478715.915, 452293.465
), LC.Stdev = c(52290.12229, 49648.49436, 55743.10306, 62002.53552,
46908.66149, 52489.615, 48016.94019, 52082.23899, 47934.37133,
58977.84845, 45827.62648, 53514.21095, 49638.98286, 139686.144
), LC.Minimum = c(279610.16, 423651.45, 356422.31, 411639.77,
397362.84, 345178.07, 406073.72, 352834.86, 339035.77, 369554.11,
348688.39, 357341.56, 370463.11, 210367.91), LC.Maximum = c(498195.9,
630648.53, 614625.78, 686737.35, 621372.36, 576491.41, 579708.95,
580633.28, 580125.9, 622108.73, 530234.87, 563616.65, 614936.33,
730272.63), HC.Median = c(507356.465, 553226.525, 447067.77,
452223.76, 453439.37, 422491.755, 447438.8, 435034.635, 446148.105,
438089.69, 466748.63, 440005.81, 454927.74, 483599.71), HC.Stdev = c(65355.46121,
72762.07338, 80118.37641, 43653.99318, 73389.12355, 62590.47601,
46421.36678, 62822.88532, 61175.4241, 64418.56174, 63101.2232,
68166.51814, 61256.74139, 87354.9441), HC.Minimum = c(381552.05,
391124.94, 280614.72, 395454.12, 291433.84, 252579.15, 331661.03,
296223.64, 240262.37, 299431.98, 375224.27, 278780.87, 310275.66,
213170.04), HC.Maximum = c(626483.6, 635111.41, 555357.3,
528822.8, 534172.42, 514927.42, 538385.26, 533024.74, 524973.99,
544335.94, 564954.87, 572206.98, 547489.1, 565338.09), zPrime = c(-3.96,
-23.73, -7.88, -5.81, -5.32, -5.54, -4.48, -7.98, -6.99,
-5.63, -22.54, -33.83, -11.92, -17.44), Sb = c(1.17, 1.03,
0.91, 0.91, 0.89, 0.89, 0.9, 0.92, 0.92, 0.89, 1.04, 0.98,
0.95, 1.09), Sn = c(1.37, 0.3, -0.83, -0.76, -1.22, -1.01,
-1.08, -0.74, -0.86, -0.95, 0.31, -0.2, -0.52, 0.27)), .Names = c("row",
"ID", "PlateId", "LC.Median", "LC.Stdev", "LC.Minimum", "LC.Maximum",
"HC.Median", "HC.Stdev", "HC.Minimum", "HC.Maximum", "zPrime",
"Sb", "Sn"), class = "data.frame", row.names = c(NA, -14L))
When creating plots with multiple layers, I approach it as follows:
Define common aesthetics in the initial call to ggplot
Define additional aesthetics in each additional layer
Note that I have modified your code, since I couldn't get your example to work:
Provide an explicit binwidth=1 to geom_bar to remove the warnings
Remove the position=dodge since this is the default and redundant
Supply an explicit stat=identity to geom_bar
The code:
ggplot(qc, aes(x=PlateId)) +
geom_bar(aes(y=HC.Median), binwidth=1, stat="identity", fill="cyan") +
geom_errorbar(aes(ymin=HC.Minimum, ymax=LC.Minimum), colour="red") +
geom_errorbar(aes(ymin=LC.Maximum, ymax=HC.Maximum), colour="purple")
Just add another geom_errorbar.
#Rename limits to limits_hi
limits_hi <- aes(ymax = qc$HC.Maximum, ymin = qc$HC.Minimum)
#Define the other error bar
limits_lo <- aes(ymax = qc$LC.Maximum, ymin = LC.Minimum)
#I'm not quite sure what you want in the bars; see if this looks right
p <- ggplot(qc, aes(factor(PlateId), HC.Median))
p +
geom_bar(position="dodge") +
geom_errorbar(limits_hi, position="dodge", colour = "red") +
geom_errorbar(limits_lo, position="dodge", colour = "blue") +
opts(axis.text.x = theme_text(angle = 30))

Resources