Draw histogram between cluster via R - r

This is dataset with my variables for analysis.
clys<-structure(list(session_price = c(18824.7664, 35584.4106, 21084.4035,
9907.5856, 30806.5486, 15788.1279, 10147.7593, 11977.5904, 11734.3553,
53484.8698, 27788.9949, 11072.0588, 29241.0885, 5676.2372, 14007.0981,
34964.85, 14668.6735, 9425.9294, 16577.845, 153147.2272), flight_type = c(1.2462,
1.1691, 1.0601, 1.2909, 1.5488, 1.1279, 1.166, 1.3862, 1.2936,
1.0195, 1.0451, 1.2904, 1.6684, 1.2786, 1.1358, 1.2958, 1.05,
1.1522, 1.0561, 1.6795), adults_count = c(1.1793, 1.0821, 1.1156,
1.2565, 1.2742, 1.2283, 1.3237, 1.1494, 1.2904, 1.3525, 1.0814,
1.3644, 1.5781, 1.1816, 1.2604, 1.1732, 1.4088, 1.3959, 1.0959,
1.4726), children_count = c(0.2432, 0.0338, 0.1573, 0.0517, 0.0769,
0.0365, 0.1494, 0.0408, 0.1177, 0.128, 0.0579, 0.2749, 0.4045,
0.0823, 0.0943, 0.0677, 0.2088, 0.3009, 0.0817, 0.2353), infants_count = c(0.0152,
0.0048, 0.0731, 0.0259, 0.0129, 0.0046, 0.0954, 0.014, 0.0141,
0.0152, 0.0121, 0.0667, 0.0365, 0.0174, 0.0679, 0.0111, 0.0441,
0.0818, 0.0313, 0.0446), meta_flight_type = c(0.2918, 0.4686,
0.1425, 0.43, 0.3924, 0.6575, 0.6349, 0.0583, 0.2747, 0.167,
0.6179, 0.22, 0.5573, 0.2165, 0.3623, 0.6272, 0.3853, 0.1468,
0.255, 0.4604), flight_kind = c(0.4528, 0.1379, 3.6497, 0.2331,
0.3969, 0.1519, 0.098, NA, 0.6111, NA, 0.1086, 0.1061, NA, NA,
0.8571, 1.3472, NA, 0.0243, 3.3273, 1.1279), service_class_id = c(2,
1.9952, 2, 2, 1.9986, 2, 2, 1.9977, 2, 1.9913, 1.9968, 1.9985,
1.9983, 2, 2, 1.9994, 2, 1.9979, 1.9986, 1.9939), UI_profit = c(249.9766,
210.7159, 121.1932, 46.7757, 202.5403, 58.3467, 35.375, 0, 63.4536,
0, 116.4613, 41.2356, 0, 0, 72.0427, 131.8692, 0, 24.3831, 75.1906,
53), leg_price = c(9807.4805, 23253.6651, 15805.3328, 6148.6305,
15574.0215, 11339.653, 5964.4419, 7846.2151, 6910.2812, 35607.4389,
23953.2572, 5411.9416, 9544.5809, 3568.1491, 9463.4491, 23276.3196,
8357.9574, 4977.6056, 13331.1196, 54673.0944), flight_duration_min = c(307.2136,
269.9225, 439.2894, 143.2841, 197.8477, 110.2875, 114.3542, NA,
173.47, NA, 236.4197, 160.9437, NA, NA, 216.9208, 624.4288, NA,
162.4991, 190.5408, 776.6839), trip_duration_min = c(504.257,
531.7625, 967.9167, 261.4497, 265.9794, 138.0625, 163.9792, NA,
325.6778, NA, 459.6784, 166.7464, NA, NA, 462.5097, 949.2419,
NA, 162.6241, 478.7249, 1346.6982), price_duration_min = c(27.8457,
78.404, 35.9824, 38.95, 56.0142, 102.8833, 49.24, NA, 33.841,
NA, 96.4814, 29.3607, NA, NA, 43.4476, 33.1768, NA, 28.893, 76.8556,
45.4329), days_to_flight = c(27.8068, 23.0823, 23.7821, 12.4188,
26.8415, 19.6586, 24.6713, 16.6704, 13.9125, 10.1796, 13.2141,
18.1858, 119.3786, 12.5782, 20.3807, 31.856, 37.4516, 6.9034,
21.6605, 43.7275), days_RT = c(12.8218, 8.904, 23.2507, 4.585,
8.5987, 13.0174, 7.6805, 6.4065, 4.219, 19.984, 11.874, 8.8732,
14.4032, 4.9503, 11.9996, 12.5172, 4.9677, 8.0309, 12.8996, 15.5516
), mobile_share = c(0.538, 0.5845, 0.7576, 0.5409, 0.5279, 0.6119,
0.6017, 0.5344, 0.5133, 0.7007, 0.7336, 0.7531, 0.5156, 0.6429,
0.7208, 0.6033, 0.7118, 0.8446, 0.6328, 0.6268), desktop_share = c(0.4559,
0.4155, 0.2424, 0.3556, 0.4687, 0.3881, 0.3983, 0.4656, 0.4757,
0.2993, 0.2626, 0.2382, 0.4844, 0.3571, 0.2792, 0.3924, 0.2882,
0.1519, 0.3643, 0.3732), iphone_share = c(0.2128, 0.2947, 0.1443,
0.3103, 0.3459, 0.3379, 0.1618, 0.2882, 0.2308, 0.4707, 0.2606,
0.4327, 0.1892, 0.277, 0.2453, 0.2805, 0.1853, 0.478, 0.1882,
0.3834), android_share = c(0.307, 0.2657, 0.6087, 0.2274, 0.1697,
0.2694, 0.4274, 0.2322, 0.2779, 0.2115, 0.4685, 0.3165, 0.3177,
0.361, 0.4755, 0.3196, 0.5176, 0.3668, 0.4432, 0.2414), multi_share = c(0.2888,
0.0676, 0.8825, 0.1078, 0.0807, 0.0365, 0.1411, 0.0292, 0.2229,
0.0412, 0.1354, 0.1619, 0.0972, 0.1538, 0.1585, 0.3809, 0.1324,
0.0902, 0.473, 0.211), CR_session_to_popup = c(0.1185, 0.1159,
0.0879, 0.2295, 0.1276, 0.2374, 0.1162, 0.1097, 0.1695, 0.1605,
0.1062, 0.2189, 0.0226, 0.2356, 0.166, 0.1383, 0.1118, 0.2994,
0.0874, 0.0467), CR_session_to_booking = c(0.1155, 0.1063, 0.0703,
0.2392, 0.1208, 0.2237, 0.1079, 0.1995, 0.1648, 0.1844, 0.082,
0.1826, 0.0313, 0.2339, 0.1472, 0.1141, 0.1118, 0.2515, 0.0739,
0.0548), corr_winter = c(0.2635, 0.1983, 0.2513, 0.1867, 0.106,
0.4188, 0.0534, 0.1589, 0.2498, 0.4775, 0.4858, 0.3605, 0.0688,
0.318, 0.3394, 0.223, 0.3281, 0.3985, 0.173, 0.112), corr_spring = c(0.3036,
0.2772, 0.2602, 0.2209, 0.3627, 0.1332, 0.4484, 0.2793, 0.2526,
0.506, 0.0814, 0.2088, 0.6824, 0.2407, 0.1407, 0.326, 0.3228,
0.0654, 0.0897, 0.3196), corr_summer = c(0.2673, 0.1791, 0.258,
0.2894, 0.2856, 0.099, 0.2358, 0.2793, 0.276, 0.0165, 0.2525,
0.2087, 0.2488, 0.4413, 0.2477, 0.2744, 0.3491, 0.2917, 0.5926,
0.0861), corr_autumn = c(0.1656, 0.3454, 0.2304, 0.3029, 0.2458,
0.349, 0.2625, 0.2826, 0.2216, 0, 0.1803, 0.222, 0, 0, 0.2722,
0.1766, 0, 0.2444, 0.1447, 0.4823), corr_BL = c(0.4759, 0.5444,
0.4952, 0.4392, 0.4586, 0.4146, 0.4011, 0.4722, 0.4244, 0.4542,
0.4742, 0.4467, 0.4652, 0.4293, 0.4412, 0.4423, 0.4811, 0.4583,
0.496, 0.4882), corr_UP = c(0.5241, 0.4556, 0.5048, 0.5608, 0.5414,
0.5854, 0.5989, 0.5278, 0.5756, 0.5458, 0.5258, 0.5533, 0.5348,
0.5707, 0.5588, 0.5577, 0.5189, 0.5417, 0.504, 0.5118), pam_german.clustering = c(1L,
2L, 2L, 3L, 4L, 4L, 5L, 6L, 1L, 7L, 7L, 8L, 6L, 9L, 8L, 9L, 9L,
8L, 2L, 2L)), class = "data.frame", row.names = c(NA, -20L))
pam_german.clustering is the number of the cluster in which the observation is belong (row)
How for all variable from session_price to corr_UP between all clusters to draw a histogram of the distribution of variables?
I only learn ggplot2, so can't do it self. But to explain what result i need , i can draw using paint.
For session price histogram between cluster
session price
then for flight_type histogram between clusters
flight type
and so on for each variable in dataset.
How using ggplot2 get histogram, as I need?

Your desired plots don't make sense. A histogram always has count or density on the y axis, but you have price. If you want one histogram per cluster, you need 5 different panels side-by side, but price would still be on the x axis of each. There aren't really enough data points in your example data to create this many histograms (one cluster only has a single point).
Data in this structure is normally best shown with a boxplot:
clys %>%
ggplot(aes(x = factor(pam_german.clustering), y = session_price)) +
geom_boxplot(aes(fill = factor(pam_german.clustering))) +
scale_fill_viridis_d() +
theme_light(base_size = 16) +
labs(x = 'Cluster', y = 'Session Price') +
guides(fill = guide_none())
Or perhaps you are wanting columns of averages per cluster with an error bar representing the range?
library(tidyverse)
clys %>%
group_by(pam_german.clustering) %>%
summarize(max = max(session_price),
min = min(session_price),
session_price = mean(session_price),
cluster = factor(mean(pam_german.clustering))) %>%
ggplot(aes(x = cluster, y = session_price, fill = session_price)) +
geom_col() +
geom_errorbar(aes(ymin = min, ymax = max), width = 0.5, size = 0.2) +
scale_fill_viridis_c(option = 7) +
theme_light(base_size = 16) +
labs(y = 'Session Price') +
guides(fill = guide_none())
Certainly, a set of histograms is possible, but really doesn't work very well with this data set due to the lack of data points, and trying to fit too many facets across a single dimension of the plot:
clys %>%
ggplot(aes(x = session_price)) +
geom_histogram() +
facet_grid(.~pam_german.clustering, scales = 'free_x') +
theme_light(base_size = 16)

Related

R: mapped_discrete` objects can only be created from numeric vectors

I have the following data in R:
df <- structure(list(t0 = c(3.82, -4.88, NA, -3.83, -3.99, NA, NA,
NA, 6.35, 2.47, 0.28, 0.3, NA, 8.31, NA, NA, NA, 2.76, NA, 1.38
), t1 = c(NA, NA, NA, NA, NA, NA, -1.23, 2.19, 4.13, 3.49, -0.42,
NA, 3.78, 2.7, 1.17, NA, NA, NA, NA, NA), t2 = c(-1.85, NA, 1.46,
0.17, NA, NA, -2.81, 1.75, NA, 2.32, -3.08, -1.39, NA, 7.53,
1.77, NA, 0.1, NA, NA, -2.61), t3 = c(-2.05, 3.73, -2.04, -0.22,
-4.29, NA, NA, -0.11, 0.43, NA, -0.78, 3.24, NA, NA, -1.13, 1.09,
NA, NA, 2.7, NA), t4 = c(1.01, -2.77, NA, -3.05, -2.33, 3.78,
NA, NA, NA, NA, -2.04, -4.01, -2.32, 4, -0.28, NA, NA, 9.04,
NA, -4.12), t5 = c(1.56, NA, 4.89, NA, NA, NA, NA, NA, 0.88,
3.15, NA, NA, 2.59, NA, 2.04, NA, NA, NA, -0.26, NA), t6 = c(0.34,
-0.99, NA, 1.93, NA, NA, NA, NA, 0.35, NA, -6.46, NA, NA, NA,
2.57, NA, NA, 4.89, NA, -5.63), t7 = c(0.52, NA, 0.5, 1.85, -6.23,
NA, NA, 1.59, 7.82, 0.82, NA, NA, -1.77, NA, NA, NA, 2.01, NA,
0.7, -1.55), t8 = c(NA, NA, 4.9, -3.93, -8.13, 3.14, 0.03, 1.67,
3.55, NA, -1.55, 2.57, -0.87, NA, 0.71, -0.1, NA, NA, 2.04, NA
), t9 = c(-1.09, NA, -0.52, NA, NA, NA, NA, NA, NA, 2.05, -5.21,
-0.89, -0.03, NA, 0.66, 3.72, -1.96, NA, NA, NA)), row.names = c(NA,
20L), class = "data.frame")
Using the following tutorial (https://jenslaufer.com/data/analysis/visualize_missing_values_with_ggplot.html), I am trying to make a visualization that shows the percentage of missing data:
library(dplyr)
library(ggplot2)
library(tidyverse)
row.plot <- df %>%
mutate(id = row_number()) %>%
gather(-id, key = "key", value = "val") %>%
mutate(isna = is.na(val)) %>%
ggplot(aes(key, id, fill = isna)) +
geom_raster(alpha=0.8) +
scale_fill_manual(name = "",
values = c('steelblue', 'tomato3'),
labels = c("Present", "Missing")) +
scale_x_discrete(limits = levels) +
labs(x = "Variable",
y = "Row Number", title = "Missing values in rows") +
coord_flip()
When I try to see the results, this is the error that I get:
row.plot
Error in `new_mapped_discrete()`:
! `mapped_discrete` objects can only be created from numeric vectors
Run `rlang::last_error()` to see where the error occurred.
Warning messages:
1: In structure(in_domain, pos = match(in_domain, breaks)) :
Calling 'structure(NULL, *)' is deprecated, as NULL cannot have attributes.
Consider 'structure(list(), *)' instead.
2: In structure(in_domain, pos = match(in_domain, breaks)) :
Calling 'structure(NULL, *)' is deprecated, as NULL cannot have attributes.
Consider 'structure(list(), *)' instead.
3: Removed 200 rows containing missing values (geom_raster).
My Question: Can someone please show me what I am doing wrong and how can I fix this error? In the end, I would like to get this kind of picture:
The error is caused by scale_x_discrete.
You don't need it because in your example id is numeric and doesn't have levels as a factor would:
df %>%
mutate(id = row_number()) %>%
gather(-id, key = "key", value = "val") %>%
mutate(isna = is.na(val)) %>%
ggplot(aes(key, id, fill = isna)) +
geom_raster(alpha=0.8) +
scale_fill_manual(name = "",
values = c('steelblue', 'tomato3'),
labels = c("Present", "Missing")) +
#scale_x_discrete(limits = levels)
labs(x = "Variable",
y = "Row Number", title = "Missing values in rows") +
coord_flip()
It looks like you were wanting to produce this plot for missing data on each row rather than for each variable (though I've provided both here). The main issue is that levels is not provided, so we can create that here, then provide as a factor to scale_x_discrete.
library(tidyverse)
output <- df %>%
mutate(id = row_number()) %>%
pivot_longer(-id, names_to = "key", values_to = "val") %>%
select(-key) %>%
group_by(id) %>%
mutate(isna = is.na(val),
total = n()) %>%
group_by(id, total, isna) %>%
summarise(num.isna = n()) %>%
mutate(pct = num.isna / total * 100)
levels <- output %>% filter(isna == T) %>% arrange(desc(pct)) %>% pull(id)
row.plot <- output %>%
ggplot() +
geom_bar(aes(
x = reorder(id, desc(pct)),
y = pct,
fill = isna
),
stat = 'identity',
alpha = 0.8) +
scale_x_discrete(limits = factor(levels)) +
scale_fill_manual(
name = "",
values = c('steelblue', 'tomato3'),
labels = c("Present", "Missing")
) +
coord_flip() +
labs(title = "Percentage of missing values", x =
'Row Number', y = "% of missing values")
Output
Or if you want to do it by variable, then:
output <- df %>%
pivot_longer(everything(), names_to = "key", values_to = "val") %>%
group_by(key) %>%
mutate(isna = is.na(val),
total = n()) %>%
group_by(key, total, isna) %>%
summarise(num.isna = n()) %>%
mutate(pct = num.isna / total * 100)
levels <- output %>% filter(isna == T) %>% arrange(desc(pct)) %>% pull(key)
row.plot <- output %>%
ggplot() +
geom_bar(aes(
x = reorder(key, desc(pct)),
y = pct,
fill = isna
),
stat = 'identity',
alpha = 0.8) +
scale_x_discrete(limits = levels) +
scale_fill_manual(
name = "",
values = c('steelblue', 'tomato3'),
labels = c("Present", "Missing")
) +
coord_flip() +
labs(title = "Percentage of missing values", x =
'Variable', y = "% of missing values")
Output
When I run the code from your tutorial with your data, there is no error. Maybe you want something like this:
library(tidyverse)
missing.values <- df %>%
gather(key = "key", value = "val") %>%
mutate(isna = is.na(val)) %>%
group_by(key) %>%
mutate(total = n()) %>%
group_by(key, total, isna) %>%
summarise(num.isna = n()) %>%
mutate(pct = num.isna / total * 100)
levels <- (missing.values %>% filter(isna == T) %>% arrange(desc(pct)))$key
percentage.plot <- missing.values %>%
ggplot() +
geom_bar(aes(x = reorder(key, desc(pct)), y = pct, fill=isna), stat = 'identity', alpha=0.8, width = 1) +
scale_x_discrete(limits = levels) +
scale_fill_manual(name = "", values = c('goldenrod3', 'firebrick3'), labels = c("Present", "Missing")) +
coord_flip() +
labs(title = "Percentage of missing values", x = 'Variable', y = "% of missing values") +
theme_bw() +
theme(panel.grid = element_blank(),
panel.border = element_blank())
Output:

Ordering matrix plot using ggplot2

I am trying to plot a matrix plot using ggplot2. I am using the following code
library(tidyverse)
library(RColorBrewer)
df %>%
mutate(Models = factor(Models, labels = c("NDVI","SR","WBI","NWI-1","NWI-2","NWI-3","NWI-4","1650/2220 nm ratio"))) %>%
pivot_longer(-Models) %>%
mutate(p.value = cut(value, c(max(value, na.rm = T), 0.05, 0.01, min(value, na.rm = T)),
labels = c("NS","< 0.05","< 0.01"))) %>%
ggplot(aes(x=Models,y=name, fill=p.value)) +
theme_bw() +
geom_tile() +
xlab("Parameters") + ylab(" ") +
theme(text=element_text(size=18, family="serif"))+
scale_colour_manual(values = c("#DAA520", "#F5DEB3", "#FFF8DC","#DCDCDC"),
aesthetics = c("colour", "fill")) +
geom_text(aes(label=format(round(value, 2), nsmall = 2)), color="black", size=2)
which returns me the following plot
As you can see from the plot the x-axis labels are ordered according to my order. But I am unable to order y-axis. So, my questions are
How can I order y-axis? and
How to remove the NAs?
Only colour values < 0.05 and < 0.01 and > 0.05, not all.
Data
df = structure(list(Models = c("NDVI", "SR", "WBI", "NWI-1", "NWI-2",
"NWI-3", "NWI-4", "1650/2220 nm ratio"), NDVI = c(NA, 0.008,
0.017, 0.58, 0.02, 0.035, 0.067, 0.027), SR = c(NA, NA, 0.203,
0.542, 0.618, 0.825, 0.007, 0.015), WBI = c(NA, NA, NA, 0.506,
0.438, 0.086, 0.035, 0.067), `NWI-1` = c(NA, NA, NA, NA, 0.912,
0.698, 0.868, 0.319), `NWI-2` = c(NA, NA, NA, NA, NA, 0.782,
0.956, 0.268), `NWI-3` = c(NA, NA, NA, NA, NA, NA, 0.825, 0.166
), `NWI-4` = c(NA, NA, NA, NA, NA, NA, NA, 0.052), `1650/2220.nm.ratio` = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_
)), row.names = c(NA, 8L), class = "data.frame")
The output should look like the following
Try the following :
Get the data in long format first and then change both Models and column names to factor in different order.
library(tidyverse)
fac_levels <- c("NDVI","SR","WBI","NWI-1","NWI-2","NWI-3","NWI-4","1650/2220 nm ratio")
df %>%
pivot_longer(-Models, values_drop_na = TRUE) %>%
mutate(Models = factor(Models, levels = fac_levels),
name = factor(name, levels = rev(fac_levels)),
p.value = cut(value, c(max(value, na.rm = T), 0.05, 0.01, min(value, na.rm = T)),
labels = c("NS","< 0.05","< 0.01"))) %>%
ggplot(aes(x=Models,y=name, fill=p.value)) +
theme_bw() +
geom_tile() +
xlab("Parameters") + ylab(" ") +
theme(text=element_text(size=18, family="serif"))+
scale_colour_manual(values = c("#DAA520", "#F5DEB3", "#FFF8DC","#DCDCDC"),
aesthetics = c("colour", "fill")) +
geom_text(aes(label=format(round(value, 2), nsmall = 2)), color="black", size=2) +
scale_x_discrete(drop=FALSE) +
scale_y_discrete(drop=FALSE)

Forest plot with table ggplot coding

I am trying to get a table side by side with my forest plot but I am having a lot of trouble doing so.
I am able to make a forest plot with the following code:
###dataframe
###dataframe
library(ggplot2)
library(tidyr)
library(grid)
library(gridExtra)
library(forcats)
forestdf <- structure(list(labels = structure(1:36, .Label = c("Age*", "Sex – male vs. female",
"Body-mass index*,1 ", "Systolic blood pressure*", "Race - vs. white",
"Asian", "Black", "Townsend deprivation index", "Social habit",
"Smoking - vs. never", "Previous", "Current", "Alcohol use - vs. never",
"Once or twice a week", "Three or four times a week", "Daily or almost daily",
"Comorbidity", "Cancer", "Diabetes", "Chronic obstructive pulmonary disease2",
"Asthma", "Ischemic heart disease3", "Hypothyroidism", "Hypercholesterolemia",
"Allergic rhinitis", "Depression", "Serology", "White blood cell count",
"Red blood cell count", "Hemoglobin concentration", "Mean corpuscular volume",
"Mean corpuscular hemoglobin concentration", "Platelet count",
"Lymphocyte count", "Monocyte count", "Neutrophil count"), class = "factor"),
rr = c(1.18, 1.45, 1.76, 0.98, NA, 2.16, 2.65, 1.09, NA,
NA, 1.35, 1.15, NA, 0.73, 0.63, 0.63, NA, 1.23, 1.34, 1.51,
1.12, 1.46, 0.96, 1.1, 1.18, 1.38, NA, 1.03, 0.87, 0.93,
1, 0.94, 1, 1.03, 1.17, 1.06), rrhigh = c(1.08, 1.28, 1.57,
0.95, NA, 1.63, 2.03, 1.07, NA, NA, 1.18, 0.94, NA, 0.58,
0.49, 0.5, NA, 0.99, 1.08, 1.09, 0.93, 1.15, 0.71, 0.92,
0.91, 1.1, NA, 1.02, 0.73, 0.87, 0.99, 0.88, 1, 1.01, 1.03,
1.01), rrlow = c(1.28, 1.64, 1.97, 1.02, NA, 2.86, 3.44,
1.11, NA, NA, 1.55, 1.42, NA, 0.9, 0.79, 0.81, NA, 1.53,
1.66, 2.09, 1.34, 1.85, 1.3, 1.31, 1.52, 1.74, NA, 1.04,
1.03, 0.98, 1.01, 1.01, 1, 1.05, 1.32, 1.1)), class = "data.frame", row.names = c(NA,
-36L))
forestdf$labels <- factor(forestdf$labels,levels = forestdf$labels)
levels(forestdf$labels) 1.52, 1.74, NA, 1.04, 1.03, 0.98, 1.01, 1.01, 1, 1.05, 1.32,
#forestplot
p <- ggplot(forestdf, aes(x=rr, y=labels, xmin=rrlow, xmax=rrhigh))+
geom_pointrange(shape=22, fill="black")+
geom_vline(xintercept = 1, linetype=3)+
xlab("Variable")+ylab("Adjusted Relative Risk with 95% Confidence Interval")+theme_classic()+scale_y_discrete(limits = rev(labels))+
scale_x_log10(limits = c(0.25, 4), breaks = c(0.25, 0.5, 1, 2, 4), labels=c("0.25", "0.5", "1", "2", "4"), expand = c(0,0))
p
However, I cannot get the left panel with labels to work:
#dataframe for table
fplottable <- structure(list(labels = structure(c(1L, 30L, 7L, 33L, 27L, 4L,
6L, 35L, 32L, 31L, 26L, 11L, 2L, 24L, 34L, 12L, 10L, 8L, 14L,
9L, 5L, 18L, 17L, 16L, 3L, 13L, 29L, 36L, 28L, 15L, 21L, 20L,
25L, 19L, 22L, 23L), .Label = c("Age*", "Alcohol use - vs. never",
"Allergic rhinitis", "Asian", "Asthma", "Black", "Body-mass index*,1 ",
"Cancer", "Chronic obstructive pulmonary disease2", "Comorbidity",
"Current", "Daily or almost daily", "Depression", "Diabetes",
"Hemoglobin concentration", "Hypercholesterolemia", "Hypothyroidism",
"Ischemic heart disease3", "Lymphocyte count", "Mean corpuscular hemoglobin concentration",
"Mean corpuscular volume", "Monocyte count", "Neutrophil count",
"Once or twice a week", "Platelet count", "Previous", "Race - vs. white",
"Red blood cell count", "Serology", "Sex – male vs. female",
"Smoking - vs. never", "Social habit", "Systolic blood pressure*",
"Three or four times a week", "Townsend deprivation index", "White blood cell count"
), class = "factor"), No..of.Events = c(1073L, 581L, 1061L, 1031L,
NA, 57L, 68L, 1072L, NA, NA, 442L, 117L, NA, 262L, 191L, 172L,
NA, 96L, 107L, 41L, 146L, 86L, 52L, 170L, 66L, 84L, NA, 1009L,
1009L, 1009L, 1009L, 1009L, 1009L, 1005L, 1005L, 1005L), ARR..95..CI. = c("1.18 (1.08-1.28)",
"1.45 (1.28-1.64)", "1.76 (1.57-1.97)", "0.98 (0.95-1.02)", "",
"2.16 (1.63-2.86)", "2.65 (2.03-3.44)", "1.09 (1.07-1.11)", "",
"", "1.35 (1.18-1.55)", "1.15 (0.94-1.42)", "", "0.73 (0.58-0.90)",
"0.63 (0.49-0.79)", "0.63 (0.50-0.81)", "", "1.23 (0.99-1.53)",
"1.34 (1.08-1.66)", "1.51 (1.09-2.09)", "1.12 (0.93-1.34)", "1.46 (1.15-1.85)",
"0.96 (0.71-1.30)", "1.10 (0.92-1.31)", "1.18 (0.91-1.52)", "1.38 (1.10-1.74)",
"", "1.03 (1.02-1.04)", "0.87 (0.73-1.03)", "0.93 (0.87-0.98)",
"1.00 (0.99-1.01)", "0.94 (0.88-1.01)", "1.00 (1.00-1.00)", "1.03 (1.01-1.05)",
"1.17 (1.03-1.32)", "1.06 (1.01-1.10)")), class = "data.frame", row.names = c(NA,
-36L))
###NOT WORKING CODE THAT TRIES TO MAKE TABLE LEFT OF FOREST PLOT
data_table <- geom_text(data=fplottable,aes(y=labels)) +
geom_text(label=eventnum) +
geom_text(label=arr)
data_table
grid.arrange(data_table,p, ncol=2)
I am drawing inspiration from:
Reproduce table and plot from journal and trying to get something similar to what is shown in the forest plot with the pink boxes
There were a few issues as #efz pointed out. In addition, you need to refactor the labels in your second column to allow them to match up with those in your first. It's probably going to look messy with the y axis labels and title alongside the table, so these could be removed too.
That leaves you something like:
forestdf$colour <- rep(c("white", "gray95"), 18)
p <- ggplot(forestdf, aes(x = rr, y = labels, xmin = rrlow, xmax = rrhigh)) +
geom_hline(aes(yintercept = labels, colour = colour), size = 7) +
geom_pointrange(shape = 22, fill = "black") +
geom_vline(xintercept = 1, linetype = 3) +
xlab("Variable") +
ylab("Adjusted Relative Risk with 95% Confidence Interval") +
theme_classic() +
scale_colour_identity() +
scale_y_discrete(limits = rev(forestdf$labels)) +
scale_x_log10(limits = c(0.25, 4),
breaks = c(0.25, 0.5, 1, 2, 4),
labels = c("0.25", "0.5", "1", "2", "4"), expand = c(0,0)) +
theme(axis.text.y = element_blank(), axis.title.y = element_blank())
names(fplottable) <- c("labels", "eventnum", "arr")
fplottable$labels <- factor(fplottable$labels, rev(levels(forestdf$labels)))
fplottable$colour <- rep(c("white", "gray95"), 18)
data_table <- ggplot(data = fplottable, aes(y = labels)) +
geom_hline(aes(yintercept = labels, colour = colour), size = 7) +
geom_text(aes(x = 0, label = labels), hjust = 0) +
geom_text(aes(x = 5, label = eventnum)) +
geom_text(aes(x = 7, label = arr), hjust = 1) +
scale_colour_identity() +
theme_void() +
theme(plot.margin = margin(5, 0, 35, 0))
grid.arrange(data_table,p, ncol = 2)
You can simplify further by merging the two dataframes as fdf <- full_join(forestdf, fplottable, by = "labels") and running your p on fdf. Then p + geom_text(aes(x=22, label=paste(" ", arr," ",eventum, sep=' '))) will give the following output: output
Obviously, limits need to be expanded to 100 to include the table, and the full code is below:
p <- ggplot(fdf, aes(x=rr, y=labels, xmin=rrlow, xmax=rrhigh))+
geom_pointrange(shape=22, fill="black") +
geom_vline(xintercept = 1, linetype=3) +
xlab("Variable")+ylab("Adjusted Relative Risk with 95% Confidence Interval") +
theme_bw() +
#scale_y_discrete(limits = rev(labels))+
scale_x_log10(limits = c(0.25, 100),
breaks = c(0.25, 0.5, 1, 2, 4, 100),
labels=c("0.25", "0.5", "1", "2", "4", ""),
expand = c(0,0)
)+
geom_text(aes(x=22, label=paste(" ", arr," ",eventum, sep=' ')))
p
supposing
names(fplottable)<-c('labels','eventum','arr')
then there are a few issues with the code for data_table. If I understood correctly you meant something like:
data_table <- ggplot(data=fplottable)+geom_text(aes(x= 1, y=labels, label=arr))+geom_text(aes(x= 1.5, y=labels, label=eventum)).
You can play with the value of x and have only one geom_text where label=paste(arr, eventum, sep=' ')
in this case the command grid.arrange(data_table,p, ncol=2) seems to work fine. You can define the space of each panel with width.

Ylim max to change dynamically with a variable, while min is set to 0 in R

I would like my graphs to start at y= 0, but I would like the maximum to change with a multiple of the data, or somehow otherwise zoom out dynamically. I have 34 charts in this set with various ymax.
I have tried scale_y_continuous and coord_cartesian but when I try to put in the expand = expand_scale(mult = 2) that works for getting my maximum to change dynamically, but then the graphs start to start at negative numbers, and I want them to start at 0.
title<- c(
"Carangidae",
"Atlantic cutlassfish",
"Lizardfish",
"Sharks",
"Mackerel")
#DATA#
biomass<- structure(list(timestep = structure(c(10957, 10988, 11017, 11048,
11078, 11109, 11139, 11170, 11201, 11231, 11262, 11292), class = "Date"),
bio_pre_Carangidae = c(0.01105, 0.0199, 0.017,
0.01018, 0.0119, 0.0101, 0.009874, 0.009507,
0.009019, 0.00843, 0.00841, 0.00805), bio_obs_Carangidae = c(NA,
NA, NA, NA, NA, 0.00239, NA, NA, NA, NA, NA, NA), bio_pre_Atl_cutlassfish = c(0.078,
0.069, 0.067, 0.06872, 0.0729, 0.0769,
0.0775, 0.075, 0.0743, 0.072, 0.071,
0.069), bio_obs_Atl_cutlassfish = c(NA, NA, NA, NA, NA,
0.0325, NA, NA, NA, NA, NA, NA), bio_pre_lizardfish = c(0.0635,
0.062, 0.057, 0.0536, 0.0505, 0.0604,
0.0627, 0.068, 0.0695, 0.066, 0.0623,
0.0598), bio_obs_lizardfish = c(NA, NA, NA, NA, NA, 0.037,
NA, NA, NA, NA, NA, NA), bio_pre_sharks = c(0.025, 0.0155,
0.0148, 0.0135, 0.01379, 0.01398, 0.014,
0.0139, 0.0136, 0.0132, 0.0126, 0.011),
bio_obs_sharks = c(NA, NA, NA, NA, NA, 0.003, NA, NA,
NA, NA, NA, NA), bio_pre_mackerel = c(0.0567, 0.0459,
0.0384, 0.03, 0.0328, 0.0336, 0.0299,
0.0296, 0.02343, 0.02713, 0.0239, 0.019
), bio_obs_mackerel = c(NA, NA, NA, NA, NA, 0.055, NA,
NA, NA, NA, NA, NA)), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -12L))
This is my function:
function (biomass, .var1, .var2, .var3) {
p <- ggplot(biomass, aes(x = timestep)) +
geom_line(aes(y = .data[[.var1]], linetype = "Predicted")) + geom_point(size = 3, aes(y = .data[[.var2]], shape = "Observed")) +
ggtitle(paste0(.var3)) +
ylab(expression("biomass" ~ (t/km^2))) +
theme_classic() +
scale_y_continuous(limits = c(0, NA), expand = expand_scale(mult = 2))+
###This is the portion where I cannot figure out how to set ymin = 0 and then ymax to 2* the maximum value of a dataset.##
theme(legend.position = "right") +
theme(axis.ticks = element_line(size = 1), axis.ticks.length = unit(0.25, "cm"))
return(p)
}
## create two separate name vectors
var1_names <- colnames(biomass)[grepl("^bio_pre", colnames(biomass))]
var2_names <- colnames(biomass)[grepl("^bio_obs", colnames(biomass))]
var3_names <- title
## loop through two vectors simultaneously and save result in a list
# ..1 = var1_names, ..2 = var2_names
my_plot_b <- pmap(list(var1_names, var2_names, var3_names), ~ my_bio_plot(biomass, ..1, ..2, ..3))
## merge plots together
# https://cran.r-project.org/web/packages/cowplot/
# install.packages("cowplot", dependencies = TRUE)
dev.new(title = "Model Fit Biomass",
width = 12,
height = 6,
noRStudioGD = TRUE
)
print(my_plot_b)
I can manage to get EITHER a set ymin=0 (a) OR a dynamic ymax (b) but cannot manage to get both.
a
b
How about this? Seems to work on your data.
Define the max for each chart at the top of your function:
my_bio_plot <- function (biomass, .var1, .var2, .var3) {
max_y = 2.0 * max(biomass[[.var1]])
...
scale_y_continuous(limits = c(0, max_y)) +
...
This seems to create the requested output, with min y = 0 and max y = 2 * max y in data.
Updated to add a substantially different approach from yours:
biomass %>%
gather(species, bio, -timestep) %>%
mutate(type = ifelse(stringr::str_detect(species, 'pre'), 'predicted', 'observed'),
species = gsub(".*_", "", species)) %>%
group_by(species) %>%
mutate(ul = max(bio, na.rm = TRUE) * 2) %>%
filter(species == "sharks") -> df
df %>%
ggplot(aes(timestep, bio, group = type)) +
geom_point(aes(shape = type)) +
geom_line(aes(linetype = type)) +
# facet_wrap(~species) +
scale_linetype_manual(name = "",
values = c("blank", 'solid')) +
scale_shape_manual(name = "",
values = c(19, NA))+
scale_y_continuous(limits = c(0, max(df$ul)))
You could remove the filter(species == "sharks") and uncomment thefacet_wrap(~species) and you will get all the species plotted at the same time.

Remove margin inside plot pf ggplot2

this is my script and the associated plot:
library(ggplot2)
library(reshape)
df <- structure(list(ID = structure(1:19, .Label = c("2818/22/0834",
"2818/22/0851", "2818/22/0853", "2818/22/0886", "B0F", "B12T",
"B1T", "B21T", "B22F", "B26T", "B33F", "B4F", "P1", "P21", "P24",
"P25", "P27", "P28", "P29"), class = "factor"), K = c(0.089,
0.094, 0.096, 0.274, 0.09, 0.312, 0.33, 0.178, 0.05, 0.154, 0.083,
0.098, 0.035, 0.084, 0.053, 0.061, 0.043, 0.094, 0.101), Na = c(2.606,
3.822, 4.977, 2.522, 15.835, 83.108, 52.041, 41.448, 11.849,
40.531, 5.854, 10.151, 3.52, 8.445, 5.273, 7.246, 6.177, 14.813,
15.569), Cl = c(3.546, 6.181, 8.422, 3.733, 14.685, 96.911, 65.518,
79.01, 10.349, 53.361, 6.12, 10.832, 2.313, 10.312, 5.641, 8.708,
6.138, 12.302, 20.078), Mg = c(1.487, 1.773, 1.992, 1.143, 2.991,
1.678, 2.23, 3.288, 1.148, 2.428, 3.428, 2.729, 0.777, 2.554,
2.374, 4.075, 1.993, 1.881, 3.034), Ca = c(5.529, 6.205, 6.59,
4.099, 10.631, 4.564, 6.652, 13.374, 4.332, 10.542, 11.194, 10.053,
2.969, 7.73, 8.163, 11.539, 6.166, 5.968, 9.299), SO4 = c(0.663,
0.831, 0.607, 0.882, 9.013, 0.896, 0.652, 0.021, 1.446, 0.012,
8.832, 6.665, 1.003, 2.575, 3.685, 7.121, 3.64, 5.648, 2.397),
HCO3 = c(7.522, 5.498, 6.15, 5.242, 8.582, 4.067, 5.65, 9.364,
5.435, 8.068, 9.054, 8.326, 4.805, 7.235, 7.488, 9.234, 6.352,
6.98, 8.34)), .Names = c("ID", "K", "Na", "Cl", "Mg", "Ca",
"SO4", "HCO3"), class = "data.frame", row.names = c(NA, -19L))
df_melted<-melt(df, na.rm=T)
ggplot(df_melted, aes(variable, value, group=ID, color=ID))+
geom_point(size=2) +
geom_line() +
theme(legend.position="none") +
scale_y_log10(breaks=seq(0, 100, 10))
Is there a way to remove the spaces at the beginning and at the end of the plot? I tried with xlim but the problem is that the x variable is not a numerical variable, so, something like xlim(c("K", "HCO3")) doesn't work.
This is a discrete scale, but you can still used the expand argument as follows. Whether the output looks acceptable or not is another matter. Play with the c(0,0) values until you find something that suits. Using 0.1 for the second value gives a slightly better plot, in my view...
ggplot(df_melted, aes(variable, value, group=ID, color=ID))+
geom_point(size=2) +
geom_line() +
theme(legend.position="none") +
scale_y_log10(breaks=seq(0, 100, 10)) +
scale_x_discrete(expand = c(0,0))

Resources