Combine data into one long column using melt - r

Sorry if this is a replicate question but I couldn't find the answer to my question.
I have a data frame which is currently:
Depth Year Counts Depth.1 Year.1 Counts.1 Depth.2 Year.2 Counts.2
etc
I want to create a data frame that is instead just Depth Year and Count in 3 separate columns.
I thought that the following would work, but it doesn't. Does anyone know where I am going wrong?
library(data.table)
A <- melt(df, id.vars="Depth","Year","Counts")
structure(list(Depth = c(0.5, 0.5, 0.5, 0.5, 0.5, 0.5), Year = c(2001.539328,
2001.57432, 2001.609313, 2001.644306, 2001.679298, 2001.714291
), Counts = c(2.87e-06, 3.56e-06, 4.38e-06, 5.36e-06, 6.52e-06,
7.94e-06), Depth.1 = c(1.5, 1.5, 1.5, 1.5, 1.5, 1.5), Year.1 = c(1984.293532,
1984.364339, 1984.435146, 1984.505954, 1984.576761, 1984.647568
), Counts.1 = c(1.46e-06, 1.81e-06, 2.22e-06, 2.73e-06, 3.34e-06,
4.07e-06), Depth.2 = c(2.5, 2.5, 2.5, 2.5, 2.5, 2.5), Year.2 = c(1981.470822,
1981.544366, 1981.61791, 1981.691454, 1981.764998, 1981.838542
), Counts.2 = c(1.59e-06, 1.98e-06, 2.45e-06, 3.01e-06, 3.7e-06,
4.53e-06), Depth.3 = c(3.5, 3.5, 3.5, 3.5, 3.5, 3.5), Year.3 = c(1977.871611,
1977.945227, 1978.018842, 1978.092458, 1978.166074, 1978.239689
), Counts.3 = c(1.52e-06, 1.9e-06, 2.34e-06, 2.89e-06, 3.56e-06,
4.35e-06), Depth.4 = c(4.5, 4.5, 4.5, 4.5, 4.5, 4.5), Year.4 = c(1973.91751,
1973.996106, 1974.074703, 1974.1533, 1974.231896, 1974.310493
), Counts.4 = c(2.68e-06, 3.35e-06, 4.17e-06, 5.2e-06, 6.44e-06,
7.93e-06), Depth.5 = c(5.5, 5.5, 5.5, 5.5, 5.5, 5.5), Year.5 = c(1968.953544,
1969.036898, 1969.120252, 1969.203607, 1969.286961, 1969.370316
), Counts.5 = c(1.17e-06, 1.47e-06, 1.84e-06, 2.28e-06, 2.81e-06,
3.47e-06), Depth.6 = c(6.5, 6.5, 6.5, 6.5, 6.5, 6.5), Year.6 = c(1962.314375,
1962.408079, 1962.501784, 1962.595488, 1962.689193, 1962.782897
), Counts.6 = c(1.5e-06, 1.95e-06, 2.5e-06, 3.18e-06, 4.02e-06,
5.09e-06), Depth.7 = c(7.5, 7.5, 7.5, 7.5, 7.5, 7.5), Year.7 = c(1958.713624,
1958.805065, 1958.896505, 1958.987945, 1959.079385, 1959.170826
), Counts.7 = c(1.12e-06, 1.43e-06, 1.8e-06, 2.25e-06, 2.8e-06,
3.49e-06), Depth.8 = c(8.5, 8.5, 8.5, 8.5, 8.5, 8.5), Year.8 = c(1951.664415,
1951.763029, 1951.861644, 1951.960258, 1952.058872, 1952.157487
), Counts.8 = c(1.03e-06, 1.31e-06, 1.66e-06, 2.09e-06, 2.61e-06,
3.23e-06), Depth.9 = c(9.5, 9.5, 9.5, 9.5, 9.5, 9.5), Year.9 = c(1945.090332,
1945.195825, 1945.301319, 1945.406813, 1945.512306, 1945.6178
), Counts.9 = c(9.86e-07, 1.26e-06, 1.6e-06, 2.01e-06, 2.51e-06,
3.13e-06), Depth.10 = c(10.5, 10.5, 10.5, 10.5, 10.5, 10.5),
Year.10 = c(1935.501068, 1935.619252, 1935.737435, 1935.855618,
1935.973801, 1936.091984), Counts.10 = c(1.65e-06, 2.1e-06,
2.65e-06, 3.32e-06, 4.15e-06, 5.17e-06), Depth.11 = c(11.5,
11.5, 11.5, 11.5, 11.5, 11.5), Year.11 = c(1925.293378, 1925.407495,
1925.521611, 1925.635728, 1925.749844, 1925.863961), Counts.11 = c(9.04e-07,
1.13e-06, 1.4e-06, 1.72e-06, 2.1e-06, 2.58e-06), Depth.12 = c(12.5,
12.5, 12.5, 12.5, 12.5, 12.5), Year.12 = c(1915.470281, 1915.590233,
1915.710185, 1915.830138, 1915.95009, 1916.070042), Counts.12 = c(1.18e-06,
1.45e-06, 1.78e-06, 2.17e-06, 2.65e-06, 3.23e-06), Depth.13 = c(13.5,
13.5, 13.5, 13.5, 13.5, 13.5), Year.13 = c(1907.029774, 1907.144991,
1907.260209, 1907.375426, 1907.490644, 1907.605861), Counts.13 = c(1.33e-06,
1.68e-06, 2.11e-06, 2.62e-06, 3.24e-06, 4.02e-06), Depth.14 = c(14.5,
14.5, 14.5, 14.5, 14.5, 14.5), Year.14 = c(1896.291234, 1896.410534,
1896.529835, 1896.649135, 1896.768436, 1896.887736), Counts.14 = c(8.64e-07,
1.1e-06, 1.39e-06, 1.75e-06, 2.2e-06, 2.74e-06), Depth.15 = c(15.5,
15.5, 15.5, 15.5, 15.5, 15.5), Year.15 = c(1889.864627, 1889.969479,
1890.074332, 1890.179184, 1890.284037, 1890.388889), Counts.15 = c(1.05e-06,
1.36e-06, 1.75e-06, 2.22e-06, 2.81e-06, 3.55e-06), Depth.16 = c(16.5,
16.5, 16.5, 16.5, 16.5, 16.5), Year.16 = c(1886.325239, 1886.425704,
1886.526169, 1886.626634, 1886.727099, 1886.827564), Counts.16 = c(1.27e-06,
1.68e-06, 2.21e-06, 2.89e-06, 3.73e-06, 4.77e-06), Depth.17 = c(17.5,
17.5, 17.5, 17.5, 17.5, 17.5), Year.17 = c(1882.108412, 1882.108412,
1882.108412, 1882.108412, 1882.108412, 1882.108412), Counts.17 = c(1.38e-06,
1.86e-06, 2.47e-06, 3.24e-06, 4.22e-06, 5.45e-06), Depth.18 = c(18.5,
18.5, 18.5, 18.5, 18.5, 18.5), Year.18 = c(1864.178957, 1864.300768,
1864.422579, 1864.544389, 1864.6662, 1864.788011), Counts.18 = c(1.1e-06,
1.52e-06, 2.08e-06, 2.81e-06, 3.73e-06, 4.9e-06), Depth.19 = c(19.5,
19.5, 19.5, 19.5, 19.5, 19.5), Year.19 = c(1844.224683, 1844.373854,
1844.523025, 1844.672196, 1844.821367, 1844.970538), Counts.19 = c(1.05e-06,
1.52e-06, 2.18e-06, 3.07e-06, 4.25e-06, 5.78e-06), Depth.20 = c(20.5,
20.5, 20.5, 20.5, 20.5, 20.5), Year.20 = c(1826.063901, 1826.239023,
1826.414145, 1826.589268, 1826.76439, 1826.939512), Counts.20 = c(7.96e-07,
1.1e-06, 1.5e-06, 2.02e-06, 2.68e-06, 3.53e-06), Depth.21 = c(21.5,
21.5, 21.5, 21.5, 21.5, 21.5), Year.21 = c(1794.869238, 1795.097941,
1795.326644, 1795.555348, 1795.784051, 1796.012754), Counts.21 = c(5.72e-07,
7.74e-07, 1.03e-06, 1.37e-06, 1.81e-06, 2.36e-06), Depth.22 = c(22.5,
22.5, 22.5, 22.5, 22.5, 22.5), Year.22 = c(1776.381101, 1776.645157,
1776.909213, 1777.173268, 1777.437324, 1777.70138), Counts.22 = c(4.57e-07,
6.08e-07, 7.98e-07, 1.04e-06, 1.34e-06, 1.72e-06), Depth.23 = c(23.5,
23.5, 23.5, 23.5, 23.5, 23.5), Year.23 = c(1766.787508, 1767.066229,
1767.344949, 1767.62367, 1767.90239, 1768.181111), Counts.23 = c(4.28e-07,
5.65e-07, 7.39e-07, 9.57e-07, 1.23e-06, 1.57e-06), Depth.24 = c(24.5,
24.5, 24.5, 24.5, 24.5, 24.5), Year.24 = c(1724.904818, 1725.249971,
1725.595125, 1725.940278, 1726.285432, 1726.630586), Counts.24 = c(3.85e-07,
5.28e-07, 7.13e-07, 9.51e-07, 1.25e-06, 1.63e-06), Depth.25 = c(25.5,
25.5, 25.5, 25.5, 25.5, 25.5), Year.25 = c(1666.304304, 1666.747587,
1667.19087, 1667.634153, 1668.077436, 1668.520719), Counts.25 = c(3.14e-07,
4.35e-07, 5.93e-07, 7.99e-07, 1.07e-06, 1.42e-06), Depth.26 = c(26.5,
26.5, 26.5, 26.5, 26.5, 26.5), Year.26 = c(1646.315863, 1646.752634,
1647.189405, 1647.626176, 1648.062946, 1648.499717), Counts.26 = c(2.97e-07,
4.01e-07, 5.43e-07, 7.26e-07, 9.58e-07, 1.25e-06), Depth.27 = c(27.5,
27.5, 27.5, 27.5, 27.5, 27.5), Year.27 = c(1631.425358, 1631.862129,
1632.298899, 1632.73567, 1633.172441, 1633.609212), Counts.27 = c(3.01e-07,
4.02e-07, 5.39e-07, 7.13e-07, 9.33e-07, 1.21e-06), Depth.28 = c(28.5,
28.5, 28.5, 28.5, 28.5, 28.5), Year.28 = c(1623.821174, 1624.214018,
1624.606862, 1624.999706, 1625.39255, 1625.785393), Counts.28 = c(3.08e-07,
4.09e-07, 5.38e-07, 7.01e-07, 9.11e-07, 1.17e-06), Depth.29 = c(29.5,
29.5, 29.5, 29.5, 29.5, 29.5), Year.29 = c(1612.475829, 1612.864893,
1613.253957, 1613.643021, 1614.032085, 1614.421149), Counts.29 = c(3.4e-07,
4.66e-07, 6.3e-07, 8.4e-07, 1.11e-06, 1.44e-06), Depth.30 = c(30.5,
30.5, 30.5, 30.5, 30.5, 30.5), Year.30 = c(1600.26273, 1600.609876,
1600.957023, 1601.304169, 1601.651316, 1601.998462), Counts.30 = c(4.18e-07,
5.85e-07, 8.07e-07, 1.1e-06, 1.49e-06, 1.99e-06), Depth.31 = c(31.5,
31.5, 31.5, 31.5, 31.5, 31.5), Year.31 = c(1549.137398, 1549.553381,
1549.969364, 1550.385346, 1550.801329, 1551.217311), Counts.31 = c(3.27e-07,
4.48e-07, 6.06e-07, 8.1e-07, 1.07e-06, 1.41e-06), Depth.32 = c(32.5,
32.5, 32.5, 32.5, 32.5, 32.5), Year.32 = c(1379.9456, 1380.656236,
1381.366871, 1382.077507, 1382.788142, 1383.498778), Counts.32 = c(3.71e-07,
5.02e-07, 6.71e-07, 8.88e-07, 1.16e-06, 1.5e-06), Depth.33 = c(33.5,
33.5, 33.5, 33.5, 33.5, 33.5), Year.33 = c(1176.400716, 1177.495517,
1178.590318, 1179.685119, 1180.77992, 1181.874721), Counts.33 = c(1.21e-07,
1.66e-07, 2.24e-07, 2.99e-07, 3.94e-07, 5.13e-07), Depth.34 = c(34.5,
34.5, 34.5, 34.5, 34.5, 34.5), Year.34 = c(984.8733315, 986.2808571,
987.6883826, 989.0959082, 990.5034338, 991.9109593), Counts.34 = c(9.87e-08,
1.37e-07, 1.86e-07, 2.51e-07, 3.34e-07, 4.43e-07), Depth.35 = c(35.5,
35.5, 35.5, 35.5, 35.5, 35.5), Year.35 = c(931.6673674, 933.0776679,
934.4879684, 935.8982688, 937.3085693, 938.7188698), Counts.35 = c(8.67e-08,
1.15e-07, 1.53e-07, 2.01e-07, 2.61e-07, 3.34e-07), Depth.36 = c(36.5,
36.5, 36.5, 36.5, 36.5, 36.5), Year.36 = c(894.7139257, 896.1463371,
897.5787485, 899.01116, 900.4435714, 901.8759828), Counts.36 = c(7.95e-08,
1.04e-07, 1.34e-07, 1.71e-07, 2.19e-07, 2.78e-07), Depth.37 = c(37.5,
37.5, 37.5, 37.5, 37.5, 37.5), Year.37 = c(867.2347826, 868.6591119,
870.0834411, 871.5077704, 872.9320996, 874.3564289), Counts.37 = c(7.45e-08,
9.58e-08, 1.22e-07, 1.54e-07, 1.95e-07, 2.44e-07), Depth.38 = c(38.5,
38.5, 38.5, 38.5, 38.5, 38.5), Year.38 = c(822.8193907, 824.2840456,
825.7487006, 827.2133555, 828.6780105, 830.1426654), Counts.38 = c(7.25e-08,
9.32e-08, 1.19e-07, 1.5e-07, 1.89e-07, 2.37e-07), Depth.39 = c(39.5,
39.5, 39.5, 39.5, 39.5, 39.5), Year.39 = c(780.7261404, 782.1666312,
783.6071219, 785.0476127, 786.4881034, 787.9285942), Counts.39 = c(7.24e-08,
9.24e-08, 1.17e-07, 1.48e-07, 1.86e-07, 2.31e-07), Depth.40 = c(40.5,
40.5, 40.5, 40.5, 40.5, 40.5), Year.40 = c(743.4256597, 744.8411919,
746.2567241, 747.6722563, 749.0877885, 750.5033208), Counts.40 = c(1.09e-07,
1.41e-07, 1.8e-07, 2.29e-07, 2.89e-07, 3.64e-07), Depth.41 = c(41.5,
41.5, 41.5, 41.5, 41.5, 41.5), Year.41 = c(673.4489487, 674.8279538,
676.2069588, 677.5859639, 678.964969, 680.3439741), Counts.41 = c(1.06e-07,
1.36e-07, 1.73e-07, 2.19e-07, 2.77e-07, 3.48e-07), Depth.42 = c(42.5,
42.5, 42.5, 42.5, 42.5, 42.5), Year.42 = c(624.182451, 625.532222,
626.881993, 628.231764, 629.581535, 630.931306), Counts.42 = c(8e-08,
1.03e-07, 1.32e-07, 1.67e-07, 2.1e-07, 2.63e-07), Depth.43 = c(43.5,
43.5, 43.5, 43.5, 43.5, 43.5), Year.43 = c(566.5185196, 567.8721804,
569.2258412, 570.579502, 571.9331628, 573.2868236), Counts.43 = c(8.43e-08,
1.1e-07, 1.42e-07, 1.83e-07, 2.34e-07, 2.97e-07), Depth.44 = c(44.5,
44.5, 44.5, 44.5, 44.5, 44.5), Year.44 = c(518.6933347, 520.014935,
521.3365354, 522.6581358, 523.9797362, 525.3013366), Counts.44 = c(9.23e-08,
1.24e-07, 1.65e-07, 2.16e-07, 2.8e-07, 3.6e-07), Depth.45 = c(45.5,
45.5, 45.5, 45.5, 45.5, 45.5), Year.45 = c(443.0346844, 444.2413453,
445.4480063, 446.6546672, 447.8613282, 449.0679891), Counts.45 = c(1.1e-07,
1.51e-07, 2.04e-07, 2.72e-07, 3.59e-07, 4.67e-07), Depth.46 = c(46.5,
46.5, 46.5, 46.5, 46.5, 46.5), Year.46 = c(368.5762277, 369.8150017,
371.0537756, 372.2925496, 373.5313236, 374.7700976), Counts.46 = c(1.54e-07,
2.24e-07, 3.19e-07, 4.49e-07, 6.21e-07, 8.47e-07)), class = c("data.table",
"data.frame"), row.names = c(NA, -6L))

You can try the base R code (don't need data.table) below, using stack + unstack
res <- unstack(
transform(
stack(df),
ind = gsub("\\..*", "", ind)
)
)
which gives
> head(res, 20)
Counts Depth Year
1 2.87e-06 0.5 2001.539
2 3.56e-06 0.5 2001.574
3 4.38e-06 0.5 2001.609
4 5.36e-06 0.5 2001.644
5 6.52e-06 0.5 2001.679
6 7.94e-06 0.5 2001.714
7 1.46e-06 1.5 1984.294
8 1.81e-06 1.5 1984.364
9 2.22e-06 1.5 1984.435
10 2.73e-06 1.5 1984.506
11 3.34e-06 1.5 1984.577
12 4.07e-06 1.5 1984.648
13 1.59e-06 2.5 1981.471
14 1.98e-06 2.5 1981.544
15 2.45e-06 2.5 1981.618
16 3.01e-06 2.5 1981.691
17 3.70e-06 2.5 1981.765
18 4.53e-06 2.5 1981.839
19 1.52e-06 3.5 1977.872
20 1.90e-06 3.5 1977.945```

This could be done by:
setDT(df)
melt(df,, patterns("Counts", "Depth", "Year"))
If you want to be more sophisticated, just do:
nms <- c('Count', 'Depth', 'Year')
melt(df, measure.vars = patterns(nms), value.name = nms)
variable Count Depth Year
1: 1 2.87e-06 0.5 2001.539
2: 1 3.56e-06 0.5 2001.574
3: 1 4.38e-06 0.5 2001.609
4: 1 5.36e-06 0.5 2001.644
5: 1 6.52e-06 0.5 2001.679
---
206: 35 NA 34.5 NA
207: 35 NA 34.0 NA
208: 35 NA 34.5 NA
209: 35 NA 34.5 NA
210: 35 NA 34.0 NA

Using pivot_longer from tidyr :
tidyr::pivot_longer(df, cols = everything(),
names_to = '.value',
names_pattern = '(\\w+)')
# A tibble: 210 x 3
# Depth Year Counts
# <dbl> <dbl> <dbl>
# 1 0.5 2002. 0.00000287
# 2 1.5 1984. 0.00000146
# 3 2.5 1981. 0.00000159
# 4 3.5 1978. 0.00000152
# 5 4.5 1974. 0.00000268
# 6 5.5 1969. 0.00000117
# 7 6.5 1962. 0.0000015
# 8 7.5 1959. 0.00000112
# 9 8.5 1952. 0.00000103
#10 9.5 1945. 0.000000986
# … with 200 more rows

Related

How to connect points according to grouping instead of connecting all points in ggplot?

As the title stated, I want to connect points in every group instead of all points.
Here is the original date:
df<-structure(list(TN = c(13.6, 18, 18.5, 17, 16.9, 13.6, 17.6, 14.8,
14, 11, 12.6, 18.6, 18.8, 18.3, 19.4, 18.5, 18.9, 22, 22.3),
TX = c(29.9, 26.9, 30.5, 26.6, 25.4, 29.7, 24.1, 21.1, 23.8,
29.3, 34.4, 31.1, 32, 35.9, 36.7, 37.5, 39.2, 34.8, 33.6),
TM = c(22.5, 21.4, 23.3, 21.4, 20.2, 21.4, 19.9, 17.8, 18.9,
20.9, 24.5, 24.5, 25.1, 27.3, 28.2, 28.5, 29.2, 28.2, 26.8
), Date = c("01/06/2022", "02/06/2022", "03/06/2022", "04/06/2022",
"05/06/2022", "06/06/2022", "07/06/2022", "08/06/2022", "09/06/2022",
"10/06/2022", "11/06/2022", "12/06/2022", "13/06/2022", "14/06/2022",
"15/06/2022", "16/06/2022", "17/06/2022", "18/06/2022", "19/06/2022"
)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-19L))
Here is my code:
library(ggplot2)
library(tidyr)
library(dplyr)
df %>% select(Date, TN, TX) %>%
pivot_longer(cols = c(TN,TX), names_to = "Tcombine", values_to = "Value") %>%
ggplot(aes(Date, Value,group = 1,shape=Tcombine,color=Tcombine)) +
geom_point()+
geom_line()+
theme(axis.text.x = element_text(angle = 90, hjust = 1), axis.title.x=element_blank())
I want the points of the two groups (two colors) to be connected separately as the date changes, but I don't know why all the points are connected?
Here is final graph I got:
Any suggestions are welcome! Thank you in adavance!
Add group=Tcombine.
df %>% select(Date, TN, TX) %>%
pivot_longer(cols = c(TN,TX), names_to = "Tcombine", values_to = "Value") %>%
ggplot(aes(Date, Value,group = 1,shape=Tcombine,color=Tcombine)) +
geom_point()+
geom_line(aes(group = Tcombine))+
theme(axis.text.x = element_text(angle = 90, hjust = 1), axis.title.x=element_blank())
Incidentally, while the ordering of your x-axis works here, the moment you get another month it will break. I suggest you convert your Date column to a proper Date-class and add scale_x_date.
df %>%
mutate(Date = as.Date(Date, format = "%d/%m/%Y")) %>%
select(Date, TN, TX) %>%
pivot_longer(cols = c(TN,TX), names_to = "Tcombine", values_to = "Value") %>%
ggplot(aes(Date, Value,group = 1,shape=Tcombine,color=Tcombine)) +
geom_point()+
geom_line(aes(group = Tcombine)) +
scale_x_date(date_breaks = "1 day") +
theme(axis.text.x = element_text(angle = 90, hjust = 1), axis.title.x=element_blank())
While this looks very similar, you have much better control over breaks (e.g., date_breaks = "3 days") and formatting (e.g., date_labels ="%d/%m/%Y" if you really want that formatting of the dates).

How to get rid of annotations on faceted graph?

Problem
I am trying to label the left facet side of my graph while leaving out the annotations on the right side.
Data
Here are my libraries and data:
#### Libraries ####
library(tidyverse)
library(ggpubr)
library(plotly)
#### Dput ####
emlit <- structure(list(X = 1:20, Ethnicity = c("Asian (other than Chinese)",
"Filipino", "Indonesian", "Thai", "Japanese", "Korean", "South Asian",
"Indian", "Nepalese", "Pakistani", "Other South Asian", "Other Asian",
"White", "Mixed", "With Chinese parent", "Other mixed", "Others",
"All ethnic minorities", "All ethnic minorities, excluding\n foreign domestic helpers",
"Whole population"), Age_5.14 = c(65.8, 72.2, 69.4, 83.1, 26.6,
52.4, 67.4, 60.4, 69.5, 71.5, 92.5, 92, 34.8, 76.6, 84.2, 45.3,
51.3, 64.3, 64.3, 94.8), Age_15.24 = c(28.1, 29.2, 4.4, 72.9,
34.8, 50.3, 38.7, 41.4, 22.2, 54.3, 41.9, 64.7, 24.4, 82.9, 90.7,
37.4, 53.2, 40.6, 52.9, 96.9), Age_25.34 = c(4.5, 1.8, 4.6, 20,
17.2, 26.8, 6.6, 4.2, 6.4, 11.9, 12, 33.9, 15, 60.5, 82, 6.7,
11.2, 7.8, 21.8, 84.9), Age_35.44 = c(6.3, 2, 6.1, 35.7, 36.5,
25.5, 9.4, 6.2, 10.5, 10.1, 22.4, 35.7, 8.6, 63, 83.2, 4.5, 12.2,
9.5, 23.4, 84.6), Age_45.54 = c(8.1, 2.3, 8, 23.2, 43.4, 59.6,
7.5, 6.3, 3.9, 13.5, 28.3, 47.5, 13.1, 72.1, 84, 4.4, 22.4, 14.2,
27.7, 92.5), Age_55.64 = c(15.9, 4.4, 44, 27, 41.7, 52.8, 11.8,
7.4, 9.5, 2, 54.2, 39.6, 12.7, 75.3, 80.1, 2.6, 20.6, 25, 32.4,
94.8), Age_65. = c(31.1, 11.9, 82.6, 39, 46.4, 57, 9.5, 3.9,
NA, 11.4, 66.5, 74.5, 14.5, 80.5, 81, 57.5, 13.6, 42.7, 44, 82.3
), Age_Overall = c(10.1, 3.5, 6.4, 31.4, 35.1, 39.8, 20.4, 15.3,
16.4, 33.8, 30.4, 46.3, 15.4, 72.7, 83.9, 19.4, 19.8, 16.9, 35.2,
89.4)), class = "data.frame", row.names = c(NA, -20L))
I have also pivoted the data for my graph:
#### Pivot Data ####
emlitpivot <- emlit %>%
pivot_longer(cols = contains("Age"),
names_to = "Age_Range",
values_to = "Percent")
Plot
Here is my plot so far, a faceted graph that breaks down literacy by age with some notes on some important points on the left:
#### EM vs all ####
# Order
order <- c("5-14", "15-24", "25-34", "35-44", "45-54", "55-64", "65+", "Overall",
"5-14", "15-24", "25-34", "35-44", "45-54", "55-64", "65+", "Overall")
# Plot
plot <- emlitpivot %>%
filter(Ethnicity %in% c("All ethnic minorities",
"Whole population")) %>%
ggbarplot(x="Age_Range",
y="Percent",
fill = "Ethnicity",
label = T,
palette = "jco",
facet.by = "Ethnicity",
title = "EM x Native Chinese Literacy by Age",
xlab = "Age Range",
ylab = "Literacy in Chinese (By Percent)",
caption = "*Data obtained from Census and Statistics Department Hong Kong SAR, 2016.")+
theme_cleveland()+
theme(axis.text.x = element_text(angle = 45,
hjust = .5,
vjust = .5),
legend.position = "none",
plot.caption = element_text(face = "italic"))+
scale_x_discrete(labels=order)+
geom_segment(aes(x = 3, y = 15, xend = 3, yend = 48))+
geom_segment(aes(x = 1, y = 71, xend = 1, yend = 80))+
geom_segment(aes(x = 7, y = 50, xend = 7, yend = 65))+
annotate("text",
x=4,
y=53,
label = "Post-college workers can't read.")+
annotate("text",
x=3.5,
y=85,
label = "School age supports seem to boost initial literacy.")+
annotate("text",
x=6,
y=70,
label = "Increase due to generational literacy?")
# Print plot:
plot
However, you can probably guess what the problem is:
How do I get rid of the annotations on the right? I'm not sure if there is a simple way of getting rid of them, but it would be helpful to only have text on the left side.
In this case, I'll use geom_text instead of annotate, since it allows you to have subset of your data.
library(tidyverse)
library(ggpubr)
emlitpivot %>%
filter(Ethnicity %in% c(
"All ethnic minorities",
"Whole population"
)) %>%
ggbarplot(
x = "Age_Range",
y = "Percent",
fill = "Ethnicity",
label = T,
palette = "jco",
facet.by = "Ethnicity",
title = "EM x Native Chinese Literacy by Age",
xlab = "Age Range",
ylab = "Literacy in Chinese (By Percent)",
caption = "*Data obtained from Census and Statistics Department Hong Kong SAR, 2016."
) +
theme_cleveland() +
theme(
axis.text.x = element_text(
angle = 45,
hjust = .5,
vjust = .5
),
legend.position = "none",
plot.caption = element_text(face = "italic")
) +
scale_x_discrete(labels = order) +
geom_segment(data = subset(emlitpivot, Ethnicity == "All ethnic minorities"), aes(x = 3, y = 15, xend = 3, yend = 48)) +
geom_segment(data = subset(emlitpivot, Ethnicity == "All ethnic minorities"), aes(x = 1, y = 71, xend = 1, yend = 80)) +
geom_segment(data = subset(emlitpivot, Ethnicity == "All ethnic minorities"), aes(x = 7, y = 50, xend = 7, yend = 65)) +
geom_text(data = subset(emlitpivot, Ethnicity == "All ethnic minorities"), aes(4, 53), label = "Post-college workers can't read.", check_overlap = T) +
geom_text(data = subset(emlitpivot, Ethnicity == "All ethnic minorities"), aes(3.5, 85), label = "School age supports seem to boost initial literacy.", check_overlap = T) +
geom_text(data = subset(emlitpivot, Ethnicity == "All ethnic minorities"), aes(6, 70), label = "Increase due to generational literacy?", check_overlap = T)
Update remove lines in second facet:
Create a dataframe with your text labels and position and add it to the plot,
to remove the lines do the same procedure:
df for text = ann_text
df for lines = segm
ann_text <- data.frame(x = c(4, 3.5, 6),
y = c(53, 85, 70),
lab = c("Post-college workers can't read.", "School age supports seem to boost initial literacy.",
"Increase due to generational literacy?"),
Ethnicity = rep("All ethnic minorities", 3))
segm <- data.frame(x = c(3,1,7),
y = c(15, 71, 50),
xend = c(3,1,7),
yend = c(48,80,65),
Ethnicity = rep("All ethnic minorities", 3))
plot1 <- plot +
geom_text(
data = ann_text,
mapping = aes(x = x, y = y, label = lab)
)
plot1 + geom_segment(
data = segm,
mapping = aes(x = x, y = y, xend = xend, yend = yend)
)
remove the following from your code:
annotate("text",
x=4,
y=53,
label = "Post-college workers can't read.")+
annotate("text",
x=3.5,
y=85,
label = "School age supports seem to boost initial literacy.")+
annotate("text",
x=6,
y=70,
label = "Increase due to generational literacy?")

geom_col renders an invisible plot

While trying to generate an error plot, I found the following undesirable behavior:
# sample data (please excuse the length, you'll see it's important!)
a <- structure(list(valor = c(22.35, 23.9, 32, 36.2, 35.2, 24.3, 42,
36.4, 16.65, 40.95, 21, 33.2, 32, 33, 28.9, 28, 40.9, 28.4, 32.5,
24.9, 28.35, 36.4, 31.05, 28.4, 37.9, 35.9, 24, 27.6, 28.6, 37.4,
31.6, 31.9, 28.6, 33.9, 31.2, 27, 25.6, 31.2, 32.5, 26.4, 40,
32.9, 32.9, 31.5, 24.9, 21.9, 33.4, 31.8, 31.1, 29.6, 31.5, 29.8,
32.9, 26.6, 24.6, 35.9), error = c(-18.7573531872138, 1.31066637545209,
NA, 0.277829536700779, -2.64925385673394, -11.8996081065239,
-2.60692704590275, -1.33715023334453, NA, -7.61175343400322,
2.55982080155896, 4.4863429357563, 4.16085789426681, -3.90087313834282,
-1.8721045665811, 0.369086865146173, 12.2927002385953, -0.848796857979458,
4.13045179906004, 4.28348313246773, 3.05347592474616, -5.33715023334453,
-1.68395695575215, 5.15120314202054, -3.45360182568537, 0.700053120316895,
4.50817359293553, 1.58628172614129, 7.54200618644399, 7.58601073994592,
-6.61548902751109, -1.03317248113754, 3.54200618644399, 1.84047336001635,
3.60755820405548, 1.41075911687027, 0.661540377840424, 6.60755820405548,
-15.86954820094, 4.2336254711588, -15.4283737200925, 1.90546464068269,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA)), row.names = c(NA,
-56L), class = "data.frame")
This plot works:
p32 <- ggplot(a[1:32,], aes(x = valor, y = error))+geom_col(position = "jitter")
p32
This doesn't work:
p33 <- ggplot(a[1:33,], aes(x = valor, y = error))+geom_col(position = "jitter")
p33
My reasearch
I understand now that this is caused by a zero-width of the bars (see this closed issue), as can be seen here:
# Notice xmin == xmax:
head(ggplot_build(p33)$data[[1]], 3)
x y PANEL group ymin ymax xmin xmax colour
1 22.35 -18.752212 1 -1 -1.875920e+01 0.002793391 22.35 22.35 NA
2 23.90 1.315615 1 -1 -3.452852e-05 1.304455085 23.90 23.90 NA
3 32.00 NA 1 -1 NA NA 32.00 32.00 NA
fill size linetype alpha
1 grey35 0.5 1 NA
2 grey35 0.5 1 NA
3 grey35 0.5 1 NA
What I tried:
I know I can set the width manually:
ggplot(a[1:33,], aes(x = valor, y = error))+geom_col(position = "jitter", width = 0.1)
but the problem is I'm using the ggplot call within a function that takes the data.frame (a in my example) as argument. Obviously, it can have any length and the data may be essentially different, so fixing a width manually creates some plots with very thin bars and other with very thick ones:
ggplot(a, aes(x = valor, y = error))+geom_col(position = "jitter", width = .1)
ggplot(b, aes(x = valor, y = error))+geom_col(position = "jitter", width = .1)
# with b=
b <- structure(list(valor = c(1.03, 0.43, 1.25, 1.2, 0.74, 2.33, 1.49,
1.5, 0.3, 0.96, 0.81, 1.13, 0.83, 0.68, 2.22, 0.68, 0.9, 1.03,
0.39, 0.84, 1.4, 0.85, 0.7, 1.55, 1.08, 0.37, 0.66, 0.67, 1.36,
0.97, 1.03, 0.64, 1, 0.78, 0.62, 0.5, 0.94, 0.56, 2.09, 1.01),
error = c(2.23998224289866, 0.224579421022632, -0.637159523178084,
-2.74850423807165, -2.69675570480791, 4.59843342442166, 2.34260767883423,
-12.4611961378406, 1.02484359455743, 2.46750883802447, 0.376157081501411,
-1.354215218894, 0.947671489701406, 0.426378012316505, 19.9389705823977,
-1.5736219876835, -22.1173385165668, 5.23998224289866, -0.540189922794198,
7.23019854807831, -3.46146029781903, -2.48937236945532, 5.06312219297025,
-1.49229963183367, -3.53967661036512, 0.534698732147042,
1.77779803536164, 6.10360322576836, 6.71339758402689, -5.27443362843563,
1.23998224289866, 1.11679330753741, -0.510113509024535, 0.502074779997471,
1.44272604499763, -0.91952618750328, -17.0537006712522, 3.33491106257746,
-8.09000221353266, 1.7414648468139)), row.names = c(NA, -40L
), class = "data.frame")
Per this post I tried adding +scale_x_continuous(oob = scales::rescale_none) AND / OR +scale_y_continuous(oob = scales::rescale_none), but none of them worked (neither to thicken the thin bars nor to thin the thick ones).
How can I address this and have a call that can handle a and b and produce an output that looks similar? (regarding the appearance of the bars)
A barplot conceptually can't work on a continuous x scale. However, you can use a different geom:
+ geom_segment(aes(xend = valor, yend = 0))

Data labels for mean and percentiles in a distribution chart

I'm creating a custom chart to visualize a variable's distribution using geom_density. I added 3 vertical lines for a custom value, the 5th percentile and the 95th percentile.
How do I add labels for those lines?
I tried using geom_text but i don't know how to parameter the x and y variables
library(ggplot2)
ggplot(dataset, aes(x = dataset$`Estimated percent body fat`)) +
geom_density() +
geom_vline(aes(xintercept = dataset$`Estimated percent body fat`[12]),
color = "red", size = 1) +
geom_vline(aes(xintercept = quantile(dataset$`Estimated percent body fat`,
0.05, na.rm = TRUE)),
color = "grey", size = 0.5) +
geom_vline(aes(xintercept = quantile(dataset$`Estimated percent body fat`,
0.95, na.rm = TRUE)),
color="grey", size=0.5) +
geom_text(aes(x = dataset$`Estimated percent body fat`[12],
label = "Custom", y = 0),
colour = "red", angle = 0)
I'd like to obtain the following:
for the custom value, I'd like to add the label at the top of the chart, just to the right of the line
for the percentiles label, I'd like to add them in the middle of the chart; at the left of the line for the 5th percentile and right of the line for 95th percentile
Here is what I was able to obtain https://i.imgur.com/thSQwyg.png
And these are the first 50 lines of my dataset:
structure(list(`Respondent sequence number` = c(21029L, 21034L,
21043L, 21056L, 21067L, 21085L, 21087L, 21105L, 21107L, 21109L,
21110L, 21125L, 21129L, 21138L, 21141L, 21154L, 21193L, 21195L,
21206L, 21215L, 21219L, 21221L, 21232L, 21239L, 21242L, 21247L,
21256L, 21258L, 21287L, 21310L, 21325L, 21367L, 21380L, 21385L,
21413L, 21418L, 21420L, 21423L, 21427L, 21432L, 21437L, 21441L,
21444L, 21453L, 21466L, 21467L, 21477L, 21491L, 21494L, 21495L
), `Estimated percent body fat` = c(NA, 7.2, NA, NA, 24.1, 25.1,
30.2, 23.6, 24.3, 31.4, NA, 14.1, 20.5, NA, 23.1, 30.6, 21, 20.9,
NA, 24, 26.7, 16.6, NA, 26.9, 16.9, 21.3, 15.9, 27.4, 13.9, NA,
20, NA, 12.8, NA, 33.8, 18.1, NA, NA, 28.4, 10.9, 38.1, 33, 39.3,
15.9, 32.7, NA, 20.4, 16.8, NA, 29)), row.names = c(NA, 50L), class =
"data.frame")
First I recommend clean column names.
dat <- dataset
names(dat) <- tolower(gsub("\\s", "\\.", names(dat)))
Whith base R plots you could do the following. The clou is, that you can store the quantiles and custom positions to use them as coordinates later which gives you a dynamic positioning. I'm not sure if/how this is possible with ggplot.
plot(density(dat$estimated.percent.body.fat, na.rm=TRUE), ylim=c(0, .05),
main="Density curve")
abline(v=c1 <- dat$estimated.percent.body.fat[12], col="red")
abline(v=q1 <- quantile(dat$estimated.percent.body.fat, .05, na.rm=TRUE), col="grey")
abline(v=q2 <- quantile(dat$estimated.percent.body.fat, .95, na.rm=TRUE), col="grey")
text(c1 + 4, .05, c(expression("" %<-% "custom")), cex=.8)
text(q1 - 5.5, .025, c(expression("5% percentile" %->% "")), cex=.8)
text(q2 + 5.5, .025, c(expression("" %<-% "95% percentile")), cex=.8)
Note: Case you don't like the arrows just do e.g. "5% percentile" instead of c(expression("5% percentile" %->% "")).
Or in ggplot you could use annotate.
library(ggplot2)
ggplot(dataset, aes(x = dataset$`Estimated percent body fat`)) +
geom_density() +
geom_vline(aes(xintercept = dataset$`Estimated percent body fat`[12]),
color = "red", size = 1) +
geom_vline(aes(xintercept = quantile(dataset$`Estimated percent body fat`,
0.05, na.rm = TRUE)),
color = "grey", size = 0.5) +
geom_vline(aes(xintercept = quantile(dataset$`Estimated percent body fat`,
0.95, na.rm = TRUE)),
color="grey", size=0.5) +
annotate("text", x=16, y=.05, label="custom") +
annotate("text", x=9.5, y=.025, label="5% percentile") +
annotate("text", x=38, y=.025, label="95% percentile")
Note, that in either solution the result (i.e. exact label positions) depends on your export size. To learn how to control this, take e.g. a look into How to save a plot as image on the disk?.
Data
dataset <- structure(list(`Respondent sequence number` = c(21029L, 21034L,
21043L, 21056L, 21067L, 21085L, 21087L, 21105L, 21107L, 21109L,
21110L, 21125L, 21129L, 21138L, 21141L, 21154L, 21193L, 21195L,
21206L, 21215L, 21219L, 21221L, 21232L, 21239L, 21242L, 21247L,
21256L, 21258L, 21287L, 21310L, 21325L, 21367L, 21380L, 21385L,
21413L, 21418L, 21420L, 21423L, 21427L, 21432L, 21437L, 21441L,
21444L, 21453L, 21466L, 21467L, 21477L, 21491L, 21494L, 21495L
), `Estimated percent body fat` = c(NA, 7.2, NA, NA, 24.1, 25.1,
30.2, 23.6, 24.3, 31.4, NA, 14.1, 20.5, NA, 23.1, 30.6, 21, 20.9,
NA, 24, 26.7, 16.6, NA, 26.9, 16.9, 21.3, 15.9, 27.4, 13.9, NA,
20, NA, 12.8, NA, 33.8, 18.1, NA, NA, 28.4, 10.9, 38.1, 33, 39.3,
15.9, 32.7, NA, 20.4, 16.8, NA, 29)), row.names = c(NA, 50L), class =
"data.frame")

2d density plot for categories

I'm trying to make a 2d density plot where the density is displayed for each category. For example, in the image below, we have a density plot for each day, and all the daily densities are combined into the coloured plots. These types of plots are common in the scientific literature on atmospheric sciences and aerosol pollution studies.
So far I've got this
ggplot(dat, aes(y = `dN/dlogDp`, x = date)) +
stat_density2d(geom="tile", aes(fill = ..density..), contour = FALSE) +
scale_fill_gradient(low="blue", high="red") +
geom_point(alpha = 0.1) +
theme_bw()
But I want to facet it by day, and I'm not sure where to start.
Here are the example data:
structure(list(date = structure(c(1359244800, 1359245400, 1359246000,
1359246600, 1359247200, 1359247800, 1359248400, 1359249000, 1359249600,
1359250200, 1359250800, 1359251400, 1359252000, 1359252600, 1359253200,
1359253800, 1359254400, 1359255000, 1359255600, 1359256200, 1359256800,
1359257400, 1359258000, 1359258600, 1359259200, 1359259800, 1359260400,
1359261000, 1359261600, 1359262200, 1359262800, 1359263400, 1359264000,
1359264600, 1359265200, 1359265800, 1359266400, 1359267000, 1359267600,
1359268200, 1359268800, 1359269400, 1359270000, 1359270600, 1359271200,
1359271800, 1359272400, 1359273000, 1359273600, 1359274200, 1359274800,
1359275400, 1359276000, 1359276600, 1359277200, 1359277800, 1359278400,
1359279000, 1359279600, 1359280200, 1359280800, 1359281400, 1359282000,
1359282600, 1359283200, 1359283800, 1359284400, 1359285000, 1359285600,
1359286200, 1359286800, 1359287400, 1359288000, 1359288600, 1359289200,
1359289800, 1359290400, 1359291000, 1359291600, 1359292200, 1359292800,
1359293400, 1359294000, 1359294600, 1359295200, 1359295800, 1359296400,
1359297000, 1359297600, 1359298200, 1359298800, 1359299400, 1359300000,
1359300600, 1359301200, 1359301800, 1359302400, 1359303000, 1359303600,
1359304200), class = c("POSIXct", "POSIXt"), tzone = "UTC"),
`dN/dlogDp` = c(49.8, 49.275, 47.4, 47.975, 48.625, 51.725,
50.7, 47.55, 45.975, 45.35, 45.4, 47.75, 49.625, 48.225,
47.65, 47.3, 48.75, 50.075, 34.725, 42.025, 48.825, 52.25,
54.05, 49.15, 34.6, 34.375, 42.85, 30.325, 43.15, 36.875,
32.85, 36.85, 35.725, 39.8, 38.65, 40.1, 42.675, 38.5, 37.2,
34.425, 25.2, 14.725, 22.675, 14.875, 37.45, 46.025, 49.275,
35.425, 30, 38.9, 28.6, 41.675, 46.05, 48.6, 62.425, 62.65,
61.7, 49.5, 70.05, 71.875, 59.4, 38.525, 36.85, 25.625, 14.675,
14.7, 14.6, 14.725, 15.6, 15, 14.6, 14.75, 15.05, 14.975,
15.425, 15.1, 15.95, 14.95, 15, 14.6, 14.725, 14.85, 15.175,
28.95, 14.975, 14.725, 16.6, 18.925, 53.225, 60.2, 56.425,
54.55, 41.4, 19.025, 19.825, 31.875, 14.85, 16.375, 16.65,
34.325), Diameter = c(14.6, 15.1, 15.7, 16.3, 16.8, 17.5,
18.1, 18.8, 19.5, 20.2, 20.9, 21.7, 22.5, 23.3, 24.1, 25,
25.9, 26.9, 27.9, 28.9, 30, 31.1, 32.2, 33.4, 34.6, 35.9,
37.2, 38.5, 40, 41.4, 42.9, 44.5, 46.1, 47.8, 49.6, 51.4,
53.3, 55.2, 57.3, 59.4, 61.5, 63.8, 66.1, 68.5, 71, 73.7,
76.4, 79.1, 82, 85.1, 88.2, 91.4, 94.7, 98.2, 101.8, 105.5,
109.4, 113.4, 117.6, 121.9, 126.3, 131, 135.8, 140.7, 145.9,
151.2, 156.8, 162.5, 168.5, 174.7, 181.1, 187.7, 194.6, 201.7,
209.1, 216.7, 224.7, 232.9, 241.4, 250.3, 259.5, 269, 278.8,
289, 299.6, 310.6, 322, 333.8, 346, 358.7, 371.8, 385.4,
399.5, 414.2, 429.4, 445.1, 461.4, 478.3, 495.8, 514)), .Names = c("date",
"dN/dlogDp", "Diameter"), row.names = c(NA, 100L), class = c("tbl_df",
"tbl", "data.frame"))
UPDATE This question is misguided and I now think that using categories isn't relevant to recreating this plot. These other questions are more closely related to the task of recreating this plot:
geom_raster interpolation with log scale
Use R to recreate contour plot made in Igor
And after I asked this question I have been keeping an updated gist of R code that combines details from the answers to these questions, and successfully replicates these plots (example output included in the gist). That gist is here: https://gist.github.com/benmarwick/9a54cbd325149a8ff405
The key steps are to strip away much of the decoration in the panels, and use scale_*_continuous(expand = c(0,0)) to make the density plot fill the entire panel. Here's an example of how to put it together:
# get the day and hour to use as facet panels
dat$day <- as.Date(dat$date)
dat$hour <- as.numeric(format(dat$date, "%H"))
library(ggplot2)
library(viridis)
# theme to suppress many details
squeeze_grid_theme <- theme_bw() + theme(axis.title = element_blank(),
axis.ticks = element_blank(),
axis.text = element_blank(),
strip.text = element_blank(),
strip.background = element_blank(),
panel.margin.y = unit(0, "lines"),
panel.margin.x = unit(-1,"lines"),
panel.border = element_blank(),
panel.grid = element_blank(),
axis.text.x = element_text(margin=margin(0,0,0,0,"pt")),
axis.text.y = element_text(margin=margin(0,0,0,0,"pt")))
p <- ggplot(dat, aes(z = Diameter, y = `dN/dlogDp`, x = date)) +
stat_density2d(geom="tile", aes(fill = ..density..), contour = FALSE) +
scale_fill_viridis() +
geom_point(alpha = 0.1) +
facet_grid(~hour) +
scale_y_continuous(expand = c(0,0)) +
scale_x_datetime(expand = c(0,0)) +
squeeze_grid_theme
p
Then we get a separate density plot for each hour, tightly squeezed together like the example plot in the question.

Resources