data <- structure(list(A_w = c(0, 0.69, 1.41, 2.89, 6.42, 13.3, 25.5,
36.7, 44.3, 46.4), E_w = c(1.2, 1.2, 1.5, 1.6, 1.9, 2.3, 3.4,
4.4, 10.6, 16.5), A_e = c(0, 0.18, 0.37, 0.79, 1.93, 4.82, 11.4,
21.6, 31.1, 36.2), E_e = c(99.4, 99.3, 98.9, 98.4, 97.1, 93.3,
84.7, 71.5, 58.1, 48.7)), row.names = c(NA, -10L), class = "data.frame")
data
#> A_w E_w A_e E_e
#> 1 0.00 1.2 0.00 99.4
#> 2 0.69 1.2 0.18 99.3
#> 3 1.41 1.5 0.37 98.9
#> 4 2.89 1.6 0.79 98.4
#> 5 6.42 1.9 1.93 97.1
#> 6 13.30 2.3 4.82 93.3
#> 7 25.50 3.4 11.40 84.7
#> 8 36.70 4.4 21.60 71.5
#> 9 44.30 10.6 31.10 58.1
#> 10 46.40 16.5 36.20 48.7
Created on 2021-05-31 by the reprex package (v2.0.0)
I am trying to plot this data with all A values as X and Es as Y. How can I put either a) both of these columns plotted on a ggplot2, or b) rearrange this dataframe to combine the A columns and E columns into a final dataframe with only two columns with 2x as many rows as pictured?
Thanks for any help, I am a beginner (obviously)
Edit for Clarity: It's important that the A_e & E_e values remain as pairs, similar to how the A_w and E_w values remain as pairs. The end result plot should resemble the ORANGE and BLUE lines of this image, but I am trying to replicate this while learning R.
Currently I am capable of plotting each separately when dividing into two dataframes of 2x10
A_w E_w
1 0.00 1.2
2 0.69 1.2
3 1.41 1.5
4 2.89 1.6
5 6.42 1.9
6 13.30 2.3
7 25.50 3.4
8 36.70 4.4
9 44.30 10.6
10 46.40 16.5
and the second plot
# A tibble: 10 x 2
A_e E_e
<dbl> <dbl>
1 0 99.4
2 0.18 99.3
3 0.37 98.9
4 0.79 98.4
5 1.93 97.1
6 4.82 93.3
7 11.4 84.7
8 21.6 71.5
9 31.1 58.1
10 36.2 48.7
But my end goal is to have them both on the same plot, like in the Excel graph (orange + blue graph) above.
Here is a try
library(dplyr)
library(ggplot2)
line_1_data <- data %>%
select(A_w, E_w) %>%
mutate(xend = lead(A_w), yend = lead(E_w)) %>%
filter(!is.na(xend))
line_2_data <- data %>%
select(A_e, E_e) %>%
mutate(xend = lead(A_e), yend = lead(E_e)) %>%
filter(!is.na(xend))
# multiple column for with different geom
ggplot(data = data) +
# The blue line
geom_point(aes(x = A_w, y = E_w), color = "blue") +
geom_curve(data = line_1_data, aes(x = A_w, y = E_w, xend = xend,
yend = yend), color = "blue",
curvature = 0.02) +
# The orange line
geom_point(aes(x = A_e, y = E_e), color = "orange") +
geom_curve(data = line_2_data,
aes(x = A_e, y = E_e, xend = xend, yend = yend), color = "orange",
curvature = -0.02) +
# The red connection between two line
geom_curve(data = tail(data, 1),
aes(x = A_w, y = E_w, xend = A_e, yend = E_e), curvature = 0.1,
color = "red") +
# The black straight line between pair
geom_curve(
aes(x = A_w, y = E_w, xend = A_e, yend = E_e), curvature = 0,
color = "black")
Created on 2021-05-31 by the reprex package (v2.0.0)
You may try from this
data <- data.frame(
A_w = c(0,0.69,1.41,2.89,6.42,
13.3,25.5,36.7,44.3,46.4),
E_w = c(1.2, 1.2, 1.5, 1.6, 1.9, 2.3, 3.4, 4.4, 10.6, 16.5),
A_e = c(0,0.18,0.37,0.79,1.93,
4.82,11.4,21.6,31.1,36.2),
E_e = c(99.4,99.3,98.9,98.4,
97.1,93.3,84.7,71.4,58.1,48.7)
)
library(tidyverse)
data %>% pivot_longer(everything(), names_sep = '_', names_to = c('.value', 'type')) %>%
ggplot(aes(x = A, y = E, color = type)) +
geom_point() +
geom_line()
Created on 2021-05-31 by the reprex package (v2.0.0)
Doing it "by hand":
#dummmy data:
df = data.frame(A_w=rnorm(10), E_w=rnorm(10), A_e=rnorm(10), E_e=rnorm(10))
df2 = data.frame(A=c(df$A_w, df$A_e), E=c(df$E_w, df$A_e))
Output:
> df2
A E
1 1.25522468 -0.2441768
2 -0.50585191 -0.1383637
3 0.42374270 -0.9664189
4 -0.39858532 -0.3442157
5 -1.05665363 -1.3574362
6 0.79191788 -0.8202841
7 -1.31349592 0.7280619
8 -0.05609851 0.6365495
9 1.01068811 2.0222241
10 -1.15572972 -0.2190794
11 0.15579931 0.1557993
12 1.58834329 1.5883433
13 1.24933622 1.2493362
14 -0.28197439 -0.2819744
15 0.30593184 0.3059318
16 0.75486103 0.7548610
17 1.19394302 1.1939430
18 -1.79955846 -1.7995585
19 0.59688655 0.5968865
20 0.71519048 0.7151905
And for the plot: ggplot(df2, aes(x=A, y=E)) + geom_point()
Output:
There are ways to do this without having to joint the columns by listing their names - with the tidyr package - but i think that this solution is easier to understand from a beginners pov.
Related
Based on the data below how can I remove the rows with duplicate X and Y coordinates? In the example below, you will notice that one of X coordinate is -1.52 which is repeated twice but it's not a duplicate since it's corresponding Y coordiantes are different.
I don't know if it matters but please note that the orginal dataset has more than 2 decimal places for the X and Y values.
Sample data:
structure(list(id = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10), X = c(-1.01,
-1.11, -1.11, -2.13, -2.13, -1.52, -1.52, -1.98, -3.69, -4.79),
Y = c(2.11, 3.33, 3.33, 6.66, 6.66, 7.77, 8.88, 9.99, 1.11,
6.68)), class = "data.frame", row.names = c(NA, -10L))
Desired data:
id X Y
1 -1.01 2.11
2 -1.11 3.33
4 -2.13 6.66
6 -1.52 7.77
7 -1.52 8.88
8 -1.98 9.99
9 -3.69 1.11
19 -4.79 6.68
Use duplicated
subset(df1, !duplicated(df1[-1]))
-output
id X Y
1 1 -1.01 2.11
2 2 -1.11 3.33
4 4 -2.13 6.66
6 6 -1.52 7.77
7 7 -1.52 8.88
8 8 -1.98 9.99
9 9 -3.69 1.11
10 10 -4.79 6.68
Or with distinct
library(dplyr)
df1 %>%
distinct(X, Y, .keep_all = TRUE)
I'm trying to split a data frame from long to wide format by converting selected rows to columns. Here is the current general long-format structure:
data_long <- data.frame(
id = c("kelp","kelp","fish","fish","beach","beach","kelp","kelp","fish","fish","beach","beach"),
desig = c("mpa","reference","mpa","reference","mpa","reference","mpa","reference","mpa","reference","mpa","reference"),
indicator = c("density","density","density","density","density","density","biomass","biomass","biomass","biomass","biomass","biomass"),
n = c(1118,1118,1118,1118,1118,1118,1118,1118,1118,1118,1118,1118),
m = c(0.35, 4.28, 1.16, 106.35, 13.44,0.63,0.35, 4.28, 1.16, 106.35, 13.44,0.63),
sd = c(1.19, 8.48, 4.25, 118, 31.77,2.79,1.19, 8.48, 4.25, 118, 31.77,2.79)
)
data_long
I want to keep id and indicator, split by "desig",and move "n", "m", and "sd" into new columns. The final data frame structure I'm trying to obtain is:
data_wide <- data.frame(
id = c("kelp","fish","beach","kelp","fish","beach"),
indicator = c("density","density","density","biomass","biomass","biomass"),
mpa.n = c(1118,1118,1118,1118,1118,1118),
mpa.m = c(0.35, 4.28, 1.16, 106.35, 13.44,0.63),
mpa.sd = c(1.19, 8.48, 4.25, 118, 31.77,2.79),
reference.n = c(1118,1118,1118,1118,1118,1118),
reference.m = c(0.35, 4.28, 1.16, 106.35, 13.44,0.63),
reference.sd = c(1.19, 8.48, 4.25, 118, 31.77,2.79)
)
data_wide
I can't seem to get this right using reshape2. Any suggestions?
We may use pivot_wider
library(tidyr)
library(dplyr)
pivot_wider(data_long, names_from = desig,
values_from = c(n, m, sd), names_glue = "{desig}.{.value}") %>%
select(id, indicator, starts_with("mpa"), starts_with('reference'))
-output
# A tibble: 6 × 8
id indicator mpa.n mpa.m mpa.sd reference.n reference.m reference.sd
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 kelp density 1118 0.35 1.19 1118 4.28 8.48
2 fish density 1118 1.16 4.25 1118 106. 118
3 beach density 1118 13.4 31.8 1118 0.63 2.79
4 kelp biomass 1118 0.35 1.19 1118 4.28 8.48
5 fish biomass 1118 1.16 4.25 1118 106. 118
6 beach biomass 1118 13.4 31.8 1118 0.63 2.79
I have the following type of table :
df0 <- read.table(text = 'Sample Method Mg Al Ca Ti
Sa A 5.5 2.2 33 0.2
Sb A 4.2 1.2 44 0.1
Sc A 1.1 0.5 25 0.3
Sd A 3.3 1.3 31 0.5
Se A 6.2 0.2 55 0.6
Sa B 5.2 2 35 0.25
Sb B 4.6 1.3 48 0.1
Sc B 1.6 0.8 22 0.32
Sd B 3.1 1.6 29 0.4
Se B 6.8 0.3 51 0.7
Sa C 5.6 2.5 30 0.2
Sb C 4.1 1.2 41 0.15
Sc C 1 0.6 22 0.4
Sd C 3.2 1.5 30 0.5
Se C 6.8 0.1 51 0.65', header = T, stringsAsFactors = F)
Which include chemical compositions. I would like to use the Method A as a reference (X-axis) and to make automated scatter plots with the data from Method B, C in Y (with linear trend). With a reference line of 1:1 which would correspond to a perfect match.
In other words, I would like to produce plots like that :
I think a solution could start from transforming the data frame into:
df <- read.table(text = 'Sample Mg_A Al_A Ca_A Ti_A Mg_B Al_B Ca_B Ti_B Mg_C Al_C Ca_C Ti_C
Sa 5.5 2.2 33 0.2 5.2 2 35 0.25 5.6 2.5 30 0.2
Sb 4.2 1.2 44 0.1 4.6 1.3 48 0.1 4.1 1.2 41 0.15
Sc 1.1 0.5 25 0.3 1.6 0.8 22 0.32 1 0.6 22 0.4
Sd 3.3 1.3 31 0.5 3.1 1.6 29 0.4 3.2 1.5 30 0.5
Se 6.2 0.2 55 0.6 6.8 0.3 51 0.7 6.8 0.1 51 0.65
', header = T, stringsAsFactors = F)
But I don't know how to go further.
Any help would be appreciated.
Best, Anne-Christine
You can use the following code
library(tidyverse)
df0 %>%
pivot_wider(names_from = Method, values_from = c(Mg, Al, Ca, Ti)) %>%
pivot_longer(cols = -Sample) %>% #wide to long data format
separate(name, c("key","number"), sep = "_") %>%
group_by(number) %>% #Group the vaules according to number
mutate(row = row_number()) %>% #For creating unique IDs
pivot_wider(names_from = number, values_from = value) %>%
ggplot() +
geom_point(aes(x=A, y=B, color = "A vs B")) +
geom_point(aes(x=A, y=C, color = "A vs C")) +
geom_abline(slope=1, intercept=0) +
geom_smooth(aes(x=A, y=B, color = "A vs B"), method=lm, se=FALSE, fullrange=TRUE)+
geom_smooth(aes(x=A, y=C, color = "A vs C"), method=lm, se=FALSE, fullrange=TRUE)+
facet_wrap(key~., scales = "free")+
theme_bw()+
ylab("B or C") +
xlab("A")
Data
df0 = structure(list(Sample = c("Sa", "Sb", "Sc", "Sd", "Se", "Sa",
"Sb", "Sc", "Sd", "Se", "Sa", "Sb", "Sc", "Sd", "Se"), Method = c("A",
"A", "A", "A", "A", "B", "B", "B", "B", "B", "C", "C", "C", "C",
"C"), Mg = c(5.5, 4.2, 1.1, 3.3, 6.2, 5.2, 4.6, 1.6, 3.1, 6.8,
5.6, 4.1, 1, 3.2, 6.8), Al = c(2.2, 1.2, 0.5, 1.3, 0.2, 2, 1.3,
0.8, 1.6, 0.3, 2.5, 1.2, 0.6, 1.5, 0.1), Ca = c(33L, 44L, 25L,
31L, 55L, 35L, 48L, 22L, 29L, 51L, 30L, 41L, 22L, 30L, 51L),
Ti = c(0.2, 0.1, 0.3, 0.5, 0.6, 0.25, 0.1, 0.32, 0.4, 0.7,
0.2, 0.15, 0.4, 0.5, 0.65)), class = "data.frame", row.names = c(NA,
-15L))
I am working on a dumbbell plot in R inspired by this post, and have two problems:
Ordering the dumbbell plot (I've tried a strategy provided in this post)
Present value labels in the plot in an aesthetically pleasing way.
My data set is formatted as a wide data set with 18 units with the following structure:
> head(ADHD_med_2010_2018_wide, 18)
# A tibble: 18 x 9
age gender county adhd_pr_1000_2010 adhd_pr_1000_2018 county_label adhd_2010 adhd_2018 diff
<dbl+lbl> <dbl+lbl> <dbl+lbl> <dbl> <dbl> <fct> <dbl> <dbl> <dbl>
1 2 [10-14] 1 [Both genders] 1 [Østfold] 32.1 24.3 Østfold 32.1 24.3 -7.80
2 2 [10-14] 1 [Both genders] 2 [Akershus] 20.6 23.0 Akershus 20.6 23 2.40
3 2 [10-14] 1 [Both genders] 3 [Oslo] 17.2 33.9 Oslo 17.2 33.9 16.7
4 2 [10-14] 1 [Both genders] 4 [Hedmark] 41.7 30.9 Hedmark 41.7 30.9 -10.8
5 2 [10-14] 1 [Both genders] 5 [Oppland] 24.9 39.0 Oppland 24.9 39 14.1
6 2 [10-14] 1 [Both genders] 6 [Buskerud] 26.7 36.8 Buskerud 26.7 36.8 10.1
7 2 [10-14] 1 [Both genders] 7 [Vestfold] 28.1 27.1 Vestfold 28.1 27 -1.10
8 2 [10-14] 1 [Both genders] 8 [Telemark] 29.2 24.7 Telemark 29.2 24.7 -4.5
9 2 [10-14] 1 [Both genders] 9 [Aust-Agder] 34.9 39.2 Aust-Agder 34.9 39.2 4.30
10 2 [10-14] 1 [Both genders] 10 [Vest-Agder] 17.4 23.8 Vest-Agder 17.4 23.8 6.40
11 2 [10-14] 1 [Both genders] 11 [Rogaland] 29.5 13.8 Rogaland 29.5 13.8 -15.7
12 2 [10-14] 1 [Both genders] 12 [Hordaland] 21.3 14.4 Hordaland 21.3 14.4 -6.90
13 2 [10-14] 1 [Both genders] 14 [Sogn og Fjordane] 21.3 39.7 Sogn og Fjordane 21.3 39.7 18.4
14 2 [10-14] 1 [Both genders] 15 [Møre og Romsdal] 27.0 18.6 Møre og Romsdal 27 18.6 -8.40
15 2 [10-14] 1 [Both genders] 18 [Nordland] 40.1 30.0 Nordland 40.1 30 -10.1
16 2 [10-14] 1 [Both genders] 19 [Troms] 25.8 33.2 Troms 25.8 33.2 7.40
17 2 [10-14] 1 [Both genders] 20 [Finnmark] 19.1 21.3 Finnmark 19.1 21.3 2.20
18 2 [10-14] 1 [Both genders] 50 [Trøndelag] 25.0 36.9 Trøndelag 25 37 12
I've tried two strategies for problem 1:
library("tidyverse")
library("ggalt")
fig2 <- ggplot(ADHD_med_2010_2018_wide, aes(x=adhd_2010, xend=adhd_2018, y=county_label, group=county_label)) +
#create a thick line between x and xend instead of using defaut
#provided by geom_dubbell
geom_segment(aes(x=adhd_2010,
xend=adhd_2018,
y=county_label,
yend=county_label),
color="#b2b2b2", size=1.5)+
geom_dumbbell(color="light blue",
size_x=3.5,
size_xend = 3.5,
#Note: there is no US:'color' for UK:'colour'
# in geom_dumbbel unlike standard geoms in ggplot()
colour_x="forestgreen", # green = 2010
colour_xend = "red")+ # red = 2018
labs(x=NULL, y=NULL,
title="Dumbbell Chart",
subtitle="Change in prescription rate: 2010 vs 2018")+
geom_text(color="black", size=2, hjust=-0.5,
aes(x=adhd_2010, label=adhd_2010))+
geom_text(aes(x=adhd_2018, label=adhd_2018),
color="black", size=2, hjust=1.5)
fig2
Which gives a plot without ordering or values presented in a good way:
To correct ordering, I tried following the strategy provided in the post linked above:
library(dplyr)
ADHD_med_2010_2018_wide%>%
mutate(difference = abs(adhd_2018-adhd_2010)) %>% #creates the variable of differences
top_n(18, wt = difference) %>% # Choose the rows with top 20 difference
ggplot() +
aes(x=adhd_2010, xend=adhd_2018, y=reorder(county_label, difference),
group=county_label) + #reorder the labels by descending difference value
geom_dumbbell(color="light blue",
size_x=3.5,
size_xend = 3.5,
#Note: there is no US:'color' for UK:'colour'
# in geom_dumbbel unlike standard geoms in ggplot()
colour_x="forestgreen", # green = 2010
colour_xend = "red")+ # red = 2018
labs(x=NULL, y=NULL,
title="Dumbbell Chart",
subtitle="Change in prescription rate: 2010 vs 2018")+
geom_text(color="black", size=2, hjust=-0.5,
aes(x=adhd_2010, label=adhd_2010))+
geom_text(aes(x=adhd_2018, label=adhd_2018),
color="black", size=2, hjust=1.5)
This still does not give a plot with a nice ordering, although it seem to order the difference (and there's still the issue with value labels):
Hopefully some of you may have input on these issues.
Data to copy:
> dput(head(ADHD_med_2010_2018_wide, 18))
structure(list(age = structure(c(2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2), label = "Age groups", labels = c(`5-9` = 1,
`10-14` = 2, `15-19` = 3, `20-24` = 4, `25-29` = 5, `30-34` = 6,
`All ages` = 7), class = "haven_labelled"), gender = structure(c(1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), label = "Gender", labels = c(`Both genders` = 1,
Female = 2, Male = 3), class = "haven_labelled"), county = structure(c(1,
2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 18, 19, 20, 50), labels = c(Østfold = 1,
Akershus = 2, Oslo = 3, Hedmark = 4, Oppland = 5, Buskerud = 6,
Vestfold = 7, Telemark = 8, `Aust-Agder` = 9, `Vest-Agder` = 10,
Rogaland = 11, Hordaland = 12, `Sogn og Fjordane` = 14, `Møre og Romsdal` = 15,
Nordland = 18, Troms = 19, Finnmark = 20, Trøndelag = 50, `Hele landet` = 99
), class = "haven_labelled"), adhd_pr_1000_2010 = c(32.1488990783691,
20.5894756317139, 17.2119483947754, 41.6982574462891, 24.8543014526367,
26.7194156646729, 28.1328239440918, 29.2480430603027, 34.8775291442871,
17.3759765625, 29.4698066711426, 21.340311050415, 21.3308296203613,
27.0334072113037, 40.1140670776367, 25.7862873077393, 19.1311283111572,
25.0325565338135), adhd_pr_1000_2018 = c(24.2834396362305, 23.0037822723389,
33.9068183898926, 30.8641967773438, 39.0195579528809, 36.7909698486328,
27.0642204284668, 24.6901988983154, 39.1978950500488, 23.8095245361328,
13.8218154907227, 14.4400091171265, 39.7175636291504, 18.5994052886963,
29.9642810821533, 33.1638412475586, 21.2596340179443, 36.9249382019043
), county_label = structure(18:1, .Label = c("Trøndelag", "Finnmark",
"Troms", "Nordland", "Møre og Romsdal", "Sogn og Fjordane", "Hordaland",
"Rogaland", "Vest-Agder", "Aust-Agder", "Telemark", "Vestfold",
"Buskerud", "Oppland", "Hedmark", "Oslo", "Akershus", "Østfold"
), class = "factor"), adhd_2010 = c(32.0999984741211, 20.6000003814697,
17.2000007629395, 41.7000007629395, 24.8999996185303, 26.7000007629395,
28.1000003814697, 29.2000007629395, 34.9000015258789, 17.3999996185303,
29.5, 21.2999992370605, 21.2999992370605, 27, 40.0999984741211,
25.7999992370605, 19.1000003814697, 25), adhd_2018 = c(24.2999992370605,
23, 33.9000015258789, 30.8999996185303, 39, 36.7999992370605,
27, 24.7000007629395, 39.2000007629395, 23.7999992370605, 13.8000001907349,
14.3999996185303, 39.7000007629395, 18.6000003814697, 30, 33.2000007629395,
21.2999992370605, 37), diff = c(-7.79999923706055, 2.39999961853027,
16.7000007629395, -10.8000011444092, 14.1000003814697, 10.0999984741211,
-1.10000038146973, -4.5, 4.29999923706055, 6.39999961853027,
-15.6999998092651, -6.89999961853027, 18.4000015258789, -8.39999961853027,
-10.0999984741211, 7.40000152587891, 2.19999885559082, 12)), row.names = c(NA,
-18L), class = c("tbl_df", "tbl", "data.frame"))
Here an easy way to order your plot is to use arrange function from dplyr to sort your dataframe according column(s) of your choice and then format the grouping value (county_label) as factor with the ranked elements:
library(dplyr)
library(ggplot2)
DF %>% arrange(adhd_2010) %>% mutate(county_label = factor(county_label, unique(county_label))) %>%
ggplot(aes(x=adhd_2010, xend=adhd_2018, y=county_label, group=county_label)) +
#create a thick line between x and xend instead of using defaut
#provided by geom_dubbell
geom_segment(aes(x=adhd_2010,
xend=adhd_2018,
y=county_label,
yend=county_label),
color="#b2b2b2", size=1.5)+
geom_dumbbell(color="light blue",
size_x=3.5,
size_xend = 3.5,
#Note: there is no US:'color' for UK:'colour'
# in geom_dumbbel unlike standard geoms in ggplot()
colour_x="forestgreen", # green = 2010
colour_xend = "red")+ # red = 2018
labs(x=NULL, y=NULL,
title="Dumbbell Chart",
subtitle="Change in prescription rate: 2010 vs 2018")+
geom_text(color="black", size=2, hjust=-0.5,
aes(x=adhd_2010, label=adhd_2010))+
geom_text(aes(x=adhd_2018, label=adhd_2018),
color="black", size=2, hjust=1.5)
It is not perfect but at least your values are quite ordered. After you can change the column by which you wish to order your plot (here I order based on adhd_2010)
# Reformat data
DF2<-DF%>% arrange(desc(adhd_2010))
DF3<-DF%>% mutate("key" = "Change in Prescription Rate")
DF3$county_label<-factor(DF3$county_label,DF2$county_label)
DF3$adhd_2018<-signif(DF3$adhd_2018, digits = 3)
DF3$adhd_2010<-signif(DF3$adhd_2010, digits = 3)
# Plot
dumbbell::dumbbell(DF3, id="county_label", key="key", column1="adhd_2010", column2="adhd_2018", lab1="2010", lab2="2018", delt=1, textsize = 2, pt_val =1, ,pointsize = 2) + xlim(13,43) +
labs(x=NULL, y=NULL, title="Dumbbell Chart",subtitle="Change in prescription rate: 2010 vs 2018")
I added in a few bells and whistles, just toggle the options to remove
I hope someone finds it useful
Enjoy!
My data frame is simple (and probably is not strictly a dataframe):
date MAE_f0 MAE_f1
1 20140101 0.2 0.2
2 20140102 1.9 0.1
3 20140103 0.1 0.3
4 20140104 7.8 15.9
5 20140105 1.9 4.6
6 20140106 0.8 0.8
7 20140107 0.5 0.6
8 20140108 0.2 0.2
9 20140109 0.2 0.2
10 20140110 0.8 1.1
11 20140111 0.2 0.2
12 20140112 0.4 0.4
13 20140113 2.8 0.9
14 20140114 5.4 5.8
15 20140115 0.2 0.3
16 20140116 4.9 3.1
17 20140117 3.7 6.0
18 20140118 1.4 2.1
19 20140119 0.9 3.0
20 20140120 0.2 3.6
21 20140121 0.3 0.3
22 20140122 0.4 0.4
23 20140123 0.6 1.7
24 20140124 6.1 4.7
25 20140125 0.1 0.0
26 20140126 7.4 4.9
27 20140127 0.8 0.9
28 20140128 0.3 0.3
29 20140129 3.0 4.2
30 20140130 9.9 17.3
On every day I've 2 variables: MAE for f0, and MAE for f1.
I can calculate frequency for my 2 variables on the whole time period using "cut" with the same intervals for both:
cut(mae.df$MAE_f0,c(0,2,5,10,50))
cut(mae.df$MAE_f1,c(0,2,5,10,50))
Well. Now I can use boxplot to plot variable versus it's frequency distribution:
boxplot(mae.df$MAE_f0~cut(mae.df$MAE_f0,c(0,2,5,10,50)))
boxplot(mae.df$MAE_f1~cut(mae.df$MAE_f1,c(0,2,5,10,50)))
The produced boxplot (2) are very simple (but I don't show it 'cause I've ho "reputation"): on x there are the intervals of frequency (0-2,2-5,5-10,10-50), on y the boxplot value for variable MAE_f0 for each interval.
Well, the question is very trivial: I'd like to have only one box plot, with both variables MAE_f0 and MAE_f1 and it's frequency distribution: I'd like to have is a plot with 2 boxplot for each frequency interval (I mean: 2 for 0-2, 2 for 2-5 and so on).
I know that my knowledge on R, data frame and so on is very poor, and, de facto, I'm missing something important about those arguments, specially on data frame and reshaping! Sorry in advance for that!But I've seen some nice examples in stackoverflow about grouping boxplot, all without time variable, and I'm not able to figure out how I can adjust my data frame for doing that.
I hope my question is not misplaced: sorry again for that.
Umbe
Here is how I would do this. I think it makes sense to melt your data first. A quick tutorial on melting your data is available here.
# First, make this reproducible by using dput for the data frame
df <- structure(list(date = 20140101:20140130, MAE_f0 = c(0.2, 1.9, 0.1, 7.8, 1.9, 0.8, 0.5, 0.2, 0.2, 0.8, 0.2, 0.4, 2.8, 5.4, 0.2, 4.9, 3.7, 1.4, 0.9, 0.2, 0.3, 0.4, 0.6, 6.1, 0.1, 7.4, 0.8, 0.3, 3, 9.9), MAE_f1 = c(0.2, 0.1, 0.3, 15.9, 4.6, 0.8, 0.6, 0.2, 0.2, 1.1, 0.2, 0.4, 0.9, 5.8, 0.3, 3.1, 6, 2.1, 3, 3.6, 0.3, 0.4, 1.7, 4.7, 0, 4.9, 0.9, 0.3, 4.2, 17.3)), .Names = c("date", "MAE_f0", "MAE_f1"), row.names = c(NA, -30L), class = "data.frame")
require(ggplot2)
require(reshape2)
# Melt the original data frame
df2 <- melt(df, measure.vars = c("MAE_f0", "MAE_f1"))
head(df2)
# date variable value
# 1 20140101 MAE_f0 0.2
# 2 20140102 MAE_f0 1.9
# 3 20140103 MAE_f0 0.1
# 4 20140104 MAE_f0 7.8
# 5 20140105 MAE_f0 1.9
# 6 20140106 MAE_f0 0.8
# Create a "cuts" variable with the correct breaks
df2$cuts <- cut(df2$value,
breaks = c(-Inf, 2, 5, 10, +Inf),
labels = c("first cut", "second cut", "third cut", "fourth cut"))
head(df2)
# date variable value cuts
# 1 20140101 MAE_f0 0.2 first cut
# 2 20140102 MAE_f0 1.9 first cut
# 3 20140103 MAE_f0 0.1 first cut
# 4 20140104 MAE_f0 7.8 third cut
# 5 20140105 MAE_f0 1.9 first cut
# 6 20140106 MAE_f0 0.8 first cut
# Plotting
ggplot(df2, aes(x = variable, y = value, fill = variable)) +
geom_boxplot() +
facet_wrap(~ cuts, nrow = 1)
Result:
Here is one way. You reshape your data. Then, you want to add a fake data point in this case. I noticed that there is no data point for MAE_f0 for (10,50](frequency 10-50). Combine your reshaped data and the fake data. When you draw a figure, use coord_cartesian with the range of y values in the original data set. Hope this gives you an ideal graphic. Here, your data is called mydf
library(dplyr)
library(tidyr)
library(ggplot2)
mydf <- structure(list(V1 = 1:30, V2 = 20140101:20140130, V3 = c(0.2,
1.9, 0.1, 7.8, 1.9, 0.8, 0.5, 0.2, 0.2, 0.8, 0.2, 0.4, 2.8, 5.4,
0.2, 4.9, 3.7, 1.4, 0.9, 0.2, 0.3, 0.4, 0.6, 6.1, 0.1, 7.4, 0.8,
0.3, 3, 9.9), V4 = c(0.2, 0.1, 0.3, 15.9, 4.6, 0.8, 0.6, 0.2,
0.2, 1.1, 0.2, 0.4, 0.9, 5.8, 0.3, 3.1, 6, 2.1, 3, 3.6, 0.3,
0.4, 1.7, 4.7, 0, 4.9, 0.9, 0.3, 4.2, 17.3)), .Names = c("V1",
"V2", "V3", "V4"), class = "data.frame", row.names = c(NA, -30L
))
ana <- select(mydf, -V1) %>%
rename(date = V2, MAE_f0 = V3, MAE_f1 = V4) %>%
gather(variable, value, -date) %>%
mutate(frequency = cut(value, breaks = c(-Inf,2,5,10,50)))
# Create a fake df
extra <- data.frame(date = 20140101,
variable = "MAE_f0",
value = 60,
frequency = "(10,50]")
new <- rbind(ana, extra)
ggplot(data = new, aes(x = frequency, y = value, fill = variable)) +
geom_boxplot(position = "dodge") +
coord_cartesian(ylim = range(ana$value) + c(-0.25, 0.25))