Related
Using a dataframe with missing values:
structure(list(id = c("id1", "test", "rew", "ewt"), total_frq_1 = c(54, 87, 10, 36), total_frq_2 = c(45, 24, 202, 43), total_frq_3 = c(24, NA, 25, 8), total_frq_4 = c(36, NA, 104, NA)), row.names = c(NA, 4L), class = "data.frame")
How is is possible to create a bar plot with the mean for every column, excluding the id column, but without filling the missing values with 0 but leaving out the row with missing values example for total_frq_3 24+25+8 = 57/3 = 19
You can use colMeans function and pass it the appropriate argument to ignore NA.
library(ggplot2)
xy <- structure(list(id = c("id1", "test", "rew", "ewt"),
total_frq_1 = c(54, 87, 10, 36), total_frq_2 = c(45, 24, 202, 43), total_frq_3 = c(24, NA, 25, 8),
total_frq_4 = c(36, NA, 104, NA)),
row.names = c(NA, 4L),
class = "data.frame")
xy.means <- colMeans(x = xy[, 2:ncol(xy)], na.rm = TRUE)
xy.means <- as.data.frame(xy.means)
xy.means$total <- rownames(xy.means)
ggplot(xy.means, aes(x = total, y = xy.means)) +
theme_bw() +
geom_col()
Or just use base image graphic
barplot(height = colMeans(x = xy[, 2:ncol(xy)], na.rm = TRUE))
I have these datas that determines how absence rate affect student's grade in 3 different years, im trying to plot a one dimensional scatter plot with the three different years in the y-axis G1, G2 and G3 while absences being the x-axis. The one dimensional scatter plot should plot occurences of grade = 0 in specific absences value, something like the picture provided below.
desired output:
my data:
structure(list(absences = c("6", "4", "10", "2", "4", "10", "0",
"6", "o", "0", "0", "4", "2", "2", "0", "4", "6", "4", "16",
"4"), G1 = c(5, 5, 7, 15, 6, 15, 12, 6, 16, 14, 10, 10, 14, 10,
14, 14, 13, 8, 6, 8), G2 = c(6, 5, 8, 14, 10, 15, 12, 5, 18,
15, 8, 12, 14, 10, 16, 14, 14, 10, 5, 10), G3 = c(6, 6, 10, 15,
10, 15, 11, 6, 19, 15, 9, 12, 14, 11, 16, 14, 14, 10, 5, 10)), row.names = c(NA,
-20L), spec = structure(list(cols = list(absences = structure(list(), class = c("collector_character",
"collector")), G1 = structure(list(), class = c("collector_double",
"collector")), G2 = structure(list(), class = c("collector_double",
"collector")), G3 = structure(list(), class = c("collector_double",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), delim = "\t"), class = "col_spec"), problems = <pointer: 0x55e465b58110>, class = c("spec_tbl_df",
"tbl_df", "tbl", "data.frame"))
Something like this?
library(dplyr)
library(tidyr)
library(ggplot2)
set.seed(2022)
tibble(
absences = sample(c(0:16), 20, replace = TRUE),
G1 = sample(c(0:16), 20, replace = TRUE),
G2 = sample(c(0:16), 20, replace = TRUE),
G3 = sample(c(0:16), 20, replace = TRUE)
) %>%
pivot_longer(
cols = -absences,
names_to = "key",
values_to = "value"
) %>%
filter(key != 0) %>%
ggplot(aes(absences, key)) +
#geom_jitter(color = "red", height = 0.1)
geom_point(color = "red")
I didn't know if your grades and absences are distinct. if not, you can use the geom_jitter() I quoted out.
Output:
Here is a way how you could start to achieve your task:
library(tidyverse)
df %>%
pivot_longer(
-absences
) %>%
mutate(absences = as.numeric(replace(absences, absences == "o", "0"))) %>%
group_by(absences, name, value) %>%
summarise(absences = sum(absences, na.rm = TRUE)) %>%
ggplot(aes(x=name, y=factor(absences)))+
geom_point(aes(size = value), color="red")+
theme_minimal() +
labs(title = "Your title", y ="Absences", x = "Year") +
theme(legend.position = "bottom",
plot.title = element_text(hjust = 0.5)) +
guides(color= guide_legend(), size=guide_legend())+
coord_flip()
This question already has answers here:
Wrap long axis labels via labeller=label_wrap in ggplot2
(4 answers)
Closed 1 year ago.
I have some data, for the variable names are too long. When I don't have them in an angle, they overlap. When I have them in an angle they look like the example below.
What I would like to do is simply have the possibility to write the problematic variable as:
This is a very long
name specifically
for the example
But I cannot figure out how to do this in ggplot2.
library(ggplot2)
counts <- structure(list(ECOST = c("0.52", "0.52", "0.39", "0.39", "0.26",
"0.26", "0.13", "0.13", "0.00", "This is a very long name specifically for the example"), group = c("control",
"treatment", "control", "treatment", "control", "treatment",
"control", "treatment", "control", "treatment"), count = c(18,
31, 30, 35, 47, 46, 66, 68, 86, 86), percentage = c(16.3636363636364,
31.9587628865979, 27.2727272727273, 36.0824742268041, 42.7272727272727,
47.4226804123711, 60, 70.1030927835051, 78.1818181818182, 88.659793814433
), total = c(110, 97, 110, 97, 110, 97, 110, 97, 110, 97), negative_count = c(92,
66, 80, 62, 63, 51, 44, 29, 24, 11), p_value = c(0.00843644912924255,
0.00843644912924255, 0.172947686684261, 0.172947686684261, 0.497952719783453,
0.497952719783453, 0.128982570547408, 0.128982570547408, 0.0447500820026408,
0.0447500820026408)), row.names = c(NA, -10L), class = c("data.table",
"data.frame"))
ECOST group count percentage total negative_count p_value
1: 0.52 control 18 16 110 92 0.0084
2: 0.52 treatment 31 32 97 66 0.0084
3: 0.39 control 30 27 110 80 0.1729
4: 0.39 treatment 35 36 97 62 0.1729
5: 0.26 control 47 43 110 63 0.4980
6: 0.26 treatment 46 47 97 51 0.4980
7: 0.13 control 66 60 110 44 0.1290
8: 0.13 treatment 68 70 97 29 0.1290
9: 0.00 control 86 78 110 24 0.0448
10: This is a very long name specifically for the example treatment 86 89 97 11 0.0448
counts %>%
ggplot(aes(x = ECOST, y = percentage, fill = group, label=sprintf("%.02f %%", round(percentage, digits = 1)))) +
geom_col(position = 'dodge') +
geom_text(position = position_dodge(width = .9), # move to center of bars
vjust = -0.5, # nudge above top of bar
size = 4) +
scale_fill_grey(start = 0.8, end = 0.5) +
theme_bw(base_size = 15) +
theme(axis.text.x=element_text(angle=45,hjust=1))
The simplest solution is to use str_wrap from stringr package to set the new lines automatically and make your plot code reproducible in other scenarios. The scales package also provides label_wrap and wrap_format which can be convenient in some cases (for example here you can also use scale_x_discrete(labels = scales::wrap_format(20))).
library(tidyverse)
library(ggplot2)
counts <- structure(list(ECOST = c("0.52", "0.52", "0.39", "0.39", "0.26",
"0.26", "0.13", "0.13", "0.00", "This is a very long name specifically for the example"), group = c("control",
"treatment", "control", "treatment", "control", "treatment",
"control", "treatment", "control", "treatment"), count = c(18,
31, 30, 35, 47, 46, 66, 68, 86, 86), percentage = c(16.3636363636364, 31.9587628865979, 27.2727272727273, 36.0824742268041, 42.7272727272727,
47.4226804123711, 60, 70.1030927835051, 78.1818181818182, 88.659793814433
), total = c(110, 97, 110, 97, 110, 97, 110, 97, 110, 97), negative_count = c(92,
66, 80, 62, 63, 51, 44, 29, 24, 11), p_value = c(0.00843644912924255,
0.00843644912924255, 0.172947686684261, 0.172947686684261, 0.497952719783453,
0.497952719783453, 0.128982570547408, 0.128982570547408, 0.0447500820026408,
0.0447500820026408)), row.names = c(NA, -10L), class = c("data.table",
"data.frame"))
counts %>%
ggplot(aes(x = ECOST, y = percentage, fill = group, label=sprintf("%.02f %%", round(percentage, digits = 1)))) +
geom_col(position = 'dodge') +
geom_text(position = position_dodge(width = .9), # move to center of bars
vjust = -0.5, # nudge above top of bar
size = 4) +
scale_fill_grey(start = 0.8, end = 0.5) +
theme_bw(base_size = 15) +
theme(axis.text.x=element_text(angle=45,hjust=1)) +
scale_x_discrete(labels = function(x) stringr::str_wrap(x, width = 20))
Created on 2021-02-22 by the reprex package (v0.3.0)
You can break lines using \n.
Code:
library(ggplot2)
counts <- structure(list(ECOST = c("0.52", "0.52", "0.39", "0.39", "0.26",
"0.26", "0.13", "0.13", "0.00", "This is a \nvery long name \nspecifically for the \nexample"), group = c("control",
"treatment", "control", "treatment", "control", "treatment",
"control", "treatment", "control", "treatment"), count = c(18,
31, 30, 35, 47, 46, 66, 68, 86, 86), percentage = c(16.3636363636364,
31.9587628865979, 27.2727272727273, 36.0824742268041, 42.7272727272727,
47.4226804123711, 60, 70.1030927835051, 78.1818181818182, 88.659793814433
), total = c(110, 97, 110, 97, 110, 97, 110, 97, 110, 97), negative_count = c(92,
66, 80, 62, 63, 51, 44, 29, 24, 11), p_value = c(0.00843644912924255,
0.00843644912924255, 0.172947686684261, 0.172947686684261, 0.497952719783453,
0.497952719783453, 0.128982570547408, 0.128982570547408, 0.0447500820026408,
0.0447500820026408)), row.names = c(NA, -10L), class = c("data.table",
"data.frame"))
library(dplyr)
counts %>%
ggplot(aes(x = ECOST, y = percentage, fill = group, label=sprintf("%.02f %%", round(percentage, digits = 1)))) +
geom_col(position = 'dodge') +
geom_text(position = position_dodge(width = .9), # move to center of bars
vjust = -0.5, # nudge above top of bar
size = 4) +
scale_fill_grey(start = 0.8, end = 0.5) +
theme_bw(base_size = 15) +
theme(axis.text.x=element_text(angle=45,hjust=1))
-output
I am using following Data frame.
df2<-final.data%>% gather(Hospital,Attendance,contains("Attendance"))
df2 %>% spread(Hospital, Attendance)
> dput(final.data[0:2,])
structure(list(RoyalPerth.Attendance = c(235, 209), RoyalPerth.Admissions = c(99,
97), RoyalPerth.Tri1 = c("8", "N/A"), RoyalPerth.Tri2 = c(33,
41), RoyalPerth.Tri3 = c(89, 73), RoyalPerth.Tri4 = c(85, 80),
RoyalPert
h.Tri5 = c("20", "14"), Fremantle.Attendance = c(155,
145), Fremantle.Admissions = c(70, 56), Fremantle.Tri1 = c("N/A",
"N/A"), Fremantle.Tri2 = c(25, 22), Fremantle.Tri3 = c(67,
51), Fremantle.Tri4 = c(54, 47), Fremantle.Tri5 = c(9, 24
), PrincessMargaret.Attendance = c(252, 219), PrincessMargaret.Admissions = c(59,
47), PrincessMargaret.Tri1 = c("N/A", "N/A"), PrincessMargaret.Tri2 = c("13",
"14"), PrincessMargaret.Tri3 = c(75, 61), PrincessMargaret.Tri4 = c(159,
139), PrincessMargaret.Tri5 = c("4", "4"), KingEdward.Attendance = c(52,
43), KingEdward.Admissions = c("6", "7"), KingEdward.Tri1 = c("N/A",
"N/A"), KingEdward.Tri2 = c("N/A", "N/A"), KingEdward.Tri3 = c("7",
"N/A"), KingEdward.Tri4 = c(20, 25), KingEdward.Tri5 = c("25",
"17"), SirCharles.Attendance = c(209, 184), SirCharles.Admissions = c(109,
112), SirCharles.Tri1 = c("N/A", "N/A"), SirCharles.Tri2 = c(42,
43), SirCharles.Tri3 = c(108, 73), SirCharles.Tri4 = c(47,
61), SirCharles.Tri5 = c("11", "5"), Armadale.Attendance = c(166,
175), Armadale.Admissions = c(19, 25), Armadale.Tri1 = c("N/A",
"N/A"), Armadale.Tri2 = c(16, 26), Armadale.Tri3 = c(62,
73), Armadale.Tri4 = c(79, 55), Armadale.Tri5 = c("9", "19"
), Swan.Attendance = c(133, 129), Swan.Admissions = c(17,
25), Swan.Tri1 = c("N/A", "N/A"), Swan.Tri2 = c(29, 25),
Swan.Tri3 = c(59, 57), Swan.Tri4 = c(42, 43), Swan.Tri5 = c("N/A",
"4"), Rockingham.Attendance = c(155, 145), Rockingham.Admissions = c("10",
"24"), Rockingham.Tri1 = c("N/A", "N/A"), Rockingham.Tri2 = c(12,
26), Rockingham.Tri3 = c(51, 45), Rockingham.Tri4 = c(81,
65), Rockingham.Tri5 = c("11", "8"), Joondalup.Attendance = c(267,
241), Joondalup.Admissions = c(73, 81), Joondalup.Tri1 = c("N/A",
"N/A"), Joondalup.Tri2 = c(27, 23), Joondalup.Tri3 = c(75,
78), Joondalup.Tri4 = c(151, 133), Joondalup.Tri5 = c("12",
"7")), row.names = 1:2, class = "data.frame")
Error:
Warning message:
attributes are not identical across measure variables;
they will be dropped
I have tried below things:
hospital.dataset<-gather(hospital,triage,sum,Tri1:Tri5) to gather Triage
after using cbind on the data set.
I want to covert it into long data set using gather.
dput(hospital.dataset[1:2,])
structure(list(Date = structure(c(-714598, -714597), class = "Date"), [enter image description here][1]
Attendance = c(235, 209), Admissions = c(99, 97), Hospital = structure(c(1L,
1L), .Label = c("RoyalPerth Hospital", "Fremantle Hospital",
"Princess Margaret Hospital", "KingEdward Hospital", "SirCharles Hospital",
"Armadale Hospital", "Swan Hospital", "Rockingham Hospital",
"Joondalup Hospital"), class = "factor"), triage = c("Tri1",
"Tri1"), sum = c(8, 0)), row.names = 1:2, class = "data.frame")
Like this.
Thanks in advance.
Expected Dataframe
Note: This solution feels like a lot of effort. So please consider there may be more elegant approaches available.
One issue with this data is that the values you want "wide" (Attendance, Admissions) are mixed in with the values you want "long" (Tri1, Tri2, etc).
This solution uses pivot_longer on the entire data frame (note: pivot_longer is the new gather syntax) , and then separate to pull out the hospital name from the specific data field.
Then it splits into two data frames, applies pivot_wider to the Attendance/Admissions columns, and rejoins after that.
library(tidyverse)
final_data_long <- final.data.raw %>%
mutate_all(as.character) %>%
mutate(row_n = row_number()) %>%
pivot_longer(-row_n, names_to = "field", values_to = "value") %>%
separate(field, into = c("hospital", "category"))
attend_admit <- final_data_long %>%
filter(str_detect(category, "Attendance|Admissions"))
triage <- final_data_long %>% anti_join(attend_admit)
attend_admit_long <-
attend_admit %>%
group_by(row_n) %>%
pivot_wider(id_cols = c(row_n, hospital), names_from = category,
values_from = value)
triage %>%
inner_join(attend_admit_long, by = c("row_n", "hospital")) %>%
arrange(hospital) %>%
select(-row_n)
Output
# A tibble: 90 x 5
hospital category value Attendance Admissions
<chr> <chr> <chr> <chr> <chr>
1 Armadale Tri1 N/A 166 19
2 Armadale Tri2 16 166 19
3 Armadale Tri3 62 166 19
4 Armadale Tri4 79 166 19
5 Armadale Tri5 9 166 19
6 Armadale Tri1 N/A 175 25
7 Armadale Tri2 26 175 25
8 Armadale Tri3 73 175 25
9 Armadale Tri4 55 175 25
10 Armadale Tri5 19 175 25
# … with 80 more rows
Data
*I couldn't get OP's dput to work, here's a version that can be copy/pasted:
final.data.raw <- structure(
list(RoyalPerth.Attendance = c(235, 209), RoyalPerth.Admissions = c(99, 97), RoyalPerth.Tri1 = c("8", "N/A"),
RoyalPerth.Tri2 = c(33, 41), RoyalPerth.Tri3 = c(89, 73), RoyalPerth.Tri4 = c(85, 80),
RoyalPerth.Tri5 = c("20", "14"), Fremantle.Attendance = c(155, 145), Fremantle.Admissions = c(70, 56),
Fremantle.Tri1 = c("N/A", "N/A"), Fremantle.Tri2 = c(25, 22), Fremantle.Tri3 = c(67, 51),
Fremantle.Tri4 = c(54, 47), Fremantle.Tri5 = c(9, 24), PrincessMargaret.Attendance = c(252, 219),
PrincessMargaret.Admissions = c(59,47), PrincessMargaret.Tri1 = c("N/A", "N/A"), PrincessMargaret.Tri2 = c("13", "14"),
PrincessMargaret.Tri3 = c(75, 61), PrincessMargaret.Tri4 = c(159, 139), PrincessMargaret.Tri5 = c("4", "4"),
KingEdward.Attendance = c(52, 43), KingEdward.Admissions = c("6", "7"), KingEdward.Tri1 = c("N/A", "N/A"),
KingEdward.Tri2 = c("N/A", "N/A"), KingEdward.Tri3 = c("7", "N/A"), KingEdward.Tri4 = c(20, 25),
KingEdward.Tri5 = c("25", "17"), SirCharles.Attendance = c(209, 184), SirCharles.Admissions = c(109, 112),
SirCharles.Tri1 = c("N/A", "N/A"), SirCharles.Tri2 = c(42, 43), SirCharles.Tri3 = c(108, 73),
SirCharles.Tri4 = c(47, 61), SirCharles.Tri5 = c("11", "5"), Armadale.Attendance = c(166, 175),
Armadale.Admissions = c(19, 25), Armadale.Tri1 = c("N/A", "N/A"), Armadale.Tri2 = c(16, 26),
Armadale.Tri3 = c(62, 73), Armadale.Tri4 = c(79, 55), Armadale.Tri5 = c("9", "19"),
Swan.Attendance = c(133, 129), Swan.Admissions = c(17, 25), Swan.Tri1 = c("N/A", "N/A"),
Swan.Tri2 = c(29, 25), Swan.Tri3 = c(59, 57), Swan.Tri4 = c(42, 43),
Swan.Tri5 = c("N/A", "4"), Rockingham.Attendance = c(155, 145), Rockingham.Admissions = c("10", "24"),
Rockingham.Tri1 = c("N/A", "N/A"), Rockingham.Tri2 = c(12, 26), Rockingham.Tri3 = c(51, 45),
Rockingham.Tri4 = c(81, 65), Rockingham.Tri5 = c("11", "8"), Joondalup.Attendance = c(267, 241),
Joondalup.Admissions = c(73, 81), Joondalup.Tri1 = c("N/A", "N/A"), Joondalup.Tri2 = c(27, 23),
Joondalup.Tri3 = c(75, 78), Joondalup.Tri4 = c(151, 133), Joondalup.Tri5 = c("12", "7")),
row.names = 1:2, class = "data.frame")
I have a panel (cross-sectional time series) dataset. For each group (defined by (NAICS2, occ_type) in time ym) I have many variables. For each variable I would like to subtract each group's first (dplyr::first) value from every value of that group.
Ultimately I am trying to take the Euclidean difference between the vector of each row 's group's first entry, (i.e. sqrt(c_1^2 + ... + c_k^2).
I was able to create the a column equal to the first entries for each group:
df2 <- df %>%
group_by(ym, NAICS2, occ_type) %>%
distinct(ym, NAICS2, occ_type, .keep_all = T) %>%
arrange(occ_type, NAICS2, ym) %>%
select(group_cols(), ends_with("_scf")) %>%
mutate_at(vars(-group_cols(), ends_with("_scf")),
list(first = dplyr::first))
I then tried to include variations of f.diff = . - dplyr::first(.) in the list, but none of those worked. I googled the dot notation for a while as well as first and lag in dplyr timeseries but have not been able to resolve this yet.
Ideally, I unite all variables into a vector for each row first and then take the difference.
df2 <- df %>%
group_by(ym, NAICS2, occ_type) %>%
distinct(ym, NAICS2, occ_type, .keep_all = T) %>%
arrange(occ_type, NAICS2, ym) %>%
select(group_cols(), ends_with("_scf")) %>%
unite(vector, c(-group_cols(), ends_with("_scf")), sep = ',') %>%
# TODO: DISTANCE_BETWEEN_ENTRY_AND_FIRST
mutate(vector.diff = ???)
I expect the output to be a numeric column that contains a distance measure of how different each group's row vector is from its initial row vector.
Here is a sample of the data:
structure(list(ym = c("2007-01-01", "2007-02-01"), NAICS2 = c(0L,
0L), occ_type = c("is_middle_manager", "is_middle_manager"),
Administration_scf = c(344, 250), Agriculture..Horticulture..and.the.Outdoors_scf = c(11,
17), Analysis_scf = c(50, 36), Architecture.and.Construction_scf = c(57,
51), Business_scf = c(872, 585), Customer.and.Client.Support_scf = c(302,
163), Design_scf = c(22, 17), Economics..Policy..and.Social.Studies_scf = c(7,
7), Education.and.Training_scf = c(77, 49), Energy.and.Utilities_scf = c(25,
28), Engineering_scf = c(90, 64), Environment_scf = c(19,
19), Finance_scf = c(455, 313), Health.Care_scf = c(105,
71), Human.Resources_scf = c(163, 124), Industry.Knowledge_scf = c(265,
174), Information.Technology_scf = c(467, 402), Legal_scf = c(21,
17), Maintenance..Repair..and.Installation_scf = c(194, 222
), Manufacturing.and.Production_scf = c(176, 174), Marketing.and.Public.Relations_scf = c(139,
109), Media.and.Writing_scf = c(18, 20), Personal.Care.and.Services_scf = c(31,
16), Public.Safety.and.National.Security_scf = c(14, 7),
Religion_scf = c(0, 0), Sales_scf = c(785, 463), Science.and.Research_scf = c(52,
24), Supply.Chain.and.Logistics_scf = c(838, 455), total_scf = c(5599,
3877)), class = c("grouped_df", "tbl_df", "tbl", "data.frame"
), row.names = c(NA, -2L), groups = structure(list(ym = c("2007-01-01",
"2007-02-01"), NAICS2 = c(0L, 0L), occ_type = c("is_middle_manager",
"is_middle_manager"), .rows = list(1L, 2L)), row.names = c(NA,
-2L), class = c("tbl_df", "tbl", "data.frame"), .drop = TRUE))