use of pivot_wider to plot the evolution of variables in R - r

I would like to plot the evolution of the number of workers per category ("A", "D", "F", "I"), from 2017 to 2021, with a stacked bar chart (with the labels in the middle of each bar, for each category), one bar per year. Yet my dataset isn't in the right way to do this, I think I need to use pivot_wider() or pivot_longer() from what I have seen here, but I don't really know how to manipulate these functions. Could anyone help ?
Here is the structure of my dataset, for reproducibility :
structure(list(A = c("10", "7", "8", "8", "9", "Total"), D = c(23,
14, 29, 35, 16, 117), F = c(8, 7, 11, 6, 6, 38), I = c(449, 498,
415, 470, 531, 2363), annee = c("2017", "2018", "2019", "2020",
"2021", NA)), core = structure(list(A = c("10", "7", "8", "8",
"9"), D = c(23, 14, 29, 35, 16), F = c(8, 7, 11, 6, 6), I = c(449,
498, 415, 470, 531)), class = "data.frame", row.names = c(NA,
-5L)), tabyl_type = "two_way", totals = "row", row.names = c(NA,
6L), class = c("tabyl", "data.frame"))

library(tidyverse)
library(ggrepel)
df <- structure(list(A = c("10", "7", "8", "8", "9", "Total"), D = c(
23,
14, 29, 35, 16, 117
), F = c(8, 7, 11, 6, 6, 38), I = c(
449, 498,
415, 470, 531, 2363
), annee = c(
"2017", "2018", "2019", "2020",
"2021", NA
)), core = structure(list(A = c(
"10", "7", "8", "8",
"9"
), D = c(23, 14, 29, 35, 16), F = c(8, 7, 11, 6, 6), I = c(
449,
498, 415, 470, 531
)), class = "data.frame", row.names = c(
NA,
-5L
)), tabyl_type = "two_way", totals = "row", row.names = c(
NA,
6L
), class = c("tabyl", "data.frame"))
df |>
filter(!is.na(annee)) |>
mutate(A = as.double(A)) |>
pivot_longer(-annee, names_to = "category") |>
ggplot(aes(annee, value, fill = category, label = value)) +
geom_col() +
geom_label_repel(position = position_stack(), max.overlaps = 20)
Created on 2022-08-08 by the reprex package (v2.0.1)

Once you remove the total row, and ensuring that A through I are numeric, you can pivot_longer and pass to ggplot() like this:
data %>%
filter(A!="Total") %>%
mutate(across(A:I, as.numeric)) %>%
pivot_longer(cols = -annee, names_to = "group", values_to = "ct") %>%
ggplot(aes(annee,ct,fill=group)) +
geom_col()
I did not add the category labels, since group I dominates each year; you might want to reconsider that visualization

Related

How to plot multiple 1 dimensional scatter plot in R based on occurences of a specific value at a specific point of x-axis

I have these datas that determines how absence rate affect student's grade in 3 different years, im trying to plot a one dimensional scatter plot with the three different years in the y-axis G1, G2 and G3 while absences being the x-axis. The one dimensional scatter plot should plot occurences of grade = 0 in specific absences value, something like the picture provided below.
desired output:
my data:
structure(list(absences = c("6", "4", "10", "2", "4", "10", "0",
"6", "o", "0", "0", "4", "2", "2", "0", "4", "6", "4", "16",
"4"), G1 = c(5, 5, 7, 15, 6, 15, 12, 6, 16, 14, 10, 10, 14, 10,
14, 14, 13, 8, 6, 8), G2 = c(6, 5, 8, 14, 10, 15, 12, 5, 18,
15, 8, 12, 14, 10, 16, 14, 14, 10, 5, 10), G3 = c(6, 6, 10, 15,
10, 15, 11, 6, 19, 15, 9, 12, 14, 11, 16, 14, 14, 10, 5, 10)), row.names = c(NA,
-20L), spec = structure(list(cols = list(absences = structure(list(), class = c("collector_character",
"collector")), G1 = structure(list(), class = c("collector_double",
"collector")), G2 = structure(list(), class = c("collector_double",
"collector")), G3 = structure(list(), class = c("collector_double",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), delim = "\t"), class = "col_spec"), problems = <pointer: 0x55e465b58110>, class = c("spec_tbl_df",
"tbl_df", "tbl", "data.frame"))
Something like this?
library(dplyr)
library(tidyr)
library(ggplot2)
set.seed(2022)
tibble(
absences = sample(c(0:16), 20, replace = TRUE),
G1 = sample(c(0:16), 20, replace = TRUE),
G2 = sample(c(0:16), 20, replace = TRUE),
G3 = sample(c(0:16), 20, replace = TRUE)
) %>%
pivot_longer(
cols = -absences,
names_to = "key",
values_to = "value"
) %>%
filter(key != 0) %>%
ggplot(aes(absences, key)) +
#geom_jitter(color = "red", height = 0.1)
geom_point(color = "red")
I didn't know if your grades and absences are distinct. if not, you can use the geom_jitter() I quoted out.
Output:
Here is a way how you could start to achieve your task:
library(tidyverse)
df %>%
pivot_longer(
-absences
) %>%
mutate(absences = as.numeric(replace(absences, absences == "o", "0"))) %>%
group_by(absences, name, value) %>%
summarise(absences = sum(absences, na.rm = TRUE)) %>%
ggplot(aes(x=name, y=factor(absences)))+
geom_point(aes(size = value), color="red")+
theme_minimal() +
labs(title = "Your title", y ="Absences", x = "Year") +
theme(legend.position = "bottom",
plot.title = element_text(hjust = 0.5)) +
guides(color= guide_legend(), size=guide_legend())+
coord_flip()

Error: Join columns must be present in data error, but columns are separated, tidy, and still having problem

I'm having issues joining a set of columns with a simple inner_join even though all of my data is tidy. Below is the error that I receive and below that I will paste simple samples of my data.
library(tidyverse)
library(janitor)
regions_name = regions %>% select(region, name)
regions_name$region = as.numeric(regions_name$region)
postcode_clean = postcode %>% clean_names()
#postcode_clean$pr = as.double(postcode_clean$pr)
postcode_province = postcode_clean %>% left_join(y = regions_name, by = c("pr", "region"))
#> Error: Join columns must be present in data.
#> x Problem with `region`.
> dput(head(postcode_clean, 10))
structure(list(pc = structure(c("A0A1A0", "A0A1B0", "A0A1C0",
"A0A1C0", "A0A1C0", "A0A1C0", "A0A1C0", "A0A1C0", "A0A1E0", "A0A1G0"
), label = "Postal code", format.spss = "A6"), pr = structure(c(10,
10, 10, 10, 10, 10, 10, 10, 10, 10), label = "Province or territory code", format.spss = "F2.0", display_width = 4L, labels = c(Newfoundland = 10,
`Prince Edward Island` = 11, `Nova Scotia` = 12, `New Brunswick` = 13,
Quebec = 24, Ontario = 35, Manitoba = 46, Saskatchewan = 47,
Alberta = 48, `British Columbia` = 59, Yukon = 60, `Northwest Territories` = 61,
Nunavut = 62), class = c("haven_labelled", "vctrs_vctr", "double"
)), cs_duid = structure(c(1001144, 1001464, 1001557, 1001557,
1001557, 1001557, 1001557, 1001557, 1001347, 1001409), label = "Census subdivision unique identifier", format.spss = "F7.0", display_width = 9L)), row.names = c(NA,
-10L), class = c("tbl_df", "tbl", "data.frame"))
> dput(head(regions_name, 10))
structure(list(region = c(1, 35, 24, 59, 48, 46, 47, 12, 13,
10), name = c("Canada", "Ontario", "Quebec", "British Columbia",
"Alberta", "Manitoba", "Saskatchewan", "Nova Scotia", "New Brunswick",
"Newfoundland and Labrador")), row.names = c(NA, -10L), class = c("tbl_df",
"tbl", "data.frame"), last_updated = structure(1648783776.07826, class = c("POSIXct",
"POSIXt")))
I don't understand it. I'm not doing anything complicated, yet I am being thrown this error. Any suggestions?

Error "Attribute not identical" in using gather function for the multi column in R

I am using following Data frame.
df2<-final.data%>% gather(Hospital,Attendance,contains("Attendance"))
df2 %>% spread(Hospital, Attendance)
> dput(final.data[0:2,])
structure(list(RoyalPerth.Attendance = c(235, 209), RoyalPerth.Admissions = c(99,
97), RoyalPerth.Tri1 = c("8", "N/A"), RoyalPerth.Tri2 = c(33,
41), RoyalPerth.Tri3 = c(89, 73), RoyalPerth.Tri4 = c(85, 80),
RoyalPert
h.Tri5 = c("20", "14"), Fremantle.Attendance = c(155,
145), Fremantle.Admissions = c(70, 56), Fremantle.Tri1 = c("N/A",
"N/A"), Fremantle.Tri2 = c(25, 22), Fremantle.Tri3 = c(67,
51), Fremantle.Tri4 = c(54, 47), Fremantle.Tri5 = c(9, 24
), PrincessMargaret.Attendance = c(252, 219), PrincessMargaret.Admissions = c(59,
47), PrincessMargaret.Tri1 = c("N/A", "N/A"), PrincessMargaret.Tri2 = c("13",
"14"), PrincessMargaret.Tri3 = c(75, 61), PrincessMargaret.Tri4 = c(159,
139), PrincessMargaret.Tri5 = c("4", "4"), KingEdward.Attendance = c(52,
43), KingEdward.Admissions = c("6", "7"), KingEdward.Tri1 = c("N/A",
"N/A"), KingEdward.Tri2 = c("N/A", "N/A"), KingEdward.Tri3 = c("7",
"N/A"), KingEdward.Tri4 = c(20, 25), KingEdward.Tri5 = c("25",
"17"), SirCharles.Attendance = c(209, 184), SirCharles.Admissions = c(109,
112), SirCharles.Tri1 = c("N/A", "N/A"), SirCharles.Tri2 = c(42,
43), SirCharles.Tri3 = c(108, 73), SirCharles.Tri4 = c(47,
61), SirCharles.Tri5 = c("11", "5"), Armadale.Attendance = c(166,
175), Armadale.Admissions = c(19, 25), Armadale.Tri1 = c("N/A",
"N/A"), Armadale.Tri2 = c(16, 26), Armadale.Tri3 = c(62,
73), Armadale.Tri4 = c(79, 55), Armadale.Tri5 = c("9", "19"
), Swan.Attendance = c(133, 129), Swan.Admissions = c(17,
25), Swan.Tri1 = c("N/A", "N/A"), Swan.Tri2 = c(29, 25),
Swan.Tri3 = c(59, 57), Swan.Tri4 = c(42, 43), Swan.Tri5 = c("N/A",
"4"), Rockingham.Attendance = c(155, 145), Rockingham.Admissions = c("10",
"24"), Rockingham.Tri1 = c("N/A", "N/A"), Rockingham.Tri2 = c(12,
26), Rockingham.Tri3 = c(51, 45), Rockingham.Tri4 = c(81,
65), Rockingham.Tri5 = c("11", "8"), Joondalup.Attendance = c(267,
241), Joondalup.Admissions = c(73, 81), Joondalup.Tri1 = c("N/A",
"N/A"), Joondalup.Tri2 = c(27, 23), Joondalup.Tri3 = c(75,
78), Joondalup.Tri4 = c(151, 133), Joondalup.Tri5 = c("12",
"7")), row.names = 1:2, class = "data.frame")
Error:
Warning message:
attributes are not identical across measure variables;
they will be dropped
I have tried below things:
hospital.dataset<-gather(hospital,triage,sum,Tri1:Tri5) to gather Triage
after using cbind on the data set.
I want to covert it into long data set using gather.
dput(hospital.dataset[1:2,])
structure(list(Date = structure(c(-714598, -714597), class = "Date"), [enter image description here][1]
Attendance = c(235, 209), Admissions = c(99, 97), Hospital = structure(c(1L,
1L), .Label = c("RoyalPerth Hospital", "Fremantle Hospital",
"Princess Margaret Hospital", "KingEdward Hospital", "SirCharles Hospital",
"Armadale Hospital", "Swan Hospital", "Rockingham Hospital",
"Joondalup Hospital"), class = "factor"), triage = c("Tri1",
"Tri1"), sum = c(8, 0)), row.names = 1:2, class = "data.frame")
Like this.
Thanks in advance.
Expected Dataframe
Note: This solution feels like a lot of effort. So please consider there may be more elegant approaches available.
One issue with this data is that the values you want "wide" (Attendance, Admissions) are mixed in with the values you want "long" (Tri1, Tri2, etc).
This solution uses pivot_longer on the entire data frame (note: pivot_longer is the new gather syntax) , and then separate to pull out the hospital name from the specific data field.
Then it splits into two data frames, applies pivot_wider to the Attendance/Admissions columns, and rejoins after that.
library(tidyverse)
final_data_long <- final.data.raw %>%
mutate_all(as.character) %>%
mutate(row_n = row_number()) %>%
pivot_longer(-row_n, names_to = "field", values_to = "value") %>%
separate(field, into = c("hospital", "category"))
attend_admit <- final_data_long %>%
filter(str_detect(category, "Attendance|Admissions"))
triage <- final_data_long %>% anti_join(attend_admit)
attend_admit_long <-
attend_admit %>%
group_by(row_n) %>%
pivot_wider(id_cols = c(row_n, hospital), names_from = category,
values_from = value)
triage %>%
inner_join(attend_admit_long, by = c("row_n", "hospital")) %>%
arrange(hospital) %>%
select(-row_n)
Output
# A tibble: 90 x 5
hospital category value Attendance Admissions
<chr> <chr> <chr> <chr> <chr>
1 Armadale Tri1 N/A 166 19
2 Armadale Tri2 16 166 19
3 Armadale Tri3 62 166 19
4 Armadale Tri4 79 166 19
5 Armadale Tri5 9 166 19
6 Armadale Tri1 N/A 175 25
7 Armadale Tri2 26 175 25
8 Armadale Tri3 73 175 25
9 Armadale Tri4 55 175 25
10 Armadale Tri5 19 175 25
# … with 80 more rows
Data
*I couldn't get OP's dput to work, here's a version that can be copy/pasted:
final.data.raw <- structure(
list(RoyalPerth.Attendance = c(235, 209), RoyalPerth.Admissions = c(99, 97), RoyalPerth.Tri1 = c("8", "N/A"),
RoyalPerth.Tri2 = c(33, 41), RoyalPerth.Tri3 = c(89, 73), RoyalPerth.Tri4 = c(85, 80),
RoyalPerth.Tri5 = c("20", "14"), Fremantle.Attendance = c(155, 145), Fremantle.Admissions = c(70, 56),
Fremantle.Tri1 = c("N/A", "N/A"), Fremantle.Tri2 = c(25, 22), Fremantle.Tri3 = c(67, 51),
Fremantle.Tri4 = c(54, 47), Fremantle.Tri5 = c(9, 24), PrincessMargaret.Attendance = c(252, 219),
PrincessMargaret.Admissions = c(59,47), PrincessMargaret.Tri1 = c("N/A", "N/A"), PrincessMargaret.Tri2 = c("13", "14"),
PrincessMargaret.Tri3 = c(75, 61), PrincessMargaret.Tri4 = c(159, 139), PrincessMargaret.Tri5 = c("4", "4"),
KingEdward.Attendance = c(52, 43), KingEdward.Admissions = c("6", "7"), KingEdward.Tri1 = c("N/A", "N/A"),
KingEdward.Tri2 = c("N/A", "N/A"), KingEdward.Tri3 = c("7", "N/A"), KingEdward.Tri4 = c(20, 25),
KingEdward.Tri5 = c("25", "17"), SirCharles.Attendance = c(209, 184), SirCharles.Admissions = c(109, 112),
SirCharles.Tri1 = c("N/A", "N/A"), SirCharles.Tri2 = c(42, 43), SirCharles.Tri3 = c(108, 73),
SirCharles.Tri4 = c(47, 61), SirCharles.Tri5 = c("11", "5"), Armadale.Attendance = c(166, 175),
Armadale.Admissions = c(19, 25), Armadale.Tri1 = c("N/A", "N/A"), Armadale.Tri2 = c(16, 26),
Armadale.Tri3 = c(62, 73), Armadale.Tri4 = c(79, 55), Armadale.Tri5 = c("9", "19"),
Swan.Attendance = c(133, 129), Swan.Admissions = c(17, 25), Swan.Tri1 = c("N/A", "N/A"),
Swan.Tri2 = c(29, 25), Swan.Tri3 = c(59, 57), Swan.Tri4 = c(42, 43),
Swan.Tri5 = c("N/A", "4"), Rockingham.Attendance = c(155, 145), Rockingham.Admissions = c("10", "24"),
Rockingham.Tri1 = c("N/A", "N/A"), Rockingham.Tri2 = c(12, 26), Rockingham.Tri3 = c(51, 45),
Rockingham.Tri4 = c(81, 65), Rockingham.Tri5 = c("11", "8"), Joondalup.Attendance = c(267, 241),
Joondalup.Admissions = c(73, 81), Joondalup.Tri1 = c("N/A", "N/A"), Joondalup.Tri2 = c(27, 23),
Joondalup.Tri3 = c(75, 78), Joondalup.Tri4 = c(151, 133), Joondalup.Tri5 = c("12", "7")),
row.names = 1:2, class = "data.frame")

Descriptives for a specified subset of rows in r

I have a long format dataset with each row being another measurement (as indicated by my "timeline.compressed" variable, which has 8 possible values; see dput below).
However, now I want to check the descriptive statistics of some of my variables (i.e., x1-x3) but for each of the timepoints seperately. I've tried using the if function, but that gives me the warning that the condition has >1 in length.
Does anyone perhaps know what code I should use to be able to get summary statistics for each of the timepoints seperately?
dput for table with possible timeline values:
structure(c(7518L, 6178L, 6393L, 5886L, 6121L, 5977L, 7440L,
5886L), .Dim = 8L, .Dimnames = structure(list(c("5", "16", "28",
"40", "52", "64", "79", "95")), .Names = ""), class = "table")
dput for example dataset
structure(list(nomem_encr = c(800009L, 800009L, 800012L, 800015L,
800015L, 800015L), timeline.compressed = c(79, 95, 79, 28, 40,
52), sel = c(4.9, NA, NA, 6.9, 6.7, NA), close_num = c(1, 0.2,
1, 0.8, 1, 0.8), gener_sat = c(7, 7, 8, 7, 7, 5)), .Names = c("ID",
"timeline.compressed", "x1", "x2", "x3"), row.names = c(NA,
6L), class = "data.frame")
Using dplyr you can do, e.g. with timeline_values being your frequency table and df your data
data.frame(timeline.compressed = as.numeric(names(timeline_values))) %>%
left_join(df) %>%
group_by(timeline.compressed) %>%
summarize_all(mean, na.rm = TRUE)

Why hover in plotly barchart does not work?

I've got data like this ...
# rok miesiac ile kwartal miesiac2 kwartal2 miesiac3 limit serwis typ ile2 ile_proc lp
# (dbl) (dbl) (dbl) (dbl) (chr) (fctr) (chr) (dbl) (chr) (chr) (dbl) (dbl) (dbl)
# 1 2017 1 31.5 1 1 Q1 2017 Styczeń 0 Sport wizyty 32.5 97 1
# 2 2017 2 1.0 1 2 Q1 2017 Luty 0 Sport wizyty 32.5 3 1
... and I try to draw this plot from plotly library ...
plot_ly(tab,
x = ~lp,
y = ~ile,
color = ~miesiac2,
type = "bar",
text = ~miesiac3,
hoverinfo = "text")
... and everything is ok but hover. It does not work and I have no idea why. What is curious when I have the same format of data but a bit 'longer', everything works.
I have no idea where the problem is. I hope you do!
Simple data:
structure(list(rok = c(2017, 2017), miesiac = c(1, 2), ile = c(31.5,
1), kwartal = c(1, 1), miesiac2 = c("1", "2"), kwartal2 = structure(c(1L,
1L), .Label = "Q1 2017", class = "factor"), miesiac3 = c("Styczeń",
"Luty"), limit = c(97, 97), serwis = c("Sport", "Sport"), typ = c("wizyty",
"wizyty"), ile2 = c(32.5, 32.5), ile_proc = c(97, 3), lp = c(1,
1)), class = "data.frame", .Names = c("rok", "miesiac", "ile",
"kwartal", "miesiac2", "kwartal2", "miesiac3", "limit", "serwis",
"typ", "ile2", "ile_proc", "lp"), row.names = c(NA, -2L))
'Longer' data:
structure(list(rok = c(2016, 2016, 2016, 2016, 2016, 2016, 2016,
2016, 2016, 2017, 2017), miesiac = c(4, 5, 6, 7, 8, 9, 10, 11,
12, 1, 2), ile = c(80.1, 87.5, 159, 104, 125.3, 74.2, 84.9, 74.4,
75.3, 81.8, 2.4), kwartal = c(2, 2, 2, 3, 3, 3, 4, 4, 4, 1, 1
), miesiac2 = c("1", "2", "3", "1", "2", "3", "1", "2", "3",
"1", "2"), kwartal2 = structure(c(1L, 1L, 1L, 2L, 2L, 2L, 3L,
3L, 3L, 4L, 4L), .Label = c("Q2 2016", "Q3 2016", "Q4 2016",
"Q1 2017"), class = "factor"), miesiac3 = c("Kwiecień", "Maj",
"Czerwiec", "Lipiec", "Sierpień", "Wrzesień", "Październik",
"Listopad", "Grudzień", "Styczeń", "Luty"), limit = c(308, 308,
308, 300, 300, 300, 245, 245, 245, 244, 244), serwis = c("Sport",
"Sport", "Sport", "Sport", "Sport", "Sport", "Sport", "Sport",
"Sport", "Sport", "Sport"), typ = c("odslony", "odslony", "odslony",
"odslony", "odslony", "odslony", "odslony", "odslony", "odslony",
"odslony", "odslony"), ile2 = c(326.6, 326.6, 326.6, 303.5, 303.5,
303.5, 234.6, 234.6, 234.6, 84.2, 84.2), ile_proc = c(25, 27,
49, 34, 41, 24, 36, 32, 32, 97, 3), lp = c(1, 1, 1, 2, 2, 2,
3, 3, 3, 4, 4)), class = "data.frame", .Names = c("rok", "miesiac",
"ile", "kwartal", "miesiac2", "kwartal2", "miesiac3", "limit",
"serwis", "typ", "ile2", "ile_proc", "lp"), row.names = c(NA,
-11L))
The plotting works for me, although I do get a warning
Warning message:
In RColorBrewer::brewer.pal(N, "Set2") :
minimal value for n is 3, returning requested palette with 3 different levels
This is just a warning and can in this case be ignored. For the curious, it originates from RColorBrewer and can be avoided by manually specifying the colors.
library(RColorBrewer)
# display.brewer.all() # see all the palettes
# generate colors beforehand - same warning. extract only first two
cols <- brewer.pal(n = 2, name = "Set2")[1:2]
plot_ly(xyshort,
x = ~lp,
y = ~ile,
color = ~miesiac2,
colors = cols, # explicitly name colors
type = "bar",
text = ~miesiac3,
hoverinfo = "text")
R 3.3.2 on Windows 7 and plotly_4.5.6.

Resources