I have a list formed by 12 elements, each being a data frame. Each df contain three columns, two common columns across all the elements and one different.
The two common columns are:
coche_OEM
dia_hora_OEM
The other column, which is different in every element, can be collapsed in an unique column when converting the list into a data frame. For instance, column U0073 in one of the elements containS one value with the same name, whereas column B1182 contains another element with the same name as the variable name.
The issue is that I would like to convert this list into a data frame with three columns (variables):
coche_OEM
dia_hora_OEM
DTC: this column with all the values present in each column with their codes.
The list is this one:
listdf <- list(structure(list(B1182 = structure(1L, .Label = c("B1182",
"NULL"), class = "factor"), coche_OEM = structure(3L, .Label = c("356232050832996",
"356232050836666", "356232050880755", "356232050882736", "356232050899078",
"356232050905933"), class = "factor"), dia_hora_OEM = structure(1577774413, class = c("POSIXct",
"POSIXt"), tzone = "UTC")), row.names = c(NA, -1L), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"), groups = structure(list(B1182 = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("B1182",
"NULL"), class = "factor"), coche_OEM = structure(c(1L, 2L, 3L,
4L, 5L, 6L, 1L, 2L, 3L, 4L, 5L, 6L), .Label = c("356232050832996",
"356232050836666", "356232050880755", "356232050882736", "356232050899078",
"356232050905933"), class = "factor"), .rows = list(integer(0),
integer(0), 1L, integer(0), integer(0), integer(0), integer(0),
integer(0), integer(0), integer(0), integer(0), integer(0))), .Names = c("B1182",
"coche_OEM", ".rows"), row.names = c(NA, -12L), class = c("tbl_df",
"tbl", "data.frame"), .drop = FALSE), .Names = c("B1182", "coche_OEM",
"dia_hora_OEM")), structure(list(B124D = structure(1L, .Label = c("B124D",
"NULL"), class = "factor"), coche_OEM = structure(3L, .Label = c("356232050832996",
"356232050836666", "356232050880755", "356232050882736", "356232050899078",
"356232050905933"), class = "factor"), dia_hora_OEM = structure(1577774413, class = c("POSIXct",
"POSIXt"), tzone = "UTC")), row.names = c(NA, -1L), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"), groups = structure(list(B124D = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("B124D",
"NULL"), class = "factor"), coche_OEM = structure(c(1L, 2L, 3L,
4L, 5L, 6L, 1L, 2L, 3L, 4L, 5L, 6L), .Label = c("356232050832996",
"356232050836666", "356232050880755", "356232050882736", "356232050899078",
"356232050905933"), class = "factor"), .rows = list(integer(0),
integer(0), 1L, integer(0), integer(0), integer(0), integer(0),
integer(0), integer(0), integer(0), integer(0), integer(0))), .Names = c("B124D",
"coche_OEM", ".rows"), row.names = c(NA, -12L), class = c("tbl_df",
"tbl", "data.frame"), .drop = FALSE), .Names = c("B124D", "coche_OEM",
"dia_hora_OEM")), structure(list(P2000 = structure(1L, .Label = c("c(\"P2000\", \"P2000\", \"P2000\")",
"NULL"), class = "factor"), coche_OEM = structure(5L, .Label = c("356232050832996",
"356232050836666", "356232050880755", "356232050882736", "356232050899078",
"356232050905933"), class = "factor"), dia_hora_OEM = structure(1577793330, class = c("POSIXct",
"POSIXt"), tzone = "UTC")), row.names = c(NA, -1L), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"), groups = structure(list(P2000 = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("c(\"P2000\", \"P2000\", \"P2000\")",
"NULL"), class = "factor"), coche_OEM = structure(c(1L, 2L, 3L,
4L, 5L, 6L, 1L, 2L, 3L, 4L, 5L, 6L), .Label = c("356232050832996",
"356232050836666", "356232050880755", "356232050882736", "356232050899078",
"356232050905933"), class = "factor"), .rows = list(integer(0),
integer(0), integer(0), integer(0), 1L, integer(0), integer(0),
integer(0), integer(0), integer(0), integer(0), integer(0))), .Names = c("P2000",
"coche_OEM", ".rows"), row.names = c(NA, -12L), class = c("tbl_df",
"tbl", "data.frame"), .drop = FALSE), .Names = c("P2000", "coche_OEM",
"dia_hora_OEM")), structure(list(U3003 = structure(c(2L, 2L), .Label = c("NULL",
"U3003"), class = "factor"), coche_OEM = structure(c(5L, 1L), .Label = c("356232050832996",
"356232050836666", "356232050880755", "356232050882736", "356232050899078",
"356232050905933"), class = "factor"), dia_hora_OEM = structure(c(1577793330,
1582648789), class = c("POSIXct", "POSIXt"), tzone = "UTC")), row.names = c(NA,
-2L), class = c("grouped_df", "tbl_df", "tbl", "data.frame"), groups = structure(list(
U3003 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L,
2L, 2L), .Label = c("NULL", "U3003"), class = "factor"),
coche_OEM = structure(c(1L, 2L, 3L, 4L, 5L, 6L, 1L, 2L, 3L,
4L, 5L, 6L), .Label = c("356232050832996", "356232050836666",
"356232050880755", "356232050882736", "356232050899078",
"356232050905933"), class = "factor"), .rows = list(integer(0),
integer(0), integer(0), integer(0), integer(0), integer(0),
2L, integer(0), integer(0), integer(0), 1L, integer(0))), .Names = c("U3003",
"coche_OEM", ".rows"), row.names = c(NA, -12L), class = c("tbl_df",
"tbl", "data.frame"), .drop = FALSE), .Names = c("U3003", "coche_OEM",
"dia_hora_OEM")), structure(list(B1D01 = structure(c(1L, 1L,
2L), .Label = c("B1D01", "c(\"B1D01\", \"B1D01\")", "NULL"), class = "factor"),
coche_OEM = structure(c(2L, 1L, 1L), .Label = c("356232050832996",
"356232050836666", "356232050880755", "356232050882736",
"356232050899078", "356232050905933"), class = "factor"),
dia_hora_OEM = structure(c(1581690876, 1582648789, 1582651926
), class = c("POSIXct", "POSIXt"), tzone = "UTC")), row.names = c(NA,
-3L), class = c("grouped_df", "tbl_df", "tbl", "data.frame"), groups = structure(list(
B1D01 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L,
2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("B1D01", "c(\"B1D01\", \"B1D01\")",
"NULL"), class = "factor"), coche_OEM = structure(c(1L, 2L,
3L, 4L, 5L, 6L, 1L, 2L, 3L, 4L, 5L, 6L, 1L, 2L, 3L, 4L, 5L,
6L), .Label = c("356232050832996", "356232050836666", "356232050880755",
"356232050882736", "356232050899078", "356232050905933"), class = "factor"),
.rows = list(2L, 1L, integer(0), integer(0), integer(0),
integer(0), 3L, integer(0), integer(0), integer(0), integer(0),
integer(0), integer(0), integer(0), integer(0), integer(0),
integer(0), integer(0))), .Names = c("B1D01", "coche_OEM",
".rows"), row.names = c(NA, -18L), class = c("tbl_df", "tbl",
"data.frame"), .drop = FALSE), .Names = c("B1D01", "coche_OEM",
"dia_hora_OEM")), structure(list(U0155 = structure(2L, .Label = c("NULL",
"U0155"), class = "factor"), coche_OEM = structure(1L, .Label = c("356232050832996",
"356232050836666", "356232050880755", "356232050882736", "356232050899078",
"356232050905933"), class = "factor"), dia_hora_OEM = structure(1582648789, class = c("POSIXct",
"POSIXt"), tzone = "UTC")), row.names = c(NA, -1L), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"), groups = structure(list(U0155 = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("NULL",
"U0155"), class = "factor"), coche_OEM = structure(c(1L, 2L,
3L, 4L, 5L, 6L, 1L, 2L, 3L, 4L, 5L, 6L), .Label = c("356232050832996",
"356232050836666", "356232050880755", "356232050882736", "356232050899078",
"356232050905933"), class = "factor"), .rows = list(integer(0),
integer(0), integer(0), integer(0), integer(0), integer(0),
1L, integer(0), integer(0), integer(0), integer(0), integer(0))), .Names = c("U0155",
"coche_OEM", ".rows"), row.names = c(NA, -12L), class = c("tbl_df",
"tbl", "data.frame"), .drop = FALSE), .Names = c("U0155", "coche_OEM",
"dia_hora_OEM")), structure(list(C1B00 = structure(1L, .Label = c("C1B00",
"NULL"), class = "factor"), coche_OEM = structure(1L, .Label = c("356232050832996",
"356232050836666", "356232050880755", "356232050882736", "356232050899078",
"356232050905933"), class = "factor"), dia_hora_OEM = structure(1582648789, class = c("POSIXct",
"POSIXt"), tzone = "UTC")), row.names = c(NA, -1L), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"), groups = structure(list(C1B00 = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("C1B00",
"NULL"), class = "factor"), coche_OEM = structure(c(1L, 2L, 3L,
4L, 5L, 6L, 1L, 2L, 3L, 4L, 5L, 6L), .Label = c("356232050832996",
"356232050836666", "356232050880755", "356232050882736", "356232050899078",
"356232050905933"), class = "factor"), .rows = list(1L, integer(0),
integer(0), integer(0), integer(0), integer(0), integer(0),
integer(0), integer(0), integer(0), integer(0), integer(0))), .Names = c("C1B00",
"coche_OEM", ".rows"), row.names = c(NA, -12L), class = c("tbl_df",
"tbl", "data.frame"), .drop = FALSE), .Names = c("C1B00", "coche_OEM",
"dia_hora_OEM")), structure(list(P037D = structure(2L, .Label = c("NULL",
"P037D"), class = "factor"), coche_OEM = structure(1L, .Label = c("356232050832996",
"356232050836666", "356232050880755", "356232050882736", "356232050899078",
"356232050905933"), class = "factor"), dia_hora_OEM = structure(1582648789, class = c("POSIXct",
"POSIXt"), tzone = "UTC")), row.names = c(NA, -1L), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"), groups = structure(list(P037D = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("NULL",
"P037D"), class = "factor"), coche_OEM = structure(c(1L, 2L,
3L, 4L, 5L, 6L, 1L, 2L, 3L, 4L, 5L, 6L), .Label = c("356232050832996",
"356232050836666", "356232050880755", "356232050882736", "356232050899078",
"356232050905933"), class = "factor"), .rows = list(integer(0),
integer(0), integer(0), integer(0), integer(0), integer(0),
1L, integer(0), integer(0), integer(0), integer(0), integer(0))), .Names = c("P037D",
"coche_OEM", ".rows"), row.names = c(NA, -12L), class = c("tbl_df",
"tbl", "data.frame"), .drop = FALSE), .Names = c("P037D", "coche_OEM",
"dia_hora_OEM")), structure(list(P0616 = structure(2L, .Label = c("NULL",
"P0616"), class = "factor"), coche_OEM = structure(1L, .Label = c("356232050832996",
"356232050836666", "356232050880755", "356232050882736", "356232050899078",
"356232050905933"), class = "factor"), dia_hora_OEM = structure(1582648789, class = c("POSIXct",
"POSIXt"), tzone = "UTC")), row.names = c(NA, -1L), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"), groups = structure(list(P0616 = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("NULL",
"P0616"), class = "factor"), coche_OEM = structure(c(1L, 2L,
3L, 4L, 5L, 6L, 1L, 2L, 3L, 4L, 5L, 6L), .Label = c("356232050832996",
"356232050836666", "356232050880755", "356232050882736", "356232050899078",
"356232050905933"), class = "factor"), .rows = list(integer(0),
integer(0), integer(0), integer(0), integer(0), integer(0),
1L, integer(0), integer(0), integer(0), integer(0), integer(0))), .Names = c("P0616",
"coche_OEM", ".rows"), row.names = c(NA, -12L), class = c("tbl_df",
"tbl", "data.frame"), .drop = FALSE), .Names = c("P0616", "coche_OEM",
"dia_hora_OEM")), structure(list(P0562 = structure(2L, .Label = c("NULL",
"P0562"), class = "factor"), coche_OEM = structure(1L, .Label = c("356232050832996",
"356232050836666", "356232050880755", "356232050882736", "356232050899078",
"356232050905933"), class = "factor"), dia_hora_OEM = structure(1582648789, class = c("POSIXct",
"POSIXt"), tzone = "UTC")), row.names = c(NA, -1L), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"), groups = structure(list(P0562 = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("NULL",
"P0562"), class = "factor"), coche_OEM = structure(c(1L, 2L,
3L, 4L, 5L, 6L, 1L, 2L, 3L, 4L, 5L, 6L), .Label = c("356232050832996",
"356232050836666", "356232050880755", "356232050882736", "356232050899078",
"356232050905933"), class = "factor"), .rows = list(integer(0),
integer(0), integer(0), integer(0), integer(0), integer(0),
1L, integer(0), integer(0), integer(0), integer(0), integer(0))), .Names = c("P0562",
"coche_OEM", ".rows"), row.names = c(NA, -12L), class = c("tbl_df",
"tbl", "data.frame"), .drop = FALSE), .Names = c("P0562", "coche_OEM",
"dia_hora_OEM")), structure(list(U0073 = structure(2L, .Label = c("NULL",
"U0073"), class = "factor"), coche_OEM = structure(1L, .Label = c("356232050832996",
"356232050836666", "356232050880755", "356232050882736", "356232050899078",
"356232050905933"), class = "factor"), dia_hora_OEM = structure(1582648789, class = c("POSIXct",
"POSIXt"), tzone = "UTC")), row.names = c(NA, -1L), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"), groups = structure(list(U0073 = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("NULL",
"U0073"), class = "factor"), coche_OEM = structure(c(1L, 2L,
3L, 4L, 5L, 6L, 1L, 2L, 3L, 4L, 5L, 6L), .Label = c("356232050832996",
"356232050836666", "356232050880755", "356232050882736", "356232050899078",
"356232050905933"), class = "factor"), .rows = list(integer(0),
integer(0), integer(0), integer(0), integer(0), integer(0),
1L, integer(0), integer(0), integer(0), integer(0), integer(0))), .Names = c("U0073",
"coche_OEM", ".rows"), row.names = c(NA, -12L), class = c("tbl_df",
"tbl", "data.frame"), .drop = FALSE), .Names = c("U0073", "coche_OEM",
"dia_hora_OEM")), structure(list(P0138 = structure(1L, .Label = c("c(\"P0138\", \"P0138\", \"P0138\")",
"NULL"), class = "factor"), coche_OEM = structure(5L, .Label = c("356232050832996",
"356232050836666", "356232050880755", "356232050882736", "356232050899078",
"356232050905933"), class = "factor"), dia_hora_OEM = structure(1583391111, class = c("POSIXct",
"POSIXt"), tzone = "UTC")), row.names = c(NA, -1L), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"), groups = structure(list(P0138 = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("c(\"P0138\", \"P0138\", \"P0138\")",
"NULL"), class = "factor"), coche_OEM = structure(c(1L, 2L, 3L,
4L, 5L, 6L, 1L, 2L, 3L, 4L, 5L, 6L), .Label = c("356232050832996",
"356232050836666", "356232050880755", "356232050882736", "356232050899078",
"356232050905933"), class = "factor"), .rows = list(integer(0),
integer(0), integer(0), integer(0), 1L, integer(0), integer(0),
integer(0), integer(0), integer(0), integer(0), integer(0))), .Names = c("P0138",
"coche_OEM", ".rows"), row.names = c(NA, -12L), class = c("tbl_df",
"tbl", "data.frame"), .drop = FALSE), .Names = c("P0138", "coche_OEM",
"dia_hora_OEM")))
So, how could I convert this list into a data frame with my requirements?
We can rename all the columns that are not 'coche_OEM' or 'dia_hora_OEM' to a predefined string ('id' here):
map_df(listdf, ~rename_at(.x, vars(-c('coche_OEM', 'dia_hora_OEM')), ~'id'))
# A tibble: 15 x 3
# Groups: id, coche_OEM [78]
id coche_OEM dia_hora_OEM
<chr> <fct> <dttm>
1 "B1182" 356232050880755 2019-12-31 06:40:13
2 "B124D" 356232050880755 2019-12-31 06:40:13
3 "c(\"P2000\", \"P2000\", \"P2000\")" 356232050899078 2019-12-31 11:55:30
4 "U3003" 356232050899078 2019-12-31 11:55:30
5 "U3003" 356232050832996 2020-02-25 16:39:49
6 "B1D01" 356232050836666 2020-02-14 14:34:36
7 "B1D01" 356232050832996 2020-02-25 16:39:49
8 "c(\"B1D01\", \"B1D01\")" 356232050832996 2020-02-25 17:32:06
9 "U0155" 356232050832996 2020-02-25 16:39:49
10 "C1B00" 356232050832996 2020-02-25 16:39:49
11 "P037D" 356232050832996 2020-02-25 16:39:49
12 "P0616" 356232050832996 2020-02-25 16:39:49
13 "P0562" 356232050832996 2020-02-25 16:39:49
14 "U0073" 356232050832996 2020-02-25 16:39:49
15 "c(\"P0138\", \"P0138\", \"P0138\")" 356232050899078 2020-03-05 06:51:51
I would appreciate any advice with my plot - I am a ggplot novice!
I am trying to create a cleveland dot plot faceted by cluster, which has 3 levels. I have 3 issues that I am struggling with:
Within each cluster, I want the dots to be ordered by my continuous x-var. The code below isn't ordering correctly.
Is it possible to change the dot type based on whether the y-var ends in a 0 (does not have a characteristic) or 1 (does have the characteristic)?
I have a variable in my data set (Population) which shows the population % of a characteristic. I would like to see if a cluster characteristic is over/under-represented compared with the population. I would like to add a dot on the same line of each y-var.
Here is my code :
ggplot(cl1, aes(x=Cluster_prop, y=reorder(Var, Cluster_prop)))+
geom_segment(aes(yend=Var), xend=0, colour="grey50")+
geom_point(size=3, aes(colour=Cluster))+
facet_grid(Cluster~., scales="free_y", space="free_y") +
ggtitle("Top 10 Cluster Characteristics: % Children Within Cluster With
Feature")
Here is my data:
> dput(cl1)
structure(list(Var = structure(c(2L, 3L, 5L, 7L, 14L, 16L, 18L,
19L, 20L, 22L, 15L, 9L, 7L, 6L, 21L, 13L, 17L, 12L, 4L, 11L,
15L, 17L, 21L, 1L, 13L, 4L, 10L, 12L, 6L, 8L), .Label = c("asthdoc_1",
"AttacksOnExer_1_0", "AttacksTTT_1_0", "AttacksTTT_1_1", "Breath0rmal_1_0",
"Breath0rmal_1_1", "CAsthmaMed_1_0", "CAsthmaMed_1_1", "CCurrentAsthma_1_0",
"CCurrentAsthma_1_1", "CongColds_1_1", "CoughNight_1_1",
"CoughWithColds_1_1",
"EverWheeze_1_0", "EverWheeze_1_1", "Wheeze6M_1_0", "Wheeze6M_1_1",
"WheezeMostDays_1_0", "WheezeOcc_1_0", "WheezeWithColds_1_0",
"WheezeWithColds_1_1", "WheezeWithShort_1_0"), class = "factor"),
Cluster_prop = c(100, 100, 100, 100, 100, 100, 100, 100,
100, 100, 100, 99.4219653, 98.8439306, 95.3757225, 94.7976879,
83.2369942, 79.1907514, 53.7572254, 50.867052, 50.867052,
100, 100, 100, 93.103448, 89.655172, 86.206897, 86.206897,
82.758621, 79.310345, 79.310345), Population = c(96.131528,
78.143133, 63.636364, 95.16441, 60.928433, 67.891683, 97.485493,
89.555126, 62.669246, 90.32882, 39.071567, 94.584139, 95.16441,
36.363636, 37.330754, 68.665377, 32.108317, 43.520309, 21.856867,
42.166344, 39.071567, 32.108317, 37.330754, 9.864603, 68.665377,
21.856867, 5.415861, 43.520309, 36.363636, 4.83559), Cluster =
structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("1",
"2", "3"), class = "factor")), .Names = c("Var", "Cluster_prop",
"Population", "Cluster"), row.names = c(NA, -30L), vars = "Cluster", drop =
TRUE, indices = list(
0:9, 10:19, 20:29), group_sizes = c(10L, 10L, 10L), biggest_group_size =
10L, labels = structure(list(
Cluster = 1:3), row.names = c(NA, -3L), class = "data.frame", vars =
"Cluster", drop = TRUE, .Names = "Cluster"), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))
Many thanks for any advice!
For your second (EDIT and third) issue(s):
library(tidyverse)
library(stringr)
str_sub(str, start = -1, end = -1)
cl2 <- cl1 %>% mutate(Shape = str_sub(Var, start = -1, end = -1))
ggplot(cl2, aes(x=Cluster_prop, y=reorder(Var, Cluster_prop)))+
geom_segment(aes(yend=Var), xend=0, colour="grey50")+
geom_point(size=3, aes(colour=Cluster, shape = Shape))+
geom_point(aes(x = Population), size = 2, color = "black")+
facet_grid(Cluster~., scales="free_y", space="free_y") +
ggtitle("Top 10 Cluster Characteristics: % Children Within Cluster With
Feature")