NA values in R when dividing two variables - r

I have the following dataset:
structure(list(decils_renda = structure(c(1L, 3L, 5L, 3L, 2L,
10L, 3L, 7L, 2L, 8L, 4L, 7L, 6L, 2L, 5L, 1L, 1L, 9L, 4L, 2L), .Label = c("1r",
"2n", "3r", "4t", "5è", "6è", "7è", "8è", "9è", "10è"), class = "factor"),
nombre_families_decils = c(2107410.879995, 1919694.803749,
1871204.79901, 1919694.803749, 2000467.089601, 1756059.188985,
1919694.803749, 1865871.935523, 2000467.089601, 1832456.399842,
1929142.572451, 1865871.935523, 1857086.601994, 2000467.089601,
1871204.79901, 2107410.879995, 2107410.879995, 1726965.615762,
1929142.572451, 2000467.089601), despesatotal = structure(c(3692812.45,
9798007.97, 11479590.32, 7022441.93, 32068770.61, 43498810.27,
14197075.72, 30361832.13, 12884341.18, 86317384.39, 17834496.58,
7124896.58, 31555170.18, 6652264.05, 5166912.67, 22087897.14,
28243177.88, 13478665.67, 7722015.78, 11334536.72), format.stata = "%12.0g"),
despesamonetaria = structure(c(1750165.37, 5424793.37, 8354996.5,
5009218.41, 20577773.88, 38507968.12, 10922966.92, 30361832.13,
7139635.72, 80050637.69, 14429261.22, 5429467.01, 25528438.99,
3315187.59, 5166912.67, 14379160.67, 20813842.46, 9559187.02,
5939555.08, 9223340.12), format.stata = "%12.0g")), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -20L), groups = structure(list(
decils_renda = structure(1:10, .Label = c("1r", "2n", "3r",
"4t", "5è", "6è", "7è", "8è", "9è", "10è"), class = "factor"),
.rows = structure(list(c(1L, 16L, 17L), c(5L, 9L, 14L, 20L
), c(2L, 4L, 7L), c(11L, 19L), c(3L, 15L), 13L, c(8L, 12L
), 10L, 18L, 6L), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -10L), .drop = TRUE))
I want to divide despesatotal and despesamonetaria between nombre_families_decils. However, when decils_renda is 1r, I only get NA values. And it shouldn't be an NA value.
I am using the following code:
Llar_2021_Red <- Llar_2021_Red %>%
group_by(decils_renda) %>%
mutate(despesa_total_decils=sum(despesatotal)/nombre_families_decils, na.rm=TRUE) %>%
mutate(despesa_monetaria_decils=sum(despesamonetaria)/nombre_families_decils, na.rm=TRUE)

Related

subsample random rows of tibble

Suppose i have two data objects, df.A and df.B.
df.A <- structure(list(Species = structure(c(7L, 7L, 1L, 1L, 1L, 1L,
4L, 6L, 5L, 5L), .Label = c("Carcharhinus leucas", "Carcharhinus limbatus",
"Carcharhinus perezi", "Galeocerdo cuvier", "Ginglymostoma cirratum",
"Hypanus americanus", "Negaprion brevirostris", "Sphyrna mokarran"
), class = "factor"), Sex = structure(c(1L, 1L, 1L, 2L, 1L, 1L,
1L, 1L, 1L, 2L), .Label = c("f", "m"), class = "factor")), row.names = c(NA,
10L), class = "data.frame")
> class(df.A)
[1] "data.frame"
df.B <- structure(list(Diel.phase = structure(c(2L, 2L, 1L, 2L, 1L, 2L,
2L, 1L, 1L, 1L), .Label = c("Day", "Night"), class = "factor"),
Season = structure(c(2L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 1L,
2L), .Label = c("Summer", "Winter"), class = "factor")), row.names = c(NA,
-10L), groups = structure(list(.rows = structure(list(1L, 2L,
3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, -10L), class = c("tbl_df",
"tbl", "data.frame")), class = c("rowwise_df", "tbl_df", "tbl",
"data.frame"))
> class(df.B)
[1] "rowwise_df" "tbl_df" "tbl" "data.frame"
Let's say I want to subsample 2 rows from each object. The code below works for df.A but not for df.B. Instead, all rows for df.B are returned.
df.B %>% slice_sample(n=2)
Can someone explain this result? And how can i apply sample_slice to object of class(df.B) without back-transforming to data.frame object first?
The grouping influences how the tibble is treated.
You can do this:
df.B %>% ungroup() %>% slice_sample(n=2)

Convert list in data frame collapsing one column and keeping others unaletered in R

I have a list formed by 12 elements, each being a data frame. Each df contain three columns, two common columns across all the elements and one different.
The two common columns are:
coche_OEM
dia_hora_OEM
The other column, which is different in every element, can be collapsed in an unique column when converting the list into a data frame. For instance, column U0073 in one of the elements containS one value with the same name, whereas column B1182 contains another element with the same name as the variable name.
The issue is that I would like to convert this list into a data frame with three columns (variables):
coche_OEM
dia_hora_OEM
DTC: this column with all the values present in each column with their codes.
The list is this one:
listdf <- list(structure(list(B1182 = structure(1L, .Label = c("B1182",
"NULL"), class = "factor"), coche_OEM = structure(3L, .Label = c("356232050832996",
"356232050836666", "356232050880755", "356232050882736", "356232050899078",
"356232050905933"), class = "factor"), dia_hora_OEM = structure(1577774413, class = c("POSIXct",
"POSIXt"), tzone = "UTC")), row.names = c(NA, -1L), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"), groups = structure(list(B1182 = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("B1182",
"NULL"), class = "factor"), coche_OEM = structure(c(1L, 2L, 3L,
4L, 5L, 6L, 1L, 2L, 3L, 4L, 5L, 6L), .Label = c("356232050832996",
"356232050836666", "356232050880755", "356232050882736", "356232050899078",
"356232050905933"), class = "factor"), .rows = list(integer(0),
integer(0), 1L, integer(0), integer(0), integer(0), integer(0),
integer(0), integer(0), integer(0), integer(0), integer(0))), .Names = c("B1182",
"coche_OEM", ".rows"), row.names = c(NA, -12L), class = c("tbl_df",
"tbl", "data.frame"), .drop = FALSE), .Names = c("B1182", "coche_OEM",
"dia_hora_OEM")), structure(list(B124D = structure(1L, .Label = c("B124D",
"NULL"), class = "factor"), coche_OEM = structure(3L, .Label = c("356232050832996",
"356232050836666", "356232050880755", "356232050882736", "356232050899078",
"356232050905933"), class = "factor"), dia_hora_OEM = structure(1577774413, class = c("POSIXct",
"POSIXt"), tzone = "UTC")), row.names = c(NA, -1L), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"), groups = structure(list(B124D = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("B124D",
"NULL"), class = "factor"), coche_OEM = structure(c(1L, 2L, 3L,
4L, 5L, 6L, 1L, 2L, 3L, 4L, 5L, 6L), .Label = c("356232050832996",
"356232050836666", "356232050880755", "356232050882736", "356232050899078",
"356232050905933"), class = "factor"), .rows = list(integer(0),
integer(0), 1L, integer(0), integer(0), integer(0), integer(0),
integer(0), integer(0), integer(0), integer(0), integer(0))), .Names = c("B124D",
"coche_OEM", ".rows"), row.names = c(NA, -12L), class = c("tbl_df",
"tbl", "data.frame"), .drop = FALSE), .Names = c("B124D", "coche_OEM",
"dia_hora_OEM")), structure(list(P2000 = structure(1L, .Label = c("c(\"P2000\", \"P2000\", \"P2000\")",
"NULL"), class = "factor"), coche_OEM = structure(5L, .Label = c("356232050832996",
"356232050836666", "356232050880755", "356232050882736", "356232050899078",
"356232050905933"), class = "factor"), dia_hora_OEM = structure(1577793330, class = c("POSIXct",
"POSIXt"), tzone = "UTC")), row.names = c(NA, -1L), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"), groups = structure(list(P2000 = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("c(\"P2000\", \"P2000\", \"P2000\")",
"NULL"), class = "factor"), coche_OEM = structure(c(1L, 2L, 3L,
4L, 5L, 6L, 1L, 2L, 3L, 4L, 5L, 6L), .Label = c("356232050832996",
"356232050836666", "356232050880755", "356232050882736", "356232050899078",
"356232050905933"), class = "factor"), .rows = list(integer(0),
integer(0), integer(0), integer(0), 1L, integer(0), integer(0),
integer(0), integer(0), integer(0), integer(0), integer(0))), .Names = c("P2000",
"coche_OEM", ".rows"), row.names = c(NA, -12L), class = c("tbl_df",
"tbl", "data.frame"), .drop = FALSE), .Names = c("P2000", "coche_OEM",
"dia_hora_OEM")), structure(list(U3003 = structure(c(2L, 2L), .Label = c("NULL",
"U3003"), class = "factor"), coche_OEM = structure(c(5L, 1L), .Label = c("356232050832996",
"356232050836666", "356232050880755", "356232050882736", "356232050899078",
"356232050905933"), class = "factor"), dia_hora_OEM = structure(c(1577793330,
1582648789), class = c("POSIXct", "POSIXt"), tzone = "UTC")), row.names = c(NA,
-2L), class = c("grouped_df", "tbl_df", "tbl", "data.frame"), groups = structure(list(
U3003 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L,
2L, 2L), .Label = c("NULL", "U3003"), class = "factor"),
coche_OEM = structure(c(1L, 2L, 3L, 4L, 5L, 6L, 1L, 2L, 3L,
4L, 5L, 6L), .Label = c("356232050832996", "356232050836666",
"356232050880755", "356232050882736", "356232050899078",
"356232050905933"), class = "factor"), .rows = list(integer(0),
integer(0), integer(0), integer(0), integer(0), integer(0),
2L, integer(0), integer(0), integer(0), 1L, integer(0))), .Names = c("U3003",
"coche_OEM", ".rows"), row.names = c(NA, -12L), class = c("tbl_df",
"tbl", "data.frame"), .drop = FALSE), .Names = c("U3003", "coche_OEM",
"dia_hora_OEM")), structure(list(B1D01 = structure(c(1L, 1L,
2L), .Label = c("B1D01", "c(\"B1D01\", \"B1D01\")", "NULL"), class = "factor"),
coche_OEM = structure(c(2L, 1L, 1L), .Label = c("356232050832996",
"356232050836666", "356232050880755", "356232050882736",
"356232050899078", "356232050905933"), class = "factor"),
dia_hora_OEM = structure(c(1581690876, 1582648789, 1582651926
), class = c("POSIXct", "POSIXt"), tzone = "UTC")), row.names = c(NA,
-3L), class = c("grouped_df", "tbl_df", "tbl", "data.frame"), groups = structure(list(
B1D01 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L,
2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("B1D01", "c(\"B1D01\", \"B1D01\")",
"NULL"), class = "factor"), coche_OEM = structure(c(1L, 2L,
3L, 4L, 5L, 6L, 1L, 2L, 3L, 4L, 5L, 6L, 1L, 2L, 3L, 4L, 5L,
6L), .Label = c("356232050832996", "356232050836666", "356232050880755",
"356232050882736", "356232050899078", "356232050905933"), class = "factor"),
.rows = list(2L, 1L, integer(0), integer(0), integer(0),
integer(0), 3L, integer(0), integer(0), integer(0), integer(0),
integer(0), integer(0), integer(0), integer(0), integer(0),
integer(0), integer(0))), .Names = c("B1D01", "coche_OEM",
".rows"), row.names = c(NA, -18L), class = c("tbl_df", "tbl",
"data.frame"), .drop = FALSE), .Names = c("B1D01", "coche_OEM",
"dia_hora_OEM")), structure(list(U0155 = structure(2L, .Label = c("NULL",
"U0155"), class = "factor"), coche_OEM = structure(1L, .Label = c("356232050832996",
"356232050836666", "356232050880755", "356232050882736", "356232050899078",
"356232050905933"), class = "factor"), dia_hora_OEM = structure(1582648789, class = c("POSIXct",
"POSIXt"), tzone = "UTC")), row.names = c(NA, -1L), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"), groups = structure(list(U0155 = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("NULL",
"U0155"), class = "factor"), coche_OEM = structure(c(1L, 2L,
3L, 4L, 5L, 6L, 1L, 2L, 3L, 4L, 5L, 6L), .Label = c("356232050832996",
"356232050836666", "356232050880755", "356232050882736", "356232050899078",
"356232050905933"), class = "factor"), .rows = list(integer(0),
integer(0), integer(0), integer(0), integer(0), integer(0),
1L, integer(0), integer(0), integer(0), integer(0), integer(0))), .Names = c("U0155",
"coche_OEM", ".rows"), row.names = c(NA, -12L), class = c("tbl_df",
"tbl", "data.frame"), .drop = FALSE), .Names = c("U0155", "coche_OEM",
"dia_hora_OEM")), structure(list(C1B00 = structure(1L, .Label = c("C1B00",
"NULL"), class = "factor"), coche_OEM = structure(1L, .Label = c("356232050832996",
"356232050836666", "356232050880755", "356232050882736", "356232050899078",
"356232050905933"), class = "factor"), dia_hora_OEM = structure(1582648789, class = c("POSIXct",
"POSIXt"), tzone = "UTC")), row.names = c(NA, -1L), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"), groups = structure(list(C1B00 = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("C1B00",
"NULL"), class = "factor"), coche_OEM = structure(c(1L, 2L, 3L,
4L, 5L, 6L, 1L, 2L, 3L, 4L, 5L, 6L), .Label = c("356232050832996",
"356232050836666", "356232050880755", "356232050882736", "356232050899078",
"356232050905933"), class = "factor"), .rows = list(1L, integer(0),
integer(0), integer(0), integer(0), integer(0), integer(0),
integer(0), integer(0), integer(0), integer(0), integer(0))), .Names = c("C1B00",
"coche_OEM", ".rows"), row.names = c(NA, -12L), class = c("tbl_df",
"tbl", "data.frame"), .drop = FALSE), .Names = c("C1B00", "coche_OEM",
"dia_hora_OEM")), structure(list(P037D = structure(2L, .Label = c("NULL",
"P037D"), class = "factor"), coche_OEM = structure(1L, .Label = c("356232050832996",
"356232050836666", "356232050880755", "356232050882736", "356232050899078",
"356232050905933"), class = "factor"), dia_hora_OEM = structure(1582648789, class = c("POSIXct",
"POSIXt"), tzone = "UTC")), row.names = c(NA, -1L), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"), groups = structure(list(P037D = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("NULL",
"P037D"), class = "factor"), coche_OEM = structure(c(1L, 2L,
3L, 4L, 5L, 6L, 1L, 2L, 3L, 4L, 5L, 6L), .Label = c("356232050832996",
"356232050836666", "356232050880755", "356232050882736", "356232050899078",
"356232050905933"), class = "factor"), .rows = list(integer(0),
integer(0), integer(0), integer(0), integer(0), integer(0),
1L, integer(0), integer(0), integer(0), integer(0), integer(0))), .Names = c("P037D",
"coche_OEM", ".rows"), row.names = c(NA, -12L), class = c("tbl_df",
"tbl", "data.frame"), .drop = FALSE), .Names = c("P037D", "coche_OEM",
"dia_hora_OEM")), structure(list(P0616 = structure(2L, .Label = c("NULL",
"P0616"), class = "factor"), coche_OEM = structure(1L, .Label = c("356232050832996",
"356232050836666", "356232050880755", "356232050882736", "356232050899078",
"356232050905933"), class = "factor"), dia_hora_OEM = structure(1582648789, class = c("POSIXct",
"POSIXt"), tzone = "UTC")), row.names = c(NA, -1L), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"), groups = structure(list(P0616 = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("NULL",
"P0616"), class = "factor"), coche_OEM = structure(c(1L, 2L,
3L, 4L, 5L, 6L, 1L, 2L, 3L, 4L, 5L, 6L), .Label = c("356232050832996",
"356232050836666", "356232050880755", "356232050882736", "356232050899078",
"356232050905933"), class = "factor"), .rows = list(integer(0),
integer(0), integer(0), integer(0), integer(0), integer(0),
1L, integer(0), integer(0), integer(0), integer(0), integer(0))), .Names = c("P0616",
"coche_OEM", ".rows"), row.names = c(NA, -12L), class = c("tbl_df",
"tbl", "data.frame"), .drop = FALSE), .Names = c("P0616", "coche_OEM",
"dia_hora_OEM")), structure(list(P0562 = structure(2L, .Label = c("NULL",
"P0562"), class = "factor"), coche_OEM = structure(1L, .Label = c("356232050832996",
"356232050836666", "356232050880755", "356232050882736", "356232050899078",
"356232050905933"), class = "factor"), dia_hora_OEM = structure(1582648789, class = c("POSIXct",
"POSIXt"), tzone = "UTC")), row.names = c(NA, -1L), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"), groups = structure(list(P0562 = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("NULL",
"P0562"), class = "factor"), coche_OEM = structure(c(1L, 2L,
3L, 4L, 5L, 6L, 1L, 2L, 3L, 4L, 5L, 6L), .Label = c("356232050832996",
"356232050836666", "356232050880755", "356232050882736", "356232050899078",
"356232050905933"), class = "factor"), .rows = list(integer(0),
integer(0), integer(0), integer(0), integer(0), integer(0),
1L, integer(0), integer(0), integer(0), integer(0), integer(0))), .Names = c("P0562",
"coche_OEM", ".rows"), row.names = c(NA, -12L), class = c("tbl_df",
"tbl", "data.frame"), .drop = FALSE), .Names = c("P0562", "coche_OEM",
"dia_hora_OEM")), structure(list(U0073 = structure(2L, .Label = c("NULL",
"U0073"), class = "factor"), coche_OEM = structure(1L, .Label = c("356232050832996",
"356232050836666", "356232050880755", "356232050882736", "356232050899078",
"356232050905933"), class = "factor"), dia_hora_OEM = structure(1582648789, class = c("POSIXct",
"POSIXt"), tzone = "UTC")), row.names = c(NA, -1L), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"), groups = structure(list(U0073 = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("NULL",
"U0073"), class = "factor"), coche_OEM = structure(c(1L, 2L,
3L, 4L, 5L, 6L, 1L, 2L, 3L, 4L, 5L, 6L), .Label = c("356232050832996",
"356232050836666", "356232050880755", "356232050882736", "356232050899078",
"356232050905933"), class = "factor"), .rows = list(integer(0),
integer(0), integer(0), integer(0), integer(0), integer(0),
1L, integer(0), integer(0), integer(0), integer(0), integer(0))), .Names = c("U0073",
"coche_OEM", ".rows"), row.names = c(NA, -12L), class = c("tbl_df",
"tbl", "data.frame"), .drop = FALSE), .Names = c("U0073", "coche_OEM",
"dia_hora_OEM")), structure(list(P0138 = structure(1L, .Label = c("c(\"P0138\", \"P0138\", \"P0138\")",
"NULL"), class = "factor"), coche_OEM = structure(5L, .Label = c("356232050832996",
"356232050836666", "356232050880755", "356232050882736", "356232050899078",
"356232050905933"), class = "factor"), dia_hora_OEM = structure(1583391111, class = c("POSIXct",
"POSIXt"), tzone = "UTC")), row.names = c(NA, -1L), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"), groups = structure(list(P0138 = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("c(\"P0138\", \"P0138\", \"P0138\")",
"NULL"), class = "factor"), coche_OEM = structure(c(1L, 2L, 3L,
4L, 5L, 6L, 1L, 2L, 3L, 4L, 5L, 6L), .Label = c("356232050832996",
"356232050836666", "356232050880755", "356232050882736", "356232050899078",
"356232050905933"), class = "factor"), .rows = list(integer(0),
integer(0), integer(0), integer(0), 1L, integer(0), integer(0),
integer(0), integer(0), integer(0), integer(0), integer(0))), .Names = c("P0138",
"coche_OEM", ".rows"), row.names = c(NA, -12L), class = c("tbl_df",
"tbl", "data.frame"), .drop = FALSE), .Names = c("P0138", "coche_OEM",
"dia_hora_OEM")))
So, how could I convert this list into a data frame with my requirements?
We can rename all the columns that are not 'coche_OEM' or 'dia_hora_OEM' to a predefined string ('id' here):
map_df(listdf, ~rename_at(.x, vars(-c('coche_OEM', 'dia_hora_OEM')), ~'id'))
# A tibble: 15 x 3
# Groups: id, coche_OEM [78]
id coche_OEM dia_hora_OEM
<chr> <fct> <dttm>
1 "B1182" 356232050880755 2019-12-31 06:40:13
2 "B124D" 356232050880755 2019-12-31 06:40:13
3 "c(\"P2000\", \"P2000\", \"P2000\")" 356232050899078 2019-12-31 11:55:30
4 "U3003" 356232050899078 2019-12-31 11:55:30
5 "U3003" 356232050832996 2020-02-25 16:39:49
6 "B1D01" 356232050836666 2020-02-14 14:34:36
7 "B1D01" 356232050832996 2020-02-25 16:39:49
8 "c(\"B1D01\", \"B1D01\")" 356232050832996 2020-02-25 17:32:06
9 "U0155" 356232050832996 2020-02-25 16:39:49
10 "C1B00" 356232050832996 2020-02-25 16:39:49
11 "P037D" 356232050832996 2020-02-25 16:39:49
12 "P0616" 356232050832996 2020-02-25 16:39:49
13 "P0562" 356232050832996 2020-02-25 16:39:49
14 "U0073" 356232050832996 2020-02-25 16:39:49
15 "c(\"P0138\", \"P0138\", \"P0138\")" 356232050899078 2020-03-05 06:51:51

Reorder geom_col by grouping variable (Error: Column `` can't be modified because it's a grouping variable)

I have this df,
df <- structure(list(Gender = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L,
3L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("", "Female", "Male",
"Q6 - OBS: Sex of Respondent"), class = "factor"), Incident = c("Death",
"Detention", "Extortion", "Kidnapping", "Physical_abuse", "Robbery",
"Sexual_assault", "Death", "Detention", "Extortion", "Kidnapping",
"Physical_abuse", "Robbery", "Sexual_assault"), Victim = structure(c(5L,
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L), .Label = c("",
"No", "Q54 - Did you witness any migrant deaths during your journey?",
"Refused", "Yes", "Q69 - Did you experience any physical abuse or harassment (of a non-sexual nature) during your journey?",
"Q62 - Did you witness or experience any sexual assault or harassment during your journey?",
"Q75 - Have you been kidnapped or otherwise held against your will during your journey?",
"Q96 - Have you been detained by the police, military, militia or immigration officials during your journey?",
"Q84 - Have you ever been robbed during your journey?", "Q90 - Did you have to give government officials gifts, services or bribes during your journey?"
), class = "factor"), n = c(253L, 300L, 1978L, 73L, 740L, 646L,
553L, 436L, 816L, 4052L, 194L, 1196L, 1059L, 259L), Percent = c(8,
10, 65, 2, 24, 21, 18, 6, 12, 59, 3, 17, 15, 4)), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -14L), groups = structure(list(
Gender = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L,
3L, 3L, 3L, 3L, 3L), .Label = c("", "Female", "Male", "Q6 - OBS: Sex of Respondent"
), class = "factor"), Incident = c("Death", "Detention",
"Extortion", "Kidnapping", "Physical_abuse", "Robbery", "Sexual_assault",
"Death", "Detention", "Extortion", "Kidnapping", "Physical_abuse",
"Robbery", "Sexual_assault"), .rows = list(1L, 2L, 3L, 4L,
5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L)), row.names = c(NA,
-14L), class = c("tbl_df", "tbl", "data.frame"), .drop = TRUE))
which I plotted like this:
df %>%
ggplot(aes(x=Incident, y=Percent, fill=Gender))+
geom_col(position = "dodge", width=0.72)
Now I need to sort Incident from higher total percentage to lower total percentage, so that Extortion comes first on the left, followed by Physical abuse, etc. I have tried:
df %>%
mutate(Incident=reorder(Incident, -Percent)) %>%
ggplot(aes(x=Incident, y=Percent, fill=Gender))+
geom_col(position = "dodge", width=0.72)
But I get the error:
Error: Column `Incident` can't be modified because it's a grouping variable
I have then tried ungroup, or fct_rev, but I cannot make it work! The only thing that works is to export the df as csv, to then import it again, and then it works. But of course that is not very efficient... Anybody please help!

Problems with ordering for geom_segment chart

I would appreciate any advice with my plot - I am a ggplot novice!
I am trying to create a cleveland dot plot faceted by cluster, which has 3 levels. I have 3 issues that I am struggling with:
Within each cluster, I want the dots to be ordered by my continuous x-var. The code below isn't ordering correctly.
Is it possible to change the dot type based on whether the y-var ends in a 0 (does not have a characteristic) or 1 (does have the characteristic)?
I have a variable in my data set (Population) which shows the population % of a characteristic. I would like to see if a cluster characteristic is over/under-represented compared with the population. I would like to add a dot on the same line of each y-var.
Here is my code :
ggplot(cl1, aes(x=Cluster_prop, y=reorder(Var, Cluster_prop)))+
geom_segment(aes(yend=Var), xend=0, colour="grey50")+
geom_point(size=3, aes(colour=Cluster))+
facet_grid(Cluster~., scales="free_y", space="free_y") +
ggtitle("Top 10 Cluster Characteristics: % Children Within Cluster With
Feature")
Here is my data:
> dput(cl1)
structure(list(Var = structure(c(2L, 3L, 5L, 7L, 14L, 16L, 18L,
19L, 20L, 22L, 15L, 9L, 7L, 6L, 21L, 13L, 17L, 12L, 4L, 11L,
15L, 17L, 21L, 1L, 13L, 4L, 10L, 12L, 6L, 8L), .Label = c("asthdoc_1",
"AttacksOnExer_1_0", "AttacksTTT_1_0", "AttacksTTT_1_1", "Breath0rmal_1_0",
"Breath0rmal_1_1", "CAsthmaMed_1_0", "CAsthmaMed_1_1", "CCurrentAsthma_1_0",
"CCurrentAsthma_1_1", "CongColds_1_1", "CoughNight_1_1",
"CoughWithColds_1_1",
"EverWheeze_1_0", "EverWheeze_1_1", "Wheeze6M_1_0", "Wheeze6M_1_1",
"WheezeMostDays_1_0", "WheezeOcc_1_0", "WheezeWithColds_1_0",
"WheezeWithColds_1_1", "WheezeWithShort_1_0"), class = "factor"),
Cluster_prop = c(100, 100, 100, 100, 100, 100, 100, 100,
100, 100, 100, 99.4219653, 98.8439306, 95.3757225, 94.7976879,
83.2369942, 79.1907514, 53.7572254, 50.867052, 50.867052,
100, 100, 100, 93.103448, 89.655172, 86.206897, 86.206897,
82.758621, 79.310345, 79.310345), Population = c(96.131528,
78.143133, 63.636364, 95.16441, 60.928433, 67.891683, 97.485493,
89.555126, 62.669246, 90.32882, 39.071567, 94.584139, 95.16441,
36.363636, 37.330754, 68.665377, 32.108317, 43.520309, 21.856867,
42.166344, 39.071567, 32.108317, 37.330754, 9.864603, 68.665377,
21.856867, 5.415861, 43.520309, 36.363636, 4.83559), Cluster =
structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("1",
"2", "3"), class = "factor")), .Names = c("Var", "Cluster_prop",
"Population", "Cluster"), row.names = c(NA, -30L), vars = "Cluster", drop =
TRUE, indices = list(
0:9, 10:19, 20:29), group_sizes = c(10L, 10L, 10L), biggest_group_size =
10L, labels = structure(list(
Cluster = 1:3), row.names = c(NA, -3L), class = "data.frame", vars =
"Cluster", drop = TRUE, .Names = "Cluster"), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))
Many thanks for any advice!
For your second (EDIT and third) issue(s):
library(tidyverse)
library(stringr)
str_sub(str, start = -1, end = -1)
cl2 <- cl1 %>% mutate(Shape = str_sub(Var, start = -1, end = -1))
ggplot(cl2, aes(x=Cluster_prop, y=reorder(Var, Cluster_prop)))+
geom_segment(aes(yend=Var), xend=0, colour="grey50")+
geom_point(size=3, aes(colour=Cluster, shape = Shape))+
geom_point(aes(x = Population), size = 2, color = "black")+
facet_grid(Cluster~., scales="free_y", space="free_y") +
ggtitle("Top 10 Cluster Characteristics: % Children Within Cluster With
Feature")

R - predict() error: invalid type (builtin) for variable 'class'

I'm trying to use multinom() from nnet to apply regression on my data.
Here is what I've done:
#------------------Multinom Regression---------------#
#regression
glm.fit=multinom(Duration~., data=train)
summary(glm.fit)
#Prediction
predsval <-predict(glm.fit, newdata=validation[,2:11], "probs")
The predict() operation throws this error:
Error in model.frame.default(Terms, newdata, na.action = na.omit, xlev = object$xlevels) :
invalid type (builtin) for variable 'class'
The predict() line was working for numerous models like decision trees and neural networks. but for the same line it throws the error on the multinom regression model.
any ideas?
EDIT:
> dput(train[1:5,])
structure(list(Duration = structure(c(1L, 1L, 1L, 1L, 1L), .Label = c("1",
"2", "3", "4", "5", "6"), class = "factor"), ActionAVG = c(1079.818182,
8519.15, 4938.211538, 633.9230769, 487.1341463), ActionCount = c(33L,
20L, 52L, 13L, 82L), ActionsSTD = c(1325.668286, 14333.15299,
5746.947505, 1558.555553, 1187.325397), EventCount = c(53L, 1L,
36L, 9L, 20L), GestureAVG = c(712.001548, 2645.481675, 1724.010753,
2113.457711, 2757.006369), GestureCount = c(646L, 191L, 93L,
201L, 157L), gesturesstd = c(1446.855062, 4864.355753, 1967.416169,
1733.255691, 2572.892938), screencount = c(50L, 12L, 32L, 15L,
78L), stddiff = c(1356.033565, 6373.766188, 3497.559543, 1770.347893,
2679.068084), ScreenCountDist = c(13L, 6L, 5L, 7L, 8L), class = structure(c(1L,
1L, 1L, 1L, 1L), .Label = c("1", "2", "3", "4", "5", "6"), class = "factor")), .Names = c("Duration",
"ActionAVG", "ActionCount", "ActionsSTD", "EventCount", "GestureAVG",
"GestureCount", "gesturesstd", "screencount", "stddiff", "ScreenCountDist",
"class"), row.names = c(NA, 5L), class = "data.frame")
EDIT_2:
> dput(validation[1:5,])
structure(list(Duration = c(5, 2, 3, 3, 3), ActionAVG = c(68.2,
909.875, 4135, 192.5, 535.75), ActionCount = c(5L, 8L, 1L, 8L,
4L), ActionsSTD = c(29.32064119, 1362.292022, 0, 293.8877337,
522.1917751), EventCount = c(13L, 6L, 1L, 3L, 1L), GestureAVG = c(1573.473684,
2964.966667, 1973.352941, 1072.733333, 560.2692308), GestureCount = c(57L,
60L, 34L, 15L, 26L), gesturesstd = c(3052.29873, 3258.204122,
2452.19659, 1439.818365, 454.8399769), screencount = c(8L, 14L,
3L, 6L, 6L), stddiff = c(2862.564254, 5449.960621, 2345.319105,
2220.919405, 909.2036427), ScreenCountDist = c(4L, 8L, 3L, 5L,
4L)), .Names = c("Duration", "ActionAVG", "ActionCount", "ActionsSTD",
"EventCount", "GestureAVG", "GestureCount", "gesturesstd", "screencount",
"stddiff", "ScreenCountDist"), row.names = c(2L, 4L, 5L, 7L,
15L), class = "data.frame")
exclude from the model a variable that is not present in the validation set
glm.fit=multinom(Duration~., data=train[,-12])

Resources