Recode a factor variable, dropping N/A - r

I have a factor variable with 14 levels, which I'm trying to into collapse into only 3 levels. It contains two N/A which I also wanna remove.
My code looks like this:
job <- fct_collapse(E$occupation, other = c("7","9", "10", "13" "14"), 1 = c("1", "2", "3", "12"), 2 = c("4", "5", "6", "8", "11"))
However it just gives me tons of error. Can anyone help here me here?

We could also this with a named list
library(forcats)
lst1 <- setNames(list(as.character(c(7, 9, 10, 13, 14)),
as.character(c(1, 2, 3, 12)), as.character(c(4, 5, 6, 8, 11))), c('other', 1, 2))
fct_collapse(df$occupation, !!!lst1)
data
df <- structure(list(occupation = c("1", "3", "5", "7", "9", "10",
"12", "14", "13", "4", "7", "6", "5")), class = "data.frame", row.names = c(NA,
-13L))

For numbers try using backquotes in fct_collapse.
job <- forcats::fct_collapse(df$occupation,
other = c("7","9", "10", "13", "14"),
`1` = c("1", "2", "3", "12"),
`2` = c("4", "5", "6", "8", "11"))

Related

Create (many) columns conditional on similarly named columns

I want to create a new column that take the value of one of two similarly named columns, depending on a third column. There are many such columns to create. Here's my data.
dt <- structure(list(malvol_left_1_w1 = c("1", "1", "4", "3", "4",
"4", "1", "4", "4", "3", "1", "4", "4", "3", "4", "4", "5", "2",
"4", "2"), malvol_left_2_w1 = c("1", "1", "4", "3", "4", "4",
"1", "3", "4", "2", "2", "2", "4", "1", "5", "4", "5", "2", "4",
"2"), malvol_right_1_w1 = c("1", "1", "4", "3", "4", "4", "1",
"3", "4", "2", "1", "4", "4", "5", "5", "4", "2", "6", "4", "1"
), malvol_right_2_w1 = c("1", "1", "4", "3", "4", "4", "1", "3",
"4", "2", "1", "2", "4", "5", "5", "4", "5", "5", "4", "5"),
malvol_left_1_w2 = c("1", "1", "3", "3", "4", "4", "1", "5",
"4", "4", "4", "2", "1", "4", "5", "4", "3", "2", "4", "4"
), malvol_left_2_w2 = c("1", "1", "3", "3", "4", "4", "7",
"5", "4", "2", "3", "1", "1", "4", "4", "4", "3", "4", "4",
"4"), malvol_right_1_w2 = c("1", "3", "3", "3", "4", "4",
"1", "4", "4", "3", "2", "2", "4", "1", "4", "4", "5", "5",
"4", "4"), malvol_right_2_w2 = c("1", "2", "3", "3", "4",
"4", "1", "2", "4", "2", "3", "2", "4", "1", "4", "4", "5",
"4", "4", "3"), leftright_w1 = c("right", "right", "left",
"right", "right", "right", "left", "right", "right", "left",
"left", "left", "left", "right", "left", "left", "right",
"right", "right", "left"), leftright_w2 = c("right", "right",
"left", "left", "right", "left", "left", "right", "right",
"left", "left", "left", "left", "right", "left", "left",
"right", "right", "left", "left")), class = "data.frame", row.names = c("12",
"15", "69", "77", "95", "96", "112", "122", "150", "163", "184",
"216", "221", "226", "240", "298", "305", "354", "370", "379"
))
Now I can do this in dplyr like:
dt <- dt %>%
mutate(
malvol_1_w1 = case_when(
leftright_w1 == "left" ~ malvol_right_1_w1,
leftright_w1 == "right" ~ malvol_left_1_w1),
malvol_2_w1 = case_when(
leftright_w1 == "left" ~ malvol_right_2_w1,
leftright_w1 == "right" ~ malvol_left_2_w1),
malvol_1_w2 = case_when(
leftright_w2 == "left" ~ malvol_right_1_w2,
leftright_w2 == "right" ~ malvol_left_1_w2),
malvol_2_w2 = case_when(
leftright_w2 == "left" ~ malvol_right_2_w2,
leftright_w2 == "right" ~ malvol_left_2_w2))
However, it's not really a feasible solution, because there will be more of both numbers defining a variable (e.g. both malvol_3_w1 and malvol_1_w3 will need to be created).
One solution is to this with a loop:
for (wave in 1:2) {
for (var in 1:2) {
dt[, paste0("malvol_", var, "_w", wave)] <- dt[, paste0("malvol_right_", var, "_w", wave)]
dt[dt[[paste0("leftright_w", wave)]] == "right", paste0("malvol_", var, "_w", wave)] <-
dt[dt[[paste0("leftright_w", wave)]] == "right", paste0("malvol_left_", var, "_w", wave)]
}
}
However, what is a tidyverse solution?
UPDATE:
I came up with a tidyverse solution myself, however, not every elegant. Still looking for more canonical solutions.
dt <- dt %>%
mutate(
malvol_1_w1 = NA, malvol_2_w1 = NA,
malvol_1_w2 = NA, malvol_2_w2 = NA) %>%
mutate(
across(matches("malvol_\\d"),
~ case_when(
eval(parse(text = paste0("leftright_", str_extract(cur_column(), "w.")))) == "left" ~
eval(parse(text = paste0(str_split(cur_column(), "_\\d", simplify = T)[1],
"_right", str_split(cur_column(), "malvol", simplify = T)[2]))),
eval(parse(text = paste0("leftright_", str_extract(cur_column(), "w.")))) == "right" ~
eval(parse(text = paste0(str_split(cur_column(), "_\\d", simplify = T)[1],
"_left", str_split(cur_column(), "malvol", simplify = T)[2]))))))
What makes your problem difficult is that a lot of information is hidden in variable names rather than data cells. Hence, you need some steps to transform your data into "tidy" format. In the code below, the crucial part is (1) to turn the variables [malvol]_[lr]_[num]_[w] into four separate columns malvol, lr, num, w (all prefixed with m_), and (2) from the variables leftright_[w] extract variable w (prefixed with l_) using the functions pivot_longer and than separate.
# Just adding a row_id to your data, for later joining
dt <- dt %>% mutate(id = row_number())
df <- dt %>%
# Tidy the column "malvol"
pivot_longer(cols = starts_with('malvol'), names_to = "m_var", values_to = "m_val") %>%
separate(m_var, into = c("m_malvol", "m_lr", "m_num", "m_w")) %>%
# They the column "leftright"
pivot_longer(cols = starts_with('leftright'), names_to = 'l_var', values_to = 'l_lr') %>%
separate(l_var, into = c(NA, "l_w")) %>%
# Implement the logic
filter(l_w == m_w) %>%
filter(l_lr != m_lr) %>%
# Pivot into original wide format
select(-c(l_w, l_lr, m_lr)) %>%
pivot_wider(names_from = c(m_malvol, m_num, m_w), values_from = m_val)
# Merging back results to original data
dt <- dt %>% mutate(id = row_number()) %>% inner_join(df, by="id")
Although I pivoted the data back into your desired format in the end (to check whether results are in line with your desired results), I would suggest you leave the data in the long format, which is "tidy" and more easy to work with, compared to your "wide" format. So maybe skip the last pivot_wider operation.

Problem with Piping for revalue in R Studio

I would like to revalue 13 different variables. They all have character as levels right now and are supposed to be changed to values.
Individually it would work to use
x$eins <- revalue(x$eins, c("Nie Thema" = "1",
"Selten Thema" = "2",
"Manchmal Thema" = "3",
"Häufig Thema" = "4",
"Sehr häufig Thema" = "5",
"Fast immer Thema" = "6"))
With the piping, I guess it would look something like this
x %>%
dplyr::select(., eins:dreizehn) %>%
revalue(., c("Nie Thema" = "1",
"Selten Thema" = "2",
"Manchmal Thema" = "3",
"Häufig Thema" = "4",
"Sehr häufig Thema" = "5",
"Fast immer Thema" = "6"))
With this, I get the warning message from revalue, that x is not a factor or a character vector.
What am I doing wrong?
Thanks in advance.
Use across to apply a function for multiple columns.
library(dplyr)
x <- x %>%
dplyr::mutate(across(eins:dreizehn, ~revalue(., c("Nie Thema" = "1",
"Selten Thema" = "2",
"Manchmal Thema" = "3",
"Häufig Thema" = "4",
"Sehr häufig Thema" = "5",
"Fast immer Thema" = "6"))))

How to specify number of digits when writing a table?

I have a matrix with 12 rows and 12 columns
AirAnalysis2019 <- matrix(c(...), nrow = 12, ncol = 12, byrow = TRUE,
dimnames = list(c("1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12"),
c(...)))
Now, I'm trying to write an Excel file with this using the xlsx package:
write.xlsx2(AirAnalysis2019, file="xxx.xlsx", sheetName="AirAnalysis2019",
col.names=TRUE, row.names=TRUE, append=FALSE, showNA = FALSE)
Is it possible to tell the software that I wish to get the values of the columns 1, 5 and 9 in a "scientific" format while all the other values appear with 2 digits only ?
Also, is it possible with this package to color specifically the name of the rows ?
Thank for you help

applying if statement for list within list in r

I am trying to run over a list of lists. Each line has 29 lists, and each list has 6 numbers stored as strings. An example looks like the following
dput(M[6000])
list(list(c("0", "1", "19", "785", "-3150", "0.90"), c("4", "2", "-1", "5550", "4400", "0.00"),
c("1", "3", "6", "3319", "-2558", "1.49"), c("1", "4", "1", "4573", "-435", "1.24"),
c("0", "5", "6", "1137", "-2828", "2.28"), c("0", "6", "24", "1668", "-1143", "2.76"),
c("1", "7", "2", "2859", "-720", "1.40"), c("1", "8", "23", "420", "-3346", "1.57"),
c("1", "9", "26", "2290", "752", "1.23"), c("1", "10", "8", "1208", "-2842", "2.14"),
c("0", "11", "11", "-219", "-374", "1.26"), c("0", "12", "3", "-69", "-2403", "2.24"),
c("0", "13", "1", "-3488", "-830", "0.17"), c("1", "14", "7", "2102", "-1404", "1.24"),
c("1", "15", "3", "1746", "-3481", "1.59"), c("3", "16", "0", "720", "-1425", "0.47"),
c("1", "17", "9", "170", "-2257", "3.14"), c("0", "18", "5", "-351", "-1564", "1.08"),
c("4", "19", "-1", "5550", "4400", "0.00"), c("3", "20", "1", "3304", "-3448", "1.78"),
c("1", "21", "4", "2289", "-1873", "3.13"), c("0", "22", "2", "175", "-3080", "1.28"),
c("1", "23", "12", "877", "140", "1.52"), c("0", "24", "8", "871", "-1933", "4.11"),
c("0", "25", "9", "3185", "-2548", "1.50"), c("4", "26", "-1", "5550", "4400", "0.00"),
c("3", "27", "2", "-290", "3415", "0.56"), c("4", "28", "-1", "5550", "4400", "0.00"),
c("0", "29", "32", "2176", "-2145", "1.58")))
For each line, I am trying to run over the 29 lists and save only the lists that has the 3rd element equal to 4. For one line it would be:
if(as.numeric(M[[6000]][[1]][3]) == 4) M[[6000]][[1]]
I have tried something down the line of
MP4 <- lapply(M, function(x) if(as.numeric(x[[1]][3]) == 4) x[[1]])
without luck.
The purrr package is very good at those kinds of problems:
library(purrr)
M %>%
map(.f = keep, .p = ~ .x[[3]] == "4")
# [[1]]
# [[1]][[1]]
# [1] "1" "21" "4" "2289" "-1873" "3.13"
Edit per your comment:
Let's make another list, M_2, to illustrate the issue:
M_2 <- c(M, list(list()))
M_2 %>%
map(.f = keep, .p = ~ .x[[3]] == "4")
# [[1]]
# [[1]][[1]]
# [1] "1" "21" "4" "2289" "-1873" "3.13"
#
#
# [[2]]
# list()
Then simply discard lists that are equal to list():
M_2 %>%
map(.f = keep, .p = ~ .x[[3]] == "4") %>%
discard(identical, list())
# [[1]]
# [[1]][[1]]
# [1] "1" "21" "4" "2289" "-1873" "3.13"
To use base R, you can use Filter:
lapply(M, Filter, f = function(x){x[[3]] == '4'})
## [[1]]
## [[1]][[1]]
## [1] "1" "21" "4" "2289" "-1873" "3.13"
To filter out empty elements of a larger list, Filter twice:
# using #apom's data from above
Filter(function(x){length(x) != 0},
lapply(M_2, Filter, f = function(x){x[[3]] == '4'}))
## [[1]]
## [[1]][[1]]
## [1] "1" "21" "4" "2289" "-1873" "3.13"
Here is one way to loop through the list and create a new one which saves elements according to your criteria of third element equal to 4.
new_dl <- list()
j <- 1L
for (l in 1L:length(dl)) {
new_dl[[l]] <- list()
for (i in 1L:length(dl[[1]]))
if (dl[[l]][[i]][3] == 4) {
new_dl[[l]][[j]] <- dl[[l]][[i]]
j <- j + 1L
}
j <- 1L
}

invalid color name background in qgraph

I have been trying to use qgraph to generate the network graph. The code is as following
Gw <- qgraph(edgeList, diag = TRUE, labels = TRUE,legend.cex = 0.3, vsize = 1,edge.color=colorLabels,legend=TRUE,asize=1)
The figure can be generated, but the R command line gives the following error message. I do not know what does the invalid color name 'background' mean.
The dput result is shown as follows,
dput(edgeList)
structure(c("1", "2", "2", "3", "4", "5", "6", "7", "8", "1",
"9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "16",
"4", "5", "7", "1", "9", "10", "19", "20", "2", "16", "21", "3",
"22", "5", "23", "8", "1", "20", "2", "13", "14", "17", "14",
"1", "19", "14", "2", "21", "14", "24", "1", ":499.3", "nk Transfe",
"de of tran", "up(non-US ", "up(non-US ", "up(non-US ", "up(non-US ",
"up(non-US ", "up(non-US ", "up(non-US ", "up(non-US ", "up(non-US ",
"ine:4121", "ine:3257.4", "ine:75.2", "ine:75.2", "ine:11615.",
"ine:10603", "ine:334.2", "ine:7256.8", "ine:7256.8", "ine:996.8",
"ine:884.6", "ine:364.9", "ine:6360", "ine:5640.9", "ine:2729.7",
"ine:5482.6", "ine:85", "ine:1474.9", "ine:700.8", "ine:2754.6",
"ine:3257.4", "ine:3257.4", "ine:7307.8", "ine:18560.", "ine:85.1",
"ine:364.8", ":700.1", ":5317", "l:4258.9", "l:4258.9", "l:1637.6",
"l:1637.6", "l:46.4", "l:3938.5", "l:3938.5", "l:2800.4", "l:2715.1",
"l:2715.1", "l:12708.2", "l:1042", ":499.3", "nk Transfe", "de of tran",
"up(non-US ", "up(non-US ", "up(non-US ", "up(non-US ", "up(non-US ",
"up(non-US ", "up(non-US ", "up(non-US ", "up(non-US ", "ine:4121",
"ine:3257.4", "ine:75.2", "ine:75.2", "ine:11615.", "ine:10603",
"ine:334.2", "ine:7256.8", "ine:7256.8", "ine:996.8", "ine:884.6",
"ine:364.9", "ine:6360", "ine:5640.9", "ine:2729.7", "ine:5482.6",
"ine:85", "ine:1474.9", "ine:700.8", "ine:2754.6", "ine:3257.4",
"ine:3257.4", "ine:7307.8", "ine:18560.", "ine:85.1", "ine:364.8",
":700.1", ":5317", "l:4258.9", "l:4258.9", "l:1637.6", "l:1637.6",
"l:46.4", "l:3938.5", "l:3938.5", "l:2800.4", "l:2715.1", "l:2715.1",
"l:12708.2", "l:1042", "25", "1", "1", "26", "27", "28", "29",
"30", "31", "25", "32", "33", "4", "4", "3", "3", "5", "5", "7",
"6", "6", "27", "28", "30", "25", "32", "33", "9", "8", "1",
"1", "10", "12", "12", "16", "16", "16", "16", "8", "1", "3",
"3", "7", "7", "25", "9", "9", "1", "10", "10", "14", "14"), .Dim = c(104L,
2L), .Dimnames = list(NULL, c("newsendId", "newtoId")))
The generated figure is as follows. I used the following command to generate it
Gw <- qgraph(edgeList, layout = "spring", diag = FALSE, labels = TRUE, cut = NULL, edge.color = "red",legend.cex = 0.5, vsize = 8)
Which nodes are problems? With your data and code you can modify label.cex. There are other variations of the arguments for the label and legend sizes. Here is one version, with the color blue.
library(qgraph)
Gw <- qgraph(edgeList, layout = "spring", diag = FALSE, labels = TRUE, cut = NULL, edge.color = "red", legend.cex = 0.3, vsize = 4, label.cex = 0.3, label.color = "blue")
Gw

Resources