Dpylr's recode function multiple-to-1 R - r

I would like an easier way to recode vectors. Specifically I'm wondering if there is a way to pass vectors to a function like dplyr's recode. I understand the basics of quasiquotation but don't quite get how to incorporate the =.
library(tidyverse)
vec1 <- rep(LETTERS[1:7],7)
#standard way
vec2 <- recode(vec1,
"A" = "Value1",
"B" = "Value2",
"C" = "Value3",
"D" = "Value4",
"E" = "Value5",
"F" = "Value6",
"G" = "Value7"
)
vec3 <- recode(vec1,
"A" = "Value1",
"B" = "Value1",
"C" = "Value2",
"D" = "Value2",
.default = "Value other"
)
I'd like to do the following
vec3 <- some.function(vec1,
c("A", "B") = "Value1",
c("C", "D") = "Value2",
.default = "Value other"
)
I have a solution but can't figure out how to incorporate a function with ... and =
do.call(dplyr::recode,
c(list(vec1),
setNames(rep("Value1",length(val1)), val1),
setNames(rep("Value2",length(val2)), val2)))
I also have figured out a way to pass two vectors and rename all the variables.
recode.by.vectors <- function(x, current.names, new.names){
do.call(dplyr::recode, c(list(x), setNames(new.names, current.names)))
}
Lastly, I'm aware of a base solution.
vec3 <- vec1
val1 <- c("A", "B")
val2 <- c("C", "D")
vec3[vec1 %in% val1] <- "Value1"
vec3[vec1 %in% val2] <- "Value2"
vec3[!vec1 %in% c(val1,val1)] <- "Value other"
but am unaware to how in incorporate the assignment carried out here into a function.

We can use case_when from the dplyr package.
library(dplyr)
vec1 <- rep(LETTERS[1:7],7)
vec2 <- case_when(
vec1 %in% c("A", "B") ~ "Value1",
vec1 %in% c("C", "D") ~ "Value2",
TRUE ~ "Value other"
)
head(vec2)
# [1] "Value1" "Value1" "Value2" "Value2" "Value other" "Value other"

Using the forcats package (also included in the tidyverse package)
library(forcats)
vec1 <- rep(LETTERS[1:7], 7)
fct_collapse(vec1,
Value1 = c("A", "B"),
Value2 = c("C", "D"),
`Value other` = c("E", "F", "G"))
This is a little cumbersome if you have lots of categories to put into Value other, but with a secondary step, you can simplify it a little bit
fct_collapse(vec1,
Value1 = c("A", "B"),
Value2 = c("C", "D")) %>%
fct_other(keep = c("Value1", "Value2"),
other_level = "Value other")

Related

Merging two dataframes based on conditions in multiple columns

I am trying to create a new df, call it df3, out of two other datasets:
df1 = data.frame("String" = c("a", "b", "c"), "Title" = c("A", "B", "C"), "Date" = c("2020-01-01", "2020-01-02", "2020-01-03"))
and:
df2 = data.frame("String" = c("a", "x", "y"), "Title" = c("ABCDEF", "XYZ", "YZ"), "Date" = c("2020-01-03", "2020-01-20", "2020-01-30"))
The conditions for the observations that should be matched, and form a new dataset, are:
df1$String %$in% df2$String
grepl(df1$Title, df2$Title) == TRUE
df1$Date < df$Date
What is the best way to do this kind of merging? I have tried to create an indicator along the lines of :
df1$indicator = ifelse(df1$String %in% df2$String & grepl(df1$Title, df2$Title) & df1$Date < df$Date, 1, 0)
or
df1$indicator = ifelse(df1$String %in% df2$String & grepl(df1$Title, df2$Title[df1$String %in% df2$String) & df1$Date < df2$Date[df1$String %in% df2$String, 1, 0)
to then use for merging, but I've been getting "longer object length is not a multiple of shorter object length" and "argument 'pattern' has length > 1 and only the first element will be used" warnings.
One way: Use a crossjoin then filter the result.
Note that grepl is not vectorized over both arguments, so i use mapply.
df1 = data.frame("String" = c("a", "b", "c"), "Title" = c("A", "B", "C"), "Date" = c("2020-01-01", "2020-01-02", "2020-01-03"))
df2 = data.frame("String" = c("a", "x", "y"), "Title" = c("ABCDEF", "XYZ", "YZ"), "Date" = c("2020-01-03", "2020-01-20", "2020-01-30"))
merge(df1,df2, by=NULL, suffixes = c(".x", ".y")) |>
subset(String.x %in% String.y
& mapply(grepl, Title.x, Title.y)
& Date.x < Date.y )
#> String.x Title.x Date.x String.y Title.y Date.y
#> 1 a A 2020-01-01 a ABCDEF 2020-01-03

Append text to a field based on another field's value

I want to append a text based on another field's value. For example:-
This is the current df:
field_x <- c("A", "A", "C", "B", "B", "C")
field_y <- c("Axl", "Slash", "Duff", "Steven", "Izzy", "Dizzy")
df <- cbind(field_x, field_y)
I need to change the field_y based on field_x values so that it looks like this:
field_x <- c("A", "A", "C", "B", "B", "C")
field_y <- c("Axl (Apple)", "Slash (Apple)", "Duff (Cat)", "Steven (Ball)", "Izzy (Ball)", "Dizzy (Cat)")
So, basically if field_x has "A" then "(Apple)" is to be appended to field_y and so forth. Thanks in advance!
First note that your df is actually a matrix: when you cbind vectors, you get a matrix. So first thing to do is convert to dataframe.
Then it depends on whether or not you are using dplyr.
field_x <- c("A", "A", "C", "B", "B", "C")
field_y <- c("Axl", "Slash", "Duff", "Steven", "Izzy", "Dizzy")
df <- cbind(field_x, field_y)
df <- as.data.frame(df)
Without dplyr:
df <- within(df, {
s <- ifelse(field_x == "A", "Apple", ifelse(field_x == "B", "Ball", "Cat"))
field_y <- paste0(field_y, "(", s, ")")
rm(s)
})
With dplyr:
library(dplyr)
library(stringr)
library(magrittr)
df %<>%
mutate(
s = recode(field_x, "A" = "Apple", "B" = "Ball", "C" = "Cat"),
field_y = str_glue("{field_y}({s})")) %>%
select(-s)
Another way, with case_when instead of recode:
df %<>%
mutate(
s = case_when(
field_x == "A" ~ "Apple",
field_x == "B" ~ "Ball",
field_x == "C" ~ "Cat"
),
field_y = str_glue("{field_y}({s})")) %>%
select(-s)
Note that I create an auxiliary variable s: it's not really necessary, but it makes the code more readable.
Here is another approach:
We could create a look-up table to address the concerns of #Tim Biegeleisen in the comment section:
look_up <- data.frame(x = c("A", "B" ,"C"),
y = c("Apple", "Ball", "Cat"))
library(dplyr)
df %>%
as.data.frame() %>%
rowwise() %>%
mutate(field_y = paste0(field_y, ' (', look_up$y[look_up$x==field_x], ')'))
field_x field_y
<chr> <chr>
1 A Axl (Apple)
2 A Slash (Apple)
3 C Duff (Cat)
4 B Steven (Ball)
5 B Izzy (Ball)
6 C Dizzy (Cat)

Error in `colnames<-`(`*tmp*`, value = `*vtmp*`) : attempt to set 'colnames' on an object with less than two dimensions

I am trying to rename specific column names for a df.
To perform this task I am using a function to perform replacement by using a for loop.
Here's my code:
## data ##
df <- structure(list(
A = c(1,2,3),
B = c("Yes", "Yes", "No"),
C = c("John", "James", "Maria"),
D = c(45, 34, 23),
E = c(712, 777, 888)),
class = "data.frame",
row.names = c(NA, -3L))
## setting atomic vectors for original/new names ##
original_df_names <- c("A", "B", "C")
new_df_names <- c("Order", "Answer", "Name")
## function to rename columns ##
rename_fun <- function(df, original_names, new_names){
for(i in 1:seq_along(original_names)){
for(j in 1:seq_along(new_names)){
colnames(df)[which(names(df) == i)] <- j
}
}
}
## applying function ##
df <- mapply(df = df,
original_names = original_df_names,
new_names = new_df_names,
rename_fun)
Console output:
Error in `colnames<-`(`*tmp*`, value = `*vtmp*`) :
attempt to set 'colnames' on an object with less than two dimensions
Expected Output
names(df)
[1] "Order" "Answer" "Names" "D" "E"
Is there any other way to accomplish this?
We can make use of rename_at
library(dplyr)
df <- df %>%
rename_at(vars(all_of(original_df_names)), ~ new_df_names)

Creating objects from a column and assigning values from another in R

I want to read a data like -
Name, Value
A,20
B,23
C, Stupid
D, IDIOT
And then I want to create several objects from col Name with values from col Value to the effect that -
A <- 20
C <- Stupid
and so on.
Any help is appreciated!
If you really want to do this (though it would make more sense to keep the objects in a list), you can do
list2env(setNames(as.list(data$Value), data$Name), globalenv())
#> <environment: R_GlobalEnv>
ls()
#> [1] "A" "B" "C" "D" "data"
Data
data <- structure(list(Name = c("A", "B", "C", "D"), Value = c("20",
"23", "Stupid", "IDIOT")), class = "data.frame", row.names = c(NA,
-4L))
Name = list("A", "B", "C", "D")
Value = list(20, 23, "enthusiastic", "beginner")
mapply(assign, Name, Value, MoreArgs=list(envir=parent.frame()))
We can use assign with a for loop
for(i in seq_len(nrow(data))) assign(data$Name[i], data$Value[i])
Or with zeallot
library(zeallot)
c(A, B, C, D) %<-% data$Value
data
data <- structure(list(Name = c("A", "B", "C", "D"), Value = c("20",
"23", "Stupid", "IDIOT")), class = "data.frame", row.names = c(NA,
-4L))
Try the code below
list2env(type.convert(setNames(as.list(df$Value), df$Name), as.is = TRUE),.GlobalEnv)

Exchange data.table columns with most prevalent value of columns

I have data
test = data.table(
a = c(1,1,3,4,5,6),
b = c("a", "be", "a", "c", "d", "c"),
c = rep(1, 6)
)
I wish to take the unique values of column a, store it in another data.table, and afterwards fill in the remaining columns with the most prevalent values of those remaining columns, such that my resulting data.table would be:
test2 = data.table(a = c(1,3,4,5,6), b = "a", c = 1)
Column be has equal amounts of "a" and "c", but it doesn't matter which is chosen in those cases.
Attempt so far:
test2 = unique(test, by = "a")
test2[, c("b", "c") := lapply(.SD, FUN = function(x){test2[, .N, by = x][order(-N)][1,1]}), .SDcols = c("b", "c")]
EDIT: I would preferrably like a generic solution that is compatible with a function where I specify the column to be "uniqued", and the rest of the columns are with the single most prevalent value. Hence my use of lapply and .SD =)
EDIT2: as #MichaelChirico points out, how do we keep the class. With the following data.table some of the solutions does not work, although solution of #chinsoon12 does work:
test = data.table(a = c(1,1,3,4,5,6),
b = c("a", "be", "a", "c", "d", "c"),
c = rep(1, 6),
d = as.Date("2019-01-01"))
Another option:
dtmode <- function(x) x[which.max(rowid(x))]
test[, .(A=unique(A), B=dtmode(B), C=dtmode(C))]
data:
test = data.table(
A = c(1,1,3,4,5,6),
B = c("a", "be", "a", "c", "d", "c"),
C = rep(1, 6)
)
Not a clean way to do this but it works.
test = data.frame(a = c(1,1,3,4,5,6), b = c("a", "be", "a", "c", "d", "c"), c = rep(1, 6))
a = unique(test$a)
b = tail(names(sort(table(test$b))), 1)
c = tail(names(sort(table(test$c))), 1)
test2 = cbind(a,b,c)
Output is like this:
> test2
a b c
[1,] "1" "c" "1"
[2,] "3" "c" "1"
[3,] "4" "c" "1"
[4,] "5" "c" "1"
[5,] "6" "c" "1"
>
#EmreKiratli is very close to what I would do:
test[ , c(
list(a = unique(a)),
lapply(.SD, function(x) as(tail(names(sort(table(x))), 1L), class(x)))
), .SDcols = !'a']
The as(., class(x)) part is because names in R are always character, so we have to convert back to the original class of x.
You might like this better in magrittr form since it's many nested functions:
library(magrittr)
test[ , c(
list(a = unique(a)),
lapply(.SD, function(x) {
table(x) %>% sort %>% names %>% tail(1L) %>% as(class(x))
})
), .SDcols = !'a']
I was able to make an OK solution, but if somebody can do it more elegantly, for example not going through the step of storting a list in refLevel below, please let me know! I'm very interested in learning data.table properly!
#solution:
test = data.table(a = c(1,1,3,4,5,6), b = c("a", "be", "a", "c", "d", "c"), c = rep(1, 6))
test2 = unique(test, by="a")
funPrev = function(x){unlist(as.data.table(x)[, .N, by=x][order(-N)][1,1], use.names = F)}
refLevel = lapply(test[, c("b", "c")], funPrev)
test2[, c("b", "c") := refLevel]
...and using a function (if anybody see any un-necessary step, please let me know):
genData = function(dt, var_unique, vars_prev){
data = copy(dt)
data = unique(data, by = var_unique)
funPrev = function(x){unlist(as.data.table(x)[, .N, by=x][order(-N)][1,1], use.names = F)}
refLevel = lapply(dt[, .SD, .SDcols = vars_prev], funPrev)
data[, (vars_prev) := refLevel]
return(data)
}
test2 = genData(test, "a", c("b", "c"))
Here's another variant which one might find less sophisticated, yet more readable. It's essentially chinsoon12's rowid approach generalized for any number of columns. Also the classes are kept.
test = data.table(a = c(1,1,3,4,5,6),
b = c("a", "be", "a", "c", "d", "c"),
c = rep(1, 6),
d = as.Date("2019-01-01"))
test2 = unique(test, by = "a")
for (col in setdiff(names(test2), "a")) test2[[col]] = test2[[col]][which.max(rowid(test2[[col]]))]

Resources