generate labels for variables in R - r

I'm searching for a better/faster way than this one to generate labels for a variable :
df <- data.frame(a=c(0,7,1,10,2,4,3,5,10,1,7,8,3,2))
pick <- c(0,1,2,3,10)
df[sapply(df$a,function(x) !(x %in% pick)),"a"] <- "a"
df[sapply(df$a,function(x) x==0),"a"] <- "b"
df[sapply(df$a,function(x) x==1 | x==2 | x==3),"a"] <- "c"
df[sapply(df$a,function(x) x==10),"a"] <- "d"
df$a
[1] "b" "a" "c" "d" "c" "a" "c" "a" "d" "c" "a" "a" "c" "c"
For simplicity, I just have one variable in this example, of course there are more variables in my dataset but I just want to change a specific one.

You don't need sapply:
df$a[!df$a %in% pick] <- "a"
df$a[df$a==0] <- "b"
df$a[df$a %in% 1:3] <- "c"
df$a[df$a==10] <- "d"
You could also produce the same result with factors:
df <- data.frame(a=c(0,7,1,10,2,4,3,5,10,1,7,8,3,2))
# the above method
a <- df$a
a[!df$a %in% pick] <- "a"
a[df$a==0] <- "b"
a[df$a %in% 1:3] <- "c"
a[df$a==10] <- "d"
# one way that gives a warning
b1 <- factor(df$a, levels=0:10, labels=c("b",rep("c",3),rep("a",6),"d"))
# another way that won't give a warning
b2 <- factor(df$a)
levels(b2) <- c("b",rep("c",3),rep("a",4),"d")
b2 <- as.character(b2)
# a third strategy using `library(car)`
b3 <- car::recode(df$a,"0='b';1:3='c';10='d';else='a'")
# check that all strategies are the same
all.equal(a,as.character(b1))
# [1] TRUE
all.equal(as.character(b1),as.character(b2))
# [1] TRUE
all.equal(as.character(b1),as.character(b3))
# [1] TRUE

You might also consider mapvalues or revalue in plyr, particularly if you're dealing with more labels:
df$a <- mapvalues(df$a, c(0, 1, 2, 3, 10), c("b", "c", "c", "c", "d"))
df$a[! df$a %in% c("b", "c", "d")] <- "a" # The !pick values

Here is another fairly straightforward solution:
names(pick) <- c("b", "c", "c", "c", "d")
x <- names(pick[match(df$a, pick)])
x[is.na(x)] <- "a"
x
# [1] "b" "a" "c" "d" "c" "a" "c" "a" "d" "c" "a" "a" "c" "c"
It is even more straightforward if you include an NA in your "pick" object.
pick <- c(NA, 0, 1, 2, 3, 10)
names(pick) <- c("a", "b", "c", "c", "c", "d")
names(pick[match(df$a, pick, nomatch = 1)])
# [1] "b" "a" "c" "d" "c" "a" "c" "a" "d" "c" "a" "a" "c" "c"
If you use this second alternative, note that nomatch takes an integer value of the position of what you're matching agains. Here, nomatch maps to "NA" which is in the first position in your "pick" vector. If the "NA" were in the last position, you would enter it as nomatch = 6 instead.

You can also use ifelse function.
with(df,ifelse(a==0,"b",ifelse(a %in% c(1,2,3),"c",ifelse(a==10,"d","a"))))
[1] "b" "a" "c" "d" "c" "a" "c" "a" "d" "c" "a" "a" "c" "c"

Related

Generate all combinations (and their sum) of a vector of characters in R

Suppose that I have a vector of length n and I need to generate all possible combinations and their sums. For example:
If n=3, we have:
myVec <- c("a", "b", "c")
Output =
"a"
"b"
"c"
"a+b"
"a+c"
"b+c"
"a+b+c"
Note that we consider that a+b = b+a, so only need to keep one.
Another example if n=4,
myVec <- c("a", "b", "c", "d")
Output:
"a"
"b"
"c"
"d"
"a+b"
"a+c"
"a+d"
"b+c"
"b+d"
"c+d"
"a+b+c"
"a+c+d"
"b+c+d"
"a+b+c+d"
We can use sapply with varying length in combn and use paste as function to apply.
sapply(seq_along(myVec), function(n) combn(myVec, n, paste, collapse = "+"))
#[[1]]
#[1] "a" "b" "c"
#[[2]]
#[1] "a+b" "a+c" "b+c"
#[[3]]
#[1] "a+b+c"
myVec <- c("a", "b", "c", "d")
sapply(seq_along(myVec), function(n) combn(myVec, n, paste, collapse = "+"))
#[[1]]
#[1] "a" "b" "c" "d"
#[[2]]
#[1] "a+b" "a+c" "a+d" "b+c" "b+d" "c+d"
#[[3]]
#[1] "a+b+c" "a+b+d" "a+c+d" "b+c+d"
#[[4]]
#[1] "a+b+c+d"
We can unlist if we need output as single vector.

Trouble evaluating combinations from combn using purrr

I am trying to use combn to divide a group of n = 20 different units into 3 groups of unequal size -- 4, 6 and 10. Then I am trying to validate for values that must be together within a group -- if one element from the pair exists in the group then the other should also be in the group. If one is not in the group then neither should be in the group. In this fashion, I'd like to evaluate the groups in order to find all possible valid solutions where the rules are true.
x <- letters[1:20]
same_group <- list(
c("a", "c"),
c("d", "f"),
c("b", "k", "r")
)
combinations_list <- combn(x, 4, simplify = F)
validate_combinations <- function(x) all(c("a", "c") %in% x) | !any(c("a", "c") %in% x)
valid_combinations <- keep(combinations_list, validate_combinations)
In this way I'd like to combine -> reduce each group until I have a list of all valid combinations. I'm not sure how to combine combinations_list, validate_combinations, and the same_group to check all same_group "rules" against the combinations in the table. The furthest I can get is to check against one combination c("a", "c"), which when run against keep(combinations_list, validate_combinations) is indeed giving me the output I want.
I think once I can do this, I can then use the unpicked values in another combn function for the group of 6 and the group of 10.
We can change the function to accept variable group
validate_combinations <- function(x, group) all(group %in% x) | !any(group %in% x)
then for each group subset the combinations_list which satisfy validate_combinations
lapply(same_group, function(x) combinations_list[
sapply(combinations_list, function(y) validate_combinations(y, x))])
#[[1]]
#[[1]][[1]]
#[1] "a" "b" "c" "d"
#[[1]][[2]]
#[1] "a" "b" "c" "e"
#[[1]][[3]]
#[1] "a" "b" "c" "f"
#[[1]][[4]]
#[1] "a" "b" "c" "g"
#[[1]][[5]]
#[1] "a" "b" "c" "h"
#[[1]][[6]]
#[1] "a" "b" "c" "i"
#[[1]][[7]]
#[1] "a" "b" "c" "j"
#[[1]][[8]]
#[1] "a" "b" "c" "k"
#......

R: Find unique vectors in list of vectors

I have a list of vectors
list_of_vectors <- list(c("a", "b", "c"), c("a", "c", "b"), c("b", "c", "a"), c("b", "b", "c"), c("c", "c", "b"), c("b", "c", "b"), c("b", "b", "c", "d"), NULL)
For this list I would like to know which vectors are unique in terms of their elements. That is, I would like the following output
[[1]]
[1] "a" "b" "c"
[[2]]
[1] "b" "b" "c"
[[3]]
[1] "c" "c" "b"
[[4]]
[1] "b" "b" "c" "d"
[[5]]
[1] NULL
Is there a function in R for performing this check? Or do I need do a lot of workarounds by writing functions?
My current not so elegant solution:
# Function for turning vectors into strings ordered by alphabet
stringer <- function(vector) {
if(is.null(vector)) {
return(NULL)
} else {
vector_ordered <- vector[order(vector)]
vector_string <- paste(vector_ordered, collapse = "")
return(vector_string)
}
}
# Identifying unique strings
vector_strings_unique <- unique(lapply(list_of_vectors, function(vector)
stringer(vector)))
vector_strings_unique
[[1]]
[1] "abc"
[[2]]
[1] "bbc"
[[3]]
[1] "bcc"
[[4]]
[1] "bbcd"
[[5]]
NULL
# Function for splitting the strings back into vectors
splitter <- function(string) {
if(is.null(string)) {
return(NULL)
} else {
vector <- unlist(strsplit(string, split = ""))
return(vector)
}
}
# Applying function
lapply(vector_strings_unique, function(string) splitter(string))
[[1]]
[1] "a" "b" "c"
[[2]]
[1] "b" "b" "c"
[[3]]
[1] "c" "c" "b"
[[4]]
[1] "b" "b" "c" "d"
[[5]]
[1] NULL
It does the trick and could be rewritten as a single function, but there must be a more elegant solution.
We can sort the list elements, apply duplicated to get a logical index of unique elements and subset the list based on that
list_of_vectors[!duplicated(lapply(list_of_vectors, sort))]
#[[1]]
#[1] "a" "b" "c"
#[[2]]
#[1] "b" "b" "c"
#[[3]]
#[1] "c" "c" "b"
#[[4]]
#[1] "b" "b" "c" "d"
#[[5]]
#NULL

collapse / compress vector of repeated elements to max k-repeated

Is there a more efficient way than function below based on rle to compress/collapse a vector, of lets's say strings, into max k-repeated. Example input and desired outputs given below, .
Input
foov <- rep(c("a", "b", "a"), c(5, 3, 2))
For k = 2, desired output would be:
"a" "a" "b" "b" "a" "a"
And for k = 3, desired output would be:
"a" "a" "a" "b" "b" "b" "a" "a"
At the moment I am using rle as follows to achieve this:
collapseRLE <- function(v, k) {
vrle <- rle(v)
vrle$lengths[vrle$lengths > k] <- k
ret <- rep(vrle$values, vrle$lengths)
return(invisible(ret))
}
foov <- rep(c("a", "b", "a"), c(5, 3, 2))
print(collapseRLE(foov, 2))
We can use rleid from data.table. Based on the grouping by rleid on the vector, we subset from the index provided the sequence of 'k' and extract the columns as a vector ($V1)
library(data.table)
f1 <- function(k, vec) data.table(vec)[, vec[seq_len(pmin(k, .N))], rleid(vec)]$V1
f1(2, foov)
#[1] "a" "a" "b" "b" "a" "a"
f1(3, foov)
#[1] "a" "a" "a" "b" "b" "b" "a" "a"

Extract data between rows r

I have the following row:
rep(c("foo",rep(c('A','B'),2),"bar",rep(c("C","D"),2)),2)
[1] "foo" "A" "B" "A" "B" "bar" "C" "D" "C" "D" "foo" "A"
[13] "B" "A" "B" "bar" "C" "D" "C" "D"
I would like to extract the data between 'foo' and and 'bar' to get
[1] "A" "B" "A" "B" "A" "B" "A" "B"
How would you do perform this task in r?
I used this approach which I think is the easiest to understand and a more R idiomatic way of doing it.
s <- rep(c("foo", rep(c('A','B'), 2), "bar", rep(c("C","D"), 2)), 2) # Your vector
get <- c(mapply(seq, which(s == "foo") + 1, which(s == "bar") - 1))
s[get]
#[1] "A" "B" "A" "B" "A" "B" "A" "B"
Using similar methodology to this answer
temp <- paste(rep(c("foo",rep(c('A','B'),2),"bar",rep(c("C","D"),2)),2), collapse = "")
unlist(strsplit(regmatches(temp, gregexpr('(?<=foo).*?(?=bar)', temp, perl=T))[[1]], ""))
##[1] "A" "B" "A" "B" "A" "B" "A" "B"
May be this helps:
If vec is the vector
vec[mapply(`:`, grep("foo", vec), grep("bar", vec))[-c(1,6),]]
#[1] "A" "B" "A" "B" "A" "B" "A" "B"
or
vec1 <- vec[mapply(`:`, grep("foo", vec), grep("bar", vec))]
vec1[!grepl(paste(c("foo","bar"),collapse="|"), vec1)]
#[1] "A" "B" "A" "B" "A" "B" "A" "B"
Update
For a vector like below:
vec1 <- c("foo", "A", "B", "bar", "C", "D", "bar", "foo", "A", "B", "A",
"bar", "C", "D", "D", "zoo", "A", "B", "foo", "A", "bar", "B", "A", "zoo",
"A", "foo", "A", "B")
you could use:
fun1 <- function(vec, first, second) {
lst <- split(vec, cumsum(vec == first))
unlist(lapply(lst, function(x) {
indx <- match(second, x) - 1
if (!is.na(indx) & indx>1) {
x[2:indx]
}
}), use.names = F)
}
fun1(vec1, "foo", "bar")
#[1] "A" "B" "A" "B" "A" "A"
fun1(vec, "foo", "bar")
#[1] "A" "B" "A" "B" "A" "B" "A" "B"
BTW, #David Arenburg's method works for both cases.
Assuming that foo is first among the two and that they alternate:
vec <- rep(c("foo",rep(c('A','B'),2),"bar",rep(c("C","D"),2)),2)
idx <- vec %in% c("foo", "bar")
vec[cumsum(idx) %% 2 == 1 & !idx]
# [1] "A" "B" "A" "B" "A" "B" "A" "B"

Resources