Show adjacent members in a list - r

I want to inspect adjacent elements in a list based on a match. For example, in a list of randomly ordered letters, I want to know what the neighboring letters of m is. My current solution is:
library(stringr)
ltrs <- sample(letters)
ltrs[(str_which(ltrs,'m')-2):(str_which(ltrs,'m')+2)]
[1] "j" "f" "m" "q" "a"
To me, the repetition of str_which() feels unnecessary. Is there a simpler way to achieve the same result?

First, I regenerate random data with a seed for reproducibility:
set.seed(42)
ltrs <- sample(letters)
ltrs
# [1] "q" "e" "a" "j" "d" "r" "z" "o" "g" "v" "i" "y" "n" "t" "w" "b" "c" "p" "x"
# [20] "l" "m" "s" "u" "h" "f" "k"
Use -2:2 and then (cautionarily) remove those below 1 or above the length of the vector:
ind <- -2:2 + which(ltrs == "m")
ind <- ind[0 < ind & ind < length(ltrs)]
ltrs[ind]
# [1] "x" "l" "m" "s" "u"
If your target is more than one (not just "m"), then we can use a different approach.
ind <- which(ltrs %in% c("m", "f"))
ind <- lapply(ind, function(z) { z <- z + -2:2; z[0 < z & z <= length(ltrs)]; })
ind
# [[1]]
# [1] 19 20 21 22 23
# [[2]]
# [1] 23 24 25 26
lapply(ind, function(z) ltrs[z])
# [[1]]
# [1] "x" "l" "m" "s" "u"
# [[2]]
# [1] "u" "h" "f" "k"
Or, if you don't care about keeping them grouped, we can try this:
ind <- which(ltrs %in% c("m", "f"))
ind <- unique(sort(outer(-2:2, ind, `+`)))
ind <- ind[0 < ind & ind <= length(ltrs)]
ltrs[ind]
# [1] "x" "l" "m" "s" "u" "h" "f" "k"

If you don't have duplicates, you can try the code like below
ltrs[seq_along(ltrs)%in% (which(ltrs=="m")+(-2:2))]
otherwise
ltrs[seq_along(ltrs) %in% c(outer(which(ltrs == "m"), -2:2, `+`))]

You can also use the slider::slide function (using data provided by #r2evans):
slider::slide(ltrs, ~ .x, .before = 2, .after = 2)[[which(ltrs == "m")]]
# [1] "x" "l" "m" "s" "u"
slider::slide(ltrs, ~ .x, .before = 2, .after = 2)[which(ltrs %in% c("m","f"))]
# [[1]]
# [1] "x" "l" "m" "s" "u"
#
# [[2]]
# [1] "u" "h" "f" "k"

Related

How to code to get an output vector list of unique elements based satisfying two conditions?

I'm trying to get list of uniques elements based on conditions of two columns in R.
For example, I have 4 groups and I want to get unique list of names of participants who are in group-1.
This requires to specify the two conditions in the code:
Unique(df$participants XXX_group_XXX).
How to code this condition specifically to get the output vecort list satisfying both conditions?
A simple solution using only base R:
set.seed(7*11*13)
name <- sample(LETTERS[1:10], 100, replace=TRUE)
G <- sample(1:5, 100, replace=TRUE)
U <- tapply(name, G, unique)
> U
$`1`
[1] "G" "F" "D" "B" "J" "A" "E" "H" "C"
$`2`
[1] "C" "J" "D" "B" "F" "G"
$`3`
[1] "C" "G" "H" "D" "F" "E" "I" "B" "J"
$`4`
[1] "F" "B" "G" "E" "I" "C" "H" "D" "J"
$`5`
[1] "G" "D" "A" "H" "F" "E" "B" "J" "C"
Would this work for you? I need to create a data frame first. Then I filter for the group you wish to see and get the unique values per group.
library(dplyr)
seed <- 123
# create some data
data <- data.frame(
name = sample(LETTERS, size = 100, replace = TRUE),
group = sample(c(1, 2, 3, 4), size = 100, replace = TRUE)
)
# base R
unique(data[data$group == 1, 1])
# or:
unique(data[data$group == 1, "name"])
# tidyverse
data %>%
filter(group == 1) %>%
distinct(name) %>%
pull() # if you want a vector to be returned

Pass function through specific columns with lapply or for loop

I have created a function that reorganizes a data frame into a list. I want to pass the function through all of the columns in the data frame (excluding the first 2 columns) however, the lapply function is returning strange results.
Here is a reproducible example:
names <- c("A", "B", "C", "D")
titles <- c("P", "S", "S", "P")
day1 <- c(1,0,1,0)
day2 <- c(0,0,1,1)
day3 <- c(1,1,0,0)
df <- data.frame(names, titles, day1, day2, day3)
ids <-df[,1:2]
obs <- df[,3:5]
I create the function which searches each "day column" for a 1 or a 0 and reports the "name" and "title" of a row with a 0 (it also removes duplicated values).
group_maker1 <- function(x){
g1 <- ids$names[obs[,x]> 0]
g2 <- ids$titles[obs[,x]> 0]
temp <- c(g1,g2)
temp <- temp[!duplicated(temp)]
paste(temp)
}
#test group_maker
> group_maker1(3)
[1] "A" "B" "P" "S"
In the actual data frame, there are many (>300) columns of "days". I want to pass this group_maker function through each column of "days" to the nth day.
I've tried running it through a for loop but the output doesn't seem to store anywhere
for(i in 1:nrow(df)) { # for-loop over columns
group_maker1 <- function(x){
g1 <- ids$names[obs[,x]> 0]
g2 <- ids$titles[obs[,x]> 0]
temp <- c(g1,g2)
temp <- temp[!duplicated(temp)]
paste(temp)
}
}
Alternatively, I tried lapply, which seems more promising as it gives an output, however "NA"'s are present, and its not reporting any of the "B" names
lapply(obs[,1:3], group_maker1)
$day1
[1] "A" "C" "NA" "P" "S"
$day2
[1] "A" "C" "NA" "P" "S"
$day3
[1] "A" "C" "NA" "P" "S"
This is the desired output, however the values within it are incorrect. I want it to return the output as seen above in the group_maker1(3) line but with the correct values for each column of days (i.e. no "NA's" and all of the values in that column)
Essentially, I want the loop/apply to pass the function through each column of "days" and provide an output of all the "names" and "titles" for each day in the form of a list.
Using your test data, we have
> group_maker1(1)
[1] "A" "C" "P" "S"
> group_maker1(2)
[1] "C" "D" "S" "P"
> group_maker1(3)
[1] "A" "B" "P" "S"
So, we can replicate using a for loop with
> for(i in 1:3) print(group_maker1(i))
[1] "A" "C" "P" "S"
[1] "C" "D" "S" "P"
[1] "A" "B" "P" "S"
or using lapply with
> lapply(1:3, group_maker1)
[[1]]
[1] "A" "C" "P" "S"
[[2]]
[1] "C" "D" "S" "P"
[[3]]
[1] "A" "B" "P" "S"
In both cases, your attempt failed because of a simple typo.
Or, taking a completely different approach to avoid the explicit use of loops altogether
library(tidyverse)
df %>%
pivot_longer(
starts_with("day"),
names_to="col",
values_to="val"
) %>%
group_by(col) %>%
group_map(
function(.x, .y) {
z <- .x %>% filter(val > 0)
c(z %>% pull(names) %>% unique(), z %>% pull(titles) %>% unique())
}
)
[[1]]
[1] "A" "C" "P" "S"
[[2]]
[1] "C" "D" "S" "P"
[[3]]
[1] "A" "B" "P" "S"
This final option could be shorter if there were no need to deal with awkward input and output formats.

multiple data.table columns to one column of vectors

I have a data.table like this:
tab = data.table(V1 = c('a', 'b', 'c'),
V2 = c('d', 'e', 'f'),
V3 = c('g', 'h', 'i'),
id = c(1,2,3))
From the columns V1,V2,V3 of this table, I'd like to get for row i a vector of c(V1[i],V2[i], V3[i])
I can get a list of the desired vectors like this:
lapply(1:tab[, .N], function(x) tab[x, c(V1, V2, V3)])
Which returns:
[[1]]
[1] "a" "d" "g"
[[2]]
[1] "b" "e" "h"
[[3]]
[1] "c" "f" "i"
But I think this is probably slow and not very data.table-like.
Also, I'd like to generalize it, do that I don't have explicitly type V1, V2, V3, but rather pass a vector of column names to be processed this way.
Try this?
> asplit(unname(tab[, V1:V3]), 1)
[[1]]
"a" "d" "g"
[[2]]
"b" "e" "h"
[[3]]
"c" "f" "i"
Using split
split(as.matrix(tab[, V1:V3]), tab$id)
$`1`
[1] "a" "d" "g"
$`2`
[1] "b" "e" "h"
$`3`
[1] "c" "f" "i"
as.list(transpose(tab[, .(V1, V2, V3)]))
Or as a function
tdt <- function(DT, cols) as.list(transpose(DT[, .SD, .SDcols = cols]))
tdt(tab, c('V1', 'V2', 'V3'))
# $V1
# [1] "a" "d" "g"
#
# $V2
# [1] "b" "e" "h"
#
# $V3
# [1] "c" "f" "i"
tab[, 1:3] |> transpose() |> as.list()
$V1
[1] "a" "d" "g"
$V2
[1] "b" "e" "h"
$V3
[1] "c" "f" "i"

Group a DNA sequence in codons

I have generated a random DNA sequence
base <- c("A","G","U")
seq <- sample(base, 15, replace = T)
[1] "A" "G" "A" "U" "A" "G" "U" "A" "U" "A" "G" "U" "G" "U" "G"
How can I group the resulting sequence to codons (set of three nucleotides) in order to look for the stop codons?
I need something like these:
new_seq <- c("AGA","UAG", "UAU", "AGU", "GUG")
Convert to 3 column matrix, then paste:
base <- c("A","G","U")
set.seed(1); x <- sample(base, 15, replace = T)
x
# [1] "A" "U" "A" "G" "A" "U" "U" "G" "G" "U" "U" "A" "A" "A" "G"
do.call(paste0, as.data.frame(matrix(x, ncol = 3, byrow = TRUE)))
# [1] "AUA" "GAU" "UGG" "UUA" "AAG"
We can use gl to create the group, and using tapply do a group by paste
unname(tapply(seq, as.integer(gl(length(seq), 3,
length(seq))), FUN = paste, collapse=""))
#[1] "GAU" "UUG" "AAG" "GGU" "AGA"
NOTE: This would also work when the length is not a multiple
Or another option is to split after pasteing into a single string
strsplit(paste(seq, collapse=""), "(?<=...)", perl = TRUE)[[1]]
#[1] "GAU" "UUG" "AAG" "GGU" "AGA"

Finding Elements of Lists in R

Right now I'm working with a character vector in R, that i use strsplit to separate word by word. I'm wondering if there's a function that I can use to check the whole list, and see if a specific word is in the list, and (if possible) say which elements of the list it is in.
ex.
a = c("a","b","c")
b= c("b","d","e")
c = c("a","e","f")
If z=list(a,b,c), then f("a",z) would optimally yield [1] 1 3, and f("b",z) would optimally yield [1] 1 2
Any assistance would be wonderful.
As alexwhan says, grep is the function to use. However, be careful about using it with a list. It isn't doing what you might think it's doing. For example:
grep("c", z)
[1] 1 2 3 # ?
grep(",", z)
[1] 1 2 3 # ???
What's happening behind the scenes is that grep coerces its 2nd argument to character, using as.character. When applied to a list, what as.character returns is the character representation of that list as obtained by deparsing it. (Modulo an unlist.)
as.character(z)
[1] "c(\"a\", \"b\", \"c\")" "c(\"b\", \"d\", \"e\")" "c(\"a\", \"e\", \"f\")"
cat(as.character(z))
c("a", "b", "c") c("b", "d", "e") c("a", "e", "f")
This is what grep is working on.
If you want to run grep on a list, a safer method is to use lapply. This returns another list, which you can operate on to extract what you're interested in.
res <- lapply(z, function(ch) grep("a", ch))
res
[[1]]
[1] 1
[[2]]
integer(0)
[[3]]
[1] 1
# which vectors contain a search term
sapply(res, function(x) length(x) > 0)
[1] TRUE FALSE TRUE
Much faster than grep is:
sapply(x, function(y) x %in% y)
and if you want the index of course just use which():
which(sapply(x, function(y) x %in% y))
Evidence!
x = setNames(replicate(26, list(sample(LETTERS, 10, rep=T))), sapply(LETTERS, list))
head(x)
$A
[1] "A" "M" "B" "X" "B" "J" "P" "L" "M" "L"
$B
[1] "H" "G" "F" "R" "B" "E" "D" "I" "L" "R"
$C
[1] "P" "R" "C" "N" "K" "E" "R" "S" "N" "P"
$D
[1] "F" "B" "B" "Z" "E" "Y" "J" "R" "H" "P"
$E
[1] "O" "P" "E" "X" "S" "Q" "S" "A" "H" "B"
$F
[1] "Y" "P" "T" "T" "P" "N" "K" "P" "G" "P"
system.time(replicate(1000, grep("A", x)))
user system elapsed
0.11 0.00 0.11
system.time(replicate(1000, sapply(x, function(y) "A" %in% y)))
user system elapsed
0.05 0.00 0.05
You're looking for grep():
grep("a", z)
#[1] 1 3
grep("b", z)
#[1] 1 2

Resources