replace characters in string based on positions from another variable R - r

I have the below dataframe xo. For each row, I want to find and replace the positions listed in positions_of_Ns_to_remove in sequence. The results new variable in the example should be sequence with all R's removed. I cannot search based on the character itself in this situation - it must be based on the position of the character.
p <- data.frame(locus = c("1","2","3"), positions_of_Ns_to_remove = c("12,17,43,100","30,60,61,62",NA))
x <- data.frame(locus = c("1","1","2","3"), sequence = c("xxxxxxxxxxxRxxxxRxxxxxxxxxxxxxxxxxxxxxxxxxRxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxR","xxxxxxxxxxxRxxxxRxxxxxxxxxxxxxxxxxxxxxxxxxRxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxR","xxxxxxxxxxxxxxxxxxxxxxxxxxxxxRxxxxxxxxxxxxxxxxxxxxxxxxxxxxxRRRxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx","xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"))
xo <- merge(x, p, by = c("locus"), all.x = T)
> xo
locus sequence positions_of_Ns_to_remove
1 1 xxxxxxxxxxxRxxxxRxxxxxxxxxxxxxxxxxxxxxxxxxRxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxR 12,17,43,100
2 1 xxxxxxxxxxxRxxxxRxxxxxxxxxxxxxxxxxxxxxxxxxRxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxR 12,17,43,100
3 2 xxxxxxxxxxxxxxxxxxxxxxxxxxxxxRxxxxxxxxxxxxxxxxxxxxxxxxxxxxxRRRxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx 30,60,61,62
4 3 xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx <NA>
This works if there is only 1 row in xo, but not when there are multiple rows. I would like to use tidyverse functions / piping and avoid for loops if possible.
xo %>% dplyr::mutate(new_sequence = paste(
replace( unlist(strsplit(sequence, "")), as.integer(unlist(strsplit(positions_of_Ns_to_remove,","))), "" ),
collapse = "")
)
What I want:
locus new_sequence positions_of_Ns_to_remove
1 1 xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx 12,17,43,100
2 1 xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx 12,17,43,100
3 2 xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx 30,60,61,62
4 3 xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx <NA>

You could build a custom function and apply it to your data:
library(stringr)
# cuts the n-th character out of the string
remove_pos <- function(string, n) {
n <- as.integer(n)
n <- n[order(n, decreasing = TRUE)]
len <- nchar(string)
output <- string
for (i in n) {
output <- paste0(
str_sub(output, start = 1L, end = i - 1L),
str_sub(output, start = i + 1, end = len)
)
}
return(output)
}
xo %>%
mutate(positions = str_split(positions_of_Ns_to_remove, ",")) %>%
group_by(locus, n=row_number()) %>%
mutate(
new_seq = ifelse(!is.na(positions_of_Ns_to_remove),
remove_pos(sequence, unlist(positions)),
sequence)
) %>%
select(-positions) %>%
ungroup()
which returns
# A tibble: 5 x 4
locus sequence positions_of_Ns_to~ new_seq
<chr> <chr> <chr> <chr>
1 1 xxxxxxxxxxxRxxxxRxxxxxxxxxxxxxxxxxxxxxxxxx~ 12,17,43,100 xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx~
2 1 xxxxxxxxxxxRxxxxRxxxxxxxxxxxxxxxxxxxxxxxxx~ 12,17,43,100 xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx~
3 2 xxxxxxxxxxxxxxxxxxxxxxxxxxxxxRxxxxxxxxxxxx~ 30,60,61,62 xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx~
4 3 Rxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx~ 1 xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx~
5 4 xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx~ NA xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx~

Related

Returning a tibble: how to vectorize with case_when?

I have a function which returns a tibble. It runs OK, but I want to vectorize it.
library(tidyverse)
tibTest <- tibble(argX = 1:4, argY = 7:4)
square_it <- function(xx, yy) {
if(xx >= 4){
tibble(x = NA, y = NA)
} else if(xx == 3){
tibble(x = as.integer(), y = as.integer())
} else if (xx == 2){
tibble(x = xx^2 - 1, y = yy^2 -1)
} else {
tibble(x = xx^2, y = yy^2)
}
}
It runs OK in a mutate when I call it with map2, giving me the result I wanted:
tibTest %>%
mutate(sq = map2(argX, argY, square_it)) %>%
unnest()
## A tibble: 3 x 4
# argX argY x y
# <int> <int> <dbl> <dbl>
# 1 1 7 1 49
# 2 2 6 3 35
# 3 4 4 NA NA
My first attempt to vectorize it failed, and I can see why - I can't return a vector of tibbles.
square_it2 <- function(xx, yy){
case_when(
x >= 4 ~ tibble(x = NA, y = NA),
x == 3 ~ tibble(x = as.integer(), y = as.integer()),
x == 2 ~ tibble(x = xx^2 - 1, y = yy^2 -1),
TRUE ~ tibble(x = xx^2, y = yy^2)
)
}
# square_it2(4, 2) # FAILS
My next attempt runs OK on a simple input. I can return a list of tibbles, and that's what I want for the unnest
square_it3 <- function(xx, yy){
case_when(
xx >= 4 ~ list(tibble(x = NA, y = NA)),
xx == 3 ~ list(tibble(x = as.integer(), y = as.integer())),
xx == 2 ~ list(tibble(x = xx^2 - 1, y = yy^2 -1)),
TRUE ~ list(tibble(x = xx^2, y = yy^2))
)
}
square_it3(4, 2)
# [[1]]
# # A tibble: 1 x 2
# x y
# <lgl> <lgl>
# 1 NA NA
But when I call it in a mutate, it doesn't give me the result I had with square_it. I can sort of see what's
wrong. In the xx == 2 clause, xx acts as an atomic value of 2. But in
building the tibble, xx is a length-4 vector.
tibTest %>%
mutate(sq = square_it3(argX, argY)) %>%
unnest()
# # A tibble: 9 x 4
# argX argY x y
# <int> <int> <dbl> <dbl>
# 1 1 7 1 49
# 2 1 7 4 36
# 3 1 7 9 25
# 4 1 7 16 16
# 5 2 6 0 48
# 6 2 6 3 35
# 7 2 6 8 24
# 8 2 6 15 15
# 9 4 4 NA NA
How do I get the same result as I did with square_it, but from a vectorized function using case_when ?
We define row_case_when which has a similar formula interface as case_when except it has a first argument of .data, acts by row and expects that the value of each leg to be a data frame. It returns a data.frame/tibble. Wrapping in a list, rowwise and unnest are not needed.
case_when2 <- function (.data, ...) {
fs <- dplyr:::compact_null(rlang:::list2(...))
n <- length(fs)
if (n == 0) {
abort("No cases provided")
}
query <- vector("list", n)
value <- vector("list", n)
default_env <- rlang:::caller_env()
quos_pairs <- purrr::map2(fs, seq_along(fs), dplyr:::validate_formula,
rlang:::default_env, rlang:::current_env())
for (i in seq_len(n)) {
pair <- quos_pairs[[i]]
query[[i]] <- rlang::eval_tidy(pair$lhs, data = .data, env = default_env)
value[[i]] <- rlang::eval_tidy(pair$rhs, data = .data, env = default_env)
if (!is.logical(query[[i]])) {
abort_case_when_logical(pair$lhs, i, query[[i]])
}
if (query[[i]]) return(value[[i]])
}
}
row_case_when <- function(.data, ...) {
.data %>%
group_by(.group = 1:n(), !!!.data) %>%
do(case_when2(., ...)) %>%
mutate %>%
ungroup %>%
select(-.group)
}
Test run
It is used like this:
library(dplyr)
tibTest <- tibble(argX = 1:4, argY = 7:4) # test data from question
tibTest %>%
row_case_when(argX >= 4 ~ tibble(x = NA, y = NA),
argX == 3 ~ tibble(x = as.integer(), y = as.integer()),
argX == 2 ~ tibble(x = argX^2 - 1, y = argY^2 -1),
TRUE ~ tibble(x = argX^2, y = argY^2)
)
giving:
# A tibble: 3 x 4
argX argY x y
<int> <int> <dbl> <dbl>
1 1 7 1 49
2 2 6 3 35
3 4 4 NA NA
mutate_cond and mutate_when
These are not quite the same as row_case_when since they don't run through conditions taking the first true one but by using mutually exclusive conditions they can be used for certain aspects of this problem. They do not handle changing the number of rows in the result but we can use dplyr::filter to remove rows for a particular condition.
mutate_cond defined in dplyr mutate/replace several columns on a subset of rows is like mutate except the second argument is a condition and the subsequent arguments are applied only to rows for which that condition is TRUE.
mutate_when defined in
dplyr mutate/replace several columns on a subset of rows is similar to case_when except it applies to rows, the replacement values are provided in a list and alternate arguments are conditions and lists. Also all legs are always run applying the replacement values to the rows satisfying the conditions (as opposed to, for each row, performing the replacement on just the first true leg). To get a similar effect to row_case_when be sure that the conditions are mutually exclusive.
# mutate_cond example
tibTest %>%
filter(argX != 3) %>%
mutate(x = NA_integer_, y = NA_integer_) %>%
mutate_cond(argX == 2, x = argX^2 - 1L, y = argY^2 - 1L) %>%
mutate_cond(argX < 2, x = argX^2, y = argY^2)
# mutate_when example
tibTest %>%
filter(argX != 3) %>%
mutate_when(TRUE, list(x = NA_integer_, y = NA_integer_),
argX == 2, list(x = argX^2 - 1L, y = argY^2 - 1L),
argX < 2, list(x = argX^2, y = argY^2))
You need to ensure you are creating a 1-row tibble with each call of the function, then vectorize that.
This works whether you have rowwise groups or not.
You can do this with switch wrapped in a map2:
Here's a reprex:
library(tidyverse)
tibTest <- tibble(argX = 1:4, argY = 7:4)
square_it <- function(xx, yy) {
map2(xx, yy, function(x, y){
switch(which(c(x >= 4,
x == 3,
x == 2,
x < 4 & x != 3 & x != 2)),
tibble(x = NA, y = NA),
tibble(x = as.integer(), y = as.integer()),
tibble(x = x^2 - 1, y = y^2 -1),
tibble(x = x^2, y = y^2))})
}
tibTest %>% mutate(sq = square_it(argX, argY)) %>% unnest(cols = sq)
#> # A tibble: 3 x 4
#> argX argY x y
#> <int> <int> <dbl> <dbl>
#> 1 1 7 1 49
#> 2 2 6 3 35
#> 3 4 4 NA NA
Created on 2020-05-16 by the reprex package (v0.3.0)

Long format separation issue

From this dataframe:
dftest <- data.frame(id = c(1), text = c("java-ee?jsf?omnifaces?jpa"), stringsAsFactors = F)
I would like to produce a dataframe like this
data.frame(id = c(1), java-ee = c(1), jsf = c(1), onifaces = c(1), jpa = c(1))
I use this commands to make it:
s2 <- strsplit(dftest$text, split = "?")
dftest2 <- data.frame(id = rep(dftest2$id, sapply(s2, length)), text = unlist(s2))
dflike_final <- reshape(dftest2, idvar = "id", timevar = "text", direction = "wide")
Howver the results from the first two line is this:
id text
1 1 j
2 1 a
3 1 v
4 1 a
5 1 -
6 1 e
7 1 e
8 1 ?
9 1 j
10 1 s
11 1 f
12 1 ?
13 1 o
14 1 m
15 1 n
16 1 i
17 1 f
18 1 a
19 1 c
20 1 e
21 1 s
22 1 ?
23 1 j
24 1 p
25 1 a
How can I fix it to have the whole string?
We can bring the text in separate rows, create a dummy column (n) and get the data in wide format using pivot_wider.
library(dplyr)
library(tidyr)
dftest %>%
separate_rows(text, sep = "\\?") %>%
mutate(n = 1) %>%
pivot_wider(values_from = n, names_from = text)
# A tibble: 1 x 5
# id `java-ee` jsf omnifaces jpa
# <dbl> <dbl> <dbl> <dbl> <dbl>
#1 1 1 1 1 1
As mentioned by #Roland ? is a special character in regex we need to escape it. Also you need to include a dummy column in creating the new dataframe. You can then use your attempt as
s2 <- strsplit(dftest$text, split = "\\?")
dftest2 <- data.frame(id = rep(dftest$id, lengths(s2)), text = unlist(s2), n = 1)
dflike_final <- reshape(dftest2, idvar = "id", timevar = "text", direction = "wide")

Using rep inside sapply to strech a vector according to another vector

I want to generate a data.frame of edges. Problems arise when many edges end on one node. Edges are defined in vectors from and to.
# Data
vertices <- data.frame(id = 1:3, label = c("a", "b", "c"), stringsAsFactors = FALSE)
to <- c("a", "b", "c")
from1 <- c("c", "a", "b")
from2 <- c("c", "a", "a,b,c")
What I tried:
# Attempt 1
create_edges_1 <- function(from, to) {
to <- sapply(to, function(x){vertices$id[vertices$label == x]})
from <- sapply(from, function(x){vertices$id[vertices$label == x]})
data.frame(from = from, to = to, stringsAsFactors = FALSE)
}
This works for example create_edges_1(from1, to), the output is:
from to
c 3 1
a 1 2
b 2 3
However for example from2 this attempt fails.
So I tried the following:
# Attempt 2
create_edges_2 <- function(from, to) {
to <- sapply(unlist(sapply(strsplit(to, ","), function(x){vertices$id[vertices$label == x]})), function(x){rep(x, sapply(strsplit(from2, ","), length))})
from <- unlist(sapply(strsplit(from2, ","), function(x){vertices$id[vertices$label == x]}))
data.frame(from = from, to = to, stringsAsFactors = FALSE)
}
The idea was to "stretch" to for every node where more than one edge ends. However create_edges_2(from1, to) and create_edges_2(from2, to) both throw an error
Error in rep(x, sapply(strsplit(from2, ","), length)) :
invalid 'times' argument
What am I doing wrong in my sapply statements?
The expected output for create_edges_2(from2, to) is:
from to
3 1
1 2
1 3
2 3
3 3
You could use joins or match for this
f2 <- strsplit(from2, ',')
df <- data.frame(from = unlist(f2)
, to = rep(to, lengths(f2))
, stringsAsFactors = FALSE)
With match
library(tidyverse)
map_dfc(df, ~ with(vertices, id[match(.x, label)]))
# # A tibble: 5 x 2
# from to
# <int> <int>
# 1 3 1
# 2 1 2
# 3 1 3
# 4 2 3
# 5 3 3
With Joins
library(dplyr)
df %>%
inner_join(vertices, by = c(from = 'label')) %>%
inner_join(vertices, by = c(to = 'label')) %>%
select_at(vars(matches('.x|.y')))
# id.x id.y
# 1 3 1
# 2 1 2
# 3 1 3
# 4 2 3
# 5 3 3
Here is a way:
# Attempt 3
library(dplyr)
to <- sapply(to, function(x){vertices$id[vertices$label == x]})
from0 <- sapply(from2, function(x) strsplit(x, ",")) %>% unlist() %>% as.character()
lengths0 <- lapply(sapply(from2, function(x) strsplit(x, ",")), length) %>% unlist()
to0 <- c()
for( i in 1:length(lengths0)) to0 <- c(to0, rep(to[i], lengths0[i]))
from <- sapply(from0, function(x){vertices$id[vertices$label == x]})
edges <- data.frame(from = from, to = to0, stringsAsFactors = FALSE)
edges
Giving this result as requested:
from to
1 3 1
2 1 2
3 1 3
4 2 3
5 3 3
The idea is to split from with comma separators, and to store the size of each element in order to "stretch" every node. Here done with a for loop

Loop by variable names

I want to create a for loop by variable names.
Each time, I calculte the max between each two variables, and define a new one in data df. New variables look like this:var1_1, var1_2... Here is my code:
df=data.frame(matrix(c(1:6), nrow = 2))
colnames(df) = c("x", "y", "z")
for(i in length(names(df))-1){
df = df %>% mutate(paste0("var", i, "_", i+1) = max(names(df)[i], names(df)[i+1]))
}
But there gives error.
Expected output:
>df
x y z var1_2 var1_3 var2_3
1 3 5 3 5 5
2 4 6 4 6 6
One way via base R,
m1 <- sapply(combn(names(df),2, simplify = FALSE), function(i) do.call(pmax, df[i]))
nms <- combn(ncol(m1), 2, function(i) paste0('Var', i[1], '_', i[2]))
cbind(df, setNames(data.frame(m1), nms))
# x y z Var1_2 Var1_3 Var2_3
#1 1 3 5 3 5 5
#2 2 4 6 4 6 6
If you really want to use a Loop you can try:
ind<-combn(3,2)
for(i in 1:dim(df)[2]){
i <- ind[,i]
name <- paste0("var", i[1], "_", i[2])
val <- names(df)[i[ifelse(sum(df[,i[1]]) > sum(df[,i[2]]),1,2)]]
df <- mutate_(df, .dots= setNames(list(val),name))
}

Concatenating all rows within a group using dplyr

Suppose I have a dataframe like this:
hand_id card_id card_name card_class
A 1 p alpha
A 2 q beta
A 3 r theta
B 2 q beta
B 3 r theta
B 4 s gamma
C 1 p alpha
C 2 q beta
I would like to concatenate the card_id, card_name, and card_class into one single row per hand level A, B, C. So the result would look something like this:
hand_id combo_1 combo_2 combo_3
A 1-2-3 p-q-r alpha-beta-theta
B 2-3-4 q-r-s beta-theta-gamma
....
I attempted to do this using group_by and mutate, but I can't seem to get it to work
data <- read_csv('data.csv')
byHand <- group_by(data, hand_id) %>%
mutate(combo_1 = paste(card_id),
combo_2 = paste(card_name),
combo_3 = paste(card_class))
Thank you for your help.
You were kind of close!
library(tidyr)
library(dplyr)
data <- read_csv('data.csv')
byHand <- group_by(data, hand_id) %>%
summarise(combo_1 = paste(card_id, collapse = "-"),
combo_2 = paste(card_name, collapse = "-"),
combo_3 = paste(card_class, collapse = "-"))
or using summarise_each:
byHand <- group_by(data, hand_id) %>%
summarise_each(funs(paste(., collapse = "-")))
Here is another option using data.table
library(data.table)
setDT(data)[, lapply(.SD, paste, collapse="-") , by = hand_id]
# hand_id card_id card_name card_class
#1: A 1-2-3 p-q-r alpha-beta-theta
#2: B 2-3-4 q-r-s beta-theta-gamma
#3: C 1-2 p-q alpha-beta
Not very familiar with dplyr... so here's my attempt without dplyr
df <- read_csv('data.csv')
res <- lapply(split(df, df$hand_id),function(x){
sL <- apply(x[,-1], 2, function(y) paste(y, collapse = "-"))
d <- data.frame(x$hand_id[1], rbind(sL))
names(d) <- c("hand_id", "combo_1", "combo_2", "combo_3")
return(d)
})
res <- do.call("rbind",res)
rownames(res) <- NULL
Here's the output:
## hand_id combo_1 combo_2 combo_3
## 1 A 1-2-3 p-q-r alpha-beta-theta
## 2 B 2-3-4 q-r-s beta-theta-gamma
## 3 C 1-2 p-q alpha-beta
If you have NAs in your data, you can use na.omit() inline with str_c(). unique() will also work if you only want the distinct.
data:
hand_id card_id card_name card_class
<chr> <dbl> <chr> <chr>
1 A 1 p alpha
2 A 2 q beta
3 A 3 r theta
4 A NA NA NA
5 B 2 q beta
6 B 3 r theta
7 B 4 s gamma
8 C 1 p alpha
9 C 2 q beta
code:
data %>%
group_by(hand_id) %>%
summarize(card_id = str_c(na.omit(card_id), collapse = "-"),
card_name = str_c(na.omit(card_name), collapse = "-"),
card_class = str_c(na.omit(card_class), collapse = "-"))
output:
hand_id card_id card_name card_class
* <chr> <chr> <chr> <chr>
1 A 1-2-3 p-q-r alpha-beta-the…
2 B 2-3-4 q-r-s beta-theta-gam…
3 C 1-2 p-q alpha-beta

Resources