Creating new columns with combinations of string patterns in R - r

I have a data frame - in which I have a column with a lengthy string separated by _. Now I am interested in counting the patterns and several possible combinations from the long string. In the use case I provided below, you can find that I would like to count the occurrence of events A and B but not anything else.
If A and B repeat like A_B or B_A alone or if they repeats itself n number of times, I want to count them and also if there are several occurrences of those combinations.
Example data frame:
participant <- c("A", "B", "C")
trial <- c(1,1,2)
string_pattern <- c("A_B_A_C_A_B", "B_A_B_A_C_D_A_B", "A_B_C_A_B")
df <- data.frame(participant, trial, string_pattern)
Expected output:
participant trial string_pattern A_B B_A A_B_A B_A_B B_A_B_A
1. A 1 A_B_A_C_A_B 2 1 1 0 0
2. B 1 B_A_B_A_C_D_A_B 2 2 1 1 1
3. C 2 A_B_C_A_B 2 0 0 0 0
My code:
revised_df <- df%>%
dplyr::mutate(A_B = stringr::str_count(string_pattern, "A_B"),
B_A = stringr::str_count(string_pattern, "B_A"),
B_A_B = string::str_count(string_pattern, "B_A_B"))
My approach gets complicated as the number of combinations increases. Hence, looking for a better solution.

You could write a function to solve this:
m <- function(s){
a <- seq(nchar(s)-1)
start <- rep(a, rev(a))
stop <- ave(start, start, FUN = \(x)seq_along(x)+x)
b <- substring(s, start, stop)
gsub('(?<=\\B)|(?=\\B)', '_', b, perl = TRUE)
}
n <- function(x){
names(x) <- x
a <- strsplit(gsub("_", '', gsub("_[^AB]+_", ':', x)), ':')
b <- t(table(stack(lapply(a, \(y)unlist(sapply(y, m))))))
data.frame(pattern=x, as.data.frame.matrix(b), row.names = NULL)
}
n(string_pattern)
pattern A_B A_B_A B_A B_A_B B_A_B_A
1 A_B_A_C_A_B 2 1 1 0 0
2 B_A_B_A_C_D_A_B 2 1 2 1 1
3 A_B_C_A_B 2 0 0 0 0

Try: This checks each string row for current column name
library(dplyr)
df |>
mutate(A_B = 0, B_A = 0, A_B_A = 0, B_A_B = 0, B_A_B_A = 0) |>
mutate(across(A_B:B_A_B_A, ~ str_count(string_pattern, cur_column())))
participant trial string_pattern A_B B_A A_B_A B_A_B B_A_B_A
1 A 1 A_B_A_C_A_B 2 1 1 0 0
2 B 1 B_A_B_A_C_D_A_B 2 2 1 1 1
3 C 2 A_B_C_A_B 2 0 0 0 0

Related

Adjacency Matrix from a dataframe

I am trying to convert an edgelist to an adjacent matrix.
Below is the sample data
#Sample Data
User<-c("1","1","2","3","4")
v1 <- c("b", "b", "a", "d", "c")
v2 <- c("c", "d", "c", "a", "a")
v3 <- c(0, 0, "d", 0, "b")
v4 <- c(0, 0, 0, 0, 0)
v5 <- c(0, 0, 0, 0, 0)
my_data<-data.frame(User, v1, v2, v3, v4, v5)
my_data
If you run this code you will get the below as output,
User v1 v2 v3 v4 v5
1 b c 0 0 0
1 b d 0 0 0
2 a c d 0 0
3 d a 0 0 0
4 c a b 0 0
Using the data, I want to create an adjacent matrix that looks like follows:
a b c d
a 0 0 2 2
b 0 0 1 1
c 2 1 0 1
d 2 1 1 0
Basically, the desired output diplays the count how many times each pair appeared in column v1~v5 in the sample data frame.
I have tried to use AdjacencyFromEdgelist function from dils library, also tried to create a matrix shell with NAs and fill out the matrix by looping through the dataframe.
However, I could not get neither way to work.
I think this may be close to what you have in mind. In the rows where there are more than 2 vertices, I considered every existing pairs:
library(igraph)
do.call(rbind, my_data[-1] |>
apply(1, \(x) x[x != 0]) |>
lapply(\(x) t(combn(x, m = 2)))) |>
graph_from_edgelist(directed = FALSE) %>%
as_adjacency_matrix()
4 x 4 sparse Matrix of class "dgCMatrix"
b c d a
b . 2 1 1
c 2 . 1 2
d 1 1 . 2
a 1 2 2 .
Or without the pip operator in base R:
tmp <- apply(my_data[-1], 1, function(x) x[x != 0])
tmp <- do.call(rbind, lapply(tmp, function(x) t(combn(x, m = 2))))
my_graph <- graph_from_edgelist(tmp, directed = FALSE)
adj_mat <- as_adjacency_matrix(my_graph)
adj_mat
Another attempt, minus the need to calculate all the combinations with combn
sel <- my_data[-1] != 0
dat <- data.frame(row=row(my_data[-1])[sel], value = my_data[-1][sel])
out <- crossprod(table(dat))
diag(out) <- 0
out
# value
#value a b c d
# a 0 1 2 2
# b 1 0 2 1
# c 2 2 0 1
# d 2 1 1 0
Matches the result from #AnoushiravanR:
adj_mat[c("a","b","c","d"), c("a","b","c","d")]
#4 x 4 sparse Matrix of class "dgCMatrix"
# a b c d
#a . 1 2 2
#b 1 . 2 1
#c 2 2 . 1
#d 2 1 1 .
Another igraph option
do.call(
rbind,
combn(df, 2, setNames, nm = c("from", "to"), simplify = FALSE)
) %>%
filter(from > 0 & to > 0) %>%
arrange(from) %>%
graph_from_data_frame(directed = FALSE) %>%
get.adjacency(sparse = FALSE)
gives
a b c d
a 0 1 2 2
b 1 0 2 1
c 2 2 0 1
d 2 1 1 0

Dispatch values in list column to separate columns

I have a data.table with a list column "c":
df <- data.table(a = 1:3, c = list(1L, 1:2, 1:3))
df
a c
1: 1 1
2: 2 1,2
3: 3 1,2,3
I want to create separate columns for the values in "c".
I create a set of new columns F_1, F_2, F_3:
mmax <- max(df$a)
flux <- paste("F", 1:mmax, sep = "_")
df[, (flux) := 0]
df
a c F_1 F_2 F_3
1: 1 1 0 0 0
2: 2 1,2 0 0 0
3: 3 1,2,3 0 0 0
I want to dispatch values in "c" to columns F_1, F_2, F_3 like this:
df
a c F_1 F_2 F_3
1: 1 1 1 0 0
2: 2 1,2 1 2 0
3: 3 1,2,3 1 2 3
What I have tried:
comp_vect <- function(vec, mmax){
vec <- vec %>% unlist()
n <- length(vec)
answr <- c(vec, rep(0, l = mmax -n))
}
df[ , ..flux := mapply(comp_vect, c, mmax)]
The expected data.table is :
> df
a c F_1 F_2 F_3
1: 1 1 1 0 0
2: 2 1,2 1 2 0
3: 3 1,2,3 1 2 3
I followed a radically different approach. I rbinded the list column and then dcasted it, obtaining the desired result. Last part is to set the names.
library(data.table)
df <- data.table(a = 1:3, d = list(1L, c(1L, 2L), c(1L, 2L, 3L)))
df2 <- df[, rbind(d), by = a][, dcast(.SD, a ~ V1, fill = 0)]
setnames(df2, 2:4, flux)[]
a F_1 F_2 F_3
1: 1 1 0 0
2: 2 1 2 0
3: 3 1 2 3
where flux is the variable of names that you defined in your question.
Please notice that avoided using the column name c, as it may be confused with the function c().
Solution :
for(idx in seq(max(sapply(df$c, length)))){ # maximum number of values according to all the elements of the list
set(x = df,
i = NULL,
j = paste0("F_",idx), # column's name
value = sapply(df$c, function(x){
if(is.na(x[idx])){
return(0) # 0 instead of NA
} else {
return(x[idx])
}
})
)
}
Explications :
We can extract the values from a list like this :
sapply(df$c, function(ll) return(ll[1])) # first value
[1] 1 1 1
sapply(df$c, function(ll) return(ll[2])) # second value
[1] NA 2 2
sapply(df$c, function(ll) return(ll[3])) # third value
[1] NA NA 3
We see that if there is no value, we have a NA.
We need an iterator to extract all values at the position idx. For that, we'll find the number of values in each element of df$c (the list) and keep the maximum.
max(sapply(df$c, length))
[1] 3
If we want zeros instead of NAs, we need to create a function in the sapply to convert them :
vec <- c(NA, 5, 1, NA)
> sapply(vec, function(x) if(is.na(x)) return(0) else return(x))
[1] 0 5 1 0

capture column pattern frequency

I have a dataset like this below
Id A B C
10 1 0 1
11 1 0 1
12 1 1 0
13 1 0 0
14 0 1 1
I am trying to count the column patterns like this below.
Pattern Count
A, C 2
A, B 1
A 1
B, C 1
Not sure where to start, any help or advice is much appreciated.
If you don't have to group per ID then simply,
table(apply(df[-1], 1, function(i) paste(names(i[i == 1]), collapse = ',')))
# A A,B A,C B,C
# 1 1 2 1
Starting by "reversing" the tabulation of the data in the two separate vectors:
w = which(dat[-1] == 1L, TRUE)
we could use
table(tapply(names(dat)[-1][w[, "col"]], w[, "row"], paste, collapse = ", "))
#
# A A, B A, C B, C
# 1 1 2 1
If the result is not needed only for formatting purposes, to avoid unnecessary paste/strsplit, an alternative -among many- is:
pats = split(names(dat)[-1][w[, "col"]], w[, "row"])
upats = unique(pats)
data.frame(pat = upats, n = tabulate(match(pats, upats)))
# pat n
#1 A, C 2
#3 A, B 1
#4 A 1
#5 B, C 1
We can try with
table(gsub(",*N|N,*", "", chartr('0123', 'NABC',
do.call(paste, c(df1[-1] * col(df1[-1]), sep=",")))))
# A A,B A,C B,C
# 1 1 2 1
As #DavidArenburg mentioned, the old/new in chartr can be made automatic with
cols <- paste(c("N", names(df1[-1])), collapse = "")
indx <- paste(seq(nchar(cols)) - 1, collapse = "")
table(gsub(",*N|N,*", "", chartr(indx, cols,
do.call(paste, c(df1[-1] * col(df1[-1]), sep=",")))))

How to compare with values adjacent in a sequence in the same group

Let's say I have something like this:
set.seed(0)
the.df <- data.frame( x=rep(letters[1:3], each=4),
n=rep(0:3, 3),
val=round(runif(12)))
the.df
x n val
1 a 0 1
2 a 1 0
3 a 2 0
4 a 3 1
5 b 0 1
6 b 1 0
7 b 2 1
8 b 3 1
9 c 0 1
10 c 1 1
11 c 2 0
12 c 3 0
Within each x, starting from n==2 (going from small to large), I want to set val to 0 if the previous val (in terms of n) is 0; otherwise, leave it as is.
For example, in the subset x=="b", I first ignore the two rows where n < 2. Now, in Row 7, because the previous val is 0 (the.df$val[the.df$x=="b" & the.df$n==1]), I set val to 0 (the.df$val[the.df$x=="b" & the.df$n==2] <- 0). Then on Row 8, now that val for the previous n is 0 (we just set it), I also want to set val here to 0 (the.df$val[the.df$x=="b" & the.df$n==3] <- 0).
Imagine that the data.frame is not sorted. Therefore procedures that depend on the order would require a sort. I also can't assume that adjacent rows exist (e.g., the row the.df[the.df$x=="a" & the.df$n==1, ] might be missing).
The trickiest part seems to be evaluating val in sequence. I can do this using a loop but I imagine that it would be inefficient (I have millions of rows). Is there a way I can do this more efficiently?
EDIT: wanted output
the.df
x n val wanted
1 a 0 1 1
2 a 1 0 0
3 a 2 0 0
4 a 3 1 0
5 b 0 1 1
6 b 1 0 0
7 b 2 1 0
8 b 3 1 0
9 c 0 1 1
10 c 1 1 1
11 c 2 0 0
12 c 3 0 0
Also, I don't mind making new columns (e.g., putting the wanted values there).
Using data.table I would try the following
library(data.table)
setDT(the.df)[order(n),
val := if(length(indx <- which(val[2:.N] == 0L)))
c(val[1:(indx[1L] + 1L)], rep(0L, .N - (indx[1L] + 1L))),
by = x]
the.df
# x n val
# 1: a 0 1
# 2: a 1 0
# 3: a 2 0
# 4: a 3 0
# 5: b 0 1
# 6: b 1 0
# 7: b 2 0
# 8: b 3 0
# 9: c 0 1
# 10: c 1 1
# 11: c 2 0
# 12: c 3 0
This will simultaneously order the data by n (as you said it's not ordered in real life) and recreate val by condition (meaning that if condition not satisfied, val will be untouched).
Hopefully in the near future this will be implemented and then the code could potentially be
setDT(the.df)[order(n), val[n > 2] := if(val[2L] == 0) 0L, by = x]
Which could be a great improvement both performance and syntax wise
A base R approach might be
df <- the.df[order(the.df$x, the.df$n),]
df$val <- ave(df$val, df$x, FUN=fun)
As for fun, #DavidArenburg's answer in plain R and written a bit more poetically might be
fun0 <- function(v) {
idx <- which.max(v[2:length(v)] == 0L) + 1L
if (length(idx))
v[idx:length(v)] <- 0L
v
}
It seems like a good idea to formulate the solution as an independent function first, because then it is easy to test. fun0 fails for some edge cases, e.g.,
> fun0(0)
[1] 0 0 0
> fun0(1)
[1] 0 0 0
> fun0(c(1, 1))
[1] 1 0
A better version is
fun1 <- function(v) {
tst <- tail(v, -1) == 0L
if (any(tst)) {
idx <- which.max(tst) + 1L
v[idx:length(v)] <- 0L
}
v
}
And even better, following #Arun
fun <- function(v)
if (length(v) > 2) c(v[1], cummin(v[-1])) else v
This is competitive (same order of magnitude) with the data.table solution, with ordering and return occurring in less than 1s for the ~10m row data.frame of #m-dz 's timings. At a second for millions of rows, it doesn't seem worth while to pursue further optimization.
Nonetheless, when there are a very large number of small groups (e.g., 2M each of size 5) an improvement is to avoid the tapply() function call by using group identity to offset the minimum. For instance,
df <- df[order(df$x, df$n),]
grp <- match(df$x, unique(df$x)) # strictly sequential groups
keep <- duplicated(grp) # ignore the first of each group
df$val[keep] <- cummin(df$val[keep] - grp[keep]) + grp[keep]
Hmmm, should be pretty efficient if you switch to data.table...
library(data.table)
# Define the.df as a data.table (or use data.table::setDT() function)
set.seed(0)
the.df <- data.table(
x = rep(letters[1:3], each = 4),
n = rep(0:3, 3),
val = round(runif(12))
)
m_dz <- function() {
setorder(the.df, x, n)
repeat{
# Get IDs of rows to change
# ids <- which(the.df[, (n > 1) & (val == 1) & (shift(val, 1L, type = "lag") == 0)])
ids <- the.df[(n > 1) & (val == 1) & (shift(val, 1L, type = "lag") == 0), , which = TRUE]
# If no IDs break
if(length(ids) == 0){
break
}
# Set val to 0
# for (i in ids) set(the.df, i = i, j = "val", value = 0)
set(the.df, i = ids, j = "val", value = 0)
}
return(the.df)
}
Edit: Above function is slightly modified thanks to #jangorecki's, i.e. uses which = TRUE and set(the.df, i = ids, j = "val", value = 0), which made the timings much more stable (no very high max timings).
Edit: timing comparison with #David Arenburgs's answer on a slightly bigger table, m-dz() updated (#FoldedChromatin's answer skipped because of diffrent results).
My function is slightly faster in terms of median and upper quantile, but there is quite a big spread in timings (see max...), I cannot figure out why. Hopefully the timing methodology is correct (returning the result to different object etc.).
Anything bigger will kill my PC :(
set.seed(0)
groups_ids <- replicate(300, paste(sample(LETTERS, 5, replace=TRUE), collapse = ""))
size1 <- length(unique(groups_ids))
size2 <- round(1e7/size1)
the.df1 <- data.table(
x = rep(groups_ids, each = size2), # 52 * 500 = 26000
n = rep(0:(size2-1), size1),
val = round(runif(size1*size2))
)
the.df2 <- copy(the.df1)
# m-dz
m_dz <- function() {
setorder(df1, x, n)
repeat{
ids <- df1[(n > 1) & (val == 1) & (shift(val, 1L, type = "lag") == 0), , which = TRUE]
if(length(ids) == 0){
break
}
set(df1, i = ids, j = "val", value = 0)
}
return(df1)
}
# David Arenburg
DavidArenburg <- function() {
setorder(df2, x, n)
df2[, val := if(length(indx <- which.max(val[2:.N] == 0) + 1L)) c(val[1:indx], rep(0L, .N - indx)), by = x]
return(df2)
}
library(microbenchmark)
microbenchmark(
res1 <- m_dz(),
res2 <- DavidArenburg(),
times = 100
)
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# res1 <- m_dz() 247.4136 268.5005 363.0117 288.4216 312.7307 7071.0960 100 a
# res2 <- DavidArenburg() 270.6074 281.3935 314.7864 303.5229 328.1210 525.8095 100 a
identical(res1, res2)
# [1] TRUE
Edit: (Old) results for even bigger table:
set.seed(0)
groups_ids <- replicate(300, paste(sample(LETTERS, 5, replace=TRUE), collapse = ""))
size1 <- length(unique(groups_ids))
size2 <- round(1e8/size1)
# Unit: seconds
# expr min lq mean median uq max neval cld
# res1 <- m_dz() 5.599855 5.800264 8.773817 5.923721 6.021132 289.85107 100 a
# res2 <- m_dz2() 5.571911 5.836191 9.047958 5.970952 6.123419 310.65280 100 a
# res3 <- DavidArenburg() 9.183145 9.519756 9.714105 9.723325 9.918377 10.28965 100 a
Why not just use by
> set.seed(0)
> the.df <- data.frame( x=rep(letters[1:3], each=4),
n=rep(0:3, 3),
val=round(runif(12)))
> the.df
x n val
1 a 0 1
2 a 1 0
3 a 2 0
4 a 3 1
5 b 0 1
6 b 1 0
7 b 2 1
8 b 3 1
9 c 0 1
10 c 1 1
11 c 2 0
12 c 3 0
> Mod.df<-by(the.df,INDICES=the.df$x,function(x){
x$val[x$n==2]=0
Which=which(x$n==2 & x$val==0)+1
x$val[Which]=0
x})
> do.call(rbind,Mod.df)
x n val
a.1 a 0 1
a.2 a 1 0
a.3 a 2 0
a.4 a 3 0
b.5 b 0 1
b.6 b 1 0
b.7 b 2 0
b.8 b 3 0
c.9 c 0 1
c.10 c 1 1
c.11 c 2 0
c.12 c 3 0

Take certain value in a data frame

I have a data.frame and would like to take a certain value from a cell if another is in a dataframe.
I tried the apply function.
n <- c(2, 3, 0 ,1)
s <- c(0, 1, 1, 2)
b <- c("THIS", "FALSE", "NOT", "THIS")
df <- data.frame(n, s, b)
df <- sapply(df$Vals, FUN=function(x){ if(b[x]=="THIS") ? n[x] : s[x] } )
My logic is:
if(b at position x is equal to "This") {
add n[x] to the column df$Vals
} else {
add s[x] to the column df$Vals
}
Whereas x is a single row.
Any recommendation what I am doing wrong?
I appreciate your reply!
Like this:
df$Vals = with(df, ifelse(b=="THIS", n, s))
Or giving direct the resulting data.frame:
transform(df, Vals=with(df, ifelse(b=="THIS", n, s)))
# n s b Vals
#1 2 0 THIS 2
#2 3 1 FALSE 1
#3 0 1 NOT 1
#4 1 2 THIS 1
With your additional conditions:
func=Vectorize(function(b, s, n){if(b=='THIS') return(n);if(b==F) return(n+s);s})
df$Vals = with(df, func(b,s,n))
Or you could use the row/column indexing
df$Vals <- df[1:2][cbind(1:nrow(df),(df$b!='THIS')+1)]
df
# n s b Vals
#1 2 0 THIS 2
#2 3 1 FALSE 1
#3 0 1 NOT 1
#4 1 2 THIS 1

Resources