Find most repeated sequence of values - r

Suppose I have a data frame that looks like this
ITEM
1 X
2 A
3 B
4 C
5 A
6 F
7 U
8 A
9 B
10 C
11 F
12 U
How can I obtain the most common sequence of values in the 'ITEM' column?. In this case the most frequent sequence would be A, B, C since it appears in row 2 to 4 and 8 to 10.
I have already tried the function rle, as well as some of the solutions found here, and I haven't been lucky. Can I have a suggestion, hint, or package recommendation?

I guess you want the longest non-overlapping sub-string. There's some good explanation about the dynamic programming solution here.
x = c("X", "A", "B", "C", "A", "F", "U", "A", "B", "C", "F", "U")
n = length(x)
m1 = sapply(x, function(i) sapply(x, function(j) as.integer(i == j)))
diag(m1) = 0
m1[lower.tri(m1)] = 0
m1
# X A B C A F U A B C F U
# X 0 0 0 0 0 0 0 0 0 0 0 0
# A 0 0 0 0 1 0 0 1 0 0 0 0
# B 0 0 0 0 0 0 0 0 1 0 0 0
# C 0 0 0 0 0 0 0 0 0 1 0 0
# A 0 0 0 0 0 0 0 1 0 0 0 0
# F 0 0 0 0 0 0 0 0 0 0 1 0
# U 0 0 0 0 0 0 0 0 0 0 0 1
# A 0 0 0 0 0 0 0 0 0 0 0 0
# B 0 0 0 0 0 0 0 0 0 0 0 0
# C 0 0 0 0 0 0 0 0 0 0 0 0
# F 0 0 0 0 0 0 0 0 0 0 0 0
# U 0 0 0 0 0 0 0 0 0 0 0 0
m2 = m1
for (i in 2:nrow(m1)){
for (j in 2:nrow(m1)){
if (m1[i-1, j-1] == 1 & m1[i, j] == 1){
if (j - i > m2[i - 1, j - 1]){
m2[i, j] = m2[i - 1, j - 1] + m2[i, j]
m2[i - 1, j - 1] = 0
} else {
m2[i, j] = 0
}
}
}
}
m2
# X A B C A F U A B C F U
# X 0 0 0 0 0 0 0 0 0 0 0 0
# A 0 0 0 0 1 0 0 0 0 0 0 0
# B 0 0 0 0 0 0 0 0 0 0 0 0
# C 0 0 0 0 0 0 0 0 0 3 0 0
# A 0 0 0 0 0 0 0 1 0 0 0 0
# F 0 0 0 0 0 0 0 0 0 0 0 0
# U 0 0 0 0 0 0 0 0 0 0 0 2
# A 0 0 0 0 0 0 0 0 0 0 0 0
# B 0 0 0 0 0 0 0 0 0 0 0 0
# C 0 0 0 0 0 0 0 0 0 0 0 0
# F 0 0 0 0 0 0 0 0 0 0 0 0
# U 0 0 0 0 0 0 0 0 0 0 0 0
ans_len = max(m2)
inds = c(which(m2 == ans_len, arr.ind = TRUE)[,2])
lapply(inds, function(ind) x[(ind - ans_len + 1):ind])
# [[1]]
# [1] "A" "B" "C"

A tidyverse solution mixed with nested apply functions. The solution is generalized and will report the most frequent non-trivial consecutive sequence that appears at least twice--tie goes to the longer sequence.
library(tidyverse)
# Data
x <- data.frame(ITEM = c("X", "A", "B", "C", "A", "F", "U", "A", "B", "C", "F", "U"), stringsAsFactors = F)
# convert x to vector
y <- x$ITEM
# Create list to check for sequence of each length 2 through n/2
l <- lapply(2:floor(length(y)/2), function(a) sapply(1:a, function(x) y[(0 + x):(length(y) - a + x)])) %>%
lapply(as.data.frame) %>%
setNames(sapply(2:(length(.) + 1), function(a) paste0("Consecutive", a)))
# Show most frequent sequence(s), choosing the longest
lapply(1:length(l), function(x) (as.data.frame(table(do.call(paste, l[[x]])), stringsAsFactors = F) %>%
dplyr::mutate(length = nchar(Var1)) %>%
dplyr::filter(length == max(length) & Freq == max(Freq) & Freq > 1)) ) %>%
.[which(sapply(., nrow) > 0)] %>%
dplyr::bind_rows() %>%
dplyr::filter(Freq == max(Freq)) %>%
dplyr::filter(length == max(length)) %>%
dplyr::rename(Sequence = Var1) %>%
dplyr::select(-length)
# Sequence Freq
#1 A B C 2

Related

Iteration in R using tidyverse

I am trying to avoid using a for loop and instead use tidyverse for iteration. Specifically, I have a vector of values that I want to loop through a single variable from a data frame to create new variables with a prefix. I've tried using dplyr::across but am unsuccessful when the vector length is >1
Sample code:
library(tidyverse)
library(glue)
data <- data.frame(id = 1:10,
y = letters[1:10],
z = LETTERS[1:10])
letter_list <- letters[1:10]
var_naming <- function(dat, list){
dat %>%
mutate(!!glue("hx_{list}") := ifelse(y == {list}, 1, 0))
}
Code I've tried:
**the correct dimensions of the data frame should be 13 variables and 10 observations**
# data_b outputs the correct number of observations but has 40 variables
data_b <- map(letter_list,
~var_naming(data, .x)) %>%
as.data.frame()
# data_c gives me the correct number of variables but has 100 observations
data_c <- map_df(letter_list,
~var_naming(data, .x))
# error message from data_d when using dplyr::across:
>> Error in `mutate()`:
>> ! Problem while computing `..1 =
>> across(...)`.
>> Caused by error in `across()`:
>> ! All unnamed arguments must be length 1
>> Run `rlang::last_error()` to see where the error occurred.
data_d <- data %>%
mutate(
across(
.cols = y,
.fns = ~ifelse(y == {letter_list}, 1, 0),
.names = glue("hx_{letter_list}")
))
Desired output:
id y z hx_a hx_b hx_c hx_d hx_e hx_f hx_g hx_h hx_i hx_j
1 a A 1 0 0 0 0 0 0 0 0 0
2 b B 0 1 0 0 0 0 0 0 0 0
3 c C 0 0 1 0 0 0 0 0 0 0
4 d D 0 0 0 1 0 0 0 0 0 0
5 e E 0 0 0 0 1 0 0 0 0 0
6 f F 0 0 0 0 0 1 0 0 0 0
7 g G 0 0 0 0 0 0 1 0 0 0
8 h H 0 0 0 0 0 0 0 1 0 0
9 i I 0 0 0 0 0 0 0 0 1 0
10 j J 0 0 0 0 0 0 0 0 0 1
You were close with the mutate call, but what you ultimately want is a list of functions (one for each letter in letter_list) to pass to .fns. Since they are anonymous functions, name them the same as letter_list to help with across naming the columns
myFxs <- map(letter_list, ~function(y) ifelse(y == .x, 1, 0)) %>%
setNames(letter_list)
For whatever reason, .names seemed to be having a problem with the glue character vector (for me anyway). Since the functions are named for the letters they are matching against you can use the .fn pronoun to instead to pass a template to across
data %>%
mutate(
across(
.cols = y,
.fns = myFxs,
.names = "hx_{.fn}"
)
)
The code can be modified
Remove the {} around the list on the rhs of :=
It may be better to use transmute instead of mutate as mutate returns the whole data by default.
Once we get the column binded (_dfc) data from map, bind the original data with bind_cols
library(dplyr)
library(purrr)
var_naming <- function(dat, list){
dat %>%
transmute(!!glue::glue('hx_{list}') := ifelse(y == list, 1, 0))
}
NOTE: list is a base R function to construct list data structure. It may be better to create functions with argument names different than the reserved words or function names already existing.
-testing
map_dfc(letter_list, var_naming, dat = data) %>%
bind_cols(data, .)
-output
id y z hx_a hx_b hx_c hx_d hx_e hx_f hx_g hx_h hx_i hx_j
1 1 a A 1 0 0 0 0 0 0 0 0 0
2 2 b B 0 1 0 0 0 0 0 0 0 0
3 3 c C 0 0 1 0 0 0 0 0 0 0
4 4 d D 0 0 0 1 0 0 0 0 0 0
5 5 e E 0 0 0 0 1 0 0 0 0 0
6 6 f F 0 0 0 0 0 1 0 0 0 0
7 7 g G 0 0 0 0 0 0 1 0 0 0
8 8 h H 0 0 0 0 0 0 0 1 0 0
9 9 i I 0 0 0 0 0 0 0 0 1 0
10 10 j J 0 0 0 0 0 0 0 0 0 1
Anotehr way to get the same results:
data %>%
cbind(model.matrix(~y + 0, .)) %>%
rename_with(~str_replace(., 'y\\B', 'hx_'))
id y z hx_a hx_b hx_c hx_d hx_e hx_f hx_g hx_h hx_i hx_j
1 1 a A 1 0 0 0 0 0 0 0 0 0
2 2 b B 0 1 0 0 0 0 0 0 0 0
3 3 c C 0 0 1 0 0 0 0 0 0 0
4 4 d D 0 0 0 1 0 0 0 0 0 0
5 5 e E 0 0 0 0 1 0 0 0 0 0
6 6 f F 0 0 0 0 0 1 0 0 0 0
7 7 g G 0 0 0 0 0 0 1 0 0 0
8 8 h H 0 0 0 0 0 0 0 1 0 0
9 9 i I 0 0 0 0 0 0 0 0 1 0
10 10 j J 0 0 0 0 0 0 0 0 0 1
If you only consider those in letters_list:
data %>%
mutate( y =factor(y, letter_list)) %>%
cbind(model.matrix(~y + 0, .) %>%
as_tibble() %>%
select(paste0('y', letter_list)) %>%
rename_with(~str_replace(., 'y', 'hx_')))

Turning data.frame into adjacency matrix lists does not show values (in R)

I have a data.frame that has 4 columns (sender, receiver, year and value). I want to create a list which has for each year an adjacency matrix of the sender and the receiver containing the value.
A MVE would be
df = data.frame(sender = c("a","a","b","c","d","d","d","b","e","e"),
receiver = c("b","d","a","a","b","a","c","e","c","a"),
value = 1:10,
year= c(2000,2000,2001,2002,2002,2002,2003,2003,2003,2004))
Also I have a country list I need it to match with
country_list = data.frame(country = c("a","b","c","d","e")
What I have tried looks like this. My problem is that not the correct values are shown in the subsequent adjacency matrix.
transfer_list <- list()
for (t in 2000:2004){
matrix<-matrix(0,5,5)
rownames(matrix)<-country_list[1:5,1]
colnames(matrix)<-country_list[1:5,1]
year=which(df[,4]==t)
dyad=df[year,c(1,2)]
for (i in 1:dim(dyad)[1]){
partner1<-which(country_list[,1]==dyad[i,1])
partner2<-which(country_list[,1]==dyad[i,2])
matrix[partner1, partner2]<-df[i,3]
}
transfer_list[[t-1999]]=matrix
}
The result for 2004 should be a 10 for the transfer from e to a but is:
> transfer_list[[5]]
a b c d e
a 0 0 0 0 0
b 0 0 0 0 0
c 0 0 0 0 0
d 0 0 0 0 0
e 1 0 0 0 0
What is my error?
One approach might be the igraph package.
We can use igraph::graph_from_data_frame to create a graph for each year. We can include all of the possible vertices using the vertices = argument. Otherwise, only vertices with edges will be included.
Once we have created the weighted graph, we can use as_adj with attr = "value" to create the adjacency matrix with the values in the matrix. sparse = FALSE gets you 0s in the other positions.
library(igraph)
result <- lapply(unique(df$year), function(x) {
g <- graph_from_data_frame(df[df$year == x,-4],
vertices = unique(country_list))
as_adj(g, attr = "value", sparse = FALSE)})
names(result) <- unique(df$year)
result$`2004`
# a b c d e
#a 0 0 0 0 0
#b 0 0 0 0 0
#c 0 0 0 0 0
#d 0 0 0 0 0
#e 10 0 0 0 0
You can try the code below
library(igraph)
library(dplyr)
lapply(
split(df[setdiff(names(df), "year")], df$year),
function(x) {
x %>%
graph_from_data_frame(vertices = country_list$country) %>%
as_adj(attr = "value", sparse = FALSE)
}
)
which gives
$`2000`
a b c d e
a 0 1 0 2 0
b 0 0 0 0 0
c 0 0 0 0 0
d 0 0 0 0 0
e 0 0 0 0 0
$`2001`
a b c d e
a 0 0 0 0 0
b 3 0 0 0 0
c 0 0 0 0 0
d 0 0 0 0 0
e 0 0 0 0 0
$`2002`
a b c d e
a 0 0 0 0 0
b 0 0 0 0 0
c 4 0 0 0 0
d 6 5 0 0 0
e 0 0 0 0 0
$`2003`
a b c d e
a 0 0 0 0 0
b 0 0 0 0 8
c 0 0 0 0 0
d 0 0 7 0 0
e 0 0 9 0 0
$`2004`
a b c d e
a 0 0 0 0 0
b 0 0 0 0 0
c 0 0 0 0 0
d 0 0 0 0 0
e 10 0 0 0 0
Change this section:
dyad=df[year,c(1,2,3)]
for (i in 1:dim(dyad)[1]){
partner1<-which(country_list[,1]==dyad[i,1])
partner2<-which(country_list[,1]==dyad[i,2])
matrix[partner1, partner2]<- dyad[i,3]
}

Data frame into a symmetric matrix while keeping all row and column

The question R DataFrame into square (symmetric) matrix is about transforming a data frame into a symmetric matrix given the following example:
# data
x <- structure(c(1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0),
.Dim = c(3L,5L),
.Dimnames = list(c("X", "Y", "Z"), c("A", "B", "C", "D", "E")))
x
# A B C D E
#X 1 1 0 0 0
#Y 1 1 0 0 0
#Z 0 0 0 1 0
# transformation
x <- x %*% t(x)
diag(x) <- 1
x
# X Y Z
#X 1 2 0
#Y 2 1 0
#Z 0 0 1
Now, I'm trying to keep the columns c("A", "B", "C", "D", "E") in the matrix, like this:
# X Y Z A B C D E
#X 1 0 0 1 1 0 0 0
#Y 0 1 0 1 1 0 0 0
#Z 0 0 1 0 0 0 1 0
#A 1 1 0 1 0 0 0 0
#B 1 1 0 0 1 0 0 0
#C 0 0 0 0 0 1 0 0
#D 0 0 1 0 0 0 1 0
#E 0 0 0 0 0 0 0 1
I'm pretty sure there's an easy way to do it similar to the first transformation including each case. Can anyone suggest a solution?
Thank you in advance!
This is just an augmentation, isn't it?
X <- as.matrix(x)
rbind(cbind(diag(nrow(X)), X), cbind(t(X), diag(ncol(X))))
build it step by step: thanks to #李哲源 for pointing out m <- diag(sum(dim(x))).
m <- diag(sum(dim(x)))
colnames(m) = rownames(m) = c(rownames(x),colnames(x))
m[dimnames(x)[[1]],dimnames(x)[[2]]] <- x
m[dimnames(x)[[2]],dimnames(x)[[1]]] <- t(x)
# X Y Z A B C D E
#X 1 0 0 1 1 0 0 0
#Y 0 1 0 1 1 0 0 0
#Z 0 0 1 0 0 0 1 0
#A 1 1 0 1 0 0 0 0
#B 1 1 0 0 1 0 0 0
#C 0 0 0 0 0 1 0 0
#D 0 0 1 0 0 0 1 0
#E 0 0 0 0 0 0 0 1
You can wrap it into a function and then call it:
makeSymMat <-function(x) {
x <- as.matrix(x)
m <- diag(sum(dim(x)))
colnames(m) = rownames(m) = c(rownames(x),colnames(x))
m[dimnames(x)[[1]],dimnames(x)[[2]]] <- x
m[dimnames(x)[[2]],dimnames(x)[[1]]] <- t(x)
return(m)
}
makeSymMat(x)

Adding columns in data frame as a-z (26) new columns

Each column will read the Name from the Name column and add the frequency of the alphabet in the corresponding column.
I want to add 26 new columns to my data frame. I am using
mydata3$a = str_count(mydata3$Name, "a")
mydata3$b = str_count(mydata3$Name, "b")
mydata3$c = str_count(mydata3$Name, "c")
mydata3$d = str_count(mydata3$Name, "d")
mydata3$e = str_count(mydata3$Name, "e")
mydata3$f = str_count(mydata3$Name, "f")
mydata3$g = str_count(mydata3$Name, "g")
mydata3$h = str_count(mydata3$Name, "h")
mydata3$i = str_count(mydata3$Name, "i")
mydata3$j = str_count(mydata3$Name, "j")
mydata3$k = str_count(mydata3$Name, "k")
mydata3$l = str_count(mydata3$Name, "l")
mydata3$m = str_count(mydata3$Name, "m")
mydata3$n = str_count(mydata3$Name, "n")
mydata3$o = str_count(mydata3$Name, "o")
mydata3$p = str_count(mydata3$Name, "p")
mydata3$q = str_count(mydata3$Name, "q")
mydata3$r = str_count(mydata3$Name, "r")
mydata3$s = str_count(mydata3$Name, "s")
mydata3$t = str_count(mydata3$Name, "t")
mydata3$u = str_count(mydata3$Name, "u")
mydata3$v = str_count(mydata3$Name, "v")
mydata3$w = str_count(mydata3$Name, "w")
mydata3$x = str_count(mydata3$Name, "x")
mydata3$y = str_count(mydata3$Name, "y")
mydata3$z = str_count(mydata3$Name, "z")
I want to make a function for this in R. Please help
output:
Name Gender a b c d e f g h i j k l m n o p q r s t u v w x y z
1 emma F 1 0 0 0 1 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0
2 olivia F 1 0 0 0 0 0 0 0 2 0 0 1 0 0 1 0 0 0 0 0 0 1 0 0 0 0
3 ava F 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
4 sophia F 1 0 0 0 0 0 0 1 1 0 0 0 0 0 1 1 0 0 1 0 0 0 0 0 0 0
5 isabella F 2 1 0 0 1 0 0 0 1 0 0 2 0 0 0 0 0 0 1 0 0 0 0 0 0 0
6 mia F 1 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
The letters will provide the lower case alphabets, using that we can easily create columns by looping through it using lapply
library(stringr)
mydata3[letters] <- lapply(letters, function(x) str_count(mydata3$Name, x))
Another option would be do split the 'Name' column, get the frequency with table
cbind(mydata3, as.data.frame.matrix(table(transform(stack(setNames(strsplit(mydata3$Name,
""), seq_len(nrow(mydata3))))[2:1] , values = factor(values, levels = letters)))))
# Name Gender a b c d e f g h i j k l m n o p q r s t u v w x y z
#1 emma F 1 0 0 0 1 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0
#2 olivia F 1 0 0 0 0 0 0 0 2 0 0 1 0 0 1 0 0 0 0 0 0 1 0 0 0 0
#3 ava F 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
#4 sophia F 1 0 0 0 0 0 0 1 1 0 0 0 0 0 1 1 0 0 1 0 0 0 0 0 0 0
#5 isabella F 2 1 0 0 1 0 0 0 1 0 0 2 0 0 0 0 0 0 1 0 0 0 0 0 0 0
#6 mia F 1 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
data
mydata3 <- structure(list(Name = c("emma", "olivia", "ava", "sophia", "isabella",
"mia"), Gender = c("F", "F", "F", "F", "F", "F")), .Names = c("Name",
"Gender"), row.names = c("1", "2", "3", "4", "5", "6"), class = "data.frame")

Keep sorting of data.frame when using acast

When I use acastin R, the sorting of my data frame gets messed up. Imagine my data.frame looks like this
V1 V2 V3
1 D Y 0
2 E X 0
3 C N 0
4 B M 0
5 A S 0
doing acast(dd, V1 ~ V2, value.var="V3", fill = 0) will result in an ordered matrix, e.g.
M N S X Y
A 0 0 0 0 0
B 0 0 0 0 0
C 0 0 0 0 0
D 0 0 0 0 0
E 0 0 0 0 0
How do I keep the original sorting of the data frame?
Make V1 and V2 into factors, and when you do so, make their levels the order you want. The default ordering when making factors is to sort them, which is why you got the order you did the first time.
d <- data.frame(V1=c("D", "E", "C", "B", "A"), V2=c("Y","X","N","M","S"), V3=0)
d$V1 <- factor(d$V1, levels=unique(d$V1))
d$V2 <- factor(d$V2, levels=unique(d$V2))
> acast(d, V1~V2, value.var="V3", fun.aggregate=sum)
Y X N M S
D 0 0 0 0 0
E 0 0 0 0 0
C 0 0 0 0 0
B 0 0 0 0 0
A 0 0 0 0 0
You can do something like this :
m <- acast(dd, V1 ~ V2, value.var="V3", fill = 0)
m[dd$V1,dd$V2]
Which gives :
Y X N M S
D 0 0 0 0 0
E 0 0 0 0 0
C 0 0 0 0 0
B 0 0 0 0 0
A 0 0 0 0 0

Resources