Adding columns in data frame as a-z (26) new columns - r

Each column will read the Name from the Name column and add the frequency of the alphabet in the corresponding column.
I want to add 26 new columns to my data frame. I am using
mydata3$a = str_count(mydata3$Name, "a")
mydata3$b = str_count(mydata3$Name, "b")
mydata3$c = str_count(mydata3$Name, "c")
mydata3$d = str_count(mydata3$Name, "d")
mydata3$e = str_count(mydata3$Name, "e")
mydata3$f = str_count(mydata3$Name, "f")
mydata3$g = str_count(mydata3$Name, "g")
mydata3$h = str_count(mydata3$Name, "h")
mydata3$i = str_count(mydata3$Name, "i")
mydata3$j = str_count(mydata3$Name, "j")
mydata3$k = str_count(mydata3$Name, "k")
mydata3$l = str_count(mydata3$Name, "l")
mydata3$m = str_count(mydata3$Name, "m")
mydata3$n = str_count(mydata3$Name, "n")
mydata3$o = str_count(mydata3$Name, "o")
mydata3$p = str_count(mydata3$Name, "p")
mydata3$q = str_count(mydata3$Name, "q")
mydata3$r = str_count(mydata3$Name, "r")
mydata3$s = str_count(mydata3$Name, "s")
mydata3$t = str_count(mydata3$Name, "t")
mydata3$u = str_count(mydata3$Name, "u")
mydata3$v = str_count(mydata3$Name, "v")
mydata3$w = str_count(mydata3$Name, "w")
mydata3$x = str_count(mydata3$Name, "x")
mydata3$y = str_count(mydata3$Name, "y")
mydata3$z = str_count(mydata3$Name, "z")
I want to make a function for this in R. Please help
output:
Name Gender a b c d e f g h i j k l m n o p q r s t u v w x y z
1 emma F 1 0 0 0 1 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0
2 olivia F 1 0 0 0 0 0 0 0 2 0 0 1 0 0 1 0 0 0 0 0 0 1 0 0 0 0
3 ava F 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
4 sophia F 1 0 0 0 0 0 0 1 1 0 0 0 0 0 1 1 0 0 1 0 0 0 0 0 0 0
5 isabella F 2 1 0 0 1 0 0 0 1 0 0 2 0 0 0 0 0 0 1 0 0 0 0 0 0 0
6 mia F 1 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0

The letters will provide the lower case alphabets, using that we can easily create columns by looping through it using lapply
library(stringr)
mydata3[letters] <- lapply(letters, function(x) str_count(mydata3$Name, x))
Another option would be do split the 'Name' column, get the frequency with table
cbind(mydata3, as.data.frame.matrix(table(transform(stack(setNames(strsplit(mydata3$Name,
""), seq_len(nrow(mydata3))))[2:1] , values = factor(values, levels = letters)))))
# Name Gender a b c d e f g h i j k l m n o p q r s t u v w x y z
#1 emma F 1 0 0 0 1 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0
#2 olivia F 1 0 0 0 0 0 0 0 2 0 0 1 0 0 1 0 0 0 0 0 0 1 0 0 0 0
#3 ava F 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
#4 sophia F 1 0 0 0 0 0 0 1 1 0 0 0 0 0 1 1 0 0 1 0 0 0 0 0 0 0
#5 isabella F 2 1 0 0 1 0 0 0 1 0 0 2 0 0 0 0 0 0 1 0 0 0 0 0 0 0
#6 mia F 1 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
data
mydata3 <- structure(list(Name = c("emma", "olivia", "ava", "sophia", "isabella",
"mia"), Gender = c("F", "F", "F", "F", "F", "F")), .Names = c("Name",
"Gender"), row.names = c("1", "2", "3", "4", "5", "6"), class = "data.frame")

Related

Turning data.frame into adjacency matrix lists does not show values (in R)

I have a data.frame that has 4 columns (sender, receiver, year and value). I want to create a list which has for each year an adjacency matrix of the sender and the receiver containing the value.
A MVE would be
df = data.frame(sender = c("a","a","b","c","d","d","d","b","e","e"),
receiver = c("b","d","a","a","b","a","c","e","c","a"),
value = 1:10,
year= c(2000,2000,2001,2002,2002,2002,2003,2003,2003,2004))
Also I have a country list I need it to match with
country_list = data.frame(country = c("a","b","c","d","e")
What I have tried looks like this. My problem is that not the correct values are shown in the subsequent adjacency matrix.
transfer_list <- list()
for (t in 2000:2004){
matrix<-matrix(0,5,5)
rownames(matrix)<-country_list[1:5,1]
colnames(matrix)<-country_list[1:5,1]
year=which(df[,4]==t)
dyad=df[year,c(1,2)]
for (i in 1:dim(dyad)[1]){
partner1<-which(country_list[,1]==dyad[i,1])
partner2<-which(country_list[,1]==dyad[i,2])
matrix[partner1, partner2]<-df[i,3]
}
transfer_list[[t-1999]]=matrix
}
The result for 2004 should be a 10 for the transfer from e to a but is:
> transfer_list[[5]]
a b c d e
a 0 0 0 0 0
b 0 0 0 0 0
c 0 0 0 0 0
d 0 0 0 0 0
e 1 0 0 0 0
What is my error?
One approach might be the igraph package.
We can use igraph::graph_from_data_frame to create a graph for each year. We can include all of the possible vertices using the vertices = argument. Otherwise, only vertices with edges will be included.
Once we have created the weighted graph, we can use as_adj with attr = "value" to create the adjacency matrix with the values in the matrix. sparse = FALSE gets you 0s in the other positions.
library(igraph)
result <- lapply(unique(df$year), function(x) {
g <- graph_from_data_frame(df[df$year == x,-4],
vertices = unique(country_list))
as_adj(g, attr = "value", sparse = FALSE)})
names(result) <- unique(df$year)
result$`2004`
# a b c d e
#a 0 0 0 0 0
#b 0 0 0 0 0
#c 0 0 0 0 0
#d 0 0 0 0 0
#e 10 0 0 0 0
You can try the code below
library(igraph)
library(dplyr)
lapply(
split(df[setdiff(names(df), "year")], df$year),
function(x) {
x %>%
graph_from_data_frame(vertices = country_list$country) %>%
as_adj(attr = "value", sparse = FALSE)
}
)
which gives
$`2000`
a b c d e
a 0 1 0 2 0
b 0 0 0 0 0
c 0 0 0 0 0
d 0 0 0 0 0
e 0 0 0 0 0
$`2001`
a b c d e
a 0 0 0 0 0
b 3 0 0 0 0
c 0 0 0 0 0
d 0 0 0 0 0
e 0 0 0 0 0
$`2002`
a b c d e
a 0 0 0 0 0
b 0 0 0 0 0
c 4 0 0 0 0
d 6 5 0 0 0
e 0 0 0 0 0
$`2003`
a b c d e
a 0 0 0 0 0
b 0 0 0 0 8
c 0 0 0 0 0
d 0 0 7 0 0
e 0 0 9 0 0
$`2004`
a b c d e
a 0 0 0 0 0
b 0 0 0 0 0
c 0 0 0 0 0
d 0 0 0 0 0
e 10 0 0 0 0
Change this section:
dyad=df[year,c(1,2,3)]
for (i in 1:dim(dyad)[1]){
partner1<-which(country_list[,1]==dyad[i,1])
partner2<-which(country_list[,1]==dyad[i,2])
matrix[partner1, partner2]<- dyad[i,3]
}

Find most repeated sequence of values

Suppose I have a data frame that looks like this
ITEM
1 X
2 A
3 B
4 C
5 A
6 F
7 U
8 A
9 B
10 C
11 F
12 U
How can I obtain the most common sequence of values in the 'ITEM' column?. In this case the most frequent sequence would be A, B, C since it appears in row 2 to 4 and 8 to 10.
I have already tried the function rle, as well as some of the solutions found here, and I haven't been lucky. Can I have a suggestion, hint, or package recommendation?
I guess you want the longest non-overlapping sub-string. There's some good explanation about the dynamic programming solution here.
x = c("X", "A", "B", "C", "A", "F", "U", "A", "B", "C", "F", "U")
n = length(x)
m1 = sapply(x, function(i) sapply(x, function(j) as.integer(i == j)))
diag(m1) = 0
m1[lower.tri(m1)] = 0
m1
# X A B C A F U A B C F U
# X 0 0 0 0 0 0 0 0 0 0 0 0
# A 0 0 0 0 1 0 0 1 0 0 0 0
# B 0 0 0 0 0 0 0 0 1 0 0 0
# C 0 0 0 0 0 0 0 0 0 1 0 0
# A 0 0 0 0 0 0 0 1 0 0 0 0
# F 0 0 0 0 0 0 0 0 0 0 1 0
# U 0 0 0 0 0 0 0 0 0 0 0 1
# A 0 0 0 0 0 0 0 0 0 0 0 0
# B 0 0 0 0 0 0 0 0 0 0 0 0
# C 0 0 0 0 0 0 0 0 0 0 0 0
# F 0 0 0 0 0 0 0 0 0 0 0 0
# U 0 0 0 0 0 0 0 0 0 0 0 0
m2 = m1
for (i in 2:nrow(m1)){
for (j in 2:nrow(m1)){
if (m1[i-1, j-1] == 1 & m1[i, j] == 1){
if (j - i > m2[i - 1, j - 1]){
m2[i, j] = m2[i - 1, j - 1] + m2[i, j]
m2[i - 1, j - 1] = 0
} else {
m2[i, j] = 0
}
}
}
}
m2
# X A B C A F U A B C F U
# X 0 0 0 0 0 0 0 0 0 0 0 0
# A 0 0 0 0 1 0 0 0 0 0 0 0
# B 0 0 0 0 0 0 0 0 0 0 0 0
# C 0 0 0 0 0 0 0 0 0 3 0 0
# A 0 0 0 0 0 0 0 1 0 0 0 0
# F 0 0 0 0 0 0 0 0 0 0 0 0
# U 0 0 0 0 0 0 0 0 0 0 0 2
# A 0 0 0 0 0 0 0 0 0 0 0 0
# B 0 0 0 0 0 0 0 0 0 0 0 0
# C 0 0 0 0 0 0 0 0 0 0 0 0
# F 0 0 0 0 0 0 0 0 0 0 0 0
# U 0 0 0 0 0 0 0 0 0 0 0 0
ans_len = max(m2)
inds = c(which(m2 == ans_len, arr.ind = TRUE)[,2])
lapply(inds, function(ind) x[(ind - ans_len + 1):ind])
# [[1]]
# [1] "A" "B" "C"
A tidyverse solution mixed with nested apply functions. The solution is generalized and will report the most frequent non-trivial consecutive sequence that appears at least twice--tie goes to the longer sequence.
library(tidyverse)
# Data
x <- data.frame(ITEM = c("X", "A", "B", "C", "A", "F", "U", "A", "B", "C", "F", "U"), stringsAsFactors = F)
# convert x to vector
y <- x$ITEM
# Create list to check for sequence of each length 2 through n/2
l <- lapply(2:floor(length(y)/2), function(a) sapply(1:a, function(x) y[(0 + x):(length(y) - a + x)])) %>%
lapply(as.data.frame) %>%
setNames(sapply(2:(length(.) + 1), function(a) paste0("Consecutive", a)))
# Show most frequent sequence(s), choosing the longest
lapply(1:length(l), function(x) (as.data.frame(table(do.call(paste, l[[x]])), stringsAsFactors = F) %>%
dplyr::mutate(length = nchar(Var1)) %>%
dplyr::filter(length == max(length) & Freq == max(Freq) & Freq > 1)) ) %>%
.[which(sapply(., nrow) > 0)] %>%
dplyr::bind_rows() %>%
dplyr::filter(Freq == max(Freq)) %>%
dplyr::filter(length == max(length)) %>%
dplyr::rename(Sequence = Var1) %>%
dplyr::select(-length)
# Sequence Freq
#1 A B C 2

Change values of matrix where row names equal column names

I am trying to change the values of a matrix so that, for each element where the row name equals the column name, the resulting matrix will have a value of one.
> z<-matrix(0, nrow=10, ncol=8)
> colnames(z)<-letters[1:8]
> rownames(z)<-c("f", "c", "a", "f", "a", "b", "f", "b", "h", "c")
> z
a b c d e f g h
f 0 0 0 0 0 0 0 0
c 0 0 0 0 0 0 0 0
a 0 0 0 0 0 0 0 0
f 0 0 0 0 0 0 0 0
a 0 0 0 0 0 0 0 0
b 0 0 0 0 0 0 0 0
f 0 0 0 0 0 0 0 0
b 0 0 0 0 0 0 0 0
h 0 0 0 0 0 0 0 0
c 0 0 0 0 0 0 0 0
z should be:
a b c d e f g h
f 0 0 0 0 0 1 0 0
c 0 0 1 0 0 0 0 0
a 1 0 0 0 0 0 0 0
f 0 0 0 0 0 1 0 0
a 1 0 0 0 0 0 0 0
b 0 1 0 0 0 0 0 0
f 0 0 0 0 0 1 0 0
b 0 1 0 0 0 0 0 0
h 0 0 0 0 0 0 0 1
c 0 0 1 0 0 0 0 0
I tried:
> z[unique(rownames(z)), unique(rownames(z))]<-1
> z
a b c d e f g h
f 1 1 1 0 0 1 0 1
c 1 1 1 0 0 1 0 1
a 1 1 1 0 0 1 0 1
f 0 0 0 0 0 0 0 0
a 0 0 0 0 0 0 0 0
b 1 1 1 0 0 1 0 1
f 0 0 0 0 0 0 0 0
b 0 0 0 0 0 0 0 0
h 1 1 1 0 0 1 0 1
c 0 0 0 0 0 0 0 0
and:
> z["a", "a"]<-1
> z
a b c d e f g h
f 0 0 0 0 0 0 0 0
c 0 0 0 0 0 0 0 0
a 1 0 0 0 0 0 0 0
f 0 0 0 0 0 0 0 0
a 0 0 0 0 0 0 0 0
b 0 0 0 0 0 0 0 0
f 0 0 0 0 0 0 0 0
b 0 0 0 0 0 0 0 0
h 0 0 0 0 0 0 0 0
c 0 0 0 0 0 0 0 0
but that only changed the first 'a' in the 'a' column.
You can also do this with base R using outer.
z[outer(rownames(z), colnames(z), "==")] <- 1
z
a b c d e f g h
f 0 0 0 0 0 1 0 0
c 0 0 1 0 0 0 0 0
a 1 0 0 0 0 0 0 0
f 0 0 0 0 0 1 0 0
a 1 0 0 0 0 0 0 0
b 0 1 0 0 0 0 0 0
f 0 0 0 0 0 1 0 0
b 0 1 0 0 0 0 0 0
h 0 0 0 0 0 0 0 1
c 0 0 1 0 0 0 0 0
Another option is (which is a modification of #akrun's 2nd option):
z[sapply(colnames(z), `==`, rownames(z))] <- 1
which also gives the correct answer:
> z
a b c d e f g h
f 0 0 0 0 0 1 0 0
c 0 0 1 0 0 0 0 0
a 1 0 0 0 0 0 0 0
f 0 0 0 0 0 1 0 0
a 1 0 0 0 0 0 0 0
b 0 1 0 0 0 0 0 0
f 0 0 0 0 0 1 0 0
b 0 1 0 0 0 0 0 0
h 0 0 0 0 0 0 0 1
c 0 0 1 0 0 0 0 0
The difference with #akrun's 'dimnames' solution is that in the above approach only the necessary spots are converted to 1 which is an advantage when the original matrix doesn't just contain zero's. This is also achieved by the 'outer'-option from #lmo and the 'cbind'-option of #akrun.
We can use row/column indexing to change the elements to 1
z[cbind(1:nrow(z), match( rownames(z), colnames(z)))] <- 1
z
# a b c d e f g h
#f 0 0 0 0 0 1 0 0
#c 0 0 1 0 0 0 0 0
#a 1 0 0 0 0 0 0 0
#f 0 0 0 0 0 1 0 0
#a 1 0 0 0 0 0 0 0
#b 0 1 0 0 0 0 0 0
#f 0 0 0 0 0 1 0 0
#b 0 1 0 0 0 0 0 0
#h 0 0 0 0 0 0 0 1
#c 0 0 1 0 0 0 0 0
Or another option is (should be slower for big datasets)
`dimnames<-`(+(sapply(colnames(z), `==`, rownames(z))), dimnames(z))
# a b c d e f g h
#f 0 0 0 0 0 1 0 0
#c 0 0 1 0 0 0 0 0
#a 1 0 0 0 0 0 0 0
#f 0 0 0 0 0 1 0 0
#a 1 0 0 0 0 0 0 0
#b 0 1 0 0 0 0 0 0
#f 0 0 0 0 0 1 0 0
#b 0 1 0 0 0 0 0 0
#h 0 0 0 0 0 0 0 1
#c 0 0 1 0 0 0 0 0
NOTE: BTW, both the solutions are base R only solutions and not came from some external packages.
Benchmarks
z1 <- matrix(0, 5000, 5000)
colnames(z1) <- 1:5000
set.seed(24)
row.names(z1) <- sample(1:5000, 5000, replace=TRUE)
z2 <- z1
z3 <- z1
z4 <- z1
system.time(z1[cbind(1:nrow(z1), match( rownames(z1), colnames(z1)))] <- 1)
# user system elapsed
# 0.03 0.08 0.11
system.time(z2[outer(rownames(z2), colnames(z2), "==")] <- 1)
# user system elapsed
# 0.67 0.16 0.83
identical(z1, z2)
#[1] TRUE
system.time( `dimnames<-`(+(sapply(colnames(z3), `==`, rownames(z3))), dimnames(z3)))
# user system elapsed
# 31.70 0.39 32.28
system.time(z3[vapply(colnames(z3), function(x) x== rownames(z3),
logical(nrow(z3)))] <- 1)
# user system elapsed
# 0.22 0.00 0.21
Testing with #Procrastinatus Maximus modification
system.time(z4[sapply(colnames(z4), `==`, rownames(z4))] <- 1)
# user system elapsed
# 28.42 0.36 28.85
By testing it on a 10000 x 10000 matrix, the timings are
system.time(z1[cbind(1:nrow(z1), match( rownames(z1), colnames(z1)))] <- 1)
# user system elapsed
# 0.12 0.32 0.44
system.time(z2[outer(rownames(z2), colnames(z2), "==")] <- 1)
# user system elapsed
# 2.72 0.86 3.58
and on 20000 X 20000 matrix
system.time(z1[cbind(1:nrow(z1), match( rownames(z1), colnames(z1)))] <- 1)
# user system elapsed
# 0.95 1.00 1.95
system.time(z2[outer(rownames(z2), colnames(z2), "==")] <- 1)
# user system elapsed
# 15.47 5.87 21.39

In an NxN matrix give value "True" to pairs from data frame

I have created a 5x5 matrix where rows and columns have the same names and a data frame with name pairs:
N <- 5
Names <- letters[1:N]
mat <- matrix(rep(0, N*N), nrow = N, ncol = N, dimnames = list(Names, Names))
a b c d e
a 0 0 0 0 0
b 0 0 0 0 0
c 0 0 0 0 0
d 0 0 0 0 0
e 0 0 0 0 0
The data frame then consist of different pairs:
col1 col2
1 a c
2 c b
3 d b
4 d e
How can I match these in so that col1 only refers to rows in my matrix and col2 only to columns? The above should compute to the following result:
a b c d e
a 0 0 1 0 0
b 0 0 0 0 0
c 0 1 0 0 0
d 0 1 0 0 1
e 0 0 0 0 0
You can use match to create a "key" of which combinations need to be replaced with 1, like this:
key <- vapply(seq_along(mydf),
function(x) match(mydf[[x]],
dimnames(mat)[[x]]),
numeric(nrow(mydf)))
Then, use matrix indexing to replace the relevant values.
mat[key] <- 1
mat
a b c d e
a 0 0 1 0 0
b 0 0 0 0 0
c 0 1 0 0 0
d 0 1 0 0 1
e 0 0 0 0 0
You could also do:
mat[as.matrix(d1)] <- 1
mat
# a b c d e
#a 0 0 1 0 0
#b 0 0 0 0 0
#c 0 1 0 0 0
#d 0 1 0 0 1
#e 0 0 0 0 0
data
d1 <- structure(list(col1 = c("a", "c", "d", "d"), col2 = c("c", "b",
"b", "e")), .Names = c("col1", "col2"), class = "data.frame",
row.names = c("1", "2", "3", "4"))

Keep sorting of data.frame when using acast

When I use acastin R, the sorting of my data frame gets messed up. Imagine my data.frame looks like this
V1 V2 V3
1 D Y 0
2 E X 0
3 C N 0
4 B M 0
5 A S 0
doing acast(dd, V1 ~ V2, value.var="V3", fill = 0) will result in an ordered matrix, e.g.
M N S X Y
A 0 0 0 0 0
B 0 0 0 0 0
C 0 0 0 0 0
D 0 0 0 0 0
E 0 0 0 0 0
How do I keep the original sorting of the data frame?
Make V1 and V2 into factors, and when you do so, make their levels the order you want. The default ordering when making factors is to sort them, which is why you got the order you did the first time.
d <- data.frame(V1=c("D", "E", "C", "B", "A"), V2=c("Y","X","N","M","S"), V3=0)
d$V1 <- factor(d$V1, levels=unique(d$V1))
d$V2 <- factor(d$V2, levels=unique(d$V2))
> acast(d, V1~V2, value.var="V3", fun.aggregate=sum)
Y X N M S
D 0 0 0 0 0
E 0 0 0 0 0
C 0 0 0 0 0
B 0 0 0 0 0
A 0 0 0 0 0
You can do something like this :
m <- acast(dd, V1 ~ V2, value.var="V3", fill = 0)
m[dd$V1,dd$V2]
Which gives :
Y X N M S
D 0 0 0 0 0
E 0 0 0 0 0
C 0 0 0 0 0
B 0 0 0 0 0
A 0 0 0 0 0

Resources