Separate data in R [duplicate] - r

I have a character vector like this:
a <- c("a,b,c", "a,b", "a,b,c,d")
What I would like to do is create a data frame where the individual letters in each string are represented by dummy columns:
a b c d
1] 1 1 1 0
2] 1 1 0 0
3] 1 1 1 1
I have a feeling that I need to be using some combination of read.table and reshape but am really struggling. Any and help appreciated.

You can try cSplit_e from my "splitstackshape" package:
library(splitstackshape)
a <- c("a,b,c", "a,b", "a,b,c,d")
cSplit_e(as.data.table(a), "a", ",", type = "character", fill = 0)
# a a_a a_b a_c a_d
# 1: a,b,c 1 1 1 0
# 2: a,b 1 1 0 0
# 3: a,b,c,d 1 1 1 1
cSplit_e(as.data.table(a), "a", ",", type = "character", fill = 0, drop = TRUE)
# a_a a_b a_c a_d
# 1: 1 1 1 0
# 2: 1 1 0 0
# 3: 1 1 1 1
There's also mtabulate from "qdapTools":
library(qdapTools)
mtabulate(strsplit(a, ","))
# a b c d
# 1 1 1 1 0
# 2 1 1 0 0
# 3 1 1 1 1
A very direct base R approach is to use table along with stack and strsplit:
table(rev(stack(setNames(strsplit(a, ",", TRUE), seq_along(a)))))
# values
# ind a b c d
# 1 1 1 1 0
# 2 1 1 0 0
# 3 1 1 1 1

Another convoluted base-R solution:
x <- strsplit(a,",")
xl <- unique(unlist(x))
t(sapply(x,function(z)table(factor(z,levels=xl))))
which gives
a b c d
[1,] 1 1 1 0
[2,] 1 1 0 0
[3,] 1 1 1 1

A base R - but longer solution:
el = unique(unlist(strsplit(a, ',')))
do.call(rbind, lapply(a, function(u) setNames(el %in% strsplit(u,',')[[1]]+0L, el))
# a b c d
#[1,] 1 1 1 0
#[2,] 1 1 0 0
#[3,] 1 1 1 1

Another option is tstrsplit() from data.table:
library(data.table)
vapply(tstrsplit(a, ",", fixed = TRUE, fill = 0), ">", integer(length(a)), 0L)
# [,1] [,2] [,3] [,4]
# [1,] 1 1 1 0
# [2,] 1 1 0 0
# [3,] 1 1 1 1

After I wrote this I noticed that Colonel Beauvel's solution is quite similar but perhaps this is sufficiently distinct to be a separate solution. No packages are used.
First we split the character strings into a list of vectors, L and then we compute the union of them all, u. Finally we determine a binary vector for each list element and rbind them together, convert the result from logical to numeric using + 0 and set the column names.
L <- strsplit(a, ",")
u <- Reduce(union, L)
m <- do.call(rbind, lapply(L, `%in%`, x = u)) + 0
colnames(m) <- u
giving:
> m
a b c d
[1,] 1 1 1 0
[2,] 1 1 0 0
[3,] 1 1 1 1
Added The last two lines of code could be replaced by either of these:
do.call(rbind, lapply(lapply(L, factor, levels = u), table))
do.call(rbind, Map(function(x) sapply(u, `%in%`, x), L)) + 0

Unfortunately, base R does not offer a vectorized string matching function but the stringi package does.
library(stringi)
a=c("a,b,c", "a,b", "a,b,c,d")
1*outer(a,unique(unlist(strsplit(a,","))),stri_detect_regex)
# [,1] [,2] [,3] [,4]
#[1,] 1 1 1 0
#[2,] 1 1 0 0
#[3,] 1 1 1 1

I've had a lot of success with dummy_cols within fastDummies which can take care of this fairly simply and can be specified by variable.
library(fastDummies)
a <- c("a,b,c", "a,b", "a,b,c,d")
a <- dummy_cols(a, split = ",")
outputs
# .data .data_a .data_b .data_c .data_d
# 1 a,b,c 1 1 1 0
# 2 a,b 1 1 0 0
# 3 a,b,c,d 1 1 1 1

Related

Count occurrence of a value within a data frame within the rows above it

I'm trying to find a way to create a matrix which counts values from each row of a data frame. I'd like it to recognise the values in each row of the data frame, and count how many times that value has occurred in all rows above the row the value occurs in (not the whole data frame).
The same value will never occur more than once in a single row of the data frame.
For example:
# df:
a b c
1 2 3
3 4 5
3 2 6
7 8 9
8 3 6
matrix result:
0 0 0 (none of the df values have occurred as there are no rows above)
1 0 0 (3 has occurred once above, the others have not occurred)
2 1 0 (3 has occurred twice above, 2 has occurred once above, 6 has not occurred)
0 0 0 (none of the df values have occurred in rows above)
1 3 1 (8 has occurred once, 3 has occurred 3 times, 6 has occurred once)
Here's one way:
# convert to a vector
x = as.vector(t(as.matrix(df)))
# get counts of each unique element (in the right place)
# and add them up
res = rowSums(sapply(unique(x), function(z) {
r = integer(length(x))
r[x == z] = 0:(sum(x == z) - 1)
return(r)
}))
# convert to matrix
res = matrix(res, ncol = ncol(df), byrow = T)
res
# [,1] [,2] [,3]
# [1,] 0 0 0
# [2,] 1 0 0
# [3,] 2 1 0
# [4,] 0 0 0
# [5,] 1 3 1
Using this data:
df = read.table(text = "
a b c
1 2 3
3 4 5
3 2 6
7 8 9
8 3 6", header = T)
Another...for fun
out<-matrix(1,nrow = nrow(df),ncol = ncol(df))
for(i in 1:nrow(df)){
out[i,]<-sapply(1:ncol(df),function(z) sum(unlist(df[0:(i-1),]) %in% df[i,z]))
}
out
[,1] [,2] [,3]
[1,] 0 0 0
[2,] 1 0 0
[3,] 2 1 0
[4,] 0 0 0
[5,] 1 3 1
Three other approaches:
1) with base R:
temp <- stack(df)[c(outer(c(0,5,10), 1:5, '+')),]
temp$val2 <- with(temp, ave(values, values, FUN = seq_along)) - 1
df2 <- unstack(temp, val2 ~ ind)
which gives:
> df2
a b c
1 0 0 0
2 1 0 0
3 2 1 0
4 0 0 0
5 1 3 1
2) with data.table:
library(data.table)
melt(setDT(df)[, r := .I],
id = 'r')[order(r), val2 := rowid(value) - 1
][, dcast(.SD, rowid(variable) ~ variable, value.var = 'val2')
][, variable := NULL][]
which gives the same result.
3) with the tidyverse:
library(dplyr)
library(tidyr)
df %>%
mutate(r = row_number()) %>%
gather(k, v, -4) %>%
arrange(r) %>%
group_by(v) %>%
mutate(v2 = row_number() - 1) %>%
ungroup() %>%
select(r, k, v2) %>%
spread(k, v2)
which, off course, also gives the same result.
Here is another solution:
df = read.table(text = "a b c
1 2 3
3 4 5
3 2 6
7 8 9
8 3 6", header = T)
elements = sort(unique(unlist(df)))
frequency = sapply(elements, # for each element
function(element) {apply(df == element, 1, sum)}) # Sum the number of occurances per row
# [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9]
# [1,] 1 1 1 0 0 0 0 0 0
# [2,] 0 0 1 1 1 0 0 0 0
# [3,] 0 1 1 0 0 1 0 0 0
# [4,] 0 0 0 0 0 0 1 1 1
# [5,] 0 0 1 0 0 1 0 1 0
results = df
for(i in 1:nrow(df)){
for(j in 1:ncol(df))
results[i,j] = sum(frequency[1:i-1, # Sum the prevoius rows occurances
which(df[i,j] == elements)]) # Of the same element
}
# a b c
# 1 0 0 0
# 2 1 0 0
# 3 2 1 0
# 4 0 0 0
# 5 1 3 1
I know we're not supposed to comment with "thanks", but thank you to all. I've marked Brian's response as the most useful because I'm pretty new to R and his was the example I could follow all the way through without needing to look anything up. I'll have fun finding out about all the other ways and new (to me) functions / approaches you've kindly shared though.

How to Fill in Empty Matrix in R with Loop [duplicate]

This question already has answers here:
Reshape dataframe and create similarity matrix
(2 answers)
Closed 7 years ago.
I have a blank matrix called Trial that is 5000 X 5000, but i'll put a small snippet.
a b c d e f
a
b
c
d
e
f
and I want to fill the Matrix, with a Data Table I have.
Name Value
-----------
Cat A
Cat B
Cat E
Dog D
Dog C
Dog F
So basically in the end, I want the matrix to be filled like this:
a b c d e f
a 1 1 0 0 1 0
b 1 1 0 0 1 0
c 0 0 1 1 0 1
d 0 0 1 1 0 1
e 1 1 0 0 1 0
f 0 0 1 1 0 1
So all the values relating to the Name will be 1, and if they don't relate they will be 0. For example, A and F don't relate because they are different names (cat and dog), thus they will get a 0.
Here is a way with loops
dd <- read.table(header = TRUE, text="Name Value
Cat A
Cat B
Cat E
Dog D
Dog C
Dog F")
o <- order(dd$Value)
sapply(1:nrow(dd), function(x) dd$Name %in% dd[x, 'Name'] + 0L)[o, o]
# [,1] [,2] [,3] [,4] [,5] [,6]
# [1,] 1 1 0 0 1 0
# [2,] 1 1 0 0 1 0
# [3,] 0 0 1 1 0 1
# [4,] 0 0 1 1 0 1
# [5,] 1 1 0 0 1 0
# [6,] 0 0 1 1 0 1
or with an explicit for loop
mm <- matrix(nrow = nrow(dd), ncol = nrow(dd))
for (ii in 1:nrow(mm))
mm[ii, ] <- dd$Name %in% dd[ii, 'Name'] + 0L
mm[o, o]
For 5000 x 5000, takes less than 2 seconds on my crummy laptop
dd <- data.frame(Name = sample(LETTERS, 5000, replace = TRUE), Value = 1:5000)
o <- order(dd$Value)
system.time({
oo <- sapply(1:nrow(dd), function(x) dd$Name %in% dd[x, 'Name'] + 0L)[o, o]
})
# user system elapsed
# 1.680 0.188 1.874
system.time({
mm <- matrix(nrow = nrow(dd), ncol = nrow(dd))
for (ii in 1:nrow(mm))
mm[ii, ] <- dd$Name %in% dd[ii, 'Name'] + 0L
mm[o, o]
})
# user system elapsed
# 1.918 0.152 2.073

Split character column into several binary (0/1) columns

I have a character vector like this:
a <- c("a,b,c", "a,b", "a,b,c,d")
What I would like to do is create a data frame where the individual letters in each string are represented by dummy columns:
a b c d
1] 1 1 1 0
2] 1 1 0 0
3] 1 1 1 1
I have a feeling that I need to be using some combination of read.table and reshape but am really struggling. Any and help appreciated.
You can try cSplit_e from my "splitstackshape" package:
library(splitstackshape)
a <- c("a,b,c", "a,b", "a,b,c,d")
cSplit_e(as.data.table(a), "a", ",", type = "character", fill = 0)
# a a_a a_b a_c a_d
# 1: a,b,c 1 1 1 0
# 2: a,b 1 1 0 0
# 3: a,b,c,d 1 1 1 1
cSplit_e(as.data.table(a), "a", ",", type = "character", fill = 0, drop = TRUE)
# a_a a_b a_c a_d
# 1: 1 1 1 0
# 2: 1 1 0 0
# 3: 1 1 1 1
There's also mtabulate from "qdapTools":
library(qdapTools)
mtabulate(strsplit(a, ","))
# a b c d
# 1 1 1 1 0
# 2 1 1 0 0
# 3 1 1 1 1
A very direct base R approach is to use table along with stack and strsplit:
table(rev(stack(setNames(strsplit(a, ",", TRUE), seq_along(a)))))
# values
# ind a b c d
# 1 1 1 1 0
# 2 1 1 0 0
# 3 1 1 1 1
Another convoluted base-R solution:
x <- strsplit(a,",")
xl <- unique(unlist(x))
t(sapply(x,function(z)table(factor(z,levels=xl))))
which gives
a b c d
[1,] 1 1 1 0
[2,] 1 1 0 0
[3,] 1 1 1 1
A base R - but longer solution:
el = unique(unlist(strsplit(a, ',')))
do.call(rbind, lapply(a, function(u) setNames(el %in% strsplit(u,',')[[1]]+0L, el))
# a b c d
#[1,] 1 1 1 0
#[2,] 1 1 0 0
#[3,] 1 1 1 1
Another option is tstrsplit() from data.table:
library(data.table)
vapply(tstrsplit(a, ",", fixed = TRUE, fill = 0), ">", integer(length(a)), 0L)
# [,1] [,2] [,3] [,4]
# [1,] 1 1 1 0
# [2,] 1 1 0 0
# [3,] 1 1 1 1
After I wrote this I noticed that Colonel Beauvel's solution is quite similar but perhaps this is sufficiently distinct to be a separate solution. No packages are used.
First we split the character strings into a list of vectors, L and then we compute the union of them all, u. Finally we determine a binary vector for each list element and rbind them together, convert the result from logical to numeric using + 0 and set the column names.
L <- strsplit(a, ",")
u <- Reduce(union, L)
m <- do.call(rbind, lapply(L, `%in%`, x = u)) + 0
colnames(m) <- u
giving:
> m
a b c d
[1,] 1 1 1 0
[2,] 1 1 0 0
[3,] 1 1 1 1
Added The last two lines of code could be replaced by either of these:
do.call(rbind, lapply(lapply(L, factor, levels = u), table))
do.call(rbind, Map(function(x) sapply(u, `%in%`, x), L)) + 0
Unfortunately, base R does not offer a vectorized string matching function but the stringi package does.
library(stringi)
a=c("a,b,c", "a,b", "a,b,c,d")
1*outer(a,unique(unlist(strsplit(a,","))),stri_detect_regex)
# [,1] [,2] [,3] [,4]
#[1,] 1 1 1 0
#[2,] 1 1 0 0
#[3,] 1 1 1 1
I've had a lot of success with dummy_cols within fastDummies which can take care of this fairly simply and can be specified by variable.
library(fastDummies)
a <- c("a,b,c", "a,b", "a,b,c,d")
a <- dummy_cols(a, split = ",")
outputs
# .data .data_a .data_b .data_c .data_d
# 1 a,b,c 1 1 1 0
# 2 a,b 1 1 0 0
# 3 a,b,c,d 1 1 1 1

combine tables into a data frame

How do I turn a list of tables into a data frame?
I have:
> (tabs <- list(table(c('a','a','b')),table(c('c','c','b')),table(c()),table(c('b','b'))))
[[1]]
a b
2 1
[[2]]
b c
1 2
[[3]]
< table of extent 0 >
[[4]]
b
2
I want:
> data.frame(a=c(2,0,0),b=c(1,1,2),c=c(0,2,0))
a b c
1 2 1 0
2 0 1 2
3 0 0 0
4 0 2 0
PS. Please do not assume that the tables were created by table calls! They were not!
c_names <- unique(unlist(sapply(tabs, names)))
df <- do.call(rbind, lapply(tabs, `[`, c_names))
colnames(df) <- c_names
df[is.na(df)] <- 0
This assumes the tables are one dimensional.
all.names <- unique(unlist(lapply(tabs, names)))
df <- as.data.frame(do.call(rbind,
lapply(
tabs, function(x) as.list(replace(c(x)[all.names], is.na(c(x)[all.names]), 0))
) ) )
names(df) <- all.names
df
There is probably a cleaner way to do this.
# a b c
# 1 2 1 0
# 2 0 1 2
# 3 0 0 0
# 4 0 2 0
tabs <- list(table(c('a','a','b')),table(c('c','c','b')),table(c()),table(c('b','b')))
dat.names <- unique(unlist(sapply(tabs, names)))
dat <- matrix(0, nrow = length(tabs), ncol = length(dat.names))
colnames(dat) <- dat.names
for (ii in 1:length(tabs)) {
dat[ii, ] <- tabs[[ii]][match(colnames(dat), names(tabs[[ii]]) )]
}
dat[is.na(dat)] <- 0
> dat
a b c
[1,] 2 1 0
[2,] 0 1 2
[3,] 0 0 0
[4,] 0 2 0
Here is a pretty clean approach:
library(reshape2)
newTabs <- melt(tabs)
newTabs
# Var1 value L1
# 1 a 2 1
# 2 b 1 1
# 3 b 1 2
# 4 c 2 2
# 5 b 2 4
newTabs$L1 <- factor(newTabs$L1, seq_along(tabs))
dcast(newTabs, L1 ~ Var1, fill = 0, drop = FALSE)
# L1 a b c
# 1 1 2 1 0
# 2 2 0 1 2
# 3 3 0 0 0
# 4 4 0 2 0
This makes use of the fact that there is a melt method for lists (see reshape2:::melt.list) which automatically adds in a variable (L1 for an unnested list) that identifies the index of the list element. Since your list has some items which are empty, they won't show up in your melted list, so you need to factor the "L1" column, specifying the levels you want. dcast takes care of restructuring your output and allows you to specify the desired fill value.

Optimizing a "set in a string list" to a "set as a matrix" operation

I have a set of strings which contain space-separated elements. I want to build a matrix which will tell me which elements were part of which strings. For example:
""
"A B C"
"D"
"B D"
Should give something like:
A B C D
1
2 1 1 1
3 1
4 1 1
Now I've got a solution, but it runs slow as molasse, and I've run out of ideas on how to make it faster:
reverseIn <- function(vector, value) {
return(value %in% vector)
}
buildCategoryMatrix <- function(valueVector) {
allClasses <- c()
for(classVec in unique(valueVector)) {
allClasses <- unique(c(allClasses,
strsplit(classVec, " ", fixed=TRUE)[[1]]))
}
resMatrix <- matrix(ncol=0, nrow=length(valueVector))
splitValues <- strsplit(valueVector, " ", fixed=TRUE)
for(cat in allClasses) {
if(cat=="") {
catIsPart <- (valueVector == "")
} else {
catIsPart <- sapply(splitValues, reverseIn, cat)
}
resMatrix <- cbind(resMatrix, catIsPart)
}
colnames(resMatrix) <- allClasses
return(resMatrix)
}
Profiling the function gives me this:
$by.self
self.time self.pct total.time total.pct
"match" 31.20 34.74 31.24 34.79
"FUN" 30.26 33.70 74.30 82.74
"lapply" 13.56 15.10 87.86 97.84
"%in%" 12.92 14.39 44.10 49.11
So my actual questions would be:
- Where are the 33% spent in "FUN" coming from?
- Would there be any way to speed up the %in% call?
I tried turning the strings into factors prior to going into the loop so that I'd be matching numbers instead of strings, but that actually makes R crash. I've also tried going for partial matrix assignment (IE, resMatrix[i,x] <- 1) where i is the number of the string and x is the vector of factors. No dice there either, as it seems to keep on running infinitely.
In the development version of my "splitstackshape" package, there's a helper function called charBinaryMat that can be used for something like this:
Here's the function (since the version of the package on CRAN doesn't have it yet):
charBinaryMat <- function(listOfValues, fill = NA) {
lev <- sort(unique(unlist(listOfValues, use.names = FALSE)))
m <- matrix(fill, nrow = length(listOfValues), ncol = length(lev))
colnames(m) <- lev
for (i in 1:nrow(m)) {
m[i, listOfValues[[i]]] <- 1
}
m
}
The input is expected to be the output of strsplit:
And here it is in use:
str <- c("" , "A B C" , "D" , "B D" )
## Fill is `NA` by default
charBinaryMat(strsplit(str, " ", fixed=TRUE))
# A B C D
# [1,] NA NA NA NA
# [2,] 1 1 1 NA
# [3,] NA NA NA 1
# [4,] NA 1 NA 1
## Can easily be set to another value
charBinaryMat(strsplit(str, " ", fixed=TRUE), fill = 0)
# A B C D
# [1,] 0 0 0 0
# [2,] 1 1 1 0
# [3,] 0 0 0 1
# [4,] 0 1 0 1
Benchmarking
Since your question is about a faster approach, let's benchmark.
The functions for benchmarking:
CBM <- function() {
charBinaryMat(strsplit(str, " ", fixed=TRUE), fill = 0)
}
BCM <- function() {
buildCategoryMatrix(str)*1L
}
Sapply <- function() {
y <- unique( unlist( strsplit( str , " " ) ) )
out <- t(sapply(str, function(x) y %in% unlist(strsplit(x , " " )),
USE.NAMES = FALSE )) * 1L
colnames(out) <- y
out
}
Some sample data:
set.seed(1)
A = sample(10, 100000, replace = TRUE)
str <- sapply(seq_along(A), function(x)
paste(sample(LETTERS[1:10], A[x]), collapse = " "))
head(str)
# [1] "H G C" "F H J G" "H D J A I B"
# [4] "A C F H J B E G D I" "F C H" "I C G B J D F A E"
Some sample output:
## Automatically sorted
head(CBM())
# A B C D E F G H I J
# [1,] 0 0 1 0 0 0 1 1 0 0
# [2,] 0 0 0 0 0 1 1 1 0 1
# [3,] 1 1 0 1 0 0 0 1 1 1
# [4,] 1 1 1 1 1 1 1 1 1 1
# [5,] 0 0 1 0 0 1 0 1 0 0
# [6,] 1 1 1 1 1 1 1 0 1 1
## Sorting just for comparison
head(BCM())[, LETTERS[1:10]]
# A B C D E F G H I J
# [1,] 0 0 1 0 0 0 1 1 0 0
# [2,] 0 0 0 0 0 1 1 1 0 1
# [3,] 1 1 0 1 0 0 0 1 1 1
# [4,] 1 1 1 1 1 1 1 1 1 1
# [5,] 0 0 1 0 0 1 0 1 0 0
# [6,] 1 1 1 1 1 1 1 0 1 1
## Sorting just for comparison
head(Sapply())[, LETTERS[1:10]]
# A B C D E F G H I J
# [1,] 0 0 1 0 0 0 1 1 0 0
# [2,] 0 0 0 0 0 1 1 1 0 1
# [3,] 1 1 0 1 0 0 0 1 1 1
# [4,] 1 1 1 1 1 1 1 1 1 1
# [5,] 0 0 1 0 0 1 0 1 0 0
# [6,] 1 1 1 1 1 1 1 0 1 1
Benchmarking:
library(microbenchmark)
microbenchmark(CBM(), BCM(), Sapply(), times=20)
# Unit: milliseconds
# expr min lq median uq max neval
# CBM() 675.0929 718.3454 777.2423 805.3872 858.6609 20
# BCM() 11059.6305 11267.9888 11367.3283 11595.1758 11792.5950 20
# Sapply() 3536.7755 3687.0308 3759.7388 3813.4233 3968.3192 20
This is pretty easy to do with vapply:
x <- c("" , "A B C" , "D" , "B D" )
lines <- strsplit(x, " ", fixed = TRUE)
all <- sort(unique(unlist(lines)))
t(vapply(lines, function(x) all %in% x, numeric(length(all))))
This is a little slower than #Ananda's approach: https://gist.github.com/hadley/7169138
Here's one way of doing this. There is a lot going on in the line where out is assigned. Basically, we loop over each element of your input vector. We split each element into individual characters, then we look to see which of these is present in a vector of all the unique values in your dataset. This returns either TRUE or FALSE. We use * 1L at the end to turn logical values into integer but you could just wrap the whole thing in as.integer instead. sapply returns the results column-wise but you want them row-wise so we use the transpose function t() to achieve this.
The final line converts to a data.frame and applies column names.
# Data
str <- c("" , "A B C" , "D" , "B D" )
# Unique column headers (excluding empty strings as in example)
y <- unique( unlist( strsplit( str , " " ) ) )
# Results
out <- t( sapply( str , function(x) y %in% unlist( strsplit( x , " " ) ) , USE.NAMES = FALSE ) ) * 1L
# Combine to a data.frame
setNames( data.frame( out ) , y )
# A B C D
#1 0 0 0 0
#2 1 1 1 0
#3 0 0 0 1
#4 0 1 0 1

Resources