create matrix based on another matrix contain index of element - r

I have a matrix that contains indices of column ides to the element of matrix I want create it
> index
[,1] [,2] [,3]
[1,] 1 2 3
[2,] 1 2 5
[3,] 1 3 4
[4,] 1 3 5
[5,] 1 4 5
[6,] 2 3 5
[7,] 3 4 5
example first row have the column id 1 ,2 , 3 that set to it value 1
second row have the column id 1 , 2 , 5 that set to it value 1
now I want to create the following matrix:
a1 a2 a3 a4 a5
[1,] 1 1 1 0 0
[2,] 1 1 0 0 1
[3,] 1 0 1 1 0
[4,] 1 0 1 0 1
[5,] 1 0 0 1 1
[6,] 0 1 1 0 1
[7,] 0 0 1 1 1
Data
index <- rbind(c(1,2,3), c(1,2,5), c(1,3,4), c(1,3,5), c(1,4,5), c(2,3,5), c(3,4,5))

Here is an extremely fast and efficient base R method mentioned in the comments using matrix indexing.
# construct 0 matrix with correct dimensions
newMat <- matrix(0L, nrow(myMat), max(myMat))
# fill in matrix using matrix indexing
newMat[cbind(c(row(myMat)), c(myMat))] <- 1L
This returns
newMat
[,1] [,2] [,3] [,4] [,5]
[1,] 1 1 1 0 0
[2,] 1 1 0 0 1
[3,] 1 0 1 1 0
[4,] 1 0 1 0 1
[5,] 1 0 0 1 1
[6,] 0 1 1 0 1
[7,] 0 0 1 1 1
data
myMat <-
structure(c(1L, 1L, 1L, 1L, 1L, 2L, 3L, 2L, 2L, 3L, 3L, 4L, 3L,
4L, 3L, 5L, 4L, 5L, 5L, 5L, 5L), .Dim = c(7L, 3L))

Here is a solution in base R:
# Your sample matrix
m.idx <- matrix(c(1,1,1,1,1,2,3,2,2,3,3,4,3,4,3,5,4,5,5,5,5), ncol = 3);
# Construct empty matrix of the right dims
m.val <- matrix(0, nrow = nrow(m.idx), ncol = max(m.idx));
for (i in 1:nrow(m.idx)) m.val[i, m.idx[i, ]] <- 1;
m.val;
# [,1] [,2] [,3] [,4] [,5]
#[1,] 1 1 1 0 0
#[2,] 1 1 0 0 1
#[3,] 1 0 1 1 0
#[4,] 1 0 1 0 1
#[5,] 1 0 0 1 1
#[6,] 0 1 1 0 1
#[7,] 0 0 1 1 1
Update
Please see below for benchmarking results of all methods presented here. I've wrapped all methods inside functions
# The original matrix with indices
m.idx <- matrix(c(1,1,1,1,1,2,3,2,2,3,3,4,3,4,3,5,4,5,5,5,5), ncol = 3);
# For loop method
method.for_loop <- function(m) {
m.val <- matrix(0, nrow = nrow(m), ncol = max(m));
for (i in 1:nrow(m.idx)) m.val[i, m.idx[i, ]] <- 1;
return(m.val);
}
# lapply method (#Headpoint)
method.lapply <- function(m) {
m.val <- as.data.frame(matrix(0, nrow = nrow(m), ncol = max(m)));
invisible(lapply(1:nrow(m),
function(x) m.val[x,][m[x,]] <<- 1));
return(m.val);
}
# Direct indexing method (#lmo)
method.indexing <- function(m) {
m.val <- matrix(0L, nrow(m.idx), max(m.idx));
m.val[cbind(c(row(m.idx)), c(m.idx))] <- 1L;
return(m.val);
}
# tidyr/dplyr method (#CPak)
method.dplyr_tidyr <- function(m) {
as.data.frame(m) %>%
gather() %>% # wide-to-long format
group_by(key) %>%
mutate(rn = row_number()) %>% # add unique row_id per `key` group
mutate(newval = 1) %>% # fill in `existing` with this value
ungroup() %>% # ungroup and unselect `key` group
select(-key) %>%
spread(value, newval, fill=0) %>% # long-to-wide format
# fill in `non-existing` with `0`
select(-rn) %>% # unselect row_id column
rename_all(funs(paste0("a", .))) # rename columns
}
The results from microbenchmark are given below.
library(microbenchmark);
library(tidyr);
library(dplyr);
library(magrittr);
res <- microbenchmark(
for_loop = method.for_loop(m.idx),
lapply = method.lapply(m.idx),
indexing = method.indexing(m.idx),
dplyr_tidyr = method.dplyr_tidyr(m.idx),
times = 1000L
)
print(res);
# Unit: microseconds
# expr min lq mean median uq max
# for_loop 6.796 9.5405 16.89643 13.497 20.445 96.537
# lapply 1315.765 1441.5990 1696.74392 1518.256 1675.027 66181.880
# indexing 5.695 8.1450 20.49116 14.918 20.094 3139.946
# dplyr_tidyr 18777.669 20525.8095 22225.51936 21647.120 23215.714 84791.858
Conclusion: The methods using a for loop or direct indexing are tied and the fastest. lapply is second, the tidyr/dplyr method the slowest (but note the large increases in runtime).

No for loop, but didn't really check if it is faster.
index <- matrix(c(1,1,1,1,1,2,3,2,2,3,3,4,3,4,3,5,4,5,5,5,5),
ncol = 3)
df <- as.data.frame(matrix(0, nrow = 7, ncol = 5))
invisible(lapply(1:nrow(index),
function(x) df[x,][index[x,]] <<- 1))
df
# V1 V2 V3 V4 V5
# 1 1 1 1 0 0
# 2 1 1 0 0 1
# 3 1 0 1 1 0
# 4 1 0 1 0 1
# 5 1 0 0 1 1
# 6 0 1 1 0 1
# 7 0 0 1 1 1

You can do this with a combination of dplyr and tidyr
Your data
df <- read.table(text="1 2 3
1 2 5
1 3 4
1 3 5
1 4 5
2 3 5
3 4 5", header=FALSE)
Solution: some of these steps are to clean up the output
df %>%
gather() %>% # wide-to-long format
group_by(key) %>%
mutate(rn = row_number()) %>% # add unique row_id per `key` group
mutate(newval = 1) %>% # fill in `existing` with this value
ungroup() %>% # ungroup and unselect `key` group
select(-key) %>%
spread(value, newval, fill=0) %>% # long-to-wide format
# fill in `non-existing` with `0`
select(-rn) %>% # unselect row_id column
rename_all(funs(paste0("a", .))) # rename columns
Output
# A tibble: 7 x 5
a1 a2 a3 a4 a5
* <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 1 1 0 0
2 1 1 0 0 1
3 1 0 1 1 0
4 1 0 1 0 1
5 1 0 0 1 1
6 0 1 1 0 1
7 0 0 1 1 1

Related

Separate data in R [duplicate]

I have a character vector like this:
a <- c("a,b,c", "a,b", "a,b,c,d")
What I would like to do is create a data frame where the individual letters in each string are represented by dummy columns:
a b c d
1] 1 1 1 0
2] 1 1 0 0
3] 1 1 1 1
I have a feeling that I need to be using some combination of read.table and reshape but am really struggling. Any and help appreciated.
You can try cSplit_e from my "splitstackshape" package:
library(splitstackshape)
a <- c("a,b,c", "a,b", "a,b,c,d")
cSplit_e(as.data.table(a), "a", ",", type = "character", fill = 0)
# a a_a a_b a_c a_d
# 1: a,b,c 1 1 1 0
# 2: a,b 1 1 0 0
# 3: a,b,c,d 1 1 1 1
cSplit_e(as.data.table(a), "a", ",", type = "character", fill = 0, drop = TRUE)
# a_a a_b a_c a_d
# 1: 1 1 1 0
# 2: 1 1 0 0
# 3: 1 1 1 1
There's also mtabulate from "qdapTools":
library(qdapTools)
mtabulate(strsplit(a, ","))
# a b c d
# 1 1 1 1 0
# 2 1 1 0 0
# 3 1 1 1 1
A very direct base R approach is to use table along with stack and strsplit:
table(rev(stack(setNames(strsplit(a, ",", TRUE), seq_along(a)))))
# values
# ind a b c d
# 1 1 1 1 0
# 2 1 1 0 0
# 3 1 1 1 1
Another convoluted base-R solution:
x <- strsplit(a,",")
xl <- unique(unlist(x))
t(sapply(x,function(z)table(factor(z,levels=xl))))
which gives
a b c d
[1,] 1 1 1 0
[2,] 1 1 0 0
[3,] 1 1 1 1
A base R - but longer solution:
el = unique(unlist(strsplit(a, ',')))
do.call(rbind, lapply(a, function(u) setNames(el %in% strsplit(u,',')[[1]]+0L, el))
# a b c d
#[1,] 1 1 1 0
#[2,] 1 1 0 0
#[3,] 1 1 1 1
Another option is tstrsplit() from data.table:
library(data.table)
vapply(tstrsplit(a, ",", fixed = TRUE, fill = 0), ">", integer(length(a)), 0L)
# [,1] [,2] [,3] [,4]
# [1,] 1 1 1 0
# [2,] 1 1 0 0
# [3,] 1 1 1 1
After I wrote this I noticed that Colonel Beauvel's solution is quite similar but perhaps this is sufficiently distinct to be a separate solution. No packages are used.
First we split the character strings into a list of vectors, L and then we compute the union of them all, u. Finally we determine a binary vector for each list element and rbind them together, convert the result from logical to numeric using + 0 and set the column names.
L <- strsplit(a, ",")
u <- Reduce(union, L)
m <- do.call(rbind, lapply(L, `%in%`, x = u)) + 0
colnames(m) <- u
giving:
> m
a b c d
[1,] 1 1 1 0
[2,] 1 1 0 0
[3,] 1 1 1 1
Added The last two lines of code could be replaced by either of these:
do.call(rbind, lapply(lapply(L, factor, levels = u), table))
do.call(rbind, Map(function(x) sapply(u, `%in%`, x), L)) + 0
Unfortunately, base R does not offer a vectorized string matching function but the stringi package does.
library(stringi)
a=c("a,b,c", "a,b", "a,b,c,d")
1*outer(a,unique(unlist(strsplit(a,","))),stri_detect_regex)
# [,1] [,2] [,3] [,4]
#[1,] 1 1 1 0
#[2,] 1 1 0 0
#[3,] 1 1 1 1
I've had a lot of success with dummy_cols within fastDummies which can take care of this fairly simply and can be specified by variable.
library(fastDummies)
a <- c("a,b,c", "a,b", "a,b,c,d")
a <- dummy_cols(a, split = ",")
outputs
# .data .data_a .data_b .data_c .data_d
# 1 a,b,c 1 1 1 0
# 2 a,b 1 1 0 0
# 3 a,b,c,d 1 1 1 1

Rearrange and Sort

I have the following data
ID v1 v2 v3 v4 v5
1 1 3 6 4
2 4 2
3 3 1 8 5
4 2 5 3 1
Can I rearrange the data so that it will automatically create new columns and assign binary value (1 or 0) according to the value in each variable (v1 to v5)?
E.g. In first row, I have values of 1,3,4 and 6. Can R automatically create 6 dummy variables to have assign the value to the respective column as below:
ID dummy1 dummy2 dummy3 dummy4 dummy5 dummy6
1 1 0 1 1 0 1
To have something like this:
ID c1 c2 c3 c4 c5 c6 c7 c8
1 1 0 1 1 0 1 0 0
2 0 1 0 1 0 0 0 0
3 1 0 1 0 1 0 0 1
4 1 1 1 0 1 0 0 0
Thanks.
We can use base R to do this. Loop through the rows of the dataset except the first column, get the sequence of max value in the row, check how many of these are in the row and convert it to integer with as.integer, append NAs at the end to make the lengths same in the list output and cbind with the first column
lst <- apply(df[-1], 1, function(x) as.integer(seq_len(max(x, na.rm = TRUE)) %in% x))
res <- cbind(df[1], do.call(rbind, lapply(lst, `length<-`, max(lengths(lst)))))
res[is.na(res)] <- 0
colnames(res)[-1] <- paste0('c', 1:8)
res
# ID c1 c2 c3 c4 c5 c6 c7 c8
#1 1 1 0 1 1 0 1 0 0
#2 2 0 1 0 1 0 0 0 0
#3 3 1 0 1 0 1 0 0 1
#4 4 1 1 1 0 1 0 0 0
In base R, you can use:
table(transform(cbind(mydf[1], stack(mydf[-1]))[1:2], values = factor(values, 1:8)))
## values
## ID 1 2 3 4 5 6 7 8
## 1 1 0 1 1 0 1 0 0
## 2 0 1 0 1 0 0 0 0
## 3 1 0 1 0 1 0 0 1
## 4 1 1 1 0 1 0 0 0
Note that you need to convert the stacked values to factor if you want the "7" to be included in the output. This applies to the "data.table" and "tidyverse" approaches as well.
Alternatively, you can try the following with "data.table":
library(data.table)
melt(as.data.table(mydf), "ID", na.rm = TRUE)[
, dcast(.SD, ID ~ factor(value, 1:8), fun = length, drop = FALSE)]
Or the following with the "tidyverse":
library(tidyverse)
mydf %>%
gather(var, val, -ID, na.rm = TRUE) %>%
select(-var) %>%
mutate(var = 1, val = factor(val, 1:8)) %>%
spread(val, var, fill = 0, drop = FALSE)
Sample data:
mydf <- structure(list(ID = 1:4, v1 = c(1L, 4L, 3L, 2L), v2 = c(3L, 2L,
1L, 5L), v3 = c(6L, NA, 8L, 3L), v4 = c(4L, NA, 5L, 1L), v5 = c(NA,
NA, NA, NA)), .Names = c("ID", "v1", "v2", "v3", "v4", "v5"), row.names = c(NA,
4L), class = "data.frame")
If automation is important, you can also use syntax like factor(value, sequence(max(value)) in the "data.table" approach or val = factor(val, sequence(max(val)))) in the "tidyverse" approach.
Another base R answer with some similarities to akrun's is
# create matrix of values
myMat <- as.matrix(dat[-1])
# create result matrix of desired shape, filled with 0s
res <- matrix(0L, nrow(dat), ncol=max(myMat, na.rm=TRUE))
# use matrix indexing to fill in 1s
res[cbind(dat$ID, as.vector(myMat))] <- 1L
# convert to data.frame, add ID column, and provide variable names
setNames(data.frame(cbind(dat$ID, res)), c("ID", paste0("c", 1:8)))
which returns
ID c1 c2 c3 c4 c5 c6 c7 c8
1 1 1 0 1 1 0 1 0 0
2 2 0 1 0 1 0 0 0 0
3 3 1 0 1 0 1 0 0 1
4 4 1 1 1 0 1 0 0 0

Count occurrence of a value within a data frame within the rows above it

I'm trying to find a way to create a matrix which counts values from each row of a data frame. I'd like it to recognise the values in each row of the data frame, and count how many times that value has occurred in all rows above the row the value occurs in (not the whole data frame).
The same value will never occur more than once in a single row of the data frame.
For example:
# df:
a b c
1 2 3
3 4 5
3 2 6
7 8 9
8 3 6
matrix result:
0 0 0 (none of the df values have occurred as there are no rows above)
1 0 0 (3 has occurred once above, the others have not occurred)
2 1 0 (3 has occurred twice above, 2 has occurred once above, 6 has not occurred)
0 0 0 (none of the df values have occurred in rows above)
1 3 1 (8 has occurred once, 3 has occurred 3 times, 6 has occurred once)
Here's one way:
# convert to a vector
x = as.vector(t(as.matrix(df)))
# get counts of each unique element (in the right place)
# and add them up
res = rowSums(sapply(unique(x), function(z) {
r = integer(length(x))
r[x == z] = 0:(sum(x == z) - 1)
return(r)
}))
# convert to matrix
res = matrix(res, ncol = ncol(df), byrow = T)
res
# [,1] [,2] [,3]
# [1,] 0 0 0
# [2,] 1 0 0
# [3,] 2 1 0
# [4,] 0 0 0
# [5,] 1 3 1
Using this data:
df = read.table(text = "
a b c
1 2 3
3 4 5
3 2 6
7 8 9
8 3 6", header = T)
Another...for fun
out<-matrix(1,nrow = nrow(df),ncol = ncol(df))
for(i in 1:nrow(df)){
out[i,]<-sapply(1:ncol(df),function(z) sum(unlist(df[0:(i-1),]) %in% df[i,z]))
}
out
[,1] [,2] [,3]
[1,] 0 0 0
[2,] 1 0 0
[3,] 2 1 0
[4,] 0 0 0
[5,] 1 3 1
Three other approaches:
1) with base R:
temp <- stack(df)[c(outer(c(0,5,10), 1:5, '+')),]
temp$val2 <- with(temp, ave(values, values, FUN = seq_along)) - 1
df2 <- unstack(temp, val2 ~ ind)
which gives:
> df2
a b c
1 0 0 0
2 1 0 0
3 2 1 0
4 0 0 0
5 1 3 1
2) with data.table:
library(data.table)
melt(setDT(df)[, r := .I],
id = 'r')[order(r), val2 := rowid(value) - 1
][, dcast(.SD, rowid(variable) ~ variable, value.var = 'val2')
][, variable := NULL][]
which gives the same result.
3) with the tidyverse:
library(dplyr)
library(tidyr)
df %>%
mutate(r = row_number()) %>%
gather(k, v, -4) %>%
arrange(r) %>%
group_by(v) %>%
mutate(v2 = row_number() - 1) %>%
ungroup() %>%
select(r, k, v2) %>%
spread(k, v2)
which, off course, also gives the same result.
Here is another solution:
df = read.table(text = "a b c
1 2 3
3 4 5
3 2 6
7 8 9
8 3 6", header = T)
elements = sort(unique(unlist(df)))
frequency = sapply(elements, # for each element
function(element) {apply(df == element, 1, sum)}) # Sum the number of occurances per row
# [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9]
# [1,] 1 1 1 0 0 0 0 0 0
# [2,] 0 0 1 1 1 0 0 0 0
# [3,] 0 1 1 0 0 1 0 0 0
# [4,] 0 0 0 0 0 0 1 1 1
# [5,] 0 0 1 0 0 1 0 1 0
results = df
for(i in 1:nrow(df)){
for(j in 1:ncol(df))
results[i,j] = sum(frequency[1:i-1, # Sum the prevoius rows occurances
which(df[i,j] == elements)]) # Of the same element
}
# a b c
# 1 0 0 0
# 2 1 0 0
# 3 2 1 0
# 4 0 0 0
# 5 1 3 1
I know we're not supposed to comment with "thanks", but thank you to all. I've marked Brian's response as the most useful because I'm pretty new to R and his was the example I could follow all the way through without needing to look anything up. I'll have fun finding out about all the other ways and new (to me) functions / approaches you've kindly shared though.

Split character column into several binary (0/1) columns

I have a character vector like this:
a <- c("a,b,c", "a,b", "a,b,c,d")
What I would like to do is create a data frame where the individual letters in each string are represented by dummy columns:
a b c d
1] 1 1 1 0
2] 1 1 0 0
3] 1 1 1 1
I have a feeling that I need to be using some combination of read.table and reshape but am really struggling. Any and help appreciated.
You can try cSplit_e from my "splitstackshape" package:
library(splitstackshape)
a <- c("a,b,c", "a,b", "a,b,c,d")
cSplit_e(as.data.table(a), "a", ",", type = "character", fill = 0)
# a a_a a_b a_c a_d
# 1: a,b,c 1 1 1 0
# 2: a,b 1 1 0 0
# 3: a,b,c,d 1 1 1 1
cSplit_e(as.data.table(a), "a", ",", type = "character", fill = 0, drop = TRUE)
# a_a a_b a_c a_d
# 1: 1 1 1 0
# 2: 1 1 0 0
# 3: 1 1 1 1
There's also mtabulate from "qdapTools":
library(qdapTools)
mtabulate(strsplit(a, ","))
# a b c d
# 1 1 1 1 0
# 2 1 1 0 0
# 3 1 1 1 1
A very direct base R approach is to use table along with stack and strsplit:
table(rev(stack(setNames(strsplit(a, ",", TRUE), seq_along(a)))))
# values
# ind a b c d
# 1 1 1 1 0
# 2 1 1 0 0
# 3 1 1 1 1
Another convoluted base-R solution:
x <- strsplit(a,",")
xl <- unique(unlist(x))
t(sapply(x,function(z)table(factor(z,levels=xl))))
which gives
a b c d
[1,] 1 1 1 0
[2,] 1 1 0 0
[3,] 1 1 1 1
A base R - but longer solution:
el = unique(unlist(strsplit(a, ',')))
do.call(rbind, lapply(a, function(u) setNames(el %in% strsplit(u,',')[[1]]+0L, el))
# a b c d
#[1,] 1 1 1 0
#[2,] 1 1 0 0
#[3,] 1 1 1 1
Another option is tstrsplit() from data.table:
library(data.table)
vapply(tstrsplit(a, ",", fixed = TRUE, fill = 0), ">", integer(length(a)), 0L)
# [,1] [,2] [,3] [,4]
# [1,] 1 1 1 0
# [2,] 1 1 0 0
# [3,] 1 1 1 1
After I wrote this I noticed that Colonel Beauvel's solution is quite similar but perhaps this is sufficiently distinct to be a separate solution. No packages are used.
First we split the character strings into a list of vectors, L and then we compute the union of them all, u. Finally we determine a binary vector for each list element and rbind them together, convert the result from logical to numeric using + 0 and set the column names.
L <- strsplit(a, ",")
u <- Reduce(union, L)
m <- do.call(rbind, lapply(L, `%in%`, x = u)) + 0
colnames(m) <- u
giving:
> m
a b c d
[1,] 1 1 1 0
[2,] 1 1 0 0
[3,] 1 1 1 1
Added The last two lines of code could be replaced by either of these:
do.call(rbind, lapply(lapply(L, factor, levels = u), table))
do.call(rbind, Map(function(x) sapply(u, `%in%`, x), L)) + 0
Unfortunately, base R does not offer a vectorized string matching function but the stringi package does.
library(stringi)
a=c("a,b,c", "a,b", "a,b,c,d")
1*outer(a,unique(unlist(strsplit(a,","))),stri_detect_regex)
# [,1] [,2] [,3] [,4]
#[1,] 1 1 1 0
#[2,] 1 1 0 0
#[3,] 1 1 1 1
I've had a lot of success with dummy_cols within fastDummies which can take care of this fairly simply and can be specified by variable.
library(fastDummies)
a <- c("a,b,c", "a,b", "a,b,c,d")
a <- dummy_cols(a, split = ",")
outputs
# .data .data_a .data_b .data_c .data_d
# 1 a,b,c 1 1 1 0
# 2 a,b 1 1 0 0
# 3 a,b,c,d 1 1 1 1

Dummy variables from a string variable

I would like to create dummy variables form this dataset:
DF<-structure(list(A = c(1, 2, 3, 4, 5), B = c("1,3,2", "2,1,3,6",
"3,2,5,1,7", "3,7,4,2,6,5", "4,10,7,3,5,6")), .Names = c("A", "B"),
row.names = c(NA, 5L), class = "data.frame")
> DF
A B
1 1 1,3,2
2 2 2,1,3,6
3 3 3,2,5,1,7
4 4 3,7,4,2,6,5
5 5 4,10,7,3,5,6
Desired output shoud look like this:
A 1 2 3 4 5 6 7 8 9 10
1 1 1 1 0 0 0 0 0 0 0
2 1 1 1 0 0 1 0 0 0 0
3 1 1 1 0 1 0 1 0 0 0
4 0 1 1 1 1 1 1 0 0 0
5 0 0 1 1 1 1 1 0 0 1
Is there a efficient way to do such thing? I can use strsplit or ifelse. Original dataset is very large with many rows (>10k) and values in column B (>15k). Function dummy from package dummies don't work as I want to.
I also found simmilar case: Splitting one column into multiple columns. But the anwsers from the link above work really slow in my case (up to 15 minutes on my Dell i7-2630QM, 8Gb, Win7 64 bit, R 2.15.3 64bit).
Thank you in advance for your anwsers.
UPDATE
The function mentioned here has now been moved to a package available on CRAN called "splitstackshape". The version on CRAN is considerably faster than this original version. The speeds should be similar to what you would get with the direct for loop solution at the end of this answer. See #Ricardo's answer for detailed benchmarks.
Install it, and use concat.split.expanded to get the desired result:
library(splitstackshape)
concat.split.expanded(DF, "B", fill = 0, drop = TRUE)
# A B_01 B_02 B_03 B_04 B_05 B_06 B_07 B_08 B_09 B_10
# 1 1 1 1 1 0 0 0 0 0 0 0
# 2 2 1 1 1 0 0 1 0 0 0 0
# 3 3 1 1 1 0 1 0 1 0 0 0
# 4 4 0 1 1 1 1 1 1 0 0 0
# 5 5 0 0 1 1 1 1 1 0 0 1
Original post
A while ago, I had written a function to do not just this sort of splitting, but others. The function, named concat.split(), can be found here.
The usage, for your example data, would be:
## Keeping the original column
concat.split(DF, "B", structure="expanded")
# A B B_1 B_2 B_3 B_4 B_5 B_6 B_7 B_8 B_9 B_10
# 1 1 1,3,2 1 1 1 NA NA NA NA NA NA NA
# 2 2 2,1,3,6 1 1 1 NA NA 1 NA NA NA NA
# 3 3 3,2,5,1,7 1 1 1 NA 1 NA 1 NA NA NA
# 4 4 3,7,4,2,6,5 NA 1 1 1 1 1 1 NA NA NA
# 5 5 4,10,7,3,5,6 NA NA 1 1 1 1 1 NA NA 1
## Dropping the original column
concat.split(DF, "B", structure="expanded", drop.col=TRUE)
# A B_1 B_2 B_3 B_4 B_5 B_6 B_7 B_8 B_9 B_10
# 1 1 1 1 1 NA NA NA NA NA NA NA
# 2 2 1 1 1 NA NA 1 NA NA NA NA
# 3 3 1 1 1 NA 1 NA 1 NA NA NA
# 4 4 NA 1 1 1 1 1 1 NA NA NA
# 5 5 NA NA 1 1 1 1 1 NA NA 1
Recoding NA to 0 has to be done manually--perhaps I'll update the function to add an option to do so, and at the same time, implement one of these faster solutions :)
temp <- concat.split(DF, "B", structure="expanded", drop.col=TRUE)
temp[is.na(temp)] <- 0
temp
# A B_1 B_2 B_3 B_4 B_5 B_6 B_7 B_8 B_9 B_10
# 1 1 1 1 1 0 0 0 0 0 0 0
# 2 2 1 1 1 0 0 1 0 0 0 0
# 3 3 1 1 1 0 1 0 1 0 0 0
# 4 4 0 1 1 1 1 1 1 0 0 0
# 5 5 0 0 1 1 1 1 1 0 0 1
Update
Most of the overhead in the concat.split function probably comes in things like converting from a matrix to a data.frame, renaming the columns, and so on. The actual code used to do the splitting is a GASP for loop, but test it out, and you'll find that it performs pretty well:
b = strsplit(DF$B, ",")
ncol = max(as.numeric(unlist(b)))
temp = lapply(b, as.numeric)
## Set up an empty matrix
m = matrix(0, nrow = nrow(DF), ncol = ncol)
## Fill it in
for (i in 1:nrow(DF)) {
m[i, temp[[i]]] = 1
}
## View your result
m
Update:
Added benchmarks below
Update2: added bechmarks for #Anada's solution. WOW it's fast!!
Added benchmarks for an evern larger data set and #Anada's solution speeds ahead by a larger margin. '
Original Answer:
As you can see below, KnownMax and UnknownMax are outperforming even the data.table solution. Although, I suspect that if there were 10e6+ rows, then the data.table solution would be fastest. (feel free to benchmark it by simply modifying the parameters at the very bottom of this post)
Solution 1: KnownMax
If you know the maximum value in B, then you have a nice, two-liner:
maximum <- 10
results <- t(sapply(strsplit(DF$B, ","), `%in%`, x=1:maximum)) + 0
# [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
# [1,] 1 1 1 0 0 0 0 0 0 0
# [2,] 1 1 1 0 0 1 0 0 0 0
# [3,] 1 1 1 0 1 0 1 0 0 0
# [4,] 0 1 1 1 1 1 1 0 0 0
# [5,] 0 0 1 1 1 1 1 0 0 1
Three lines, if you want to name the columns and rows:
dimnames(results) <- list(seq(nrow(results)), seq(ncol(results)))
Solution 2: UnknownMax
# if you do not know the maximum ahead of time:
splat <- strsplit(DF$B, ",")
maximum <- max(as.numeric(unlist(splat)))
t(sapply(splat, `%in%`, x=1:maximum)) + 0
Solution 3: DT
As per #dickoa's request, here is an option with data.table. '
DT <- data.table(DF)
DT.long <- DT[, list(vals=as.numeric(unlist(strsplit(B, ",")))), by=A]
cols <- DT.long[, max(vals)]
rows <- DT.long[, max(A)]
matrix(as.numeric(DT.long[, (1:cols) %in% vals, by=A]$V1), ncol=cols,
byrow=TRUE, dimnames=list(seq(rows), seq(cols)))
# 1 2 3 4 5 6 7 8 9 10
# 1 1 1 1 0 0 0 0 0 0 0
# 2 1 1 1 0 0 1 0 0 0 0
# 3 1 1 1 0 1 0 1 0 0 0
# 4 0 1 1 1 1 1 1 0 0 0
# 5 0 0 1 1 1 1 1 0 0 1
Similar setup can be done in base R as well
===
Here are some benchmarks with slightly larger data:
microbenchmark(KnownMax = eval(KnownMax), UnknownMax = eval(UnknownMax),
DT.withAssign = eval(DT.withAssign),
DT.withOutAssign = eval(DT.withOutAssign),
lapply.Dickoa = eval(lapply.Dickoa), apply.SimonO101 = eval(apply.SimonO101),
forLoop.Ananda = eval(forLoop.Ananda), times=50L)
Using the OP data.frame, where the result is 5 x 10
Unit: microseconds
expr min lq median uq max neval
KnownMax 106.556 114.692 122.4915 129.406 6427.521 50
UnknownMax 114.470 122.561 128.9780 136.384 158.346 50
DT.withAssign 3000.777 3099.729 3198.8175 3291.284 10415.315 50
DT.withOutAssign 2637.023 2739.930 2814.0585 2903.904 9376.747 50
lapply.Dickoa 7031.791 7315.781 7438.6835 7634.647 14314.687 50
apply.SimonO101 430.350 465.074 487.9505 522.938 7568.442 50
forLoop.Ananda 81.415 91.027 99.7530 104.588 265.394 50
Using the slightly larger data.frame (below) where the results is 1000 x 100
removing lapply.Dickoa as my edit might have slowed it down and as it stood it crashed.
Unit: milliseconds
expr min lq median uq max neval
KnownMax 34.83210 35.59068 36.13330 38.15960 52.27746 50
UnknownMax 36.41766 37.17553 38.03075 47.71438 55.57009 50
DT.withAssign 31.95005 32.65798 33.73578 43.71493 50.05831 50
DT.withOutAssign 31.36063 32.08138 32.80728 35.32660 51.00037 50
apply.SimonO101 78.61677 91.72505 95.53592 103.36052 163.14346 50
forLoop.Ananda 13.61827 14.02197 14.18899 14.58777 26.42266 50
Even larger set where the results is 10,000 x 600
Unit: milliseconds
expr min lq median uq max neval
KnownMax 1583.5902 1631.6214 1658.6168 1724.9557 1902.3923 50
UnknownMax 1597.1215 1655.9634 1690.7550 1735.5913 1804.2156 50
DT.withAssign 586.4675 641.7206 660.7330 716.0100 1193.4806 50
DT.withOutAssign 587.0492 628.3731 666.3148 717.5575 776.2671 50
apply.SimonO101 1916.6589 1995.2851 2044.9553 2079.6754 2385.1028 50
forLoop.Ananda 163.4549 172.5627 182.6207 211.9153 315.0706 50
Using the following:
library(microbmenchmark)
library(data.table)
KnownMax <- quote(t(sapply(strsplit(DF$B, ","), `%in%`, x=1:maximum)) + 0)
UnknownMax <- quote({ splat <- strsplit(DF$B, ","); maximum <- max(as.numeric(unlist(splat))); t(sapply(splat, `%in%`, x=1:maximum)) + 0})
DT.withAssign <- quote({DT <- data.table(DF); DT.long <- DT[, list(vals=as.numeric(unlist(strsplit(B, ",")))), by=A]; cols <- DT.long[, max(vals)]; rows <- DT.long[, max(A)] ; matrix(as.numeric(DT.long[, (1:cols) %in% vals, by=A]$V1), ncol=cols, byrow=TRUE, dimnames=list(seq(rows), seq(cols)))})
DT.withOutAssign <- quote({DT.long <- DT[, list(vals=as.numeric(unlist(strsplit(B, ",")))), by=A]; cols <- DT.long[, max(vals)]; rows <- DT.long[, max(A)] ; matrix(as.numeric(DT.long[, (1:cols) %in% vals, by=A]$V1), ncol=cols, byrow=TRUE, dimnames=list(seq(rows), seq(cols)))})
lapply.Dickoa <- quote({ tmp <- strsplit(DF$B, ","); label <- 1:max(as.numeric(unlist(tmp))); tmp <- lapply(tmp, function(x) as.data.frame(lapply(label, function(y) (x == y)))); unname(t(sapply(tmp, colSums))) })
apply.SimonO101 <- quote({cols <- 1:max( as.numeric( unlist(strsplit(DF$B,",")))); t(apply(DF["B"] , 1 , function(x) ifelse( cols %in% as.numeric( unlist( strsplit(x , ",") ) ) , 1 , 0 ) ) ) })
forLoop.Ananda <- quote({b = strsplit(DF$B, ","); ncol = max(as.numeric(unlist(b))); temp = lapply(b, as.numeric); m = matrix(0, nrow = nrow(DF), ncol = ncol)      ; for (i in 1:nrow(DF)) {  m[i, temp[[i]]] = 1 }; m })
# slightly modified #Dickoa's alogrithm to allow for instances were B is only a single number.
# Instead of using `sapply(.)`, I used `as.data.frame(lapply(.))` which hopefully the simplification process in sapply is analogous in time to `as.data.frame`
identical(eval(lapply.Dickoa), eval(UnknownMax))
identical(eval(lapply.Dickoa), unname(eval(apply.SimonO101)))
identical(eval(lapply.Dickoa), eval(KnownMax))
identical(unname(as.matrix(eval(DT.withAssign))), eval(KnownMax))
# ALL TRUE
this is what was used to create the sample data:
# larger data created as follows
set.seed(1)
maximum <- 600
rows <- 10000
DF <- data.frame(A=seq(rows), B=as.character(c(maximum, replicate(rows-1, paste(sample(maximum, sample(20), FALSE), collapse=",")))), stringsAsFactors=FALSE)
DT <- data.table(DF);
DT
One way you could do this with ifelse and strsplit (unless I misunderstood and you don't want to use them?) is like this....
cols <- 1:max( as.numeric( unlist(strsplit(DF$B,","))))
df <- t(apply(DF["B"] , 1 , function(x) ifelse( cols %in% as.numeric( unlist( strsplit(x , ",") ) ) , 1 , 0 ) ) )
colnames(df) <- cols
df
# 1 2 3 4 5 6 7 8 9 10
#1 1 1 1 0 0 0 0 0 0 0
#2 1 1 1 0 0 1 0 0 0 0
#3 1 1 1 0 1 0 1 0 0 0
#4 0 1 1 1 1 1 1 0 0 0
#5 0 0 1 1 1 1 1 0 0 1
The idea is that we get a vector of the unique values in your desired column, find the max value and create a vector 1:max(value) then apply across each row to find out which values for that row are in the vector of all values. We use ifelse to put a 1 if it's there and 0 if it's not. The vector we match in is a sequence so its output is ready sorted.
A little late to the game, but a different strategy uses the fact that a matrix can be indexed by another two-column matrix specifying row and and column indexes for update. So
f2 <- function(DF) {
b <- strsplit(DF$B, ",", fixed=TRUE)
len <- vapply(b, length, integer(1)) # 'geometry'
b <- as.integer(unlist(b))
midx <- matrix(c(rep(seq_len(nrow(DF)), len), b), ncol=2)
m <- matrix(0L, nrow(DF), max(b))
m[midx] <- 1L
m
}
This uses strsplit(..., fixed=TRUE) and vapply for efficiency and type safety, and as.integer and 0L, 1L because we really want integer and not numeric return values.
For comparison, here's the original implementation from #AnandaMahto
f0 <- function(DF) {
b = strsplit(DF$B, ",")
ncol = max(as.numeric(unlist(b)))
temp = lapply(b, as.numeric)
m = matrix(0, nrow = nrow(DF), ncol = ncol)
for (i in 1:nrow(DF)) {
m[i, temp[[i]]] = 1
}
m
}
This can be improved for efficiency by using fixed=TRUE and avoiding the double coercion of b, and made more robust by coercing to integer and using seq_len(nrow(DF)) to avoid the corner case of 0-row DF
f1 <- function(DF) {
b = lapply(strsplit(DF$B, ",", fixed=TRUE), as.integer)
ncol = max(unlist(b))
m = matrix(0L, nrow = nrow(DF), ncol = ncol)
for (i in seq_len(nrow(DF)))
m[i, b[[i]]] = 1L
m
}
The for loop is a good candidate for compilation, so
library(compiler)
f1c <- cmpfun(f1)
and then for comparison on the 10,000 x 600 data from #RicardoSaporta
> library(microbenchmark)
> microbenchmark(f0(DF), f1(DF), f1c(DF), f2(DF))
Unit: milliseconds
expr min lq median uq max neval
f0(DF) 170.51388 180.25997 182.45772 188.23811 717.7511 100
f1(DF) 91.53578 97.14909 97.97195 100.24236 447.5900 100
f1c(DF) 79.39194 84.45712 85.71022 87.85763 411.8340 100
f2(DF) 76.45496 81.70307 82.50752 110.83620 398.6093 100
Both the 2-fold increase from f0 to f1 and relative efficiency of the for loop were relatively surprising to me. #AnandaMahto's solution is more memory efficient, made more so without too much performance cost with
ncol = max(vapply(b, max, integer(1)))
I know there's already a good and quite efficient answer but we can use another approach too to get the same results.
tmp <- strsplit(DF$B, ",")
label <- 1:max(as.numeric(unlist(tmp)))
tmp <- lapply(tmp, function(x)
sapply(label, function(y) (x == y)))
t(sapply(tmp, colSums))
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
## [1,] 1 1 1 0 0 0 0 0 0 0
## [2,] 1 1 1 0 0 1 0 0 0 0
## [3,] 1 1 1 0 1 0 1 0 0 0
## [4,] 0 1 1 1 1 1 1 0 0 0
## [5,] 0 0 1 1 1 1 1 0 0 1
We can benchmark it now to compare with #SimonO101 solution (fun2)
require(rbenchmark)
fun1 <- function(DF) {
tmp <- strsplit(DF$B, ",")
label <- 1:max(as.numeric(unlist(tmp)))
tmp <- lapply(tmp, function(x)
sapply(label, function(y) (x == y)))
t(sapply(tmp, colSums))
}
fun2 <- function(DF) {
cols <- 1:max( as.numeric( unlist(strsplit(DF$B,","))))
df <- t(apply(DF["B"] , 1 , function(x) ifelse( cols %in% as.numeric( unlist( strsplit(x , ",") ) ) , 1 , 0 ) ) )
colnames(df) <- cols
df
}
all.equal(fun1(DF),
fun2(DF),
check.attributes = FALSE)
## [1] TRUE
benchmark(fun1(DF),
fun2(DF),
order = "elapsed",
columns = c("test", "elapsed", "relative"),
replications = 5000)
## test elapsed relative
## 1 fun1(DF) 1.870 1.000
## 2 fun2(DF) 2.018 1.079
As we can see there's not a big difference.
Suggested edit (RS):
# from:
tmp <- lapply(tmp, function(x)
sapply(label, function(y) (x == y)))
# to:
tmp <- lapply(tmp, function(x)
as.data.frame(lapply(label, function(y) (x == y))))
Ok, this has been bugging me for a while, but I thought it would be a good use of Rcpp. So I wrote a little function too see if I can get something faster than #Ananda's amazing for loop solution. This solution seems to run approximately twice as fast (using the larger sample dataset posted by #RicardoSaporta).
Note: I was attempting this more to teach myself how to use Rcpp and C++ than to provide a useful solution, but all the same...
Our .cpp file...
#include <Rcpp.h>
#include <string>
#include <sstream>
using namespace Rcpp;
//[[Rcpp::export]]
NumericMatrix expandR(CharacterVector x) {
int n = x.size();
std::vector< std::vector<int> > out; // list to hold numeric vectors
int tmax = 0;
for(int i = 0; i < n; ++i) {
std::vector<int> vect; // vector to hold split strings
std::string str = as<std::string>(x[i]);
std::stringstream ss(str);
int j = 0;
while (ss >> j) {
vect.push_back(j); // add integer to result vector
if (ss.peek() == ',') //split by ',' delim
ss.ignore();
}
int it = *std::max_element(vect.begin(), vect.end());
if( it > tmax )
tmax = it; //current max value
out.push_back(vect);
}
// Now we construct the matrix. tmax gives us number of columns, n is number of rows;
NumericMatrix mat(n,tmax);
for( int i = 0; i < n; ++i) {
NumericMatrix::Row zzrow = mat( i , _ );
std::vector<int> vec = out[i];
for( int j = 0; j < vec.size(); ++j ) {
zzrow[ (vec[j]-1) ] = 1; //don't forget R vs. C++ indexing
}
}
return mat;
}
Using the nominal example from the OP we can then just do...
require(Rcpp)
## source the function so it is available to use in R
sourceCpp("C:/path/to/file.cpp")
# Call it like any other R function
expandR(DF$B)
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
[1,] 1 1 1 0 0 0 0 0 0 0
[2,] 1 1 1 0 0 1 0 0 0 0
[3,] 1 1 1 0 1 0 1 0 0 0
[4,] 0 1 1 1 1 1 1 0 0 0
[5,] 0 0 1 1 1 1 1 0 0 1
And using the larger dataset provided by #Ricardo) and comparing with #Ananda's solution)....
require(Rcpp)
require(data.table)
set.seed(1)
maximum <- 600
rows <- 10000
DF <- data.frame(A=seq(rows), B=as.character(c(maximum, replicate(rows-1, paste(sample(maximum, sample(20), FALSE), collapse=",")))), stringsAsFactors=FALSE)
DT <- data.table(DF);
DT
## source in our c code
sourceCpp("C:/Users/sohanlon/Desktop/expandR2.cpp")
forLoop.Ananda <- quote({b = strsplit(DF$B, ","); ncol = max(as.numeric(unlist(b))); temp = lapply(b, as.numeric); m = matrix(0, nrow = nrow(DF), ncol = ncol) ; for (i in 1:nrow(DF)) { m[i, temp[[i]]] = 1 }; m })
rcpp.Simon <- quote({mm = expandR( DT$B )})
require(microbenchmark)
microbenchmark( eval(forLoop.Ananda) , eval(rcpp.Simon) , times = 5L )
Unit: milliseconds
expr min lq median uq max neval
eval(forLoop.Ananda) 173.3024 178.6445 181.5881 218.9619 227.9490 5
eval(rcpp.Simon) 115.8309 116.3876 116.8125 119.1971 125.6504 5
Not a particularly fast solution, however, it could be useful for those preferring tidyverse possibilities:
DF %>%
mutate(B = str_split(B, fixed(","))) %>%
unnest() %>%
transmute(A,
var = as.numeric(B),
val = 1) %>%
complete(var = seq(min(var), max(var), 1), nesting(A)) %>%
spread(var, val, fill = 0)
A `1` `2` `3` `4` `5` `6` `7` `8` `9` `10`
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 1 1 1 0 0 0 0 0 0 0
2 2 1 1 1 0 0 1 0 0 0 0
3 3 1 1 1 0 1 0 1 0 0 0
4 4 0 1 1 1 1 1 1 0 0 0
5 5 0 0 1 1 1 1 1 0 0 1
To have more compact column names:
DF %>%
mutate(B = str_split(B, fixed(","))) %>%
unnest() %>%
transmute(A,
var = as.numeric(B),
val = 1) %>%
complete(var = seq(min(var), max(var), 1), nesting(A)) %>%
spread(var, val, fill = 0) %>%
rename_at(2:length(.), ~ paste0("Col", 1:length(.)))
A Col1 Col2 Col3 Col4 Col5 Col6 Col7 Col8 Col9 Col10
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 1 1 1 0 0 0 0 0 0 0
2 2 1 1 1 0 0 1 0 0 0 0
3 3 1 1 1 0 1 0 1 0 0 0
4 4 0 1 1 1 1 1 1 0 0 0
5 5 0 0 1 1 1 1 1 0 0 1

Resources