Split column string elements within a row inside a dataframe

Split column string elements within a row inside a dataframe - r

I have a matrix (1000 x 2830) like this:
9178 3574 3547
160 B_B B_B A_A
301 B_B A_B A_B
303 B_B B_B A_A
311 A_B A_B A_A
312 B_B A_B A_A
314 B_B A_B A_A
and I want to obtain the following (duplicating colnames and splitting each element of each column):
9178 9178 3574 3574 3547 3547
160 B B B B A A
301 B B A B A B
303 B B B B A A
311 A B A B A A
312 B B A B A A
314 B B A B A A
I tried using strsplit but I got error messages because this is a matrix, not a string. Could you please provide some ideas for resolving this?

Here's an option using dplyr (for bind_cols) and tidyr (for separate_) together with lapply from base R. It assumes that your data is a data.frame (i.e. you might need to convert it to data.frame first):
library(dplyr)
library(tidyr)
lapply(names(df), function(x) separate_(df[x], x, paste0(x,"_",1:2), sep = "_" )) %>%
bind_cols
# X9178_1 X9178_2 X3574_1 X3574_2 X3547_1 X3547_2
#1 B B B B A A
#2 B B A B A B
#3 B B B B A A
#4 A B A B A A
#5 B B A B A A
#6 B B A B A A

I'm biased, but I would recommend using cSplit from my "splitstackshape" package. Since it appears that you have rownames in your input, use as.data.table(., keep.rownames = TRUE):
library(splitstackshape)
cSplit(as.data.table(mydf, keep.rownames = TRUE), names(mydf), "_")
# rn X9178_1 X9178_2 X3574_1 X3574_2 X3547_1 X3547_2
# 1: 160 B B B B A A
# 2: 301 B B A B A B
# 3: 303 B B B B A A
# 4: 311 A B A B A A
# 5: 312 B B A B A A
# 6: 314 B B A B A A
Less legible than cSplit (but presently likely to be faster) would be to use stri_split_fixed from "stringi", like this:
library(stringi)
`dimnames<-`(do.call(cbind,
lapply(mydf, stri_split_fixed, "_", simplify = TRUE)),
list(rownames(mydf), rep(colnames(mydf), each = 2)))
# X9178 X9178 X3574 X3574 X3547 X3547
# 160 "B" "B" "B" "B" "A" "A"
# 301 "B" "B" "A" "B" "A" "B"
# 303 "B" "B" "B" "B" "A" "A"
# 311 "A" "B" "A" "B" "A" "A"
# 312 "B" "B" "A" "B" "A" "A"
# 314 "B" "B" "A" "B" "A" "A"
If speed is of the essence, I would suggest checking out the "iotools" package, particularly the mstrsplit function. The approach would be similar to the "stringi" approach:
library(iotools)
`dimnames<-`(do.call(cbind,
lapply(mydf, mstrsplit, "_", ncol = 2, type = "character")),
list(rownames(mydf), rep(colnames(mydf), each = 2)))
You may need to add an lapply(mydf, as character) in there if you forgot to use stringsAsFactors = FALSE when converting from a matrix to a data.frame, but it should still beat even the stri_split approach.

Something you can do, although it seems a bit "twisted" (yourmat being your matrix)...:
inter<-data.frame(t(sapply(as.vector(yourmat), function(x) {
strsplit(x, "_")[[1]]
})),
row.names=paste0(rep(colnames(yourmat), e=nrow(yourmat)), 1:nrow(yourmat)),
stringsAsFactors=F)
res<-do.call("cbind",
split(inter, factor(substr(row.names(inter), 1, 4), level = colnames(yourmat))))
res
# 9178.X1 9178.X2 3574.X1 3574.X2 3547.X1 3547.X2
# 91781 B B B B A A
# 91782 B B A B A B
# 91783 B B B B A A
# 91784 A B A B A A
# 91785 B B A B A A
# 91786 B B A B A A
Edit
If you want the row.names of resto be the same as in yourmat, you can do:
row.names(res)<-row.names(yourmat)
NB: If yourmat is a data.frame instead of a matrix the as.vector function in the first line needs to be changed to unlist.

base R solution without using data frames:
# split
z <- unlist(strsplit(m,'_'))
M <- matrix(c(z[c(T,F)],z[c(F,T)]),nrow=nrow(m))
# properly order columns
i <- 1:ncol(M)
M <- M[,order(c(i[c(T,F)],i[c(F,T)]))]
# set dimnames
rownames(M) <- rownames(m)
colnames(M) <- rep(colnames(m),each=2)
# 9178 9178 3574 3574 3547 3547
# 160 "B" "B" "A" "B" "B" "A"
# 301 "B" "A" "A" "B" "B" "B"
# 303 "B" "B" "A" "B" "B" "A"
# 311 "A" "A" "A" "B" "B" "A"
# 312 "B" "A" "A" "B" "B" "A"
# 314 "B" "A" "A" "B" "B" "A"
[Update]
Here is a small benchmarking study of the proposed solutions (I didn't include the cSplit solution because it was too slow):
Setup:
m <- matrix('A_B',nrow=1000,ncol=2830)
d <- as.data.frame(m, stringsAsFactors = FALSE)
#####
f.mtrx <- function(m) {
z <- unlist(strsplit(m,'_'))
M <- matrix(c(z[c(T,F)],z[c(F,T)]),nrow=nrow(m))
# properly order columns
i <- 1:ncol(M)
M <- M[,order(c(i[c(T,F)],i[c(F,T)]))]
# set dimnames
rownames(M) <- rownames(m)
colnames(M) <- rep(colnames(m),each=2)
M
}
library(stringi)
f.mtrx2 <- function(m) {
z <- unlist(stri_split_fixed(m,'_'))
M <- matrix(c(z[c(T,F)],z[c(F,T)]),nrow=nrow(m))
# properly order columns
i <- 1:ncol(M)
M <- M[,order(c(i[c(T,F)],i[c(F,T)]))]
# set dimnames
rownames(M) <- rownames(m)
colnames(M) <- rep(colnames(m),each=2)
M
}
#####
library(splitstackshape)
f.cSplit <- function(mydf) cSplit(as.data.table(mydf, keep.rownames = TRUE), names(mydf), "_")
#####
library(stringi)
f.stringi <- function(mydf) `dimnames<-`(do.call(cbind,
lapply(mydf, stri_split_fixed, "_", simplify = TRUE)),
list(rownames(mydf), rep(colnames(mydf), each = 2)))
#####
library(dplyr)
library(tidyr)
f.dplyr <- function(df) lapply(names(df), function(x) separate_(df[x], x, paste0(x,"_",1:2), sep = "_" )) %>%
bind_cols
#####
library(iotools)
f.mstrsplit <- function(mydf) `dimnames<-`(do.call(cbind,
lapply(mydf, mstrsplit, "_", ncol = 2, type = "character")),
list(rownames(mydf), rep(colnames(mydf), each = 2)))
#####
library(rbenchmark)
benchmark(f.mtrx(m), f.mtrx2(m), f.dplyr(d), f.stringi(d), f.mstrsplit(d), replications = 10)
Results:
test replications elapsed relative user.self sys.self user.child sys.child
3 f.dplyr(d) 10 27.722 10.162 27.360 0.269 0 0
5 f.mstrsplit(d) 10 2.728 1.000 2.607 0.098 0 0
1 f.mtrx(m) 10 37.943 13.909 34.885 0.799 0 0
2 f.mtrx2(m) 10 15.176 5.563 13.936 0.802 0 0
4 f.stringi(d) 10 8.107 2.972 7.815 0.247 0 0
In the updated benchmark, the winner is f.mstrsplit.

Related

How to randomize the order of all sublists simultaneously

I am looking to randomize the order of the sublists, but retaining the structure. To illustrate, I can do this with a data frame:
df1 <- data.frame("X1" = LETTERS[1:5], "X2" = letters[1:5])
df1
df1R <- df1[sample(df1[,1]),]
df1R
> df1
X1 X2
1 A a
2 B b
3 C c
4 D d
5 E e
>
> df1R <- df1[sample(df1[,1]),]
> df1R
X1 X2
2 B b
5 E e
1 A a
3 C c
4 D d
You can see here that the overall order is randomised, but rows remain together, this is what I mean by retaining the structure - A stays with a, B stays with b...
I'd like to implement this for a list:
m1 <- list(LETTERS[1:5], letters[1:5])
But I'm stuck on the how, I've had a good look round but not found a solution. Any advice?
The result would look like:
> m1R
[[1]]
[1] "B" "C" "E" "A" "D"
[[2]]
[1] "b" "c" "e" "a" "d"

You could do this to reorder all elements:
neworder <- sample.int(5)
lapply(m1, function(x) x[neworder])

splitting vector every two indices

Given vector of N elements:
LETTERS[1:10]
[1] "A" "B" "C" "D" "E" "F" "G" "H" "I" "J"
How can one get a data.table/frame (df) as follows?
>df
one two
A B
C D
E F
G H
I J
EDIT
Generalizing I would like to know given a vector to split as follows:
[A B C],[D E],[F G H I J]
and obtaining:
V1 V2 V3 V4 V5
A B C NA NA
D E NA NA NA
F G H I J

One option is the matrix way
as.data.frame(matrix(LETTERS[1:10], ncol=2,byrow=TRUE,
dimnames = list(NULL, c('one', 'two'))), stringsAsFactors=FALSE)
# one two
#1 A B
#2 C D
#3 E F
#4 G H
#5 I J
f we need to create an index, we can use gl to split the vector and rbind
do.call(rbind, split(v1, as.integer(gl(length(v1), 2, length(v1)))))
where
v1 <- LETTERS[1:10]
Update
Based on the update in OP's post
lst <- split(v1, rep(1:3, c(3, 2, 5)))
do.call(rbind, lapply(lst, `length<-`, max(lengths(lst))))
# [,1] [,2] [,3] [,4] [,5]
#1 "A" "B" "C" NA NA
#2 "D" "E" NA NA NA
#3 "F" "G" "H" "I" "J"
Or otherwise
library(stringi)
stri_list2matrix(lst, byrow = TRUE)
Update2
If we are using a 'splitVec'
lst <- split(v1, cumsum(seq_along(v1) %in% splitVec))
and then proceed as above

"lapply" in R does not work for each element

test.data <- data.frame(a=seq(10),b=rep(seq(5),times=2),c=rep(seq(5),each=2))
test.data <- data.frame(lapply(test.data, as.character), stringsAsFactors = F)
test.ref <- data.frame(original=seq(10),name=letters[1:10])
test.ref <- data.frame(lapply(test.ref, as.character), stringsAsFactors = F)
test.match <- function (x) {
result = test.ref$name[which(test.ref$original == x)]
return(result)
}
> data.frame(lapply(test.data, test.match))
a b c
1 a a a
2 b b a
3 c c a
4 d d a
5 e e a
6 f a a
7 g b a
8 h c a
9 i d a
10 j e a
> lapply(test.data, test.match)
$a
[1] "a" "b" "c" "d" "e" "f" "g" "h" "i" "j"
$b
[1] "a" "b" "c" "d" "e"
$c
[1] "a"
Hi all,
I am learning to use the apply family in R. However, I am stuck in a rather simple exercise. Above is my code. I am trying to use the "test.match" function to replace all the elements in "test.data" by the reference rule in "test.ref". However, the last column does not work if I turn the final result into data frame. It is even worse if I keep the result as a list.
Many thanks for your help,
Kevin

As mentioned in the comments, you probably want match:
do.test.match.df <- function(df, ref_df = test.ref){
res <- df
res[] <- lapply(df, function(x) ref_df$name[ match(x, ref_df$original) ])
return(res)
}
do.test.match.df(test.data)
which gives
a b c
1 a a a
2 b b a
3 c c b
4 d d b
5 e e c
6 f a c
7 g b d
8 h c d
9 i d e
10 j e e
This is the idiomatic way. lapply will always return a vanilla list. A data.frame is a special kind of list (a list of column vectors). With res[] <- lapply(df, myfun), we're assigning to columns of res.
Since all your columns are the same class, I'd suggest using a matrix instead of a data.frame.
test.mat <- as.matrix(test.data)
do.test.match <- function(mat, ref_df=test.ref){
res <- matrix(, nrow(mat), ncol(mat))
res[] <- ref_df$name[ match( c(mat), ref_df$original ) ]
return(res)
}
do.test.match(test.mat)

How to add a list to a data frame in R?

I have 2 tables as below:
a = read.table(text=' a b
1 c
1 d
2 c
2 a
2 b
3 a
', head=T)
b = read.table(text=' a c
1 x i
2 y j
3 z k
', head=T)
And I want result to be like this:
1 x i c d
2 y j c a b
3 z k a
Originally I thought to use tapply to transform them to lists (eg. aa = tapply(a[,2], a[,1], function(x) paste(x,collapse=","))), then append it back to table b, but I got stuck...
Any suggestion to do this?
Thanks a million.

One way to do it:
mapply(FUN = c,
lapply(split(b, row.names(b)), function(x) as.character(unlist(x, use.names = FALSE))),
split(as.character(a$b), a$a),
SIMPLIFY = FALSE)
# $`1`
# [1] "x" "i" "c" "d"
#
# $`2`
# [1] "y" "j" "c" "a" "b"
#
# $`3`
# [1] "z" "k" "a"

Column Split without repeat

I have a dataframe with one column that I would like to split into several columns, but the number of splits is dynamic throughout the rows.
Var1
====
A/B
A/B/C
C/B
A/C/D/E
I have tried using colsplit(df$Var1,split="/",names=c("Var1","Var2","Var3","Var4")), but rows with less than 4 variables will repeat.
From Hansi, the desired output would be:
Var1 Var2 Var3 Var4
[1,] "A" "B" NA NA
[2,] "A" "B" "C" NA
[3,] "C" "B" NA NA
[4,] "A" "C" "D" "E"

> read.table(text=as.character(df$Var1), sep="/", fill=TRUE)
V1 V2 V3 V4
1 A B
2 A B C
3 C B
4 A C D E
Leading zeros in digit only fields can be preserved with colClasses="character"
a <- data.frame(Var1=c("01/B","04/B/C","0098/B","8708/C/D/E"))
read.table(text=as.character(a$Var1), sep="/", fill=TRUE, colClasses="character")
V1 V2 V3 V4
1 01 B
2 04 B C
3 0098 B
4 8708 C D E

If I understood your objective correctly here is one possible solution, I'm sure there is a better way of doing it but this was the first that came to mind:
a <- data.frame(Var1=c("A/B","A/B/C","C/B","A/C/D/E"))
splitNames <- c("Var1","Var2","Var3","Var4")
# R> a
# Var1
# 1 A/B
# 2 A/B/C
# 3 C/B
# 4 A/C/D/E
b <- t(apply(a,1,function(x){
temp <- unlist(strsplit(x,"/"));
return(c(temp,rep(NA,max(0,length(splitNames)-length(temp)))))
}))
colnames(b) <- splitNames
# R> b
# Var1 Var2 Var3 Var4
# [1,] "A" "B" NA NA
# [2,] "A" "B" "C" NA
# [3,] "C" "B" NA NA
# [4,] "A" "C" "D" "E"

i do not know a function to solve your problem, but you can achieve it easily with standard R commands :
# Here are your data
df <- data.frame(Var1=c("A/B", "A/B/C", "C/B", "A/C/D/E"), stringsAsFactors=FALSE)
# Split
rows <- strsplit(df$Var1, split="/")
# Maximum amount of columns
columnCount <- max(sapply(rows, length))
# Fill with NA
rows <- lapply(rows, `length<-`, columnCount)
# Coerce to data.frame
out <- as.data.frame(rows)
# Transpose
out <- t(out)
As it relies on strsplit, you may need to make some type conversion. See type.con

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

Split column string elements within a row inside a dataframe - r

Related

How to randomize the order of all sublists simultaneously

splitting vector every two indices

"lapply" in R does not work for each element

How to add a list to a data frame in R?

Column Split without repeat

Categories

Resources