Split concatenated column to corresponding column positions - r

I have a data frame where a column may contain concatenated characters separated by |:
df <- data.frame(FOO = c('A|B|C', 'A|B', 'B|C', 'A', 'C'))
# df
# FOO
# 1 A|B|C
# 2 A|B
# 3 B|C
# 4 A
# 5 C
I want to split the string and put the individual values into different columns:
df
# X1 X2 X3
# 1 A B C
# 2 A B
# 3 B C
# 4 A
# 5 C
So far I tried with this example: [https://stackoverflow.com/questions/7069076/split-column-at-delimiter-in-data-frame][1] but it is not splitting the columns without repeating values, what I get there is:
df <- data.frame(do.call('rbind', strsplit(as.character(df$FOO),'|',fixed=TRUE)))
> df
X1 X2 X3
1 A B C
2 A B A
3 B C B
4 A A A
5 C C C
And I also get this warning:
Warning message: In rbind(c("A", "B", "C"), c("A", "B"), c("B", "C"),
"A", "C") : number of columns of result is not a multiple of vector
length (arg 2)
What can I do in those cases? Preferably with base R.
[1]: Split column at delimiter in data frame

Simply do:
splt <- strsplit(as.character(df$FOO),"\\|")
all_val <- sort(unique(unlist(splt)))
t(sapply(splt,function(x){all_val[!(all_val %in% x)]<-NA;all_val}))
# [,1] [,2] [,3]
#[1,] "A" "B" "C"
#[2,] "A" "B" NA
#[3,] NA "B" "C"
#[4,] "A" NA NA
#[5,] NA NA "C"
data:
df <- data.frame(FOO = c('A|B|C', 'A|B', 'B|C', 'A', 'C'))
Please note:
My version is base:: (no libraries needed) and general:
It would also work with:
df <- data.frame(FOO = c('A|B|C', 'A|B', 'B|C', 'A', 'C', 'B|D|F'))

Overlooked that OP asked for a base R solution. Please try #AndreElrico's, #r.user.05apr's or #milan's solutions.
This can be done with cSplit_e from the splitstackshape package:
library(splitstackshape)
cSplit_e(
data = df,
split.col = "FOO",
sep = "|",
mode = "value",
type = "character",
fill = " ",
drop = TRUE
)
# FOO_A FOO_B FOO_C
#1 A B C
#2 A B
#3 B C
#4 A
#5 C
Does also work in case of the following df (see OP's comment above).
(df1 <- data.frame(FOO = c('A|B|C', 'A|B', 'B|C', 'A', 'C', 'B|D|F')))
# FOO
#1 A|B|C
#2 A|B
#3 B|C
#4 A
#5 C
#6 B|D|F
cSplit_e(df1, "FOO", "|", "value", "character", TRUE, fill = " ")
# FOO_A FOO_B FOO_C FOO_D FOO_F
#1 A B C
#2 A B
#3 B C
#4 A
#5 C
#6 B D F

In base R:
df <- data.frame(FOO = c('A|B|C', 'A|B', 'B|C', 'A', 'C'))
dummy <- strsplit(as.character(df$FOO), "[|]")
want <- data.frame(values = unlist(dummy),
ids = rep(1:length(dummy), unlist(lapply(dummy, length))),
stringsAsFactors = FALSE)
library(reshape2)
want <- dcast(want, ids ~ values, value.var = "values", fill = " ")[, -1] # first col removed
names(want) <- paste0("X", seq_along(unique(unlist(dummy))))
want
# X1 X2 X3
#1 A B C
#2 A B
#3 B C
#4 A
#5 C

Use unique and strsplit to find all unique values (A, B and C in this case). Use grep to search for the unique values, and return the values when there's a match or character(0) otherwise. cbind the resulting characters. Use apply and ifelse to replace character(0) with NA.
vals <- unique(unlist(sapply(a1, function(x) strsplit(x, '|', fixed = T))))
out <- NULL
for(i in vals){
out <- cbind(out, as.character((lapply(df$FOO, function(x) grep(x, i, value=T)))))
}
apply(out, 2, function(x) ifelse(x=="character(0)", NA, x))
[,1] [,2] [,3]
[1,] "A" "B" "C"
[2,] "A" "B" NA
[3,] NA "B" "C"
[4,] "A" NA NA
[5,] NA NA "C"

You can try a tidyverse as well
library(tidyverse)
df %>%
rownames_to_column() %>%
separate_rows(FOO, sep="[|]") %>%
mutate(L=factor(FOO, labels = paste0("X",1:length(unique(FOO))))) %>%
spread(L, FOO) %>%
select(-1)
X1 X2 X3
1 A B C
2 A B <NA>
3 <NA> B C
4 A <NA> <NA>
5 <NA> <NA> C
It is also generally working e.g. df <- data.frame(FOO = c('A|B|C', 'A|B', 'B|C', 'A', 'C', 'B|D|F')). In addition you can set the levels e.g. B>C>A by yourself using levels = c("B", "C", "A") in the factor function in the mutate step.

Related

Conditional mutate and vector

I have the following data frame:
df <- data.frame(
x = rep(letters[1:3], 2)
)
And the following vector:
vec <- c(1.5, 3.2)
This vector belongs to each b in df. How do I mutate vec if it matches b and return NA values if not?
Expected outcome:
1 a NA
2 b 1.5
3 c NA
4 a NA
5 b 3.2
6 c NA
Simplest way would be to get indexes of "b" and replace them with vec.
df$output[df$x == "b"] <- vec
df
# x output
#1 a NA
#2 b 1.5
#3 c NA
#4 a NA
#5 b 3.2
#6 c NA
Another option is with replace
df$output <- replace(df$output, df$x == "b", vec)
Forcefully, fitting this into tidyverse
library(dplyr)
df$output <- NA
df %>%
mutate(output = replace(output, x == "b", vec))

Group columns with the same name in R

If I have a data frame as below, with the first row the column names (row names not included here)
A B C D E F G H I
a b c a a b c c c
1 2 3 4 5 6 7 8 9
How would I be able create a new data frame such that:
a b c
1 2 3
4 6 7
5 NA 8
NA NA 9
Notice the NA. For empty values.
UPDATE
If d.frame is the dataframe in question:
new.df <- data.frame();
firstrow <- d.frame[,1]
names <- unique(firstrow)
for (n in names) {
#cbind.fill is part of a package plyr
new.df <- cbind.fill(new.df, frame[3,which(firstrow == n)])
}
colnames(new.df) <- names;
I think that works well. But it isn't efficient and relies on a third party package. Any suggestions?
Here is another solution, based on function cbind.fill from cbind a df with an empty df (cbind.fill?)
cbind.fill<-function(...){
nm <- list(...)
nm<-lapply(nm, as.matrix)
n <- max(sapply(nm, nrow))
do.call(cbind, lapply(nm, function (x)
rbind(x, matrix(, n-nrow(x), ncol(x)))))
}
df <- read.table(text = "A B C D E F G H I
a b c a a b c c c
1 2 3 4 5 6 7 8 9", header = T, as.is=T)
df <- as.matrix(df)
do.call(cbind.fill, split(df[2,], df[1,]))
And another one solution
df <- as.matrix(df)
lst <- split(df[2,], df[1,])
m <- max(sapply(lst, length))
result <- sapply(lst, function(x) {length(x) <- m; x})
Couldn't find a simple solution for this, so here's one option using base R as you requested in comments. This solution will work no matter how many columns you have in the original data
temp <- read.table(text = "A B C D E F G H I
a b c a a b c c c
1 2 3 4 5 6 7 8 9", header = T) # your data
temp <- data.frame(t(temp))
lengths <- table(temp[, 1])
maxval <- max(lengths)
data.frame(do.call(cbind, lapply(levels(temp[, 1]), function(x) c(x, temp[temp[, 1] == x, 2], rep(NA, maxval - lengths[x])))))
## X1 X2 X3
## 1 a b c
## 2 1 2 3
## 3 4 6 7
## 4 5 <NA> 8
## 5 <NA> <NA> 9
I would transpose the original two-row data.frame, create a "time" variable, use reshape to reorganize the data, and transpose the result.
Like this:
x <- t(mydf)
y <- data.frame(cbind(x, ave(x[, 1], x[, 1], FUN = seq_along)))
t(reshape(y, direction = "wide", idvar = "X1", timevar = "X3"))
# A B C
# X1 "a" "b" "c"
# X2.1 "1" "2" "3"
# X2.2 "4" "6" "7"
# X2.3 "5" NA "8"
# X2.4 NA NA "9"

na.strings applied to a dataframe

I currently have a dataframe in which there are several rows I would like converted to "NA". When I first imported this dataframe from a .csv, I could use na.strings=c("A", "B", "C) and so on to remove the values I didn't want.
I want to do the same thing again, but this time using a dataframe already, not importing another .csv
To import the data, I used:
data<-read.csv("code.csv", header=T, strip.white=TRUE, stringsAsFactors=FALSE, na.strings=c("", "A", "B", "C"))
Now, with "data", I would like to subset it while removing even more specific values in the rows.. I tried someting like:
data2<-data.frame(data, na.strings=c("D", "E", "F"))
Of course this doesn't work because I think na.strings only works with the "read" package.. not other functions. Is there any equivalent to simply convert certain values into NA so I can na.omit(data2) fairly easily?
Thanks for your help.
Here's a way to replace values in multiple columns:
# an example data frame
dat <- data.frame(x = c("D", "E", "F", "G"),
y = c("A", "B", "C", "D"),
z = c("X", "Y", "Z", "A"))
# x y z
# 1 D A X
# 2 E B Y
# 3 F C Z
# 4 G D A
# values to replace
na.strings <- c("D", "E", "F")
# index matrix
idx <- Reduce("|", lapply(na.strings, "==", dat))
# replace values with NA
is.na(dat) <- idx
dat
# x y z
# 1 <NA> A X
# 2 <NA> B Y
# 3 <NA> C Z
# 4 G <NA> A
Just assign the NA values directly.
e.g.:
x <- data.frame(a=1:5, b=letters[1:5])
# > x
# a b
# 1 1 a
# 2 2 b
# 3 3 c
# 4 4 d
# 5 5 e
# convert the 'b' and 'd' in columb b to NA
x$b[x$b %in% c('b', 'd')] <- NA
# > x
# a b
# 1 1 a
# 2 2 <NA>
# 3 3 c
# 4 4 <NA>
# 5 5 e
data[ data == "D" ] = NA
Note that if you were trying to replace NA with "D", the reverse (df[ df == NA ] = "D") will not work; you would need to use df[is.na(df)] <- "D"
Since we don't have your data I will use mtcars. Suppose we want to set values anywhere in mtcars that are equal to 4 or 19.2 to NA
ind <- which(mtcars == 4, arr.ind = TRUE)
mtcars[ind] <- NA
In your setting you would replace this number by "D" or "E"

order while splitting (eg. TA should be split to two column "A" in first "T" second) in r

I have following issue, I could solve:
set.seed (1234)
mydf <- data.frame (var1a = sample (c("TA", "AA", "TT"), 5, replace = TRUE),
varb2 = sample (c("GA", "AA", "GG"), 5, replace = TRUE),
varAB = sample (c("AC", "AA", "CC"), 5, replace = TRUE)
)
mydf
var1a varb2 varAB
1 TA AA CC
2 AA GA AA
3 AA GA AC
4 AA AA CC
5 TT AA AC
I want to split two letter into different column, and then order alphabetically.
Edit: Ordering can be done before split, for example var1a value "TA" var1a should be "AT" or after split so that var1aa should be "A", and var1ab be "T" (instead of "T", "A").
so sorting is within each cell.
split_col <- function(.col, data){
.x <- colsplit( data[[.col]], names = paste0(.col, letters[1:2]))
}
split each column and combine
require(reshape)
splitdf <- do.call(cbind, lapply(names(mydf), split_col, data = mydf))
var1aa var1ab varb2a varb2b varABa varABb
1 T A A A C C
2 A A G A A A
3 A A G A A C
4 A A A A C C
5 T T A A A C
But the unsolved part is I want to order the pair of columns such that columnname"a" and columname"b" are ordered, alphabetically. Thus expected output:
var1aa var1ab varb2a varb2b varABa varABb
1 A T A A C C
2 A A A G A A
3 A A A G A C
4 A A A A C C
5 T T A A A C
Can how can order (short with each pair of variable) ?
mylist <-as.list(mydf)
splits <- lapply(mylist, reshape::colsplit, names=c("a", "b"))
rowsort <- lapply(splits, function(x) t(apply(x, 1, sort)))
comb <- do.call(data.frame, rowsort)
comb
var1a.1 var1a.2 varb2.1 varb2.2 varAB.a varAB.b
1 A T A A C C
2 A A A G A A
3 A A A G A C
4 A A A A C C
5 T T A A A C
EDIT:
If names are important, you can replace them:
replaceNums <- function(x){
.which <- regmatches(x, regexpr("[[:alnum:]]*(?=.)", x, perl=TRUE))
stopifnot(length(x) %% 2 == 0) #checkstep
paste0(.which, c("a", "b"))
}
names(comb) <- replaceNums(names(comb))
comb
var1aa var1ab varb2a varb2b varABa varABb
1 A T A A C C
2 A A A G A A
3 A A A G A C
4 A A A A C C
5 T T A A A C

Create a vector from repetitons of items from a matrix

I have a data frame m
A 2
B 3
C 4
and I want to create a data frame like
A 1
A 2
B 1
B 2
B 3
C 1
C 2
C 3
C 4
Any help? Thanks a lot in advance
Your original question can be answered by:
text <- LETTERS[1:3]
n <- 2:4
rep(text, times=n)
[1] "A" "A" "B" "B" "B" "C" "C" "C" "C"
Your new question is quite different:
df <- data.frame(
text <- LETTERS[1:3],
n <- 2:4
)
data.frame(
text = rep(df$text, times=df$n),
seq = sequence(df$n)
)
text seq
1 A 1
2 A 2
3 B 1
4 B 2
5 B 3
6 C 1
7 C 2
8 C 3
9 C 4
rep accepts vectors. Try this:
dat <- data.frame(V1 = letters[1:3], V2 = 2:4)
rep(dat[, 1], dat[, 2])
> rep(dat[, 1], dat[, 2])
[1] a a b b b c c c c
Assuming m is a data frame:
m <- data.frame(V1 = LETTERS[1:3], V2 = 2:4, stringsAsFactors = FALSE)
This will do what you want:
with(m, rep(V1, times = V2))
e.g.
> with(m, rep(V1, times = V2))
[1] "A" "A" "B" "B" "B" "C" "C" "C" "C"
Edit: To address the edit made by the OP, try the following:
with(m, data.frame(X1 = rep(V1, times = V2),
X2 = unlist(lapply(V2, seq_len))))
Which produces:
> with(m, data.frame(X1 = rep(V1, times = V2),
+ X2 = unlist(lapply(V2, seq_len))))
X1 X2
1 A 1
2 A 2
3 B 1
4 B 2
5 B 3
6 C 1
7 C 2
8 C 3
9 C 4
Or more succinctly via sequence() — as per #Andrie's Answer (which I also keep forgetting about):
with(m, data.frame(X1 = rep(V1, times = V2), X2 = sequence(V2)))
#Andrie's answer is the only one so far that answers your new question. There may be a better way to do this but:
m <- data.frame(V1 = LETTERS[1:3], V2 = 2:4, stringsAsFactors = FALSE)
library(plyr)
ddply(m,"V1",function(x) data.frame(V2=seq(x[,2])))

Resources