Create a vector from repetitons of items from a matrix - r

I have a data frame m
A 2
B 3
C 4
and I want to create a data frame like
A 1
A 2
B 1
B 2
B 3
C 1
C 2
C 3
C 4
Any help? Thanks a lot in advance

Your original question can be answered by:
text <- LETTERS[1:3]
n <- 2:4
rep(text, times=n)
[1] "A" "A" "B" "B" "B" "C" "C" "C" "C"
Your new question is quite different:
df <- data.frame(
text <- LETTERS[1:3],
n <- 2:4
)
data.frame(
text = rep(df$text, times=df$n),
seq = sequence(df$n)
)
text seq
1 A 1
2 A 2
3 B 1
4 B 2
5 B 3
6 C 1
7 C 2
8 C 3
9 C 4

rep accepts vectors. Try this:
dat <- data.frame(V1 = letters[1:3], V2 = 2:4)
rep(dat[, 1], dat[, 2])
> rep(dat[, 1], dat[, 2])
[1] a a b b b c c c c

Assuming m is a data frame:
m <- data.frame(V1 = LETTERS[1:3], V2 = 2:4, stringsAsFactors = FALSE)
This will do what you want:
with(m, rep(V1, times = V2))
e.g.
> with(m, rep(V1, times = V2))
[1] "A" "A" "B" "B" "B" "C" "C" "C" "C"
Edit: To address the edit made by the OP, try the following:
with(m, data.frame(X1 = rep(V1, times = V2),
X2 = unlist(lapply(V2, seq_len))))
Which produces:
> with(m, data.frame(X1 = rep(V1, times = V2),
+ X2 = unlist(lapply(V2, seq_len))))
X1 X2
1 A 1
2 A 2
3 B 1
4 B 2
5 B 3
6 C 1
7 C 2
8 C 3
9 C 4
Or more succinctly via sequence() — as per #Andrie's Answer (which I also keep forgetting about):
with(m, data.frame(X1 = rep(V1, times = V2), X2 = sequence(V2)))

#Andrie's answer is the only one so far that answers your new question. There may be a better way to do this but:
m <- data.frame(V1 = LETTERS[1:3], V2 = 2:4, stringsAsFactors = FALSE)
library(plyr)
ddply(m,"V1",function(x) data.frame(V2=seq(x[,2])))

Related

Split concatenated column to corresponding column positions

I have a data frame where a column may contain concatenated characters separated by |:
df <- data.frame(FOO = c('A|B|C', 'A|B', 'B|C', 'A', 'C'))
# df
# FOO
# 1 A|B|C
# 2 A|B
# 3 B|C
# 4 A
# 5 C
I want to split the string and put the individual values into different columns:
df
# X1 X2 X3
# 1 A B C
# 2 A B
# 3 B C
# 4 A
# 5 C
So far I tried with this example: [https://stackoverflow.com/questions/7069076/split-column-at-delimiter-in-data-frame][1] but it is not splitting the columns without repeating values, what I get there is:
df <- data.frame(do.call('rbind', strsplit(as.character(df$FOO),'|',fixed=TRUE)))
> df
X1 X2 X3
1 A B C
2 A B A
3 B C B
4 A A A
5 C C C
And I also get this warning:
Warning message: In rbind(c("A", "B", "C"), c("A", "B"), c("B", "C"),
"A", "C") : number of columns of result is not a multiple of vector
length (arg 2)
What can I do in those cases? Preferably with base R.
[1]: Split column at delimiter in data frame
Simply do:
splt <- strsplit(as.character(df$FOO),"\\|")
all_val <- sort(unique(unlist(splt)))
t(sapply(splt,function(x){all_val[!(all_val %in% x)]<-NA;all_val}))
# [,1] [,2] [,3]
#[1,] "A" "B" "C"
#[2,] "A" "B" NA
#[3,] NA "B" "C"
#[4,] "A" NA NA
#[5,] NA NA "C"
data:
df <- data.frame(FOO = c('A|B|C', 'A|B', 'B|C', 'A', 'C'))
Please note:
My version is base:: (no libraries needed) and general:
It would also work with:
df <- data.frame(FOO = c('A|B|C', 'A|B', 'B|C', 'A', 'C', 'B|D|F'))
Overlooked that OP asked for a base R solution. Please try #AndreElrico's, #r.user.05apr's or #milan's solutions.
This can be done with cSplit_e from the splitstackshape package:
library(splitstackshape)
cSplit_e(
data = df,
split.col = "FOO",
sep = "|",
mode = "value",
type = "character",
fill = " ",
drop = TRUE
)
# FOO_A FOO_B FOO_C
#1 A B C
#2 A B
#3 B C
#4 A
#5 C
Does also work in case of the following df (see OP's comment above).
(df1 <- data.frame(FOO = c('A|B|C', 'A|B', 'B|C', 'A', 'C', 'B|D|F')))
# FOO
#1 A|B|C
#2 A|B
#3 B|C
#4 A
#5 C
#6 B|D|F
cSplit_e(df1, "FOO", "|", "value", "character", TRUE, fill = " ")
# FOO_A FOO_B FOO_C FOO_D FOO_F
#1 A B C
#2 A B
#3 B C
#4 A
#5 C
#6 B D F
In base R:
df <- data.frame(FOO = c('A|B|C', 'A|B', 'B|C', 'A', 'C'))
dummy <- strsplit(as.character(df$FOO), "[|]")
want <- data.frame(values = unlist(dummy),
ids = rep(1:length(dummy), unlist(lapply(dummy, length))),
stringsAsFactors = FALSE)
library(reshape2)
want <- dcast(want, ids ~ values, value.var = "values", fill = " ")[, -1] # first col removed
names(want) <- paste0("X", seq_along(unique(unlist(dummy))))
want
# X1 X2 X3
#1 A B C
#2 A B
#3 B C
#4 A
#5 C
Use unique and strsplit to find all unique values (A, B and C in this case). Use grep to search for the unique values, and return the values when there's a match or character(0) otherwise. cbind the resulting characters. Use apply and ifelse to replace character(0) with NA.
vals <- unique(unlist(sapply(a1, function(x) strsplit(x, '|', fixed = T))))
out <- NULL
for(i in vals){
out <- cbind(out, as.character((lapply(df$FOO, function(x) grep(x, i, value=T)))))
}
apply(out, 2, function(x) ifelse(x=="character(0)", NA, x))
[,1] [,2] [,3]
[1,] "A" "B" "C"
[2,] "A" "B" NA
[3,] NA "B" "C"
[4,] "A" NA NA
[5,] NA NA "C"
You can try a tidyverse as well
library(tidyverse)
df %>%
rownames_to_column() %>%
separate_rows(FOO, sep="[|]") %>%
mutate(L=factor(FOO, labels = paste0("X",1:length(unique(FOO))))) %>%
spread(L, FOO) %>%
select(-1)
X1 X2 X3
1 A B C
2 A B <NA>
3 <NA> B C
4 A <NA> <NA>
5 <NA> <NA> C
It is also generally working e.g. df <- data.frame(FOO = c('A|B|C', 'A|B', 'B|C', 'A', 'C', 'B|D|F')). In addition you can set the levels e.g. B>C>A by yourself using levels = c("B", "C", "A") in the factor function in the mutate step.

'Proper' way to do row-wise replacement

I have a data frame which looks something like:
dataDemo <- data.frame(POS = 1:4 , REF = c("A" , "T" , "G" , "C") ,
ind1 = c("A" , "." , "G" , "C") , ind2 = c("A" , "C" , "C" , "."),
stringsAsFactors=FALSE)
dataDemo
POS REF ind1 ind2
1 1 A A A
2 2 T . C
3 3 G G C
4 4 C C .
and I'd like to replace all the "."s with the REF value for that row. Here is how I did it:
for(i in seq_along(dataDemo$REF)){
dataDemo[i , ][dataDemo[i , ] == '.'] <- dataDemo$REF[i]
}
I'd like to know if there's a more 'proper' or idiomatic way of doing this in R. I generally try to use *apply whenever possible and this seems like something that could easily be adapted to that approach and made more readable (and run faster), but despite throwing a good bit of time at it I haven't made much progress.
In dplyr,
library(dplyr)
dataDemo %>% mutate_each(funs(ifelse(. == '.', REF, as.character(.))), -POS)
# POS REF ind1 ind2
# 1 1 A A A
# 2 2 T T C
# 3 3 G G C
# 4 4 C C C
Here's another base R alternative, where we use the row numbers of the "." occurrences to replace them by the appropriate REF values.
# Get row numbers
rownrs <- which(dataDemo==".", arr.ind = TRUE)[,1]
# Replace values
dataDemo[dataDemo=="."] <- dataDemo$REF[rownrs]
# Result
dataDemo
# POS REF ind1 ind2
#1 1 A A A
#2 2 T T C
#3 3 G G C
#4 4 C C C
Here is an option using set from data.table, which should be fast.
library(data.table)
setDT(dataDemo)
nm1 <- paste0("ind", 1:2)
for(j in nm1){
i1 <- dataDemo[[j]]=="."
set(dataDemo, i = which(i1), j=j, value = dataDemo$REF[i1])
}
dataDemo
# POS REF ind1 ind2
#1: 1 A A A
#2: 2 T T C
#3: 3 G G C
#4: 4 C C C
EDIT: Based on #alexis_laz's comments
Or using dplyr
library(dplyr)
dataDemo %>%
mutate_each(funs(ifelse(.==".", REF,.)), ind1:ind2)
# POS REF ind1 ind2
#1 1 A A A
#2 2 T T C
#3 3 G G C
#4 4 C C C
Or we can use base R methods to do this in a single line.
dataDemo[nm1] <- lapply(dataDemo[nm1], function(x) ifelse(x==".", dataDemo$REF, x))

R: reshape data frame when one column has unequal number of entries

I have a data frame x with 2 character columns:
x <- data.frame(a = numeric(), b = I(list()))
x[1:3,"a"] = 1:3
x[[1, "b"]] <- "a, b, c"
x[[2, "b"]] <- "d, e"
x[[3, "b"]] <- "f"
x$a = as.character(x$a)
x$b = as.character(x$b)
x
str(x)
The entries in column b are comma-separated strings of characters.
I need to produce this data frame:
1 a
1 b
1 c
2 d
2 e
3 f
I know how to do it when I loop row by row. But is it possible to do without looping?
Thank you!
Have you checked out require(splitstackshape)?
> cSplit(x, "b", ",", direction = "long")
a b
1: 1 a
2: 1 b
3: 1 c
4: 2 d
5: 2 e
6: 3 f
> s <- strsplit(as.character(x$b), ',')
> data.frame(value=rep(x$a, sapply(s, FUN=length)),b=unlist(s))
value b
1 1 a
2 1 b
3 1 c
4 2 d
5 2 e
6 3 f
there you go, should be very fast:
library(data.table)
x <- data.table(x)
x[ ,strsplit(b, ","), by = a]

Group columns with the same name in R

If I have a data frame as below, with the first row the column names (row names not included here)
A B C D E F G H I
a b c a a b c c c
1 2 3 4 5 6 7 8 9
How would I be able create a new data frame such that:
a b c
1 2 3
4 6 7
5 NA 8
NA NA 9
Notice the NA. For empty values.
UPDATE
If d.frame is the dataframe in question:
new.df <- data.frame();
firstrow <- d.frame[,1]
names <- unique(firstrow)
for (n in names) {
#cbind.fill is part of a package plyr
new.df <- cbind.fill(new.df, frame[3,which(firstrow == n)])
}
colnames(new.df) <- names;
I think that works well. But it isn't efficient and relies on a third party package. Any suggestions?
Here is another solution, based on function cbind.fill from cbind a df with an empty df (cbind.fill?)
cbind.fill<-function(...){
nm <- list(...)
nm<-lapply(nm, as.matrix)
n <- max(sapply(nm, nrow))
do.call(cbind, lapply(nm, function (x)
rbind(x, matrix(, n-nrow(x), ncol(x)))))
}
df <- read.table(text = "A B C D E F G H I
a b c a a b c c c
1 2 3 4 5 6 7 8 9", header = T, as.is=T)
df <- as.matrix(df)
do.call(cbind.fill, split(df[2,], df[1,]))
And another one solution
df <- as.matrix(df)
lst <- split(df[2,], df[1,])
m <- max(sapply(lst, length))
result <- sapply(lst, function(x) {length(x) <- m; x})
Couldn't find a simple solution for this, so here's one option using base R as you requested in comments. This solution will work no matter how many columns you have in the original data
temp <- read.table(text = "A B C D E F G H I
a b c a a b c c c
1 2 3 4 5 6 7 8 9", header = T) # your data
temp <- data.frame(t(temp))
lengths <- table(temp[, 1])
maxval <- max(lengths)
data.frame(do.call(cbind, lapply(levels(temp[, 1]), function(x) c(x, temp[temp[, 1] == x, 2], rep(NA, maxval - lengths[x])))))
## X1 X2 X3
## 1 a b c
## 2 1 2 3
## 3 4 6 7
## 4 5 <NA> 8
## 5 <NA> <NA> 9
I would transpose the original two-row data.frame, create a "time" variable, use reshape to reorganize the data, and transpose the result.
Like this:
x <- t(mydf)
y <- data.frame(cbind(x, ave(x[, 1], x[, 1], FUN = seq_along)))
t(reshape(y, direction = "wide", idvar = "X1", timevar = "X3"))
# A B C
# X1 "a" "b" "c"
# X2.1 "1" "2" "3"
# X2.2 "4" "6" "7"
# X2.3 "5" NA "8"
# X2.4 NA NA "9"

How to reorder data.table columns (without copying)

I'd like to reorder columns in my data.table x, given a character vector of column names, neworder:
library(data.table)
x <- data.table(a = 1:3, b = 3:1, c = runif(3))
neworder <- c("c", "b", "a")
Obviously I could do:
x[ , neworder, with = FALSE]
# or
x[ , ..neworder]
# c b a
# 1: 0.8476623 3 1
# 2: 0.4787768 2 2
# 3: 0.3570803 1 3
but that would require copying the entire dataset again. Is there another way to do this?
Use setcolorder():
library(data.table)
x <- data.table(a = 1:3, b = 3:1, c = runif(3))
x
# a b c
# [1,] 1 3 0.2880365
# [2,] 2 2 0.7785115
# [3,] 3 1 0.3297416
setcolorder(x, c("c", "b", "a"))
x
# c b a
# [1,] 0.2880365 3 1
# [2,] 0.7785115 2 2
# [3,] 0.3297416 1 3
From ?setcolorder:
In data.table parlance, all set* functions change their input by reference. That is, no copy is made at all, other than temporary working memory, which is as large as one column.
so should be pretty efficient. See ?setcolorder for details.
One may find it easier to use the above solution, but instead sort by column number. For example:
library(data.table)
> x <- data.table(a = 1:3, b = 3:1, c = runif(3))
> x
a b c
[1,] 1 3 0.2880365
[2,] 2 2 0.7785115
[3,] 3 1 0.3297416
> setcolorder(x, c(3,2,1))
> x
c b a
[1,] 0.2880365 3 1
[2,] 0.7785115 2 2
[3,] 0.3297416 1 3

Resources