R: reshape data frame when one column has unequal number of entries

R: reshape data frame when one column has unequal number of entries - r

I have a data frame x with 2 character columns:
x <- data.frame(a = numeric(), b = I(list()))
x[1:3,"a"] = 1:3
x[[1, "b"]] <- "a, b, c"
x[[2, "b"]] <- "d, e"
x[[3, "b"]] <- "f"
x$a = as.character(x$a)
x$b = as.character(x$b)
x
str(x)
The entries in column b are comma-separated strings of characters.
I need to produce this data frame:
1 a
1 b
1 c
2 d
2 e
3 f
I know how to do it when I loop row by row. But is it possible to do without looping?
Thank you!

Have you checked out require(splitstackshape)?
> cSplit(x, "b", ",", direction = "long")
a b
1: 1 a
2: 1 b
3: 1 c
4: 2 d
5: 2 e
6: 3 f

> s <- strsplit(as.character(x$b), ',')
> data.frame(value=rep(x$a, sapply(s, FUN=length)),b=unlist(s))
value b
1 1 a
2 1 b
3 1 c
4 2 d
5 2 e
6 3 f

there you go, should be very fast:
library(data.table)
x <- data.table(x)
x[ ,strsplit(b, ","), by = a]

Related

How to remove duplicated concatenated string in R

I have the following dataset
path value
1 b,b,a,c 3
2 c,b 2
3 a 10
4 b,c,a,b 0
5 e,f 0
6 a,f 1
df
df <- data.frame (path= c("b,b,a,c", "c,b", "a", "b,c,a,b" ,"e,f" ,"a,f"), value = c(3,2,10,0,0,1))
and I wish to remove duplicated in column path. when I use this code the format of data changes:
df$path <- sapply(strsplit(as.character(df$path), split=","),
function(x) unique(x))
and it gives me data like a dataframe
path value
1 c("b", "a", "c") 3
2 c( "c", "b ") 2
...
However, I wish to have data like that:
path value
1 b, a, c 3
2 c, b 2
3 a 10
4 b, c, a 0
5 e, f 0
6 a, f 1

replace unique(x) with paste(unique(x), collapse = ', '), or toString(unique(x)) as Frank suggested.
df <- data.frame (
path= c("b,b,a,c", "c,b", "a", "b,c,a,b" ,"e,f" ,"a,f"),
value = c(3,2,10,0,0,1))
df$path <- sapply(strsplit(as.character(df$path), split=","),
function(x) paste(unique(x), collapse = ', '))
# or
df$path <- sapply(strsplit(as.character(df$path), split=","),
function(x) toString(unique(x)))
df
# path value
# 1 b, a, c 3
# 2 c, b 2
# 3 a 10
# 4 b, c, a 0
# 5 e, f 0
# 6 a, f 1

'Proper' way to do row-wise replacement

I have a data frame which looks something like:
dataDemo <- data.frame(POS = 1:4 , REF = c("A" , "T" , "G" , "C") ,
ind1 = c("A" , "." , "G" , "C") , ind2 = c("A" , "C" , "C" , "."),
stringsAsFactors=FALSE)
dataDemo
POS REF ind1 ind2
1 1 A A A
2 2 T . C
3 3 G G C
4 4 C C .
and I'd like to replace all the "."s with the REF value for that row. Here is how I did it:
for(i in seq_along(dataDemo$REF)){
dataDemo[i , ][dataDemo[i , ] == '.'] <- dataDemo$REF[i]
}
I'd like to know if there's a more 'proper' or idiomatic way of doing this in R. I generally try to use *apply whenever possible and this seems like something that could easily be adapted to that approach and made more readable (and run faster), but despite throwing a good bit of time at it I haven't made much progress.

In dplyr,
library(dplyr)
dataDemo %>% mutate_each(funs(ifelse(. == '.', REF, as.character(.))), -POS)
# POS REF ind1 ind2
# 1 1 A A A
# 2 2 T T C
# 3 3 G G C
# 4 4 C C C

Here's another base R alternative, where we use the row numbers of the "." occurrences to replace them by the appropriate REF values.
# Get row numbers
rownrs <- which(dataDemo==".", arr.ind = TRUE)[,1]
# Replace values
dataDemo[dataDemo=="."] <- dataDemo$REF[rownrs]
# Result
dataDemo
# POS REF ind1 ind2
#1 1 A A A
#2 2 T T C
#3 3 G G C
#4 4 C C C

Here is an option using set from data.table, which should be fast.
library(data.table)
setDT(dataDemo)
nm1 <- paste0("ind", 1:2)
for(j in nm1){
i1 <- dataDemo[[j]]=="."
set(dataDemo, i = which(i1), j=j, value = dataDemo$REF[i1])
}
dataDemo
# POS REF ind1 ind2
#1: 1 A A A
#2: 2 T T C
#3: 3 G G C
#4: 4 C C C
EDIT: Based on #alexis_laz's comments
Or using dplyr
library(dplyr)
dataDemo %>%
mutate_each(funs(ifelse(.==".", REF,.)), ind1:ind2)
# POS REF ind1 ind2
#1 1 A A A
#2 2 T T C
#3 3 G G C
#4 4 C C C
Or we can use base R methods to do this in a single line.
dataDemo[nm1] <- lapply(dataDemo[nm1], function(x) ifelse(x==".", dataDemo$REF, x))

build word co-occurence edge list in R

I have a chunk of sentences and I want to build the undirected edge list of word co-occurrence and see the frequency of every edge. I took a look at the tm package but didn't find similar functions. Is there some package/script I can use? Thanks a lot!
Note: A word doesn't co-occur with itself. A word which appears twice or more co-occurs with other words for only once in the same sentence.
DF:
sentence_id text
1 a b c d e
2 a b b e
3 b c d
4 a e
5 a
6 a a a
OUTPUT
word1 word2 freq
a b 2
a c 1
a d 1
a e 3
b c 2
b d 2
b e 2
c d 2
c e 1
d e 1

It's convoluted so there's got to be a better approach:
dat <- read.csv(text="sentence_id, text
1, a b c d e
2, a b b e
3, b c d
4, a e", header=TRUE)
library(qdapTools); library(tidyr)
x <- t(mtabulate(with(dat, by(text, sentence_id, bag_o_words))) > 0)
out <- x %*% t(x)
out[upper.tri(out, diag=TRUE)] <- NA
out2 <- matrix2df(out, "word1") %>%
gather(word2, freq, -word1) %>%
na.omit()
rownames(out2) <- NULL
out2
## word1 word2 freq
## 1 b a 2
## 2 c a 1
## 3 d a 1
## 4 e a 3
## 5 c b 2
## 6 d b 2
## 7 e b 2
## 8 d c 2
## 9 e c 1
## 10 e d 1
Base only solution
out <- lapply(with(dat, split(text, sentence_id)), function(x) {
strsplit(gsub("^\\s+|\\s+$", "", as.character(x)), "\\s+")[[1]]
})
nms <- sort(unique(unlist(out)))
out2 <- lapply(out, function(x) {
as.data.frame(table(x), stringsAsFactors = FALSE)
})
dat2 <- data.frame(x = nms)
for(i in seq_along(out2)) {
m <- merge(dat2, out2[[i]], all.x = TRUE)
names(m)[i + 1] <- dat[["sentence_id"]][i]
dat2 <- m
}
dat2[is.na(dat2)] <- 0
x <- as.matrix(dat2[, -1]) > 0
out3 <- x %*% t(x)
out3[upper.tri(out3, diag=TRUE)] <- NA
dimnames(out3) <- list(dat2[[1]], dat2[[1]])
out4 <- na.omit(data.frame(
word1 = rep(rownames(out3), ncol(out3)),
word2 = rep(colnames(out3), each = nrow(out3)),
freq = c(unlist(out3)),
stringsAsFactors = FALSE)
)
row.names(out4) <- NULL
out4

This is very closely related to #TylerRinker's answer, but using different tools.
library(splitstackshape)
library(reshape2)
temp <- crossprod(
as.matrix(
cSplit_e(d, "text", " ", type = "character",
fill = 0, drop = TRUE)[-1]))
temp[upper.tri(temp, diag = TRUE)] <- NA
melt(temp, na.rm = TRUE)
# Var1 Var2 value
# 2 text_b text_a 2
# 3 text_c text_a 1
# 4 text_d text_a 1
# 5 text_e text_a 3
# 8 text_c text_b 2
# 9 text_d text_b 2
# 10 text_e text_b 2
# 14 text_d text_c 2
# 15 text_e text_c 1
# 20 text_e text_d 1
The "text_" parts of "Var1" and "Var2" can be stripped easily with sub or gsub.

Here's a base R way:
d <- read.table(text='sentence_id text
1 "a b c d e"
2 "a b b e"
3 "b c d"
4 "a e"', header=TRUE, as.is=TRUE)
result.vec <- table(unlist(lapply(d$text, function(text) {
pairs <- combn(unique(scan(text=text, what='', sep=' ')), m=2)
interaction(pairs[1,], pairs[2,])
})))
# a.b b.b c.b d.b a.c b.c c.c d.c a.d b.d c.d d.d a.e b.e c.e d.e
# 2 0 0 0 1 2 0 0 1 2 2 0 3 2 1 1
result <- subset(data.frame(do.call(rbind, strsplit(names(result.vec), '\\.')), freq=as.vector(result.vec)), freq > 0)
with(result, result[order(X1, X2),])
# X1 X2 freq
# 1 a b 2
# 5 a c 1
# 9 a d 1
# 13 a e 3
# 6 b c 2
# 10 b d 2
# 14 b e 2
# 11 c d 2
# 15 c e 1
# 16 d e 1

Group columns with the same name in R

If I have a data frame as below, with the first row the column names (row names not included here)
A B C D E F G H I
a b c a a b c c c
1 2 3 4 5 6 7 8 9
How would I be able create a new data frame such that:
a b c
1 2 3
4 6 7
5 NA 8
NA NA 9
Notice the NA. For empty values.
UPDATE
If d.frame is the dataframe in question:
new.df <- data.frame();
firstrow <- d.frame[,1]
names <- unique(firstrow)
for (n in names) {
#cbind.fill is part of a package plyr
new.df <- cbind.fill(new.df, frame[3,which(firstrow == n)])
}
colnames(new.df) <- names;
I think that works well. But it isn't efficient and relies on a third party package. Any suggestions?

Here is another solution, based on function cbind.fill from cbind a df with an empty df (cbind.fill?)
cbind.fill<-function(...){
nm <- list(...)
nm<-lapply(nm, as.matrix)
n <- max(sapply(nm, nrow))
do.call(cbind, lapply(nm, function (x)
rbind(x, matrix(, n-nrow(x), ncol(x)))))
}
df <- read.table(text = "A B C D E F G H I
a b c a a b c c c
1 2 3 4 5 6 7 8 9", header = T, as.is=T)
df <- as.matrix(df)
do.call(cbind.fill, split(df[2,], df[1,]))
And another one solution
df <- as.matrix(df)
lst <- split(df[2,], df[1,])
m <- max(sapply(lst, length))
result <- sapply(lst, function(x) {length(x) <- m; x})

Couldn't find a simple solution for this, so here's one option using base R as you requested in comments. This solution will work no matter how many columns you have in the original data
temp <- read.table(text = "A B C D E F G H I
a b c a a b c c c
1 2 3 4 5 6 7 8 9", header = T) # your data
temp <- data.frame(t(temp))
lengths <- table(temp[, 1])
maxval <- max(lengths)
data.frame(do.call(cbind, lapply(levels(temp[, 1]), function(x) c(x, temp[temp[, 1] == x, 2], rep(NA, maxval - lengths[x])))))
## X1 X2 X3
## 1 a b c
## 2 1 2 3
## 3 4 6 7
## 4 5 <NA> 8
## 5 <NA> <NA> 9

I would transpose the original two-row data.frame, create a "time" variable, use reshape to reorganize the data, and transpose the result.
Like this:
x <- t(mydf)
y <- data.frame(cbind(x, ave(x[, 1], x[, 1], FUN = seq_along)))
t(reshape(y, direction = "wide", idvar = "X1", timevar = "X3"))
# A B C
# X1 "a" "b" "c"
# X2.1 "1" "2" "3"
# X2.2 "4" "6" "7"
# X2.3 "5" NA "8"
# X2.4 NA NA "9"

How can I build an inverted index from a data frame in R?

Say I have a data frame in R : data.frame(x=1:4, y=c("a b c", "b", "a c", "c"))
x y
1 1 a b c
2 2 b
3 3 a c
4 4 c
Now I want to build a new data frame, an inverted index which is quite common in IR or recommendation systems, from it:
y x
a 1 3
b 1 2
c 1 3 4
How can I do this in an efficient way?

conv <- function(x) {
l <- function(z) {
paste(x$x[grep(z, x$y)], collapse=' ')
}
lv <- Vectorize(l)
alphabet <- unique(unlist(strsplit(as.character(x$y), ' '))) # hard-coding this might be preferred for some uses.
y <- lv(alphabet)
data.frame(y=names(y), x=y)
}
x <- data.frame(x=1:4, y=c("a b c", "b", "a c", "c"))
> conv(x)
## y x
## a a 1 3
## b b 1 2
## c c 1 3 4

An attempt, after converting y to characters:
test <- data.frame(x=1:4,y=c("a b c","b","a c","c"),stringsAsFactors=FALSE)
result <- strsplit(test$y," ")
result2 <- sapply(unique(unlist(result)),function(y) sapply(result,function(x) y %in% x))
result3 <- apply(result2,2,function(x) test$x[which(x)])
final <- data.frame(x=names(result3),y=sapply(result3,paste,collapse=" "))
> final
x y
a a 1 3
b b 1 2
c c 1 3 4

quick and dirty
original.df <- data.frame(x=1:4, y=c("a b c", "b", "a c", "c"))
original.df$y <- as.character(original.df$y)
y.split <- strsplit(original.df$y, " ")
y.unlisted <- unique(unlist(y.split))
new.df <-
sapply(y.unlisted, function(element)
paste(which(sapply(y.split, function(y.row) element %in% y.row)), collapse=" " ))
as.data.frame(new.df)
> new.df
a 1 3
b 1 2
c 1 3 4

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

R: reshape data frame when one column has unequal number of entries - r

Have you checked out require(splitstackshape)? > cSplit(x, "b", ",", direction = "long") a b 1: 1 a 2: 1 b 3: 1 c 4: 2 d 5: 2 e 6: 3 f

> s <- strsplit(as.character(x$b), ',') > data.frame(value=rep(x$a, sapply(s, FUN=length)),b=unlist(s)) value b 1 1 a 2 1 b 3 1 c 4 2 d 5 2 e 6 3 f

there you go, should be very fast: library(data.table) x <- data.table(x) x[ ,strsplit(b, ","), by = a]

Related

How to remove duplicated concatenated string in R

'Proper' way to do row-wise replacement

build word co-occurence edge list in R

Group columns with the same name in R

How can I build an inverted index from a data frame in R?

Categories

Resources