split and table

split and table - r

I have a data frame like this:
GN SN
a b
a b
a c
d e
d f
d e
I would like the following output:
GN: a SN: 2 b 1 c
GN d SN: 2 e 1 f
In other words I would like to have a sort of table() of the data.frame on the column S.N. First of all I splitted the data.frame according to $GN, so I have blocks. At this point I' m not able to have the counting of the elements on column SN according to the split I've done. Is the "apply" function a way to do this? And how can i save a general output belonging from split function?
Thanks in advance

With your data:
df <- data.frame(GN = rep(c("a","b"), each = 3),
SN = c(rep("b", 2), "c", "e", "f", "e"))
We could do:
> lapply(with(df, split(SN, GN)), table)
$a
b c e f
2 1 0 0
$b
b c e f
0 0 2 1
But if you don't want all the levels (the 0 entries) then we need to drop the empty levels:
> lapply(with(df, split(SN, GN)), function(x) table(droplevels(x)))
$a
b c
2 1
$b
e f
2 1
Writing out the individual tables to a file
This isn't perfect but at least you can work with it
## save tables
tmp <- lapply(with(df, split(SN, GN)), function(x) table(droplevels(x)))
## function to write output to file `fname`
foo <- function(x, fname) {
cat(paste(names(x), collapse = " "), "\n", file = fname, append = TRUE)
cat(paste(x, collapse = " "), "\n", file = fname, append = TRUE)
invisible()
}
fname <- "foo.txt"
file.create(fname) # create file fname
lapply(tmp, foo, fname = fname) # run our function to write to fname
That gives:
R> readLines(fname)
[1] "b c " "2 1 " "e f " "2 1 "
or from the OS:
$ cat foo.txt
b c
2 1
e f
2 1

Related

How to remove duplicated concatenated string in R

I have the following dataset
path value
1 b,b,a,c 3
2 c,b 2
3 a 10
4 b,c,a,b 0
5 e,f 0
6 a,f 1
df
df <- data.frame (path= c("b,b,a,c", "c,b", "a", "b,c,a,b" ,"e,f" ,"a,f"), value = c(3,2,10,0,0,1))
and I wish to remove duplicated in column path. when I use this code the format of data changes:
df$path <- sapply(strsplit(as.character(df$path), split=","),
function(x) unique(x))
and it gives me data like a dataframe
path value
1 c("b", "a", "c") 3
2 c( "c", "b ") 2
...
However, I wish to have data like that:
path value
1 b, a, c 3
2 c, b 2
3 a 10
4 b, c, a 0
5 e, f 0
6 a, f 1

replace unique(x) with paste(unique(x), collapse = ', '), or toString(unique(x)) as Frank suggested.
df <- data.frame (
path= c("b,b,a,c", "c,b", "a", "b,c,a,b" ,"e,f" ,"a,f"),
value = c(3,2,10,0,0,1))
df$path <- sapply(strsplit(as.character(df$path), split=","),
function(x) paste(unique(x), collapse = ', '))
# or
df$path <- sapply(strsplit(as.character(df$path), split=","),
function(x) toString(unique(x)))
df
# path value
# 1 b, a, c 3
# 2 c, b 2
# 3 a 10
# 4 b, c, a 0
# 5 e, f 0
# 6 a, f 1

Generate list of second-degree neighbors using lapply and igraph

I got some excellent advice here on how to lookup neighbors for a list of network nodes. See: lapply function to look up neighbors in igraph (when not all nodes are found)
Now I need to do the same thing with second-degree neighbors. However, substituting either ego or neighborhood function into this loop produces an error.
edgelist <- read.table(text = "
A B
B C
C D
D E
C F
F G")
testlist <- read.table(text = "
A
H
C
D
J")
testlist2 <- read.table(text = "
A
C
B
D
E")
library(igraph)
graph <- graph.data.frame(edgelist)
str(graph)
get_neighbors <- function(graph, n) {
do.call(rbind, lapply(n, function(x) {
if (x %in% V(graph)$name) {
nb <- neighborhood(graph,2, x) ##HERE##
if (length(nb) > 0) {
data.frame(lookupnode=x,
neighbor=nb$name, # h/t #MrFlick for this shortcut
stringsAsFactors=FALSE)
} else {
data.frame(lookupnode=x, neighbor=NA, stringsAsFactors=FALSE)
}
} else {
data.frame(lookupnode=x, neighbor=NA, stringsAsFactors=FALSE)
}
}))
}
A=get_neighbors(graph, as.character(testlist$V1))
Error in data.frame(lookupnode = x, neighbor = nb$name, stringsAsFactors = FALSE) : arguments imply differing number of rows: 1, 0
I gather the issue is that ego and neighborhood can't be directly coerced into a data frame. I can use unlist and then put in a data frame, but the values I want end up as row.names not values that I can put into my output.
How can I create an output of second-degree neighbors?

Changed
neighbor=nb$name, # h/t #MrFlick for this shortcut
to
neighbor=names(unlist(nb)), # h/t #MrFlick for this shortcut
and it is working for me now.
> A
lookupnode neighbor
1 A A
2 A B
3 A C
4 H <NA>
5 C C
6 C B
7 C D
8 C F
9 C A
10 C E
11 C G
12 D D
13 D C
14 D E
15 D B
16 D F
17 J <NA>
>

R: reshape data frame when one column has unequal number of entries

I have a data frame x with 2 character columns:
x <- data.frame(a = numeric(), b = I(list()))
x[1:3,"a"] = 1:3
x[[1, "b"]] <- "a, b, c"
x[[2, "b"]] <- "d, e"
x[[3, "b"]] <- "f"
x$a = as.character(x$a)
x$b = as.character(x$b)
x
str(x)
The entries in column b are comma-separated strings of characters.
I need to produce this data frame:
1 a
1 b
1 c
2 d
2 e
3 f
I know how to do it when I loop row by row. But is it possible to do without looping?
Thank you!

Have you checked out require(splitstackshape)?
> cSplit(x, "b", ",", direction = "long")
a b
1: 1 a
2: 1 b
3: 1 c
4: 2 d
5: 2 e
6: 3 f

> s <- strsplit(as.character(x$b), ',')
> data.frame(value=rep(x$a, sapply(s, FUN=length)),b=unlist(s))
value b
1 1 a
2 1 b
3 1 c
4 2 d
5 2 e
6 3 f

there you go, should be very fast:
library(data.table)
x <- data.table(x)
x[ ,strsplit(b, ","), by = a]

How can I build an inverted index from a data frame in R?

Say I have a data frame in R : data.frame(x=1:4, y=c("a b c", "b", "a c", "c"))
x y
1 1 a b c
2 2 b
3 3 a c
4 4 c
Now I want to build a new data frame, an inverted index which is quite common in IR or recommendation systems, from it:
y x
a 1 3
b 1 2
c 1 3 4
How can I do this in an efficient way?

conv <- function(x) {
l <- function(z) {
paste(x$x[grep(z, x$y)], collapse=' ')
}
lv <- Vectorize(l)
alphabet <- unique(unlist(strsplit(as.character(x$y), ' '))) # hard-coding this might be preferred for some uses.
y <- lv(alphabet)
data.frame(y=names(y), x=y)
}
x <- data.frame(x=1:4, y=c("a b c", "b", "a c", "c"))
> conv(x)
## y x
## a a 1 3
## b b 1 2
## c c 1 3 4

An attempt, after converting y to characters:
test <- data.frame(x=1:4,y=c("a b c","b","a c","c"),stringsAsFactors=FALSE)
result <- strsplit(test$y," ")
result2 <- sapply(unique(unlist(result)),function(y) sapply(result,function(x) y %in% x))
result3 <- apply(result2,2,function(x) test$x[which(x)])
final <- data.frame(x=names(result3),y=sapply(result3,paste,collapse=" "))
> final
x y
a a 1 3
b b 1 2
c c 1 3 4

quick and dirty
original.df <- data.frame(x=1:4, y=c("a b c", "b", "a c", "c"))
original.df$y <- as.character(original.df$y)
y.split <- strsplit(original.df$y, " ")
y.unlisted <- unique(unlist(y.split))
new.df <-
sapply(y.unlisted, function(element)
paste(which(sapply(y.split, function(y.row) element %in% y.row)), collapse=" " ))
as.data.frame(new.df)
> new.df
a 1 3
b 1 2
c 1 3 4

order while splitting (eg. TA should be split to two column "A" in first "T" second) in r

I have following issue, I could solve:
set.seed (1234)
mydf <- data.frame (var1a = sample (c("TA", "AA", "TT"), 5, replace = TRUE),
varb2 = sample (c("GA", "AA", "GG"), 5, replace = TRUE),
varAB = sample (c("AC", "AA", "CC"), 5, replace = TRUE)
)
mydf
var1a varb2 varAB
1 TA AA CC
2 AA GA AA
3 AA GA AC
4 AA AA CC
5 TT AA AC
I want to split two letter into different column, and then order alphabetically.
Edit: Ordering can be done before split, for example var1a value "TA" var1a should be "AT" or after split so that var1aa should be "A", and var1ab be "T" (instead of "T", "A").
so sorting is within each cell.
split_col <- function(.col, data){
.x <- colsplit( data[[.col]], names = paste0(.col, letters[1:2]))
}
split each column and combine
require(reshape)
splitdf <- do.call(cbind, lapply(names(mydf), split_col, data = mydf))
var1aa var1ab varb2a varb2b varABa varABb
1 T A A A C C
2 A A G A A A
3 A A G A A C
4 A A A A C C
5 T T A A A C
But the unsolved part is I want to order the pair of columns such that columnname"a" and columname"b" are ordered, alphabetically. Thus expected output:
var1aa var1ab varb2a varb2b varABa varABb
1 A T A A C C
2 A A A G A A
3 A A A G A C
4 A A A A C C
5 T T A A A C
Can how can order (short with each pair of variable) ?

mylist <-as.list(mydf)
splits <- lapply(mylist, reshape::colsplit, names=c("a", "b"))
rowsort <- lapply(splits, function(x) t(apply(x, 1, sort)))
comb <- do.call(data.frame, rowsort)
comb
var1a.1 var1a.2 varb2.1 varb2.2 varAB.a varAB.b
1 A T A A C C
2 A A A G A A
3 A A A G A C
4 A A A A C C
5 T T A A A C
EDIT:
If names are important, you can replace them:
replaceNums <- function(x){
.which <- regmatches(x, regexpr("[[:alnum:]]*(?=.)", x, perl=TRUE))
stopifnot(length(x) %% 2 == 0) #checkstep
paste0(.which, c("a", "b"))
}
names(comb) <- replaceNums(names(comb))
comb
var1aa var1ab varb2a varb2b varABa varABb
1 A T A A C C
2 A A A G A A
3 A A A G A C
4 A A A A C C
5 T T A A A C

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

split and table - r

Related

How to remove duplicated concatenated string in R

Generate list of second-degree neighbors using lapply and igraph

R: reshape data frame when one column has unequal number of entries

How can I build an inverted index from a data frame in R?

order while splitting (eg. TA should be split to two column "A" in first "T" second) in r

Categories

Resources