How to remove duplicated concatenated string in R - r

I have the following dataset
path value
1 b,b,a,c 3
2 c,b 2
3 a 10
4 b,c,a,b 0
5 e,f 0
6 a,f 1
df
df <- data.frame (path= c("b,b,a,c", "c,b", "a", "b,c,a,b" ,"e,f" ,"a,f"), value = c(3,2,10,0,0,1))
and I wish to remove duplicated in column path. when I use this code the format of data changes:
df$path <- sapply(strsplit(as.character(df$path), split=","),
function(x) unique(x))
and it gives me data like a dataframe
path value
1 c("b", "a", "c") 3
2 c( "c", "b ") 2
...
However, I wish to have data like that:
path value
1 b, a, c 3
2 c, b 2
3 a 10
4 b, c, a 0
5 e, f 0
6 a, f 1

replace unique(x) with paste(unique(x), collapse = ', '), or toString(unique(x)) as Frank suggested.
df <- data.frame (
path= c("b,b,a,c", "c,b", "a", "b,c,a,b" ,"e,f" ,"a,f"),
value = c(3,2,10,0,0,1))
df$path <- sapply(strsplit(as.character(df$path), split=","),
function(x) paste(unique(x), collapse = ', '))
# or
df$path <- sapply(strsplit(as.character(df$path), split=","),
function(x) toString(unique(x)))
df
# path value
# 1 b, a, c 3
# 2 c, b 2
# 3 a 10
# 4 b, c, a 0
# 5 e, f 0
# 6 a, f 1

Related

Find immediate neighbors by group using data table or igraph

I have a data.table:
groups <- data.table(group = c("A", "B", "C", "D", "E", "F", "G"),
code_1 = c(2,2,2,7,8,NA,5),
code_2 = c(NA,3,NA,3,NA,NA,2),
code_3 = c(4,1,1,4,4,1,8))
group code_1 code_2 code_3
A 2 NA 4
B 2 3 1
C 2 NA 1
D 7 3 4
E 8 NA 4
F NA NA 1
G 5 2 8
What I would like to achieve, is for each group to find the immediate neighbors based on the available codes. For example: Group A has immediate neighbors groups B, C due to code_1 (code_1 is equal to 2 in all groups) and has immediate neighbor groups D,E due to code_3 (code_3 is equal to 4 in all those groups).
What I tried is for each code, subsetting the first column (group) based on the matches as follows:
groups$code_1_match = list()
for (row in 1:nrow(groups)){
set(groups, i=row, j="code_1_match", list(groups$group[groups$code_1[row] == groups$code_1]))
}
group code_1 code_2 code_3 code_1_match
A 2 NA 4 A,B,C,NA
B 2 3 1 A,B,C,NA
C 2 NA 1 A,B,C,NA
D 7 3 4 D,NA
E 8 NA 4 E,NA
F NA NA 1 NA,NA,NA,NA,NA,NA,...
G 5 2 8 NA,G
This "kinda" works but I would assume there is a more data table kind of way of doing this. I tried
groups[, code_1_match_2 := list(group[code_1 == groups$code_1])]
But this doesn't work.
Am I missing some obvious data table trick to deal with it?
My ideal case result would look like this (which currently would require using my method for all 3 columns and then concatenating the results):
group code_1 code_2 code_3 Immediate neighbors
A 2 NA 4 B,C,D,E
B 2 3 1 A,C,D,F
C 2 NA 1 A,B,F
D 7 3 4 B,A
E 8 NA 4 A,D
F NA NA 1 B,C
G 5 2 8
Using igraph, get 2nd degree neighbours, drop numeric nodes, paste remaining nodes.
library(data.table)
library(igraph)
# reshape wide-to-long
x <- melt(groups, id.vars = "group")[!is.na(value)]
# convert to graph
g <- graph_from_data_frame(x[, .(from = group, to = paste0(variable, "_", value))])
# get 2nd degree neighbours
x1 <- ego(g, 2, nodes = groups$group)
# prettify the result
groups$res <- sapply(seq_along(x1), function(i) toString(intersect(names(x1[[ i ]]),
groups$group[ -i ])))
# group code_1 code_2 code_3 res
# 1: A 2 NA 4 B, C, D, E
# 2: B 2 3 1 A, C, D, F
# 3: C 2 NA 1 A, B, F
# 4: D 7 3 4 B, A, E
# 5: E 8 NA 4 A, D
# 6: F NA NA 1 B, C
# 7: G 5 2 8
More info
This is how our data looks like before converting to igraph object. We want to ensure code1 with value 2 is different from code2 with value 2, etc.
x[, .(from = group, to = paste0(variable, "_", value))]
# from to
# 1: A code_1_2
# 2: B code_1_2
# 3: C code_1_2
# 4: D code_1_7
# 5: E code_1_8
# 6: G code_1_5
# 7: B code_2_3
# 8: D code_2_3
# 9: G code_2_2
# 10: A code_3_4
# 11: B code_3_1
# 12: C code_3_1
# 13: D code_3_4
# 14: E code_3_4
# 15: F code_3_1
# 16: G code_3_8
Here is how our network looks like:
Note that A..G nodes are always connected through code_x_y.
So we need to get the 2nd degree, ego(..., order = 2) gives us neighbours up to including 2nd degree neighbours, and returns a list object.
To get the names:
lapply(x1, names)
# [[1]]
# [1] "A" "code_1_2" "code_3_4" "B" "C" "D" "E"
#
# [[2]]
# [1] "B" "code_1_2" "code_2_3" "code_3_1" "A" "C" "D" "F"
#
# [[3]]
# [1] "C" "code_1_2" "code_3_1" "A" "B" "F"
#
# [[4]]
# [1] "D" "code_1_7" "code_2_3" "code_3_4" "B" "A" "E"
#
# [[5]]
# [1] "E" "code_1_8" "code_3_4" "A" "D"
#
# [[6]]
# [1] "F" "code_3_1" "B" "C"
#
# [[7]]
# [1] "G" "code_1_5" "code_2_2" "code_3_8"
To prettify the result, we need to remove code_x_y nodes and the origin node (1st node)
sapply(seq_along(x1), function(i) toString(intersect(names(x1[[ i ]]), groups$group[ -i ])))
#[1] "B, C, D, E" "A, C, D, F" "A, B, F" "B, A, E" "A, D" "B, C" ""
There is probably some more practical way of achieving this but you could do something like this, using melts and joins:
mgrp <- melt(groups, id.vars = "group")[!is.na(value)]
setkey(mgrp, variable, value)
for (i in seq_along(groups$group)) {
let = groups$group[i]
set(
groups,
i = i,
j = "inei",
value = list(mgrp[mgrp[group == let], setdiff(unique(group), let)])
)
}
groups
# group code_1 code_2 code_3 inei
# 1: A 2 NA 4 B,C,D,E
# 2: B 2 3 1 A,C,D,F
# 3: C 2 NA 1 A,B,F
# 4: D 7 3 4 B,A,E
# 5: E 8 NA 4 A,D
# 6: F NA NA 1 B,C
# 7: G 5 2 8
As mentioned by zx8754, using data.table::melt with combn and then igraph::as_adjacency_matrix
library(data.table)
df <- melt(groups, id.vars="group", na.rm=TRUE)[,
if (.N > 1L) transpose(combn(group, 2L, simplify=FALSE)), value][, (1) := NULL]
library(igraph)
as_adjacency_matrix(graph_from_data_frame(df, FALSE))
output:
7 x 7 sparse Matrix of class "dgCMatrix"
A B C E D G F
A . 1 1 1 1 1 .
B 1 . 2 . 1 1 1
C 1 2 . . . 1 1
E 1 . . . 1 1 .
D 1 1 . 1 . . .
G 1 1 1 1 . . .
F . 1 1 . . . .
or without using igraph
x <- df[, unique(c(V1, V2))]
df <- rbindlist(list(df, data.table(x, x)))
tab <- table(df) #or xtabs(~ V1 + V2, data=df)
ans <- t(tab) + tab
diag(ans) <- 0L
ans
output:
V1
V2 A B C D E F G
A 0 1 1 1 1 0 1
B 1 0 2 1 0 1 1
C 1 2 0 0 0 1 1
D 1 1 0 0 1 0 0
E 1 0 0 1 0 0 1
F 0 1 1 0 0 0 0
G 1 1 1 0 1 0 0
This is inspired by #sindri_baldur's melt. This solution:
Melts the groups
Performs a cartesian self-join.
Pastes together all the groups that matches.
Joins back to the original DT
library(data.table)
#> Warning: package 'data.table' was built under R version 3.6.2
groups <- data.table(group = c("A", "B", "C", "D", "E", "F", "G"), code_1 = c(2,2,2,7,8,NA,5), code_2 = c(NA,3,NA,3,NA,NA,2), code_3=c(4,1,1,4,4,1,8))
molten_grps = melt(groups, measure.vars = patterns("code"), na.rm = TRUE)
inei_dt = molten_grps[molten_grps,
on = .(variable, value),
allow.cartesian = TRUE
][,
.(inei = paste0(setdiff(i.group, .BY[[1L]]), collapse = ", ")),
by = group]
groups[inei_dt, on = .(group), inei := inei]
groups
#> group code_1 code_2 code_3 inei
#> <char> <num> <num> <num> <char>
#> 1: A 2 NA 4 B, C, D, E
#> 2: B 2 3 1 A, C, D, F
#> 3: C 2 NA 1 A, B, F
#> 4: D 7 3 4 B, A, E
#> 5: E 8 NA 4 A, D
#> 6: F NA NA 1 B, C
#> 7: G 5 2 8

R-Converting Incidence matrix(csv file) to edge list format

I am studying social network analysis and will be using Ucinet to draw network graphs. For this, I have to convert the csv file to an edge list format. Converting the adjacency matrix to the edge list was successful. However, it is difficult to convert an incidence matrix to the edge list format.
The csv file('some.csv') I have, with a incidence matrix like this:
A B C D
a 1 0 3 1
b 0 0 0 2
c 3 2 0 1
The code that converted the adjacency matrix to the edge list was as follows:
x<-read.csv("C:/.../something.csv", header=T, row.names=1)
net<-as.network(x, matrix.type='adjacency', ignore.eval=FALSE, names.eval='dd', loops=FALSE)
el<-edgelist(net, attrname='dd')
write.csv(el, file='C:/.../result.csv')
Now It only succeedded in loading the file. I tried to follow the above method, but I get an error.
y<-read.csv("C:/.../some.csv", header=T, row.names=1)
net2<-network(y, matrix.type='incidence', ignore.eval=FALSE, names.eval='co', loops=FALSE)
Error in network.incidence(x, g, ignore.eval, names.eval, na.rm, edge.check) :
Supplied incidence matrix has empty head/tail lists. (Did you get the directedness right?)
I want to see the result in this way:
a A 1
a C 3
a D 1
b D 2
c A 3
c B 2
c D 1
I tried to put the values as the error said, but I could not get the result i wanted.
Thank you for any assistance with this.
Here's your data:
inc_mat <- matrix(
c(1, 0, 3, 1,
0, 0, 0, 2,
3, 2, 0, 1),
nrow = 3, ncol = 4, byrow = TRUE
)
rownames(inc_mat) <- letters[1:3]
colnames(inc_mat) <- LETTERS[1:4]
inc_mat
#> A B C D
#> a 1 0 3 1
#> b 0 0 0 2
#> c 3 2 0 1
Here's a generalized function that does the trick:
as_edgelist.weighted_incidence_matrix <- function(x, drop_rownames = TRUE) {
melted <- do.call(cbind, lapply(list(row(x), col(x), x), as.vector)) # 3 col matrix of row index, col index, and `x`'s values
filtered <- melted[melted[, 3] != 0, ] # drop rows where column 3 is 0
# data frame where first 2 columns are...
df <- data.frame(mode1 = rownames(x)[filtered[, 1]], # `x`'s rownames, indexed by first column in `filtered``
mode2 = colnames(x)[filtered[, 2]], # `x`'s colnames, indexed by the second column in `filtered`
weight = filtered[, 3], # the third column in `filtered`
stringsAsFactors = FALSE)
out <- df[order(df$mode1), ] # sort by first column
if (!drop_rownames) {
return(out)
}
`rownames<-`(out, NULL)
}
Take it for a spin:
el <- as_edgelist.weighted_incidence_matrix(inc_mat)
el
#> mode1 mode2 weight
#> 1 a A 1
#> 2 a C 3
#> 3 a D 1
#> 4 b D 2
#> 5 c A 3
#> 6 c B 2
#> 7 c D 1
Here are the results you wanted:
control_df <- data.frame(
mode1 = c("a", "a", "a", "b", "c", "c", "c"),
mode2 = c("A", "C", "D", "D", "A", "B", "D"),
weight = c(1, 3, 1, 2, 3, 2, 1),
stringsAsFactors = FALSE
)
control_df
#> mode1 mode2 weight
#> 1 a A 1
#> 2 a C 3
#> 3 a D 1
#> 4 b D 2
#> 5 c A 3
#> 6 c B 2
#> 7 c D 1
Do they match?
identical(control_df, el)
#> [1] TRUE
This might not be the most efficient way, but it produces expected result:
y <- matrix( c(1,0,3,0,0,2,3,0,0,1,2,1), nrow=3)
colnames(y) <- c("e.A","e.B","e.C","e.D")
dt <- data.frame(rnames=c("a","b","c"))
dt <- cbind(dt, y)
# rnames e.A e.B e.C e.D
#1 a 1 0 3 1
#2 b 0 0 0 2
#3 c 3 2 0 1
# use reshape () function to convert dataframe into the long format
M <- reshape(dt, direction="long", idvar = "rnames", varying = c("e.A","e.B","e.C","e.D"))
M <- M[M$e >0,]
M
# rnames time e
# a.A a A 1
# c.A c A 3
# c.B c B 2
# a.C a C 3
# a.D a D 1
# b.D b D 2
# c.D c D 1
# If M needs to be sorted by the column rnames:
M[order(M$rnames), ]
# rnames time e
# a.A a A 1
# a.C a C 3
# a.D a D 1
# b.D b D 2
# c.A c A 3
# c.B c B 2
# c.D c D 1

In R: Split a character vector to find specific characters and return a data frame

I want to be able to extract specific characters from a character vector in a data frame and return a new data frame. The information I want to extract is auditors remark on a specific company's income and balance sheet. My problem is that the auditors remarks are stored in vectors containing the different remarks. For instance:
vec = c("A C G H D E"). Since "A" %in% vec won't return TRUE, I have to use strsplit to break up each character vector in the data frame, hence "A" %in% unlist(strsplit(dat[i, 2], " "). This returns TRUE.
Here is a MWE:
dat <- data.frame(orgnr = c(1, 2, 3, 4), rat = as.character(c("A B C")))
dat$rat <- as.character(dat$rat)
dat[2, 2] <- as.character(c("A F H L H"))
dat[3, 2] <- as.character(c("H X L O"))
dat[4, 2] <- as.character(c("X Y Z A B C"))
Now, to extract information about every single letter in the rat coloumn, I've tried several approaches, following similar problems such as Roland's answer to a similar question (How to split a character vector into data frame?)
DF <- data.frame(do.call(rbind, strsplit(dat$rat, " ", fixed = TRUE)))
DF
X1 X2 X3 X4 X5 X6
1 A B C A B C
2 A F H L H A
3 H X L O H X
4 X Y Z A B C
This returnsthe following error message: Warning message:
In (function (..., deparse.level = 1) :
number of columns of result is not a multiple of vector length (arg 2)
It would be a desirable approach since it's fast, but I can't use DF since it recycles.
Is there a way to insert NA instead of the recycling because of the different length of the vectors?
So far I've found a solution to the problem by using for-loops in combination with ifelse-statements. However, with 3 mill obs. this approach takes years!
dat$A <- 0
for(i in seq(1, nrow(dat), 1)) {
print(i)
dat[i, 3] <- ifelse("A" %in% unlist(strsplit(dat[i, 2], " ")), 1, 0)
}
dat$B <- 0
for(i in seq(1, nrow(dat), 1)) {
print(i)
dat[i, 4] <- ifelse("B" %in% unlist(strsplit(dat[i, 2], " ")), 1, 0)
}
This gives the results I want:
dat
orgnr rat A B
1 1 A B C 1 1
2 2 A F H L H 1 0
3 3 H X L O 0 0
4 4 X Y Z A B C 1 1
I've searched through most of the relevant questions I could find here on StackOverflow. This one is really close to my problem: How to convert a list consisting of vector of different lengths to a usable data frame in R?, but I don't know how to implement strsplit with that approach.
We can use for-loop with grepl to achieve this task. + 0 is to convert the column form TRUE or FALSE to 1 or 0
for (col in c("A", "B")){
dat[[col]] <- grepl(col, dat$rat) + 0
}
dat
# orgnr rat A B
# 1 1 A B C 1 1
# 2 2 A F H L H 1 0
# 3 3 H X L O 0 0
# 4 4 X Y Z A B C 1 1
If performance is an issue, try this data.table approach.
library(data.table)
# Convert to data.table
setDT(dat)
# Create a helper function
dummy_fun <- function(col, vec){
grepl(col, vec) + 0
}
# Apply the function to A and B
dat[, c("A", "B") := lapply(c("A", "B"), dummy_fun, vec = rat)]
dat
# orgnr rat A B
# 1: 1 A B C 1 1
# 2: 2 A F H L H 1 0
# 3: 3 H X L O 0 0
# 4: 4 X Y Z A B C 1 1
using Base R:
a=strsplit(dat$rat," ")
b=data.frame(x=rep(dat$orgnr,lengths(a)),y=unlist(a),z=1)
cbind(dat,as.data.frame.matrix(xtabs(z~x+y,b)))
orgnr rat A B C F H L O X Y Z
1 1 A B C 1 1 1 0 0 0 0 0 0 0
2 2 A F H L H 1 0 0 1 2 1 0 0 0 0
3 3 H X L O 0 0 0 0 1 1 1 1 0 0
4 4 X Y Z A B C 1 1 1 0 0 0 0 1 1 1
From here you can Just call those columns that you want:
d=as.data.frame.matrix(xtabs(z~x+y,b))
cbind(dat,d[c("A","B")])
orgnr rat A B
1 1 A B C 1 1
2 2 A F H L H 1 0
3 3 H X L O 0 0
4 4 X Y Z A B C 1 1

R: reshape data frame when one column has unequal number of entries

I have a data frame x with 2 character columns:
x <- data.frame(a = numeric(), b = I(list()))
x[1:3,"a"] = 1:3
x[[1, "b"]] <- "a, b, c"
x[[2, "b"]] <- "d, e"
x[[3, "b"]] <- "f"
x$a = as.character(x$a)
x$b = as.character(x$b)
x
str(x)
The entries in column b are comma-separated strings of characters.
I need to produce this data frame:
1 a
1 b
1 c
2 d
2 e
3 f
I know how to do it when I loop row by row. But is it possible to do without looping?
Thank you!
Have you checked out require(splitstackshape)?
> cSplit(x, "b", ",", direction = "long")
a b
1: 1 a
2: 1 b
3: 1 c
4: 2 d
5: 2 e
6: 3 f
> s <- strsplit(as.character(x$b), ',')
> data.frame(value=rep(x$a, sapply(s, FUN=length)),b=unlist(s))
value b
1 1 a
2 1 b
3 1 c
4 2 d
5 2 e
6 3 f
there you go, should be very fast:
library(data.table)
x <- data.table(x)
x[ ,strsplit(b, ","), by = a]

How can I build an inverted index from a data frame in R?

Say I have a data frame in R : data.frame(x=1:4, y=c("a b c", "b", "a c", "c"))
x y
1 1 a b c
2 2 b
3 3 a c
4 4 c
Now I want to build a new data frame, an inverted index which is quite common in IR or recommendation systems, from it:
y x
a 1 3
b 1 2
c 1 3 4
How can I do this in an efficient way?
conv <- function(x) {
l <- function(z) {
paste(x$x[grep(z, x$y)], collapse=' ')
}
lv <- Vectorize(l)
alphabet <- unique(unlist(strsplit(as.character(x$y), ' '))) # hard-coding this might be preferred for some uses.
y <- lv(alphabet)
data.frame(y=names(y), x=y)
}
x <- data.frame(x=1:4, y=c("a b c", "b", "a c", "c"))
> conv(x)
## y x
## a a 1 3
## b b 1 2
## c c 1 3 4
An attempt, after converting y to characters:
test <- data.frame(x=1:4,y=c("a b c","b","a c","c"),stringsAsFactors=FALSE)
result <- strsplit(test$y," ")
result2 <- sapply(unique(unlist(result)),function(y) sapply(result,function(x) y %in% x))
result3 <- apply(result2,2,function(x) test$x[which(x)])
final <- data.frame(x=names(result3),y=sapply(result3,paste,collapse=" "))
> final
x y
a a 1 3
b b 1 2
c c 1 3 4
quick and dirty
original.df <- data.frame(x=1:4, y=c("a b c", "b", "a c", "c"))
original.df$y <- as.character(original.df$y)
y.split <- strsplit(original.df$y, " ")
y.unlisted <- unique(unlist(y.split))
new.df <-
sapply(y.unlisted, function(element)
paste(which(sapply(y.split, function(y.row) element %in% y.row)), collapse=" " ))
as.data.frame(new.df)
> new.df
a 1 3
b 1 2
c 1 3 4

Resources