I have a data.table:
groups <- data.table(group = c("A", "B", "C", "D", "E", "F", "G"),
code_1 = c(2,2,2,7,8,NA,5),
code_2 = c(NA,3,NA,3,NA,NA,2),
code_3 = c(4,1,1,4,4,1,8))
group code_1 code_2 code_3
A 2 NA 4
B 2 3 1
C 2 NA 1
D 7 3 4
E 8 NA 4
F NA NA 1
G 5 2 8
What I would like to achieve, is for each group to find the immediate neighbors based on the available codes. For example: Group A has immediate neighbors groups B, C due to code_1 (code_1 is equal to 2 in all groups) and has immediate neighbor groups D,E due to code_3 (code_3 is equal to 4 in all those groups).
What I tried is for each code, subsetting the first column (group) based on the matches as follows:
groups$code_1_match = list()
for (row in 1:nrow(groups)){
set(groups, i=row, j="code_1_match", list(groups$group[groups$code_1[row] == groups$code_1]))
}
group code_1 code_2 code_3 code_1_match
A 2 NA 4 A,B,C,NA
B 2 3 1 A,B,C,NA
C 2 NA 1 A,B,C,NA
D 7 3 4 D,NA
E 8 NA 4 E,NA
F NA NA 1 NA,NA,NA,NA,NA,NA,...
G 5 2 8 NA,G
This "kinda" works but I would assume there is a more data table kind of way of doing this. I tried
groups[, code_1_match_2 := list(group[code_1 == groups$code_1])]
But this doesn't work.
Am I missing some obvious data table trick to deal with it?
My ideal case result would look like this (which currently would require using my method for all 3 columns and then concatenating the results):
group code_1 code_2 code_3 Immediate neighbors
A 2 NA 4 B,C,D,E
B 2 3 1 A,C,D,F
C 2 NA 1 A,B,F
D 7 3 4 B,A
E 8 NA 4 A,D
F NA NA 1 B,C
G 5 2 8
Using igraph, get 2nd degree neighbours, drop numeric nodes, paste remaining nodes.
library(data.table)
library(igraph)
# reshape wide-to-long
x <- melt(groups, id.vars = "group")[!is.na(value)]
# convert to graph
g <- graph_from_data_frame(x[, .(from = group, to = paste0(variable, "_", value))])
# get 2nd degree neighbours
x1 <- ego(g, 2, nodes = groups$group)
# prettify the result
groups$res <- sapply(seq_along(x1), function(i) toString(intersect(names(x1[[ i ]]),
groups$group[ -i ])))
# group code_1 code_2 code_3 res
# 1: A 2 NA 4 B, C, D, E
# 2: B 2 3 1 A, C, D, F
# 3: C 2 NA 1 A, B, F
# 4: D 7 3 4 B, A, E
# 5: E 8 NA 4 A, D
# 6: F NA NA 1 B, C
# 7: G 5 2 8
More info
This is how our data looks like before converting to igraph object. We want to ensure code1 with value 2 is different from code2 with value 2, etc.
x[, .(from = group, to = paste0(variable, "_", value))]
# from to
# 1: A code_1_2
# 2: B code_1_2
# 3: C code_1_2
# 4: D code_1_7
# 5: E code_1_8
# 6: G code_1_5
# 7: B code_2_3
# 8: D code_2_3
# 9: G code_2_2
# 10: A code_3_4
# 11: B code_3_1
# 12: C code_3_1
# 13: D code_3_4
# 14: E code_3_4
# 15: F code_3_1
# 16: G code_3_8
Here is how our network looks like:
Note that A..G nodes are always connected through code_x_y.
So we need to get the 2nd degree, ego(..., order = 2) gives us neighbours up to including 2nd degree neighbours, and returns a list object.
To get the names:
lapply(x1, names)
# [[1]]
# [1] "A" "code_1_2" "code_3_4" "B" "C" "D" "E"
#
# [[2]]
# [1] "B" "code_1_2" "code_2_3" "code_3_1" "A" "C" "D" "F"
#
# [[3]]
# [1] "C" "code_1_2" "code_3_1" "A" "B" "F"
#
# [[4]]
# [1] "D" "code_1_7" "code_2_3" "code_3_4" "B" "A" "E"
#
# [[5]]
# [1] "E" "code_1_8" "code_3_4" "A" "D"
#
# [[6]]
# [1] "F" "code_3_1" "B" "C"
#
# [[7]]
# [1] "G" "code_1_5" "code_2_2" "code_3_8"
To prettify the result, we need to remove code_x_y nodes and the origin node (1st node)
sapply(seq_along(x1), function(i) toString(intersect(names(x1[[ i ]]), groups$group[ -i ])))
#[1] "B, C, D, E" "A, C, D, F" "A, B, F" "B, A, E" "A, D" "B, C" ""
There is probably some more practical way of achieving this but you could do something like this, using melts and joins:
mgrp <- melt(groups, id.vars = "group")[!is.na(value)]
setkey(mgrp, variable, value)
for (i in seq_along(groups$group)) {
let = groups$group[i]
set(
groups,
i = i,
j = "inei",
value = list(mgrp[mgrp[group == let], setdiff(unique(group), let)])
)
}
groups
# group code_1 code_2 code_3 inei
# 1: A 2 NA 4 B,C,D,E
# 2: B 2 3 1 A,C,D,F
# 3: C 2 NA 1 A,B,F
# 4: D 7 3 4 B,A,E
# 5: E 8 NA 4 A,D
# 6: F NA NA 1 B,C
# 7: G 5 2 8
As mentioned by zx8754, using data.table::melt with combn and then igraph::as_adjacency_matrix
library(data.table)
df <- melt(groups, id.vars="group", na.rm=TRUE)[,
if (.N > 1L) transpose(combn(group, 2L, simplify=FALSE)), value][, (1) := NULL]
library(igraph)
as_adjacency_matrix(graph_from_data_frame(df, FALSE))
output:
7 x 7 sparse Matrix of class "dgCMatrix"
A B C E D G F
A . 1 1 1 1 1 .
B 1 . 2 . 1 1 1
C 1 2 . . . 1 1
E 1 . . . 1 1 .
D 1 1 . 1 . . .
G 1 1 1 1 . . .
F . 1 1 . . . .
or without using igraph
x <- df[, unique(c(V1, V2))]
df <- rbindlist(list(df, data.table(x, x)))
tab <- table(df) #or xtabs(~ V1 + V2, data=df)
ans <- t(tab) + tab
diag(ans) <- 0L
ans
output:
V1
V2 A B C D E F G
A 0 1 1 1 1 0 1
B 1 0 2 1 0 1 1
C 1 2 0 0 0 1 1
D 1 1 0 0 1 0 0
E 1 0 0 1 0 0 1
F 0 1 1 0 0 0 0
G 1 1 1 0 1 0 0
This is inspired by #sindri_baldur's melt. This solution:
Melts the groups
Performs a cartesian self-join.
Pastes together all the groups that matches.
Joins back to the original DT
library(data.table)
#> Warning: package 'data.table' was built under R version 3.6.2
groups <- data.table(group = c("A", "B", "C", "D", "E", "F", "G"), code_1 = c(2,2,2,7,8,NA,5), code_2 = c(NA,3,NA,3,NA,NA,2), code_3=c(4,1,1,4,4,1,8))
molten_grps = melt(groups, measure.vars = patterns("code"), na.rm = TRUE)
inei_dt = molten_grps[molten_grps,
on = .(variable, value),
allow.cartesian = TRUE
][,
.(inei = paste0(setdiff(i.group, .BY[[1L]]), collapse = ", ")),
by = group]
groups[inei_dt, on = .(group), inei := inei]
groups
#> group code_1 code_2 code_3 inei
#> <char> <num> <num> <num> <char>
#> 1: A 2 NA 4 B, C, D, E
#> 2: B 2 3 1 A, C, D, F
#> 3: C 2 NA 1 A, B, F
#> 4: D 7 3 4 B, A, E
#> 5: E 8 NA 4 A, D
#> 6: F NA NA 1 B, C
#> 7: G 5 2 8
Related
R coders! I have a data frame, plan, with two columns. One column has group labels, lab, and the other, tr has only two distinct values in it.
lab <- rep(letters[1:2], each = 4)
tr <- c(1, 2, 2, 1, 1, 2, 1, 2)
plan <- data.frame(lab = lab, tr = tr)
> plan
lab tr
1 a 1
2 a 2
3 a 2
4 a 1
5 b 1
6 b 2
7 b 1
8 b 2
I have another vector, order_new, which is a reordered version of lab.
order_new <- lab[sample(1:8)]
> order_new
[1] "b" "b" "a" "a" "b" "a" "b" "a"
I want to reorder the data frame above so the tr values are sorted in the order given by order_new but with the order within the original lab groups preserved. The result I want is:
plan_new <- data.frame(order_new = order_new, tr = c(1, 2, 1, 2, 1, 2, 2, 1))
> plan_new
order_new tr
1 b 1
2 b 2
3 a 1
4 a 2
5 b 1
6 a 2
7 b 2
8 a 1
The first row in the new data frame is a "b" value and so takes the first "b" value in the original data frame. Row 2, also a "b", takes the second "b" value in the original. The third row, an "a", takes the first "a" value in the original etc.
I can't find anything close enough in past answers to work this out and am really looking forward to someone helping me out with this!
If you don't mind a loop
order_new=c("b", "b", "a", "a", "b", "a", "b", "a")
tmp=split(plan$tr,plan$lab)
res=list()
for (x in 1:length(order_new)) {
res[[x]]=tmp[[order_new[x]]][1]
tmp[[order_new[x]]]=tail(tmp[[order_new[x]]],-1)
}
data.frame(
"lab"=order_new,
"tr"=unlist(res)
)
lab tr
1 b 1
2 b 2
3 a 1
4 a 2
5 b 1
6 a 2
7 b 2
8 a 1
Here is a data.table approach of things.. can easily be tinkerd into a dplyr or baseR solution, followint the same logic..
I included all intermediate results to show you the results of each line..
lab <- rep(letters[1:2], each = 4)
tr <- c(1, 2, 2, 1, 1, 2, 1, 2)
plan <- data.frame(lab = lab, tr = tr)
#hard coded, since sample is not reproducible without set.seed()
order_new <- c("b", "b", "a", "a", "b", "a", "b", "a")
library( data.table )
#make plan a data.table
setDT(plan)
#set row_id's by grope (lab)
plan[, row_id := rowid( lab ) ]
# lab tr row_id
# 1: a 1 1
# 2: a 2 2
# 3: a 2 3
# 4: a 1 4
# 5: b 1 1
# 6: b 2 2
# 7: b 1 3
# 8: b 2 4
#make a new data.table for the new ordering
plan_new <- data.table( order_new = order_new )
#also add rownumbers by group
plan_new[, row_id := rowid( order_new ) ][]
# order_new row_id
# 1: b 1
# 2: b 2
# 3: a 1
# 4: a 2
# 5: b 3
# 6: a 3
# 7: b 4
# 8: a 4
#now join the tr-value from data.table 'plan' to 'plkan2', based on the rowid
plan_new[ plan, tr := i.tr, on = .(order_new = lab, row_id) ]
# order_new row_id tr
# 1: b 1 1
# 2: b 2 2
# 3: a 1 1
# 4: a 2 2
# 5: b 3 1
# 6: a 3 2
# 7: b 4 2
# 8: a 4 1
#drop the row_id column if needed
plan_new[, row_id := NULL ][]
# order_new tr
# 1: b 1
# 2: b 2
# 3: a 1
# 4: a 2
# 5: b 1
# 6: a 2
# 7: b 2
# 8: a 1
I have the following dataset
path value
1 b,b,a,c 3
2 c,b 2
3 a 10
4 b,c,a,b 0
5 e,f 0
6 a,f 1
df
df <- data.frame (path= c("b,b,a,c", "c,b", "a", "b,c,a,b" ,"e,f" ,"a,f"), value = c(3,2,10,0,0,1))
and I wish to remove duplicated in column path. when I use this code the format of data changes:
df$path <- sapply(strsplit(as.character(df$path), split=","),
function(x) unique(x))
and it gives me data like a dataframe
path value
1 c("b", "a", "c") 3
2 c( "c", "b ") 2
...
However, I wish to have data like that:
path value
1 b, a, c 3
2 c, b 2
3 a 10
4 b, c, a 0
5 e, f 0
6 a, f 1
replace unique(x) with paste(unique(x), collapse = ', '), or toString(unique(x)) as Frank suggested.
df <- data.frame (
path= c("b,b,a,c", "c,b", "a", "b,c,a,b" ,"e,f" ,"a,f"),
value = c(3,2,10,0,0,1))
df$path <- sapply(strsplit(as.character(df$path), split=","),
function(x) paste(unique(x), collapse = ', '))
# or
df$path <- sapply(strsplit(as.character(df$path), split=","),
function(x) toString(unique(x)))
df
# path value
# 1 b, a, c 3
# 2 c, b 2
# 3 a 10
# 4 b, c, a 0
# 5 e, f 0
# 6 a, f 1
I have data set
ID <- c(1,1,2,2,2,2,3,3,3,3,3,4,4,4)
Eval <- c("A","A","B","B","A","A","A","A","B","B","A","A","A","B")
med <- c("c","d","k","k","h","h","c","d","h","h","h","c","h","k")
df <- data.frame(ID,Eval,med)
> df
ID Eval med
1 1 A c
2 1 A d
3 2 B k
4 2 B k
5 2 A h
6 2 A h
7 3 A c
8 3 A d
9 3 B h
10 3 B h
11 3 A h
12 4 A c
13 4 A h
14 4 B k
I try to create variable x and y, group by ID and Eval. For each ID, if Eval = A, and med = "h" or "k", I set x = 1, other wise x = 0, if Eval = B and med = "h" or "k", I set y = 1, other wise y = 0. I use the way I don't like it, I got answer but it seem like not that great
df <- data.table(df)
setDT(df)[, count := uniqueN(med) , by = .(ID,Eval)]
setDT(df)[Eval == "A", x:= ifelse(count == 1 & med %in% c("k","h"),1,0), by=ID]
setDT(df)[Eval == "B", y:= ifelse(count == 1 & med %in% c("k","h"),1,0), by=ID]
ID Eval med count x y
1: 1 A c 2 0 NA
2: 1 A d 2 0 NA
3: 2 B k 1 NA 1
4: 2 B k 1 NA 1
5: 2 A h 1 1 NA
6: 2 A h 1 1 NA
7: 3 A c 3 0 NA
8: 3 A d 3 0 NA
9: 3 B h 1 NA 1
10: 3 B h 1 NA 1
11: 3 A h 3 0 NA
12: 4 A c 2 0 NA
13: 4 A h 2 0 NA
14: 4 B k 1 NA 1
Then I need to collapse the row to get unique ID, I don't know how to collapse rows, any idea?
The output
ID x y
1 0 0
2 1 1
3 0 1
4 0 1
We create the 'x' and 'y' variables grouped by 'ID' without the NA elements directly coercing the logical vector to binary (as.integer)
df[, x := as.integer(Eval == "A" & count ==1 & med %in% c("h", "k")) , by = ID]
and similarly for 'y'
df[, y := as.integer(Eval == "B" & count ==1 & med %in% c("h", "k")) , by = ID]
and summarise it, using any after grouping by "ID"
df[, lapply(.SD, function(x) as.integer(any(x))) , ID, .SDcols = x:y]
# ID x y
#1: 1 0 0
#2: 2 1 1
#3: 3 0 1
#4: 4 0 1
If we need a compact approach, instead of assinging (:=), we summarise the output grouped by "ID", "Eval" based on the conditions and then grouped by 'ID', we check if there is any TRUE values in 'x' and 'y' by looping over the columns described in the .SDcols.
setDT(df)[, if(any(uniqueN(med)==1 & med %in% c("h", "k"))) {
.(x= Eval=="A", y= Eval == "B") } else .(x=FALSE, y=FALSE),
by = .(ID, Eval)][, lapply(.SD, any) , by = ID, .SDcols = x:y]
# ID x y
#1: 1 FALSE FALSE
#2: 2 TRUE TRUE
#3: 3 FALSE TRUE
#4: 4 FALSE TRUE
If needed, we can convert to binary similar to the approach showed in the first solution.
The OP's goal...
"I try to create variable x and y, group by ID and Eval. For each ID, if Eval = A, and med = "h" or "k", I set x = 1, other wise x = 0, if Eval = B and med = "h" or "k", I set y = 1, other wise y = 0. [...] Then I need to collapse the row to get unique ID"
can be simplified to...
For each ID and Eval, flag if all med values are h or all med values are k.
setDT(df) # only do this once
df[, all(med=="k") | all(med=="h"), by=.(ID,Eval)][, dcast(.SD, ID ~ Eval, fun=any)]
ID A B
1: 1 FALSE FALSE
2: 2 TRUE TRUE
3: 3 FALSE TRUE
4: 4 FALSE TRUE
To see what dcast is doing, read ?dcast and try running just the first part on its own, df[, all(med=="k") | all(med=="h"), by=.(ID,Eval)].
The change to use x and y instead of A and B is straightforward but ill-advised (since unnecessary renaming can be confusing and lead to extra work when there are new Eval values); and ditto the change for 1/0 instead of TRUE/FALSE (since the values captured are actually boolean).
Here is my dplyr solution since I find it more readable than data.table.
library(dplyr)
df %>%
group_by(ID, Eval) %>%
mutate(
count = length(unique(med)),
x = ifelse(Eval == "A" &
count == 1 & med %in% c("h", "k"), 1, 0),
y = ifelse(Eval == "B" &
count == 1 & med %in% c("h", "k"), 1, 0)
) %>%
group_by(ID) %>%
summarise(x1 = max(unique(x)),
y1 = max(unique(y)))
A one liner solution for collapsing the rows of your result :
df[,lapply(.SD,function(i) {ifelse(1 %in% i,ifelse(!0 %in% i,1,0),0)}),.SDcols=x:y,by=ID]
ID x y
1: 1 0 0
2: 2 1 1
3: 3 0 1
4: 4 0 1
in poLCA package in R I use poLCA.table for calculates predicted cell frequencies based on an estimated latent class model.
I need use poLCA.table for all combinations of variables in a latent class model
data(carcinoma)
f <- cbind(A, B, C, D, E, F, G) ~ 1
lca2 <- poLCA(f, carcinoma, nclass = 2)
poLCA.table(formula = A ~ B, condition = list(), lc = lca2)
For example, I need all combinations to A from G.
How can I make it automatically?
This loops creates the 2-way combinations of the letters cast into separate formula objects and then passes each of them to the function you specified:
lapply( apply(combn(LETTERS[1:5],2), 2,
function(col) as.formula( paste(col[1], "~", col[2]) ) ),
function(x) poLCA.table(formula =x, condition = list(), lc = lca2)
)
#------------------
[[1]]
B 1 B 2
A 1 33.572932 18.42707
A 2 5.427068 60.57293
[[2]]
C 1 C 2
A 1 52 2.551164e-10
A 2 21 4.500000e+01
[[3]]
D 1 D 2
A 1 52 1.814161e-10
A 2 34 3.200000e+01
[[4]]
E 1 E 2
A 1 40.408128 11.59187
A 2 6.591872 59.40813
[[5]]
C 1 C 2
B 1 38.23913 0.7608669
B 2 34.76087 44.2391331
[[6]]
D 1 D 2
B 1 38.45894 0.5410609
B 2 47.54106 31.4589391
[[7]]
E 1 E 2
B 1 29.55038 9.44962
B 2 17.44962 61.55038
[[8]]
D 1 D 2
C 1 65.34774 7.652258
C 2 20.65226 24.347742
[[9]]
E 1 E 2
C 1 46.0386552 26.96134
C 2 0.9613448 44.03866
[[10]]
E 1 E 2
D 1 46.316377 39.68362
D 2 0.683623 31.31638
I have a chunk of sentences and I want to build the undirected edge list of word co-occurrence and see the frequency of every edge. I took a look at the tm package but didn't find similar functions. Is there some package/script I can use? Thanks a lot!
Note: A word doesn't co-occur with itself. A word which appears twice or more co-occurs with other words for only once in the same sentence.
DF:
sentence_id text
1 a b c d e
2 a b b e
3 b c d
4 a e
5 a
6 a a a
OUTPUT
word1 word2 freq
a b 2
a c 1
a d 1
a e 3
b c 2
b d 2
b e 2
c d 2
c e 1
d e 1
It's convoluted so there's got to be a better approach:
dat <- read.csv(text="sentence_id, text
1, a b c d e
2, a b b e
3, b c d
4, a e", header=TRUE)
library(qdapTools); library(tidyr)
x <- t(mtabulate(with(dat, by(text, sentence_id, bag_o_words))) > 0)
out <- x %*% t(x)
out[upper.tri(out, diag=TRUE)] <- NA
out2 <- matrix2df(out, "word1") %>%
gather(word2, freq, -word1) %>%
na.omit()
rownames(out2) <- NULL
out2
## word1 word2 freq
## 1 b a 2
## 2 c a 1
## 3 d a 1
## 4 e a 3
## 5 c b 2
## 6 d b 2
## 7 e b 2
## 8 d c 2
## 9 e c 1
## 10 e d 1
Base only solution
out <- lapply(with(dat, split(text, sentence_id)), function(x) {
strsplit(gsub("^\\s+|\\s+$", "", as.character(x)), "\\s+")[[1]]
})
nms <- sort(unique(unlist(out)))
out2 <- lapply(out, function(x) {
as.data.frame(table(x), stringsAsFactors = FALSE)
})
dat2 <- data.frame(x = nms)
for(i in seq_along(out2)) {
m <- merge(dat2, out2[[i]], all.x = TRUE)
names(m)[i + 1] <- dat[["sentence_id"]][i]
dat2 <- m
}
dat2[is.na(dat2)] <- 0
x <- as.matrix(dat2[, -1]) > 0
out3 <- x %*% t(x)
out3[upper.tri(out3, diag=TRUE)] <- NA
dimnames(out3) <- list(dat2[[1]], dat2[[1]])
out4 <- na.omit(data.frame(
word1 = rep(rownames(out3), ncol(out3)),
word2 = rep(colnames(out3), each = nrow(out3)),
freq = c(unlist(out3)),
stringsAsFactors = FALSE)
)
row.names(out4) <- NULL
out4
This is very closely related to #TylerRinker's answer, but using different tools.
library(splitstackshape)
library(reshape2)
temp <- crossprod(
as.matrix(
cSplit_e(d, "text", " ", type = "character",
fill = 0, drop = TRUE)[-1]))
temp[upper.tri(temp, diag = TRUE)] <- NA
melt(temp, na.rm = TRUE)
# Var1 Var2 value
# 2 text_b text_a 2
# 3 text_c text_a 1
# 4 text_d text_a 1
# 5 text_e text_a 3
# 8 text_c text_b 2
# 9 text_d text_b 2
# 10 text_e text_b 2
# 14 text_d text_c 2
# 15 text_e text_c 1
# 20 text_e text_d 1
The "text_" parts of "Var1" and "Var2" can be stripped easily with sub or gsub.
Here's a base R way:
d <- read.table(text='sentence_id text
1 "a b c d e"
2 "a b b e"
3 "b c d"
4 "a e"', header=TRUE, as.is=TRUE)
result.vec <- table(unlist(lapply(d$text, function(text) {
pairs <- combn(unique(scan(text=text, what='', sep=' ')), m=2)
interaction(pairs[1,], pairs[2,])
})))
# a.b b.b c.b d.b a.c b.c c.c d.c a.d b.d c.d d.d a.e b.e c.e d.e
# 2 0 0 0 1 2 0 0 1 2 2 0 3 2 1 1
result <- subset(data.frame(do.call(rbind, strsplit(names(result.vec), '\\.')), freq=as.vector(result.vec)), freq > 0)
with(result, result[order(X1, X2),])
# X1 X2 freq
# 1 a b 2
# 5 a c 1
# 9 a d 1
# 13 a e 3
# 6 b c 2
# 10 b d 2
# 14 b e 2
# 11 c d 2
# 15 c e 1
# 16 d e 1