I have data like this:
ID=c(rep("ID1",3), rep("ID2",2), "ID3", rep("ID4",2))
item=c("a","b","c","a","c","a","b","a")
data.frame(ID,item)
ID1 a
ID1 b
ID1 c
ID2 a
ID2 c
ID3 a
ID4 b
ID4 a
and I would need it as a list of edges like this:
a;b
b;c
a;c
a;c
b;a
the first three edges coming from ID1, fourth from ID2, ID3 has no edges so nothing from that and fifth from ID4. Any ideas on how to accomplish this? melt/cast?
I'd guess there should be a simple igrpah solution for this, but here's a simple solution using data.table package
library(data.table)
setDT(df)[, if(.N > 1) combn(as.character(item), 2, paste, collapse = ";"), ID]
# ID V1
# 1: ID1 a;b
# 2: ID1 a;c
# 3: ID1 b;c
# 4: ID2 a;c
# 5: ID4 b;a
Try
res <- do.call(rbind,with(df, tapply(item, ID,
FUN=function(x) if(length(x)>=2) t(combn(x,2)))))
paste(res[,1], res[,2], sep=";")
#[1] "a;b" "a;c" "b;c" "a;c" "b;a"
Here is a more scalable solution that uses the same core logic as the other solutions:
library(plyr)
library(dplyr)
ID=c(rep("ID1",3), rep("ID2",2), "ID3", rep("ID4",2))
item=c("a","b","c","a","c","a","b","a")
dfPaths = data.frame(ID, item)
dfPaths2 = dfPaths %>%
group_by(ID) %>%
mutate(numitems = n(), item = as.character(item)) %>%
filter(numitems > 1)
ddply(dfPaths2, .(ID), function(x) t(combn(x$item, 2)))
Related
I have some difficulties with my code, and I hope some of you could help.
The dataset looks something like this:
df <- data.frame("group" = c("A", "A", "A","A_1", "A_1", "B","B","B_1"),
"id" = c("id1", "id2", "id3", "id2", "id3", "id5","id1","id1"),
"time" = c(1,1,1,3,3,2,2,5),
"Val" = c(10,10,10,10,10,12,12,12))
"group" indicate the group the individual "id" is in. "A_1" indicate that a subject has left the group.
For instance, one subject "id1" leaves the "group A" that becomes group "A_1", where only "id2" and "id3" are members. Similarly "id5" leaves group B that becomes "B_1" with only id1 as a member.
What I would like to have in the final dataset is an opposite type of groups identification, that should look something like this:
final <- data.frame("group" = c("A", "A", "A","A_1", "B","B","B_1"),
"id" = c("id1", "id2", "id3", "id1", "id5","id1","id5"),
"time" = c(1,1,1,3,2,2,5),
"Val" = c(10,10,10,10,12,12,12),
"groupid" = c("A", "A", "A","A", "B","B","B"))
Whereby "A_1" and "B_1" only indicate the subjects, "id1" and "id5" respectively, that have left the original group, rather than identifying remaining subjects.
Does anyone have suggestions on how I could systematically do this?
I thank you in advance for your help.
Follow up:
My data is a little more complex that in the above example as there are multiple "exits" from treatements, moreover group identifier can be of different character leghts (here for instance AAA and B). The data looks more like the following:
df2 <- data.frame("group" = c("AAA", "AAA", "AAA","AAA","AAA_1","AAA_1", "AAA_1","AAA_2","AAA_2","B","B","B_1"),
"id" = c("id1", "id2", "id3","id4", "id2", "id3","id4", "id2","id3", "id5","id1","id1"),
"time" = c(1,1,1,1,3,3,3,6,6,2,2,5),
"Val" = c(10,10,10,10,10,10,10,10,10,12,12,12))
Where at time 3 id1 leaves groups AAA, that becomes groups AAA_1, while at time 6, also id4 leaves group AAA, that becomes group AAA_2. As discussed previously, i would like groups with "_" to identify those id that left the group rather than the one remaining. Hence the final dataset should look something like this:
final2 <- data.frame("group" = c("A", "A", "A","A","A_1","A_2",
"B","B","B_1"),
"id" = c("id1", "id2", "id3","id4", "id1", "id4", "id5","id1","id5"),
"time" = c(1,1,1,1,3,6,2,2,5),
"Val" = c(10,10,10,10,10,10,12,12,12))
thanks for helping me with this
Ok you can try with dplyr in this way: maybe it's not elegant, but you get the result. The idea behind is to first fetch the ones that are in group ... but not in the relative ..._1 and change their group, fetch the others, and rbind them together:
library(dplyr)
# first you could find the one that are missing in the ..._1 groups
# and change their group to ..._1
dups <-
df %>%
group_by(id, groupid = substr(group,1,1)) %>%
filter(n() == 1)%>%
mutate(group = paste0(group,'_1')) %>%
left_join(df %>%
select(group, time, Val) %>%
distinct(), by ='group') %>%
select(group, id, time = time.y, Val = Val.y) %>%
ungroup()
dups
# A tibble: 2 x 5
groupid group id time Val
<chr> <chr> <fct> <dbl> <dbl>
1 A A_1 id1 3 10
2 B B_1 id5 5 12
# now you can select the ones that are in both groups:
dups2 <-
df %>%
filter(nchar(as.character(group)) == 1) %>%
mutate(groupid = substr(group,1,1))
dups2
group id time Val groupid
1 A id1 1 10 A
2 A id2 1 10 A
3 A id3 1 10 A
4 B id5 2 12 B
5 B id1 2 12 B
Last, rbind() them, arrange() them and order() the columns:
rbind(dups, dups2) %>%
arrange(group) %>%
select(group, id, time, Val, groupid)
# A tibble: 7 x 5
group id time Val groupid
<chr> <fct> <dbl> <dbl> <chr>
1 A id1 1 10 A
2 A id2 1 10 A
3 A id3 1 10 A
4 A_1 id1 3 10 A
5 B id5 2 12 B
6 B id1 2 12 B
7 B_1 id5 5 12 B
Hope it helps!
EDIT:
You can generalize it with some work, here my attempt, hope it helps:
library(dplyr)
df3 <- df2
# you have to set a couple of fields you need:
df3$group <-ifelse(
substr(df2$group,(nchar(as.character(df2$group))+1)-1,nchar(as.character(df2$group))) %in% c(0:9),
paste0(substr(df2$group,1,1),"_",substr(df2$group,(nchar(as.character(df2$group))+1)-1,nchar(as.character(df2$group)))),
paste0(substr(df2$group,1,1),"_0")
)
df3$util <- as.numeric(substr(df3$group,3,3))+1
# two empty lists to populate with a nested loop:
changed <- list()
final_changed <- list()
Now first we find who changes, then the other: the idea is the same of the previous part:
for (j in c("A","B")) {
df3_ <- df3[substr(df3$group,1,1)==j,]
for (i in unique(df3_$util)[1:length(unique(df3_$util))-1]) {
temp1 <- df3_[df3_$util == i,]
temp2 <- df3_[df3_$util == i+1,]
changes <- temp1[!temp1$id %in% temp2$id,]
changes$group <- paste0(j,'_',i )
changes <- changes %>% left_join(temp2, by = 'group') %>%
select(group , id = id.x, time = time.y, Val = Val.y)
changed[[i]] <- changes
}
final_changed[[j]] <- changed
}
change <- do.call(rbind,(do.call(Map, c(f = rbind, final_changed)))) %>% distinct()
change
group id time Val
1 A_1 id1 3 10
2 B_1 id5 5 12
3 A_2 id4 6 10
Then the remains, and put together:
remain <-
df3 %>% mutate(group = gsub("_0", "", .$group)) %>%
filter(nchar(as.character(group)) == 1) %>% select(-util)
rbind(change, remain) %>%
mutate(groupid = substr(group,1,1)) %>% arrange(group) %>%
select(group, id, time, Val, groupid)
group id time Val groupid
1 A id1 1 10 A
2 A id2 1 10 A
3 A id3 1 10 A
4 A id4 1 10 A
5 A_1 id1 3 10 A
6 A_2 id4 6 10 A
7 B id5 2 12 B
8 B id1 2 12 B
9 B_1 id5 5 12 B
I have a very large dataset which has 3 columns of interest, id, house, & people. Each id can have multiple houses and each house can have multiple people. I want to create a edge-list using what #David Arenburg, has shared here Creating edge list with additional variables in R
However, the issue I have is the edges given are 'a;b' and 'b;a'. I would like to have them only once. As large set of a and b could produce thousands of a;b, b;a combinations.
I would like to have them only once as I would like to count how many times the people share a house.
Given the dataset
id=c(rep("ID1",3), rep("ID2",6), "ID3", rep("ID4",5))
house=c(rep("house1",2), "house2", rep("house3",2), rep("house4",4), "house5", rep("house6",3), "house7", "house8")
people=c("a","b","c","d","e","d","e","d","e","f","g","h","h","h","h")
df1 <- data.frame(id,house, people)
The following code by #David Arenburg gives us the edge-list
df1 = setDT(df1)[, if(.N > 1) tstrsplit(combn(as.character(people),
2, paste, collapse = ";"), ";"),
.(id, house)]
The results
id house V1 V2
1: ID1 house1 a b
2: ID2 house3 d e
3: ID2 house4 d e
4: ID2 house4 d d
5: ID2 house4 d e
6: ID2 house4 e d
7: ID2 house4 e e
8: ID2 house4 d e
9: ID4 house6 g h
10: ID4 house6 g h
11: ID4 house6 h h
As you can see there is between V1 & V2, house has both 'd;e', 'e;d' which I would like to avoid. So for large amount of data those combinations could be in 1000s
Thanks for your help
I'm sure there's a more concise base R way, but here's one dplyr approach, where we sort the two values to make it easier to eliminate repeats.
library(dplyr)
df %>%
mutate(V1s = if_else(V1 < V2, V1, V2),
V2s = if_else(V1 < V2, V2, V1)) %>%
distinct(id, house, V1s, V2s)
There's a possibility following from the excelent answer that #David Aremburg provided.
The overall strategy:
Create a new variable with the ordered edge (it is, convert "e -> d" to "d -> e")
Get the unique values of each combination of id, house and the new variable.
Drop the variable
.
library(data.table)
# keep Aremburg's solution and chain a couple of additional commands:
setDT(df1)[,
if(.N > 1) tstrsplit(combn(as.character(people),
2, paste, collapse = ";"), ";"),
.(id, house)][,
edge := apply(.SD,
1,
function(x) paste(sort(c(x[1],
x[2])),
collapse = ",")),
.SDcols = c("V1", "V2")][,
.SD[1, ],
by = .(id, house, edge)][
, edge := NULL][]
id house V1 V2
1: ID1 house1 a b
2: ID2 house3 d e
3: ID2 house4 d e
4: ID2 house4 d d
5: ID2 house4 e e
6: ID4 house6 g h
7: ID4 house6 h h
Notice that you could drop the rows in which V1 == V2 too, as those are irrelevant edges. That could be accomplished with [V1 != V2, ] at the end of the previous chain.
Applying dplyr::distinct, in order to keep only selected columns instead of all (.keep_all = TRUE), I am currently selecting post hoc using select:
library(dplyr)
foo_df <- data.frame(id1=c(1,1,3),id2=c(1,1,4), val1 = letters[1:3], val2 = letters[3:5])
foo_df %>% distinct(id1,id2,.keep_all = TRUE) %>% select(id1,id2, val1)
# I want to keep "val1" and the identifiers for unique combinations
#> id1 id2 val1
#> 1 1 1 a
#> 2 3 4 c
#> packageVersion('dplyr')
#> [1] ‘0.7.7’
Created on 2018-12-19 by the reprex package (v0.2.1)
But is there a more succinct way? Happy to be pointed to another function too.
Shame on me if this is a dupe.
Maybe the data.table syntax is more to your liking. It is more succinct than dplyr.
library(data.table)
DT <- data.table(foo_df)
# ?data.table::unique
unique(DT[, .(id1, id2, val1)], by = c("id1", "id2"))
id1 id2 val1
1: 1 1 a
2: 3 4 c
My two dataframes:
df1
Col1
A
B
C
df2
Col1
A
D
E
F
I would like to add a 2nd column, Col2, to df1 where each value in the column is 1 if it's respective value in Col1 is also in Col1 of df2. Thus df1 would look like this:
df1
Col1 Col2
A 1
B 0
C 0
Thanks!
Add the col2 to df2
df2$Col2 <- 1
Perform a left-join merge:
df3 <- merge(df1, df2, all.x=T, by='Col1')
Replace the NAs with zeros
df3$Col2[which(is.na(df3$Col2))] <- 0
df3 is now
Col1 Col2
1 A 1
2 B 0
3 C 0
Edit: #ycw has done it more concisely using as.numeric and %in%. I like his answer, but I thought I'd edit mine to include a version of his work that doesn't use dplyr:
It's as simple as df1$Col2 <- as.numeric(df1$Col1 %in% df2$Col1). Much better than mine!
df3 is the final output.
library(dplyr)
df1 <- data_frame(Col1 = c("A", "B", "C"))
df2 <- data_frame(Col1 = c("A", "D", "E", "F"))
df3 <- df1 %>% mutate(Col2 = as.numeric(Col1 %in% df2$Col1))
Or the following approach is similar to HarlandMason's method but using dplyr and tidyr.
library(dplyr)
library(tidyr)
df3 <- df2 %>%
mutate(Col2 = 1) %>%
right_join(df1, by = "Col1") %>%
replace_na(list(Col2 = 0))
Two options using data.table
First one uses %chin% operator :
library(data.table)
x = data.table(v = LETTERS[1:3])
y = data.table(v = c("A","D","E","F"))
x[, found:= v %chin% y$v]
x
#> v found
#> 1: A TRUE
#> 2: B FALSE
#> 3: C FALSE
The second one is built on merging behaviour:
library(data.table)
x = data.table(v = LETTERS[1:3])
y = data.table(v = c("A","D","E","F"))
y[, found := TRUE]
x[, found:= y[.SD, .(ifelse(is.na(found), FALSE, TRUE)), on = .(v)]]
x
#> v found
#> 1: A TRUE
#> 2: B FALSE
#> 3: C FALSE
EDIT: Based on #frank comment, you could simplify with no ifelse - it is the same
x[, found:= y[.SD, !is.na(found), on = .(v)]]
x
#> v found
#> 1: A TRUE
#> 2: B FALSE
#> 3: C FALSE
For understanding what happens, here is the inside code I built on:
x[, found := NULL]
y[x, on = .(v)]
#> v found
#> 1: A TRUE
#> 2: B NA
#> 3: C NA
I have two dataframes:
df_1 <- data.frame(c("a_b", "a_c", "a_d"))
df_2 <- data.frame(matrix(ncol = 2))
And I would like to loop over df_1 in order to fill df_2:
for (i in (1:(length(df_1[,1])))){
for (j in (1:2)) {
df_2[i*j,] <-str_split_fixed(df_1[i,1], "_", 2)
}
}
I would like df_2 to look like:
col1 col2
a b
a b
a c
a c
a d
a d
But instead I get:
col1 col2
a b
a c
a d
a c
NA NA
a d
I must be doing something wrong, but cannot figure it out.
I also would like to use apply (or something like it, but am pretty new to R and not firm with the apply-family.
Thanks for your help!
Another way would be
df_1 <- data.frame(col1 = c("a_b", "a_c", "a_d"))
df_2 <- as.data.frame(do.call(rbind, strsplit(as.character(df_1$col1), split = "_", fixed = TRUE)))
df_2[rep(1:nrow(df_2), each = 2), ]
V1 V2
1 a b
1.1 a b
2 a c
2.1 a c
3 a d
3.1 a d
We can use cSplit with data.table approach
library(splitstackshape)
cSplit(df_1, 'col1', '_')[rep(seq_len(.N), each =2)]
# col1_1 col1_2
#1: a b
#2: a b
#3: a c
#4: a c
#5: a d
#6: a d
Or another option is tidyverse
library(tidyverse)
separate(df_1, col1, into=c("col_1", "col_2")) %>%
map_df(~rep(., each = 2))
# A tibble: 6 × 2
# col_1 col_2
# <chr> <chr>
#1 a b
#2 a b
#3 a c
#4 a c
#5 a d
#6 a d
NOTE: Both the answers are one-liners.
data
df_1 <- data.frame(col1 = c("a_b", "a_c", "a_d"))
This would be a combination of two answers. With cSplit we split the column by _ and then repeat each row twice. Assuming your column name as V1.
library(splitstackshape)
df_2 <- cSplit(df_1, "V1", "_")
df_2[rep(seq_len(nrow(df_2)),each = 2), ]
# V1_1 V1_2
#1: a b
#2: a b
#3: a c
#4: a c
#5: a d
#6: a d
Or as #Sotos mentioned in the comments we can use expandRows to accomodate everything into one line.
expandRows(cSplit(df_1, "V1", "_"), 2, count.is.col = FALSE)
# V1_1 V1_2
#1: a b
#2: a b
#3: a c
#4: a c
#5: a d
#6: a d
data
df_1 <- data.frame(V1 = c("a_b", "a_c", "a_d"))
OK, I started learning R this week, but if you want presented result you can use your code with this fix:
for (i in (1:(length(df_1[,1])))){
for (j in (1:2)) {
df_2[(i-1)*2+j,] <- str_split_fixed(df_1[i,1], "_", 2)
}
}
I changed index of df_2.
I guess that there is better way than two for loops, but that all I can do for the moment.
I was trying to post a solution I found right after posting but it was misunderstood and was deleted:
"sometimes posting a question helps:
I am was asking for the right position in df_1, but I was saving the result in the wrong cell.
the answer to my original question should be something like this:
n <- 1
for (i in (1:(length(df_1[,1])))){
for (j in (1:2)) {
df_2[n,] <-str_split_fixed(df_1[i,1], "_", 2)
n <- n+1
}
}"