Get edge list that includes alter's alters - r

I need a data frame that includes three columns: i, j (alter), and k (j's alter). I have an adjacency matrix (sample below). From there I can get a graph object and extract the edge list. How can I manipulate the data to get an output like the WANT data frame below?
HAVE (matrix & edgelist):
1 2 3 4 5
1 0 0 0 1 0
2 0 0 1 1 1
3 0 0 0 0 0
4 1 1 0 0 1
5 1 1 0 1 0
g <- graph_from_adjacency_matrix(mat)
get.edgelist(g)
i j
1 4
2 3
2 4
2 5
4 1
4 2
4 5
5 1
5 2
5 4
WANT (ijk edge list):
i j k
1 4 2
1 4 5
2 4 1
2 4 5
4 2 3
4 5 1
4 5 2
5 1 4
5 2 3
5 2 4
5 4 1
5 4 2
the ijk edge list should so all possible triples with ij, excluding self loops(ex: 1 4 1)

Data:
as.matrix(read.table(text = "0 0 0 1 0
0 0 1 1 1
0 0 0 0 0
1 1 0 0 1
1 1 0 1 0",
header = F, stringsAsFactors = F)) -> m1
dimnames(m1) <- list(1:5, 1:5)
Libraries:
library(igraph)
library(dplyr)
library(tidyr)
library(magrittr)
Solution:
g1 <- graph_from_adjacency_matrix(m1)
e1 <- get.edgelist(g1) %>% as.data.frame %>% mutate_if(is.factor, as.character)
e1 %>%
group_by(V1) %>%
nest(V2) %>%
right_join(e1,.,by = c("V2"="V1")) %>%
unnest %>%
filter(V1 != V21) %>%
set_colnames(c("i", "j", "k"))
Output:
#> i j k
#> 1 1 4 2
#> 2 1 4 5
#> 3 2 4 1
#> 4 2 4 5
#> 5 2 5 1
#> 6 2 5 4
#> 7 4 2 3
#> 8 4 2 5
#> 9 4 5 1
#> 10 4 5 2
#> 11 5 1 4
#> 12 5 2 3
#> 13 5 2 4
#> 14 5 4 1
#> 15 5 4 2

I was actually able to get a way to do it using igraph and dplyr:
# make graph of matrix
g <- graph_from_adjacency_matrix(mat)
# put edgelist into two objects, one where columns are "i, j" and the other "j, k"
df1 <- get.edgelist(g) %>%
as.data.frame() %>%
select(i = V1, j = V2)
df2 <- get.edgelist(g) %>%
as.data.frame() %>%
select(j = V1, k = V2)
# combine the dataframes, filter out rows where i and k are the same observation
df_combn <- inner_join(df1, df2, by = c("j" = "j")) %>%
mutate_all(as.character) %>%
filter(., !(i == k))

Related

Transformation of dataframe applied on 3 variables

I wrote several commands to transform a dataframe but i would like to simplify the code that I wrote in four parts. Part 1,2 and 3 are to make calculation of column 1, 2 and 3 (count the number of time a value is repeated for each column and complete for missing number comprised between 0 and the max of value of the three column). The fourth part is to join the previous output.
I would like to simplify it in order to make the transformation of the 3 column in one block of code instead of 4. Is it possible to do it without using function ?
Thank you in advance.
set.seed(1234)
# Data
A=sample(0:10, 20, replace = TRUE)
B=sample(0:10, 20, replace = TRUE)
C=sample(0:10, 20, replace = TRUE)
df=data.frame(A,B,C)
A B C
1 9 2 0
2 5 3 5
3 4 9 7
4 8 4 2
5 4 1 5
6 5 7 0
7 3 10 0
8 1 3 8
9 6 2 7
10 5 6 9
11 9 8 0
12 5 2 10
13 3 5 7
14 7 3 9
15 3 7 5
16 3 9 2
17 4 10 8
18 7 1 2
19 3 4 5
20 7 5 8
# Count for A
df2=data.frame(A=0:max(max(df$A),max(df$B),max(df$C)))
df3_A= df %>%
select(A) %>%
group_by(A) %>%
mutate(A_number= n()) %>%
distinct(A_number, .keep_all = TRUE) %>%
ungroup() %>%
complete (df2)
df3_A$A_number[is.na(df3_A$A_number)]=0
# Count for B
df2=data.frame(B=0:max(max(df$A),max(df$B),max(df$C)))
df3_B= df %>%
select(B) %>%
group_by(B) %>%
mutate(B_number= n()) %>%
distinct(B_number, .keep_all = TRUE) %>%
ungroup() %>%
complete (df2)
df3_B$B_number[is.na(df3_B$B_number)]=0
# Count for C
df2=data.frame(C=0:max(max(df$A),max(df$B),max(df$C)))
df3_C= df %>%
select(C) %>%
group_by(C) %>%
mutate(C_number= n()) %>%
distinct(C_number, .keep_all = TRUE) %>%
ungroup() %>%
complete (df2)
df3_C$C_number[is.na(df3_C$C_number)]=0
# Join
df3= df3_A %>%
left_join(df3_B, by=c("A"="B")) %>%
left_join(df3_C, by=c("A"="C"))
A A_number B_number C_number
<int> <dbl> <dbl> <dbl>
1 0 0 0 4
2 1 1 2 0
3 2 0 3 3
4 3 5 3 0
5 4 3 2 0
6 5 4 2 4
7 6 1 1 0
8 7 3 2 3
9 8 1 1 3
10 9 2 2 2
11 10 0 2 1
Using base: stack and table:
as.data.frame.matrix(table(stack(df)))
# A B C
# 0 0 0 4
# 1 1 2 0
# 2 0 3 3
# 3 5 3 0
# 4 3 2 0
# 5 4 2 4
# 6 1 1 0
# 7 3 2 3
# 8 1 1 3
# 9 2 2 2
# 10 0 2 1
You can reshape to long, count the values by variables, then reshape back to wide filling missings with zero:
library(dplyr)
library(tidyr)
df %>%
pivot_longer(cols = everything()) %>%
count(name, value) %>%
pivot_wider(values_from = n, values_fill = 0) %>%
arrange(value)
# A tibble: 11 × 4
value A B C
<int> <int> <int> <int>
1 0 0 0 4
2 1 1 2 0
3 2 0 3 3
4 3 5 3 0
5 4 3 2 0
6 5 4 2 4
7 6 1 1 0
8 7 3 2 3
9 8 1 1 3
10 9 2 2 2
11 10 0 2 1
You can use vctrs::vec_count over the columns and then merge the data.frames altogether:
library(dplyr)
library(purrr)
df %>%
mutate(across(A:C, factor, levels = 0:10, ordered = TRUE)) %>%
map(vctrs::vec_count) %>%
imap(~ {name <- paste0("count", .y) %>%
rename_with(.x, ~ name, count)}) %>%
reduce(full_join, by = "key") %>%
replace(is.na(.), 0) %>%
arrange(key)
output
key countA countB countC
1 0 0 0 4
2 1 1 2 0
3 2 0 3 3
4 3 5 3 0
5 4 3 2 0
6 5 4 2 4
7 6 1 1 0
8 7 3 2 3
9 8 1 1 3
10 9 2 2 2
11 10 0 2 1

Create a new dataframe with 1's and 0's from summarized data?

I have the below dataset that I am working with in R:
df <- data.frame(day=seq(1,3,1), tot.infected=c(1,2,4), tot.ind=5)
df
And I would like to transform the tot.infected column into a binomial variable with 1's and 0's, such as the following dataframe:
df2 <- data.frame(year = c(rep(1,5), rep(2,5), rep(3,5)), infected = c(rep(1,1), rep(0,4), rep(1,2), rep(0,3), rep(1,4), rep(0,1)))
Is there a more elegant way to do this in R?
Thank you for your help!
I tried hard-coding a dataframe using rep(), but this is extremely time-consuming for large datasets and I was looking for a more elegant way to achieve this.
base R
tmp <- do.call(Map, c(list(f = function(y, inf, ind) data.frame(year = y, infected = replace(integer(ind), seq(ind) <= inf, 1L))), unname(df)))
do.call(rbind, tmp)
# year infected
# 1 1 1
# 2 1 0
# 3 1 0
# 4 1 0
# 5 1 0
# 6 2 1
# 7 2 1
# 8 2 0
# 9 2 0
# 10 2 0
# 11 3 1
# 12 3 1
# 13 3 1
# 14 3 1
# 15 3 0
dplyr
library(dplyr)
df %>%
rowwise() %>%
summarize(tibble(year = day, infected = replace(integer(tot.ind), seq(tot.ind) <= tot.infected, 1L)))
# # A tibble: 15 x 2
# year infected
# <dbl> <int>
# 1 1 1
# 2 1 0
# 3 1 0
# 4 1 0
# 5 1 0
# 6 2 1
# 7 2 1
# 8 2 0
# 9 2 0
# 10 2 0
# 11 3 1
# 12 3 1
# 13 3 1
# 14 3 1
# 15 3 0
We can do it this way:
library(dplyr)
df %>%
group_by(day) %>%
summarise(cur_data()[seq(unique(tot.ind)),]) %>%
#mutate(x = row_number())
mutate(tot.infected = ifelse(row_number() <= first(tot.infected),
first(tot.infected)/first(tot.infected), 0), .keep="used")
day tot.infected
<dbl> <dbl>
1 1 1
2 1 0
3 1 0
4 1 0
5 1 0
6 2 1
7 2 1
8 2 0
9 2 0
10 2 0
11 3 1
12 3 1
13 3 1
14 3 1
15 3 0
Using rep.int and replace, basically.
with(df, data.frame(
year=do.call(rep.int, unname(df[c(1, 3)])),
infected=unlist(Map(replace, Map(rep.int, 0, tot.ind), lapply(tot.infected, seq), 1))
))
# year infected
# 1 1 1
# 2 1 0
# 3 1 0
# 4 1 0
# 5 1 0
# 6 2 1
# 7 2 1
# 8 2 0
# 9 2 0
# 10 2 0
# 11 3 1
# 12 3 1
# 13 3 1
# 14 3 1
# 15 3 0
Data:
df <- structure(list(day = c(1, 2, 3), tot.infected = c(1, 2, 4), tot.ind = c(5,
5, 5)), class = "data.frame", row.names = c(NA, -3L))

R joining on counts of elements in a vector to matching index in a dataframe

I have a dataframe df that looks like this:
indx
1 1
2 2
3 3
4 4
5 5
6 6
7 7
8 8
9 9
10 10
df<-structure(list(indx = 1:10), row.names = c(NA, 10L), class = "data.frame")
And a vector vec that looks like this:
vec<-c(3,5,9,9,5,4,3,3)
I would like to find the counts of each element in vec e.g. by doing this:
vec_counts<-table(vec)
3 4 5 9
3 1 2 2
And then join these counts into the matching indx in df. The final result should be:
indx vec_counts
1 1 0
2 2 0
3 3 3
4 4 1
5 5 2
6 6 0
7 7 0
8 8 0
9 9 2
10 10 0
You can use merge.
x <- merge(df, as.data.frame(table(vec)), by.x = "indx", by.y = "vec", all.x = TRUE)
names(x) <- c("indx", "vec_counts")
x$vec_counts[is.na(x$vec_counts)] <- 0
x
# indx vec_counts
# 1 1 0
# 2 2 0
# 3 3 3
# 4 4 1
# 5 5 2
# 6 6 0
# 7 7 0
# 8 8 0
# 9 9 2
# 10 10 0
Also you can use outer() to calculate it in one quick step without table().
df$vec_counts <- rowSums(outer(df$indx, vec, `==`))
# or with dplyr
library(dplyr)
df %>%
mutate(vec_counts = rowSums(outer(indx, vec, `==`)))
A possible solution, based on tidyverse:
library(tidyverse)
df %>%
mutate(count = map_dbl(indx, ~ sum(.x == vec)))
#> indx count
#> 1 1 0
#> 2 2 0
#> 3 3 3
#> 4 4 1
#> 5 5 2
#> 6 6 0
#> 7 7 0
#> 8 8 0
#> 9 9 2
#> 10 10 0
Or in base R:
df$count <- apply(df, 1, \(x) sum(x[1] == vec))
Or even, still in base R, using sapply:
df$count <- sapply(df$indx, \(x) sum(x == vec))

Subsetting data based on a value within ids in r

I'm trying to subset a dataset based on two criteria. Here is a snapshot of my data:
ids <- c(1,1,1,1,1,1, 2,2,2,2,2,2, 3,3,3,3,3,3)
seq <- c(1,2,3,4,5,6, 1,2,3,4,5,6, 1,2,3,4,5,6)
type <- c(1,1,5,1,1,1, 1,1,1,8,1,1, 1,1,1,1,1,1)
data <- data.frame(ids, seq, type)
ids seq type
1 1 1 1
2 1 2 1
3 1 3 5
4 1 4 1
5 1 5 1
6 1 6 1
7 2 1 1
8 2 2 1
9 2 3 1
10 2 4 8
11 2 5 1
12 2 6 1
13 3 1 1
14 3 2 1
15 3 3 1
16 3 4 1
17 3 5 1
18 3 6 1
ids is the student id, seq is the sequence of the questions (items) students take. type refers to the type of the question. 1 is simple, 5 or 8 is the complicated items. What I would like to do is to generate 1st variable(complex) as to whether or not student has a complicated item(type=5|8). Then I would like to get:
> data
ids seq type complex
1 1 1 1 1
2 1 2 1 1
3 1 3 5 1
4 1 4 1 1
5 1 5 1 1
6 1 6 1 1
7 2 1 1 1
8 2 2 1 1
9 2 3 1 1
10 2 4 8 1
11 2 5 1 1
12 2 6 1 1
13 3 1 1 0
14 3 2 1 0
15 3 3 1 0
16 3 4 1 0
17 3 5 1 0
18 3 6 1 0
The second step is to split data within students.
(a) For the student who has non-complex items (complex=0), I would like to split the dataset from half point and get this below:
>simple.split.1
ids seq type complex
13 3 1 1 0
14 3 2 1 0
15 3 3 1 0
>simple.split.2
ids seq type complex
16 3 4 1 0
17 3 5 1 0
18 3 6 1 0
(b) for the students who have complex items (complex=1), I would like to set the complex item as a cutting point and split the data from there. So the data should look like this (excluding complex item):
>complex.split.1
ids seq type complex
1 1 1 1 1
2 1 2 1 1
7 2 1 1 1
8 2 2 1 1
9 2 3 1 1
>complex.split.2
ids seq type complex
4 1 4 1 1
5 1 5 1 1
6 1 6 1 1
11 2 5 1 1
12 2 6 1 1
Any thoughts?
Thanks
Here's a way to do it using data.table, zoo packages and split function:
library(data.table)
library(zoo)
setDT(data)[, complex := ifelse(type == 5 | type == 8, 1, NA_integer_), by = ids][, complex := na.locf(na.locf(complex, na.rm=FALSE), na.rm=FALSE, fromLast=TRUE), by = ids][, complex := ifelse(is.na(complex), 0, complex)] ## set data to data.table & add a flag 1 where type is 5 or 8 ## carry forward and backward of complex flag ## replace na values in complex column with 0
data <- data[!(type == 5 | type == 8), ] ## removing rows where type equals 5 or 8
complex <- split(data, data$complex) ## split data based on complex flag
complex_0 <- as.data.frame(complex$`0`) ## saving as data frame based on complex flag
complex_1 <- as.data.frame(complex$`1`)
split(complex_0, cut(complex_0$seq, 2)) ## split into equal parts
split(complex_1, cut(complex_1$seq, 2))
#$`(0.995,3.5]`
# ids seq type complex
#1 3 1 1 0
#2 3 2 1 0
#3 3 3 1 0
#$`(3.5,6]`
# ids seq type complex
#4 3 4 1 0
#5 3 5 1 0
#6 3 6 1 0
#$`(0.995,3.5]`
# ids seq type complex
#1 1 1 1 1
#2 1 2 1 1
#6 2 1 1 1
#7 2 2 1 1
#8 2 3 1 1
#$`(3.5,6]`
# ids seq type complex
#3 1 4 1 1
#4 1 5 1 1
#5 1 6 1 1
#9 2 5 1 1
#10 2 6 1 1
If you prefer using the tidyverse, here's an approach:
ids <- c(1,1,1,1,1,1, 2,2,2,2,2,2, 3,3,3,3,3,3)
seq <- c(1,2,3,4,5,6, 1,2,3,4,5,6, 1,2,3,4,5,6)
type <- c(1,1,5,1,1,1, 1,1,1,8,1,1, 1,1,1,1,1,1)
data <- data.frame(ids, seq, type)
step1.data <- data %>%
group_by(ids) %>%
mutate(complex = ifelse(any(type %in% c(5,8)), 1, 0)) %>%
ungroup()
simple.split.1 <- step1.data %>%
filter(complex == 0) %>%
group_by(ids) %>%
filter(seq <= mean(seq)) %>% #if you happen to have more than 6 questions in seq, this gives the midpoint
ungroup()
simple.split.2 <- step1.data %>%
filter(complex == 0) %>%
group_by(ids) %>%
filter(seq > mean(seq)) %>%
ungroup()
complex.split.1 <- step1.data %>%
filter(complex == 1) %>%
arrange(ids, seq) %>%
group_by(ids) %>%
filter(seq < min(seq[type %in% c(5,8)])) %>%
ungroup()
complex.split.2 <- step1.data %>%
filter(complex == 1) %>%
arrange(ids, seq) %>%
group_by(ids) %>%
filter(seq > min(seq[type %in% c(5,8)])) %>%
ungroup()

How to unlist a list of data frames into a single data frame in R

Suppose I have a list of data frames:
lst<-list(data.frame(x=1:3,y=2:4, row.names=letters[1:3]),
data.frame(z=1:4,w=2:5, row.names=letters[3:6]),
data.frame(r=2:4,s=3:5, row.names=letters[2:4]))
lst
[[1]]
x y
a 1 2
b 2 3
c 3 4
[[2]]
z w
c 1 2
d 2 3
e 3 4
f 4 5
[[3]]
r s
b 2 3
c 3 4
d 4 5
My question is how to unlist it to a single data frame merged by rownames and replace NAs to 0 such as:
x y z w r s
a 1 2 0 0 0 0
b 2 3 0 0 2 3
c 3 4 1 2 3 4
d 0 0 2 3 4 5
e 0 0 3 4 0 0
f 0 0 4 5 0 0
In base R:
lst2 <- lapply(lst,function(x) cbind(rowname=rownames(x),x))
df1 <- Reduce(function(x,y) merge(x,y,all=T),lst2)
rownames(df1) <- df1[[1]]
df1 <- df1[-1]
df1[is.na(df1)] <- 0
df1
# x y z w r s
# a 1 2 0 0 0 0
# b 2 3 0 0 2 3
# c 3 4 1 2 3 4
# d 0 0 2 3 4 5
# e 0 0 3 4 0 0
# f 0 0 4 5 0 0
tidyverse can make things more compact/readable:
library(tidyverse)
lst %>%
map(rownames_to_column) %>%
reduce(full_join) %>%
`[<-`(is.na(.),value=0) %>%
column_to_rownames
# x y z w r s
# a 1 2 0 0 0 0
# b 2 3 0 0 2 3
# c 3 4 1 2 3 4
# d 0 0 2 3 4 5
# e 0 0 3 4 0 0
# f 0 0 4 5 0 0
merging directly by row names
merge supports merging by row names if you set the argument by to "row.names" or to 0, but strangely it returns a dataframe with a column Row.names and no actual row name. This makes the Reduce call much less smooth that it could have been, so in the end it's not much better, possibly worse, than my original base solution:
df1 <- Reduce(function(x,y) {
z <- merge(x,y,all=T,by=0)
rownames(z) <- z[[1]]
z[-1]},
lst)
df1[is.na(df1)] <- 0
An alternative tidyverse solution that in my microbenchmark is very marginally faster than the other solution (~0.5 ms on a 5ms operation), but I think is a bit more hacky and certainly less neat. This approach basically avoids calling full_join so many times by simply row binding the data frames and then collapsing by rowname, trusting mean, na.rm = TRUE to just drop all except the one nonmissing value.
library(tidyverse)
lst <- list(
data.frame(x = 1:3, y = 2:4, row.names = letters[1:3]),
data.frame(z = 1:4, w = 2:5, row.names = letters[3:6]),
data.frame(r = 2:4, s = 3:5, row.names = letters[2:4])
)
lst %>%
map(rownames_to_column) %>%
bind_rows() %>%
group_by(rowname) %>%
summarise_all(mean, na.rm = TRUE) %>%
map_dfc(replace_na, 0)
#> # A tibble: 6 x 7
#> rowname x y z w r s
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 a 1 2 0 0 0 0
#> 2 b 2 3 0 0 2 3
#> 3 c 3 4 1 2 3 4
#> 4 d 0 0 2 3 4 5
#> 5 e 0 0 3 4 0 0
#> 6 f 0 0 4 5 0 0
Created on 2018-06-22 by the reprex package (v0.2.0).

Resources