Find biggest independent subset of a connectivity matrix - r

I have two groups linked by a connectivity matrix like the following:
#
# X1 X2 X3 X4 X5 X6
# 1 0 0 0 0 0 V1
# 1 1 1 0 0 0 V2
# 0 1 0 0 0 0 V3
# 0 0 1 0 0 0 V4
# 0 0 0 1 0 0 V5
# 0 0 0 1 0 0 V6
# 0 0 0 0 1 0 V7
# 0 0 0 0 1 1 V8
# 0 0 0 0 1 0 V9
# 0 0 0 0 0 1 V10
#
So X1 is linked to V1 and V2 while V2 is linked to X1, X2 and X3 and so on. I need to find a way (algorithm or command) for getting all the biggest independent subsets of the matrix. So, in this case:
# X1 X2 X3
# 1 0 0 V1
# 1 1 1 V2
# 0 1 0 V3
# 0 0 1 V4
and:
# X4
# 1 V5
# 1 V6
and:
# X5 X6
# 1 0 V7
# 1 1 V8
# 1 0 V9
# 0 1 V10
Do you have any hint? I guess there's already some library or function to use either from graph analysis or linear algebra.

As you hinted we can do this with igraph:
# dummy data
df1 <- read.table(text = " X1 X2 X3 X4 X5 X6
V1 1 0 0 0 0 0
V2 1 1 1 0 0 0
V3 0 1 0 0 0 0
V4 0 0 1 0 0 0
V5 0 0 0 1 0 0
V6 0 0 0 1 0 0
V7 0 0 0 0 1 0
V8 0 0 0 0 1 1
V9 0 0 0 0 1 0
V10 0 0 0 0 0 1
")
library(dplyr)
library(tidyr)
library(igraph)
# make graph object
gg <-
df1 %>%
add_rownames(var = "V") %>%
gather(X, value, -V) %>%
filter(value == 1) %>%
graph.data.frame
# split based on clusters of graph
lapply(
sapply(split(clusters(gg)$membership,
clusters(gg)$membership), names),
function(i)
df1[intersect(rownames(df1), i),
intersect(colnames(df1), i),
drop = FALSE])
# $`1`
# X1 X2 X3
# V1 1 0 0
# V2 1 1 1
# V3 0 1 0
# V4 0 0 1
#
# $`2`
# X4
# V5 1
# V6 1
#
# $`3`
# X5 X6
# V7 1 0
# V8 1 1
# V9 1 0
# V10 0 1

Related

Create subset of the sample by different variables simultaneously

I have a data frame as the following. Variables a and b are continuous, and variables v1-v7 are binary.
> df <- data.frame(a= c(1,1,2,3,5),
+ b = c(3, 6,8, 2, 4),
+ v1 = c(0,0,0,0,0),
+ v2 = c(1,0,0,0,0),
+ v3 = c(0,1,1,1,1),
+ v4 = c(0,1,1,1,1),
+ v5 = c(0,0,0,0,1),
+ v6 = c(0,0,0,0,0),
+ v7 = c(0,0,0,0,0))
> df
a b v1 v2 v3 v4 v5 v6 v7
1 1 3 0 1 0 0 0 0 0
2 1 6 0 0 1 1 0 0 0
3 2 8 0 0 1 1 0 0 0
4 3 2 0 0 1 1 0 0 0
5 5 4 0 0 1 1 1 0 0
>
I want to create seven subsamples based on the data frame I showed above. Specifically, I want to make seven subsamples that only include variables a and b and when each v1-v7 equals 1. For example,
> df1 <- df %>% filter(v1==1)
> df1
[1] a b v1 v2 v3 v4 v5 v6 v7
<0 rows> (or 0-length row.names)
> df2 <- df %>% filter(v2==1)
> df2
a b v1 v2 v3 v4 v5 v6 v7
1 1 3 0 1 0 0 0 0 0
> df3 <- df %>% filter(v3==1)
> df3
a b v1 v2 v3 v4 v5 v6 v7
1 1 6 0 0 1 1 0 0 0
2 2 8 0 0 1 1 0 0 0
3 3 2 0 0 1 1 0 0 0
4 5 4 0 0 1 1 1 0 0
I want to know how can I do these simultaneously in R? Thanks.
Here's a way with lapply(). You are better off keeping your results in a list. Subsample for v1 would be subsamples[[1]] and so on. -
subsamples <- lapply(3:9, function(x) df[df[[x]]==1, ])
subsamples
[[1]]
[1] a b v1 v2 v3 v4 v5 v6 v7
<0 rows> (or 0-length row.names)
[[2]]
a b v1 v2 v3 v4 v5 v6 v7
1 1 3 0 1 0 0 0 0 0
[[3]]
a b v1 v2 v3 v4 v5 v6 v7
2 1 6 0 0 1 1 0 0 0
3 2 8 0 0 1 1 0 0 0
4 3 2 0 0 1 1 0 0 0
5 5 4 0 0 1 1 1 0 0
[[4]]
a b v1 v2 v3 v4 v5 v6 v7
2 1 6 0 0 1 1 0 0 0
3 2 8 0 0 1 1 0 0 0
4 3 2 0 0 1 1 0 0 0
5 5 4 0 0 1 1 1 0 0
[[5]]
a b v1 v2 v3 v4 v5 v6 v7
5 5 4 0 0 1 1 1 0 0
[[6]]
[1] a b v1 v2 v3 v4 v5 v6 v7
<0 rows> (or 0-length row.names)
[[7]]
[1] a b v1 v2 v3 v4 v5 v6 v7
<0 rows> (or 0-length row.names)
in dplyr you can specify a variable name as character string with the pronoun .data (see data masking)
df_samples <- list()
for(i in 1:7)
df_samples[[i]] <- filter(df, .data[[paste0("v", i)]] == 1)
Just loop over the columns 'v1' to 'v7' and do the filter and return in a list
library(dplyr)
library(stringr)
library(purrr)
lst1 <- str_subset(names(df), "^v\\d+") %>%
map(~ df %>%
filter(if_all(all_of(.x), ~ .x == 1)))
names(lst1) <- str_c('df', seq_along(lst1))
It is better to keep it in a list. If we need objects created in the global env (not recommended), use list2env on the named list
list2env(lst1, .GlobalEnv)

How to find percentage of people belonging to atleast 2 groups in r

I just started using R. And I have a stata dataset which I opened in R. In the questionnaire there is a question “Please look carefully at the following list of political groups and say which, if any, do you belong to?” . Variable v1 to v10 represents the different groups and each have values of 1 or 0 which is ‘yes’ or ‘no’.
My question is: How do I find the percentage of people who are members of atleast 2 groups?
I think I’m supposed to use dplyr but I am not sure.
One of the idea that I've got was to use filter and mutate.
Does this work:
> library(dplyr)
> stat <- data.frame(v1 = sample(c(0,1), 10, T),
+ v2 = sample(c(0,1), 10, T),
+ v3 = sample(c(0,1), 10, T),
+ v4 = sample(c(0,1), 10, T),
+ v5 = sample(c(0,1), 10, T),
+ v6 = sample(c(0,1), 10, T),
+ v7 = sample(c(0,1), 10, T),
+ v8 = sample(c(0,1), 10, T),
+ v9 = sample(c(0,1), 10, T),
+ v10 = sample(c(0,1), 10, T), stringsAsFactors = F)
> stat
v1 v2 v3 v4 v5 v6 v7 v8 v9 v10
1 0 1 1 1 1 1 1 0 0 1
2 0 1 1 0 0 1 1 1 0 1
3 0 1 1 0 1 0 0 1 1 0
4 0 0 1 1 0 1 0 1 0 0
5 0 0 1 1 0 1 0 1 1 0
6 0 1 0 1 1 1 1 1 1 0
7 0 0 1 0 0 0 0 1 0 1
8 0 0 1 1 1 1 0 0 0 1
9 0 1 0 0 0 1 0 0 0 1
10 0 1 1 0 0 0 0 0 1 1
> stat %>% mutate(groups_member = rowSums(.)) %>% mutate(atleast_two_groups = case_when(groups_member >= 2 ~ 'Yes', TRUE ~ 'No')) %>% select(-groups_member)
v1 v2 v3 v4 v5 v6 v7 v8 v9 v10 atleast_two_groups
1 0 1 1 1 1 1 1 0 0 1 Yes
2 0 1 1 0 0 1 1 1 0 1 Yes
3 0 1 1 0 1 0 0 1 1 0 Yes
4 0 0 1 1 0 1 0 1 0 0 Yes
5 0 0 1 1 0 1 0 1 1 0 Yes
6 0 1 0 1 1 1 1 1 1 0 Yes
7 0 0 1 0 0 0 0 1 0 1 Yes
8 0 0 1 1 1 1 0 0 0 1 Yes
9 0 1 0 0 0 1 0 0 0 1 Yes
10 0 1 1 0 0 0 0 0 1 1 Yes
>
So the dataframe is like a matrix with 10 variables each having either 0 or 1. So creating a new column that sums up all rows and if the total count is more than 2 which is more than atleast 20% (2/10) then it tells whether it satisfies your query.
You can create a new column, where you add up all 1's and 0' then sum up the values that are greater or smaller than 2.
set.seed(1234)
dat <- matrix(ifelse(runif(100)>=0.1,0,1),10,10) %>%
as_tibble(,.name_repair = "unique")
dat %>%
mutate(rsum = rowSums(.)) %>%
summarise(fewer_than_two = 100*sum(rsum<2)/n(),
more_than_two = 100*sum(rsum>=2)/n())
# A tibble: 1 x 2
fewer_than_two more_than_two
<dbl> <dbl>
1 80 20
Start creating some fake data
library(dplyr)
df <- tibble(id = 1:5,
v1 = c(1, 1, 0, 0, 0),
v2 = c(1, 1, 0, 0, 0),
v3 = rep(0, 5),
v4 = rep(0, 5),
v5 = rep(0, 5),
v6 = rep(0, 5),
v7 = rep(0, 5),
v8 = rep(0, 5),
v9 = rep(0, 5),
v10 = rep(0, 5))
This is our table. Note that out of 5 observations we have 2 people (40%) who are members of at least 2 groups
> df
# A tibble: 5 x 11
id v1 v2 v3 v4 v5 v6 v7 v8 v9 v10
<int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 1 1 0 0 0 0 0 0 0 0
2 2 1 1 0 0 0 0 0 0 0 0
3 3 0 0 0 0 0 0 0 0 0 0
4 4 0 0 0 0 0 0 0 0 0 0
5 5 0 0 0 0 0 0 0 0 0 0
First, I calculate the sum of the variables 1 to 10, creating a variable that gets true if greater than or equal to 2 and false otherwise. Then we group by this new variable and calculate the percentages
result <- df %>%
rowwise() %>%
mutate(two_or_more = sum(c_across(v1:v10)) >= 2) %>%
group_by(two_or_more) %>%
summarize(percentage = sum(n()) / nrow(df) * 100)
The result should look like this
> result
# A tibble: 2 x 2
two_or_more percentage
<lgl> <dbl>
1 FALSE 60
2 TRUE 40

Using R to remove all columns that sum to 0

I have a very large CSV file containing counts of unique DNA sequences, and there is a column for each unique sequence. I started with hundreds of samples and cut it down to only 15 that I care about but now I have THOUSANDS of columns that contain nothing but Zeroes and it is messing up my data processing. How do I go about completely removing any column that sums to zero? I’ve seen some similar questions on here but none of those suggestions have worked for me.
I have 6653 columns and 16 rows in my data frame.
If it matters my columns all have super crazy names, some several hundred characters long ( AATCGGCTAA..., etc) and the row names are the sample IDs which are also not entirely numeric. Any tips greatly appreciated. I am still new to R so please let me know where I will need to change things in code examples if you can! Thanks!
You can use colSums
set.seed(10)
df <- as.data.frame(matrix(sample(0:1, 50, replace = TRUE, prob = c(.8, .2)),
5, 10))
df
# V1 V2 V3 V4 V5 V6 V7 V8 V9 V10
# 1 0 0 0 0 1 0 0 0 0 0
# 2 0 0 0 0 0 1 0 1 0 0
# 3 0 0 0 0 0 0 0 1 0 0
# 4 0 0 0 0 0 0 1 0 0 0
# 5 0 0 0 1 0 0 0 0 0 1
df[colSums(df) != 0]
# V4 V5 V6 V7 V8 V10
# 1 0 1 0 0 0 0
# 2 0 0 1 0 1 0
# 3 0 0 0 0 1 0
# 4 0 0 0 1 0 0
# 5 1 0 0 0 0 1
But you might not want to remove all columns which sum to 0, because that could be true even if not all elements are 0. Take V4 in the data frame below as an example.
df$V4[1] <- -1
df
# V1 V2 V3 V4 V5 V6 V7 V8 V9 V10
# 1 0 0 0 -1 1 0 0 0 0 0
# 2 0 0 0 0 0 1 0 1 0 0
# 3 0 0 0 0 0 0 0 1 0 0
# 4 0 0 0 0 0 0 1 0 0 0
# 5 0 0 0 1 0 0 0 0 0 1
So if you want to only remove columns where all elements are 0, you can do
df[colSums(df == 0) < nrow(df)]
# V4 V5 V6 V7 V8 V10
# 1 -1 1 0 0 0 0
# 2 0 0 1 0 1 0
# 3 0 0 0 0 1 0
# 4 0 0 0 1 0 0
# 5 1 0 0 0 0 1
welcome to SO here is a tidyverse approach
library(tidyverse)
mtcars %>%
select_if(is.numeric) %>%
select_if(~ sum(.x) > 0)

Transforming Binary data

I have a dataframe that only consists of 0 and 1. So for each individual instead of having one column with a factoral value (ex. low price, 4 rooms) I have
V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16 V17 V18 V19 V20 V21
1 0 0 0 1 0 0 0 1 0 1 0 0 0 0 1 1 0 0 0 1 0
2 1 0 0 0 0 0 0 1 1 0 0 0 0 0 1 0 0 1 0 0 1
3 0 0 0 1 1 0 0 0 0 0 1 0 0 0 1 1 0 0 1 0 0
4 0 0 0 1 0 1 0 0 0 0 1 0 1 0 0 0 1 0 1 0 0
How can I transform the dataset in R, so that I create new columns (#number of rooms) and give the position of the 1 (in the 4th column) a vhigh value?
I have multiple expenatory varibales I need to do this for. the 21 columns are representing 6 variables for 1000+ observations. should be something like this
PurchaseP. NumberofRooms ...
1. vhigh. 4
2. low. 4
3. vhigh. 1
4. vhigh. 2
Just did it for the first 2 epxlenatory varibales here, but essentially it repeats like this with each explenatory variable has 3-4 possible factoral values.
V1:V4 = purchase price, V5:V8 = number of rooms,V9:V11 = floors, and so on
In my head something like this could work
create a if statemt to give each 1 a value depending on column position, ex. if value in V4=1 then name "vhigh". and do this for each Vx
Then combine each column V1:V4, V5:V8, V9:V11 (depending on if it has 3-4 possible factoral/integer values) while ignoring 0 values.
Would this work, or is there a simpler approach? How would one code this in R?
Here is an approach that should work for you. I wrote a function, which will take as arguments your data.frame, the columns representing one of your variables of interest (e.g. purchase price is stored in columns 1 to 4), and the names of the levels you would like as a result. The function will then return the result you requested. You'll need to write this out for the 6 variables you are interested in.
I'll simulate some data and illustrate the approach.
df <- data.frame(matrix(rep(c(0,0,0,1, 1,0,0,0, 1,0,0,0,0,0,0,1), 2),
nrow = 4, byrow = T))
df
#> X1 X2 X3 X4 X5 X6 X7 X8
#> 1 0 0 0 1 1 0 0 0
#> 2 1 0 0 0 0 0 0 1
#> 3 0 0 0 1 1 0 0 0
#> 4 1 0 0 0 0 0 0 1
We'll say that the first four columns are the purchase price in v.low to v.high, and the second four are the number of rooms (1:4). We'll write a function that takes this information as arguments and returns the result:
rangeToCol <- function(df, # Your data.frame
range, # the columns that incode the category of interest
lev.names # The names of the category levels
) {
tdf <- df[range]
lev.names[unlist(apply(tdf, 1, function(rw){which(rw==1)}))]
}
new.df <- data.frame(PurchaseP = rangeToCol(df, 1:4,
c('vlow','low','high','vhigh')),
NumberofRooms = rangeToCol(df, 5:8, c(1:4)))
new.df
#> PurchaseP NumberofRooms
#> 1 vhigh 1
#> 2 vlow 4
#> 3 vhigh 1
#> 4 vlow 4

read in matrix into r without delimination

a file I want to read into R looks liek this
0010101010101010101
1110101010101010101
1111110101010111000
0001010101000010100
when I read that in the problem is that R thinks every row is a number and just shows
V1
1 Inf
2 Inf
3 Inf
4 Inf
5 Inf
6 Inf
how can I read it in as a matrix with 0 and the other element?
One option is
as.matrix(read.fwf('triub.txt', widths=rep(1,19)))
# V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16 V17 V18 V19
#[1,] 0 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1
#[2,] 1 1 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1
#[3,] 1 1 1 1 1 1 0 1 0 1 0 1 0 1 1 1 0 0 0
#[4,] 0 0 0 1 0 1 0 1 0 1 0 0 0 0 1 0 1 0 0
Or
as.matrix(read.table(text=gsub("", ' ', readLines('triub.txt'))))
# V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16 V17 V18 V19
#[1,] 0 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1
#[2,] 1 1 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1
#[3,] 1 1 1 1 1 1 0 1 0 1 0 1 0 1 1 1 0 0 0
#[4,] 0 0 0 1 0 1 0 1 0 1 0 0 0 0 1 0 1 0 0
Or you can pipe with sed or awk (in linux)
as.matrix(read.table(pipe("sed 's/./& /g' triub.txt"), header=FALSE))
# V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16 V17 V18 V19
#[1,] 0 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1
#[2,] 1 1 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1
#[3,] 1 1 1 1 1 1 0 1 0 1 0 1 0 1 1 1 0 0 0
#[4,] 0 0 0 1 0 1 0 1 0 1 0 0 0 0 1 0 1 0 0
as.matrix(read.table(pipe("awk 'BEGIN{FS=\"\"; OFS=\" \"}{$1=$1}1' triub.txt"),
header=FALSE))
You could treat it like a fixed with data file and use the new readr library.
library(readr)
read_fwf("0010101010101010101
1110101010101010101
1111110101010111000
0001010101000010100", fwf_widths(rep(1,19)))
this returns a data.frame which you can convert to matrix with as.matrix
Or you could read the lines and split (useful if you don't know the number of columns ahead of time)
tx<-textConnection("0010101010101010101
1110101010101010101
1111110101010111000
0001010101000010100")
do.call(rbind, lapply(strsplit(readLines(tx), strsplit, split=""), as.numeric))
close(tx)
Note I only use textConnection() here to make a reproducible example. You can use readLines("filename.txt") for your real data file.

Resources