Given a matrix m how can I create a TRUE/ FALSE or 1 / 0 matrix where the columns and rows match some "key" in a data frame?
My goal is to assign a 1 or 0 to the location in the matrix where the columns match the cols and the rows match the rows in the colsrows_df. Then essentially just extract the observations where this is true or paste them into the colsrows_df next to the correct columns.
The below forloop just creates diagonally 1's and 0's
m <- matrix(runif(30), nrow = 20, ncol = 20)
dimnames(m) <- list(c(paste0("ID", 1:5, "_2000"), paste0("ID", 1:5, "_2001"), paste0("ID", 1:5, "_2002"), paste0("ID", 1:5, "_2003")),
c(paste0("ID", 1:5, "_2000"), paste0("ID", 1:5, "_2001"), paste0("ID", 1:5, "_2002"), paste0("ID", 1:5, "_2003")))
cols <- colnames(m)
rows <- rownames(m)
library(tidyr)
library(dplyr)
colsrows <- cbind(cols, rows)
# Here I just separate the rows/cols and then add an extra year and paste them back together
colsrows_df <- colsrows %>%
data.frame %>%
separate(cols, c("id_col", "year_col"), "_", remove = FALSE) %>%
separate(rows, c("id_row", "year_row"), "_", remove = FALSE) %>%
mutate(year_row_plus_1 = as.numeric(year_row) + 1,
rows = paste0(id_row,"_", year_row_plus_1)) %>%
select(cols, rows)
colsrows_df
for(i in 1:nrow(colsrows)){
m[i, ] <- colnames(m) == colsrows_df$cols
m[, i] <- rownames(m) == colsrows_df$rows
}
m
EDIT:
This seems to "solve" the problem however I am not sure how robust it is.
ids <- colsrows_df[colsrows_df$cols %in% colnames(m) &
colsrows_df$rows %in% rownames(m), ]
res <- melt(m[as.matrix(colsrows_df[colsrows_df$cols %in% colnames(m) &
colsrows_df$rows %in% rownames(m), ][2:1])])
cbind(ids, res)
I think can you first filter colsrows_df with rownames and colnames which are actually present in m then change the order of columns, convert to matrix , use it to subset m and change those values to 1.
m[as.matrix(colsrows_df[colsrows_df$cols %in% colnames(m) &
colsrows_df$rows %in% rownames(m), ][2:1])] <- 1
Then convert remaining ones to 0
m[m != 1] <- 0
Related
I have created two data frames that I then turn into lists (e.g., list1 and list2). I removed one element from list2 to better represent my example data set.
library(dplyr)
intervals <- rep_len(c("01-01-2022", "01-11-2022", "01-31-2022"), 100)
ID <- rep(c("A","B", "C"), 100)
df <- data.frame(ID = as.factor(ID),
intervals = as.factor(intervals))
list1 <- df %>%
group_by(ID, intervals) %>%
group_split()
intervals <- rep_len(c("01-01-2022", "01-11-2022", "01-31-2022"), 25)
ID <- rep(c("A","B"), 25)
df2 <- data.frame(ID = as.factor(ID),
intervals = as.factor(intervals))
list2 <- df2 %>%
group_by(ID, intervals) %>%
group_split()
list2 <- list2[-6]
For each of these list I have added an attribute, and I have included a function to check the added attribute more readily (check).
# Convenience function to grab the attributes for you
check <- function(list, attribute_name) {
return(attr(list, attribute_name))
}
# Add an attribute to hold the attributes of each list element
attr(list1, "match") <- data.frame(id = sapply(list1, function(x) paste(x$ID[1])),
interval_start_date = sapply(list1, function(x) paste(x$intervals[1]))
)
# Check the attributes
check(list1, "match")
# Add an attribute "tab" to hold the attributes of each list element
attr(list2, "match") <- data.frame(id = sapply(list2, function(x) paste(x$ID[1])),
interval_start_date = sapply(list2, function(x) paste(x$intervals[1]))
)
# Check the attributes
check(list2, "match")
I have created an index for the two list, and the objective here is to remove any list components that don't have the same ID and the same intervals. The goal is to have only the matching IDs with the same intervals.
# Creates an index for the two list based on the attributes,
dat2 <- check(list1, "match")
dat1 <- check(list2, "match")
# Removes rows where the id isn't present in both data frames, and creates a
# index where both the interval and id are the same.
if (!length(unique(dat2$id)) == length(unique(dat1$id))){
dat3 <- dat2[dat2$id %in% dat1$id, ]
dat4 <- dat1[dat1$id %in% dat2$id, ]
i1 <- paste(dat3[["id"]], format(as.Date(dat3[["interval_"]]),
"%Y-%d")) %in%
paste(dat4[["id"]], format(as.Date(dat4[["interval_"]]),
"%Y-%d"))
}
Now here is where I begin to get an error:
# Error occurs because the lengths of `i1` is not the same as `list2`
out <- list1[i1]
I know that this is occuring because list1 does not have the same length as i1. I'm wondering if there is a way to appending logical values to i1 to get it the same length as list1, but in a way that it doesn't remove values from list1 that we actually do want to keep. Any thoughts?
Here is my expected output for list1, where I hope it ends up with only the same IDs and intervals as list2.
# Expected output
expected_list1 <- list(list1[1], list1[2],list1[3], list1[4], list1[5])
This answer is close to what I would like, but it has an additional element. I think ultimately the attribute table should be similiar to that of dat4.
test <- list1[dat2$id %in% dat1$id][i1]
# Add an attribute "tab" to hold the attributes of each list element
attr(test, "match") <- data.frame(id = sapply(test, function(x) paste(x$ID[1])),
interval_start_date = sapply(test, function(x) paste(x$intervals[1]))
)
# Check the attributes
check(test, "match")
There was a mismatch in the column name i.e. it is not interval_, but interval_start_date in dat1 and dat2. [[ will look for exact match whereas $ can match partial names as well
if (!length(unique(dat2$id)) == length(unique(dat1$id))){
ids_common <- intersect(dat2$id, dat1$id)
inds1 <- dat2$id %in% ids_common
inds2 <- dat1$id %in% ids_common
i1 <- paste(dat2[["id"]], format(as.Date(dat2[["interval_start_date"]]),
"%Y-%d")) %in%
paste(dat1[["id"]], format(as.Date(dat1[["interval_start_date"]]),
"%Y-%d"))
out <- list1[i1 & inds1]
}
-checking
> length(out)
[1] 5
> i1
[1] TRUE TRUE TRUE TRUE TRUE FALSE FALSE FALSE FALSE
I would like to remove the rows of this dataframe in which, if the pattern ,2) exists, it just exist in one of the columns.
As an example: in this dataframe, each column is a character class (representing a vector in each position):
A c(0,1) c(1,1)
B c(0,2) c(0,1)
C c(1,1) c(0,1)
D c(1,2) c(0,2)
I would like to subset it, removing row B, as the pattern is present in one of the columns but not in the other.
I tried to use grep, but I don't know how to specify the conditional statement.
How can I achieve this?
For a single column we would do this (calling your data d)
d[!grepl(",2)", d$column_name, fixed = TRUE), ]
But we need to check all the columns and find rows that have exactly one match. For this, we'll convert to matrix and use rowSums to count the matches by row:
n_occurrences = rowSums(matrix(grepl(",2)", as.matrix(d), fixed = TRUE), nrow = nrow(d)))
d[n_occurrences != 1, ]
# V1 V2 V3
# 1 A c(0,1) c(1,1)
# 3 C c(1,1) c(0,1)
# 4 D c(1,2) c(0,2)
Using this sample data:
d = read.table(text = 'A c(0,1) c(1,1)
B c(0,2) c(0,1)
C c(1,1) c(0,1)
D c(1,2) c(0,2)')
Not as elegant as the selected answer above, but you can also split into two variables at the blank space and then create separate indices.
library(dplyr)
df = data.frame(v1=c('c(0,1) c(1,1)','c(0,2) c(0,1)',
'c(1,1) c(0,1)','c(1,2) c(0,2)'))
empty_omit <- function(vec) vec[vec!='']
get_even <- function(vec) vec[seq_along(vec) %% 2 == 0]
get_odd <- function(vec) vec[seq_along(vec) %% 2 ==1]
df$v2 = strsplit(df$v1, ' ') %>% unlist() %>% empty_omit %>% get_odd()
df$v3 = strsplit(df$v1, ' ') %>% unlist() %>% empty_omit %>% get_even()
idx_v2 = grepl(",2)", df$v2)
idx_v3 = grepl(",2)", df$v3)
df[!idx_v2 | idx_v3, ]
I have a list of ~8000 vectors, and I would like to know how many duplicates there are of these 8000 vectors, but the order of the elements in each could be different.
for example:
list <- c()
list[[1]] <- c(1,2,3)
list[[2]] <- c(2,1,3)
list[[3]] <- c(3,2,1)
list[[4]] <- c(4,5)
list[[5]] <- c(5,4)
list[[6]] <- c(1,2,3,5)
should give me a count of 3 for c(1,2,3) and 2 for c(4,5) and 1 for c(1,2,3,5)
I'd like the count of each of the duplicates, not just how many are duplicated.
library(tidyverse)
library(gtools)
get_perm <- function(v) {
m <- permutations(n = length(v), r = length(v), v = v, set = F)
m[order(c(m))]
}
all <- map(list, get_perm)
unique <- map(list, get_perm) %>% unique()
res_vec <- c()
element <- c()
for(i in seq_along(unique)) {
element[[i]] <- unique[[i]] %>% unique() %>% paste(collapse = ",")
res_vec[[i]] <- all %in% unique[i] %>% sum()
}
tibble(
elements = unlist(element),
numbers = res_vec
)
Result
# A tibble: 3 x 2
elements numbers
<chr> <int>
1 1,2,3 3
2 4,5 2
3 1,2,3,5 1
elements contains all the individual elements of the vectors for each group and numbers are the numbers of vectors you have in each group.
We create a function to take vector as an argument ('val'), then loop through the list with sapply, check if all the 'valare%in%the 'x', andsumthe logicalvector`
f1 <- function(lst, val) sum(sapply(lst, function(x) all(val %in% x)))
f1(list, c(1, 2, 3))
[#1] 3
f1(list, c(4, 5))
#[1] 2
Sooo
I’ve got two lists
list1 <- rep(c("john","steve","lisa","sara","anna"), c(50,0,15,25,10))
list2 <- rep(c("john","steve","lisa","sara","anna"), c(15,25,0,10,50))
I need to put them into a dataframe.
df <- as.data.frame(matrix(1, nrow = 100, ncol = 2))
df$v1 <- list1
Now the problem.
I need to put list2 into df$v2
with out any row in df containing the same values.
It does not matter what values are in each row.
I use this for testing it, if each rows contains the same value:
all(apply(ballots, 1, function(x) length(unique(x)) == 2) == TRUE)
to clarify:
I need each value in the columns, which row doesn't matter.
I need a way to randomize or change the order of the second column (or the first) in such a way that the same value is never in column one or two
The output:
V1 V2
John Steve
John Lisa
Sara John
John Lisa
Steve Anna
Currently, when I join the columns in the dataframe, there are many rows in both column one and two containing the same value.
Alright... finally found the answer after many trials and errors.
If anyone has a cleaner method to do this I would love to see one.
The following code takes list A and puts it in column A
takes list B, randomizes and puts in column C, Column B is NA
If A and C is not the same, switch column B and C.
If it fails to finish all the rows, it starts over, randomizing column C
library(taRifx)
failed.counter <- 0
while (failed.counter <= 1) {
list1 <- rep(c("A","B","C"), c(3,1,2))
list2 <- sample(rep(c("A","B","C"), c(2,3,1)))
df <- as.data.frame(matrix(NA, nrow = length(list1), ncol = 3))
df[,1] <- list1
df[,3] <- list2
iteration.counter <- 0
while (anyNA(df$V2) == TRUE && failed.counter == 0 ) {
iteration.counter <- iteration.counter + 1
df.sub <- df[is.na(df[,2]) & df[,1] != df[,3] & !is.na(df[,3]),]
df.sub <- df.sub[,c("V1", "V3", "V2")]
colnames(df.sub) <- c("V1", "V2", "V3")
r.names <- rownames(df.sub)
df[r.names,] <- df.sub
df[,3] <- shift(df[,3], 1, Wrap=TRUE)
if(iteration.counter >= nrow(df)+1) {failed.counter <- 1}
}
if(anyNA(df$V2) == FALSE) {failed.counter <- 2}
}
I have a multi-step problem. First step: match text in one string (df1) from one column to another range of columns (df2). There is no order of which columns match and the match could occur anywhere within the range. Once the match is found, copy the df2 row match into df1. Finally, repeat for the entire column.
df1= structure(list(Assay = c("ATG_AR_trans_up","NVS_PXR_cis","BSK_VCAM1_up"), p.value = c(0.01,0.05,0.0001)), .Names = c("Assay", "p.value"),row.names = c(NA, 3L), class = "data.frame")
df1
Assay p.value
ATG_AR_trans_up 0.01
NVS_hPXR 0.065
BSK_VCAM1_up 0.001
df2=structure(list(GeneID = c("AR", "VACM1", "TR", "ER", "PXR"), Assay1= c("ATG_ARE_cis", "BSK_hEDG_VCAM1", "NVS_TR_tran", "ATG_ER_UP", "NVS_PXRE_UP"), Assay2= c("ATG_AR_trans_up", "BSK_BE3K_VCAM1", "NA", "ATG_ERE_cis", "ATG_PXRE_cis"), Assay3= c("NVS_AR_trans", "BSK_VCAM1_UP", "NA", "NVS_ERa_CIS", "NVS_PXR_cis"), Assay4= c("Tox21_AR_ARE","NA", "NA", "Tox21_ERaERb_lig", "NA")), .Names = c("GeneID", "Assay1", "Assay2", "Assay3", "Assay4"),row.names = c(NA, 5L), class = "data.frame")
df2
GeneID Assay1 Assay 2 Assay3
AR ATG_ARE_cis NVS_hAR ATG_AR_trans_up
VACM1 BSK_hEGF_CAM1 BSK_VCAM1_up BSK_VCAM1_down
TR NVS_TR_tran NA NA
ER ATG_ER_UP ATG_ERE_cis NVS_ERa_CIS
PXR ATG_PXR_down ATG_PXRE_cis NVS_hPXR
Essentially becomes
df
Assay p.value GeneID Assay1 Assay2 Assay3
ATG_AR_trans_up 0.01 AR ATG_ARE_cis NVS_hAR ATG_AR_trans_up
NVS_hPXR 0.065 PXR ATG_PXR_down ATG_PXRE_cis NVS_hPXR
BSK_VCAM1_up 0.001 VCAM1 BSK_hEGF_CAM1 BSK_VCAM1_up BSK_VCAM1_down
For brevity I shortened the df substantially, but it is around 88 Assays and 4,000 some rows to go through for just one match (there are about 30). So a my initial instinct is to loop, but I was told grep might be a helpful package (even though it is not for R 3.2.2). Any help would be appreciated though.
Since OP was interested in a grep solution, another way to do it would be,
asDF2 <- apply(df2, 1, function(r) do.call(paste, as.list(r)))
do.call(rbind, lapply(1:nrow(df1),
function(i){
matchIX <- grepl(df1$Assay[i], asDF2, ignore.case=T)
if(any(matchIX))
cbind(df1[i, ], df2[matchIX, ])
}))
The first line creates a character vector with concatenated row assay names of df2. The second line loops through df1 and finds match in asDF2 using grepl
Or equivalently,
do.call(rbind, lapply(1:nrow(df1),
function(i){
matchIX <- grepl(df1$Assay[i],
data.frame(t(df2), stringsAsFactors=F),
ignore.case=T)
if(any(matchIX))
cbind(df1[i, ], df2[matchIX, ])
} ))
Note that above variants, can match multiple rows in df2 to df1.
NOTE
To test I added new rows to original data frames as
df1 <- rbind(df1, data.frame(Assay="NoMatch", p.value=.2))
df2 <- rbind(df2,
data.frame(GeneID="My", Assay1="NVS_PXR_cis", Assay2="NA", Assay3="NA", Assay4="NA"))
This can be easily done with reshaping. I put all the assays into all caps because that was messing up the matching.
library(dplyr)
library(tidyr)
library(stringi)
df2_ID = df %>% mutate(new_ID = 1:n() )
result =
df2_ID %>%
select(new_ID, Assay1:Assay85) %>%
gather(assay_number, Assay, Assay1:Assay85) %>%
mutate(Assay =
Assay %>%
iconv(to = "ASCII") %>%
stri_trans_toupper) %>%
inner_join(df1 %>%
mutate(Assay =
Assay %>%
iconv(to = "ASCII") %>%
stri_trans_toupper)) %>%
inner_join(df2_ID)
Since you're new to R, I think you are right that the most intuitive way to do this is with a for-loop. This is not the most concise or most efficient way to do this, but it should be clear what's going on.
# Creating example data
df1 <- as.data.frame(matrix(data=c("aa", "bb", "ee", .9, .5, .7), nrow=3))
names(df1) <- c("assay", "p")
df2 <- as.data.frame(matrix(data=c("G1", "G2", "aa", "dd", "bb", "ee", "cc", "ff"), nrow=2))
names(df2) <- c("GeneID", "assay1", "assay2", "assay3")
# Building a dataframe to store output
df3 <- as.data.frame(matrix(data=NA, nrow=dim(df1)[1], ncol=dim(df2)[2]))
names(df3) <- names(df2)
# Populating dataframe with output
for(i in 1:dim(df1)[1]){
index <- which(df2==as.character(df1$assay[i]), arr.ind = TRUE)[1]
for(j in 1:dim(df3)[2]){
df3[i,j] <- as.character(df2[index,j])
}
}
df <- cbind(df1, df3)
Edit after clarification from user:
I just created a triple for loop to check your values. Basically what it does is it looks for a match. It does this by looping through all columns and all the values from that column.
However my code is not perfect yet (also a beginner in R) and I just wanted to post it so that maybe we can work something out together :).
So I first convert your data to a data.frame. After that I create an empty output which I later fill per match found.
The improvements in this method would be that with this solution the function append will also append the column names which will result in multiple useless column names.
df3 <- as.data.frame(df1)
df4 <- as.data.frame(df2)
output <- data.frame()
for(j in 1:nrow(df3)) {
match <- FALSE
for(i in 2:(ncol(df4))) {
for(p in 1:nrow(df4)) {
if((df3[j, 1] == df4[p, i]) && (match == FALSE)) {
output <- append(output, c(df3[j, ], df4[j, ]))
match <- TRUE
}
}
}
}
Assuming, you don't have any repeated entry corresponding to the entry in df1. Following is the solution for your problem:
assay <-as.matrix(df1[,1])
m1 <- as.numeric(sapply(assay, function(x){grep(x,df2[,2], ignore.case = T)}, simplify = FALSE))
m2 <- as.numeric(sapply(assay, function(x){grep(x,df2[,3], ignore.case = T)}, simplify = FALSE))
m3 <- as.numeric(sapply(assay, function(x){grep(x,df2[,4], ignore.case = T)}, simplify = FALSE))
m4 <- as.numeric(sapply(assay, function(x){grep(x,df2[,5], ignore.case = T)}, simplify = FALSE))
m1[is.na(m1)] <- 0
m2[is.na(m2)] <- 0
m3[is.na(m3)] <- 0
m4[is.na(m4)] <- 0
m0 <- (m1+m2+m3+m4)
df <- NULL
for(i in 1:nrow(df1){
df3 = cbind(df1[i,],df2[m0[i],])
df = rbind(df,df3)
}
Edit: Generalization
Since you have more than 80 rows, you can generalize it as under:
assay <-as.matrix(df1[,1])
# Storing Assay column in a list
m <- vector('list',ncol(df2[, 2:ncol(df2)]))
for(i in 1:length(m)){
m[[i]] <- as.numeric(sapply(assay, function(x){grep(x,df2[,(i+1)], ignore.case = T)}, simplify = FALSE))
}
# Getting row subscript for df2
m1 <- as.data.frame(m)
m1[is.na(m1)] <- 0
m2 <- rowSums(m1)
df <- NULL
for(i in 1:nrow(df1)){
df3 = cbind(df1[i,],df2[m2[i],])
df = rbind(df,df3)
}