R - Linear linear regression with variables in different dataframes - r

I have 4 large matrixes of the same size A, B, C and D . Each matrix has n samples (columns) and n observations (rows).
A <- structure(list(S1 = c(0L, 0L, 1L, 1L), S2 = c(0L, 1L, 0L, 0L), S3 = c(0L, 0L, 0L, 1L)), class = "data.frame", row.names = c("Ob1", "Ob2", "Ob3", "Ob4"))
# S1 S2 S3
# Ob1 0 0 0
# Ob2 0 1 0
# Ob3 1 0 0
# Ob4 1 0 1
B <- structure(list(S1 = c(0L, 1L, 1L, 1L), S2 = c(0L, 8L, 0L, 0L), S3 = c(0L, 0L, 0L, 1L)), class = "data.frame", row.names = c("Ob1", "Ob2", "Ob3", "Ob4"))
# S1 S2 S3
# Ob1 0 0 0
# Ob2 1 8 0
# Ob3 1 0 0
# Ob4 1 0 1
C <- structure(list(S1 = c(0L, 0L, 4L, 1L), S2 = c(2L, 1L, 0L, 2L), S3 = c(0L, 0L, 0L, 1L)), class = "data.frame", row.names = c("Ob1", "Ob2", "Ob3", "Ob4"))
# S1 S2 S3
# Ob1 0 2 0
# Ob2 0 1 0
# Ob3 4 0 0
# Ob4 1 2 1
D <- structure(list(S1 = c(0L, 0L, 4L, 1L), S2 = c(8L, 1L, 5L, 0L), S3 = c(0L, 0L, 0L, 1L)), class = "data.frame", row.names = c("Ob1", "Ob2", "Ob3", "Ob4"))
# S1 S2 S3
# Ob1 0 8 0
# Ob2 0 1 0
# Ob3 4 5 0
# Ob4 1 0 1
Each matrix contains a different variable. I want to perform a linear regression of 4 variables for each sample and observation of the matrixes. I don't want a linear regression betweeen any combinaton of samples and observations, just pairwise regressions in the form of column 1 and row 1 in matrx A is going to be fitted with column 1 and row 1 in matrixes B, C and D; column 2 and row 2 with column 2 and row 2, and so on.
lm model:
lm(A ~ B * C + D)
I want:
lm(A$S1_Obs1 ~ B$S1_Obs1 * C$S1_Obs1 + D$S1_Obs1)
lm(A$S1_Obs2 ~ B$S1_Obs2 * C$S1_Obs2 + D$S1_Obs2)
lm(A$S1_Obs3 ~ B$S1_Obs3 * C$S1_Obs3 + D$S1_Obs3)
lm(A$S2_Obs1 ~ B$S2_Obs1 * C$S2_Obs1 + D$S2_Obs1)
lm(A$S2_Obs2 ~ B$S2_Obs2 * C$S2_Obs2 + D$S2_Obs2)
lm(A$S2_Obs3 ~ B$S2_Obs3 * C$S2_Obs3 + D$S2_Obs3)
...
Any help appreciated.

We may use asplit to split by row and then construct the linear model by looping each of the split elements in Map
out <- Map(function(a, b, c, d) lm(a ~ b * c + d),
asplit(A, 1), asplit(B, 1), asplit(C, 1), asplit(D, 1))

Here is an approach using the purrr package that assigns names as well:
library(purrr)
seq_along(A) %>%
map(~ lm(A[.] ~ B[.] * C[.] + D[.])) %>%
set_names(map(seq_along(.),
~ arrayInd(.x, dim(A)) %>%
paste(collapse = "_")))

Related

subset rows in dataframe using combinations of conditions

I have a data frame:
table = structure(list(Plot = 1:10, Sp1 = c(0L, 0L, 1L, 1L, 0L, 1L, 0L,
0L, 1L, 0L), Sp2 = c(1L, 0L, 1L, 0L, 0L, 0L, 1L, 0L, 0L, 0L),
Sp3 = c(1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 0L, 0L), Sp4 = c(0L,
1L, 1L, 0L, 1L, 0L, 0L, 1L, 1L, 0L)), class = "data.frame", row.names = c(NA,
-10L))
0 represents a species (Sp) being absent from a plot. 1 represents a species being present.
First, I want to subset my data frame so that only plots with Sp1 or Sp3 or Sp4 remain. This can be done easily with filter from dplyr:
reduced_table <- table %>% filter(table$Sp1 == 1 |table$Sp3 == 1 | table$Sp4 == 1)
But, what if I want to reduce the table so that only plots that have any combination of two of these species is present. For example plots with Sp1 & Sp3, or Sp1 and Sp4, or Sp3 and Sp4 would remain.
Can this be done eloquently like using filter? My real situation has many more species and therefore many more combinations so explicitly writing out the combinations is not ideal.
We can use if_any with filter
library(dplyr)
table %>%
filter(if_any(c(Sp1, Sp3, Sp4), ~ .== 1))
-output
# Plot Sp1 Sp2 Sp3 Sp4
#1 1 0 1 1 0
#2 2 0 0 1 1
#3 3 1 1 1 1
#4 4 1 0 1 0
#5 5 0 0 1 1
#6 6 1 0 0 0
#7 7 0 1 1 0
#8 8 0 0 1 1
#9 9 1 0 0 1
Or using a combnation of columns
library(purrr)
combn(c("Sp1", "Sp3", "Sp4"), 2, simplify = FALSE) %>%
map_dfr( ~ table %>%
filter(if_all(.x, ~ . == 1))) %>%
distinct
If the intention is to do filtering on pairwise column checks, use combn from base R
subset(table, Reduce(`|`, combn(c("Sp1", "Sp3", "Sp4"), 2,
FUN = function(x) rowSums(table[x] == 1) == 2, simplify = FALSE)))

Create categorical variable from mutually exclusive dummy variables [duplicate]

This question already has answers here:
Reconstruct a categorical variable from dummies in R [duplicate]
(3 answers)
Closed 3 years ago.
How can I create a categorical variable from mutually exclusive dummy variables (taking values 0/1)?
Basically I am looking for the exact opposite of this solution: (https://subscription.packtpub.com/book/big_data_and_business_intelligence/9781787124479/1/01lvl1sec22/creating-dummies-for-categorical-variables).
Would appreciate a base R solution.
For example, I have the following data:
dummy.df <- structure(c(1L, 0L, 1L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 0L,
0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 1L,
0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 1L, 0L, 1L),
.Dim = c(10L, 4L),
.Dimnames = list(NULL, c("State.NJ", "State.NY", "State.TX", "State.VA")))
State.NJ State.NY State.TX State.VA
[1,] 1 0 0 0
[2,] 0 1 0 0
[3,] 1 0 0 0
[4,] 0 0 0 1
[5,] 0 1 0 0
[6,] 0 0 1 0
[7,] 1 0 0 0
[8,] 0 0 0 1
[9,] 0 0 1 0
[10,] 0 0 0 1
I would like to get the following results
state
1 NJ
2 NY
3 NJ
4 VA
5 NY
6 TX
7 NJ
8 VA
9 TX
10 VA
cat.var <- structure(list(state = structure(c(1L, 2L, 1L, 4L, 2L, 3L, 1L,
4L, 3L, 4L), .Label = c("NJ", "NY", "TX", "VA"), class = "factor")),
class = "data.frame", row.names = c(NA, -10L))
# toy data
df <- data.frame(a = c(1,0,0,0,0), b = c(0,1,0,1,0), c = c(0,0,1,0,1))
df$cat <- apply(df, 1, function(i) names(df)[which(i == 1)])
Result:
> df
a b c cat
1 1 0 0 a
2 0 1 0 b
3 0 0 1 c
4 0 1 0 b
5 0 0 1 c
To generalize, you'll need to play with the df and names(df) part, but you get the drift. One option would be to make a function, e.g.,
catmaker <- function(data, varnames, catname) {
data[,catname] <- apply(data[,varnames], 1, function(i) varnames[which(i == 1)])
return(data)
}
newdf <- catmaker(data = df, varnames = c("a", "b", "c"), catname = "newcat")
One nice aspect of the functional approach is that it is robust to variations in the order of names in the vector of column names you feed into it. I.e., varnames = c("c", "a", "b") produces the same result as varnames = c("a", "b", "c").
P.S. You added some example data after I posted this. The function works on your example, as long as you convert dummy.df to a data frame first, e.g., catmaker(data = as.data.frame(dummy.df), varnames = colnames(dummy.df), "State") does the job.
You can use tidyr::gather:
library(dplyr)
library(tidyr)
as_tibble(dummy.df) %>%
mutate(id =1:n()) %>%
pivot_longer(., -id, values_to = "Value",
names_to = c("txt","State"), names_sep = "\\.") %>%
filter(Value ==1) %>% select(State)
#> # A tibble: 10 x 1
#> State
#> <chr>
#> 1 NJ
#> 2 NY
#> 3 NJ
#> 4 VA
#> 5 NY
#> 6 TX
#> 7 NJ
#> 8 VA
#> 9 TX
#> 10 VA
You can do:
states <- names(dummy.df)[max.col(dummy.df)]
Or if as in your example it's a matrix you'd need to use colnames():
colnames(dummy.df)[max.col(dummy.df)]
Then just clean it up with sub():
sub(".*\\.", "", states)
"NJ" "NY" "NJ" "VA" "NY" "TX" "NJ" "VA" "TX" "VA"
EDIT : with your data
One way with model.matrix for dummy creation and matrix multiplication :
dummy.df<-structure(c(1L, 0L, 1L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 0L,
0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 1L,
0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 1L, 0L, 1L), .Dim = c(10L, 4L
), .Dimnames = list(NULL, c("State.NJ", "State.NY", "State.TX",
"State.VA")))
level_names <- colnames(dummy.df)
# use matrix multiplication to extract wanted level
res <- dummy.df%*%1:ncol(dummy.df)
# clean up
res <- as.numeric(res)
factor(res, labels = level_names)
#> [1] State.NJ State.NY State.NJ State.VA State.NY State.TX State.NJ
#> [8] State.VA State.TX State.VA
#> Levels: State.NJ State.NY State.TX State.VA
General reprex :
# create factor and dummy target y
dfr <- data.frame(vec = gl(n = 3, k = 3, labels = letters[1:3]),
y = 1:9)
dfr
#> vec y
#> 1 a 1
#> 2 a 2
#> 3 a 3
#> 4 b 4
#> 5 b 5
#> 6 b 6
#> 7 c 7
#> 8 c 8
#> 9 c 9
# dummies creation
dfr_dummy <- model.matrix(y ~ 0 + vec, data = dfr)
# use matrix multiplication to extract wanted level
res <- dfr_dummy%*%c(1,2,3)
# clean up
res <- as.numeric(res)
factor(res, labels = letters[1:3])
#> [1] a a a b b b c c c
#> Levels: a b c

Concatenate dichotome columns to semicolon-separated column

I have data frame containing the results of a multiple choice question. Each item has either 0 (not mentioned) or 1 (mentioned). The columns are named like this:
F1.2_1, F1.2_2, F1.2_3, F1.2_4, F1.2_5, F1.2_99
etc.
I would like to concatenate these values like this: The new column should be a semicolon-separated string of the selected items. So if a row has a 1 in F1.2_1, F1.2_4 and F1.2_5 it should be: 1;4;5
The last digit(s) of the dichotome columns are the item codes to be used in the string.
Any idea how this could be achieved with R (and data.table)? Thanks for any help!
edit:
Here is a example DF with the desired result:
structure(list(F1.2_1 = c(0L, 1L, 0L, 1L), F1.2_2 = c(1L, 0L,
0L, 1L), F1.2_3 = c(0L, 1L, 0L, 1L), F1.2_4 = c(0L, 1L, 0L, 0L
), F1.2_5 = c(0L, 0L, 0L, 0L), F1.2_99 = c(0L, 0L, 1L, 0L), desired_result = structure(c(3L,
2L, 4L, 1L), .Label = c("1;2;3", "1;3;4", "2", "99"), class = "factor")), .Names = c("F1.2_1",
"F1.2_2", "F1.2_3", "F1.2_4", "F1.2_5", "F1.2_99", "desired_result"
), class = "data.frame", row.names = c(NA, -4L))
F1.2_1 F1.2_2 F1.2_3 F1.2_4 F1.2_5 F1.2_99 desired_result
1 0 1 0 0 0 0 2
2 1 0 1 1 0 0 1;3;4
3 0 0 0 0 0 1 99
4 1 1 1 0 0 0 1;2;3
In his comment, the OP asked how to deal with more multiple choice questions.
The approach below will be able to handle an arbitrary number of questions and choices for each question. It uses melt() and dcast() from the data.table package.
Sample input data
Let's assume the input data.frame DT for the extended case contains two questions, one with 6 choices and the other with 4 choices:
DT
# F1.2_1 F1.2_2 F1.2_3 F1.2_4 F1.2_5 F1.2_99 F2.7_1 F2.7_2 F2.7_3 F2.7_11
#1: 0 1 0 0 0 0 0 1 1 0
#2: 1 0 1 1 0 0 1 1 1 1
#3: 0 0 0 0 0 1 1 0 1 0
#4: 1 1 1 0 0 0 1 0 1 1
Code
library(data.table)
# coerce to data.table and add row number for later join
setDT(DT)[, rn := .I]
# reshape from wide to long format
molten <- melt(DT, id.vars = "rn")
# alternatively, the measure cols can be specified (in case of other id vars)
# molten <- melt(DT, measure.vars = patterns("^F"))
# split question id and choice id
molten[, c("question_id", "choice_id") := tstrsplit(variable, "_")]
# reshape only selected choices from long to wide format,
# thereby pasting together the ids of the selected choices for each question
result <- dcast(molten[value == 1], rn ~ question_id, paste, collapse = ";",
fill = NA, value.var = "choice_id")
# final join for demonstration only, remove row number as no longer needed
DT[result, on = "rn"][, rn := NULL][]
# F1.2_1 F1.2_2 F1.2_3 F1.2_4 F1.2_5 F1.2_99 F2.7_1 F2.7_2 F2.7_3 F2.7_11 F1.2 F2.7
#1: 0 1 0 0 0 0 0 1 1 0 2 2;3
#2: 1 0 1 1 0 0 1 1 1 1 1;3;4 1;2;3;11
#3: 0 0 0 0 0 1 1 0 1 0 99 1;3
#4: 1 1 1 0 0 0 1 0 1 1 1;2;3 1;3;11
For each question, the final result shows which choices were selected in each row.
Reproducible data
The sample data can be created with
DT <- structure(list(F1.2_1 = c(0L, 1L, 0L, 1L), F1.2_2 = c(1L, 0L,
0L, 1L), F1.2_3 = c(0L, 1L, 0L, 1L), F1.2_4 = c(0L, 1L, 0L, 0L
), F1.2_5 = c(0L, 0L, 0L, 0L), F1.2_99 = c(0L, 0L, 1L, 0L), F2.7_1 = c(0L,
1L, 1L, 1L), F2.7_2 = c(1L, 1L, 0L, 0L), F2.7_3 = c(1L, 1L, 1L,
1L), F2.7_11 = c(0L, 1L, 0L, 1L)), .Names = c("F1.2_1", "F1.2_2",
"F1.2_3", "F1.2_4", "F1.2_5", "F1.2_99", "F2.7_1", "F2.7_2",
"F2.7_3", "F2.7_11"), row.names = c(NA, -4L), class = "data.frame")
We can try
j1 <- do.call(paste, c(as.integer(sub(".*_", "",
names(DF)[-7]))[col(DF[-7])]*DF[-7], sep=";"))
DF$newCol <- gsub("^;+|;+$", "", gsub(";*0;|0$|^0", ";", j1))
DF$newCol
#[1] "2" "1;3;4" "99" "1;2;3"

How to programatically compare an entire row in R?

I have the following dataframe in R:
data=
Time X1 X2 X3
1 1 0 0
2 1 1 1
3 0 0 1
4 1 1 1
5 0 0 0
6 0 1 1
7 1 1 1
8 0 0 0
9 1 1 1
10 0 0 0
Is there a way to programatically select those rows that are equal to (0,1,1)? I know it can be done by doing data[data$X1 == 0 & data$X2 == 1 & data$X3 == 1,] but, in my scenario, (0,1,1) is a list in a variable. My ultimate goal here is to determine the number of rows that are equal to (0,1,1), or any other combination that list variable can hold.
Thanks!
Mariano.
Here's a couple of options using a merge:
merge(list(X1=0,X2=1,X3=1), dat)
#or
merge(setNames(list(0,1,1),c("X1","X2","X3")), dat)
Or even using positional indexes based on what columns you want matched up:
L <- list(0,1,1)
merge(L, dat, by.x=seq_along(L), by.y=2:4)
All of which return:
# X1 X2 X3 Time
#1 0 1 1 6
If your matching variables are all of the same type, you could also safely do it via matrix comparison like:
dat[colSums(t(dat[c("X1","X2","X3")]) == c(0,1,1)) == 3,]
apply(data, 1, function(x) all(x==c(0,1,1)))
This will go down each row of the frame and return TRUE for each row where the row is equal to c(0,1,1).
this is your data
mydf <- structure(list(Time = 1:10, X1 = c(1L, 1L, 0L, 1L, 0L, 0L, 1L,
0L, 1L, 0L), X2 = c(0L, 1L, 0L, 1L, 0L, 1L, 1L, 0L, 1L, 0L),
X3 = c(0L, 1L, 1L, 1L, 0L, 1L, 1L, 0L, 1L, 0L)), .Names = c("Time",
"X1", "X2", "X3"), class = "data.frame", row.names = c(NA, -10L
))
Using subset
subset(mydf, X1 == 0 & X2==1 & X3==1)
# Time X1 X2 X3
#6 6 0 1 1
another way
mydf[mydf$X1 ==0 & mydf$X2 ==1 & mydf$X3 ==1, ]
# Time X1 X2 X3
#6 6 0 1 1
or like this
mydf[mydf$X1 ==0 & mydf$X2 & mydf$X3 %in% c(1,1), ]
# Time X1 X2 X3
#6 6 0 1 1
you can also do that by
library(dplyr)
filter(mydf, X1==0 & X2==1 & X3==1)
# Time X1 X2 X3
#1 6 0 1 1

How to extract value of a column based on multiple other columns

I have a dataframe which looks like this:
>head(df)
chrom pos strand ref alt A_pos A_neg C_pos C_neg G_pos G_neg T_pos T_neg
chr1 2283161 - G A 3 1 2 0 0 0 0 0
chr1 2283161 - G A 3 1 2 0 0 0 0 0
chr1 2283313 - G C 0 0 0 0 0 0 0 0
chr1 2283313 - G C 0 0 0 0 0 0 0 0
chr1 2283896 - G A 0 0 0 0 0 0 0 0
chr1 2283896 + G A 0 0 0 0 0 0 0 0
I want to extract the value from columns 6:13 (A_pos...T_neg) based on the value of the columns 'strand', 'ref' and 'alt'. For instance, in row1: strand = '-', ref = 'G' and alt = 'A', so I should extract the values from G_neg and A_neg. Again, in row6: stand = '+', ref = 'G' and alt = 'A', so I should get the values from G_pos and A_pos. I basically intend to do a chi-square test after extracting these values (These are my observed values, I have another set of expected values) but that is another story.
So the logic is somewhat like:
if(df$strand=="+")
do
print:paste(df$ref,"pos",sep="_") #extract value in column df$ref_pos
print:paste(df$alt,"pos",sep="_") #extract value in column df$alt_pos
else if(gt.merge$gene_strand=="-")
do
print:paste(df$ref,"neg",sep="_") #extract value in column df$ref_neg
print:paste(df$alt,"neg",sep="_") #extract value in column df$alt_neg
Here, I am trying to use paste on the values in 'ref' and 'alt' to get the desired column names. For instance, if strand ='+' and ref = 'G', it will fetch value from column G_pos.
The data frame is actually large and so I ruled out using for-loops. I am not sure how else can I do this to make the code as efficient as possible. Any help/suggestions would be appreciated.
Thanks!
Another alternative that looks valid, at least with the sample data:
tmp = ifelse(as.character(DF$strand) == "-", "neg", "pos")
sapply(DF[c("ref", "alt")],
function(x) as.integer(DF[cbind(seq_len(nrow(DF)),
match(paste(x, tmp, sep = "_"), names(DF)))]))
# ref alt
#[1,] 0 1
#[2,] 0 1
#[3,] 0 0
#[4,] 0 0
#[5,] 0 0
#[6,] 0 0
Where DF:
DF = structure(list(chrom = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = "chr1", class = "factor"),
pos = c(2283161L, 2283161L, 2283313L, 2283313L, 2283896L,
2283896L), strand = structure(c(1L, 1L, 1L, 1L, 1L, 2L), .Label = c("-",
"+"), class = "factor"), ref = structure(c(1L, 1L, 1L, 1L,
1L, 1L), .Label = "G", class = "factor"), alt = structure(c(1L,
1L, 2L, 2L, 1L, 1L), .Label = c("A", "C"), class = "factor"),
A_pos = c(3L, 3L, 0L, 0L, 0L, 0L), A_neg = c(1L, 1L, 0L,
0L, 0L, 0L), C_pos = c(2L, 2L, 0L, 0L, 0L, 0L), C_neg = c(0L,
0L, 0L, 0L, 0L, 0L), G_pos = c(0L, 0L, 0L, 0L, 0L, 0L), G_neg = c(0L,
0L, 0L, 0L, 0L, 0L), T_pos = c(0L, 0L, 0L, 0L, 0L, 0L), T_neg = c(0L,
0L, 0L, 0L, 0L, 0L)), .Names = c("chrom", "pos", "strand",
"ref", "alt", "A_pos", "A_neg", "C_pos", "C_neg", "G_pos", "G_neg",
"T_pos", "T_neg"), class = "data.frame", row.names = c(NA, -6L
))
Not very elegant, but does the job:
strand.map <- c("-"="_neg", "+"="_pos")
cbind(
df[1:5],
do.call(
rbind,
lapply(
split(df[-(1:2)], 1:nrow(df)),
function(x)
c(
ref=x[-(1:2)][, paste0(x[[2]], strand.map[x[[1]]])],
alt=x[-(1:2)][, paste0(x[[3]], strand.map[x[[1]]])]
) ) ) )
We cycle through each row in your data frame and apply a function that pulls the value based on strand, ref, and alt. This produces:
chrom pos strand ref alt ref alt
1 chr1 2283161 - G A 0 1
2 chr1 2283161 - G A 0 1
3 chr1 2283313 - G C 0 0
4 chr1 2283313 - G C 0 0
5 chr1 2283896 - G A 0 0
6 chr1 2283896 + G A 0 0
An alternate approach is to use melt, but the format of your data makes it rather annoying because we need two melts in a row, and we need to create a unique id column so we can reconstitute the data frame once we're done computing.
df$id <- 1:nrow(df)
df.mlt <-
melt(
melt(df, id.vars=c("id", "chrom", "pos", "strand", "ref", "alt")),
measure.vars=c("ref", "alt"), value.name="base",
variable.name="alt_or_ref"
)
dcast(
subset(df.mlt, paste0(base, strand.map[strand]) == variable),
id + chrom + pos + strand ~ alt_or_ref,
value.var="value"
)
Which produces:
id chrom pos strand ref alt
1 1 chr1 2283161 - 0 1
2 2 chr1 2283161 - 0 1
3 3 chr1 2283313 - 0 0
4 4 chr1 2283313 - 0 0
5 5 chr1 2283896 - 0 0
6 6 chr1 2283896 + 0 0
Another way
testFunc <- function(x){
posneg <- if(x["strand"] == "-") {"neg"} else {"pos"}
cbind(as.numeric(x[paste0(x["ref"],"_",posneg)]), as.numeric(x[paste0(x["alt"],"_",posneg)]))
}
temp <- t(apply(df, 1, testFunc))
colnames(temp) <- c("ref", "alt")
using the [very] fast data.table library:
library(data.table)
df = fread('df.txt') # fastread
df[,ref := ifelse(strand == "-",
paste(ref,"neg",sep = "_"),
paste(ref,"pos",sep = "_"))]
df[,alt := ifelse(strand == "-",
paste(alt,"neg",sep = "_"),
paste(alt,"pos",sep = "_"))]
df[,strand := NULL] # not required anymore
dfm = melt(df,
id.vars = c("chrom","pos","ref","alt"),
variable.name = "mycol", value.name = "value")
dfm[mycol == ref | mycol == alt,] # matching

Resources