subseting a data frame under a specific condition - r

How can i get rows of a data frame that has a same value in a element of that comparing with another data frame ?
I have written this but it didn't work.
# example of two data frame
df1 <- data.frame(V1 = c("a", "g", "h", "l", "n", "e"), V2 = c("b", "n", "i", "m", "i", "f"), stringsAsFactors = F)
df2 <- data.frame(V1 = c("a", "c", "f","h"), V2 = c("b", "d", "e","z"), stringsAsFactors = F)
# finding joint values in each element of two data frames
res1<-intersect(df1$V1,df2$V1)
res2<-intersect(df1$V2,df2$V2)
res3<-intersect(df1$V1,df2$V2)
res4<-intersect(df1$V1,df2$V2)
# Getting rows that has joint value at least in one element of df1
ress1<-df1[apply(df1, MARGIN = 1, function(x) all(x== res1)), ]
ress2<-df1[apply(df1, MARGIN = 1, function(x) all(x== res2)), ]
ress3<-df1[apply(df1, MARGIN = 1, function(x) all(x== res3)), ]
ress4<-df1[apply(df1, MARGIN = 1, function(x) all(x== res4)), ]
# Getting rows that has joint value at least in one element of df2
resss1<-df2[apply(df2, MARGIN = 1, function(x) all(x== res1)), ]
resss2<-df2[apply(df2, MARGIN = 1, function(x) all(x== res2)), ]
resss3<-df2[apply(df2, MARGIN = 1, function(x) all(x== res3)), ]
resss4<-df2[apply(df2, MARGIN = 1, function(x) all(x== res4)), ]
# then combine above results
final.res<-rbind(ress1,ress2,ress3,ress4,resss1,resss2,resss3,resss4)
My favorite result is:
a b
h z
h i
f e
e f

This should work
#Import data
df1 <- data.frame(V1 = c("a", "g", "h", "l", "n", "e"), V2 = c("b", "n", "i", "m", "i", "f"), stringsAsFactors = F)
df2 <- data.frame(V1 = c("a", "c", "f","h"), V2 = c("b", "d", "e","z"), stringsAsFactors = F)
# Get the intersects
vals <- intersect(c(df1$V1, df1$V2), c(df2$V1, df2$V2))
#Get the subsets and rbind them
full <- rbind(
subset(df1, df1$V1 %in% vals),
subset(df1, df1$V2 %in% vals),
subset(df2, df2$V1 %in% vals),
subset(df2, df2$V2 %in% vals)
)
#Remove duplicates
full <- full[!duplicated(full),]

Related

Minimize a function of characters inputs in R

I have the following function that I want to find the minimum:
model <- Create(parameter1 = list(model = "a" , "b"),
parameter2 = list(distribution = "x" , "y"))
The four inputs of this function are characters, and have as possible values:
parameter1: "a", "b", "c", "d", "e"
parameter2: "x", "y", "z", "w", "t", "v"
I've tried the optim function a few times without success.
Any help is appreciated.
Evaluate the function at every possible set of input values and take the least.
# test function
Create <- function(parameter1, parameter2) {
sum(match(unlist(parameter1), p1), match(unlist(parameter2), p2))
}
p1 <- c("a", "b", "c", "d", "e")
p2 <- c("x", "y", "z", "w", "t", "v")
g <- expand.grid(p1, p1, p2, p2, stringsAsFactors = FALSE)
obj <- function(x) Create(x[1:2], x[3:4])
ix <- which.min(apply(g, 1, obj))
g[ix, ]
## Var1 Var2 Var3 Var4
## 1 a a x x
obj(g[ix, ])
## [1] 4

Avoid the use of nested loop in multiple column comparison

I have a dataframe like this:
df <- data.frame(Patient.ID = rep(paste("Pat", seq(1:3), sep = ""), 2),
Gene = c(rep("Gene1", 3), rep("Gene2", 3)),
Ref = c("A", "C", "G", "T", "A", "T"),
Tum1 = c("A", "A", "T", "T", "A", "T"),
Tum2 = c("A", "C", "G", "G", "C", "C"))
What I would like to do is determine the change that is occurring between the Ref or either Tum column. In other words, if Tum1 is different from Tum2 take the character string which is different to the Ref column and store that in a separate column as the change so the dataframe above would become:
df <- data.frame(Patient.ID = rep(paste("Pat", seq(1:3), sep = ""), 2),
Gene = c(rep("Gene1", 3), rep("Gene2", 3)),
Ref = c("A", "C", "G", "T", "A", "T"),
Tum1 = c("A", "A", "T", "T", "A", "T"),
Tum2 = c("A", "C", "G", "G", "C", "C"),
BaseChange = c("NoCh", "C.A", "G.T", "T.G", "A.C", "T.C"))
I'm aware I could use a nested ifelse() statement like below (but extended) to solve this, but my actual dataframe has many more combinations and I figure there has to be a "safer" method of doing so.
df$BaseChange <- as.factor(ifelse(df$Ref == "C" & df$Tum1 == "A" | df$Ref== "C" & df$Tum2 == "A", "C.A",
ifelse((df$Ref == "G" & df$Tum1 == "T" | df$Ref == "G" & df$Tum2 == "T"), "G.T",...)))
Any help would be greatly appreciated.
It's not pretty, but it works:
df <- df %>%
mutate(BaseChange2 = ifelse( (as.character(Ref)==as.character(Tum1) & as.character(Ref) == as.character(Tum2)), "NoCh",
ifelse(as.character(Ref)==as.character(Tum1),paste(Ref,Tum2, sep="."),paste(Ref,Tum1, sep="."))))
It seems tha you need to paste unique Tums together, i.e.
apply(df[3:5], 1, function(i) paste0(unique(i), collapse = '.'))
#[1] "A" "C.A" "G.T" "T.G" "A.C" "T.C"
To replace the first A,
v2 <- apply(df[3:5], 1, function(i) paste0(unique(i), collapse = '.'))
replace(v2, nchar(v2) == 1, 'NoChange')
#[1] "NoChange" "C.A" "G.T" "T.G" "A.C" "T.C"

Converting for-loop to lapply function on two data frames with if condition

How can I convert these loops to lapply function or another fast function to speed up?
Example:
df1 <- data.frame(
V1 = c("a", "g", "h", "l", "n", "e"),
V2 = c("b", "n", "i", "m", "i", "f"),
stringsAsFactors = FALSE)
df2 <- data.frame(
V1 = c("a", "c", "b"),
V2 = c("b", "d", "a"),
stringsAsFactors = FALSE)
for (i in 1:nrow(df1)) {
for (j in 1:nrow(df2)) {
if (df1[i,]$V1==df2[j,]$V1 & df1[i,]$V2==df2[j,]$V2 |
df1[i,]$V1==df2[j,]$V2 & df1[i,]$V2==df2[j,]$V1) {
res1 <- df1[i,]
res2 <- df2[j,]
res <- rbind(res1, res2)
}
}
}
If you only have two columns, you could also use pmin and pmax. and then combine it with merge in order to find common rows
lookup <- setNames(data.frame(do.call(pmin, df2),
do.call(pmax, df2),
1:nrow(df2)),
c(names(df2), "indx"))
df2[merge(lookup, df1)$indx, ]
# V1 V2
# 1 a b
# 3 b a
Or using data.table for more efficiency
library(data.table)
lookup <- setnames(data.table(do.call(pmin, df2),
do.call(pmax, df2)),
names(df2))
indx <- lookup[df1, on = names(df2), which = TRUE, nomatch = 0L]
df2[indx, ]
# V1 V2
# 1 a b
# 3 b a
We can try
df2[do.call(paste0,
as.data.frame(t(apply(df2, 1, sort)))) %in%
do.call(paste0, df1),]
# V1 V2
#1 a b
#3 b a

Subscribing between two data frames

Hie,
I have two data frames that are like this for example
df1
V1 V2
a b
m n
h i
l m
n i
e f
and
df2
V1 V2
a b
c d
e f
b a
and I want to get rows that are the same in both data frames in a new one
like this
res2
V1 V2
a b
e f
b a
I tried
res1<-df1[df1$v1%in%df2$V1, ]
res2<-res1[res1$V2%in%df2$V2, ]
but I was unsuccessful. Any better idea?
You need to merge your two data frames based on V1 amd V2 with an inner join:
df1 <- data.frame(V1 = c("a", "m", "h", "l", "n", "e"), V2 = c("b", "n", "i", "m", "i", "f"), stringsAsFactors = F)
df2 <- data.frame(V1 = c("a", "c", "e"), V2 = c("b", "d", "f"), stringsAsFactors = F)
merge(df1, df2, by = c("V1", "V2"))
The result will be the unique couple of V1 and V2 which are both on df1 and df2.
Depending on if you want to keep duplicates values in df1 or df2, you could use as well the options all.x = T or all.y = T.

ggave() Error: Unknown input R

I am having trouble with ggsave() from the ggplot2 library. I wrote a function that I pass arguments to, and that is supposed to produce and then save the results with ggsave().
Here is some example data and code to reproduce the error:
example.df.1 <- data.frame(matrix(1:100, nrow = 20, ncol = 5))
colnames(example.df.1) <- c("var1", "var2", "var3", "var4", "var5")
rownames(example.df.1) <- c("A", "B", "C", "D", "E", "F", "G", "H",
"I", "J", "K", "L", "M", "N", "O", "P",
"Q", "R", "S", "T")
example.df.2 <- data.frame(matrix(ncol = 2, nrow = 24))
example.df.2[,1] <- c("A", "B", "C", "D", "E", "F", "G", "H",
"I", "J", "K", "L", "M", "N", "O", "P",
"Q", "R", "S", "T", "U", "V", "W", "X")
example.df.2[,2] <- rnorm(24, 10, 2)
problematic_func <- function(data1, col, title, var, data2) {
# only include rows without missing values
loc1 <- subset(data1, rowSums(is.na(data1)) == 0)
loc1 <- cbind(loc1, rank(-as.data.frame(loc1[,1]), ties.method = "first"))
# reduce data2 to only those rows that correspond to rows in data1
loc2 <- data2[data2[,1] %in% rownames(loc1),]
# order loc2
loc2.ordered <- loc2[order(loc2[,1]),]
# correlation between loc1 and loc2.ordered
corr <- cor(loc1[,1], loc2.ordered[,2])
# creating the plot
i <- ggplot(loc1, aes_q(x = loc1[,1], y = loc2.ordered))
i <- i + geom_point(colour = col, size = 4)
i <- i + ggtitle(title)
i <- i + xlab(var)
i <- i + ylab("y-axis")
i <- i + coord_cartesian(xlim = c(0, max(loc1[,1])),
ylim = c(0, max(loc2.ordered[,2])*1.2))
i <- i + annotate("text", x = max(loc1[,1])*.5, y = 1,
label = paste("Correlation coef: ", as.character(corr)), size = 3)
# saving the plot - this is where the error occurs according
# to the debugger
ggsave(filename = paste("my_example_plot_", var, ".png", sep = ""),
plot = i, device = png, width = 625, height = 625, limitsize = FALSE)
}
for (i in 1:ncol(example.df.1)) {
sv <- as.data.frame(example.df.1[,i])
rownames(sv) <- rownames(example.df.1)
problematic_func(sv, "orange", colnames(example.df.1[i]),
colnames(example.df.1[i]), data2 = example.df.2)
}
Edit: Sorry, I forgot to add the error I get:
Error in FUN(X[[2L]], ...) : Unknown input:data.frame
I found the mistake myself. The error was not in the ggsave() function, but in the ggplot() function with which I created the plot in the first place. The correct code must be:
i <- ggplot(loc1, aes_q(x = loc1[,1], y = loc2.ordered[,2]))
The difference is that the y aesthetic needs to be passed a column of loc2.ordered, not the entire dataframe.

Resources