merging and counting similar strings - r

I have a data with three columns like
Inputdf<-structure(list(df1 = structure(c(4L, 5L, 2L, 1L, 3L), .Label = c("P61160,P61158,O15143,O15144,O15145,P59998,O15511",
"P78537,Q6QNY1,Q6QNY0", "Q06323,Q9UL46", "Q92793,Q09472,Q9Y6Q9,Q92831",
"Q92828,Q13227,O15379,O75376,O60907,Q9BZK7"), class = "factor"),
df2 = structure(c(3L, 2L, 5L, 4L, 1L), .Label = c("", "P61158,O15143,O15144",
"Q06323,Q9UL46", "Q6QNY0", "Q92828"), class = "factor"),
df3 = structure(c(5L, 4L, 3L, 2L, 1L), .Label = c("", "O15511",
"Q06323,Q9UL46", "Q6QNY0", "Q92793,Q09472"), class = "factor")), .Names = c("df1",
"df2", "df3"), class = "data.frame", row.names = c(NA, -5L))
I am trying to find similar strings in this data for example
in df1, I have the first row I have Q92793,Q09472,Q9Y6Q9,Q92831
then I look at df2 and df3 and see if any of these members are in there then in this example, I make the following data
df1 df2 df3 Numberdf1 df2 df3
1 0 1 4 0 Q92793,Q09472
df1 1 means the first row of df1
df2 0 means it did not have any similarity
df3 1, means the first row of df3 has similarity with df1 row 1
Numberdf1, it is the count of strings separated by a ,which is 4
df2 is 0 because there was not any similar string accords df2
df3 is Q92793,Q09472 which paste the string which were similar in here
a desire output looks like below
out<- structure(list(df1 = 1:5, df2 = c(0L, 3L, 4L, 2L, 1L), df3 = c(1L,
0L, 2L, 4L, 3L), Numberdf1 = c(4L, 6L, 2L, 7L, 2L), df2.1 = structure(c(1L,
5L, 4L, 2L, 3L), .Label = c("0", "P61158,O15143,O15144", "Q06323,Q9UL46",
"Q6QNY0", "Q92828"), class = "factor"), df3.1 = structure(c(5L,
1L, 4L, 2L, 3L), .Label = c("0", "O15511", "Q06323,Q9UL46", "Q6QNY0",
"Q92793,Q09472"), class = "factor")), .Names = c("df1", "df2",
"df3", "Numberdf1", "df2.1", "df3.1"), class = "data.frame", row.names = c(NA,
-5L))
The below function does not work , for example, use this data as input
Inputdf1<- structure(list(df1 = structure(c(2L, 3L, 1L), .Label = c("Q06323,Q9UL46",
"Q92793,Q09472,Q9Y6Q9,Q92831", "Q92828,Q13227,O15379,O75376,O60907,Q9BZK7"
), class = "factor"), df2 = structure(1:3, .Label = c("P25788,P25789",
"Q92828, O60907, O75376", "Q9UL46, Q06323"), class = "factor"),
df3 = structure(c(2L, 1L, 3L), .Label = c("Q92831, Q92793, Q09472",
"Q9BZK7, Q92828, O75376, O60907", "Q9UL46, Q06323"), class = "factor")), .Names = c("df1",
"df2", "df3"), class = "data.frame", row.names = c(NA, -3L))

This works for your example:
# First convert factors to strings to lists
Inputdf[] = lapply(Inputdf, as.character)
Inputdf[] = lapply(Inputdf, function(col) sapply(col, function(x) unlist(strsplit(x,','))))
not.empty = function(x) length(x) > 0
out = data.frame()
for (r in 1:nrow(Inputdf)) {
df2.intersect = lapply(Inputdf$df2, intersect, Inputdf$df1[[r]])
df3.intersect = lapply(Inputdf$df3, intersect, Inputdf$df1[[r]])
out[r, 'df1'] = r
out[r, 'df2'] = Position(not.empty, df2.intersect, nomatch=0)
out[r, 'df3'] = Position(not.empty, df3.intersect, nomatch=0)
out[r, 'Numberdf1'] = length(Inputdf$df1[[r]])
out[r, 'df2.1'] = paste(Find(not.empty, df2.intersect, nomatch=0), collapse=',')
out[r, 'df3.1'] = paste(Find(not.empty, df3.intersect, nomatch=0), collapse=',')
}
out
# df1 df2 df3 Numberdf1 df2.1 df3.1
# 1 1 0 1 4 0 Q92793,Q09472
# 2 2 3 0 6 Q92828 0
# 3 3 4 2 3 Q6QNY0 Q6QNY0
# 4 4 2 4 7 P61158,O15143,O15144 O15511
# 5 5 1 3 2 Q06323,Q9UL46 Q06323,Q9UL46
Note: Find and Position identify the first match only. If there are potentially multiple matches, use which.
EDIT
Version accounting for multiple matches
Inputdf[] = lapply(Inputdf, as.character)
Inputdf[] = lapply(Inputdf, function(col) sapply(col, function(x) unlist(strsplit(x,',\\s*'))))
not.empty = function(x) length(x) > 0
out = data.frame()
for (r in 1:nrow(Inputdf)) {
df2.intersect = lapply(Inputdf$df2, intersect, Inputdf$df1[[r]])
df3.intersect = lapply(Inputdf$df3, intersect, Inputdf$df1[[r]])
out[r, 'df1'] = r
out[r, 'df2'] = paste(which(sapply(df2.intersect, not.empty)), collapse=',')
out[r, 'df3'] = paste(which(sapply(df3.intersect, not.empty)), collapse=',')
out[r, 'Numberdf1'] = length(Inputdf$df1[[r]])
out[r, 'df2.1'] = paste(unique(unlist(df2.intersect)), collapse=',')
out[r, 'df3.1'] = paste(unique(unlist(df3.intersect)), collapse=',')
}
out[out==""] = "0"

Related

overlapping unique dataframes in R

My two dataframes are:
df1<-structure(list(header1 = structure(1:4, .Label = c("a", "b",
"c", "d"), class = "factor")), class = "data.frame", row.names = c(NA,
-4L))
and
df2<-structure(list(sample_x = structure(c(1L, 1L, 2L, 3L), .Label = c("0",
"a", "c"), class = "factor"), sample_y = structure(c(1L, 3L,
2L, 4L), .Label = c("0", "a", "m", "t"), class = "factor"), sample_z = structure(c(3L,
2L, 1L, 1L), .Label = c("0", "a", "c"), class = "factor")), class = "data.frame", row.names = c(NA,
-4L))
0s in df2 means no values.
Now I want to overlap df1 and df2 to make an output dataframe(df3):
df3<-structure(list(sample_x = c(2L, 2L, 0L), sample_y = c(1L, 3L,
2L), sample_z = c(2L, 2L, 0L)), class = "data.frame", row.names = c("overlap_df1_df2",
"unique_df1", "unique_df2"))
I tried the datatable function foverlaps:
setkeyv(df1, names(df1))
setkeyv(df2, names(df2))
df3<-foverlaps(df1,df2)
But seems like I need to have some common column names in these two dataframes, which is obviously not the case.
Thank you!
Loop through columns, and use set operations:
sapply(df2, function(i){
x = i[ !is.na(i) ]
o = intersect(df1$header1, x)
u_df1 = setdiff(df1$header1, o)
u_df2 = setdiff(x, o)
c(o = length(o),
u_df1 = length(u_df1),
u_df2 = length(u_df2))
})
# sample_x sample_y sample_z
# o 2 1 2
# u_df1 2 3 2
# u_df2 0 2 0
A solution using map:
library(purrr)
rbind(
overlap = map_dbl(df2, ~length(intersect(df1$header1, .x))),
unique_df1 = map_dbl(df2, ~length(setdiff(df1$header1, .x))),
unique_df2 = unique_df1 - overlap
)
sample_x sample_y sample_z
overlap 2 1 2
unique_df1 2 3 2
unique_df2 0 2 0

splitting a data into several parts

My data is below which I want to split it based on IDs into several parts
df1<- structure(list(Ids1 = 1:7, string1 = structure(c(3L, 2L, 4L,
1L, 1L, 1L, 1L), .Label = c("gdyijq,udyhfs,gqdtr", "hdydg", "hishsgd,gugddf",
"ydis"), class = "factor"), Ids2 = c(1L, 3L, 4L, 9L, 10L, NA,
NA), string2 = structure(c(4L, 6L, 2L, 3L, 5L, 1L, 1L), .Label = c("",
"gdyijq,udyhfs", "gqdtr", "hishsgd,gugddf", "nlrshf", "ydis"), class = "factor")), .Names = c("Ids1",
"string1", "Ids2", "string2"), class = "data.frame", row.names = c(NA,
-7L))
The first I want to make df.1 when I keep only those that have similar Ids and count how many of string1 is similar to string2 (they are separated by a comma).
Ids1 string1 ids2 string2 Similar
1 hishsgd,gugddf 1 hishsgd,gugddf 2
3 ydis 3 ydis 1
4 gdyijq,udyhfs,gqdtr 4 gdyijq,udyhfs 2
I do this
df.1 <- df1[which(df1$Ids1 == df1$Ids2), ]
which only gives me the first row and nothing else
Then I want to have those that there are only ids 1 which dont exist in ids2
Ids1 string1
2 hdydg
5 gdyijq,udyhfs,gqdtr
6 gdyijq,udyhfs,gqdtr
7 gdyijq,udyhfs,gqdtr
I do this but also does not work
df.2<- df1[which(df1$Ids1 != df1$Ids2), ]
and the last I want to keep those that are only in ids2 and not ids1
Ids1 string1
9 gqdtr
10 nlrshf
which I do this but also does not work
df.3<- df1[which(df1$Ids2 != df1$Ids1), ]
Here is one solution I could come up with based on joins using dplyr package:
library(dplyr)
df.1 <- inner_join(select(df1, Ids1, string1), select(df1, Ids2, string2), by = c('Ids1' = 'Ids2'))
df.1$Similar <- apply(df.1[, -1], 1, function(x) sum(unlist(strsplit(x[1], ',')) %in% unlist(strsplit(x[2], ','))))
df.2 <- anti_join(select(df1, Ids1, string1), select(df1, Ids2, string2), by = c('Ids1' = 'Ids2'))
df.3 <- anti_join(select(df1, Ids2, string2), select(df1, Ids1, string1), by = c('Ids2' = 'Ids1'))
df.3 <- df.3[complete.cases(df.3), ]
You can also do something different for df.2 and df.3 as follows:
df.2 <- df1[!df1$Ids1 %in% df1$Ids2, c('Ids1', 'string1')]
df.3 <- df1[!df1$Ids2 %in% df1$Ids1, c('Ids2', 'string2')]
df.3 <- df.3[complete.cases(df.3), ]

R program, ?count, rename "freq" to something else

I am studying this webpage, and cannot figure out how to rename freq to something else, say number of times imbibed
Here is dput
structure(list(name = structure(c(1L, 2L, 1L, 2L, 1L, 2L, 1L,
2L), .Label = c("Bill", "Llib"), class = "factor"), drink = structure(c(2L,
3L, 1L, 4L, 2L, 3L, 1L, 4L), .Label = c("cocoa", "coffee", "tea",
"water"), class = "factor"), cost = 1:8), .Names = c("name",
"drink", "cost"), row.names = c(NA, -8L), class = "data.frame")
And this is working code with output. Again, I'd like to rename the freq column. Thanks!
library(plyr)
bevs$cost <- as.integer(bevs$cost)
count(bevs, "name")
Output
name freq
1 Bill 4
2 Llib 4
Are you trying to do this?
counts <- count(bevs, "name")
names(counts) <- c("name", "number of times imbibed")
counts
The count() function returns a data.frame. Just rename it like any other data.frame:
counts <- count(bevs, "name")
names(counts)[which(names(counts) == "freq")] <- "number of times imbibed"
print(counts)
# name number of times imbibed
# 1 Bill 4
# 2 Llib 4

Preserve NA in output of ifelse statement using paste

I have data that is organized like below M1 - M4, and I use the code from here to generate M_NEW:
M1 M2 M3 M4 M_NEW
1 1,2 0 1 1
3,4 3,4 1,2,3,4 4 3,4
NA NA 1 2 NA
It looks for a specified number of occurneces of number in the four columns and reports those numbers in M_NEW. Now, I would like to include the numbers 0 and 21 to each of the observations, unless that observation is NA. However, so far, I am unable to paste 0 and 21 to the observations, without also pasting them the NA values. The desired output is include in df below as M_NEW1. How can this be accomplished? It appears that I am missing something with paste here.
# sample data
df <- structure(list(M1 = structure(c(3L, 4L, 2L, 2L, 1L, 5L, NA, 6L
), .Label = c("0", "1", "1,2", "1,2,3,4", "1,2,3,4,5", "3,4,5,6,7"
), class = "factor"), M2 = structure(c(3L, NA, 2L, 2L, 1L, 4L,
NA, 5L), .Label = c("0", "1,2", "1,2,3,4,5", "4,5,6", "4,5,6,7,8,9,10,11,12,13,14"
), class = "factor"), M3 = structure(c(3L, NA, 1L, 1L, 1L, 2L,
NA, 4L), .Label = c("0", "1,2,3,4", "1,2,3,4,5", "1,2,3,4,5,6,7,8"
), class = "factor"), M4 = structure(c(3L, NA, 1L, 2L, 1L, 5L,
NA, 4L), .Label = c("0", "1", "1,2,3,4,5,6", "1,2,3,4,5,6,7,8,9,10,11,12",
"4,5"), class = "factor"), M_NEW1 = structure(c(3L, NA, 1L, 2L,
1L, 5L, NA, 4L), .Label = c("0,21", "1,0,21", "1,2,3,4,5,0,21",
"3,4,5,6,7,8,0,21", "4,5,0,21"), class = "factor")), .Names = c("M1",
"M2", "M3", "M4", "M_NEW1"), class = "data.frame", row.names = c(NA,
-8L))
# function slightly modified from https://stackoverflow.com/a/23203159/1670053
f <- function(x, n=3) {
tab <- table(strsplit(paste(x, collapse=","), ","))
res <- paste(names(tab[which(tab >= n)]), collapse=",")
return(ifelse(is.na(res), NA, ifelse(res == 0, "0,21", paste(res,",0,21",sep=""))))
#return(ifelse(is.na(res), ifelse(res == 0, "0,21", NA), paste(res,",0,21",sep=""))) #https://stackoverflow.com/a/17554670/1670053
#return(ifelse(is.na(res), NA, ifelse(res == 0, "0,21", paste(na.omit(res),",0,21",sep=""))))
#return(ifelse(is.na(res), as.character(NA), ifelse(res == 0, "0,21", paste(res,",0,21",sep=""))))
}
df$M_NEW2 <- apply(df[, 1:4], 1, f))
You can add another if else statement - rather inelegant but gets you there.
f2 <- function(x, n=3) {
tab <- table(strsplit(paste(x, collapse=","), ","))
res <- paste(names(tab[which(tab >= n)]), collapse=",")
res <- ifelse(res %in% c("0", ""), "0,21", res)
if(res %in% c("NA","0,21")) res else paste(res, "0,21", sep=",")
}
apply(df[1:4], 1, f2)
# "1,2,3,4,5,0,21" "NA" "0,21" "1,0,21" "0,21" "4,5,0,21" "NA"
# "3,4,5,6,7,8,0,21"

Compare first element of a list with another list

I am using R and need a hint to solve my problem:
I have two lists and I want to compare the values of the first row of list "a" with the values of the first row of list "b". If the element exists, I want to write the value of the second row of list "b" into the second row of list "a".
So, here is list "a":
X.WORD FREQ
abase 0
abased 0
abasing 0
abashs 0
here list "b"
V1 V2
arthur 11
abased 29
turtle 9
abash 2
The result should be
X.WORD FREQ
abase 0
abased 29
abasing 0
abashs 0
Thanks for your answers
That's just a task for simple merge in base R
Res <- merge(a, b, by.x = "X.WORD", by.y = "V1", all.x = TRUE)[, -2]
Res$V2[is.na(Res$V2)] <- 0
Res
# X.WORD V2
# 1 abase 0
# 2 abased 29
# 3 abashs 0
# 4 abasing 0
Data
a <- structure(list(X.WORD = structure(c(1L, 2L, 4L, 3L), .Label = c("abase",
"abased", "abashs", "abasing"), class = "factor"), FREQ = c(0L,
0L, 0L, 0L)), .Names = c("X.WORD", "FREQ"), class = "data.frame", row.names = c(NA,
-4L))
b <- structure(list(V1 = structure(c(3L, 1L, 4L, 2L), .Label = c("abased",
"abash", "arthur", "turtle"), class = "factor"), V2 = c(11L,
29L, 9L, 2L)), .Names = c("V1", "V2"), class = "data.frame", row.names = c(NA,
-4L))
Here is one approach.
library(dplyr)
ana <- foo %>%
left_join(foo2, by = c("X.WORD" = "V1")) %>%
select(-FREQ) %>%
rename(FREQ = V2)
ana$FREQ[is.na(ana$FREQ)] <- 0
# X.WORD FREQ
#1 abase 0
#2 abased 29
#3 abasing 0
#4 abashs 0
Data
foo <- structure(list(X.WORD = structure(c(1L, 2L, 4L, 3L), .Label = c("abase",
"abased", "abashs", "abasing"), class = "factor"), FREQ = c(0L,
0L, 0L, 0L)), .Names = c("X.WORD", "FREQ"), class = "data.frame", row.names = c(NA,
-4L))
foo2 <- structure(list(V1 = structure(c(3L, 1L, 4L, 2L), .Label = c("abased",
"abash", "arthur", "turtle"), class = "factor"), V2 = c(11L,
29L, 9L, 2L)), .Names = c("V1", "V2"), class = "data.frame", row.names = c(NA,
-4L))

Resources