Concatenate expressions to subset a dataframe - r

I am attempting to create a function that will calculate the mean of a column in a subsetted dataframe. The trick here is that I always to want to have a couple subsetting conditions and then have the option to pass more conditions to the functions to further subset the dataframe.
Suppose my data look like this:
dat <- data.frame(var1 = rep(letters, 26), var2 = rep(letters, each = 26), var3 = runif(26^2))
head(dat)
var1 var2 var3
1 a a 0.7506109
2 b a 0.7763748
3 c a 0.6014976
4 d a 0.6229010
5 e a 0.5648263
6 f a 0.5184999
I want to be able to do the subset shown below, using the first condition in all function calls, and the second be something that can change with each function call. Additionally, the second subsetting condition could be on other variables (I'm using a single variable, var2, for parsimony, but the condition could involve multiple variables).
subset(dat, var1 %in% c('a', 'b', 'c') & var2 %in% c('a', 'b'))
var1 var2 var3
1 a a 0.7506109
2 b a 0.7763748
3 c a 0.6014976
27 a b 0.7322357
28 b b 0.4593551
29 c b 0.2951004
My example function and function call would look something like:
getMean <- function(expr) {
return(with(subset(dat, var1 %in% c('a', 'b', 'c') eval(expr)), mean(var3)))
}
getMean(expression(& var2 %in% c('a', 'b')))
An alternative call could look like:
getMean(expression(& var4 < 6 & var5 > 10))
Any help is much appreciated.
EDIT: With Wojciech Sobala's help, I came up with the following function, which gives me the option of passing in 0 or more conditions.
getMean <- function(expr = NULL) {
sub <- if(is.null(expr)) { expression(var1 %in% c('a', 'b', 'c'))
} else expression(var1 %in% c('a', 'b', 'c') & eval(expr))
return(with(subset(dat, eval(sub)), mean(var3)))
}
getMean()
getMean(expression(var2 %in% c('a', 'b')))

It can be simplified with defalut expr=TRUE.
getMean <- function(expr = TRUE) {
return(with(subset(dat, var1 %in% c('a', 'b', 'c') & eval(expr)), mean(var3)))
}

This is how I would approach it. The function getMean makes use of the R's handy default parameter settings:
getMean <- function(x, subset_var1, subset_var2=unique(x$var2)){
xs <- subset(x, x$var1 %in% subset_var1 & x$var2 %in% subset_var2)
mean(xs$var3)
}
getMean(dat, c('a', 'b', 'c'))
[1] 0.4762141
getMean(dat, c('a', 'b', 'c'), c('a', 'b'))
[1] 0.3814149

Related

Extracting a single unique character from a pattern in R

I have a data frame of unique character vectors that are all very similar to a distinct pattern, but with small deviations in each. I'm hoping to find a way to identify what the deviation is in each string. Here is what I have tried:
library(stringr)
#The strings are concatenated in my code, I separated them for easier use
KeyPattern <- c('abcd'
uniqchars <- function(x) unique(strsplit(x, "")[[1]])
KayPattern <- uniqchars(KeyPattern)
> KeyPattern
[1] "a" "b" "c" "d"
SampleString <- c('a', 'b', 'z', 'c', 'd')
str_detect(SampleString, KeyPattern)
[1] TRUE TRUE FALSE FALSE FALSE
As you can see, it recognizes the 'z' character, and correctly returns FALSE, and from there the pattern is completely off. I also considered trying:
word(string, start = 1L, end = start, sep = fixed(" "))
but this requires a pre-existing knowledge of where the deviations are (start = ..., end = ...) and it will be different in every row of the data frame.
Ultimately I want to have a data frame with one column of unique string, a column of distinct deviations (mismatches in the pattern), and it's location in the string.
Goal Sample Table:
String
Deviation from Key
Deviation start location
'a' 'b' 'c' 'z' 'd'
z
4
'a' 'b' 'a' 'c' 'd'
a
3
Current concatenated data frame:
1 ASGGGGSAASHLIALQLRLIGDAFDGGGGSGGGGSG
2 ASLTVDVGNVTYHFNNPITVLVFAILVALELGGTVHVHGNRIHVEG
3 ASLTVHVGDLTYHFENPQLVKLVAEIWARALNLTIEIRGNEIHVEG
4 ASNELVELVVEILYRMCVDPDQIKKILKRRGVSDEEVKRAIDKAIG
5 ASNMNMLEALQQRLQFYFGVVSRAALENNSGKARRFGRIVKQYEDAIKLYKAGKPVPYDELPVPPGFGG
6 ASNTIMLEALQQRLQFYFGVVSRAALENNSGKARRFGRIVKQYEDAIKLYKAGKPVPYDELPVPPGFGG
#CurrentKey
[1] "ASSTNMLEALQQRLQFYFGVVSRALENNSGKARRFGRIVKQYEDAIKLYKAGKPVPYDELPVPPGFGG"
Any suggestions?
see if this what you want?
df <- structure(list(STRINGS = list(c("a", "b", "c", "z", "d"), c("a",
"b", "a", "c", "d"))), class = "data.frame", row.names = c(NA,
-2L))
df
#> STRINGS
#> 1 a, b, c, z, d
#> 2 a, b, a, c, d
pattern <- c('a', 'b', 'c', 'd')
library(tidyverse)
df %>%
mutate(deviation = map_chr(STRINGS, ~ {x <- cumsum(.x[seq_along(pattern)] != pattern); .x[which(x >0)[1]]}),
deviation_start_loc = map_int(STRINGS, ~ {x <- cumsum(.x[seq_along(pattern)] != pattern); which(x > 0)[1]}))
#> STRINGS deviation deviation_start_loc
#> 1 a, b, c, z, d z 4
#> 2 a, b, a, c, d a 3
Created on 2021-06-21 by the reprex package (v2.0.0)
Here is my approach:
First, define a recursive function:
find_deviation <- function(string, key, position = 1) {
stopifnot(is.character(string), is.character(key))
if (min(length(key), length(string)) == 0)
return(c(deviation = NA, position = NA))
if (string[1] != key[1])
return(c(deviation = string[1], position = position))
find_deviation(string[-1], key[-1], position + 1)
}
Then, use it to generate the desired result:
dplyr::bind_cols(
purrr::map_dfr(SampleString, ~ c(String = paste(.x, collapse = ","))),
purrr::map_dfr(SampleString, ~ find_deviation(.x, KeyPattern))
)
Result:
# A tibble: 2 x 3
String deviation position
<chr> <chr> <chr>
1 a,b,z,c,d z 3
2 a,b,a,c,d a 3
Data used:
KeyPattern <- c('a', 'b', 'c', 'd')
SampleString <- list(c('a', 'b', 'z', 'c', 'd'), c('a', 'b', 'a', 'c', 'd'))
Using aphid library and sequence alignment, the character vectors are combined into a list, the first element being the key pattern vector.
library(aphid)
KeyPattern <- c('a', 'b', 'c', 'd')
SampleString1 <- c('a', 'b', 'z', 'c', 'd')
SampleString2 <- c('a', 'b', 'c', 'z', 'd')
SampleString3 <- c('a', 'b', 'a', 'c', 'd')
sequences=list(KeyPattern,SampleString1,SampleString2,SampleString3)
do.call(rbind,
sapply(2:length(sequences),function(x){
glo=align(sequences[c(1,x)],type="global",k=1)
tmp=glo[1,]!=glo[2,]
data.frame(
"String"=paste0(sequences[[x]],collapse=" "),
"Deviation from Key"=glo[2,tmp],
"Deviation start location"=which(tmp)
)
},simplify=F)
)
String Deviation.from.Key Deviation.start.location
1 a b z c d z 3
2 a b c z d z 4
3 a b a c d a 3

How to paste columns together only for rows with specific string in R

I have a dataframe (df1):
df1 <- data.frame(var1 = c('a', 'b', 'c', 'd', 'e'),
var2 = c('uniqA', 'uniqB', 'unknown', 'uniqC', 'unknown'))
I want to paste var1 and var2 columns only where var2 is "unknown" so that I get the resulting dataframe:
df1 <- data.frame(var1 = c('a', 'b', 'c', 'd', 'e'),
var2 = c('uniqA', 'uniqB', 'unknown', 'uniqC', 'unknown'),
var3 = c('uniqA', 'uniqB', 'c_unknown', 'uniqC', 'e_unknown'))
I only know how to paste columns together with all patterns:
df1$var3 = paste(df1$var1,"_",df1$var2)
I attempted this which gave me "TRUE" or "FALSE" and still pasted all patterns together:
df1$var3=paste(df1$var1,"_",grepl("unknown", df1$var2)
Thanks in advance!
You were close, use ifelse to do a vectorized conditional.
paste0(ifelse(grepl("unknown", df1$var2), paste0(df1$var1, "_"), ""), df1$var2)
# [1] "uniqA" "uniqB" "c_unknown" "uniqC" "e_unknown"
You can also use ifelse()
df1$var3 <- ifelse(df1$var2 == "unknown",
paste0(df1$var1,"_",df1$var2),
df1$var2)
The result can be obtained in two steps (even if I'm sure that someone will realize it a more immediate manner).
The first step is to assign var2 to var3
df1$var3 <- df1$var2
The sedond step is to modify only rows that match the condition on var2
df1$var3[df1$var2 == "unknown"] <- paste(df1$var1[df1$var2 == "unknown"],
df1$var2[df1$var2 == "unknown"], sep = "_")
The result is what expected:
df1
var1 var2 var3
1 a uniqA uniqA
2 b uniqB uniqB
3 c unknown c_unknown
4 d uniqC uniqC
5 e unknown e_unknown
Hope it can help!

applying a function to multiple lists (R)

I have two lists:
source <- list(c(5,10,20,30))
source.val <- list(c('A', 'B', 'C', 'D'))
Each corresponding element in source has a corresponding value in source.val. I want to create dataframe from the above two files that look like below
source.val_5 source.val_10 source.val_20 source.val_30
A B C D
I did this
tempList <- list()
for(i in 1:lengths(source)){
tempList[[i]] <- data.frame(variable = paste0('source.val_',source[[1]][[i]]),
value = source.val[[1]][[i]])
}
temp.dat <- do.call('rbind', tempList)
temp.dat_wider <- tidyr::pivot_wider(finalList, id_cols = value, names_from = variable)
Now I want to do this across a bigger list
source <- list(c(5,10,20,30),
c(5,10,20,30),
c(5,10,20,30),
c(5,10,20,30))
source.val <- list(c('A', 'B', 'C', 'D'),
c('B', 'B', 'D', 'D'),
c('C', 'B', 'A', 'D'),
c('D', 'B', 'B', 'D'))
The resulting table will have 4 rows looking like this:
A tibble: 1 x 4
source.val_5 source.val_10 source.val_20 source.val_30
A B C D
B B D D
C B A D
D B B D
What is the best way to use function like mapply to achieve my desired result?
For the example shared, where all the elements of source have the same order you can do :
cols <- paste0('source.val_', sort(unique(unlist(source))))
setNames(do.call(rbind.data.frame, source.val), cols)
# source.val_5 source.val_10 source.val_20 source.val_30
#1 A B C D
#2 B B D D
#3 C B A D
#4 D B B D
However, for a general case where every value in source do not follow the same order you can reorder source.val based on source :
source.val <- Map(function(x, y) y[order(x)], source, source.val)
and then use the above code.

Detect discrepancies between two sequences

I have two time series vectors: complete_data and incomplete_data. the data in the vector consists of 6 possible events which occur randomly throughout the vector. In principle the two should be the same because with every event in complete_data, that same event was then added on to incomplete_data. however in reality there were some anomalies in the system and not all of the events in complete_data were sent to incomplete_data. Thus complete_data is longer than incomplete_data. I need to find the differences in the pattern between the two and mark them. I made an attempt but it assumes that the discrepancy between the two vectors occurs in a single chunk, whereas in reality, there are various "missing events" scattered in incomplete_data.
Here is my attempt:
complete_data <- c('a', 'b', 'c', 'a', 'b', 'c', 'a', 'b', 'c', 'a', 'b', 'c')
dfcomplete <- as.data.frame(complete_data)
incomplete_data <- c('a', 'b', 'c', 'a','c', 'a', 'b', 'a', 'b', 'c')
dfincomplete <- as.data.frame(incomplete_data)
findMatch <- function(complete_data, incomplete_data){
matching_inorder <- NULL
matching_reverseorder <- NULL
for (i in 1:length(complete_data)){
matching_inorder[i] <- complete_data[i] == incomplete_data[i]
matching_reverseorder[i] <- rev(complete_data)[i] == rev(incomplete_data)[i]
}
is_match <- ifelse(matching_inorder == FALSE &
rev(matching_reverseorder) == FALSE, 'non_match', 'match')
is_match
}
dfcomplete$is_match_incorrect <- findMatch(dfcomplete$complete_data,
dfincomplete$incomplete_data)
And here is what I would like to get:
dfcomplete$expected_output <- c('match', 'match', 'match', 'match', 'non-match', 'match',
'match', 'match', 'non_match', 'match', 'match', 'match')
In reality my data is much larger than these examples with many different discrepancies scattered throughout the vector. Though there aren't necessarily too many discrepancies to make the task meaningless, for example, in one case the complete vector has 320 datapoints whilst the incomplete vector has 309.
Any help that can be offered would be much appreciated.
There are various ways to do this, but here's a recursive one, where x is assumed to be a complete sequence and y incomplete.
compare <- function(x, y) {
if (length(x) > 0) {
if (x[1] == y[1]) {
x[1] <- "match"
c(x[1], compare(x[-1], y[-1]))
} else {
x[1] <- "no match"
c(x[1], compare(x[-1], y))
}
}
}
compare(complete_data, incomplete_data)
# [1] "match" "match" "match" "match" "no match" "match"
# [7] "match" "match" "no match" "match" "match" "match"
Another one that perhaps is more readable and uses a simple loop would be
out <- rep(NA, length(incomplete_data))
gap <- 0
for(i in seq_along(complete_data)) {
if (complete_data[i] == incomplete_data[i - gap]) {
out[i] <- "match"
} else {
out[i] <- "no match"
gap <- gap + 1
}
}
out
# [1] "match" "match" "match" "match" "no match" "match"
# [7] "match" "match" "no match" "match" "match" "match"
If you can afford having event names only one letter long, here is a solution using string matching. The trick is to transform the incomplete data to a pattern including places to insert new characters.
complete_data <- c('a', 'b', 'c', 'a', 'B', 'c', 'a', 'b', 'C', 'a', 'b', 'c')
dfcomplete <- as.data.frame(complete_data,stringsAsFactors=FALSE)
incomplete_data <- c('a', 'b', 'c', 'a','c', 'a', 'b', 'a', 'b', 'c')
y <- paste0('^(.*)',paste(incomplete_data,collapse='(.*)'),'(.*)$')
x <- paste(complete_data,collapse="")
z <- str_length(str_match(x,y)[-1])
data.frame(incomplete_data=c("",incomplete_data),stringsAsFactors=FALSE) %>%
mutate(n=ifelse(incomplete_data=="",z,z+1)) %>%
filter(n>0) %>%
uncount(n) %>%
mutate(incomplete_data=ifelse(str_detect(rownames(.),"\\."),"",incomplete_data)) %>%
bind_cols(dfcomplete) %>%
mutate(match=complete_data==incomplete_data)
# incomplete_data complete_data match
#1 a a TRUE
#2 b b TRUE
#3 c c TRUE
#4 a a TRUE
#5 B FALSE
#6 c c TRUE
#7 a a TRUE
#8 b b TRUE
#9 C FALSE
#10 a a TRUE
#11 b b TRUE
#12 c c TRUE

create a scoring matrix from two dataframes

I am trying to compare sets of variables(X) that are stored in two dataframes (foo, bar). Each X is a unique independent variable that has up to 10 values of Y associated with it. I would like to compare every foo.X with every bar.X by comparing the number of Y values they have in common - so the output could be a matrix with axes of foo.x by bar.x in length.
this simple example of foo and bar would want to return a 2x2 matrix comparing a,b with c,d:
foo <- data.frame(x= c('a', 'a', 'a', 'b', 'b', 'b'), y=c('ab', 'ac', 'ad', 'ae', 'fx', 'fy'))
bar <- data.frame(x= c('c', 'c', 'c', 'd', 'd', 'd'), y=c('ab', 'xy', 'xz', 'xy', 'fx', 'xz'))
EDIT:
I've left the following code for other newbies to learn from (for loops are effectvie but probably very suboptimal), but the two solutions below are effective. In particular Ramnath's use of data.table is very effective when dealing with very large dataframes.
store the dataframes as lists where the values of y are stored using the stack function
foo.list <- dlply(foo, .(x), function(x) stack(x, select = y))
bar.list <- dlply(bar, .(x),function(x) stack(x, select = y))
write a function for comparing membership in the two stacked lists
comparelists <- function(list1, list2) {
for (i in list1){
for (j in list2){
count <- 0
if (i[[1]] %in% j[[1]]) count <- count + 1
}
}
return count
}
write an output matrix
output.matrix <- matrix(1:length(foo.list), 1:length(bar.list))
for (i in foo.list){
for (j in bar.list){
output.matrix[i,j] <- comparelists(i,j)
}
}
There must be a hundred ways to do this; here is one that feels relatively straightforward to me:
library(reshape2)
foo <- data.frame(x = c('a', 'a', 'a', 'b', 'b', 'b'),
y = c('ab', 'ac', 'ad', 'ae', 'fx', 'fy'))
bar <- data.frame(x = c('c', 'c', 'c', 'd', 'd', 'd'),
y = c('ab', 'xy', 'xz', 'xy', 'fx', 'xz'))
# Create a function that counts the number of common elements in two groups
nShared <- function(A, B) {
length(intersect(with(foo, y[x==A]), with(bar, y[x==B])))
}
# Enumerate all combinations of groups in foo and bar
(combos <- expand.grid(foo.x=unique(foo$x), bar.x=unique(bar$x)))
# foo.x bar.x
# 1 a c
# 2 b c
# 3 a d
# 4 b d
# Find number of elements in common among all pairs of groups
combos$n <- mapply(nShared, A=combos$foo.x, B=combos$bar.x)
# Reshape results into matrix form
dcast(combos, foo.x ~ bar.x)
# foo.x c d
# 1 a 1 0
# 2 b 0 1
Here is a simpler approach using merge
library(reshape2)
df1 <- merge(foo, bar, by = 'y')
dcast(df1, x.x ~ x.y, length)
x.x c d
1 a 1 0
2 b 0 1
EDIT. The merge can be faster using data.table. Here is the code
foo_dt <- data.table(foo, key = 'y')
bar_dt <- data.table(bar, key = 'y')
df1 <- bar_dt[foo_dt, nomatch = 0]

Resources