create_c <- function(df, line_number = NA, prior_trt, line_name, biomarker, ...) {
if (!"data.frame" %in% class(df)) {
stop("First input must be dataframe")
}
# handle extra arguments
args <- enquos(...)
names(args) <- tolower(names(args))
# check for unknown argument - cols that do not exist in df
check_args_exist(df, args)
# argument to expression
ex_args <- unname(imap(args, function(expr, name) quo(!!sym(name) == !!expr)))
# special case arguments
if (!missing(line_number)) {
df <- df %>% filter(line_number %in% (!!line_number))
if (!missing(prior_trt)) {
df <- filter_arg(df. = df, arg = prior_trt, col = "prior_trt_", val = "y")
}
}
if (!missing(biomarker)) {
df <- filter_arg(df. = df, arg = biomarker, col = "has_", val = "positive")
}
if (!missing(line_name)) {
ln <- list()
if (!!str_detect(line_name[1], "or")) {
line_name <- str_split(line_name, " or ", simplify = TRUE)
}
for (i in 1:length(line_name)) {
ln[[i]] <- paste(tolower(sort(strsplit(line_name[i], "\\+")[[1]])), collapse = ",")
}
df <- df %>% filter(line_name %in% (ln))
}
df <- df %>%
group_by(patient_id) %>%
slice(which.min(line_number)) %>%
ungroup()
df <- df %>% filter(!!!ex_args)
invisible(df)
}
I have this function where I am basically filtering various columns based on parameters users pass. I want the users to be able to pass logical operators like >,<, != for some of the parameters. Right now my function is not able to handle any other operators besides '='. Is there a way to accomplish this?
create_c(df = bsl_all_nsclc,
line_number > 2)
create_c(df, biomarker != "positive)
Error in tolower(arg) : object 'biomarker' not found
Certainly there is a way: operators are regular functions in R, you can pass them around like any other function.
The only complication is that the operators are non-syntactic names so you can’t just pass them “as is”, this would confuse the parser. Instead, you need to wrap them in backticks, to make their use syntactically valid where a name would be expected:
filter_something = function (value, op) {
op(value, 13)
}
filter_something(cars$speed, `>`)
filter_something(cars$speed, `<`)
filter_something(cars$speed, `==`)
And since R also supports non-standard evaluation of function arguments, you can also pass unevaluated expressions — this gets slightly more complicated, since you’d want to evaluate them in the correct context. ‘rlang’/‘dplyr’ uses data masking for this.
How exactly you need to apply this depends entirely on the context in which the expression is to be used. In many cases, you can simply dispatch them to the corresponding ‘dplyr’ functions, e.g.
filter_something2 = function (.data, expr) {
.data %>%
filter({{expr}})
}
filter_something2(cars, speed < 13)
The “secret sauce” here is the {{…}} syntax. This works because filter from ‘dplyr’ accepts unevaluated arguments and handles {{expr}} specially by transforming it into (effectively) !! enexpr(expr). That is: expr is first “defused”: it is explicitly marked as unevaluated, and the name expr is replaced by the unevaluated expression it binds to (speed < 13 in the above). Next, this unevaluated expression is unquoted. That is, the wrapper is “peeled off” from the expression, and that unevaluated expression itself is handled inside filter as if it were passed as filter(.data, speed < 13). In other words: the name expr is substituted with the speed < 13 in the call expression.
For a more thorough explanation, please refer to the Programming with dplyr vignette.
This is my function. Basically I want to include a if else statement inside it but controlling by the length of the arguments that I use in ...:
This is what Ive tried so far, and it is wrong:
soma_mtcars<-function(data,...){
if(length(...) < 2){
sum_df<- data %>% group_by() %>% summarise(total = sum(disp))
}
else(
sum_df<- data %>% group_by() %>% summarise(total = sum(disp))
)
}
Of course the problem is in length(...) < 2. How can I deal with it?
And I would like to have, for example, outputs for: soma_mtcars(mtcars,cyl) and soma_mtcars(mtcars, cyl, disp)
You can use nargs(), which gives you the total number of arguments (i.e. including your data argument):
soma_mtcars <- function (data, ...) {
if (nargs() < 3L) { …
}
… or you can pass ... list, and get the length of its result:
soma_mtcars <- function (data, ...) {
if (length(list(...)) < 2L) { …
}
Either of these will return the length of dot dot dot. The first one does it without evaluating dot dot dot.
len_noeval <- function(...) ...length()
len_eval <- function(...) length(list(...))
# test
len_noeval(11, print(12), 13)
## [1] 3
len_eval(11, print(12), 13)
## [1] 12
## [1] 3
I'm writing a function (NextWordPrediction) in R to predict the next word given some words. The basic structure is as follows:
If input exists in dat such that nrow(dat) != 0 return input and answer
If input doesn't exist such that nrow(dat) == 0 call to recursion and atempt input-1 (eg. if input is "hello great world" try "great world" so on and so forth until nrow nrow(dat) != 0
If after step 2 nrow(dat) == 0 return string "Word not in dictionary. We added this to our database!" and add original input to dataset
Here is the full code:
NextWordPrediction <- function(input) {
dat <- training %>%
filter(., N_gram == str_count(input, "\\S+") + 1) %>%
filter(grepl(paste("^", tolower(str_squish(input)), sep = ""), Word)) %>%
arrange(., desc(Prop))
if (nrow(dat) != 0) {
assign("training",
training %>%
mutate(Frequency = ifelse(Word == input &
N_gram == str_count(input, "\\S+"),
Frequency + 1,
Frequency)) %>%
group_by(., N_gram) %>%
mutate(., Prop = Frequency/ sum(Frequency)) %>%
data.frame(.),
envir = .GlobalEnv)
val <- dat$Word_to_Predict[1]
ans <- paste(str_squish(input), val)
return(list(ans, head(dat,5)))
} else if (nrow(dat) == 0 & word(input, 1) != "NA") {
input_1 <- Reduce(paste, word(input, 2:str_count(input,"\\S+")))
return(NextWordPrediction(input_1))
} else if (nrow(dat) == 0 & word(input, 1) == "NA") {
assign("training",
training %>%
add_row(., Word = tolower(input), Frequency = 1, N_gram = str_count(input, "\\S+")),
envir = .GlobalEnv)
ans <- paste("Word not in dictionary. We added this to our database!")
return(ans)
}
}
The issue I'm having happens somewhere between step 2 and 3. If input is not found after the recursion call, the added input to the database is input-1 ("great world") where I'd like the original input ("hello great world"). This is my first attempt to implement recursion and would like to understand the mistake in my code.
Thanks :)
Update to be Reproducible:
library(dplyr); library(stringr)
training <- data.frame(Word = c("hello", "she was great", "this is", "long time ago in"), Frequency = c(4, 3, 10, 1),
N_gram = c(1, 3, 2, 4), Prop = c(4/18, 3/18, 10/18, 1/18), Word_to_Predict = c(NA, "great", "is", "in"))
NextWordPrediction("she was") ## returns "she was" & "great"
NextWordPrediction("hours ago") ## returns "hours ago" & "in"
NextWordPrediction("words not in data") ## returns "Word not in dictionary. We added this to our database!" after trying "not in data", "in data" and adds "words not in data" to dataset
Here is an imperfect and overly-complicated demonstration of a recursive function operating on strings. Ideally there are some more safeguards that could be put into place, and there are of course much faster, more efficient, smarter ways of doing this one task, but ... perhaps you'll get the point.
I'm going to change all es to as, one word at a time.
e_to_a <- function(strings) {
# unnecessarily complex
message("# Called : ", sQuote(strings))
if (!nzchar(strings)) return(strings)
word1 <- sub("^([^[:space:]]*)[[:space:]]?.*", "\\1", strings)
others <- sub("^[^[:space:]]*[[:space:]]?", "", strings)
message("# - word1 : ", sQuote(word1))
message("# - others: ", sQuote(others))
# operate on the first word
word1 <- gsub("e", "a", word1)
if (nzchar(others)) {
others <- e_to_a(others)
return(paste(word1, others))
} else {
return(word1)
}
}
In action:
e_to_a("hello great world")
# # Called : 'hello great world'
# # - word1 : 'hello'
# # - others: 'great world'
# # Called : 'great world'
# # - word1 : 'great'
# # - others: 'world'
# # Called : 'world'
# # - word1 : 'world'
# # - others: ''
# [1] "hallo graat world"
The key is that when you make the recursive call, what you're currently doing
return(NextWordPrediction(input_1))
is going to return just the recursive part, dismissing the first word. That would be analogous to me doing
if (nzchar(others)) {
others <- e_to_a(others)
# return(paste(word1, others))
return(others)
} else {
return(word1)
}
I hope you can apply this to your function.
Bottom line, since your question is not reproducible, I'll guess that your fix is something like:
} else if (nrow(dat) == 0 & word(input, 1) != "NA") {
input_vec <- str_split(input, "\\s+")
input_firstword <- input_vec[1]
input_otherwords <- paste(input_vec[-1], collapse = " ")
return(paste(input_firstword, NextWordPrediction(input_otherwords)))
} else if (nrow(dat) == 0 & word(input, 1) == "NA") {
Stream-of-consciousness answer. It doesn't solve anything, but it highlights some areas where code can or must be changed. Up front: == NA fails; you're always discarding the first word in recursion; NA (the object meaning "could be anything") is being coerced into "NA", the literal string.
Starting with a fresh training, I'll debug(NextWordPrediction) and trace line-by-line. It gets to input_1 <- ..., the first thing I notice is:
first time, input_1 is "great world";
next time, it is "world";
next time, it is "na world", fail.
This is a classic fail on two counts:
the code assumes that there are multiple words, even though str_count(input,"\\S+") returns 1 here; and
it is a common mistake to assume that 2:... is always increasing and will not go over a certain count, but unfortunately 2:1 returns c(2L, 1L) ... perhaps you should check the length of your vectors before arbitrarily counting past them.
I think you're trying to guard against this with your previous test of word(input,1) != NA (which is also a mistake), but the only time that's going to happen is when input is 0-length vector (character(0)), not empty-string "". You won't get that with the current code, and I think your intent is for it to reduce to "".
I'm going to change your word(input, 2:str_count(...)) to
input_1 <- sub("^\\S*\\s?", "", input)
You have word(input, 1) != "NA" (and ==), that is either mistaking R's native object for a string, or you think you should be checking for a literal string "NA"; granted, english doesn't use that much as a real word, some languages do. I'm not certain if you intend that to be the NA literal or if for some reason your function will convert NA to "NA" and you want to guard against that.
That last assumption is fixing a symptom, not a problem. Never allow your function to return "NA" (this happens here in a couple of places), you need to guard against it. To me, it is perfectly reasonable to see a word "NA" and differentiate it from the R native NA. Data missingness is important to differentiate.
Assuming you meant != NA instead ... word(input, 1) != NA will never work. Let's run through some examples:
word("hello", 1)
# [1] "hello"
word("", 1)
# [1] ""
word(c(), 1)
# Warning in rep(string, length.out = n) :
# 'x' is NULL so the result will be NULL
# Error in mapply(function(word, loc) word[loc, "start"], words, start) :
# zero-length inputs cannot be mixed with those of non-zero length
word(character(0), 1)
# [1] NA
Okay, so it can return an NA, when the input vector is a 0-length character vector, but ...
word(character(0), 1) == NA
# [1] NA
word(character(0), 1) == NA_character_
# [1] NA
That's right, you cannot check for NA-ness that way. (Did you know that there are over six kinds of NA? They are not the same, identical(NA, NA_real_).)
Use is.na(.):
is.na(word(character(0), 1))
# [1] TRUE
(That's assuming we can see it in normal operation.)
I'm going to change that if condition to:
} else if (nrow(dat) == 0 && nzchar(input) && !is.na(word(input, 1))) {
We're getting closer. Now I can get into the third call of the function, where input is finally "" and we go into the first conditional block, assigning the new content to training. Unfortunately, dat$Word_to_Predict[1] is NA, so your ans is " NA", which just doesn't seem logical. Granted, your default training dataset has this explicitly, and while I don't know what you mean to happen here, I suggest stringifying an R object of NA into " NA" seems wrong.
I don't have a fundamental fix to this flow, though: you want to concatenate the val found with the previous input string, but ... if Word_to_Predict is NA (not a normal string), then ... what do you do? For the sake of moving forward, I'll dismiss concatenating "NA" onto a string ... though it's producing results that are "wrong" from a linguistic standpoint, I believe. (I'll just interpret "NA" as "(I don't have a great value for this spot)" or similar :-)
You are always pasteing a squished input with val, but ... if input is "", then paste still adds a space between them, which seems unnecessary. You can always "patch" this later by repeatedly squishing the strings, but ... symptom/problem again. I suggest instead using
ans <- str_squish(paste(input, val))
And my original point ...
When you start with "she was", it will find something on the first invocation, and we paste the input with the val to get the answer. However, when you have to go into recursion, you call the function again with the rest of the sentence and perfect discard the first word. For instance:
NextWordPrediction("hello great world")
#1> `input` is "hello great world", second `if` block, `input_1` is "great world"
#2> `input` is "great world", second `if` block, `input_1` is "world"
#3> `input` is "world", second `if` block, `input_1` is `""`
#4> `input` is "", first `if` block, `val` is `NA`, and `ans` is "NA"
#3> blindly returns list("NA", head(dat)) (discarding "world")
#2> blindly returns list("NA", head(dat)) (discarding "great")
#1> blindly returns list("NA", head(dat)) (discarding "hello")
Do you see the problem now? Instead of return(NextWordPrediction(input_rest)), you need to capture the result, prepend the word you stripped from input, and continue passing the updated return value up the chain. I suggest
input_1 <- gsub("\\s\\S*", "", input)
input_rest <- sub("^\\S*\\s?", "", input)
out <- NextWordPrediction(input_rest)
out[[1]] <- str_squish(paste(input_1, out[[1]]))
return(out)
After all of that, I now see
NextWordPrediction("hello great world")
# [[1]]
# [1] "hello great world NA"
# [[2]]
# Word Frequency N_gram Prop Word_to_Predict
# 1 hello 4 1 1 <NA>
which, according to your initial training, is correct.
Unfortunately, this breaks something else.
"words not in data" always eventually matches something (as will anything not in training), since it reduces to an empty string "", and your first logic of grepl(paste("^", tolower(str_squish(input)), sep = ""), Word) will always match something with input of "".
We can fix this with a simple additional condition in your first filtering:
filter(nzchar(input) & grepl(paste("^", tolower(str_squish(input)), sep = ""), Word)) %>%
And finally, when you get to the final if block when you need to add data to training, if this is the first/outer call of the function, then input truly reflects the entire sentence, which is what you want. However, if you've done one or more calls of recursion, then input is merely one word in the chain, not the entire thing. And due to some of the assumptions above, at this stage input is "", so ... any addition would be useless.
There are two strategies for dealing with this:
Keep track of whether this is the outer (first) call or some inner call. When you recursively call, check the return value ... if empty and this is an inner call, return empty; if empty and this is the first/outer call, then append to training; or
Always pass the entire string along with the current input. This would reverse my recommendation in bullet 6 above, so your second if block would just call NextWordPrediction(input_rest, input_1) (using my variables) and not str_squish after it. The squishing/pasting would be handled in the first if block, where you would need to prepend the value (if any) of preceding).
NextWordPrediction <- function(input, preceding = "") {
Side notes, not wrong per se but still not good.
& (single) in an if condition works but is bad practice: & does vector logic, which means it can return vectors of length other than 1; if conditions must be length exactly 1, not 0 or 2 or more. Use && here.
Reduce(paste, ...) is just unnecessary. Use paste(...).
After understanding the implications of recursion in my function thanks to #r2evans I realized that a solution by means of recursion would be too complicated and as a result the following code meets all my conditions and works as expected:
NextWordPrediction <- function(input) {
dat <- training %>%
filter(., N_gram == str_count(input, "\\S+") + 1) %>%
filter(grepl(paste("^", tolower(str_squish(input)), sep = ""), Word)) %>%
arrange(., desc(Prop))
if (nrow(dat) != 0) {
assign("training",
training %>%
mutate(Frequency = ifelse(Word == input &
N_gram == str_count(input, "\\S+"),
Frequency + 1,
Frequency)) %>%
group_by(., N_gram) %>%
mutate(., Prop = Frequency/ sum(Frequency)) %>%
data.frame(.),
envir = .GlobalEnv)
val <- dat$Word_to_Predict[1]
ans <- paste(str_squish(input), val)
return(list(ans, head(dat,5)))
} else {
for (i in 2:str_count(input, "\\S+")) {
input_1 <- word(input, start = i, end = str_count(input,"\\S+"))
dat <- training %>%
filter(., N_gram == str_count(input_1, "\\S+") + 1) %>%
filter(grepl(paste("^", tolower(str_squish(input_1)), sep = ""), Word)) %>%
arrange(., desc(Prop))
if (nrow(dat) != 0) {
val <- dat$Word_to_Predict[1]
ans <- paste(str_squish(input), val)
return(list(ans, head(dat,5)))
} else if (nrow(dat) == 0 & i == str_count(input, "\\S+")) {
assign("training",
training %>%
add_row(., Word = tolower(input), Frequency = + 1, N_gram = str_count(input, "\\S+"),
Word_to_Predict = word(input, -1)) %>%
group_by(., N_gram) %>%
mutate(., Prop = Frequency/ sum(Frequency)) %>%
data.frame(.),
envir = .GlobalEnv)
ans <- paste("Word not in dictionary. We added this to our database!")
return(ans)
}
}
}
}
It loops through input-1 until a value is found in the dataframe and when this happens an answer is returned, otherwise we add the original input to the dataframe.