Keeping the the name of the vector element in the output - r

My foo function below works great, however it omits the name associated with its vector output.
For the example below, I expect foo to return:
scale
[1] 2
But it simply returns:
[1] 2
Is there fix for this?
library(tidyverse)
library(rlang)
txt = "
study scale
1 1
1 1
2 2
3 2
3 2
3 2
4 1
"
h <- read.table(text = txt,h=T)
foo <- function(data, ...){
dot_cols <- rlang::ensyms(...)
str_cols <- purrr::map_chr(dot_cols, rlang::as_string)
min(sapply(str_cols, function(i) length(unique(data[[i]]))))
}
foo(h, study, scale)

We may use which.min to get the index and then use it to subset the element. Also, loop over a named vector
foo <- function(data, ...){
dot_cols <- rlang::ensyms(...)
str_cols <- purrr::map_chr(dot_cols, rlang::as_string)
v1 <- sapply(setNames(str_cols, str_cols),
function(i) length(unique(data[[i]])))
v1[which.min(v1)]
}
-testing
> foo(h, study, scale)
scale
2

You can skip the rlang stuff by using summarise and passing ... to across
library(dplyr)
foo <- function(data, ...){
data %>%
summarise(across(c(...), n_distinct)) %>%
unlist() %>%
.[which.min(.)]
}
foo(h, study, scale)
#> scale
#> 2

Related

Use input of a function as variable name in R

I have a simple function in R to modify a dataframe
monthly_fun <- function(x){
x %>%
mutate(obstime = convert_dates(obstime)) %>%
select(obstime, x = obsvalue)
}
When applying the function to dataframe df, i.e. monthly_fun(df), I would like df to be the name of obsvalue. In my current code, the name is obviously "x", how can I modify the part in select to get the name of the supplied dataframe as the variable name instead?
Thanks a lot
EDIT: I want to apply this function to several dataframes using
result <- list( df1, df2, df3) %>%
lapply( monthly_fun )
You could extract the name of input by deparse(substitute(x)), and use !!y := obsvalue in mutate().
monthly_fun <- function(x) {
y <- deparse(substitute(x))
x %>%
mutate(obstime = convert_dates(obstime),
!!y := obsvalue) %>%
select(obstime, y)
}
A simplified example:
fun <- function(x) {
y <- deparse(substitute(x))
x %>%
mutate(!!y := 1) %>%
select(y)
}
fun(df)
# df
# 1 1
# 2 1
# 3 1
# 4 1
# 5 1
Update
If you want to apply it to several data frames stored in a list, you should design a 2-argument function, one argument for data and the other for new column names. Then use Map() to apply this function over each pair of data and names.
fun <- function(x, y) {
x %>%
mutate(!!y := 1) %>%
select(y)
}
Map(fun, list(df1, df2), c("name1", "name2"))
# [[1]]
# name1
# 1 1
# 2 1
# 3 1
# 4 1
# 5 1
#
# [[2]]
# name2
# 1 1
# 2 1
# 3 1
# 4 1
# 5 1
If you're familiar with purrr, The use of Map can be replaced with map2() or imap(). (Notice the difference of inputs to the both functions)
library(purrr)
# (1) map2(): Input data and names separately
map2(list(df1, df2), c("name1", "name2"), fun)
# (2) imap(): Input a named list
imap(list(name1 = df1, name2 = df2), fun)
Using the suggestion by Julien and creating a variable using deparse(substitute(df)) and rename using that.
monthly_fun <- function(x) {
y = deparse(substitute(x))
x <- x %>%
mutate(obstime = obstime*5) %>%
select(obstime, obsvalue)
names(x)[names(x) == "obsvalue"] <- y
return(x)
}
see this site for more naming methods.

Apply codon translation function to all elements of data.frame

I have a data.frame that looks like
df <- data.frame(P1 = c("ATG","GTA","GGG","GGG"), P2 = c("TGG","GAT","GGG","GCG"))
I want to convert each DNA codon to an amino-acid using the below function (but any translate option is viable), and output an identical data.frame but with single letter amino-acids rather than codons:
library(Biostrings)
library(seqinr)
translate_R <- function(x)
{
translate(s2c(as.character(x)))
}
It works for individual elements of the data.frame
> translate_R(df[1,1])
[1] "M"
But trying to apply this to the whole data.frame isn't working. What am I missing? I don't understand why there is an error, as googling how to do this suggests it should work. Missing something fundamental I guess.
> df[] <- lapply(df, translate_R)
Error in seq.default(from = frame + 1, to = frame + l, by = 3) :
wrong sign in 'by' argument
In addition: Warning message:
In s2c(as.character(x)) :
Error in seq.default(from = frame + 1, to = frame + l, by = 3) :
wrong sign in 'by' argument
Your translate_R function is expecting a single value, but it's getting a vector. You can fix this by passing in individual values.
In other words, iterate over columns of df with an outer apply and then over values in each column with an inner apply.
Here's how to do it with base R:
data.frame(lapply(df, function(x) sapply(x, translate_R)))
And here's a tidyverse version with map:
library(tidyverse)
df %>% mutate(across(everything(), ~map(., translate_R)))
In both cases, the output is:
P1 P2
1 M W
2 V D
3 G G
4 G A
Another potential tidyverse solution is to use the "rowwise" tidyverse function:
library(tidyverse)
library(Biostrings)
library(seqinr)
translate_R <- function(x) {
translate(s2c(as.character(x)))
}
df <- data.frame(P1 = c("ATG","GTA","GGG","GGG"), P2 = c("TGG","GAT","GGG","GCG"))
df %>%
rowwise() %>%
mutate(across(everything(), ~ translate_R(.x)))
#> # A tibble: 4 x 2
#> # Rowwise:
#> P1 P2
#> <chr> <chr>
#> 1 M W
#> 2 V D
#> 3 G G
#> 4 G A
Created on 2021-07-21 by the reprex package (v2.0.0)

Dplyr to calculate mean, SD, and graph multiple variables

I have a table with columns
[Time, var1, var2, var3, var4...varN]
I need to calculate mean/SE per Time for each var1, var2...var n , and I want to do this programmatically for all variables, rather than 1 at a time which would involve a lot of copy-pasting.
Section 8.2.3 here https://tidyeval.tidyverse.org/dplyr.html is close to what I want but my below code:
x <- as.data.frame(matrix(nrow = 2, ncol = 3))
x[1,1] = 1
x[1,2] = 2
x[1,3] = 3
x[2,1] =4
x[2,2] = 5
x[2,3] = 6
names(x)[1] <- "time"
names(x)[2] <- "var1"
names(x)[3] <- "var2"
grouped_mean3 <- function(.data, ...) {
print(.data)
summary_vars <- enquos(...)
print(summary_vars)
summary_vars <- purrr::map(summary_vars, function(var) {
expr(mean(!!var, na.rm = TRUE))
})
print(summary_vars)
.data %>%
group_by(time)
summarise(!!!summary_vars) # Unquote-splice the list
}
grouped_mean3(x, var("var1"), var("var2"))
Yields
Error in !summary_vars : invalid argument type
And the original cause is "Must group by variables found in .data." and it finds a column that isn't in the dummy "x" that I generated for the purposes of testing. I have no idea what's happening, sadly.
How do I actually extract the mean from the new summary_vars and add it to the .data table? summary_vars becomes something like
[[1]]
mean(~var1, na.rm = TRUE)
[[2]]
mean(~var2, na.rm = TRUE)
Which seems close, but needs evaluation. How do I evaluate this? !!! wasn't working.
For what it's worth, I tried plugging the example in dplyr into this R engine https://rdrr.io/cran/dplyr/man/starwars.html and it didn't work either.
Help?
End goal would be a table along the lines of
[Time, var1mean, var2mean, var3mean, var4mean...]
Try this :
library(dplyr)
grouped_mean3 <- function(.data, ...) {
vars <- c(...)
.data %>%
group_by(time) %>%
summarise(across(all_of(vars), mean))
}
grouped_mean3(x, 'var1')
# time var1mean
# <dbl> <dbl>
#1 1 2
#2 4 5
grouped_mean3(x, 'var1', 'var2')
# time var1mean var2mean
# <dbl> <dbl> <dbl>
#1 1 2 3
#2 4 5 6
Perhaps this is what you are looking for?
x %>%
group_by(time) %>%
summarise_at(vars(starts_with('var')), ~mean(.,na.rm=T)) %>%
rename_at(vars(starts_with('var')),funs(paste(.,"mean"))) %>%
merge(x)
With your data (from your question) following is the output:
time var1mean var2mean var1 var2
1 1 2 3 2 3
2 4 5 6 5 6

Exporting values passed to enquos as string of format name1,name2, nameN,

In this example, I have a simple function taking variable names passed via ... and making use of the enquos function in order to pass them to group_by operator in dplyr.
Basic function
# Libraries
library(dplyr)
library(rlang)
sample_function <- function(x, ...) {
group_vars <- enquos(...)
x %>%
group_by(!!!group_vars) %>%
summarise(num_obs = n())
}
Results
mtcars %>% sample_function(cyl, am)
# A tibble: 6 x 3
# Groups: cyl [3]
cyl am num_obs
<dbl> <dbl> <int>
1 4 0 3
2 4 1 8
3 6 0 4
4 6 1 3
5 8 0 12
6 8 1 2
Problem
I would like to expand the function above and in addition to the produced results create a new scalar character that would reflect names of variables passed to enquos in a format: "var1, var2, ...".
Attempt
library(dplyr)
library(rlang)
sample_function <- function(x, ...) {
group_vars <- enquos(...)
# Problem:
# Create test object of quoted variables
assign(x = "used_group_variables",
value = quo_text(group_vars),
envir = globalenv())
# Summary
x %>%
group_by(!!!group_vars) %>%
summarise(num_obs = n())
}
Results
Produced string does not match the desired format.
used_group_variables
# [1] "structure(list(~cyl, ~am), .Names = c(\"\", \"\"), class = c(\"quosures\", \n\"list\"))"
Desired results
Only names of all variables initially passed via enquos are returned and pasted together with "`" as a separator.
used_group_variables
# "cyl, am"
Notes
Admittedly, assigning values to the global environment from the inside of a function is not a good practice. This is only done for illustrative purposes. In effect, the key goal is to coerce whatever sits within enquos to string of format "name1, name2, ...".
You could use sapply for that and collapse with toString:
sample_function <- function(x, ...) {
group_vars <- enquos(...)
assign(x = "used_group_variables",
value = toString(sapply(group_vars, quo_name)),
envir = globalenv())
x %>%
group_by(!!!group_vars) %>%
summarise(num_obs = n())
}
Output:
mtcars %>% sample_function(am, cyl)
used_group_variables
# [1] "am, cyl"
Edit: As suggested by #LionelHenry in the comment, you may want to use as_label instead of quo_name as the latter is misleading and will likely be deprecated.

How to identify repeated subsequences in a dataset

I have a dataset of numerical values, each represent a zone.
eg.
x <- c(1,6,1,2,3,4,5,8,5,9,10,1,2,3,10,7,5,9,4,1,2,3)
I need to identify whether there are repeated subsequences within the data, i.e whether the subject repeatedly travelled from zone 1 to 2 to 3. In the above example 1,2,3 would give a value of 3. I don't know the subsequences already, I need R to provide this given the data.
Following that I need to calculate how many times this subsequence appears in the data.
Very basic knowledge or R so forgive me for my ignorance if this is a simple task!
Here's a way to find which sequences of length n repeat, and how many times
For n = 3
library(tidyverse) # not necessary, see base version below
n <- 3
lapply(seq(0, length(x) - n), `+`, seq(n)) %>% # get index of all subsequences
map_chr(~ paste(x[.], collapse = ',')) %>% # paste together as character
table %>% # get number of times each occurs
`[`(. > 1) # select sequences occurring > 1 time
# 1,2,3
# 3
For n = 2
n <- 2
lapply(seq(0, length(x) - n), `+`, seq(n)) %>%
map_chr(~ paste(x[.], collapse = ',')) %>%
table %>%
`[`(. > 1)
# 1,2 2,3 5,9
# 3 3 2
Without Tidyverse
seqs <- lapply(seq(0, length(x) - n), `+`, seq(n))
seqs.char <- sapply(seqs, function(i) paste(x[i], collapse = ','))
tbl <- table(seqs.char)
tbl[tbl > 1]
I'll add my own question: Does anyone know how to do this without converting to character first? e.g. fun where fun(list(1:2, 1:2, 2:3)) tells you 1:2 occurs twice and 2:3 occurs once?
An alternative tidyverse approach that creates a big dataframe of results based on how many values you want your subsequences to have:
library(tidyverse)
# example vector
x <- c(1,6,1,2,3,4,5,8,5,9,10,1,2,3,10,7,5,9,4,1,2,3)
# function that gets as input number of consequtive elements in a subsequence
# and returns an ordered dataframe by counts of occurence
f = function(n) {
data.frame(value = x) %>% # get the vector x
slice(1:(nrow(.)-n+1)) %>% # remove values not needed from the end
mutate(position = row_number()) %>% # add position of each value
rowwise() %>% # for each value/row
mutate(vec = paste0(x[position:(position+n-1)], collapse = ",")) %>% # create subsequences as a string
ungroup() %>% # forget the grouping
count(vec, sort = T) } # order by counts descending
2:5 %>% # specify how many values in your subsequences you want to investigate (let's say from 2 to 5)
map_df(~ data.frame(NumElements = ., f(.))) %>% # apply your function and keep the number values
arrange(desc(n)) %>% # order by counts descending
tbl_df() # (only for visualisation purposes)
# # A tibble: 88 x 3
# NumElements vec n
# <dbl> <chr> <int>
# 1 2 1,2 3
# 2 2 2,3 3
# 3 3 1,2,3 3
# 4 2 5,9 2
# 5 2 1,6 1
# 6 2 10,1 1
# 7 2 10,7 1
# 8 2 3,10 1
# 9 2 3,4 1
# 10 2 4,1 1
# # ... with 78 more rows
The approach below finds sequences of any length (k): the input vector is converted into a matrix with k rows; this is done k times with adding 0:(k-1) NA's to the beginning. Finally, all rows in these k matrices are counted (paste'ing the elements together):
frs <- function(x, k=2){
padit <- function(.) c(.,rep(NA, k-length(.)%%k))
xx <- lapply(1:k, function(iii) padit(c(rep(NA,iii-1), x)))
xx <- do.call(rbind, lapply(xx, function(.) matrix(., ncol=k, byrow=TRUE)))
xx <- sapply(split(xx, 1:NROW(xx)), paste, collapse=",")
(function(x) x[x>1])(table(xx))
}
Output:
> frs(x,2)
xx
1,2 2,3 5,9
3 3 2
> frs(x,3)
1,2,3
3
> frs(x,4)
named integer(0)

Resources