I have two tibbles with different number of columns. I want to filter df1 using a value from column b and I also want to filter df2 using a value from column b and also column c. Is it possible to do this using the same function?
I followed the list(...) procedure, but of course, I got an error since, in the first case there is no x[[2]].
library(dplyr)
df1 <- tibble(a = c(4,2,3,4),
b = c(8,6,7,8))
df2 <- tibble(a = c(1,2,3,4),
b = c(5,6,7,8),
c = c(1,5,3,7))
df1
#> # A tibble: 4 × 2
#> a b
#> <dbl> <dbl>
#> 1 4 8
#> 2 2 6
#> 3 3 7
#> 4 4 8
df2
#> # A tibble: 4 × 3
#> a b c
#> <dbl> <dbl> <dbl>
#> 1 1 5 1
#> 2 2 6 5
#> 3 3 7 3
#> 4 4 8 7
createTable <- function(df, ...) {
x <- list(...)
tabl <- df %>%
filter(b < x[[1]], c < x[[2]])
return(tabl)
}
tabl1 <- createTable(df1, 8)
#> Error in `filter()`:
#> ! Problem while computing `..2 = c < x[[2]]`.
#> Caused by error in `x[[2]]`:
#> ! subscript out of bounds
tabl2 <- createTable(df2, 7, 5)
Created on 2022-07-27 by the reprex package (v2.0.1)
Related
I am trying to assign the vector output (i.e. greater than length 1) of a function to multiple columns in a single operation (or at least as concisely as possible).
Take the range() function for example which returns as output a numeric vector of length 2 denoting the minimum and maximum, respectively. Let's say I want to compute the range() per group and assign the output to two columns min and max.
My current approach is combining summarize followed by manually adding a key and then re-shaping to wide format:
library(magrittr)
# create data
df <- dplyr::tibble(group = rep(letters[1:3], each = 3),
x = rpois(9, 10))
df
#> # A tibble: 9 x 2
#> group x
#> <chr> <int>
#> 1 a 8
#> 2 a 12
#> 3 a 8
#> 4 b 9
#> 5 b 14
#> 6 b 9
#> 7 c 11
#> 8 c 6
#> 9 c 12
# summarize gives two lines per group
range_df <- df %>%
dplyr::group_by(group) %>%
dplyr::summarize(range = range(x)) %>%
dplyr::ungroup()
range_df
#> # A tibble: 6 x 2
#> group range
#> <chr> <int>
#> 1 a 8
#> 2 a 12
#> 3 b 9
#> 4 b 14
#> 5 c 6
#> 6 c 12
# add key and reshape
range_df %>%
dplyr::mutate(key = rep(c("min", "max"), 3)) %>%
tidyr::pivot_wider(names_from = key, values_from = range)
#> # A tibble: 3 x 3
#> group min max
#> <chr> <int> <int>
#> 1 a 8 12
#> 2 b 9 14
#> 3 c 6 12
Is there a more elegant / concise alternative to this?
Edit:
Ideally the alternative solution could handle an arbitrary number of outputs (e.g. if the function returns an output with length 3 then 3 variables should be created).
# Writw a small function that does the job:
library(tidyverse)
f <- function(x){
setNames(data.frame(t(range(x))), c('min', 'max'))
}
df %>%
summarise(across(x, f, .unpack = TRUE), .by=group)
#> # A tibble: 3 × 3
#> group x_min x_max
#> <chr> <int> <int>
#> 1 a 10 13
#> 2 b 7 10
#> 3 c 10 12
If you are using older version of dplyr
df %>%
group_by(group)%>%
summarise(across(x, f))%>%
unpack(x)
#> # A tibble: 3 × 3
#> group min max
#> <chr> <int> <int>
#> 1 a 6 9
#> 2 b 7 12
#> 3 c 6 10
Based on onyambu's answer, I build a small generic function for this. There probably will be some edge cases, where this will not work.
out2col <- function(x, fun, out_names = c(), add_args = list()) {
tmp <- do.call(what = fun, args = c(list(x), add_args))
out <- data.frame(t(tmp))
if (length(out_names) != 0) {
if (length(tmp) != length(out_names)) {
stop("provided names did not match the number of outputs")
}
out <- setNames(object = out, nm = out_names)
}
return(out)
}
Examples without any additional parameters:
df %>%
summarise(across(x, out2col, .unpack = TRUE, fun = range),
.by=group)
Output:
# A tibble: 3 × 3
group x_X1 x_X2
<chr> <int> <int>
1 a 7 10
2 b 11 14
3 c 9 14
Examples with additional parameters:
df %>%
summarise(across(x, out2col, .unpack = TRUE, fun = quantile,
out_names = c("min", "max", "Q25"),
add_args = list(probs = c(0, 1, 0.25))
),
.by=group)
Output:
# A tibble: 3 × 4
group x_min x_max x_Q25
<chr> <dbl> <dbl> <dbl>
1 a 7 10 7.5
2 b 11 14 11.5
3 c 9 14 10
set.seed(1)
df <- dplyr::tibble(group = rep(letters[1:3], each = 3),
x = rpois(9, 10))
function
g <- function(x){
data.frame(min = min(x), max = max(x))
}
calling g:
df %>%
group_by(group) %>%
summarise(across(x, g, .unpack = TRUE))
i have two data frames in R say data1 and data2:
a = c(1,2,NA,4,5)
b = c(3,4,5,6,7)
data1 = tibble(a,b);data1
a = c(4,2,4,4,9)
b = c(3,4,4,6,7)
d = c(5,9,3,4,2)
data2 = tibble(a,b,d);data2
i want to calculate the correlation of these two data frames matched columns.Keep in mind that i might have NA in some column vectors and also some columns might not exist in the initial data frame 1 which ideally i want to report NA.How i can do that in R using dplyr ?
library(tibble)
library(purrr)
a = c(1,2,NA,4,5)
b = c(3,4,5,6,7)
data1 = tibble(a,b)
a = c(4,2,4,4,9)
b = c(3,4,4,6,7)
d = c(5,9,3,4,2)
data2 = tibble(a,b,d)
matched <- intersect(colnames(data1), colnames(data2))
names(matched) <- matched
map_dbl(matched, ~ cor(data1[[.x]], data2[[.x]], use = "complete.obs")) %>%
as.matrix() %>%
as.data.frame() %>%
rownames_to_column()
#> rowname V1
#> 1 a 0.7337014
#> 2 b 0.9622504
Created on 2022-07-11 by the reprex package (v2.0.1)
Since column a in data1 contains 1 NA, the output should be NA for a. You may do this
library(tidyverse)
a = c(1,2,NA,4,5)
b = c(3,4,5,6,7)
data1 = tibble(a,b);
data1
#> # A tibble: 5 × 2
#> a b
#> <dbl> <dbl>
#> 1 1 3
#> 2 2 4
#> 3 NA 5
#> 4 4 6
#> 5 5 7
a = c(4,2,4,4,9)
b = c(3,4,4,6,7)
d = c(5,9,3,4,2)
data2 = tibble(a,b,d);data2
#> # A tibble: 5 × 3
#> a b d
#> <dbl> <dbl> <dbl>
#> 1 4 3 5
#> 2 2 4 9
#> 3 4 4 3
#> 4 4 6 4
#> 5 9 7 2
names(data2) %>%
map_dbl(~ {col <- if(is.null(data1[[.x]])){
rep(NA, dim(data1)[1])
} else {
data1[[.x]]
}
cor(col, data2[[.x]])
}) %>% set_names(names(data2))
#> a b d
#> NA 0.9622504 NA
Created on 2022-07-11 by the reprex package (v2.0.1)
OR usingb stack() will give you a dataframe
names(data2) %>%
map_dbl(~ {col <- if(is.null(data1[[.x]])){
rep(NA, dim(data1)[1])
} else {
data1[[.x]]
}
cor(col, data2[[.x]])
}) %>% set_names(names(data2)) %>%
stack()
#> values ind
#> 1 NA a
#> 2 0.9622504 b
#> 3 NA d
Created on 2022-07-11 by the reprex package (v2.0.1)
I have a number of large data frames that have the following basic format, where the final two rows are a mean (d) and standard deviation (e) - although these are calculated elsewhere.
a b c
a 4 3 4
b 3 2 6
c 2 1 8
d 3 2 6
e 1 1 2
I would like to create an iterative function that converts each raw data point into a z-score via the mean and sd value in d and e per column. The formula I would like to apply is ((x-mean)/SD).
The result would be the following:
a b c
a 1 1 1
b 0 0 0
c -1 -1 -1
I don't mind if this is added to the end, created as a new dataframe or the data is converted.
Thanks!
Here is one approach, note that I do not use the mean/sd provided in the data but re-calculate it on the fly.
Also note that usually the data should be in a tidy data representation, which in your case would mean that a, b, c would be in columns and then mean/sd would be either calculated on the fly or be in a separate column (note that this would reshaping the data, not shown here).
# your input data
raw_data <- data.frame(
a = c(4, 3, 2, 3, 1),
b = c(3, 2, 1, 2, 1),
c = c(4, 6, 8, 6, 2),
row.names = c("a", "b", "c", "d", "e")
)
raw_data
#> a b c
#> a 4 3 4
#> b 3 2 6
#> c 2 1 8
#> d 3 2 6
#> e 1 1 2
# remove the mean/sd values
data <- raw_data[!rownames(raw_data) %in% c("d", "e"), ]
data
#> a b c
#> a 4 3 4
#> b 3 2 6
#> c 2 1 8
# quick way to recalculate the values
means <- apply(data, 2, mean)
means
#> a b c
#> 3 2 6
sds <- apply(data, 2, sd)
sds
#> a b c
#> 1 1 2
z_scores <- apply(data, 2, function(x) (x - mean(x)) / sd(x))
z_scores
#> a b c
#> a 1 1 -1
#> b 0 0 0
#> c -1 -1 1
Created on 2021-01-07 by the reprex package (v0.3.0)
Edit / Full Code
The following code is a bit longer but most of it is spent on getting the data into the right (long/tidy) format.
If you have any questions, feel free to use the comments.
Note that the tidyverse is really helpful, but might need some time to get used to. The code used here is mostly dplyr (included in the tidyverse).
If you understand the functions: %>% (pipe), group_by(), mutate(), summarise(), and pivot_longer/wider() you got everything.
library(tidyverse)
# use your original dataset again
raw_data <- data.frame(
a = c(4, 3, 2, 3, 1),
b = c(3, 2, 1, 2, 1),
c = c(4, 6, 8, 6, 2),
row.names = c("a", "b", "c", "d", "e")
)
### 1) Turn the data into a nicer format
# match-table how to rename the variables
var_match <- c(d = "mean", e = "sd")
# convert the raw data into a nicer format, first we do some minor changes
# (variable names, etc)
data_mixed <- raw_data %>%
# have the rownames as explicit variable
rownames_to_column("metric") %>%
# nicer printing etc
as_tibble() %>%
# replace variable names with mean/sd
mutate(metric = ifelse(metric %in% c("d", "e"),
var_match[metric], metric))
data_mixed
#> # A tibble: 5 x 4
#> metric a b c
#> <chr> <dbl> <dbl> <dbl>
#> 1 a 4 3 4
#> 2 b 3 2 6
#> 3 c 2 1 8
#> 4 mean 3 2 6
#> 5 sd 1 1 2
# separate the dataset into two:
# data holds the values
# data_vars holds the metrics mean and sd
data <- data_mixed %>% filter(!metric %in% var_match) %>% select(-metric)
data_vars <- data_mixed %>% filter(metric %in% var_match)
data
#> # A tibble: 3 x 3
#> a b c
#> <dbl> <dbl> <dbl>
#> 1 4 3 4
#> 2 3 2 6
#> 3 2 1 8
data_vars
#> # A tibble: 2 x 4
#> metric a b c
#> <chr> <dbl> <dbl> <dbl>
#> 1 mean 3 2 6
#> 2 sd 1 1 2
# turn the value dataset into its longer form, makes it easier to work with it later
data_long <- data %>%
pivot_longer(everything(), names_to = "var", values_to = "val")
data_long
#> # A tibble: 9 x 2
#> var val
#> <chr> <dbl>
#> 1 a 4
#> 2 b 3
#> 3 c 4
#> 4 a 3
#> 5 b 2
#> 6 c 6
#> 7 a 2
#> 8 b 1
#> 9 c 8
# turn the metric dataset into another long form, allowing easy combination in the next step
data_vars2 <- data_vars %>%
pivot_longer(-metric, names_to = "var", values_to = "val") %>%
pivot_wider(var, names_from = metric, values_from = val)
data_vars2
#> # A tibble: 3 x 3
#> var mean sd
#> <chr> <dbl> <dbl>
#> 1 a 3 1
#> 2 b 2 1
#> 3 c 6 2
# combine the datasets
data_all <- left_join(data_long, data_vars2, by = "var")
data_all
#> # A tibble: 9 x 4
#> var val mean sd
#> <chr> <dbl> <dbl> <dbl>
#> 1 a 4 3 1
#> 2 b 3 2 1
#> 3 c 4 6 2
#> 4 a 3 3 1
#> 5 b 2 2 1
#> 6 c 6 6 2
#> 7 a 2 3 1
#> 8 b 1 2 1
#> 9 c 8 6 2
## 2) calculate the z-score
# now comes the actual number crunchin!
# per variable var (a, b, c) compute the variable val_z as the z-score
data_res <- data_all %>%
group_by(var) %>%
mutate(val_z = (val - mean) / sd)
data_res
#> # A tibble: 9 x 5
#> # Groups: var [3]
#> var val mean sd val_z
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 a 4 3 1 1
#> 2 b 3 2 1 1
#> 3 c 4 6 2 -1
#> 4 a 3 3 1 0
#> 5 b 2 2 1 0
#> 6 c 6 6 2 0
#> 7 a 2 3 1 -1
#> 8 b 1 2 1 -1
#> 9 c 8 6 2 1
## 3) make the results more readable
# lastly pivot the results to its original form
data_res_wide <- data_res %>%
select(var, val_z) %>%
group_by(var) %>%
mutate(id = 1:n()) %>% # needed for easier identification of values
pivot_wider(id, names_from = var, values_from = val_z)
data_res_wide
#> # A tibble: 3 x 4
#> id a b c
#> <int> <dbl> <dbl> <dbl>
#> 1 1 1 1 -1
#> 2 2 0 0 0
#> 3 3 -1 -1 1
Created on 2021-01-07 by the reprex package (v0.3.0)
If I want to make overscoping explicit, I can use the .data pronoun like this
library(dplyr)
cyl <- 3
transmute(as_tibble(mtcars), cyl_plus_one = .data$cyl + 1)
#> # A tibble: 32 x 1
#> cyl_plus_one
#> <dbl>
#> 1 7
#> 2 7
#> 3 5
#> 4 7
#> 5 9
#> 6 7
#> 7 9
#> 8 5
#> 9 5
#> 10 7
#> # ... with 22 more rows
However, what about the opposite, i.e. if I want to avoid overscoping explicitly? In the example below, I would like to add a new column that contains the value b (supplied via the function call, not the b in the data) plus 1, which does obviously not work the way it's stated now (because of overscoping).
library(dplyr)
add_one <- function(data, b) {
data %>%
mutate(a = b + 1)
}
data <- data_frame(
b = 999
)
add_one(data, 3)
#> # A tibble: 1 x 2
#> b a
#> <dbl> <dbl>
#> 1 999 1000
I also tried to create the new value outside the mutate() call, but then I still need to rely on new_val being not in the data.
library(dplyr)
add_one <- function(data, b) {
new_val <- b + 1
data %>%
mutate(a = new_val)
}
Just unquote with !! to look for a variable with that name above the data frame scope:
library(tidyverse)
add_one <- function(data, b) {
data %>% mutate(a = !!b + 1)
}
data <- data_frame(b = 999)
add_one(data, 3)
#> # A tibble: 1 x 2
#> b a
#> <dbl> <dbl>
#> 1 999 4.00
This question uses a data.frame which contains list-columns (nested). It had me wondering why/if there's an advantage to working this way. I assumed you would want to minimize the amount of memory each table uses...But when I checked I was surprised:
Compare table sizes for nested vs. tidy format:
1. Generate nested/tidy versions of a 2-col and 5-col data.frame:
library(pryr)
library(dplyr)
library(tidyr)
library(ggvis)
n <- 1:1E6
df <- data_frame(id = n, vars = lapply(n, function(x) x <- sample(letters,sample(1:26,1))))
dfu <- df %>% unnest(vars)
df_morecols <- data_frame(id = n, other1 = n, other2 = n, other3 = n,
vars = lapply(n, function(x) x <- sample(letters,sample(1:26,1))))
dfu_morecols <- df_morecols %>% unnest(vars)
they look like:
head(df)
#> Source: local data frame [6 x 2]
#> id vars
#> 1 1 <chr[16]>
#> 2 2 <chr[4]>
#> 3 3 <chr[26]>
#> 4 4 <chr[9]>
#> 5 5 <chr[11]>
#> 6 6 <chr[18]>
head(dfu)
#> Source: local data frame [6 x 2]
#> id vars
#> 1 1 k
#> 2 1 d
#> 3 1 s
#> 4 1 j
#> 5 1 m
#> 6 1 t
head(df_morecols)
#> Source: local data frame [6 x 5]
#> id other1 other2 other3 vars
#> 1 1 1 1 1 <chr[4]>
#> 2 2 2 2 2 <chr[22]>
#> 3 3 3 3 3 <chr[24]>
#> 4 4 4 4 4 <chr[6]>
#> 5 5 5 5 5 <chr[15]>
#> 6 6 6 6 6 <chr[11]>
head(dfu_morecols)
#> Source: local data frame [6 x 5]
#> id other1 other2 other3 vars
#> 1 1 1 1 1 r
#> 2 1 1 1 1 p
#> 3 1 1 1 1 s
#> 4 1 1 1 1 w
#> 5 2 2 2 2 l
#> 6 2 2 2 2 j
2. Calculate object sizes and col sizes
from: lapply(list(df,dfu,df_morecols,dfu_morecols),object_size)
170 MB vs. 162 MB for nested vs. tidy 2-col df
170 MB vs. 324 MB for nested vs. tidy 5-col df
col_sizes <- sapply(c(df,dfu,df_morecols,dfu_morecols),object_size)
col_names <- names(col_sizes)
parent_obj <- c(rep(c('df','dfu'),each = 2),
rep(c('df_morecols','dfu_morecols'),each = 5))
res <- data_frame(parent_obj,col_names,col_sizes) %>%
unite(elementof, parent_obj,col_names, remove = F)
3. Plot columns sizes coloured by parent object:
res %>%
ggvis(y = ~elementof, x = ~0, x2 = ~col_sizes, fill = ~parent_obj) %>%
layer_rects(height = band())
Questions:
What explains the smaller footprint of the tidy 2-col df compared to the nested one?
Why doesn't this effect change for to the 5-col df?