I'm looking for a tidy way to anonymize selected columns of a data frame.
The best I could come up with is to define a mapping table and then using plyr::mapvalues(), but I can't wrap my head around generalizing this to make it work with in conjunction with dplyr::mutate_at() (see pseudo code below).
Or would this best be done via purrr::map2()?
library(magrittr)
df <- tibble::tribble(
~name, ~surname, ~value,
"John", "Doe", 10,
"Jane", "Doe", 20
)
seed <- 2093
cols_to_anon <- c("name", "surname")
recode_table <- cols_to_anon %>%
dplyr::syms() %>%
purrr::map(function(.x) {
uniques <- df %>%
dplyr::distinct(!!.x) %>%
dplyr::pull()
n <- length(uniques)
set.seed(seed)
original <- uniques[sample(1:n)]
set.seed(seed)
anon_1 <- sample(LETTERS, n, replace = TRUE)
set.seed(seed)
anon_2 <- sample(1:1000, n, replace = TRUE)
anon <- stringr::str_glue("{anon_1}{anon_2}")
tibble::tibble(original, anon)
}) %>%
purrr::set_names(cols_to_anon)
recode_table
#> $name
#> # A tibble: 2 x 2
#> original anon
#> <chr> <S3: glue>
#> 1 Jane W875
#> 2 John D149
#>
#> $surname
#> # A tibble: 1 x 2
#> original anon
#> <chr> <S3: glue>
#> 1 Doe W875
df_anon <- df %>%
dplyr::mutate(
name = plyr::mapvalues(name,
recode_table$name$original,
recode_table$name$anon
),
surname = plyr::mapvalues(surname,
recode_table$surname$original,
recode_table$surname$anon
)
)
df_anon
#> # A tibble: 2 x 3
#> name surname value
#> <chr> <chr> <dbl>
#> 1 D149 W875 10
#> 2 W875 W875 20
Created on 2019-05-16 by the reprex package (v0.2.1.9000)
PSEUDO CODE OF "DESIRED" SOLUTION
df_anon <- df %>%
dplyr::mutate_at(
dplyr::vars(one_of(cols_to_anon)),
~plyr::mapvalues(<col_name_i>,
mtable_list[[<col_name_i>]]$original,
mtable_list[[<col_name_i>]]$anon
)
)
with `<col_name_i>` being the name of the respective column that is to be anonymized
One approach would be:
library(rlang)
library(stringr)
library(tidyverse)
df <- tibble::tribble(
~name, ~surname, ~value,
"John", "Doe", 10,
"Jane", "Doe", 20
)
df
my_selection <- exprs(name, surname)
map(df %>%
select(!!!my_selection),
~enframe(unique(.), name = NULL, value = "original") %>%
mutate(anon = str_c(sample(LETTERS, n(), replace = TRUE),
sample(1:1000, n(), replace = TRUE),
sep = ""))) -> recode_table
recode_table
# $name
# # A tibble: 2 x 2
# original anon
# <chr> <chr>
# 1 John F330
# 2 Jane O445
#
# $surname
# # A tibble: 1 x 2
# original anon
# <chr> <chr>
# 1 Doe N710
imap_dfc(recode_table,
~df %>%
select(..2) %>%
`colnames<-`("original") %>%
left_join(recode_table[[..2]], by = "original") %>%
select(-original) %>%
`colnames<-`(..2)) %>%
cbind(
df %>%
select(-c(!!!my_selection))) -> df_anon
df_anon
# name surname value
# 1 F330 N710 10
# 2 O445 N710 20
Related
I am trying to map a function to each row in a tibble. Please see code below. My desired workflow is as follows -
Convert a list with sub lists to a tibble
Map each row the tibble to a function
My desired output should be a list with a tibble as output for each row mapped to the function. See full code below -
# Packages
library(tidyverse)
library(purrr)
# Function i want to map
sample_func <- function(tib){
a <- tib$name
b <- tib$qty
c <- tib$price
d <- tib$add
e <- b+c+d
t <- tibble(e = c(e), stock = c(a))
return(t)
}
# Define the list with multiple sublists
lst <- list(c( "CHR1", 15, 222.14, 6), c( "CHR2", 10, 119.20, 10))
# Convert each sublist to a tibble and bind the rows
tib <- bind_rows(lapply(lst, function(x) {
tibble(name = x[1], qty = x[2] %>% as.numeric(), price = x[3] %>% as.numeric(),
add = x[4] %>% as.numeric())
}))
# Apply the function to each row in the tibble using map()
result <- tib %>%
rowwise() %>%
mutate(temp = map(list(name, qty, price, add), sample_func)) %>%
unnest(temp)
My desired output should be -
[[1]]
# A tibble: 1 × 2
e name
<dbl> <chr>
1 243. CHR1
[[2]]
# A tibble: 1 × 2
e name
<dbl> <chr>
1 139. CHR2
However when the final rowwise mapping, I get the following error -
Error in `mutate()`:
! Problem while computing `temp = map(list(name, qty, price, add), sample_func)`.
ℹ The error occurred in row 1.
Caused by error in `map()`:
ℹ In index: 1.
Caused by error in `tib$name`:
! $ operator is invalid for atomic vectors
What am I doing wrong here?
An alternative approach is to change the inputs of the sample_func function to be the names of the columns instead of the tibble, then you can do this with pmap():
# Function i want to map
sample_func <- function(name, qty, price, add){
a <- name
b <- qty
c <- price
d <- add
e <- b+c+d
t <- tibble(e = c(e), stock = c(a))
return(t)
}
# Define the list with multiple sublists
lst <- list(c( "CHR1", 15, 222.14, 6), c( "CHR2", 10, 119.20, 10))
# Convert each sublist to a tibble and bind the rows
tib <- bind_rows(lapply(lst, function(x) {
tibble(name = x[1], qty = x[2] %>% as.numeric(), price = x[3] %>% as.numeric(),
add = x[4] %>% as.numeric())
}))
# Apply the function to each row in the tibble using map()
pmap(tib, sample_func)
Instead of passing a tibble to the function you may pass columns of the tibble as vector.
library(dplyr)
library(purrr)
sample_func <- function(name, qty, price, add){
res <- tibble(e = qty + price + add, stock = name)
return(res)
}
You may then use pmap -
out <- tib %>%
mutate(res = pmap(list(name, qty, price, add), sample_func))
out
# A tibble: 2 × 5
# name qty price add res
# <chr> <dbl> <dbl> <dbl> <list>
#1 CHR1 15 222. 6 <tibble [1 × 2]>
#2 CHR2 10 119. 10 <tibble [1 × 2]>
out$res
#[[1]]
# A tibble: 1 × 2
# e stock
# <dbl> <chr>
#1 243. CHR1
#[[2]]
# A tibble: 1 × 2
# e stock
# <dbl> <chr>
#1 139. CHR2
You may use unnest to get separate columns.
out %>% unnest(res)
# name qty price add e stock
# <chr> <dbl> <dbl> <dbl> <dbl> <chr>
#1 CHR1 15 222. 6 243. CHR1
#2 CHR2 10 119. 10 139. CHR2
We could just apply the sample_func on the picked dataset and unnest
library(dplyr)
library(tidyr)
tib %>%
transmute(temp = sample_func(pick(everything()))) %>%
unnest(where(is_tibble))
-output
# A tibble: 2 × 2
e stock
<dbl> <chr>
1 243. CHR1
2 139. CHR2
If we want it as a list of tibbles
tib %>%
rowwise %>%
reframe(temp = list(sample_func(pick(everything())))) %>%
pull(temp)
-output
[[1]]
# A tibble: 1 × 2
e stock
<dbl> <chr>
1 243. CHR1
[[2]]
# A tibble: 1 × 2
e stock
<dbl> <chr>
1 139. CHR2
To get your desired output and without changing your function or tibble we can use dplyr::rowwise() and dplyr::group_map().
With rowwise we tell 'dplyr' to treat each row as a group. With group_map we apply a function to each group (in our case row) and the function takes the data.frame of each group as input .x which fits your sample_func() perfectly.
library(dplyr)
tib %>%
rowwise() %>%
group_map(~ sample_func(.x))
#> [[1]]
#> # A tibble: 1 × 2
#> e stock
#> <dbl> <chr>
#> 1 243. CHR1
#>
#> [[2]]
#> # A tibble: 1 × 2
#> e stock
#> <dbl> <chr>
#> 1 139. CHR2
Data from OP
library(tidyverse)
# Function i want to map
sample_func <- function(tib){
a <- tib$name
b <- tib$qty
c <- tib$price
d <- tib$add
e <- b+c+d
t <- tibble(e = c(e), stock = c(a))
return(t)
}
# Define the list with multiple sublists
lst <- list(c( "CHR1", 15, 222.14, 6), c( "CHR2", 10, 119.20, 10))
# Convert each sublist to a tibble and bind the rows
tib <- bind_rows(lapply(lst, function(x) {
tibble(name = x[1], qty = x[2] %>% as.numeric(), price = x[3] %>% as.numeric(),
add = x[4] %>% as.numeric())
}))
Created on 2023-02-12 with reprex v2.0.2
I have a list of dataframes that all contain a matching ID column.
For example...
dat1 = tribble(
~id, ~response,
"id_1", 10,
"id_2", 15
)
dat2 = tribble(
~id, ~response,
"id_3", 20,
"id_4", 25
)
example_list <- list(dat1, dat2)
> list(dat1, dat2)
[[1]]
# A tibble: 2 × 2
id response
<chr> <dbl>
1 id_1 10
2 id_2 15
[[2]]
# A tibble: 2 × 2
id response
<chr> <dbl>
1 id_3 20
2 id_4 25
How can I map across the dataframes to remove the "id_" prefix for each row on the id column using str_remove()?
With purrr::map, then str_remove (or gsub or readr::parse_number).
library(tidyverse)
example_list %>%
map(~ mutate(.x, id = str_remove(id, "id_")))
#map(~ .x %>% mutate(id = gsub("id_", "", id)))
#map(~ mutate(.x, id = parse_number(id)))
output
[[1]]
# A tibble: 2 × 2
id response
<chr> <dbl>
1 1 10
2 2 15
[[2]]
# A tibble: 2 × 2
id response
<chr> <dbl>
1 3 20
2 4 25
You can nest a modify_at() for greater speed. Also, substring should be faster than some text match, since you know the length of your prefix already.
Of course, you may want an as.integer() to convert this back to to a number, but that is solution independent.
library(purrr)
example_list %>%
map(modify_at, "id", substring, 4)
# [[1]]
# # A tibble: 2 x 2
# id response
# <chr> <dbl>
# 1 1 10
# 2 2 15
#
# [[2]]
# # A tibble: 2 x 2
# id response
# <chr> <dbl>
# 1 3 20
# 2 4 25
# to convert to integer
example_list %>%
map(modify_at, "id", ~ as.integer(substring(.x, 4)))
Running a few options as a benchmark:
library(purrr)
library(dplyr)
library(stringr)
microbenchmark::microbenchmark(
modify_substring = example_list %>%
map(modify_at, "id", substring, 4),
mutate_substring = example_list %>%
map(~ mutate(.x, id = substring(id, 4))),
mutate_str_remove = example_list %>%
map(~ mutate(.x, id = str_remove(id, "id_")))
)
You can see that this approach runs substantially quicker.
Unit: microseconds
expr min lq mean median uq max neval
modify_substring 302.301 359.9005 442.340 419.6505 459.901 1597.401 100
mutate_substring 3019.502 3308.6015 4916.405 3540.5505 3847.801 116220.501 100
mutate_str_remove 4064.801 4568.4010 5355.351 4839.1010 5232.452 10521.701 100
I have a working custom function but not sure how to allow it to loop with a list of inputs. Looks like I need to understand apply() and the such but I'm not quite there with my current setup. The function uses rollapply() to find the largest metric for a given time frame.
library(zoo)
library(dplyr)
# Data
set.seed(1)
df <- tibble(player = rep(LETTERS[1:2], each = 10),
minute = rep(1:10, times = 2),
tdc = sample(100:200,size = 20),
sumad = sample(1:10, size = 20, replace = TRUE))
# Custom function
x_min_roll <- function(df, metric, n_minutes, fun){
metric <- ensym(metric)
newname <- glue::glue("{rlang::as_string(metric)}_x{as.character(n_minutes)}")
df %>%
# dynamically create new column name based on input
mutate("{newname}" := rollapply(!!metric, n_minutes, fun, align='left', fill=NA)) %>%
group_by(player) %>%
slice_max(.data[[newname]]) %>%
select(player, .data[[newname]])
}
# This works
df %>%
x_min_roll(metric = tdc, n_minutes = 2, fun = sum)
# A tibble: 2 x 2
# Groups: player [2]
player tdc_x2
<chr> <int>
1 A 339
2 B 380
I would like to be able to do this:
metric_list <- c('tdc', 'sumad')
minutes_list <- c(2,5)
df %>%
x_min_roll(metric = metric_list, n_minutes = minutes_list, fun = sum) %>%
# maybe a few more steps here.... to get this
# A tibble: 2 x 5
player tdc_x2 tdc_x5 sumad_x2 sumad_x5
<chr> <dbl> <dbl> <dbl> <dbl>
1 A 339 793 20 36
2 B 380 866 19 41
We can use map2 to loop over the corresponding elements of both vectors
library(purrr)
library(dplyr)
map2(metric_list, minutes_list,
~ df %>%
x_min_roll(metric = !!.x, n_minutes = .y, fun = sum))
-output
[[1]]
# A tibble: 2 × 2
# Groups: player [2]
player tdc_x2
<chr> <int>
1 A 339
2 B 380
[[2]]
# A tibble: 3 × 2
# Groups: player [2]
player sumad_x5
<chr> <int>
1 A 36
2 B 41
3 B 41
EDIT: Based on #Onyambu's comments
If we want for each combination, then use crossing to create the combination
library(tidyr)
crossing(metric_list, minutes_list) %>%
pmap(~ df %>%
x_min_roll(metric = !!.x, n_minutes = .y, fun = sum))
Based on the comments from the OP, if we want to combine the datasets
crossing(metric_list, minutes_list) %>%
pmap(~ df %>% x_min_roll(metric = !!.x, n_minutes = .y, fun = sum)) %>%
reduce(inner_join, by = 'player')
first time for me here, I'll try to explain you my problem as clearly as possible.
I'm working on erosion data contained in farms in the form of pixels (e.g. 1 farm = 10 pixels so 10 lines in my df), for this I have 4 df in a list, and I would like to calculate for each farm the mean of erosion. I thought about a loop on the name of erosion field but my problem is that my df don't have the exact name (either ERO13 or ERO17). I don't want to work the position of the field because it could change between the df, only with the name which is variable.
Here's a example :
df1 <- data.frame(ID = c(1,1,2), ERO13 = c(2,4,6))
df2 <- data.frame(ID = c(4,4,6), ERO17 = c(4,5,12))
lst_df <- list(df1,df2)
for (df in lst_df){
cur_df <- df
cur_df <- cur_df %>%
group_by(ID) %>%
summarise(current_name_of_erosion_field = mean(current_name_of_erosion_field))
}
I tried with
for (df in lst_df){
cur_df <- df
cur_camp <- names(cur_df)[2]
cur_df <- cur_df %>%
group_by(ID) %>%
summarise(cur_camp = mean(cur_camp))
}
but first doesn't work because it's a string character and not a variable containing the string character and it works with the position.
How can I build the current_name_of_erosion_field here ?
We may convert it to symbol and evaluate (!!) or may pass the string across. Also, as we are using a for loop, make sure to create a list to store the output. Also, to assign from an object created, use := with !!
out <- vector('list', length(lst_df))
for (i in seq_along(lst_df)){
cur_df <- lst_df[[i]]
cur_camp <- names(cur_df)[2]
cur_df <- cur_df %>%
group_by(ID) %>%
summarise(!!cur_camp := mean(!! sym(cur_camp)))
out[[i]] <- cur_df
}
-output
> out
[[1]]
# A tibble: 2 × 2
ID ERO13
<dbl> <dbl>
1 1 3
2 2 6
[[2]]
# A tibble: 2 × 2
ID ERO17
<dbl> <dbl>
1 4 4.5
2 6 12
Or may use across
out <- vector('list', length(lst_df))
for (i in seq_along(lst_df)){
cur_df <- lst_df[[i]]
cur_camp <- names(cur_df)[2]
cur_df <- cur_df %>%
group_by(ID) %>%
summarise(across(all_of(cur_camp), mean))
out[[i]] <- cur_df
}
-output
> out
[[1]]
# A tibble: 2 × 2
ID ERO13
<dbl> <dbl>
1 1 3
2 2 6
[[2]]
# A tibble: 2 × 2
ID ERO17
<dbl> <dbl>
1 4 4.5
2 6 12
A slightly different approach would be to bind the dataframes and use pivot_longer to separate the erosion name from the erosion value. Then you can take the mean of the values without having to specify the name.
library(tidyverse)
df1 <- data.frame(ID = c(1,1,2), ERO13 = c(2,4,6))
df2 <- data.frame(ID = c(4,4,6), ERO17 = c(4,5,12))
bind_rows(df1, df2) %>%
pivot_longer(starts_with('ERO'),
names_to = 'ERO',
values_drop_na = TRUE) %>%
group_by(ID, ERO) %>%
summarize(value = mean(value))
#> `summarise()` has grouped output by 'ID'. You can override using the `.groups` argument.
#> # A tibble: 4 x 3
#> # Groups: ID [4]
#> ID ERO value
#> <dbl> <chr> <dbl>
#> 1 1 ERO13 3
#> 2 2 ERO13 6
#> 3 4 ERO17 4.5
#> 4 6 ERO17 12
Created on 2022-01-14 by the reprex package (v2.0.0)
As an example, I have the following data frame:
df <- data.frame(a1=1,a2=2,a3=3,b1=1,b2=2,b3=3)
I have a function:
fn <- function(x,y,z) x^y+(z-x)^(y-x)
I want the following:
df <- df %>% mutate(a=fn(a1,a2,a3),b=fn(b1,b2,b3))
The problem is, I have tons of triplets in my dataset, so it is not ideal to write them out one by one.
Here are base R options using:
split.default + lapply + do.call
cbind(
df,
lapply(
split.default(df, gsub("\\d+", "", names(df))),
function(x) do.call(fn, unname(x))
)
)
reshape + lapply + do.call
cbind(
df,
lapply(
subset(
reshape(
setNames(df, gsub("(\\d+)$", "\\.\\1", names(df))),
direction = "long",
varying = 1:length(df)
),
select = -c(time, id)
),
function(x) do.call(fn, as.list(x))
)
)
Output
a1 a2 a3 b1 b2 b3 a b
1 1 2 3 1 2 3 3 3
I would convert df to long format then use lag to create 3 columns then apply fn() on them
library(tidyverse)
df_long <- df %>%
pivot_longer(everything(),
names_to = c(".value", "set"),
names_pattern = "(.)(.)")
df_longer <- df_long %>%
pivot_longer(-c(set),
names_to = "key",
values_to = "val") %>%
arrange(key)
df_longer
#> # A tibble: 6 x 3
#> set key val
#> <chr> <chr> <dbl>
#> 1 1 a 1
#> 2 2 a 2
#> 3 3 a 3
#> 4 1 b 1
#> 5 2 b 2
#> 6 3 b 3
lag then apply fn(), keep only non-NA val_fn
df_longer <- df_longer %>%
group_by(key) %>%
mutate(val_lag1 = lag(val, n = 1),
val_lag2 = lag(val, n = 2)) %>%
mutate(val_fn = fn(val_lag2, val_lag1, val)) %>%
filter(!is.na(val_fn))
df_longer
#> # A tibble: 2 x 6
#> # Groups: key [2]
#> set key val val_lag1 val_lag2 val_fn
#> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 3 a 3 2 1 3
#> 2 3 b 3 2 1 3
Created on 2020-12-03 by the reprex package (v0.3.0)
I think it would be easier/shorter to combine columns into their separate group and apply the function to each column.
library(dplyr)
library(tidyr)
df %>%
pivot_longer(cols = everything(),
names_to = '.value',
names_pattern = '([a-z]+)') %>%
summarise(across(.fns = ~do.call(fn, as.list(.)))) -> result
result
# a b
# <dbl> <dbl>
#1 3 3
You can bind the result to your original dataset if needed.
bind_cols(df, result)
# a1 a2 a3 b1 b2 b3 a b
#1 1 2 3 1 2 3 3 3