Related
I have the following data structure:
country age sex
x 10 m
y 20 f
x 12 m
y 40 m
I want to group my data according to the country and get a calculation of the percentage of my sex variable according to it, resulting in some table like this:
country mean_age percent_m percent_f
x 11 100% 0%
y 20 50% 50%
Thank you in advance!
A possible solution:
library(tidyverse)
df %>%
mutate(age = as.character(age)) %>%
pivot_longer(-country) %>%
pivot_wider(country, values_fn = ~ {if (any(str_detect(.x, "\\d")))
mean(as.numeric(.x)) else proportions(table(.x))["m"]}) %>%
mutate(mean_age = age, percent_m = sex, percent_f = 1- percent_m, age = NULL,
sex = NULL)
#> # A tibble: 2 × 4
#> country mean_age percent_m percent_f
#> <chr> <dbl> <dbl> <dbl>
#> 1 x 11 1 0
#> 2 y 30 0.5 0.5
Another possible solution:
library(tidyverse)
inner_join(
df %>%
mutate(name = "mean_age") %>%
pivot_wider(country, values_from = age, values_fn = mean),
df %>%
mutate(name = "percent_m") %>%
pivot_wider(country, values_from = sex,
values_fn = ~ proportions(table(.x))["m"])) %>%
mutate(percent_f = 1 - percent_m)
#> Joining, by = "country"
#> # A tibble: 2 × 4
#> country mean_age percent_m percent_f
#> <chr> <dbl> <dbl> <dbl>
#> 1 x 11 1 0
#> 2 y 30 0.5 0.5
library(tidyverse)
df <- data.frame(country = c("x", "x", "x", "x", "x", "y", "y", "y"),
age = c(10, 20, 12, 40, 23, 17, 21, 19),
sex = c("m", "f", "f", "m", "f", "m", "f", "f"))
df1 <- df %>%
pivot_wider(names_from = sex, values_from = sex) %>%
group_by(country) %>%
summarise(age_mean = mean(age),
m = length(na.omit(m)),
f = length(na.omit(f))) %>%
mutate(m_perc = (m / (m + f)) * 100,
f_perc = (f / (m + f)) * 100)
> df1
# A tibble: 2 x 6
country age_mean m f m_perc f_perc
<chr> <dbl> <int> <int> <dbl> <dbl>
1 x 21 2 3 40 60
2 y 19 1 2 33.3 66.7
I would like to reassign a given records to a single group if the records are duplicated. In the below dataset I would like to to have 12-4 all being assigned to group A or B but not both. Is there a way to go abou it?
library(tidyverse)
dat <- tibble(
group = c("A", "A", "A", "A", "B", "B", "B", "B", "B"),
assigned = c("12-1", "12-2", "12-3", "12-4", "12-4", "12-5", "12-6",
"12-7", "12-8")
)
# Attempts to tease out records for each group
dat %>% pivot_wider(names_from = group, values_from = assigned)
You can group by record and reassign all to the same group, chosen at random from the available groups:
dat %>%
group_by(assigned) %>%
mutate(group = nth(group, sample(n())[1])) %>%
ungroup()
#> # A tibble: 9 x 2
#> group assigned
#> <chr> <chr>
#> 1 A 12-1
#> 2 A 12-2
#> 3 A 12-3
#> 4 A 12-4
#> 5 A 12-4
#> 6 B 12-5
#> 7 B 12-6
#> 8 B 12-7
#> 9 B 12-8
library(tidyverse)
dat <- tibble(
group = c("A", "A", "A", "A", "B", "B", "B", "B", "B"),
assigned = c(
"12-1", "12-2", "12-3", "12-4", "12-4", "12-5", "12-6",
"12-7", "12-8"
)
)
dat %>%
select(-group) %>%
left_join(
dat %>%
left_join(dat %>% count(group)) %>%
# reassign to the smallest group
arrange(n) %>%
select(-n) %>%
distinct(assigned, .keep_all = TRUE)
)
#> Joining, by = "group"
#> Joining, by = "assigned"
#> # A tibble: 9 × 2
#> assigned group
#> <chr> <chr>
#> 1 12-1 A
#> 2 12-2 A
#> 3 12-3 A
#> 4 12-4 A
#> 5 12-4 A
#> 6 12-5 B
#> 7 12-6 B
#> 8 12-7 B
#> 9 12-8 B
Created on 2022-04-04 by the reprex package (v2.0.0)
I have the following dataset:
df <- structure(list(var = c("a", "a", "a", "a", "a", "a", "a", "a",
"a", "a", "b", "b", "b", "b", "b", "b", "b", "b", "b", "b"),
beta_2 = c(-0.0441739987111475, -0.237256549142376, -0.167105040977351,
-0.140660549127359, -0.0623609020878716, -0.279740636040755,
-0.0211523654970921, 0.135368375550385, -0.0612770247281429,
-0.13183964102725, 0.363736380163624, -0.0134490092107583,
-0.0179957210095045, -0.00897746346470879, -0.0588242539401108,
-0.0571976057977875, -0.0290052449275881, 0.263181562031473,
0.00398338217426211, 0.0945495450635497), beta_3 = c(8.54560737016843e-05,
-0.0375859675101865, -0.0334219898732454, 0.0332275634691021,
6.41499442849741e-05, -0.0200724300602369, 8.046644459034e-05,
0.0626880671346749, 0.066218613897726, 0.0101268565262127,
0.44671567722757, 0.180543425234781, 0.526177616390516, 0.281245231195401,
-0.0362628519010746, 0.0609803646123324, 0.104137160504616,
0.804375133555955, 0.211218123083386, 0.824756942938928),
beta_4 = c(-8.50289708803184e-06, 0.0376601781861706, 0.104418586040791,
-0.0949557776511923, 2.11896613386966e-05, 0.0969765824620132,
4.95280289930771e-06, -0.0967836292162074, -0.132623370126544,
0.0579395551175153, -0.140392004360494, 0.00950912868877355,
-0.388317615535003, -0.0282634228070272, 0.0547116932731301,
0.0119441792873249, -0.0413015877795695, -0.720387490330028,
-0.0321860166581817, -0.627489324697221)), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -20L))
df
# # A tibble: 20 × 4
# var beta_2 beta_3 beta_4
# <chr> <dbl> <dbl> <dbl>
# 1 a -0.0442 0.0000855 -0.00000850
# 2 a -0.237 -0.0376 0.0377
# 3 a -0.167 -0.0334 0.104
# 4 a -0.141 0.0332 -0.0950
# 5 a -0.0624 0.0000641 0.0000212
# ...
I would like to summarise each beta_ column grouped by var so that I have the mean of beta_2, beta_3 and beta_4 in a single cell.
I can do it with the following code:
df %>%
pivot_longer(!var) %>%
group_by(var, name) %>%
summarise(mean_beta = mean(value) %>% round(2), .groups = "drop") %>%
aggregate(mean_beta ~ var, ., function(x) paste0(x, collapse = ", ")) %>%
as_tibble()
# # A tibble: 2 × 2
# var mean_beta
# <chr> <chr>
# 1 a -0.1, 0.01, 0
# 2 b 0.05, 0.34, -0.19
I'm looking for a more straightforward, tidyverse-only solution. I have tried using map inside summarise but couldn't get what I wanted. Any idea?
You may do the following -
library(dplyr)
df %>%
group_by(var) %>%
summarise(mean_beta = cur_data() %>%
summarise(across(.fns =
~.x %>% mean(na.rm = TRUE) %>% round(2))) %>%
unlist() %>% toString())
# var mean_beta
# <chr> <chr>
#1 a -0.1, 0.01, 0
#2 b 0.05, 0.34, -0.19
cur_data() provides the sub-data within each group as dataframe that can be summarised for each column and concatenated together.
Another possible solution:
library(tidyverse)
df %>%
group_by(var) %>%
summarise(across(everything(), mean)) %>%
{bind_cols(var=.$var, mean_betas=apply(., 1, \(x) str_c(x[-1], collapse = ", ")))}
#> # A tibble: 2 × 2
#> var mean_betas
#> <chr> <chr>
#> 1 a "-0.10101983, 0.008141079, -0.002735024"
#> 2 b " 0.05400016, 0.340388682, -0.190217246"
I'm taking the mean, 3 by 3, by grouping. For that, I'm using the summarise function. In this context I would like to select the last date from the four that make up the average.
I tried to select the maximum, but this way I'm just selecting the highest date for the whole group.
test = data.frame(my_groups = c("A", "A", "A", "B", "B", "C", "C", "C", "A", "A", "A"),
measure = c(10, 20, 5, 2, 62 ,2, 5, 4, 6, 7, 25),
time= c("20-09-2020", "25-09-2020", "19-09-2020", "20-05-2020", "20-06-2021",
"11-01-2021", "13-01-2021", "13-01-2021", "15-01-2021", "15-01-2021", "19-01-2021"))
# > test
# my_groups measure time
# 1 A 10 20-09-2020
# 2 A 20 25-09-2020
# 3 A 5 19-09-2020
# 4 B 2 20-05-2020
# 5 B 62 20-06-2021
# 6 C 2 11-01-2021
# 7 C 5 13-01-2021
# 8 C 4 13-01-2021
# 9 A 6 15-01-2021
# 10 A 7 15-01-2021
# 11 A 25 19-01-2021
test %>%
arrange(time) %>%
group_by(my_groups) %>%
summarise(mean_3 = rollapply(measure, 3, mean, by = 3, align = "left", partial = F),
final_data = max(time))
# my_groups mean_3 final_data
# <chr> <dbl> <chr>
# 1 A 12.7 25-09-2020
# 2 A 11.7 25-09-2020
# 3 C 3.67 13-01-2021
In the second line I wish the date was 19-01-2021, and not the global maximum of group A, (25-09-2020).
Any hint on how I could do that?
I have 2 dplyr ways for you. Not happy with it because when the rollapply with max and dates doesn't find anything it in group B it uses a double by default which doesn't match the characters from group A and C.
Mutate:
test %>%
arrange(time) %>%
group_by(my_groups) %>%
mutate(final = rollapply(time, 3, max, by = 3, fill = NA, align = "left", partial = F),
mean_3 = rollapply(measure, 3, mean, by = 3, fill = NA, align = "left", partial = F)) %>%
filter(!is.na(final)) %>%
select(my_groups, final, mean_3) %>%
arrange(my_groups)
# A tibble: 3 x 3
# Groups: my_groups [2]
my_groups final mean_3
<chr> <chr> <dbl>
1 A 19-01-2021 12.7
2 A 25-09-2020 11.7
3 C 13-01-2021 3.67
Summarise that doesn't summarise, but is a bit cleaner in code:
test %>%
arrange(time) %>%
group_by(my_groups) %>%
summarise(final = rollapply(time, 3, max, by = 3, fill = NA, align = "left", partial = F),
mean_3 = rollapply(measure, 3, mean, by = 3, fill = NA, align = "left", partial = F)) %>%
filter(!is.na(final))
`summarise()` has grouped output by 'my_groups'. You can override using the `.groups` argument.
# A tibble: 3 x 3
# Groups: my_groups [2]
my_groups final mean_3
<chr> <chr> <dbl>
1 A 19-01-2021 12.7
2 A 25-09-2020 11.7
3 C 13-01-2021 3.67
Edit:
Added isa's solution from comment. Partial = TRUE does the trick:
test %>%
arrange(time) %>%
group_by(my_groups) %>%
summarise(mean_3 = rollapply(measure, 3, mean, by = 3, align = "left", partial = F),
final_data = rollapply(time, 3, max, by = 3, align = "left", partial = T))
`summarise()` has grouped output by 'my_groups'. You can override using the `.groups` argument.
# A tibble: 3 x 3
# Groups: my_groups [2]
my_groups mean_3 final_data
<chr> <dbl> <chr>
1 A 12.7 19-01-2021
2 A 11.7 25-09-2020
3 C 3.67 13-01-2021
Another possible solution:
library(tidyverse)
test = data.frame(my_groups = c("A", "A", "A", "B", "B", "C", "C", "C", "A", "A", "A"),
measure = c(10, 20, 5, 2, 62 ,2, 5, 4, 6, 7, 25),
time= c("20-09-2020", "25-09-2020", "19-09-2020", "20-05-2020", "20-06-2021",
"11-01-2021", "13-01-2021", "13-01-2021", "15-01-2021", "15-01-2021", "19-01-2021"))
test %>%
group_by(data.table::rleid(my_groups)) %>%
filter(n() == 3) %>%
summarise(
groups = unique(my_groups),
mean_3 = mean(measure), final_data = max(time), .groups = "drop") %>%
select(-1)
#> # A tibble: 3 × 3
#> groups mean_3 final_data
#> <chr> <dbl> <chr>
#> 1 A 11.7 25-09-2020
#> 2 C 3.67 13-01-2021
#> 3 A 12.7 19-01-2021
EDIT
To allow for calculation of mean of 2 values, as asked for in a comment below by the OP, I revised my code, using data.table::frollmean and data.table::frollapply:
library(tidyverse)
library(lubridate)
library(data.table)
n <- 2 # choose the number with which to calculate the mean
test %>%
group_by(rleid(my_groups)) %>%
summarise(
groups = unique(my_groups),
mean_n = frollmean(measure, n), final_data = frollapply(dmy(time), n, max) %>%
as_date(origin = lubridate::origin), .groups = "drop") %>%
drop_na(mean_n) %>% select(-1)
#> # A tibble: 7 × 3
#> groups mean_n final_data
#> <chr> <dbl> <date>
#> 1 A 15 2020-09-25
#> 2 A 12.5 2020-09-25
#> 3 B 32 2021-06-20
#> 4 C 3.5 2021-01-13
#> 5 C 4.5 2021-01-13
#> 6 A 6.5 2021-01-15
#> 7 A 16 2021-01-19
I have written a function that is working fine when I enter the input variables as data frames. But then when I want to use pmap to enter the inputs as a list of data frames I get the following error:
Error in UseMethod("filter_") : no applicable method for 'filter_' applied to an object of class "character"
Here is the data and the first part of the function that causes the error, I use y and a parameters in parts of the function that is not shown here:
x <- tibble::tibble(x1 = sample(0:1, 8, replace = TRUE),
x2 = sample(0:25, 8, replace = FALSE),
x3 = sample(1:3, 8, replace = TRUE),
strata =c("a", "b", "c", "d", "a", "b", "c", "d"))
y <- tibble::tibble(rate = sample(0:1, 8, replace = TRUE),
strata =c("a", "b", "c", "d", "a", "b", "c", "d") )
a <- tibble::tibble(sample(10:80, 4, replace = FALSE))
example <- function(x, y, a , d){
CR <- x %>% filter(x1, x2>0) %>%
group_by(x3) %>%
summarise(avg_revenue = mean(x2), revenue = sum(x2))
return(CR)
}
example(x,y,a, d = 0.1)
but when I call the pmap on this function:
df <- tibble::tibble(x = x %>% group_by(strata) %>% nest(),
y = y %>% group_by(strata) %>% nest(),
a = a)
pmap(df, example, d= 0.1)
I get the error mentioned above.
I do not believe that df is creating the df you want it to be creating. I believe this does what you want...if I am understanding the question correctly. However y is not used anywhere in your function, so I am unclear what its purpose is. I believe there is also a better way to do this using map and nest but again I'm not sure what you are trying to do.
library(tidyverse)
x <- tibble::tibble(x1 = sample(0:1, 8, replace = TRUE),
x2 = sample(0:25, 8, replace = FALSE),
x3 = sample(1:3, 8, replace = TRUE),
strata =c("a", "b", "c", "d", "a", "b", "c", "d"))
y <- tibble::tibble(rate = sample(0:1, 8, replace = TRUE),
strata =c("a", "b", "c", "d", "a", "b", "c", "d") )
a <- tibble::tibble(a = sample(10:80, 4, replace = FALSE))
example <- function(x, y, a , d){
CR <- x %>% filter(x1, x2>0) %>%
group_by(x3) %>%
summarise(avg_revenue = mean(x2), revenue = sum(x2))
return(CR)
}
example(x,y,a, d = 0.1)
#> # A tibble: 1 x 3
#> x3 avg_revenue revenue
#> <int> <dbl> <int>
#> 1 1 5 10
df <- bind_cols(x, select(y, rate)) %>%
group_by(strata) %>%
nest(x = c(x1, x2, x3),
y = c(rate)) %>%
bind_cols(a) %>% ungroup()
pmap(select(df, -strata), example)
#> [[1]]
#> # A tibble: 0 x 3
#> # … with 3 variables: x3 <int>, avg_revenue <dbl>, revenue <int>
#>
#> [[2]]
#> # A tibble: 0 x 3
#> # … with 3 variables: x3 <int>, avg_revenue <dbl>, revenue <int>
#>
#> [[3]]
#> # A tibble: 1 x 3
#> x3 avg_revenue revenue
#> <int> <dbl> <int>
#> 1 1 4 4
#>
#> [[4]]
#> # A tibble: 1 x 3
#> x3 avg_revenue revenue
#> <int> <dbl> <int>
#> 1 1 6 6
pmap_dfr(select(df, -strata), example, d = 0.1, .id = 'strata')
#> # A tibble: 2 x 4
#> strata x3 avg_revenue revenue
#> <chr> <int> <dbl> <int>
#> 1 3 1 4 4
#> 2 4 1 6 6
Created on 2019-12-17 by the reprex package (v0.3.0)
This error arises, as CLedbetter has also mentioned in their helpful answer, when the input data-frame to pmap,df, is not in the correct format. pmap expects df to only have columns that is known to the function it is operating on.
To that end I edited the df with an inner_join and then still we have the column strata that is not known to the function example().
As it is mentioned in the help of pmap function in R in order to make pmap function ignore the column that is not used by function example(),
I used "..." in the definition of example() so that pmap can skip the first column of the data-frame, strata, that is not used in the function.
So the updated code will be:
x <- tibble::tibble(x1 = sample(0:1, 8, replace = TRUE),
x2 = sample(0:25, 8, replace = FALSE),
x3 = sample(1:3, 8, replace = TRUE),
strata =c("a", "b", "c", "d", "a", "b", "c", "d"))
y <- tibble::tibble(rate = sample(0:1, 8, replace = TRUE),
strata =c("a", "b", "c", "d", "a", "b", "c", "d") )
a <- tibble::tibble(sample(10:80, 4, replace = FALSE))
# Note the addition of the "..." to the function input definition
example <- function(x, y, a , d, ...){
CR <- x %>% filter(x1, x2>0) %>%
group_by(x3) %>%
summarise(avg_revenue = mean(x2), revenue = sum(x2))
return(CR)
}
example(x,y,a, d = 0.1)
# Note the change in the reformatting of df with an inner_join
df <- inner_join(x %>% group_by(strata) %>% nest(),
y %>% group_by(strata) %>% nest(),
by = "strata") %>% rename(x = data.x, y = data.y )
# with these changes pmap produces the output
pmap(df, example, d= 0.1)