I have the following dataset:
df_x <- data.frame(year = c(2000, 2000, 2000, 2001, 2001, 2001, 2002, 2002, 2002),
a = c(7, 3, 5),
b = c(5, 8, 1),
c = c(8, 4, 3))
and this vector:
v <- c("a", "b", "c")
Now I want to create a new dataset and summarise a, b, and c by creating new variables (y_a, y_b, and y_c) that calculate the mean of each variable grouped by year.
The code for doing this is the following:
y <- df_x %>% group_by(year) %>% dplyr::summarise(y_a = mean(a, na.rm = TRUE),
y_b = mean(b, na.rm = TRUE),
y_c = mean(c, na.rm = TRUE))
However, I want to use the vector v to read the respective variable from it and paste in into the summarise function:
y <- df_x %>% group_by(year) %>% dplyr::summarise(as.name(paste0("y_", v[1])) = mean(as.name(v[1]), na.rm = TRUE),
as.name(paste0("y_", v[2])) = mean(as.name(v[1]), na.rm = TRUE),
as.name(paste0("y_", v[3])) = mean(as.name(v[1]), na.rm = TRUE))
Doing so, I receive the following error message:
Error: unexpected '=' in "y <- df_x %>% group_by(year) %>% dplyr::summarise(as.name(paste0("y_", v[1])) ="
How can I paste the value of a vector in this summarise function so that it works?
To define a new variable on the left hand side, you need := instead of =. Because you create it with paste0, you need !! to inject the expression and make sure that is correctly evaluated. To access existing columns in dplyr with a string stored in a variable, using .data is the easiest way.
library(dplyr)
df_x <- data.frame(year = c(2000, 2000, 2000, 2001, 2001, 2001, 2002, 2002, 2002),
a = c(7, 3, 5),
b = c(5, 8, 1),
c = c(8, 4, 3))
v <- c("a", "b", "c")
df_x %>% group_by(year) %>%
dplyr::summarise(!!paste0("y_", v[1]) := mean(.data[[v[1]]], na.rm = TRUE),
!!paste0("y_", v[2]) := mean(.data[[v[1]]], na.rm = TRUE),
!!paste0("y_", v[3]) := mean(.data[[v[1]]], na.rm = TRUE))
#> # A tibble: 3 × 4
#> year y_a y_b y_c
#> <dbl> <dbl> <dbl> <dbl>
#> 1 2000 5 5 5
#> 2 2001 5 5 5
#> 3 2002 5 5 5
Created on 2022-12-21 by the reprex package (v1.0.0)
Here is a one-liner via base R,
aggregate(. ~ year, cbind.data.frame(year = df_x$year, df_x[v]), FUN = \(i)mean(i, na.rm = TRUE))
year a b c
1 2000 5 4.666667 5
2 2001 5 4.666667 5
3 2002 5 4.666667 5
It would be easier with across and modifying the names with .names
library(dplyr)
df_x %>%
group_by(year) %>%
summarise(across(all_of(v), ~ mean(.x, na.rm = TRUE), .names = "y_{.col}"))
-output
# A tibble: 3 × 4
year y_a y_b y_c
<dbl> <dbl> <dbl> <dbl>
1 2000 5 4.67 5
2 2001 5 4.67 5
3 2002 5 4.67 5
Related
I have a data frame. Each row is a separate person. I need to create a data frame that only shows the latest "date" and "salary" per row. Below is an example of the data frame I'm starting with:
example_df <- tribble(
~person_id, ~date1, ~date2, ~date3, ~salaary1, ~salary2, ~salary3,
1, 2010, 2013, 2015, 100, 200, 300,
2, 1998, NA, NA, 50, NA, NA,
3, 2000, 2001, NA, 100, 200, NA,
4, 1987, 1989, 2005, 50, 300, 500
)
This is what I need the data frame to look like after processing:
example_clean_df <- tribble(
~person_id, ~date, ~salaary,
1, 2015,300,
2, 1998, 50,
3, 2001, 200,
4, 2005, 500
)
Any ideas would be super helpful. Thank you!
Does this work:
library(dplyr)
example_df %>%
rowwise() %>%
mutate(date = max(date1, date2, date3, na.rm = 1),
salary = max(salaary1, salary2, salary3, na.rm = 1)) %>%
select(person_id, date, salary)
# A tibble: 4 × 3
# Rowwise:
person_id date salary
<dbl> <dbl> <dbl>
1 1 2015 300
2 2 1998 50
3 3 2001 200
4 4 2005 500
Use pivot_longer and slice_max:
library(dplyr)
library(tidyr)
example_df %>%
pivot_longer(-person_id, names_pattern = "(date|salary)(\\d)", names_to = c(".value", "number")) %>%
group_by(person_id) %>%
slice_max(salary) %>%
select(-number)
output
# A tibble: 4 × 3
# Groups: person_id [4]
person_id date salary
<dbl> <dbl> <dbl>
1 1 2015 300
2 2 1998 50
3 3 2001 200
4 4 2005 500
The primitive base r / for loop version:
result.df <- list()
for(i in seq(nrow(example_df))){
result.df[[i]] <- cbind(example_df[1][i,],
max(example_df[,2:4][i,], na.rm = T),
max(example_df[,5:7][i,], na.rm = T))
}
result.df <- setNames(do.call(rbind, result.df), c('person_id', 'date', 'salary'))
person_id date salary
1 1 2015 300
2 2 1998 50
3 3 2001 200
4 4 2005 500
PS:
microbenchmark suggests using #Karthik S method, as it is the fastest.
# test1: loop base R
# test2: rowwise mutate
# test3: pivot_longer
min lq mean median uq max neval
9.9957 10.41815 11.858530 10.86845 11.97645 21.8334 100
7.6594 7.96195 9.457389 8.29315 9.49365 25.9524 100
12.0949 12.49685 14.080567 12.83050 13.85300 26.6272 100
PPS:
using lapply speeds up the process:
result.df <- lapply(seq(nrow(example_df)), \(x) cbind(example_df[1][x,],
max(example_df[,2:4][x,], na.rm = T),
max(example_df[,5:7][x,], na.rm = T))) %>%
do.call(rbind, .) %>%
setNames(c('person_id', 'date', 'salary'))
min lq mean median uq max neval
3.6828 3.89075 5.754244 4.41195 7.14130 14.0325 100
I want to calculate the proportion of a variable in subgroups compared to the proportion of the whole dataset. The subgroups are based on binary columns. I want to filter the dataframe for each column, count the grouping variable and calculate the proportions. To compare the proportions, I calculate an index value which is 100*prop_subgroup/prop_overall.
I tried and failed to do this with map. Below is a for-loop and a lot of detours to achieve this, and I´m looking for some help to clean up this code and solve this "the tidyverse way". Thank you!
data <- data.frame(group = sample(c(LETTERS[1:6], NA), 1000, T),
v1 = sample(c(0, 1, NA), 1000, T),
v2 = sample(c(0, 1, 2, 3, 4, NA), 1000, T),
v3 = sample(c(0, 1, NA), 1000, T, prob = c(0.05, 0.05, 0.9)),
v4 = sample(c(0, 1, NA), 1000, T, prob = c(0.8, 0.1, 0.1)),
v5 = sample(c("a", 1, NA), 1000, T, prob = c(0.8, 0.1, 0.1)))
Calculate the prop.table
result <- data %>% count(group) %>% na.omit() %>% transmute(group = group, prop = n/sum(n))
Select binary columns
data_binary <- data %>% select(where(is.numeric)) %>%
select(where(function(x) {max(x, na.rm = T) == 1})) %>%
bind_cols(data %>% select(group), .)
Very ugly peace of code to calculate the frequencies for each group. Left join because some subgroups do not contain all grouping variables. The key peace I failed to do with map is the filtering based on one column and count of another column applied to all binary columns.
for(i in 2:ncol(data_binary)){
name <- names(data_binary)[i]
result <- left_join(result, data_binary %>% filter(.[[i]] == 1) %>% count(group) %>%
na.omit() %>% transmute(group = group, "{{name}}_index" := n/sum(n)))
}
Calculate index based on the frequencies
index <- bind_cols(result %>% select(group),
result %>% transmute_at(vars(-c("prop", "group")), function(x) {100 * x / result$prop}))
Result
group "v1"_index "v3"_index "v4"_index
1 A 79.90019 16.21418 60.54443
2 B 91.31450 97.28507 87.45307
3 C 114.26996 122.50712 95.30142
4 D 96.63614 175.24198 109.06017
5 E 100.08550 116.05938 126.39978
6 F 116.70123 62.55683 116.79493
I think you can accomplish this with a group_by, summarize to get counts and group_by, mutate to calculate fractions. However, I don't produce the same result so perhaps I don't understand exactly how you want to calculate the fractions (sum only the ones?)
data <- data.frame(group = sample(c(LETTERS[1:6], NA), 1000, T),
v1 = sample(c(0, 1, NA), 1000, T),
v2 = sample(c(0, 1, 2, 3, 4, NA), 1000, T),
v3 = sample(c(0, 1, NA), 1000, T, prob = c(0.05, 0.05, 0.9)),
v4 = sample(c(0, 1, NA), 1000, T, prob = c(0.8, 0.1, 0.1)),
v5 = sample(c("a", 1, NA), 1000, T, prob = c(0.8, 0.1, 0.1)))
library(tidyverse)
# counts and fractions for each combination of group and variable
data_long <- data %>%
as_tibble() %>%
# select only binary
select(group, where(~max(., na.rm = TRUE) == 1)) %>%
# pivot and calculate sums and fractions
pivot_longer(-group) %>%
drop_na(value) %>%
group_by(group, name) %>% summarize(count = sum(value), .groups = "drop") %>%
group_by(group) %>% mutate(fraction = count / sum(count))
print(data_long)
#> # A tibble: 21 x 4
#> # Groups: group [7]
#> group name count fraction
#> <chr> <chr> <dbl> <dbl>
#> 1 A v1 61 0.693
#> 2 A v3 7 0.0795
#> 3 A v4 20 0.227
#> 4 B v1 54 0.659
#> 5 B v3 10 0.122
#> 6 B v4 18 0.220
#> 7 C v1 45 0.75
#> 8 C v3 4 0.0667
#> 9 C v4 11 0.183
#> 10 D v1 48 0.716
#> # ... with 11 more rows
# pivot wider on fractions to get output in desired form
data_wide <- data_long %>%
pivot_wider(id_cols = group, values_from = fraction)
print(data_wide)
#> # A tibble: 7 x 4
#> # Groups: group [7]
#> group v1 v3 v4
#> <chr> <dbl> <dbl> <dbl>
#> 1 A 0.693 0.0795 0.227
#> 2 B 0.659 0.122 0.220
#> 3 C 0.75 0.0667 0.183
#> 4 D 0.716 0.0896 0.194
#> 5 E 0.707 0.0690 0.224
#> 6 F 0.677 0.154 0.169
#> 7 <NA> 0.725 0.0980 0.176
Created on 2022-03-31 by the reprex package (v2.0.1)
I want to use dplyr programming syntax (combine !! and :=) to evaluate a function in .fn argument but failed.
The code like this:
library(zoo)
library(glue)
aa = structure(list(region = c(1, 2, 3, 4), co_mean = c(5, 5, 5, 5
), o3_mean = c(5, 5, 5, 5), pm2.5_mean = c(5, 5, 5, 5)), row.names = c(NA,
-4L), class = c("tbl_df", "tbl", "data.frame"))
for (i in 1:3) {
fun_name_1 = glue('lag{i}')
fun_name_2 = glue('lag0{i}')
aa = aa %>% group_by(region) %>%
mutate(across(.cols = contains('mean'),
.fns = list(!!fun_name_1 := ~lag(., i), # ERROR OCCUR AT HERE
!!fun_name_2 := ~ rollmeanr(., i)),
.names = '{.col}_{.fn}'))
aa
}
I don't know how to solve it.
Any help will be highly appreciated!
======UPDATE========
My new code and new ERROR:
library(zoo)
library(glue)
aa = structure(list(region = c(1, 2, 3, 4), co_mean = c(5, 5, 5, 5
), o3_mean = c(5, 5, 5, 5), pm2.5_mean = c(5, 5, 5, 5)), row.names = c(NA,
-4L), class = c("tbl_df", "tbl", "data.frame"))
for (i in 1:3) {
# i <- 1
fun_name_1 = glue('lag{i}')
fun_name_2 = glue('lag0{i}')
aa %>%
group_by(region) %>%
mutate(across(.cols = contains('mean'),
.fns = setNames(list(~lag(., i),
~ rollmeanr(., i)), c(fun_name_1, fun_name_2)),
.names = '{.col}_{.fn}'))
aa
}
# Error: Problem with `mutate()` input `..1`.
# x 'names' attribute [6] must be the same length as the vector [5]
# i Input `..1` is `across(...)`.
# i The error occurred in group 1: region = 1.
# Run `rlang::last_error()` to see where the error occurred.
It would work as a named list. It makes perfect sense to pass a group by first (assuming that the OP's original example data have multiple rows per group)
i <- 1
fun_name_1 = glue('lag{i}')
fun_name_2 = glue('lag0{i}')
aa %>%
group_by(region) %>%
mutate(across(.cols = contains('mean'),
.fns = setNames(list(~lag(., i),
~ rollmeanr(., i)), c(fun_name_1, fun_name_2)),
.names = '{.col}_{.fn}'))
-output
# A tibble: 4 x 10
# Groups: region [4]
# region co_mean o3_mean pm2.5_mean co_mean_lag1 co_mean_lag01 o3_mean_lag1 o3_mean_lag01 pm2.5_mean_lag1 pm2.5_mean_lag01
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 1 5 5 5 NA 5 NA 5 NA 5
#2 2 5 5 5 NA 5 NA 5 NA 5
#3 3 5 5 5 NA 5 NA 5 NA 5
#4 4 5 5 5 NA 5 NA 5 NA
Could specify the fill = TRUE in rollmean
aa %>%
group_by(region) %>%
mutate(across(.cols = contains('mean'),
.fns = setNames(list(~lag(., i),
~ rollmeanr(., i, fill = TRUE)), c(fun_name_1, fun_name_2)),
.names = '{.col}_{.fn}'))
First I don't think your data should be grouped, at least for the data shared it doesn't make sense to have only 1 row in the group and then calculate lag value and rolling mean on it.
You can have appropriate column names using .names in across and use map_dfc to combine everything into one dataframe.
library(dplyr)
library(purrr)
library(zoo)
map_dfc(1:3, function(x) {
aa %>%
transmute(across(.cols = contains('mean'),
.fns = list(lag = ~lag(., x),
lag0 = ~rollmeanr(., x, fill = NA)),
.names = sprintf('{fn}_{col}_%d', x)))
})
You can add group_by(Region) if you are trying it on some another dataset.
I am trying to filter all records that contain systematic NA across a subset of numerical columns.
Here is a toy example.
library(tidyverse)
df <- tibble(
year = c(2001, 2002, 2003, 2001, 2002, 2003, 2001),
rank = c(12, 25, 65, NA, NA, NA, NA),
category = c("a", "a", "b", "c", "c", "c", NA),
other = c("x", "y", "x", "y", "x", "y", "x")
)
df %>%
pivot_wider(names_from = year, values_from = rank) %>%
filter(.cols = c(2001:2003),
.fns = ~ !is.na(.x))
This code doesn't work, it cannot recognize the columns 2001:2003 - What is the correct way to basically remove the third record by filtering all NA for columns 2001, 2002, 2003.
The columns range needs to be specified as a range similarly as I do 2001:2003.
The record "c" should be filtered out, but not the record were all columns are NA (last one in the toy example)
This is the error I am getting :
Error: Problem with filter() input ..1. x Input ..1 is named. ℹ
This usually means that you've used = instead of ==. ℹ Did you
mean .cols == c(2001:2003)?
You can filter by columns. Use across
df %>%
pivot_wider(names_from = year, values_from = rank) %>%
filter(rowSums(!across(`2001`:`2003`, is.na)) > 0L)
Ouput
# A tibble: 3 x 5
category other `2001` `2002` `2003`
<chr> <chr> <dbl> <dbl> <dbl>
1 a x 12 NA NA
2 a y NA 25 NA
3 b x NA NA 65
By my knowledge, you cannot filter by columns. You can only select by columns. Here is what I would do.
df %>%
pivot_wider(names_from = year, values_from = rank) %>%
pivot_longer(2:4) %>%
group_by(name) %>%
filter(!any(is.na(value)))
It may be better to filter before doing the pivot_wider
library(dplyr)
library(tidyr)
df %>%
group_by(category, other) %>%
filter(any(!is.na(rank))) %>%
ungroup %>%
pivot_wider(names_from = year, values_from = rank)
-output
# A tibble: 3 x 5
# category other `2001` `2002` `2003`
# <chr> <chr> <dbl> <dbl> <dbl>
#1 a x 12 NA NA
#2 a y NA 25 NA
#3 b x NA NA 65
I have a data frame with duplicated ID´s. An ID stands for a specific entity. The ID´s are duplicated because the dataset refers to a process that every entity can go through multiple times.
Here is a small example dat:
library(dplyr)
glimpse(dat)
Observations: 6
Variables: 3
$ ID <dbl> 1, 1, 1, 2, 2, 2
$ Amount <dbl> 10, 70, 80, 50, 10, 10
$ Product <fct> A, B, C, B, E, A
ID stands for the entity, Amount stands for the amount of money the entity has spend and Product stands for the good the entity bought.
The issue is that I have to "condense" this data. So, every ID / entity may occur only once. For the continuous variable, this is not an issue because I can simply calculate the mean per ID.
library(tidyr)
dat_con_ID <- dat %>%
select(ID) %>%
unique()
dat_con_Amount <- dat %>%
group_by(ID) %>%
summarise(Amount = mean(Amount))
dat_con <- inner_join(dat_con_ID, dat_con_Amount, by = "ID")
glimpse(dat_con)
Observations: 2
Variables: 2
$ ID <dbl> 1, 2
$ Amount <dbl> 53.33333, 23.33333
The problem is, that I can´t calculate the mean of Product because it´s a categorical variable. An option would be to make a dummy variable out of this factor and calculate the mean. But since the original data frame is really huge this is not a good solution. Any Idea how to handle this problem?
May be you are trying to do this:
I am using data.table library. I also modified your data by adding one extra row for ID = 1, so that you can see the difference in the output.
Data:
library('data.table')
dat <- data.table(ID =as.double(c(1, 1, 1, 2, 2, 2,1)),
Amount = as.double(c( 10, 70, 80, 50, 10, 10, 20)),
Product = factor( c('A', 'B', 'C', 'B', 'E', 'A', 'A')))
Code:
# average amount per id
dat[, .(avg_amt = mean(Amount)), by = .(ID) ]
# ID avg_amt
# 1: 1 45.00000
# 2: 2 23.33333
# average product per id
dat[, .SD[, .N, by = Product ][, .( avg_pdt = N/sum(N), Product)], by = .(ID) ]
# ID avg_pdt Product
# 1: 1 0.5000000 A
# 2: 1 0.2500000 B
# 3: 1 0.2500000 C
# 4: 2 0.3333333 B
# 5: 2 0.3333333 E
# 6: 2 0.3333333 A
# combining average amount and average product per id
dat[, .SD[, .N, by = Product ][, .( Product,
avg_pdt = N/sum(N),
avg_amt = mean(Amount))],
by = .(ID) ]
# ID Product avg_pdt avg_amt
# 1: 1 A 0.5000000 45.00000
# 2: 1 B 0.2500000 45.00000
# 3: 1 C 0.2500000 45.00000
# 4: 2 B 0.3333333 23.33333
# 5: 2 E 0.3333333 23.33333
# 6: 2 A 0.3333333 23.33333
edit
Another idea would be to count 'Product' as per 'ID', calculating the mean of 'Amount' and the relative frequencies for each product. spread the data by 'Product' to end up with the data in wide format. So, every ID / entity may occur only once.
dat %>%
add_count(Product, ID) %>%
group_by(ID) %>%
mutate(Amount = mean(Amount),
n = n / n()) %>%
unique() %>%
spread(Product, n, sep = "_") %>%
ungroup()
# A tibble: 2 x 6
# ID Amount Product_A Product_B Product_C Product_E
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 1. 45.0 0.500 0.250 0.250 NA
#2 2. 23.3 0.333 0.333 NA 0.333
My first attempt, not what OP was looking for but in case someone is interested:
As suggested by #steveb in the comments, you could summarise Product as a string.
library(dplyr)
dat %>%
group_by(ID) %>%
summarise(Amount = mean(Amount),
Product = toString( sort(unique(Product)))
)
# A tibble: 2 x 3
# ID Amount Product
# <dbl> <dbl> <chr>
#1 1. 45.0 A, B, C
#2 2. 23.3 A, B, E
data
dat <- structure(list(ID = c(1, 1, 1, 2, 2, 2, 1), Amount = c(10, 70,
80, 50, 10, 10, 20), Product = structure(c(1L, 2L, 3L, 2L, 4L,
1L, 1L), .Label = c("A", "B", "C", "E"), class = "factor")), .Names = c("ID",
"Amount", "Product"), row.names = c(NA, -7L), .internal.selfref = <pointer: 0x2c14528>, class = c("tbl_df",
"tbl", "data.frame"))