I want to calculate the proportion of a variable in subgroups compared to the proportion of the whole dataset. The subgroups are based on binary columns. I want to filter the dataframe for each column, count the grouping variable and calculate the proportions. To compare the proportions, I calculate an index value which is 100*prop_subgroup/prop_overall.
I tried and failed to do this with map. Below is a for-loop and a lot of detours to achieve this, and I´m looking for some help to clean up this code and solve this "the tidyverse way". Thank you!
data <- data.frame(group = sample(c(LETTERS[1:6], NA), 1000, T),
v1 = sample(c(0, 1, NA), 1000, T),
v2 = sample(c(0, 1, 2, 3, 4, NA), 1000, T),
v3 = sample(c(0, 1, NA), 1000, T, prob = c(0.05, 0.05, 0.9)),
v4 = sample(c(0, 1, NA), 1000, T, prob = c(0.8, 0.1, 0.1)),
v5 = sample(c("a", 1, NA), 1000, T, prob = c(0.8, 0.1, 0.1)))
Calculate the prop.table
result <- data %>% count(group) %>% na.omit() %>% transmute(group = group, prop = n/sum(n))
Select binary columns
data_binary <- data %>% select(where(is.numeric)) %>%
select(where(function(x) {max(x, na.rm = T) == 1})) %>%
bind_cols(data %>% select(group), .)
Very ugly peace of code to calculate the frequencies for each group. Left join because some subgroups do not contain all grouping variables. The key peace I failed to do with map is the filtering based on one column and count of another column applied to all binary columns.
for(i in 2:ncol(data_binary)){
name <- names(data_binary)[i]
result <- left_join(result, data_binary %>% filter(.[[i]] == 1) %>% count(group) %>%
na.omit() %>% transmute(group = group, "{{name}}_index" := n/sum(n)))
}
Calculate index based on the frequencies
index <- bind_cols(result %>% select(group),
result %>% transmute_at(vars(-c("prop", "group")), function(x) {100 * x / result$prop}))
Result
group "v1"_index "v3"_index "v4"_index
1 A 79.90019 16.21418 60.54443
2 B 91.31450 97.28507 87.45307
3 C 114.26996 122.50712 95.30142
4 D 96.63614 175.24198 109.06017
5 E 100.08550 116.05938 126.39978
6 F 116.70123 62.55683 116.79493
I think you can accomplish this with a group_by, summarize to get counts and group_by, mutate to calculate fractions. However, I don't produce the same result so perhaps I don't understand exactly how you want to calculate the fractions (sum only the ones?)
data <- data.frame(group = sample(c(LETTERS[1:6], NA), 1000, T),
v1 = sample(c(0, 1, NA), 1000, T),
v2 = sample(c(0, 1, 2, 3, 4, NA), 1000, T),
v3 = sample(c(0, 1, NA), 1000, T, prob = c(0.05, 0.05, 0.9)),
v4 = sample(c(0, 1, NA), 1000, T, prob = c(0.8, 0.1, 0.1)),
v5 = sample(c("a", 1, NA), 1000, T, prob = c(0.8, 0.1, 0.1)))
library(tidyverse)
# counts and fractions for each combination of group and variable
data_long <- data %>%
as_tibble() %>%
# select only binary
select(group, where(~max(., na.rm = TRUE) == 1)) %>%
# pivot and calculate sums and fractions
pivot_longer(-group) %>%
drop_na(value) %>%
group_by(group, name) %>% summarize(count = sum(value), .groups = "drop") %>%
group_by(group) %>% mutate(fraction = count / sum(count))
print(data_long)
#> # A tibble: 21 x 4
#> # Groups: group [7]
#> group name count fraction
#> <chr> <chr> <dbl> <dbl>
#> 1 A v1 61 0.693
#> 2 A v3 7 0.0795
#> 3 A v4 20 0.227
#> 4 B v1 54 0.659
#> 5 B v3 10 0.122
#> 6 B v4 18 0.220
#> 7 C v1 45 0.75
#> 8 C v3 4 0.0667
#> 9 C v4 11 0.183
#> 10 D v1 48 0.716
#> # ... with 11 more rows
# pivot wider on fractions to get output in desired form
data_wide <- data_long %>%
pivot_wider(id_cols = group, values_from = fraction)
print(data_wide)
#> # A tibble: 7 x 4
#> # Groups: group [7]
#> group v1 v3 v4
#> <chr> <dbl> <dbl> <dbl>
#> 1 A 0.693 0.0795 0.227
#> 2 B 0.659 0.122 0.220
#> 3 C 0.75 0.0667 0.183
#> 4 D 0.716 0.0896 0.194
#> 5 E 0.707 0.0690 0.224
#> 6 F 0.677 0.154 0.169
#> 7 <NA> 0.725 0.0980 0.176
Created on 2022-03-31 by the reprex package (v2.0.1)
Related
I want to do correlations for each unique combination and grouped by another variable. My solutions works for a very small dataset buy imagine more columns it's getting very tedious.
set.seed((13))
df <- data.frame(group = rep(LETTERS[1:3], 3),
var1 = rnorm(9, 1),
var2 = rnorm(9, 2),
var3 = rnorm(9, 1))
df %>%
group_by(group) %>%
summarise(var1_var2 = cor(var1, var2),
var1_var3 = cor(var1, var3),
var2_var3 = cor(var2, var3))
I also tried this one, but it doens't work.
df %>%
group_by(group) %>%
summarise(cor = cor(df[,2:ncol(df)]))
Here is an option. Map out all the combos then run a cor test for each group and each var and then pivot wider at the end:
library(tidyverse)
map_dfr(unique(df$group), \(x){
data.frame(t(combn(c("var1", "var2", "var3"), 2))) |>
mutate(group = x)
}) |>
mutate(cor = pmap_dbl(list(X1, X2, group),
~cor(df[df$group == ..3, ..1],
df[df$group == ..3, ..2]))) |>
unite(test, X1, X2) |>
pivot_wider(names_from = test, values_from = cor)
#> # A tibble: 3 x 4
#> group var1_var2 var1_var3 var2_var3
#> <chr> <dbl> <dbl> <dbl>
#> 1 A 0.318 -0.476 -0.985
#> 2 B -0.373 -0.487 -0.628
#> 3 C 0.535 0.991 0.645
Another solution. This works for any number of variables.
library(dplyr)
library(tidyr)
library(purrr)
library(tibble)
set.seed((13))
df <- data.frame(group = rep(LETTERS[1:3], 3),
var1 = rnorm(9, 1),
var2 = rnorm(9, 2),
var3 = rnorm(9, 2))
df %>%
select(-group) %>%
split(df$group) %>%
imap_dfr(
~ {
expand.grid(
first = names(.x),
second = names(.x),
stringsAsFactors = FALSE
) %>%
filter(first < second) %>%
rowwise() %>%
transmute(
group = .y,
pair = paste(first, second, sep = "_"),
cor = cor(.x[[first]], .x[[second]])
)
}
) %>%
pivot_wider(
names_from = "pair",
values_from = "cor"
)
# # A tibble: 3 × 4
# group var1_var2 var1_var3 var2_var3
# <chr> <dbl> <dbl> <dbl>
# 1 A 0.318 -0.476 -0.985
# 2 B -0.373 -0.487 -0.628
# 3 C 0.535 0.991 0.645
I have a dataframe that is set up as follows:
set.seed(1234)
id <- rep(1:50, each = 3)
stimuli <- rep(c("a", "b", "c"), each = 1, times = 50)
dv_1 <- rnorm(150, mean = 2, sd = 0.7)
dv_2 <- rnorm(150, mean = 4, sd = 1.5)
dv_3 <- rnorm(150, mean = 7.5, sd = 1)
simdat <- data.frame(id, stimuli, dv_1, dv_2, dv_3)
simranks <- t(apply(simdat[,-1], 1, function(x) rank(x, ties.method = "min") ))
colnames(simranks) <- paste(colnames(simranks), "_rank", sep="")
simdat <- data.frame(simdat, simranks)
I then have split the dataframe according to the different types of stimuli, e.g.,
dat_a <- simdat %>%
dplyr::filter(stimuli == "a") %>%
select(id, dv_1_rank, dv_2_rank, dv_3_rank)
Then I would like to perform a bunch of different analyses on the subsetted data:
a_melt <- melt(dat_a, id.vars = c("id"), measure.vars = c("dv_1_rank", "dv_2_rank", "dv_3_rank"))
a_perc <- a_melt %>%
group_by(variable, value) %>%
summarise(count = n()) %>%
mutate(perc = count/sum(count))
ggplot(a_perc, aes(x = variable, y = perc, fill = value)) +
geom_col(position = "stack") +
scale_y_continuous(labels=scales::percent)
How can I write the code so that, rather than copying and pasting the code chunks for stimuli b and stimuli c, it loops over all of them (the "stimuli" column in the original dataset)
Here is how I usually deal with this.
I split my dataset into chunks gathered in a list then,
I use lapply ou purrr::map function to apply a function, that does the analysis for one chunk, to each chunck.
Here you could do something like:
analyses <- function(.df){
require(dplyr)
require(ggplot2)
df_melt <- data.table::melt(.df, id.vars = c("id"), measure.vars = c("dv_1_rank", "dv_2_rank", "dv_3_rank"))
df_perc <- df_melt %>%
group_by(variable, value) %>%
summarise(count = n()) %>%
mutate(perc = count/sum(count))
ggplot(df_perc, aes(x = variable, y = perc, fill = value)) +
geom_col(position = "stack") +
scale_y_continuous(labels=scales::percent)
}
lapply(split(simdat, ~ stimuli), analyses)
Note: this is just a proof of concept.
Does this produce your desired output?
library(tidyverse)
simdat <- expand_grid(stimuli = c("a", "b", "c"), id = 1:20) %>%
mutate(
dv_1_rank = floor(runif(nrow(.), 1, 5)),
dv_2_rank = floor(runif(nrow(.), 1, 5)),
dv_3_rank = floor(runif(nrow(.), 1, 5)),
)
a_perc <- simdat %>%
pivot_longer(dv_1_rank:dv_3_rank) %>%
group_by(stimuli, name, value) %>%
summarise(count = n(), .groups = "drop") %>%
group_by(stimuli, name) %>%
mutate(perc = count/sum(count)) %>%
ungroup()
print(a_perc)
#> # A tibble: 36 x 5
#> stimuli name value count perc
#> <chr> <chr> <dbl> <int> <dbl>
#> 1 a dv_1_rank 1 4 0.2
#> 2 a dv_1_rank 2 8 0.4
#> 3 a dv_1_rank 3 5 0.25
#> 4 a dv_1_rank 4 3 0.15
#> 5 a dv_2_rank 1 5 0.25
#> 6 a dv_2_rank 2 6 0.3
#> 7 a dv_2_rank 3 4 0.2
#> 8 a dv_2_rank 4 5 0.25
#> 9 a dv_3_rank 1 3 0.15
#> 10 a dv_3_rank 2 5 0.25
#> # ... with 26 more rows
Created on 2022-03-14 by the reprex package (v2.0.1)
I'm taking the mean, 3 by 3, by grouping. For that, I'm using the summarise function. In this context I would like to select the last date from the four that make up the average.
I tried to select the maximum, but this way I'm just selecting the highest date for the whole group.
test = data.frame(my_groups = c("A", "A", "A", "B", "B", "C", "C", "C", "A", "A", "A"),
measure = c(10, 20, 5, 2, 62 ,2, 5, 4, 6, 7, 25),
time= c("20-09-2020", "25-09-2020", "19-09-2020", "20-05-2020", "20-06-2021",
"11-01-2021", "13-01-2021", "13-01-2021", "15-01-2021", "15-01-2021", "19-01-2021"))
# > test
# my_groups measure time
# 1 A 10 20-09-2020
# 2 A 20 25-09-2020
# 3 A 5 19-09-2020
# 4 B 2 20-05-2020
# 5 B 62 20-06-2021
# 6 C 2 11-01-2021
# 7 C 5 13-01-2021
# 8 C 4 13-01-2021
# 9 A 6 15-01-2021
# 10 A 7 15-01-2021
# 11 A 25 19-01-2021
test %>%
arrange(time) %>%
group_by(my_groups) %>%
summarise(mean_3 = rollapply(measure, 3, mean, by = 3, align = "left", partial = F),
final_data = max(time))
# my_groups mean_3 final_data
# <chr> <dbl> <chr>
# 1 A 12.7 25-09-2020
# 2 A 11.7 25-09-2020
# 3 C 3.67 13-01-2021
In the second line I wish the date was 19-01-2021, and not the global maximum of group A, (25-09-2020).
Any hint on how I could do that?
I have 2 dplyr ways for you. Not happy with it because when the rollapply with max and dates doesn't find anything it in group B it uses a double by default which doesn't match the characters from group A and C.
Mutate:
test %>%
arrange(time) %>%
group_by(my_groups) %>%
mutate(final = rollapply(time, 3, max, by = 3, fill = NA, align = "left", partial = F),
mean_3 = rollapply(measure, 3, mean, by = 3, fill = NA, align = "left", partial = F)) %>%
filter(!is.na(final)) %>%
select(my_groups, final, mean_3) %>%
arrange(my_groups)
# A tibble: 3 x 3
# Groups: my_groups [2]
my_groups final mean_3
<chr> <chr> <dbl>
1 A 19-01-2021 12.7
2 A 25-09-2020 11.7
3 C 13-01-2021 3.67
Summarise that doesn't summarise, but is a bit cleaner in code:
test %>%
arrange(time) %>%
group_by(my_groups) %>%
summarise(final = rollapply(time, 3, max, by = 3, fill = NA, align = "left", partial = F),
mean_3 = rollapply(measure, 3, mean, by = 3, fill = NA, align = "left", partial = F)) %>%
filter(!is.na(final))
`summarise()` has grouped output by 'my_groups'. You can override using the `.groups` argument.
# A tibble: 3 x 3
# Groups: my_groups [2]
my_groups final mean_3
<chr> <chr> <dbl>
1 A 19-01-2021 12.7
2 A 25-09-2020 11.7
3 C 13-01-2021 3.67
Edit:
Added isa's solution from comment. Partial = TRUE does the trick:
test %>%
arrange(time) %>%
group_by(my_groups) %>%
summarise(mean_3 = rollapply(measure, 3, mean, by = 3, align = "left", partial = F),
final_data = rollapply(time, 3, max, by = 3, align = "left", partial = T))
`summarise()` has grouped output by 'my_groups'. You can override using the `.groups` argument.
# A tibble: 3 x 3
# Groups: my_groups [2]
my_groups mean_3 final_data
<chr> <dbl> <chr>
1 A 12.7 19-01-2021
2 A 11.7 25-09-2020
3 C 3.67 13-01-2021
Another possible solution:
library(tidyverse)
test = data.frame(my_groups = c("A", "A", "A", "B", "B", "C", "C", "C", "A", "A", "A"),
measure = c(10, 20, 5, 2, 62 ,2, 5, 4, 6, 7, 25),
time= c("20-09-2020", "25-09-2020", "19-09-2020", "20-05-2020", "20-06-2021",
"11-01-2021", "13-01-2021", "13-01-2021", "15-01-2021", "15-01-2021", "19-01-2021"))
test %>%
group_by(data.table::rleid(my_groups)) %>%
filter(n() == 3) %>%
summarise(
groups = unique(my_groups),
mean_3 = mean(measure), final_data = max(time), .groups = "drop") %>%
select(-1)
#> # A tibble: 3 × 3
#> groups mean_3 final_data
#> <chr> <dbl> <chr>
#> 1 A 11.7 25-09-2020
#> 2 C 3.67 13-01-2021
#> 3 A 12.7 19-01-2021
EDIT
To allow for calculation of mean of 2 values, as asked for in a comment below by the OP, I revised my code, using data.table::frollmean and data.table::frollapply:
library(tidyverse)
library(lubridate)
library(data.table)
n <- 2 # choose the number with which to calculate the mean
test %>%
group_by(rleid(my_groups)) %>%
summarise(
groups = unique(my_groups),
mean_n = frollmean(measure, n), final_data = frollapply(dmy(time), n, max) %>%
as_date(origin = lubridate::origin), .groups = "drop") %>%
drop_na(mean_n) %>% select(-1)
#> # A tibble: 7 × 3
#> groups mean_n final_data
#> <chr> <dbl> <date>
#> 1 A 15 2020-09-25
#> 2 A 12.5 2020-09-25
#> 3 B 32 2021-06-20
#> 4 C 3.5 2021-01-13
#> 5 C 4.5 2021-01-13
#> 6 A 6.5 2021-01-15
#> 7 A 16 2021-01-19
I'm a beginner in R and I am stuck with the following..
df <- tibble(
id = c(01, 02),
a = c(0.44, 0.42),
b = c(1, 0.42),
c = c(NaN, 0.71),
d = c(0.75, 0),
e = c(0.66, 0.75),
f = c(0.5, 0.22),
g = c(1, NaN),
h = c(0.8, NaN)
)
I wonder how I can mutate a column that counts the number of cases of cells >0 - separately for the columns a:d and e:h (&rowwise)
I have been thinking of something like this..
df1 <- df %>%
rowwise() %>%
mutate(casesatod = length(which(., > 0), na.rm = TRUE),
casesetoh = length(which(., > 0), na.rm = TRUE))
Of course, this code is not complete but to give you an idea of what I was thinking of..
I'd really looking forward to receiving help from you !
Thanks in advance !
An option would be rowSums after selecting subset of columns from the dataset. It would be more efficient than rowwise as it is vectorized
library(dplyr)
df %>%
mutate(casesatod = rowSums(.[2:5] > 0, na.rm = TRUE),
casesetoh = rowSums(.[6:9] > 0, na.rm = TRUE))
If we need to use column names for selecting, use select
df %>%
mutate(casesatod = rowSums(select(., a:d) > 0, na.rm = TRUE),
casesetoh = rowSums(select(., e:h) > 0, na.rm = TRUE))
# A tibble: 2 x 11
# id a b c d e f g h casesatod casesetoh
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 1 0.44 1 NaN 0.75 0.66 0.5 1 0.8 3 4
#2 2 0.42 0.42 0.71 0 0.75 0.22 NaN NaN 3 2
I have a data frame with duplicated ID´s. An ID stands for a specific entity. The ID´s are duplicated because the dataset refers to a process that every entity can go through multiple times.
Here is a small example dat:
library(dplyr)
glimpse(dat)
Observations: 6
Variables: 3
$ ID <dbl> 1, 1, 1, 2, 2, 2
$ Amount <dbl> 10, 70, 80, 50, 10, 10
$ Product <fct> A, B, C, B, E, A
ID stands for the entity, Amount stands for the amount of money the entity has spend and Product stands for the good the entity bought.
The issue is that I have to "condense" this data. So, every ID / entity may occur only once. For the continuous variable, this is not an issue because I can simply calculate the mean per ID.
library(tidyr)
dat_con_ID <- dat %>%
select(ID) %>%
unique()
dat_con_Amount <- dat %>%
group_by(ID) %>%
summarise(Amount = mean(Amount))
dat_con <- inner_join(dat_con_ID, dat_con_Amount, by = "ID")
glimpse(dat_con)
Observations: 2
Variables: 2
$ ID <dbl> 1, 2
$ Amount <dbl> 53.33333, 23.33333
The problem is, that I can´t calculate the mean of Product because it´s a categorical variable. An option would be to make a dummy variable out of this factor and calculate the mean. But since the original data frame is really huge this is not a good solution. Any Idea how to handle this problem?
May be you are trying to do this:
I am using data.table library. I also modified your data by adding one extra row for ID = 1, so that you can see the difference in the output.
Data:
library('data.table')
dat <- data.table(ID =as.double(c(1, 1, 1, 2, 2, 2,1)),
Amount = as.double(c( 10, 70, 80, 50, 10, 10, 20)),
Product = factor( c('A', 'B', 'C', 'B', 'E', 'A', 'A')))
Code:
# average amount per id
dat[, .(avg_amt = mean(Amount)), by = .(ID) ]
# ID avg_amt
# 1: 1 45.00000
# 2: 2 23.33333
# average product per id
dat[, .SD[, .N, by = Product ][, .( avg_pdt = N/sum(N), Product)], by = .(ID) ]
# ID avg_pdt Product
# 1: 1 0.5000000 A
# 2: 1 0.2500000 B
# 3: 1 0.2500000 C
# 4: 2 0.3333333 B
# 5: 2 0.3333333 E
# 6: 2 0.3333333 A
# combining average amount and average product per id
dat[, .SD[, .N, by = Product ][, .( Product,
avg_pdt = N/sum(N),
avg_amt = mean(Amount))],
by = .(ID) ]
# ID Product avg_pdt avg_amt
# 1: 1 A 0.5000000 45.00000
# 2: 1 B 0.2500000 45.00000
# 3: 1 C 0.2500000 45.00000
# 4: 2 B 0.3333333 23.33333
# 5: 2 E 0.3333333 23.33333
# 6: 2 A 0.3333333 23.33333
edit
Another idea would be to count 'Product' as per 'ID', calculating the mean of 'Amount' and the relative frequencies for each product. spread the data by 'Product' to end up with the data in wide format. So, every ID / entity may occur only once.
dat %>%
add_count(Product, ID) %>%
group_by(ID) %>%
mutate(Amount = mean(Amount),
n = n / n()) %>%
unique() %>%
spread(Product, n, sep = "_") %>%
ungroup()
# A tibble: 2 x 6
# ID Amount Product_A Product_B Product_C Product_E
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 1. 45.0 0.500 0.250 0.250 NA
#2 2. 23.3 0.333 0.333 NA 0.333
My first attempt, not what OP was looking for but in case someone is interested:
As suggested by #steveb in the comments, you could summarise Product as a string.
library(dplyr)
dat %>%
group_by(ID) %>%
summarise(Amount = mean(Amount),
Product = toString( sort(unique(Product)))
)
# A tibble: 2 x 3
# ID Amount Product
# <dbl> <dbl> <chr>
#1 1. 45.0 A, B, C
#2 2. 23.3 A, B, E
data
dat <- structure(list(ID = c(1, 1, 1, 2, 2, 2, 1), Amount = c(10, 70,
80, 50, 10, 10, 20), Product = structure(c(1L, 2L, 3L, 2L, 4L,
1L, 1L), .Label = c("A", "B", "C", "E"), class = "factor")), .Names = c("ID",
"Amount", "Product"), row.names = c(NA, -7L), .internal.selfref = <pointer: 0x2c14528>, class = c("tbl_df",
"tbl", "data.frame"))