I have a dataframe that is set up as follows:
set.seed(1234)
id <- rep(1:50, each = 3)
stimuli <- rep(c("a", "b", "c"), each = 1, times = 50)
dv_1 <- rnorm(150, mean = 2, sd = 0.7)
dv_2 <- rnorm(150, mean = 4, sd = 1.5)
dv_3 <- rnorm(150, mean = 7.5, sd = 1)
simdat <- data.frame(id, stimuli, dv_1, dv_2, dv_3)
simranks <- t(apply(simdat[,-1], 1, function(x) rank(x, ties.method = "min") ))
colnames(simranks) <- paste(colnames(simranks), "_rank", sep="")
simdat <- data.frame(simdat, simranks)
I then have split the dataframe according to the different types of stimuli, e.g.,
dat_a <- simdat %>%
dplyr::filter(stimuli == "a") %>%
select(id, dv_1_rank, dv_2_rank, dv_3_rank)
Then I would like to perform a bunch of different analyses on the subsetted data:
a_melt <- melt(dat_a, id.vars = c("id"), measure.vars = c("dv_1_rank", "dv_2_rank", "dv_3_rank"))
a_perc <- a_melt %>%
group_by(variable, value) %>%
summarise(count = n()) %>%
mutate(perc = count/sum(count))
ggplot(a_perc, aes(x = variable, y = perc, fill = value)) +
geom_col(position = "stack") +
scale_y_continuous(labels=scales::percent)
How can I write the code so that, rather than copying and pasting the code chunks for stimuli b and stimuli c, it loops over all of them (the "stimuli" column in the original dataset)
Here is how I usually deal with this.
I split my dataset into chunks gathered in a list then,
I use lapply ou purrr::map function to apply a function, that does the analysis for one chunk, to each chunck.
Here you could do something like:
analyses <- function(.df){
require(dplyr)
require(ggplot2)
df_melt <- data.table::melt(.df, id.vars = c("id"), measure.vars = c("dv_1_rank", "dv_2_rank", "dv_3_rank"))
df_perc <- df_melt %>%
group_by(variable, value) %>%
summarise(count = n()) %>%
mutate(perc = count/sum(count))
ggplot(df_perc, aes(x = variable, y = perc, fill = value)) +
geom_col(position = "stack") +
scale_y_continuous(labels=scales::percent)
}
lapply(split(simdat, ~ stimuli), analyses)
Note: this is just a proof of concept.
Does this produce your desired output?
library(tidyverse)
simdat <- expand_grid(stimuli = c("a", "b", "c"), id = 1:20) %>%
mutate(
dv_1_rank = floor(runif(nrow(.), 1, 5)),
dv_2_rank = floor(runif(nrow(.), 1, 5)),
dv_3_rank = floor(runif(nrow(.), 1, 5)),
)
a_perc <- simdat %>%
pivot_longer(dv_1_rank:dv_3_rank) %>%
group_by(stimuli, name, value) %>%
summarise(count = n(), .groups = "drop") %>%
group_by(stimuli, name) %>%
mutate(perc = count/sum(count)) %>%
ungroup()
print(a_perc)
#> # A tibble: 36 x 5
#> stimuli name value count perc
#> <chr> <chr> <dbl> <int> <dbl>
#> 1 a dv_1_rank 1 4 0.2
#> 2 a dv_1_rank 2 8 0.4
#> 3 a dv_1_rank 3 5 0.25
#> 4 a dv_1_rank 4 3 0.15
#> 5 a dv_2_rank 1 5 0.25
#> 6 a dv_2_rank 2 6 0.3
#> 7 a dv_2_rank 3 4 0.2
#> 8 a dv_2_rank 4 5 0.25
#> 9 a dv_3_rank 1 3 0.15
#> 10 a dv_3_rank 2 5 0.25
#> # ... with 26 more rows
Created on 2022-03-14 by the reprex package (v2.0.1)
Related
I want to do correlations for each unique combination and grouped by another variable. My solutions works for a very small dataset buy imagine more columns it's getting very tedious.
set.seed((13))
df <- data.frame(group = rep(LETTERS[1:3], 3),
var1 = rnorm(9, 1),
var2 = rnorm(9, 2),
var3 = rnorm(9, 1))
df %>%
group_by(group) %>%
summarise(var1_var2 = cor(var1, var2),
var1_var3 = cor(var1, var3),
var2_var3 = cor(var2, var3))
I also tried this one, but it doens't work.
df %>%
group_by(group) %>%
summarise(cor = cor(df[,2:ncol(df)]))
Here is an option. Map out all the combos then run a cor test for each group and each var and then pivot wider at the end:
library(tidyverse)
map_dfr(unique(df$group), \(x){
data.frame(t(combn(c("var1", "var2", "var3"), 2))) |>
mutate(group = x)
}) |>
mutate(cor = pmap_dbl(list(X1, X2, group),
~cor(df[df$group == ..3, ..1],
df[df$group == ..3, ..2]))) |>
unite(test, X1, X2) |>
pivot_wider(names_from = test, values_from = cor)
#> # A tibble: 3 x 4
#> group var1_var2 var1_var3 var2_var3
#> <chr> <dbl> <dbl> <dbl>
#> 1 A 0.318 -0.476 -0.985
#> 2 B -0.373 -0.487 -0.628
#> 3 C 0.535 0.991 0.645
Another solution. This works for any number of variables.
library(dplyr)
library(tidyr)
library(purrr)
library(tibble)
set.seed((13))
df <- data.frame(group = rep(LETTERS[1:3], 3),
var1 = rnorm(9, 1),
var2 = rnorm(9, 2),
var3 = rnorm(9, 2))
df %>%
select(-group) %>%
split(df$group) %>%
imap_dfr(
~ {
expand.grid(
first = names(.x),
second = names(.x),
stringsAsFactors = FALSE
) %>%
filter(first < second) %>%
rowwise() %>%
transmute(
group = .y,
pair = paste(first, second, sep = "_"),
cor = cor(.x[[first]], .x[[second]])
)
}
) %>%
pivot_wider(
names_from = "pair",
values_from = "cor"
)
# # A tibble: 3 × 4
# group var1_var2 var1_var3 var2_var3
# <chr> <dbl> <dbl> <dbl>
# 1 A 0.318 -0.476 -0.985
# 2 B -0.373 -0.487 -0.628
# 3 C 0.535 0.991 0.645
I have the following data structure:
country age sex
x 10 m
y 20 f
x 12 m
y 40 m
I want to group my data according to the country and get a calculation of the percentage of my sex variable according to it, resulting in some table like this:
country mean_age percent_m percent_f
x 11 100% 0%
y 20 50% 50%
Thank you in advance!
A possible solution:
library(tidyverse)
df %>%
mutate(age = as.character(age)) %>%
pivot_longer(-country) %>%
pivot_wider(country, values_fn = ~ {if (any(str_detect(.x, "\\d")))
mean(as.numeric(.x)) else proportions(table(.x))["m"]}) %>%
mutate(mean_age = age, percent_m = sex, percent_f = 1- percent_m, age = NULL,
sex = NULL)
#> # A tibble: 2 × 4
#> country mean_age percent_m percent_f
#> <chr> <dbl> <dbl> <dbl>
#> 1 x 11 1 0
#> 2 y 30 0.5 0.5
Another possible solution:
library(tidyverse)
inner_join(
df %>%
mutate(name = "mean_age") %>%
pivot_wider(country, values_from = age, values_fn = mean),
df %>%
mutate(name = "percent_m") %>%
pivot_wider(country, values_from = sex,
values_fn = ~ proportions(table(.x))["m"])) %>%
mutate(percent_f = 1 - percent_m)
#> Joining, by = "country"
#> # A tibble: 2 × 4
#> country mean_age percent_m percent_f
#> <chr> <dbl> <dbl> <dbl>
#> 1 x 11 1 0
#> 2 y 30 0.5 0.5
library(tidyverse)
df <- data.frame(country = c("x", "x", "x", "x", "x", "y", "y", "y"),
age = c(10, 20, 12, 40, 23, 17, 21, 19),
sex = c("m", "f", "f", "m", "f", "m", "f", "f"))
df1 <- df %>%
pivot_wider(names_from = sex, values_from = sex) %>%
group_by(country) %>%
summarise(age_mean = mean(age),
m = length(na.omit(m)),
f = length(na.omit(f))) %>%
mutate(m_perc = (m / (m + f)) * 100,
f_perc = (f / (m + f)) * 100)
> df1
# A tibble: 2 x 6
country age_mean m f m_perc f_perc
<chr> <dbl> <int> <int> <dbl> <dbl>
1 x 21 2 3 40 60
2 y 19 1 2 33.3 66.7
I want to calculate the proportion of a variable in subgroups compared to the proportion of the whole dataset. The subgroups are based on binary columns. I want to filter the dataframe for each column, count the grouping variable and calculate the proportions. To compare the proportions, I calculate an index value which is 100*prop_subgroup/prop_overall.
I tried and failed to do this with map. Below is a for-loop and a lot of detours to achieve this, and I´m looking for some help to clean up this code and solve this "the tidyverse way". Thank you!
data <- data.frame(group = sample(c(LETTERS[1:6], NA), 1000, T),
v1 = sample(c(0, 1, NA), 1000, T),
v2 = sample(c(0, 1, 2, 3, 4, NA), 1000, T),
v3 = sample(c(0, 1, NA), 1000, T, prob = c(0.05, 0.05, 0.9)),
v4 = sample(c(0, 1, NA), 1000, T, prob = c(0.8, 0.1, 0.1)),
v5 = sample(c("a", 1, NA), 1000, T, prob = c(0.8, 0.1, 0.1)))
Calculate the prop.table
result <- data %>% count(group) %>% na.omit() %>% transmute(group = group, prop = n/sum(n))
Select binary columns
data_binary <- data %>% select(where(is.numeric)) %>%
select(where(function(x) {max(x, na.rm = T) == 1})) %>%
bind_cols(data %>% select(group), .)
Very ugly peace of code to calculate the frequencies for each group. Left join because some subgroups do not contain all grouping variables. The key peace I failed to do with map is the filtering based on one column and count of another column applied to all binary columns.
for(i in 2:ncol(data_binary)){
name <- names(data_binary)[i]
result <- left_join(result, data_binary %>% filter(.[[i]] == 1) %>% count(group) %>%
na.omit() %>% transmute(group = group, "{{name}}_index" := n/sum(n)))
}
Calculate index based on the frequencies
index <- bind_cols(result %>% select(group),
result %>% transmute_at(vars(-c("prop", "group")), function(x) {100 * x / result$prop}))
Result
group "v1"_index "v3"_index "v4"_index
1 A 79.90019 16.21418 60.54443
2 B 91.31450 97.28507 87.45307
3 C 114.26996 122.50712 95.30142
4 D 96.63614 175.24198 109.06017
5 E 100.08550 116.05938 126.39978
6 F 116.70123 62.55683 116.79493
I think you can accomplish this with a group_by, summarize to get counts and group_by, mutate to calculate fractions. However, I don't produce the same result so perhaps I don't understand exactly how you want to calculate the fractions (sum only the ones?)
data <- data.frame(group = sample(c(LETTERS[1:6], NA), 1000, T),
v1 = sample(c(0, 1, NA), 1000, T),
v2 = sample(c(0, 1, 2, 3, 4, NA), 1000, T),
v3 = sample(c(0, 1, NA), 1000, T, prob = c(0.05, 0.05, 0.9)),
v4 = sample(c(0, 1, NA), 1000, T, prob = c(0.8, 0.1, 0.1)),
v5 = sample(c("a", 1, NA), 1000, T, prob = c(0.8, 0.1, 0.1)))
library(tidyverse)
# counts and fractions for each combination of group and variable
data_long <- data %>%
as_tibble() %>%
# select only binary
select(group, where(~max(., na.rm = TRUE) == 1)) %>%
# pivot and calculate sums and fractions
pivot_longer(-group) %>%
drop_na(value) %>%
group_by(group, name) %>% summarize(count = sum(value), .groups = "drop") %>%
group_by(group) %>% mutate(fraction = count / sum(count))
print(data_long)
#> # A tibble: 21 x 4
#> # Groups: group [7]
#> group name count fraction
#> <chr> <chr> <dbl> <dbl>
#> 1 A v1 61 0.693
#> 2 A v3 7 0.0795
#> 3 A v4 20 0.227
#> 4 B v1 54 0.659
#> 5 B v3 10 0.122
#> 6 B v4 18 0.220
#> 7 C v1 45 0.75
#> 8 C v3 4 0.0667
#> 9 C v4 11 0.183
#> 10 D v1 48 0.716
#> # ... with 11 more rows
# pivot wider on fractions to get output in desired form
data_wide <- data_long %>%
pivot_wider(id_cols = group, values_from = fraction)
print(data_wide)
#> # A tibble: 7 x 4
#> # Groups: group [7]
#> group v1 v3 v4
#> <chr> <dbl> <dbl> <dbl>
#> 1 A 0.693 0.0795 0.227
#> 2 B 0.659 0.122 0.220
#> 3 C 0.75 0.0667 0.183
#> 4 D 0.716 0.0896 0.194
#> 5 E 0.707 0.0690 0.224
#> 6 F 0.677 0.154 0.169
#> 7 <NA> 0.725 0.0980 0.176
Created on 2022-03-31 by the reprex package (v2.0.1)
My data looks like this :
structure(
list(
ID = c(1, 2, 3, 4, 5, 6),
Compagny = c("x", "x", "x", "y", "y", "y"),
Variable = c("size", "lenght", "diameter", "size", "lenght", "diameter"),
Score = c(12, 15, 8, 20, 4, 7)
),
row.names = c(NA, -6L),
class = "data.frame"
)
ID
Compagny
Variable
Score
1
x
size
12
2
x
lenght
15
3
x
diameter
8
4
y
size
20
5
y
lenght
4
6
y
diameter
7
i want to pivot wider so that variables are columns :
ID
Compagny
size
lenght
diameter
1
x
12
15
8
2
y
20
4
7
I've Followed this tutorial
because i had the same problem
pivot_wider issue "Values in `values_from` are not uniquely identified; output will contain list-cols"
I Copy/paste this lines of codes found above :
d %>%
group_by(name) %>%
mutate(row = row_number()) %>%
tidyr::pivot_wider(names_from = name, values_from = val) %>%
select(-row)
That became
PivoTable <- LongTable %>%
group_by(score) %>%
mutate(row = row_number()) %>%
tidyr::pivot_wider(names_from = score, values_from = mean) %>%
select(-row)
And I also have a special identifier for each row.
It still doesn't work even though I dont have a propre table, but a matrix with NA values instead (cf. picture)
You can do this:
df %>%
pivot_wider(id_cols = -ID, names_from = Variable, values_from = Score) %>%
mutate(ID = row_number(), .before = Compagny)
# A tibble: 2 x 5
ID Compagny size lenght diameter
<int> <chr> <int> <int> <int>
1 1 x 12 15 8
2 2 y 20 4 7
ESGTable <- select(EU_ESG_PILLARS,compagny, variable_name, mean_value)
ESGTable <- tibble::rowid_to_column(ESGTable, "ID")
# Petittable <- tibble::rowid_to_column(Petittable, "ID")
StackTry <- ESGTable %>%
pivot_wider(id_cols = -ID, names_from = variable_name, values_from = mean_value) %>%
mutate(ID = row_number(), .before = compagny)
## --> worked
Adding and index column (1-->n rows)
delete one colomn that was redundant
pivot_wider(id_cols --> identify each row uniquely
Thank you very much for such quick help !
Output
I have a dataframe I've created in the form
FREQ CNT
0 5
1 20
2 1000
3 3
4 3
I want to further group my results to be in the following form:
CUT CNT
0+1 25
2+3 1003
4+5 ...
.....
I've tried using the between and cut functions in dplyr but it just adds a new interval column to my dataframe can anyone give me a good indication as to where to go to achieve this?
Here is a way to do it in dplyr:
library(dplyr)
df <- df %>%
mutate(id = 1:n()) %>%
mutate(new_freq = ifelse(id %% 2 != 0, paste0(FREQ, "+", lead(FREQ, 1)), paste0(lag(FREQ, 1), "+", FREQ)))
df <- df %>%
group_by(new_freq) %>%
mutate(new_cnt = sum(CNT))
unique(df[, 4:5])
# A tibble: 2 x 2
# Groups: new_freq [2]
# new_freq new_cnt
# <chr> <int>
#1 0+1 25
#2 2+3 1003
data
df <- structure(list(FREQ = 0:3, CNT = c(5L, 20L, 1000L, 3L)), class = "data.frame", row.names = c(NA, -4L))
A non-elegant solution using dplyr... probably a better way to do this.
dat <- data.frame(FREQ = c(0,1,2,3,4), CNT = c(5,20,1000, 3, 3))
dat2 <- dat %>%
mutate(index = 0:(nrow(dat)-1)%/%2) %>%
group_by(index)
dat2 %>%
summarise(new_CNT = sum(CNT)) %>%
left_join(dat2 %>%
mutate(CUT = paste0(FREQ[1], "+", FREQ[2])) %>%
distinct(index, CUT),
by = "index") %>%
select(-index)
# A tibble: 3 x 2
new_CNT CUT
<dbl> <chr>
1 25 0+1
2 1003 2+3
3 3 4+NA