I'm taking the mean, 3 by 3, by grouping. For that, I'm using the summarise function. In this context I would like to select the last date from the four that make up the average.
I tried to select the maximum, but this way I'm just selecting the highest date for the whole group.
test = data.frame(my_groups = c("A", "A", "A", "B", "B", "C", "C", "C", "A", "A", "A"),
measure = c(10, 20, 5, 2, 62 ,2, 5, 4, 6, 7, 25),
time= c("20-09-2020", "25-09-2020", "19-09-2020", "20-05-2020", "20-06-2021",
"11-01-2021", "13-01-2021", "13-01-2021", "15-01-2021", "15-01-2021", "19-01-2021"))
# > test
# my_groups measure time
# 1 A 10 20-09-2020
# 2 A 20 25-09-2020
# 3 A 5 19-09-2020
# 4 B 2 20-05-2020
# 5 B 62 20-06-2021
# 6 C 2 11-01-2021
# 7 C 5 13-01-2021
# 8 C 4 13-01-2021
# 9 A 6 15-01-2021
# 10 A 7 15-01-2021
# 11 A 25 19-01-2021
test %>%
arrange(time) %>%
group_by(my_groups) %>%
summarise(mean_3 = rollapply(measure, 3, mean, by = 3, align = "left", partial = F),
final_data = max(time))
# my_groups mean_3 final_data
# <chr> <dbl> <chr>
# 1 A 12.7 25-09-2020
# 2 A 11.7 25-09-2020
# 3 C 3.67 13-01-2021
In the second line I wish the date was 19-01-2021, and not the global maximum of group A, (25-09-2020).
Any hint on how I could do that?
I have 2 dplyr ways for you. Not happy with it because when the rollapply with max and dates doesn't find anything it in group B it uses a double by default which doesn't match the characters from group A and C.
Mutate:
test %>%
arrange(time) %>%
group_by(my_groups) %>%
mutate(final = rollapply(time, 3, max, by = 3, fill = NA, align = "left", partial = F),
mean_3 = rollapply(measure, 3, mean, by = 3, fill = NA, align = "left", partial = F)) %>%
filter(!is.na(final)) %>%
select(my_groups, final, mean_3) %>%
arrange(my_groups)
# A tibble: 3 x 3
# Groups: my_groups [2]
my_groups final mean_3
<chr> <chr> <dbl>
1 A 19-01-2021 12.7
2 A 25-09-2020 11.7
3 C 13-01-2021 3.67
Summarise that doesn't summarise, but is a bit cleaner in code:
test %>%
arrange(time) %>%
group_by(my_groups) %>%
summarise(final = rollapply(time, 3, max, by = 3, fill = NA, align = "left", partial = F),
mean_3 = rollapply(measure, 3, mean, by = 3, fill = NA, align = "left", partial = F)) %>%
filter(!is.na(final))
`summarise()` has grouped output by 'my_groups'. You can override using the `.groups` argument.
# A tibble: 3 x 3
# Groups: my_groups [2]
my_groups final mean_3
<chr> <chr> <dbl>
1 A 19-01-2021 12.7
2 A 25-09-2020 11.7
3 C 13-01-2021 3.67
Edit:
Added isa's solution from comment. Partial = TRUE does the trick:
test %>%
arrange(time) %>%
group_by(my_groups) %>%
summarise(mean_3 = rollapply(measure, 3, mean, by = 3, align = "left", partial = F),
final_data = rollapply(time, 3, max, by = 3, align = "left", partial = T))
`summarise()` has grouped output by 'my_groups'. You can override using the `.groups` argument.
# A tibble: 3 x 3
# Groups: my_groups [2]
my_groups mean_3 final_data
<chr> <dbl> <chr>
1 A 12.7 19-01-2021
2 A 11.7 25-09-2020
3 C 3.67 13-01-2021
Another possible solution:
library(tidyverse)
test = data.frame(my_groups = c("A", "A", "A", "B", "B", "C", "C", "C", "A", "A", "A"),
measure = c(10, 20, 5, 2, 62 ,2, 5, 4, 6, 7, 25),
time= c("20-09-2020", "25-09-2020", "19-09-2020", "20-05-2020", "20-06-2021",
"11-01-2021", "13-01-2021", "13-01-2021", "15-01-2021", "15-01-2021", "19-01-2021"))
test %>%
group_by(data.table::rleid(my_groups)) %>%
filter(n() == 3) %>%
summarise(
groups = unique(my_groups),
mean_3 = mean(measure), final_data = max(time), .groups = "drop") %>%
select(-1)
#> # A tibble: 3 × 3
#> groups mean_3 final_data
#> <chr> <dbl> <chr>
#> 1 A 11.7 25-09-2020
#> 2 C 3.67 13-01-2021
#> 3 A 12.7 19-01-2021
EDIT
To allow for calculation of mean of 2 values, as asked for in a comment below by the OP, I revised my code, using data.table::frollmean and data.table::frollapply:
library(tidyverse)
library(lubridate)
library(data.table)
n <- 2 # choose the number with which to calculate the mean
test %>%
group_by(rleid(my_groups)) %>%
summarise(
groups = unique(my_groups),
mean_n = frollmean(measure, n), final_data = frollapply(dmy(time), n, max) %>%
as_date(origin = lubridate::origin), .groups = "drop") %>%
drop_na(mean_n) %>% select(-1)
#> # A tibble: 7 × 3
#> groups mean_n final_data
#> <chr> <dbl> <date>
#> 1 A 15 2020-09-25
#> 2 A 12.5 2020-09-25
#> 3 B 32 2021-06-20
#> 4 C 3.5 2021-01-13
#> 5 C 4.5 2021-01-13
#> 6 A 6.5 2021-01-15
#> 7 A 16 2021-01-19
Related
I would like to reassign a given records to a single group if the records are duplicated. In the below dataset I would like to to have 12-4 all being assigned to group A or B but not both. Is there a way to go abou it?
library(tidyverse)
dat <- tibble(
group = c("A", "A", "A", "A", "B", "B", "B", "B", "B"),
assigned = c("12-1", "12-2", "12-3", "12-4", "12-4", "12-5", "12-6",
"12-7", "12-8")
)
# Attempts to tease out records for each group
dat %>% pivot_wider(names_from = group, values_from = assigned)
You can group by record and reassign all to the same group, chosen at random from the available groups:
dat %>%
group_by(assigned) %>%
mutate(group = nth(group, sample(n())[1])) %>%
ungroup()
#> # A tibble: 9 x 2
#> group assigned
#> <chr> <chr>
#> 1 A 12-1
#> 2 A 12-2
#> 3 A 12-3
#> 4 A 12-4
#> 5 A 12-4
#> 6 B 12-5
#> 7 B 12-6
#> 8 B 12-7
#> 9 B 12-8
library(tidyverse)
dat <- tibble(
group = c("A", "A", "A", "A", "B", "B", "B", "B", "B"),
assigned = c(
"12-1", "12-2", "12-3", "12-4", "12-4", "12-5", "12-6",
"12-7", "12-8"
)
)
dat %>%
select(-group) %>%
left_join(
dat %>%
left_join(dat %>% count(group)) %>%
# reassign to the smallest group
arrange(n) %>%
select(-n) %>%
distinct(assigned, .keep_all = TRUE)
)
#> Joining, by = "group"
#> Joining, by = "assigned"
#> # A tibble: 9 × 2
#> assigned group
#> <chr> <chr>
#> 1 12-1 A
#> 2 12-2 A
#> 3 12-3 A
#> 4 12-4 A
#> 5 12-4 A
#> 6 12-5 B
#> 7 12-6 B
#> 8 12-7 B
#> 9 12-8 B
Created on 2022-04-04 by the reprex package (v2.0.0)
I want to calculate the proportion of a variable in subgroups compared to the proportion of the whole dataset. The subgroups are based on binary columns. I want to filter the dataframe for each column, count the grouping variable and calculate the proportions. To compare the proportions, I calculate an index value which is 100*prop_subgroup/prop_overall.
I tried and failed to do this with map. Below is a for-loop and a lot of detours to achieve this, and I´m looking for some help to clean up this code and solve this "the tidyverse way". Thank you!
data <- data.frame(group = sample(c(LETTERS[1:6], NA), 1000, T),
v1 = sample(c(0, 1, NA), 1000, T),
v2 = sample(c(0, 1, 2, 3, 4, NA), 1000, T),
v3 = sample(c(0, 1, NA), 1000, T, prob = c(0.05, 0.05, 0.9)),
v4 = sample(c(0, 1, NA), 1000, T, prob = c(0.8, 0.1, 0.1)),
v5 = sample(c("a", 1, NA), 1000, T, prob = c(0.8, 0.1, 0.1)))
Calculate the prop.table
result <- data %>% count(group) %>% na.omit() %>% transmute(group = group, prop = n/sum(n))
Select binary columns
data_binary <- data %>% select(where(is.numeric)) %>%
select(where(function(x) {max(x, na.rm = T) == 1})) %>%
bind_cols(data %>% select(group), .)
Very ugly peace of code to calculate the frequencies for each group. Left join because some subgroups do not contain all grouping variables. The key peace I failed to do with map is the filtering based on one column and count of another column applied to all binary columns.
for(i in 2:ncol(data_binary)){
name <- names(data_binary)[i]
result <- left_join(result, data_binary %>% filter(.[[i]] == 1) %>% count(group) %>%
na.omit() %>% transmute(group = group, "{{name}}_index" := n/sum(n)))
}
Calculate index based on the frequencies
index <- bind_cols(result %>% select(group),
result %>% transmute_at(vars(-c("prop", "group")), function(x) {100 * x / result$prop}))
Result
group "v1"_index "v3"_index "v4"_index
1 A 79.90019 16.21418 60.54443
2 B 91.31450 97.28507 87.45307
3 C 114.26996 122.50712 95.30142
4 D 96.63614 175.24198 109.06017
5 E 100.08550 116.05938 126.39978
6 F 116.70123 62.55683 116.79493
I think you can accomplish this with a group_by, summarize to get counts and group_by, mutate to calculate fractions. However, I don't produce the same result so perhaps I don't understand exactly how you want to calculate the fractions (sum only the ones?)
data <- data.frame(group = sample(c(LETTERS[1:6], NA), 1000, T),
v1 = sample(c(0, 1, NA), 1000, T),
v2 = sample(c(0, 1, 2, 3, 4, NA), 1000, T),
v3 = sample(c(0, 1, NA), 1000, T, prob = c(0.05, 0.05, 0.9)),
v4 = sample(c(0, 1, NA), 1000, T, prob = c(0.8, 0.1, 0.1)),
v5 = sample(c("a", 1, NA), 1000, T, prob = c(0.8, 0.1, 0.1)))
library(tidyverse)
# counts and fractions for each combination of group and variable
data_long <- data %>%
as_tibble() %>%
# select only binary
select(group, where(~max(., na.rm = TRUE) == 1)) %>%
# pivot and calculate sums and fractions
pivot_longer(-group) %>%
drop_na(value) %>%
group_by(group, name) %>% summarize(count = sum(value), .groups = "drop") %>%
group_by(group) %>% mutate(fraction = count / sum(count))
print(data_long)
#> # A tibble: 21 x 4
#> # Groups: group [7]
#> group name count fraction
#> <chr> <chr> <dbl> <dbl>
#> 1 A v1 61 0.693
#> 2 A v3 7 0.0795
#> 3 A v4 20 0.227
#> 4 B v1 54 0.659
#> 5 B v3 10 0.122
#> 6 B v4 18 0.220
#> 7 C v1 45 0.75
#> 8 C v3 4 0.0667
#> 9 C v4 11 0.183
#> 10 D v1 48 0.716
#> # ... with 11 more rows
# pivot wider on fractions to get output in desired form
data_wide <- data_long %>%
pivot_wider(id_cols = group, values_from = fraction)
print(data_wide)
#> # A tibble: 7 x 4
#> # Groups: group [7]
#> group v1 v3 v4
#> <chr> <dbl> <dbl> <dbl>
#> 1 A 0.693 0.0795 0.227
#> 2 B 0.659 0.122 0.220
#> 3 C 0.75 0.0667 0.183
#> 4 D 0.716 0.0896 0.194
#> 5 E 0.707 0.0690 0.224
#> 6 F 0.677 0.154 0.169
#> 7 <NA> 0.725 0.0980 0.176
Created on 2022-03-31 by the reprex package (v2.0.1)
How can I convert this matrix:
> matrix(1:3, nrow = 3, dimnames = list(c("X","Y","Z"), c("A")))
A
X 1
Y 2
Z 3
into this tibble:
> tibble::tribble(~group1, ~group2, ~value, "X", "A", 1, "Y", "A", 2, "Z", "A", 3)
# A tibble: 3 × 3
group1 group2 value
<chr> <chr> <dbl>
1 X A 1
2 Y A 2
3 Z A 3
Thank you
as.tibble can convert the matrix's rownames to a column, and then you can use gather() to create the group2 column:
library(tidyverse)
m <- matrix(1:3, nrow = 3, dimnames = list(c("X","Y","Z"), c("A")))
newtib <- m %>%
as.tibble(rownames = "group1") %>%
gather('A', key = "group2", value = "value")
> newtib
# A tibble: 3 × 3
group1 group2 value
<chr> <chr> <int>
1 X A 1
2 Y A 2
3 Z A 3
> tibble::tribble(~group1, ~group2, ~value, "X", "A", 1, "Y", "A", 2, "Z", "A", 3)
# A tibble: 3 × 3
group1 group2 value
<chr> <chr> <dbl>
1 X A 1
2 Y A 2
3 Z A 3
Easier with base R, if we convert to table and coerce with as.data.frame (if we need to convert to tibble - use as_tibble as wrapper over the as.data.frame
as.data.frame(as.table(m1))
Var1 Var2 Freq
1 X A 1
2 Y A 2
3 Z A 3
data
m1 <- matrix(1:3, nrow = 3, dimnames = list(c("X","Y","Z"), c("A")))
Transform your matrix into a dataframe
bring your rownames to column group1
mutate group2
data.frame(matrix) %>%
rownames_to_column("group1") %>%
mutate(group2 = colnames(matrix)) %>%
dplyr::select(group1, group2, value=A)
group1 group2 value
1 X A 1
2 Y A 2
3 Z A 3
You can use -
library(tidyverse)
mat <- matrix(1:3, nrow = 3, dimnames = list(c("X","Y","Z"), c("A")))
mat %>%
as.data.frame() %>%
rownames_to_column(var = 'group1') %>%
pivot_longer(cols = -group1, names_to = 'group2')
# group1 group2 value
# <chr> <chr> <dbl>
#1 X A 1
#2 Y A 2
#3 Z A 3
The code below creates a simplified version of the dataframe and illustrates my desired end result (df_wider) based on the unnested version. My question is: How can I achieve the same end result (df_wider) from the nested version (nested_df), using purrr?
library(tidyverse)
df <- tibble(id_01 = c(rep("01", 3), rep("02", 3)),
a = (c("a", "a", "b", "c", "c", "d")),
b = letters[7:12],
id_02 = rep(c(1, 2, 1), 2)
)
df_wider <- pivot_wider(df,
id_cols = c(id_01, a),
names_from = id_02,
values_from = b,
names_sep = "_"
)
nested_df <- nest(df, data = -id_01)
To be clear, I am trying to pivot while the dataframes are nested (i.e., before unnesting).
We can use purrr::map() within dplyr::mutate():
library(tidyverse)
df <- tibble(
id_01 = c(rep("01", 3), rep("02", 3)),
a = (c("a", "a", "b", "c", "c", "d")),
b = letters[7:12],
id_02 = rep(c(1, 2, 1), 2)
)
nested_df <- df %>%
nest(data = -id_01) %>%
mutate(data = map(data, ~ .x %>%
pivot_wider(
id_cols = a,
names_from = id_02,
values_from = b
)))
nested_df
#> # A tibble: 2 x 2
#> id_01 data
#> <chr> <list>
#> 1 01 <tibble [2 x 3]>
#> 2 02 <tibble [2 x 3]>
nested_df %>%
unnest(data)
#> # A tibble: 4 x 4
#> id_01 a `1` `2`
#> <chr> <chr> <chr> <chr>
#> 1 01 a g h
#> 2 01 b i <NA>
#> 3 02 c j k
#> 4 02 d l <NA>
Created on 2021-03-26 by the reprex package (v1.0.0)
My data frame looks like this:
id A T C G ref var
1 1 10 15 7 0 A C
2 2 11 9 2 3 A G
3 3 2 31 1 12 T C
I'd like to create two new columns: ref_count and var_count which will have following values:
Value from A column and value from C column, since ref is A and var is C
Value from A column and value from G column, since ref is A and var is G
etc.
So I'd like to select a column based on the value in another column for each row.
Thanks!
We can use pivot_longer to reshape into 'long' format, filter the rows and then reshape it to 'wide' format with pivot_wider
library(dplyr)
library(tidyr)
df1 %>%
pivot_longer(cols = A:G) %>%
group_by(id) %>%
filter(name == ref|name == var) %>%
mutate(nm1 = c('ref_count', 'var_count')) %>%
ungroup %>%
select(id, value, nm1) %>%
pivot_wider(names_from = nm1, values_from = value) %>%
left_join(df1, .)
# A tibble: 3 x 9
# id A T C G ref var ref_count var_count
#* <int> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <dbl> <dbl>
#1 1 10 15 7 0 A C 10 7
#2 2 11 9 2 3 A G 11 3
#3 3 2 31 1 12 T C 31 1
Or in base R, we can also make use of the vectorized row/column indexing
df1$refcount <- as.matrix(df1[2:5])[cbind(seq_len(nrow(df1)), match(df1$ref, names(df1)[2:5]))]
df1$var_count <- as.matrix(df1[2:5])[cbind(seq_len(nrow(df1)), match(df1$var, names(df1)[2:5]))]
data
df1 <- structure(list(id = 1:3, A = c(10, 11, 2), T = c(15, 9, 31),
C = c(7, 2, 1), G = c(0, 3, 12), ref = c("A", "A", "T"),
var = c("C", "G", "C")), row.names = c(NA, -3L), class = c("tbl_df",
"tbl", "data.frame"))
The following is a tidyverse alternative without creating a long dataframe that needs filtering. It essentially uses tidyr::nest() to nest the dataframe by rows, after which the correct column can be selected for each row.
df1 %>%
nest(data = -id) %>%
mutate(
data = map(
data,
~mutate(., refcount = .[[ref]], var_count = .[[var]])
)
) %>%
unnest(data)
#> # A tibble: 3 × 9
#> id A T C G ref var refcount var_count
#> <int> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <dbl> <dbl>
#> 1 1 10 15 7 0 A C 10 7
#> 2 2 11 9 2 3 A G 11 3
#> 3 3 2 31 1 12 T C 31 1
A variant of this does not need the (assumed row-specific) id column but defines the nested groups from the unique values of ref and var directly:
df1 %>%
nest(data = -c(ref, var)) %>%
mutate(
data = pmap(
list(data, ref, var),
function(df, ref, var) {
mutate(df, refcount = df[[ref]], var_count = df[[var]])
}
)
) %>%
unnest(data)
The data were specified by akrun:
df1 <- structure(list(id = 1:3, A = c(10, 11, 2), T = c(15, 9, 31),
C = c(7, 2, 1), G = c(0, 3, 12), ref = c("A", "A", "T"),
var = c("C", "G", "C")), row.names = c(NA, -3L), class = c("tbl_df",
"tbl", "data.frame"))