Group several demographic variables with dplyr - r

I have the following data structure:
country age sex
x 10 m
y 20 f
x 12 m
y 40 m
I want to group my data according to the country and get a calculation of the percentage of my sex variable according to it, resulting in some table like this:
country mean_age percent_m percent_f
x 11 100% 0%
y 20 50% 50%
Thank you in advance!

A possible solution:
library(tidyverse)
df %>%
mutate(age = as.character(age)) %>%
pivot_longer(-country) %>%
pivot_wider(country, values_fn = ~ {if (any(str_detect(.x, "\\d")))
mean(as.numeric(.x)) else proportions(table(.x))["m"]}) %>%
mutate(mean_age = age, percent_m = sex, percent_f = 1- percent_m, age = NULL,
sex = NULL)
#> # A tibble: 2 × 4
#> country mean_age percent_m percent_f
#> <chr> <dbl> <dbl> <dbl>
#> 1 x 11 1 0
#> 2 y 30 0.5 0.5
Another possible solution:
library(tidyverse)
inner_join(
df %>%
mutate(name = "mean_age") %>%
pivot_wider(country, values_from = age, values_fn = mean),
df %>%
mutate(name = "percent_m") %>%
pivot_wider(country, values_from = sex,
values_fn = ~ proportions(table(.x))["m"])) %>%
mutate(percent_f = 1 - percent_m)
#> Joining, by = "country"
#> # A tibble: 2 × 4
#> country mean_age percent_m percent_f
#> <chr> <dbl> <dbl> <dbl>
#> 1 x 11 1 0
#> 2 y 30 0.5 0.5

library(tidyverse)
df <- data.frame(country = c("x", "x", "x", "x", "x", "y", "y", "y"),
age = c(10, 20, 12, 40, 23, 17, 21, 19),
sex = c("m", "f", "f", "m", "f", "m", "f", "f"))
df1 <- df %>%
pivot_wider(names_from = sex, values_from = sex) %>%
group_by(country) %>%
summarise(age_mean = mean(age),
m = length(na.omit(m)),
f = length(na.omit(f))) %>%
mutate(m_perc = (m / (m + f)) * 100,
f_perc = (f / (m + f)) * 100)
> df1
# A tibble: 2 x 6
country age_mean m f m_perc f_perc
<chr> <dbl> <int> <int> <dbl> <dbl>
1 x 21 2 3 40 60
2 y 19 1 2 33.3 66.7

Related

how to duplicate rows with certain condition and create anew variable at the same time

I have a df like below and I would like to transfer it to sth like the table on the right, how can I duplicate the rows with Type=="N" and add new var Grade?
Basically, if Type==N, then Grade can be S or W, that is why we need to duplicate the rows.
df<-structure(list(Type = c("N", "N", "S", "W"), Result = c(8, 9,
7, 6)), row.names = c(NA, -4L), class = c("tbl_df", "tbl", "data.frame"
))
Using some functions from tidyverse, you can use crossing to duplicate rows and add the "Grade" column at the same time, then filter to match your stated rules.
library(tidyverse)
result <- df %>%
crossing(data.frame(Grade = c('S', 'W'))) %>%
filter(Type == 'N' | Type == Grade)
Type Result Grade
<chr> <dbl> <chr>
1 N 8 S
2 N 8 W
3 N 9 S
4 N 9 W
5 S 7 S
6 W 6 W
I think this approach is extensible to many more conditions assuming yours is the minimal example and you have a larger more complicated dataset.
library(dplyr)
df<-structure(list(Type = c("N", "N", "S", "W"), Result = c(8, 9,
7, 6)), row.names = c(NA, -4L), class = c("tbl_df", "tbl", "data.frame"
))
df2 <- data.frame(Type2 = c("N", "N"), Grade = c("S", "W"))
df %>%
select(Type, Result) %>%
left_join(df2, by = c("Type" = "Type2")) %>%
mutate(Grade = case_when(Type == "S" ~ "S", Type == "W" ~ "W", TRUE ~ Grade))
Type Result Grade
<chr> <dbl> <chr>
1 N 8 S
2 N 8 W
3 N 9 S
4 N 9 W
5 S 7 S
6 W 6 W
Another option is to use if_else() (or case_when() if there are more complex conditions) to return a list column of multiple values and unnest:
library(dplyr)
library(tidyr)
df %>%
mutate(Grade = if_else(Type == "N", list(c("S", "W")), as.list(Type))) %>%
unnest(Grade)
# A tibble: 6 x 3
Type Result Grade
<chr> <dbl> <chr>
1 N 8 S
2 N 8 W
3 N 9 S
4 N 9 W
5 S 7 S
6 W 6 W
Or:
df %>%
mutate(Grade = case_when(Type == "N" ~ list(c("S", "W")),
TRUE ~ as.list(Type))) %>%
unnest(Grade)
A dplyr way:
We could use bind_rows after using slice.
library(dplyr)
df %>%
slice(1:2) %>%
bind_rows(df) %>%
group_by(Type) %>%
arrange(Result, .by_group = TRUE) %>%
ungroup() %>%
mutate(Grade = rep(c("S","W"),length.out = n()), .before=2)
Type Grade Result
<chr> <chr> <dbl>
1 N S 8
2 N W 8
3 N S 9
4 N W 9
5 S S 7
6 W W 6
Here is a possible data.table option:
library(data.table)
dt <- as.data.table(df)
output <- dt[, CJ(.SD$Type, c('S', 'W')), .(Result)][which(V1 == 'N' | V1 == V2), ]
setnames(output, c(names(dt), "Grade"))
setcolorder(output, c("Result", "Grade", "Type"))
Output
Result Grade Type
1: N S 8
2: N W 8
3: N S 9
4: N W 9
5: S S 7
6: W W 6

Performing pivot_longer() over multiple sets of columns

I am stuck in performing pivot_longer() over multiple sets of columns. Here is the sample dataset
df <- data.frame(
id = c(1, 2),
uid = c("m1", "m2"),
germ_kg = c(23, 24),
mineral_kg = c(12, 17),
perc_germ = c(45, 34),
perc_mineral = c(78, 10))
I need the output dataframe to look like this
out <- df <- data.frame(
id = c(1, 1, 2, 2),
uid = c("m1", "m1", "m2", "m2"),
crop = c("germ", "germ", "mineral", "mineral"),
kg = c(23, 12, 24, 17),
perc = c(45, 78, 34, 10))
df %>%
rename_with(~str_replace(.x,'(.*)_kg', 'kg_\\1')) %>%
pivot_longer(-c(id, uid), names_to = c('.value', 'crop'), names_sep = '_')
# A tibble: 4 x 5
id uid crop kg perc
<dbl> <chr> <chr> <dbl> <dbl>
1 1 m1 germ 23 45
2 1 m1 mineral 12 78
3 2 m2 germ 24 34
4 2 m2 mineral 17 10
If you were to use data.table:
library(data.table)
melt(setDT(df), c('id', 'uid'), patterns(kg = 'kg', perc = 'perc'))
id uid variable kg perc
1: 1 m1 1 23 45
2: 2 m2 1 24 34
3: 1 m1 2 12 78
4: 2 m2 2 17 10
I suspect there might be a simpler way using pivot_long_spec, but one tricky thing here is that your column names don't have a consistent ordering of their semantic components. #Onyambu's answer deals with this nicely by fixing it upsteam.
library(tidyverse)
df %>%
pivot_longer(-c(id, uid)) %>%
separate(name, c("col1", "col2")) %>% # only needed
mutate(crop = if_else(col2 == "kg", col1, col2), # because name
meas = if_else(col2 == "kg", col2, col1)) %>% # structure
select(id, uid, crop, meas, value) %>% # is
pivot_wider(names_from = meas, values_from = value) # inconsistent
# A tibble: 4 x 5
id uid crop kg perc
<dbl> <chr> <chr> <dbl> <dbl>
1 1 m1 germ 23 45
2 1 m1 mineral 12 78
3 2 m2 germ 24 34
4 2 m2 mineral 17 10

Converting a matrix into a tibble in R

How can I convert this matrix:
> matrix(1:3, nrow = 3, dimnames = list(c("X","Y","Z"), c("A")))
A
X 1
Y 2
Z 3
into this tibble:
> tibble::tribble(~group1, ~group2, ~value, "X", "A", 1, "Y", "A", 2, "Z", "A", 3)
# A tibble: 3 × 3
group1 group2 value
<chr> <chr> <dbl>
1 X A 1
2 Y A 2
3 Z A 3
Thank you
as.tibble can convert the matrix's rownames to a column, and then you can use gather() to create the group2 column:
library(tidyverse)
m <- matrix(1:3, nrow = 3, dimnames = list(c("X","Y","Z"), c("A")))
newtib <- m %>%
as.tibble(rownames = "group1") %>%
gather('A', key = "group2", value = "value")
> newtib
# A tibble: 3 × 3
group1 group2 value
<chr> <chr> <int>
1 X A 1
2 Y A 2
3 Z A 3
> tibble::tribble(~group1, ~group2, ~value, "X", "A", 1, "Y", "A", 2, "Z", "A", 3)
# A tibble: 3 × 3
group1 group2 value
<chr> <chr> <dbl>
1 X A 1
2 Y A 2
3 Z A 3
Easier with base R, if we convert to table and coerce with as.data.frame (if we need to convert to tibble - use as_tibble as wrapper over the as.data.frame
as.data.frame(as.table(m1))
Var1 Var2 Freq
1 X A 1
2 Y A 2
3 Z A 3
data
m1 <- matrix(1:3, nrow = 3, dimnames = list(c("X","Y","Z"), c("A")))
Transform your matrix into a dataframe
bring your rownames to column group1
mutate group2
data.frame(matrix) %>%
rownames_to_column("group1") %>%
mutate(group2 = colnames(matrix)) %>%
dplyr::select(group1, group2, value=A)
group1 group2 value
1 X A 1
2 Y A 2
3 Z A 3
You can use -
library(tidyverse)
mat <- matrix(1:3, nrow = 3, dimnames = list(c("X","Y","Z"), c("A")))
mat %>%
as.data.frame() %>%
rownames_to_column(var = 'group1') %>%
pivot_longer(cols = -group1, names_to = 'group2')
# group1 group2 value
# <chr> <chr> <dbl>
#1 X A 1
#2 Y A 2
#3 Z A 3

How can I use purrr to pivot a nested dataframe?

The code below creates a simplified version of the dataframe and illustrates my desired end result (df_wider) based on the unnested version. My question is: How can I achieve the same end result (df_wider) from the nested version (nested_df), using purrr?
library(tidyverse)
df <- tibble(id_01 = c(rep("01", 3), rep("02", 3)),
a = (c("a", "a", "b", "c", "c", "d")),
b = letters[7:12],
id_02 = rep(c(1, 2, 1), 2)
)
df_wider <- pivot_wider(df,
id_cols = c(id_01, a),
names_from = id_02,
values_from = b,
names_sep = "_"
)
nested_df <- nest(df, data = -id_01)
To be clear, I am trying to pivot while the dataframes are nested (i.e., before unnesting).
We can use purrr::map() within dplyr::mutate():
library(tidyverse)
df <- tibble(
id_01 = c(rep("01", 3), rep("02", 3)),
a = (c("a", "a", "b", "c", "c", "d")),
b = letters[7:12],
id_02 = rep(c(1, 2, 1), 2)
)
nested_df <- df %>%
nest(data = -id_01) %>%
mutate(data = map(data, ~ .x %>%
pivot_wider(
id_cols = a,
names_from = id_02,
values_from = b
)))
nested_df
#> # A tibble: 2 x 2
#> id_01 data
#> <chr> <list>
#> 1 01 <tibble [2 x 3]>
#> 2 02 <tibble [2 x 3]>
nested_df %>%
unnest(data)
#> # A tibble: 4 x 4
#> id_01 a `1` `2`
#> <chr> <chr> <chr> <chr>
#> 1 01 a g h
#> 2 01 b i <NA>
#> 3 02 c j k
#> 4 02 d l <NA>
Created on 2021-03-26 by the reprex package (v1.0.0)

How do I select column based on value in another column with dplyr?

My data frame looks like this:
id A T C G ref var
1 1 10 15 7 0 A C
2 2 11 9 2 3 A G
3 3 2 31 1 12 T C
I'd like to create two new columns: ref_count and var_count which will have following values:
Value from A column and value from C column, since ref is A and var is C
Value from A column and value from G column, since ref is A and var is G
etc.
So I'd like to select a column based on the value in another column for each row.
Thanks!
We can use pivot_longer to reshape into 'long' format, filter the rows and then reshape it to 'wide' format with pivot_wider
library(dplyr)
library(tidyr)
df1 %>%
pivot_longer(cols = A:G) %>%
group_by(id) %>%
filter(name == ref|name == var) %>%
mutate(nm1 = c('ref_count', 'var_count')) %>%
ungroup %>%
select(id, value, nm1) %>%
pivot_wider(names_from = nm1, values_from = value) %>%
left_join(df1, .)
# A tibble: 3 x 9
# id A T C G ref var ref_count var_count
#* <int> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <dbl> <dbl>
#1 1 10 15 7 0 A C 10 7
#2 2 11 9 2 3 A G 11 3
#3 3 2 31 1 12 T C 31 1
Or in base R, we can also make use of the vectorized row/column indexing
df1$refcount <- as.matrix(df1[2:5])[cbind(seq_len(nrow(df1)), match(df1$ref, names(df1)[2:5]))]
df1$var_count <- as.matrix(df1[2:5])[cbind(seq_len(nrow(df1)), match(df1$var, names(df1)[2:5]))]
data
df1 <- structure(list(id = 1:3, A = c(10, 11, 2), T = c(15, 9, 31),
C = c(7, 2, 1), G = c(0, 3, 12), ref = c("A", "A", "T"),
var = c("C", "G", "C")), row.names = c(NA, -3L), class = c("tbl_df",
"tbl", "data.frame"))
The following is a tidyverse alternative without creating a long dataframe that needs filtering. It essentially uses tidyr::nest() to nest the dataframe by rows, after which the correct column can be selected for each row.
df1 %>%
nest(data = -id) %>%
mutate(
data = map(
data,
~mutate(., refcount = .[[ref]], var_count = .[[var]])
)
) %>%
unnest(data)
#> # A tibble: 3 × 9
#> id A T C G ref var refcount var_count
#> <int> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <dbl> <dbl>
#> 1 1 10 15 7 0 A C 10 7
#> 2 2 11 9 2 3 A G 11 3
#> 3 3 2 31 1 12 T C 31 1
A variant of this does not need the (assumed row-specific) id column but defines the nested groups from the unique values of ref and var directly:
df1 %>%
nest(data = -c(ref, var)) %>%
mutate(
data = pmap(
list(data, ref, var),
function(df, ref, var) {
mutate(df, refcount = df[[ref]], var_count = df[[var]])
}
)
) %>%
unnest(data)
The data were specified by akrun:
df1 <- structure(list(id = 1:3, A = c(10, 11, 2), T = c(15, 9, 31),
C = c(7, 2, 1), G = c(0, 3, 12), ref = c("A", "A", "T"),
var = c("C", "G", "C")), row.names = c(NA, -3L), class = c("tbl_df",
"tbl", "data.frame"))

Resources