I have a 1-by-4 table that contains summary statistics of two variables. For example,
df <- data.frame(
x_min=1,
x_max=2,
y_min=3,
y_max=4)
df
x_min x_max y_min y_max
1 1 2 3 4
I'd like to shape it into a 2-by-2 format:
x y
min 1 3
max 2 4
I'm able to get the result by the following code:
df %>%
pivot_longer(everything(),names_to = 'stat',values_to = 'val') %>%
separate(stat,into = c('var','stat'),sep = '_') %>%
pivot_wider(names_from = var, values_from = val)
However, I feel that this is a bit too circuitous because it first converts df into a table that's way too "long", and then "widens" it back to the appropriate size.
Is there a way to use pivot_longer() to get directly to the final result (that is, without involving pivot_wider())?
You could do:
df <- data.frame(
x_min=1,
x_max=2,
y_min=3,
y_max=4)
tidyr::pivot_longer(df, everything(), names_to = c(".value", "name"), names_sep = "_")
#> # A tibble: 2 × 3
#> name x y
#> <chr> <dbl> <dbl>
#> 1 min 1 3
#> 2 max 2 4
library(tidyverse)
df %>%
pivot_longer(everything(), names_to = c('.value', 'rowname'), names_sep = '_')%>%
column_to_rownames()
x y
min 1 3
max 2 4
Related
i have a dataframe which contains a text string like below that shows the ingredients and the proportion of each ingredient. What i would like to achive is to extract the proportion of each ingredient as a separate variable:
What i have:
given <- tibble(
ingredients =c("1.5BZ+1FZ+2HT","2FZ","0.5HT+2BZ")
)
What i want to achive:
to_achieve <- tibble(
ingredients =c("1.5BZ+1FZ+2HT","2FZ","0.5HT+2BZ"),
proportion_bz = c(1.5,0,2),
proportion_fz = c(1,2,0),
proportion_ht=c(2,2,0.5)
)
Please note there might be more than a dozen different ingredients and tidyverse methods are preferred.
Thanks in advance,
Felix
Making heavy use of tidyr you could first split your strings into rows per ingredient using separate_rows, afterwards extract the numeric proportion and the type of ingredient and finally use pivot_wider to reshape into your desired format:
library(dplyr)
library(tidyr)
given %>%
mutate(ingredients_split = ingredients) |>
tidyr::separate_rows(ingredients_split, sep = "\\+") |>
tidyr::extract(
ingredients_split,
into = c("proportion", "ingredient"),
regex = "^([\\d+\\.]+)(.*)$"
) |>
mutate(
proportion = as.numeric(proportion),
ingredient = tolower(ingredient)
) |>
pivot_wider(
names_from = ingredient,
names_prefix = "proportion_",
values_from = proportion,
values_fill = 0
)
#> # A tibble: 3 × 4
#> ingredients proportion_bz proportion_fz proportion_ht
#> <chr> <dbl> <dbl> <dbl>
#> 1 1.5BZ+1FZ+2HT 1.5 1 2
#> 2 2FZ 0 2 0
#> 3 0.5HT+2BZ 2 0 0.5
library(tidyr)
library(readr)
library(stringr)
library(janitor)
# SOLUTION -----
given %>%
separate(ingredients, into = c("a", "b", "c"), sep = "\\+", remove = F) %>%
pivot_longer(a:c) %>%
select(-name) %>%
mutate(name = str_remove_all(value, "[0-9]|\\."),
value = parse_number(value)) %>%
na.omit() %>%
pivot_wider(names_prefix = "proportion_", values_fill = 0) %>%
clean_names()
# OUTPUT ----
#># A tibble: 3 × 4
#> ingredients proportion_bz proportion_fz proportion_ht
#> <chr> <dbl> <dbl> <dbl>
#>1 1.5BZ+1FZ+2HT 1.5 1 2
#>2 2FZ 0 2 0
#>3 0.5HT+2BZ 2 0 0.5
I have a grouped data frame which I want to summarise into "count of values less than x, y, z by group". I can manually generate the wide dataframe I want using code similar to this below
library(tidyverse)
set.seed(1337)
df <- data.frame(cbind(group = seq(1:5), num = sample(x = 1:400, size = 100, replace = T)))
manual <- df %>%
group_by(group) %>%
summarise(less_than_50 = sum(num < 50),
less_than_100 = sum(num < 100),
less_than_150 = sum(num < 150))
However, I'd like to be able to define a list of "less thans" and generate these columns by referring to a list. I've done something similar in the past, though using enframe(quantile()) to generate a long list of quantiles before pivoting
pc <- c(0.1, 0.5, 0.9)
quantiles <- df %>%
group_by(group) %>%
summarise(enframe(quantile(num, pc))) %>%
pivot_wider(
id_cols = group,
names_from = name,
values_from = value
)
But I don't know / understand the way to define a custom function within the enframe(). Ideally I'd like to apply this in something like the code below (though this obviously doesn't work), with or without the pivot step, in order to get back to the same output as "manual"
levels <- c(50, 100, 150)
programmatic <- df %>%
group_by(group) %>%
summarise(cols = ("less_than", x), num < levels) %>%
pivot...
Any help greatly appreciated
One way you could do it:
library(tidyverse)
set.seed(1337)
df <- data.frame(cbind(group = seq(1:5), num = sample(x = 1:400, size = 100, replace = T)))
less_than <- function(x) {
df %>%
group_by(group) %>%
summarise(less_than_ = sum(num < x)) %>%
rename_with(~ str_c(., x), .cols = -group)
}
levels <- c(50, 100, 150)
map_dfr(levels, less_than) |>
group_by(group) |>
summarise(across(everything(), mean, na.rm = TRUE))
#> # A tibble: 5 × 4
#> group less_than_50 less_than_100 less_than_150
#> <int> <dbl> <dbl> <dbl>
#> 1 1 4 5 10
#> 2 2 2 2 5
#> 3 3 2 6 11
#> 4 4 4 5 5
#> 5 5 1 7 9
# Manual result for comparison
df %>%
group_by(group) %>%
summarise(less_than_50 = sum(num < 50),
less_than_100 = sum(num < 100),
less_than_150 = sum(num < 150))
#> # A tibble: 5 × 4
#> group less_than_50 less_than_100 less_than_150
#> <int> <int> <int> <int>
#> 1 1 4 5 10
#> 2 2 2 2 5
#> 3 3 2 6 11
#> 4 4 4 5 5
#> 5 5 1 7 9
Created on 2022-06-06 by the reprex package (v2.0.1)
I got a data frame with a lot of columns and want to summarise them with multiple functions.
test_df <- data.frame(Group = sample(c("A", "B", "C"), 10, T), var1 = sample(1:5, 10, T), var2 = sample(3:7, 10, T))
test_df %>%
group_by(Group) %>%
summarise_all(c(Mean = mean, Sum = sum))
# A tibble: 3 x 5
Group var1_Mean var2_Mean var1_Sum var2_Sum
<chr> <dbl> <dbl> <int> <int>
1 A 3.14 5.14 22 36
2 B 4.5 4.5 9 9
3 C 4 6 4 6
This results in a tibble with the first row Group and column names with a combination of the previous column name and the function name.
The desired result is a table with the previous column names as first row and the groups and functions in the column names.
I can achive this with
test_longer <- test_df %>% pivot_longer(cols = starts_with("var"), names_to = "var", values_to = "val")
# Add row number because spread needs unique identifiers for rows
test_longer <- test_longer %>%
group_by(Group) %>%
mutate(grouped_id = row_number())
spread(test_longer, Group, val) %>%
select(-grouped_id) %>%
group_by(var) %>%
summarise_all(c(Mean = mean, Sum = sum), na.rm = T)
# A tibble: 2 x 7
var A_Mean B_Mean C_Mean A_Sum B_Sum C_Sum
<chr> <dbl> <dbl> <dbl> <int> <int> <int>
1 var1 3.14 4.5 4 22 9 4
2 var2 5.14 4.5 6 36 9 6
But this seems to be a rather long detour... There probably is a better way, but I could not find it. Any suggestions? Thank you
There's lots of ways to go about it, but I would simplify it by pivoting to a longer data frame initially, and then grouping by var and group. Then you can just pivot wider to get the final result you want. Note that I used summarize(across()) which replaces the deprecated summarize_all(), even though with a single column could've just manually specified Mean = ... and Sum = ....
set.seed(123)
test_df %>%
pivot_longer(
var1:var2,
names_to = "var"
) %>%
group_by(Group, var) %>%
summarize(
across(
everything(),
list(Mean = mean, Sum = sum),
.names = "{.fn}"
),
.groups = "drop"
) %>%
pivot_wider(
names_from = "Group",
values_from = c(Mean, Sum),
names_glue = "{Group}_{.value}"
)
#> # A tibble: 2 × 7
#> var A_Mean B_Mean C_Mean A_Sum B_Sum C_Sum
#> <chr> <dbl> <dbl> <dbl> <int> <int> <int>
#> 1 var1 1 2.5 3.2 1 10 16
#> 2 var2 5 4.5 4.4 5 18 22
In the example below, I would like to add column 'value' based on the values of column 'variable' (i.e., 1 and 20).
toy_data <-
tibble::tribble(
~x, ~y, ~variable,
1, 2, "x",
10, 20, "y"
)
Like this:
x
y
variable
value
1
2
x
1
10
20
y
20
However, none of the below works:
toy_data %>%
dplyr::mutate(
value = get(variable)
)
toy_data %>%
dplyr::mutate(
value = mget(variable)
)
toy_data %>%
dplyr::mutate(
value = mget(variable, inherits = TRUE)
)
toy_data %>%
dplyr::mutate(
value = !!variable
)
How can I do this?
If you know which variables you have in the dataframe in advance: use simple logic like ifelse() or dplyr::case_when() to choose between them.
If not: use functional programming. Under is an example:
library(dplyr)
f <- function(data, variable_col) {
data[[variable_col]] %>%
purrr::imap_dbl(~ data[[.y, .x]])
}
toy_data$value <- f(toy_data, "variable")
Here are a few options that should scale well.
First is a base option that works along both the variable column and its index. (I made a copy of the data frame just so I had the original intact for more programming.)
library(dplyr)
toy2 <- toy_data
toy2$value <- mapply(function(v, i) toy_data[[v]][i], toy_data$variable, seq_along(toy_data$variable))
toy2
#> # A tibble: 2 × 4
#> x y variable value
#> <dbl> <dbl> <chr> <dbl>
#> 1 1 2 x 1
#> 2 10 20 y 20
Second uses purrr::imap_dbl to iterate along the variable and its index and return a double.
toy_data %>%
mutate(value = purrr::imap_dbl(variable, function(v, i) toy_data[[v]][i]))
#> # A tibble: 2 × 4
#> x y variable value
#> <dbl> <dbl> <chr> <dbl>
#> 1 1 2 x 1
#> 2 10 20 y 20
Third is least straightforward, but what I'd most likely use personally, maybe just because it's a process that fits many of my workflows. Pivotting makes a long version of the data, letting you see both values of variable and corresponding values of x and y, which you can then filter for where those 2 columns match. Then self-join back to the data frame.
inner_join(
toy_data,
toy_data %>%
tidyr::pivot_longer(cols = -variable, values_to = "value") %>%
filter(variable == name),
by = "variable"
) %>%
select(-name)
#> # A tibble: 2 × 4
#> x y variable value
#> <dbl> <dbl> <chr> <dbl>
#> 1 1 2 x 1
#> 2 10 20 y 20
Edit: #jpiversen rightly points out that the self-join won't work if variable has duplicates—in that case, add a row number to the data and use that as an additional joining column. Here I first add an additional observation to illustrate.
toy3 <- toy_data %>%
add_row(x = 5, y = 4, variable = "x") %>%
tibble::rowid_to_column()
inner_join(
toy3,
toy3 %>%
pivot_longer(cols = c(-rowid, -variable), values_to = "value") %>%
filter(variable == name),
by = c("rowid", "variable")
) %>%
select(-name, -rowid)
here is data.frame data as below , how to transfer it to wished_data Thanks!
library(tidyverse)
data <- data.frame(category=c('a','b','a','b','a'),
values=c(1,'A','2','4','B'))
#below code can't work
data %>% group_by(category ) %>%
summarize(sum=if_else(is.numeric(values)>0,sum(is.numeric(values)),paste0(values)))
#below is the wished result
wished_data <- data.frame(category=c('a','a','b','b'),
values=c('3','B','A','4'))
Mixing numeric and character variables in a column is not tidy. Consider giving each type their own column, for example:
data %>%
mutate(letters = str_extract(values, "[A-Z]"),
numbers = as.numeric(str_extract(values, "\\d"))) %>%
group_by(category) %>%
summarise(values = sum(numbers, na.rm = T),
letters = na.omit(letters))
category values letters
<chr> <dbl> <chr>
1 a 3 B
2 b 4 A
In R string math does not make sense, "1+1" is not "2", and is.numeric("1") gives FALSE. A workaround is converting to list object, or to give each their own columns.
I'd create a separate column to group numeric values in a category separately from characters.
data %>%
mutate(num_check = grepl("[0-9]", values)) %>%
group_by(category, num_check) %>%
summarize(sum = ifelse(
unique(num_check),
as.character(sum(as.numeric(values))),
unique(values)
), .groups = "drop")
#> # A tibble: 4 × 3
#> category num_check sum
#> <chr> <lgl> <chr>
#> 1 a FALSE B
#> 2 a TRUE 3
#> 3 b FALSE A
#> 4 b TRUE 4
Here is a bit of a messy answer,
library(dplyr)
bind_rows(data %>%
filter(is.na(as.numeric(values))),
data %>%
mutate(values = as.numeric(values)) %>%
group_by(category) %>%
summarise(values = as.character(sum(values, na.rm = TRUE)))) %>%
arrange(category)
category values
#1 a B
#2 a 3
#3 b A
#4 b 4