i have a dataframe which contains a text string like below that shows the ingredients and the proportion of each ingredient. What i would like to achive is to extract the proportion of each ingredient as a separate variable:
What i have:
given <- tibble(
ingredients =c("1.5BZ+1FZ+2HT","2FZ","0.5HT+2BZ")
)
What i want to achive:
to_achieve <- tibble(
ingredients =c("1.5BZ+1FZ+2HT","2FZ","0.5HT+2BZ"),
proportion_bz = c(1.5,0,2),
proportion_fz = c(1,2,0),
proportion_ht=c(2,2,0.5)
)
Please note there might be more than a dozen different ingredients and tidyverse methods are preferred.
Thanks in advance,
Felix
Making heavy use of tidyr you could first split your strings into rows per ingredient using separate_rows, afterwards extract the numeric proportion and the type of ingredient and finally use pivot_wider to reshape into your desired format:
library(dplyr)
library(tidyr)
given %>%
mutate(ingredients_split = ingredients) |>
tidyr::separate_rows(ingredients_split, sep = "\\+") |>
tidyr::extract(
ingredients_split,
into = c("proportion", "ingredient"),
regex = "^([\\d+\\.]+)(.*)$"
) |>
mutate(
proportion = as.numeric(proportion),
ingredient = tolower(ingredient)
) |>
pivot_wider(
names_from = ingredient,
names_prefix = "proportion_",
values_from = proportion,
values_fill = 0
)
#> # A tibble: 3 × 4
#> ingredients proportion_bz proportion_fz proportion_ht
#> <chr> <dbl> <dbl> <dbl>
#> 1 1.5BZ+1FZ+2HT 1.5 1 2
#> 2 2FZ 0 2 0
#> 3 0.5HT+2BZ 2 0 0.5
library(tidyr)
library(readr)
library(stringr)
library(janitor)
# SOLUTION -----
given %>%
separate(ingredients, into = c("a", "b", "c"), sep = "\\+", remove = F) %>%
pivot_longer(a:c) %>%
select(-name) %>%
mutate(name = str_remove_all(value, "[0-9]|\\."),
value = parse_number(value)) %>%
na.omit() %>%
pivot_wider(names_prefix = "proportion_", values_fill = 0) %>%
clean_names()
# OUTPUT ----
#># A tibble: 3 × 4
#> ingredients proportion_bz proportion_fz proportion_ht
#> <chr> <dbl> <dbl> <dbl>
#>1 1.5BZ+1FZ+2HT 1.5 1 2
#>2 2FZ 0 2 0
#>3 0.5HT+2BZ 2 0 0.5
Related
I have a query that has multiple group-by - summarise statements. When I ungroup the data between everything works fine, but if I don't one of the columns is replaced by another.
I would expect the columns to not be changed. For example in the examples below, the variable gender should be F, or M and not Group X
library(dplyr)
library(arrow)
# Create sample dataset
N <- 1000
set.seed(123)
orig_data <- tibble(
code_group = sample(paste("Group", 1:2), N, replace = TRUE),
year = sample(2015:2016, N, replace = TRUE),
gender = sample(c("F", "M"), N, replace = TRUE),
value = runif(N, 0, 10)
)
write_dataset(orig_data, "example")
# Query and replicate the error
(ds <- open_dataset("example/"))
#> FileSystemDataset with 1 Parquet file
#> code_group: string
#> year: int32
#> gender: string
#> value: double
ds |>
group_by(year, code_group, gender) |>
summarise(value = sum(value)) |>
group_by(code_group, gender) |>
summarise(value = max(value), NN = n()) |>
collect()
#> # A tibble: 2 × 4
#> # Groups: code_group [2]
#> code_group gender value NN
#> <chr> <chr> <dbl> <int>
#> 1 Group 1 Group 1 724. 4
#> 2 Group 2 Group 2 661. 4
ERROR the gender variable is replaced by the values of the group variable
ds |>
group_by(year, code_group, gender) |>
summarise(value = sum(value)) |>
ungroup() |> #< Added this line...
group_by(code_group, gender) |>
summarise(value = max(value), NN = n()) |>
collect()
#> # A tibble: 4 × 4
#> # Groups: code_group [2]
#> code_group gender value NN
#> <chr> <chr> <dbl> <int>
#> 1 Group 1 F 724. 2
#> 2 Group 2 M 627. 2
#> 3 Group 1 M 658. 2
#> 4 Group 2 F 661. 2
Note now after inserting the ungroup() between the group-by - summarise calls, gender is not replaced
Quick look at the query (note Node 4 where "gender": code_group)
ds |>
group_by(year, code_group, gender) |>
summarise(value = sum(value)) |>
group_by(code_group, gender) |>
summarise(value = max(value), NN = n()) |>
show_query()
#> ExecPlan with 8 nodes:
#> 7:SinkNode{}
#> 6:ProjectNode{projection=[code_group, gender, value, NN]}
#> 5:GroupByNode{keys=["code_group", "gender"], aggregates=[
#> hash_max(value, {skip_nulls=false, min_count=0}),
#> hash_sum(NN, {skip_nulls=true, min_count=1}),
#> ]}
#> 4:ProjectNode{projection=[value, "NN": 1, code_group, "gender": code_group]}
#> 3:ProjectNode{projection=[year, code_group, gender, value]}
#> 2:GroupByNode{keys=["year", "code_group", "gender"], aggregates=[
#> hash_sum(value, {skip_nulls=false, min_count=0}),
#> ]}
#> 1:ProjectNode{projection=[value, year, code_group, gender]}
#> 0:SourceNode{}
Created on 2022-12-07 by the reprex package (v2.0.1)
Do I have a wrong understanding of arrow/dplyr or is this a bug (if so is that in arrow or dplyr/dbplyr)?
Note that this was indeed a bug and had been closed with PR 14905. It should work with the development version of arrow on GitHub.
I got a data frame with a lot of columns and want to summarise them with multiple functions.
test_df <- data.frame(Group = sample(c("A", "B", "C"), 10, T), var1 = sample(1:5, 10, T), var2 = sample(3:7, 10, T))
test_df %>%
group_by(Group) %>%
summarise_all(c(Mean = mean, Sum = sum))
# A tibble: 3 x 5
Group var1_Mean var2_Mean var1_Sum var2_Sum
<chr> <dbl> <dbl> <int> <int>
1 A 3.14 5.14 22 36
2 B 4.5 4.5 9 9
3 C 4 6 4 6
This results in a tibble with the first row Group and column names with a combination of the previous column name and the function name.
The desired result is a table with the previous column names as first row and the groups and functions in the column names.
I can achive this with
test_longer <- test_df %>% pivot_longer(cols = starts_with("var"), names_to = "var", values_to = "val")
# Add row number because spread needs unique identifiers for rows
test_longer <- test_longer %>%
group_by(Group) %>%
mutate(grouped_id = row_number())
spread(test_longer, Group, val) %>%
select(-grouped_id) %>%
group_by(var) %>%
summarise_all(c(Mean = mean, Sum = sum), na.rm = T)
# A tibble: 2 x 7
var A_Mean B_Mean C_Mean A_Sum B_Sum C_Sum
<chr> <dbl> <dbl> <dbl> <int> <int> <int>
1 var1 3.14 4.5 4 22 9 4
2 var2 5.14 4.5 6 36 9 6
But this seems to be a rather long detour... There probably is a better way, but I could not find it. Any suggestions? Thank you
There's lots of ways to go about it, but I would simplify it by pivoting to a longer data frame initially, and then grouping by var and group. Then you can just pivot wider to get the final result you want. Note that I used summarize(across()) which replaces the deprecated summarize_all(), even though with a single column could've just manually specified Mean = ... and Sum = ....
set.seed(123)
test_df %>%
pivot_longer(
var1:var2,
names_to = "var"
) %>%
group_by(Group, var) %>%
summarize(
across(
everything(),
list(Mean = mean, Sum = sum),
.names = "{.fn}"
),
.groups = "drop"
) %>%
pivot_wider(
names_from = "Group",
values_from = c(Mean, Sum),
names_glue = "{Group}_{.value}"
)
#> # A tibble: 2 × 7
#> var A_Mean B_Mean C_Mean A_Sum B_Sum C_Sum
#> <chr> <dbl> <dbl> <dbl> <int> <int> <int>
#> 1 var1 1 2.5 3.2 1 10 16
#> 2 var2 5 4.5 4.4 5 18 22
In the example below, I would like to add column 'value' based on the values of column 'variable' (i.e., 1 and 20).
toy_data <-
tibble::tribble(
~x, ~y, ~variable,
1, 2, "x",
10, 20, "y"
)
Like this:
x
y
variable
value
1
2
x
1
10
20
y
20
However, none of the below works:
toy_data %>%
dplyr::mutate(
value = get(variable)
)
toy_data %>%
dplyr::mutate(
value = mget(variable)
)
toy_data %>%
dplyr::mutate(
value = mget(variable, inherits = TRUE)
)
toy_data %>%
dplyr::mutate(
value = !!variable
)
How can I do this?
If you know which variables you have in the dataframe in advance: use simple logic like ifelse() or dplyr::case_when() to choose between them.
If not: use functional programming. Under is an example:
library(dplyr)
f <- function(data, variable_col) {
data[[variable_col]] %>%
purrr::imap_dbl(~ data[[.y, .x]])
}
toy_data$value <- f(toy_data, "variable")
Here are a few options that should scale well.
First is a base option that works along both the variable column and its index. (I made a copy of the data frame just so I had the original intact for more programming.)
library(dplyr)
toy2 <- toy_data
toy2$value <- mapply(function(v, i) toy_data[[v]][i], toy_data$variable, seq_along(toy_data$variable))
toy2
#> # A tibble: 2 × 4
#> x y variable value
#> <dbl> <dbl> <chr> <dbl>
#> 1 1 2 x 1
#> 2 10 20 y 20
Second uses purrr::imap_dbl to iterate along the variable and its index and return a double.
toy_data %>%
mutate(value = purrr::imap_dbl(variable, function(v, i) toy_data[[v]][i]))
#> # A tibble: 2 × 4
#> x y variable value
#> <dbl> <dbl> <chr> <dbl>
#> 1 1 2 x 1
#> 2 10 20 y 20
Third is least straightforward, but what I'd most likely use personally, maybe just because it's a process that fits many of my workflows. Pivotting makes a long version of the data, letting you see both values of variable and corresponding values of x and y, which you can then filter for where those 2 columns match. Then self-join back to the data frame.
inner_join(
toy_data,
toy_data %>%
tidyr::pivot_longer(cols = -variable, values_to = "value") %>%
filter(variable == name),
by = "variable"
) %>%
select(-name)
#> # A tibble: 2 × 4
#> x y variable value
#> <dbl> <dbl> <chr> <dbl>
#> 1 1 2 x 1
#> 2 10 20 y 20
Edit: #jpiversen rightly points out that the self-join won't work if variable has duplicates—in that case, add a row number to the data and use that as an additional joining column. Here I first add an additional observation to illustrate.
toy3 <- toy_data %>%
add_row(x = 5, y = 4, variable = "x") %>%
tibble::rowid_to_column()
inner_join(
toy3,
toy3 %>%
pivot_longer(cols = c(-rowid, -variable), values_to = "value") %>%
filter(variable == name),
by = c("rowid", "variable")
) %>%
select(-name, -rowid)
here is data.frame data as below , how to transfer it to wished_data Thanks!
library(tidyverse)
data <- data.frame(category=c('a','b','a','b','a'),
values=c(1,'A','2','4','B'))
#below code can't work
data %>% group_by(category ) %>%
summarize(sum=if_else(is.numeric(values)>0,sum(is.numeric(values)),paste0(values)))
#below is the wished result
wished_data <- data.frame(category=c('a','a','b','b'),
values=c('3','B','A','4'))
Mixing numeric and character variables in a column is not tidy. Consider giving each type their own column, for example:
data %>%
mutate(letters = str_extract(values, "[A-Z]"),
numbers = as.numeric(str_extract(values, "\\d"))) %>%
group_by(category) %>%
summarise(values = sum(numbers, na.rm = T),
letters = na.omit(letters))
category values letters
<chr> <dbl> <chr>
1 a 3 B
2 b 4 A
In R string math does not make sense, "1+1" is not "2", and is.numeric("1") gives FALSE. A workaround is converting to list object, or to give each their own columns.
I'd create a separate column to group numeric values in a category separately from characters.
data %>%
mutate(num_check = grepl("[0-9]", values)) %>%
group_by(category, num_check) %>%
summarize(sum = ifelse(
unique(num_check),
as.character(sum(as.numeric(values))),
unique(values)
), .groups = "drop")
#> # A tibble: 4 × 3
#> category num_check sum
#> <chr> <lgl> <chr>
#> 1 a FALSE B
#> 2 a TRUE 3
#> 3 b FALSE A
#> 4 b TRUE 4
Here is a bit of a messy answer,
library(dplyr)
bind_rows(data %>%
filter(is.na(as.numeric(values))),
data %>%
mutate(values = as.numeric(values)) %>%
group_by(category) %>%
summarise(values = as.character(sum(values, na.rm = TRUE)))) %>%
arrange(category)
category values
#1 a B
#2 a 3
#3 b A
#4 b 4
Using tidyr/dplyr, I have some factor columns which I'd like to Z-score, and then mutate an average Z-score, whilst retaining the original data for reference.
I'd like to avoid using a for loop in tidyr/dplyr, thus I'm gathering my data and performing my calculation (Z-score) on a single column. However, I'm struggling with restoring the wide format.
Here is a MWE:
library(dplyr)
library(tidyr)
# Original Data
dfData <- data.frame(
Name = c("Steve","Jwan","Ashley"),
A = c(10,20,12),
B = c(0.2,0.3,0.5)
) %>% tbl_df()
# Gather to Z-score
dfLong <- dfData %>% gather("Factor","Value",A:B) %>%
mutate(FactorZ = paste0("Z_",Factor)) %>%
group_by(Factor) %>%
mutate(ValueZ = (Value - mean(Value,na.rm = TRUE))/sd(Value,na.rm = TRUE))
# Now go wide to do some mutations (eg Z)Avg = (Z_A + Z_B)/2)
# This does not work
dfWide <- dfLong %>%
spread(Factor,Value) %>%
spread(FactorZ,ValueZ)%>%
mutate(Z_Avg = (Z_A+Z_B)/2)
# This is the desired result
dfDesired <- dfData %>% mutate(Z_A = (A - mean(A,na.rm = TRUE))/sd(A,na.rm = TRUE)) %>% mutate(Z_B = (B - mean(B,na.rm = TRUE))/sd(B,na.rm = TRUE)) %>%
mutate(Z_Avg = (Z_A+Z_B)/2)
Thanks for any help/input!
Another approach using dplyr (version 0.5.0)
library(dplyr)
dfData %>%
mutate_each(funs(Z = scale(.)), -Name) %>%
mutate(Z_Avg = (A_Z+B_Z)/2)
means <-function(x)mean(x, na.rm=T)
dfWide %>% group_by(Name) %>% summarise_each(funs(means)) %>% mutate(Z_Avg = (Z_A + Z_B)/2)
# A tibble: 3 x 6
Name A B Z_A Z_B Z_Avg
<chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Ashley 12 0.5 -0.3779645 1.0910895 0.3565625
2 Jwan 20 0.3 1.1338934 -0.2182179 0.4578378
3 Steve 10 0.2 -0.7559289 -0.8728716 -0.8144003
Here is one approach with long and wide format. For z-transformation, you can use the base function scale. Furthermore, this approach includes a join to combine the original data frame and the one including the new values.
dfLong <- dfData %>%
gather(Factor, Value, A:B) %>%
group_by(Factor) %>%
mutate(ValueZ = scale(Value))
# Name Factor Value ValueZ
# <fctr> <chr> <dbl> <dbl>
# 1 Steve A 10.0 -0.7559289
# 2 Jwan A 20.0 1.1338934
# 3 Ashley A 12.0 -0.3779645
# 4 Steve B 0.2 -0.8728716
# 5 Jwan B 0.3 -0.2182179
# 6 Ashley B 0.5 1.0910895
dfWide <- dfData %>% inner_join(dfLong %>%
ungroup %>%
select(-Value) %>%
mutate(Factor = paste0("Z_", Factor)) %>%
spread(Factor, ValueZ) %>%
mutate(Z_Avg = (Z_A + Z_B) / 2))
# Name A B Z_A Z_B Z_Avg
# <fctr> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 Steve 10 0.2 -0.7559289 -0.8728716 -0.8144003
# 2 Jwan 20 0.3 1.1338934 -0.2182179 0.4578378
# 3 Ashley 12 0.5 -0.3779645 1.0910895 0.3565625
I would just do it all in wide format. No need to keep switching between the long and wide formats.
dfData %>%
mutate(Z_A=(A-mean(unlist(dfData$A)))/sd(unlist(dfData$A)),
Z_B=(B-mean(unlist(dfData$B)))/sd(unlist(dfData$B))) %>%
mutate(Z_AVG=(Z_A+Z_B)/2)