dplyr concatenate strings by group - row by row - r

I need to concatenate strings by group in the dplyr, but the resulting column should account only for the previous columns, not the leading ones
I want my data to look like this:
ID
message
messages_used
1
53
53
1
54
53,54
1
55
53,54,55
2
53
53
2
58
53,58
Is it achievable using dplyr only?

You can use Reduce(..., accumulate = TRUE) from base:
library(dplyr)
df %>%
group_by(ID) %>%
mutate(messages_used = Reduce(\(x, y) paste(x, y, sep = ", "), message, accumulate = TRUE)) %>%
ungroup()
# # A tibble: 5 x 3
# ID message messages_used
# <int> <int> <chr>
# 1 1 53 53
# 2 1 54 53, 54
# 3 1 55 53, 54, 55
# 4 2 53 53
# 5 2 58 53, 58

We can use dplyr::group_by() and purrr::accumulate():
dat <- data.frame(ID = c(1,1,1,2,2), message = c(53,54,55,53,58))
library(dplyr)
library(purrr)
dat %>%
group_by(ID) %>%
mutate(message_used = accumulate(message, ~ paste(.x, .y, sep =",")))
#> # A tibble: 5 x 3
#> # Groups: ID [2]
#> ID message message_used
#> <dbl> <dbl> <chr>
#> 1 1 53 53
#> 2 1 54 53,54
#> 3 1 55 53,54,55
#> 4 2 53 53
#> 5 2 58 53,58
Created on 2022-05-11 by the reprex package (v2.0.1)

Related

How do I create new columns based on the values of a different column and count the percentage value of another numerical column in R?

The sample data frame:
no <- rep(1:5, each=2)
type <- rep(LETTERS[1:2], times=5)
set.seed(4)
value <- round(runif(10, 10, 30))
df <- data.frame(no, type, value)
df
no type value
1 1 A 22
2 1 B 10
3 2 A 16
4 2 B 16
5 3 A 26
6 3 B 15
7 4 A 24
8 4 B 28
9 5 A 29
10 5 B 11
Now what I want is to calculate the % value of each type of type (A or B) and create separate columns. Desired output is something like this:
no pct_A pct_B total_value
1 1 68.75000 31.25000 32
2 2 50.00000 50.00000 32
3 3 63.41463 36.58537 41
4 4 46.15385 53.84615 52
5 5 72.50000 27.50000 40
What I have tried so far (This gives the right output but the process seems very sub-optimal):
df %>%
group_by(no) %>%
mutate(total_value= sum(value))-> df
df %>%
mutate(pct_A=ifelse(type=='A', (value/total_value) *100, 0),
pct_B=ifelse(type=='B', (value/total_value) *100, 0)) %>%
group_by(no) %>%
summarise(pct_A=sum(pct_A),
pct_B=sum(pct_B)) %>%
ungroup() %>%
merge(df) %>%
distinct(no, .keep_all = T) %>%
select(-type, -value)
Is there any better way to do that? Especially using dplyr?
I looked for other answers too, but no help. This one came closer:
R Create new column of values based on the factor levels of another column
You could do it in base using aggregate.
do.call(data.frame, aggregate(value ~ no, df, \(x) c(proportions(x), sum(x)))) |>
setNames(c('no', 'pct_A', 'pct_B', 'total_value'))
# no pct_A pct_B total_value
# 1 1 0.6875000 0.3125000 32
# 2 2 0.5000000 0.5000000 32
# 3 3 0.6341463 0.3658537 41
# 4 4 0.4615385 0.5384615 52
# 5 5 0.7250000 0.2750000 40
For each no we can calculate sum and ratio then get the data in wide format.
library(dplyr)
library(tidyr)
df %>%
group_by(no) %>%
mutate(total_value = sum(value),
value = prop.table(value) * 100) %>%
ungroup %>%
pivot_wider(names_from = type, values_from = value, names_prefix = 'pct_')
# no total_value pct_A pct_B
# <int> <dbl> <dbl> <dbl>
#1 1 32 68.8 31.2
#2 2 32 50 50
#3 3 41 63.4 36.6
#4 4 52 46.2 53.8
#5 5 40 72.5 27.5
Here are two more ways to do this.
We could use purrr::map_dfc. However, setting up the correct column names is kind of cumbersome:
library(dplyr)
library(purrr)
df %>%
group_by(no) %>%
summarise(total_value = sum(value),
map_dfc(unique(type) %>% set_names(., paste0("pct_",.)),
~ sum((type == .x) * value) / total_value * 100)
)
#> # A tibble: 5 x 4
#> no total_value pct_A pct_B
#> <int> <dbl> <dbl> <dbl>
#> 1 1 32 68.8 31.2
#> 2 2 32 50 50
#> 3 3 41 63.4 36.6
#> 4 4 52 46.2 53.8
#> 5 5 40 72.5 27.5
Alternatively we can use dplyover::over (disclaimer: I'm the maintainer) which allows us to create names on the fly in a across-like way:
library(dplyover) # https://github.com/TimTeaFan/dplyover
df %>%
group_by(no) %>%
summarise(total_value = sum(value),
over(dist_values(type), # alternatively `unique(type)`
~ sum((type == .x) * value) / total_value * 100,
.names = "pct_{x}")
)
#> # A tibble: 5 x 4
#> no total_value pct_A pct_B
#> <int> <dbl> <dbl> <dbl>
#> 1 1 32 68.8 31.2
#> 2 2 32 50 50
#> 3 3 41 63.4 36.6
#> 4 4 52 46.2 53.8
#> 5 5 40 72.5 27.5
Created on 2021-09-17 by the reprex package (v2.0.1)
Performance-wise both approaches should be faster compared to data-rectangling approaches such as pivot_wider (but I haven't tested this specific scenario).

Separate observations not joining in data frame

So this data frame has a lot of separate observations that I need to be added together.
Use dget():
I've tried a lot of different solutions like:
df %>%
group_by(product, price) %>%
summarise(
quantity = sum(quantity),
total = sum(total)
)
And:
df %>%
gather(key = variable, value = value, c(Quantity,Price,Total)) %>%
group_by(Product, variable) %>%
summarize(sum = sum(value)) %>%
spread(variable, sum)
And:
df %>%
group_by(Product) %>%
summarise(Quantity = sum(Quantity),
AveragePrice = sum(Total)/sum(Quantity),
Total = sum(Total))
but i just get:
> df
quantity total
1 61 1685
Expected output is something like this but obviously with all the products on the product column:
#> # Groups: product [2]
#> product price quantity total
#> <chr> <dbl> <dbl> <dbl>
#> 1 small cucumber 10 1 10
#> 2 tomatoes 1kg 16 2 32
I have asked this before but I wasn't nearly specific enough.
Thanks.
You're calling plyr::summarise instead of dplyr::summarise. You can call it explicitly with
df %>%
group_by(Product) %>%
dplyr::summarise(
Quantity = sum(Quantity),
AveragePrice = sum(Total)/sum(Quantity),
Total = sum(Total))
#> # A tibble: 27 x 4
#> Product Quantity AveragePrice Total
#> <chr> <dbl> <dbl> <dbl>
#> 1 asparagus 200g 4 45 180
#> 2 back bacon 200g 1 30 30
#> 3 beef fillet strips 500g 1 90 90
#> 4 beetroot 1kg 1 15 15
#> 5 broccoli head 1 25 25
#> 6 butter 500g 1 57 57
#> 7 butternut cubes 4 14 56
#> 8 calistos jalape=c3=b1o salsa 1 40 40
#> 9 carrot 1 kg 4 14 56
#> 10 cauliflower whole head 2 25 50
#> # … with 17 more rows

How to define the baseline with select negative number

I have a data set that looks like this:
Sample data can be get from:
ID <-c("1", "1", "1","1","2", "2")
Test_date <-c(-15, -8,7, 12,-3,2)
Test_Result<-c(100, 98, 78,99, 65,89)
Sample.data <- data.frame(ID, Test_date, Test_Result)
I need to use the biggest negative test_date's test_result as baseline. The progress is calculated using test_result divided by the baseline Test_Result. what should I do?
The final result should be something looks like this:
Many Thanks.
try it this way
library(tidyverse)
df %>%
group_by(ID) %>%
filter(Test_date > 0 | Test_date == max(Test_date[Test_date < 0])) %>%
mutate(progress = ifelse(Test_date > 0,
Test_Result / Test_Result[which.min(Test_date)],
NA_real_)) %>%
right_join(df) %>%
arrange(ID, Test_date) %>%
ungroup(ID)
Joining, by = c("ID", "Test_date", "Test_Result")
# A tibble: 6 x 4
ID Test_date Test_Result progress
<chr> <dbl> <dbl> <dbl>
1 1 -15 100 NA
2 1 -8 98 NA
3 1 7 78 0.796
4 1 12 99 1.01
5 2 -3 65 NA
6 2 2 89 1.37
In my view, this kind of operation "by group" operation is most easily accomplished with dplyr or data.table packages:
ID <-c("1", "1", "1","2", "2")
Test_date <-c(-15, -8,7, -3,2)
Test_Result<-c(100, 98, 78,65,89)
Sample.data <- data.frame(ID, Test_date, Test_Result)
big_neg <- function(x) which(x == max(x[x < 0]))
library(dplyr)
Sample.data %>%
group_by(ID) %>%
mutate(Progress = Test_Result / Test_Result[big_neg(Test_date)])
#> # A tibble: 5 x 4
#> # Groups: ID [2]
#> ID Test_date Test_Result Progress
#> <chr> <dbl> <dbl> <dbl>
#> 1 1 -15 100 1.02
#> 2 1 -8 98 1
#> 3 1 7 78 0.796
#> 4 2 -3 65 1
#> 5 2 2 89 1.37
library(data.table)
dat <- data.table(Sample.data)
dat[, Progress := Test_Result / Test_Result[big_neg(Test_date)], by=ID][]
#> ID Test_date Test_Result Progress
#> 1: 1 -15 100 1.0204082
#> 2: 1 -8 98 1.0000000
#> 3: 1 7 78 0.7959184
#> 4: 2 -3 65 1.0000000
#> 5: 2 2 89 1.3692308

Calculate total elapsed time

I have the following data:
library(dplyr, warn.conflicts = FALSE)
df <- tibble(
x = c(30, 60, 90, 30, 60, 90),
phase = c(rep(c("phase 1", "phase 2"), each = 3))
)
df
#> # A tibble: 6 x 2
#> x phase
#> <dbl> <chr>
#> 1 30 phase 1
#> 2 60 phase 1
#> 3 90 phase 1
#> 4 30 phase 2
#> 5 60 phase 2
#> 6 90 phase 2
Created on 2020-08-11 by the reprex package (v0.3.0)
Where x is the elapsed time (in seconds) within each phase. Since phase is something that happens continuously, I am interested in calculating the total elapsed time.
Desired output:
#> # A tibble: 6 x 3
#> x phase elapsed_time
#> <dbl> <chr> <dbl>
#> 1 30 phase 1 30
#> 2 60 phase 1 60
#> 3 90 phase 1 90
#> 4 30 phase 2 120
#> 5 60 phase 2 150
#> 6 90 phase 2 180
Any ideas? Please, note that my real example has much more phases.
I believe the following post has the answer you're looking for:
How to add a cumulative column to an R dataframe using dplyr?
It shows how to create a cumulative column using group_by and mutate. It appears you want the elapsed time to sum across both phases, so simply do not include the group_by call in your code.
Here is an idea via dplyr. First we need to group by the phase and get the time differences for each. We then ungroup() and take the cumsum() as a total, i.e.
library(dplyr)
df %>%
group_by(phase) %>%
mutate(diffs = x - lag(x, default = 0)) %>%
ungroup() %>%
mutate(res = cumsum(diffs)) %>%
select(-diffs)
# A tibble: 6 x 3
# x phase res
# <dbl> <chr> <dbl>
#1 30 phase 1 30
#2 60 phase 1 60
#3 90 phase 1 90
#4 30 phase 2 120
#5 60 phase 2 150
#6 90 phase 2 180
Here is another dplyr solution. It finds the start of each phase, and adds this to x
library(tidyverse)
df <- tibble(
x = c(30, 60, 90, 30, 60, 90),
phase = c(rep(c("phase 1", "phase 2"), each = 3))
)
df %>% group_by(phase) %>%
nest() %>%
mutate(start = map_dbl(data, max)) %>%
ungroup() %>%
mutate(start = lag(start, default = 0)) %>%
unnest(data) %>%
mutate(elapsed_time = start + x) %>%
select(-start)
#> # A tibble: 6 x 3
#> phase x elapsed_time
#> <chr> <dbl> <dbl>
#> 1 phase 1 30 30
#> 2 phase 1 60 60
#> 3 phase 1 90 90
#> 4 phase 2 30 120
#> 5 phase 2 60 150
#> 6 phase 2 90 180
Created on 2020-08-11 by the reprex package (v0.3.0)

Hashing every row of a tibble

I am using the newly minted dplyr 1.0.0 and the digest package to generate a hash of every row in a tibble.
I am aware of
adding hash to each row using dplyr and digest in R
but I would like to use the revamped rowwise() in dplyr 1.0.0.
See the example below. Anyone has any idea about why it fails? I should be allowed to digest a row where the entries are of different types.
library(dplyr)
library(digest)
df <- tibble(
student_id = letters[1:4],
student_id2 = letters[9:12],
test1 = 10:13,
test2 = 20:23,
test3 = 30:33,
test4 = 40:43
)
df
#> # A tibble: 4 x 6
#> student_id student_id2 test1 test2 test3 test4
#> <chr> <chr> <int> <int> <int> <int>
#> 1 a i 10 20 30 40
#> 2 b j 11 21 31 41
#> 3 c k 12 22 32 42
#> 4 d l 13 23 33 43
dd <- df %>%
rowwise(student_id) %>%
mutate(hash = digest(c_across(everything()))) %>%
ungroup
#> Error: Problem with `mutate()` input `hash`.
#> ✖ Can't combine `student_id2` <character> and `test1` <integer>.
#> ℹ Input `hash` is `digest(c_across(everything()))`.
#> ℹ The error occured in row 1.
### but digest should not care too much about the type of the input
Created on 2020-06-04 by the reprex package (v0.3.0)
It seems that the different column types have an issue. One option is to first change the column types to a single one and then do the rowwise
library(dplyr)
library(digest)
df %>%
mutate(across(everything(), as.character)) %>%
rowwise %>%
mutate(hash = digest(c_across(everything())))
# A tibble: 4 x 7
# Rowwise:
# student_id student_id2 test1 test2 test3 test4 hash
# <chr> <chr> <chr> <chr> <chr> <chr> <chr>
#1 a i 10 20 30 40 2638067de6dcfb3d58b83a83e0cd3089
#2 b j 11 21 31 41 21162fc0c528a6550b53c87ca0c2805e
#3 c k 12 22 32 42 8d7539eacff61efbd567b6100227523b
#4 d l 13 23 33 43 9739997605aa39620ce50e96f1ff4f70
Or another option is to unite the columns to a single one and then do the digest on that column
library(tidyr)
df %>%
unite(new, everything(), remove = FALSE) %>%
rowwise %>%
mutate(hash = digest(new)) %>%
select(-new)
# A tibble: 4 x 7
# Rowwise:
# student_id student_id2 test1 test2 test3 test4 hash
# <chr> <chr> <int> <int> <int> <int> <chr>
#1 a i 10 20 30 40 a9e4cafdfbc88f17b7593dfd684eb2a1
#2 b j 11 21 31 41 a67a5df8186972285bd7be59e6fdab38
#3 c k 12 22 32 42 9c20bd87a50642631278b3e6d28ecf68
#4 d l 13 23 33 43 3f4f373d1969dcf0c8f542023a258225
Or another option is pmap, where we concatenate the elements to a single one in each row, resulting in integer converting to character as vectors can hold only a single class
library(purrr)
df %>%
mutate(hash = pmap_chr(., ~ digest(c(...))))
# A tibble: 4 x 7
# student_id student_id2 test1 test2 test3 test4 hash
# <chr> <chr> <int> <int> <int> <int> <chr>
#1 a i 10 20 30 40 f0fb4100907570ef9bda073b78dc44a6
#2 b j 11 21 31 41 754b09e8d4d854aa5e40aa88d1edfc66
#3 c k 12 22 32 42 5f3a699caff833e900fd956232cf61dd
#4 d l 13 23 33 43 4d31c65284e5db36c37461126a9eb63c
The advantage here is that we are not changing the column types

Resources