Transpose and sum distinct values in R

Transpose and sum distinct values in R - r

IS there a way to transpose and summing distinct values in R For example
df
Cola Order Quantity Loc
ABC 1 4 LocA
ABC 1 4 LocB
CSD 4 6 LocA
CDS 3 2 LocB
We have same values for Order and Quantity but still need to take sum of it.
Expected Output (Transpose with respect to Quantity)
Cola Order Quantity LocA_Quantity Loc B_Quantity
ABC 2 8 4 4
CSD 4 6 6
CDS 3 2 2

Create the dataset:
library(tibble)
df = tribble(
~Cola, ~Order, ~Quantity, ~Loc,
'ABC', 1, 4, 'LocA',
'ABC', 1, 4, 'LocB',
'CSD', 4, 6, 'LocA',
'CDS', 3, 2, 'LocB'
)
Create the summaries:
library(dplyr)
df %>%
group_by(Cola) %>%
summarise(
Order = sum(Order),
LocA_Quantity = sum(Quantity * if_else(Loc == "LocA", 1, 0)),
LocB_Quantity = sum(Quantity * if_else(Loc == "LocB", 1, 0)),
Quantity = sum(Quantity)
)

You can do it for both Quantity and order and drop columns you dont want at the end, i.e.
library(tidyverse)
df %>%
group_by(Cola) %>%
mutate_at(vars(2:3), list(new = sum)) %>%
pivot_wider(names_from = Loc, values_from = 2:3)
## A tibble: 3 x 7
## Groups: Cola [3]
# Cola Order_new Quantity_new Order_LocA Order_LocB Quantity_LocA Quantity_LocB
# <fct> <int> <int> <int> <int> <int> <int>
#1 ABC 2 8 1 1 4 4
#2 CSD 4 6 4 NA 6 NA
#3 CDS 3 2 NA 3 NA 2

1) dplyr/tidyr Using the data shown reproducibly in the Note at the end, sum the orders and quantity and create a Quantity_ column equal to Quantity by Cola. Then reshape the Quantity_ column to wide form.
library(dplyr)
library(tidyr)
df %>%
group_by(Cola) %>%
mutate(Quantity_ = Quantity,
Order = sum(Order),
Quantity = sum(Quantity)) %>%
ungroup %>%
pivot_wider(names_from = "Loc", values_from = "Quantity_",
names_prefix = "Quantity_", values_fill = list(Quantity_ = 0))
giving:
# A tibble: 3 x 5
Cola Order Quantity Quantity_LocA Quantity_LocB
<chr> <int> <int> <int> <int>
1 ABC 2 8 4 4
2 CSD 4 6 6 0
3 CDS 3 2 0 2
2) Base R We can do much the same in base R using transform/ave and reshape like this:
df2 <- transform(df,
Quantity_ = Quantity,
Quantity = ave(Quantity, Cola, FUN = sum),
Order = ave(Order, Cola, FUN = sum))
wide <- reshape(df2, dir = "wide", idvar = c("Cola", "Quantity", "Order"),
timevar = "Loc", sep = "")
wide
## Cola Order Quantity Quantity_LocA Quantity_LocB
## 1 ABC 2 8 4 4
## 3 CSD 4 6 6 NA
## 4 CDS 3 2 NA 2
Note
Lines <- "Cola Order Quantity Loc
ABC 1 4 LocA
ABC 1 4 LocB
CSD 4 6 LocA
CDS 3 2 LocB"
df <- read.table(text = Lines, header = TRUE, as.is = TRUE)

Related

Cumulative Sum of String

I have a table which looks like:
Order
Col A
Col B
1
a
2,3,4,5
2
a
3,5,6,7,8
3
a
1,2,4,9
4
a
3,5,7,11,12
I want to aggregate this table by Col A. The output should look like the following:
Order
Col A
Col B
Col C
1
a
2,3,4,5
2,3,4,5
2
a
3,5,6,7,8
2,3,4,5,6,7,8
3
a
1,2,4,9
1,2,3,4,5,6,7,8,9
4
a
3,5,7,11,12
1,2,3,4,5,6,7,8,9,11,12
Please guide me on how I get the desirable output in R?

This ought to do it:
library(dplyr)
df %>%
group_by(ColA) %>%
mutate(
result = strsplit(Colb, split = ","),
result = lapply(result, as.numeric),
result = Reduce(f = union, x = result, accumulate = TRUE),
result = lapply(result, sort),
result = sapply(result, paste, collapse = ",")
) %>%
ungroup()
# # A tibble: 4 × 4
# Order ColA Colb result
# <int> <chr> <chr> <chr>
# 1 1 a 2,3,4,5 2,3,4,5
# 2 2 a 3,5,6,7,8 2,3,4,5,6,7,8
# 3 3 a 1,2,4,9 1,2,3,4,5,6,7,8,9
# 4 4 a 3,5,7,11,12 1,2,3,4,5,6,7,8,9,11,12
Using this data:
df = read.table(text = "Order ColA Colb
1 a '2,3,4,5'
2 a '3,5,6,7,8'
3 a '1,2,4,9'
4 a '3,5,7,11,12' ", header = T)

df %>%
group_by(ColA)%>%
mutate(ColC = map_chr(accumulate(strsplit(ColB,','),
~union(.x,.y)), str_c, collapse=','))
# A tibble: 4 × 4
# Groups: ColA [1]
Order ColA ColB ColC
<int> <chr> <chr> <chr>
1 1 a 2,3,4,5 2,3,4,5
2 2 a 3,5,6,7,8 2,3,4,5,6,7,8
3 3 a 1,2,4,9 2,3,4,5,6,7,8,1,9
4 4 a 3,5,7,11,12 2,3,4,5,6,7,8,1,9,11,12

Merge where some ids are concatenated in single column

I have a dataframe with a column of ids, but for some rows there are multiple ids concatenated together. I want to merge this onto another dataframe using the id, and when the ids are concatenated it handles that and reflects it by having the values in the new columns added also concatenated.
For example I have dataframes
data <- data.frame(
id = c(1, 4, 3, "2,3", "1,4"),
value = c(1:5)
)
> data
id value
1 1 1
2 4 2
3 3 3
4 2,3 4
5 1,4 5
mapping <- data.frame(
id = 1:4,
name = c("one", "two", "three", "four")
)
> mapping
id name
1 1 one
2 2 two
3 3 three
4 4 four
I would like to end up with
id value name
1 1 1 one
2 4 2 four
3 3 3 three
4 2,3 4 two,three
5 1,4 5 one,four

I don't think there's a good way to do this other than to separate, join, and re-concatenate:
library(dplyr)
library(tidyr)
data %>%
mutate(true_id = row_number()) %>%
separate_rows(id, convert = TRUE) %>%
left_join(mapping, by = "id") %>%
group_by(true_id, value) %>%
summarize(id = toString(id), name = toString(name), .groups = "drop")
# # A tibble: 5 × 4
# true_id value id name
# <int> <int> <chr> <chr>
# 1 1 1 1 one
# 2 2 2 4 four
# 3 3 3 3 three
# 4 4 4 2, 3 two, three
# 5 5 5 1, 4 one, four
I wasn't sure if your value column would actually be unique, so I added a true_id just in case.

What about something like this. I could think of a few ways. One is longer, but much easier to follow and the other is short, but kind of a mess.
library(tidyverse)
#long and readable
data |>
mutate(tmp = row_number()) |>
mutate(id = str_split(id, ",")) |>
unnest_longer(id) |>
left_join(mapping |>
mutate(id = as.character(id)), by = "id") |>
group_by(tmp) |>
summarise(id = paste(id, collapse = ","),
value = value[1],
name = paste(name, collapse = ","))
#> # A tibble: 5 x 4
#> tmp id value name
#> <int> <chr> <int> <chr>
#> 1 1 1 1 one
#> 2 2 4 2 four
#> 3 3 3 3 three
#> 4 4 2,3 4 two,three
#> 5 5 1,4 5 one,four
#short and ugly
data |>
mutate(name = map_chr(id, \(x)paste(
mapping$name[which(as.character(mapping$id) %in% str_split(x, ",")[[1]])],
collapse = ",") ))
#> id value name
#> 1 1 1 one
#> 2 4 2 four
#> 3 3 3 three
#> 4 2,3 4 two,three
#> 5 1,4 5 one,four

greping the data$ids out of the mapping$ids.
mapply(\(x, y) toString(mapping$name[grep(sprintf('[%s]', gsub('\\D', '', x)), y)]),
data$id, list(mapping$id))
# 1 4 3 2,3 1,4
# "one" "four" "three" "two, three" "one, four"
In order not to have a space after the comma, use paste(., collapse=',') instead of toString.

reshape grouped data in R

I have the following data:
id <- c(1,1,1,1,2,2,2,2,2,2)
date <-as.Date(c("2007-06-22", "2007-06-22", "2007-07-13","2007-07-13",
"2019-10-05", "2019-10-05", "2019-11-07", "2019-11-07",
"2007-06-22","2007-06-22"))
value <-c(0,3,2,4,0,1,4,2,6,8)
mydata_1 <- data.frame(id, date, value)
mydata_1
id date value
1 2007-06-22 0
1 2007-06-22 3
1 2007-07-13 2
1 2007-07-13 4
2 2019-10-05 0
2 2019-10-05 1
2 2019-11-07 4
2 2019-11-07 2
2 2007-06-22 6
2 2007-06-22 8
I would like the data to look like this:
id <- c(1,1,2,2,2)
date <-as.Date(c("2007-06-22", "2007-07-13", "2019-10-05", "2019-11-07","2007-06-22"))
value.1 = c(0,2,0,4,6)
value.2 = c(3,4,1,2,8)
mydata_2 <- data.frame(id, date, value.1, value.2)
mydata_2
id date value.1 value.2
1 2007-06-22 0 3
1 2007-07-13 2 4
2 2019-10-05 0 1
2 2019-11-07 4 2
2 2007-06-22 6 8
I have tried below from (Reshaping data matrix in R) but since some of the dates are the same in the two different id's it is not working as intended
dateno <- with(mydata_1, ave(id, date, FUN = seq_along))
test2 <- transform(mydata_1, dateno = dateno)
reshape(test2, dir = "wide", idvar = c("id","date"), timevar = "dateno")

I think I have come up with an answer following this guide How to transpose a data frame by group using reshape2 library?
mydata_1 = mydata_1 %>% group_by(id,date) %>% mutate(id_2 = paste0("V",row_number()))
library(tidyr)
mydata_2 = spread(data = my, key = id_2, value = value)
mydata_2
id date V1 V2
<dbl> <date> <dbl> <dbl>
1 1 2007-06-22 0 3
2 1 2007-07-13 2 4
3 2 2007-06-22 6 8
4 2 2019-10-05 0 1
5 2 2019-11-07 4 2

Maybe sth. like this:
library(tidyverse)
id <- c(1, 1, 1, 1, 2, 2, 2, 2, 2, 2)
date <- as.Date(c(
"2007-06-22", "2007-06-22", "2007-07-13", "2007-07-13",
"2019-10-05", "2019-10-05", "2019-11-07", "2019-11-07",
"2007-06-22", "2007-06-22"
))
value <- c(0, 3, 2, 4, 0, 1, 4, 2, 6, 8)
mydata_1 <- data.frame(id, date, value)
mydata_1
mydata_1 %>%
group_by(id, date) %>%
mutate(visit = row_number()) %>%
complete(id, date, fill = list(value = 0)) %>%
pivot_wider(names_from = visit, values_from = value, names_prefix = "value.")
Created on 2021-11-25 by the reprex package (v2.0.1)

Another possible solution:
library(tidyverse)
id <- c(1,1,1,1,2,2,2,2,2,2)
date <-as.Date(c("2007-06-22", "2007-06-22", "2007-07-13","2007-07-13",
"2019-10-05", "2019-10-05", "2019-11-07", "2019-11-07",
"2007-06-22","2007-06-22"))
value <-c(0,3,2,4,0,1,4,2,6,8)
mydata_1 <- data.frame(id, date, value)
mydata_1 %>%
group_by(id, date) %>%
summarise(value = str_c(value, collapse = ","), .groups = "drop") %>%
separate(value, into=c("value1", "value2"), sep=",", convert = T)
#> # A tibble: 5 × 4
#> id date value1 value2
#> <dbl> <date> <int> <int>
#> 1 1 2007-06-22 0 3
#> 2 1 2007-07-13 2 4
#> 3 2 2007-06-22 6 8
#> 4 2 2019-10-05 0 1
#> 5 2 2019-11-07 4 2

R dplyr::c_across() strange behaviour in rowSums

I'm trying to see how to apply rowSums() to specific columns only.
here is a reprex:
df <- tibble(
"ride" = c("bicycle", "motorcycle", "car", "other"),
"A" = c(1, NA, 1, NA),
"B" = c(NA, 2, NA, 2)
)
I can get the desired result, by index[2:3]
df %>%
mutate(total = rowSums(.[2:3], na.rm = TRUE))
# A tibble: 4 × 4
ride A B total
<chr> <dbl> <dbl> <dbl>
1 bicycle 1 NA 1
2 motorcycle NA 2 2
3 car 1 NA 1
4 other NA 2 2
however, if I try specifying columns by name, strange results occur
df %>%
mutate(total = sum(c_across(c("A":"B")), na.rm = TRUE))
# A tibble: 4 × 4
ride A B total
<chr> <dbl> <dbl> <dbl>
1 bicycle 1 NA 6
2 motorcycle NA 2 6
3 car 1 NA 6
4 other NA 2 6
What am I doing wrong?
I can achieve what I want, by something like this:
df %>%
mutate_all(~replace(., is.na(.), 0)) %>%
mutate(total = A + B)
but I'd like to specify column names by passing a vector, so I can change to different combination of column names in future.
Something like this is what I'd like to achieve:
cols_to_sum <- c("A","B")
df %>%
mutate(total = sum(across(cols_to_sum), na.rm = TRUE))

You may use select to specify the columns you want to sum.
library(dplyr)
cols_to_sum <- c("A","B")
df %>%
mutate(total = rowSums(select(., all_of(cols_to_sum)), na.rm = TRUE))
# ride A B total
# <chr> <dbl> <dbl> <dbl>
#1 bicycle 1 NA 1
#2 motorcycle NA 2 2
#3 car 1 NA 1
#4 other NA 2 2
c_across works with rowwise -
df %>%
rowwise() %>%
mutate(total = sum(c_across(all_of(cols_to_sum)), na.rm = TRUE)) %>%
ungroup

How to combine multiple summary tables at once

Consider the following data frame:
set.seed(123)
dat <- data.frame(Region = rep(c("a","b"), each=100),
State =rep(c("NY","MA","FL","GA"), each = 50),
Loc = rep(letters[1:20], each = 5),
ID = 1:200,
count1 = sample(4, 200, replace=T),
count2 = sample(4, 200, replace=T))
Region, State, and Loc are grouping variables for individual measurements, each of which has a unique ID number. For each grouping variable, I want to summarize the number of observations in each level of count1 and count2. Normally I would do on of the following for each pair:
#example for count1 and region:
library(tidyverse)
dat%>%
dplyr::select(Region,count1)%>%
group_by(count1,Region)%>%
count()
##or
with(dat, table(Region, count1))
How can I do this for all combinations and wrap them into a single table (or at least a few tables that are grouped by equivalent lengths since they will differ depending on which grouping variable is being used)

Try something like this:
Region1 <- dat %>% group_by(Region, count1) %>%
summarise(TotalRegion1 = n())
State1 <- dat %>% group_by(State, count1) %>%
summarise(TotalState1 = n())
Loc1 <- dat %>% group_by(Loc, count1) %>%
summarise(TotalLoc1 = n())

You can try to get "all at once" (for count1) with
out <- dat %>%
select(-ID, -count2) %>%
pivot_longer(Region:Loc, names_to = "k", values_to = "v") %>%
group_by(k, v, count1) %>%
tally() %>%
ungroup()
out %>%
filter(k == "Region")
# # A tibble: 8 x 4
# k v count1 n
# <chr> <fct> <int> <int>
# 1 Region a 1 26
# 2 Region a 2 27
# 3 Region a 3 20
# 4 Region a 4 27
# 5 Region b 1 20
# 6 Region b 2 30
# 7 Region b 3 30
# 8 Region b 4 20
out
# # A tibble: 101 x 4
# k v count1 n
# <chr> <fct> <int> <int>
# 1 Loc a 2 5
# 2 Loc a 3 1
# 3 Loc a 4 4
# 4 Loc b 1 2
# 5 Loc b 2 2
# 6 Loc b 3 3
# 7 Loc b 4 3
# 8 Loc c 1 2
# 9 Loc c 2 2
# 10 Loc c 3 3
# # ... with 91 more rows

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

Transpose and sum distinct values in R - r

Related

Cumulative Sum of String

Merge where some ids are concatenated in single column

reshape grouped data in R

R dplyr::c_across() strange behaviour in rowSums

How to combine multiple summary tables at once

Categories

Resources