R group by column, count the combinations observed - r

I imagine this is already solved in many places, but I lack the right wordage to use to search for a solution. In R I have example data in long format like this:
A = tibble( c(1,2,3,1,2,4,5,5), c('a','b','c','a','f','-','b', 'f'))
and what I want returned is sort of a grouped result (something like a spread?) where I first collect the set of letters that match each number to get something like this.
1: 'a', 'a'
2: 'b', 'f'
3: 'c', 'c'
4: '_'
5: 'b', 'f'
and the actual final result I am looking for is the count of how many times each letter combination, when is observed:
'a','a': 1
'b','f': 2
'c','c': 1
'-': 1
I can do the last step with group_by() but I mention it here in case there is some magic sauce that does the whole thing.

We can do a group by 'a', then paste the second column while taking the number of distinct elements in 'b' and get the distinct rows
library(dplyr)
library(stringr)
A %>%
group_by(a) %>%
summarise(out = str_c(b, collapse=","), n = n_distinct(b))%>%
distinct(out, n)
# A tibble: 4 x 2
# out n
# <chr> <int>
#1 a,a 1
#2 b,f 2
#3 c 1
#4 - 1
data
A <- structure(list(a = c(1, 2, 3, 1, 2, 4, 5, 5), b = c("a", "b",
"c", "a", "f", "-", "b", "f")), row.names = c(NA, -8L), class = c("tbl_df",
"tbl", "data.frame"))

This is close to what you are looking for:
library(tidyverse)
#Data
A <- structure(list(v1 = c(1, 2, 3, 1, 2, 4, 5, 5), v2 = c("a", "b",
"c", "a", "f", "-", "b", "f")), row.names = c(NA, -8L), class = c("tbl_df",
"tbl", "data.frame"))
#Code
A %>% group_by(v1) %>% summarise(chain=paste0(v2,collapse = ',')) %>% ungroup() %>%
group_by(chain) %>% summarise(N=n())
# A tibble: 4 x 2
chain N
<chr> <int>
1 - 1
2 a,a 1
3 b,f 2
4 c 1

Here is a base R option using nested aggregate
aggregate(.~y,aggregate(y~.,A,toString),length)
which gives
> aggregate(.~y,aggregate(y~.,A,toString),length)
y x
1 - 1
2 a, a 1
3 b, f 2
4 c 1
Data
A = tibble(x = c(1,2,3,1,2,4,5,5), y = c('a','b','c','a','f','-','b', 'f'))

Maybe you want to cast the data in wide format and then count the combinations. Try :
library(dplyr)
library(tidyr)
A %>%
group_by(v1) %>%
mutate(row = row_number()) %>%
pivot_wider(names_from = row, values_from = v2, names_prefix = 'col_') %>%
ungroup %>%
count(col_1, col_2)
# col_1 col_2 n
# <chr> <chr> <int>
#1 - NA 1
#2 a a 1
#3 b f 2
#4 c NA 1

Related

Cumulative sum equivalent for string

I have a table which looks like:
Col A
Col b
a
2,3,4,5
a
3,5,6,7,8
b
1,2,4
b
3,5,7
I want to aggregate this table by Col A. The output should look like the following:
Col A
Col b
a
2,3,4,5,6,7,8
b
1,2,3,4,5,7
Please guide me on how I get the desirable output in R?
Assuming the Col.b is a string column as in
quux <- structure(list(Col.A = c("a", "a", "b", "b"), Col.b = c("2,3,4,5", "3,5,6,7,8", "1,2,4", "3,5,7")), class = "data.frame", row.names = c(NA, -4L))
then we can do
library(dplyr)
quux %>%
group_by(Col.A) %>%
summarize(Col.b = paste(unique(unlist(strsplit(Col.b, ","))), collapse = ","))
# # A tibble: 2 × 2
# Col.A Col.b
# <chr> <chr>
# 1 a 2,3,4,5,6,7,8
# 2 b 1,2,4,3,5,7
If it is instead a list-column, as in
quux <- structure(list(Col.A = c("a", "a", "b", "b"), Col.b = list(c("2", "3", "4", "5"), c("3", "5", "6", "7", "8"), c("1", "2", "4"), c("3", "5", "7"))), row.names = c(NA, -4L), class = "data.frame")
quux
# Col.A Col.b
# 1 a 2, 3, 4, 5
# 2 a 3, 5, 6, 7, 8
# 3 b 1, 2, 4
# 4 b 3, 5, 7
then we can do
library(dplyr)
quux %>%
group_by(Col.A) %>%
summarize(Col.b = list(unique(unlist(Col.b)))) %>%
as.data.frame()
# Col.A Col.b
# 1 a 2, 3, 4, 5, 6, 7, 8
# 2 b 1, 2, 4, 3, 5, 7
The trailing %>% as.data.frame() is not at all required, it is provided solely to demonstrate what Col.b now contains. Without it, it looks like this, which is value-wise equivalent:
quux %>%
group_by(Col.A) %>%
summarize(Col.b = list(unique(unlist(Col.b))))
# # A tibble: 2 × 2
# Col.A Col.b
# <chr> <list>
# 1 a <chr [7]>
# 2 b <chr [6]>
One way is to use separate_rows to split and aggregate back toString on the unique sorted values, i.e.
library(dplyr)
library(tidyr)
df %>%
separate_rows(Colb, sep = ',') %>%
group_by(ColA) %>%
summarise(Colb = toString(sort(unique(Colb))))
# A tibble: 2 × 2
ColA Colb
<chr> <chr>
1 a 2, 3, 4, 5, 6, 7, 8
2 b 1, 2, 3, 4, 5, 7
DATA
dput(df)
structure(list(ColA = c("a", "a", "b", "b"), Colb = c("2,3,4,5",
"3,5,6,7,8", "1,2,4", "3,5,7")), class = "data.frame", row.names = c(NA,
-4L))
Using base, split on ColA, then split strings by comma, then get unique values, finally paste it back and convert it to dataframe:
stack(lapply(split(df$Colb, df$ColA), function(i){
paste(unique(unlist(strsplit(i, ","))), collapse = ",")
}))
# values ind
# 1 2,3,4,5,6,7,8 a
# 2 1,2,4,3,5,7 b
> dat <- data.table::data.table(a = c('a', 'a', 'b', 'b'), b = c('2,3,4,5', '3,5,6,7,8', '1,2,4', '3,5,7'))
> dat <- dat[, .(bc = paste0(b, collapse = ',')), by = .(a)]
> dat$bs <- mapply(paste0, mapply(unique, (mapply(strsplit, dat$bc, split = ','))), collapse = ',')
> dat
a bc bs
1: a 2,3,4,5,3,5,6,7,8 2,3,4,5,6,7,8
2: b 1,2,4,3,5,7 1,2,4,3,5,7
> dat$bs
[1] "2,3,4,5,6,7,8" "1,2,4,3,5,7"
Alternatively please check the separate_rows and paste0 functions
data <- tribble(
~a, ~b,
'a', '2,3,4,5',
'a', '3,5,6,7,8',
'b', '1,2,4',
'b', '3,5,7'
) %>%
group_by(a) %>% separate_rows(b, sep = ',') %>% distinct(a,b) %>%
mutate(b=paste0(b,collapse = ',')) %>% slice_head(n=1)
Created on 2023-02-01 with reprex v2.0.2
# A tibble: 2 × 2
# Groups: a [2]
a b
<chr> <chr>
1 a 2,3,4,5,6,7,8
2 b 1,2,4,3,5,7

how to duplicate rows with certain condition and create anew variable at the same time

I have a df like below and I would like to transfer it to sth like the table on the right, how can I duplicate the rows with Type=="N" and add new var Grade?
Basically, if Type==N, then Grade can be S or W, that is why we need to duplicate the rows.
df<-structure(list(Type = c("N", "N", "S", "W"), Result = c(8, 9,
7, 6)), row.names = c(NA, -4L), class = c("tbl_df", "tbl", "data.frame"
))
Using some functions from tidyverse, you can use crossing to duplicate rows and add the "Grade" column at the same time, then filter to match your stated rules.
library(tidyverse)
result <- df %>%
crossing(data.frame(Grade = c('S', 'W'))) %>%
filter(Type == 'N' | Type == Grade)
Type Result Grade
<chr> <dbl> <chr>
1 N 8 S
2 N 8 W
3 N 9 S
4 N 9 W
5 S 7 S
6 W 6 W
I think this approach is extensible to many more conditions assuming yours is the minimal example and you have a larger more complicated dataset.
library(dplyr)
df<-structure(list(Type = c("N", "N", "S", "W"), Result = c(8, 9,
7, 6)), row.names = c(NA, -4L), class = c("tbl_df", "tbl", "data.frame"
))
df2 <- data.frame(Type2 = c("N", "N"), Grade = c("S", "W"))
df %>%
select(Type, Result) %>%
left_join(df2, by = c("Type" = "Type2")) %>%
mutate(Grade = case_when(Type == "S" ~ "S", Type == "W" ~ "W", TRUE ~ Grade))
Type Result Grade
<chr> <dbl> <chr>
1 N 8 S
2 N 8 W
3 N 9 S
4 N 9 W
5 S 7 S
6 W 6 W
Another option is to use if_else() (or case_when() if there are more complex conditions) to return a list column of multiple values and unnest:
library(dplyr)
library(tidyr)
df %>%
mutate(Grade = if_else(Type == "N", list(c("S", "W")), as.list(Type))) %>%
unnest(Grade)
# A tibble: 6 x 3
Type Result Grade
<chr> <dbl> <chr>
1 N 8 S
2 N 8 W
3 N 9 S
4 N 9 W
5 S 7 S
6 W 6 W
Or:
df %>%
mutate(Grade = case_when(Type == "N" ~ list(c("S", "W")),
TRUE ~ as.list(Type))) %>%
unnest(Grade)
A dplyr way:
We could use bind_rows after using slice.
library(dplyr)
df %>%
slice(1:2) %>%
bind_rows(df) %>%
group_by(Type) %>%
arrange(Result, .by_group = TRUE) %>%
ungroup() %>%
mutate(Grade = rep(c("S","W"),length.out = n()), .before=2)
Type Grade Result
<chr> <chr> <dbl>
1 N S 8
2 N W 8
3 N S 9
4 N W 9
5 S S 7
6 W W 6
Here is a possible data.table option:
library(data.table)
dt <- as.data.table(df)
output <- dt[, CJ(.SD$Type, c('S', 'W')), .(Result)][which(V1 == 'N' | V1 == V2), ]
setnames(output, c(names(dt), "Grade"))
setcolorder(output, c("Result", "Grade", "Type"))
Output
Result Grade Type
1: N S 8
2: N W 8
3: N S 9
4: N W 9
5: S S 7
6: W W 6

How to iterate over a variable in rep()

this question has been asked a couple of times but I have yet to find a satisfactory answer that works.
I have a dataframe:
grouping1 <- rep(c('a','b'),times=47350)
grouping2 <- rep(c('A','B', 'C', 'D', 'E'), times=18940)
observations <- rep(c(14, 16, 12, 11, 15, 15,15,18,20,34,12), times=9470)
my_data <- as.data.frame(cbind(grouping1,grouping2,observations))
I would like to group over my grouping variables to pass a different value to 'times' in rep() for each group:
new_data <- my_data %>%
group_by(grouping1,grouping2,grouping3) %>%
mutate(sim_count = rep(1:100, times=observations, each=1))
But the 'times' argument is invalid, no matter if I pipe in a list of values from 'observations' iterate over 'observations' from the dataframe, iterate through observations in a for loop, etc. I think there must be an easy fix but I'm not seeing it. Thank you in advance.
EDIT: Thanks to everyone for their patience; they helped me better envision the data structure and how I could better explain the problem. Here's the solution I came up with:
new_data <- my_data %>%
distinct(grouping1,grouping2,.keep_all=T) %>%
rowwise() %>%
mutate(sim_count = list(rep(1:100,times=observations,each=1))) %>%
unnest_longer(sim_count) %>%
arrange(sim_count)
We can make a list-column and then tidyr::unnest it:
my_data %>%
group_by(grouping1, grouping2, grouping3) %>%
mutate(sim_count = lapply(observations, function(obs) rep(1:100, times = obs, each = 1))) %>%
ungroup() %>%
tidyr::unnest(sim_count)
# # A tibble: 8,300 x 5
# grouping1 grouping2 grouping3 observations sim_count
# <chr> <chr> <chr> <dbl> <int>
# 1 a A 1 14 1
# 2 a A 1 14 2
# 3 a A 1 14 3
# 4 a A 1 14 4
# 5 a A 1 14 5
# 6 a A 1 14 6
# 7 a A 1 14 7
# 8 a A 1 14 8
# 9 a A 1 14 9
# 10 a A 1 14 10
# # ... with 8,290 more rows
Data
my_data <- structure(list(grouping1 = c("a", "a", "a", "b", "b", "b"), grouping2 = c("A", "A", "B", "B", "C", "C"), grouping3 = c("1", "2", "3", "4", "5", "6"), observations = c(14, 16, 12, 11, 15, 15)), class = "data.frame", row.names = c(NA, -6L))
Maybe we can try the following data.table option
setDT(my_data)[
,
.(observations,
sim_count = rep(1:100, times = observations, each = 1)
), grouping1:grouping3
]

Arranging rows and columns in R

I have an output data frame as below. But I would like to rearrange to achieve the result in df2. Is there a way for me to arrange or group it?
df>
a_test1 b_test1 c_test1 a_test2 b_test2 c_test2
Test Test1 Test1 Test1 Test2 Test2 Test2
Result 10 9 4 4 3 1
df2>
a b c
Test1 10 9 4
Test2 4 3 1
dat <- data.frame(a_test1 = 10,
b_test1 = 9,
c_test1 = 4,
a_test2 = 4,
b_test2 = 3,
c_test2 = 1)
You can achieve this with this code:
library(tidyverse)
dat %>%
pivot_longer(cols = everything(),
names_sep = "_",
names_to = c("prefix", "suffix")) %>%
pivot_wider(names_from = prefix)
which gives:
# A tibble: 2 x 4
suffix a b c
<chr> <dbl> <dbl> <dbl>
1 test1 10 9 4
2 test2 4 3 1
UPDATE:
TO asked if it would still work with different column names that contain several underscores as separator:
dat2 <- data.frame(a_test1_10 = 10,
b_test1_10 = 9,
c_test1_10 = 4,
a_test2_10 = 4,
b_test2_10 = 3,
c_test2_10 = 1)
pivot_spec <- data.frame(.name = colnames(dat2),
.value = c("a", "b", "c", "a", "b", "c"),
test_group = c("test1", "test1", "test1", "test2", "test2", "test2"))
This pivot_spec looks like:
.name .value test_group
1 a_test1_10 a test1
2 b_test1_10 b test1
3 c_test1_10 c test1
4 a_test2_10 a test2
5 b_test2_10 b test2
6 c_test2_10 c test2
and then ou can just continue pivoting. Actually, the whole pivoting now looks much cleaner and you don't need to combine a pivot_longer with a pivot_wider.
dat2 %>%
pivot_longer_spec(pivot_spec)
which gives:
# A tibble: 2 x 4
test_group a b c
<chr> <dbl> <dbl> <dbl>
1 test1 10 9 4
2 test2 4 3 1
As you can see, createing this pivot_spec template makes the whole thing extremely flexible. The .name column contains all your required data columns, the .value column contains the new column names and maps the old column names to the new ones. And the test_group (you can choose whatever name you like) column determines the rows that would be created and which original column should appear in which column.
You can reshape the dat, filter rows and turn column into rownames.
tidyr::pivot_longer(df,
cols = everything(),
names_to = c('.value', 'col'),
names_sep = '_') %>%
dplyr::filter(!grepl('Test', a)) %>%
type.convert(as.is = TRUE) %>%
tibble::column_to_rownames('col')
# a b c
#test1 10 9 4
#test2 4 3 1
data
df <- structure(list(a_test1 = c("Test1", "10"), b_test1 = c("Test1",
"9"), c_test1 = c("Test1", "4"), a_test2 = c("Test2", "4"), b_test2 = c("Test2",
"3"), c_test2 = c("Test2", "1")), class = "data.frame", row.names = c("Test",
"Result"))

How do I select column based on value in another column with dplyr?

My data frame looks like this:
id A T C G ref var
1 1 10 15 7 0 A C
2 2 11 9 2 3 A G
3 3 2 31 1 12 T C
I'd like to create two new columns: ref_count and var_count which will have following values:
Value from A column and value from C column, since ref is A and var is C
Value from A column and value from G column, since ref is A and var is G
etc.
So I'd like to select a column based on the value in another column for each row.
Thanks!
We can use pivot_longer to reshape into 'long' format, filter the rows and then reshape it to 'wide' format with pivot_wider
library(dplyr)
library(tidyr)
df1 %>%
pivot_longer(cols = A:G) %>%
group_by(id) %>%
filter(name == ref|name == var) %>%
mutate(nm1 = c('ref_count', 'var_count')) %>%
ungroup %>%
select(id, value, nm1) %>%
pivot_wider(names_from = nm1, values_from = value) %>%
left_join(df1, .)
# A tibble: 3 x 9
# id A T C G ref var ref_count var_count
#* <int> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <dbl> <dbl>
#1 1 10 15 7 0 A C 10 7
#2 2 11 9 2 3 A G 11 3
#3 3 2 31 1 12 T C 31 1
Or in base R, we can also make use of the vectorized row/column indexing
df1$refcount <- as.matrix(df1[2:5])[cbind(seq_len(nrow(df1)), match(df1$ref, names(df1)[2:5]))]
df1$var_count <- as.matrix(df1[2:5])[cbind(seq_len(nrow(df1)), match(df1$var, names(df1)[2:5]))]
data
df1 <- structure(list(id = 1:3, A = c(10, 11, 2), T = c(15, 9, 31),
C = c(7, 2, 1), G = c(0, 3, 12), ref = c("A", "A", "T"),
var = c("C", "G", "C")), row.names = c(NA, -3L), class = c("tbl_df",
"tbl", "data.frame"))
The following is a tidyverse alternative without creating a long dataframe that needs filtering. It essentially uses tidyr::nest() to nest the dataframe by rows, after which the correct column can be selected for each row.
df1 %>%
nest(data = -id) %>%
mutate(
data = map(
data,
~mutate(., refcount = .[[ref]], var_count = .[[var]])
)
) %>%
unnest(data)
#> # A tibble: 3 × 9
#> id A T C G ref var refcount var_count
#> <int> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <dbl> <dbl>
#> 1 1 10 15 7 0 A C 10 7
#> 2 2 11 9 2 3 A G 11 3
#> 3 3 2 31 1 12 T C 31 1
A variant of this does not need the (assumed row-specific) id column but defines the nested groups from the unique values of ref and var directly:
df1 %>%
nest(data = -c(ref, var)) %>%
mutate(
data = pmap(
list(data, ref, var),
function(df, ref, var) {
mutate(df, refcount = df[[ref]], var_count = df[[var]])
}
)
) %>%
unnest(data)
The data were specified by akrun:
df1 <- structure(list(id = 1:3, A = c(10, 11, 2), T = c(15, 9, 31),
C = c(7, 2, 1), G = c(0, 3, 12), ref = c("A", "A", "T"),
var = c("C", "G", "C")), row.names = c(NA, -3L), class = c("tbl_df",
"tbl", "data.frame"))

Resources