Related
I have a table which looks like:
Col A
Col b
a
2,3,4,5
a
3,5,6,7,8
b
1,2,4
b
3,5,7
I want to aggregate this table by Col A. The output should look like the following:
Col A
Col b
a
2,3,4,5,6,7,8
b
1,2,3,4,5,7
Please guide me on how I get the desirable output in R?
Assuming the Col.b is a string column as in
quux <- structure(list(Col.A = c("a", "a", "b", "b"), Col.b = c("2,3,4,5", "3,5,6,7,8", "1,2,4", "3,5,7")), class = "data.frame", row.names = c(NA, -4L))
then we can do
library(dplyr)
quux %>%
group_by(Col.A) %>%
summarize(Col.b = paste(unique(unlist(strsplit(Col.b, ","))), collapse = ","))
# # A tibble: 2 × 2
# Col.A Col.b
# <chr> <chr>
# 1 a 2,3,4,5,6,7,8
# 2 b 1,2,4,3,5,7
If it is instead a list-column, as in
quux <- structure(list(Col.A = c("a", "a", "b", "b"), Col.b = list(c("2", "3", "4", "5"), c("3", "5", "6", "7", "8"), c("1", "2", "4"), c("3", "5", "7"))), row.names = c(NA, -4L), class = "data.frame")
quux
# Col.A Col.b
# 1 a 2, 3, 4, 5
# 2 a 3, 5, 6, 7, 8
# 3 b 1, 2, 4
# 4 b 3, 5, 7
then we can do
library(dplyr)
quux %>%
group_by(Col.A) %>%
summarize(Col.b = list(unique(unlist(Col.b)))) %>%
as.data.frame()
# Col.A Col.b
# 1 a 2, 3, 4, 5, 6, 7, 8
# 2 b 1, 2, 4, 3, 5, 7
The trailing %>% as.data.frame() is not at all required, it is provided solely to demonstrate what Col.b now contains. Without it, it looks like this, which is value-wise equivalent:
quux %>%
group_by(Col.A) %>%
summarize(Col.b = list(unique(unlist(Col.b))))
# # A tibble: 2 × 2
# Col.A Col.b
# <chr> <list>
# 1 a <chr [7]>
# 2 b <chr [6]>
One way is to use separate_rows to split and aggregate back toString on the unique sorted values, i.e.
library(dplyr)
library(tidyr)
df %>%
separate_rows(Colb, sep = ',') %>%
group_by(ColA) %>%
summarise(Colb = toString(sort(unique(Colb))))
# A tibble: 2 × 2
ColA Colb
<chr> <chr>
1 a 2, 3, 4, 5, 6, 7, 8
2 b 1, 2, 3, 4, 5, 7
DATA
dput(df)
structure(list(ColA = c("a", "a", "b", "b"), Colb = c("2,3,4,5",
"3,5,6,7,8", "1,2,4", "3,5,7")), class = "data.frame", row.names = c(NA,
-4L))
Using base, split on ColA, then split strings by comma, then get unique values, finally paste it back and convert it to dataframe:
stack(lapply(split(df$Colb, df$ColA), function(i){
paste(unique(unlist(strsplit(i, ","))), collapse = ",")
}))
# values ind
# 1 2,3,4,5,6,7,8 a
# 2 1,2,4,3,5,7 b
> dat <- data.table::data.table(a = c('a', 'a', 'b', 'b'), b = c('2,3,4,5', '3,5,6,7,8', '1,2,4', '3,5,7'))
> dat <- dat[, .(bc = paste0(b, collapse = ',')), by = .(a)]
> dat$bs <- mapply(paste0, mapply(unique, (mapply(strsplit, dat$bc, split = ','))), collapse = ',')
> dat
a bc bs
1: a 2,3,4,5,3,5,6,7,8 2,3,4,5,6,7,8
2: b 1,2,4,3,5,7 1,2,4,3,5,7
> dat$bs
[1] "2,3,4,5,6,7,8" "1,2,4,3,5,7"
Alternatively please check the separate_rows and paste0 functions
data <- tribble(
~a, ~b,
'a', '2,3,4,5',
'a', '3,5,6,7,8',
'b', '1,2,4',
'b', '3,5,7'
) %>%
group_by(a) %>% separate_rows(b, sep = ',') %>% distinct(a,b) %>%
mutate(b=paste0(b,collapse = ',')) %>% slice_head(n=1)
Created on 2023-02-01 with reprex v2.0.2
# A tibble: 2 × 2
# Groups: a [2]
a b
<chr> <chr>
1 a 2,3,4,5,6,7,8
2 b 1,2,4,3,5,7
I have a df like below and I would like to transfer it to sth like the table on the right, how can I duplicate the rows with Type=="N" and add new var Grade?
Basically, if Type==N, then Grade can be S or W, that is why we need to duplicate the rows.
df<-structure(list(Type = c("N", "N", "S", "W"), Result = c(8, 9,
7, 6)), row.names = c(NA, -4L), class = c("tbl_df", "tbl", "data.frame"
))
Using some functions from tidyverse, you can use crossing to duplicate rows and add the "Grade" column at the same time, then filter to match your stated rules.
library(tidyverse)
result <- df %>%
crossing(data.frame(Grade = c('S', 'W'))) %>%
filter(Type == 'N' | Type == Grade)
Type Result Grade
<chr> <dbl> <chr>
1 N 8 S
2 N 8 W
3 N 9 S
4 N 9 W
5 S 7 S
6 W 6 W
I think this approach is extensible to many more conditions assuming yours is the minimal example and you have a larger more complicated dataset.
library(dplyr)
df<-structure(list(Type = c("N", "N", "S", "W"), Result = c(8, 9,
7, 6)), row.names = c(NA, -4L), class = c("tbl_df", "tbl", "data.frame"
))
df2 <- data.frame(Type2 = c("N", "N"), Grade = c("S", "W"))
df %>%
select(Type, Result) %>%
left_join(df2, by = c("Type" = "Type2")) %>%
mutate(Grade = case_when(Type == "S" ~ "S", Type == "W" ~ "W", TRUE ~ Grade))
Type Result Grade
<chr> <dbl> <chr>
1 N 8 S
2 N 8 W
3 N 9 S
4 N 9 W
5 S 7 S
6 W 6 W
Another option is to use if_else() (or case_when() if there are more complex conditions) to return a list column of multiple values and unnest:
library(dplyr)
library(tidyr)
df %>%
mutate(Grade = if_else(Type == "N", list(c("S", "W")), as.list(Type))) %>%
unnest(Grade)
# A tibble: 6 x 3
Type Result Grade
<chr> <dbl> <chr>
1 N 8 S
2 N 8 W
3 N 9 S
4 N 9 W
5 S 7 S
6 W 6 W
Or:
df %>%
mutate(Grade = case_when(Type == "N" ~ list(c("S", "W")),
TRUE ~ as.list(Type))) %>%
unnest(Grade)
A dplyr way:
We could use bind_rows after using slice.
library(dplyr)
df %>%
slice(1:2) %>%
bind_rows(df) %>%
group_by(Type) %>%
arrange(Result, .by_group = TRUE) %>%
ungroup() %>%
mutate(Grade = rep(c("S","W"),length.out = n()), .before=2)
Type Grade Result
<chr> <chr> <dbl>
1 N S 8
2 N W 8
3 N S 9
4 N W 9
5 S S 7
6 W W 6
Here is a possible data.table option:
library(data.table)
dt <- as.data.table(df)
output <- dt[, CJ(.SD$Type, c('S', 'W')), .(Result)][which(V1 == 'N' | V1 == V2), ]
setnames(output, c(names(dt), "Grade"))
setcolorder(output, c("Result", "Grade", "Type"))
Output
Result Grade Type
1: N S 8
2: N W 8
3: N S 9
4: N W 9
5: S S 7
6: W W 6
this question has been asked a couple of times but I have yet to find a satisfactory answer that works.
I have a dataframe:
grouping1 <- rep(c('a','b'),times=47350)
grouping2 <- rep(c('A','B', 'C', 'D', 'E'), times=18940)
observations <- rep(c(14, 16, 12, 11, 15, 15,15,18,20,34,12), times=9470)
my_data <- as.data.frame(cbind(grouping1,grouping2,observations))
I would like to group over my grouping variables to pass a different value to 'times' in rep() for each group:
new_data <- my_data %>%
group_by(grouping1,grouping2,grouping3) %>%
mutate(sim_count = rep(1:100, times=observations, each=1))
But the 'times' argument is invalid, no matter if I pipe in a list of values from 'observations' iterate over 'observations' from the dataframe, iterate through observations in a for loop, etc. I think there must be an easy fix but I'm not seeing it. Thank you in advance.
EDIT: Thanks to everyone for their patience; they helped me better envision the data structure and how I could better explain the problem. Here's the solution I came up with:
new_data <- my_data %>%
distinct(grouping1,grouping2,.keep_all=T) %>%
rowwise() %>%
mutate(sim_count = list(rep(1:100,times=observations,each=1))) %>%
unnest_longer(sim_count) %>%
arrange(sim_count)
We can make a list-column and then tidyr::unnest it:
my_data %>%
group_by(grouping1, grouping2, grouping3) %>%
mutate(sim_count = lapply(observations, function(obs) rep(1:100, times = obs, each = 1))) %>%
ungroup() %>%
tidyr::unnest(sim_count)
# # A tibble: 8,300 x 5
# grouping1 grouping2 grouping3 observations sim_count
# <chr> <chr> <chr> <dbl> <int>
# 1 a A 1 14 1
# 2 a A 1 14 2
# 3 a A 1 14 3
# 4 a A 1 14 4
# 5 a A 1 14 5
# 6 a A 1 14 6
# 7 a A 1 14 7
# 8 a A 1 14 8
# 9 a A 1 14 9
# 10 a A 1 14 10
# # ... with 8,290 more rows
Data
my_data <- structure(list(grouping1 = c("a", "a", "a", "b", "b", "b"), grouping2 = c("A", "A", "B", "B", "C", "C"), grouping3 = c("1", "2", "3", "4", "5", "6"), observations = c(14, 16, 12, 11, 15, 15)), class = "data.frame", row.names = c(NA, -6L))
Maybe we can try the following data.table option
setDT(my_data)[
,
.(observations,
sim_count = rep(1:100, times = observations, each = 1)
), grouping1:grouping3
]
I imagine this is already solved in many places, but I lack the right wordage to use to search for a solution. In R I have example data in long format like this:
A = tibble( c(1,2,3,1,2,4,5,5), c('a','b','c','a','f','-','b', 'f'))
and what I want returned is sort of a grouped result (something like a spread?) where I first collect the set of letters that match each number to get something like this.
1: 'a', 'a'
2: 'b', 'f'
3: 'c', 'c'
4: '_'
5: 'b', 'f'
and the actual final result I am looking for is the count of how many times each letter combination, when is observed:
'a','a': 1
'b','f': 2
'c','c': 1
'-': 1
I can do the last step with group_by() but I mention it here in case there is some magic sauce that does the whole thing.
We can do a group by 'a', then paste the second column while taking the number of distinct elements in 'b' and get the distinct rows
library(dplyr)
library(stringr)
A %>%
group_by(a) %>%
summarise(out = str_c(b, collapse=","), n = n_distinct(b))%>%
distinct(out, n)
# A tibble: 4 x 2
# out n
# <chr> <int>
#1 a,a 1
#2 b,f 2
#3 c 1
#4 - 1
data
A <- structure(list(a = c(1, 2, 3, 1, 2, 4, 5, 5), b = c("a", "b",
"c", "a", "f", "-", "b", "f")), row.names = c(NA, -8L), class = c("tbl_df",
"tbl", "data.frame"))
This is close to what you are looking for:
library(tidyverse)
#Data
A <- structure(list(v1 = c(1, 2, 3, 1, 2, 4, 5, 5), v2 = c("a", "b",
"c", "a", "f", "-", "b", "f")), row.names = c(NA, -8L), class = c("tbl_df",
"tbl", "data.frame"))
#Code
A %>% group_by(v1) %>% summarise(chain=paste0(v2,collapse = ',')) %>% ungroup() %>%
group_by(chain) %>% summarise(N=n())
# A tibble: 4 x 2
chain N
<chr> <int>
1 - 1
2 a,a 1
3 b,f 2
4 c 1
Here is a base R option using nested aggregate
aggregate(.~y,aggregate(y~.,A,toString),length)
which gives
> aggregate(.~y,aggregate(y~.,A,toString),length)
y x
1 - 1
2 a, a 1
3 b, f 2
4 c 1
Data
A = tibble(x = c(1,2,3,1,2,4,5,5), y = c('a','b','c','a','f','-','b', 'f'))
Maybe you want to cast the data in wide format and then count the combinations. Try :
library(dplyr)
library(tidyr)
A %>%
group_by(v1) %>%
mutate(row = row_number()) %>%
pivot_wider(names_from = row, values_from = v2, names_prefix = 'col_') %>%
ungroup %>%
count(col_1, col_2)
# col_1 col_2 n
# <chr> <chr> <int>
#1 - NA 1
#2 a a 1
#3 b f 2
#4 c NA 1
My data frame looks like this:
id A T C G ref var
1 1 10 15 7 0 A C
2 2 11 9 2 3 A G
3 3 2 31 1 12 T C
I'd like to create two new columns: ref_count and var_count which will have following values:
Value from A column and value from C column, since ref is A and var is C
Value from A column and value from G column, since ref is A and var is G
etc.
So I'd like to select a column based on the value in another column for each row.
Thanks!
We can use pivot_longer to reshape into 'long' format, filter the rows and then reshape it to 'wide' format with pivot_wider
library(dplyr)
library(tidyr)
df1 %>%
pivot_longer(cols = A:G) %>%
group_by(id) %>%
filter(name == ref|name == var) %>%
mutate(nm1 = c('ref_count', 'var_count')) %>%
ungroup %>%
select(id, value, nm1) %>%
pivot_wider(names_from = nm1, values_from = value) %>%
left_join(df1, .)
# A tibble: 3 x 9
# id A T C G ref var ref_count var_count
#* <int> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <dbl> <dbl>
#1 1 10 15 7 0 A C 10 7
#2 2 11 9 2 3 A G 11 3
#3 3 2 31 1 12 T C 31 1
Or in base R, we can also make use of the vectorized row/column indexing
df1$refcount <- as.matrix(df1[2:5])[cbind(seq_len(nrow(df1)), match(df1$ref, names(df1)[2:5]))]
df1$var_count <- as.matrix(df1[2:5])[cbind(seq_len(nrow(df1)), match(df1$var, names(df1)[2:5]))]
data
df1 <- structure(list(id = 1:3, A = c(10, 11, 2), T = c(15, 9, 31),
C = c(7, 2, 1), G = c(0, 3, 12), ref = c("A", "A", "T"),
var = c("C", "G", "C")), row.names = c(NA, -3L), class = c("tbl_df",
"tbl", "data.frame"))
The following is a tidyverse alternative without creating a long dataframe that needs filtering. It essentially uses tidyr::nest() to nest the dataframe by rows, after which the correct column can be selected for each row.
df1 %>%
nest(data = -id) %>%
mutate(
data = map(
data,
~mutate(., refcount = .[[ref]], var_count = .[[var]])
)
) %>%
unnest(data)
#> # A tibble: 3 × 9
#> id A T C G ref var refcount var_count
#> <int> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <dbl> <dbl>
#> 1 1 10 15 7 0 A C 10 7
#> 2 2 11 9 2 3 A G 11 3
#> 3 3 2 31 1 12 T C 31 1
A variant of this does not need the (assumed row-specific) id column but defines the nested groups from the unique values of ref and var directly:
df1 %>%
nest(data = -c(ref, var)) %>%
mutate(
data = pmap(
list(data, ref, var),
function(df, ref, var) {
mutate(df, refcount = df[[ref]], var_count = df[[var]])
}
)
) %>%
unnest(data)
The data were specified by akrun:
df1 <- structure(list(id = 1:3, A = c(10, 11, 2), T = c(15, 9, 31),
C = c(7, 2, 1), G = c(0, 3, 12), ref = c("A", "A", "T"),
var = c("C", "G", "C")), row.names = c(NA, -3L), class = c("tbl_df",
"tbl", "data.frame"))