Cumulative sum equivalent for string - r

I have a table which looks like:
Col A
Col b
a
2,3,4,5
a
3,5,6,7,8
b
1,2,4
b
3,5,7
I want to aggregate this table by Col A. The output should look like the following:
Col A
Col b
a
2,3,4,5,6,7,8
b
1,2,3,4,5,7
Please guide me on how I get the desirable output in R?

Assuming the Col.b is a string column as in
quux <- structure(list(Col.A = c("a", "a", "b", "b"), Col.b = c("2,3,4,5", "3,5,6,7,8", "1,2,4", "3,5,7")), class = "data.frame", row.names = c(NA, -4L))
then we can do
library(dplyr)
quux %>%
group_by(Col.A) %>%
summarize(Col.b = paste(unique(unlist(strsplit(Col.b, ","))), collapse = ","))
# # A tibble: 2 × 2
# Col.A Col.b
# <chr> <chr>
# 1 a 2,3,4,5,6,7,8
# 2 b 1,2,4,3,5,7
If it is instead a list-column, as in
quux <- structure(list(Col.A = c("a", "a", "b", "b"), Col.b = list(c("2", "3", "4", "5"), c("3", "5", "6", "7", "8"), c("1", "2", "4"), c("3", "5", "7"))), row.names = c(NA, -4L), class = "data.frame")
quux
# Col.A Col.b
# 1 a 2, 3, 4, 5
# 2 a 3, 5, 6, 7, 8
# 3 b 1, 2, 4
# 4 b 3, 5, 7
then we can do
library(dplyr)
quux %>%
group_by(Col.A) %>%
summarize(Col.b = list(unique(unlist(Col.b)))) %>%
as.data.frame()
# Col.A Col.b
# 1 a 2, 3, 4, 5, 6, 7, 8
# 2 b 1, 2, 4, 3, 5, 7
The trailing %>% as.data.frame() is not at all required, it is provided solely to demonstrate what Col.b now contains. Without it, it looks like this, which is value-wise equivalent:
quux %>%
group_by(Col.A) %>%
summarize(Col.b = list(unique(unlist(Col.b))))
# # A tibble: 2 × 2
# Col.A Col.b
# <chr> <list>
# 1 a <chr [7]>
# 2 b <chr [6]>

One way is to use separate_rows to split and aggregate back toString on the unique sorted values, i.e.
library(dplyr)
library(tidyr)
df %>%
separate_rows(Colb, sep = ',') %>%
group_by(ColA) %>%
summarise(Colb = toString(sort(unique(Colb))))
# A tibble: 2 × 2
ColA Colb
<chr> <chr>
1 a 2, 3, 4, 5, 6, 7, 8
2 b 1, 2, 3, 4, 5, 7
DATA
dput(df)
structure(list(ColA = c("a", "a", "b", "b"), Colb = c("2,3,4,5",
"3,5,6,7,8", "1,2,4", "3,5,7")), class = "data.frame", row.names = c(NA,
-4L))

Using base, split on ColA, then split strings by comma, then get unique values, finally paste it back and convert it to dataframe:
stack(lapply(split(df$Colb, df$ColA), function(i){
paste(unique(unlist(strsplit(i, ","))), collapse = ",")
}))
# values ind
# 1 2,3,4,5,6,7,8 a
# 2 1,2,4,3,5,7 b

> dat <- data.table::data.table(a = c('a', 'a', 'b', 'b'), b = c('2,3,4,5', '3,5,6,7,8', '1,2,4', '3,5,7'))
> dat <- dat[, .(bc = paste0(b, collapse = ',')), by = .(a)]
> dat$bs <- mapply(paste0, mapply(unique, (mapply(strsplit, dat$bc, split = ','))), collapse = ',')
> dat
a bc bs
1: a 2,3,4,5,3,5,6,7,8 2,3,4,5,6,7,8
2: b 1,2,4,3,5,7 1,2,4,3,5,7
> dat$bs
[1] "2,3,4,5,6,7,8" "1,2,4,3,5,7"

Alternatively please check the separate_rows and paste0 functions
data <- tribble(
~a, ~b,
'a', '2,3,4,5',
'a', '3,5,6,7,8',
'b', '1,2,4',
'b', '3,5,7'
) %>%
group_by(a) %>% separate_rows(b, sep = ',') %>% distinct(a,b) %>%
mutate(b=paste0(b,collapse = ',')) %>% slice_head(n=1)
Created on 2023-02-01 with reprex v2.0.2
# A tibble: 2 × 2
# Groups: a [2]
a b
<chr> <chr>
1 a 2,3,4,5,6,7,8
2 b 1,2,4,3,5,7

Related

How to combine two rows of a dataframe into one row

I have a dataframe which looks like this.
Name info.1 info.2
ab a 1
123 a 1
de c 4
456 c 4
fg d 5
789 d 5
The two rows that need to be combined are identical aside from the name column and are together in the dataframe. I want the new dataframe to look like this:
Name ID info.1 info.2
ab 123 a 1
de 456 c 4
fg 789 d 5
I have no clue how to do this and google search hasn't been helpful so far
In base R you could do:
data.frame(Name = df[seq(nrow(df)) %% 2 == 0, 1],
ID = df[seq(nrow(df)) %% 2 == 1, 1],
df[seq(nrow(df)) %% 2 == 0, 2:3])
#> Name ID info.1 info.2
#> 2 ab 456 a 1
#> 4 123 fg c 4
#> 6 de 789 d 5
Created on 2022-07-20 by the reprex package (v2.0.1)
A possible solution:
library(tidyverse)
df %>%
group_by(info.1) %>%
summarise(Name = str_c(Name, collapse = "_"), info.2 = first(info.2)) %>%
separate(Name, into = c("Name", "ID"), convert = T) %>%
relocate(info.1, .before = info.2)
#> # A tibble: 3 × 4
#> Name ID info.1 info.2
#> <chr> <int> <chr> <int>
#> 1 ab 123 a 1
#> 2 de 456 c 4
#> 3 fg 789 d 5
Assuming the Name column is consistently ordered Name-ID-Name-ID then:
library(tidyverse)
data <- tibble(Name = c('ab', 123, 'de', 456, 'fg', 789),
info.1 = c('a', 'a', 'c', 'c', 'd', 'd'),
info.2 = c(1, 1, 4, 4, 5, 5))
# remove the troublesome column and make a tibble
# with the unique combos of info1 and 2
data_2 <- data %>% select(info.1, info.2) %>% distinct()
# add columns for name and ID by skipping every other row in the
# original tibble
data_2$Name <- data$Name[seq(from = 1, to = nrow(data), by = 2)]
data_2$ID <- data$Name[seq(from = 2, to = nrow(data), by = 2)]
We could also use summarise and extract first as name and last as id:
data |>
group_by(info.1, info.2) |>
summarise(name = first(Name), ID = last(Name)) |>
ungroup() #|>
#relocate(3:4,1:2)
Output:
# A tibble: 3 × 4
info.1 info.2 name ID
<chr> <dbl> <chr> <chr>
1 a 1 ab 123
2 c 4 de 456
3 d 5 fg 789
We could also use
library(dplyr)
library(stringr)
data %>%
group_by(across(starts_with('info'))) %>%
mutate(ID = str_subset(Name, "^\\d+$"), .before = 2) %>%
ungroup %>%
filter(str_detect(Name, '^\\d+$', negate = TRUE))
-output
# A tibble: 3 × 4
Name ID info.1 info.2
<chr> <chr> <chr> <dbl>
1 ab 123 a 1
2 de 456 c 4
3 fg 789 d 5
data
data <- structure(list(Name = c("ab", "123", "de", "456", "fg", "789"
), info.1 = c("a", "a", "c", "c", "d", "d"), info.2 = c(1, 1,
4, 4, 5, 5)), row.names = c(NA, -6L), class = "data.frame")

How to iterate over a variable in rep()

this question has been asked a couple of times but I have yet to find a satisfactory answer that works.
I have a dataframe:
grouping1 <- rep(c('a','b'),times=47350)
grouping2 <- rep(c('A','B', 'C', 'D', 'E'), times=18940)
observations <- rep(c(14, 16, 12, 11, 15, 15,15,18,20,34,12), times=9470)
my_data <- as.data.frame(cbind(grouping1,grouping2,observations))
I would like to group over my grouping variables to pass a different value to 'times' in rep() for each group:
new_data <- my_data %>%
group_by(grouping1,grouping2,grouping3) %>%
mutate(sim_count = rep(1:100, times=observations, each=1))
But the 'times' argument is invalid, no matter if I pipe in a list of values from 'observations' iterate over 'observations' from the dataframe, iterate through observations in a for loop, etc. I think there must be an easy fix but I'm not seeing it. Thank you in advance.
EDIT: Thanks to everyone for their patience; they helped me better envision the data structure and how I could better explain the problem. Here's the solution I came up with:
new_data <- my_data %>%
distinct(grouping1,grouping2,.keep_all=T) %>%
rowwise() %>%
mutate(sim_count = list(rep(1:100,times=observations,each=1))) %>%
unnest_longer(sim_count) %>%
arrange(sim_count)
We can make a list-column and then tidyr::unnest it:
my_data %>%
group_by(grouping1, grouping2, grouping3) %>%
mutate(sim_count = lapply(observations, function(obs) rep(1:100, times = obs, each = 1))) %>%
ungroup() %>%
tidyr::unnest(sim_count)
# # A tibble: 8,300 x 5
# grouping1 grouping2 grouping3 observations sim_count
# <chr> <chr> <chr> <dbl> <int>
# 1 a A 1 14 1
# 2 a A 1 14 2
# 3 a A 1 14 3
# 4 a A 1 14 4
# 5 a A 1 14 5
# 6 a A 1 14 6
# 7 a A 1 14 7
# 8 a A 1 14 8
# 9 a A 1 14 9
# 10 a A 1 14 10
# # ... with 8,290 more rows
Data
my_data <- structure(list(grouping1 = c("a", "a", "a", "b", "b", "b"), grouping2 = c("A", "A", "B", "B", "C", "C"), grouping3 = c("1", "2", "3", "4", "5", "6"), observations = c(14, 16, 12, 11, 15, 15)), class = "data.frame", row.names = c(NA, -6L))
Maybe we can try the following data.table option
setDT(my_data)[
,
.(observations,
sim_count = rep(1:100, times = observations, each = 1)
), grouping1:grouping3
]

Arranging rows and columns in R

I have an output data frame as below. But I would like to rearrange to achieve the result in df2. Is there a way for me to arrange or group it?
df>
a_test1 b_test1 c_test1 a_test2 b_test2 c_test2
Test Test1 Test1 Test1 Test2 Test2 Test2
Result 10 9 4 4 3 1
df2>
a b c
Test1 10 9 4
Test2 4 3 1
dat <- data.frame(a_test1 = 10,
b_test1 = 9,
c_test1 = 4,
a_test2 = 4,
b_test2 = 3,
c_test2 = 1)
You can achieve this with this code:
library(tidyverse)
dat %>%
pivot_longer(cols = everything(),
names_sep = "_",
names_to = c("prefix", "suffix")) %>%
pivot_wider(names_from = prefix)
which gives:
# A tibble: 2 x 4
suffix a b c
<chr> <dbl> <dbl> <dbl>
1 test1 10 9 4
2 test2 4 3 1
UPDATE:
TO asked if it would still work with different column names that contain several underscores as separator:
dat2 <- data.frame(a_test1_10 = 10,
b_test1_10 = 9,
c_test1_10 = 4,
a_test2_10 = 4,
b_test2_10 = 3,
c_test2_10 = 1)
pivot_spec <- data.frame(.name = colnames(dat2),
.value = c("a", "b", "c", "a", "b", "c"),
test_group = c("test1", "test1", "test1", "test2", "test2", "test2"))
This pivot_spec looks like:
.name .value test_group
1 a_test1_10 a test1
2 b_test1_10 b test1
3 c_test1_10 c test1
4 a_test2_10 a test2
5 b_test2_10 b test2
6 c_test2_10 c test2
and then ou can just continue pivoting. Actually, the whole pivoting now looks much cleaner and you don't need to combine a pivot_longer with a pivot_wider.
dat2 %>%
pivot_longer_spec(pivot_spec)
which gives:
# A tibble: 2 x 4
test_group a b c
<chr> <dbl> <dbl> <dbl>
1 test1 10 9 4
2 test2 4 3 1
As you can see, createing this pivot_spec template makes the whole thing extremely flexible. The .name column contains all your required data columns, the .value column contains the new column names and maps the old column names to the new ones. And the test_group (you can choose whatever name you like) column determines the rows that would be created and which original column should appear in which column.
You can reshape the dat, filter rows and turn column into rownames.
tidyr::pivot_longer(df,
cols = everything(),
names_to = c('.value', 'col'),
names_sep = '_') %>%
dplyr::filter(!grepl('Test', a)) %>%
type.convert(as.is = TRUE) %>%
tibble::column_to_rownames('col')
# a b c
#test1 10 9 4
#test2 4 3 1
data
df <- structure(list(a_test1 = c("Test1", "10"), b_test1 = c("Test1",
"9"), c_test1 = c("Test1", "4"), a_test2 = c("Test2", "4"), b_test2 = c("Test2",
"3"), c_test2 = c("Test2", "1")), class = "data.frame", row.names = c("Test",
"Result"))

R group by column, count the combinations observed

I imagine this is already solved in many places, but I lack the right wordage to use to search for a solution. In R I have example data in long format like this:
A = tibble( c(1,2,3,1,2,4,5,5), c('a','b','c','a','f','-','b', 'f'))
and what I want returned is sort of a grouped result (something like a spread?) where I first collect the set of letters that match each number to get something like this.
1: 'a', 'a'
2: 'b', 'f'
3: 'c', 'c'
4: '_'
5: 'b', 'f'
and the actual final result I am looking for is the count of how many times each letter combination, when is observed:
'a','a': 1
'b','f': 2
'c','c': 1
'-': 1
I can do the last step with group_by() but I mention it here in case there is some magic sauce that does the whole thing.
We can do a group by 'a', then paste the second column while taking the number of distinct elements in 'b' and get the distinct rows
library(dplyr)
library(stringr)
A %>%
group_by(a) %>%
summarise(out = str_c(b, collapse=","), n = n_distinct(b))%>%
distinct(out, n)
# A tibble: 4 x 2
# out n
# <chr> <int>
#1 a,a 1
#2 b,f 2
#3 c 1
#4 - 1
data
A <- structure(list(a = c(1, 2, 3, 1, 2, 4, 5, 5), b = c("a", "b",
"c", "a", "f", "-", "b", "f")), row.names = c(NA, -8L), class = c("tbl_df",
"tbl", "data.frame"))
This is close to what you are looking for:
library(tidyverse)
#Data
A <- structure(list(v1 = c(1, 2, 3, 1, 2, 4, 5, 5), v2 = c("a", "b",
"c", "a", "f", "-", "b", "f")), row.names = c(NA, -8L), class = c("tbl_df",
"tbl", "data.frame"))
#Code
A %>% group_by(v1) %>% summarise(chain=paste0(v2,collapse = ',')) %>% ungroup() %>%
group_by(chain) %>% summarise(N=n())
# A tibble: 4 x 2
chain N
<chr> <int>
1 - 1
2 a,a 1
3 b,f 2
4 c 1
Here is a base R option using nested aggregate
aggregate(.~y,aggregate(y~.,A,toString),length)
which gives
> aggregate(.~y,aggregate(y~.,A,toString),length)
y x
1 - 1
2 a, a 1
3 b, f 2
4 c 1
Data
A = tibble(x = c(1,2,3,1,2,4,5,5), y = c('a','b','c','a','f','-','b', 'f'))
Maybe you want to cast the data in wide format and then count the combinations. Try :
library(dplyr)
library(tidyr)
A %>%
group_by(v1) %>%
mutate(row = row_number()) %>%
pivot_wider(names_from = row, values_from = v2, names_prefix = 'col_') %>%
ungroup %>%
count(col_1, col_2)
# col_1 col_2 n
# <chr> <chr> <int>
#1 - NA 1
#2 a a 1
#3 b f 2
#4 c NA 1

Subset R data.frame by index and name in one line

Sample data.frame:
structure(list(a = c(1, 2, 3), b = c(4, 5, 6), c = c(7, 8, 9)), .Names = c("a", "b", "c"), row.names = c(NA, -3L), class = "data.frame")
Output:
df
# a b c
# 1 1 4 7
# 2 2 5 8
# 3 3 6 9
I'd like to get the first and third columns, but I want to subset by name and also by column index.
df[, "a"]
# [1] 1 2 3
df[, 3]
# [1] 7 8 9
df[, c("a", 3)]
# Error in `[.data.frame`(df, , c("a", 3)) : undefined columns selected
df[, c(match("a", names(df)), 3)]
# a c
# 1 1 7
# 2 2 8
# 3 3 9
Are there functions or packages that allow for clean/simple syntax, as in the third example, while also achieving the result of the fourth example?
Maybe use dplyr?
For interactive use - i.e., if you know ahead of time the name of the column you want to select
library(dplyr)
df %>% select(a, 3)
If you do not know the name of the column in advance, and want to pass it as a variable,
x <- names(df)[1]
x
[1] "a"
df %>% select_(x, 3)
Either way the output is
# a c
#1 1 7
#2 2 8
#3 3 9
In base R you can combine subset with select.
df <- structure(list(a = c(1, 2, 3),
b = c(4, 5, 6), c = c(7, 8, 9)),
.Names = c("a", "b", "c"), row.names = c(NA, -3L), class = "data.frame")
df <- subset(df, select = c(a, 3))
You can index names(df) without using dplyr:
df <- structure(list(a = c(1, 2, 3), b = c(4, 5, 6), c = c(7, 8, 9)), .Names = c("a", "b", "c"), row.names = c(NA, -3L), class = "data.frame")
df[,c("a",names(df)[3]) ]
Output:
a c
1 1 7
2 2 8
3 3 9

Resources