Set collapse for paste function in aggregate - r

I want to aggregate a data.frame with two columns: in one column I have "num", which is an identifier number and in the other I have text. It is important that the aggregated text has a space between the individual parts. My code is this:
data_aggr <- aggregate(
x = data_aggr,
FUN = paste,
by = list(data_aggr$num)
)
I have tried the obvious with FUN = paste(collapse = " ") and
FUN = paste,
collapse = " ",
but that doesn't work. How do I need to do this?

Aggregate can be used to paste together the rows with the same value of num as follows:
data_aggr <- data.frame(num=c(1,1,1,2,2), letters=letters[1:5])
aggregate(data_aggr$letters, list(data_aggr$num), FUN=paste, collapse= " ")
# Group.1 x
# 1 1 a b c
# 2 2 d e

A dplyr solution, the idea is to create a new column with row number to be able to conduct the operation on each row.
> library(dplyr)
> df.ask <- data.frame('Num' = 1:10,
+ 'Text' = letters[1:10])
>
> df.ask %>%
+ mutate(row_num = row_number()) %>%
+ group_by(row_num) %>%
+ mutate(together = paste(Num, Text, collapse = ' ')) %>%
+ ungroup() %>%
+ select(-row_num)
# A tibble: 10 x 3
Num Text together
<int> <fct> <chr>
1 1 a 1 a
2 2 b 2 b
3 3 c 3 c
4 4 d 4 d
5 5 e 5 e
6 6 f 6 f
7 7 g 7 g
8 8 h 8 h
9 9 i 9 i
10 10 j 10 j

Related

How to replicate a String in a dataframe individually N times [duplicate]

This question already has answers here:
Repeat each row of data.frame the number of times specified in a column
(10 answers)
Collapse / concatenate / aggregate a column to a single comma separated string within each group
(6 answers)
Closed 2 years ago.
I have a dataframe and I want to replicate the input of a single cell n times dependent on the input of the next cell and display it in a new cell.
My dataframe looks like this:
data <- data.frame(c(1,1,2,3,4,4,4), c("A","B","A","C","D","E","A"), c(2,1,1,3,2,1,3))
colnames(data) <- c("document number", "term", "count")
data
This is my desired result:
datanew <- data.frame(c(1,2,3,4), c("A A B", "A", "C C C", "D D E A A A"))
colnames(datanew) <- c("document number", "term")
# document number term
# 1 1 A A B
# 2 2 A
# 3 3 C C C
# 4 4 D D E A A A
So basically, I like to multiplicate the input of the term cell with the input of the corresponding count cell. Does anyone has an idea how to code it in R?
We can use rep to replicate term count times and paste the data together.
library(dplyr)
data %>%
group_by(`document number`) %>%
summarise(new = paste(rep(term, count), collapse = " "))
# A tibble: 4 x 2
# `document number` new
# <dbl> <chr>
#1 1 A A B
#2 2 A
#3 3 C C C
#4 4 D D E A A A
Similarly with data.table
library(data.table)
setDT(data)[, (new = paste(rep(term, count), collapse = " ")),
by = `document number`]
We can do this with tidyverse methods
library(dplyr)
library(tidyr)
library(stringr)
data %>%
uncount(count) %>%
group_by(`document number`) %>%
summarise(term = str_c(term, collapse=' '))
# A tibble: 4 x 2
# `document number` term
# <dbl> <chr>
#1 1 A A B
#2 2 A
#3 3 C C C
#4 4 D D E A A A
Or with data.table
library(data.table)
setDT(data)[rep(seq_len(.N), count)][, .(term =
paste(term, collapse=' ')), `document number`]
Or using base R with aggregate
aggregate(term ~ `document number`, data[rep(seq_len(nrow(data)),
data$count),], FUN = paste, collapse= ' ')

Recursively sum data frames for matching rows

I would like to combine a set of data frames into a single data frame by summing columns that have matching variables (instead of appending columns).
For example, given
df1 <- data.frame(A = c(0,0,1,1,1,2,2), B = c(1,2,1,2,3,1,5), x = c(2,3,1,5,3,7,0))
df2 <- data.frame(A = c(0,1,1,2,2,2), B = c(1,1,3,2,4,5), x = c(4,8,4,1,0,3))
df3 <- data.frame(A = c(0,1,2), B = c(5,4,2), x = c(5,3,1))
I want to match by "A" and "B" and sum the values of "x". For this example, I can get the desired result as follows:
library(plyr)
library(dplyr)
# rename columns so that join_all preserves them all:
colnames(df1)[3] <- "x1"
colnames(df2)[3] <- "x2"
colnames(df3)[3] <- "x3"
# join the data frames by matching "A" and "B" values:
res <- join_all(list(df1, df2, df3), by = c("A", "B"), type = "full")
# get the sums and drop superfluous columns:
arrange(res, A, B) %>%
rowwise() %>%
mutate(x = sum(x1, x2, x3, na.rm = TRUE)) %>%
select(A, B, x)
Result:
A B x
<dbl> <dbl> <dbl>
1 0 1 6
2 0 2 3
3 0 5 5
4 1 1 9
5 1 2 5
6 1 3 7
7 1 4 3
8 2 1 7
9 2 2 2
10 2 4 0
11 2 5 3
A more general solution is
library(dplyr)
# function to get the desired result for two data frames:
my_merge <- function(df1, df2)
{
m1 <- merge(df1, df2, by = c("A", "B"), all = TRUE)
m1 <- rowwise(res) %>%
mutate(x = sum(x.x, x.y, na.rm = TRUE)) %>%
select(A, B, x)
return(m1)
}
l1 <- list(df2, df3) # omit the first data frame
res <- df1 # initial value of the result
for(df in l1) res <- my_merge(res, df) # call the function repeatedly
Is there a more efficient option for combining a large set of data frames? Ideally it should be recursive (i.e. it's better not to join all data frames into one massive data frame before calculating the sums).
An easier option is to bind the rows of the datasets, then group by the columns of interest and get the summarised output by getting the sum of 'x'
library(tidyverse)
bind_rows(df1, df2, df3) %>%
group_by(A, B) %>%
summarise(x = sum(x))
# A tibble: 11 x 3
# Groups: A [?]
# A B x
# <dbl> <dbl> <dbl>
# 1 0 1 6
# 2 0 2 3
# 3 0 5 5
# 4 1 1 9
# 5 1 2 5
# 6 1 3 7
# 7 1 4 3
# 8 2 1 7
# 9 2 2 2
#10 2 4 0
#11 2 5 3
If there are many objects in the global environment with the pattern "df" followed by some digits
mget(ls(pattern= "^df\\d+")) %>%
bind_rows %>%
group_by(A, B) %>%
summarise(x = sum(x))
As the OP mentioned about memory constraints, if we do the join first and then use rowSums or + with reduce, it would be more efficient
mget(ls(pattern= "^df\\d+")) %>%
reduce(full_join, by = c("A", "B")) %>%
transmute(A, B, x = rowSums(.[3:5], na.rm = TRUE)) %>%
arrange(A, B)
# A B x
#1 0 1 6
#2 0 2 3
#3 0 5 5
#4 1 1 9
#5 1 2 5
#6 1 3 7
#7 1 4 3
#8 2 1 7
#9 2 2 2
#10 2 4 0
#11 2 5 3
This could also be done with data.table
library(data.table)
rbindlist(mget(ls(pattern= "^df\\d+")))[, .(x = sum(x)), by = .(A, B)]
Ideally it should be recursive (i.e. it's better not to join all data frames into one massive data frame before calculating the sums).
If you're memory constrained and willing to sacrifice speed (vs #akrun's data.table approach), use one table at a time in a loop:
library(data.table)
tabs = c("df1", "df2", "df3")
# enumerate all combos for the results table
# initializing sum to 0
res = CJ(A = 0:2, B = 1:5, x = 0)
# loop over tabs, adding on
for (i in seq_along(tabs)){
tab = get(tabs[[i]])
res[tab, on=.(A, B), x := x + i.x][]
rm(tab)
}
If you need to read tables from disk, change tabs to file names and get to fread or whatever function.
I am skeptical that you can fit all the tables in memory, but cannot also fit an rbind-ed copy of them together.
Similarly (thanks to #akrun's comment), use his approach pairwise:
res = data.table(get(tabs[[1]]))[0L]
for (i in seq_along(tabs)){
tab = get(tabs[[i]])
res = rbind(res, tab)[, .(x = sum(x)), by=.(A,B)]
rm(tab)
}

Extract character list values from data.frame rows and reshape data

I have a variable x with character lists in each row:
dat <- data.frame(id = c(rep('a',2),rep('b',2),'c'),
x = c('f,o','f,o,o','b,a,a,r','b,a,r','b,a'),
stringsAsFactors = F)
I would like to reshape the data so that each row is a unique (id, x) pair such as:
dat2 <- data.frame(id = c(rep('a',2),rep('b',3),rep('c',2)),
x = c('f','o','a','b','r','a','b'))
> dat2
id x
1 a f
2 a o
3 b a
4 b b
5 b r
6 c a
7 c b
I've attempted to do this by splitting the character lists and keeping only the unique list values in each row:
dat$x <- sapply(strsplit(dat$x, ','), sort)
dat$x <- sapply(dat$x, unique)
dat <- unique(dat)
> dat
id x
1 a f, o
3 b a, b, r
5 c a, b
However, I'm not sure how to proceed with converting the row lists into individual row entries.
How would I accomplish this? Or is there a more efficient way of converting a list of strings to reshape the data as described?
You can use tidytext::unnest_tokens:
library(tidytext)
library(dplyr)
dat %>%
unnest_tokens(x1, x) %>%
distinct()
id x1
1 a f
2 a o
3 b b
4 b a
5 b r
6 c b
7 c a
A base R method with two lines is
#get list of X potential vars
x <- strsplit(dat$x, ",")
# construct full data.frame, then use unique to return desired rows
unique(data.frame(id=rep(dat$id, lengths(x)), x=unlist(x)))
This returns
id x
1 a f
2 a o
6 b b
7 b a
9 b r
13 c b
14 c a
If you don't want to write out the variable names yourself, you can use setNames.
setNames(unique(data.frame(rep(dat$id, lengths(x)), unlist(x))), names(dat))
We could use separate_rows
library(tidyverse)
dat %>%
separate_rows(x) %>%
distinct()
# id x
#1 a f
#2 a o
#3 b b
#4 b a
#5 b r
#6 c b
#7 c a
A solution can be achieved using splitstackshape::cSplit to split x column into mulltiple columns. Then gather and filter will help to achieve desired output.
library(tidyverse)
library(splitstackshape)
dat %>% cSplit("x", sep=",") %>%
mutate_if(is.factor, as.character) %>%
gather(key, value, -id) %>%
filter(!is.na(value)) %>%
select(-key) %>% unique()
# id value
# 1 a f
# 3 b b
# 5 c b
# 6 a o
# 8 b a
# 10 c a
# 13 b r
Base solution:
temp <- do.call(rbind, apply( dat, 1,
function(z){ data.frame(
id=z[1],
x = scan(text=z['x'], what="",sep=","),
stringsAsFactors=FALSE)} ) )
Read 2 items
Read 3 items
Read 4 items
Read 3 items
Read 2 items
Warning messages:
1: In data.frame(id = z[1], x = scan(text = z["x"], what = "", sep = ",")) :
row names were found from a short variable and have been discarded
2: In data.frame(id = z[1], x = scan(text = z["x"], what = "", sep = ",")) :
row names were found from a short variable and have been discarded
3: In data.frame(id = z[1], x = scan(text = z["x"], what = "", sep = ",")) :
row names were found from a short variable and have been discarded
4: In data.frame(id = z[1], x = scan(text = z["x"], what = "", sep = ",")) :
row names were found from a short variable and have been discarded
5: In data.frame(id = z[1], x = scan(text = z["x"], what = "", sep = ",")) :
row names were found from a short variable and have been discarded
temp[!duplicated(temp),]
#------
id x
1 a f
2 a o
6 b b
7 b a
9 b r
13 c b
14 c a
To get rid of all the messages and warnings:
temp <- do.call(rbind, apply( dat, 1,
function(z){ suppressWarnings(data.frame(id=z[1],
x = scan(text=z['x'], what="",sep=",", quiet=TRUE), stringsAsFactors=FALSE)
)} ) )
temp[!duplicated(temp),]

Function of strings to tibble

I need to define a function f(x,y) such that:
x = "col1,1,2,3,4"
y = "col2,a,b,c,d"
becomes:
# A tibble: 4 x 2
col1 col2
<int> <chr>
1 1 a
2 2 b
3 3 c
4 4 d
Any thoughts? Thanks.
The most obvious idea that comes to mind is to split the input by comma, use paste to combine the output into a single string, and read that using read_csv.
Example:
paste(do.call(paste, c(strsplit(c(x, y), ","), sep = ", ")), collapse = "\n")
# [1] "col1, col2\n1, a\n2, b\n3, c\n4, d"
library(tidyverse)
read_csv(paste(do.call(paste, c(strsplit(c(x, y), ","), sep = ", ")), collapse = "\n"))
# # A tibble: 4 x 2
# col1 col2
# <int> <chr>
# 1 1 a
# 2 2 b
# 3 3 c
# 4 4 d
From there, I hope you're able to convert the approach to a function.

Concatenating all rows within a group using dplyr

Suppose I have a dataframe like this:
hand_id card_id card_name card_class
A 1 p alpha
A 2 q beta
A 3 r theta
B 2 q beta
B 3 r theta
B 4 s gamma
C 1 p alpha
C 2 q beta
I would like to concatenate the card_id, card_name, and card_class into one single row per hand level A, B, C. So the result would look something like this:
hand_id combo_1 combo_2 combo_3
A 1-2-3 p-q-r alpha-beta-theta
B 2-3-4 q-r-s beta-theta-gamma
....
I attempted to do this using group_by and mutate, but I can't seem to get it to work
data <- read_csv('data.csv')
byHand <- group_by(data, hand_id) %>%
mutate(combo_1 = paste(card_id),
combo_2 = paste(card_name),
combo_3 = paste(card_class))
Thank you for your help.
You were kind of close!
library(tidyr)
library(dplyr)
data <- read_csv('data.csv')
byHand <- group_by(data, hand_id) %>%
summarise(combo_1 = paste(card_id, collapse = "-"),
combo_2 = paste(card_name, collapse = "-"),
combo_3 = paste(card_class, collapse = "-"))
or using summarise_each:
byHand <- group_by(data, hand_id) %>%
summarise_each(funs(paste(., collapse = "-")))
Here is another option using data.table
library(data.table)
setDT(data)[, lapply(.SD, paste, collapse="-") , by = hand_id]
# hand_id card_id card_name card_class
#1: A 1-2-3 p-q-r alpha-beta-theta
#2: B 2-3-4 q-r-s beta-theta-gamma
#3: C 1-2 p-q alpha-beta
Not very familiar with dplyr... so here's my attempt without dplyr
df <- read_csv('data.csv')
res <- lapply(split(df, df$hand_id),function(x){
sL <- apply(x[,-1], 2, function(y) paste(y, collapse = "-"))
d <- data.frame(x$hand_id[1], rbind(sL))
names(d) <- c("hand_id", "combo_1", "combo_2", "combo_3")
return(d)
})
res <- do.call("rbind",res)
rownames(res) <- NULL
Here's the output:
## hand_id combo_1 combo_2 combo_3
## 1 A 1-2-3 p-q-r alpha-beta-theta
## 2 B 2-3-4 q-r-s beta-theta-gamma
## 3 C 1-2 p-q alpha-beta
If you have NAs in your data, you can use na.omit() inline with str_c(). unique() will also work if you only want the distinct.
data:
hand_id card_id card_name card_class
<chr> <dbl> <chr> <chr>
1 A 1 p alpha
2 A 2 q beta
3 A 3 r theta
4 A NA NA NA
5 B 2 q beta
6 B 3 r theta
7 B 4 s gamma
8 C 1 p alpha
9 C 2 q beta
code:
data %>%
group_by(hand_id) %>%
summarize(card_id = str_c(na.omit(card_id), collapse = "-"),
card_name = str_c(na.omit(card_name), collapse = "-"),
card_class = str_c(na.omit(card_class), collapse = "-"))
output:
hand_id card_id card_name card_class
* <chr> <chr> <chr> <chr>
1 A 1-2-3 p-q-r alpha-beta-the…
2 B 2-3-4 q-r-s beta-theta-gam…
3 C 1-2 p-q alpha-beta

Resources