Sum a set of numeric columns and collapse string column by group - r

Suppose I have a data frame (df) like this:
Names ID Thing1 Thing2 Thing3 Thing4 Thing5
1: Gen1 id1 10 5 10 5 10
2: Gen2 id2 1 2 3 4 5
3: Gen1 id3 10 5 10 5 10
4: Gen2 id4 1 2 3 4 5
5: Gen3 id5 7 7 7 7 7
For each 'Names', I would like to sum 'Thing' columns, and collapse the strings in 'ID':
Names ID Thing1 Thing2 Thing3 Thing4 Thing5
1: Gen1 id1|id3 20 10 20 10 20
2: Gen2 id2|id4 2 4 6 8 10
3: Gen3 id5 7 7 7 7 7
I am able to achieve this via dplyr:
df1 <- df %>%
group_by(Names)%>%
summarise_each(funs(paste(unique(.), collapse='|')),matches('^\\D+$'))
df2 <- df %>%
group_by(Names)%>%
summarise_each(funs(sum = sum(., na.rm=TRUE)), starts_with('Thing' ))
bind_cols(df1, df2[-1])
However, this solution takes very long since I have a data frame with more than 10k rows and more than 10k column!
Is there any possible solution with data.table?
The closest I have gotten is this here:
> setDT(df)[, c(paste(df$ID,collapse = "-", sep = ""), lapply(.SD, sum, na.rm = TRUE)),
by = Names, .SDcols = !"ID"]
Names Thing1 Thing2 Thing3 Thing4 Thing5
1: Gen1 id1-id2-id3-id4-id5 20 10 20 10 20
2: Gen2 id1-id2-id3-id4-id5 2 4 6 8 10
3: Gen3 id1-id2-id3-id4-id5 7 7 7 7 7
Obviously this is not what I am going for since it will collapse all IDs and not just the ones that were aggregated by summarizing via "Names".
I would very much appreciate your help!
Here is the example data:
df <- structure(list(Names = c("Gen1", "Gen2", "Gen1", "Gen2","Gen3"),
ID=c("id1","id2","id3","id4","id5"),
Thing1 = c(10L, 1L, 10L, 1L, 7L),
Thing2 = c(5L, 2L, 5L, 2L,7L),
Thing3 = c(10L, 3L, 10L, 3L, 7L),
Thing4 = c(5L, 4L, 5L,4L, 7L),
Thing5 = c(10L, 5L, 10L, 5L, 7L)),
.Names = c("Names","ID","Thing1", "Thing2", "Thing3", "Thing4", "Thing5"),
class = "data.frame", row.names = c(1:5L))

If you don't heavily rely on data.table you could use aggregate two times and merge the results.
merge(aggregate(.~Names, df[-2], sum), aggregate(ID ~ Names, df, paste, collapse="|"))
# Names Thing1 Thing2 Thing3 Thing4 Thing5 ID
# 1 Gen1 20 10 20 10 20 id1|id3
# 2 Gen2 2 4 6 8 10 id2|id4
# 3 Gen3 7 7 7 7 7 id5

try it this way
use tidyverse
library(tidyverse)
df %>%
group_by(Names) %>%
summarise(across(where(is.character), str_c, collapse = "|"),
across(where(is.numeric), sum, na.rm = T))
# A tibble: 3 x 7
Names ID Thing1 Thing2 Thing3 Thing4 Thing5
<chr> <chr> <int> <int> <int> <int> <int>
1 Gen1 id1|id3 20 10 20 10 20
2 Gen2 id2|id4 2 4 6 8 10
3 Gen3 id5
use data.table
library(data.table)
dt <- copy(df)
setDT(dt)
out_sum <- dt[, lapply(.SD, sum), by = Names, .SDcols=!"ID"]
out_id <- dt[, list(id = sapply(list(ID), paste0, collapse = "|")), by = Names]
merge(out_id, out_sum)
Names id Thing1 Thing2 Thing3 Thing4 Thing5
1: Gen1 id1|id3 20 10 20 10 20
2: Gen2 id2|id4 2 4 6 8 10
3: Gen3 id5 7 7 7 7 7

Related

Join two data frames with partially common and unequal columns

Is there a better way to leverage the power of merge and join in R? Merge looses unique rows and join creates duplicate and partially filled columns.
Dataframe1
Key Col1 Col2 Col3
A 1 2 3
B 2 4 6
Dataframe2
Key Col1 Col2 Col4
A 1 2 4
C 3 6 12
D 4 8 20
Merged Dataframe
Key Col1 Col2 Col3 Col4
A 1 2 3 4
B 2 4 6 <NA>
C 3 6 <NA> 12
D 4 8 <NA> 20
We could bind the datasets with bind_rows and then do a group by summarise or reframe to return only the non-NA rows
library(dplyr)
bind_rows(df1, df2) %>%
group_by(Key) %>%
reframe(across(everything(), ~ .x[!is.na(.x)][1]))
-output
# A tibble: 4 × 5
Key Col1 Col2 Col3 Col4
<chr> <int> <int> <int> <int>
1 A 1 2 3 4
2 B 2 4 6 NA
3 C 3 6 NA 12
4 D 4 8 NA 20
Or may use powerjoin
library(powerjoin)
power_full_join(df1, df2, by = "Key", conflict = coalesce_xy) %>%
select(Key, order(names(.)[-1])+1)
-output
Key Col1 Col2 Col3 Col4
1 A 1 2 3 4
2 B 2 4 6 NA
3 C 3 6 NA 12
4 D 4 8 NA 20
data
df1 <- structure(list(Key = c("A", "B"), Col1 = 1:2, Col2 = c(2L, 4L
), Col3 = c(3L, 6L)), class = "data.frame", row.names = c(NA,
-2L))
df2 <- structure(list(Key = c("A", "C", "D"), Col1 = c(1L, 3L, 4L),
Col2 = c(2L, 6L, 8L), Col4 = c(4L, 12L, 20L)),
class = "data.frame", row.names = c(NA,
-3L))
Here is an option with full_join combined with coalesce:
library(dplyr)
full_join(df1, df2, by="Key") %>%
mutate(Col1 = coalesce(Col1.x, Col1.y),
Col2 = coalesce(Col2.x, Col2.y), .before="Col3") %>%
select(-contains("."))
Key Col1 Col2 Col3 Col4
1 A 1 2 3 4
2 B 2 4 6 NA
3 C 3 6 NA 12
4 D 4 8 NA 20

Using complete to fill groups with NA to have same length as the maximum group

I have this dataframe:
df <- structure(list(id = c(1L, 1L, 1L, 2L, 2L, 3L), var = c("A", "B",
"C", "B", "C", "C")), class = "data.frame", row.names = c(NA,
-6L))
id var
1 1 A
2 1 B
3 1 C
4 2 B
5 2 C
6 3 C
I would like to get this dataframe:
id var
1 1 A
2 1 B
3 1 C
4 2 <NA>
5 2 B
6 2 C
7 3 <NA>
8 3 <NA>
9 3 C
I would like to learn how to use complete or expand.grid in this situation
I have tried several ways but was not successful: One of my tries:
df %>%
complete(id, var, fill=list(NA))
Create a duplicate column of 'var' and then do the complete on the other column, which makes the NA in the 'var' column and then remove the duplicate 'var' column
library(dplyr)
library(tidyr)
df %>%
mutate(var1 = var) %>%
complete(id, var1) %>%
select(-var1)
-output
# A tibble: 9 × 2
id var
<int> <chr>
1 1 A
2 1 B
3 1 C
4 2 <NA>
5 2 B
6 2 C
7 3 <NA>
8 3 <NA>
9 3 C

Transpose Rows in batches to Columns in R

My data.frame df looks like this:
A 1
A 2
A 5
B 2
B 3
B 4
C 3
C 7
C 9
I want it to look like this:
A B C
1 2 3
2 3 7
5 4 9
I have tried spread() but probably not in the right way. Any ideas?
We can use unstack from base R
unstack(df1, col2 ~ col1)
# A B C
#1 1 2 3
#2 2 3 7
#3 5 4 9
Or with split
data.frame(split(df1$col2, df1$col1))
Or if we use spread or pivot_wider, make sure to create a sequence column
library(dplyr)
library(tidyr)
df1 %>%
group_by(col1) %>%
mutate(rn = row_number()) %>%
ungroup %>%
pivot_wider(names_from = col1, values_from = col2) %>%
# or use
# spread(col1, col2) %>%
select(-rn)
# A tibble: 3 x 3
# A B C
# <int> <int> <int>
#1 1 2 3
#2 2 3 7
#3 5 4 9
Or using dcast
library(data.table)
dcast(setDT(df1), rowid(col1) ~ col1)[, .(A, B, C)]
data
df1 <- structure(list(col1 = c("A", "A", "A", "B", "B", "B", "C", "C",
"C"), col2 = c(1L, 2L, 5L, 2L, 3L, 4L, 3L, 7L, 9L)),
class = "data.frame", row.names = c(NA,
-9L))
In data.table, we can use dcast :
library(data.table)
dcast(setDT(df), rowid(col1)~col1, value.var = 'col2')[, col1 := NULL][]
# A B C
#1: 1 2 3
#2: 2 3 7
#3: 5 4 9

Random sampling in R within Categorical variable

Suppose I have a data frame with categorical variable of n classes and a numerical variable. I need to randomize the numerical variable within each category. For example , consider the following table:
Col_1 Col_2
A 2
A 5
A 4
A 8
B 1
B 4
B 9
B 7
When I tried sample() function in R, it threw the result considering both the categories. Is there any function where I can get this kind of output? (with or without replacement, doesn't matter)
Col_1 Col_2
A 8
A 4
A 2
A 5
B 9
B 7
B 4
B 1
You could sample row numbers within groups. In base R, we can use ave
df[with(df, ave(seq_len(nrow(df)), Col_1, FUN = sample)), ]
# Col_1 Col_2
#2 A 5
#4 A 8
#1 A 2
#3 A 4
#7 B 9
#5 B 1
#8 B 7
#6 B 4
In dplyr, we can use sample_n
library(dplyr)
df %>% group_by(Col_1) %>% sample_n(n())
data
df <- structure(list(Col_1 = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L), .Label = c("A", "B"), class = "factor"), Col_2 = c(2L, 5L,
4L, 8L, 1L, 4L, 9L, 7L)), class = "data.frame", row.names = c(NA, -8L))
Here's a dplyr solution:
library(dplyr)
set.seed(2)
dat %>%
group_by(Col_1) %>%
mutate(Col_2 = sample(Col_2)) %>%
ungroup()
# # A tibble: 8 x 2
# Col_1 Col_2
# <chr> <int>
# 1 A 2
# 2 A 4
# 3 A 5
# 4 A 8
# 5 B 7
# 6 B 9
# 7 B 1
# 8 B 4
A data.table method:
library(data.table)
datDT <- as.data.table(dat)
set.seed(2)
datDT[, Col_2 := sample(Col_2), by = "Col_1"]
datDT
# Col_1 Col_2
# 1: A 2
# 2: A 4
# 3: A 5
# 4: A 8
# 5: B 7
# 6: B 9
# 7: B 1
# 8: B 4
Data
dat <- read.table(header = TRUE, stringsAsFactors = FALSE, text = "
Col_1 Col_2
A 2
A 5
A 4
A 8
B 1
B 4
B 9
B 7")

R split each row of a dataframe into two rows

I would like to splite each row of a data frame(numberic) into two rows. For example, part of the original data frame like this (nrow(original datafram) > 2800000):
ID X Y Z value_1 value_2
1 3 2 6 22 54
6 11 5 9 52 71
3 7 2 5 2 34
5 10 7 1 23 47
And after spliting each row, we can get:
ID X Y Z
1 3 2 6
22 54 NA NA
6 11 5 9
52 71 NA NA
3 7 2 5
2 34 NA NA
5 10 7 1
23 47 NA NA
the "value_1" and "value_2" columns are split and each element is set to a new row. For example, value_1 = 22 and value_2 = 54 are set to a new row.
Here is one option with data.table. We convert the 'data.frame' to 'data.table' by creating a column of rownames (setDT(df1, keep.rownames = TRUE)). Subset the columns 1:5 and 1, 6, 7 in a list, rbind the list element with fill = TRUE option to return NA for corresponding columns that are not found in one of the datasets, order by the row number ('rn') and assign (:=) the row number column to 'NULL'.
library(data.table)
setDT(df1, keep.rownames = TRUE)[]
rbindlist(list(df1[, 1:5, with = FALSE], setnames(df1[, c(1, 6:7),
with = FALSE], 2:3, c("ID", "X"))), fill = TRUE)[order(rn)][, rn:= NULL][]
# ID X Y Z
#1: 1 3 2 6
#2: 22 54 NA NA
#3: 6 11 5 9
#4: 52 71 NA NA
#5: 3 7 2 5
#6: 2 34 NA NA
#7: 5 10 7 1
#8: 23 47 NA NA
A hadleyverse corresponding to the above logic would be
library(dplyr)
tibble::rownames_to_column(df1[1:4]) %>%
bind_rows(., setNames(tibble::rownames_to_column(df1[5:6]),
c("rowname", "ID", "X"))) %>%
arrange(rowname) %>%
select(-rowname)
# ID X Y Z
#1 1 3 2 6
#2 22 54 NA NA
#3 6 11 5 9
#4 52 71 NA NA
#5 3 7 2 5
#6 2 34 NA NA
#7 5 10 7 1
#8 23 47 NA NA
data
df1 <- structure(list(ID = c(1L, 6L, 3L, 5L), X = c(3L, 11L, 7L, 10L
), Y = c(2L, 5L, 2L, 7L), Z = c(6L, 9L, 5L, 1L), value_1 = c(22L,
52L, 2L, 23L), value_2 = c(54L, 71L, 34L, 47L)), .Names = c("ID",
"X", "Y", "Z", "value_1", "value_2"), class = "data.frame",
row.names = c(NA, -4L))
Here's a (very slow) pure R solution using no extra packages:
# Replicate your matrix
input_df <- data.frame(ID = rnorm(10000),
X = rnorm(10000),
Y = rnorm(10000),
Z = rnorm(10000),
value_1 = rnorm(10000),
value_2 = rnorm(10000))
# Preallocate memory to a data frame
output_df <- data.frame(
matrix(
nrow = nrow(input_df)*2,
ncol = ncol(input_df)-2))
# Loop through each row in turn.
# Put the first four elements into the current
# row, and the next two into the current+1 row
# with two NAs attached.
for(i in seq(1, nrow(output_df), 2)){
output_df[i,] <- input_df[i, c(1:4)]
output_df[i+1,] <- c(input_df[i, c(5:6)],NA,NA)
}
colnames(output_df) <- c("ID", "X", "Y", "Z")
Which results in
> head(output_df)
X1 X2 X3 X4
1 0.5529417 -0.93859275 2.0900276 -2.4023800
2 0.9751090 0.13357075 NA NA
3 0.6753835 0.07018647 0.8529300 -0.9844643
4 1.6405939 0.96133195 NA NA
5 0.3378821 -0.44612782 -0.8176745 0.2759752
6 -0.8910678 -0.37928353 NA NA
This should work
data <- read.table(text= "ID X Y Z value_1 value_2
1 3 2 6 22 54
6 11 5 9 52 71
3 7 2 5 2 34
5 10 7 1 23 47", header=T)
data1 <- data[,1:4]
data2 <- setdiff(data,data1)
names(data2) <- names(data1)[1:ncol(data2)]
combined <- plyr::rbind.fill(data1,data2)
n <- nrow(data1)
combined[kronecker(1:n, c(0, n), "+"),]
Though why you would need to do this beats me.

Resources