How to make groups in a data.frame equal length? - r

I have this data.frame:
df <- data.frame(id=c('A','A','B','B','B','C'), amount=c(45,66,99,34,71,22))
id | amount
-----------
A | 45
A | 66
B | 99
B | 34
B | 71
C | 22
which I need to expand so that each by group in the data.frame is of equal length (filling it out with zeroes), like so:
id | amount
-----------
A | 45
A | 66
A | 0 <- added
B | 99
B | 34
B | 71
C | 22
C | 0 <- added
C | 0 <- added
What is the most efficient way of doing this?
NOTE
Benchmarking the some of the solutions provided with my actual 1 million row data.frame I got:
plyr | data.table | unstack
-----------------------------------
Elapsed: 139.87s | 0.09s | 2.00s

One way using data.table
df <- structure(list(V1 = structure(c(1L, 1L, 2L, 2L, 2L, 3L),
.Label = c("A ", "B ", "C "), class = "factor"),
V2 = c(45, 66, 99, 34, 71, 22)),
.Names = c("V1", "V2"),
class = "data.frame", row.names = c(NA, -6L))
require(data.table)
dt <- data.table(df, key="V1")
# get maximum index
idx <- max(dt[, .N, by=V1]$N)
# get final result
dt[, list(V2 = c(V2, rep(0, idx-length(V2)))), by=V1]
# V1 V2
# 1: A 45
# 2: A 66
# 3: A 0
# 4: B 99
# 5: B 34
# 6: B 71
# 7: C 22
# 8: C 0
# 9: C 0

I'm sure there is a base R solution, but here is one that uses ddply in the plyr package
library(plyr)
##N: How many values should be in each group
N = 3
ddply(df, "id", summarize,
amount = c(amount, rep(0, N-length(amount))))
gives:
id amount
1 A 45
2 A 66
3 A 0
4 B 99
5 B 34
6 B 71
7 C 22
8 C 0
9 C 0

Here's another way in base R using unstack and stack.
# ensure character id col
df <- transform(df, id=as.character(id))
# break into a list by id
u <- unstack(df, amount ~ id)
# get max length
max.len <- max(sapply(u, length))
# pad the short ones with 0s
filled <- lapply(u, function(x) c(x, numeric(max.len - length(x))))
# recombine into data.frame
stack(filled)
# values ind
# 1 45 A
# 2 66 A
# 3 0 A
# 4 99 B
# 5 34 B
# 6 71 B
# 7 22 C
# 8 0 C
# 9 0 C

How about this?
out <- by(df, INDICES = df$id, FUN = function(x, N) {
x <- droplevels(x)
lng <- nrow(x)
dif <- N - lng
if (dif == 0) return(x)
make.list <- lapply(1:dif, FUN = function(y) data.frame(id = levels(x$id), amount = 0))
rbind(x, do.call("rbind", make.list))
}, N = max(table(df$id))) # N could also be an integer
do.call("rbind", out)
id amount
A.1 A 45
A.2 A 66
A.3 A 0
B.3 B 99
B.4 B 34
B.5 B 71
C.6 C 22
C.2 C 0
C.3 C 0

Here is a dplyr option:
library(dplyr)
# Get maximum number of rows for all groups
N = max(count(df,id)$n)
df %>%
group_by(id) %>%
summarise(amount = c(amount, rep(0, N-length(amount))), .groups = "drop")
Output
id amount
<chr> <dbl>
1 A 45
2 A 66
3 A 0
4 B 99
5 B 34
6 B 71
7 C 22
8 C 0
9 C 0

Related

Identify pairs or groups of rows that have the same values across multiple columns

Say I have a data.frame:
file = read.table(text = "sex age num
M 32 5
F 31 2
M 91 2
M 30 1
M 23 1
F 19 1
F 31 2
F 21 2
M 32 5
F 65 3
M 24 5", header = T, sep = "")
I want to get a sorted data frame of all rows that have the exact same values of sex, age, and num with any other row in the data frame.
The result should look like this (note that the data frame is sorted by the pairs or groups that are duplicated with each other):
result = read.table(text = "sex age num
M 32 5
M 32 5
F 31 2
F 31 2", header = T, sep = "")
I have tried various combinations of distinct in dplyr and duplicated, but they don't quite get at this use case.
We need duplicated twice i.e. one duplicated in the normal direction from up to bottom and second from bottom to top (fromLast = TRUE) and then use | so that it can be TRUE in either direction for subsetting
out <- file[duplicated(file)|duplicated(file, fromLast = TRUE),]
out$sex <- factor(out$sex, levels = c("M", "F"))
out1 <- out[do.call(order, out),]
row.names(out1) <- NULL
-output
> out1
sex age num
1 M 32 5
2 M 32 5
3 F 31 2
4 F 31 2
The above can be written in tidyverse
library(dplyr)
file %>%
arrange(sex == "F", across(everything())) %>%
filter(duplicated(.)|duplicated(., fromLast = TRUE))
sex age num
1 M 32 5
2 M 32 5
3 F 31 2
4 F 31 2
An alternative approach:
Here all groups with more then 1 nrow will be kept:
library(dplyr)
file %>%
group_by(sex, age, num) %>%
filter(n() > 1) %>%
arrange(.by_group = T)
ungroup()
sex age num
<chr> <int> <int>
1 F 31 2
2 F 31 2
3 M 32 5
4 M 32 5
file = read.table(text = "sex age num
M 32 5
F 31 2
M 91 2
M 30 1
M 23 1
F 19 1
F 31 2
F 21 2
M 32 5
F 65 3
M 24 5", header = T, sep = "")
library(vctrs)
library(dplyr, warn = F)
#> Warning: package 'dplyr' was built under R version 4.1.2
file %>%
filter(vec_duplicate_detect(.)) %>%
arrange(across(everything()))
#> sex age num
#> 1 F 31 2
#> 2 F 31 2
#> 3 M 32 5
#> 4 M 32 5
Created on 2022-08-19 by the reprex package (v2.0.1.9000)
A base R option using subset + ave
> subset(file, ave(seq_along(num), sex, age, num, FUN = length) > 1)
sex age num
1 M 32 5
2 F 31 2
7 F 31 2
9 M 32 5
or rbind + split
> do.call(rbind, Filter(function(x) nrow(x) > 1, split(file, ~ sex + age + num)))
sex age num
F.31.2.2 F 31 2
F.31.2.7 F 31 2
M.32.5.1 M 32 5
M.32.5.9 M 32 5
Here is an approach, using .SD[.N>1] by group in data.table
library(data.table)
result = setDT(file)[, i:=.I][, .SD[.N>1],.(sex,age,num)][, i:=NULL]
Output:
sex age num
1: M 32 5
2: M 32 5
3: F 31 2
4: F 31 2

Matching based on different independent tables using data.table in R

I would like to match multiple conditions from independent data tables onto my main data table.
How can I do this using the data.table package?
What would be the most efficient/fastest way?
I have a mock example, with some mock conditions here to illustrate my question:
main_data <- data.frame( pnum = c(1,2,3,4,5,6,7,8,9,10),
age = c(24,35,43,34,55,24,36,43,34,54),
gender = c("f","m","f","f","m","f","m","f","f","m"))
data_1 <- data.frame( pnum = c(1,4,5,8,9),
value_data_1 = c(1, 2, 1, 1, 1),
date = as.Date(c("2019-01-01", "2018-07-01", "2018-01-01", "2016-07-01", "2016-07-01")))
data_2 <- data.frame( pnum = c(1,5,7,8,9),
value_data_2 = c(1, 2, 1, 1, 2),
date = as.Date(c("2019-01-01", "2018-07-01", "2018-01-01", "2016-07-01", "2016-07-01")))
I would like to create a new variable in my main_data table called "matching" of those rows that match between data_1 and data_2 under multiple conditions:
First, the value of data_1$value_data_1 has to be equal to 1.
Second, the value of data_2$value_data_2 also has to be equal to 1.
Third, the pnum and the date should match between data_1 and data_2.
When all these conditions are met, I would expect the new output of main_data to look like this:
> main_data
pnum age gender matching
1 1 24 f 1
2 2 35 m 0
3 3 43 f 0
4 4 34 f 0
5 5 55 m 0
6 6 24 f 0
7 7 36 m 0
8 8 43 f 1
9 9 34 f 0
10 10 54 m 0
Until now, I programmed each condition seperately and created new placeholder tables in between, but this is not very memory efficient. Is there an efficient way to chain all the conditions using the data.tables package specifically?
Here's one way:
library(data.table)
library(magrittr)
setDT(main_data)
setDT(data_1)
setDT(data_2)
main_data %>%
data_1[., on = .(pnum == pnum) ] %>%
data_2[., on = .(pnum == pnum, date == date) ] %>%
.[, matching := fcoalesce(+(value_data_1 == 1 & value_data_2 == 1), 0L) ] %>%
.[, .(pnum, age, gender, matching) ]
# pnum age gender matching
# 1: 1 24 f 1
# 2: 2 35 m 0
# 3: 3 43 f 0
# 4: 4 34 f 0
# 5: 5 55 m 0
# 6: 6 24 f 0
# 7: 7 36 m 0
# 8: 8 43 f 1
# 9: 9 34 f 0
# 10: 10 54 m 0
I used the magrittr package because I find it useful for portraying the flow code. It is not at all required, and the alternative pipeline for data.table for the same code could be:
data_2[
data_1[main_data, on = .(pnum == pnum) ]
,on = .(pnum == pnum, date == date)
][ ,matching := fcoalesce(+(value_data_1 == 1 & value_data_2 == 1), 0L)
][ ,.(pnum, age, gender, matching) ]
There are other ways to break it out, including the use of temporary (mid-step) variables. (This is mostly style and personal preference.)
You can use something like Reduce(merge, list(...))
library(data.table)
setDT(main_data); setDT(data_1); setDT(data_2)
res <- Reduce(function(x, y) {
merge(x, y, by = "pnum", all.x = TRUE)
}, list(main_data, data_1[, -"date"], data_2[, -"date"]))[, `:=`(
matching = 1L - (value_data_1 != 1 | value_data_2 != 1 | is.na(value_data_1) | is.na(value_data_2)),
value_data_1 = NULL,
value_data_2 = NULL
)]
Output
> res[]
pnum age gender matching
1: 1 24 f 1
2: 2 35 m 0
3: 3 43 f 0
4: 4 34 f 0
5: 5 55 m 0
6: 6 24 f 0
7: 7 36 m 0
8: 8 43 f 1
9: 9 34 f 0
10: 10 54 m 0

multiple rows in one after join

i have to join two dataframes. the joining is working properly, my only problem is:
sometimes i have 2 rows in a dataset, that have to be joined with the other dataset. after the joining the rows are displayed directly one below the other. I need them in one row:
here a small example:
df1 <- data.frame(A = c(1:5),
B = c(12, 13, 14, 15, 16),
C = c("a", "b", "c", "d", "e"))
df2 <- data.frame(A = c(1,1,2,2,3),
E = c(112, 145, 546, 674, 287),
J = c("t", "e", "v", "p", "s"))
merged <- inner_join(df1, df2, by = "A")
A | B | c | E | J |
---------------------
1 | 12| a | 112 | t
1 | 12| a | 145 | e
2 | 13| b | 546 | v
2 | 13| b | 674 | p
3 | 14| c | 287 | s
i need it this way:
A | B | c | E.x | J.x | E.y | J.y
--------------------------------
1 | 12| a | 112 | t | 145 | e
2 | 13| b | 546 | v | 674 | p
all joined rows together in one row in the new dataset
thank you for your help
derlu
**update:**
My original data frames have
354 items with 1535 observation
246 items 203 observation
They are merged by the first column id_merge. Sometimes I have 2 matches, sometimes 3 identical „id_merge“ values.
One approach is to split up the dataframe by occurrence of each key, and then do another join. First, we create a key (n) and split up the dataframe:
merged <- merged %>%
group_by(A, B, C) %>%
mutate(n = 1:n())
# A B C E J n
# <dbl> <dbl> <fct> <dbl> <fct> <int>
# 1 1 12 a 112 t 1
# 2 1 12 a 145 e 2
# 3 2 13 b 546 v 1
# 4 2 13 b 674 p 2
# 5 3 14 c 287 s 1
merged_list <- split(merged, merged$n) %>%
map(select, -n)
# $`1`
# # A tibble: 3 x 5
# # Groups: A [3]
# A B C E J
# <dbl> <dbl> <fct> <dbl> <fct>
# 1 1 12 a 112 t
# 2 2 13 b 546 v
# 3 3 14 c 287 s
# $`2`
# # A tibble: 2 x 5
# # Groups: A [2]
# A B C E J
# <dbl> <dbl> <fct> <dbl> <fct>
# 1 1 12 a 145 e
# 2 2 13 b 674 p
Then use Reduce to join the dataframes:
Reduce(function(x, y) left_join(x, y, by = c("A", "B", "C")),
merged_list)
# A B C E.x J.x E.y J.y
# <dbl> <dbl> <fct> <dbl> <fct> <dbl> <fct>
# 1 1 12 a 112 t 145 e
# 2 2 13 b 546 v 674 p
# 3 3 14 c 287 s NA NA
It's not cool these days, but Base R's reshape actually shines for this as it deals with unequal group sizes and labelling all in one go. You need to add a time variable first though to label each group:
merged <- merge(df1, cbind(df2, time=ave(df2$A, df2$A, FUN=seq_along)), by="A")
reshape(merged, idvar=names(df1), direction="wide", sep="_")
# A B C E_1 J_1 E_2 J_2
#1 1 12 a 112 t 145 e
#3 2 13 b 546 v 674 p
#5 3 14 c 287 s NA <NA>
It actually fits in a dplyr pipeline not too bad:
df2 %>%
group_by(A) %>%
mutate(time=row_number()) %>%
inner_join(df1,.,by="A") %>%
reshape(idvar=names(df1), direction="wide", sep="_")
With data.table, you can split each group of (E, J) rows having the same (A, B, C) values by row, and cbind that list together. Here .I is the row number, .N is the number of rows in the group, and .SD is the group of rows.
This only works if the number of repeated rows is the same in all groups (except for groups with 1 row)
library(data.table)
setDT(merged)
merged[, if(.N > 1) do.call(cbind, split(.SD, .I))
, by = .(A, B, C)]
# A B C 1.E 1.J 2.E 2.J
# 1: 1 12 a 112 t 145 e
# 2: 2 13 b 546 v 674 p
It will be like this and applicable for larger datasets as well
df1 <- data.frame(A = c(1:5),
B = c(12, 13, 14, 15, 16),
C = c("a", "b", "c", "d", "e"))
df2 <- data.frame(A = c(1,1,2,2,3),
E = c(112, 145, 546, 674, 287),
J = c("t", "e", "v", "p", "s"))
df3 <- merge(df1, df2, by=c("A"))
df4 <- data.frame()
for (j in 1:(nrow(df3)-1))
for (k in (j+1):nrow(df3))
for (i in 2:ncol(df3))
if(df3[j,1] == df3[k,1] && all(df3[j,i:ncol(df3)]!=df3[k,i:ncol(df3)])){
if(nrow(df4) == 0)
df4 <- data.frame(c(df3[j,],df3[k,i:ncol(df3)]))
else
df4 <- rbind(df4,c(df3[j,],df3[k,i:ncol(df3)]))
break
}

Aggregate dataframe in rolling blocks of 3 rows

I have the following data frame as an example
df <- data.frame(score=letters[1:15], total1=1:15, total2=16:30)
> df
score total1 total2
1 a 1 16
2 b 2 17
3 c 3 18
4 d 4 19
5 e 5 20
6 f 6 21
7 g 7 22
8 h 8 23
9 i 9 24
10 j 10 25
11 k 11 26
12 l 12 27
13 m 13 28
14 n 14 29
15 o 15 30
I would like to aggregate my data frame by sum by grouping the rows having different name, i.e.
groups sum1 sum2
'a-b-c' 6 51
'c-d-e' 21 60
etc
All the given answers to this kind of question assume that the strings repeat in the row.
The usual aggregate function that I use to obtain the summary delivers a different result:
aggregate(df$total1, by=list(sum1=df$score %in% c('a','b','c'), sum2=df$score %in% c('d','e','f')), FUN=sum)
sum1 sum2 x
1 FALSE FALSE 99
2 TRUE FALSE 6
3 FALSE TRUE 15
If you want a tidyverse solution, here is one possibility:
df <- data.frame(score=letters[1:15], total1=1:15, total2=16:30)
df %>%
mutate(groups = case_when(
score %in% c("a","b","c") ~ "a-b-c",
score %in% c("d","e","f") ~ "d-e-f"
)) %>%
group_by(groups) %>%
summarise_if(is.numeric, sum)
returns
# A tibble: 3 x 3
groups total1 total2
<chr> <int> <int>
1 a-b-c 6 51
2 d-e-f 15 60
3 <NA> 99 234
Add a "groups" column with the category value.
df$groups = NA
and then define each group like this:
df$groups[df$score=="a" | df$score=="b" | df$score=="c" ] = "a-b-c"
Finally aggregate by that column.
Here's a solution that works for any sized data frame.
df <- data.frame(score=letters[1:15], total1=1:15, total2=16:30)
# I'm adding a row to demonstrate that the grouping pattern works when the
# number of rows is not equally divisible by 3.
df <- rbind(df, data.frame(score = letters[16], total1 = 16, total2 = 31))
# A vector that represents the correct groupings for the data frame.
groups <- c(rep(1:floor(nrow(df) / 3), each = 3),
rep(floor(nrow(df) / 3) + 1, nrow(df) - length(1:(nrow(df) / 3)) * 3))
# Your method of aggregation by `groups`. I'm going to use `data.table`.
require(data.table)
dt <- as.data.table(df)
dt[, group := groups]
aggDT <- dt[, list(score = paste0(score, collapse = "-"),
total1 = sum(total1), total2 = sum(total2)), by = group][
, group := NULL]
aggDT
score total1 total2
1: a-b-c 6 51
2: d-e-f 15 60
3: g-h-i 24 69
4: j-k-l 33 78
5: m-n-o 42 87
6: p 16 31

How to recode multiple columns in R

I tried my best to recode multiple columns, but I still struggle to do it. Here what I have done:
df<-read.table(text="ZR1 Time1 ZR2 Time2 ZR3 Time3
A 60 A 56 B 44
C 61 B 44 D 78
D 62 C 78 E 66
E 58 D 46 B 45
A 54 B 23 B 23
A 57 E 24 B 100",h=T)
What I have done
for (i in 1) {
ZRi<-paste0("ZR", i)
Zi<-paste0("Z",i)}
df[,Zi]=c(A=4,B=3,C=2,D=1,E=0)
df[,Zi]=c(A=4,B=3,C=2,D=1,E=0)[df[,ZRi]]
I got this:
ZR1 Time1 ZR2 Time2 ZR3 Time3 Z1
1 A 60 A 56 B 44 4
2 C 61 B 44 D 78 3
3 D 62 C 78 E 66 2
4 E 58 D 46 B 45 1
5 A 54 B 23 B 23 4
6 A 57 E 24 B 100 4
As you can see, I could get Z1, which is wrong.
I want to get this:
ZR1 Time1 ZR2 Time2 ZR3 Time3 Z1 Z2 Z3
A 60 A 56 B 44 4 4 3
C 61 B 44 D 78 2 3 1
D 62 C 78 E 66 1 2 0
E 58 D 46 B 45 0 1 3
A 54 B 23 B 23 4 3 3
A 57 E 24 B 100 4 0 3
Here's the base approach (and probably fastest). You are just using the values of the ZR columns as an index into c(A=4,B=3,C=2,D=1,E=0) which becomes a translation table and then assigning those results to new columns in df:
df[ paste0("Z", 1:3) ] <-
lapply( df[ , grepl("^ZR", names(df))] , # passes "ZR" columns one-at-a-time
function(x) {c(A=4,B=3,C=2,D=1,E=0)[as.character(x)]})
Depending on what was intended as the purpose for these new columns, #User60 should be aware that this delivers numeric vectors
By playing with levels and labels you can get this:
for (i in 1:3) {
df[[paste0("Z",i)]] <-
factor(df[[paste0("ZR", i)]],levels=LETTERS[1:5],labels=4:0)
}
df
# ZR1 Time1 ZR2 Time2 ZR3 Time3 Z1 Z2 Z3
# 1 A 60 A 56 B 44 4 4 3
# 2 C 61 B 44 D 78 2 3 1
# 3 D 62 C 78 E 66 1 2 0
# 4 E 58 D 46 B 45 0 1 3
# 5 A 54 B 23 B 23 4 3 3
# 6 A 57 E 24 B 100 4 0 3
The created columns with this method will be factors, to have numeric instead use the following:
for (i in 1:3) {
df[[paste0("Z",i)]] <-
as.numeric(as.character(factor(df[[paste0("ZR", i)]],levels=LETTERS[1:5],labels=4:0)))
}
Maybe this one one-liner with dplyr could help
df %>%
mutate_at(setNames(paste0("ZR", 1:3), paste0("Z", 1:3)),
~5-as.numeric(factor(.x, levels = LETTERS[1:5])))
The trick here is to pass named vector to mutate_at to create new columns. You can coerce factor to numeric if you pre-specified the levels.
An alternative solution using dplyr + magrittr packages
library(dplyr); library(magrittr)
df2 <- select(df, starts_with("ZR")) %>%
lapply(as.character) %>%
mapply(`[`, list(c(A=4,B=3,C=2,D=1,E=0)), .) %>%
data.frame(df, .)
names(df2)[ncol(df2)-2:0] <- paste0("Z", 1:3)
Here's a more dplyr-esque method. Useful for recoding when the output isn't an integer.
library(dplyr)
# Make lookup table
lookup <- data.frame(let = LETTERS[1:5], num = 4:0, stringsAsFactors = F)
# Join with lookup table
df %>%
left_join(lookup, by = c('ZR1' = 'let')) %>%
left_join(lookup, by = c('ZR2' = 'let')) %>%
left_join(lookup, by = c('ZR3' = 'let')) %>%
rename_at(vars(matches('num')), ~paste0('Z', 1:3))
Or, with data.table
library(data.table)
lookup <- data.frame(let = LETTERS[1:5], num = 4:0, stringsAsFactors = F)
setDT(df)
df[, paste0('Z', 1:3) := lapply(df[,paste0('ZR', 1:3)],
function(x) lookup$num[match(x, lookup$let)])]

Resources