Aggregate dataframe in rolling blocks of 3 rows - r

I have the following data frame as an example
df <- data.frame(score=letters[1:15], total1=1:15, total2=16:30)
> df
score total1 total2
1 a 1 16
2 b 2 17
3 c 3 18
4 d 4 19
5 e 5 20
6 f 6 21
7 g 7 22
8 h 8 23
9 i 9 24
10 j 10 25
11 k 11 26
12 l 12 27
13 m 13 28
14 n 14 29
15 o 15 30
I would like to aggregate my data frame by sum by grouping the rows having different name, i.e.
groups sum1 sum2
'a-b-c' 6 51
'c-d-e' 21 60
etc
All the given answers to this kind of question assume that the strings repeat in the row.
The usual aggregate function that I use to obtain the summary delivers a different result:
aggregate(df$total1, by=list(sum1=df$score %in% c('a','b','c'), sum2=df$score %in% c('d','e','f')), FUN=sum)
sum1 sum2 x
1 FALSE FALSE 99
2 TRUE FALSE 6
3 FALSE TRUE 15

If you want a tidyverse solution, here is one possibility:
df <- data.frame(score=letters[1:15], total1=1:15, total2=16:30)
df %>%
mutate(groups = case_when(
score %in% c("a","b","c") ~ "a-b-c",
score %in% c("d","e","f") ~ "d-e-f"
)) %>%
group_by(groups) %>%
summarise_if(is.numeric, sum)
returns
# A tibble: 3 x 3
groups total1 total2
<chr> <int> <int>
1 a-b-c 6 51
2 d-e-f 15 60
3 <NA> 99 234

Add a "groups" column with the category value.
df$groups = NA
and then define each group like this:
df$groups[df$score=="a" | df$score=="b" | df$score=="c" ] = "a-b-c"
Finally aggregate by that column.

Here's a solution that works for any sized data frame.
df <- data.frame(score=letters[1:15], total1=1:15, total2=16:30)
# I'm adding a row to demonstrate that the grouping pattern works when the
# number of rows is not equally divisible by 3.
df <- rbind(df, data.frame(score = letters[16], total1 = 16, total2 = 31))
# A vector that represents the correct groupings for the data frame.
groups <- c(rep(1:floor(nrow(df) / 3), each = 3),
rep(floor(nrow(df) / 3) + 1, nrow(df) - length(1:(nrow(df) / 3)) * 3))
# Your method of aggregation by `groups`. I'm going to use `data.table`.
require(data.table)
dt <- as.data.table(df)
dt[, group := groups]
aggDT <- dt[, list(score = paste0(score, collapse = "-"),
total1 = sum(total1), total2 = sum(total2)), by = group][
, group := NULL]
aggDT
score total1 total2
1: a-b-c 6 51
2: d-e-f 15 60
3: g-h-i 24 69
4: j-k-l 33 78
5: m-n-o 42 87
6: p 16 31

Related

Identify pairs or groups of rows that have the same values across multiple columns

Say I have a data.frame:
file = read.table(text = "sex age num
M 32 5
F 31 2
M 91 2
M 30 1
M 23 1
F 19 1
F 31 2
F 21 2
M 32 5
F 65 3
M 24 5", header = T, sep = "")
I want to get a sorted data frame of all rows that have the exact same values of sex, age, and num with any other row in the data frame.
The result should look like this (note that the data frame is sorted by the pairs or groups that are duplicated with each other):
result = read.table(text = "sex age num
M 32 5
M 32 5
F 31 2
F 31 2", header = T, sep = "")
I have tried various combinations of distinct in dplyr and duplicated, but they don't quite get at this use case.
We need duplicated twice i.e. one duplicated in the normal direction from up to bottom and second from bottom to top (fromLast = TRUE) and then use | so that it can be TRUE in either direction for subsetting
out <- file[duplicated(file)|duplicated(file, fromLast = TRUE),]
out$sex <- factor(out$sex, levels = c("M", "F"))
out1 <- out[do.call(order, out),]
row.names(out1) <- NULL
-output
> out1
sex age num
1 M 32 5
2 M 32 5
3 F 31 2
4 F 31 2
The above can be written in tidyverse
library(dplyr)
file %>%
arrange(sex == "F", across(everything())) %>%
filter(duplicated(.)|duplicated(., fromLast = TRUE))
sex age num
1 M 32 5
2 M 32 5
3 F 31 2
4 F 31 2
An alternative approach:
Here all groups with more then 1 nrow will be kept:
library(dplyr)
file %>%
group_by(sex, age, num) %>%
filter(n() > 1) %>%
arrange(.by_group = T)
ungroup()
sex age num
<chr> <int> <int>
1 F 31 2
2 F 31 2
3 M 32 5
4 M 32 5
file = read.table(text = "sex age num
M 32 5
F 31 2
M 91 2
M 30 1
M 23 1
F 19 1
F 31 2
F 21 2
M 32 5
F 65 3
M 24 5", header = T, sep = "")
library(vctrs)
library(dplyr, warn = F)
#> Warning: package 'dplyr' was built under R version 4.1.2
file %>%
filter(vec_duplicate_detect(.)) %>%
arrange(across(everything()))
#> sex age num
#> 1 F 31 2
#> 2 F 31 2
#> 3 M 32 5
#> 4 M 32 5
Created on 2022-08-19 by the reprex package (v2.0.1.9000)
A base R option using subset + ave
> subset(file, ave(seq_along(num), sex, age, num, FUN = length) > 1)
sex age num
1 M 32 5
2 F 31 2
7 F 31 2
9 M 32 5
or rbind + split
> do.call(rbind, Filter(function(x) nrow(x) > 1, split(file, ~ sex + age + num)))
sex age num
F.31.2.2 F 31 2
F.31.2.7 F 31 2
M.32.5.1 M 32 5
M.32.5.9 M 32 5
Here is an approach, using .SD[.N>1] by group in data.table
library(data.table)
result = setDT(file)[, i:=.I][, .SD[.N>1],.(sex,age,num)][, i:=NULL]
Output:
sex age num
1: M 32 5
2: M 32 5
3: F 31 2
4: F 31 2

Dynamic Columns in Dplyr using NSE on the RHS

I am attempting to reference existing columns in dplyr through a loop. Effectively, I would like to evaluate the operations from one table (evaluation in below example) to be performed to another table (dt in below example). I do not want to hardcode the column names on the RHS within mutate(). I would like to control the evaluations being performed from the evaluation table below. So I am trying to make the process dynamic.
Here is a sample dataframe:
dt = data.frame(
A = c(1:20),
B = c(11:30),
C = c(21:40),
AA = rep(1, 20),
BB = rep(2, 20)
)
Here is a table of sample operations to be performed:
evaluation = data.frame(
New_Var = c("AA", "BB"),
Operation = c("(A*2) > B", "(B*2) <= C"),
Result = c("True", "False")
) %>% mutate_all(as.character)
What I am trying to do is the following:
for (i in 1:nrow(evaluation)) {
var = evaluation$New_Var[i]
dt = dt %>%
rowwise() %>%
mutate(!!var := ifelse(eval(parse(text = evaluation$Operation[i])),
evaluation$Result[i],
!!var))
}
my desired result would be something like this except for the "AA" in the AA column would be the original numeric values of the AA column of 1, 1, 1, 1, 1.
UPDATED:
I believe my syntax in the "False" part of the ifelse statement is incorrect. What is the correct syntax to specify "!!var" in the false portion of the ifelse statement?
I know there are other ways to do it using base R, but I would rather do it through dplyr as it is cleaner code to look at. I am leveraging "rowise()" to do it element by element.
Modified data to (a) enforce type consistency for columns AA and BB and (b) ensure that at least one row satisfies the second condition.
dt = tibble(
A = c(1:20),
B = c(10:29), ## Note the change
C = c(21:40),
AA = rep("a", 20), ## Note initialization with strings
BB = rep("b", 20) ## Ditto
)
To make your loop work, you need to convert your code strings into actual expressions. You can use rlang::sym() for variable names and rlang::parse_expr() for everything else.
for( i in 1:nrow(evaluation) )
{
var <- rlang::sym(evaluation$New_Var[i])
op <- rlang::parse_expr(evaluation$Operation[i])
dt = dt %>% rowwise() %>%
mutate(!!var := ifelse(!!op, evaluation$Result[i],!!var))
}
# # A tibble: 20 x 5
# A B C AA BB
# <int> <int> <int> <chr> <chr>
# 1 1 10 21 a False
# 2 2 11 22 a False
# 3 3 12 23 a b
# 4 4 13 24 a b
# 5 5 14 25 a b
# 6 6 15 26 a b
# 7 7 16 27 a b
# 8 8 17 28 a b
# 9 9 18 29 a b
# 10 10 19 30 True b
# 11 11 20 31 True b
# 12 12 21 32 True b
# 13 13 22 33 True b
# 14 14 23 34 True b
# 15 15 24 35 True b
# 16 16 25 36 True b
# 17 17 26 37 True b
# 18 18 27 38 True b
# 19 19 28 39 True b
# 20 20 29 40 True b
Assuming that Felipe's answer was the functionality you desired, here's a more "tidyverse"/pipe-oriented/functional approach.
Data
library(rlang)
library(dplyr)
library(purrr)
operations <- tibble(
old_var = exprs(A, B),
new_var = exprs(AA, BB),
test = exprs(2*A > B, 2*B <= C),
result = exprs("True", "False")
)
original <- tibble(
A = sample.int(30, 10),
B = sample.int(30, 10),
C = sample.int(30, 10)
)
original
# A tibble: 10 x 3
A B C
<int> <int> <int>
1 4 20 5
2 30 29 11
3 1 27 14
4 2 21 4
5 17 19 24
6 14 25 9
7 5 22 22
8 6 13 7
9 25 4 21
10 12 11 12
Functions
# Here's your reusable functions
generic_mutate <- function(dat, new_var, test, result, old_var) {
dat %>% mutate(!!new_var := ifelse(!!test, !!result, !!old_var))
}
generic_ops <- function(dat, ops) {
pmap(ops, generic_mutate, dat = dat) %>%
reduce(full_join)
}
generic_mutate takes a single original dataframe, a single new_var, etc. It performs the test, adds the new column with the appropriate name and values.
generic_ops is the "vectorized" version. It takes the original dataframe as the first argument, and a dataframe of operations as the second. It then parallel maps over each column of new variable names, tests, etc, and calls generic_mutate on each one. That results in a list of dataframes, each with one added column. The reduce then combines them back all together with a sequential full_join.
Results
original %>%
generic_ops(operations)
Joining, by = c("A", "B", "C")
# A tibble: 10 x 5
A B C AA BB
<int> <int> <int> <chr> <chr>
1 4 20 5 4 20
2 30 29 11 True 29
3 1 27 14 1 27
4 2 21 4 2 21
5 17 19 24 True 19
6 14 25 9 True 25
7 5 22 22 5 22
8 6 13 7 6 13
9 25 4 21 True False
10 12 11 12 True 11
The magic here is using exprs(...) so you can store NSE names and operations in a tibble without forcing their evaluation. I think this is a lot cleaner than storing names and operations in strings with quotation marks.
How's this:
evaluation = data.frame(
Old_Var = c('A', 'B'),
New_Var = c("AA", "BB"),
Operation = c("(A*2) > B", "(B*2) <= C"),
Result = c("True", "False")
) %>% mutate_all(as.character)
for (i in 1:nrow(evaluation)) {
old <- sym(evaluation$Old_Var[i])
new <- sym(evaluation$New_Var[i])
op <- sym(evaluation$Operation[i])
res <- sym(evaluation$Result[i])
dt <- dt %>%
mutate(!!new := ifelse(!!op, !!res, !!old))
}
EDIT: My last answer doesn't work because rlang tries to find a variable named !!op (e.g. named (A*2) > B) instead of evaluating the expression. I got this to work using a mix of tidyselect and base R. You can of course follow #Brian's advice and use this solution with pmap. I honestly don't know how well this will perform though, as I think it will evaluate the ifelse once per row, and am not sure it's a vectorized operation...
dt <- tibble(
A = c(1:20),
B = c(11:30),
C = c(21:40),
AA = rep(1, 20),
BB = rep(2, 20)
)
evaluation = tibble(
Old_Var = c('A', 'B'),
New_Var = c("AA", "BB"),
Operation = c('(A*2) > B', '(B*2) <= C'),
Result = c("True", "False")
)
for (i in 1:nrow(evaluation)) {
old <- evaluation$Old_Var[i]
new <- evaluation$New_Var[i]
op <- evaluation$Operation[i]
res <- evaluation$Result[i]
dt <- dt %>%
mutate(!!sym(new) := eval(parse(text = sprintf('ifelse(%s, "%s", %s)', op, res, old))))
}
One way is to rework the conditions first, then pass them to mutate :
conds <- parse(text=evaluation$Operation) %>%
as.list() %>%
setNames(evaluation$New_Var) %>%
imap(~expr(ifelse(!!.,"True", !!sym(.y))))
conds
#> $AA
#> ifelse((A * 2) > B, "True", AA)
#>
#> $BB
#> ifelse((B * 2) <= C, "True", BB)
dt %>% mutate(!!!conds)
#> A B C AA BB
#> 1 1 11 21 1 2
#> 2 2 12 22 1 2
#> 3 3 13 23 1 2
#> 4 4 14 24 1 2
#> 5 5 15 25 1 2
#> 6 6 16 26 1 2
#> 7 7 17 27 1 2
#> 8 8 18 28 1 2
#> 9 9 19 29 1 2
#> 10 10 20 30 1 2
#> 11 11 21 31 True 2
#> 12 12 22 32 True 2
#> 13 13 23 33 True 2
#> 14 14 24 34 True 2
#> 15 15 25 35 True 2
#> 16 16 26 36 True 2
#> 17 17 27 37 True 2
#> 18 18 28 38 True 2
#> 19 19 29 39 True 2
#> 20 20 30 40 True 2

R: apply a function row-wise to a table, passing values from a look-up table as arguments

Here’s a row-wise iteration problem that I’ve been trying to solve with purrr::pmap, but no luck.
I start with a table of raw scores:
rawscore_table <- data.frame(rawscore = 10:14, SS1 = NA, SS2 = NA)
rawscore SS1 SS2
1 10 NA NA
2 11 NA NA
3 12 NA NA
4 13 NA NA
5 14 NA NA
There are two empty columns, SS1 and SS2, whose values I want to obtain by applying a function to each row:
SS1 = rawscore + x + y
SS2 = rawscore + x + y
The values of x and y are found in a lookup table:
lookup_table <- data.frame(SS = c('SS1', 'SS2'), x = 1:2, y = 3:4)
SS x y
1 SS1 1 3
2 SS2 2 4
The solution I’m looking for will calculate the values of column rawscore_table$SS1 by finding the values of x and y in the SS1 row of lookup_table, and it will calculate the values of column rawscore_table$SS2 by finding the values of x and y in the SS2 row of lookup_table.
So the code has to refer to the name of the column in rawscore_table in order to pluck values from the corresponding row of lookup_table.
The desired output looks like this:
rawscore SS1 SS2
1 10 14 16
2 11 15 17
3 12 16 18
4 13 17 19
5 14 18 20
Thanks in advance for any help!
An option would be to get the rowSums of the numeric columns of 'lookup_table', add (+) with the first column of 'rawscore_table', assign the output back to the columns except the first column
rawscore_table[-1] <- lapply(as.list(rowSums(lookup_table[-1])), `+`,
rawscore_table[,1])
Or replicate the rowSums output and add with the first column
rawscore_table[as.character(lookup_table$SS)] <- rawscore_table$rawscore +
rep(rowSums(lookup_table[-1]), each = nrow(rawscore_table))
Or using tidyverse
library(tidyverse)
lookup_table %>%
transmute(SS, xy = x + y) %>%
deframe %>%
as.list %>%
imap_dfc(~
rawscore_table %>%
transmute(!! .y := .x + rawscore)) %>%
bind_cols(rawscore_table[1], .)
# rawscore SS1 SS2
#1 10 14 16
#2 11 15 17
#3 12 16 18
#4 13 17 19
#5 14 18 20
Another option would be to join rawscore and lookup:
rawscore_table %>%
gather(SS, val, -rawscore) %>%
left_join(lookup_table, by = 'SS') %>%
mutate(val = rawscore + x + y, x = NULL, y = NULL) %>%
spread(SS, val)
# rawscore SS1 SS2
#1 10 14 16
#2 11 15 17
#3 12 16 18
#4 13 17 19
#5 14 18 20

Comparing two unequal data frame and selecting common values in the order of second data frame

I have two data frame as follows:
df1<-data.frame(st=c(1,2,3,4),v1=c(12,14,15,75),v2=c(43,32,12,18))
df1
st v1 v2
1 1 12 43
2 2 14 32
3 3 15 12
4 4 75 18
df2<-data.frame(st=c(1,2,3,4),v1=c(12,24,35,18),v2=c(48,32,121,82),v3=c(53,11,12,75))
df2
st v1 v2 v3
1 1 12 48 53
2 2 24 32 11
3 3 35 121 12
4 4 18 82 75
What i want is to match both the data frame at a "st" column level i.e. for st = 1 in df1 the corresponding values for v1 and v2 are 12 & 43. So if for st= 1 in df2 if any of the variables contain these values then I want to select st, and those values from df2.
So for the above example the output will be
St values
1 12(coming from v1 in df2)
2 32(coming from v2 in df2)
3 12(coming from v3 in df2)
4 18 75(coming from v1 & v3 in df2)
The important thing to note is, in the output data frame the order of selected variables should be as that of df2, as you can see that for st = 4, the values in df1 are 75 & 18 which matches with st = 2 but still the output is 18 and then 75 which is the order in df2. Also the variables in df2 will always be greater than df1.
If I understand you correctly...
Step 0. prepare data
You mentioned that you only want to select rows that fit your conditions, but the sample dataset has at least one match in each row. I tweaked it such that there's no match for St=3, to demonstrate the that the row will not be returned in the result.
df1<-data.frame(st=c(1,2,3,4),v1=c(12,14,15,75),v2=c(43,32,12,18))
df2<-data.frame(st=c(1,2,3,4),v1=c(12,24,35,18),v2=c(48,32,121,82),v3=c(53,11,13,75))
Step 1. combine the datasets
combined.df <- rbind(df1 %>% gather(v, n, -st) %>% mutate(df = "df1"),
df2 %>% gather(v, n, -st) %>% mutate(df = "df2"))
> head(combined.df)
st v n df
1 1 v1 12 df1
2 2 v1 14 df1
3 3 v1 15 df1
4 4 v1 75 df1
5 1 v2 43 df1
6 2 v2 32 df1
Step 2. compare & keep only matched ones from df2
res <- combined.df %>%
group_by(st) %>%
mutate(n = ifelse(df=="df1", n, ifelse(n %in% n[df=="df1"], n, NA))) %>%
ungroup() %>%
filter(df=="df2", !is.na(n)) %>%
arrange(st, v)
# if you just want the values, you can stop here.
> res
# A tibble: 4 × 4
st v n df
<dbl> <chr> <dbl> <chr>
1 1 v1 12 df2
2 2 v2 32 df2
3 4 v1 18 df2
4 4 v3 75 df2
# this part formats the result to follow that of the desired output
res <- res %>%
group_by(st) %>%
summarise(values = paste(as.character(n), collapse = " ")) %>%
ungroup()
> res
# A tibble: 3 × 2
st values
<dbl> <chr>
1 1 12
2 2 32
3 4 18 75
If you use merge function, you can create a unique df with this matches:
new<-merge(df1,df2,by="st")
new
st v1.x v2.x v1.y v2.y v3
1 1 12 43 12 48 53
2 2 14 32 24 32 11
3 3 15 12 35 121 12
4 4 75 18 18 82 75
And if you want, you then can order it in the way you want. For example:
new2<-new[,1:2]
new2$from<-"from v1"
names(new2)<-c("st","value","from")
for(i in 3:ncol(new)){
new3<-new[,c(1,i)]
new3$from<-pasteo("from v",i)
names(new3)<-c("st","value","from")
new2<-rbind(new2,new3)
}
This is not the most efficient way, but if you have few data, it will work

Find data frame column differences per multiple groupings

In R, I would like to subtract the sum of a value column (grouped by a letter in column 't1') from the sum of the same value column (grouped by the same letter in column 't2'). Repeat the process for every letter and for every year group.
Consider;
set.seed(3)
df <- data.frame(age = rep(1:3,each=25),
t1 = rep(expand.grid(LETTERS[1:5],LETTERS[1:5])[,1],3),
t2 = rep(expand.grid(LETTERS[1:5],LETTERS[1:5])[,2],3),
value = sample(1:10,75,replace=T))
This data frame shows 3 values in the 'age' column, 2 columns with categories (t1 and t2) and an associated value (value).
As an example, here is how it might work for 'A':
library(plyr);
# extract rows with A
df2 <- df[df$t1=="A" | df$t2=="A",]
# remove where t1 and t2 are the same (not needed)
df2 <- df2[df2$t1 != df2$t2,]
# use ddply to subtract sum of 'value' for A in t1 from t2
df2 <- ddply(df2, .(age), transform, change = sum(value[t2=="A"])-sum(value[t1=="A"]))
# create a name
df2$cat <- "A"
# remove all the duplicate rows, just need one summary value
df2 <- df2[ !duplicated(df2$change), ]
# keep summary data
df2 <- df2[,c(1,6,5)]
now I need to do this for all the values that occur in t1 and t2 (in this case A,B,C & D), creating a 12 line summary.
I tried a loop with;
for (c in as.character(unique(df$t1)))
but got nowehere
thanks a lot
Here is one base R solution that involves aggregation and merging:
# aggregate by age and t1 or t2
t1Agg <- aggregate(value ~ t1 + age, data=df, FUN=sum)
t2Agg <- aggregate(value ~ t2 + age, data=df, FUN=sum)
# merge aggregated data
aggData <- merge(t1Agg, t2Agg, by.x=c("age","t1"), by.y=c("age","t2"))
names(aggData) <- c("age", "t", "value.t1", "value.t2")
aggData$diff <- aggData$value.t1 - aggData$value.t2
I would recommend tidying your data first and then you can spread post-summarise and add a new column:
# Make reproducible
set.seed(4)
df <- data.frame(age = rep(1:3,each=25),
t1 = rep(expand.grid(LETTERS[1:5],LETTERS[1:5])[,1],3),
t2 = rep(expand.grid(LETTERS[1:5],LETTERS[1:5])[,2],3),
value = sample(1:10,75,replace=T))
library(tidyr)
library(dplyr)
df_tidy <- gather(df, t_var, t_val, -age, -value)
sample_n(df_tidy, 3)
# age value t_var t_val
# 104 2 6 t2 A
# 48 2 9 t1 C
# 66 3 7 t1 A
df_tidy %>%
group_by(age, t_var, t_val) %>%
summarise(val_sum = sum(value)) %>%
spread(t_var, val_sum) %>%
mutate(diff = t1 - t2)
# age t_val t1 t2 diff
# (int) (chr) (int) (int) (int)
# 1 1 A 30 22 8
# 2 1 B 32 32 0
# 3 1 C 27 28 -1
# 4 1 D 38 39 -1
# 5 1 E 30 36 -6
# 6 2 A 36 35 1
# 7 2 B 26 30 -4
# 8 2 C 40 27 13
# 9 2 D 27 31 -4
# 10 2 E 28 34 -6
# 11 3 A 26 39 -13
# 12 3 B 19 26 -7
# 13 3 C 31 29 2
# 14 3 D 41 33 8
# 15 3 E 39 29 10

Resources