R aggregate data.frame with condition from second dataframe - r

I have one table containing data which looks like this
Samp depth value
A1 0 2
A1 1 4
A1 2 3
A1 3 6
A1 4 8
A1 5 6
A1 6 2
A1 7 3
A2 0 2
A2 1 8
A2 2 6
A2 3 3
A2 4 6
A2 5 6
A3 0 7
A3 1 3
A3 2 2
A3 3 8
A3 4 3
...
Second table with intervals
Samp d_top d_bot
A1 0 2
A2 0 5
A3 1 2
A4 3 5
...
Now I would like to query the first table, using intervals from the second table.
Samp d_int sum_value
A1 0-2 9
A2 0-5 29
A3 1-2 5
...
It should work with aggregate or ddply, by specifying a list, which I tried. The problem is that it is not fixed intervals, but as defined in the second table. Any help is appreciated.

This 'group by other table' can be expressed in SQL with the help of the sqldf package.
Your table one is x in the code below and the range-table is y. The trick is the double join conditions. The first in the join clause (on Samp) and one (non-equi join) implicit in the where conditions.
library(sqldf)
sqldf('
SELECT
x.Samp,
y.d_top || "-" || y.d_bot as d_int,
sum(x.value) as sum_value
FROM x
JOIN y on y.Samp = x.Samp
WHERE
y.d_top <= x.depth and
x.depth <= y.d_bot
GROUP BY
y.d_top, y.d_bot, x.Samp
')
Which yields
Samp d_int sum_value
1 A1 0-2 9
2 A2 0-5 31
3 A3 1-2 5

Here's one approach:
do.call(rbind, by(dat1, dat1$Samp, function(x) {
Samp <- as.character(x$Samp[1])
idx <- Samp == as.character(dat2$Samp)
sequ <- seq(dat2$d_top[idx], dat2$d_bot[idx])
idx2 <- x$depth %in% sequ
data.frame(Samp, d_int = paste(range(sequ), collapse = "-"),
sum_value = sum(x$value[idx2]))
}))
where dat1 is your larger data frame and dat2 your shorter data frame.
This returns:
Samp d_int sum_value
A1 A1 0-2 9
A2 A2 0-5 31
A3 A3 1-2 5

Another idea:
f = function(samp, dt, db) {
inds = DF1$Samp == samp
sum(DF1[inds,'value'][DF1[inds,'depth'] %in% `:`(dt, db)])
} #DF1 and DF2 are your large and small dataframes, respectively
data.frame(Samp = DF2$Samp,
d_int = paste(DF2$d_top, DF2$d_bot, sep = " - "),
sum_value = mapply(f, DF2$Samp, DF2$d_top, DF2$d_bot, USE.NAMES = F))
# Samp d_int sum_value
#1 A1 0 - 2 9
#2 A2 0 - 5 31
#3 A3 1 - 2 5
A benchmarking:
set.seed(11)
DF1 = data.frame(Samp = rep(letters, each = 20),
depth = sample(1:10, 26*20, T),
value = runif(26*20),
stringsAsFactors = F)
set.seed(11)
DF2 = data.frame(Samp = letters,
d_top = sample(1:5, 26, T),
d_bot = sample(3:10, 26, T),
stringsAsFactors = F)
dat1 = DF1; dat2 = DF2; x = DF1; y = DF2
#> head(alex())
# Samp d_int sum_value
#1 a 2 - 6 5.127813
#2 b 1 - 3 4.043807
#3 c 3 - 4 3.356880
#4 d 1 - 6 9.209616
#5 e 1 - 7 7.452329
#6 f 5 - 5 2.241515
#> head(sven())
# Samp d_int sum_value
#a a 2-6 5.127813
#b b 1-3 4.043807
#c c 3-4 3.356880
#d d 1-6 9.209616
#e e 1-7 7.452329
#f f 5-5 2.241515
#> head(rick()[order(rick()[,1]),])
# Samp d_int sum_value
#10 a 2-6 5.127813
#1 b 1-3 4.043807
#16 c 3-4 3.356880
#4 d 1-6 9.209616
#6 e 1-7 7.452329
#22 f 5-5 2.241515
#> microbenchmark(alex(), sven(), rick())
#Unit: milliseconds
# expr min lq median uq max neval
# alex() 3.10070 3.230853 3.306196 3.461753 4.269292 100
# sven() 24.33163 25.525797 26.184391 26.868042 63.197223 100
# rick() 17.89463 18.622127 19.182584 19.820124 23.278920 100

Related

R: creating combinations of elements within a group and adding up numbers associated with combinations in a new data frame

I have the following dataset:
Letter ID Number
A A1 1
A A2 2
A A3 3
B B1 1
B B2 2
B B3 3
B B4 4
My aim is first to create all possible combinations of IDs within the same "Letter" group. For example, for the letter A, it would be only three combinations: A1-A2,A2-A3,and A1-A3. The same IDs ordered differently don't count as a new combination, so for example A1-A2 is the same as A2-A1.
Then, within those combinations, I want to add up the numbers from the "Number" column associated with those IDs. So for the combination A1-A2, which are associated with 1 and 2 in the "Number" column, this would result in the number 1+2=3.
Finally, I want to place the ID combinations, added numbers and original Letter in a new data frame. Something like this:
Letter Combination Add.Number
A A1-A2 3
A A2-A3 5
A A1-A3 4
B B1-B2 3
B B2-B3 5
B B3-B4 7
B B1-B3 4
B B2-B4 6
B B1-B4 5
How can I do this in R, ideally using the package dplyr?
library(dplyr)
letter <- c("A","A","A","B","B","B","B")
df <-
data.frame(letter) %>%
group_by(letter) %>%
mutate(
number = row_number(),
id = paste0(letter,number)
)
df %>%
full_join(df,by = "letter") %>%
filter(number.x < number.y) %>%
mutate(
combination = paste0(id.x,"-",id.y),
add_number = number.x + number.y) %>%
select(letter,combination,add_number)
# A tibble: 9 x 3
# Groups: letter [2]
letter combination add_number
<chr> <chr> <int>
1 A A1-A2 3
2 A A1-A3 4
3 A A2-A3 5
4 B B1-B2 3
5 B B1-B3 4
6 B B1-B4 5
7 B B2-B3 5
8 B B2-B4 6
9 B B3-B4 7
In base R, using combn:
df <- data.frame(
Letter = c("A","A","A","B","B","B","B"),
Id = c("A1","A2","A3","B1","B2","B3","B4"),
Number = c(1,2,3,1,2,3,4))
# combinations
l<-lapply(split(df$Id, df$Letter) ,function(x)
setNames(data.frame(t(combn(x,2))), c("L1","L2")))
n<-lapply(split(df$Number, df$Letter) ,function(x)
setNames(data.frame(t(combn(x,2))), c("N1","N2")))
# rbind all
result <- do.call(rbind, mapply(cbind, Letter=names(l), l, n, SIMPLIFY = F))
result$combination <- paste(result$L1, result$L2, sep="-")
result$sum = result$N1 + result$N2
result
#> Letter L1 L2 N1 N2 combination sum
#> A.1 A A1 A2 1 2 A1-A2 3
#> A.2 A A1 A3 1 3 A1-A3 4
#> A.3 A A2 A3 2 3 A2-A3 5
#> B.1 B B1 B2 1 2 B1-B2 3
#> B.2 B B1 B3 1 3 B1-B3 4
#> B.3 B B1 B4 1 4 B1-B4 5
#> B.4 B B2 B3 2 3 B2-B3 5
#> B.5 B B2 B4 2 4 B2-B4 6
#> B.6 B B3 B4 3 4 B3-B4 7

add row based on variable condition in R

I have df as follow
df
ID type other-col
1 A1 cc
1 A2 dd
1 A3 cc
2 A1 cc
2 B1 aa
3 A2 aa
I want add new to when "ID" changes with the value of F for "type" and "other-col" columns
new_df
ID
df
ID type other-col
1 A1 cc
1 A2 dd
1 A3 cc
1 F F <- this row added
2 A1 cc
2 B1 aa
2 F F <- this row added
3 A2 aa
how can I do it in R?
thx
This should be doable in a single replacement operation once you know the indexes of where each change occurs. E.g.:
idx <- match(unique(df$ID), df$ID)[-1] - 1
df <- df[sort(c(sequence(nrow(df)),idx)),]
df[seq_along(idx) + idx, c("type","other_col")] <- "F"
# ID type other_col
#1 1 A1 cc
#2 1 A2 dd
#3 1 A3 cc
#3.1 1 F F
#4 2 A1 cc
#5 2 B1 aa
#5.1 2 F F
#6 3 A2 aa
Where df was:
df <- read.table(text="ID type other_col
1 A1 cc
1 A2 dd
1 A3 cc
2 A1 cc
2 B1 aa
3 A2 aa", header=TRUE, stringsAsFactors=FALSE)
An option with group_split and add_row. We can split by 'ID' with group_split into a list of data.frames, then loop through the list with map, add a row as the last row (add_row - by default adds row to the end, but we can control it with .before and .after), then slice out the last row as the last 'ID' didn't need the 'F' row
library(tidyverse)
df1 %>%
group_split(ID) %>%
map_dfr(~ .x %>%
add_row(ID = first(.$ID), type = 'F', `other-col` = 'F')) %>%
slice(-n())
Here is another approach with a similar idea as #akrun's answer.
library(tidyverse)
dat2 <- dat %>%
split(f = .$ID) %>%
map_if(.p = function(x) unique(x$ID) < max(dat$ID),
~bind_rows(.x, tibble(ID = unique(.x$ID), type = "F", `other.col` = "F"))) %>%
bind_rows()
dat2
# ID type other.col
# 1 1 A1 cc
# 2 1 A2 dd
# 3 1 A3 cc
# 4 1 F F
# 5 2 A1 cc
# 6 2 B1 aa
# 7 2 F F
# 8 3 A2 aa
Data
dat <- read.table(text = "ID type other-col
1 A1 cc
1 A2 dd
1 A3 cc
2 A1 cc
2 B1 aa
3 A2 aa",
header = TRUE, stringsAsFactors = FALSE)
Update
I provided an updated answer to show that if ID column is not integer but character, we can create a new column (ID2 in this case) that is converted to be factor based on ID, and then convert it to integer. The rest of the operation would be similar to the original answer but based on ID2.
library(tidyverse)
dat2 <- dat %>%
mutate(ID2 = as.integer(factor(ID, levels = unique(.$ID)))) %>%
split(f = .$ID2) %>%
map_if(.p = function(x) unique(x$ID2) != unique(last(.)$ID2),
~bind_rows(.x, tibble(ID = unique(.x$ID), type = "F", `other.col` = "F",
ID2 = unique(.x$ID2)))) %>%
bind_rows() %>%
select(-ID2)
dat2
# ID type other.col
# 1 C A1 cc
# 2 C A2 dd
# 3 C A3 cc
# 4 C F F
# 5 A A1 cc
# 6 A B1 aa
# 7 A F F
# 8 B A2 aa
DATA
dat <- read.table(text = "ID type other-col
C A1 cc
C A2 dd
C A3 cc
A A1 cc
A B1 aa
B A2 aa",
header = TRUE, stringsAsFactors = FALSE)
Similar to akrun's answer but in base R. Basically, split dataframe by ID then rbind extra row to each split, then recombine dataframe and remove unrequired last row using head(..., -1) -
head(n = -1,
do.call(rbind,
lapply(split(dat, dat$ID), function(x) {
rbind(x, c(x$ID[1], "F", "F"))
})
)
)
ID type other.col
1.1 1 A1 cc
1.2 1 A2 dd
1.3 1 A3 cc
1.4 1 F F
2.4 2 A1 cc
2.5 2 B1 aa
2.3 2 F F
3.6 3 A2 aa
Using base R you could do:
cbind(ID=sort(c(dat$ID,unique(dat$ID))),do.call(rbind,by(dat[-1],dat[1],rbind,'F')))
ID type other.col
1.1 1 A1 cc
1.2 1 A2 dd
1.3 1 A3 cc
1.4 1 F F
2.4 2 A1 cc
2.5 2 B1 aa
2.3 2 F F
3.6 3 A2 aa
3.2 3 F F
Or you could do:
do.call(rbind,by(dat,dat$ID,function(x)cbind(ID = unique(x[,1]),rbind(x[-1],"F"))))
inds = head(cumsum(with(rle(df$ID), unlist(lapply(lengths, function(i) c((rep(1, i)), F = 0))))), -1)
df1 = df[inds,]
df1[which(names(inds) == "F"), c("type", "other_col")] = "F"
df1
# ID type other_col
#1 1 A1 cc
#2 1 A2 dd
#3 1 A3 cc
#3.1 1 F F
#4 2 A1 cc
#5 2 B1 aa
#5.1 2 F F
#6 3 A2 aa
A possible approach using data.table:
library(data.table)
m <- setDT(df)[, max(ID)]
df[, if (.BY$ID < m) rbind(.SD, as.list(rep("F", ncol(.SD)))) else .SD, ID]
output:
ID type other-col
1: 1 A1 cc
2: 1 A2 dd
3: 1 A3 cc
4: 1 F F
5: 2 A1 cc
6: 2 B1 aa
7: 2 F F
8: 3 A2 aa
or if you dont mind adding another row at the bottom, code will be shorter: setDT(df)[, rbind(.SD, as.list(rep("F", ncol(.SD)))), ID]

Calculate direct dependencies among values of a dataframe in R

A data frame is given and the objective is to calculate the direct dependency value between two columns of the data frame.
c1 c2 N
a b 30
a c 5
a d 10
c a 5
b a 10
what we are looking for is that to get the direct dependency relations, for example, for aand b this value is ab - ba = 20.
The final result should be like this:
c1 c2 N DepValue
a b 30 ab - ba = 20
a c 5 ac - ca = 0
a d 10 ad- 0 = 10
c a 5 ca - ac= 0
b a 10 ba - ab = 20
Thank you for your help.
D <- read.table(header=TRUE, stringsAsFactors = FALSE, text=
"c1 c2 N
a b 30
a c 5
a d 10
c a 5
b a 10")
N12 <- D$N
names(N12) <- paste0(D$c1, D$c2)
N21 <- N12[paste0(D$c2, D$c1)]
D$depValue <- D$N - ifelse(is.na(N21), 0, N21)
result:
> D
c1 c2 N depValue
1 a b 30 20
2 a c 5 0
3 a d 10 10
4 c a 5 0
5 b a 10 -20
One option is to create groups with pmin and pmax values of c1 and c2 and take difference between the two values. This will return NA for groups with only one value, we can replace those NAs to the first value in the group.
library(dplyr)
df %>%
group_by(group1 = pmin(c1, c2), group2 = pmax(c1, c2)) %>%
mutate(dep = N[1] - N[2],
dep = replace(dep, is.na(dep), N[1])) %>%
ungroup() %>%
select(-group1, -group2)
# c1 c2 N dep
# <chr> <chr> <int> <int>
#1 a b 30 20
#2 a c 5 0
#3 a d 10 10
#4 c a 5 0
#5 b a 10 20
An idea via base R is to sort columns c1 and c2, split based on those values and subtract N, i.e.
i1 <- paste(pmin(df$c1, df$c2), pmax(df$c1, df$c2))
i1
#[1] "a b" "a c" "a d" "a c" "a b"
do.call(rbind, lapply(split(df, i1), function(i) {i['DepValue'] <- Reduce(`-`, i$N); i}))
# c1 c2 N DepValue
#a b.1 a b 30 20
#a b.5 b a 10 20
#a c.2 a c 5 0
#a c.4 c a 5 0
#a d a d 10 10

Using lapply to transpose part of a column and add it as new columns to a data frame

I've been searching for some clarity on this one, but cannot find something that applies to my case, I constructed a DF very similar to this one (but with considerably more data, over a million rows in total)
Key1 <- c("A", "B", "C", "A", "C", "B", "B", "C", "A", "C")
Key2 <- c("A1", "B1", "C1", "A2", "C2", "B2", "B3", "C3", "A3", "C4")
NumVal <- c(2, 3, 1, 4, 6, 8, 2, 3, 1, 0)
DF1 <- as.data.frame(cbind(Key1, Key2, NumVal), stringsAsFactors = FALSE) %>% arrange(Key2)
ConsId <- c(1:10)
DF1 <- cbind(DF1, ConsId)
Now, what I want to do is to add lets say 3 new columns (in real life I need 12, but in order to be more graphic in this toy example we'll use 3) to the data frame, where each row corresponds to the values of $NumVal with the same $Key1 and greater than or equal $ConsId to the ones in each row and filling the remaining spaces with NA's, here is the expected result in case I wasn't very clear:
Key1 Key2 NumVal ConsId V1 V2 V3
A A1 2 1 2 4 1
A A2 4 2 4 1 NA
A A3 1 3 1 NA NA
B B1 3 4 3 8 2
B B2 8 5 8 2 NA
B B3 2 6 2 NA NA
C C1 1 7 1 6 3
C C2 6 8 6 3 0
C C3 3 9 3 0 NA
C C4 0 10 0 NA NA
Now I'm using a do.call(rbind), and even tough it works fine, it takes way too long for my real data with a bit over 1 million rows (around 6 hrs), I also tried with the bind_rows dplyr function but it took a bit longer so I stuck with the do.call option, here's an example of the code I'm using:
# Function
TranspNumVal <- function(i){
Id <- DF1[i, "Key1"]
IdCons <- DF1[i, "ConsId"]
myvect <- as.matrix(filter(DF1, Id == Key1, ConsId >= IdCons) %>% select(NumVal))
Result <- as.data.frame(t(myvect[1:3]))
return(Result)
}
# Applying the function to the entire data frame
DF2 <- do.call(rbind, lapply(1:NROW(DF1), function(i) TranspNumVal(i)))
DF3 <- cbind(DF1, DF2)
Maybe changing the class is causing the code to be so inefficient, or maybe I'm just not finding a better way to vectorize my problem (you don't want to know how long it took with a nested loop), I'm fairly new to R and have just started fooling around with dplyr, so I'm open to any suggestion about how to optimize my code
We can use dplyr::lead
DF1 %>%
group_by(Key1) %>%
mutate(
V1 = NumVal,
V2 = lead(NumVal, n = 1),
V3 = lead(NumVal, n = 2))
## A tibble: 10 x 7
## Groups: Key1 [3]
# Key1 Key2 NumVal ConsId V1 V2 V3
# <chr> <chr> <chr> <int> <chr> <chr> <chr>
# 1 A A1 2 1 2 4 1
# 2 A A2 4 2 4 1 NA
# 3 A A3 1 3 1 NA NA
# 4 B B1 3 4 3 8 2
# 5 B B2 8 5 8 2 NA
# 6 B B3 2 6 2 NA NA
# 7 C C1 1 7 1 6 3
# 8 C C2 6 8 6 3 0
# 9 C C3 3 9 3 0 NA
#10 C C4 0 10 0 NA NA
Explanation: We group entries by Key1 and then use lead to shift NumVal values for columns V2 and V3. V1 is simply a copy of NumVal.
A dplyr pipeline.
First utility function will filter a (NumVal) based on the values of b (ConsId):
myfunc1 <- function(a,b) {
n <- length(b)
lapply(seq_along(b), function(i) a[ b >= b[i] ])
}
Second utility function converts a ragged list into a data.frame. It works with arbitrary number of columns to append, but we've limited it to 3 based on your requirements:
myfunc2 <- function(x, ncols = 3) {
n <- min(ncols, max(lengths(x)))
as.data.frame(do.call(rbind, lapply(x, `length<-`, n)))
}
Now the pipeline:
dat %>%
group_by(Key1) %>%
mutate(lst = myfunc1(NumVal, ConsId)) %>%
ungroup() %>%
bind_cols(myfunc2(.$lst)) %>%
select(-lst) %>%
arrange(Key1, ConsId)
# # A tibble: 10 × 7
# Key1 Key2 NumVal ConsId V1 V2 V3
# <chr> <chr> <int> <int> <int> <int> <int>
# 1 A A1 2 1 2 4 1
# 2 A A2 4 2 4 1 NA
# 3 A A3 1 3 1 NA NA
# 4 B B1 3 4 3 8 2
# 5 B B2 8 5 8 2 NA
# 6 B B3 2 6 2 NA NA
# 7 C C1 1 7 1 6 3
# 8 C C2 6 8 6 3 0
# 9 C C3 3 9 3 0 NA
# 10 C C4 0 10 0 NA NA
After grouping by 'Key1', use shift (from data.table) to get the next value of 'NumVal' in a list, convert it to tibble and unnest the nested list elements to individual columns of the dataset. By default, shift fill NA at the end.
library(data.table)
library(tidyverse)
DF1 %>%
group_by(Key1) %>%
mutate(new = shift(NumVal, 0:(n()-1), type = 'lead') %>%
map(~
as.list(.x) %>%
set_names(paste0("V", seq_along(.))) %>%
as_tibble)) %>%
unnest %>%
select(-V4)
# A tibble: 10 x 7
# Groups: Key1 [3]
# Key1 Key2 NumVal ConsId V1 V2 V3
# <chr> <chr> <dbl> <int> <dbl> <dbl> <dbl>
# 1 A A1 2 1 2 4 1
# 2 A A2 4 2 4 1 NA
# 3 A A3 1 3 1 NA NA
# 4 B B1 3 4 3 8 2
# 5 B B2 8 5 8 2 NA
# 6 B B3 2 6 2 NA NA
# 7 C C1 1 7 1 6 3
# 8 C C2 6 8 6 3 0
# 9 C C3 3 9 3 0 NA
#10 C C4 0 10 0 NA NA
data
DF1 <- data.frame(Key1, Key2, NumVal, stringsAsFactors = FALSE) %>%
arrange(Key2)
DF1$ConsId <- 1:10

calculate summary by group and bring value back in the dataframe [duplicate]

This question already has answers here:
Calculate group mean, sum, or other summary stats. and assign column to original data
(4 answers)
Closed 5 years ago.
df <- data.frame(
id = c('A1','A2','A4','A2','A1','A4','A3','A2','A1','A3'),
value = c(4,3,1,3,4,6,6,1,8,4))
I want to get max value within each id group. I tried following but got an error saying replacement has 4 rows and data has 10 which i understand but don't know how to correct
df$max.by.id <- aggregate(value ~ id, df, max)
this is how i ended up successfully doing it
max.by.id <- aggregate(value ~ id, df, max)
names(max.by.id) <- c("id", "max")
df2 <- merge(df,max.by.id, by.x = "id", by.y = "id")
df2
# id value max
#1 A1 4 8
#2 A1 4 8
#3 A1 8 8
#4 A2 3 3
#5 A2 3 3
#6 A2 1 3
#7 A3 6 6
#8 A3 4 6
#9 A4 1 6
#10 A4 6 6
any better way? thanks in advance
ave() is the function for that task:
df$max.by.id <- ave(df$value, df$id, FUN=max)
example:
df <- data.frame(
id = c('A1','A2','A4','A2','A1','A4','A3','A2','A1','A3'),
value = c(4,3,1,3,4,6,6,1,8,4))
df$max.by.id <- ave(df$value, df$id, FUN=max)
The result of ave() has the same length as the original vector of values (what is also the length of the grouping variables). The values of the result are going to the right positions with respect to the grouping variables. For more information read the documentation of ave().
with data.table, you can compute the max by id "inside" the data, automatically adding the newly computed value (unique by id):
library(data.table)
setDT(df)[, max.by.id := max(value), by=id]
df
# id value max.by.id
# 1: A1 4 8
# 2: A2 3 3
# 3: A4 1 6
# 4: A2 3 3
# 5: A1 4 8
# 6: A4 6 6
# 7: A3 6 6
# 8: A2 1 3
# 9: A1 8 8
#10: A3 4 6
tapply(df$value, df$id, max)
# A1 A2 A3 A4
8 3 6 6
library(plyr)
ddply(df, .(id), function(df){max(df$value)})
# id V1
# 1 A1 8
# 2 A2 3
# 3 A3 6
# 4 A4 6
library(dplyr)
df %>% group_by(id) %>% arrange(desc(value)) %>% do(head(., 1))
# Source: local data frame [4 x 2]
# Groups: id [4]
# id value
# (fctr) (dbl)
# 1 A1 8
# 2 A2 3
# 3 A3 6
# 4 A4 6
UPDATE:
If you need to keep the raw value, use the following code.
library(plyr)
ddply(df, .(id), function(df){
df$max.val = max(df$value)
return(df)
})
library(dplyr)
df %>% group_by(id) %>% mutate(max.val=max(value))
# Source: local data frame [10 x 3]
# Groups: id [4]
# id value max.val
# (fctr) (dbl) (dbl)
# 1 A1 4 8
# 2 A2 3 3
# 3 A4 1 6
# 4 A2 3 3
# 5 A1 4 8
# 6 A4 6 6
# 7 A3 6 6
# 8 A2 1 3
# 9 A1 8 8
# 10 A3 4 6

Resources