Calculate adstock using data.table - r

I have a dataframe as shown:
structure(list(ID = c(1, 1, 1, 1, 2, 2, 2, 2), ColA = c(2, 3,
4, 5, 2, 3, 4, 5), ColB = c(1, 2, 3, 4, 1, 2, 3, 4), ColA_0.2 = c(2,
3.4, 4.68, 5.936, 2, 3.4, 4.68, 5.936), ColB_0.2 = c(1, 2.2,
3.44, 4.688, 1, 2.2, 3.44, 4.688)), class = "data.frame", row.names = c(NA,
-8L))
What I need ? - For each ID, I want to calculate ColA_ad and ColB_ad. User will pass a parameter 'ad'.
For example - if 'ad' is 0.2 then the values will be calculated as:
First row - same as ColA (i.e. 2)
Second row - Add Second row of ColA to 0.2*First row of ColA_ad (i.e. Sum(3,0.2*2)=3.4)
Third row - Add Third row of ColA to 0.2*second row of ColA_ad (i.e. Sum(4,0.2*3.4)=4.68)
and so on.
The same will be calculated for all other columns (here ColB), which can be mentioned in separate vector.
Summary - I would take 0.2 times carry over effect of previous calculated row and add to new row.
The results are displayed in Column ColA_ad and ColB_ad.
As my dataset is very large, I am looking for data.table solution.

Here is a base R solution, where a linear algebra property is applied to speed up your iterative calculation.
basic idea (taking id = 1 as example)
you first construct a low triangluar matrix for mapping from col to col_ad, i.e.,
l <- 0.2**abs(outer(seq(4),seq(4),"-"))
l[upper.tri(l)] <- 0
which gives
> l
[,1] [,2] [,3] [,4]
[1,] 1.000 0.00 0.0 0
[2,] 0.200 1.00 0.0 0
[3,] 0.040 0.20 1.0 0
[4,] 0.008 0.04 0.2 1
then you use lover columns col, i.e.,
> l %*% as.matrix(subset(df,ID == 1)[-1])
ColA ColB
[1,] 2.000 1.000
[2,] 3.400 2.200
[3,] 4.680 3.440
[4,] 5.936 4.688
code
ad <- 0.2
col_ad <- do.call(rbind,
c(make.row.names = F,
lapply(split(df,df$ID),
function(x) {
l <- ad**abs(outer(seq(nrow(x)),seq(nrow(x)),"-"))
l[upper.tri(l)]<- 0
`colnames<-`(data.frame(l%*% as.matrix(x[-1])),paste0(names(x[-1]),"_",ad))
}
)
)
)
dfout <- cbind(df,col_ad)
such that
> dfout
ID ColA ColB ColA_0.2 ColB_0.2
1 1 2 1 2.000 1.000
2 1 3 2 3.400 2.200
3 1 4 3 4.680 3.440
4 1 5 4 5.936 4.688
5 2 2 1 2.000 1.000
6 2 3 2 3.400 2.200
7 2 4 3 4.680 3.440
8 2 5 4 5.936 4.688
DATA
df <- structure(list(ID = c(1, 1, 1, 1, 2, 2, 2, 2), ColA = c(2, 3,
4, 5, 2, 3, 4, 5), ColB = c(1, 2, 3, 4, 1, 2, 3, 4)), class = "data.frame", row.names = c(NA,
-8L))

A non-recursive option:
setDT(DT)[, paste0(cols,"_",ad) := {
m <- matrix(unlist(shift(ad^(seq_len(.N)-1L), 0L:(.N-1L), fill = 0)), nrow=.N)
lapply(.SD, function(x) c(m%*%x))
}, by = ID, .SDcols = cols]
Another recursive option:
library(data.table)
setDT(DT)[, paste0(cols,"_",ad) := {
a <- 0
b <- 0
.SD[, {
a <- ColA + ad*a
b <- ColB + ad*b
.(a, b)
}, seq_len(.N)][, (1) := NULL]
},
by = ID]
output:
ID ColA ColB ColA_0.2 ColB_0.2
1: 1 2 1 2.000 1.000
2: 1 3 2 3.400 2.200
3: 1 4 3 4.680 3.440
4: 1 5 4 5.936 4.688
5: 2 2 1 2.000 1.000
6: 2 3 2 3.400 2.200
7: 2 4 3 4.680 3.440
8: 2 5 4 5.936 4.688
data:
DT <- structure(list(ID = c(1, 1, 1, 1, 2, 2, 2, 2), ColA = c(2, 3,
4, 5, 2, 3, 4, 5), ColB = c(1, 2, 3, 4, 1, 2, 3, 4), ColA_0.2 = c(2,
3.4, 4.68, 5.936, 2, 3.4, 4.68, 5.936), ColB_0.2 = c(1, 2.2,
3.44, 4.688, 1, 2.2, 3.44, 4.688)), class = "data.frame", row.names = c(NA,
-8L))
ad <- 0.2
cols <- c("ColA", "ColB")

Here is one way with data.table using Reduce:
#Columns to apply function to
cols <- names(df)[2:3]
#Create a function to apply
apply_fun <- function(col, ad) {
Reduce(function(x, y) sum(y, x * ad), col, accumulate = TRUE)
}
library(data.table)
#Convert dataframe to data.table
setDT(df)
#set ad value
ad <- 0.2
#Apply funnction to each columns of cols
df[, (paste(cols, ad, sep = "_")) := lapply(.SD, apply_fun, ad), .SDcols = cols, by = ID]
df
# ID ColA ColB ColA_0.2 ColB_0.2
#1: 1 2 1 2.000 1.000
#2: 1 3 2 3.400 2.200
#3: 1 4 3 4.680 3.440
#4: 1 5 4 5.936 4.688
#5: 2 2 1 2.000 1.000
#6: 2 3 2 3.400 2.200
#7: 2 4 3 4.680 3.440
#8: 2 5 4 5.936 4.688

Related

How to arrange data frame elements using group column order stored in another data frame in R?

I have two data frames with equal number of rows. The snippet is
df1 <- data.frame(Id = c(123, 124, 125), X1 = c(1, 2, 1), X2 = c(NA_integer_, 1, 2))
df1
Id X1 X2
1 123 1 NA
2 124 2 1
3 125 1 2
df2 <- data.frame(Id = c(123, 124, 125), X1_1 = c(NA_integer_, 2, 1), X1_2 = c(1, NA_integer_, 2), X1_3 = c(2, 3, 3), X2_1 = c(NA_integer_, 1, NA_integer_), X2_2 = c(NA_integer_, 3, NA_integer_), X2_3 = c(NA_integer_, 2, 2))
df2
Id X1_1 X1_2 X1_3 X2_1 X2_2 X2_3
1 123 NA 1 2 NA NA NA
2 124 2 NA 3 1 3 2
3 125 1 2 3 NA NA 2
I need to create list with arranged df2 elements accordingly group positions from df1 and remove NA:
$`123`
[1] 1 2
$`124`
[1] 1 3 2 2 3
$`125`
[1] 1 2 3 2
How to create this list in R?
UPD
Example of real data is here.
Here is one option
Map(function(u, v) {
nm1 <- sub("_\\d+", "", names(v))
grp <- match(nm1, unique(nm1))
as.vector(na.omit(unlist(split.default(v, grp)[unlist(u)], use.names = FALSE)))
}, split(df1[-1], df1$Id),
split(df2[-1], df2$Id))
#$`123`
#[1] 1 2
#$`124`
#[1] 1 3 2 2 3
#$`125`
#[1] 1 2 3 2
Another option using lapply, sappy, sort and grep:
lapply(setNames(df1$Id, df1$Id), function(i) {
x <- df2[df2$Id==i, unlist(lapply(paste0("^", names(sort(df1[df1$Id==i,-1])), "_"), grep, colnames(df2)))]
x[!is.na(x)]
})
#$`123`
#[1] 1 2
#
#$`124`
#[1] 1 3 2 2 3
#
#$`125`
#[1] 1 2 3 2

Aggregate using different functions for each column

I have a data.table similar to the one below, but with around 3 million rows and a lot more columns.
key1 price qty status category
1: 1 9.26 3 5 B
2: 1 14.64 1 5 B
3: 1 16.66 3 5 A
4: 1 18.27 1 5 A
5: 2 2.48 1 7 A
6: 2 0.15 2 7 C
7: 2 6.29 1 7 B
8: 3 7.06 1 2 A
9: 3 24.42 1 2 A
10: 3 9.16 2 2 C
11: 3 32.21 2 2 B
12: 4 20.00 2 9 B
Heres the dput() string
dados = structure(list(key1 = c(1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4),
price = c(9.26, 14.64, 16.66, 18.27, 2.48, 0.15, 6.29, 7.06,
24.42, 9.16, 32.21, 20), qty = c(3, 1, 3, 1, 1, 2, 1, 1,
1, 2, 2, 2), status = c(5, 5, 5, 5, 7, 7, 7, 2, 2, 2, 2,
9), category = c("B", "B", "A", "A", "A", "C", "B", "A",
"A", "C", "B", "B")), .Names = c("key1", "price", "qty",
"status", "category"), row.names = c(NA, -12L), class = c("data.table",
"data.frame"), .internal.selfref = <pointer: 0x0000000004720788>)
I need to transform this data so that I have one entry for each key, and on the proccess I need to create some additional variables. So far I was using this:
Mode <- function(x) {
ux <- unique(x)
ux[which.max(tabulate(match(x, ux)))]
}
key.aggregate = function(x){
return(data.table(
key1 = Mode(x$key1),
perc.A = sum(x$price[x$category == "A"],na.rm=T)/sum(x$price),
perc.B = sum(x$price[x$category == "B"],na.rm=T)/sum(x$price),
perc.C = sum(x$price[x$category == "C"],na.rm=T)/sum(x$price),
status = Mode(x$status),
qty = sum(x$qty),
price = sum(x$price)
))
}
new_data = split(dados,by = "key1") #Runs out of RAM here
results = rbindlist(lapply(new_data,key.aggregate))
And expecting the following output:
> results
key1 perc.A perc.B perc.C status qty price
1: 1 0.5937447 0.4062553 0.00000000 5 8 58.83
2: 2 0.2780269 0.7051570 0.01681614 7 4 8.92
3: 3 0.4321208 0.4421414 0.12573782 2 6 72.85
4: 4 0.0000000 1.0000000 0.00000000 9 2 20.00
But I'm always running out of RAM when splitting the data by keys. I've tried using only a third of the data, and now only a sixth of it but it still gives the same Error: cannot allocate vector of size 593 Kb.
I'm thinking this approach is very inefficient, which would be the best way to get this result?

Matching column in dataframe by nearest values in column of other dataframe

Hello I have one question of matching two data.frames.
Consider I have two datasets:
Dataframe 1:
"A" "B"
91 1
92 3
93 11
94 4
95 10
96 6
97 7
98 8
99 9
100 2
structure(list(A = 91:100, B = c(1, 3, 11, 4, 10, 6, 7, 8, 9,
2)), .Names = c("A", "B"), row.names = c(NA, -10L), class = "data.frame")
Dataframe 2:
"C" "D"
91.12 1
92.34 3
93.65 11
94.23 4
92.14 10
96.98 6
97.22 7
98.11 8
93.15 9
100.67 2
91.45 1
96.45 3
83.78 11
84.66 4
100 10
structure(list(C = c(91.12, 92.34, 93.65, 94.23, 92.14, 96.98,
97.22, 98.11, 93.15, 100.67, 91.25, 96.45, 83.78, 84.66, 100),
D = c(1, 3, 11, 4, 10, 6, 7, 8, 9, 2, 1, 3, 11, 4, 10)), .Names = c("C",
"D"), row.names = c(NA, -15L), class = "data.frame")
Now I want to find the rounded matches between column A and C and replace column D by the respective value in column B from Dataframe 1. Where there is no corresponding value (by rounded matches between A and C) I want to get an NaN for the replaced column D.
result:
"C" "newD"
91.12 1
92.34 3
93.65 4
94.23 4
92.14 3
96.98 7
97.22 7
98.11 8
93.15 11
100.67 NaN
91.25 1
96.45 6
83.78 NaN
84.66 NaN
100 2
structure(list(C = c(91.12, 92.34, 93.65, 94.23, 92.14, 96.98,
97.22, 98.11, 93.15, 100.67, 91.25, 96.45, 83.78, 84.66, 100),
D = c(1, 3, 4, 4, 3, 7, 7, 8, 11, NaN, 1, 6, NaN, NaN, 2)), .Names = c("C",
"D"), row.names = c(NA, -15L), class = "data.frame")
Does anybody knows how to do that especially for large datasets?
Thanks a lot!
Making an update join with data.table:
library(data.table)
setDT(DF1); setDT(DF2)
DF2[, A := round(C)]
DF2[, D := DF1[DF2, on=.(A), x.B] ]
# alternately, chain together in one step:
DF2[, A := round(C)][, D := DF1[DF2, on=.(A), x.B] ]
This gives NAs in unmatched rows. To switch it... DF2[is.na(D), D := NaN].
To drop the new DF2$A column, use DF2[, A := NULL].
Does anybody knows how to do that especially for large datasets?
This modifies DF2 in place (instead of making a new table like a vanilla join as in Mike's answer), so it should be fairly efficient for large tables. It might perform better if A is stored as an integer instead of a float in both tables.
On data.table 1.9.6, use on="A", B instead of on=.(A), x.B. Thanks to Mike H for checking this.
You can create a lookup table where the values in A are used to look up the values in B.
Lookup = df1$B
names(Lookup) = df1$A
df3 = data.frame(C = df2$C, newD = Lookup[as.character(round(df2$C))])
df3$newD[is.na(df3$newD)] = NaN
For these types of merges I like sql:
library(sqldf)
res <- sqldf("SELECT l.C, r.B
FROM df2 as l
LEFT JOIN df1 as r
on round(l.C) = round(r.A)")
res
# C B
#1 91.12 1
#2 92.34 3
#3 93.65 4
#4 94.23 4
#5 92.14 3
#6 96.98 7
#7 97.22 7
#8 98.11 8
#9 93.15 11
#10 100.67 NA
#11 91.45 1
#12 96.45 6
#13 83.78 NA
#14 84.66 NA
#15 100.00 2

Remove group from data.frame if at least one group member meets condition

I have a data.frame where I'd like to remove entire groups if any of their members meets a condition.
In this first example, if the values are numbers and the condition is NA the code below works.
df <- structure(list(world = c(1, 2, 3, 3, 2, NA, 1, 2, 3, 2), place = c(1,
1, 2, 2, 3, 3, 1, 2, 3, 1), group = c(1, 1, 1, 2, 2, 2, 3,
3, 3, 3)), .Names = c("world", "place", "group"), row.names = c(NA,
-10L), class = "data.frame")
ans <- ddply(df, . (group), summarize, code=mean(world))
ans$code[is.na(ans$code)] <- 0
ans2 <- merge(df,ans)
final.ans <- ans2[ans2$code !=0,]
However, this ddply maneuver with the NA values will not work if the condition is something other than "NA", or if the value are non-numeric.
For example, if I wanted to remove groups which have one or more rows with a world value of AF (as in the data frame below) this ddply trick would not work.
df2 <-structure(list(world = structure(c(1L, 2L, 3L, 3L, 3L, 5L, 1L,
4L, 2L, 4L), .Label = c("AB", "AC", "AD", "AE", "AF"), class = "factor"),
place = c(1, 1, 2, 2, 3, 3, 1, 2, 3, 1), group = c(1,
1, 1, 2, 2, 2, 3, 3, 3, 3)), .Names = c("world", "place",
"group"), row.names = c(NA, -10L), class = "data.frame")
I can envision a for-loop where for each group the value of each member is checked, and if the condition is met a code column could be populated, and then a subset could me made based on that code.
But, perhaps there is a vectorized, r way to do this?
Using dplyr:
library(dplyr)
df2 %>%
group_by(group) %>%
filter(!any(world == "AF"))
Or as per motioned by #akrun using data.table:
library(data.table)
setDT(df2)[, if(!any(world == "AF")) .SD, group]
#or
setDT(df2)[, if(all(world != "AF")) .SD, group]
Which gives:
#Source: local data frame [7 x 3]
#Groups: group
#
# world place group
#1 AB 1 1
#2 AC 1 1
#3 AD 2 1
#4 AB 1 3
#5 AE 2 3
#6 AC 3 3
#7 AE 1 3
Alternate data.table solution:
setDT(df2)
df2[!(group %in% df2[world == "AF",group])]
gives:
world place group
1: AB 1 1
2: AC 1 1
3: AD 2 1
4: AB 1 3
5: AE 2 3
6: AC 3 3
7: AE 1 3
Using keys we can be a bit faster:
setkey(df2,group)
df2[!J((df2[world == "AF",group]))]
base package:
df2[ df2$group != df2[ df2$world == 'AF', "group" ], ]
Output:
world place group
1 AB 1 1
2 AC 1 1
3 AD 2 1
7 AB 1 3
8 AE 2 3
9 AC 3 3
10 AE 1 3
Using sqldf:
library(sqldf)
sqldf("SELECT df2.world, df2.place, [group] FROM df2
LEFT JOIN
(SELECT * FROM df2 WHERE world LIKE 'AF') AS t
USING([group])
WHERE t.world IS NULL")
Output:
world place group
1 AB 1 1
2 AC 1 1
3 AD 2 1
4 AB 1 3
5 AE 2 3
6 AC 3 3
7 AE 1 3
Base R option using ave
df2[with(df2, ave(world != "AF", group, FUN = all)),]
# world place group
#1 AB 1 1
#2 AC 1 1
#3 AD 2 1
#7 AB 1 3
#8 AE 2 3
#9 AC 3 3
#10 AE 1 3
Or we can also use subset
subset(df2, ave(world != "AF", group, FUN = all))
The above can also be written as
df2[with(df2, !ave(world == "AF", group, FUN = any)),]
and
subset(df2, !ave(world == "AF", group, FUN = any))

How to get row from R data.frame

I have a data.frame with column headers.
How can I get a specific row from the data.frame as a list (with the column headers as keys for the list)?
Specifically, my data.frame is
A B C
1 5 4.25 4.5
2 3.5 4 2.5
3 3.25 4 4
4 4.25 4.5 2.25
5 1.5 4.5 3
And I want to get a row that's the equivalent of
> c(a=5, b=4.25, c=4.5)
a b c
5.0 4.25 4.5
x[r,]
where r is the row you're interested in. Try this, for example:
#Add your data
x <- structure(list(A = c(5, 3.5, 3.25, 4.25, 1.5 ),
B = c(4.25, 4, 4, 4.5, 4.5 ),
C = c(4.5, 2.5, 4, 2.25, 3 )
),
.Names = c("A", "B", "C"),
class = "data.frame",
row.names = c(NA, -5L)
)
#The vector your result should match
y<-c(A=5, B=4.25, C=4.5)
#Test that the items in the row match the vector you wanted
x[1,]==y
This page (from this useful site) has good information on indexing like this.
Logical indexing is very R-ish. Try:
x[ x$A ==5 & x$B==4.25 & x$C==4.5 , ]
Or:
subset( x, A ==5 & B==4.25 & C==4.5 )
Try:
> d <- data.frame(a=1:3, b=4:6, c=7:9)
> d
a b c
1 1 4 7
2 2 5 8
3 3 6 9
> d[1, ]
a b c
1 1 4 7
> d[1, ]['a']
a
1 1
If you don't know the row number, but do know some values then you can use subset
x <- structure(list(A = c(5, 3.5, 3.25, 4.25, 1.5 ),
B = c(4.25, 4, 4, 4.5, 4.5 ),
C = c(4.5, 2.5, 4, 2.25, 3 )
),
.Names = c("A", "B", "C"),
class = "data.frame",
row.names = c(NA, -5L)
)
subset(x, A ==5 & B==4.25 & C==4.5)
10 years later ---> Using tidyverse we could achieve this simply and borrowing a leaf from Christopher Bottoms. For a better grasp, see slice().
library(tidyverse)
x <- structure(list(A = c(5, 3.5, 3.25, 4.25, 1.5 ),
B = c(4.25, 4, 4, 4.5, 4.5 ),
C = c(4.5, 2.5, 4, 2.25, 3 )
),
.Names = c("A", "B", "C"),
class = "data.frame",
row.names = c(NA, -5L)
)
x
#> A B C
#> 1 5.00 4.25 4.50
#> 2 3.50 4.00 2.50
#> 3 3.25 4.00 4.00
#> 4 4.25 4.50 2.25
#> 5 1.50 4.50 3.00
y<-c(A=5, B=4.25, C=4.5)
y
#> A B C
#> 5.00 4.25 4.50
#The slice() verb allows one to subset data row-wise.
x <- x %>% slice(1) #(n) for the nth row, or (i:n) for range i to n, (i:n()) for i to last row...
x
#> A B C
#> 1 5 4.25 4.5
#Test that the items in the row match the vector you wanted
x[1,]==y
#> A B C
#> 1 TRUE TRUE TRUE
Created on 2020-08-06 by the reprex package (v0.3.0)

Resources