I have the following data.table
df <- data.table(
id = c(rep(1,6),rep(2,6),rep(3,6)),
grp = c(rep("x",6),rep("y",6),rep("z",6)),
val1 = 1:18,
val2 = 13:30
)
I want two apply two different functions by row condition
for example:
cols <- paste0("val",1:2)
df[id == 1,lapply(.SD, function (x) tail(x,2)),.SDcols = cols,by = list(id,grp)]
df[id != 1,lapply(.SD, function (x) tail(x,3)),.SDcols = cols,by = list(id,grp)]
I'm quite new to working with data.table so there is maybe a more efficient way than carrying out separate calculations then joining the two tables above
If the conditions are disjunct, i.e., id == 1 and id != 1, and if id is also one of the grouping variables (in the by = clause), two different functions can be applied by
df[, lapply(.SD, function (x) if (first(id) == 1) tail(x, 2) else tail(x, 3)),
.SDcols = cols, by = .(id, grp)]
id grp val1 val2
1: 1 x 5 17
2: 1 x 6 18
3: 2 y 10 22
4: 2 y 11 23
5: 2 y 12 24
6: 3 z 16 28
7: 3 z 17 29
8: 3 z 18 30
So, subsetting is not by row but by grouping variable and has been moved into the anonymous function definition within lapply(). This avoids to rbind() the subsets afterwards.
For the sake of completeness, in the particular case of the tail() function being called with different parameters we can write more concisely
df[, lapply(.SD, tail, n = fifelse(first(id) == 1, 2, 3)),
.SDcols = cols, by = .(id, grp)]
Here is another option:
df[.N:1L, ri := rowid(id, grp)]
rbindlist(list(
df[id == 1L & ri <= 2L], #for the first, df[id == 1L, tail(.SD, 2L), .(id, grp), .SDcols = cols]
df[id != 1L & ri <= 3L] #and for df[id != 2, tail(.SD, 3L), .(id,grp), .SDcols = cols]
))
output:
id grp val1 val2 ri
1: 1 x 5 17 2
2: 1 x 6 18 1
3: 2 y 10 22 3
4: 2 y 11 23 2
5: 2 y 12 24 1
6: 3 z 16 28 3
7: 3 z 17 29 2
8: 3 z 18 30 1
Would be interested to know the size of your dataset and the speedup.
Related
Example:
data.table(x=1:3, y=4:6)
I want to insert a new column whose values would be:
z=(2*5+3*6, 3*6, NA)
I tried to create this function firstly but it doesnt work :
sumprod <- function(x, y){
z=vector()
for (i in 1:length(x)-1){
z=c(z, sum(shift(x, n=i+1, type="lag")*shift(y, n=i+1, type="lag"), na.rm=FALSE))
}
return(z)
}
We may do
library(data.table)
dt1[, z := rev(cumsum(rev(Reduce(`*`, shift(.SD, type = "lead",
fill = 0)))))]
dt1[z == 0, z := NA_real_]
-output
> dt1
x y z
1: 1 4 28
2: 2 5 18
3: 3 6 NA
Or with fcumsum
library(collapse)
dt1[, z := fcumsum(shift(do.call(`*`, .SD), type = "lead")[.N:1])[.N:1]]
data
dt1 <- data.table(x=1:3, y=4:6)
You could Reduce using right=T argument (from right to left):
dt[,z:=shift(Reduce('+',x*y,accumulate=T,right=T),-1)][]
x y z
<int> <int> <int>
1: 1 4 28
2: 2 5 18
3: 3 6 NA
Using my example below, how can I rank multiple columns using different orders, so for example rank y as descending and z as ascending?
require(data.table)
dt <- data.table(x = c(rep("a", 5), rep("b", 5)),
y = abs(rnorm(10)) * 10, z = abs(rnorm(10)) * 10)
cols <- c("y", "z")
dt[, paste0("rank_", cols) := lapply(.SD, function(x) frankv(x, ties.method = "min")), .SDcols = cols, by = .(x)]
data.table's frank() function has some useful features which aren't available in base R's rank() function (see ?frank). E.g., we can reverse the order of the ranking by prepending the variable with a minus sign:
library(data.table)
# create reproducible data
set.seed(1L)
dt <- data.table(x = c(rep("a", 5), rep("b", 5)),
y = abs(rnorm(10)) * 10, z = abs(rnorm(10)) * 10)
# rank y descending, z ascending
dt[, rank_y := frank(-y), x][, rank_z := frank(z), x][]
x y z rank_y rank_z
1: a 6.264538 15.1178117 3 4
2: a 1.836433 3.8984324 5 1
3: a 8.356286 6.2124058 2 2
4: a 15.952808 22.1469989 1 5
5: a 3.295078 11.2493092 4 3
6: b 8.204684 0.4493361 1 2
7: b 4.874291 0.1619026 4 1
8: b 7.383247 9.4383621 2 5
9: b 5.757814 8.2122120 3 4
10: b 3.053884 5.9390132 5 3
If there are many columns which are to be ranked individually, some descending, some ascending, we can do this in two steps
# first rank all columns in descending order
cols_desc <- c("y")
dt[, paste0("rank_", cols_desc) := lapply(.SD, frankv, ties.method = "min", order = -1L),
.SDcols = cols_desc, by = x][]
# then rank all columns in ascending order
cols_asc <- c("z")
dt[, paste0("rank_", cols_asc) := lapply(.SD, frankv, ties.method = "min", order = +1L),
.SDcols = cols_asc, by = x][]
x y z rank_y rank_z
1: a 6.264538 15.1178117 3 4
2: a 1.836433 3.8984324 5 1
3: a 8.356286 6.2124058 2 2
4: a 15.952808 22.1469989 1 5
5: a 3.295078 11.2493092 4 3
6: b 8.204684 0.4493361 1 2
7: b 4.874291 0.1619026 4 1
8: b 7.383247 9.4383621 2 5
9: b 5.757814 8.2122120 3 4
10: b 3.053884 5.9390132 5 3
I have two vectors having common and repetitive elements. I want a table comparing the frequency of common elements in both vectors. Here is subset
plyr::count(V1)
x freq
1 A*02:01 106
2 A*02:02 88
3 A*03:01 95
4 A*03:02 60
plyr::count(V2)
x freq
1 A*02:01 11
2 A*02:02 11
3 A*02:04 1
4 A*03:01 20
The Output I want is:
x freq.V1 freq.V2
1 A*02:01 106 11
2 A*02:02 88 11
3 A*03:01 60 20
I think merge seems a good choice here as the default is to keep observations common to both datasets. So the following should work
merge(plyr::count(V1), plyr::count(V2), by="x")
Worked example
plyr::count(mtcars$gear)
# x freq
# 1 3 15
# 2 4 12
# 3 5 5
plyr::count(mtcars$gear[1:10])
# x freq
# 1 3 4
# 2 4 6
merge(
plyr::count(mtcars$gear),
plyr::count(mtcars$gear[1:10]),
by="x")
# x freq.x freq.y
# 1 3 15 4
# 2 4 12 6
Just use table:
tbl1 <- table(V1[V1 %in% (int <- intersect(unique(V1), unique(V2)))])
tbl2 <- table(V2[V2 %in% int])
data.frame(x = names(tbl1), freq.V1 = as.vector(tbl1), freq.V2 = as.vector(tbl2))
Or my favorite, data.table:
library(data.table)
DT <- data.table(V1 = V1, V2 = V2)
DT[V1 %in% unique(V2), .(freq.V1 = .N), by = .(x = V1)
][DT[V2 %in% unique(V1), .N, by = .(x = V2)],
freq.V2 := i.N, on = "x", nomatch = 0L]
Of course both options look much simpler if you know beforehand that V1 and V2 consist of the same set of elements:
data.frame(x = names(tbl1 <- table(V1)), freq.V1 = as.vector(tbl1),
freq.V2 = as.vector(table(V2)))
and
DT[ , .(freq.V1 = .N), by = .(x = V1)
][DT[ , .(freq.V2 = .N), by = .(x = V2)], on = "x"]
I have a data set with individuals (ID) that can be part of more than one group.
Example:
library(data.table)
DT <- data.table(
ID = rep(1:5, c(3:1, 2:3)),
Group = c("A", "B", "C", "B",
"C", "A", "A", "C",
"A", "B", "C")
)
DT
# ID Group
# 1: 1 A
# 2: 1 B
# 3: 1 C
# 4: 2 B
# 5: 2 C
# 6: 3 A
# 7: 4 A
# 8: 4 C
# 9: 5 A
# 10: 5 B
# 11: 5 C
I want to know the sum of identical individuals for 2 groups.
The result should look like this:
Group.1 Group.2 Sum
A B 2
A C 3
B C 3
Where Sum indicates the number of individuals the two groups have in common.
Here's my version:
# size-1 IDs can't contribute; skip
DT[ , if (.N > 1)
# simplify = FALSE returns a list;
# transpose turns the 3-length list of 2-length vectors
# into a length-2 list of 3-length vectors (efficiently)
transpose(combn(Group, 2L, simplify = FALSE)), by = ID
][ , .(Sum = .N), keyby = .(Group.1 = V1, Group.2 = V2)]
With output:
# Group.1 Group.2 Sum
# 1: A B 2
# 2: A C 3
# 3: B C 3
As of version 1.9.8 (on CRAN 25 Nov 2016), data.table has gained the ability to do non-equi joins. So, a self non-equi join can be used:
library(data.table) # v1.9.8+
setDT(DT)[, Group:= factor(Group)]
DT[DT, on = .(ID, Group < Group), nomatch = 0L, .(ID, x.Group, i.Group)][
, .N, by = .(x.Group, i.Group)]
x.Group i.Group N
1: A B 2
2: A C 3
3: B C 3
Explanantion
The non-equi join on ID, Group < Group is a data.table version of combn() (but applied group-wise):
DT[DT, on = .(ID, Group < Group), nomatch = 0L, .(ID, x.Group, i.Group)]
ID x.Group i.Group
1: 1 A B
2: 1 A C
3: 1 B C
4: 2 B C
5: 4 A C
6: 5 A B
7: 5 A C
8: 5 B C
We self-join with the same dataset on 'ID', subset the rows where the 'Group' columns are different, get the nrows (.N), grouped by the 'Group' columns, sort the 'Group.1' and 'Group.2' columns by row using pmin/pmax and get the unique value of 'N'.
library(data.table)#v1.9.6+
DT[DT, on='ID', allow.cartesian=TRUE][Group!=i.Group, .N ,.(Group, i.Group)][,
list(Sum=unique(N)) ,.(Group.1=pmin(Group, i.Group), Group.2=pmax(Group, i.Group))]
# Group.1 Group.2 Sum
#1: A B 2
#2: A C 3
#3: B C 3
Or as mentioned in the comments by #MichaelChirico and #Frank, we can convert 'Group' to factor class, subset the rows based on as.integer(Group) < as.integer(i.Group), group by 'Group', 'i.Group' and get the nrow (.N)
DT[, Group:= factor(Group)]
DT[DT, on='ID', allow.cartesian=TRUE][as.integer(Group) < as.integer(i.Group), .N,
by = .(Group.1= Group, Group.2= i.Group)]
Great answers above.
Just an alternative using dplyr in case you, or someone else, is interested.
library(dplyr)
cmb = combn(unique(dt$Group),2)
data.frame(g1 = cmb[1,],
g2 = cmb[2,]) %>%
group_by(g1,g2) %>%
summarise(l=length(intersect(DT[DT$Group==g1,]$ID,
DT[DT$Group==g2,]$ID)))
# g1 g2 l
# (fctr) (fctr) (int)
# 1 A B 2
# 2 A C 3
# 3 B C 3
yet another solution (base R):
tmp <- split(DT, DT[, 'Group'])
ans <- apply(combn(LETTERS[1 : 3], 2), 2, FUN = function(ind){
out <- length(intersect(tmp[[ind[1]]][, 1], tmp[[ind[2]]][, 1]))
c(group1 = ind[1], group2 = ind[2], sum_ = out)
}
)
data.frame(t(ans))
# group1 group2 sum_
#1 A B 2
#2 A C 3
#3 B C 3
first split data into list of groups, then for each unique pairwise combinations of two groups see how many subjects in common they have, using length(intersect(....
I would like to append a columns to my data.frame in R that contain row sums and products
Consider following data frame
x y z
1 2 3
2 3 4
5 1 2
I want to get the following
x y z sum prod
1 2 3 6 6
2 3 4 9 24
5 1 2 8 10
I have tried
sum = apply(ages,1,add)
but it gives me a row vector. Can some one please show me an efficient command to sum and product and append them to original data frame as shown above?
Try
transform(df, sum=rowSums(df), prod=x*y*z)
# x y z sum prod
#1 1 2 3 6 6
#2 2 3 4 9 24
#3 5 1 2 8 10
Or
transform(df, sum=rowSums(df), prod=Reduce(`*`, df))
# x y z sum prod
#1 1 2 3 6 6
#2 2 3 4 9 24
#3 5 1 2 8 10
Another option would be to use rowProds from matrixStats
library(matrixStats)
transform(df, sum=rowSums(df), prod=rowProds(as.matrix(df)))
If you are using apply
df[,c('sum', 'prod')] <- t(apply(df, 1, FUN=function(x) c(sum(x), prod(x))))
df
# x y z sum prod
#1 1 2 3 6 6
#2 2 3 4 9 24
#3 5 1 2 8 10
Another approach.
require(data.table)
# Create data
dt <- data.table(x = c(1,2,5), y = c(2,3,1), z = c(3,4,2))
# Create index
dt[, i := .I]
# Compute sum and prod
dt[, sum := sum(x, y, z), by = i]
dt[, prod := prod(x, y, z), by = i]
dt
# Compute sum and prod using .SD
dt[, c("sum", "prod") := NULL]
dt
dt[, sum := sum(.SD), by = i, .SDcols = c("x", "y", "z")]
dt[, prod := prod(.SD), by = i, .SDcols = c("x", "y", "z")]
dt
# Compute sum and prod using .SD and list
dt[, c("sum", "prod") := NULL]
dt
dt[, c("sum", "prod") := list(sum(.SD), prod(.SD)), by = i,
.SDcols = c("x", "y", "z")]
dt
# Compute sum and prod using .SD and lapply
dt[, c("sum", "prod") := NULL]
dt
dt[, c("sum", "prod") := lapply(list(sum, prod), do.call, .SD), by = i,
.SDcols = c("x", "y", "z")]
dt
Following can also be done but column names need to be entered:
ddf$sum = with(ddf, x+y+z)
ddf$prod = with(ddf, x*y*z)
ddf
x y z sum prod
1 1 2 3 6 6
2 2 3 4 9 24
3 5 1 2 8 10
With data.table, another form can be:
library(data.table)
cbind(dt, dt[,list(sum=x+y+z, product=x*y*z),])
x y z sum product
1: 1 2 3 6 6
2: 2 3 4 9 24
3: 5 1 2 8 10
A simpler version is suggested by #David Arenberg in comments:
dt[, ":="(sum = x+y+z, product = x*y*z)]
Only a partial answer, but if all values are greater than or equal to 0, rowSums/rowsum can be used to calculate products:
df <- data.frame(x = c(1, 2, 5), y = c(2, 3, 1), z = c(3, 4, 2))
# custom row-product-function
my_rowprod <- function(x) exp(rowSums(log(x)))
df$prod <- my_rowprod(df)
df
The generic version is (including negatives):
my_rowprod_2 <- function(x) {
sign <- ifelse((rowSums(x < 0) %% 2) == 1, -1, 1)
prod <- exp(rowSums(log(abs(x)))) * sign
prod
}
df$prod <- my_rowprod_2(df)
df