Keep all the data.table when aggregating a data.table - r

I would like to aggregate a data.table by a list of column and keep all the columns at the end.
A <- c(1,2,3,4,4,6,4)
B <- c("a","b","c","d","e","f","g")
C <- c(10,11,23,8,8,1,3)
D <- c(2,3,5,9,7,8,4)
dt <- data.table(A,B,C,D)
Now I want to aggregate the column B paste(B,sep=";") by A and C and keep the column D too at the end. Do you know a way to do it please?
EDIT
this is what i obtained using dt[, newCol := toString(B), .(A, C)]
A B C D newCol
1: 1 a 10 2 a
2: 2 b 11 3 b
3: 3 c 23 5 c
4: 4 d 8 9 d, e
5: 4 e 8 7 d, e
6: 6 f 1 8 f
7: 4 g 3 4 g
But i would like to obtain
A B C D newCol
1: 1 a 10 2 a
2: 2 b 11 3 b
3: 3 c 23 5 c
4: 4 d 8 9 d, e
6: 6 f 1 8 f
7: 4 g 3 4 g

Related

Is there an R function to merge two data frames based on two columns separately matching to the same column?

I would like to two populate values ("VAL") based on one of two columns separately("VALA","VALB").
# Data
DF1 <- data.frame("colA" = rep(c("A","B"), 6),
"colB" = rep(c("C","D","E"), 4))
DF2 <- data.frame("colC" = c("A","B","C","D","E"),
"VAL" = 1:5)
# three join calls
tmp1 <- left_join(DF1, DF2, by=c("colA"="colC"))
names(tmp1)[3] <- "VALA"
tmp2 <- left_join(DF1, DF2, by=c("colB"="colC"))
names(tmp2)[3] <- "VALB"
left_join(tmp1, tmp2, by=c("colA", "colB"))
# colA colB VALA VALB
# 1 A C 1 3
# 2 A C 1 3
# 3 B D 2 4
# 4 B D 2 4
# 5 A E 1 5
# 6 A E 1 5
# 7 B C 2 3
# 8 B C 2 3
# 9 A D 1 4
# 10 A D 1 4
# 11 B E 2 5
# 12 B E 2 5
# 13 A C 1 3
# 14 A C 1 3
# 15 B D 2 4
# 16 B D 2 4
# 17 A E 1 5
# 18 A E 1 5
# 19 B C 2 3
# 20 B C 2 3
# 21 A D 1 4
# 22 A D 1 4
# 23 B E 2 5
# 24 B E 2 5
Why does the last operation give 24 rows as output instead of expected 12?
Is there any possibility to achieve the same expected out in the most elegant way(instead of 3 join operations)?
You can use match to find the corresponding value and cbind the resluting columns.
cbind(DF1, VALA=DF2$VAL[match(DF1$colA, DF2$colC)],
VALB=DF2$VAL[match(DF1$colB, DF2$colC)])
colA colB VALA VALB
#1 A C 1 3
#2 B D 2 4
#3 A E 1 5
#4 B C 2 3
#5 A D 1 4
#6 B E 2 5
#7 A C 1 3
#8 B D 2 4
#9 A E 1 5
#10 B C 2 3
#11 A D 1 4
#12 B E 2 5
or use names:
x <- setNames(DF2$VAL, DF2$colC)
cbind(DF1, VALA=x[DF1$colA], VALB=x[DF1$colB])
and in case for many columns using match inside lapply
cbind(DF1, setNames(lapply(DF1, function(x) DF2$VAL[match(x, DF2$colC)]),
sub("col", "VAL", names(DF1))))
# colA colB VALA VALB
#1 A C 1 3
#2 B D 2 4
#3 A E 1 5
#4 B C 2 3
#5 A D 1 4
#6 B E 2 5
#7 A C 1 3
#8 B D 2 4
#9 A E 1 5
#10 B C 2 3
#11 A D 1 4
#12 B E 2 5
Try to combine left_join after one another using %>% and define its suffixes.
DF1 <- DF1 %>%
left_join(DF2, c("colA" = "colC")) %>%
left_join(DF2, c("colB" = "colC"),
suffix = c ("A", "B"))
> DF1
colA colB VALA VALB
1 A C 1 3
2 B D 2 4
3 A E 1 5
4 B C 2 3
5 A D 1 4
6 B E 2 5
7 A C 1 3
8 B D 2 4
9 A E 1 5
10 B C 2 3
11 A D 1 4
12 B E 2 5

R: reshape a data frame when more than 2 dimensions

I am trying to cast a dataframe into an other one, see below for the examples:
> start = data.frame(Aa = c('A','A','A','A','a','a','a','a'),Bb = c('B','B','b','b','B','B','b','b'),Cc = c('C','c','C','c','C','c','C','c'),v=c(1,2,3,4,5,6,7,8))
> start
Aa Bb Cc v
1 A B C 1
2 A B c 2
3 A b C 3
4 A b c 4
5 a B C 5
6 a B c 6
7 a b C 7
8 a b c 8
And I would like to have a data frame like this one:
1 A B 3
2 A b 7
3 a B 11
4 a b 15
5 B C 6
6 B c 8
7 b C 10
8 b c 12
9 A C 4
10 A c 6
11 a C 12
12 a c 14
Where line 1 is calculated because we have A-B-C -> 1 and A-B-c -> 2 so A-B -> 3
The fact is that I can imagine a solution with some for loops on the columns, but I need it to time efficient, I can have 100,000 rows and up to 100 columns so I need something fast, and I don't think that the for loop are really efficient in R.
Do you have any ideas?
Thanks you!
Perhaps you can use combn on the column names.
Here, I've used data.table for its efficient aggregation and for the convenience of rbindlist to put the data back together.
library(data.table)
setDT(start)
rbindlist(combn(names(start)[1:3], 2, FUN = function(x) {
start[, sum(v), x]
}, simplify = FALSE))
# Aa Bb V1
# 1: A B 3
# 2: A b 7
# 3: a B 11
# 4: a b 15
# 5: A C 4
# 6: A c 6
# 7: a C 12
# 8: a c 14
# 9: B C 6
# 10: B c 8
# 11: b C 10
# 12: b c 12

Counting how many times an element occurs in the column of a data.frame

Let's say I have a data.frame with a factor.
d = data.frame(f = c("a","a","a","b","b","b","b","d","d"))
f
1 a
2 a
3 a
4 b
5 b
6 b
7 b
8 d
9 d
And I want to add a column telling me how many times an element occurs.
Like this
f n
1 a 3
2 a 3
3 a 3
4 b 4
5 b 4
6 b 4
7 b 4
8 d 2
9 d 2
How would I do this?
Can also use some plyr functions - join & ddply
d <- data.frame(f = c("a","a","a","b","b","b","b","d","d"))
d2 <- join(d, ddply(d, .(f), 'nrow'))
d2
f nrow
1 a 3
2 a 3
3 a 3
4 b 4
5 b 4
6 b 4
7 b 4
8 d 2
9 d 2
You can use table like this:
d$n <- table(d$f)[d$f]
# f n
#1 a 3
#2 a 3
#3 a 3
#4 b 4
#5 b 4
#6 b 4
#7 b 4
#8 d 2
#9 d 2
You can use ave and length:
> d$n <- as.numeric(ave(as.character(d$f), d$f, FUN = length))
> d
f n
1 a 3
2 a 3
3 a 3
4 b 4
5 b 4
6 b 4
7 b 4
8 d 2
9 d 2
With the "data.table" package, you might do something like:
library(data.table)
D <- data.table(d)
D[, n := as.numeric(.N), by = f]

Multiple joins/merges with data.tables

I have two data.tables, DT and L:
> DT = data.table(x=rep(c("a","b","c"),each=3), y=c(1,3,6), v=1:9,key="x")
> L=data.table(yv=c(1L:8L,12L),lu=c(letters[8:1],letters[12]),key="yv")
> DT
x y v
1: a 1 1
2: a 3 2
3: a 6 3
4: b 1 4
5: b 3 5
6: b 6 6
7: c 1 7
8: c 3 8
9: c 6 9
> L
yv lu
1: 1 h
2: 2 g
3: 3 f
4: 4 e
5: 5 d
6: 6 c
7: 7 b
8: 8 a
9: 12 l
I would like to independently look up the corresponding value of lu from L for column y and for column v in DT. The following syntax provides the correct result, but is cumbersome to generate and then understand at a glance later:
> L[setkey(L[setkey(DT,y)],v)][,list(x,y=yv.1,v=yv,lu.1=lu.1,lu.2=lu)]
x y v lu.1 lu.2
1: a 1 1 h h
2: a 2 3 g f
3: a 3 6 f c
4: b 4 1 e h
5: b 5 3 d f
6: b 6 6 c c
7: c 7 1 b h
8: c 8 3 a f
9: c 9 6 NA c
(Edit: original post had L[setkey(L[setkey(DT,y)],v)][,list(x,y=yv,v=yv.1,lu.1=lu,lu.2=lu.1)] above, which incorrectly mixed up the y and v columns and looked up values.)
In SQL this would be simple/straightforward:
SELECT DT.*, L1.lu AS lu1, L2.lu AS lu2
FROM DT
LEFT JOIN L AS L1 ON DT.y = L1.yv
LEFT JOIN L AS L2 ON DT.v = L2.yv
Is there a more elegant way to use data.table to perform multiple joins? Note that I'm joining one table to another table twice in this example, but I am also interested in joining one table to multiple different tables.
Great question. One trick is that i doesn't have to be keyed. Only x must be keyed.
There might be better ways. How about this:
> cbind( L[DT[,list(y)]], L[DT[,list(v)]], DT )
yv lu yv lu x y v
1: 1 h 1 h a 1 1
2: 3 f 2 g a 3 2
3: 6 c 3 f a 6 3
4: 1 h 4 e b 1 4
5: 3 f 5 d b 3 5
6: 6 c 6 c b 6 6
7: 1 h 7 b c 1 7
8: 3 f 8 a c 3 8
9: 6 c 9 NA c 6 9
or, to illustrate, this is the same :
> cbind( L[J(DT$y)], L[J(DT$v)], DT )
yv lu yv lu x y v
1: 1 h 1 h a 1 1
2: 3 f 2 g a 3 2
3: 6 c 3 f a 6 3
4: 1 h 4 e b 1 4
5: 3 f 5 d b 3 5
6: 6 c 6 c b 6 6
7: 1 h 7 b c 1 7
8: 3 f 8 a c 3 8
9: 6 c 9 NA c 6 9
merge could also be used, if the following feature request was implemented :
FR#2033 Add by.x and by.y to merge.data.table

Alternative to expand.grid for data.frames

I have a data.frame df and I want that every row in this df is duplicated lengthTime times and that a new column is added that counts from 1 to lengthTime for each row in df.
I know, it sounds pretty complicated, but what I basically want is to apply expand.grid to df. Here is an ugly workaround and I have the feeling that there most be an easier solution (maybe even a base-R function?):
df <- data.frame(ID = rep(letters[1:3], each=3),
CatA = rep(1:3, times = 3),
CatB = letters[1:9])
lengthTime <- 3
nrRow <- nrow(df)
intDF <- df
for (i in 1:(lengthTime - 1)) {
df <- rbind(df, intDF)
}
df$Time <- rep(1:lengthTime, each=nrRow)
I thought that I could just use expand.grid(df, 1:lengthTime), but that does not work. outer did not bring any luck either. So does anyone know a good solution?
It's been a while since this question was posted, but I recently came across it looking for just the thing in the title, namely, an expand.grid that works for data frames. The posted answers address the OP's more specific question, so in case anyone is looking for a more general solution for data frames, here's a slightly more general approach:
expand.grid.df <- function(...) Reduce(function(...) merge(..., by=NULL), list(...))
# For the example in the OP
expand.grid.df(df, data.frame(1:lengthTime))
# More generally
df1 <- data.frame(A=1:3, B=11:13)
df2 <- data.frame(C=51:52, D=c("Y", "N"))
df3 <- data.frame(E=c("+", "-"))
expand.grid.df(df1, df2, df3)
You can also just do a simple merge by NULL (which will cause merge to do simple combinatorial data replication):
merge(data.frame(time=1:lengthTime), iris, by=NULL)
Why not just something like df[rep(1:nrow(df),times = 3),] to extend the data frame, and then add the extra column just as you have above, with df$Time <- rep(1:lengthTime, each=nrRow)?
Quick update
There is now also the crossing() function in package tidyr which can be used instead of merge, is somewhat faster, and returns a tbl_df / tibble.
data.frame(time=1:10) %>% merge(iris, by=NULL)
data.frame(time=1:10) %>% tidyr::crossing(iris)
This works:
REP <- rep(1:nrow(df), 3)
df2 <- data.frame(df[REP, ], Time = rep(1:3, each = 9))
rownames(df2) <- NULL
df2
A data.table solution:
> library(data.table)
> ( df <- data.frame(ID = rep(letters[1:3], each=3),
+ CatA = rep(1:3, times = 3),
+ CatB = letters[1:9]) )
ID CatA CatB
1 a 1 a
2 a 2 b
3 a 3 c
4 b 1 d
5 b 2 e
6 b 3 f
7 c 1 g
8 c 2 h
9 c 3 i
> ( DT <- data.table(df)[, lapply(.SD, function(x) rep(x,3))][, Time:=rep(1:3, each=nrow(df0))] )
ID CatA CatB Time
1: a 1 a 1
2: a 2 b 1
3: a 3 c 1
4: b 1 d 1
5: b 2 e 1
6: b 3 f 1
7: c 1 g 1
8: c 2 h 1
9: c 3 i 1
10: a 1 a 2
11: a 2 b 2
12: a 3 c 2
13: b 1 d 2
14: b 2 e 2
15: b 3 f 2
16: c 1 g 2
17: c 2 h 2
18: c 3 i 2
19: a 1 a 3
20: a 2 b 3
21: a 3 c 3
22: b 1 d 3
23: b 2 e 3
24: b 3 f 3
25: c 1 g 3
26: c 2 h 3
27: c 3 i 3
Another one :
> library(data.table)
> ( df <- data.frame(ID = rep(letters[1:3], each=3),
+ CatA = rep(1:3, times = 3),
+ CatB = letters[1:9]) )
> DT <- data.table(df)
> rbindlist(lapply(1:3, function(i) cbind(DT, Time=i)))
ID CatA CatB Time
1: a 1 a 1
2: a 2 b 1
3: a 3 c 1
4: b 1 d 1
5: b 2 e 1
6: b 3 f 1
7: c 1 g 1
8: c 2 h 1
9: c 3 i 1
10: a 1 a 2
11: a 2 b 2
12: a 3 c 2
13: b 1 d 2
14: b 2 e 2
15: b 3 f 2
16: c 1 g 2
17: c 2 h 2
18: c 3 i 2
19: a 1 a 3
20: a 2 b 3
21: a 3 c 3
22: b 1 d 3
23: b 2 e 3
24: b 3 f 3
25: c 1 g 3
26: c 2 h 3
27: c 3 i 3

Resources