Frequencies data table multiple columns - r

I have a data table like this
require(data.table)
dt <- data.table(a= c("a","a","b","b","b"), b= c("a","a","c","c","e"), c=c("d","d","b","b","b"))
I want to count frequencies from all the columns. I know how to do it one by one, but I want to do it in one instruction because my data has a lot of columns.
Result must be this one:
dt[,a1:=.N, by = c("a")]
dt[,a2:=.N, by = c("b")]
dt[,a3:=.N, by = c("c")]

require(data.table)
dt <- data.table(a= c("a","a","b","b","b"),
b= c("a","a","c","c","e"),
c=c("d","d","b","b","b"))
#dt
# a b c
#1: a a d
#2: a a d
#3: b c b
#4: b c b
#5: b e b
l=lapply(seq_along(colnames(dt)),
function(i) dt[,eval(colnames(dt)[i]),with=F][, x:=.N,by=eval(colnames(dt)[i])])
#l
#[[1]]
# a x
#1: a 2
#2: a 2
#3: b 3
#4: b 3
#5: b 3
#[[2]]
# b x
#1: a 2
#2: a 2
#3: c 2
#4: c 2
#5: e 1
#[[3]]
# c x
#1: d 2
#2: d 2
#3: b 3
#4: b 3
#5: b 3
df = as.data.frame(l)
# replacing alternate column names with concatenating "_count" to it
colnames(df)[seq(2,length(colnames(df)),2)]=
paste0(colnames(df)[seq(1,length(colnames(df)),2)],"_count")
#df
# a a_count b b_count c c_count
#1 a 2 a 2 d 2
#2 a 2 a 2 d 2
#3 b 3 c 2 b 3
#4 b 3 c 2 b 3
#5 b 3 e 1 b 3

Related

Is there an R function to merge two data frames based on two columns separately matching to the same column?

I would like to two populate values ("VAL") based on one of two columns separately("VALA","VALB").
# Data
DF1 <- data.frame("colA" = rep(c("A","B"), 6),
"colB" = rep(c("C","D","E"), 4))
DF2 <- data.frame("colC" = c("A","B","C","D","E"),
"VAL" = 1:5)
# three join calls
tmp1 <- left_join(DF1, DF2, by=c("colA"="colC"))
names(tmp1)[3] <- "VALA"
tmp2 <- left_join(DF1, DF2, by=c("colB"="colC"))
names(tmp2)[3] <- "VALB"
left_join(tmp1, tmp2, by=c("colA", "colB"))
# colA colB VALA VALB
# 1 A C 1 3
# 2 A C 1 3
# 3 B D 2 4
# 4 B D 2 4
# 5 A E 1 5
# 6 A E 1 5
# 7 B C 2 3
# 8 B C 2 3
# 9 A D 1 4
# 10 A D 1 4
# 11 B E 2 5
# 12 B E 2 5
# 13 A C 1 3
# 14 A C 1 3
# 15 B D 2 4
# 16 B D 2 4
# 17 A E 1 5
# 18 A E 1 5
# 19 B C 2 3
# 20 B C 2 3
# 21 A D 1 4
# 22 A D 1 4
# 23 B E 2 5
# 24 B E 2 5
Why does the last operation give 24 rows as output instead of expected 12?
Is there any possibility to achieve the same expected out in the most elegant way(instead of 3 join operations)?
You can use match to find the corresponding value and cbind the resluting columns.
cbind(DF1, VALA=DF2$VAL[match(DF1$colA, DF2$colC)],
VALB=DF2$VAL[match(DF1$colB, DF2$colC)])
colA colB VALA VALB
#1 A C 1 3
#2 B D 2 4
#3 A E 1 5
#4 B C 2 3
#5 A D 1 4
#6 B E 2 5
#7 A C 1 3
#8 B D 2 4
#9 A E 1 5
#10 B C 2 3
#11 A D 1 4
#12 B E 2 5
or use names:
x <- setNames(DF2$VAL, DF2$colC)
cbind(DF1, VALA=x[DF1$colA], VALB=x[DF1$colB])
and in case for many columns using match inside lapply
cbind(DF1, setNames(lapply(DF1, function(x) DF2$VAL[match(x, DF2$colC)]),
sub("col", "VAL", names(DF1))))
# colA colB VALA VALB
#1 A C 1 3
#2 B D 2 4
#3 A E 1 5
#4 B C 2 3
#5 A D 1 4
#6 B E 2 5
#7 A C 1 3
#8 B D 2 4
#9 A E 1 5
#10 B C 2 3
#11 A D 1 4
#12 B E 2 5
Try to combine left_join after one another using %>% and define its suffixes.
DF1 <- DF1 %>%
left_join(DF2, c("colA" = "colC")) %>%
left_join(DF2, c("colB" = "colC"),
suffix = c ("A", "B"))
> DF1
colA colB VALA VALB
1 A C 1 3
2 B D 2 4
3 A E 1 5
4 B C 2 3
5 A D 1 4
6 B E 2 5
7 A C 1 3
8 B D 2 4
9 A E 1 5
10 B C 2 3
11 A D 1 4
12 B E 2 5

Assign an ID in one column based on the ID in another column in R

I have first assigned a unique id based on column 1
column1 id.1 column2
A 1 C
A 1 B
B 2 B
C 3 A
C 3 D
and I would like to allocate the id.1 values to column2
column1 id.1 column2 id.2
A 1 C 3
A 1 B 2
B 2 B 2
C 3 A 1
C 3 D NA
I am sorry if this has been answered again. I am trying for long to find a beautiful solution on that. Thank you for your time
match is the usual way to do this:
df$id.2 = df$id.1[match(df$column2, df$column1)]
df
# column1 id.1 column2 id.2
# 1 A 1 C 3
# 2 A 1 B 2
# 3 B 2 B 2
# 4 C 3 A 1
# 5 C 3 D NA
Or using dplyr syntax:
mutate(df, id.2 = id.1[match(column2, column1)])
Using this data:
df = read.table(text = 'column1 id.1 column2
A 1 C
A 1 B
B 2 B
C 3 A
C 3 D', header = T)
you could use factor as shown below:
transform(df, id.2 = factor(column2, unique(column1), unique(id.1)))
column1 id.1 column2 id.2
1 A 1 C 3
2 A 1 B 2
3 B 2 B 2
4 C 3 A 1
5 C 3 D <NA>

Enumerate groups within groups in a data.table [duplicate]

This question already has answers here:
How to create group indices for nested groups in r
(3 answers)
Closed 3 years ago.
This is related to multiple duplicates (1, 2, 3), but a slightly different problem that I'm stuck with. So far, I've seen pandas solution only.
In this data table:
dt = data.table(gr = rep(letters[1:2], each = 6),
cl = rep(letters[1:4], each = 3))
gr cl
1: a a
2: a a
3: a a
4: a b
5: a b
6: a b
7: b c
8: b c
9: b c
10: b d
11: b d
12: b d
I'd like to enumerate unique classes per group to obtain this:
gr cl id
1: a a 1
2: a a 1
3: a a 1
4: a b 2
5: a b 2
6: a b 2
7: b c 1
8: b c 1
9: b c 1
10: b d 2
11: b d 2
12: b d 2
Try
library(data.table)
dt[, id := rleid(cl), by=gr]
dt
# gr cl id
# 1: a a 1
# 2: a a 1
# 3: a a 1
# 4: a b 2
# 5: a b 2
# 6: a b 2
# 7: b c 1
# 8: b c 1
# 9: b c 1
#10: b d 2
#11: b d 2
#12: b d 2
You can do (maybe it will require to sort the data first):
dt[, id := cumsum(!duplicated(cl)), by = gr]
gr cl id
1: a a 1
2: a a 1
3: a a 1
4: a b 2
5: a b 2
6: a b 2
7: b c 1
8: b c 1
9: b c 1
10: b d 2
11: b d 2
12: b d 2
The same with dplyr:
dt %>%
group_by(gr) %>%
mutate(id = cumsum(!duplicated(cl)))
Or a rleid()-like possibility:
dt %>%
group_by(gr) %>%
mutate(id = with(rle(cl), rep(seq_along(lengths), lengths)))
An alternative solution using factor which will not require ordering first
dt %>%
group_by(gr) %>%
mutate(id = as.numeric(factor(cl))) %>%
ungroup()
# # A tibble: 12 x 3
# gr cl id
# <chr> <chr> <dbl>
# 1 a a 1
# 2 a a 1
# 3 a a 1
# 4 a b 2
# 5 a b 2
# 6 a b 2
# 7 b c 1
# 8 b c 1
# 9 b c 1
#10 b d 2
#11 b d 2
#12 b d 2
Note that this will automatically assign a number / id based on the alphabetical order of the cl values, within each gr group.

Keep all the data.table when aggregating a data.table

I would like to aggregate a data.table by a list of column and keep all the columns at the end.
A <- c(1,2,3,4,4,6,4)
B <- c("a","b","c","d","e","f","g")
C <- c(10,11,23,8,8,1,3)
D <- c(2,3,5,9,7,8,4)
dt <- data.table(A,B,C,D)
Now I want to aggregate the column B paste(B,sep=";") by A and C and keep the column D too at the end. Do you know a way to do it please?
EDIT
this is what i obtained using dt[, newCol := toString(B), .(A, C)]
A B C D newCol
1: 1 a 10 2 a
2: 2 b 11 3 b
3: 3 c 23 5 c
4: 4 d 8 9 d, e
5: 4 e 8 7 d, e
6: 6 f 1 8 f
7: 4 g 3 4 g
But i would like to obtain
A B C D newCol
1: 1 a 10 2 a
2: 2 b 11 3 b
3: 3 c 23 5 c
4: 4 d 8 9 d, e
6: 6 f 1 8 f
7: 4 g 3 4 g

Alternative to expand.grid for data.frames

I have a data.frame df and I want that every row in this df is duplicated lengthTime times and that a new column is added that counts from 1 to lengthTime for each row in df.
I know, it sounds pretty complicated, but what I basically want is to apply expand.grid to df. Here is an ugly workaround and I have the feeling that there most be an easier solution (maybe even a base-R function?):
df <- data.frame(ID = rep(letters[1:3], each=3),
CatA = rep(1:3, times = 3),
CatB = letters[1:9])
lengthTime <- 3
nrRow <- nrow(df)
intDF <- df
for (i in 1:(lengthTime - 1)) {
df <- rbind(df, intDF)
}
df$Time <- rep(1:lengthTime, each=nrRow)
I thought that I could just use expand.grid(df, 1:lengthTime), but that does not work. outer did not bring any luck either. So does anyone know a good solution?
It's been a while since this question was posted, but I recently came across it looking for just the thing in the title, namely, an expand.grid that works for data frames. The posted answers address the OP's more specific question, so in case anyone is looking for a more general solution for data frames, here's a slightly more general approach:
expand.grid.df <- function(...) Reduce(function(...) merge(..., by=NULL), list(...))
# For the example in the OP
expand.grid.df(df, data.frame(1:lengthTime))
# More generally
df1 <- data.frame(A=1:3, B=11:13)
df2 <- data.frame(C=51:52, D=c("Y", "N"))
df3 <- data.frame(E=c("+", "-"))
expand.grid.df(df1, df2, df3)
You can also just do a simple merge by NULL (which will cause merge to do simple combinatorial data replication):
merge(data.frame(time=1:lengthTime), iris, by=NULL)
Why not just something like df[rep(1:nrow(df),times = 3),] to extend the data frame, and then add the extra column just as you have above, with df$Time <- rep(1:lengthTime, each=nrRow)?
Quick update
There is now also the crossing() function in package tidyr which can be used instead of merge, is somewhat faster, and returns a tbl_df / tibble.
data.frame(time=1:10) %>% merge(iris, by=NULL)
data.frame(time=1:10) %>% tidyr::crossing(iris)
This works:
REP <- rep(1:nrow(df), 3)
df2 <- data.frame(df[REP, ], Time = rep(1:3, each = 9))
rownames(df2) <- NULL
df2
A data.table solution:
> library(data.table)
> ( df <- data.frame(ID = rep(letters[1:3], each=3),
+ CatA = rep(1:3, times = 3),
+ CatB = letters[1:9]) )
ID CatA CatB
1 a 1 a
2 a 2 b
3 a 3 c
4 b 1 d
5 b 2 e
6 b 3 f
7 c 1 g
8 c 2 h
9 c 3 i
> ( DT <- data.table(df)[, lapply(.SD, function(x) rep(x,3))][, Time:=rep(1:3, each=nrow(df0))] )
ID CatA CatB Time
1: a 1 a 1
2: a 2 b 1
3: a 3 c 1
4: b 1 d 1
5: b 2 e 1
6: b 3 f 1
7: c 1 g 1
8: c 2 h 1
9: c 3 i 1
10: a 1 a 2
11: a 2 b 2
12: a 3 c 2
13: b 1 d 2
14: b 2 e 2
15: b 3 f 2
16: c 1 g 2
17: c 2 h 2
18: c 3 i 2
19: a 1 a 3
20: a 2 b 3
21: a 3 c 3
22: b 1 d 3
23: b 2 e 3
24: b 3 f 3
25: c 1 g 3
26: c 2 h 3
27: c 3 i 3
Another one :
> library(data.table)
> ( df <- data.frame(ID = rep(letters[1:3], each=3),
+ CatA = rep(1:3, times = 3),
+ CatB = letters[1:9]) )
> DT <- data.table(df)
> rbindlist(lapply(1:3, function(i) cbind(DT, Time=i)))
ID CatA CatB Time
1: a 1 a 1
2: a 2 b 1
3: a 3 c 1
4: b 1 d 1
5: b 2 e 1
6: b 3 f 1
7: c 1 g 1
8: c 2 h 1
9: c 3 i 1
10: a 1 a 2
11: a 2 b 2
12: a 3 c 2
13: b 1 d 2
14: b 2 e 2
15: b 3 f 2
16: c 1 g 2
17: c 2 h 2
18: c 3 i 2
19: a 1 a 3
20: a 2 b 3
21: a 3 c 3
22: b 1 d 3
23: b 2 e 3
24: b 3 f 3
25: c 1 g 3
26: c 2 h 3
27: c 3 i 3

Resources