I wish to count the number of times each combination of two elements appears in the same group.
For example, with:
> dat = data.table(group = c(1,1,1,2,2,2,3,3), id=c(10,11,12,10,11,13,11,13))
> dat
group id
1: 1 10
2: 1 11
3: 1 12
4: 2 10
5: 2 11
6: 2 13
7: 3 11
8: 3 13
The expected result would be:
id.1 id.2 nb_common_appearances
10 11 2 (in group 1 and 2)
10 12 1 (in group 1)
11 12 1 (in group 1)
10 13 1 (in group 2)
11 13 2 (in group 2 and 3)
Here is a data.table approach (roughly the same as #josilber's from plyr):
pairs <- dat[, c(id=split(combn(id,2),1:2)), by=group ]
pairs[, .N, by=.(id.1,id.2) ]
# id.1 id.2 N
# 1: 10 11 2
# 2: 10 12 1
# 3: 11 12 1
# 4: 10 13 1
# 5: 11 13 2
You might also consider viewing the results in a table:
pairs[, table(id.1,id.2) ]
# id.2
# id.1 11 12 13
# 10 2 1 1
# 11 0 1 2
You can use merges instead of combn:
setkey(dat, group)
dat[ dat, allow.cartesian=TRUE ][ id<i.id, .N, by=.(id,i.id) ]
Benchmarks. For large data, the merges can be a little faster (as hypothesized by #DavidArenburg). #Arun's answer is faster still:
DT <- data.table(g=1,id=1:(1.5e3),key="id")
system.time({a <- combn(DT$id,2)})
# user system elapsed
# 0.81 0.00 0.81
system.time({b <- DT[DT,allow.cartesian=TRUE][id<i.id]})
# user system elapsed
# 0.13 0.00 0.12
system.time({d <- DT[,.(rep(id,(.N-1L):0L),id[indices(.N-1L)])]})
# user system elapsed
# 0.01 0.00 0.02
(I left out the group-by operation as I don't think it will be important to the timings.)
In defense of combn. The combn approach extends nicely to larger combos, while merges and #Arun's answer, while much faster for pairs, do not (as far as I can see):
DT2 <- data.table(g=rep(1:2,each=5),id=1:5)
tuple_size <- 4
tuples <- DT2[, c(id=split(combn(id,tuple_size),1:tuple_size)), by=g ]
tuples[, .N, by=setdiff(names(tuples),"g")]
# id.1 id.2 id.3 id.4 N
# 1: 1 2 3 4 2
# 2: 1 2 3 5 2
# 3: 1 2 4 5 2
# 4: 1 3 4 5 2
# 5: 2 3 4 5 2
Another way using data.table:
require(data.table)
indices <- function(n) sequence(n:1L) + rep(1:n, n:1)
dat[, .(id1 = rep(id, (.N-1L):0L),
id2 = id[indices(.N-1L)]),
by=group
][, .N, by=.(id1, id2)]
# id1 id2 N
# 1: 10 11 2
# 2: 10 12 1
# 3: 11 12 1
# 4: 10 13 1
# 5: 11 13 2
You could reshape your data to have each pair in each group in a separate row (I've used split-apply-combine for that step) and then use count from the plyr package to count the frequency of unique rows:
library(plyr)
count(do.call(rbind, lapply(split(dat, dat$group), function(x) t(combn(x$id, 2)))))
# x.1 x.2 freq
# 1 10 11 2
# 2 10 12 1
# 3 10 13 1
# 4 11 12 1
# 5 11 13 2
Here is a dplyr approach, using combn to make the combinations.
dat %>%
group_by(group) %>%
do(as.data.frame(t(combn(.[["id"]], 2)))) %>%
group_by(V1, V2) %>%
summarise(n( ))
Source: local data frame [5 x 3]
Groups: V1
V1 V2 n()
1 10 11 2
2 10 12 1
3 10 13 1
4 11 12 1
5 11 13 2
Related
I have a dataset (dt) like this in R:
n id val
1 1&&2 10
2 3 20
3 4&&5 30
And what I want to get is
n id val
1 1 10
2 2 10
3 3 20
4 4 30
5 5 30
I know that to split ids I need to do something like this:
id_split <- strsplit(dt$id,"&&")
But how do I create new rows with the same val for ids which were initially together in a row?
You may cbind the splits to get a column which you cbind again to the val (recycling).
res <- do.call(rbind, Map(data.frame, id=lapply(strsplit(dat$id, "&&"), cbind),
val=dat$val))
res <- cbind(n=1:nrow(res), res)
res
# n id val
# 1 1 1 10
# 2 2 2 10
# 3 3 3 20
# 4 4 4 30
# 5 5 5 30
You can use the lengths from the split of id and expand your rows. Then set n to be the sequece of the length of your data frame, i.e.
l1 <- strsplit(as.character(df$id), '&&')
res_df <- transform(df[rep(seq_len(nrow(df)), lengths(l1)),],
id = unlist(l1),
n = seq_along(unlist(l1)))
which gives,
n id val
1 1 1 10
1.1 2 2 10
2 3 3 20
3 4 4 30
3.1 5 5 30
You can remove the rownames with rownames(res_df) <- NULL
A data.table solution.
library(data.table)
DT <- fread('n id val
1 1&&2 10
2 3 20
3 4&&5 30')
DT[,.(id=unlist(strsplit(id,split ="&&"))),by=.(n,val)][,n:=.I][]
#> n val id
#> 1: 1 10 1
#> 2: 2 10 2
#> 3: 3 20 3
#> 4: 4 30 4
#> 5: 5 30 5
Created on 2020-05-08 by the reprex package (v0.3.0)
Note:
A more rebosut solution is by = 1:nrow(DT). But you need to play around your other columns though.
If anyone looking for tidy solution,
dt %>%
separate(id, into = paste0("id", 1:2),sep = "&&") %>%
pivot_longer(cols = c(id1,id2), names_to = "id_name", values_to = "id") %>%
drop_na(id) %>%
select(n, id, val)
output as
# A tibble: 5 x 3
n id val
<dbl> <chr> <dbl>
1 1 1 10
2 1 2 10
3 2 3 20
4 3 4 30
5 3 5 30
Edit:
As suggested by #sotos, and completely missed by me. one liner solution
d %>% separate_rows(id, ,sep = "&&")
gives same output as
# A tibble: 5 x 3
n id val
<dbl> <chr> <dbl>
1 1 1 10
2 1 2 10
3 2 3 20
4 3 4 30
5 3 5 30
tstrplit by id from data.table can do the job
library(data.table)
df <- setDT(df)[,.('id' = tstrsplit(id, "&&")), by = c('n','val')]
df[,'n' := seq(.N)]
df
n val id
1: 1 10 1
2: 2 10 2
3: 3 20 3
4: 4 30 4
5: 5 30 5
I'm trying to better understant the data.table package in r. I want to do different types of calculation with some columns and assign the result to new columns with specific names. Here is an example:
set.seed(122)
df <- data.frame(rain = rep(5,10),temp=1:10, skip = sample(0:2,10,T),
windw_sz = sample(1:2,10,T),city =c(rep("a",5),rep("b",5)),ord=rep(sample(1:5,5),2))
df <- as.data.table(df)
vars <- c("rain","temp")
df[, paste0("mean.",vars) := lapply(mget(vars),mean), by="city" ]
This works just fine. But now I also want to calculate the sum of these variables, so I try:
df[, c(paste0("mean.",vars), paste("sum.",vars)) := list( lapply(mget(vars),mean),
lapply(mget(vars),sum)), by="city" ]
and I get an error.
How could I implement this last part?
Thanks a lot!
Instead of list wrap, we can do a c as the lapply output is a list, and when do list as wrapper, it returns a list of list. However, with c, it concats two list end to end (i.e. c(as.list(1:5), as.list(6:10)) as opposed to list(as.list(1:5), as.list(6:10))) and instead of mget, make use of .SDcols
library(data.table)
df[, paste0(rep(c("mean.", "sum."), each = 2), vars) :=
c(lapply(.SD, mean), lapply(.SD, sum)), by = .(city), .SDcols = vars]
df
# rain temp skip windw_sz city ord mean.rain mean.temp sum.rain sum.temp
# 1: 5 1 0 2 a 2 5 3 25 15
# 2: 5 2 1 1 a 5 5 3 25 15
# 3: 5 3 2 2 a 3 5 3 25 15
# 4: 5 4 2 1 a 4 5 3 25 15
# 5: 5 5 2 2 a 1 5 3 25 15
# 6: 5 6 0 1 b 2 5 8 25 40
# 7: 5 7 2 2 b 5 5 8 25 40
# 8: 5 8 1 2 b 3 5 8 25 40
# 9: 5 9 2 1 b 4 5 8 25 40
#10: 5 10 2 2 b 1 5 8 25 40
My question is similar to this r count combinations of elements in groups however, firstly, I want to group all potential combinations by group in a column Comb and second, count the occurrences of the combinations depending on year in a column n.
Using the same mock dataset:
> dat = data.table(group = c(1,1,1,2,2,2,3,3), id=c(10,11,12,10,11,13,11,13))
> dat
group id year
1: 1 10 2010
2: 1 11 2010
3: 1 12 2010
4: 2 10 2011
5: 2 11 2011
6: 2 13 2011
7: 3 11 2012
8: 3 13 2012
The desired outcome:
> dat
group Comb year n
1: 1 10 11 2010 1
2: 1 11 12 2010 1
3: 1 12 10 2010 1
4: 2 10 11 2011 2
5: 2 11 13 2011 1
6: 2 13 10 2011 1
7: 3 11 13 2012 2
I would much appreciate a possible solution with dplyr.
thanks
Here's a solution, presented first as data.table then as dplyr. The process is the same: we self-join on group, filter where the id combinations are in a consistent order (any order would work, we pick first id < second id), group by combination to number the rows, and drop the unused columns.
dat = data.table(group = c(1,1,1,2,2,2,3,3), id=c(10,11,12,10,11,13,11,13))
## with data.table
merge(dat, dat, by = "group", allow.cartesian = TRUE)[
id.x < id.y, ][
, Comb := paste(id.x, id.y)][
, n := 1:.N, by = .(Comb)
][, .(group, Comb, n)]
# group Comb n
# 1: 1 10 11 1
# 2: 1 10 12 1
# 3: 1 11 12 1
# 4: 2 10 11 2
# 5: 2 10 13 1
# 6: 2 11 13 1
# 7: 3 11 13 2
## with dplyr
dat %>% full_join(dat, by = "group") %>%
filter(id.x < id.y) %>%
group_by(Comb = paste(id.x, id.y)) %>%
mutate(n = row_number()) %>%
select(group, Comb, n)
# # A tibble: 7 x 3
# # Groups: Comb [5]
# group Comb n
# <dbl> <chr> <int>
# 1 1 10 11 1
# 2 1 10 12 1
# 3 1 11 12 1
# 4 2 10 11 2
# 5 2 10 13 1
# 6 2 11 13 1
# 7 3 11 13 2
I have a problem to combine two different dimension dataframes which each dataframe has huge rows. Let's say, the sample of my dataframes are d and e, and new expected dataframe is de. I would like to make pair between all value in same row both in d and e, and construct those pairs in a new dataframe (de). Any idea/help for solving my problem is really appreciated. Thanks
> d <- data.frame(v1 = c(1,3,5), v2 = c(2,4,6))
> d
v1 v2
1 1 2
2 3 4
3 5 6
> e <- data.frame(v1 = c(11, 14), v2 = c(12,15), v3=c(13,16))
> e
v1 v2 v3
1 11 12 13
2 14 15 16
> de <- data.frame(x = c(1,1,1,2,2,2,3,3,3,4,4,4), y = c(11,12,13,11,12,13,14,15,16,14,15,16))
> de
x y
1 1 11
2 1 12
3 1 13
4 2 11
5 2 12
6 2 13
7 3 14
8 3 15
9 3 16
10 4 14
11 4 15
12 4 16
One solution is to "melt" d and e into long format, then merge, then get rid of the extra columns. If you have very large datasets, data tables are much faster (no difference for this tiny dataset).
library(reshape2) # for melt(...)
library(data.table)
# add id column
d <- cbind(id=1:nrow(d),d)
e <- cbind(id=1:nrow(e),e)
# melt to long format
d.melt <- data.table(melt(d,id.vars="id"), key="id")
e.melt <- data.table(melt(e,id.vars="id"), key="id")
# data table join, remove extra columns
result <- d.melt[e.melt, allow.cartesian=T]
result[,":="(id=NULL,variable=NULL,variable.1=NULL)]
setnames(result,c("x","y"))
setkey(result,x,y)
result
x y
1: 1 12
2: 1 13
3: 1 14
4: 2 12
5: 2 13
6: 2 14
7: 3 15
8: 3 16
9: 3 17
10: 4 15
11: 4 16
12: 4 17
If your data are numeric, like they are in this example, this is pretty straightforward in base R too. Conceptually this is the same as #jlhoward's answer: get your data into a long format, and merge:
merge(cbind(id = rownames(d), stack(d)),
cbind(id = rownames(e), stack(e)),
by = "id")[c("values.x", "values.y")]
# values.x values.y
# 1 1 11
# 2 1 12
# 3 1 13
# 4 2 11
# 5 2 12
# 6 2 13
# 7 3 14
# 8 3 15
# 9 3 16
# 10 4 14
# 11 4 15
# 12 4 16
Or, with the "reshape2" package:
merge(melt(as.matrix(d)),
melt(as.matrix(e)),
by = "Var1")[c("value.x", "value.y")]
I have data arranged like this in R:
indv time val
A 6 5
A 10 10
A 12 7
B 8 4
B 10 3
B 15 9
For each individual (indv) at each time, I want to calculate the change in value (val) from the initial time. So I would end up with something like this:
indv time val val_1 val_change
A 6 5 5 0
A 10 10 5 5
A 12 7 5 2
B 8 4 4 0
B 10 3 4 -1
B 15 9 4 5
Can anyone tell me how I might do this? I can use
ddply(df, .(indv), function(x)x[which.min(x$time), ])
to get a table like
indv time val
A 6 5
B 8 4
However, I cannot figure out how to make a column val_1 where the minimum values are matched up for each individual. However, if I can do that, I should be able to add column val_change using something like:
df['val_change'] = df['val_1'] - df['val']
EDIT: two excellent methods were posted below, however both rely on my time column being sorted so that small time values are on top of high time values. I'm not sure this will always be the case with my data. (I know I can sort first in Excel, but I'm trying to avoid that.) How could I deal with a case when the table appears like this:
indv time value
A 10 10
A 6 5
A 12 7
B 8 4
B 10 3
B 15 9
Here is a data.table solution that will be memory efficient as it is setting by reference within the data.table. Setting the key will sort by the key variables
library(data.table)
DT <- data.table(df)
# set key to sort by indv then time
setkey(DT, indv, time)
DT[, c('val1','change') := list(val[1], val - val[1]),by = indv]
# And to show it works....
DT
## indv time val val1 change
## 1: A 6 5 5 0
## 2: A 10 10 5 5
## 3: A 12 7 5 2
## 4: B 8 4 4 0
## 5: B 10 3 4 -1
## 6: B 15 9 4 5
Here's a plyr solution using ddply
ddply(df, .(indv), transform,
val_1 = val[1],
change = (val - val[1]))
indv time val val_1 change
1 A 6 5 5 0
2 A 10 10 5 5
3 A 12 7 5 2
4 B 8 4 4 0
5 B 10 3 4 -1
6 B 15 9 4 5
To get your second table try this:
ddply(df, .(indv), function(x) x[which.min(x$time), ])
indv time val
1 A 6 5
2 B 8 4
Edit 1
To deal with unsorted data, like the one you posted in your edit try the following
unsort <- read.table(text="indv time value
A 10 10
A 6 5
A 12 7
B 8 4
B 10 3
B 15 9", header=T)
do.call(rbind, lapply(split(unsort, unsort$indv),
function(x) x[order(x$time), ]))
indv time value
A.2 A 6 5
A.1 A 10 10
A.3 A 12 7
B.4 B 8 4
B.5 B 10 3
B.6 B 15 9
Now you can apply the procedure described above to this sorted dataframe
Edit 2
A shorter way to sort your dataframe is using sortBy function from doBy package
library(doBy)
orderBy(~ indv + time, unsort)
indv time value
2 A 6 5
1 A 10 10
3 A 12 7
4 B 8 4
5 B 10 3
6 B 15 9
Edit 3
You can even sort your df using ddply
ddply(unsort, .(indv, time), sort)
value time indv
1 5 6 A
2 10 10 A
3 7 12 A
4 4 8 B
5 3 10 B
6 9 15 B
You can do this with the base functions. using your data
df <- read.table(text = "indv time val
A 6 5
A 10 10
A 12 7
B 8 4
B 10 3
B 15 9", header = TRUE)
We first split() df on the indv variable
sdf <- split(df, df$indv)
Next we transform each component of sdf adding in the val_1 and val_change variables in a manner similar to how you suggest
sdf <- lapply(sdf, function(x) transform(x, val_1 = val[1],
val_change = val - val[1]))
Finally we arrange for the individual components to be bound row wise into a single data frame:
df <- do.call(rbind, sdf)
df
Which gives:
R> df
indv time val val_1 val_change
A.1 A 6 5 5 0
A.2 A 10 10 5 5
A.3 A 12 7 5 2
B.4 B 8 4 4 0
B.5 B 10 3 4 -1
B.6 B 15 9 4 5
Edit
To address the sorting issue the OP raises in the comments, modify the lapply() call to include a sorting step prior to the transform(). For example:
sdf <- lapply(sdf, function(x) {
x <- x[order(x$time), ]
transform(x, val_1 = val[1],
val_change = val - val[1])
})
In use we have
## scramble `df`
df <- df[sample(nrow(df)), ]
## split
sdf <- split(df, df$indv)
## apply sort and transform
sdf <- lapply(sdf, function(x) {
x <- x[order(x$time), ]
transform(x, val_1 = val[1],
val_change = val - val[1])
})
## combine
df <- do.call(rbind, sdf)
which again gives:
R> df
indv time val val_1 val_change
A.1 A 6 5 5 0
A.2 A 10 10 5 5
A.3 A 12 7 5 2
B.4 B 8 4 4 0
B.5 B 10 3 4 -1
B.6 B 15 9 4 5