I have a data table like
col1 col2 col3 col4
1: a a a 2
2: b b a 4.4
3: w w s 6.3
I want to get something like below without running a for loop.
col1 col2 col3 col4 count
1: a a a 2 1
2: b b a 4.4 2
3: w w s 6.3 2
I am counting unique values of col1, col2, col3 in each row and storing in count column. How do I do this in 1 line?
This might be of use:
data.frame solution
df <- read.table(header=T, text=' col1 col2 col3 col4
1 a a a 2
2 b b a 4.4
3 w w s 6.3')
#one line using apply
df$count <- apply(df[1:3], 1, function(x) {length(unique(x))})
Output:
> df
col1 col2 col3 col4 count
1 a a a 2.0 1
2 b b a 4.4 2
3 w w s 6.3 2
data.table solution
It will be a bit trickier to use the data.table syntax in this example.
First I create an id column by which to group_by:
#convert original df to data.table
df2 <- as.data.table(df)
df2[, id := 1:nrow(df2) ]
and then I use my self-made luna function to calculate the length of the unique elements:
luna <- function(x) length(unique(unlist(strsplit(x,''))))
df2[, count := luna(paste0(col1, col2, col3)), by=id ]
Output:
> df2
col1 col2 col3 col4 id count
1: a a a 2.0 1 1
2: b b a 4.4 2 2
3: w w s 6.3 3 2
Or as #Tensibai mentions in the comments, this is much faster:
df2 <- as.data.table(df)
df2[, id := 1:nrow(df2) ]
luna <- function(x) length(unique(x))
df2[, count2 := luna(c(col1, col2, col3)), by=id ]
> df2
col1 col2 col3 col4 id count2
1: a a a 2.0 1 1
2: b b a 4.4 2 2
3: w w s 6.3 3 2
And if we combine #Frank's and #Tensibai 's comments this should be the fastest (data.table 1.9.5+):
df2 <- as.data.table(df)
df2[, id := 1:nrow(df2) ]
#not run
#works only in data.table >= 1.9.5
df2[, count2 := uniqueN(c(col1, col2, col3)), by=id ]
#not run
How about the following:
dt <- CJ(1:5,1:3,1:4,1:2)
dt[, cnt:=apply(dt, 1, function(r) length(unique(r)))]
or if you only want to keep the rows with unique entries, you can try
dt <- CJ(1:5,1:3,1:4,1:2)
dt[apply(dt, 1, function(r) length(unique(r))==ncol(dt))]
Related
I've got two large data.tables DT1 (2M rows x 300 cols) and DT2 (50M rows x 2 cols) and i would like to merge the values of DT1 columns to a new column in DT2 based on the name of the column specified in a DT2 column. I'd like to achieve this without having to melt DT1, and by using data.table operations only, if possible.
Hora, a sample dataset.
> require(data.table)
> DT1 <- data.table(ID = c('A', 'B', 'C', 'D'), col1 = (1:4), col2 = (5:8), col3 = (9:12), col4 = (13:16))
> DT1
ID col1 col2 col3 col4
1: A 1 5 9 13
2: B 2 6 10 14
3: C 3 7 11 15
4: D 4 8 12 16
> DT2
ID col
1: A col1
2: B col2
3: B col3
4: C col1
5: A col4
#desired output
> DT2_merge
ID col col_value
1: A col1 1
2: B col2 6
3: B col3 10
4: C col1 3
5: A col4 13
Since dealing with two large data.tables, hoping to find the most efficient way of doing this.
Maybe there is a pure data.table version to do this but one way is to use matrix subsetting
library(data.table)
setDF(DT1)
DT2[, col_value := DT1[cbind(match(ID, DT1$ID), match(col, names(DT1)))]]
DT2
# ID col col_value
#1: A col1 1
#2: B col2 6
#3: B col3 10
#4: C col1 3
#5: A col4 13
Using set():
setkey(DT1, "ID")
setkey(DT2, "ID")
for (k in names(DT1)[-1]) {
rows <- which(DT2[["col"]] == k)
set(DT2, i = rows, j = "col_value", DT1[DT2[rows], ..k])
}
ID col col_value
1: A col1 1
2: A col4 13
3: B col2 6
4: B col3 10
5: C col1 3
Note: Setting the key up front speeds up the process but reorders the rows.
You can use lookup tables to find the indices for subsetting like:
setDF(DT1)
DT2[, col_value := DT1[matrix(c(setNames(seq_len(nrow(DT1)), DT1$ID)[DT2$ID],
setNames(2:NCOL(DT1), colnames(DT1)[-1])[DT2$col]), ncol=2)]]
DT2
# ID col col_value
#1: A col1 1
#2: B col2 6
#3: B col3 10
#4: C col1 3
#5: A col4 13
Using a matrix for subsetting is currently not sported in DT so if you have data.frame instead of data.table you can do it in base with:
DT2$col_value <- DT1[matrix(c(setNames(seq_len(nrow(DT1)), DT1$ID)[DT2$ID],
setNames(2:NCOL(DT1), colnames(DT1)[-1])[DT2$col]), ncol=2)]
You can also change your data structure before and change from matrix- to vector-subsetting:
DT1ID <- setNames(seq_len(nrow(DT1)), DT1$ID)
DT1 <- as.matrix(DT1[,-1])
DT2$col <- as.integer(substring(DT2$col, 4))
DT2$col_value <- DT1[c(DT1ID[DT2$ID] + (DT2$col-1)*nrow(DT1))]
Maybe also try fastmatch:
library(fastmatch)
DT1 <- as.matrix(DT1[,-1], rownames=DT1$ID)
DT2$col <- as.integer(substring(DT2$col, 4))
DT2$col_value <- DT1[c(fmatch(DT2$ID, rownames(DT1)) + (DT2$col-1)*nrow(DT1))]
Or you avoid lookup during subsetting und use levels when creating factor:
DT1 <- as.matrix(DT1[,-1], rownames=DT1$ID, colnames=colnames(DT1)[-1])
DT2$ID <- factor(DT2$ID, levels=rownames(DT1))
DT2$col <- factor(DT2$col, levels=colnames(DT1))
DT2$col_value <- DT1[c(unclass(DT2$ID) + (unclass(DT2$col)-1)*nrow(DT1))]
Here are two solutions also applicable to data.frame():
Solution 1
DT2$col_value <- apply(DT2, 1, function(v) DT1[which(DT1$ID==v[1]),which(colnames(DT1)==v[2])])
Solution 2 (same with solution by #Ronak Shah) maybe much faster than Solution 1 when with large dataset
DT2$col_value <- DT1[cbind(match(DT2$ID,DT1$ID),match(DT2$col,colnames(DT1)))]
Solution 3 (maybe the fastest)
m <- as.matrix(DT1[-1])
rownames(m) <- DT1$ID
DT2$col_value <- m[as.matrix(DT2)]
Testing some of the methods on a larger data-set and show their performance:
#sindri_baldur
library(data.table)
DT1 <- data.table(ID = rownames(x1), x1)
DT2 <- as.data.table(x2)
setkey(DT1, "ID")
setkey(DT2, "ID")
system.time(for (k in names(DT1)[-1]) {
rows <- which(DT2[["col"]] == k)
set(DT2, i = rows, j = "col_value", DT1[DT2[rows], ..k])
})
#User: 6.696
#Ronak Shah
library(data.table)
DT1 <- data.table(ID = rownames(x1), x1)
DT2 <- as.data.table(x2)
setDF(DT1)
system.time(DT2[, col_value := DT1[cbind(match(ID, DT1$ID), match(col, names(DT1)))]])
#User: 5.210
#Using fastmatch
library(fastmatch)
DT1 <- x1
DT2 <- x2
system.time(DT2$col_value <- DT1[c(fmatch(DT2$ID, rownames(DT1))
+ (fmatch(DT2$col, colnames(DT1))-1)*nrow(DT1))])
#User: 0.061
#Using factors
DT1 <- x1
DT2 <- x2
system.time(DT2$col_value <- DT1[c(unclass(DT2$ID) + (unclass(DT2$col)-1)*nrow(DT1))])
#User: 0.024
Data:
set.seed(7)
nrows <- 1e5
ncols <- 300
x1 <- matrix(sample(0:20, nrows*ncols, replace=TRUE), ncol=ncols
, dimnames=list(sample(do.call("paste0", expand.grid(rep(list(letters)
, ceiling(log(nrows, length(letters)))))), nrows), seq_len(ncols)))
x2 <- data.frame(ID=factor(sample(rownames(x1), nrows*10, replace=TRUE)
, levels=rownames(x1))
, col=factor(sample(colnames(x1), nrows*10, replace=TRUE), levels=colnames(x1)))
Is it possible to include a conditional count with other non-conditional aggregates ? I understand that you can do this with most aggregates functions using which(), but haven't been able to find a solution for counting rows.
For example, the following code sums col1 by col4 into sum1 and sums col2 for which col3 > 3 by col4 into sum2.
> DT[, .(sum1 = sum(col1)
, sum2 = sum(col2[which(col3 > 3)]))
, by = (col4)]
anything similar for .N? e.g .N[which(col3 > 3)]
I understand you can do this separately using:
> DT[col3 > 3, .N, by = .(col4)] # this works but not what I'm after
That's not what I'm after. I'd like to integrate the conditional count with other non-conditional aggregates together if I can using only data.table operations. I wouldn't want using any other packages.
here is sample data:
> DT <- data.table(col1 = c(3,5,2,2,4), col2 = c(0,1,0,1,1), col3 = c(3,4,6,7,1), col4 = c('a', 'b', 'a', 'a' ,'b'))
> DT
col1 col2 col3 col4
1: 3 0 3 a
2: 5 1 4 b
3: 2 0 6 a
4: 2 1 7 a
5: 4 1 1 b
My desired output:
DT[, .( sum1 = sum(col1)
, sum2 = sum(col3[which(col3 > 3)])
, count3 = ???????) #.N[which(col3 > 3), but this doesn't work
, by = .(col4)]
col4 sum1 sum2 count3
1: a 7 13 2
2: b 9 4 1
I have a data.table 'DT' with a column ('col2') that is a list of data frames:
require(data.table)
DT <- data.table(col1 = c('A','A','B'),
col2 = list(data.frame(colA = c(1,3,54, 23),
colB = c("aa", "bb", "cc", "hh")),
data.frame(colA =c(23, 1),
colB = c("hh", "aa")),
data.frame(colA = 1,
colB = "aa")))
> DT
col1 col2
1: A <data.frame>
2: A <data.frame>
3: B <data.frame>
>> DT$col2
[[1]]
colA colB
1 1 aa
2 3 bb
3 54 cc
4 23 hh
[[2]]
colA colB
1 23 hh
2 1 aa
[[3]]
colA colB
1 1 aa
Each data.frame in col2 has two columns colA and colB.
I'd like to have a data.table output that binds each unique row of those data.frames based on col1 of DT.
I guess it's like using rbindlist in an aggregate function of the data.table.
This is the desired output:
> #desired output
> output
colA colB col1
1: 1 aa A
2: 3 bb A
3: 54 cc A
4: 23 hh A
5: 1 aa B
The dataframe of the second row of DT (DT[2, col2]) has duplicate entries, and only unique entries are desired for each unique col1.
I tried the following and I get an error.
desired_output <- DT[, lapply(col2, function(x) unique(rbindlist(x))), by = col1]
# Error in rbindlist(x) :
# Item 1 of list input is not a data.frame, data.table or list
This 'works', though not desired output:
unique(rbindlist(DT$col2))
colA colB
1: 1 aa
2: 3 bb
3: 54 cc
4: 23 hh
Is there anyway to use rbindlist in an aggregate function of a data.table?
Group by 'col1', run rbindlist on 'col2':
unique(DT[ , rbindlist(col2), by = col1]) # trimmed thanks to #snoram
# col1 colA colB
# 1: A 1 aa
# 2: A 3 bb
# 3: A 54 cc
# 4: A 23 hh
# 5: B 1 aa
only unique entries are desired for each unique col1
If you add a column for col1, the expression above means "unique entries" (unconditional on columns).
Henrik's answer is one way to keep col1. Another is:
unique(DT[, rbindlist(setNames(col2, col1), id="col1")])
I guess this should be more efficient than
bycols = "col1"
unique(DT[, rbindlist(col2), by=bycols]) # Henrik's
though the extension to either (1) col1 not being a character column (hence suitable for setNames) or (2) having multiple by= columns is not so obvious. For either of these cases, I would make an .id column equal to row numbers of DT then copy them over:
bycols = "col1"
res = unique(DT[, rbindlist(col2, id="DT_row")])
res[, (bycols) := DT[DT_row, ..bycols]]
To put those columns first/leftmost, I think setcolorder(res, bycols) should work, but am on too old a data.table version to see it do so.
There's also an open issue for a tidyr::unnest-like function.
This works:
DT1<-apply(DT, 1, function(x){cbind(col1=x$col1,x$col2)})
unique(rbindlist(DT1))
# col1 colA colB
#1: A 1 aa
#2: A 3 bb
#3: A 54 cc
#4: A 23 hh
#5: B 1 aa
You could do something hackish like this:
nDT <- cbind(rbindlist(DT[[2]]), col1 = rep(DT[[1]], sapply(DT[[2]], nrow)))
nDT[!duplicated(nDT)]
colA colB col1
1: 1 aa A
2: 3 bb A
3: 54 cc A
4: 23 hh A
5: 1 aa B
Or using tidyr (Inspired by PKumar's comment):
unique(tidyr::unnest(DT))
Or more generalisable base R:
names(DT[[2]]) <- DT[[1]]
ndf <- do.call(rbind, DT[[2]])
ndf$col1 <- substr(row.names(ndf), 1, 1)
unique(ndf)
I have a data frame like
col1 col2 col3
A 2 b1
A 3 b2
A 2 b2
A 2 b1
A 3 b2
I want to get the count of unique values of col3 for each combination of col1 and col2 as following
col1 col2 count_unique
A 2 2
A 3 1
What is the best one line solution to this?
As #Frank and #akrun pointed out in their comments, there are several possible solutions to your question - here are three of the most used ones:
in base R:
aggregate(col3~., df, function(x) length(unique(x)) )
using the data.table package (v1.9.5 and higher):
setDT(df)[, uniqueN(col3), by=.(col1,col2)]
using the dplyr package:
df %>% group_by(col1, col2) %>% summarise(col3=n_distinct(col3))
Other two options:
plyr
library(plyr)
count(unique(df), vars = c("col1", "col2"))
Output:
col1 col2 freq
1 A 2 2
2 A 3 1
sqldf
library(sqldf)
sqldf("SELECT col1, col2, COUNT(DISTINCT(col3)) n
FROM df GROUP BY col1, col2")
Output:
col1 col2 n
1 A 2 2
2 A 3 1
col1 <- c('A','B','C', 'D')
col2 <- c('B','A','C', 'C')
col3 <- c('B','C','C', 'A')
dat <- data.frame(cbind(col1, col2, col3))
dat
col1 col2 col3
1 A B B
2 B A C
3 C C C
4 D C A
I would like to remove rows 1 and 3 from dat as the letter B is present more than once in row 1 and the letter C is present more than once in row 3.
EDIT:
My actual data contains over 1 million rows and 14 columns, all of which contain character data. The solution that runs the fastest is preferred as I am using the dataframe in a live setting to make decisions, and the underlying data is changing every few minutes.
You could try this (but I'm sure there is a better way)
cols <- ncol(dat)
indx <- apply(dat, 1, function(x) length(unique(x)) == cols)
dat[indx, ]
# col1 col2 col3
# 2 B A C
# 4 D C A
Another way (if your columns are characters and if you don't have too many columns) is something like the following (which is vectorized)
indx <- with(dat, (col1 == col2) | (col1 == col3) | (col2 == col3))
dat[!indx, ]
# col1 col2 col3
# 2 B A C
# 4 D C A
You could do this in dplyr, if you don't mind specifying the columns:
library(dplyr)
dat %>%
rowwise() %>%
mutate(repeats = max(table(c(col1, col2, col3))) - 1) %>%
filter(repeats == 0) %>%
select(-repeats) # if you don't want that column to appear in results
Source: local data frame [2 x 3]
col1 col2 col3
1 B A C
2 D C A
Here is an alternative. I haven't tested on big dataset,
library(data.table) #devel version v1.9.5
dat[setDT(melt(as.matrix(dat)))[,uniqueN(value)==.N , Var1]$V1,]
# col1 col2 col3
#2 B A C
#4 D C A
Or use anyDuplicated
dat[!apply(dat, 1, anyDuplicated),]
# col1 col2 col3
#2 B A C
#4 D C A