Does Column of Sentences Contain Word in Another Column of Sentences? - r

I have two large tables each containing a "sentence" column with a string of words. I am curious which records (true/false output) have a word that is found in any of the sentences in either column. My tables are very large and the below code I have can take a very long time. Is there a faster way to go about doing this?
Thank you!
# Determine if any "words" in either column of sentences match.
# Packages
library(tidyverse)
# Help functions
helper_in_2 <- function(b, a){
return(any(b %in% a))
}
helper_in <- function(a, b){
return(lapply(b, helper_in_2, a))
}
# Sample columns
sentence_col_a <- c("This is an example sentence.", "Here is another sample sentence?", "One more sentence that is not complicated.", "Last sentence to show an example!")
sentence_col_b <- c("Short string A.", "Another longer string.", "Final string example!")
# Extract words from each column
list_col_a <- str_to_lower(sentence_col_a) %>%
str_extract_all("[:alpha:]+")
list_col_b <- str_to_lower(sentence_col_b) %>%
str_extract_all("[:alpha:]+")
# Check for matches.
# (Code after first line isn't actually used in my code - it's just to show matches)
sapply(lapply(list_col_a, helper_in, list_col_b), as.numeric) %>%
t() %>%
as.data.frame() %>%
rename_at(vars(names(.)), function(x) sentence_col_b) %>%
mutate(rownames = sentence_col_a) %>%
tibble::column_to_rownames(var = "rownames")
Output:
Sentences
Short string A.
Another longer string.
Final string example!
This is an example sentence.
0
0
1
Here is another sample sentence?
0
1
0
One more sentence that is not complicated.
0
0
0
Last sentence to show an example!
0
0
1
Update after Ronak's Answer
library(microbenchmark)
microbenchmark("Original method:" = sapply(lapply(list_col_a, helper_in, list_col_b), as.numeric),
"Ronak's method:" = sapply(list_col_a, function(x) as.integer(grepl(sprintf('\\b(%s)\\b', paste0(x, collapse = '|')), list_col_b))))
#Unit: microseconds
# expr min lq mean median uq max neval
#Original method: 72.9 76.65 88.082 82.35 86.1 173.9 100
# Ronak's method: 262.1 277.40 354.741 286.40 348.6 3724.3 100

Here I can provide several options, but the nested for-loop method might be the most efficient one so far:
outer
TIC1 <- function() {
+outer(list_col_a, list_col_b, FUN = Vectorize(function(x, y) any(x %in% y)))
}
nested sapply
TIC2 <- function() {
sapply(
list_col_b,
function(x) {
sapply(
list_col_a,
function(y) sum(y %in% x)
)
}
)
}
nested for loops
TIC3 <- function() {
res <- matrix(nrow = length(list_col_a), ncol = length(list_col_b))
for (a in seq_along(list_col_a)) {
for (b in seq_along(list_col_b)) {
res[a, b] <- any(list_col_a[[a]] %in% list_col_b[[b]])
}
}
+res
}
Benchmarking
# Original solution
original <- function() {
sapply(lapply(list_col_a, helper_in, list_col_b), as.numeric) %>%
t() %>%
as.data.frame() %>%
rename_at(vars(names(.)), function(x) sentence_col_b) %>%
mutate(rownames = sentence_col_a) %>%
tibble::column_to_rownames(var = "rownames")
}
# Waldi's data.table solution
Waldi <- function() {
la <- data.table(id = 1:length(list_col_a), list_col_a)
lb <- data.table(id = 1:length(list_col_b), list_col_b)
la_long <- la[, .(words = unlist(list_col_a)), by = id]
lb_long <- lb[, .(words = unlist(list_col_b)), by = id]
unique(la_long[lb_long, on = .(words = words)][!is.na(id), .(idxa = id, idxb = i.id)])
}
mustafaakben1 <- function(rows = list_col_a, cols = list_col_b) {
to_matrix <- function(X_t) {
matrix(unlist(X_t),
nrow = length(list_col_a),
ncol = length(list_col_b),
byrow = T
)
}
to_matrix(lapply(
1:length(cols),
FUN = function(X) {
lapply(
X = 1:length(rows),
FUN = function(Y) {
sum(rows[[Y]] %in% cols[[X]])
}
)
}
))
}
library(fastmatch)
mustafaakben2 <- function() {
search_keywords <- unlist(list_col_b)[unlist(list_col_b) %in% unlist(list_col_a)]
b_col_filter <- which(unlist(lapply(list_col_b, function(X) any(X %in% search_keywords))))
a_row_filter <- which(unlist(lapply(list_col_a, function(X) any(X %in% search_keywords))))
res <- matrix(0,
nrow = length(list_col_a),
ncol = length(list_col_b)
)
for (a in a_row_filter) {
for (b in b_col_filter) {
res[a, b] <- any(list_col_a[[a]] %fin% list_col_b[[b]])
}
}
+res
}
# ThomasIsCoding's outer solution
TIC1 <- function() {
+outer(list_col_a, list_col_b, FUN = Vectorize(function(x, y) any(x %in% y)))
}
TIC2 <- function() {
sapply(
list_col_b,
function(x) {
sapply(
list_col_a,
function(y) sum(y %in% x)
)
}
)
}
TIC3 <- function() {
res <- matrix(nrow = length(list_col_a), ncol = length(list_col_b))
for (a in seq_along(list_col_a)) {
for (b in seq_along(list_col_b)) {
res[a, b] <- any(list_col_a[[a]] %in% list_col_b[[b]])
}
}
+res
}
microbenchmark::microbenchmark(
original(),
Waldi(),
mustafaakben1(),
mustafaakben2(),
TIC1(),
TIC2(),
TIC3(),
unit = "relative"
)
and you will see
Unit: relative
expr min lq mean median uq max
original() 172.895884 149.346066 49.841448 142.676077 134.111459 3.130206
Waldi() 107.441122 92.004380 30.290680 88.474026 83.690267 1.971249
mustafaakben1() 1.596981 1.551978 1.646884 1.610160 1.553021 1.683034
mustafaakben2() 1.635812 1.731991 2.186106 1.912535 1.831179 2.332797
TIC1() 3.854043 3.845866 1.977066 3.943445 3.707308 1.041416
TIC2() 2.888118 2.627955 1.607401 2.719427 2.538536 1.142211
TIC3() 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000
neval
100
100
100
100
100
100
100

Try nested lappy
to_matrix <- function(X_t){
matrix(unlist(X_t),
nrow = length(list_col_a),
ncol = length(list_col_b)),
byrow = T)
}
nested_lappy <- function(rows=list_col_a, cols=list_col_b) {
to_matrix(lapply(
1:length(cols),
FUN = function (X)
lapply(
X = 1:length(rows),
FUN = function(Y)
sum(rows[[Y]] %in% cols[[X]])
)
))
}
> nested_lappy()
[,1] [,2] [,3]
[1,] 0 0 1
[2,] 0 1 0
[3,] 0 0 0
[4,] 0 0 1
Probably your matrix will be huge too. It would be better to use a sparse matrix. You can try to use the Matrix package. It may help you to carry out your analysis in a more memory-efficient way.
Here is the benchmark
microbenchmark::microbenchmark(
original(),
Waldi(),
TIC(),
nested_lappy(),
unit = "relative"
)
Unit: relative
expr min lq mean median uq max neval
original() 99.97881 89.869163 83.011249 67.88434 69.883301 260.704657 100
Waldi() 56.55076 51.185905 45.436361 39.35327 42.730942 46.438114 100
TIC() 2.27000 2.249311 1.986625 1.84108 1.837013 3.974149 100
nested_lappy() 1.00000 1.000000 1.000000 1.00000 1.000000 1.000000 100
Edits
I will cheat here a little bit because #ThomasIsCoding is an amazing coder. I need to cheat :)
So, because you have a huge table, you need to focus on an efficient way to search your keyword space. As you may notice that not all keywords have an intersection and shared uniformly in the sentences. So, even before starting to search, you can eliminate those sentences in the search space. By doing so, we can focus only on the words shared by both column and row dimensions.
search_keywords<- unlist(list_col_b)[unlist(list_col_b) %in% unlist(list_col_a)]
b_col_filter <- which(unlist(lapply(list_col_b, function(X) any(X %in% search_keywords))))
a_row_filter <- which(unlist(lapply(list_col_a, function(X) any(X %in% search_keywords))))
Then, use the fastmatch package to make the %in% faster.
library(fastmatch)
mustafaakben2 <- function() {
res <- matrix(0,
nrow = length(list_col_a),
ncol = length(list_col_b))
for (a in a_row_filter) {
for (b in b_col_filter) {
res[a, b] <- any(list_col_a[[a]] %fin% list_col_b[[b]])
}
}
+res
}
> mustafaakben2()
[,1] [,2] [,3]
[1,] 0 0 1
[2,] 0 1 0
[3,] 0 0 0
[4,] 0 0 1
Benchmark results
microbenchmark::microbenchmark(
original(),
Waldi(),
TIC1(),
TIC2(),
TIC3(),
mustafaakben(),
mustafaakben2(),
unit = "relative"
)
Unit: relative
expr min lq mean median uq max neval cld
original() 288.620155 254.429012 193.446439 190.457965 171.914286 115.0415822 100 c
Waldi() 182.751938 153.864198 115.182908 115.778761 103.518095 36.9411765 100 b
TIC1() 6.581395 6.277778 5.074523 5.066372 4.685714 2.3732252 100 a
TIC2() 4.705426 4.385802 3.503269 3.466814 3.281905 1.5811359 100 a
TIC3() 1.767442 1.685185 1.360847 1.338496 1.249524 0.7728195 100 a
mustafaakben() 2.589147 2.330247 1.944260 2.017699 1.864762 0.7322515 100 a
mustafaakben2() 1.000000 1.000000 1.000000 1.000000 1.000000 1.0000000 100 a

You could use a data.table join to get the id of the matching sentences.
library(data.table)
# Original solution
original <- function(){
sapply(lapply(list_col_a, helper_in, list_col_b), as.numeric) %>%
t() %>%
as.data.frame() %>%
rename_at(vars(names(.)), function(x) sentence_col_b) %>%
mutate(rownames = sentence_col_a) %>%
tibble::column_to_rownames(var = "rownames")
}
# data.table solution
new <- function(){
la <- data.table(id = 1:length(list_col_a),list_col_a)
lb <- data.table(id = 1:length(list_col_b),list_col_b)
la_long <- la[,.(words=unlist(list_col_a)),by= id]
lb_long <- lb[,.(words=unlist(list_col_b)),by= id]
unique(la_long[lb_long, on=.(words=words)][!is.na(id),.(idxa=id, idxb = i.id)])
}
new()
idxa idxb
1: 2 2
2: 1 3
3: 4 3
microbenchmark::microbenchmark(original(),new())
Unit: milliseconds
expr min lq mean median uq max neval cld
original() 4.1623 5.1190 5.857155 5.5528 6.18345 23.5442 100 b
new() 2.2492 2.7993 3.255741 3.1298 3.68645 5.1872 100 a
As data.table allows indexing, this could be much more efficient on a higher number of sentences / words : to be tested on a bigger dataset.

With the help of regular expressions, you can do this with one sapply call. We create a pattern with each value in list_col_a and check if any of it exists in list_col_b.
sapply(list_col_a, function(x) as.integer(grepl(sprintf('\\b(%s)\\b',
paste0(x, collapse = '|')), list_col_b)))
# [,1] [,2] [,3] [,4]
#[1,] 0 0 0 0
#[2,] 0 1 0 0
#[3,] 1 0 0 1
You can include your remaining code as it is to get the matches.

Related

Vectorization of matrix operation in R matching string patterns

I'm using the code below to create a matrix that compares all strings in one vector to see if they contain any of the patterns in the second vector:
strngs <- c("hello there", "welcome", "how are you")
pattern <- c("h", "e", "o")
M <- matrix(nrow = length(strngs), ncol = length(pattern))
for(i in 1:length(strngs)){
for(j in 1:length(pattern)){
M[i, j]<-str_count(strngs[i], pattern[j])
}
}
M
It works great, and returns the matrix I'm looking for:
[,1] [,2] [,3]
[1,] 2 3 1
[2,] 0 2 1
[3,] 1 1 2
However, my real data set is huge, and looping like this doesn't scale well to a matrix with 117, 746, 754 values. Does anyone know a way I could vectorize this or otherwise speed it up? Or should I just learn C++? ;)
Thanks!
You can use outer and stri_count_fixed as suggested by #snoram.
outer(strngs, pattern, stringi::stri_count_fixed)
# [,1] [,2] [,3]
#[1,] 2 3 1
#[2,] 0 2 1
#[3,] 1 1 2
Here is some marginal improvement by removing the inner loop and switching to stringi (which stringr is built upon).
M <- matrix(0L, nrow = length(strngs), ncol = length(pattern))
for(i in 1:length(strngs)) {
M[i, ] <- stringi::stri_count_fixed(strngs[i], pattern)
}
And then a more standard R way:
t(sapply(strngs, stringi::stri_count_fixed, pattern))
Yet another solution, with sapply. Basically snoram's solution.
t(sapply(strngs, stringi::stri_count_fixed, pattern))
# [,1] [,2] [,3]
#hello there 2 3 1
#welcome 0 2 1
#how are you 1 1 2
Tests.
Since there are a total of 4 ways, here are some speed tests.
f0 <- function(){
M<-matrix(nrow=length(strngs),ncol=length(pattern))
for(i in 1:length(strngs)){
for(j in 1:length(pattern)){
M[i,j]<-stringr::str_count(strngs[i],pattern[j])
}
}
M
}
f1 <- function(){
M <- matrix(0L, nrow = length(strngs), ncol = length(pattern), )
for(i in 1:length(strngs)) {
M[i, ] <- stringi::stri_count_fixed(strngs[i], pattern)
}
M
}
f2 <- function() outer(strngs, pattern, stringi::stri_count_fixed)
f3 <- function() t(sapply(strngs, stringi::stri_count_fixed, pattern))
r0 <- f0()
r1 <- f1()
r2 <- f2()
r3 <- f3()
identical(r0, r1)
identical(r0, r2)
identical(r0, r3) # FALSE, the return has rownames
library(microbenchmark)
library(ggplot2)
mb <- microbenchmark(
op = f0(),
snoram = f1(),
markus = f2(),
rui = f3()
)
mb
#Unit: microseconds
# expr min lq mean median uq max
# op 333.425 338.8705 348.23310 341.7700 345.8060 542.699
# snoram 47.923 50.8250 53.96677 54.8500 56.3870 69.903
# markus 27.502 29.8005 33.17537 34.3670 35.7490 54.095
# rui 68.994 72.3020 76.77452 73.4845 77.1825 215.328
autoplot(mb)

mutate() with an if/else function

I have an example dataframe
df <- data.frame(cust = sample(1:100, 1000, TRUE),
channel = sample(c("WEB", "POS"), 1000, TRUE))
that I'm trying to mutate
get_channels <- function(data) {
d <- data
if(unique(d) %>% length() == 2){
d <- "Both"
} else {
if(unique(d) %>% length() < 2 && unique(d) == "WEB") {
d <- "Web"
} else {
d <- "POS"
}
}
return(d)
}
This works without issue and on small dataframes, it takes no time at all.
start.time <- Sys.time()
df %>%
group_by(cust) %>%
mutate(chan = get_channels(channel)) %>%
group_by(cust) %>%
slice(1) %>%
group_by(chan) %>%
summarize(count = n()) %>%
mutate(perc = count/sum(count))
end.time <- Sys.time()
time.taken <- end.time - start.time
time.taken
Time difference of 0.34602 secs
However, when the data frame gets rather large, say, on the order of >1000000 or more cust, my basic if/else fx takes much, much longer.
How can I streamline this function to make it run more quickly?
You should use a data.table for this.
setDT(df)
t1 = Sys.time()
df = df[ , .(channels = ifelse(uniqueN(channel) == 2, "both", as.character(channel[1]))), by = .(cust)]
> Sys.time() - t1
Time difference of 0.00500083 secs
> head(df)
cust channels
1: 37 both
2: 45 both
3: 74 both
4: 20 both
5: 1 both
6: 68 both
You can do it in base R using something like that:
web_cust <- unique(df$cust[df$channel=="WEB"])
pos_cust <- unique(df$cust[df$channel=="POS"])
both <- length(intersect(web_cust, pos_cust))
web_only <- length(setdiff(web_cust, pos_cust))
pos_only <- length(setdiff(pos_cust, web_cust))
Data:
set.seed(1)
df <- data.frame(cust = sample(2e6, 1e7, TRUE),
channel = sample(c("WEB", "POS"), 1e7, TRUE),
stringsAsFactors = F)
A faster dplyr version that takes about 1/3 the time but is probably still slower than the data table version. uniqueN borrowed from #Kristoferson answer.
df %>%
group_by(cust) %>%
summarize(chan = if_else(uniqueN(channel) == 2, "Both", as.character(channel[1]))) %>%
group_by(chan) %>%
summarize(n = n() ) %>%
mutate(perc = n /sum(n))
Also, your orginal can be improved significantly by optimizing your function like this:
get_channels <- function(data) {
ud <- unique(data)
udl <- length(ud)
if(udl == 2) {
r <- "Both"
} else {
if(udl < 2 && ud == "WEB") {
r <- "Web"
} else {
r <- "POS"
}
}
return(r)
}
And some timings...
I tried three different alternatives in both dplyr and data.table: (1) ifelse (see #Kristofersen's answer), (2) if / else (because the test is of length 1), and (3) vector indexing. Unsurprisingly, the main difference is between dplyr and data.table and not among alternative 1-3.
For 1000 customers, data.table is about 7 times faster. For 10000 customers it's about 30 times faster. For 1e6 customers, I only tested data.table, not a very large difference between alternatives.
# 1000 customers, 2*1000 registrations
df <- data.frame(cust = sample(1e3, 2e3, replace = TRUE),
channel = sample(c("WEB", "POS"), 2e3, TRUE))
library(microbenchmark)
library(dplyr)
library(data.table)
microbenchmark(dp1 = df %>%
group_by(cust) %>%
summarise(res = ifelse(n_distinct(channel) == 1, channel[1], "both")),
dp2 = df %>%
group_by(cust) %>%
summarise(res = if(n_distinct(channel) == 1) channel[1] else "both"),
dp3 = df %>%
group_by(cust) %>%
summarise(res = c("both", channel[1])[(n_distinct(channel) == 1) + 1]),
dt1 = setDT(df)[ , .(channels = ifelse(uniqueN(channel) == 2, "both", channel[1])), by = cust],
dt2 = setDT(df)[ , .(channels = if(uniqueN(channel) == 2) "both" else channel[1]), by = cust],
dt3 = setDT(df)[ , .(res = c("both", channel[1])[(uniqueN(channel) == 1) + 1]), by = cust],
times = 5, unit = "relative")
# 1e3 customers
# Unit: relative
# expr min lq mean median uq max neval
# dp1 7.8985477 8.176139 7.9355234 7.676534 8.0359975 7.9166933 5
# dp2 7.8882707 8.018000 7.8965098 8.731935 7.8414478 7.3560530 5
# dp3 8.0851402 8.934831 7.7540060 7.653026 6.8305012 7.6887950 5
# dt1 1.1713088 1.180870 1.0350482 1.209861 1.0523597 0.7650059 5
# dt2 0.8272681 1.223387 0.9311628 1.047773 0.9028017 0.7795579 5
# dt3 1.0000000 1.000000 1.0000000 1.000000 1.0000000 1.0000000 5
# 1e4 customers
# Unit: relative
# expr min lq mean median uq max neval
# dp1 40.8725204 39.5297108 29.5755838 38.996075 38.246103 17.2784642 5
# dp2 40.7396141 39.4299918 27.4476811 38.819577 37.886320 12.7265756 5
# dp3 41.0940358 39.7819673 27.5532964 39.260488 38.317899 12.4685386 5
# dt1 1.0905470 1.0661613 0.7422082 1.053786 1.034642 0.3428945 5
# dt2 0.9052739 0.9008761 1.2813458 2.111642 2.356008 0.9005391 5
# dt3 1.0000000 1.0000000 1.0000000 1.000000 1.000000 1.0000000 5
# 1e6 customers, data.table only
# Unit: relative
# expr min lq mean median uq max neval
# dt1 1.146757 1.147152 1.155497 1.164471 1.156244 1.161660 5
# dt2 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 5
# dt3 1.084442 1.079734 1.253568 1.106833 1.098766 1.799935 5

Optimizing speed of nearest search function in R [closed]

Closed. This question needs to be more focused. It is not currently accepting answers.
Want to improve this question? Update the question so it focuses on one problem only by editing this post.
Closed 6 years ago.
Improve this question
I'm trying to make this function faster (ideally with RcppAmadillo or some other alternative). myfun takes a matrix, mat, that can get quite large, but is always two columns. myfun finds the closest rows for each row in the matrix that are +1 or -1 away in absolute value from each row
As an example below, the first row of mat is 3,3. Therefore, myfun will output a list with rows 2 and 3 being closest to row 1, but not row 5, which is +2 away.
library(microbenchmark)
dim(mat)
[1] 1000 2
head(mat)
x y
[1,] 3 3
[2,] 3 4
[3,] 3 2
[4,] 7 3
[5,] 4 4
[6,] 10 1
output
[[1]]
[1] 2 3
[[2]]
[1] 1
[[3]]
[1] 1
[[4]]
integer(0)
[[5]]
integer(0)
[[6]]
integer(0)
microbenchmark( myfun(mat), times = 100) #mat of 1000 rows
# Unit: milliseconds
# expr min lq mean median uq max neval
# myfun(mat) 89.30126 90.28618 95.50418 90.91281 91.50875 180.1505 100
microbenchmark( myfun(mat), times = 100) #mat of 10,000 rows
# Unit: seconds
# expr min lq mean median uq max neval
# myfun(layout.old) 5.912633 5.912633 5.912633 5.912633 5.912633 5.912633 1
This is what myfun looks like
myfun = function(x){
doo <- function(j) {
j.mat <- matrix(rep(j, length = length(x)), ncol = ncol(x), byrow = TRUE)
j.abs <- abs(j.mat - x)
return(which(rowSums(j.abs) == 1))
}
return(apply(x, 1, doo))
}
Below, I have a base R solution that is much faster than myfun provided by the OP.
myDistOne <- function(m) {
v1 <- m[,1L]; v2 <- m[,2L]
rs <- rowSums(m)
lapply(seq_along(rs), function(x) {
t1 <- which(abs(rs[x] - rs) == 1)
t2 <- t1[which(abs(v1[x] - v1[t1]) <= 1)]
t2[which(abs(v2[x] - v2[t2]) <= 1)]
})
}
Here are some benchmarks:
library(microbenchmark)
set.seed(9711)
m1 <- matrix(sample(50, 2000, replace = TRUE), ncol = 2) ## 1,000 rows
microbenchmark(myfun(m1), myDistOne(m1))
Unit: milliseconds
expr min lq mean median uq max neval cld
myfun(m1) 78.61637 78.61637 80.47931 80.47931 82.34225 82.34225 2 b
myDistOne(m1) 27.34810 27.34810 28.18758 28.18758 29.02707 29.02707 2 a
identical(myfun(m1), myDistOne(m1))
[1] TRUE
m2 <- matrix(sample(200, 20000, replace = TRUE), ncol = 2) ## 10,000 rows
microbenchmark(myfun(m2), myDistOne(m2))
Unit: seconds
expr min lq mean median uq max neval cld
myfun(m2) 5.219318 5.533835 5.758671 5.714263 5.914672 7.290701 100 b
myDistOne(m2) 1.230721 1.366208 1.433403 1.419413 1.473783 1.879530 100 a
identical(myfun(m2), myDistOne(m2))
[1] TRUE
Here is a very large example:
m3 <- matrix(sample(1000, 100000, replace = TRUE), ncol = 2) ## 50,000 rows
system.time(testJoe <- myDistOne(m3))
user system elapsed
26.963 10.988 37.973
system.time(testUser <- myfun(m3))
user system elapsed
148.444 33.297 182.639
identical(testJoe, testUser)
[1] TRUE
I'm sure there is a faster solution. Maybe by sorting the rowSums upfront and working from there could see an improvement (it could also get very messy).
Update
As I predicted, working from a sorted rowSums is much faster (and uglier!)
myDistOneFast <- function(m) {
v1 <- m[,1L]; v2 <- m[,2L]
origrs <- rowSums(m)
mySort <- order(origrs)
rs <- origrs[mySort]
myDiff <- c(0L, diff(rs))
brks <- which(myDiff > 0L)
lenB <- length(brks)
n <- nrow(m)
myL <- vector("list", length = n)
findRows <- function(v, s, r, u1, u2) {
lapply(v, function(x) {
sx <- s[x]
tv1 <- s[r]
tv2 <- tv1[which(abs(u1[sx] - u1[tv1]) <= 1)]
tv2[which(abs(u2[sx] - u2[tv2]) <= 1)]
})
}
t1 <- brks[1L]; t2 <- brks[2L]
## setting first index in myL
myL[mySort[1L:(t1-1L)]] <- findRows(1L:(t1-1L), mySort, t1:(t2-1L), v1, v2)
k <- t0 <- 1L
while (k < (lenB-1L)) {
t1 <- brks[k]; t2 <- brks[k+1L]; t3 <- brks[k+2L]
vec <- t1:(t2-1L)
if (myDiff[t1] == 1L) {
if (myDiff[t2] == 1L) {
myL[mySort[vec]] <- findRows(vec, mySort, c(t0:(t1-1L), t2:(t3-1L)), v1, v2)
} else {
myL[mySort[vec]] <- findRows(vec, mySort, t0:(t1-1L), v1, v2)
}
} else if (myDiff[t2] == 1L) {
myL[mySort[vec]] <- findRows(vec, mySort, t2:(t3-1L), v1, v2)
}
if (myDiff[t2] > 1L) {
if (myDiff[t3] > 1L) {
k <- k+2L; t0 <- t2
} else {
k <- k+1L; t0 <- t1
}
} else {k <- k+1L; t0 <- t1}
}
## setting second to last index in myL
if (k == lenB-1L) {
t1 <- brks[k]; t2 <- brks[k+1L]; t3 <- n+1L; vec <- t1:(t2-1L)
if (myDiff[t1] == 1L) {
if (myDiff[t2] == 1L) {
myL[mySort[vec]] <- findRows(vec, mySort, c(t0:(t1-1L), t2:(t3-1L)), v1, v2)
} else {
myL[mySort[vec]] <- findRows(vec, mySort, t0:(t1-1L), v1, v2)
}
} else if (myDiff[t2] == 1L) {
myL[mySort[vec]] <- findRows(vec, mySort, t2:(t3-1L), v1, v2)
}
k <- k+1L; t0 <- t1
}
t1 <- brks[k]; vec <- t1:n
if (myDiff[t1] == 1L) {
myL[mySort[vec]] <- findRows(vec, mySort, t0:(t1-1L), v1, v2)
}
myL
}
The results are not even close. myDistOneFast is over 100x faster than the OP's original myfun on very large matrices and also scales well. Below are some benchmarks:
microbenchmark(OP = myfun(m1), Joe = myDistOne(m1), JoeFast = myDistOneFast(m1))
Unit: milliseconds
expr min lq mean median uq max neval
OP 57.60683 59.51508 62.91059 60.63064 61.87141 109.39386 100
Joe 22.00127 23.11457 24.35363 23.87073 24.87484 58.98532 100
JoeFast 11.27834 11.99201 12.59896 12.43352 13.08253 15.35676 100
microbenchmark(OP = myfun(m2), Joe = myDistOne(m2), JoeFast = myDistOneFast(m2))
Unit: milliseconds
expr min lq mean median uq max neval
OP 4461.8201 4527.5780 4592.0409 4573.8673 4633.9278 4867.5244 100
Joe 1287.0222 1316.5586 1339.3653 1331.2534 1352.3134 1524.2521 100
JoeFast 128.4243 134.0409 138.7518 136.3929 141.3046 172.2499 100
system.time(testJoeFast <- myDistOneFast(m3))
user system elapsed
0.68 0.00 0.69 ### myfun took over 100s!!!
To test equality, we have to sort each vector of indices. We also can't use identical for comparison as myL is initialized as an empty list, thus some of the indices contain NULL values (these correspond to integer(0) in the result from myfun and myDistOne).
testJoeFast <- lapply(testJoeFast, sort)
all(sapply(1:50000, function(x) all(testJoe[[x]]==testJoeFast[[x]])))
[1] TRUE
unlist(testJoe[which(sapply(testJoeFast, is.null))])
integer(0)
Here is an example with 500,000 rows:
set.seed(42)
m4 <- matrix(sample(2000, 1000000, replace = TRUE), ncol = 2)
system.time(myDistOneFast(m4))
user system elapsed
10.84 0.06 10.94
Here is an overview of how the algorithm works:
Calculate rowSums
Order the rowSums (i.e. returns the indices from the original vector of the sorted vector)
Call diff
Mark each non-zero instance
Determine which indices in small range satisfy the OP's request
Use the ordered vector calculated in 2 to determine original index
This is much faster than comparing one rowSum to all of the rowSum every time.

Repeating calculation based on conditions

What I am trying to do is pretty simple. However, I am new to R and have not learned much about loops and functions and am not sure what is the most efficient way to get the results. Basically, I want to count the number of rows that meet my conditions and do a division. Here is an example:
df1 <- data.frame(
Main = c(0.0089, -0.050667, -0.030379, 0.066484, 0.006439, -0.026076),
B = c(NA, 0.0345, -0.0683, -0.052774, 0.014661, -0.040537),
C = c(0.0181, 0, -0.056197, 0.040794, 0.03516, -0.022662),
D = c(-0.0127, -0.025995, -0.04293, 0.057816, 0.033458, -0.058382)
)
df1
# Main B C D
# 1 0.008900 NA 0.018100 -0.012700
# 2 -0.050667 0.034500 0.000000 -0.025995
# 3 -0.030379 -0.068300 -0.056197 -0.042930
# 4 0.066484 -0.052774 0.040794 0.057816
# 5 0.006439 0.014661 0.035160 0.033458
# 6 -0.026076 -0.040537 -0.022662 -0.058382
My criteria for the numerator is to count the number of B/C/D that is >0 when Main is >0; For denominator, count the number of B/C/D that is != 0 when Main is != 0. I can use length(which(df1$Main >0 & df1$B>0)) / length(which(df1$Main !=0 & df1$B !=0)) to get the ratios for each of the column individually. But my data set has many more columns, and I am wondering if there is a way to get those ratio all at once so that my result will be like:
# B C D
# 1 0.2 0.6 0.3
Use apply:
apply(df1[,-1], 2, function(x) length(which(df1$Main >0 & x>0)) / length(which(df1$Main !=0 & x !=0)))
You could do this vectorized (No apply or for is needed):
tail(colSums(df[df$Main>0,]>0, na.rm = T) / colSums(df[df$Main!=0,]!=0, na.rm = T), -1)
# B C D
#0.2000000 0.6000000 0.3333333
One way to do this would be with a for loop that loops over the columns and applies the function that you wrote. Something like this:
ratio1<-vector()
for(i in 2:ncol(df1)){
ratio1[i-1] <- length(which(df1$Main >0 & df1[,i]>0)) / length(which(df1$Main !=0 & df1[,i] !=0))
}
Maybe there is a better way to do this with apply or data.table, but this is a simple solution that I can come up with. Works on any number of columns. Use round() if you want the answer in one decimal.
criteria1 <- df1[which(df1$Main > 0), -1] > 0
criteria2 <- df1[which(df1$Main != 0), -1] != 0
colSums(criteria1, na.rm = T)/colSums(criteria2, na.rm = T)
## B C D
## 0.2000000 0.6000000 0.3333333
Edit: It appears Niek's method is quickest for this specific data
# Unit: microseconds
# expr min lq mean median uq max neval
# Jim(df1) 216.468 230.0585 255.3755 239.8920 263.6870 802.341 300
# emilliman5(df1) 120.109 135.5510 155.9018 142.4615 156.0135 1961.931 300
# Niek(df1) 97.118 107.6045 123.5204 111.1720 119.6155 1966.830 300
# nine89(df1) 211.683 222.6660 257.6510 232.2545 252.6570 2246.225 300
#[[1]]
# [,1] [,2] [,3] [,4]
#median 239.892 142.462 111.172 232.255
#ratio 1.000 0.594 0.463 0.968
#diff 0.000 -97.430 -128.720 -7.637
However, when there are many columns the vectorized approach is quicker.
Nrow <- 1000
Ncol <- 1000
mat <- matrix(runif(Nrow*Ncol),Nrow)
df1 <- data.frame(Main = sample(-2:2,Nrow,T), mat) #1001 columns
#Unit: milliseconds
# expr min lq mean median uq max
# Jim(df1) 46.75627 53.88500 66.93513 56.58143 62.04375 185.0460
#emilliman5(df1) 73.35257 91.87283 151.38991 178.53188 185.06860 292.5571
# Niek(df1) 68.17073 76.68351 89.51625 80.14190 86.45726 200.7119
# nine89(df1) 51.36117 56.79047 74.53088 60.07220 66.34270 191.8294
#[[1]]
# [,1] [,2] [,3] [,4]
#median 56.581 178.532 80.142 60.072
#ratio 1.000 3.155 1.416 1.062
#diff 0.000 121.950 23.560 3.491
functions
Jim <- function(df1){
criteria1 <- df1[which(df1$Main > 0), -1] > 0
criteria2 <- df1[which(df1$Main != 0), -1] != 0
colSums(criteria1, na.rm = T)/colSums(criteria2, na.rm = T)
}
emilliman5 <- function(df1){
apply(df1[,-1], 2, function(x) length(which(df1$Main >0 & x>0)) / length(which(df1$Main !=0 & x !=0)))
}
Niek <- function(df1){
ratio1<-vector()
for(i in 2:ncol(df1)){
ratio1[i-1] <- length(which(df1$Main >0 & df1[,i]>0)) / length(which(df1$Main !=0 & df1[,i] !=0))
}
ratio1
}
nine89 <- function(df){
tail(colSums(df[df$Main>0,]>0, na.rm = T) / colSums(df[df$Main!=0,]!=0, na.rm = T), -1)
}

What is the right way to multiply data frame by vector?

I'm trying to multiply a data frame df by a vector v, so that the product is a data frame, where the i-th row is given by df[i,]*v. I can do this, for example, by
df <- data.frame(A=1:5, B=2:6); v <- c(0,2)
as.data.frame(t(t(df) * v))
A B
1 0 4
2 0 6
3 0 8
4 0 10
5 0 12
I am sure there has to be a more R-style approach (and a very simple one!), but nothing comes on my mind. I even tried something like
apply(df, MARGIN=1, function(x) x*v)
but still, non-readable constructions like as.data.frame(t(.)) are required.
How can I find an efficient and elegant workaround here?
This works too:
data.frame(mapply(`*`,df,v))
In that solution, you are taking advantage of the fact that data.frame is a type of list, so you can iterate over both the elements of df and v at the same time with mapply.
Unfortunately, you are limited in what you can output from mapply: as simple list, or a matrix. If your data are huge, this would likely be more efficient:
data.frame(mapply(`*`,df,v,SIMPLIFY=FALSE))
Because it would convert it to a list, which is more efficient to convert to a data.frame.
If you're looking for speed and memory efficiency - data.table to the rescue:
library(data.table)
dt = data.table(df)
for (i in seq_along(dt))
dt[, (i) := dt[[i]] * v[i]]
eddi = function(dt) { for (i in seq_along(dt)) dt[, (i) := dt[[i]] * v[i]] }
arun = function(df) { df * matrix(v, ncol=ncol(df), nrow=nrow(df), byrow=TRUE) }
nograpes = function(df) { data.frame(mapply(`*`,df,v,SIMPLIFY=FALSE)) }
N = 1e6
dt = data.table(A = rnorm(N), B = rnorm(N))
v = c(0,2)
microbenchmark(eddi(copy(dt)), arun(copy(dt)), nograpes(copy(dt)), times = 10)
#Unit: milliseconds
# expr min lq mean median uq max neval
# eddi(copy(dt)) 23.01106 24.31192 26.47132 24.50675 28.87794 34.28403 10
# arun(copy(dt)) 337.79885 363.72081 450.93933 433.21176 516.56839 644.70103 10
# nograpes(copy(dt)) 19.44873 24.30791 36.53445 26.00760 38.09078 95.41124 10
As Arun points out in the comments, one can also use the set function from the data.table package to do this in-place modification on data.frame's as well:
for (i in seq_along(df))
set(df, j = i, value = df[[i]] * v[i])
This of course also works for data.table's and could be significantly faster if the number of columns is large.
A language that lets you combine vectors with matrices has to make a decision at some point whether the matrices are row-major or column-major ordered. The reason:
> df * v
A B
1 0 4
2 4 0
3 0 8
4 8 0
5 0 12
is because R operates down the columns first. Doing the double-transpose trick subverts this. Sorry if this is just explaining what you know, but I don't know another way of doing it, except explicitly expanding v into a matrix of the same size.
Or write a nice function that wraps the not very R-style code into something that is R-stylish.
Whats wrong with
t(apply(df, 1, function(x)x*v))
?
library(purrr)
map2_dfc(df, v, `*`)
Benchmark
N = 1e6
dt = data.table(A = rnorm(N), B = rnorm(N))
v = c(0,2)
eddi = function(dt) { for (i in seq_along(dt)) dt[, (i) := dt[[i]] * v[i]]; dt }
arun = function(df) { df * matrix(v, ncol=ncol(df), nrow=nrow(df), byrow=TRUE) }
nograpes = function(df) { data.frame(mapply(`*`,df,v,SIMPLIFY=FALSE)) }
ryan = function(df) {map2_dfc(df, v, `*`) }
library(microbenchmark)
microbenchmark(
eddi(copy(dt))
, arun(copy(dt))
, nograpes(copy(dt))
, ryan(copy(dt))
, times = 100)
# Unit: milliseconds
# expr min lq mean median uq max neval
# eddi(copy(dt)) 8.367513 11.06719 24.26205 12.29132 19.35958 171.6212 100
# arun(copy(dt)) 94.031272 123.79999 186.42155 148.87042 251.56241 364.2193 100
# nograpes(copy(dt)) 7.910739 10.92815 27.68485 13.06058 21.39931 172.0798 100
# ryan(copy(dt)) 8.154395 11.02683 29.40024 13.73845 21.77236 181.0375 100
I think the fastest way (without testing data.table) is data.frame(t(t(df)*v)).
My tests:
testit <- function(nrow, ncol)
{
df <- as.data.frame(matrix(rnorm(nrow*ncol),nrow=nrow,ncol=ncol))
v <- runif(ncol)
r1 <- data.frame(t(t(df)*v))
r2 <- data.frame(mapply(`*`,df,v,SIMPLIFY=FALSE))
r3 <- df * rep(v, each=nrow(df))
stopifnot(identical(r1, r2) && identical(r1, r3))
microbenchmark(data.frame(t(t(df)*v)), data.frame(mapply(`*`,df,v,SIMPLIFY=FALSE)), df * rep(v, each=nrow(df)))
}
Result
> set.seed(1)
>
> testit(100,100)
Unit: milliseconds
expr min lq median uq max neval
data.frame(t(t(df) * v)) 2.297075 2.359541 2.455778 3.804836 33.05806 100
data.frame(mapply(`*`, df, v, SIMPLIFY = FALSE)) 9.977436 10.401576 10.658964 11.762009 15.09721 100
df * rep(v, each = nrow(df)) 14.309822 14.956705 16.092469 16.516609 45.13450 100
> testit(1000,10)
Unit: microseconds
expr min lq median uq max neval
data.frame(t(t(df) * v)) 754.844 805.062 844.431 1850.363 27955.79 100
data.frame(mapply(`*`, df, v, SIMPLIFY = FALSE)) 1457.895 1497.088 1567.604 2550.090 4732.03 100
df * rep(v, each = nrow(df)) 5383.288 5527.817 5875.143 6628.586 32392.81 100
> testit(10,1000)
Unit: milliseconds
expr min lq median uq max neval
data.frame(t(t(df) * v)) 17.07548 18.29418 19.91498 20.67944 57.62913 100
data.frame(mapply(`*`, df, v, SIMPLIFY = FALSE)) 99.90103 104.36028 108.28147 114.82012 150.05907 100
df * rep(v, each = nrow(df)) 112.21719 118.74359 122.51308 128.82863 164.57431 100

Resources