Find the highest value in an interval defined by an other file - r

I have these two dataset:
Before that contains 5 columns (chromsome, start, end, line number, score)
chrI 861 870 87 5
chrI 871 880 88 11
chrI 881 890 89 11
chrI 891 900 90 19
chrI 901 910 91 19
chrI 911 920 92 20
chrI 921 930 93 20
chrI 931 940 94 20
chrI 941 950 95 19
chrI 951 960 96 19
chrI 961 970 97 19
chrI 971 980 98 19
chrI 981 990 99 25
chrI 991 1000 100 20
chrI 1001 1010 101 20
chrI 1011 1020 102 20
chrI 1021 1030 103 20
chrI 1031 1040 104 15
chrI 1041 1050 105 14
chrI 1051 1060 106 14
chrI 1061 1070 107 13
chrI 1071 1080 108 13
chrI 1081 1090 109 13
chrI 1091 1100 110 7
chrI 1101 1110 111 7
Peaks that contains 4 columns (chromsome, start, end, value)
"chrI" 880 1091 383
"chrI" 1350 1601 302
"chrI" 1680 1921 241
"chrI" 2220 2561 322
"chrI" 2750 2761 18
"chrI" 3100 3481 420
"chrI" 3660 4211 793
"chrI" 4480 4491 20
"chrI" 4710 4871 195
"chrI" 5010 5261 238
For each lines of Peaks I would like to extract the corresponding lines (e.g all the lines between 880 and 1091 for the first line) in Before, find the highest score value and write it on a new file.
Output
chrI 981 990 99 25
To this end, I've written this function:
summit <- function(x,y,output){
y<- Before
chrom <- x[1]
start <-x[2]
end <-x[3]
startLine <- y[which((y$V1 == chrom) & (y$V2==start)),]
endLine <- y[which((y$V1 == chrom) & (y$V3==end)),]
Subset <- y[which((y$V2 >= startLine$V2) & (y$V3 <= endLine$V2))]
maximum <- Subset[which(Subset$V4 == max(Subset$V4))]
output <- print(maximum)
}
apply(Peaks,1,summit,output = 'peaks_list.bed')
I don't have an error message but It runs during the entire night without giving me results so I guess something is wrong with my code but I don't know what.
I also try this:
Peaks_Range <- GRanges(seqnames=Peaks$V1, ranges=IRanges(start=Peaks$V2, end=Peaks$V3))
Before_Range <- GRanges(seqnames=Before$V1, ranges=IRanges(start=Before$V2, end=Before$V3),score=Before$V5)
Merged <- mergeByOverlaps(Peaks_Range,Before_Range)
Merged <- as.data.frame(Merged)
for (i in 1:nrow(Peaks)){
start <-Peaks[i,2]
end <-Peaks[i,3]
Subset <- subset(Merged,Merged$Peaks_Range.start == start)
maximum <- as.numeric(max(Subset$score))
summit <- Subset[which(Subset$score == maximum),]
write.table(summit,'peaks_list.bed', sep="\t", append=T, col.name = FALSE, row.names = FALSE, quote = FALSE)
}
It works (I think) but this is very very slow so I search an alternative way to do it.
Does anyone have any idea?

You can use cut to help you get the index.
setwd("/home/wang/Downloads")
before <- read.table("before.txt", header = F, stringsAsFactors = F)
colnames(before) <- c("chromosome", "start", "end", "line number", "score")
peaks <- read.table("peaks.txt", header = F, stringsAsFactors = F, quote = "\"")
colnames(peaks) <- c("chromosome", "start", "end", "value")
summit <- function(peaks_vec){
chromosome = peaks_vec[1]
start = as.numeric(peaks_vec[2])
end = as.numeric(peaks_vec[3])
filter_before = subset(before, chromosome == chromosome)
up_index = cut(end, filter_before[,2], labels = F) +1
down_index = cut(start, filter_before[,2], labels = F) +1
if(!is.na(down_index) & !is.na(up_index)){
new_filter_before = filter_before[down_index : up_index, ]
max_index = which.max(new_filter_before[,5])
return(unlist(new_filter_before[max_index,]))
}else {
return(rep(NA, 5)) # you can input what you want.
}
}
result <- t(apply(as.matrix(peaks), 1, summit))
remove_na_result <- as.data.frame(na.omit(result))
colnames(remove_na_result) <- colnames(before)
And the final output is :
chromsome start end line number score
1 chrI 981 990 99 25
Wish my answer is helpful.

Related

Gompertz-Makeham parameter estimation

I would like estimate the parameters of the Gompert-Makeham distribution, but I haven't got a result.
I would like a method in R, like this Weibull parameter estimation code:
weibull_loglik <- function(parm){
gamma <- parm[1]
lambda <- parm[2]
loglik <- sum(dweibull(vec, shape=gamma, scale=lambda, log=TRUE))
return(-loglik)
}
weibull <- nlm(weibull_loglik,parm<-c(1,1), hessian = TRUE, iterlim=100)
weibull$estimate
c=weibull$estimate[1];b=weibull$estimate[2]
My data:
[1] 872 52 31 26 22 17 11 17 17 8 20 12 25 14 17
[16] 20 17 23 32 37 28 24 43 40 34 29 26 32 34 51
[31] 50 67 84 70 71 137 123 137 172 189 212 251 248 272 314
[46] 374 345 411 494 461 505 506 565 590 535 639 710 733 795 786
[61] 894 963 1019 1149 1185 1356 1354 1460 1622 1783 1843 2049 2262 2316 2591
[76] 2730 2972 3187 3432 3438 3959 3140 3612 3820 3478 4054 3587 3433 3150 2881
[91] 2639 2250 1850 1546 1236 966 729 532 375 256 168 107 65 39 22
[106] 12 6 3 2 1 1
summary(vec)
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.0 32.0 314.0 900.9 1355.0 4054.0
It would be nice to have a reproducible example, but something like:
library(bbmle)
library(eha)
set.seed(101)
vec <- rmakeham(1000, shape = c(2,3), scale = 2)
dmwrap <- function(x, shape1, shape2, scale, log) {
res <- try(dmakeham(x, c(shape1, shape2), scale, log = log), silent = TRUE)
if (inherits(res, "try-error")) return(NA)
res
}
m1 <- mle2(y ~ dmwrap(shape1, shape2, scale),
start = list(shape1=1,shape2=1, scale=1),
data = data.frame(y = vec),
method = "Nelder-Mead"
)
Define a wrapper that (1) takes shape parameters as separate values; (2) returns NA rather than throwing an error when e.g. parameters are negative
Use Nelder-Mead rather than default BFGS for robustness
the fitdistrplus package might help too
if you're going to do a lot of this it may help to fit parameters on the log scale (i.e. use parameters logshape1, etc., and use exp(logshape1) etc. in the fitting formula)
I had to work a little harder to fit your data; I scaled the variable by 1000 (and found that I could only compute the log-likelihood; the likelihood gave an error that I didn't bother trying to track down). Unfortunately, it doesn't look like a great fit (too many small values).
x <- scan(text = "872 52 31 26 22 17 11 17 17 8 20 12 25 14 17
20 17 23 32 37 28 24 43 40 34 29 26 32 34 51
50 67 84 70 71 137 123 137 172 189 212 251 248 272 314
374 345 411 494 461 505 506 565 590 535 639 710 733 795 786
894 963 1019 1149 1185 1356 1354 1460 1622 1783 1843 2049 2262 2316 2591
2730 2972 3187 3432 3438 3959 3140 3612 3820 3478 4054 3587 3433 3150 2881
2639 2250 1850 1546 1236 966 729 532 375 256 168 107 65 39 22
12 6 3 2 1 1")
m1 <- mle2(y ~ dmwrap(shape1, shape2, scale),
start = list(shape1=1,shape2=1, scale=10000),
data = data.frame(y = x/1000),
method = "Nelder-Mead"
)
cc <- as.list(coef(m1))
png("gm.png")
hist(x,breaks = 25, freq=FALSE)
with(cc,
curve(exp(dmwrap(x/1000, shape1, shape2, scale, log = TRUE))/1000, add = TRUE)
)
dev.off()

How to calculate Williams %R in RStudio?

I am trying to write a function to calculate Williams %R on data in R. Here is my code:
getSymbols('AMD', src = 'yahoo', from = '2018-01-01')
wr = function(high, low, close, n) {
highh = runMax((high),n)
lowl = runMin((low),n)
-100 * ((highh - close) / (highh - lowl))
}
williampr = wr(AMD$AMD.High, AMD$AMD.Low, AMD$AMD.Close, n = 10)
After implementing a buy/sell/hold signal, it returns integer(0):
## 1 = BUY, 0 = HOLD, -1 = SELL
## implement Lag to shift the time back to the previous day
tradingSignal = Lag(
## if wpr is greater than 0.8, BUY
ifelse(Lag(williampr) > 0.8 & williampr < 0.8,1,
## if wpr signal is less than 0.2, SELL, else, HOLD
ifelse(Lag(williampr) > 0.2 & williampr < 0.2,-1,0)))
## make all missing values equal to 0
tradingSignal[is.na(tradingSignal)] = 0
## see how many SELL signals we have
which(tradingSignal == "-1")
What am I doing wrong?
It would have been a good idea to identify that you were using the package quantmod in your question.
There are two things preventing this from working.
You didn't inspect what you expected! Your results in williampr are all negative. Additionally, you multiplied the values by 100, so 80% is 80, not .8. I removed -100 *.
I have done the same thing so many times.
wr = function(high, low, close, n) {
highh = runMax((high),n)
lowl = runMin((low),n)
((highh - close) / (highh - lowl))
}
That's it. It works now.
which(tradingSignal == "-1")
# [1] 13 15 19 22 39 71 73 84 87 104 112 130 134 136 144 146 151 156 161 171 175
# [22] 179 217 230 255 268 288 305 307 316 346 358 380 386 404 449 458 463 468 488 492 494
# [43] 505 510 515 531 561 563 570 572 574 594 601 614 635 642 644 646 649 666 668 672 691
# [64] 696 698 719 729 733 739 746 784 807 819 828 856 861 872 877 896 900 922 940 954 968
# [85] 972 978 984 986 1004 1035 1048 1060

fast partial match checking in R (or Python or Julia)

I have two dataset with names and I need to compare names in both datasets. I just need to keep the union of the two datasets based on the names. However, a name is still considered 'matched' if it is part of the another name even if it is not a full match and vice versa. For example, "seb" should match to "seb", but also to "sebas". I am using str_detect(), but it is too slow. I am wondering if there is any way to speed up this process. I tried some other packages and functions, but nothing really improved the speed. I am open for any R or Python solution.
Create two dummy datasets
library(dplyr)
library(stringr)
set.seed(1)
data_set_A <- tibble(name = unique(replicate(2000, paste(sample(letters, runif(1, 3, 10), replace = T), collapse = "")))) %>%
mutate(ID_A = 1:n())
set.seed(2)
data_set_B <- tibble(name_2 = unique(replicate(2000, paste(sample(letters, runif(1, 3, 10), replace = T), collapse = "")))) %>%
mutate(ID_B = 1:n())
Test matching of full matches only
# This is almost instant
data_set_A %>%
rowwise() %>%
filter(any(name %in% data_set_B$name_2) | any(data_set_B$name_2 %in% name)) %>%
ungroup()
# A tibble: 4 x 2
name ID_A
<chr> <int>
1 vnt 112
2 fly 391
3 cug 1125
4 xgv 1280
Include partial matches (This is what I want to optimize)
This of course only gives me the subset of dataset A, but that is ok.
# This takes way too long
data_set_A %>%
rowwise() %>%
filter(any(str_detect(name, data_set_B$name_2)) | any(str_detect(data_set_B$name_2, name))) %>%
ungroup()
A tibble: 237 x 2
name ID_A
<chr> <int>
1 wknrsauuj 2
2 lyw 7
3 igwsvrzpk 16
4 zozxjpu 18
5 cgn 22
6 oqo 45
7 gkritbe 47
8 uuq 92
9 lhwfyksz 94
10 tuw 100
Fuzzyjoin method.
This also works, but is equally slow
bind_rows(
fuzzyjoin::fuzzy_inner_join(
data_set_A,
data_set_B,
by = c("name" = "name_2"),
match_fun = stringr::str_detect
) %>%
select(name, ID_A),
fuzzyjoin::fuzzy_inner_join(
data_set_B,
data_set_A,
by = c("name_2" = "name"),
match_fun = stringr::str_detect
) %>%
select(name, ID_A)
) %>%
distinct()
data.table solution
not much faster unfortunately
library(data.table)
setDT(data_set_A)
setDT(data_set_B)
data_set_A[data_set_A[, .I[any(str_detect(name, data_set_B$name_2)) |
any(str_detect(data_set_B$name_2, name))], by = .(ID_A)]$V1]
This is an [r] option aimed at reducing the number of times you are calling str_detect() (i.e., your example is slow because the function is called several thousand times; and for not using fixed() or fixed = TRUE as jpiversen already pointed out). Answer explained in comments in the code; I will try to jump on tomorrow to explain a bit more.
This should scale reasonably well and be more memory efficient than the current approach too because reduces the rowwise computations to an absolute minimum.
Benchmarks:
n = 2000
# A tibble: 4 × 13
expression min median `itr/sec` mem_alloc `gc/sec` n_itr
<bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int>
1 original() 6.67s 6.67s 0.150 31.95MB 0.300 1
2 using_fixed() 496.54ms 496.54ms 2.01 61.39MB 4.03 1
3 using_map_fixed() 493.35ms 493.35ms 2.03 60.27MB 6.08 1
4 andrew_fun() 167.78ms 167.78ms 5.96 1.59MB 0 1
n = 4000
Note: I am not sure if you need the answer to scale; but the approach of reducing the memory-intensive part does seem to do just that (although the time difference is negligible for n = 4000 for 1 iteration, IMO).
# A tibble: 4 × 13
expression min median `itr/sec` mem_alloc `gc/sec` n_itr
<bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int>
1 original() 26.63s 26.63s 0.0376 122.33MB 0.150 1
2 using_fixed() 1.91s 1.91s 0.525 243.96MB 3.67 1
3 using_map_fixed() 1.87s 1.87s 0.534 236.62MB 3.20 1
4 andrew_fun() 674.36ms 674.36ms 1.48 7.59MB 0 1
Code w/ comments:
# This is so we do not retain the strings with the max number of
# characters in our pattern because we are checking with %in% already
nchar_a = nchar(data_set_A$name)
nchar_b = nchar(data_set_B$name_2)
# Creating large patterns (excluding values w/ max number of characters)
pattern_a = str_c(unique(data_set_A$name[nchar_a != max(nchar_a, na.rm = TRUE)]), collapse = "|")
pattern_b = str_c(unique(data_set_B$name_2[nchar_b != max(nchar_b, na.rm = TRUE)]), collapse = "|")
# First checking using %in%
idx_a = data_set_A$name %in% data_set_B$name_2
# Next, IDing when a(string) matches b(pattern)
idx_a[!idx_a] = str_detect(data_set_A$name[!idx_a], pattern_b)
# IDing a(pattern) matches b(string) so we do not run every row of
# a(as a pattern) against all of b
b_to_check = data_set_B$name_2[str_detect(data_set_B$name_2, pattern_a)]
# Using unmatched values of a as a pattern for the reduced set for b
idx_a[!idx_a] = vapply(data_set_A$name[!idx_a], function(name) {
any(grepl(name, b_to_check, fixed = TRUE))
}, logical(1L), USE.NAMES = FALSE)
data_set_A[idx_a, ]
# A tibble: 237 × 2
name ID_A
<chr> <int>
1 wknrsauuj 2
2 lyw 7
3 igwsvrzpk 16
4 zozxjpu 18
5 cgn 22
6 oqo 45
7 gkritbe 47
8 uuq 92
9 lhwfyksz 94
10 tuw 100
# … with 227 more rows
Reproducible R code for benchmarks
The following code is largely taken from jpiversen who provided a great answer:
library(dplyr)
library(stringr)
n = 2000
set.seed(1)
data_set_A <- tibble(name = unique(replicate(n, paste(sample(letters, runif(1, 3, 10), replace = T), collapse = "")))) %>%
mutate(ID_A = 1:n())
set.seed(2)
data_set_B <- tibble(name_2 = unique(replicate(n, paste(sample(letters, runif(1, 3, 10), replace = T), collapse = "")))) %>%
mutate(ID_B = 1:n())
original <- function() {
data_set_A %>%
rowwise() %>%
filter(any(str_detect(name, data_set_B$name_2)) | any(str_detect(data_set_B$name_2, name))) %>%
ungroup()
}
using_fixed <- function() {
data_set_A %>%
rowwise() %>%
filter(any(str_detect(name, fixed(data_set_B$name_2))) | any(str_detect(data_set_B$name_2, fixed(name)))) %>%
ungroup()
}
using_map_fixed <- function() {
logical_vec <- data_set_A$name %>%
purrr::map_lgl(
~any(stringr::str_detect(.x, fixed(data_set_B$name_2))) ||
any(stringr::str_detect(data_set_B$name_2, fixed(.x)))
)
data_set_A[logical_vec, ]
}
andrew_fun = function() {
nchar_a = nchar(data_set_A$name)
nchar_b = nchar(data_set_B$name_2)
pattern_a = str_c(unique(data_set_A$name[nchar_a != max(nchar_a, na.rm = TRUE)]), collapse = "|")
pattern_b = str_c(unique(data_set_B$name_2[nchar_b != max(nchar_b, na.rm = TRUE)]), collapse = "|")
idx_a = data_set_A$name %in% data_set_B$name_2
idx_a[!idx_a] = str_detect(data_set_A$name[!idx_a], pattern_b)
b_to_check = data_set_B$name_2[str_detect(data_set_B$name_2, pattern_a)]
idx_a[!idx_a] = vapply(data_set_A$name[!idx_a], function(name) {
any(grepl(name, b_to_check, fixed = TRUE))
}, logical(1L), USE.NAMES = FALSE)
data_set_A[idx_a, ]
}
bm = bench::mark(
original(),
using_fixed(),
using_map_fixed(),
andrew_fun(),
iterations = 1
)
TL;DR
The slow part is str_detect(string, pattern).
To speed it up, wrap pattern in fixed() if you got simple strings, and in coll() if you got longer, typical human text.
To get another slight speed boost, rewrite your code using purrr::map_lgl() and use this to subset your data.
Under follows examples, explanations and benchmarks.
Rewriting str_detect() using fixed() or coll()
I believe the easiest fix is to modify how str_detect() uses regex with e.g. stringr::fixed() or stringr::coll().
From ?stringr::str_detect():
Match a fixed string (i.e. by comparing only bytes), using fixed(). This is fast, but approximate. Generally, for matching human text, you'll want coll() which respects character matching rules for the specified locale.
Under is a comparison with your original code:
original <- function() {
data_set_A %>%
rowwise() %>%
filter(any(str_detect(name, data_set_B$name_2)) | any(str_detect(data_set_B$name_2, name))) %>%
ungroup()
}
# Note the use of fixed()
using_fixed <- function() {
data_set_A %>%
rowwise() %>%
filter(any(str_detect(name, fixed(data_set_B$name_2))) | any(str_detect(data_set_B$name_2, fixed(name)))) %>%
ungroup()
}
# Note the use of coll()
using_coll <- function() {
data_set_A %>%
rowwise() %>%
filter(any(str_detect(name, coll(data_set_B$name_2))) | any(str_detect(data_set_B$name_2, coll(name)))) %>%
ungroup()
}
bm <- bench::mark(
original(),
using_fixed(),
using_coll(),
iterations = 20
)
#> Warning: Some expressions had a GC in every iteration; so filtering is disabled.
bm
#> # A tibble: 3 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 original() 6.58s 6.59s 0.152 32.4MB 0.371
#> 2 using_fixed() 501.64ms 505.51ms 1.97 61.4MB 3.94
#> 3 using_coll() 4.48s 4.5s 0.222 61.4MB 0.512
bm %>% ggplot2::autoplot(type = "violin")
#> Loading required namespace: tidyr
Created on 2022-04-02 by the reprex package (v2.0.1)
So, as we can see, wrapping your code in fixed() will make it very fast and works well on your test data. However, it might not work as well for real human text (especially non-ASCII character sets). You should test it on your original data, and use coll() as an alternative if fixed() doesn't work.
Removing rowwise()
Another step you can take to make your code a bit faster is to get rid of rowwise(). I would replace it using purrr::map_lgl() and use this logical vector to subset the dataframe. Under is an example and a benchmark against my functions defined above:
using_map_fixed <- function() {
logical_vec <- data_set_A$name %>%
purrr::map_lgl(
~any(stringr::str_detect(.x, fixed(data_set_B$name_2))) ||
any(stringr::str_detect(data_set_B$name_2, fixed(.x)))
)
data_set_A[logical_vec, ]
}
using_map_coll <- function() {
logical_vec <- data_set_A$name %>%
purrr::map_lgl(
~any(stringr::str_detect(.x, coll(data_set_B$name_2))) ||
any(stringr::str_detect(data_set_B$name_2, coll(.x)))
)
data_set_A[logical_vec, ]
}
bm <- bench::mark(
using_fixed(),
using_map_fixed(),
using_coll(),
using_map_coll(),
iterations = 20
)
#> Warning: Some expressions had a GC in every iteration; so filtering is disabled.
bm
#> # A tibble: 4 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 using_fixed() 503.4ms 507.24ms 1.95 62.9MB 5.37
#> 2 using_map_fixed() 474.28ms 477.63ms 2.09 60.3MB 3.14
#> 3 using_coll() 4.49s 4.5s 0.222 61.4MB 0.489
#> 4 using_map_coll() 4.37s 4.38s 0.228 60.2MB 0.354
Created on 2022-04-02 by the reprex package (v2.0.1)
As we see, this gives another slight speed boost.
Using fixed() with data.table or fuzzyjoin
You can also use fixed() with data.table and fuzzyjoin. I have not included it here for brevity, but my benchmark shows that data.table takes about the same amount of time as my using_map_fixed() above, and fuzzyjoin takes about twice as much time.
This makes sense to me, as the slow part is str_detect(), not the method of joining/filtering, or the underlying data structure.
If you would like to use base R, the code below might be one fast option
A <- data_set_A$name
B <- data_set_B$name_2
A2B <- sapply(A, function(x) grepl(x, B, fixed = TRUE))
B2A <- sapply(B, function(x) grepl(x, A, fixed = TRUE))
idx <- which(t(A2B) | B2A, arr.ind = TRUE)
res <- cbind(data_set_A[idx[, 1], ], data_set_B[idx[, 2], ])
which gives
> res
name ID_A name_2 ID_B
1 arh 1234 pimoarhd 8
2 qtj 720 aqtj 23
3 szcympsn 142 cym 43
4 cymvubnxg 245 cym 43
5 dppvtcymq 355 cym 43
6 kzi 690 kzii 48
7 eyajqchkn 498 chk 53
8 upfzh 522 upf 61
9 ioa 1852 ioadr 63
10 lya 1349 ibelyalvh 64
11 honod 504 ono 71
12 zozxjpu 18 zoz 72
13 jcz 914 cdjczpqg 88
14 ailmjf 623 ilm 99
15 upoux 609 oux 104
16 pouxifvp 1466 oux 104
17 mvob 516 vob 106
18 nqtotvhhm 1088 otv 115
19 wom 202 womtglapx 117
20 qkc 756 dqkcfqpps 118
21 qtl 600 ivqtlymzr 126
22 qqi 1605 owfsqqiyu 153
23 fmjalirze 1470 ali 172
24 ibwfwkyp 1588 fwk 175
25 iat 1258 iatjeg 185
26 osm 253 nviiqosm 199
27 wpj 373 wpjeb 204
28 hahx 515 ahx 213
29 keahxa 1565 ahx 213
30 psf 359 qnpsfo 223
31 saq 1859 saqhu 227
32 cvmkwtx 714 cvm 228
33 ilw 389 pyilwj 231
34 ohwysv 1590 ysv 237
35 utrl 698 trl 244
36 dmttrlcpj 1267 trl 244
37 cpv 236 btcpvmoc 247
38 uto 1047 utoi 257
39 yngunekl 1978 ekl 258
40 vceko 625 vce 265
41 fir 1934 firgk 278
42 qvd 983 eqvdfi 287
43 fir 1934 zwwefir 291
44 idvfkevdf 1380 vdf 312
45 qwdo 1921 qwd 322
46 kam 1205 tlkam 327
47 lck 488 clckjkyzn 329
48 gmspwckw 1015 msp 359
49 ynouuwqtz 1576 nou 360
50 tty 1209 bttyvt 361
51 vkc 999 fmrvkcl 366
52 ipw 1918 fipwjomdu 388
53 zdv 261 zdvkut 410
54 vku 1137 zdvkut 410
55 doby 246 oby 411
56 hycvuupgy 141 uup 421
57 uwlb 1249 wlb 431
58 auj 1452 lcmnauj 444
59 rwd 1667 ukwrwdczs 479
60 ylsihqqor 1290 ihq 483
61 feo 1649 feorvxbm 485
62 zff 755 dohzffujm 499
63 mqutujepu 904 epu 507
64 uiepu 1308 epu 507
65 vahepuk 1434 epu 507
66 cug 1125 accugl 509
67 fir 1934 firwe 517
68 dia 1599 dialeddd 527
69 temiwd 1725 tem 531
70 svofivl 1177 svo 545
71 flm 657 aflm 546
72 vnt 112 vnt 551
73 bhmoskrz 426 osk 558
74 wev 728 shemuwev 569
75 hzpi 1586 hzp 579
76 gvi 1064 mkgvivlfe 582
77 fjb 1398 vkfjbxnjl 589
78 qin 1013 qinp 593
79 ecn 1342 ecnzre 598
80 zre 1610 ecnzre 598
81 xvr 772 dpxvrfmo 623
82 tqr 1419 tqrmztdm 624
83 zmwnf 1571 mwn 626
84 ypil 1787 pil 630
85 mnxlqgfh 1132 nxl 643
86 gse 1563 gseice 646
87 ygk 1309 ygkqrk 655
88 fgm 933 vzfgmy 663
89 rlupd 977 upd 666
90 mcupdkuiy 1307 upd 666
91 fly 391 fly 669
92 vbkko 1603 kko 678
93 uvrew 465 rew 680
94 hgbhngwvd 901 wvd 690
95 wvdjprmo 1432 wvd 690
96 cgn 22 cgnd 698
97 dngnjv 967 njv 700
98 psqs 841 sqs 720
99 ywv 1180 ptywvlgc 730
100 ypil 1787 ypi 734
101 rwd 1667 srserwd 737
102 jqydasl 1294 jqy 742
103 ckujmc 717 ujm 751
104 dfzxta 662 xta 775
105 bjb 1562 jabjbei 779
106 adwknpll 1242 npl 780
107 kdv 1327 xhkdvqo 789
108 ghj 174 oghj 801
109 lhwfyksz 94 lhw 811
110 nwrrnlhw 929 lhw 811
111 xlhwm 1720 lhw 811
112 ncc 1602 wurhxnccn 814
113 jdslrf 1094 dsl 835
114 ktmw 1738 tmw 844
115 igwsvrzpk 16 gws 856
116 kug 591 pkugls 857
117 befgcpedr 339 fgc 862
118 ojf 1397 ojfpnkla 863
119 gyl 1203 gylxeqzw 872
120 ugcbb 1727 ugc 876
121 arh 1234 karhwhg 878
122 amm 458 ammqdc 883
123 azazryje 636 zaz 900
124 wczazw 1887 zaz 900
125 gkritbe 47 ritb 915
126 vku 1137 yjvkuxued 929
127 rnh 1633 kvyrnhugu 937
128 mzh 1135 xllmwmzhn 940
129 cug 1125 cug 960
130 xgv 1280 xgv 962
131 xusxgv 1436 xgv 962
132 umc 351 lwumcmvoo 980
133 zlb 1900 nkyazlb 991
134 llfkalao 1049 llf 1002
135 sflpbht 991 lpb 1048
136 rairmmcl 442 mmc 1087
137 mmckoln 780 mmc 1087
138 gfxmmcgb 1814 mmc 1087
139 aoj 402 taojlgp 1089
140 mypvzhp 121 ypv 1095
141 moctwaypv 611 ypv 1095
142 rngedn 306 ged 1106
143 djshecy 1408 ecy 1108
144 rairmmcl 442 rmm 1117
145 gzua 1594 zua 1124
146 ytj 416 yytj 1140
147 ubt 300 hubtcfr 1141
148 gqg 1854 ogqgsjqc 1144
149 tfg 1204 xiutfgru 1145
150 avrq 741 avr 1147
151 ytkpvss 440 tkp 1149
152 kug 591 yxsjkug 1176
153 vix 1846 vixsmn 1187
154 qtl 600 qtljkxz 1188
155 lgr 494 dlgrco 1189
156 ryg 864 xlmtryg 1203
157 yskvkxwj 1547 kvk 1205
158 kxhee 1795 xhe 1222
159 hzbcjs 1493 cjs 1224
160 kbi 270 itxlwkbi 1225
161 gdymcam 806 ymca 1232
162 tqr 1419 rxtqrdtl 1236
163 yyz 215 yyzw 1242
164 jyx 1735 mljjyxu 1248
165 aai 1928 umkpaaiwo 1254
166 dsd 1122 dndsdova 1257
167 tor 744 etor 1270
168 vhcyznp 1296 yzn 1278
169 xlc 1947 odxlcjwj 1280
170 mlm 1629 aomlmgtq 1303
171 owm 239 owmugb 1304
172 ynezwaml 507 nez 1308
173 jls 695 jlsve 1325
174 dvm 879 dvmv 1339
175 vsgx 944 dqpihvsgx 1352
176 wfo 768 wfokpjois 1354
177 tltbkinat 1986 nat 1362
178 gyl 1203 gylqte 1363
179 ngg 735 bsnggqbjd 1366
180 fkq 345 jdfkqf 1368
181 ojf 1397 ojfpgfga 1382
182 dqgd 1623 prqbndqgd 1398
183 siu 827 siuypucup 1412
184 yinsoivfd 1895 yin 1414
185 esm 1834 sesmeepz 1417
186 umc 351 umcj 1432
187 wny 866 wnyxamguw 1443
188 ujbhtvnin 399 vni 1444
189 dbq 630 bdbqq 1452
190 ebn 1405 ebngddw 1461
191 zcj 704 rbtjzcjod 1465
192 avn 500 avnspxv 1468
193 vkk 567 hvkk 1477
194 hmm 1441 bgjhmmthz 1483
195 aguakz 614 guak 1487
196 hycvuupgy 141 pgy 1493
197 tizpgymz 280 pgy 1493
198 guk 571 cncxdguk 1502
199 zyw 281 nzywuqs 1504
200 jnz 1558 rxdxsjnzw 1510
201 uuq 92 nxuuqtj 1514
202 qtj 720 nxuuqtj 1514
203 vkk 567 xpbpvkkdc 1518
204 iaa 460 sjiaa 1525
205 txsgmynng 1019 xsg 1526
206 yjvtwc 1107 jvt 1529
207 lnk 1113 hylnknwy 1546
208 szd 635 woszdm 1557
209 osm 253 sosmdp 1567
210 nbd 1067 nbdmmg 1570
211 mmg 1305 nbdmmg 1570
212 wqdsatbd 1536 sat 1585
213 sdlypo 1527 sdl 1596
214 inkynog 288 inky 1600
215 hpwoeclfy 1321 clf 1601
216 wodyqwqf 679 dyq 1603
217 lyw 7 xnalywyuw 1607
218 njm 1825 vjlnjmns 1617
219 njytqhaut 428 qha 1620
220 ilw 389 rilwbk 1647
221 oqo 45 ixoqowkpg 1650
222 odcbcvaun 1386 bcv 1652
223 mastn 434 stn 1662
224 xebhdssit 1091 xeb 1663
225 nmy 782 nmyxj 1671
226 fsqvgdw 673 gdw 1676
227 mwwczhs 482 wcz 1679
228 wczazw 1887 wcz 1679
229 anmryzm 915 ryz 1698
230 rteh 523 rte 1708
231 mlwrguae 817 lwr 1709
232 mbu 819 xpsuqmbuf 1729
233 mmckoln 780 cko 1733
234 lxpg 798 lxp 1734
235 ane 370 vxnanehvk 1746
236 tty 1209 vbttyozui 1752
237 igncdgyjx 332 ign 1753
238 ndignk 621 ign 1753
239 nmy 782 ivnmyba 1780
240 wknrsauuj 2 rsa 1799
241 tgd 165 qtgdidlf 1803
242 iaa 460 yziaazxto 1833
243 xto 1245 yziaazxto 1833
244 zff 755 dpzfft 1857
245 jyx 1735 jwjyxphe 1873
246 ytj 416 eytj 1881
247 lcggwonk 1596 onk 1882
248 zdv 261 zdvxfz 1889
249 xhskcb 417 kcb 1890
250 mrikqkcb 770 kcb 1890
251 psvxqnsap 1352 psv 1898
252 udjswzb 411 jsw 1900
253 rpfjswy 1840 jsw 1900
254 bjaywiso 1677 ayw 1902
255 zfli 130 fli 1906
256 vazx 1215 itvazxw 1918
257 tuw 100 tuwywtbwd 1921
258 vle 1437 ebvleaovm 1937
259 znycsygd 1757 nyc 1944
260 ynezwaml 507 ezw 1952
261 tseezwf 1276 ezw 1952
262 ezwzyfudo 1690 ezw 1952
263 oudiky 1503 dik 1964
264 dikjn 1615 dik 1964
265 oms 106 wpomsudi 1977
266 hhp 1864 hhpkm 1983
Benchmarking
It seems this base R option is slightly slower than #Andrew's approach.
TIC <- function() {
A <- data_set_A$name
B <- data_set_B$name_2
A2B <- sapply(A, function(x) grepl(x, B, fixed = TRUE))
B2A <- sapply(B, function(x) grepl(x, A, fixed = TRUE))
idx <- which(t(A2B) | B2A, arr.ind = TRUE)
cbind(data_set_A[idx[, 1], ], data_set_B[idx[, 2], ])
# data_set_A[unique(idx[, 1]), ]
}
jpiversen_fixed <- function() {
data_set_A %>%
rowwise() %>%
filter(any(str_detect(name, fixed(data_set_B$name_2))) | any(str_detect(data_set_B$name_2, fixed(name)))) %>%
ungroup()
}
andrew <- function() {
nchar_a <- nchar(data_set_A$name)
nchar_b <- nchar(data_set_B$name_2)
pattern_a <- str_c(unique(data_set_A$name[nchar_a != max(nchar_a, na.rm = TRUE)]), collapse = "|")
pattern_b <- str_c(unique(data_set_B$name_2[nchar_b != max(nchar_b, na.rm = TRUE)]), collapse = "|")
idx_a <- data_set_A$name %in% data_set_B$name_2
idx_a[!idx_a] <- str_detect(data_set_A$name[!idx_a], pattern_b)
b_to_check <- data_set_B$name_2[str_detect(data_set_B$name_2, pattern_a)]
idx_a[!idx_a] <- vapply(data_set_A$name[!idx_a], function(name) {
any(grepl(name, b_to_check, fixed = TRUE))
}, logical(1L), USE.NAMES = FALSE)
data_set_A[idx_a, ]
}
bm <- microbenchmark(
TIC(),
jpiversen_fixed(),
andrew(),
times = 20
)
shows that
> bm
Unit: milliseconds
expr min lq mean median uq max
TIC() 423.8410 441.3574 492.6091 478.2596 549.2376 611.3841
jpiversen_fixed() 1354.8954 1373.9502 1447.8649 1395.6766 1459.7058 1842.2574
andrew() 329.4821 335.3388 345.8890 341.4758 354.1298 381.6872
neval
20
20
20

Increment every nth range R

I am writing R code where there's a vector 'x' which contains values 1 to 100 and I want to create another vector 'y' which subsets a range of values at every nth range. I'm sure I can use the rep() and seq() but I can't figure out the code to get what I need. Here's what the output should look like
x <- 1:100
y <- 1 2 3 11 12 13 21 22 23 31 32 33 41 42 43 51 52 53 61 62 63 71 72 73 81 82 83 91 92 93
So if I was to do have a vector x <- 1001:1100, x[y] should return:
1001 1002 1003 1011 1012 1013 1021 1022 1023 1031 1032 1033 1041 1042 1043...etc
Any ideas?
You could use grepl for that:
x <- 1001:1100
y <- grepl("[1-3]$", x)
x[y]
# [1] 1001 1002 1003 1011 1012 1013 1021 1022 1023 1031 1032 1033 1041 1042 1043 1051 1052
#[18] 1053 1061 1062 1063 1071 1072 1073 1081 1082 1083 1091 1092 1093
It simply checks for each element of x whether the last digit is in the range of 1, 2 or 3 and if so, it returns TRUE, otherwise FALSE. This logical index is then used to subset x.
In case your objective is not to subset elements ending in 1,2 or 3 but instead, to always subset 3 elements, then leave out 7, and then subset 3 again etc... you could do:
x <- 1001:1100
y <- rep(c(TRUE, FALSE), c(3, 7))
x[y]
# [1] 1001 1002 1003 1011 1012 1013 1021 1022 1023 1031 1032 1033 1041 1042 1043 1051 1052
#[18] 1053 1061 1062 1063 1071 1072 1073 1081 1082 1083 1091 1092 1093
In this case, vector y which is again logical, is recycled - note that length(x) should be divisible by length(y) for this to work properly.
For fun, With outer:
x <- 1001:1100
y <- as.vector(outer(1:3, seq(0, length(x)-10, 10), "+"))
x[y]
# [1] 1001 1002 1003 1011 1012 1013 1021 1022 1023 1031 1032 1033 1041 1042 1043
# [16] 1051 1052 1053 1061 1062 1063 1071 1072 1073 1081 1082 1083 1091 1092 1093
Probably this may help you:
x <- 1:100
y <- as.integer()
for(i in seq(1, length(x), 10)) {
y <- append(y, c(x[i], x[i+1], x[i+2]))
}
Hm. This started out as fun, but now I happen to like it since it is constructed in basically the same way the author of the question put it:
> do.call("c",lapply(0:5,function(X) 1:3+10*X))
[1] 1 2 3 11 12 13 21 22 23 31 32 33 41 42 43 51 52 53

marking a point in the boxplot

I am plotting three different sets as three boxplot in 1 page using ggplot2. In each set there is a point that I would like to highlight, and illustrate where the point stands compare to the others, is it inside the box ? or the outside.
Here is my datapoint
CDH 1KG NHLBI
CDH 301 688 1762
RS0 204 560 21742
RS1 158 1169 1406
RS2 182 1945 1467
RS3 256 2371 1631
RS4 198 580 1765
RS5 193 524 1429
RS6 139 2551 1469
RS7 188 702 1584
RS8 142 4311 1461
RS9 223 916 1591
RS10 250 794 1406
RS11 185 539 1270
RS12 228 641 1786
RS13 152 557 1677
RS14 225 1970 1619
RS15 196 458 1543
RS16 203 2891 1528
RS17 221 1542 1780
RS18 258 1173 1850
RS19 202 718 1651
RS20 191 6314 1564
library(ggplot2)
rm(list = ls())
orig_table = read.table("thedata.csv", header = T, sep = ",")
bb = orig_table # have copy of the data
bb = bb[,-1] # since these points, the ones in the first raw are my interesting point, I exclude them from the sets for the time being
tt = bb
mydata = cbind(c(tt[,1], tt[,2], tt[,3]), c(rep(1,22),rep(2,22),rep(3,22))) # I form the dataframe
data2 = cbind(c(301,688,1762),c(1,2,3)) # here is the points that I want to highlight - similar to the first raw
colnames(data2) = c("num","gro")
data2 = as.data.frame(data2) # I form them as a dataframe
colnames(mydata) = c("num","gro")
mydata = as.data.frame(mydata)
mydata$gro = factor(mydata$gro, levels=c(1,2,3))
qplot(gro, num, data=mydata, geom=c("boxplot"))+scale_y_log10() # I am making the dataframe out of 21 other ponts
# and here I want to highlight those three values in the "data2" dataframe
I appreciate your help
First, ggplot is a lot easier to use if you use data in long format. melt from reshape2 helps with that:
library(reshape2)
library(ggplot2)
df$highlight <- c(TRUE, rep(FALSE, nrow(df) - 1L)) # tag first row as interesting
df.2 <- melt(df) # convert df to long format
ggplot(subset(df.2, !highlight), aes(x=variable, y=value)) +
geom_boxplot() + scale_y_log10() +
geom_point( # add the highlight points
data=subset(df.2, highlight),
aes(x=variable, y=value),
color="red", size=5
)
Now, all I did was add a TRUE, to the first row, melted the data to be compatible with ggplot, and plotted the points with highlight==TRUE in addition to the boxplots.
EDIT: this is how I made the data:
df <- read.table(text=" CDH 1KG NHLBI
CDH 301 688 1762
RS0 204 560 21742
RS1 158 1169 1406
RS2 182 1945 1467
RS3 256 2371 1631
RS4 198 580 1765
RS5 193 524 1429
RS6 139 2551 1469
RS7 188 702 1584
RS8 142 4311 1461
RS9 223 916 1591
RS10 250 794 1406
RS11 185 539 1270
RS12 228 641 1786
RS13 152 557 1677
RS14 225 1970 1619
RS15 196 458 1543
RS16 203 2891 1528
RS17 221 1542 1780
RS18 258 1173 1850
RS19 202 718 1651
RS20 191 6314 1564", header=T)

Resources