I'm analyzing Dual-Polarization radar data and I want to add the result of the Marshall Palmer relation as an ID-level variable in my data.
There's no CRAN function for this that I can find, but another R user has script wherein he applies the relation as an estimate of an expected value in the data:
# From Troy W. (thanks!)
# A few small changes by hack-r
## Someone better in R than me could probably clean up/refactor the code a bit.
library(dplyr)
library(data.table)
test <- fread('../input/test.csv')
mpalmer <- function(ref, minutes_past) {
# order reflectivity values and minutes_past
sort_min_index = order(minutes_past)
minutes_past <- minutes_past[sort_min_index]
ref <- ref[sort_min_index]
# calculate the length of time for which each reflectivity value is valid
valid_time <- rep(0, length(minutes_past))
valid_time[1] <- minutes_past[1]
if (length(valid_time) > 1) {
for (i in seq(2, length(minutes_past))) {
valid_time[i] <- minutes_past[i] - minutes_past[i-1]
}
valid_time[length(valid_time)] = valid_time[length(valid_time)] + 60 - sum(valid_time)
} else {
# if only 1 observation, make it valid for the entire hour
valid_time <- 60
}
valid_time = valid_time / 60
# calculate hourly rain rates using marshall-palmer weighted by valid times
sum <- 0
for (i in seq(length(ref))) {
if (!is.na(ref[i])) {
mmperhr <- ((10^(ref[i]/10))/200) ^ 0.625
sum <- sum + mmperhr * valid_time[i]
}
}
return(sum)
}
results <- test %>% group_by(Id) %>% summarize(Expected=sum)
write.csv(results, file='sample_solution.csv', row.names=FALSE)
In addition to being incredibly slow with Big Data, the problem with the code above is that it doesn't create a column of results within the original data, which would allow it to be productionalized in an ETL pipeline that created this relation at the ID level as 1 predictive variable in the dataset.
I tried rewriting the function like this:
mpalmer <- function(ref, minutes_past) {
# Credit to Troy for this
# edits by Jason Miller, hack-r.com
# order reflectivity values and minutes_past
sort_min_index = order(minutes_past)
minutes_past <- minutes_past[sort_min_index]
ref <- ref[sort_min_index]
# calculate the length of time for which each reflectivity value is valid
valid_time <- rep(0, length(minutes_past))
valid_time[1] <- minutes_past[1]
if (length(valid_time) > 1) {
for (i in seq(2, length(minutes_past))) {
valid_time[i] <- minutes_past[i] - minutes_past[i-1]
}
valid_time[length(valid_time)] = valid_time[length(valid_time)] + 60 - sum(valid_time)
} else {
# if only 1 observation, make it valid for the entire hour
valid_time <- 60
}
valid_time = valid_time / 60
# calculate hourly rain rates using marshall-palmer weighted by valid times
sum <- 0
for (i in seq(length(ref))) {
if (!is.na(ref[i])) {
mmperhr <- ((10^(ref[i]/10))/200) ^ 0.625
sum <- sum + mmperhr * valid_time[i]
}
}
return(sum)
}
and then applying it like this:
train.samp$mp <- aggregate(train.samp$Ref, by=list(train.samp$Id), FUN = mpalmer, minutes_past = train.samp$minutes_past)
which I think mostly works, however after running for a long time, it returned an error like this:
Error in `$<-.data.frame`(`*tmp*`, "mp", value = list(Group.1 = c(10L, :
replacement has 9765 rows, data has 10000
I've tried it on different samples of the data and the error message is always in that format, though the specific numbers may change. There's no missing data in the dataset.
Any idea how to fix this function (and/or make it faster)?
Update: I've got it working with a for loop but it is SO slow...
This is what I'm going with for now, but I'm still open to other solutions.
Basically, I went back to the original function then broke apart the overly large dataset into manageable chunks and ran for loops on each chunk:
train.samp <- train.samp[order(train.samp$Id),]
train.samp1 <- train.samp1[order(train.samp1$Id),]
train.samp.id <- unique(train.samp$Id)
train.samp.id.1 <- train.samp.id[1:25000]
train.samp.id.2 <- train.samp.id[25001:50000]
train.samp.id.3 <- train.samp.id[50001:75000]
train.samp.id.4 <- train.samp.id[75001:100000]
train.samp.id.6 <- train.samp.id[100001:125000]
train.samp.id.5 <- train.samp.id[125001:150000]
train.samp.id.7 <- train.samp.id[150001:175000]
train.samp.id.8 <- train.samp.id[175001:200000]
train.samp.id.9 <- train.samp.id[200001:length(train.samp.id)]
train.samp.1 <- train.samp[train.samp$Id %in% train.samp.id.1,]
train.samp.2 <- train.samp[train.samp$Id %in% train.samp.id.2,]
train.samp.3 <- train.samp[train.samp$Id %in% train.samp.id.3,]
train.samp.4 <- train.samp[train.samp$Id %in% train.samp.id.4,]
train.samp.5 <- train.samp[train.samp$Id %in% train.samp.id.5,]
train.samp.6 <- train.samp[train.samp$Id %in% train.samp.id.6,]
train.samp.7 <- train.samp[train.samp$Id %in% train.samp.id.7,]
train.samp.8 <- train.samp[train.samp$Id %in% train.samp.id.8,]
train.samp.9 <- train.samp[train.samp$Id %in% train.samp.id.9,]
system.time(
for(i in unique(train.samp.1$Id)){
train.samp.1$mp[train.samp.1$Id == i] <- mpalmer(train.samp.1$Ref[train.samp.1$Id == i], minutes_past = train.samp.1$minutes_past[train.samp.1$Id == i])
} )
for(i in unique(train.samp$Id[train.samp$Id %in% train.samp.id.2])){
train.samp$mp[train.samp$Id == i] <- mpalmer(train.samp$Ref[train.samp$Id == i], minutes_past = train.samp$minutes_past[train.samp$Id == i])
}
for(i in unique(train.samp$Id[train.samp$Id %in% train.samp.id.3])){
train.samp$mp[train.samp$Id == i] <- mpalmer(train.samp$Ref[train.samp$Id == i], minutes_past = train.samp$minutes_past[train.samp$Id == i])
}
for(i in unique(train.samp$Id[train.samp$Id %in% train.samp.id.4])){
train.samp$mp[train.samp$Id == i] <- mpalmer(train.samp$Ref[train.samp$Id == i], minutes_past = train.samp$minutes_past[train.samp$Id == i])
}
for(i in unique(train.samp$Id[train.samp$Id %in% train.samp.id.5])){
train.samp$mp[train.samp$Id == i] <- mpalmer(train.samp$Ref[train.samp$Id == i], minutes_past = train.samp$minutes_past[train.samp$Id == i])
}
for(i in unique(train.samp$Id[train.samp$Id %in% train.samp.id.6])){
train.samp$mp[train.samp$Id == i] <- mpalmer(train.samp$Ref[train.samp$Id == i], minutes_past = train.samp$minutes_past[train.samp$Id == i])
}
for(i in unique(train.samp$Id[train.samp$Id %in% train.samp.id.7])){
train.samp$mp[train.samp$Id == i] <- mpalmer(train.samp$Ref[train.samp$Id == i], minutes_past = train.samp$minutes_past[train.samp$Id == i])
}
for(i in unique(train.samp$Id[train.samp$Id %in% train.samp.id.8])){
train.samp$mp[train.samp$Id == i] <- mpalmer(train.samp$Ref[train.samp$Id == i], minutes_past = train.samp$minutes_past[train.samp$Id == i])
}
for(i in unique(train.samp$Id[train.samp$Id %in% train.samp.id.9])){
train.samp$mp[train.samp$Id == i] <- mpalmer(train.samp$Ref[train.samp$Id == i], minutes_past = train.samp$minutes_past[train.samp$Id == i])
}
system.time(
for(i in unique(train.samp1$Id)){
train.samp1$mp[train.samp1$Id == i] <- mpalmer(train.samp1$Ref[train.samp1$Id == i], minutes_past = train.samp1$minutes_past[train.samp1$Id == i])
}
The function is not shown here, but I am about to take advantage of #Gregor's suggestion in the comment.
Related
I need help, my function not work correctly when i try make any sum with the results.
have a lot of NA values and I don't know why.
Craps <- function(jogadas){
for (i in 1:jogadas){
comeOut <- sample(1:6,1)+ sample(1:6,1)
if(comeOut %in% c(2,3,12)){
result <- F #False
}else if(comeOut %in% c(7,11)){
}
else{
dados <- sample(1:6,1) + sample(1:6,1)
if(dados == 7){
result <- F
}else {
result <- T
}
while (!(dados %in% c(7,comeOut))){
dados <- sample(1:6,1)+ sample(1:6,1)
}
if(dados == 7)
result <- F
else result <- T
}
print(result)
#probability
prob<-NULL
prob[i] <- result
prob2<-sum(prob)/jogadas
print(prob2)
}
}
Craps(1000)
You put prob=NULL inside your loop, so it will become NULL at each iteration of the loop, just create prob before the loop. Also you forgot one line as noticed in the comments :
Craps <- function(jogadas){
prob<-NULL
for (i in 1:jogadas){
comeOut <- sample(1:6,1)+ sample(1:6,1)
if(comeOut %in% c(2,3,12)){
result <- F #False
}else if(comeOut %in% c(7,11)){
result <- T
}
else{
dados <- sample(1:6,1) + sample(1:6,1)
if(dados == 7){
result <- F
}else {
result <- T
}
while (!(dados %in% c(7,comeOut))){
dados <- sample(1:6,1)+ sample(1:6,1)
}
if(dados == 7)
result <- F
else result <- T
}
print(result)
#probability
prob[i] <- result
prob2<-sum(prob)/jogadas
print(prob2)
}
}
I am a R beginner and I tried to make a median function from scratch.
Here is my code:
mymedian <- function(x) {
len <- length(x)
sorted <- sort(x)
if (len %% 2 == 0) {
med1 <- sorted[len / 2]
med2 <- sorted[(len + 1) %/% 2]
result <- sorted[med1 + med2 / 2]
return(result)
} else {
result <- sorted[(len + 1)/2]
return(result)
}
}
Im getting "NA" output. I couldn't find where the problem is.
Main issue is you're trying to index your sorted vector with a non-integer (e.g., 168.5). Compare your function to this:
mymedian <- function(x){
len <- length(x)
sorted <-sort(x)
if(len%%2==0){
i <- len/2
med1<-sorted[i]
med2 <- sorted[i+1]
result <- sum(med1,med2)/2
return(result)
}else{
result<-sorted[(len+1)/2]
return(result)
}
}
I'm trying to run some R code and it is crashing because of long vector error. I'm running R 3.5.1 and getting the following error:
"Error in for (n in 1:k) { : long vectors not supported yet: eval.c:6393"
The input FASTA file size is 1 GB. The error appears right after running the loop. I tried to make the input file smaller, but seems this was not the case and I guess could be more related to the used packages. The code that creates the troubles is the following:
library(biomaRt) #version 2.36.1
library(biomartr)#version 0.8.0
library(R.utils) #version 2.7.0
library(seqinr) #version 3.4.5
genmt <- read.fasta("genymt.fa")
gensize1 <- 16900
subsize1 <- 22*2
BinToDec <- function(x)
sum(2^(which(rev(unlist(strsplit(x, "")) == 1))-1))
DecToBin <- function(x)
{
b <- intToBin(x)
while(nchar(b) < subsize1)
b <- paste("0",b,sep = "")
b
}
bin1 <- gsub('A','00',genmt)
bin1 <- gsub('T','01',bin1)
bin1 <- gsub('C','10',bin1)
bin1 <- gsub('G','11',bin1)
for (i in 1:((gensize1*2)-subsize1)) {
print(i)
beg1 <- i
end1 <- i+(subsize1-1)
sub1 <- substr(bin1, beg1, end1)
dec1 <- BinToDec(sub1)
if (i == 1) {
exists1 <- dec1
rep1 <- 1
} else {
flag1 <- any(exists1 == dec1)
if (flag1) {
ind1 <- which(exists1 == dec1)
rep1[ind1] <- rep1[ind1]+1
} else {
exists1 <- c(exists1,dec1)
rep1 <- c(rep1,1)
}
}
}
dec_res <- -1
k <- 2^subsize1
for (n in 1:k) {
print(n)
flag1 <- any(exists1 == n)
if (!flag1) {
dec_res <- n
break
}
}
bin_res <- DecToBin(dec_res)
gen_res <- matrix(,nrow = 0,ncol = subsize1/2)
ind <- 0
for(i in seq(1,subsize1,2)) {
ind <- ind + 1
ifelse(substr(bin_res,i,i+1) == "00",gen_res[ind] <- "A",
ifelse(substr(bin_res,i,i+1) == "01",gen_res[ind] <- "T",
ifelse(substr(bin_res,i,i+1) == "10",gen_res[ind] <-"C",gen_res[ind] <- "G")))
}
Could you please help me to understand the situation and provide a fix for it?
I noticed a discrepancy with rbind behaviour between matrix and data.frame objects.
With matrix objects everything works as expected:
mat1 <- matrix(nrow=2, ncol=0)
mat2 <- matrix(nrow=2, ncol=0)
dim(rbind(mat1, mat2))
[1] 4 0
But if we turn them to data.frame all of a sudden it looses the number of rows:
> dim(rbind(as.data.frame(mat1), as.data.frame(mat2)))
[1] 0 0
What I would like to understand is - is this behaviour intentional? And if so what is the reasoning for dropping the number of rows in this situation?
EDIT: As noted by #PoGibas - this behaviour is documented in ?rbind. No reason is given and it would probably be hard to infer one. So the question becomes:
How to rbind an arbitrary number of data.frames while always preserving their number of rows?
Workaround could be to use cbind and transposition:
m <- matrix(nrow = 2, ncol = 0)
as.data.frame(t(cbind(as.data.frame(t(m)), as.data.frame(t(m)))))
# Returns: data frame with 0 columns and 4 rows
Here cbind creates a data.frame with 0 rows and 4 columns and we transpose it to matrix with 4 rows and 0 columns.
Another solution is just brutal modification of original base::rbind.data.frame (source on github) function.
You have to remove/comment out two parts there:
Removal of arguments if there length is not a positive integer (length(data.frame()) returns 0). Comment out this part:
allargs <- allargs[lengths(allargs) > 0L]
Return of empty data.frame if attribute names is empty (you can't set attribute to an empty data.frame - names(as.data.frame(mat1)) <- "" returns an error). Comment out this part:
if(nvar == 0L) return(structure(list(), class = "data.frame", row.names = integer()))
Result:
m <- matrix(nrow = 2, ncol = 0)
dim(rbind.data.frame2(as.data.frame(m), as.data.frame(m)))
# Returns: [1] 4 0
Code:
rbind.data.frame2 <- function(..., deparse.level = 1, make.row.names = TRUE,
stringsAsFactors = default.stringsAsFactors())
{
match.names <- function(clabs, nmi)
{
if(identical(clabs, nmi)) NULL
else if(length(nmi) == length(clabs) && all(nmi %in% clabs)) {
## we need 1-1 matches here
m <- pmatch(nmi, clabs, 0L)
if(any(m == 0L))
stop("names do not match previous names")
m
} else stop("names do not match previous names")
}
if(make.row.names)
Make.row.names <- function(nmi, ri, ni, nrow)
{
if(nzchar(nmi)) {
if(ni == 0L) character() # PR8506
else if(ni > 1L) paste(nmi, ri, sep = ".")
else nmi
}
else if(nrow > 0L && identical(ri, seq_len(ni)) &&
identical(unlist(rlabs, FALSE, FALSE), seq_len(nrow)))
as.integer(seq.int(from = nrow + 1L, length.out = ni))
else ri
}
allargs <- list(...)
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
# allargs <- allargs[lengths(allargs) > 0L]
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
if(length(allargs)) {
## drop any zero-row data frames, as they may not have proper column
## types (e.g. NULL).
nr <- vapply(allargs, function(x)
if(is.data.frame(x)) .row_names_info(x, 2L)
else if(is.list(x)) length(x[[1L]])
# mismatched lists are checked later
else length(x), 1L)
if(any(nr > 0L)) allargs <- allargs[nr > 0L]
else return(allargs[[1L]]) # pretty arbitrary
}
n <- length(allargs)
if(n == 0L)
return(structure(list(),
class = "data.frame",
row.names = integer()))
nms <- names(allargs)
if(is.null(nms))
nms <- character(n)
cl <- NULL
perm <- rows <- vector("list", n)
rlabs <- if(make.row.names) rows # else NULL
nrow <- 0L
value <- clabs <- NULL
all.levs <- list()
for(i in seq_len(n)) {
## check the arguments, develop row and column labels
xi <- allargs[[i]]
nmi <- nms[i]
## coerce matrix to data frame
if(is.matrix(xi)) allargs[[i]] <- xi <-
as.data.frame(xi, stringsAsFactors = stringsAsFactors)
if(inherits(xi, "data.frame")) {
if(is.null(cl))
cl <- oldClass(xi)
ri <- attr(xi, "row.names")
ni <- length(ri)
if(is.null(clabs)) ## first time
clabs <- names(xi)
else {
if(length(xi) != length(clabs))
stop("numbers of columns of arguments do not match")
pi <- match.names(clabs, names(xi))
if( !is.null(pi) ) perm[[i]] <- pi
}
rows[[i]] <- seq.int(from = nrow + 1L, length.out = ni)
if(make.row.names) rlabs[[i]] <- Make.row.names(nmi, ri, ni, nrow)
nrow <- nrow + ni
if(is.null(value)) { ## first time ==> setup once:
value <- unclass(xi)
nvar <- length(value)
all.levs <- vector("list", nvar)
has.dim <- facCol <- ordCol <- logical(nvar)
for(j in seq_len(nvar)) {
xj <- value[[j]]
facCol[j] <-
if(!is.null(levels(xj))) {
all.levs[[j]] <- levels(xj)
TRUE # turn categories into factors
} else
is.factor(xj)
ordCol[j] <- is.ordered(xj)
has.dim[j] <- length(dim(xj)) == 2L
}
}
else for(j in seq_len(nvar)) {
xij <- xi[[j]]
if(is.null(pi) || is.na(jj <- pi[[j]])) jj <- j
if(facCol[jj]) {
if(length(lij <- levels(xij))) {
all.levs[[jj]] <- unique(c(all.levs[[jj]], lij))
ordCol[jj] <- ordCol[jj] & is.ordered(xij)
} else if(is.character(xij))
all.levs[[jj]] <- unique(c(all.levs[[jj]], xij))
}
}
}
else if(is.list(xi)) {
ni <- range(lengths(xi))
if(ni[1L] == ni[2L])
ni <- ni[1L]
else stop("invalid list argument: all variables should have the same length")
rows[[i]] <- ri <-
as.integer(seq.int(from = nrow + 1L, length.out = ni))
nrow <- nrow + ni
if(make.row.names) rlabs[[i]] <- Make.row.names(nmi, ri, ni, nrow)
if(length(nmi <- names(xi)) > 0L) {
if(is.null(clabs))
clabs <- nmi
else {
if(length(xi) != length(clabs))
stop("numbers of columns of arguments do not match")
pi <- match.names(clabs, nmi)
if( !is.null(pi) ) perm[[i]] <- pi
}
}
}
else if(length(xi)) { # 1 new row
rows[[i]] <- nrow <- nrow + 1L
if(make.row.names)
rlabs[[i]] <- if(nzchar(nmi)) nmi else as.integer(nrow)
}
}
nvar <- length(clabs)
if(nvar == 0L)
nvar <- max(lengths(allargs)) # only vector args
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
# if(nvar == 0L)
# return(structure(list(), class = "data.frame",
# row.names = integer()))
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
pseq <- seq_len(nvar)
if(is.null(value)) { # this happens if there has been no data frame
value <- list()
value[pseq] <- list(logical(nrow)) # OK for coercion except to raw.
all.levs <- vector("list", nvar)
has.dim <- facCol <- ordCol <- logical(nvar)
}
names(value) <- clabs
for(j in pseq)
if(length(lij <- all.levs[[j]]))
value[[j]] <-
factor(as.vector(value[[j]]), lij, ordered = ordCol[j])
if(any(has.dim)) {
rmax <- max(unlist(rows))
for(i in pseq[has.dim])
if(!inherits(xi <- value[[i]], "data.frame")) {
dn <- dimnames(xi)
rn <- dn[[1L]]
if(length(rn) > 0L) length(rn) <- rmax
pi <- dim(xi)[2L]
length(xi) <- rmax * pi
value[[i]] <- array(xi, c(rmax, pi), list(rn, dn[[2L]]))
}
}
for(i in seq_len(n)) {
xi <- unclass(allargs[[i]])
if(!is.list(xi))
if(length(xi) != nvar)
xi <- rep(xi, length.out = nvar)
ri <- rows[[i]]
pi <- perm[[i]]
if(is.null(pi)) pi <- pseq
for(j in pseq) {
jj <- pi[j]
xij <- xi[[j]]
if(has.dim[jj]) {
value[[jj]][ri, ] <- xij
## copy rownames
rownames(value[[jj]])[ri] <- rownames(xij)
} else {
## coerce factors to vectors, in case lhs is character or
## level set has changed
value[[jj]][ri] <- if(is.factor(xij)) as.vector(xij) else xij
## copy names if any
if(!is.null(nm <- names(xij))) names(value[[jj]])[ri] <- nm
}
}
}
if(make.row.names) {
rlabs <- unlist(rlabs)
if(anyDuplicated(rlabs))
rlabs <- make.unique(as.character(rlabs), sep = "")
}
if(is.null(cl)) {
as.data.frame(value, row.names = rlabs, fix.empty.names = TRUE,
stringsAsFactors = stringsAsFactors)
} else {
structure(value, class = cl,
row.names = if(is.null(rlabs)) .set_row_names(nrow) else rlabs)
}
}
I am wondering whether a proper framework for interval manipulation and comparison does exist in R.
After some search, I was only able to find the following:
- function findInterval in base Package. (but I hardly understand it)
- some answers here and there about union and intersection (notably: http://r.789695.n4.nabble.com/Union-Intersect-two-continuous-sets-td4224545.html)
Would you know of an initiative to implement a comprehensive set of tools to easily handles frequent tasks in interval manipulation, like inclusion/setdiff/union/intersection/etc. (eg see here for a list of functionalities)?
or would you have advice in developing such an approach?
below are some drafts on my side for doing so. it is surely awkward and still has some bugs but it might illustrate what I am looking for.
preliminary aspects about the options taken
- should deal seamlessly with intervals or intervals set
- intervals are represented as 2 columns data.frames (lower boundary, higher boundary), on one row
- intervals sets are represented as 2 columns with several rows
- a third column might be needed for identification of intervals sets
UNION
interval_union <- function(df){ # for data frame
df <- interval_clean(df)
if(is.empty(df)){
return(as.data.frame(NULL))
} else {
if(is.POSIXct(df[,1])) {
dated <- TRUE
df <- colwise(as.numeric)(df)
} else {
dated <- FALSE
}
M <- as.matrix(df)
o <- order(c(M[, 1], M[, 2]))
n <- cumsum( rep(c(1, -1), each=nrow(M))[o])
startPos <- c(TRUE, n[-1]==1 & n[-length(n)]==0)
endPos <- c(FALSE, n[-1]==0 & n[-length(n)]==1)
M <- M[o]
if(dated == TRUE) {
df2 <- colwise(mkDateTime)(as.data.frame(cbind(M[startPos], M[endPos])), from.s = TRUE)
} else {
df2 <- as.data.frame(cbind(M[startPos], M[endPos]))
}
colnames(df2) <- colnames(df)
# print(df2)
return(df2)
}
}
union_1_1 <- function(test, ref){
names(ref) <- names(test)
tmp <- interval_union(as.data.frame(rbind(test, ref)))
return(tmp)
}
union_1_n <- function(test, ref){
return(union_1_1(test, ref))
}
union_n_n <- function(test, ref){
testnn <- adply(.data = test, 1, union_1_n, ref, .expand = FALSE)
return(testnn)
}
ref_interval_union <- function(df, ref){
tmp0 <- adply(df, 1, union_1_1, ref, .expand = FALSE) # set to FALSE to keep ID
return(tmp0)
}
INTERSECTION
interval_intersect <- function(df){
# adapted from : http://r.789695.n4.nabble.com/Union-Intersect-two-continuous-sets-td4224545.html
M <- as.matrix(df)
L <- max(M[, 1])
R <- min(M[, 2])
Inew <- if (L <= R) c(L, R) else c()
if (!is.empty(Inew)){
df2 <- t(as.data.frame(Inew))
colnames(df2) <- colnames(df)
rownames(df2) <- NULL
} else {
df2 <- NULL
}
return(as.data.frame(df2))
}
ref_interval_intersect <- function(df, ref){
tmpfun <- function(a, b){
names(b) <- names(a)
tmp <- interval_intersect(as.data.frame(rbind(a, b)))
return(tmp)
}
tmp0 <- adply(df, 1, tmpfun, ref, .expand = FALSE) # [,3:4]
#if(!is.empty(tmp0)) colnames(tmp0) <- colnames(df)
return(tmp0)
}
int_1_1 <- function(test, ref){
te <- as.vector(test)
re <- as.vector(ref)
names(re) <- names(te)
tmp0 <- c(max(te[1, 1], re[1, 1]), min(te[1, 2], re[1, 2]))
if(tmp0[1]>tmp0[2]) tmp0 <- NULL # inverse of a correct interval --> VOID
if(!is.empty(tmp0)){
tmp1 <- colwise(mkDateTime)(as.data.frame(t(as.data.frame(tmp0))))
colnames(tmp1) <- colnames(test)
} else {
tmp1 <- data.frame(NULL)
}
return(tmp1)
}
int_1_n <- function(test, ref){
test1 <- adply(.data = ref, 1, int_1_1, test = test, .expand = FALSE)
if(is.empty(test1)){
return(data.frame(NULL))
} else {
testn <- interval_union(test1[,2:3])
return(testn)
}
}
int_n_n <- function(test, ref){
testnn <- adply(.data = test, 1, int_1_n, ref, .expand = FALSE)
# return(testnn[,2:3]) # return interval set without index (1st column)
return(testnn) # return interval set with index (1st column) --> usefull to go with merge to keep metadata going alon g with interval description
}
int_intersect <- function(df, ref){
mycols <- colnames(df)
df$X1 <- 1:nrow(df)
test <- df[, 1:2]
tmp <- int_n_n(test, ref)
intersection <- merge(tmp, df, by = "X1", suffixes = c("", "init"))
return(intersection[,mycols])
}
EXCLUSION
excl_1_1 <- function(test, ref){
te <- as.vector(test)
re <- as.vector(ref)
names(re) <- names(te)
if(te[1] < re[1]){ # Lower Bound
if(te[2] > re[1]){ # overlap
x <- unlist(c(te[1], re[1]))
} else { # no overlap
x <- unlist(c(te[1], te[2]))
}
} else { # test > ref on lower bound side
x <- NULL
}
if(te[2] > re[2]){ # Upper Bound
if(te[1] < re[2]){ # overlap
y <- unlist(c(re[2], te[2]))
} else { # no overlap
y <- unlist(c(te[1], te[2]))
}
} else { # test < ref on upper bound side
y <- NULL
}
if(is.empty(x) & is.empty(y)){
tmp0 <- NULL
tmp1 <- tmp0
} else {
tmp0 <- as.data.frame(rbind(x, y))
colnames(tmp0) <- colnames(test)
tmp1 <- interval_union(tmp0)
}
return(tmp1)
}
excl_1_n <- function(test, ref){
testn0 <- adply(.data = ref, 1, excl_1_1, test = test, .expand=FALSE)
# boucle pour intersecter successivement les intervalles sets, pour gérer les intervalles disjoints (identifiés par X1, col1)
tmp <- range(testn0)
names(tmp) <- colnames(testn0)[2:3]
tmp <- as.data.frame(t(tmp))
for(i in unique(testn0[,1])){
tmp <- int_n_n(tmp, testn0[testn0[,1]==i, 2:3])
}
return(tmp)
}
INCLUSION
incl_1_1 <- function(test, ref){
te <- as.vector(test)
re <- as.vector(ref)
if(te[1] >= re[1] & te[2] <= re[2]){ return(TRUE) } else { return(FALSE) }
}
incl_1_n <- function(test, ref){
testn <- adply(.data = ref, 1, incl_1_1, test = test)
return(any(testn[,ncol(testn)]))
}
incl_n_n <- function(test, ref){
testnn <- aaply(.data = test, 1, incl_1_n, ref, .expand = FALSE)
names(testnn) <- NULL
return(testnn)
}
flat_incl_n_n <- function(test, ref){
ref <- interval_union(ref)
return(incl_n_n(test, ref))
}
# testing for a vector, instead of an interval set
incl_x_1 <- function(x, ref){
test <- (x>=ref[1,1] & x<ref[1,2])
return(test)
}
incl_x_n <- function(x, ref){
test <- any(x>=ref[,1] & x<ref[,2])
return(test)
}
I think you might be able to make good use of the many interval-related functions in the sets package.
Here's a small example illustrating the package's support for interval construction, intersection, set difference, union, and complementation, as well as its test for inclusion in an interval. These and many other related functions are documented on the help page for ?interval.
library(sets)
i1 <- interval(1,6)
i2 <- interval(5,10)
i3 <- interval(200,400)
i4 <- interval(202,402)
i5 <- interval_union(interval_intersection(i1,i2),
interval_symdiff(i3,i4))
i5
# [5, 6] U [200, 202) U (400, 402]
interval_complement(i5)
# [-Inf, 5) U (6, 200) U [202, 400] U (402, Inf]
interval_contains_element(i5, 5.5)
# [1] TRUE
interval_contains_element(i5, 201)
# [1] TRUE
If your intervals are currently encoded in a two-column data.frame, you could use something like mapply() to convert them to intervals of the type used by the sets package:
df <- data.frame(lBound = c(1,5,100), uBound = c(10, 6, 200))
Ints <- with(df, mapply("interval", l=lBound, r=uBound, SIMPLIFY=FALSE))
Ints
# [[1]]
# [1, 10]
# [[2]]
# [5, 6]
# [[3]]
# [100, 200]