using intervals to assign categorical values - r

Take the following generic data
A <- c(5,7,11,10,23,30,24,6)
B <- c(1,2,3,1,2,3,1,2)
C <- data.frame(A,B)
and the following intervals
library(intervals)
interval1 <- Intervals(
matrix(
c(
5, 15,
15, 25,
25, 35,
35, 100
),
ncol = 2, byrow = TRUE
),
closed = c( TRUE, FALSE ),
type = "Z"
)
rownames(interval1) <- c("A","B","C", "D")
interval2 <- Intervals(
matrix(
c(
0, 10,
12, 20,
22, 30,
30, 100
),
ncol = 2, byrow = TRUE
),
closed = c( TRUE, FALSE ),
type = "Z"
)
rownames(interval2) <- c("P","Q","R", "S")
Now I want to create the following output table
So where the A value overlap the two invervals, I want to 'copy' all the data to a line below.
We also introduce data$X which is the interval1 value and data$y which is the interval2 value.
Where data does not fit within any of the interval, I want to remove it from the data.frame
I am not sure if the break() function would be better used to create the intervals or if the dplyr function can be used to make the reoccuring data rows

You can use foverlaps in data.table:
library(data.table)
C.DT <- data.table(C)
C.DT[, A1:=A] # required for `foverlaps` so we can do a range search
# `D` and `E` are your interval matrices
I1 <- data.table(cbind(data.frame(D), idX=LETTERS[1:4], idY=NA))
I2 <- data.table(cbind(data.frame(E), idX=NA, idY=LETTERS[16:19]))
setkey(I1, X1, X2) # set the keys on our interval ranges
setkey(I2, X1, X2)
rbind(
foverlaps(C.DT, I1, by.x=c("A", "A1"), nomatch=0), # match every value in `C.DT$A` to the ranges in `I1`
foverlaps(C.DT, I2, by.x=c("A", "A1"), nomatch=0)
)[order(A, B), .(A, B, X=idX, Y=idY)]
Produces:
A B X Y
1: 5 1 A NA
2: 5 1 NA P
3: 6 2 A NA
4: 6 2 NA P
5: 7 2 A NA
6: 7 2 NA P
7: 10 1 A NA
8: 10 1 NA P
9: 11 3 A NA
10: 23 2 B NA
11: 23 2 NA R
12: 24 1 B NA
13: 24 1 NA R
14: 30 3 C NA
15: 30 3 NA R
16: 30 3 NA S
Note you can easily change what you get instead of NA, by modifying the steps where I1 and I2 are created.

Related

Create a categorical variable

I want to categorized one variable with the next conditionals:
0 - 4: "fail"
5 - 7: "good"
8 - 10: "excellent"
None of the above: NA
I tried using the recode function
The values of variable is numeric
segur <- data$segur
Created a new variable using recode
dt1 <- recode(segur, "c(0,4)='suspenso';c(5, 7)='aceptable';c(8,10)='excelente'; else='NA'")
dt1
How can I fix?
using factor in base R
Data:
# set random seed
set.seed(1L)
# without any NA
x1 <- sample(x = 1:10, size = 20, replace=TRUE)
# with NA
x2 <- sample(x = c(1:10, NA), size = 20, replace=TRUE)
Code:
# without any NA
as.character(factor(x1, levels = c(0:10), labels = c(rep("fail", 5), rep("good", 3), rep("excellent", 3)), exclude=NA))
# with NA
as.character(factor(x2, levels = c(0:10), labels = c(rep("fail", 5), rep("good", 3), rep("excellent", 3)), exclude=NA))
I guess you can use cut like below
cut(segur, c(0, 4, 7, 10), labels = c("fail", "good", "excellent"))
Example
> segur
[1] 6 1 4 -2 -1 10 8 0 5 9
> cut(segur, c(0, 4, 7, 10), labels = c("fail", "good", "excellent"))
[1] good fail fail <NA> <NA> excellent excellent
[8] <NA> good excellent
Levels: fail good excellent
Here is a solution using the fmtr package. You can create a categorical format using the value and condition functions, and then apply the format to the numeric data using the fapply function. Here is an example:
library(fmtr)
# Create sample data
df <- read.table(header = TRUE, text = '
ID segur
1 0
2 8
3 5
4 11
5 7')
# Create format
fmt <- value(condition(x >= 0 & x <=4, "fail"),
condition(x >= 5 & x <=7, "good"),
condition(x >= 8 & x <= 10, "excellent"),
condition(TRUE, NA))
# Apply categorization
df$segur_cat <- fapply(df$segur, fmt)
# View results
df
# ID segur segur_cat
# 1 1 0 fail
# 2 2 8 excellent
# 3 3 5 good
# 4 4 11 <NA>
# 5 5 7 good

Add column inside named matrix

Suppose I have the following matrix:
m <- matrix(1:12, nrow = 3, dimnames = list(c("a", "b", "c"), c("w", "x", "y", "z")))
# w x y z
# a 1 4 7 10
# b 2 5 8 11
# c 3 6 9 12
How can I add a column with the values c(13, 14, 15) between column x and y without knowing where x and y are?
Using number ranges I know how to do this using cbind.
cbind(m[,1:2], c(13, 14, 15), m[,3:4])
# w x y z
# a 1 4 13 7 10
# b 2 5 14 8 11
# c 3 6 15 9 12
For named columns, it'd be neat if I could supply the column ranges with m[,:"x"] and m[,"y":] of some sort, but unfortunately that doesn't work.
Additionally, if possible, giving that column its own header name during the insertion process would be nice.
EDIT: I should have specified that x and y always are in order, so adding the column after x would have been enough. Thanks for the more general answers as well!
When you can not assume that x comes before y and there is no need that they are following each without a gap you can try:
i <- seq_len(min(match(c("x", "y"), colnames(m))))
cbind(m[,i], v=c(13, 14, 15), m[,-i])
# w x v y z
#a 1 4 13 7 10
#b 2 5 14 8 11
#c 3 6 15 9 12
In case they are ordered, that it will be enough to put it after x like:
i <- seq_len(match("x", colnames(m)))
cbind(m[,i], v=c(13, 14, 15), m[,-i])
you may found the columns positions by names and insert the new column properly:
x_pos <- which(colnames(m) == "x")
y_pos <- which(colnames(m) == "y")
m <- cbind(m[,1:x_pos], new=c(13, 14, 15), m[,y_pos:ncol(m)])
You can use which to find the desired column and assign a name in cbind, i.e.
cbind(m[, seq(which(colnames(m) == 'x'))],
w = c(13, 14, 15),
m[, (which(colnames(m) == 'y'):ncol(m))])
# w x w y z
#a 1 4 13 7 10
#b 2 5 14 8 11
#c 3 6 15 9 12
It's not exactly pretty but you can do this
cbind(m[,1:(which(dimnames(m)[[2]]=="x"))],
t=c(13, 14, 15),
m[,(which(dimnames(m)[[2]]=="y")):dim(m)[2]])
You can use this function :
insert_a_column <- function(mat, first_col,second_col, new_col, vec) {
#Get index of first column to match
one <- match(first_col, colnames(mat))
#Get index of second column to match
two <- match(second_col, colnames(mat))
#Add the middle column and combine the data
new_mat <- cbind(mat[,1:one, drop = FALSE], vec,
mat[, (one + 1):ncol(mat), drop = FALSE])
#rename the new column
colnames(new_mat)[one + 1] <- new_col
#Return the matrix.
return(new_mat)
}
insert_a_column(m, "x", "y", "a", c(13, 14, 15))
# w x a y z
#a 1 4 13 7 10
#b 2 5 14 8 11
#c 3 6 15 9 12
insert_a_column(m, "y", "z", "a", c(13, 14, 15))
# w x y a z
#a 1 4 7 13 10
#b 2 5 8 14 11
#c 3 6 9 15 12

data.table - efficiently manipulate large data set

I am amazed by the blazing speed of data.table. The coding below does exactly what I need however when executed on a large table it does not perform very well.
convinced that this can be done faster with data.table but I do not see how.
Output
The output needs to be a matrix with the rownames a regular sequence of days.
For each column separately:
All values before the first value need to be NA
All values after the last value need to be NA
Between the first and the last value 0 need to be added as the do not exist in the input table
The following coding shows how the result should look like:
M <-
matrix(c(NA, NA, NA, 2, 0, 1, 3, 0, 2 , NA,
NA, NA, 3, 1, 3, 2, 1, 2, NA, NA),
ncol = 2,
dimnames = list(as.character((Sys.Date() + 0:9)),
c("E1", "E2")))
Output example
## E1 E2
## 2017-01-27 NA NA
## 2017-01-28 NA NA
## 2017-01-29 NA 2
## 2017-01-30 2 2
## 2017-01-31 0 2
## 2017-02-01 3 1
## 2017-02-02 1 3
## 2017-02-03 0 3
## 2017-02-04 2 NA
## 2017-02-05 NA NA
Input
The following table shows the source/input for the coding/function:
DS <- data.table(
E = c(rep("E1", 4), rep("E2", 6)),
C = c(c(Sys.Date() + c(3, 5, 6, 8)),
c(Sys.Date() + c(2, 3, 4, 5, 6, 7))),
S = round(runif(n = 10,min = 1, max = 3), 0),
key = c("E", "C"))
## E C S
## 1: E1 2017-01-30 3
## 2: E1 2017-02-01 1
## 3: E1 2017-02-02 2
## 4: E1 2017-02-04 1
## 5: E2 2017-01-29 3
## 6: E2 2017-01-30 2
## 7: E2 2017-01-31 3
## 8: E2 2017-02-01 1
## 9: E2 2017-02-02 2
## 10: E2 2017-02-03 3
Input example
Code working
The following few lines do exactly what I need and is simple. However it is not efficient.
The real table has 700 unique C values and 2 Million E values.
# Create the regular time line per day
CL <- c(C= (Sys.Date() + 0:9))
# Determine first and last per E
DM <- DS[, .(MIN = min(C), MAX = max(C)), by =.(E)]
# Generate all combinations
CJ <- CJ(E = DS$E, C = CL, unique = TRUE)
# Join
DC <- DS[CJ, on = .(E, C)][!is.na(E)]
# replace NA by 0
DC[is.na(S), S:=0]
# Lead-in
DC[DM, on=.(E, C<MIN), S:=NA]
# Lead-out
DC[DM, on=.(E, C>MAX), S:=NA]
# Cast to matrix format
DC2 <- dcast(
data = DC, formula = C ~ E,
fun.aggregate = sum, value.var = "S")
# coerce to matrix
M3 <- as.matrix(DC2[, -1])
# add row nanes
rownames(M3) <- format(CL, "%Y-%m-%d")
I made some long, un-readable, clumsy coding which creates the matrix with 1.2B cells in 35 secs. This must be possible as quick but far more elegant with data.table, however not like this.
A data.table, like a data.frame underneath everything is a list (with length = number of columns)
200 Million columns is a lot of columns - this will make anything slow.
The description of the conversion to "wide" will bloat the data with large number of NA values. You can almost certainly perform the analysis you need on the "long form" and using keys.
It isn't clear from your question what you need, but you can calculate the various sums
# convert to an IDate
DT[, CALDAY := as.IDate(CALDAY)]
# get range of dates
rangeDays <- DT[,range(CALDAY)]
all_days <- as.IDate(seq(rangeDays[1],rangeDays[2], by=1))
# create sums
DT_sum <- DT[, list(VALUE= sum(VALUE)), keyby = list(ENTITY, CALDAY)]
and then index using entity and dates.
DT_sum[.("2a8605e2-e283-11e6-a3bb-bbe3fd226f8d", all_days)]
and if you need to replace NA with 0
na_replace <- function(x,repl=0){x[is.na(x)]<-repl;x}
DT_sum[.("2a8605e2-e283-11e6-a3bb-bbe3fd226f8d", all_days), na_replace(VALUE)]
This does the trick. But still the performance is not good.
It takes DS as input parameter. The result is a data.table which should be coerced to matrix by:
M <- as.matrix(build_timeseries_DT(DS))
Function
build_timeseries_DT <- function(DS){
# regular time serie for complete range with index
dtC <- data.table(
CAL = seq(min(DS$C), max(DS$C), by = "day"))[, idx:= 1:.N]
# add row index (idx) to sales
DQ <- dtC[DS, on = "CAL"]
setkey(DQ, "ENT")
# calculate min index per ENT
DM <- DQ[, .(MIN = min(idx), MAX = max(idx)), by = .(ENT)]
# allocate memory, assign 0 and set rownames by reference
DT <- dtC[, .(CAL)][, (DM[, ENT]):= 0L][, CAL:= NULL]
setattr(DT, "row.names", format(dtC$CAL, "%Y-%m-%d"))
# Set NA for the Lead-in and out, next populate values by ref
for(j in colnames(DT)){
set(x = DT,
i = c(1L:(DM[j, MIN]), (DM[j, MAX]):DT[, .N]),
j = j,
value = NA )
set(x = DT,
i = DQ[j, idx],
j = j,
value = DQ[j, SLS] )}
return(DT)
}
Test Data
DS <- data.table(
ENT = c("A", "A", "A", "B", "B", "C", "C", "C", "D", "D"),
CAL = c(Sys.Date() + c(0, 5, 6, 3, 8, 1, 2, 9, 3, 5)),
SLS = as.integer(c(1, 2, 1, 2, 3, 1, 2, 3, 2, 1)),
key = c("ENT", "CAL"))
ENT CAL SLS
1: A 2017-01-28 1
2: A 2017-02-02 2
3: A 2017-02-03 1
4: B 2017-01-31 2
5: B 2017-02-05 3
6: C 2017-01-29 1
7: C 2017-01-30 2
8: C 2017-02-06 3
9: D 2017-01-31 2
10: D 2017-02-02 1
Result
as.matrix(build_timeseries_DT(DS))
A B C D
[1,] 1 NA NA NA
[2,] 0 NA 1 NA
[3,] 0 NA 2 NA
[4,] 0 2 0 2
[5,] 0 0 0 0
[6,] 2 0 0 1
[7,] 1 0 0 NA
[8,] NA 0 0 NA
[9,] NA 3 0 NA
[10,] NA NA 3 NA
result with colors

how to merge matrices in R with different number of rows

I would like to merge several matrices using their row names.
These matrices do not have the same number of rows and columns.
For instance:
m1 <- matrix(c(1, 2, 3, 4, 5, 6), 3, 2)
rownames(m1) <- c("a","b","c")
m2 <- matrix(c(1, 2, 3, 5, 4, 5, 6, 2), 4, 2)
rownames(m2) <- c("a", "b", "c", "d")
m3 <- matrix(c(1, 2, 3, 4), 2,2)
rownames(m3) <- c("d", "e")
mlist <- list(m1, m2, m3)
For them I would like to get:
Row.names V1.x V2.x V1.y V2.y V1.z V2.z
a 1 4 1 4 NA NA
b 2 5 2 5 NA NA
c 3 6 3 6 NA NA
d NA NA 5 2 1 3
e NA NA NA NA 2 4
I have tried to use lapply with the function merge:
M <- lapply(mlist, merge, mlist, by = "row.names", all = TRUE)
However, it did not work:
Error in data.frame(c(1, 2, 3, 4, 5, 6), c(1, 2, 3, 5, 4, 5, 6, 2), c(1, :
arguments imply differing number of rows: 3, 4, 2
Is there an elegant way to merge these matrices?
You are trying to apply a reduction (?Reduce) to the list of matrices, where the reduction is basically merge. The problem is that merge(m1, m2, by = "row.names", all = T) doesn't give you a new merged matrix with row names, but instead returns the row names in the first column. This is why we need additional logic in the reduction function.
Reduce(function(a,b) {
res <- merge(a,b,by = "row.names", all = T);
rn <- res[,1]; # Row.names column of merge
res <- res[,-1]; # Actual data
row.names(res) <- rn; # Assign row.names
return(res) # Return the merged data with proper row.names
},
mlist[-1], # Reduce (left-to-right) by applying function(a,b) repeatedly
init = mlist[[1]] # Start with the first matrix
)
Or alternatively:
df <- mlist[[1]]
for (i in 2:length(mlist)) {
df <- merge(df, mlist[[i]], by = "row.names", all=T)
rownames(df) <- df$Row.names
df <- df[ , !(names(df) %in% "Row.names")]
}
# V1.x V2.x V1.y V2.y V1 V2
# a 1 4 1 4 NA NA
# b 2 5 2 5 NA NA
# c 3 6 3 6 NA NA
# d NA NA 5 2 1 3
# e NA NA NA NA 2 4
This could also be conceptualised as a reshape operation if the right long-form data.frame is passed to the function:
tmp <- do.call(rbind, mlist)
tmp <- data.frame(tmp, id=rownames(tmp),
time=rep(seq_along(mlist),sapply(mlist,nrow)) )
reshape(tmp, direction="wide")
# id X1.1 X2.1 X1.2 X2.2 X1.3 X2.3
#a a 1 4 1 4 NA NA
#b b 2 5 2 5 NA NA
#c c 3 6 3 6 NA NA
#d d NA NA 5 2 1 3
#e e NA NA NA NA 2 4

How to squeeze in missing values into a vector

Let me try to make this question as general as possible.
Let's say I have two variables a and b.
a <- as.integer(runif(20, min = 0, max = 10))
a <- as.data.frame(a)
b <- as.data.frame(a[c(-7, -11, -15),])
So b has 17 observations and is a subset of a which has 20 observations.
My question is the following: how I would use these two variables to generate a third variable c which like a has 20 observations but for which observations 7, 11 and 15 are missing, and for which the other observations are identical to b but in the order of a?
Or to put it somewhat differently: how could I squeeze in these missing observations into variable b at locations 7, 11 and 15?
It seems pretty straightforward (and it probably is) but I have been not getting this to work for a bit too long now.
1) loop Try this loop:
# test data
set.seed(123) # for reproducibility
a <- as.integer(runif(20, min = 0, max = 10))
a <- as.data.frame(a)
b <- as.data.frame(a[c(-7, -11, -15),])
# lets work with vectors
A <- a[[1]]
B <- b[[1]]
j <- 1
C <- A
for(i in seq_along(A)) if (A[i] == B[j]) j <- j+1 else C[i] <- NA
which gives:
> C
[1] 2 7 4 8 9 0 NA 8 5 4 NA 4 6 5 NA 8 2 0 3 9
2) Reduce Here is a loop-free version:
f <- function(j, a) j + (a == B[j])
r <- Reduce(f, A, acc = TRUE)
ifelse(duplicated(r), NA, A)
giving:
[1] 2 7 4 8 9 0 NA 8 5 4 NA 4 6 5 NA 8 2 0 3 9
3) dtw. Using dtw in the package of the same name we can get a compact loop-free one-liner:
library(dtw)
ifelse(duplicated(dtw(A, B)$index2), NA, A)
giving:
[1] 2 7 4 8 9 0 NA 8 5 4 NA 4 6 5 NA 8 2 0 3 9
REVISED Added additional solutions.
Here's a more complicated way of doing it, using the Levenshtein distance algorithm, that does a better job on more complicated examples (it also seemed faster in a couple of larger tests I tried):
# using same data as G. Grothendieck:
set.seed(123) # for reproducibility
a <- as.integer(runif(20, min = 0, max = 10))
a <- as.data.frame(a)
b <- as.data.frame(a[c(-7, -11, -15),])
A = a[[1]]
B = b[[1]]
# compute the transformation between the two, assigning infinite weight to
# insertion and substitution
# using +1 here because the integers fed to intToUtf8 have to be larger than 0
# could also adjust the range more dynamically based on A and B
transf = attr(adist(intToUtf8(A+1), intToUtf8(B+1),
costs = c(Inf,1,Inf), counts = TRUE), 'trafos')
C = A
C[substring(transf, 1:nchar(transf), 1:nchar(transf)) == "D"] <- NA
#[1] 2 7 4 8 9 0 NA 8 5 4 NA 4 6 5 NA 8 2 0 3 9
More complex matching example (where the greedy algorithm would perform poorly):
A = c(1,1,2,2,1,1,1,2,2,2)
B = c(1,1,1,2,2,2)
transf = attr(adist(intToUtf8(A), intToUtf8(B),
costs = c(Inf,1,Inf), counts = TRUE), 'trafos')
C = A
C[substring(transf, 1:nchar(transf), 1:nchar(transf)) == "D"] <- NA
#[1] NA NA NA NA 1 1 1 2 2 2
# the greedy algorithm would return this instead:
#[1] 1 1 NA NA 1 NA NA 2 2 2
The data frame version, which isn't terribly different from G.'s above.
(Assumes a,b setup as above).
j <- 1
c <- a
for (i in (seq_along(a[,1]))) {
if (a[i,1]==b[j,1]) {
j <- j+1
} else
{
c[i,1] <- NA
}
}

Resources