Fill in elements recursively after the first non-NA value - r

Example of data:
>w
date V1 V2 V3
1 1 NA NA NA
2 2 NA NA NA
3 3 -0.2357066 NA -0.5428883
4 4 NA NA NA
5 5 NA -0.4333103 NA
6 6 NA NA NA
7 7 -0.6494716 0.7267507 1.1519118
8 8 NA NA NA
9 9 NA NA NA
10 10 NA NA NA
> r
date V1 V2 V3
1 1 1.262954285 0.7635935 -0.22426789
2 2 -0.326233361 -0.7990092 0.37739565
3 3 1.329799263 -1.1476570 0.13333636
4 4 1.272429321 -0.2894616 0.80418951
5 5 0.414641434 -0.2992151 -0.05710677
6 6 -1.539950042 -0.4115108 0.50360797
7 7 -0.928567035 0.2522234 1.08576936
8 8 -0.294720447 -0.8919211 -0.69095384
9 9 -0.005767173 0.4356833 -1.28459935
10 10 2.404653389 -1.2375384 0.04672617
I am trying to fill in w using the following rule: w(t+1) <- w(t)*r(t), but only the values after the first non-NA element. The for-loop equivalent is:
for (i in 1:(nrow(w)-1)) {
for (j in 2:ncol(w)){
if (is.na(w[i+1,j])) {
w[i+1,j] <- w[i,j]*r[i,j]
}
}
}
and gives:
> w
date V1 V2 V3
1 1 NA NA NA
2 2 NA NA NA
3 3 -0.235706556 NA -0.542888255
4 4 -0.313442405 NA -0.072386744
5 5 -0.398833307 -0.43331032 -0.058212660
6 6 -0.165372814 0.12965300 0.003324337
7 7 -0.649471647 0.72675075 1.151911754
8 8 0.603077961 0.18330358 1.250710490
9 9 -0.177739406 -0.16349234 -0.864183216
10 10 0.001025054 -0.07123088 1.110129201
This is a bit similar to cumprod, maybe, but I am stuck. Is it possible to avoid the for-loops (or at least one of them), so to speed things up?
The data can be replicated by:
set.seed(0)
r <- as.data.frame(matrix(data = rnorm(30), nrow = 10, ncol = 3))
w <- as.data.frame(matrix(data = NA, nrow =10, ncol = 3))
w[3, c(1,3)] <- rnorm(2)
w[5, 2] <- rnorm(1)
w[7,] <- rnorm(ncol(w))
date <- 1:nrow(w)
w <- cbind(date, w)
r <- cbind(date, r)

If you have a few columns, you can replace the inner loop by following data.table operation.
library(data.table) # v1.9.5
fdt <- function(w, r){
for (j in 2:ncol(w)){
w[,j] <- data.table(x=r[, j], z=w[, j])[,ifelse(is.na(z), z[1L]*shift(cumprod(x)), z), cumsum(!is.na(z))][,V1]
}
w
}
For data frames with 100000 rows, it takes about 3s on my computer.
w <- do.call('rbind', lapply(1:10000, function(i)w))
r <- do.call('rbind', lapply(1:10000, function(i)r))
system.time(fdt(w, r))
#user system elapsed
#2.923 0.004 2.928
whereas, the nested loop takes 200s
system.time(f(w, r))
# user system elapsed
#206.406 0.043 206.559
f <- function(w, r){
for (i in 1:(nrow(w)-1)) {
for (j in 2:ncol(w)){
if (is.na(w[i+1,j])) {
w[i+1,j] <- w[i,j]*r[i,j]
}
}
}
w
}
[Edit]
dplyr version runs slightly faster than fd.
library(dplyr)
fdp <- function(w, r){
for (j in 2:ncol(w)){
d <- data_frame(x=r[, j], z=w[, j]) %>%
group_by(cumsum(!is.na(z))) %>%
mutate(v=ifelse(is.na(z), z[1L]*lag(cumprod(x)), z))
w[, j] <- d$v
}
w
}
system.time(fdp(w, r))
# user system elapsed
# 2.458 0.008 2.467
[Edit2]
For a couple million rows, data.table solution is still pretty slow. You can speed the things up nicely with Rcpp.
Rcpp::cppFunction('NumericMatrix fill(NumericMatrix w, NumericMatrix r) {
for (int i = 0; i < w.nrow()-1; i++) {
for (int j = 0; j < w.ncol(); j++) {
if (NumericVector::is_na(w(i+1,j))) {
w(i+1,j) = w(i,j)*r(i,j);
}
}
}
return w;
}')
For 1M rows, it takes less than a sec.
system.time(fill(as.matrix(w[,-1]), as.matrix(r[,-1])))
# user system elapsed
# 0.913 0.004 0.917

Here is an alternative approach:
library(zoo)
cumprodsplit <- function(col, r, w){
# fill the NAs
fill_w <- na.locf(w)[[col]]
# groups
f <- cumsum(!is.na(w[[col]]))
# split w
splits <- split(fill_w, f)
# generate the cumprods
cumprods <- lapply(split(r[[col]], f),
function(x) c(1, cumprod(x)[-length(x)]))
# multiply
vec <- mapply('*', splits, cumprods, SIMPLIFY = FALSE)
#unlist
setNames(data.frame(unlist(vec, use.names = FALSE)), col)
}
do.call("cbind", lapply(names(w)[-1], cumprodsplit, r, w))
V1 V2 V3
1 NA NA NA
2 NA NA NA
3 -0.235706556 NA -0.542888255
4 -0.313442405 NA -0.072386744
5 -0.398833307 -0.43331032 -0.058212660
6 -0.165372814 0.12965300 0.003324337
7 -0.649471647 0.72675075 1.151911754
8 0.603077961 0.18330358 1.250710490
9 -0.177739406 -0.16349234 -0.864183216
10 0.001025054 -0.07123088 1.110129201

Related

The usage of "If", "else", "is.na()" in one function of R

I have Length and Weight values in a data frame. However some of them are missing. The data frame is like:
df <- data.frame(
L = c(13,15,19,NA,NA,32,35,NA,NA,18,15),
W = c(NA,NA,50, NA,NA,NA,80,NA,NA,30,NA)
)
I need a function which will work when length is not NA and weight is NA. it will calculate the weight for length, and else it will do nothing.
lwr <- function(data, length, weight, a, b) {
if(!is.na(data$length) && is.na(data$weight)) {
data$weight = 10^(log(a) + b*log(data$length))
} else {
data$weight
}
}
Here we go
lwr(data=df, length = L, weight = W, a=0.003, b=3.2)
but it does not work.
If you help me, I would be appreciated. Thank you very much for your time.
You probably could do that easier.
f <- \(x, a, b) 10^(log(a) + b*log(x))
naw <- is.na(df$W)
df$W[naw] <- f(df$L[naw], .003, 3.2)
# L W
# 1 13 250.4350
# 2 15 718.8159
# 3 19 50.0000
# 4 NA NA
# 5 NA NA
# 6 32 191078.5331
# 7 35 80.0000
# 8 NA NA
# 9 NA NA
# 10 18 30.0000
# 11 15 718.8159
You should use the vectorized ifelse(), instead of if()...else....
lwr <- function(length, weight, a, b) {
ifelse(is.na(weight), 10^(log(a) + b*log(length)), weight)
}
df |>
transform(W2 = lwr(L, W, a=0.003, b=3.2))
# equivalent:
# df$W2 <- lwr(df$L, df$W, a=0.003, b=3.2)
# L W W2
# 1 13 NA 250.4350
# 2 15 NA 718.8159
# 3 19 50 50.0000
# 4 NA NA NA
# 5 NA NA NA
# 6 32 NA 191078.5331
# 7 35 80 80.0000
# 8 NA NA NA
# 9 NA NA NA
# 10 18 30 30.0000
# 11 15 NA 718.8159

Remove leading NAs to align data

I have a large data.frame with 'staggered' data and would like to align it. What I mean is I would like to take something like
and remove the leading (top) NAs from all columns to get
I know about the na.trim function from the zoo package, but this didn't work on either the initial data.frame presented above or its transpose. For this I used, with transposed dataframe t.df,
t.df <- na.trim(t.df, sides = 'left')
This only returned an empty data.frame, and wouldn't work the way I wanted anyway since it would create vectors of different lengths. Can anyone point me to a package or function that might be more helpful?
Here is the code for my example used above:
# example of what I have
var1 <- c(1,2,3,4,5,6,7,8,9,10)
var2 <- c(6,2,4,7,3,NA,NA,NA,NA,NA)
var3 <- c(NA,NA,8,6,3,7,NA,NA,NA,NA)
var4 <- c(NA,NA,NA,NA,5,NA,2,6,2,9)
df <- data.frame(var1, var2, var3, var4)
# transpose and (unsuccessful) attempt to remove leading NAs
t.df <- t(df)
t.df <- na.trim(t.df, sides = 'left')
We can loop over the columns (lapply(..) and apply na.trim. Then, pad NAs at the end of the each of the list elements by assigning length as the maximum length from the list elements.
library(zoo)
lst <- lapply(df, na.trim)
df[] <- lapply(lst, `length<-`, max(lengths(lst)))
df
# var1 var2 var3 var4
#1 1 6 8 5
#2 2 2 6 NA
## 3 4 3 2
#4 4 7 7 6
#5 5 3 NA 2
#6 6 NA NA 9
#7 7 NA NA NA
#8 8 NA NA NA
#9 9 NA NA NA
#10 10 NA NA NA
Or as #G.Grothendieck mentioned in the comments
replace(df, TRUE, do.call("merge", lapply(lst, zoo)))
You can do with base functions:
my.na.trim <- function(x) {
r <- rle(is.na(x))
if (!r$value[1]) return(x)
x[c(((r$length[1]+1):length(x)), 1:r$length[1])]
}
df[,] <- lapply(df, my.na.trim)
df
# var1 var2 var3 var4
# 1 1 6 8 5
# 2 2 2 6 NA
# 3 3 4 3 2
# 4 4 7 7 6
# 5 5 3 NA 2
# 6 6 NA NA 9
# 7 7 NA NA NA
# 8 8 NA NA NA
# 9 9 NA NA NA
# 10 10 NA NA NA
alternative coding for the function:
my.na.trim <- function(x) {
r <- rle(is.na(x))
if (!r$value[1]) return(x)
r1 <- r$length[1]
c(tail(x, -r1), head(x, r1))
}
We can use the cbind.na() function from the qpcR package and combine it with the na.trim() function from the zoo package:
do.call(qpcR:::cbind.na, lapply(df, zoo::na.trim))
# var1 var2 var3 var4
# [1,] 1 6 8 5
# [2,] 2 2 6 NA
# [3,] 3 4 3 2
# [4,] 4 7 7 6
# [5,] 5 3 NA 2
# [6,] 6 NA NA 9
# [7,] 7 NA NA NA
# [8,] 8 NA NA NA
# [9,] 9 NA NA NA
#[10,] 10 NA NA NA
If speed is a matter you can use this data.table solution.
library(data.table)
dt_foo <- function(dt) {
shift_v <- sapply(dt, function(col) min(which(+(is.na(col)) == 0))-1)
shift_expr <- parse(text = paste0("list(", paste("shift(", names(shift_v), ", n = ", shift_v, ", type = 'lead')", collapse = ", "), ")"))
dt[, names(shift_v) := eval(shift_expr), with = F]
dt[]
}
Some benchmarking follows.
library(zoo)
library(microbenchmark)
set.seed(1)
DT <- as.data.table(matrix(sample(c(0:9L, NA), 1e8, T, prob = c(rep(.01, 10), .9)), ncol = 1000))
zoo_foo <- function(df) {
lst <- lapply(df, na.trim)
df[] <- lapply(lst, `length<-`, max(lengths(lst)))
df
}
my.na.trim <- function(x) {
r <- rle(is.na(x))
if (!r$value[1]) return(x)
x[c(((r$length[1]+1):length(x)), 1:r$length[1])]
}
microbenchmark(dt_foo(copy(DT)), zoo_foo(DT),
as.data.frame(lapply(DT, my.na.trim)), times = 10)
Unit: seconds
expr min lq mean median uq max neval cld
dt_foo(copy(DT)) 1.468749 1.618289 1.690293 1.699926 1.725534 1.893018 10 a
zoo_foo(DT) 6.493227 6.516247 6.834768 6.779045 7.190705 7.319058 10 c
as.data.frame(lapply(DT, my.na.trim)) 4.988514 5.013340 5.384399 5.385273 5.508889 6.517748 10 b

Calculate cumsum() while ignoring NA values

Consider the following named vector x.
( x <- setNames(c(1, 2, 0, NA, 4, NA, NA, 6), letters[1:8]) )
# a b c d e f g h
# 1 2 0 NA 4 NA NA 6
I'd like to calculate the cumulative sum of x while ignoring the NA values. Many R functions have an argument na.rm which removes NA elements prior to calculations. cumsum() is not one of them, which makes this operation a bit tricky.
I can do it this way.
y <- setNames(numeric(length(x)), names(x))
z <- cumsum(na.omit(x))
y[names(y) %in% names(z)] <- z
y[!names(y) %in% names(z)] <- x[is.na(x)]
y
# a b c d e f g h
# 1 3 3 NA 7 NA NA 13
But this seems excessive, and makes a lot of new assignments/copies. I'm sure there's a better way.
What better methods are there to return the cumulative sum while effectively ignoring NA values?
You can do this in one line with:
cumsum(ifelse(is.na(x), 0, x)) + x*0
# a b c d e f g h
# 1 3 3 NA 7 NA NA 13
Or, similarly:
library(dplyr)
cumsum(coalesce(x, 0)) + x*0
# a b c d e f g h
# 1 3 3 NA 7 NA NA 13
It's an old question but tidyr gives a new solution.
Based on the idea of replacing NA with zero.
require(tidyr)
cumsum(replace_na(x, 0))
a b c d e f g h
1 3 3 3 7 7 7 13
Do you want something like this:
x2 <- x
x2[!is.na(x)] <- cumsum(x2[!is.na(x)])
x2
[edit] Alternatively, as suggested by a comment above, you can change NA's to 0's -
miss <- is.na(x)
x[miss] <- 0
cs <- cumsum(x)
cs[miss] <- NA
# cs is the requested cumsum
Here's a function I came up from the answers to this question. Thought I'd share it, since it seems to work well so far. It calculates the cumulative FUNC of x while ignoring NA. FUNC can be any one of sum(), prod(), min(), or max(), and x is a numeric vector.
cumSkipNA <- function(x, FUNC)
{
d <- deparse(substitute(FUNC))
funs <- c("max", "min", "prod", "sum")
stopifnot(is.vector(x), is.numeric(x), d %in% funs)
FUNC <- match.fun(paste0("cum", d))
x[!is.na(x)] <- FUNC(x[!is.na(x)])
x
}
set.seed(1)
x <- sample(15, 10, TRUE)
x[c(2,7,5)] <- NA
x
# [1] 4 NA 9 14 NA 14 NA 10 10 1
cumSkipNA(x, sum)
# [1] 4 NA 13 27 NA 41 NA 51 61 62
cumSkipNA(x, prod)
# [1] 4 NA 36 504 NA 7056 NA
# [8] 70560 705600 705600
cumSkipNA(x, min)
# [1] 4 NA 4 4 NA 4 NA 4 4 1
cumSkipNA(x, max)
# [1] 4 NA 9 14 NA 14 NA 14 14 14
Definitely nothing new, but maybe useful to someone.
Another option is using the collapse package with fcumsum function like this:
( x <- setNames(c(1, 2, 0, NA, 4, NA, NA, 6), letters[1:8]) )
#> a b c d e f g h
#> 1 2 0 NA 4 NA NA 6
library(collapse)
fcumsum(x)
#> a b c d e f g h
#> 1 3 3 NA 7 NA NA 13
Created on 2022-08-24 with reprex v2.0.2

How to squeeze in missing values into a vector

Let me try to make this question as general as possible.
Let's say I have two variables a and b.
a <- as.integer(runif(20, min = 0, max = 10))
a <- as.data.frame(a)
b <- as.data.frame(a[c(-7, -11, -15),])
So b has 17 observations and is a subset of a which has 20 observations.
My question is the following: how I would use these two variables to generate a third variable c which like a has 20 observations but for which observations 7, 11 and 15 are missing, and for which the other observations are identical to b but in the order of a?
Or to put it somewhat differently: how could I squeeze in these missing observations into variable b at locations 7, 11 and 15?
It seems pretty straightforward (and it probably is) but I have been not getting this to work for a bit too long now.
1) loop Try this loop:
# test data
set.seed(123) # for reproducibility
a <- as.integer(runif(20, min = 0, max = 10))
a <- as.data.frame(a)
b <- as.data.frame(a[c(-7, -11, -15),])
# lets work with vectors
A <- a[[1]]
B <- b[[1]]
j <- 1
C <- A
for(i in seq_along(A)) if (A[i] == B[j]) j <- j+1 else C[i] <- NA
which gives:
> C
[1] 2 7 4 8 9 0 NA 8 5 4 NA 4 6 5 NA 8 2 0 3 9
2) Reduce Here is a loop-free version:
f <- function(j, a) j + (a == B[j])
r <- Reduce(f, A, acc = TRUE)
ifelse(duplicated(r), NA, A)
giving:
[1] 2 7 4 8 9 0 NA 8 5 4 NA 4 6 5 NA 8 2 0 3 9
3) dtw. Using dtw in the package of the same name we can get a compact loop-free one-liner:
library(dtw)
ifelse(duplicated(dtw(A, B)$index2), NA, A)
giving:
[1] 2 7 4 8 9 0 NA 8 5 4 NA 4 6 5 NA 8 2 0 3 9
REVISED Added additional solutions.
Here's a more complicated way of doing it, using the Levenshtein distance algorithm, that does a better job on more complicated examples (it also seemed faster in a couple of larger tests I tried):
# using same data as G. Grothendieck:
set.seed(123) # for reproducibility
a <- as.integer(runif(20, min = 0, max = 10))
a <- as.data.frame(a)
b <- as.data.frame(a[c(-7, -11, -15),])
A = a[[1]]
B = b[[1]]
# compute the transformation between the two, assigning infinite weight to
# insertion and substitution
# using +1 here because the integers fed to intToUtf8 have to be larger than 0
# could also adjust the range more dynamically based on A and B
transf = attr(adist(intToUtf8(A+1), intToUtf8(B+1),
costs = c(Inf,1,Inf), counts = TRUE), 'trafos')
C = A
C[substring(transf, 1:nchar(transf), 1:nchar(transf)) == "D"] <- NA
#[1] 2 7 4 8 9 0 NA 8 5 4 NA 4 6 5 NA 8 2 0 3 9
More complex matching example (where the greedy algorithm would perform poorly):
A = c(1,1,2,2,1,1,1,2,2,2)
B = c(1,1,1,2,2,2)
transf = attr(adist(intToUtf8(A), intToUtf8(B),
costs = c(Inf,1,Inf), counts = TRUE), 'trafos')
C = A
C[substring(transf, 1:nchar(transf), 1:nchar(transf)) == "D"] <- NA
#[1] NA NA NA NA 1 1 1 2 2 2
# the greedy algorithm would return this instead:
#[1] 1 1 NA NA 1 NA NA 2 2 2
The data frame version, which isn't terribly different from G.'s above.
(Assumes a,b setup as above).
j <- 1
c <- a
for (i in (seq_along(a[,1]))) {
if (a[i,1]==b[j,1]) {
j <- j+1
} else
{
c[i,1] <- NA
}
}

Extracting off-diagonal slice of large matrix

I've got a large nxn matrix and would like to take off-diagonal slices of varying sizes. For example:
1 2 3 4 5 6
1 2 3 4 5 6
1 2 3 4 5 6
1 2 3 4 5 6
1 2 3 4 5 6
1 2 3 4 5 6
I'd like an R function which, when given the matrix and "width of diagonal slice" would return an nxn matrix of just those values. So for the matrix above and, say, 3, I'd get:
1 x x x x x
1 2 x x x x
1 2 3 x x x
x 2 3 4 x x
x x 3 4 5 x
x x x 4 5 6
At the moment I'm using (forgive me) a for loop which is incredibly slow:
getDiags<-function(ndiags, cormat){
resmat=matrix(ncol=ncol(cormat),nrow=nrow(cormat))
dimnames(resmat)<-dimnames(cormat)
for(j in 1:ndiags){
resmat[row(resmat) == col(resmat) + j] <-
cormat[row(cormat) == col(cormat) + j]
}
return(resmat)
}
I realise that this is a very "un-R" way to go about solving this problem. Is there a better way to do it, probably using diag or lower.tri?
size <- 6
mat <- matrix(seq_len(size ^ 2), ncol = size)
low <- 0
high <- 3
delta <- rep(seq_len(ncol(mat)), nrow(mat)) -
rep(seq_len(nrow(mat)), each = ncol(mat))
#or Ben Bolker's better alternative
delta <- row(mat) - col(mat)
mat[delta < low | delta > high] <- NA
mat
this works with 5000 x 5000 matrices on my machine
If you want to use upper.tri and lower.tri you could write functions like these:
cormat <- mapply(rep, 1:6, 6)
u.diags <- function(X, n) {
X[n:nrow(X),][lower.tri(X[n:nrow(X),])] <- NA
return(X)
}
or
l.diags <- function(X, n) {
X[,n:ncol(X)][upper.tri(X[,n:ncol(X)])] <- NA
return(X)
}
or
n.diags <- function(X, n.u, n.l) {
X[n.u:nrow(X),][lower.tri(X[n.u:nrow(X),])] <- NA
X[,n.l:ncol(X)][upper.tri(X[,n.l:ncol(X)])] <- NA
return(X)
}
l.diags(cormat, 3)
u.diags(cormat, 3)
n.diags(cormat, 3, 1)
you can do:
matrix:
m<-
matrix(1:6,ncol = 6, nrow=6 ,byrow = T)
function:
n_diag <- function (x, n) {
d <- dim(x)
ndiag <- .row(d) - n >= .col(d)
x[upper.tri(x) | ndiag] <- NA
return(x)
}
call:
n_diag(m,3)
# [,1] [,2] [,3] [,4] [,5] [,6]
#[1,] 1 NA NA NA NA NA
#[2,] 1 2 NA NA NA NA
#[3,] 1 2 3 NA NA NA
#[4,] NA 2 3 4 NA NA
#[5,] NA NA 3 4 5 NA
#[6,] NA NA NA 4 5 6
just for fun:
#lapply(1:6, n_diag, x = m)

Resources