I have Length and Weight values in a data frame. However some of them are missing. The data frame is like:
df <- data.frame(
L = c(13,15,19,NA,NA,32,35,NA,NA,18,15),
W = c(NA,NA,50, NA,NA,NA,80,NA,NA,30,NA)
)
I need a function which will work when length is not NA and weight is NA. it will calculate the weight for length, and else it will do nothing.
lwr <- function(data, length, weight, a, b) {
if(!is.na(data$length) && is.na(data$weight)) {
data$weight = 10^(log(a) + b*log(data$length))
} else {
data$weight
}
}
Here we go
lwr(data=df, length = L, weight = W, a=0.003, b=3.2)
but it does not work.
If you help me, I would be appreciated. Thank you very much for your time.
You probably could do that easier.
f <- \(x, a, b) 10^(log(a) + b*log(x))
naw <- is.na(df$W)
df$W[naw] <- f(df$L[naw], .003, 3.2)
# L W
# 1 13 250.4350
# 2 15 718.8159
# 3 19 50.0000
# 4 NA NA
# 5 NA NA
# 6 32 191078.5331
# 7 35 80.0000
# 8 NA NA
# 9 NA NA
# 10 18 30.0000
# 11 15 718.8159
You should use the vectorized ifelse(), instead of if()...else....
lwr <- function(length, weight, a, b) {
ifelse(is.na(weight), 10^(log(a) + b*log(length)), weight)
}
df |>
transform(W2 = lwr(L, W, a=0.003, b=3.2))
# equivalent:
# df$W2 <- lwr(df$L, df$W, a=0.003, b=3.2)
# L W W2
# 1 13 NA 250.4350
# 2 15 NA 718.8159
# 3 19 50 50.0000
# 4 NA NA NA
# 5 NA NA NA
# 6 32 NA 191078.5331
# 7 35 80 80.0000
# 8 NA NA NA
# 9 NA NA NA
# 10 18 30 30.0000
# 11 15 NA 718.8159
Related
My data is structured as follows:
set.seed(20)
RawData <- data.frame(Trial = c(rep(1, 10), rep(2, 10)),
X_Velocity = runif(20, 1, 3),
Y_Velocity = runif(20, 4, 6))
I now wish to calculate an average for X_Velocity and Y_Velocity across every two rows, for each Trial. My anticipated output, for the first four rows would be:
X_Velocity_AVG Y_Velocity_AVG
NA NA
2.6460545 4.522224
NA NA
1.8081265 4.5175165
How do I complete this?
You could do this using function f in which the average of every two elements is computed:
f <- function(a) tapply(a, rep(1:(length(a)/2), each = 2), FUN = mean)
res <- data.frame(X_Velocity_AVG=rep(NA, nrow(RawData)),
Y_Velocity_AVG=rep(NA, nrow(RawData)))
res$X_Velocity_AVG[c(F,T)] <- f(RawData$X_Velocity)
res$Y_Velocity_AVG[c(F,T)] <- f(RawData$Y_Velocity)
# X_Velocity_AVG Y_Velocity_AVG
# 1 NA NA
# 2 2.646055 4.522224
# 3 NA NA
# 4 1.808127 4.517517
# 5 NA NA
# 6 2.943262 4.334551
# 7 NA NA
# 8 1.162082 5.899396
# 9 NA NA
# 10 1.697668 4.739195
# 11 NA NA
# 12 2.473324 4.778723
# 13 NA NA
# 14 1.744730 5.020097
# 15 NA NA
# 16 1.644518 4.986245
# 17 NA NA
# 18 1.431219 5.375815
# 19 NA NA
# 20 2.108719 4.909284
Example of data:
>w
date V1 V2 V3
1 1 NA NA NA
2 2 NA NA NA
3 3 -0.2357066 NA -0.5428883
4 4 NA NA NA
5 5 NA -0.4333103 NA
6 6 NA NA NA
7 7 -0.6494716 0.7267507 1.1519118
8 8 NA NA NA
9 9 NA NA NA
10 10 NA NA NA
> r
date V1 V2 V3
1 1 1.262954285 0.7635935 -0.22426789
2 2 -0.326233361 -0.7990092 0.37739565
3 3 1.329799263 -1.1476570 0.13333636
4 4 1.272429321 -0.2894616 0.80418951
5 5 0.414641434 -0.2992151 -0.05710677
6 6 -1.539950042 -0.4115108 0.50360797
7 7 -0.928567035 0.2522234 1.08576936
8 8 -0.294720447 -0.8919211 -0.69095384
9 9 -0.005767173 0.4356833 -1.28459935
10 10 2.404653389 -1.2375384 0.04672617
I am trying to fill in w using the following rule: w(t+1) <- w(t)*r(t), but only the values after the first non-NA element. The for-loop equivalent is:
for (i in 1:(nrow(w)-1)) {
for (j in 2:ncol(w)){
if (is.na(w[i+1,j])) {
w[i+1,j] <- w[i,j]*r[i,j]
}
}
}
and gives:
> w
date V1 V2 V3
1 1 NA NA NA
2 2 NA NA NA
3 3 -0.235706556 NA -0.542888255
4 4 -0.313442405 NA -0.072386744
5 5 -0.398833307 -0.43331032 -0.058212660
6 6 -0.165372814 0.12965300 0.003324337
7 7 -0.649471647 0.72675075 1.151911754
8 8 0.603077961 0.18330358 1.250710490
9 9 -0.177739406 -0.16349234 -0.864183216
10 10 0.001025054 -0.07123088 1.110129201
This is a bit similar to cumprod, maybe, but I am stuck. Is it possible to avoid the for-loops (or at least one of them), so to speed things up?
The data can be replicated by:
set.seed(0)
r <- as.data.frame(matrix(data = rnorm(30), nrow = 10, ncol = 3))
w <- as.data.frame(matrix(data = NA, nrow =10, ncol = 3))
w[3, c(1,3)] <- rnorm(2)
w[5, 2] <- rnorm(1)
w[7,] <- rnorm(ncol(w))
date <- 1:nrow(w)
w <- cbind(date, w)
r <- cbind(date, r)
If you have a few columns, you can replace the inner loop by following data.table operation.
library(data.table) # v1.9.5
fdt <- function(w, r){
for (j in 2:ncol(w)){
w[,j] <- data.table(x=r[, j], z=w[, j])[,ifelse(is.na(z), z[1L]*shift(cumprod(x)), z), cumsum(!is.na(z))][,V1]
}
w
}
For data frames with 100000 rows, it takes about 3s on my computer.
w <- do.call('rbind', lapply(1:10000, function(i)w))
r <- do.call('rbind', lapply(1:10000, function(i)r))
system.time(fdt(w, r))
#user system elapsed
#2.923 0.004 2.928
whereas, the nested loop takes 200s
system.time(f(w, r))
# user system elapsed
#206.406 0.043 206.559
f <- function(w, r){
for (i in 1:(nrow(w)-1)) {
for (j in 2:ncol(w)){
if (is.na(w[i+1,j])) {
w[i+1,j] <- w[i,j]*r[i,j]
}
}
}
w
}
[Edit]
dplyr version runs slightly faster than fd.
library(dplyr)
fdp <- function(w, r){
for (j in 2:ncol(w)){
d <- data_frame(x=r[, j], z=w[, j]) %>%
group_by(cumsum(!is.na(z))) %>%
mutate(v=ifelse(is.na(z), z[1L]*lag(cumprod(x)), z))
w[, j] <- d$v
}
w
}
system.time(fdp(w, r))
# user system elapsed
# 2.458 0.008 2.467
[Edit2]
For a couple million rows, data.table solution is still pretty slow. You can speed the things up nicely with Rcpp.
Rcpp::cppFunction('NumericMatrix fill(NumericMatrix w, NumericMatrix r) {
for (int i = 0; i < w.nrow()-1; i++) {
for (int j = 0; j < w.ncol(); j++) {
if (NumericVector::is_na(w(i+1,j))) {
w(i+1,j) = w(i,j)*r(i,j);
}
}
}
return w;
}')
For 1M rows, it takes less than a sec.
system.time(fill(as.matrix(w[,-1]), as.matrix(r[,-1])))
# user system elapsed
# 0.913 0.004 0.917
Here is an alternative approach:
library(zoo)
cumprodsplit <- function(col, r, w){
# fill the NAs
fill_w <- na.locf(w)[[col]]
# groups
f <- cumsum(!is.na(w[[col]]))
# split w
splits <- split(fill_w, f)
# generate the cumprods
cumprods <- lapply(split(r[[col]], f),
function(x) c(1, cumprod(x)[-length(x)]))
# multiply
vec <- mapply('*', splits, cumprods, SIMPLIFY = FALSE)
#unlist
setNames(data.frame(unlist(vec, use.names = FALSE)), col)
}
do.call("cbind", lapply(names(w)[-1], cumprodsplit, r, w))
V1 V2 V3
1 NA NA NA
2 NA NA NA
3 -0.235706556 NA -0.542888255
4 -0.313442405 NA -0.072386744
5 -0.398833307 -0.43331032 -0.058212660
6 -0.165372814 0.12965300 0.003324337
7 -0.649471647 0.72675075 1.151911754
8 0.603077961 0.18330358 1.250710490
9 -0.177739406 -0.16349234 -0.864183216
10 0.001025054 -0.07123088 1.110129201
Consider the following named vector x.
( x <- setNames(c(1, 2, 0, NA, 4, NA, NA, 6), letters[1:8]) )
# a b c d e f g h
# 1 2 0 NA 4 NA NA 6
I'd like to calculate the cumulative sum of x while ignoring the NA values. Many R functions have an argument na.rm which removes NA elements prior to calculations. cumsum() is not one of them, which makes this operation a bit tricky.
I can do it this way.
y <- setNames(numeric(length(x)), names(x))
z <- cumsum(na.omit(x))
y[names(y) %in% names(z)] <- z
y[!names(y) %in% names(z)] <- x[is.na(x)]
y
# a b c d e f g h
# 1 3 3 NA 7 NA NA 13
But this seems excessive, and makes a lot of new assignments/copies. I'm sure there's a better way.
What better methods are there to return the cumulative sum while effectively ignoring NA values?
You can do this in one line with:
cumsum(ifelse(is.na(x), 0, x)) + x*0
# a b c d e f g h
# 1 3 3 NA 7 NA NA 13
Or, similarly:
library(dplyr)
cumsum(coalesce(x, 0)) + x*0
# a b c d e f g h
# 1 3 3 NA 7 NA NA 13
It's an old question but tidyr gives a new solution.
Based on the idea of replacing NA with zero.
require(tidyr)
cumsum(replace_na(x, 0))
a b c d e f g h
1 3 3 3 7 7 7 13
Do you want something like this:
x2 <- x
x2[!is.na(x)] <- cumsum(x2[!is.na(x)])
x2
[edit] Alternatively, as suggested by a comment above, you can change NA's to 0's -
miss <- is.na(x)
x[miss] <- 0
cs <- cumsum(x)
cs[miss] <- NA
# cs is the requested cumsum
Here's a function I came up from the answers to this question. Thought I'd share it, since it seems to work well so far. It calculates the cumulative FUNC of x while ignoring NA. FUNC can be any one of sum(), prod(), min(), or max(), and x is a numeric vector.
cumSkipNA <- function(x, FUNC)
{
d <- deparse(substitute(FUNC))
funs <- c("max", "min", "prod", "sum")
stopifnot(is.vector(x), is.numeric(x), d %in% funs)
FUNC <- match.fun(paste0("cum", d))
x[!is.na(x)] <- FUNC(x[!is.na(x)])
x
}
set.seed(1)
x <- sample(15, 10, TRUE)
x[c(2,7,5)] <- NA
x
# [1] 4 NA 9 14 NA 14 NA 10 10 1
cumSkipNA(x, sum)
# [1] 4 NA 13 27 NA 41 NA 51 61 62
cumSkipNA(x, prod)
# [1] 4 NA 36 504 NA 7056 NA
# [8] 70560 705600 705600
cumSkipNA(x, min)
# [1] 4 NA 4 4 NA 4 NA 4 4 1
cumSkipNA(x, max)
# [1] 4 NA 9 14 NA 14 NA 14 14 14
Definitely nothing new, but maybe useful to someone.
Another option is using the collapse package with fcumsum function like this:
( x <- setNames(c(1, 2, 0, NA, 4, NA, NA, 6), letters[1:8]) )
#> a b c d e f g h
#> 1 2 0 NA 4 NA NA 6
library(collapse)
fcumsum(x)
#> a b c d e f g h
#> 1 3 3 NA 7 NA NA 13
Created on 2022-08-24 with reprex v2.0.2
I'm trying to tabulate/map the counts of 2 factor-class vectors (b1 & b2) into a bigger dataframe. Summary of the vectors are as below:
> summary(b1)
(4,6] (6,8] NA's
16 3 1
> summary(b2)
(4,6] (6,8] NA's
9 0 11
I would like to map the above counts into a bigger dataframe:
Intervals b1 b2
1 (-Inf,0] NA NA
2 (0,2] NA NA
3 (2,4] NA NA
4 (4,6] NA NA
5 (6,8] NA NA
6 (8,10] NA NA
7 (10,12] NA NA
8 (12, Inf] NA NA
My question: is there a vectorized or more direct way to do the above without resorting to a 'for' loop + if-else condition checking? It seems like something easily done, but I'm have been having this mental block and I haven't been successful in finding relevant help online. Any help/hint is appreciated. Thanks in advance!
My sample code is attached:
NoOfElement <- 20
MyBreaks <- c(seq(4, 8, by=2))
MyBigBreaks <- c(-Inf, seq(0,12, by=2), Inf)
set.seed(1)
a1 <- rnorm(NoOfElement, 5); a2 <- rnorm(NoOfElement, 4)
b1 <- cut(a1, MyBreaks); b2 <- cut(a2, MyBreaks)
c <- seq(-10, 10)
d <- cut(c, MyBigBreaks)
e <- data.frame( Intervals=levels(d), b1=NA, b2=NA )
The table function does the tabulation that you need. It returns a named vector, and you can compare the names against the column e$Intervals to assign the correct values.
This relies on the fact that the order of the factor levels is the same in e$Intervals and b1 and b2. This is so because these all come from cut.
e$b1[e$Intervals %in% names(table(b1))] <- table(b1)
e$b2[e$Intervals %in% names(table(b2))] <- table(b2)
e
## Intervals b1 b2
## 1 (-Inf,0] NA NA
## 2 (0,2] NA NA
## 3 (2,4] NA NA
## 4 (4,6] 16 9
## 5 (6,8] 3 0
## 6 (8,10] NA NA
## 7 (10,12] NA NA
## 8 (12, Inf] NA NA
Here is my problem. I have a large vector of positive data. My goal is to remove the sequences of at least N consecutive values that are repeated in the vector (all of repeated values must be strictly > 0).
I've written a program that works and is as follows :
X is my vector of numeric values ;
N is the minimum length of repeated sequences.
rmpParNASerieRepetee <- function(X, N)
{
X_ <- paste("T", paste(X, collapse="T"), "T", sep="")
ind.parcours <- 1
ind.sup <- c()
# Loop on the values
while ( ind.parcours <= (length(X)-N+1) )
{
# indices of my sequence of N values
deb <- ind.parcours
fin <- ind.parcours + N-1
# sequence of N values to search in the vector
serie <- X[deb:fin]
serie_ <- paste("T", paste(serie, collapse="T"), "T", sep="")
borne <- 1*(ind.parcours < (length(X)-N+1)) + 0*(ind.parcours == (length(X)-N+1))
if (sum(X[(length(X)-N+1):length(X)]==serie)==3) borne <- 0
# split my string vector by my sequence vector of N values and count the pieces of result
if ( length(unlist(strsplit(X_, serie_)))-1 > borne && length(which(serie!=0))>=N)
{ ind.sup <- unique(c(ind.sup, deb:fin)) }
ind.parcours <- ind.parcours+1
}
if (length(ind.sup !=0)) { X[ind.sup] <- NA }
list_return <- list(X=X, Ind.sup=unique(sort(ind.sup)))
return (list_return)
}
I think my function is really not optimal (calculation time of 1:15 for a vector of 92,000 values, N=18). And I have to do this step 1600 times... It would take around 3 months...
Does anyone have a better idea ?
Example :
x <- c(1,2,3,4,0,4,1,2,3,8,9,1,2,3,4,0)
N <- 3
# (1,2,3) is a sequence of 3 elements which is repeated
# (1,2,3,4) is sequence of 4 elements which is repeated
# no other sequence of length at least 3 repeats
# my result should also be :
# NA NA NA NA 0 4 NA NA NA 8 9 NA NA NA NA 0
# The result of my program is :
# $X
# [1] NA NA NA NA 0 4 NA NA NA 8 9 NA NA NA NA 0
#$Ind.sup
# [1] 1 2 3 4 7 8 9 12 13 14 15
A way:
f <- function(X, N)
{
.rle <- rle(sort(X))
res <- .rle$values[.rle$lengths >= N]
res <- res[res > 0]
inds <- X %in% res
X[inds] <- NA
list(X = X, Ind = which(inds))
}
#> f(X, 3)
#$X
# [1] NA NA NA NA 0 0 0 0 NA NA NA NA NA NA 8 9 NA NA NA NA NA NA 0 0 0
#
#$Ind
# [1] 1 2 3 4 9 10 11 12 13 14 17 18 19 20 21 22
Try using table and %in% to get faster speed due to vectorisation.
rmpParNASerieRepetee<-function(X,N){
tab<-table(X[X>0])
over.n<-as.numeric(names(tab)[tab>=N])
ind<-X %in% over.n
Ind.sup<-which(ind)
X<-ifelse(ind,NA,X)
list(Ind.sup,X)
}
X <- c(1,2,3,4,0,0,0,0,1,4,1,2,3,4,8,9,1,2,3,1,4,1,0,0,0)
rmpParNASerieRepetee(X,3)
# [[1]]
# [1] 1 2 3 4 9 10 11 12 13 14 17 18 19 20 21 22
#
# [[2]]
# [1] NA NA NA NA 0 0 0 0 NA NA NA NA NA NA 8 9 NA NA NA NA NA NA 0 0 0
And a little test for 92000 values:
X<-sample(1:10000,92000,TRUE)
system.time(rmpParNASerieRepetee(X,3))
# user system elapsed
# 0.14 0.00 0.14
One way to think about this is that in a sequence, each element differs from the last one by 1, so:
X <- c(1,2,3,4,0,0,0,0,1,4,1,2,3,4,8,9,1,2,3,1,4,1,0,0,0)
y <- X[-1]
diff <- y-X[1:length(X)-1]
diff
[1] 1 1 1 -4 0 0 0 1 3 -3 1 1 1 4 1 -8 1 1 -2 3 -3 -1 0 0
And now you're looking for sequences of > N 1's in diff.
I have optimized my function, and now it takes "only" 10 minutes for a vector of length 92000.
Maybe someone could find an other solution more faster than mine.
Imagine my vector is X<-c(1,2,3,4,0,7,8,1,2,3,NA,8,9,1,2,3,4) and N=3.
c(1,2,3) et c(1,2,3,4) are the only repeated sequences of length at least N without NA or 0. So my result should be NA NA NA NA 0 7 8 NA NA NA NA 8 9 NA NA NA NA.
To answer my problem, I use this principle :
I create a big string like this : X_ <- T1T2T3T4T0T7T8T1T2T3TNAT8T9T1T2T3T4 in which, all X values are concatened by T. For each little string of length N=3 (ex : the first is T1T2T3T), I break my big string X_ using strsplit function with the pattern "little string". If the length of the result is more than 2, the sequence is repeated.
Care must be taken not to take null values in the series, and some adaptation must be done to avoid edge phenomena (borne in my function)...
I created these functions which work :
# Function to count NA in a vector
count.na <- function(vec) { return (length(which(is.na(vec)))) }
# Function to detect sequence of stricly postive numbers of length at least N
rmpParNASerieRepetee <- function(X, N, val.min=0)
{
# Collapse the vector to make a big string
X_ <- paste("T", paste(X, collapse="T"), "T", sep="")
# Index term
ind.parcours <- 1
ind.sup <- c()
# Loop on X values
while ( ind.parcours <= (length(X)-N+1) )
{
# Selection of the sequence to be detected
deb <- ind.parcours
fin <- ind.parcours + N-1
serie <- X[deb:fin]
# All values are > 0
if ( length(which(serie>0)) >= (N-count.na(serie)) )
{
# Research of repetition with strsplit
serie_ <- paste("T", paste(serie, collapse="T"), "T", sep="")
borne <- 1*(ind.parcours < (length(X)-N+1)) + 0*(ind.parcours == (length(X)-N+1))
if (sum(X[(length(X)-N+1):length(X)]==serie, na.rm=TRUE)==N) borne <- 0
if (length(unlist(strsplit(X_, serie_)))-1 > borne)
ind.sup <- unique( c(ind.sup, deb:fin) )
# Incrementation
ind.parcours <- ind.parcours + 1
}
# Contains 0
else
{ ind.parcours <- ind.parcours + max(which(serie==0))
}
}
# Invalidaion of repeated sequences
if (length(ind.sup !=0)) { X[ind.sup] <- NA }
# Return
list_return <- list(X=X, Ind.sup=unique(sort(ind.sup)))
return (list_return)
}
I hope someone will find an other way to solve my problem !