Inserting NA rows when missing data - r

I have a data set with some missing values in a sequence:
Seq<-c(1,2,3,4,6,7,10,11,12,18,19,20)
Data<-c(3,4,5,4,3,2,1,2,3,5,4,3)
DF<-data.frame(Seq, Data)
I'd like to add rows to this data set approximating where I have missing values, and filling in the data with NA. So any time where I have a gap larger than 2, I add a NA row (or multiple rows if the gap is large). The result would look something like this:
NewSeq<-c(1,2,3,4,6,7,8.5,10,11,12,14,16,18,19,20)
NewData<-c(3,4,5,4,3,2,NA,1,2,3,NA,NA,18,19,20)
NewDF<-data.frame(NewSeq,NewData)
So I ignore when the gap is only < 2, but I add an NA row anytime there is a gap > 2. If there is still a > 2 gap after adding an NA row, I add another until the gap is filled.

Not very elegant, but this is how I would do it:
Seq<-c(1,2,3,4,6,7,10,11,12,18,19,20)
Data<-c(3,4,5,4,3,2,1,2,3,5,4,3)
DF<-data.frame(Seq, Data)
first <- DF$Seq
second <- DF$Data
for(i in length(first):2) {
gap <- first[i] - first[i - 1]
if(gap > 2) {
steps <- ifelse(gap %% 2 == 1, gap %/% 2, (gap %/% 2) -1)
new_values_gap <- gap / (steps + 1)
new_values <- vector('numeric')
for(j in 1:steps) {
new_values <- c(new_values, first[i - 1] + j * new_values_gap)
}
first <- c(first[1:i - 1], new_values, first[i:length(first)])
second <- c(second[1:i - 1], rep(NA, length(new_values)), second[i:length(second)])
}
}
NewDF <- data.frame(NewSeq = first, NewData = second)
> NewDF
## NewSeq NewData
## 1 1.0 3
## 2 2.0 4
## 3 3.0 5
## 4 4.0 4
## 5 6.0 3
## 6 7.0 2
## 7 8.5 NA
## 8 10.0 1
## 9 11.0 2
## 10 12.0 3
## 11 14.0 NA
## 12 16.0 NA
## 13 18.0 5
## 14 19.0 4
## 15 20.0 3

Seem to works for your example but not sure how it will perform on data I haven't seen. You'll need to adjust the intervals in the ifelse statement depending on how you want to account for different difference intervals.
Seq<-c(1,2,3,4,6,7,10,11,12,18,19,20)
Data<-c(3,4,5,4,3,2,1,2,3,5,4,3)
DF<-data.frame(Seq, Data)
diffs <- diff(Seq)
inds <- which(diffs > 2)
new.vals <- sapply(inds, function(x)
if(diffs[x] %% 2 != 0){
seq(Seq[x]+1.5, Seq[x+1]-1.5,1.5)
}else{
seq(Seq[x]+2, Seq[x+1]-2,2)
})
add.length <- unlist(lapply(new.vals, function(x) length(x)))
Seq.new <- c(Seq, unlist(new.vals))
id <- c(seq_along(Seq),
rep(inds+0.5,add.length))
Seq.new <- Seq.new[order(id)]
Data.new <- c(Data, rep(NA, sum(add.length)))
id <- c(seq_along(Seq),
rep(inds+0.5,add.length))
Data.new <- Data.new[order(id)]
NewDF <- data.frame(Seq.new, Data.new)

Related

Finding all sum of 2 power value combination values of a given number in R

R data frame 1 :
Index
Powervalue
0
1
1
2
2
4
3
8
4
16
5
32
R dataframe 2 :
CombinedValue
20
50
Expected Final Result :
Can we get the output as in the image. If yes please help.
One of stackoverflow mate provided below code. Am looking how to seperate , values as columns with 1 and 0.
df <- data.frame(sum = c(50, 20, 6))
values_list <- list()
for (i in 1:nrow(df)) {
sum <- df$sum[i]
values <- c()
while (sum > 0) {
value <- 2^floor(log2(sum))
values <- c(values, value)
sum <- sum - value
}
values_list[[i]] <- values
}
df$values <- values_list
Can we fix columns till power 31 as shown in attached image. The columns match with possiblecodes then place 1 and 0 else 0 for the remaining columns. Please help.
Here is a function whose output matches the expected output.
toCodes <- function(x) {
n <- floor(log2(x))
pow <- rev(seq.int(max(n)))
# 'y' is the matrix of codes
y <- t(sapply(x, \(.x) (.x %/% 2^pow) %% 2L))
i_cols <- apply(y, 2, \(.y) any(.y != 0L))
colnames(y) <- sprintf("code_%d", 2^pow)
#
possiblecodes <- apply(y, 1, \(p) {
codes <- 2^pow[as.logical(p)]
paste(rev(codes), collapse = ",")
})
data.frame(combinedvalue = x, possiblecodes, y[, i_cols])
}
x <- c(20L, 50L)
toCodes(x)
#> combinedvalue possiblecodes code_32 code_16 code_4 code_2
#> 1 20 4,16 0 1 1 0
#> 2 50 2,16,32 1 1 0 1
Created on 2022-12-19 with reprex v2.0.2

R how to find a series of common values in a vector (identifying growing season)

I'm looking for a way to identify a growing season which consists of a number of days greater than say 60 between the last frost day of spring and the first frost day in the fall. A general version of this problem is this. If I have a vector of numbers like testVec, I want the item numbers of the beginning and end range of values where the number of items is 5 or greater and all of them are greater than 0.
testVec <- c(1,3,4,0, 1, -5, 6, 0, 1,3,4,6,7,5,9, 0)
In this example, the relevant range is 1,3,4,6,7,5,9 which is testVec[9] to testVec[15]
One option could be:
testVec[with(rle(testVec > 0), rep(lengths * values >= 5, lengths))]
[1] 1 3 4 6 7 5 9
Here, the idea is to, first, create runs of values that are smaller or equal to zero and bigger than zero. Second, it checks whether the runs of values bigger than zero are of length 5 or more. Finally, it subsets the original vector for the runs of values bigger than zero with length 5 or more.
1) rleid This also handles any number of sequences including zero. rleid(ok) is a vector the same length as ok such that the first run of identical elements is replaced with 1, the second run with 2 and so on. The result is a list of vectors where each vector has its positions in the original input as its names.
library(data.table)
getSeq <- function(x) {
names(x) <- seq_along(x)
ok <- x > 0
s <- split(x[ok], rleid(ok)[ok])
unname(s)[lengths(s) >= 5]
}
getSeq(testVec)
## [[1]]
## 9 10 11 12 13 14 15
## 1 3 4 6 7 5 9
getSeq(numeric(16))
## list()
getSeq(c(testVec, 10 * testVec))
## [[1]]
## 9 10 11 12 13 14 15
## 1 3 4 6 7 5 9
##
## [[2]]
## 25 26 27 28 29 30 31
## 10 30 40 60 70 50 90
If a data frame were desired then following gives the values and which sequence the row came from. The row names indicate the positions in the original input.
gs <- getSeq(c(testVec, 10 * testVec))
names(gs) <- seq_along(gs)
if (length(gs)) stack(gs) else gs
## values ind
## 9 1 1
## 10 3 1
## 11 4 1
## 12 6 1
## 13 7 1
## 14 5 1
## 15 9 1
## 25 10 2
## 26 30 2
## 27 40 2
## 28 60 2
## 29 70 2
## 30 50 2
## 31 90 2
2) gregexpr Replace each element that is > 0 with 1 and each other element with 0 pasting the 0's and 1's into a single character string. Then use gregexpr to look for sequences of 1's at least 5 long and for the ith such nonoverlapping sequence return the first positions, g, and lengths, attr(g, "match.length"). Define a function vals which extracts the values at the required positions from testVec of the ith such nonoverlapping sequence returning a list such that the ith component of the list is the ith such sequence. The names in the output vector are its positions in the input.
getSeq2 <- function(x) {
g <- gregexpr("1{5,}", paste(+(x > 0), collapse = ""))[[1]]
vals <- function(i) {
ix <- seq(g[i], length = attr(g, "match.length")[i])
setNames(x[ix], ix)
}
if (length(g) == 1 && g == -1) list() else lapply(seq_along(g), vals)
}
getSeq2(testVec)
## [[1]]
## 9 10 11 12 13 14 15
## 1 3 4 6 7 5 9
The above handles any number of sequences including 0 but if we knew there were exactly one sequence (which is the case for the example in the question) then it could be simplified to the following where the return value is just that vector:
g <- gregexpr("1{5,}", paste(+(testVec > 0), collapse = ""))[[1]]
ix <- seq(g, length = attr(g, "match.length"))
setNames(testVec[ix], ix)
## 9 10 11 12 13 14 15
## 1 3 4 6 7 5 9
You could "fix" #tmfmnk's solution like this:
f1 <- function(x, threshold, n) {
range(which(with(rle(x > threshold), rep(lengths * values >= n, lengths))))
}
x <- c(1, 3, 4, 0, 1, -5, 6, 0, 1,3,4,6,7,5,9, 0)
f1(x, 0, 5)
#[1] 9 15
But that does not work well when there are multiple runs
xx <- c(x, x)
f1(xx, 0, 5)
#[1] 9 31
Here is another, not so concise approach that returns the start and end of the longest run (the first one if there are ties).
f2 <- function(x, threshold, n) {
y <- x > threshold
y[is.na(y)] <- FALSE
a <- ave(y, cumsum(!y), FUN=cumsum)
m <- max(a)
if (m < n) return (c(NA, NA))
i <- which(a == m)[1]
c(i-m+1, i)
}
f2(x, 0, 5)
#[1] 9 15
f2(xx, 0, 5)
#[1] 9 15
or with rle
f3 <- function(x, threshold, n) {
y <- x > threshold
r <- rle(y)
m <- max(r$lengths)
if (m < n) return (c(NA, NA))
i <- sum(r$lengths[1:which.max(r$lengths)[1]])
c(i-max(r$lengths)+1, i)
}
f3(x, 0, 5)
#[1] 9 15
f3(xx, 0, 5)
#[1] 9 15
If you wanted the first run that is at least n, that is you do not want a next run, even if it is longer, you could do
f4 <- function(x, threshold, n) {
y <- with(rle(x > threshold), rep(lengths * values >= n, lengths))
i <- which(y)[1]
j <- i + which(!y[-c(1:i)])[1] - 1
c(i, j)
}

Split by percentage

I would like to know how to split a vector by a percentage. I tried to use the stats :: quantile function but it doesn't manage to separate correctly when there are several times the same values.
I would like a method that does the split only by taking into account the length of the vector without taking into account the values.
vector <- c(1,1,1, 4:10)
minProb <- 0.1
maxProb <- 0.9
l <- length(vector)
dt <- data.frame("id" = 1:l, "value" = vector)
dt <- dt %>% arrange(act)
#min <- l*minProb
#max <- l*maxProb
#data1 <- dt$id[min:max]
#data2 <- dt$id[-c(min:max)]
#q <- quantile(dt$act, probs=c(minProb,maxProb))
#w <- which(dt$act >= q[1] & dt$act <= q[2])
expected result (index of elements)
> g2
1 10
> g1
2 3 4 5 6 7 8 9
The following does split the vector, whether that's what the question asks for is not clear.
l <- length(vector)
qq <- quantile(seq_along(vector), probs = c(minProb, maxProb))
f <- logical(l)
f[round(qq[1])] <- TRUE
f[round(qq[2])] <- TRUE
split(vector, cumsum(f))
#$`0`
#[1] 1
#
#$`1`
#[1] 1 1 4 5 6 7 8
#
#$`2`
#[1] 9 10
In order to have the indices, like it is asked in a comment, do
split(seq_along(vector), cumsum(f))

How to remove near zero variance without employing the caret package?

I am new to programming. But here is the piece of code that I have tried to remove the nearZeroVar function of the caret package from:
N <- 200 # number of points per class
D <- 2 # dimensionality
K <- 4 # number of classes
X <- data.frame() # data matrix (each row = single example)
y <- data.frame() # class labels
...(some lines are omitted)...
X <- as.matrix(X)
Y <- matrix(0, N * K, K)
for (i in 1:(N * K)) { Y[i, y[i,]] <- 1}
...(some lines are omitted)...
nzv <- nearZeroVar(train)
nzv.nolabel <- nzv-1
inTrain <- createDataPartition(y=train$label, p=0.7, list=F)
training <- train[inTrain, ]
CV <- train[-inTrain, ]
X <- as.matrix(training[, -1])
N <- nrow(X)
y <- training[, 1]
K <- length(unique(y))
X.proc <- X[, -nzv.nolabel]/max(X)
D <- ncol(X.proc)
Xcv <- as.matrix(CV[, -1])
ycv <- CV[, 1]
Xcv.proc <- Xcv[, -nzv.nolabel]/max(X)
Y <- matrix(0, N, K)
So, to get rid of the nearZeroVar function, I have tried to use the Filter function and the following foo function:
foo <- function(data) {
out <- lapply(data, function(x) length(unique(x)))
want <- which(!out > 1)
unlist(want)
}
nzv <- foo(trainingSet)
nzv.nolabel <- nzv - 1
But I get error messages: "Error in X[, training.nolabel]: incorrect number of dimensions. Execution halted" or something like "Non-conformable arrays". Any ideas on how to work around the `nearZeroVar" are strongly appreciated. Please, let me know if I should share some more details.
It's not evident from the code posted that how Filter() was being used.
Try the following;
# create sample data
R> df <- data.frame(a=1:10, b=sample(10:19), c=rep(5,10))
R> df
a b c
1 1 16 5
2 2 17 5
3 3 18 5
4 4 13 5
5 5 15 5
6 6 14 5
7 7 11 5
8 8 12 5
9 9 19 5
10 10 10 5
creating a custom function like;
R> zeroVarianceCol<- function(df){
new_df<-Filter(var,df)
}
passing the dataframe to this custom function like, x<- zeroVarianceCol(df) will remove the near zero variance column, in this case column c.
R> x
a b
1 1 16
2 2 17
3 3 18
4 4 13
5 5 15
6 6 14
7 7 11
8 8 12
9 9 19
10 10 10

How do I get the column name (or number) of a multi series zoo object when I apply by row?

I have a zoo object of 12 sets of monthly returns on stock tickers. I want to get the symbol, which is the name of the series, or at least the column, of each month's best performing stock. I've been trying to do this with applying the max function, by row. How do I get the column name?
#Apply 'max' function across each row. I need to get the col number out of this.
apply(tsPctChgs, 1, max, na.rm = TRUE)
The usual answer would be via which.max() however, do note that this will return only the first of the maximums if there are two or more observations taking the maximum value.
An alternative is which(x == max(x)), which would return all value taking the maximum in the result of a tie.
You can then use the index returned to select the series maximum. Handling NAs is covered below to try to keep the initial discussion simple.
require("zoo")
set.seed(1)
m <- matrix(runif(50), ncol = 5)
colnames(m) <- paste0("Series", seq_len(ncol(m)))
ind <- seq_len(nrow(m))
mz <- zoo(m, order.by = ind)
> apply(mz, 1, which.max)
1 2 3 4 5 6 7 8 9 10
3 5 5 1 4 1 1 2 3 2
> apply(mz, 1, function(x) which(x == max(x)))
1 2 3 4 5 6 7 8 9 10
3 5 5 1 4 1 1 2 3 2
So use that to select the series name
i1 <- apply(mz, 1, function(x) which(x == max(x)))
colnames(mz)[i1]
> i1 <- apply(mz, 1, function(x) which(x == max(x)))
> colnames(mz)[i1]
[1] "Series3" "Series5" "Series5" "Series1" "Series4" "Series1" "Series1"
[8] "Series2" "Series3" "Series2"
Handling tied maximums
To illustrate the different behaviour, copy the maximum from month 1 (series 3) into series 1
mz2 <- mz ## copy
mz2[1,1] <- mz[1,3]
mz2[1,]
> mz2[1,]
1 0.9347052 0.2059746 0.9347052 0.4820801 0.8209463
Now try the two approaches again
> apply(mz2, 1, which.max)
1 2 3 4 5 6 7 8 9 10
1 5 5 1 4 1 1 2 3 2
> apply(mz2, 1, function(x) which(x == max(x)))
$`1`
Series1 Series3
1 3
.... ## truncated output ###
Notice how which.max only returns the maximum in series 1.
To use this approach to select the series name, you need to apply something to the list returned by apply(), e.g.
i2 <- apply(mz2, 1, function(x) which(x == max(x)))
lapply(i2, function (i, zobj) colnames(zobj)[i], zobj = mz2)
$`1`
[1] "Series1" "Series3"
$`2`
[1] "Series5"
$`3`
[1] "Series5"
$`4`
[1] "Series1"
$`5`
[1] "Series4"
$`6`
[1] "Series1"
$`7`
[1] "Series1"
$`8`
[1] "Series2"
$`9`
[1] "Series3"
$`10`
[1] "Series2"
Handling NAs
As you have potential for NAs, I would do the following:
apply(mz, 1, which.max, na.rm = TRUE) ## as you did already
apply(mz, 1, function(x, na.rm = TRUE) {
if(na.rm) {
x <- x[!is.na(x)]
}
which(x == max(x))
})
Since apply converts to matrix, I would use rollapply with width=1:
require("zoo")
set.seed(1)
m <- matrix(runif(50), ncol=5)
mz <- setNames(zoo(m, seq(nrow(m))), paste0("Series",seq(ncol(m))))
rollapply(mz, 1, function(r) colnames(mz)[which.max(r)], by.column=FALSE)

Resources