I use quantmod, to calculate the moving average over 2000 dataframes with loop
price = xts object
price <- cbind(price, SMA(price, 5), SMA(price, 10),
SMA(price, 20), SMA(price, 60), SMA(price, 120),
SMA(price, 180), SMA(price, 240))
But some data don't exceed the number of width, stop running in the middle. In that case, I just want to fill NA only.
I need some support to solve this problem.
Or if I need to use any other package for solving this problem, let me know
Thanks
Moving average functions give an error when the chosen period is longer than the available data. As #RuiBarradas mentions in the comment, for a SMA zoo::rollmean could work. As you need to loop over quite a few data.frames a function is easier. The function below could be used in an lapply function or just in a loop.
I created a sub function inside the bigger function to check if the chosen period is bigger than the rows supplied. If so, return a vector of NA's else return a SMA. After that, loop over the periods to return a data.frame with the supplied price column and all the SMA columns with a name so you can see which SMA is in which column.
Note that there is no error handling in case of incorrect inputs. Sample data below.
# periods for the SMA
periods <- c(5, 10, 20, 60, 120, 180, 240)
get_smas <- function(price, n) {
my_sma <- function(x, n = 10) {
if (n < 1 || n > NROW(x)) {
out <- rep(NA_real_, NROW(x))
} else {
# change SMA for EMA if you want the EMA's
out <- TTR::SMA(x, n = n)
}
out
}
# combine the price column with the ma's. Reduce works backwards, so price column last
price_combined <- Reduce(cbind, lapply(n, function(x) my_sma(price, n = x)), price)
# turn matrix into data.frame
price_combined <- data.frame(price_combined)
# rename columns, assuming price column has a column name.
# change paste0 value from SMA to EMA if EMA is used.
names(price_combined) <- c(names(price_combined)[1], paste0("SMA_", n))
price_combined
}
# supply a price and a vector of periods
my_prices <- get_smas(price, periods)
head(my_prices, 2)
Close SMA_5 SMA_10 SMA_20 SMA_60 SMA_120 SMA_180 SMA_240
1 182.01 NA NA NA NA NA NA NA
2 179.70 NA NA NA NA NA NA NA
tail(my_prices, 2)
Close SMA_5 SMA_10 SMA_20 SMA_60 SMA_120 SMA_180 SMA_240
142 156.79 154.156 152.053 147.475 145.4393 156.1770 NA NA
143 157.35 154.556 152.941 148.381 145.4292 156.0474 NA NA
data:
# close prices of aapl from 2022-01-03 to 2022-07-28
price <- structure(list(Close = c(182.009995, 179.699997, 174.919998,
172, 172.169998, 172.190002, 175.080002, 175.529999, 172.190002,
173.070007, 169.800003, 166.229996, 164.509995, 162.410004, 161.619995,
159.779999, 159.690002, 159.220001, 170.330002, 174.779999, 174.610001,
175.839996, 172.899994, 172.389999, 171.660004, 174.830002, 176.279999,
172.119995, 168.639999, 168.880005, 172.789993, 172.550003, 168.880005,
167.300003, 164.320007, 160.070007, 162.740005, 164.850006, 165.119995,
163.199997, 166.559998, 166.229996, 163.169998, 159.300003, 157.440002,
162.949997, 158.520004, 154.729996, 150.619995, 155.089996, 159.589996,
160.619995, 163.979996, 165.380005, 168.820007, 170.210007, 174.070007,
174.720001, 175.600006, 178.960007, 177.770004, 174.610001, 174.309998,
178.440002, 175.059998, 171.830002, 172.139999, 170.089996, 165.75,
167.660004, 170.399994, 165.289993, 165.070007, 167.399994, 167.229996,
166.419998, 161.789993, 162.880005, 156.800003, 156.570007, 163.639999,
157.649994, 157.960007, 159.479996, 166.020004, 156.770004, 157.279999,
152.059998, 154.509995, 146.5, 142.559998, 147.110001, 145.539993,
149.240005, 140.820007, 137.350006, 137.589996, 143.110001, 140.360001,
140.520004, 143.779999, 149.639999, 148.839996, 148.710007, 151.210007,
145.380005, 146.139999, 148.710007, 147.960007, 142.639999, 137.130005,
131.880005, 132.759995, 135.429993, 130.059998, 131.559998, 135.869995,
135.350006, 138.270004, 141.660004, 141.660004, 137.440002, 139.229996,
136.720001, 138.929993, 141.559998, 142.919998, 146.350006, 147.039993,
144.869995, 145.860001, 145.490005, 148.470001, 150.169998, 147.070007,
151, 153.039993, 155.350006, 154.089996, 152.949997, 151.600006,
156.789993, 157.350006)), class = "data.frame", row.names = c(NA,
-143L))
rollmeanr and rollapplyr can handle the situation with fewer data items than width.
library(zoo)
price <- 1:6
rollmeanr(price, 10, fill = NA)
## [1] NA NA NA NA NA NA
w <- c(5, 10, 20, 60, 120, 180, 240)
sapply(setNames(w, w), rollmeanr, x = price, fill = NA)
## 5 10 20 60 120 180 240
## [1,] NA NA NA NA NA NA NA
## [2,] NA NA NA NA NA NA NA
## [3,] NA NA NA NA NA NA NA
## [4,] NA NA NA NA NA NA NA
## [5,] 3 NA NA NA NA NA NA
## [6,] 4 NA NA NA NA NA NA
Given a uncertain number of columns containing source values for the same variable I would like to create a column that defines the final value to be selected depending on source importance and availability.
Reproducible data:
set.seed(123)
actuals = runif(10, 500, 1000)
get_rand_vector <- function(){return (runif(10, 0.95, 1.05))}
get_na_rand_ixs <- function(){return (round(runif(5,0,10),0))}
df = data.frame("source_1" = actuals*get_rand_vector(),
"source_2" = actuals*get_rand_vector(),
"source_n" = actuals*get_rand_vector())
df[["source_1"]][get_na_rand_ixs()] <- NA
df[["source_2"]][get_na_rand_ixs()] <- NA
df[["source_n"]][get_na_rand_ixs()] <- NA
My manual solution is as follows:
df$available <- ifelse(
!is.na(df$source_1),
df$source_1,
ifelse(
!is.na(df$source_2),
df$source_2,
df$source_n
)
)
Given the desired result of:
source_1 source_2 source_n available
1 NA NA NA NA
2 NA NA 930.1242 930.1242
3 716.9981 NA 717.9234 716.9981
4 NA 988.0446 NA 988.0446
5 931.7081 NA 924.1101 931.7081
6 543.6802 533.6798 NA 543.6802
7 744.6525 767.4196 783.8004 744.6525
8 902.8788 955.1173 NA 902.8788
9 762.3690 NA 761.6135 762.3690
10 761.4092 702.6064 708.7615 761.4092
How could I automatically iterate over the available sources to set the data to be considered? Given in some cases n_sources could be 1,2,3..,7 and priority follows the natural order (1 > 2 >..)
Once you have all of the candidate vectors in order and in an appropriate data structure (e.g., data.frame or matrix), you can use apply to apply a function over the rows. In this case, we just look for the first non-NA value. Thus, after the first block of code above, you only need the following line:
df$available <- apply(df, 1, FUN = function(x) x[which(!is.na(x))[1]])
coalesce() from dplyr is designed for this:
library(dplyr)
df %>%
mutate(available = coalesce(!!!.))
source_1 source_2 source_n available
1 NA NA NA NA
2 NA NA 930.1242 930.1242
3 716.9981 NA 717.9234 716.9981
4 NA 988.0446 NA 988.0446
5 931.7081 NA 924.1101 931.7081
6 543.6802 533.6798 NA 543.6802
7 744.6525 767.4196 783.8004 744.6525
8 902.8788 955.1173 NA 902.8788
9 762.3690 NA 761.6135 762.3690
10 761.4092 702.6064 708.7615 761.4092
I have datasets that have sulfate and nitrate columns in them. Depending on what the user chooses, either sulfate mean or nitrate mean is returned. I have a for loop and within it I have an IF and ELSE statement to sort this out. The following error arises when attempting to compile data.frame(datada,vec1):
"Error in data.frame(datada, vec1) : object 'datada' not found"
Also, the else statement is considered unexpected. The following error is given:
"Error: unexpected 'else' in " else"
complete <- function(directory,pollutant = "sulfate", id = 1:332) {
datada <- id
filelist <- list.files(path = directory, pattern = ".csv", full.names = TRUE)
vec <- numeric()
vec1 <- numeric()
vec2 <- numeric()
for(i in datada) {
if (pollutant == "sulfate"){
data <- read.csv(filelist[i])
vec1<- c(vec1, colMeans(data$sulfate,na.rm = TRUE )
}
data.frame(datada,vec1) #datada is not "found"
else (pollutant == "nitrate"){ #else is "unexpected"
data <- read.csv(filelist[i])
vec2<- c(vec2, colMeans(data$sulfate,na.rm = TRUE )
}
data.frame(datada,vec2)
}
Here is what one dataset looks like:
Date sulfate nitrate ID
1 2001-01-01 NA NA 2
2 2001-01-02 NA NA 2
3 2001-01-03 NA NA 2
4 2001-01-04 NA NA 2
5 2001-01-05 NA NA 2
6 2001-01-06 NA NA 2
7 2001-01-07 NA NA 2
8 2001-01-08 NA NA 2
9 2001-01-09 NA NA 2
10 2001-01-10 NA NA 2
11 2001-01-11 NA NA 2
12 2001-01-12 NA NA 2
13 2001-01-13 NA NA 2
14 2001-01-14 NA NA 2
15 2001-01-15 NA NA 2
16 2001-01-16 NA NA 2
17 2001-01-17 NA NA 2
18 2001-01-18 NA NA 2
19 2001-01-19 2.30 0.699 2
20 2001-01-20 NA NA 2
21 2001-01-21 NA NA 2
22 2001-01-22 NA NA 2
23 2001-01-23 NA NA 2
24 2001-01-24 NA NA 2
25 2001-01-25 2.19 4.970 2
Its expected to return something like this:
datada vec
1 1 117
2 3 243
3 5 402
4 7 442
5 9 275
Generated by the data.frame(datada,vec1)
Unless you want to manipulate environment objects, the easiest thing to do is to declare your variable outside the function and use <<- form of assignment inside the function.
datada <- NULL
...
complete <- function(directory,pollutant = "sulfate", id = 1:332) {
datada <<- id
...
}
I have no idea why datada is not found - when I tried a simplified version of the function on my system it seems to work fine.
As to the else -- an else must come directly after the end of the if's statement. It's not expected because you placed data.frame(datada,vec1) before it. If you put that line into the {}, everything should be fine.
But generally speaking your code is unnecessarily complex, plus it doesn't actually return anything.
Try something like this:
complete <- function(directory,pollutant = "sulfate", id = 1:332) {
datada <- id
filelist <- list.files(path = directory, pattern = ".csv", full.names = TRUE)
if (!(pollutant) %in% c("sulfate","nitrate")) stop("Unknown pollutant")
lapply(filelist, function(x) {
data<-read.csv(x)
colMeans(data[,pollutant],na.rm=TRUE)
})
}
This will output a list where each element is the vector of colMeans of each of the files. You could replace lapply with sapply which will (probably) give you a matrix instead of a list.
(note I couldn't test it because I don't have the dataset, so there may be some errors here)
I have been trying to write a while command to stop the looping function when one value generated by the loop exceeds the other. However, I have failed to figure out the proper way to do it.
The for loop runs for 30 days, but I want it to stop as soon as the last value of parasite_l.A is less than than parasite_l.B.
I have included the working code I have for generating the data and the for loop.
Alternative solutions without a limit on the loop would also be greatly appreciated.
# Subject A, initially 400 parasites, growing by 10 %
subA = 400
infA = 1.1
# Subject B, initially 120 parasites, growing by 20 %
subB = 120
infB = 1.2
# How many days to model
days = 30
days_seq = seq(1, days, 1)
# Parasite load for A
parasite_l.A = rep(NA, days)
parasite_l.A[1] = subA
# Parasite load for B
parasite_l.B = rep(NA, days)
parasite_l.B[1] = subB
# Loop for subject A and B
for(i in 1:(days)){
parasite_l.A[i+1] = parasite_l.A[i]*(infA)
parasite_l.B[i+1] = parasite_l.B[i]*(infB)
}
parasite_l.A
parasite_l.B
There is a built-in control flow function for what you are referring to named while. As long as the conditions are met it will continue to loop.
i <- 1
while (parasite_l.A[i] > parasite_l.B[i]) {
parasite_l.A[i+1] = parasite_l.A[i]*(infA)
parasite_l.B[i+1] = parasite_l.B[i]*(infB)
i <- i + 1
}
# parasite_l.A
# [1] 400.0000 440.0000 484.0000 532.4000 585.6400 644.2040 708.6244
# [8] 779.4868 857.4355 943.1791 1037.4970 1141.2467 1255.3714 1380.9085
# [15] 1518.9993 NA NA NA NA NA NA
# [22] NA NA NA NA NA NA NA
# [29] NA NA
# parasite_l.B
# [1] 120.0000 144.0000 172.8000 207.3600 248.8320 298.5984 358.3181
# [8] 429.9817 515.9780 619.1736 743.0084 891.6100 1069.9321 1283.9185
# [15] 1540.7022 NA NA NA NA NA NA
# [22] NA NA NA NA NA NA NA
# [29] NA NA
Use an index value (i), a couple of counters (A.index.value, B.index.value), and a while loop:
# Subject A, initially 400 parasites, growing by 10 %
subA <- A.index.value <- 400
infA <- 1.1
# Subject B, initially 120 parasites, growing by 20 %
subB <- B.index.value <- 120
infB <- 1.2
# How many days to model
days <- 30
days_seq <- seq(1, days, 1)
# Parasite load for A
parasite_l.A <- rep(NA, days)
parasite_l.A[1] <- subA
# Parasite load for B
parasite_l.B <- rep(NA, days)
parasite_l.B[1] <- subB
# While Loop for subject A and B
i <- 1
while (A.index.value > B.index.value) {
parasite_l.A[i+1] <- A.index.value <- parasite_l.A[i]*(infA)
parasite_l.B[i+1] <- B.index.value <- parasite_l.B[i]*(infB)
i <- i + 1
}
parasite_l.A
parasite_l.B
With the results being:
> parasite_l.A
[1] 400.00 440.00 484.00 532.40 585.64 644.20 708.62 779.49 857.44 943.18 1037.50
[12] 1141.25 1255.37 1380.91 1519.00 NA NA NA NA NA NA NA
[23] NA NA NA NA NA NA NA NA
> parasite_l.B
[1] 120.00 144.00 172.80 207.36 248.83 298.60 358.32 429.98 515.98 619.17 743.01
[12] 891.61 1069.93 1283.92 1540.70 NA NA NA NA NA NA NA
[23] NA NA NA NA NA NA NA NA
>
if (parasite_l.A < parasite_l.B) { // if parasite a is less than b, do the following //
for(i in 1:(days)){
parasite_l.A[i+1] = parasite_l.A[i]*(infA)
parasite_l.B[i+1] = parasite_l.B[i]*(infB)
}
}
Use inside the loop something like:
if (parasite_l.A > parasite_l.B) {
break
}