Related
I am trying to create a vector with the sum of the forecasts created but the vector is only storing the last value, how to create a vector with each sum?
library(forecast)
setwd("C:\\Users\\Note\\Documents\\Ssl")
n <- dir(pattern = ".csv")
forecastOut <- lapply(n, function(x) {
dat <- read.csv(x, sep=";", header = TRUE)
predic <- auto.arima(dat[,1])
forecast(predic, h = 4)
})
names(forecastOut) <- n
for (i in forecastOut){
sum<-c(sum(unlist((i[4]))))
}
>sum
[1] 2944.32
> class(forecastOut)
[1] "list"
We can create sum initialized as a vector
Sum <- numeric(length(forecatsOut))
for(i in seq_along(forecastOut)) {
Sum[i] <- sum(forecastOut[[4]], na.rm = TRUE)
}
Or append the output with c()
Sum <- c()
for(out in forecastOut) {
Sum <- c(Sum, sum(out[[4]]))
}
In the OP's loop, the sum is getting updated in each iteration and returns the last sum value
I want to create two lists of data frames in a for loop, but I cannot use assign:
dat <- data.frame(name = c(rep("a", 10), rep("b", 13)),
x = c(1,3,4,4,5,3,7,6,5,7,8,6,4,3,9,1,2,3,5,4,6,3,1),
y = c(1.1,3.2,4.3,4.1,5.5,3.7,7.2,6.2,5.9,7.3,8.6,6.3,4.2,3.6,9.7,1.1,2.3,3.2,5.7,4.8,6.5,3.3,1.2))
a <- dat[dat$name == "a",]
b <- dat[dat$name == "b",]
samp <- vector(mode = "list", length = 100)
h <- list(a,b)
hname <- c("a", "b")
for (j in 1:length(h)) {
for (i in 1:100) {
samp[[i]] <- sample(1:nrow(h[[j]]), nrow(h[[j]])*0.5)
assign(paste("samp", hname[j], sep="_"), samp[[i]])
}
}
Instead of lists named samp_a and samp_b I get vectors which contain the result of the 100th sample. I want to get a list samp_a and samp_b, which have all the different samples for dat[dat$name == a,] and dat[dat$name == a,].
How could I do this?
How about creating two different lists and avoiding using assign:
Option 1:
# create empty list
samp_a <-list()
samp_b <- list()
for (j in seq(h)) {
# fill samp_a list
if(j == 1){
for (i in 1:100) {
samp_a[[i]] <- sample(1:nrow(h[[j]]), nrow(h[[j]])*0.5)
}
# fill samp_b list
} else if(j == 2){
for (i in 1:100) {
samp_b[[i]] <- sample(1:nrow(h[[j]]), nrow(h[[j]])*0.5)
}
}
}
You could use assign too, shorter answer:
Option 2:
for (j in seq(hname)) {
l = list()
for (i in 1:100) {
l[[i]] <- sample(1:nrow(h[[j]]), nrow(h[[j]])*0.5)
}
assign(paste0('samp_', hname[j]), l)
rm(l)
}
You could easily use an lapply for this using the rep function. Unless you want a random x, paired with a random y. This will maintain the existing paired order.
dat <- data.frame(name = c(rep("a", 10), rep("b", 13)),
x = c(1,3,4,4,5,3,7,6,5,7,8,6,4,3,9,1,2,3,5,4,6,3,1),
y = c(1.1,3.2,4.3,4.1,5.5,3.7,7.2,6.2,5.9,7.3,8.6,6.3,4.2,3.6,9.7,1.1,2.3,3.2,5.7,4.8,6.5,3.3,1.2))
a <- dat[dat$name == "a",]
b <- dat[dat$name == "b",]
h <- list(a,b)
hname <- c("a", "b")
testfunc <- function(df) {
#df[sample(nrow(df), nrow(df)*0.5), ] #gives you the values in your data frame
sample(nrow(df), nrow(df)*0.5) # just gives you the indices
}
lapply(h, testfunc) # This gives you the standard lapply format, and only gives one a, and one b
samp <- lapply(rep(h, 100), testfunc) # This shows you how to replicate the function n times, giving you 100 a and 100 b data.frames in a list
samp_a <- samp[c(TRUE, FALSE)] # Applies a repeating T/F vector, selecting the odd data.frames, which in this case are the `a` frames.
samp_b <- samp[c(FALSE, TRUE)] # And here, the even data.frames, which are the `b` frames.
I have a large list that stored measurements (a product of other lapply() runs). I now want to gather these measurements and calculate median/mean/sd etc but I don't know how to access them. The structure of this list is like this:
foo[[i]][[j]][[k]][[1]]
foo[[i]][[j]][[k]][[2]]$bar
I can't figure out a function that would return e.g. mean of $bar (but not of $x) and keep relation the values of the indices i,j,k.
A sample list can be generated with the following R code:
library(purrr)
metrics <- function(y){
tt10r <- median(y)
list(y, flatten(list(bar = tt10r)))
}
example_list <- list()
for (i in 1:10)
{
v <- list()
for (j in 1:10)
{
w <- 1:10
v[j] <- list(w)
}
example_list[[i]] <- v
}
foo <- list()
for (i in 1:length(example_list))
{
u <- list()
values <- list()
for (j in 1:length(example_list[[i]]))
{
u[[j]] <- lapply(example_list[[i]][[j]], function(x) mean(x))
values[[j]] <- lapply(u[[j]], function(x) metrics(x))
}
foo[[i]] <- values
}
The following code works nicely, but I am not sure if it is efficient (loops!). Gives the anticipated result:
final <- matrix(nrow = tail(cumsum(unlist(lapply(foo, function(x) lengths(x) -2))), n=1), ncol = 3)
final <- data.frame(final)
j=1
i=1
all_js <- c(0, cumsum(lengths(foo)))
starts <- c(0, cumsum(unlist(lapply(foo, function(x) lengths(x) -2)))) + 1
ends <- c(0, cumsum(unlist(lapply(foo, function(x) lengths(x) -2))))
for (i in 1:length(foo))
{
a <- foo[[i]]
for (j in 1:length(a))
{
b <- a[[j]]
data <- unlist(lapply(lapply(b[1], '[', 2), '[[', 1))
for (k in 2:c(length(b)-2))
{
data <- rbind(data,unlist(lapply(lapply(b[k], '[', 2), '[[', 1)))
}
row.names(data) <- NULL
colnames(final) <- c("i", "j", colnames(data))
first <- starts[all_js[i] + j]
last <- ends[all_js[i] + j+1]
final[first:last,] <- data.frame(cbind(i = i, j = j, data))
}
}
I was reading a book and I came across this function in R. This function basically finds out patterns in the input string having a minimum threshold of 3.
vec <- "da0abcab0abcaab0d0"
find_rep_path <- function(vec, reps) {
regexp <- paste0(c("(.+)", rep("\\1", reps - 1L)), collapse = "")
match <- regmatches(vec, regexpr(regexp, vec, perl = TRUE))
substr(match, 1, nchar(match) / reps)
}
vals <- unique(strsplit(vec, "")[[1]])
str <- NULL
for (i in seq.int(nchar(vec))) {
x <- vec
for (v in vals) {
substr(x, i, i) <- v
tmp <- find_rep_path(x, 3)
if (length(tmp) > 0)
str <- c(str, tmp)
}
}
nc <- nchar(str)
unique(str[which(nc == max(nc))])
Now, I wish to convert this function into the form like,
function("da0abcab0abcaab0d0"). This means, that I can easily pass a string to the function directly and not hardcode it in the original function. How can I modify this?
I know this is a beginner question but I am completely at sea right now as far as R is concerned. Please help!
I don't see how it's hardcoded. But you can just wrap your code into a function if that's what you mean?
# Function 1
find_rep_path <- function(vec, reps) {
regexp <- paste0(c("(.+)", rep("\\1", reps - 1L)), collapse = "")
match <- regmatches(vec, regexpr(regexp, vec, perl = TRUE))
substr(match, 1, nchar(match) / reps)
}
# Function 2
foo <- function(vec) {
vals <- unique(strsplit(vec, "")[[1]])
str <- NULL
for (i in seq.int(nchar(vec))) {
x <- vec
for (v in vals) {
substr(x, i, i) <- v
tmp <- find_rep_path(x, 3)
if (length(tmp) > 0)
str <- c(str, tmp)
}
}
nc <- nchar(str)
return(unique(str[which(nc == max(nc))]))
}
vec <- "da0abcab0abcaab0d0"
foo(vec)
#[1] "0ab" "abc"
Edit1
To get the place of the matches you can use gregexr:
a <- foo(vec)
gregexpr(a[1], vec)
#[[1]]
#[1] 3 9
#attr(,"match.length")
#[1] 3 3
#attr(,"useBytes")
#[1] TRUE
This tells you that a[1] ("0ab") was matched in vec at positions 3 and 9. Run ?gregexpr for more informations.
Edit2
To add this information to each match, we can do something like
bar <- function(vec) {
m <- foo(vec)
ans <- sapply(m, gregexpr, vec, fixed = TRUE)
ans <- lapply(ans, function(x) {attributes(x) <- NULL; x})
return(ans)
}
bar(vec)
#$`0ab`
#[1] 3 9
#
#$abc
#[1] 4 10
I am wondering whether a proper framework for interval manipulation and comparison does exist in R.
After some search, I was only able to find the following:
- function findInterval in base Package. (but I hardly understand it)
- some answers here and there about union and intersection (notably: http://r.789695.n4.nabble.com/Union-Intersect-two-continuous-sets-td4224545.html)
Would you know of an initiative to implement a comprehensive set of tools to easily handles frequent tasks in interval manipulation, like inclusion/setdiff/union/intersection/etc. (eg see here for a list of functionalities)?
or would you have advice in developing such an approach?
below are some drafts on my side for doing so. it is surely awkward and still has some bugs but it might illustrate what I am looking for.
preliminary aspects about the options taken
- should deal seamlessly with intervals or intervals set
- intervals are represented as 2 columns data.frames (lower boundary, higher boundary), on one row
- intervals sets are represented as 2 columns with several rows
- a third column might be needed for identification of intervals sets
UNION
interval_union <- function(df){ # for data frame
df <- interval_clean(df)
if(is.empty(df)){
return(as.data.frame(NULL))
} else {
if(is.POSIXct(df[,1])) {
dated <- TRUE
df <- colwise(as.numeric)(df)
} else {
dated <- FALSE
}
M <- as.matrix(df)
o <- order(c(M[, 1], M[, 2]))
n <- cumsum( rep(c(1, -1), each=nrow(M))[o])
startPos <- c(TRUE, n[-1]==1 & n[-length(n)]==0)
endPos <- c(FALSE, n[-1]==0 & n[-length(n)]==1)
M <- M[o]
if(dated == TRUE) {
df2 <- colwise(mkDateTime)(as.data.frame(cbind(M[startPos], M[endPos])), from.s = TRUE)
} else {
df2 <- as.data.frame(cbind(M[startPos], M[endPos]))
}
colnames(df2) <- colnames(df)
# print(df2)
return(df2)
}
}
union_1_1 <- function(test, ref){
names(ref) <- names(test)
tmp <- interval_union(as.data.frame(rbind(test, ref)))
return(tmp)
}
union_1_n <- function(test, ref){
return(union_1_1(test, ref))
}
union_n_n <- function(test, ref){
testnn <- adply(.data = test, 1, union_1_n, ref, .expand = FALSE)
return(testnn)
}
ref_interval_union <- function(df, ref){
tmp0 <- adply(df, 1, union_1_1, ref, .expand = FALSE) # set to FALSE to keep ID
return(tmp0)
}
INTERSECTION
interval_intersect <- function(df){
# adapted from : http://r.789695.n4.nabble.com/Union-Intersect-two-continuous-sets-td4224545.html
M <- as.matrix(df)
L <- max(M[, 1])
R <- min(M[, 2])
Inew <- if (L <= R) c(L, R) else c()
if (!is.empty(Inew)){
df2 <- t(as.data.frame(Inew))
colnames(df2) <- colnames(df)
rownames(df2) <- NULL
} else {
df2 <- NULL
}
return(as.data.frame(df2))
}
ref_interval_intersect <- function(df, ref){
tmpfun <- function(a, b){
names(b) <- names(a)
tmp <- interval_intersect(as.data.frame(rbind(a, b)))
return(tmp)
}
tmp0 <- adply(df, 1, tmpfun, ref, .expand = FALSE) # [,3:4]
#if(!is.empty(tmp0)) colnames(tmp0) <- colnames(df)
return(tmp0)
}
int_1_1 <- function(test, ref){
te <- as.vector(test)
re <- as.vector(ref)
names(re) <- names(te)
tmp0 <- c(max(te[1, 1], re[1, 1]), min(te[1, 2], re[1, 2]))
if(tmp0[1]>tmp0[2]) tmp0 <- NULL # inverse of a correct interval --> VOID
if(!is.empty(tmp0)){
tmp1 <- colwise(mkDateTime)(as.data.frame(t(as.data.frame(tmp0))))
colnames(tmp1) <- colnames(test)
} else {
tmp1 <- data.frame(NULL)
}
return(tmp1)
}
int_1_n <- function(test, ref){
test1 <- adply(.data = ref, 1, int_1_1, test = test, .expand = FALSE)
if(is.empty(test1)){
return(data.frame(NULL))
} else {
testn <- interval_union(test1[,2:3])
return(testn)
}
}
int_n_n <- function(test, ref){
testnn <- adply(.data = test, 1, int_1_n, ref, .expand = FALSE)
# return(testnn[,2:3]) # return interval set without index (1st column)
return(testnn) # return interval set with index (1st column) --> usefull to go with merge to keep metadata going alon g with interval description
}
int_intersect <- function(df, ref){
mycols <- colnames(df)
df$X1 <- 1:nrow(df)
test <- df[, 1:2]
tmp <- int_n_n(test, ref)
intersection <- merge(tmp, df, by = "X1", suffixes = c("", "init"))
return(intersection[,mycols])
}
EXCLUSION
excl_1_1 <- function(test, ref){
te <- as.vector(test)
re <- as.vector(ref)
names(re) <- names(te)
if(te[1] < re[1]){ # Lower Bound
if(te[2] > re[1]){ # overlap
x <- unlist(c(te[1], re[1]))
} else { # no overlap
x <- unlist(c(te[1], te[2]))
}
} else { # test > ref on lower bound side
x <- NULL
}
if(te[2] > re[2]){ # Upper Bound
if(te[1] < re[2]){ # overlap
y <- unlist(c(re[2], te[2]))
} else { # no overlap
y <- unlist(c(te[1], te[2]))
}
} else { # test < ref on upper bound side
y <- NULL
}
if(is.empty(x) & is.empty(y)){
tmp0 <- NULL
tmp1 <- tmp0
} else {
tmp0 <- as.data.frame(rbind(x, y))
colnames(tmp0) <- colnames(test)
tmp1 <- interval_union(tmp0)
}
return(tmp1)
}
excl_1_n <- function(test, ref){
testn0 <- adply(.data = ref, 1, excl_1_1, test = test, .expand=FALSE)
# boucle pour intersecter successivement les intervalles sets, pour gérer les intervalles disjoints (identifiés par X1, col1)
tmp <- range(testn0)
names(tmp) <- colnames(testn0)[2:3]
tmp <- as.data.frame(t(tmp))
for(i in unique(testn0[,1])){
tmp <- int_n_n(tmp, testn0[testn0[,1]==i, 2:3])
}
return(tmp)
}
INCLUSION
incl_1_1 <- function(test, ref){
te <- as.vector(test)
re <- as.vector(ref)
if(te[1] >= re[1] & te[2] <= re[2]){ return(TRUE) } else { return(FALSE) }
}
incl_1_n <- function(test, ref){
testn <- adply(.data = ref, 1, incl_1_1, test = test)
return(any(testn[,ncol(testn)]))
}
incl_n_n <- function(test, ref){
testnn <- aaply(.data = test, 1, incl_1_n, ref, .expand = FALSE)
names(testnn) <- NULL
return(testnn)
}
flat_incl_n_n <- function(test, ref){
ref <- interval_union(ref)
return(incl_n_n(test, ref))
}
# testing for a vector, instead of an interval set
incl_x_1 <- function(x, ref){
test <- (x>=ref[1,1] & x<ref[1,2])
return(test)
}
incl_x_n <- function(x, ref){
test <- any(x>=ref[,1] & x<ref[,2])
return(test)
}
I think you might be able to make good use of the many interval-related functions in the sets package.
Here's a small example illustrating the package's support for interval construction, intersection, set difference, union, and complementation, as well as its test for inclusion in an interval. These and many other related functions are documented on the help page for ?interval.
library(sets)
i1 <- interval(1,6)
i2 <- interval(5,10)
i3 <- interval(200,400)
i4 <- interval(202,402)
i5 <- interval_union(interval_intersection(i1,i2),
interval_symdiff(i3,i4))
i5
# [5, 6] U [200, 202) U (400, 402]
interval_complement(i5)
# [-Inf, 5) U (6, 200) U [202, 400] U (402, Inf]
interval_contains_element(i5, 5.5)
# [1] TRUE
interval_contains_element(i5, 201)
# [1] TRUE
If your intervals are currently encoded in a two-column data.frame, you could use something like mapply() to convert them to intervals of the type used by the sets package:
df <- data.frame(lBound = c(1,5,100), uBound = c(10, 6, 200))
Ints <- with(df, mapply("interval", l=lBound, r=uBound, SIMPLIFY=FALSE))
Ints
# [[1]]
# [1, 10]
# [[2]]
# [5, 6]
# [[3]]
# [100, 200]