xts merge memory performance - r

I am trying to improve the memory performance for the following example:
basline df with 4 rows
df <- structure(list(sessionid = structure(c(1L, 2L, 3L, 4L), .Label =
c("AAA1", "AAA2","AAA3", "AAA4"), class = "factor"), bitrateinbps = c(10000000,
10000000, 10000000, 10000000), startdate = structure(c(1326758507, 1326758671,
1326759569, 1326760589), class = c("POSIXct", "POSIXt"), tzone = ""), enddate =
structure(c(1326765780, 1326758734, 1326760629, 1326761592), class = c("POSIXct",
"POSIXt"), tzone = "")), .Names = c("sessionid", "bitrateinbps", "startdate",
"enddate"), row.names = c(NA, 4L), class =
"data.frame")
alternate df with 8 rows
df <- structure(list(sessionid = structure(c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L),
.Label = c("AAA1", "AAA2", "AAA3", "AAA4", "AAA5", "AAA6", "AAA7", "AAA8"),
class = "factor"), bitrateinbps =c(10000000, 10000000, 10000000, 10000000,
10000000, 10000000, 10000000, 10000000), startdate = structure(c(1326758507,
1326758671, 1326759569, 1326760589, 1326761589, 1326762589, 1326763589, 1326764589),
class = c("POSIXct",
"POSIXt"), tzone = ""), enddate = structure(c(1326765780, 1326758734, 1326760629,
1326761592, 1326767592,
1326768592, 1326768700, 1326769592), class = c("POSIXct", "POSIXt"), tzone = "")),
.Names = c("sessionid",
"bitrateinbps", "startdate", "enddate"), row.names = c(NA, 8L), class =
"data.frame")
try df analysis memory usage and again for alternate df
library(xts)
fun0 <- function(i, d) {
idx0 <- seq(d$startdate[i],d$enddate[i],1) # create sequence for index
dat0 <- rep(1,length(idx0)) # create data over sequence
xts(dat0, idx0, dimnames=list(NULL,d$sessionid[i])) # xts object
}
# loop over each row and put each row into its own xts object
xl0 <- lapply(1:NROW(df), fun0, d=df)
# merge all the xts objects
xx0 <- do.call(merge, xl0)
# apply a function (e.g. colMeans) to each 15-minute period
xa0 <- period.apply(xx0, endpoints(xx0, 'minutes', 15), colSums, na.rm=TRUE)/900
xa1 <- t(xa0)
# convert from atomic vector to data frame
xa1 = as.data.frame(xa1)
# bind to df
out1 = cbind(df, xa1)
# print aggregate memory usage statistics
print(paste('R is using', memory.size(), 'MB out of limit', memory.limit(), 'MB'))
# create function to return matrix of memory consumption
object.sizes <- function()
{
return(rev(sort(sapply(ls(envir=.GlobalEnv), function (object.name)
object.size(get(object.name))))))
}
# print to console in table format
object.sizes()
results as follows:
4 row df:
xx0 = 292104 Bytes .... do.call(merge, xl0)
xl0 = 154648 Bytes .... lapply(1:NROW(df), fun0, d=df)
8 row df:
xx0 = 799480 Bytes .... do.call(merge, xl0)
xl0 = 512808 Bytes .... lapply(1:NROW(df), fun0, d=df)
I'm looking for something a little more memory efficient for the merge and lapply functions, so I can scale out the number of rows, if anyone has any suggestions and can show the comparative results for alternatives.

Related

time average for specific time range in r

I am trying to extract average values of all variables between 0 to 40 minutes every hour.
dput(head(df))
structure(list(DateTime = structure(c(1563467460, 1563468060,
1563468660, 1563469260, 1563469860, 1563470460), class = c("POSIXct",
"POSIXt"), tzone = "GMT"), date = structure(c(1563467460, 1563468060,
1563468660, 1563469260, 1563469860, 1563470460), class = c("POSIXct",
"POSIXt"), tzone = "GMT"), Date = structure(c(18095, 18095, 18095,
18095, 18095, 18095), class = "Date"), TimeCtr = structure(c(1563467460,
1563468060, 1563468660, 1563469260, 1563469860, 1563470460), class = c("POSIXct",
"POSIXt"), tzone = "GMT"), MassConc = c(0.397627, 0.539531, 0.571902,
0.608715, 0.670382, 0.835773), VolConc = c(175.038, 160.534,
174.386, 183.004, 191.074, 174.468), NumbConc = c(234.456, 326.186,
335.653, 348.996, 376.018, 488.279), MassD = c(101.426, 102.462,
101.645, 102.145, 101.255, 101.433)), .Names = c("DateTime",
"date", "Date", "TimeCtr", "MassConc", "VolConc", "NumbConc",
"MassD"), row.names = c(NA, 6L), class = "data.frame")
What I've tried so far..
hourly_mean<-mydata %>%
filter(between(as.numeric(format(DateTime, "%M")), 0, 40)) %>%
group_by(DateTime=format(DateTime, "%Y-%m-%d %H")) %>%
summarise(variable1_mean=mean(variable1))
But it gives me a single average value for the whole period. Any help is very much welcomed.
We can convert DateTime , use ceiling_date with hourly unit to round Datetime, extract minutes from DateTime and filter to keep minutes which are less than 40, group_by hour and take mean of values.
library(lubridate)
library(dplyr)
df %>%
dplyr::mutate(DateTime = ymd_hm(DateTime),
hour = ceiling_date(DateTime, "hour"),
minutes = minute(DateTime)) %>%
filter(minutes <= 40) %>%
group_by(hour) %>%
summarise_at(vars(ends_with("Conc")), mean)
data
df <- structure(list(DateTime = structure(1:7, .Label = c("2019-08-0810:07",
"2019-08-0810:17", "2019-08-0810:27", "2019-08-0810:37", "2019-08-0810:47",
"2019-08-0810:57", "2019-08-0811:07"), class = "factor"), MassConc = c(0.556398,
1.06868, 0.777654, 0.87289, 0.789704, 0.51948, 0.416676), NumbConc = c(588.069,
984.018, 964.634, 997.678, 1013.52, 924.271, 916.357), VolConc = c(582.887,
979.685, 963.3, 994.178, 1009.52, 922.104, 916.856), Conc = c(281.665,
486.176, 420.058, 422.101, 429.841, 346.539, 330.282)), class =
"data.frame", row.names = c(NA, -7L))

Change data to numeric type to determine which distribution fits better

I am trying to figure out which distribution fits best logarithmic stock returns. Here is my code:
library(TTR)
sign="^GSPC"
start=19900101
end=20160101
x <- getYahooData(sign, start = start, end = end, freq = "daily")
x$logret <- log(x$Close) - lag(log(x$Close))
x=x[,6]
I want to use the function descdist(x, discrete = FALSE) which I got from this amazing post https://stats.stackexchange.com/questions/132652/how-to-determine-which-distribution-fits-my-data-best Nonetheless r gives me this error: Error in descdist(x, discrete = FALSE) : data must be a numeric vector How do I transform my data to numeric vector??
The output from dput(head(x)) is:
structure(c(NA, -0.00258888580664607, -0.00865029791190164, -0.00980414107803274,
0.00450431207515223, -0.011856706127011), class = c("xts", "zoo"
), .indexCLASS = "Date", .indexTZ = "UTC", tclass = "Date", tzone = "UTC", index = structure(c(631238400,
631324800, 631411200, 631497600, 631756800, 631843200), tzone = "UTC", tclass = "Date"), .Dim = c(6L,
1L), .Dimnames = list(NULL, "logret"))
Pre-process x using as.numeric(na.omit(x)), or simply run
descdist(as.numeric(na.omit(x)), discrete = FALSE)

optimisation of a condition for loop in r

I have 2 datasets, one of which contains measurements of temperature at 30 min intervals
ordered.temp<-structure(list(time = structure(c(1385244720, 1385246520, 1385248320,
1385250120, 1385251920, 1385253720, 1385255520, 1385257320, 1385259120,
1385260920), class = c("POSIXct", "POSIXt"), tzone = ""), temp = c(30.419,
29.34, 28.965, 28.866, 28.891, 28.866, 28.692, 28.419, 28.122,
27.85), hoboID = c(2392890L, 2392890L, 2392890L, 2392890L, 2392890L,
2392890L, 2392890L, 2392890L, 2392890L, 2392890L)), .Names = c("time",
"temp", "hoboID"), row.names = c(NA, 10L), class = "data.frame")
, the other I created to be able to assign each measurement into 3-hour bins
df<-structure(list(start = structure(c(1385182800, 1385193600, 1385204400,
1385215200, 1385226000, 1385236800, 1385247600, 1385258400, 1385269200,
1385280000), class = c("POSIXct", "POSIXt"), tzone = ""), end = structure(c(1385193600,
1385204400, 1385215200, 1385226000, 1385236800, 1385247600, 1385258400,
1385269200, 1385280000, 1385290800), class = c("POSIXct", "POSIXt"
), tzone = ""), b = 1:10), .Names = c("start", "end", "b"), row.names = c(NA,
10L), class = "data.frame")
For simplicity, I created a subset of the data, but in reality the temp dataframe is 460k rows long and growing bigger every year. I wrote a for loop to compare each line in temp with lines in bin and assign it the corresponding b value from the bin dataframe.
m <- length(ordered.temp$time)
b <- numeric(m)
n <- length(df$start)
for (i in 1:m){
for (j in 1:n){
if (df$start[j] < ordered.temp$time[i] & ordered.temp$time[i] <= df$end[j]){
b[i] <- df$b[j]
print(i/dim(ordered.temp)[1]*100)
}
}
}
Running this loop with 460k rows takes a very long time (i ran the loop for 1 minute and calculated that it would take ±277 hours to complete it. Therefore, it is imperative to speed this loop up or find alternative methods if this is not possible. I however have no idea how I achieve the desired result. Any help would be greatly appreciated. thanks.

BioConductor IRanges coverage counts and identify segments

I have a dataset with interval information for a bunch of manufacturing circuits
df <- data.frame(structure(list(circuit = structure(c(2L, 1L, 2L, 1L, 2L, 3L,
1L, 1L, 2L), .Label = c("a", "b", "c"), class = "factor"), start = structure(c(1393621200,
1393627920, 1393628400, 1393631520, 1393650300, 1393646400, 1393656000,
1393668000, 1393666200), class = c("POSIXct", "POSIXt"), tzone = ""),
end = structure(c(1393626600, 1393631519, 1393639200, 1393632000,
1393660500, 1393673400, 1393667999, 1393671600, 1393677000
), class = c("POSIXct", "POSIXt"), tzone = ""), id = structure(1:9, .Label = c("1001",
"1002", "1003", "1004", "1005", "1006", "1007", "1008", "1009"
), class = "factor")), .Names = c("circuit", "start", "end",
"id"), class = "data.frame", row.names = c(NA, -9L)))
Circuit: Identifier for circuit
Start: Time the circuit started running
Finish: Time the circuit stopped running
Id: Unique identifier for the row
I'm able to create a new data set that counts the number of overlapping intervals:
ir <- IRanges(start = as.numeric(df$start), end = as.numeric(df$end), names = df$id)
cov <- coverage(ir)
start_time <- as.POSIXlt(start(cov), origin = "1970-01-01")
end_time <- as.POSIXlt(end(cov), origin = "1970-01-01")
seconds <- runLength(cov)
circuits_running <- runValue(cov)
res <- data.frame(start_time,end_time,seconds,circuits_running)[-1,]
But what I really need is something that looks more like this:
sqldf("select
res.start_time,
res.end_time,
res.seconds,
res.circuits_running,
df.circuit,
df.id
from res left join df on (res.start_time between df.start and df.end)")
The problem is that the sqldf way of using an inequality join is unbearably slow on my full dataset.
How can I get something similar using IRanges alone?
I suspect it has something to do with RangedData but I've not been able to see how to get what I want. Here's what I've tried...
rd <- RangedData(ir, circuit = df$circuit, id = df$id)
coverage(rd) # works but seems to lose the circuit/id info
The coverage can be represented as ranges, dropping the first (the range from 1970 to the first start point)
cov <- coverage(ir)
intervals <- ranges(cov)[-1]
Your query is to find the start of the interval of each circuit, so I narrow the interval to their start coordinate and find overlaps (the first argument is the 'query', the second the 'subject')
olaps <- findOverlaps(narrow(intervals, width(intervals)), ir)
The number of circuits running in a particular interval is
tabulate(queryHits(olaps), queryLength(olaps))
and the actual circuits are
df[subjectHits(olaps), c("circuit", "id")]
The pieces can be knit together as, perhaps
df1 <- cbind(uid=seq_along(intervals),
as.data.frame(intervals),
circuits_running=tabulate(queryHits(olaps), queryLength(olaps)))
df2 <- cbind(uid=queryHits(olaps),
df[subjectHits(olaps), c("circuit", "id")])
merge(df1, df2, by="uid", all=TRUE)
Ranges can have associated with them 'metadata' that is accessible and subset in a coordinated way, so the connection between data.frame and ranges does not have to be so loose and ad hoc. I might instead have
ir <- IRanges(start = as.numeric(df$start), end = as.numeric(df$end))
mcols(ir) <- DataFrame(df)
## ...
mcols(ir[subjectHits(olaps)])
perhaps with as.data.frame() when done with IRanges-land.
It's better to ask your questions about IRanges on the Bioconductor mailing list; no subscription required.

How to remove a row from zoo/xts object, given a timestamp

I was happily running with this code:
z=lapply(filename_list, function(fname){
read.zoo(file=fname,header=TRUE,sep = ",",tz = "")
})
xts( do.call(rbind,z) )
until Dirty Data came along with this at the end of one file:
Open High Low Close Volume
2011-09-20 21:00:00 1.370105 1.370105 1.370105 1.370105 1
and this at the start of the next file:
Open High Low Close Volume
2011-09-20 21:00:00 1.370105 1.371045 1.369685 1.3702 2230
So rbind.zoo complains about a duplicate.
I can't use something like:
y <- x[ ! duplicated( index(x) ), ]
as they are in different zoo objects, inside a list. And I cannot use aggregate, as suggested here because they are a list of zoo objects, not one big zoo object. And I can't get one big object 'cos of the duplicates. Catch-22.
So, when the going gets tough, the tough hack together some for loops (excuse the prints and a stop, as this isn't working code yet):
indexes <- do.call("c", unname(lapply(z, index)))
dups=duplicated(indexes)
if(any(dups)){
duplicate_timestamps=indexes[dups]
for(tix in 1:length(duplicate_timestamps)){
t=duplicate_timestamps[tix]
print("We have a duplicate:");print(t)
for(zix in 1:length(z)){
if(t %in% index(z[[zix]])){
print(z[[zix]][t])
if(z[[zix]][t]$Volume==1){
print("-->Deleting this one");
z[[zix]][t]=NULL #<-- PROBLEM
}
}
}
}
stop("There are duplicate bars!!")
}
The bit I've got stumped on is assigning NULL to a zoo row doesn't delete it (Error in NextMethod("[<-") : replacement has length zero). OK, so I do a filter-copy, without the offending item... but I'm tripping up on these:
> z[[zix]][!t,]
Error in Ops.POSIXt(t) : unary '!' not defined for "POSIXt" objects
> z[[zix]][-t,]
Error in `-.POSIXt`(t) : unary '-' is not defined for "POSIXt" objects
P.S. While high-level solutions to my real problem of "duplicates rows across a list of zoo objects" are very welcome, the question here is specifically about how to delete a row from a zoo object given a POSIXt index object.
A small bit of test data:
list(structure(c(1.36864, 1.367045, 1.370105, 1.36928, 1.37039,
1.370105, 1.36604, 1.36676, 1.370105, 1.367065, 1.37009, 1.370105,
5498, 3244, 1), .Dim = c(3L, 5L), .Dimnames = list(NULL, c("Open",
"High", "Low", "Close", "Volume")), index = structure(c(1316512800,
1316516400, 1316520000), class = c("POSIXct", "POSIXt"), tzone = ""), class = "zoo"),
structure(c(1.370105, 1.370115, 1.36913, 1.371045, 1.37023,
1.37075, 1.369685, 1.36847, 1.367885, 1.3702, 1.36917, 1.37061,
2230, 2909, 2782), .Dim = c(3L, 5L), .Dimnames = list(NULL,
c("Open", "High", "Low", "Close", "Volume")), index = structure(c(1316520000,
1316523600, 1316527200), class = c("POSIXct", "POSIXt"), tzone = ""), class = "zoo"))
UPDATE: Thanks to G. Grothendieck for the row-deleting solution. In the actual code I followed the advice of Joshua and GSee to get a list of xts objects instead of a list of zoo objects. So my code became:
z=lapply(filename_list, function(fname){
xts(read.zoo(file=fname,header=TRUE,sep = ",",tz = ""))
})
x=do.call.rbind(z)
(As an aside, please note the call to do.call.rbind. This is because rbind.xts has some serious memory issues. See https://stackoverflow.com/a/12029366/841830 )
Then I remove duplicates as a post-process step:
dups=duplicated(index(x))
if(any(dups)){
duplicate_timestamps=index(x)[dups]
to_delete=x[ (index(x) %in% duplicate_timestamps) & x$Volume<=1]
if(nrow(to_delete)>0){
#Next line says all lines that are not in the duplicate_timestamp group
# OR are in the duplicate timestamps, but have a volume greater than 1.
print("Will delete the volume=1 entry")
x=x[ !(index(x) %in% duplicate_timestamps) | x$Volume>1]
}else{
stop("Duplicate timestamps, and we cannot easily remove them just based on low volume.")
}
}
If z1 and z2 are your zoo objects then to rbind while removing any duplicates in z2:
rbind( z1, z2[ ! time(z2) %in% time(z1) ] )
Regarding deleting points in a zoo object having specified times, the above already illustrates this but in general if tt is a vector of times to delete:
z[ ! time(z) %in% tt ]
or if we knew there were a single element in tt then z[ time(z) != tt ] .
rbind.xts will allow duplicate index values, so it could work if you convert to xts first.
x <- lapply(z, as.xts)
y <- do.call(rbind, x)
# keep last value of any duplicates
y <- y[!duplicated(index(y),fromLast=TRUE),]
I think you'll have better luck if you convert to xts first.
a <- structure(c(1.370105, 1.370105, 1.370105, 1.370105, 1), .Dim = c(1L,
5L), index = structure(1316570400, tzone = "", tclass = c("POSIXct",
"POSIXt")), .indexCLASS = c("POSIXct", "POSIXt"), tclass = c("POSIXct",
"POSIXt"), .indexTZ = "", tzone = "", .Dimnames = list(NULL,
c("Open", "High", "Low", "Close", "Volume")), class = c("xts",
"zoo"))
b <- structure(c(1.370105, 1.371045, 1.369685, 1.3702, 2230), .Dim = c(1L,
5L), index = structure(1316570400, tzone = "", tclass = c("POSIXct",
"POSIXt")), .indexCLASS = c("POSIXct", "POSIXt"), tclass = c("POSIXct",
"POSIXt"), .indexTZ = "", tzone = "", .Dimnames = list(NULL,
c("Open", "High", "Low", "Close", "Volume")), class = c("xts",
"zoo"))
(comb <- rbind(a, b))
# Open High Low Close Volume
#2011-09-20 21:00:00 1.370105 1.370105 1.370105 1.370105 1
#2011-09-20 21:00:00 1.370105 1.371045 1.369685 1.370200 2230
dupidx <- index(comb)[duplicated(index(comb))] # indexes of duplicates
tail(comb[dupidx], 1) #last duplicate
# now rbind the last duplicated row with all non-duplicated data
rbind(comb[!index(comb) %in% dupidx], tail(comb[dupidx], 1))

Resources