Splitting Time Series for dtwclust - r

I'm attempting to prepare a dataframe of time series data (panel data) for dtwclust::tsclust. My current code looks like this:
library(dtwclust)
library(parallel)
library(dplyr)
library(zoo)
library(data.table)
Panel_VisitPages = data.table(
RecordID = c(1,1,1,1,2,3,3,5,5,5,5,5,5),
VisitPages = c(5,3,5,6,3,100,5,3,5,22,3,22,11),
DayDif = (0,-1,-2,-15,0,0,-5,0,-3,-5,-15,-20)
)
Panel_VisitPages = #df_view_ts[, TotalVisitPages:=sum(VisitPages),by=list(RecordID, DayDif)]
df_view_ts %>%group_by(RecordID, DayDif)%>%summarise(TotalVisitPages= sum(VisitPages))
groups = unique(Panel_VisitPages$RecordID)
set.seed(1)
groups = sample(groups, 3)
Panel_VisitPages = Panel_VisitPages[Panel_VisitPages$RecordID %in% groups,]
PageSeries = read.zoo(Panel_VisitPages
#, format =
, split='RecordID'
, index='DayDif'
,FUN = identity
)
PageSeries = na.fill(PageSeries, fill=0)
# Cluster Test------------------------------------------------------------------
workers <- makeCluster(detectCores()-4)
# load dtwclust in each one, and make them use 1 thread per worker
invisible(clusterEvalQ(workers, {
library(dtwclust)
library(bigmemory)
RcppParallel::setThreadOptions(1L)
}))
# register your workers, e.g. with doParallel
require(doParallel)
registerDoParallel(workers)
clustTest = tsclust(
series = t(PageSeries)
, type = 'partitional'
, k = 2L:10L
, distance = 'SBD'
, seed = 1
, trace = 1
)
The first line is of course much slower than it would be if I were able to use the data.table command I have commented out, but when I reach the tsclust command I get
Error in merge.zoo(--blocked out--, :
series cannot be merged with non-unique index entries in a series
Can anyone tell me how the data.table code at the top of this block differs from the similar dplyr code and what I can do to get it to perform the same as the dplyr line? Thanks!

Related

How to to calculate the shortest path in R efficiently?

I have more than 3500 origins and more than 3500 destinations that are connected by more than 54000 links with 24000 nodes. I am modeling a real street network (Chicago Metropolitan Area) in R using Igraph and CppRouting. The following code is called "all or nothing traffic assignment (AON)" which has to be executed more than 40 times to reach the equilibrium in the network. Now it takes more than 10 minutes for each AON execution. It is too much time. I appreciate any suggestion besides parallel computing to reduce the execution time of the following source code:
demand_matrix <- demand_matrix[order(demand_matrix$ORG ,demand_matrix$DEST) ,]
tic()
for (i in 1:length(unique(demand_matrix$ORG))){
#I think I have to iterate on every origin
org <- unique(demand_matrix$ORG)[i]
destinations <- demand_matrix$DEST[demand_matrix$ORG == org ]
demand <- demand_matrix[demand_matrix$ORG == org,2:3]
#the igraph function is also included here which requires more time to run!
#destinations <- demand_matrix$DEST[demand_matrix$ORG == org]
#sht_path <- unlist(shortest_paths(network_igraph,from =c (org) , to = c(destinations) , mode = c("out"), weights = resolved.Network[[5]]$t0,output = c("epath")),recursive = FALSE)
#sht_path <- sapply(sht_path , as_ids)
#the procedures with cppRouting
sht_path <- get_multi_paths(network_cpprouting_graph , from = org , to = destinations ,long = TRUE)
sht_path$end <- c(sht_path$node[2:nrow(sht_path)],0)
sht_path <-sht_path[sht_path$from != sht_path$node , ]
sht_path$paste <- paste(sht_path$end , sht_path$node)
edge_id_node_sequence <- as.integer(unlist(strsplit(sht_path$paste , split = " ")))
sht_path$edge_ids <- get.edge.ids(network_igraph , edge_id_node_sequence)
###I changed the sequence of nodes to edge ids in shortest path.
sht_path$to <- as.integer(sht_path$to) #I just found that "to" is character and changing it to integer would result lower time in left_join function
sht_path <-left_join(sht_path , demand,by = c("to" = "DEST"))
V2[sht_path$edge_ids] <- V2[sht_path$edge_ids] + sht_path$TRIPS #adding traffic to each link (that is what is all about, the goal is to calculate each link volume)
}
The demand Matrix has more the 4e6 none-zero values and I tried to calculate the shortest path with get_path_pair with all origin-destination Pairs, but it never ended and I restarted my Laptop. I have only 8GB of rams.
I tried to have the shortest paths with only 8e5 pairs each time (divided my matrix to 5 sections) the third section almost never ended.
length_group <- min(nrow(demand_matrix)/4,800000)
path_pair <- get_path_pair(Graph = test_net , from = demand_matrix$ORG[1:length_group],to = demand_matrix$DEST[1:length_group], long = TRUE)
path_pair <- rbind(path_pair , get_path_pair(Graph = test_net , from = demand_matrix$ORG[(length_group+1):(2*length_group)],to = demand_matrix$DEST[(length_group+1):(2*length_group)],long = TRUE))
path_pair <- rbind(path_pair , get_path_pair(Graph = test_net , from = demand_matrix$ORG[((2*length_group)+1):(3*length_group)],to = demand_matrix$DEST[((2*length_group)+1):(3*length_group)],long = TRUE))
path_pair <- rbind(path_pair , get_path_pair(Graph = test_net , from = demand_matrix$ORG[((3*length_group)+1):(4*length_group)],to = demand_matrix$DEST[((3*length_group)+1):(4*length_group)],long = TRUE))
path_pair <- rbind(path_pair , get_path_pair(Graph = test_net , from = demand_matrix$ORG[((4*length_group)+1):(5*length_group)],to = demand_matrix$DEST[((4*length_group)+1):(5*length_group)],long = TRUE))
path_pair <- rbind(path_pair , get_path_pair(Graph = test_net , from = demand_matrix$ORG[((5*length_group)+1):nrow(demand_matrix)],to = demand_matrix$DEST[((5*length_group)+1):nrow(demand_matrix)],long = TRUE))
If I understand correctly, demand_matrix is all possible combination between origin and destination nodes ? (3500² = 12 250 000)
Since cppRouting functions are vectorized, why not try :
get_multi_path(graph, origin, dest, long=TRUE)
with origin and dest your origin and destination nodes, with length of ~ 3500.
get_multi_path is the equivalent of get_distance_matrix function, it use the main property of Dijkstra's algorithm : finding shortest path between an origin node "n" and all nodes. So, full Dijkstra algorithm is runned N times, with N being origin length.
On the other hand, get_*_pair functions run Dijkstra's algorithm with a stopping criterion : when destination node is reached. So you basically increase runtime by a factor of ~1500 (not 3500, because Dijkstra's algorithm is aborted in the last option)
If you have memory issues, splitting all combinations in smaller chunks is the good strategy. However, I suggest you to split origin nodes by 10, then run get_multi_path between origin chunk and all destination nodes. At each iteration, you can aggregate the result to have the cumulated flow for each node of the network.
Finally, try to use lapply() and data.table::rbindlist() instead of multiple rbind() calls.
EDIT : If you want to accumulate traffic on edges, here is a piece of code :
library(data.table)
# or are origin nodes (I assume of length 3500)
# dest are destination nodes
chunk_size = 350
test <- lapply(seq(1,3500, chunk_size), function(x){
print(x)
res = get_multi_paths(graph, or[x:(x+chunk_size-1)] ,
dest,
long = TRUE)
setDT(res)
# eventually merge demand for each trip (origin-destination)
# reconstruct edges (by reference using data.table)
res[,edge_from := c(node[-1], NA),.(from,to)]
# aggregate demand on each edge
res <- res[!is.na(edge_from),.(traffic = sum(demand)),.(edge_from,node)]
gc()
return(res)
})
test <- rbindlist(test)
test <- test[,.(traffic = sum(traffic)),.(edge_from,node)]
Of course, you can modify chunk_size depending your available memory.

Exporting Seurat Object Data by Cluster

I'm using Seurat to perform a single cell analysis and am interested in exporting the data for all cells within each of my clusters. I tried to use the below code but have had no success.
My Seurat object is called Patients. I also attached a screenshot of my Seurat object. I am looking to extract all the clusters (i.e. Ductal1, Macrophage1, Macrophage2, etc...)
meta.data.cluster <- unique(x = Patients#meta.data$active.ident)
for(group in meta.data.cluster) {
group.cells <- WhichCells(object = Patients, subset.name = "active.ident" , accept.value = group)
data_to_write_out <- as.data.frame(x = as.matrix(x = Patients#raw.data[, group.cells]))
write.csv(x = data_to_write_out, row.names = TRUE, file = paste0(save_dir,"/",group, "_cluster_outfile.csv"))
}
I am new to R and coding so any help is greatly appreciated! :)
It doesn't work because there is no active.ident column under your metadata. For example if we use an example dataset like yours and set the ident:
library(Seurat)
M = matrix(rnbinom(5000,mu=20,size=1),ncol=50)
colnames(M) = paste0("P",1:50)
rownames(M) = paste0("gene",1:100)
Patients = CreateSeuratObject(M)
Patients$grp = sample(c("Ductal1","Macrophage1","Macrophage2"),50,replace=TRUE)
Idents(Patients) = Patients$grp
You can see this line of code gives you no value:
meta.data.cluster <- unique(x = Patients#meta.data$active.ident)
meta.data.cluster
NULL
You can do:
meta.data.cluster <- unique(Idents(Patients))
for(group in meta.data.cluster) {
group.cells <- WhichCells(object = Patients, idents = group)
data_to_write_out <- as.data.frame(GetAssayData(Patients,slot = 'counts')[,group.cells])
write.csv(data_to_write_out, row.names = TRUE, file = paste0(save_dir,"/",group, "_cluster_outfile.csv"))
}
Note also you can get the counts out using GetAssayData . You can subset one group and write out like this:
wh <- which(Idents(Patients) =="Macrophage1" )
da = as.data.frame(GetAssayData(Patients,slot = 'counts')[,wh])
write.csv(da,...)

R foreach multiple cores accessing a function at the same time

I have 1000 csv files in my working directory and each file has a location Id, rainfall and temperature. The structure of one file is shown below:
set.seed(123)
my.dat <- data.frame(Id = rep(1, each = 365),
rain = runif(365, min = 0, max = 20),
tmean = sample(20:40, 365, replace = T))
I wrote an Rcpp function that is also stored in my working directory. This function takes in rainfall and temperature data and calculates some derived variables var1 andvar2. I want to read each location's weather data and apply the function and save the corresponding output using foreach package.
location.vec <- 1:1000
myClusters <- makeCluster(6)
registerDoParallel(myClusters)
foreach(i = 1:length(location.vec),
.packages = c('Rcpp', 'dplyr', 'data.table'),
.noexport = c('myRcppFunc'),
.verbose = T) %dopar%
{
Rcpp::sourceCpp('myRcppFunc.cpp')
idRef <- location.vec[i]
# read the weather data
temp_weather <- fread(paste0('weather_',idRef,'.csv'))
# apply my Rcpp function
temp_weather[, c("var1","var2") := myRcppFunc(rain, tmean)]
# save my output
fwrite(temp_weather, 'paste0('weather_',idRef_modified,'.csv')')
}
stopCluster(myClusters)
This loop seems to have a weird behaviour. Sometimes it just gets stuck on iteration 10, sometimes on 40 etc everytime I run it and then I have to kill the job.
My doubt is this driven by the fact that multiple process are trying to access the Rcpp function at the same time which is leading to this issue? How can I fix it? Can I read in the Rcpp function in the foreach argument so that I don't have to keep loading it? Any other advise?
Thanks

R HTS package: combinef and aggts not working with gts object

I'm trying to apply the combinef and aggts functions from the R hts package to a time series matrix in order to obtain an optimized set of forecasts across a hierarchy. I've run the same code every month without issue, and am now seeing errors after upgrading to hts package v4.5.
Reproducible example (I can share data file offline if needed)
#Read in forecast data for all levels of hierarchy#
fcast<-read.csv("SampleHierarchyForecast.csv", header = TRUE, check.names = FALSE)
#Convert to time series#
fcast<-ts(fcast, start = as.numeric(2010.25) + (64)/12, end = as.numeric(2010.25) + (75)/12, f= 12)
#Create time series of only the bottom level of the hierarchy#
index<-c()
fcastBottom<-fcast
for (i in 1:length(fcastBottom [1,]))
{
if(nchar(colnames(fcastBottom)[i])!=28)
index[i]<-i
else
index[i]<-0
}
fcastBottom<-fcastBottom[,-index]
#Create grouped time series from the bottom level forecast #
GtsForecast <- gts(fcastBottom, characters = list(c(12,12), c(4)), gnames = c("Category", "Item", "Customer", "Category-Customer"))
#Use combinef function to optimally combine the full hierarchy forecast using the groups from the full hierarchy gts#
combo <- combinef(fcast, groups = GtsForecast$groups)
*Warning message:
In mapply(rep, as.list(gnames), times, SIMPLIFY = FALSE) :
longer argument not a multiple of length of shorter*
traceback()
2: stop("Argument fcasts requires all the forecasts.")
1: combinef(fcast, groups = GtsForecast$groups)
There's a little bug when comebinef() function calls gts(). Now I've fixed it on github. So you can run your own code above without any trouble after updating the development version.
Alternatively, you need to tweak your code a bit if you don't want to install the newest version.
combo <- combinef(fcast, groups = GtsForecast$groups, keep = "bottom")
combo <- ts(combo, start = as.numeric(2010.25) + (64)/12,
end = as.numeric(2010.25) + (75)/12, f = 12)
colnames(combo) <- colnames(fcastBottom)
newGtsForecast <- gts(combo, characters = list(c(12,12), c(4)),
gnames = c("Category", "Item", "Customer",
"Category-Customer"))
Aggregate <- aggts(newGtsForecast)
Hope it helps.

Avoid for-loop: Define blocks of actions within a time range

I need to define blocks of actions - so I want to group together all actions for a single id that take place less than 30 days since the last action. If it's more than 30 days since the last action, then I'd increment the label by one (so label 2, 3, 4...). Every new id would start at 1 again.
Here's the data:
dat = data.frame(cbind(
id = c(rep(1,2), rep(16,3), rep(17,24)),
##day_id is the action date in %Y%m%d format - I keep it as numeric but could potentially turn to a date.
day_id = c(20130702, 20130121, 20131028, 20131028, 20130531, 20140513, 20140509,
20140430, 20140417, 20140411, 20140410, 20140404,
20140320, 20140313, 20140305, 20140224, 20140213, 20140131, 20140114,
20130827, 20130820, 20130806, 20130730, 20130723,
20130719, 20130716, 20130620, 20130620, 20130614 ),
###diff is the # of days between actions/day_ids
diff =c(NA,162,NA,0,150,NA,4,9,13,6,1,6,15,7,8,9,11,13,17,140,7,14,
7,7,4,3,26,0,6),
###Just a flag to say whether it's a new id
new_id = c(1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)
))
I've done it with a for loop and managed to avoid loops within loops (see below) but can't seem to get rid of that outer loop. Of course, it gets extremely slow with thousands of ids. In the example below, 'call_block' is what I'm trying to reproduce but without the for loop. Can anyone help me get this out of a loop??
max_days = 30
r = NULL
for(i in unique(dat$id)){
d = dat$diff[dat$id==i]
w = c(1,which(d>=max_days) , length(d)+1)
w2 = diff(w)
r = c(r,rep(1:(length(w)-1), w2))
}
dat$call_block = r
Thank you!
Posting #alexis_laz's answer here to close out the question
library(data.table)
f = function(x){
ret = c(1, cumsum((x >= 30)[-1]) + 1)
return(ret = ret)
}
df = data.table(dat)
df2 = df[,list(call_block= f(diff)), by = id]

Resources