Transform list of dataframes to list of time-series - r

The package IMFdata returns a list of dataframes.
For example:
library(IMFData)
databaseID <- "IFS"
startdate = "2019-01-01"
enddate = "2019-03-01"
checkquery = FALSE
queryfilter <- list(CL_FREQ = "M", CL_AREA_IFS = c("AU", "BR"), CL_INDICATOR_IFS = "FIDR_PA")
IFS_ex <- CompactDataMethod(databaseID, queryfilter, startdate, enddate, checkquery)
This code creates the list of dataframes IFS_ex:
structure(list(`#FREQ` = c("M", "M"), `#REF_AREA` = c("AU", "BR"
), `#INDICATOR` = c("FIDR_PA", "FIDR_PA"), `#UNIT_MULT` = c("0",
"0"), `#TIME_FORMAT` = c("P1M", "P1M"), Obs = list(structure(list(
`#TIME_PERIOD` = c("2019-01", "2019-02", "2019-03"), `#OBS_VALUE` = c("1.95",
"1.95", "1.9")), class = "data.frame", row.names = c(NA,
3L)), structure(list(`#TIME_PERIOD` = c("2019-01", "2019-02",
"2019-03"), `#OBS_VALUE` = c("6.41275285614977", "5.70499999999921",
"6.18810104544945")), class = "data.frame", row.names = c(NA,
3L)))), row.names = 1:2, class = "data.frame")
I would like to transform this list of dataframes in a list of time-series (list_ts). This is the expected output:
list_ts <- list(AU = structure(c(1.95, 1.95, 1.90), .Tsp = c(2019, 2019.16666666667, 12), class = "ts"), BR = structure(c(6.41275285614977,
5.70499999999921, 6.18810104544945), .Tsp = c(2019, 2019.16666666667,
12), class = "ts"))

Try running it with tidy = TRUE. You can then split the dataframe and apply over the new lists. It is possible to use the original format you showed, but it would be a lot more work.
library(IMFData)
library(zoo)
databaseID <- "IFS"
startdate = "2019-01-01"
enddate = "2019-03-01"
checkquery = FALSE
IFS_ex <- CompactDataMethod(databaseID, queryfilter, startdate, enddate, checkquery,
tidy = TRUE)
lst_df <- split(IFS_ex, IFS_ex$`#REF_AREA`)
list_ts <- lapply(lst_df, function(x) ts(zoo(x$`#OBS_VALUE`, x$`#TIME_PERIOD`), start = c(2019, 1), frequency = 12))
# and to get rid of an attribute you do not want
list_ts <- lapply(list_ts, function(x) {attr(x, "index") <- NULL; x})
list_ts
# $AU
# Jan Feb Mar
# 2014 1.95 1.95 1.9
#
# $BR
# Jan Feb Mar
# 2014 6.41275285614977 5.70499999999921 6.18810104544945

Related

How to loop dataframe in R

I want to get data from IMF.However the API data is limited
Therefor I get the data by continent.
How to loop the dateframe? (The data can get from "Before loop part",load data from api)
The reference cannot work.https://stackoverflow.com/questions/25284539/loop-over-a-string-variable-in-r
Before the loop
library(imfr)
library(countrycode)
data(codelist)
country_set <- codelist
country_set<- country_set %>%
select(country.name.en , iso2c, iso3c, imf, continent, region) %>% filter(!is.na(imf) & !is.na(iso2c))
africa_iso2<- country_set$iso2c[country_set$continent=="Africa"]
asia_iso2<- country_set$iso2c[country_set$continent=="Asia"]
americas_iso2<- country_set$iso2c[country_set$continent=="Americas"]
europe_iso2<- country_set$iso2c[country_set$continent=="Europe"]
oceania_iso2<- country_set$iso2c[country_set$continent=="Oceania"]
loop part
continent <- c("africa", "asia", "americas","europe","oceania")
for(i in 1:length(continent)){
var <- paste0("gdp_nsa_xdc_", continent[i])
var1 <- paste0(continent[i],"_iso2")
[[var]]<- imf_data(database_id = "IFS" , indicator = c("NGDP_NSA_XDC"),country =[[var1]],start = 2010, end = 2022,return_raw = TRUE)
[[var]]<- [[var]]$CompactData$DataSet$Series
}
data sample is
list(CompactData = list(`#xmlns:xsi` = "http://www.w3.org/2001/XMLSchema-instance",
`#xmlns:xsd` = "http://www.w3.org/2001/XMLSchema", `#xsi:schemaLocation` = "http://www.SDMX.org/resources/SDMXML/schemas/v2_0/message https://registry.sdmx.org/schemas/v2_0/SDMXMessage.xsd http://dataservices.imf.org/compact/IFS http://dataservices.imf.org/compact/IFS.xsd",
`#xmlns` = "http://www.SDMX.org/resources/SDMXML/schemas/v2_0/message",
Header = list(ID = "18e0aeae-09ec-4dfe-ab72-60aa16aaea84",
Test = "false", Prepared = "2022-10-19T12:02:28", Sender = list(
`#id` = "1C0", Name = list(`#xml:lang` = "en", `#text` = "IMF"),
Contact = list(URI = "http://www.imf.org", Telephone = "+ 1 (202) 623-6220")),
Receiver = list(`#id` = "ZZZ"), DataSetID = "IFS"), DataSet = list(
`#xmlns` = "http://dataservices.imf.org/compact/IFS",
Series = list(`#FREQ` = "Q", `#REF_AREA` = "US", `#INDICATOR` = "NGDP_NSA_XDC",
`#UNIT_MULT` = "6", `#TIME_FORMAT` = "P3M", Obs = structure(list(
`#TIME_PERIOD` = c("2020-Q1", "2020-Q2", "2020-Q3",
"2020-Q4", "2021-Q1", "2021-Q2", "2021-Q3", "2021-Q4",
"2022-Q1", "2022-Q2"), `#OBS_VALUE` = c("5254152",
"4930197", "5349433", "5539370", "5444406", "5784816",
"5883177", "6203369", "6010733", "6352982")), class = "data.frame", row.names = c(NA,
10L))))))
I suggest you create a list first, to which you will assign the value you want your loop to create. The following code creates a named list, and then at the end of the loop, assigns the value of each iteration to that named list:
continent <-
sapply(c("africa", "asia", "americas","europe","oceania"),
c, simplify = FALSE, USE.NAMES = TRUE)
for(i in seq_len(length(continent))) {
var <- paste0("gdp_nsa_xdc_", continent[i])
var1 <- get(paste0(continent[i],"_iso2"))
var <- imf_data(database_id = "IFS" , indicator = c("NGDP_NSA_XDC"),
country = var1, start = 2010, end = 2022,
return_raw = TRUE)
continent[[i]] <- var$CompactData$DataSet$Series
}
I don't necessarily understand the double brackets around [[var]]. Let me know if my answer does not correspond to what you were looking for!
We could use assign to create objects in the global env
for(i in 1:length(continent)){
var <- paste0("gdp_nsa_xdc_", continent[i])
var1 <- paste0(continent[i],"_iso2")
assign(var, imf_data(database_id = "IFS" , indicator = c("NGDP_NSA_XDC"),country =[[var1]],start = 2010, end = 2022,
return_raw = TRUE))
assign(var, get(var)$CompactData$DataSet$Series)
}

Fetch daily data for each variable using mapply

I've a function which objective is to fetch daily data for each variable on a column on a data.frame. Range is a complete month, but could be any other range.
My df has a column unit_id, so I need my function to take the first id of col unit_id and fetch the data for every single date of march.
| unit | unit_id |
|:-----:|----------|
| AE | 123 |
| AD | 456 |
| AN | 789 |
But right now, my function loops the ids in unit_id col. So as I've 3 ids, the 4th day the function uses the 1st id again, and then for the 5th day uses the 2nd id and so on. And this repeats until the last day of the month.
I need it to use each id for every day of the month.
code:
my_dates <- seq(as.Date("2020-03-01"), as.Date("2020-03-31"), by = 1)
my_fetch <- function(unit, unit_id, d) {
df <- google_analytics(unit_id,
date_range = c(d, d),
metrics = c("totalEvents"),
dimensions = c("ga:date", "ga:eventCategory", "ga:eventAction", "ga:eventLabel"),
anti_sample = TRUE)
df$unidad_de_negocio <- unit
filename <- paste0(unit, "-", "total-events", "-", d, ".csv")
path <- "D:\\america\\costos_protv\\total_events"
write.csv(df, file.path(path, filename), row.names = FALSE)
print(filename)
rm(df)
gc()
}
monthly_fetches <- mapply(my_fetch, df$unit,
df$unit_id,
my_dates, SIMPLIFY = FALSE)
Variation 2: By monthly ranges
Thank you, Akrun. Your answer works.
I'ven trying to edit it, ot use it in this other similar scenario:
1.- Monthly starts and ends: Now the loops isn't a single day date, but has an start and end. I've called this monthly_dates
| starts | ends |
|:-----------:|------------|
| 2020-02-01 | 2020-02-29 |
| 2020-03-01 | 2020-03-31 |
I've tried to adapt the solution, but it is not working. May you see it and tell me why? Thank you.
monthly_fetches <- Map(function(x, y)
lapply(monthly_dates, function(d1, d2) my_fetch(x, y, monthly_dates$starts, monthly_dates$ends)))
Main function adapted to use 2 dates (start "d1" and end "d2"):
my_fetch <- function(udn, udn_id, d1, d2) {
df <- google_analytics(udn_id,
date_range = c(d1, d2),
metrics = c("totalEvents"),
dimensions = c("ga:month"),
anti_sample = TRUE)
df$udn <- udn
df$udn_id <- udn_id
df
}
** Code to make the monthly date ranges:**
make_date_ranges <- function(start, end){
starts <- seq(from = start,
to = Sys.Date()-1 ,
by = "1 month")
ends <- c((seq(from = add_months(start, 1),
to = end,
by = "1 month" ))-1,
(Sys.Date()-1))
data.frame(starts,ends)
}
## useage
monthly_dates <- make_date_ranges(as.Date("2020-02-01"), Sys.Date())
Update 1:
dput(monthly_fetches[1])
list(AE = list(structure(list(month = "02", totalEvents = 19670334,
udn = "AE", udn_id = 74415341), row.names = 1L, totals = list(
list(totalEvents = "19670334")), minimums = list(list(totalEvents = "19670334")), maximums = list(
list(totalEvents = "19670334")), isDataGolden = TRUE, rowCount = 1L, class = "data.frame"),
structure(list(month = "03", totalEvents = 19765253, udn = "AE",
udn_id = 74415341), row.names = 1L, totals = list(list(
totalEvents = "19765253")), minimums = list(list(totalEvents = "19765253")), maximums = list(
list(totalEvents = "19765253")), isDataGolden = TRUE, rowCount = 1L, class = "data.frame"),
structure(list(month = "04", totalEvents = 1319087, udn = "AE",
udn_id = 74415341), row.names = 1L, totals = list(list(
totalEvents = "1319087")), minimums = list(list(totalEvents = "1319087")), maximums = list(
list(totalEvents = "1319087")), isDataGolden = TRUE, rowCount = 1L, class = "data.frame")))
Update 2:
dput(monthly_fetches[[1]])
list(structure(list(month = "02", totalEvents = 19670334, udn = "AE",
udn_id = 74415341), row.names = 1L, totals = list(list(totalEvents = "19670334")), minimums = list(
list(totalEvents = "19670334")), maximums = list(list(totalEvents = "19670334")), isDataGolden = TRUE, rowCount = 1L, class = "data.frame"),
structure(list(month = "03", totalEvents = 19765253, udn = "AE",
udn_id = 74415341), row.names = 1L, totals = list(list(
totalEvents = "19765253")), minimums = list(list(totalEvents = "19765253")), maximums = list(
list(totalEvents = "19765253")), isDataGolden = TRUE, rowCount = 1L, class = "data.frame"),
structure(list(month = "04", totalEvents = 1319087, udn = "AE",
udn_id = 74415341), row.names = 1L, totals = list(list(
totalEvents = "1319087")), minimums = list(list(totalEvents = "1319087")), maximums = list(
list(totalEvents = "1319087")), isDataGolden = TRUE, rowCount = 1L, class = "data.frame"))
As Map/mapply requires all arguments to be of same length and 'df' with number of rows of 3 and 'my_dates' length 31, one option is to loop over the 'df', columns and then do a further loop inside the Map/mapply
monthly_fetches <- Map(function(x, y)
lapply(my_dates, function(date) my_fetch(x, y, date)),
df$unit, d$unit_id)
Or we can have outer loop for 'my_dates'
lapply(my_dates, function(date) Map(my_fetch, df$unit, df$unit_id, date))
Update
If we need to pass two columns, use Map
Map(function(start, end)
Map(my_fetch, df$unit, df$unit_id, start, end),
monthly_dates$starts, monthly_dates$ends))
Or
monthly_fetches <- Map(function(x, y) Map(function(start, end)
my_fetch(x, y, start, end),
monthly_dates$starts, monthly_dates$ends), df$unit, df$unit_id)
Then rbind
do.call(rbind,lapply(monthly_fetches, function(x) do.call(rbind, x)))
Or use map
library(purrr)
library(dplyr)
map_dfr(monthly_fetches, bind_rows, .id = 'grp')

Using literal month names with year in ramcharts

Here is my code to generate barplot using rAmChart,
library(rAmCharts)
amBarplot(x = "month", y = "value", data = dataset,
dataDateFormat = "MM/YYYY", minPeriod = "MM",
show_values = FALSE, labelRotation = -90, depth = 0.1)
However, is there a way to use month names & year in my x axis? I am trying to use MMM-YY formats.
Sample dataset,
structure(list(value = c(11544, 9588, 9411, 10365, 11154, 12688
), month = c("05/2012", "06/2012", "07/2012", "08/2012", "09/2012",
"10/2012")), .Names = c("value", "month"), row.names = c(NA,
6L), class = "data.frame")
Thanks.
It appears that rAmCharts doesn't expose AmCharts' dateFormats setting in the categoryAxis, so you have to access it through the init event and create your own dateFormats array with a modified format string for the MM period. I'm not very experienced with R, but here's how I managed to make it work using R 3.4.2 and rAmCharts 2.1.5
chart <- amBarplot( ... settings omitted ... )
addListener(.Object = chart,
name = 'init',
expression = paste(
"function(e) {",
"e.chart.categoryAxis.dateFormats = ",
'[{"period":"fff","format":"JJ:NN:SS"},{"period":"ss","format":"JJ:NN:SS"},',
'{"period":"mm","format":"JJ:NN"},{"period":"hh","format":"JJ:NN"},{"period":"DD","format":"MMM DD"},',
'{"period":"WW","format":"MMM DD"},',
'{"period":"MM","format":"MMM-YY"},', # "add YY to default MM format
'{"period":"YYYY","format":"YYYY"}]; ',
'e.chart.validateData();',
"}")
)
Here is a different solution:
library(rAmCharts)
dataset <- structure(list(value = c(11544, 9588, 9411, 10365, 11154, 12688
), month = c("05/2012", "06/2012", "07/2012", "08/2012", "09/2012",
"10/2012")), .Names = c("value", "month"), row.names = c(NA,
6L), class = "data.frame")
dataset$month <- as.character(
format(
as.Date(paste0("01/",dataset$month), "%d/%m/%Y"),
"%B %Y"))
amBarplot(x = "month", y = "value", data = dataset,
show_values = FALSE, labelRotation = -90, depth = 0.1)

Join data frames faster

I'm currently merging 12 different data frames that are each 480,00 obs by an id and adding the columns, so it becomes a 48k obs x 14 variable data frame. However, this is taking too long to process and I'm looking for a faster way to do this.
Example
dput:
# January data
jan <- structure(list(gridNumber = c("17578", "18982", "18983", "18984",
"18985"), PRISM_ppt_stable_4kmM2_193301_bil = c(35.7099990844727,
36, 35.4199981689453, 33.7299995422363, 33.2799987792969)), .Names = c("gridNumber",
"PRISM_ppt_stable_4kmM2_193301_bil"), row.names = c("17578",
"18982", "18983", "18984", "18985"), class = "data.frame")
# February data
feb <- structure(list(gridNumber = c("17578", "18982", "18983", "18984",
"18985"), PRISM_ppt_stable_4kmM2_193302_bil = c(14.6199998855591,
14.5600004196167, 14.9899997711182, 15.4700002670288, 15.5799999237061
)), .Names = c("gridNumber", "PRISM_ppt_stable_4kmM2_193302_bil"
), row.names = c("17578", "18982", "18983", "18984", "18985"), class = "data.frame")
# March Data
mar <- structure(list(gridNumber = c("17578", "18982", "18983", "18984",
"18985"), PRISM_ppt_stable_4kmM2_193303_bil = c(23.8400001525879,
23.9200000762939, 24.3400001525879, 25.7900009155273, 26.5900001525879
)), .Names = c("gridNumber", "PRISM_ppt_stable_4kmM2_193303_bil"
), row.names = c("17578", "18982", "18983", "18984", "18985"), class = "data.frame")
dplyr Code:
library(dplyr)
datalist <- list(jan, feb, mar)
full <- Reduce(function(x,y) {full_join(x,y, by = "gridNumber")}, datalist)
This code obviously runs much faster because of the low obs, but is there a faster way to do this?
Here is an approach using data.table and reshape2
library(data.table)
library(reshape2)
# create a list of data frames, and coerce to data.tables
month_list <- lapply(list(jan,feb,mar),setDT)
# add id column with old variable name and rename value column
for(i in seq_along(month_list)){
set(month_list[[i]],j="ID",value = names(month_list[[i]])[2])
setnames(month_list[[i]], names(month_list[[i]])[2], "value")
}
# put in long form
long_data <- rbindlist(month_list)
# then use `dcast.data.table` to make wide
wide <- dcast.data.table(long_data, gridNumber~ID, value = 'value')
Dunno if this will be faster, but:
list(jan = jan %>% rename(PRISM = PRISM_ppt_stable_4kmM2_193301_bil),
feb = feb %>% rename(PRISM = PRISM_ppt_stable_4kmM2_193302_bil),
mar = mar %>% rename(PRISM = PRISM_ppt_stable_4kmM2_193303_bil)) %>%
bind_rows(.id = "month") %>%
spread(month, PRISM)

JSON in a R list form to single merged dataframe

I currently am connecting to a mongodb through the rmongodb package (which isnt necessary for this example), it provides me with a list in the form of "mongo_data" list supplied in the code below. I want to take a list in this format and end up with a single dataframe. I currently use a loop (which I have setup to run in parallel but even still it is quite slow when using large amounts of data. Any ideas on how to do this without using a loop?
Reproducible data
symbols <- c("SPXT", "BCOM", "MXWO")
mongo_data <- list(structure(list(`_id` = "561369b8e756f3e744d9d697", symbol = "BCOM",
year = 2014, field = "PX_LAST", monthly = list(structure(list(
`2014-01-31` = 126.1206, `2014-02-28` = 133.9783, `2014-03-31` = 134.5234,
`2014-04-30` = 137.7964, `2014-05-30` = 133.8324, `2014-06-30` = 134.6268,
`2014-07-31` = 127.9143, `2014-08-29` = 126.5745, `2014-09-30` = 118.6922,
`2014-10-31` = 117.7365, `2014-11-28` = 112.9451, `2014-12-31` = 104.3285), .Names = c("2014-01-31",
"2014-02-28", "2014-03-31", "2014-04-30", "2014-05-30", "2014-06-30",
"2014-07-31", "2014-08-29", "2014-09-30", "2014-10-31", "2014-11-28",
"2014-12-31")))), .Names = c("_id", "symbol", "year", "field",
"monthly")), structure(list(`_id` = "561369b8e756f3e744d9d698",
symbol = "BCOM", year = 2015, field = "PX_LAST", monthly = list(
structure(list(`2015-01-30` = 100.8413, `2015-02-27` = 103.4379,
`2015-03-31` = 98.123, `2015-04-30` = 103.7471, `2015-05-29` = 100.9465,
`2015-06-30` = 102.6892, `2015-07-31` = 91.7827,
`2015-08-31` = 90.9328, `2015-09-30` = 87.8214), .Names = c("2015-01-30",
"2015-02-27", "2015-03-31", "2015-04-30", "2015-05-29",
"2015-06-30", "2015-07-31", "2015-08-31", "2015-09-30"
)))), .Names = c("_id", "symbol", "year", "field", "monthly"
)), structure(list(`_id` = "561353f1e756f3e744d97955", symbol = "MXWO",
year = 2014, field = "PX_LAST", monthly = list(structure(list(
`2014-01-31` = 1598.46, `2014-02-28` = 1675.4, `2014-03-31` = 1673.87,
`2014-04-30` = 1687.74, `2014-05-30` = 1715.18, `2014-06-30` = 1743.42,
`2014-07-31` = 1714.35, `2014-08-29` = 1748.69, `2014-09-30` = 1698.41,
`2014-10-31` = 1708.09, `2014-11-28` = 1739.5, `2014-12-31` = 1709.67), .Names = c("2014-01-31",
"2014-02-28", "2014-03-31", "2014-04-30", "2014-05-30", "2014-06-30",
"2014-07-31", "2014-08-29", "2014-09-30", "2014-10-31", "2014-11-28",
"2014-12-31")))), .Names = c("_id", "symbol", "year", "field",
"monthly")), structure(list(`_id` = "561353f1e756f3e744d97956",
symbol = "MXWO", year = 2015, field = "PX_LAST", monthly = list(
structure(list(`2015-01-30` = 1677.54, `2015-02-27` = 1772.86,
`2015-03-31` = 1740.81, `2015-04-30` = 1778.4, `2015-05-29` = 1779.31,
`2015-06-30` = 1735.61, `2015-07-31` = 1765.6, `2015-08-31` = 1645.43,
`2015-09-30` = 1581.92), .Names = c("2015-01-30",
"2015-02-27", "2015-03-31", "2015-04-30", "2015-05-29",
"2015-06-30", "2015-07-31", "2015-08-31", "2015-09-30"
)))), .Names = c("_id", "symbol", "year", "field", "monthly"
)), structure(list(`_id` = "5613542fe756f3e744d97a69", symbol = "SPXT",
year = 2014, field = "PX_LAST", monthly = list(structure(list(
`2014-01-31` = 3200.95, `2014-02-28` = 3347.3799, `2014-03-31` = 3375.51,
`2014-04-30` = 3400.46, `2014-05-30` = 3480.29, `2014-06-30` = 3552.1799,
`2014-07-31` = 3503.1899, `2014-08-29` = 3643.3401, `2014-09-30` = 3592.25,
`2014-10-31` = 3679.99, `2014-11-28` = 3778.96, `2014-12-31` = 3769.4399), .Names = c("2014-01-31",
"2014-02-28", "2014-03-31", "2014-04-30", "2014-05-30", "2014-06-30",
"2014-07-31", "2014-08-29", "2014-09-30", "2014-10-31", "2014-11-28",
"2014-12-31")))), .Names = c("_id", "symbol", "year", "field",
"monthly")), structure(list(`_id` = "5613542fe756f3e744d97a6a",
symbol = "SPXT", year = 2015, field = "PX_LAST", monthly = list(
structure(list(`2015-01-30` = 3656.28, `2015-02-27` = 3866.4199,
`2015-03-31` = 3805.27, `2015-04-30` = 3841.78, `2015-05-29` = 3891.1799,
`2015-06-30` = 3815.8501, `2015-07-31` = 3895.8,
`2015-08-31` = 3660.75, `2015-09-30` = 3570.1699), .Names = c("2015-01-30",
"2015-02-27", "2015-03-31", "2015-04-30", "2015-05-29",
"2015-06-30", "2015-07-31", "2015-08-31", "2015-09-30"
)))), .Names = c("_id", "symbol", "year", "field", "monthly"
)))
Code Using Currently
library(foreach)
library(doParallel)
cl <- makeCluster((detectCores()))
registerDoParallel(cl)
total_df_list <- foreach(i=(1:length(symbols))) %dopar% {
# pull out the data for symbol i
symbol_data <- mongo_data[which(sapply(lapply(mongo_data, "[[", "symbol"), function(x) x == symbols[i]))]
# pull out the particular frequency of data & put into a single list
freq_data <- lapply(symbol_data, "[[", "monthly")
freq_data <- do.call(Map, c(c, freq_data))
# if the frequency doesnt exist then add symbol to filler vector
if (length(freq_data) > 0) {
# convert NULLs to NAs
freq_data[[1]][which(sapply(freq_data[[1]], is.null))] <- NA
# transform list into a proper dataframe
mongo_df <- data.frame("Date" = names(unlist(freq_data[[1]])),"Value"=as.numeric(unlist(freq_data[[1]])))
mongo_df[,"Date"] <- as.Date(mongo_df[,"Date"])
colnames(mongo_df)[2] <- paste0(symbols[i])
# put dataframe into the master list
results <- mongo_df
} else {
filler_vector <- c(filler_vector,symbols[i])
results <- NULL
}
results
}
I did some benchmarking of my previous solution for this question, and I noticed that nearly all the runtime was spent in the as.Date call, which converts string dates to Date objects. As a result, it seems like optimizing this operation would be needed to further push the efficiency. One observation would be that many of the dates are reported multiple times (for different symbols), so we are wasting computation by performing date conversions for the same string literal multiple times. To address this, we could use the following procedure:
Compute all unique date strings
Convert them to Date objects with as.Date
Look up the converted value for each date string, yielding the final converted vector
In code, we could do this with:
vals <- unlist(lapply(mongo_data, "[[", "monthly"))
s <- unlist(lapply(mongo_data, function(x) rep(x$symbol, length(x$monthly[[1]]))))
dates <- unique(names(vals))
date.map <- as.Date(dates)
names(date.map) <- dates
partial <- data.frame(Date=date.map[names(vals)], val=as.numeric(vals),
symbol=s)
tdl2 <- unname(lapply(split(partial, partial$s), function(x) {
names(x)[2] <- as.character(x$symbol[1])
rownames(x) <- NULL
x[,-3]
})[symbols])
identical(total_df_list, tdl2)
# [1] TRUE
Let's benchmark this on a large instance that is obtained by repeating each row in mongo_data 10,000 times. OP is the function from the original question, josilber1 is from my other answer, and josilber2 is from this answer:
big.mongo <- mongo_data[rep(seq_along(mongo_data), 10000)]
system.time(OP(big.mongo, symbols))
# user system elapsed
# 5.359 0.071 5.427
system.time(josilber1(big.mongo, symbols))
# user system elapsed
# 4.345 0.047 4.385
system.time(josilber2(big.mongo, symbols))
# user system elapsed
# 0.560 0.048 0.625
My other answer yielded a 20% improvement in the runtime, while this solution yielded roughly a 10x speedup.
I see a few opportunities to speed this up:
Don't loop through the list extracting the symbol name for each symbol you process. Instead you can extract all the symbol names at the beginning and use that to quickly identify the part of the list you want to process.
sapply with == to look up matching symbols is calling == separately on each element. It would be more efficient to just call == once on the whole vector.
sapply with is.null is calling is.null separately on each element. It would be more efficient to just call is.null once on the whole vector.
Here's my updated implementation:
mongo.sym <- unlist(lapply(mongo_data, "[[", "symbol"))
tdl <- lapply(symbols, function(sym) {
vals <- unlist(lapply(mongo_data[mongo.sym == sym], function(x) unlist(x$monthly)))
vals[is.null(vals)] <- NA
ret <- data.frame(as.Date(names(vals)), as.numeric(vals))
names(ret) <- c("Date", sym)
ret
})
identical(total_df_list, tdl)
# [1] TRUE

Resources