Extracting elements from a data.frame object in R - r

For each country ("BR", "MX", "RU"), I would like to create a data.frame object containing three columns: #INDICATOR, #TIME_PERIOD, and #OBS_VALUE.
library(IMFData)
IRFCL.available.codes <- DataStructureMethod("IRFCL")
names(IRFCL.available.codes)
indicators_IRFCL <- IRFCL.available.codes[[3]]
databaseID <- "IRFCL"
startdate = "2006-01-01"
enddate = " "
checkquery = FALSE
queryfilter <- list(CL_FREQ = "Q", CL_AREA_IRFCL = c("BR", "MX", "RU"), CL_INDICATOR_IRFCL = "", CL_SECTOR_IRFCL = "")
RESERVES <- CompactDataMethod(databaseID, queryfilter, startdate, enddate, checkquery)
Inside RESERVES, one can find #INDICATOR and Obs (which contains #TIME_PERIOD and #OBS_VALUE for each #INDICATOR).

You can do:
library(IMFData)
library(dplyr)
RESERVES <- CompactDataMethod(databaseID, queryfilter, startdate, enddate, checkquery, tidy = T)
and then
RESERVES %>%
select(`#REF_AREA`, `#TIME_PERIOD`, `#INDICATOR`, `#OBS_VALUE`) %>%
group_by(`#REF_AREA`) %>%
group_split(keep = F) -> list_of_tibbles
This way you get a list of data.frames (tibbles)

Related

How to loop dataframe in R

I want to get data from IMF.However the API data is limited
Therefor I get the data by continent.
How to loop the dateframe? (The data can get from "Before loop part",load data from api)
The reference cannot work.https://stackoverflow.com/questions/25284539/loop-over-a-string-variable-in-r
Before the loop
library(imfr)
library(countrycode)
data(codelist)
country_set <- codelist
country_set<- country_set %>%
select(country.name.en , iso2c, iso3c, imf, continent, region) %>% filter(!is.na(imf) & !is.na(iso2c))
africa_iso2<- country_set$iso2c[country_set$continent=="Africa"]
asia_iso2<- country_set$iso2c[country_set$continent=="Asia"]
americas_iso2<- country_set$iso2c[country_set$continent=="Americas"]
europe_iso2<- country_set$iso2c[country_set$continent=="Europe"]
oceania_iso2<- country_set$iso2c[country_set$continent=="Oceania"]
loop part
continent <- c("africa", "asia", "americas","europe","oceania")
for(i in 1:length(continent)){
var <- paste0("gdp_nsa_xdc_", continent[i])
var1 <- paste0(continent[i],"_iso2")
[[var]]<- imf_data(database_id = "IFS" , indicator = c("NGDP_NSA_XDC"),country =[[var1]],start = 2010, end = 2022,return_raw = TRUE)
[[var]]<- [[var]]$CompactData$DataSet$Series
}
data sample is
list(CompactData = list(`#xmlns:xsi` = "http://www.w3.org/2001/XMLSchema-instance",
`#xmlns:xsd` = "http://www.w3.org/2001/XMLSchema", `#xsi:schemaLocation` = "http://www.SDMX.org/resources/SDMXML/schemas/v2_0/message https://registry.sdmx.org/schemas/v2_0/SDMXMessage.xsd http://dataservices.imf.org/compact/IFS http://dataservices.imf.org/compact/IFS.xsd",
`#xmlns` = "http://www.SDMX.org/resources/SDMXML/schemas/v2_0/message",
Header = list(ID = "18e0aeae-09ec-4dfe-ab72-60aa16aaea84",
Test = "false", Prepared = "2022-10-19T12:02:28", Sender = list(
`#id` = "1C0", Name = list(`#xml:lang` = "en", `#text` = "IMF"),
Contact = list(URI = "http://www.imf.org", Telephone = "+ 1 (202) 623-6220")),
Receiver = list(`#id` = "ZZZ"), DataSetID = "IFS"), DataSet = list(
`#xmlns` = "http://dataservices.imf.org/compact/IFS",
Series = list(`#FREQ` = "Q", `#REF_AREA` = "US", `#INDICATOR` = "NGDP_NSA_XDC",
`#UNIT_MULT` = "6", `#TIME_FORMAT` = "P3M", Obs = structure(list(
`#TIME_PERIOD` = c("2020-Q1", "2020-Q2", "2020-Q3",
"2020-Q4", "2021-Q1", "2021-Q2", "2021-Q3", "2021-Q4",
"2022-Q1", "2022-Q2"), `#OBS_VALUE` = c("5254152",
"4930197", "5349433", "5539370", "5444406", "5784816",
"5883177", "6203369", "6010733", "6352982")), class = "data.frame", row.names = c(NA,
10L))))))
I suggest you create a list first, to which you will assign the value you want your loop to create. The following code creates a named list, and then at the end of the loop, assigns the value of each iteration to that named list:
continent <-
sapply(c("africa", "asia", "americas","europe","oceania"),
c, simplify = FALSE, USE.NAMES = TRUE)
for(i in seq_len(length(continent))) {
var <- paste0("gdp_nsa_xdc_", continent[i])
var1 <- get(paste0(continent[i],"_iso2"))
var <- imf_data(database_id = "IFS" , indicator = c("NGDP_NSA_XDC"),
country = var1, start = 2010, end = 2022,
return_raw = TRUE)
continent[[i]] <- var$CompactData$DataSet$Series
}
I don't necessarily understand the double brackets around [[var]]. Let me know if my answer does not correspond to what you were looking for!
We could use assign to create objects in the global env
for(i in 1:length(continent)){
var <- paste0("gdp_nsa_xdc_", continent[i])
var1 <- paste0(continent[i],"_iso2")
assign(var, imf_data(database_id = "IFS" , indicator = c("NGDP_NSA_XDC"),country =[[var1]],start = 2010, end = 2022,
return_raw = TRUE))
assign(var, get(var)$CompactData$DataSet$Series)
}

What its mean by Error in seq.int(0, to0 - from, by) : wrong sign in 'by' argument in r

Code below used to web scrape a website using API call. I just have to change the startDate and endDate to get data set that I want. Previously it works fine, doing its loops wonderfully but after I did some modification on the html_nodes() part - try to extract different section in the webpage, it keep return me data of the same date. with error warning 'Error in seq.int(0, to0 - from, by) : wrong sign in 'by' argument in r'. What should be done here?
library(tidyverse)
library(readr)
library(tidyr)
library(dplyr)
library(xlsx)
library(beepr)
get_sounding_data <- function(region = c("naconf", "samer", "pac", "nz", "ant",
"np", "europe", "africa", "seasia", "mideast"),
date,
from_hr = c("00", "12", "all"),
to_hr = c("00", "12", "all"),
station_number = 48615) {
# we use these pkgs (the readr and dplyr dependencies removed)
suppressPackageStartupMessages({
require("xml2", quietly = TRUE)
require("httr", quietly = TRUE)
require("rvest", quietly = TRUE)
})
# validate region
region <- match.arg(
arg = region,
choices = c(
"naconf", "samer", "pac", "nz", "ant",
"np", "europe", "africa", "seasia", "mideast"
)
)
# validates the date for us if it's a character string
date <- as.Date(date)
# get year and month
year <- as.integer(format(date, "%Y"))
stopifnot(year %in% 1973:as.integer(format(Sys.Date(), "%Y")))
year <- as.character(year)
month <- format(date, "%m")
# we need these to translate day & *_hr to the param the app needs
c(
"0100", "0112", "0200", "0212", "0300", "0312", "0400", "0412",
"0500", "0512", "0600", "0612", "0700", "0712", "0800", "0812",
"0900", "0912", "1000", "1012", "1100", "1112", "1200", "1212",
"1300", "1312", "1400", "1412", "1500", "1512", "1600", "1612",
"1700", "1712", "1800", "1812", "1900", "1912", "2000", "2012",
"2100", "2112", "2200", "2212", "2300", "2312", "2400", "2412",
"2500", "2512", "2600", "2612", "2700", "2712", "2800", "2812",
"2900", "2912", "3000", "3012", "3100", "3112"
) -> hr_vals
c(
"01/00Z", "01/12Z", "02/00Z", "02/12Z", "03/00Z", "03/12Z", "04/00Z",
"04/12Z", "05/00Z", "05/12Z", "06/00Z", "06/12Z", "07/00Z", "07/12Z",
"08/00Z", "08/12Z", "09/00Z", "09/12Z", "10/00Z", "10/12Z", "11/00Z",
"11/12Z", "12/00Z", "12/12Z", "13/00Z", "13/12Z", "14/00Z", "14/12Z",
"15/00Z", "15/12Z", "16/00Z", "16/12Z", "17/00Z", "17/12Z", "18/00Z",
"18/12Z", "19/00Z", "19/12Z", "20/00Z", "20/12Z", "21/00Z", "21/12Z",
"22/00Z", "22/12Z", "23/00Z", "23/12Z", "24/00Z", "24/12Z", "25/00Z",
"25/12Z", "26/00Z", "26/12Z", "27/00Z", "27/12Z", "28/00Z", "28/12Z",
"29/00Z", "29/12Z", "30/00Z", "30/12Z", "31/00Z", "31/12Z"
) -> hr_inputs
hr_trans <- stats::setNames(hr_vals, hr_inputs)
o_from_hr <- from_hr <- as.character(tolower(from_hr))
o_to_hr <- to_hr <- as.character(tolower(to_hr))
if ((from_hr == "all") || (to_hr == "all")) {
from_hr <- to_hr <- "all"
} else {
from_hr <- hr_trans[sprintf("%s/%02dZ", format(date, "%d"), as.integer(from_hr))]
match.arg(from_hr, hr_vals)
to_hr <- hr_trans[sprintf("%s/%02dZ", format(date, "%d"), as.integer(to_hr))]
match.arg(to_hr, hr_vals)
}
# clean up the station number if it was entered as a double
station_number <- as.character(as.integer(station_number))
# execute the API call
httr::GET(
url = "http://weather.uwyo.edu/cgi-bin/sounding",
query = list(
region = region,
TYPE = "TEXT:LIST",
YEAR = year,
MONTH = sprintf("%02d", as.integer(month)),
FROM = from_hr,
TO = to_hr,
STNM = station_number
)
) -> res
# check for super bad errors (that we can't handle nicely)
httr::stop_for_status(res)
# get the page content
doc <- httr::content(res, as="text")
# if the site reports no data, issue a warning and return an empty data frame
if (grepl("Can't get", doc)) {
doc <- xml2::read_html(doc)
msg <- rvest::html_nodes(doc, "body")
msg <- rvest::html_text(msg, trim=TRUE)
msg <- gsub("\n\n+.*$", "", msg)
warning(msg)
return(data.frame(stringsAsFactors=FALSE))
}
# if the site reports no data, issue a warning and return an empty data frame
if (grepl("Can't get", doc)) {
doc <- xml2::read_html(doc)
msg <- rvest::html_nodes(doc, "body")
msg <- rvest::html_text(msg, trim=TRUE)
msg <- gsub("\n\n+.*$", "", msg)
warning(msg)
return(data.frame(stringsAsFactors=FALSE))
}
# turn it into something we can parse
doc <- xml2::read_html(doc)
raw_dat <- doc %>%
html_nodes("h3+ pre") %>%
html_text()
indices <- raw_dat %>%
str_split(pattern = "\n", simplify = T) %>%
map_chr(str_squish) %>%
tibble(x = .) %>%
separate(x, into = c("Station", "Value"), sep = ": ") %>%
filter(!is.na(Value))
data <- tidyr::spread(indices, Station, Value)
data
}
startDate <- as.Date("01-11-1979", format="%d-%m-%y")
endDate <- as.Date("31-01-1980",format="%d-%m-%y")
days <- seq(startDate, endDate, "1 day")
lapply(days[1:92], function(day) {
get_sounding_data(
region = "seasia",
date = day,
from_hr = "00",
to_hr = "00",
station_number = "48615"
)
}) -> soundings_48615
warnings()
new_df <- map(soundings_48615, . %>% mutate_all(parse_guess))
dat <- bind_rows(new_df)
dat <- dat %>% separate(col =`Observation time`, into = c('Date', 'time'), sep = '/')
dat$Date <- as.Date(dat$Date, format = "%y%m%d")
#save in text file
write.csv(dat, 'c:/Users/Hp/Documents/yr/climatology/yr_SoundingIndexLowerPart/1979.csv')
get_sounding_data <- NULL
beep()
It seems that the error you are having right now, is date format based. More specific the following times
as.Date("01-11-1979", format="%d-%m-%y")
as.Date("31-01-1980",format="%d-%m-%y")
outputs
"2019-11-01"
"2019-01-31"
R's date/time format is international standard yyyy-mm-dd. As such the "2019-11-01" comes after "2019-01-31" time wise. As such this will cause sequence to fail if it tries to iterate 1 positive day at a time. The formatting is the problem here, for this to be solved there is a very simple solution. Always work in international standard date formats, as these will be recognized by (almost) all programs.
As such change the date sequence part of your code to
....
startDate <- as.Date("1979-11-01")
endDate <- as.Date("1980-01-31")
days <- seq(startDate, endDate, "1 day")
....
Note the format change. As for why it changes 1979 to 2019, i am not entirely sure, someone else might have a more intricate answer for this weird behaviour.
I got this error when I was trying to create a sequence of dates using the seq() function. Turns out I was putting the start and end dates in the wrong order.
This would give me the error:
seq(as.Date("2012-12-30"), as.Date("2011-01-04"), by="days")
Error in seq.int(0, to0 - from, by) : wrong sign in 'by' argument
But this would not give me the error and would give me my date sequence:
seq(as.Date("2011-12-30"), as.Date("2012-01-04"), by="days")

Merging data frames on a specific key in R/Stripping asterisks from Column

Hey guys I need help merging in R and here is a reproducible sample of my code which I will explain.
My issue I'm running into is certain guys in the fixedstats df are hall of famers and therefor they have asterisks in their name. I want to clean that so I can join them to the draft database and grab their draft year. When I do so with the code I'm running into an error that gives me:
Error in order(Player = list("Tariq Abdul-Wahad", "Shareef Abdur-Rahim", :
unimplemented type 'list' in 'orderVector1'
How do you guys recommend I go about stripping the asterisks from the columns in order to properly join the two data frames on the Player Key? Thanks in Advance.
Code:
library(htmltab)
library(sqldf)
library(plyr)
library(readr)
stats0 <- ""
draftbank0 <- ""
for (i in 20003:2017){
url <- paste0("http://www.basketball-reference.com/leagues/NBA_",i,"_advanced.html")
stats <- htmltab(doc = url, which = 1, header = 1, stringsasfactors = FALSE)
stats$year <- i
stats0 <- rbind(stats0,stats)
stats0[rowSums(is.na(stats0)) != ncol(stats0),]
}
colnames(stats0) <- c("Rank",
"Player",
"Pos",
"Age",
"Tm",
"G",
"MP",
"PER",
"TSp",
"ThreePAr",
"FTr",
"ORBp",
"DRBp",
"TRBp",
"ASTp",
"STLp",
"BLKp",
"TOVp",
"USGp",
"Null", #comment out null if needed
"OWS",
"DWS",
"WS",
"WS48",
"Null2", #comment out null if needed
"OBPM",
"DBPM",
"BPM",
"VORP",
"Year")
fixedstats <- sqldf("SELECT Rank, Player, Pos, Age, Tm, G, MP, PER, TSp, ThreePAr, FTr, ORBp, DRBp, TRBp, ASTp, STLp, BLKp, TOVp,
USGp, OWS, DWS, WS, WS48, OBPM, DBPM, BPM, VORP, Year FROM stats0 WHERE player != 'Player'")
fixedstats <- fixedstats[-1,]
for (i in 1980:2016){
url2 <- paste0("http://www.draftexpress.com/nba-mock-history/",i,"/all/all/")
draftbank <- htmltab(doc = url2, which = 1, header = 1, stringsasfactors = FALSE)
draftbank0 <- rbind(draftbank0,draftbank)
}
colnames(draftbank0) <- c("Draft_Year",
"Pick",
"Null1",
"Player",
"Null2",
"Position",
"Age",
"Height",
"Weight",
"Wingspan",
"Points",
"Rebounds",
"Assists",
"PER",
"Null3",
"League",
"EWA")
draftbankfinal <- sqldf("SELECT Player, Position, Age, Height, Wingspan, Draft_Year FROM draftbank0")
draftbankfinal <- draftbank0[-1,]
#Multiple drafts appendix getting rid of guys with similar names
draftbankfinal<-draftbankfinal[!(draftbankfinal$Player=="Corey Brewer" & draftbankfinal$Draft_Year==1998),]
draftbankfinal<-draftbankfinal[!(draftbankfinal$Player=="Patrick Ewing" & draftbankfinal$Draft_Year==1985),]
draftbankfinal<-draftbankfinal[!(draftbankfinal$Player=="Charles Smith" & draftbankfinal$Draft_Year==1988),]
draftbankfinal<-draftbankfinal[!(draftbankfinal$Player=="Ray McCallum" & draftbankfinal$Draft_Year==1983),]
draftbankfinal<-draftbankfinal[!(draftbankfinal$Player=="James Anderson" & draftbankfinal$Draft_Year==1985),]
draftbankfinal<-draftbankfinal[!(draftbankfinal$Player=="Ken Johnson" & draftbankfinal$Draft_Year==1985),]
draftbankfinal<-draftbankfinal[!(draftbankfinal$Player=="Dee Brown" & draftbankfinal$Draft_Year==1990),]
fixedstats$Player <- lapply(fixedstats$Player, sub, pattern = "[*]", replacement = "")
fixedstats$Player <- lapply(fixedstats$Player, sub, pattern = "[']", replacement = "")
fixedstats$Player <- ifelse(fixedstats$Player == 'Jermaine ONeal', 'Jermaine O\'Neal', fixedstats$Player)
fixedstats$Player <- ifelse(fixedstats$Player == 'J.J Obrien', 'J.J O\'Brien', fixedstats$Player)
fixedstats$Player <- ifelse(fixedstats$Player == 'Johnny OBryant', 'Johnny O\'Bryant', fixedstats$Player)
fixedstats$Player <- ifelse(fixedstats$Player == 'Patrick OBryant', 'Patrick O\'Bryant', fixedstats$Player)
fixedstats$Player <- ifelse(fixedstats$Player == 'Shaquille ONeal', 'Shaquille O\'Neal', fixedstats$Player)
fixedstats$Player <- as.vector(fixedstats$Player)
draftbankfinal$Player <- as.vector(draftbankfinal$Player)
df <- merge(x = fixedstats, y = draftbankfinal, by = "Player", all.x = TRUE)
df2 <- df[c(1,3:5,22,28:29,32:35)]
So if anyone is curious and runs into this issue in the future, the answer is I used lapply (returning as list) instead of using vapply (returning as vector) so
fixedstats$Player <- lapply(fixedstats$Player, sub, pattern = "[*]", replacement = "")
fixedstats$Player <- lapply(fixedstats$Player, sub, pattern = "[']", replacement = "")
fixedstats$Player <- as.vector(fixedstats$Player)
draftbankfinal$Player <- as.vector(draftbankfinal$Player)
should actually be
fixedstats$Player <- vapply(fixedstats$Player, sub, pattern = "[*]", replacement = "",character(1))
fixedstats$Player <- vapply(fixedstats$Player, sub, pattern = "[']", replacement = "",character(1))

How to insert an element in the middle of an unnamed list based on chronological dates

I've got a list of lists and am looking for an appropriate method for inserting a new list into this list based on chronological order. Here's an example:
my_list <- list(list("text" = list("headline" = "Hello, World!",
"text" = "This is some text"),
"start_date" = list("year" = 2015,
"month" = "01",
"day" = "01")),
list("text" = list("headline" = "Hola, Mundo!",
"text" = "Este es algo palabras"),
"start_date" = list("year" = 2015,
"month" = "01",
"day" = "03")))
Now, if I want to add a new element into this list where, say, the start_date is 2015-01-02, I'd want to append it into the middle of the list at index 2 and push the 2nd element "down". If the start_date was 2014-12-31 then I'd want it at the very beginning and push everything else "down", and if it was anything after 2015-01-03 I'd want it at the end. Is there a more efficient method for approaching this than the following?
new_list_item <- list("text" = list("headline" = "Bonjour le monde", "text" = "ceci est un texte"), "start_date" = list("year" = 2015, "month" = "01", "day" = "02"))
counter <- 0
index <- lapply(my_list, function(elem) {
date1 <- as.Date(paste(elem$start_date$year, elem$start_date$month, elem$start_date$day, sep = "-"))
date2 <- as.Date(paste(new_list_item$start_date$year, new_list_item$start_date$month, new_list_item$start_date$day, sep = "-"))
counter <<- counter + 1
if (date2 > date1) {
return(NULL)
} else {
return(counter)
}
})
index <- min(unlist(index)[!is.null(index)])
my_list <- list(my_list[1:(index - 1)], new_list_item, my_list[index:length(my_list)])
Especially since the above method adds extra indexing on the list elements (i.e [[1]][[1]]$text vs [[1]]$text), which isn't ideal. Any help is greatly appreciated.
If you define your new_list_item like this (to match the structure of my_list):
new_list_item <- list(list("text" = list("headline" = "Bonjour le monde", "text" = "ceci est un texte"), "start_date" = list("year" = 2015, "month" = "01", "day" = "02")))
then the following function works:
insert_new_list_item <- function(old_list, new_item){
# Get the date from new_item
new_item_date <- as.Date(paste(new_item[[1]]$start_date$year,
new_item[[1]]$start_date$month,
new_item[[1]]$start_date$day, sep = "-"))
# Get a vector of dates from old_list
dates_vector <- as.Date(vapply(old_list, function(x) paste(x$start_date$year,
x$start_date$month,
x$start_date$day, sep = "-"),
FUN.VALUE = character(1)))
# Append the date from the new list item and sort
dates_vector_new <- sort(c(dates_vector, new_item_date))
# Get the position of the new list item date
new_position <- which(dates_vector_new == new_item_date)
# Append the new list item to the list
if(new_position == 1){
new_list <- append(new_item, old_list)
} else {
new_list <- append(old_list, new_item, after = new_position-1)
}
new_list
}
insert_new_list_item(old_list = my_list, new_item = new_list_item)
This question also covers the append function: How to insert an element in the middle of a list?

JSON in a R list form to single merged dataframe

I currently am connecting to a mongodb through the rmongodb package (which isnt necessary for this example), it provides me with a list in the form of "mongo_data" list supplied in the code below. I want to take a list in this format and end up with a single dataframe. I currently use a loop (which I have setup to run in parallel but even still it is quite slow when using large amounts of data. Any ideas on how to do this without using a loop?
Reproducible data
symbols <- c("SPXT", "BCOM", "MXWO")
mongo_data <- list(structure(list(`_id` = "561369b8e756f3e744d9d697", symbol = "BCOM",
year = 2014, field = "PX_LAST", monthly = list(structure(list(
`2014-01-31` = 126.1206, `2014-02-28` = 133.9783, `2014-03-31` = 134.5234,
`2014-04-30` = 137.7964, `2014-05-30` = 133.8324, `2014-06-30` = 134.6268,
`2014-07-31` = 127.9143, `2014-08-29` = 126.5745, `2014-09-30` = 118.6922,
`2014-10-31` = 117.7365, `2014-11-28` = 112.9451, `2014-12-31` = 104.3285), .Names = c("2014-01-31",
"2014-02-28", "2014-03-31", "2014-04-30", "2014-05-30", "2014-06-30",
"2014-07-31", "2014-08-29", "2014-09-30", "2014-10-31", "2014-11-28",
"2014-12-31")))), .Names = c("_id", "symbol", "year", "field",
"monthly")), structure(list(`_id` = "561369b8e756f3e744d9d698",
symbol = "BCOM", year = 2015, field = "PX_LAST", monthly = list(
structure(list(`2015-01-30` = 100.8413, `2015-02-27` = 103.4379,
`2015-03-31` = 98.123, `2015-04-30` = 103.7471, `2015-05-29` = 100.9465,
`2015-06-30` = 102.6892, `2015-07-31` = 91.7827,
`2015-08-31` = 90.9328, `2015-09-30` = 87.8214), .Names = c("2015-01-30",
"2015-02-27", "2015-03-31", "2015-04-30", "2015-05-29",
"2015-06-30", "2015-07-31", "2015-08-31", "2015-09-30"
)))), .Names = c("_id", "symbol", "year", "field", "monthly"
)), structure(list(`_id` = "561353f1e756f3e744d97955", symbol = "MXWO",
year = 2014, field = "PX_LAST", monthly = list(structure(list(
`2014-01-31` = 1598.46, `2014-02-28` = 1675.4, `2014-03-31` = 1673.87,
`2014-04-30` = 1687.74, `2014-05-30` = 1715.18, `2014-06-30` = 1743.42,
`2014-07-31` = 1714.35, `2014-08-29` = 1748.69, `2014-09-30` = 1698.41,
`2014-10-31` = 1708.09, `2014-11-28` = 1739.5, `2014-12-31` = 1709.67), .Names = c("2014-01-31",
"2014-02-28", "2014-03-31", "2014-04-30", "2014-05-30", "2014-06-30",
"2014-07-31", "2014-08-29", "2014-09-30", "2014-10-31", "2014-11-28",
"2014-12-31")))), .Names = c("_id", "symbol", "year", "field",
"monthly")), structure(list(`_id` = "561353f1e756f3e744d97956",
symbol = "MXWO", year = 2015, field = "PX_LAST", monthly = list(
structure(list(`2015-01-30` = 1677.54, `2015-02-27` = 1772.86,
`2015-03-31` = 1740.81, `2015-04-30` = 1778.4, `2015-05-29` = 1779.31,
`2015-06-30` = 1735.61, `2015-07-31` = 1765.6, `2015-08-31` = 1645.43,
`2015-09-30` = 1581.92), .Names = c("2015-01-30",
"2015-02-27", "2015-03-31", "2015-04-30", "2015-05-29",
"2015-06-30", "2015-07-31", "2015-08-31", "2015-09-30"
)))), .Names = c("_id", "symbol", "year", "field", "monthly"
)), structure(list(`_id` = "5613542fe756f3e744d97a69", symbol = "SPXT",
year = 2014, field = "PX_LAST", monthly = list(structure(list(
`2014-01-31` = 3200.95, `2014-02-28` = 3347.3799, `2014-03-31` = 3375.51,
`2014-04-30` = 3400.46, `2014-05-30` = 3480.29, `2014-06-30` = 3552.1799,
`2014-07-31` = 3503.1899, `2014-08-29` = 3643.3401, `2014-09-30` = 3592.25,
`2014-10-31` = 3679.99, `2014-11-28` = 3778.96, `2014-12-31` = 3769.4399), .Names = c("2014-01-31",
"2014-02-28", "2014-03-31", "2014-04-30", "2014-05-30", "2014-06-30",
"2014-07-31", "2014-08-29", "2014-09-30", "2014-10-31", "2014-11-28",
"2014-12-31")))), .Names = c("_id", "symbol", "year", "field",
"monthly")), structure(list(`_id` = "5613542fe756f3e744d97a6a",
symbol = "SPXT", year = 2015, field = "PX_LAST", monthly = list(
structure(list(`2015-01-30` = 3656.28, `2015-02-27` = 3866.4199,
`2015-03-31` = 3805.27, `2015-04-30` = 3841.78, `2015-05-29` = 3891.1799,
`2015-06-30` = 3815.8501, `2015-07-31` = 3895.8,
`2015-08-31` = 3660.75, `2015-09-30` = 3570.1699), .Names = c("2015-01-30",
"2015-02-27", "2015-03-31", "2015-04-30", "2015-05-29",
"2015-06-30", "2015-07-31", "2015-08-31", "2015-09-30"
)))), .Names = c("_id", "symbol", "year", "field", "monthly"
)))
Code Using Currently
library(foreach)
library(doParallel)
cl <- makeCluster((detectCores()))
registerDoParallel(cl)
total_df_list <- foreach(i=(1:length(symbols))) %dopar% {
# pull out the data for symbol i
symbol_data <- mongo_data[which(sapply(lapply(mongo_data, "[[", "symbol"), function(x) x == symbols[i]))]
# pull out the particular frequency of data & put into a single list
freq_data <- lapply(symbol_data, "[[", "monthly")
freq_data <- do.call(Map, c(c, freq_data))
# if the frequency doesnt exist then add symbol to filler vector
if (length(freq_data) > 0) {
# convert NULLs to NAs
freq_data[[1]][which(sapply(freq_data[[1]], is.null))] <- NA
# transform list into a proper dataframe
mongo_df <- data.frame("Date" = names(unlist(freq_data[[1]])),"Value"=as.numeric(unlist(freq_data[[1]])))
mongo_df[,"Date"] <- as.Date(mongo_df[,"Date"])
colnames(mongo_df)[2] <- paste0(symbols[i])
# put dataframe into the master list
results <- mongo_df
} else {
filler_vector <- c(filler_vector,symbols[i])
results <- NULL
}
results
}
I did some benchmarking of my previous solution for this question, and I noticed that nearly all the runtime was spent in the as.Date call, which converts string dates to Date objects. As a result, it seems like optimizing this operation would be needed to further push the efficiency. One observation would be that many of the dates are reported multiple times (for different symbols), so we are wasting computation by performing date conversions for the same string literal multiple times. To address this, we could use the following procedure:
Compute all unique date strings
Convert them to Date objects with as.Date
Look up the converted value for each date string, yielding the final converted vector
In code, we could do this with:
vals <- unlist(lapply(mongo_data, "[[", "monthly"))
s <- unlist(lapply(mongo_data, function(x) rep(x$symbol, length(x$monthly[[1]]))))
dates <- unique(names(vals))
date.map <- as.Date(dates)
names(date.map) <- dates
partial <- data.frame(Date=date.map[names(vals)], val=as.numeric(vals),
symbol=s)
tdl2 <- unname(lapply(split(partial, partial$s), function(x) {
names(x)[2] <- as.character(x$symbol[1])
rownames(x) <- NULL
x[,-3]
})[symbols])
identical(total_df_list, tdl2)
# [1] TRUE
Let's benchmark this on a large instance that is obtained by repeating each row in mongo_data 10,000 times. OP is the function from the original question, josilber1 is from my other answer, and josilber2 is from this answer:
big.mongo <- mongo_data[rep(seq_along(mongo_data), 10000)]
system.time(OP(big.mongo, symbols))
# user system elapsed
# 5.359 0.071 5.427
system.time(josilber1(big.mongo, symbols))
# user system elapsed
# 4.345 0.047 4.385
system.time(josilber2(big.mongo, symbols))
# user system elapsed
# 0.560 0.048 0.625
My other answer yielded a 20% improvement in the runtime, while this solution yielded roughly a 10x speedup.
I see a few opportunities to speed this up:
Don't loop through the list extracting the symbol name for each symbol you process. Instead you can extract all the symbol names at the beginning and use that to quickly identify the part of the list you want to process.
sapply with == to look up matching symbols is calling == separately on each element. It would be more efficient to just call == once on the whole vector.
sapply with is.null is calling is.null separately on each element. It would be more efficient to just call is.null once on the whole vector.
Here's my updated implementation:
mongo.sym <- unlist(lapply(mongo_data, "[[", "symbol"))
tdl <- lapply(symbols, function(sym) {
vals <- unlist(lapply(mongo_data[mongo.sym == sym], function(x) unlist(x$monthly)))
vals[is.null(vals)] <- NA
ret <- data.frame(as.Date(names(vals)), as.numeric(vals))
names(ret) <- c("Date", sym)
ret
})
identical(total_df_list, tdl)
# [1] TRUE

Resources