Fetch daily data for each variable using mapply - r

I've a function which objective is to fetch daily data for each variable on a column on a data.frame. Range is a complete month, but could be any other range.
My df has a column unit_id, so I need my function to take the first id of col unit_id and fetch the data for every single date of march.
| unit | unit_id |
|:-----:|----------|
| AE | 123 |
| AD | 456 |
| AN | 789 |
But right now, my function loops the ids in unit_id col. So as I've 3 ids, the 4th day the function uses the 1st id again, and then for the 5th day uses the 2nd id and so on. And this repeats until the last day of the month.
I need it to use each id for every day of the month.
code:
my_dates <- seq(as.Date("2020-03-01"), as.Date("2020-03-31"), by = 1)
my_fetch <- function(unit, unit_id, d) {
df <- google_analytics(unit_id,
date_range = c(d, d),
metrics = c("totalEvents"),
dimensions = c("ga:date", "ga:eventCategory", "ga:eventAction", "ga:eventLabel"),
anti_sample = TRUE)
df$unidad_de_negocio <- unit
filename <- paste0(unit, "-", "total-events", "-", d, ".csv")
path <- "D:\\america\\costos_protv\\total_events"
write.csv(df, file.path(path, filename), row.names = FALSE)
print(filename)
rm(df)
gc()
}
monthly_fetches <- mapply(my_fetch, df$unit,
df$unit_id,
my_dates, SIMPLIFY = FALSE)
Variation 2: By monthly ranges
Thank you, Akrun. Your answer works.
I'ven trying to edit it, ot use it in this other similar scenario:
1.- Monthly starts and ends: Now the loops isn't a single day date, but has an start and end. I've called this monthly_dates
| starts | ends |
|:-----------:|------------|
| 2020-02-01 | 2020-02-29 |
| 2020-03-01 | 2020-03-31 |
I've tried to adapt the solution, but it is not working. May you see it and tell me why? Thank you.
monthly_fetches <- Map(function(x, y)
lapply(monthly_dates, function(d1, d2) my_fetch(x, y, monthly_dates$starts, monthly_dates$ends)))
Main function adapted to use 2 dates (start "d1" and end "d2"):
my_fetch <- function(udn, udn_id, d1, d2) {
df <- google_analytics(udn_id,
date_range = c(d1, d2),
metrics = c("totalEvents"),
dimensions = c("ga:month"),
anti_sample = TRUE)
df$udn <- udn
df$udn_id <- udn_id
df
}
** Code to make the monthly date ranges:**
make_date_ranges <- function(start, end){
starts <- seq(from = start,
to = Sys.Date()-1 ,
by = "1 month")
ends <- c((seq(from = add_months(start, 1),
to = end,
by = "1 month" ))-1,
(Sys.Date()-1))
data.frame(starts,ends)
}
## useage
monthly_dates <- make_date_ranges(as.Date("2020-02-01"), Sys.Date())
Update 1:
dput(monthly_fetches[1])
list(AE = list(structure(list(month = "02", totalEvents = 19670334,
udn = "AE", udn_id = 74415341), row.names = 1L, totals = list(
list(totalEvents = "19670334")), minimums = list(list(totalEvents = "19670334")), maximums = list(
list(totalEvents = "19670334")), isDataGolden = TRUE, rowCount = 1L, class = "data.frame"),
structure(list(month = "03", totalEvents = 19765253, udn = "AE",
udn_id = 74415341), row.names = 1L, totals = list(list(
totalEvents = "19765253")), minimums = list(list(totalEvents = "19765253")), maximums = list(
list(totalEvents = "19765253")), isDataGolden = TRUE, rowCount = 1L, class = "data.frame"),
structure(list(month = "04", totalEvents = 1319087, udn = "AE",
udn_id = 74415341), row.names = 1L, totals = list(list(
totalEvents = "1319087")), minimums = list(list(totalEvents = "1319087")), maximums = list(
list(totalEvents = "1319087")), isDataGolden = TRUE, rowCount = 1L, class = "data.frame")))
Update 2:
dput(monthly_fetches[[1]])
list(structure(list(month = "02", totalEvents = 19670334, udn = "AE",
udn_id = 74415341), row.names = 1L, totals = list(list(totalEvents = "19670334")), minimums = list(
list(totalEvents = "19670334")), maximums = list(list(totalEvents = "19670334")), isDataGolden = TRUE, rowCount = 1L, class = "data.frame"),
structure(list(month = "03", totalEvents = 19765253, udn = "AE",
udn_id = 74415341), row.names = 1L, totals = list(list(
totalEvents = "19765253")), minimums = list(list(totalEvents = "19765253")), maximums = list(
list(totalEvents = "19765253")), isDataGolden = TRUE, rowCount = 1L, class = "data.frame"),
structure(list(month = "04", totalEvents = 1319087, udn = "AE",
udn_id = 74415341), row.names = 1L, totals = list(list(
totalEvents = "1319087")), minimums = list(list(totalEvents = "1319087")), maximums = list(
list(totalEvents = "1319087")), isDataGolden = TRUE, rowCount = 1L, class = "data.frame"))

As Map/mapply requires all arguments to be of same length and 'df' with number of rows of 3 and 'my_dates' length 31, one option is to loop over the 'df', columns and then do a further loop inside the Map/mapply
monthly_fetches <- Map(function(x, y)
lapply(my_dates, function(date) my_fetch(x, y, date)),
df$unit, d$unit_id)
Or we can have outer loop for 'my_dates'
lapply(my_dates, function(date) Map(my_fetch, df$unit, df$unit_id, date))
Update
If we need to pass two columns, use Map
Map(function(start, end)
Map(my_fetch, df$unit, df$unit_id, start, end),
monthly_dates$starts, monthly_dates$ends))
Or
monthly_fetches <- Map(function(x, y) Map(function(start, end)
my_fetch(x, y, start, end),
monthly_dates$starts, monthly_dates$ends), df$unit, df$unit_id)
Then rbind
do.call(rbind,lapply(monthly_fetches, function(x) do.call(rbind, x)))
Or use map
library(purrr)
library(dplyr)
map_dfr(monthly_fetches, bind_rows, .id = 'grp')

Related

How to loop dataframe in R

I want to get data from IMF.However the API data is limited
Therefor I get the data by continent.
How to loop the dateframe? (The data can get from "Before loop part",load data from api)
The reference cannot work.https://stackoverflow.com/questions/25284539/loop-over-a-string-variable-in-r
Before the loop
library(imfr)
library(countrycode)
data(codelist)
country_set <- codelist
country_set<- country_set %>%
select(country.name.en , iso2c, iso3c, imf, continent, region) %>% filter(!is.na(imf) & !is.na(iso2c))
africa_iso2<- country_set$iso2c[country_set$continent=="Africa"]
asia_iso2<- country_set$iso2c[country_set$continent=="Asia"]
americas_iso2<- country_set$iso2c[country_set$continent=="Americas"]
europe_iso2<- country_set$iso2c[country_set$continent=="Europe"]
oceania_iso2<- country_set$iso2c[country_set$continent=="Oceania"]
loop part
continent <- c("africa", "asia", "americas","europe","oceania")
for(i in 1:length(continent)){
var <- paste0("gdp_nsa_xdc_", continent[i])
var1 <- paste0(continent[i],"_iso2")
[[var]]<- imf_data(database_id = "IFS" , indicator = c("NGDP_NSA_XDC"),country =[[var1]],start = 2010, end = 2022,return_raw = TRUE)
[[var]]<- [[var]]$CompactData$DataSet$Series
}
data sample is
list(CompactData = list(`#xmlns:xsi` = "http://www.w3.org/2001/XMLSchema-instance",
`#xmlns:xsd` = "http://www.w3.org/2001/XMLSchema", `#xsi:schemaLocation` = "http://www.SDMX.org/resources/SDMXML/schemas/v2_0/message https://registry.sdmx.org/schemas/v2_0/SDMXMessage.xsd http://dataservices.imf.org/compact/IFS http://dataservices.imf.org/compact/IFS.xsd",
`#xmlns` = "http://www.SDMX.org/resources/SDMXML/schemas/v2_0/message",
Header = list(ID = "18e0aeae-09ec-4dfe-ab72-60aa16aaea84",
Test = "false", Prepared = "2022-10-19T12:02:28", Sender = list(
`#id` = "1C0", Name = list(`#xml:lang` = "en", `#text` = "IMF"),
Contact = list(URI = "http://www.imf.org", Telephone = "+ 1 (202) 623-6220")),
Receiver = list(`#id` = "ZZZ"), DataSetID = "IFS"), DataSet = list(
`#xmlns` = "http://dataservices.imf.org/compact/IFS",
Series = list(`#FREQ` = "Q", `#REF_AREA` = "US", `#INDICATOR` = "NGDP_NSA_XDC",
`#UNIT_MULT` = "6", `#TIME_FORMAT` = "P3M", Obs = structure(list(
`#TIME_PERIOD` = c("2020-Q1", "2020-Q2", "2020-Q3",
"2020-Q4", "2021-Q1", "2021-Q2", "2021-Q3", "2021-Q4",
"2022-Q1", "2022-Q2"), `#OBS_VALUE` = c("5254152",
"4930197", "5349433", "5539370", "5444406", "5784816",
"5883177", "6203369", "6010733", "6352982")), class = "data.frame", row.names = c(NA,
10L))))))
I suggest you create a list first, to which you will assign the value you want your loop to create. The following code creates a named list, and then at the end of the loop, assigns the value of each iteration to that named list:
continent <-
sapply(c("africa", "asia", "americas","europe","oceania"),
c, simplify = FALSE, USE.NAMES = TRUE)
for(i in seq_len(length(continent))) {
var <- paste0("gdp_nsa_xdc_", continent[i])
var1 <- get(paste0(continent[i],"_iso2"))
var <- imf_data(database_id = "IFS" , indicator = c("NGDP_NSA_XDC"),
country = var1, start = 2010, end = 2022,
return_raw = TRUE)
continent[[i]] <- var$CompactData$DataSet$Series
}
I don't necessarily understand the double brackets around [[var]]. Let me know if my answer does not correspond to what you were looking for!
We could use assign to create objects in the global env
for(i in 1:length(continent)){
var <- paste0("gdp_nsa_xdc_", continent[i])
var1 <- paste0(continent[i],"_iso2")
assign(var, imf_data(database_id = "IFS" , indicator = c("NGDP_NSA_XDC"),country =[[var1]],start = 2010, end = 2022,
return_raw = TRUE))
assign(var, get(var)$CompactData$DataSet$Series)
}

R: Error: sample_info must be a data frame with columns sample_name, file_bam, paired_end, read_length, frag_length, lib_size

I have a list of bam files in chr16_bam folder and also an annotation file sgseq_sam.txt.
library(SGSeq)
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
library(BSgenome.Hsapiens.UCSC.hg19)
library(org.Hs.eg.db)
library(tidyverse)
library(AnnotationDbi)
library(dplyr)
library(stringr)
# Rename the "file_bam" column values to the full path where the BAMs are stored
setwd("C:/Users/User/Downloads/")
bamPath = "C:/Users/User/Downloads/chr16_bam"
samFile <- read.delim("C:/Users/User/Downloads/sgseq_sam.txt", header=T)
samFile <- samFile %>%
mutate(file_bam = paste0(bamPath, file_bam))
# Save the TxDb.Hsapiens.UCSC.hg19.knownGene object into a variable.
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
txdb <- keepSeqlevels(txdb, "chr16")
seqlevelsStyle(txdb) <- "NCBI"
# Read in the gene list from Supplementary Table 1
gene.list <- read.table("Table_1_Differential Expression Analysis Revealing CLCA1 to Be a Prognostic and Diagnostic Biomarker for Colorectal Cancer.xls", header=T)
# Convert the gene symbols to Entrez IDs and remove the genes that don't map to an Entrez ID.
entrez = mapIds(org.Hs.eg.db, keys=gene.list$Name, column = "ENTREZID", keytype="SYMBOL")
gene.list <- gene.list %>%
mutate(entrez) %>%
filter(!is.na(entrez))
# Convert the TxDb.Hsapiens.UCSC.hg19.knownGene object
txf_ucsc <- convertToTxFeatures(txdb)
# Cast this object to a dataframe and save as another variable name
txf_df <- as.data.frame(txf_ucsc)
# Subset by the Entrez IDs
txf_df <- txf_df %>% filter(geneName %in% gene.list$entrez)
# Find the number of common transcripts
unique <- unique(txf_df$geneName)
gene.list <- gene.list[gene.list$entrez %in% unique,]
nrow(gene.list)
# Cast the seqnames column to factor
txf_df$seqnames <- as.factor(as.character(txf_df$seqnames))
# Remove the "chr" prefix from the seqnames column
txf_df %>%
rename_all(~stringr::str_replace(.,"^chr",""))
# Recast this dataframe back to a GRanges object
txf_grange <- makeGRangesFromDataFrame(txf_df, keep.extra.columns=T)
Now, I want to create a loop for each of the genes, where upon iteration, subset Granges objects in txf_grange by only the gene, use the reduce function to collapse the ranges of the genes into a single vector, run analyzeFeatures, then annotate functions, and finally plotFeatures.
for (i in txf_grange$geneName) {
if (i=="343") {
next
}
else{
# For each of the 15 genes, subset the Granges objects by only the gene
grange.subset <- txf_grange[i == toString(i)]
# Collapse the ranges of the genes into a single vector
grange.subset <- unlist(IRanges::reduce((split(grange.subset, grange.subset$geneName))))
# Run analyzeFeatures
for (j in 1:dim(samFile)[[1]]) {
si <- samFile
for (k in si) {
for (l in list.files(path="C:/Users/User/Downloads/chr16_bam", pattern=".bam$", all.files=F, full.names=F)) {
sgfc_pred <- analyzeFeatures(k, which=grange.subset, features=txf_ucsc, predict=T)
# Annotate predicted features
sgfc_pred <- annotate(sgfc_pred, txf_ucsc)
# Plot features
pdf(paste("plot", l, ".pdf", sep=""))
plotFeatures(sgfc_pred, geneID=1)
dev.off()
}
}
}
}
}
Data
> dput(head(txf_grange))
new("GRanges", seqnames = new("Rle", values = structure(1L, .Label = "16", class = "factor"),
lengths = 6L, elementMetadata = NULL, metadata = list()),
ranges = new("IRanges", start = c(12058964L, 12059311L, 12059311L,
12060052L, 12060198L, 12060198L), width = c(348L, 742L, 2117L,
147L, 680L, 1230L), NAMES = NULL, elementType = "ANY", elementMetadata = NULL,
metadata = list()), strand = new("Rle", values = structure(1L, .Label = c("+",
"-", "*"), class = "factor"), lengths = 6L, elementMetadata = NULL,
metadata = list()), seqinfo = new("Seqinfo", seqnames = "16",
seqlengths = NA_integer_, is_circular = NA, genome = NA_character_),
elementMetadata = new("DFrame", rownames = NULL, nrows = 6L,
listData = list(type = structure(c(3L, 1L, 1L, 2L, 1L,
1L), .Label = c("J", "I", "F", "L", "U"), class = "factor"),
txName = structure(list(c("uc002dbv.3", "uc010buy.3",
"uc010buz.3"), c("uc002dbv.3", "uc010buy.3"), "uc010buz.3",
c("uc002dbv.3", "uc010buy.3"), "uc010buy.3",
"uc002dbv.3"), class = "AsIs"), geneName = structure(list(
"608", "608", "608", "608", "608", "608"), class = "AsIs")),
elementType = "ANY", elementMetadata = NULL, metadata = list()),
elementType = "ANY", metadata = list())
change line:
analyzeFeatures(samFile, grange.subset)
Also you do not need that many loops to run for the question. The question asks for 14 plots and you might be plotting for much more with the number of loops you have.

Count Backwards in String until pattern R

I'm trying to extract UPCs from item descriptions. There is a varying number of /'s in the front of the description, but the UPC is always right before the last /, so I was using a count of characters, however, there is a variable number of characters at the end based on pack size. In the replication, you can see on the first row what this is supposed to look like at the end, but the second row has dropped the first digit of the UPC and picked up the /. Looking for a way to do this inline with DPLYR. My original code is under the replication.
test <- structure(list(Month = structure(c(17987, 17987), class = "Date"),store_id = c("7005", "7005"), UPC = c("000004150860081","00001200050404/"), `Item Description` = c("ACQUA PANNA SPRING WATER/EACH/000004150860081/1","AQUAFINA 24PK/24PK/000001200050404/24"), `Cals Item Description` = c(NA_character_,NA_character_), `Sub-Category` = c(NA_character_, NA_character_), Category = c(NA_character_, NA_character_), Department = c(NA_character_,NA_character_), `Sales Dollars` = c(17.43, 131.78), Units = c(7,528), Cost = c(8.4, 112.2), `Gross Margin` = c(9.03, 19.58), `Gross Margin %` = c(0.5181, 0.1486)), row.names = c(NA,-2L), class = c("tbl_df", "tbl", "data.frame"))
foo <- list.files(pattern = "*.csv", full.names = T) %>%
map_df(~read_csv(.)) %>%
mutate(date = lubridate::mdy(str_sub(textbox43, start = -10))) %>%
mutate(store_id = str_sub(textbox6, start = 1, end = 4)) %>%
mutate(item_desc = textbox57) %>%
filter(!is.na(item_desc), item_desc != "") %>%
mutate(dollars = textbox58,
units = textbox59,
cost = textbox61,
gm = textbox66,
gm_pct = textbox67) %>%
mutate(UPC = str_sub(item_desc, start = -17, end = -3))
Is this what you want?
sub("^.*/([^/]+)/[^/]*$",
"\\1",
test$`Item Description`)
Returns:
[1] "000004150860081" "000001200050404"
Edit: You were asking for dplyr style:
test %>%
mutate(item_id = sub("^.*/([^/]+)/[^/]*$",
"\\1",
test$`Item Description`))

Transform list of dataframes to list of time-series

The package IMFdata returns a list of dataframes.
For example:
library(IMFData)
databaseID <- "IFS"
startdate = "2019-01-01"
enddate = "2019-03-01"
checkquery = FALSE
queryfilter <- list(CL_FREQ = "M", CL_AREA_IFS = c("AU", "BR"), CL_INDICATOR_IFS = "FIDR_PA")
IFS_ex <- CompactDataMethod(databaseID, queryfilter, startdate, enddate, checkquery)
This code creates the list of dataframes IFS_ex:
structure(list(`#FREQ` = c("M", "M"), `#REF_AREA` = c("AU", "BR"
), `#INDICATOR` = c("FIDR_PA", "FIDR_PA"), `#UNIT_MULT` = c("0",
"0"), `#TIME_FORMAT` = c("P1M", "P1M"), Obs = list(structure(list(
`#TIME_PERIOD` = c("2019-01", "2019-02", "2019-03"), `#OBS_VALUE` = c("1.95",
"1.95", "1.9")), class = "data.frame", row.names = c(NA,
3L)), structure(list(`#TIME_PERIOD` = c("2019-01", "2019-02",
"2019-03"), `#OBS_VALUE` = c("6.41275285614977", "5.70499999999921",
"6.18810104544945")), class = "data.frame", row.names = c(NA,
3L)))), row.names = 1:2, class = "data.frame")
I would like to transform this list of dataframes in a list of time-series (list_ts). This is the expected output:
list_ts <- list(AU = structure(c(1.95, 1.95, 1.90), .Tsp = c(2019, 2019.16666666667, 12), class = "ts"), BR = structure(c(6.41275285614977,
5.70499999999921, 6.18810104544945), .Tsp = c(2019, 2019.16666666667,
12), class = "ts"))
Try running it with tidy = TRUE. You can then split the dataframe and apply over the new lists. It is possible to use the original format you showed, but it would be a lot more work.
library(IMFData)
library(zoo)
databaseID <- "IFS"
startdate = "2019-01-01"
enddate = "2019-03-01"
checkquery = FALSE
IFS_ex <- CompactDataMethod(databaseID, queryfilter, startdate, enddate, checkquery,
tidy = TRUE)
lst_df <- split(IFS_ex, IFS_ex$`#REF_AREA`)
list_ts <- lapply(lst_df, function(x) ts(zoo(x$`#OBS_VALUE`, x$`#TIME_PERIOD`), start = c(2019, 1), frequency = 12))
# and to get rid of an attribute you do not want
list_ts <- lapply(list_ts, function(x) {attr(x, "index") <- NULL; x})
list_ts
# $AU
# Jan Feb Mar
# 2014 1.95 1.95 1.9
#
# $BR
# Jan Feb Mar
# 2014 6.41275285614977 5.70499999999921 6.18810104544945

Automatically split function output (list) into component data.frames

I have a functions which yields 2 dataframes. As functions can only return one object, I combined these dataframes as a list. However, I need to work with both dataframes separately. Is there a way to automatically split the list into the component dataframes, or to write the function in a way that both objects are returned separately?
The function:
install.packages("plyr")
require(plyr)
fun.docmerge <- function(x, y, z, crit, typ, doc = checkmerge) {
mergedat <- paste(deparse(substitute(x)), "+",
deparse(substitute(y)), "=", z)
countdat <- nrow(x)
check_t1 <- data.frame(mergedat, countdat)
z1 <- join(x, y, by = crit, type = typ)
countdat <- nrow(z1)
check_t2 <- data.frame(mergedat, countdat)
doc <- rbind(doc, check_t1, check_t2)
t1<-list()
t1[["checkmerge"]]<-doc
t1[[z]]<-z1
return(t1)
}
This is the call to the function, saving the result list to the new object results.
results <- fun.docmerge(x = df1, y = df2, z = "df3", crit = c("id"), typ = "left")
In the following sample data to replicate the problem:
df1 <- structure(list(id = c("XXX1", "XXX2", "XXX3",
"XXX4"), tr.isincode = c("ISIN1", "ISIN2",
"ISIN3", "ISIN4")), .Names = c("id", "isin"
), row.names = c(NA, 4L), class = "data.frame")
df2 <- structure(list(id= c("XXX1", "XXX5"), wrong= c(1L,
1L)), .Names = c("id", "wrong"), row.names = 1:2, class = "data.frame")
checkmerge <- structure(list(mergedat = structure(integer(0), .Label = character(0), class = "factor"),
countdat = numeric(0)), .Names = c("mergedat", "countdat"
), row.names = integer(0), class = "data.frame")
In the example, a list with the dataframes df3 and checkmerge are returned. I would need both dataframes separately. I know that I could do it via manual assignment (e.g., checkmerge <- results$checkmerge) but I want to eliminate manual changes as much as possible and am therefore looking for an automated way.

Resources