Looping through a list of data frames in R - r

I'm having trouble looping through a list of data frames. I'll attach my full code below with my file pathways redacted and then explain specifics.
# load packages ----
xlib <- c("rnoaa","devtools","dplyr","plyr","hydroTSM","chron","gdata","date", "rowr")
lapply(xlib, require, character.only = T)
rm(xlib)
# se token ----
# get this from NOAA site - https://www.ncdc.noaa.gov/cdo-web/token
options(noaakey = "QlzFUrMWVrLFKkZijFohYRmbtVvEaqUB")
# set working folder ----
setwd("filepathway")
# read weather station information
statns <- read.csv(file = "filepathway/weather_station_locations_v1.csv", header = T)
# define dates ----
st.date <- as.Date("1950-01-01")
end.date <- as.Date("2018-04-30")
date1 <- data.frame(date = dip(st.date, end.date))
date1$year <- as.numeric(format(date1$date, "%Y"))
date1$month <- as.numeric(format(date1$date, "%m"))
date1$day <- as.numeric(format(date1$date, "%d"))
date1$jday <- as.numeric(format(date1$date, "%j"))
n1 <- 1
#n1 <- 100
for(id1 in (1:n1)) {
# pull data from NOAA server ----
# 1. precip
prcp.pull <- meteo_pull_monitors(monitors = statns$STATION[ id1 ],
keep_flags = T,
date_min = st.date,
date_max = end.date,
var = "PRCP")
prcp.pull$prcp <- prcp.pull$prcp / 10 # convert to mm/day
# 2. max. temperature
tmax.pull <- meteo_pull_monitors(monitors = statns$STATION[ id1 ],
keep_flags = T,
date_min = st.date,
date_max = end.date,
var = "TMAX")
tmax.pull$tmax <- tmax.pull$tmax / 10 # convert to dec. C
#3. min. temperature
tmin.pull <- meteo_pull_monitors(monitors = statns$STATION[ id1 ],
keep_flags = T,
date_min = st.date,
date_max = end.date,
var = "TMIN")
tmin.pull$tmin <- tmin.pull$tmin / 10 # convert to dec. C
statns2 <- split(statns, statns$"STATION")
colnames(prcp.pull)[1] <- "STATION"
colnames(tmin.pull)[1] <- "STATION"
colnames(tmax.pull)[1] <- "STATION"
prcpA <- rbind.fill(statns2, prcp.pull)
tminA <- rbind.fill(statns2, tmin.pull)
tmaxA <- rbind.fill(statns2, tmax.pull)
prcpB <- cbind.fill(statns2, prcpA)
tminB <- cbind.fill(statns2, tminA)
tmaxB <- cbind.fill(statns2, tmaxA)
tminC <- merge(tminB, statns2, by.x = 2, by.y = 2)
tmaxC <- merge(tmaxB, statns2, by.x = 2, by.y = 2)
prcpC <- merge(prcpB, statns2, by.x = 2, by.y = 2)
colnames(tminC)[2] <- "A"
colnames(tminC)[3] <- "B"
colnames(tminC)[4] <- "C"
colnames(tminC)[5] <- "D"
colnames(tminC)[6] <- "E"
colnames(tminC)[7] <- "G"
tminD = subset(tminC, select = -c(A, B, C, D, E, G ))
colnames(tmaxC)[2] <- "A"
colnames(tmaxC)[3] <- "B"
colnames(tmaxC)[4] <- "C"
colnames(tmaxC)[5] <- "D"
colnames(tmaxC)[6] <- "E"
colnames(tmaxC)[7] <- "G"
tmaxD = subset(tmaxC, select = -c(A, B, C, D, E, G ))
colnames(prcpC)[2] <- "A"
colnames(prcpC)[3] <- "B"
colnames(prcpC)[4] <- "C"
colnames(prcpC)[5] <- "D"
colnames(prcpC)[6] <- "E"
colnames(prcpC)[7] <- "G"
prcpD = subset(prcpC, select = -c(A, B, C, D, E, G ))
# save output as text file
fout <- paste("filepathway", c("prcpD", "tmaxD", "tminD"), statns$STATION[ id1 ],
".csv", sep = "")
# 1. precip
if (nrow(prcpD) > 0) {
write.table(file = fout[1], x = prcpD, col.names = T, row.names = T, append = F, sep = ",", quote = F)
}
# 2. tmax
if (nrow(tmaxD) > 0) {
write.table(file = fout[2], x = tmaxD, col.names = T, row.names = T, append = F, sep = ",", quote = F)
}
# 3. tmin
if (nrow(tminD) > 0) {
write.table(file = fout[3], x = tminD, col.names = T, row.names = T, append = F, sep = ",", quote = F)
}
}
}
After this line:statns2 <- split(statns, statns$"STATION") I get a list of data frames and would like to run the loop through each of these individual data frames - as in, when id1 (a number from 1 to 13926) matches FID + 1 (FID starts at 0 and goes to the end of the list), such that the commands after the split are run through the list one at a time making sure to match the info between my precipitation, temperature data, and weather station
Without the split into the list of data frames, it just gives each weather station one row of data whereas I'd like a row identifying the station then one for every date from start to end.
update: I made a small subset of my list and used head(statns.small) and have pasted my results below
$`CA003030525`
FID STATION LAT LON ELEV NAME CODE
1 0 CA003030525 49.8 -112.3 824 AB BARNWELL AGDM 71346
$CA003030720
FID STATION LAT LON ELEV NAME CODE
2 1 CA003030720 49.5667 -113.05 980 AB BLOOD TRIBE AGDM 71517
$CA003030768
FID STATION LAT LON ELEV NAME CODE
3 2 CA003030768 49.7333 -111.45 817 AB BOW ISLAND 71231

Related

Extract values from list of named lists in R

Based on the names of sublists with xyz values of a list, I would like to extract a sample of the xyz values from a sublist. Note: the lists do not start at 1.
Example data
set.seed(123)
data <- list('4' = list(x = rnorm(5), y = rnorm(5), z = rnorm(5)),
'5' = list(x = rnorm(5), y = rnorm(5), z = rnorm(5)),
'6' = list(x = rnorm(5), y = rnorm(5), z = rnorm(5)),
'7' = list(x = rnorm(5), y = rnorm(5), z = rnorm(5)),
'8' = list(x = rnorm(5), y = rnorm(5), z = rnorm(5)))
Function to extract random values (derived from here)
I have the following function to sample random xyz values from the list:
get_elements <- function(data, i) {
#select the main list
tmp <- data[[i]]
#Check the length of each sublist, select minimum value
#and sample 1 number from 1 to that number
rand_int <- sample(min(lengths(tmp)), 1)
#select that element from each sub-list
sapply(tmp, `[[`, rand_int)
}
Example of function
# Show list number 8
data[['8']]
#> $x
#> [1] 0.3796395 -0.5023235 -0.3332074 -1.0185754 -1.0717912
#> $y
#> [1] 0.30352864 0.44820978 0.05300423 0.92226747 2.05008469
#> $z
#> [1] -0.4910312 -2.3091689 1.0057385 -0.7092008 -0.6880086
# Extract random combination from list 8
get_elements(data, '8')
#> x y z
#> -0.33320738 0.05300423 1.00573852
Rewrite function
Using the same function as above, I replaced i with 'i':
get_elements <- function(data, i) {
tmp <- data[['i']] # <-- changed i to 'i'
rand_int <- sample(min(lengths(tmp)), 1)
sapply(tmp, `[[`, rand_int)
}
Error & Question
get_elements(data, 8)
Warning in min(lengths(tmp)): no non-missing arguments to min;
returning Inf list()
Suddenly the function breaks, and I do not understand why? What is the reason for this error?
Try this:
get_elements <- function(data, i)
{
tmp <- data[[paste(i)]]
rand_int <- sample(min(lengths(tmp)), 1)
sapply(tmp, `[[`, rand_int)
}
The reason your initial code failed is because "i" is understood as "character i", not as "convert variable i into character". See:
i = 1
print("i") # i
print(i) # 1

Demography package issue with aggregating data

# Function to construct a mortality demogdata object from HMD
hmd.mx <- function(country, username, password, label=country){
path <- paste("https://www.mortality.org/hmd/", country, "/STATS/", "Mx_1x1.txt", sep = "")
}
userpwd <- paste(username, ":", password, sep = "")
txt <- RCurl::getURL(path, userpwd = userpwd)
con <- textConnection(txt)
mx <- try(utils::read.table(con, skip = 2, header = TRUE, na.strings = "."),TRUE)
close(con)
if(class(mx)=="try-error")
stop("Connection error at www.mortality.org. Please check username, password and country label.")
path <- paste("https://www.mortality.org/hmd/", country, "/STATS/", "Exposures_1x1.txt", sep = "")
userpwd <- paste(username, ":", password, sep = "")
txt <- RCurl::getURL(path, userpwd = userpwd)
con <- textConnection(txt)
pop <- try(utils::read.table(con, skip = 2, header = TRUE, na.strings = "."),TRUE)
close(con)
if(class(pop)=="try-error")
stop("Exposures file not found at www.mortality.org")
obj <- list(type="mortality",label=label,lambda=0)
obj$year <- sort(unique(mx[, 1]))
#obj$year <- ts(obj$year, start=min(obj$year))
n <- length(obj$year)
m <- length(unique(mx[, 2]))
obj$age <- mx[1:m, 2]
obj$rate <- obj$pop <- list()
for (i in 1:n.mort)
{ obj$rate[[i]] <- matrix(mx[, i + 2], nrow = m, ncol = n)
obj$rate[[i]][obj$rate[[i]] < 0] <- NA
obj$pop[[i]] <- matrix(pop[, i + 2], nrow = m, ncol = n)
obj$pop[[i]][obj$pop[[i]] < 0] <- NA
dimnames(obj$rate[[i]]) <- dimnames(obj$pop[[i]]) <- list(obj$age, obj$year)
}
names(obj$pop) = names(obj$rate) <- tolower(mnames)
obj$age <- as.numeric(as.character(obj$age))
if (is.na(obj$age[m])) {
obj$age[m] <- 2 * obj$age[m - 1] - obj$age[m - 2] }
return(structure(obj, class = "demogdata"))
}
Above is the code that we are using to import our population data into r.
NLdata <- hmd.mx(country = "NLD",username = "username",password="password")
This would be the specific code to obtain the Dutch data.
Would anyone happen to know how to add multiple countries into one, and put that data into one dataframe (same format as the demography data packages that we download)? So for example the mortality rates for the (Netherlands + France + Norway) / 3 into one package.
You can try this code. However I could not run your demography package. So you might need to edit the code a bit. Perhaps someone else can fill in the second part? I saw that no one has reacted yet.
C1 <- data.frame(Year = 1980:2018, value1 = rnorm(39), value2 = rnorm(39), Cat =rbinom(39,1,0.5), Country = "France")
C2 <- data.frame(Year = 1980:2018, value1 = rnorm(39), value2 = rnorm(39), Cat =rbinom(39,1,0.5),Country = "England")
C3 <- data.frame(Year = 1970:2018, value1 = rnorm(49), value2 = rnorm(49), Cat =rbinom(49,1,0.5),Country = "Netherlands")
C1 <- split(C1, C1$Cat)
C2 <- split(C2, C2$Cat)
C3 <- split(C3, C3$Cat)
list_all <- list(rbind(C1[[1]],C2[[1]],C3[[1]]),rbind(C1[[2]],C2[[2]],C3[[2]]))
Final_list <- lapply(list_all, function(x) x %>% group_by(Year) %>% summarise(Val1 = mean(value1), Val2 = mean(value2), Country = "All") %>% as.data.frame)

Optimize the for loop in R

DUMMY DATA SET: (difference from my data set is item_code is string in my case)
in_cluster <- data.frame(item_code = c(1:500))
in_cluster$cluster <-
sample(5, size = nrow(in_cluster), replace = TRUE)
real_sales <- data.frame(item_code = numeric(0), sales = numeric(0))
real_sales <-
data.frame(
item_code = sample(500, size = 100000, replace = TRUE),
sales = sample(500, size = 100000, replace = TRUE)
)
mean_trajectory <- data.frame(sales = c(1:52))
mean_trajectory$sales <- sample(500, size = 52, replace = TRUE)
training_df <- data.frame(
LTF_t_minus_1 = numeric(0),
LTF_t = numeric(0),
LTF_t_plus_1 = numeric(0),
RS_t_minus_1 = numeric(0),
RS_t = numeric(0),
STF_t_plus_1 = numeric(0)
)
training_df[nrow(training_df) + 1, ] <-
c(0, 0, mean_trajectory$sales[[1]], 0, 0, 19) # week 0
week = 2
I have a simple function in R in which all I do is:
system.time({
for (r in 1:nrow(in_cluster)) {
item <- in_cluster[r,]
sale_row <-
dplyr::filter(real_sales, item_code == item$item_code)
if (nrow(sale_row) > 2) {
new_df <- data.frame(
LTF_t_minus_1 = mean_trajectory$sales[[week - 1]],
LTF_t = mean_trajectory$sales[[week]],
LTF_t_plus_1 = mean_trajectory$sales[[week + 1]],
RS_t_minus_1 = sale_row$sales[[week - 1]],
RS_t = sale_row$sales[[week]],
STF_t_plus_1 = sale_row$sales[[week + 1]]
)
training_df <-
bind_rows(training_df, new_df)
}
}
})
I am quite new to R and found this really weird looking at how small the data really is yet how long (421.59 seconds to loop through 500 rows) it is taking to loop through the data frame.
EDIT_IMPORTANT: However for above given dummy data set all it took was 1.10 seconds to get the output > could this be because of having string for item_code? does it take that much time to process a string item_code. (I didn't use string for dummy data sets because I do not know how to have 500 unique strings for item_code in in_cluster, and have the same strings in real_sales as item_code)
I read through few other articles which suggested ways to optimize the R code and used bind_rows instead of rbind or using:
training_df[nrow(training_df) + 1,] <-
c(mean_trajectory$sales[[week-1]], mean_trajectory$sales[[week]], mean_trajectory$sales[[week+1]], sale_row$sales[[week-1]], sale_row$sales[[week]], sale_row$sales[[week+1]])
using bind_rows seems to have improved the performance by 36 seconds when looping through 500 rows of data frame in_cluster
Is it possible to use lapply in this scenario? I tried code below and got an error:
Error in filter_impl(.data, dots) : $ operator is invalid for
atomic vectors
myfun <- function(item, sales, mean_trajectory, week) {
sale_row<- filter(sales, item_code == item$item_code)
data.frame(
LTF_t_minus_1 = mean_trajectory$sales[[week-1]],
LTF_t = mean_trajectory$sales[[week]],
LTF_t_plus_1 = mean_trajectory$sales[[week+1]],
RS_t_minus_1 = sale_row$sales[[week-1]],
RS_t = sale_row$sales[[week]],
STF_t_plus_1 = sale_row$sales[[week+1]])
}
system.time({
lapply(in_cluster, myfun, sales= sales, mean_trajectory = mean_trajectory) %>% bind_rows()
})
Help with lapply would be appreciated, however my main target is to speed up the loop.
Ok, so there a lot of bad practices in your code.
You are operating per row
You are creating 2(!) new data frames per row (very expensive)
You are growing objects in a loop )that training_df <- bind_rows(training_df, new_df) keeps growing in each iteration while running a pretty expensive operation (bind_rows))
You are running the same operation over and over again when you could just run them once (why are you running mean_trajectory$sales[[week-1]] and al per row while mean_trajectory has nothing to do with the loop? You could just assign it afterwards).
And the list goes on...
I would suggest an alternative simple data.table solution which will perform much better. The idea is to first make a binary join between in_cluster and real_sales (and run all the operations while joining without creating extra data frames and then binding them). Then, run all the mean_trajectoryrelated lines only once. (I ignored the training_df[nrow(training_df) + 1, ] <- c(0, 0, mean_trajectory$sales[[1]], 0, 0, 19) initialization as it's irrelevant here and you can just add it afterwards using and rbind)
library(data.table) #v1.10.4
## First step
res <-
setDT(real_sales)[setDT(in_cluster), # binary join
if(.N > 2) .(RS_t_minus_1 = sales[week - 1], # The stuff you want to do
RS_t = sales[week], # by condition
STF_t_plus_1 = sales[week + 1]),
on = "item_code", # The join key
by = .EACHI] # Do the operations per each join
## Second step (run the `mean_trajectory` only once)
res[, `:=`(LTF_t_minus_1 = mean_trajectory$sales[week - 1],
LTF_t = mean_trajectory$sales[week],
LTF_t_plus_1 = mean_trajectory$sales[week + 1])]
Some benchmarks:
### Creating your data sets
set.seed(123)
N <- 1e5
N2 <- 5e7
in_cluster <- data.frame(item_code = c(1:N))
real_sales <-
data.frame(
item_code = sample(N, size = N2, replace = TRUE),
sales = sample(N, size = N2, replace = TRUE)
)
mean_trajectory <- data.frame(sales = sample(N, size = 25, replace = TRUE))
training_df <- data.frame(
LTF_t_minus_1 = numeric(0),
LTF_t = numeric(0),
LTF_t_plus_1 = numeric(0),
RS_t_minus_1 = numeric(0),
RS_t = numeric(0),
STF_t_plus_1 = numeric(0)
)
week = 2
###############################
################# Your solution
system.time({
for (r in 1:nrow(in_cluster)) {
item <- in_cluster[r,, drop = FALSE]
sale_row <-
dplyr::filter(real_sales, item_code == item$item_code)
if (nrow(sale_row) > 2) {
new_df <- data.frame(
LTF_t_minus_1 = mean_trajectory$sales[[week - 1]],
LTF_t = mean_trajectory$sales[[week]],
LTF_t_plus_1 = mean_trajectory$sales[[week + 1]],
RS_t_minus_1 = sale_row$sales[[week - 1]],
RS_t = sale_row$sales[[week]],
STF_t_plus_1 = sale_row$sales[[week + 1]]
)
training_df <-
bind_rows(training_df, new_df)
}
}
})
### Ran forever- I've killed it after half an hour
######################
########## My solution
library(data.table)
system.time({
res <-
setDT(real_sales)[setDT(in_cluster),
if(.N > 2) .(RS_t_minus_1 = sales[week - 1],
RS_t = sales[week],
STF_t_plus_1 = sales[week + 1]),
on = "item_code",
by = .EACHI]
res[, `:=`(LTF_t_minus_1 = mean_trajectory$sales[week - 1],
LTF_t = mean_trajectory$sales[week],
LTF_t_plus_1 = mean_trajectory$sales[week + 1])]
})
# user system elapsed
# 2.42 0.05 2.47
So for 50MM rows the data.table solution ran for about 2 secs, while your solution ran endlessly until I've killed it (after half an hour).

Apply a function based on column name in data.tables R

I'm looking to apply a user define function based on the name given to a column
dt <- data.table(gr_id = 1, id = seq(1,10),min_c = runif(10,10,30),
ml_c = runif(10,30,50),mx_c = runif(10,50,100),
min_t = runif(10,10,20),ml_t = runif(10,20,25),
mx_t = runif(10,25,30))
I would like to apply a function which calculates (min(min)+min(ml))/mx for both "c" columns and "t" columns. Currently, I've done as follows. However, becomes hard when I want to add more columns (lets say, "a")
dt[,{
temp1 = min(min_c)
temp2 = min(ml_c)
temp3 = min(mx_c)
score_c = (temp1+temp2)/temp3
temp4 = min(min_t)
temp5 = min(ml_t)
temp6 = min(mx_t)
score_t = (temp4+temp5)/temp6
list(score_c = score_c,
score_t = score_t)
},by = gr_id
]
I think this will work. the basic idea is using get.
# the original code could be simplified to:
dt[, .(
score_c = (min(min_c) + min(ml_c)) / min(mx_c),
score_t = (min(min_t) + min(ml_t)) / min(mx_t)
), by = gr_id]
#
# gr_id score_c score_t
# 1: 1 0.9051556 1.28054
# using `get`
cols <- c('c', 't')
dt[, {
res <- lapply(cols, function(i){
vars <- paste(c('min', 'ml', 'mx'), i, sep = '_')
(min(get(vars[1])) + min(get(vars[2]))) / min(get(vars[3]))
})
names(res) <- paste('score', cols, sep = '_')
res
}, by = gr_id]
# gr_id score_c score_t
# 1: 1 0.9051556 1.28054

Apply loop in automated forecast

I am trying to forecast individual variables from a data.frame in long format. I get stuck in the loop [apply] part. The question is: how can I replace the manual forecasting with an apply?
library(forecast)
library(data.table)
# get time series
www = "http://staff.elena.aut.ac.nz/Paul-Cowpertwait/ts/cbe.dat"
cbe = read.table(www, header = T)
# in this case, there is a data.frame in long format to start with
df = data.table(cbe[, 2:3])
df[, year := 1958:1990]
dfm = melt(df, id.var = "year", variable.name = "indicator", variable.factor = F) # will give warning because beer = num and others are int
dfm[, site := "A"]
dfm2= copy(dfm) # make duplicate to simulate other site
dfm2[, site := "B"]
dfm = rbind(dfm, dfm2)
# function to make time series & forecast
f.forecast = function(df, mysite, myindicator, forecast.length = 6, frequency = freq) {
# get site and indicator
x = df[site == mysite & indicator == myindicator,]
# convert to time series
start.date = min(x$year)
myts = ts(x$value, frequency = freq, start = start.date)
# forecast
myfc = forecast(myts, h = forecast.length, fan = F, robust = T)
plot(myfc, main = paste(mysite, myindicator, sep = " / "))
grid()
return(myfc)
}
# the manual solution
par(mfrow = c(2,1))
f1 = f.forecast(dfm, mysite = "A", myindicator = "beer", forecast.length = 6, freq = 12)
f2 = f.forecast(dfm, mysite = "A", myindicator = "elec", forecast.length = 6, freq = 12)
# how to loop? [in the actual data set there are many variables per site]
par(mfrow = c(2,1))
myindicators = unique(dfm$indicator)
sapply(myindicator, f.forecast(dfm, "A", myindicator = myindicators, forecast.length = 6, freq = 12)) # does not work
I'd suggest using split and dropping the second and third argument of f.forecast. You directly pass the subset of the data.frame you want to forecast. For instance:
f.forecast = function(x, forecast.length = 6, frequency = freq) {
#comment the first line
#x = df[site == mysite & indicator == myindicator,]
#here goes the rest of the body
#modify the plot line
plot(myfc, main = paste(x$site[1], x$indicator[1], sep = " / "))
}
Now you split the entire df and call f.forecast for each subset:
dflist<-split(df,df[,c("site","indicator")],drop=TRUE)
lapply(dflist,f.forecast)

Resources