Join data frames faster - r

I'm currently merging 12 different data frames that are each 480,00 obs by an id and adding the columns, so it becomes a 48k obs x 14 variable data frame. However, this is taking too long to process and I'm looking for a faster way to do this.
Example
dput:
# January data
jan <- structure(list(gridNumber = c("17578", "18982", "18983", "18984",
"18985"), PRISM_ppt_stable_4kmM2_193301_bil = c(35.7099990844727,
36, 35.4199981689453, 33.7299995422363, 33.2799987792969)), .Names = c("gridNumber",
"PRISM_ppt_stable_4kmM2_193301_bil"), row.names = c("17578",
"18982", "18983", "18984", "18985"), class = "data.frame")
# February data
feb <- structure(list(gridNumber = c("17578", "18982", "18983", "18984",
"18985"), PRISM_ppt_stable_4kmM2_193302_bil = c(14.6199998855591,
14.5600004196167, 14.9899997711182, 15.4700002670288, 15.5799999237061
)), .Names = c("gridNumber", "PRISM_ppt_stable_4kmM2_193302_bil"
), row.names = c("17578", "18982", "18983", "18984", "18985"), class = "data.frame")
# March Data
mar <- structure(list(gridNumber = c("17578", "18982", "18983", "18984",
"18985"), PRISM_ppt_stable_4kmM2_193303_bil = c(23.8400001525879,
23.9200000762939, 24.3400001525879, 25.7900009155273, 26.5900001525879
)), .Names = c("gridNumber", "PRISM_ppt_stable_4kmM2_193303_bil"
), row.names = c("17578", "18982", "18983", "18984", "18985"), class = "data.frame")
dplyr Code:
library(dplyr)
datalist <- list(jan, feb, mar)
full <- Reduce(function(x,y) {full_join(x,y, by = "gridNumber")}, datalist)
This code obviously runs much faster because of the low obs, but is there a faster way to do this?

Here is an approach using data.table and reshape2
library(data.table)
library(reshape2)
# create a list of data frames, and coerce to data.tables
month_list <- lapply(list(jan,feb,mar),setDT)
# add id column with old variable name and rename value column
for(i in seq_along(month_list)){
set(month_list[[i]],j="ID",value = names(month_list[[i]])[2])
setnames(month_list[[i]], names(month_list[[i]])[2], "value")
}
# put in long form
long_data <- rbindlist(month_list)
# then use `dcast.data.table` to make wide
wide <- dcast.data.table(long_data, gridNumber~ID, value = 'value')

Dunno if this will be faster, but:
list(jan = jan %>% rename(PRISM = PRISM_ppt_stable_4kmM2_193301_bil),
feb = feb %>% rename(PRISM = PRISM_ppt_stable_4kmM2_193302_bil),
mar = mar %>% rename(PRISM = PRISM_ppt_stable_4kmM2_193303_bil)) %>%
bind_rows(.id = "month") %>%
spread(month, PRISM)

Related

Transform list of dataframes to list of time-series

The package IMFdata returns a list of dataframes.
For example:
library(IMFData)
databaseID <- "IFS"
startdate = "2019-01-01"
enddate = "2019-03-01"
checkquery = FALSE
queryfilter <- list(CL_FREQ = "M", CL_AREA_IFS = c("AU", "BR"), CL_INDICATOR_IFS = "FIDR_PA")
IFS_ex <- CompactDataMethod(databaseID, queryfilter, startdate, enddate, checkquery)
This code creates the list of dataframes IFS_ex:
structure(list(`#FREQ` = c("M", "M"), `#REF_AREA` = c("AU", "BR"
), `#INDICATOR` = c("FIDR_PA", "FIDR_PA"), `#UNIT_MULT` = c("0",
"0"), `#TIME_FORMAT` = c("P1M", "P1M"), Obs = list(structure(list(
`#TIME_PERIOD` = c("2019-01", "2019-02", "2019-03"), `#OBS_VALUE` = c("1.95",
"1.95", "1.9")), class = "data.frame", row.names = c(NA,
3L)), structure(list(`#TIME_PERIOD` = c("2019-01", "2019-02",
"2019-03"), `#OBS_VALUE` = c("6.41275285614977", "5.70499999999921",
"6.18810104544945")), class = "data.frame", row.names = c(NA,
3L)))), row.names = 1:2, class = "data.frame")
I would like to transform this list of dataframes in a list of time-series (list_ts). This is the expected output:
list_ts <- list(AU = structure(c(1.95, 1.95, 1.90), .Tsp = c(2019, 2019.16666666667, 12), class = "ts"), BR = structure(c(6.41275285614977,
5.70499999999921, 6.18810104544945), .Tsp = c(2019, 2019.16666666667,
12), class = "ts"))
Try running it with tidy = TRUE. You can then split the dataframe and apply over the new lists. It is possible to use the original format you showed, but it would be a lot more work.
library(IMFData)
library(zoo)
databaseID <- "IFS"
startdate = "2019-01-01"
enddate = "2019-03-01"
checkquery = FALSE
IFS_ex <- CompactDataMethod(databaseID, queryfilter, startdate, enddate, checkquery,
tidy = TRUE)
lst_df <- split(IFS_ex, IFS_ex$`#REF_AREA`)
list_ts <- lapply(lst_df, function(x) ts(zoo(x$`#OBS_VALUE`, x$`#TIME_PERIOD`), start = c(2019, 1), frequency = 12))
# and to get rid of an attribute you do not want
list_ts <- lapply(list_ts, function(x) {attr(x, "index") <- NULL; x})
list_ts
# $AU
# Jan Feb Mar
# 2014 1.95 1.95 1.9
#
# $BR
# Jan Feb Mar
# 2014 6.41275285614977 5.70499999999921 6.18810104544945

How to plot layers of tupples on same plot in R?

I am trying to plot the time and NDVI for each region on the same plot. I think to do this I have to convert the date column from characters to time and then plot each layer. However I cannot figure out how to do this. Any thoughts?
list(structure(list(observation = 1L, HRpcode = NA_character_,
timeseries = NA_character_), row.names = c(NA, -1L), class = c("tbl_df",
"tbl", "data.frame")), structure(list(observation = 1:6, time = c("2014-01-01",
"2014-02-01", "2014-03-01", "2014-04-01", "2014-05-01", "2014-06-01"
), ` NDVI` = c("0.3793765496776215", "0.21686891782421552", "0.3785652933528299",
"0.41027240624704164", "0.4035578030242673", "0.341299793064468"
)), row.names = c(NA, -6L), class = c("tbl_df", "tbl", "data.frame"
)), structure(list(observation = 1:6, time = c("2014-01-01",
"2014-02-01", "2014-03-01", "2014-04-01", "2014-05-01", "2014-06-01"
), ` NDVI` = c("0.4071076986818826", "0.09090719657570319", "0.35214166081795284",
"0.4444311032927228", "0.5220702877666005", "0.5732370503295022"
)), row.names = c(NA, -6L), class = c("tbl_df", "tbl", "data.frame"
)), structure(list(observation = 1:6, time = c("2014-01-01",
"2014-02-01", "2014-03-01", "2014-04-01", "2014-05-01", "2014-06-01"
), ` NDVI` = c("0.3412131556625801", "0.18815996897460135", "0.5218904976415136",
"0.6970128777711452", "0.7229657162729096", "0.535967435470161"
)), row.names = c(NA, -6L), class = c("tbl_df", "tbl", "data.frame"
)))
111
First we need to clean your data. The first element in this list is empty
df = df[-1]
Now we need to make a data.frame
df = do.call(rbind, df)
I am going to add a region variable, change the name of NDVI to remove the space,
change ndvi into a numeric vector, and change time into a Date object
library(dplyr)
df = df %>%
mutate(region = factor(rep(1:3, rep(6, 3)))) %>%
rename(ndvi = ' NDVI') %>%
mutate(ndvi = as.numeric(ndvi)) %>%
mutate(time = as.Date(time))
Now we can use ggplot2 to plot the data by region
library(ggplot2)
g = df %>%
ggplot(aes(x = time, y = ndvi, col = region)) +
geom_line()
g
Which gives this plot:
Here's an approach with lubridate to handle dates and dplyr to make the binding of the data.frames easier to understand.
Note that the group names are taken from the names of the list, and since those don't exist in the data you provided, we have to set them in advance.
library(lubridate)
library(ggplot2)
library(dplyr)
names(data) <- 1:3
data <- bind_rows(data, .id = "group")
data$time <- ymd(data$time)
setnames(data," NDVI","NDVI")
data$NDVI <- as.numeric(data$NDVI)
ggplot(data, aes(x=time,y=NDVI,color=Group)) + geom_line()

iterate over certain elements of a list, not a data.frame

I am trying to modify certain items from a list based on a criteria (starts with "rr_esp") in the render.data list.
library(tidyr)
library(dplyr)
library(purrr)
per <- 2015:2019
render.data <- list(
emision = structure(
list(
AÑO = c(2017, 2018, 2019),
TRABAJADORESMES_r = c(58147, 57937, 24818),
MASA_r = c(3439195127, 4091347036.2, 2441068565.77),
TRABAJADORESMESsinDOM = c(58147L, 57928L, 24818L),
MESES = c(12, 12, 5)
),
class = c("tbl_df", "tbl", "data.frame"),
row.names = c(NA, -3L)
),
siniestros = structure(
list(
AÑO = c(2017, 2018, 2019),
N = c(388L, 327L, 115L),
GR_66 = c(64, 53, 15),
JU = c(41L, 5L, 0L),
JN = c(20, 19, 6),
PORINC_66s = c(437.22, 293.73, 82.12),
EDADs = c(15142L, 12886L, 4712L),
SALARIOs = c(13707950.67, 15151144.7, 4800075.4)
),
class = c("tbl_df", "tbl", "data.frame"),
row.names = c(NA, -3L)
),
rr_esp1 = structure(
list(
AÑO = c(2017, 2018, 2019),
MESES = c(12, 12, 5),
TRAB_PROM = c(4845.58, 4828.08, 4963.60),
PORINC = c(6.83, 5.54, 5.47),
SALARIO = c(35329.76, 46333.77, 41739.78),
EDAD = c(39.02, 39.40, 40.97)
),
class = c("tbl_df", "tbl", "data.frame"),
row.names = c(NA, -3L)
),
rr_esp7 = structure(
list(
AÑO = c(2017, 2018, 2019),
JUI_LIQ = c(1539624.21, 318726, 0),
JUI_RVA = c(24434809.51, 2292925.89, 0),
JUI_IBNR = c(0, 25284030.0174036, 22434092.26),
JUI_ULT = c(25974433.72, 27895681.90, 22434092.26),
CM_JUICIO = c(1505898.34, 1806002.14, 1557923.07)
),
class = c("tbl_df", "tbl", "data.frame"),
row.names = c(NA, -3L)
)
)
When apply a loop over their elements, they loses their original itemnames
Afterwards, I dont know a better way to iterate a subset of list elements and assign them a new value. I google it but I do not find a critical solution for list instead of data.frames.
render.data <- invisible(lapply(seq_along(render.data), function(i){
if(startsWith(names(render.data)[i], prefix = "rr_esp")){
render.data[[i]] %>%
complete(`AÑO` = per) %>%
gather(
key = "metrica", value = "valor", -`AÑO`
) %>%
mutate(# orden de las metricas
metrica = factor(metrica, levels = unique(metrica))
) %>%
spread(
key = `AÑO`, value = "valor"
)} else{
render.data[[i]]
}
setNames(render.data[[i]], names(render.data)[i])
}))
This seems like a case where a for loop is much clearer than an lapply. The main advantages of lapply are (a) that it pre-allocates a data structure for the result and (b) has simple syntax to apply a simple function. You already have a data structure for the result, and your function is complex. I don't know what your expected output is, but I would try this:
# find elements to modify
rr_elements = which(startsWith(names(render.data), prefix = "rr_esp"))
# modify in for loop
for (i in rr_elements) {
render.data[[i]] = render.data[[i]] %>%
complete(`AÑO` = per) %>%
gather(key = "metrica", value = "valor",-`AÑO`) %>%
mutate(# orden de las metricas
metrica = factor(metrica, levels = unique(metrica))) %>%
spread(key = `AÑO`, value = "valor")
}
If you want to make this code more re-usable, create a function for the operation on one data frame, and then you can use it easily with for or lapply. In general, I'd say that picking the data frames on which to use the function is better done externally than internally. (That is, I don't like how you have an if() statement checking the name inside the function. Do this logic outside the function, and only give the function the data you want it to use.)
foo = function(data) {
data %>%
complete(`AÑO` = per) %>%
gather(key = "metrica", value = "valor",-`AÑO`) %>%
mutate(# orden de las metricas
metrica = factor(metrica, levels = unique(metrica))) %>%
spread(key = `AÑO`, value = "valor")
}
# now the for loop or lapply is simple:
rr_elements = which(startsWith(names(render.data), prefix = "rr_esp"))
# for loop version
for (i in rr_elements) {
render.data[[i]] = foo(render.data[[i]])
}
# lapply version
render.data[rr_elements] = lapply(render.data[rr_elements], foo)

Automatically split function output (list) into component data.frames

I have a functions which yields 2 dataframes. As functions can only return one object, I combined these dataframes as a list. However, I need to work with both dataframes separately. Is there a way to automatically split the list into the component dataframes, or to write the function in a way that both objects are returned separately?
The function:
install.packages("plyr")
require(plyr)
fun.docmerge <- function(x, y, z, crit, typ, doc = checkmerge) {
mergedat <- paste(deparse(substitute(x)), "+",
deparse(substitute(y)), "=", z)
countdat <- nrow(x)
check_t1 <- data.frame(mergedat, countdat)
z1 <- join(x, y, by = crit, type = typ)
countdat <- nrow(z1)
check_t2 <- data.frame(mergedat, countdat)
doc <- rbind(doc, check_t1, check_t2)
t1<-list()
t1[["checkmerge"]]<-doc
t1[[z]]<-z1
return(t1)
}
This is the call to the function, saving the result list to the new object results.
results <- fun.docmerge(x = df1, y = df2, z = "df3", crit = c("id"), typ = "left")
In the following sample data to replicate the problem:
df1 <- structure(list(id = c("XXX1", "XXX2", "XXX3",
"XXX4"), tr.isincode = c("ISIN1", "ISIN2",
"ISIN3", "ISIN4")), .Names = c("id", "isin"
), row.names = c(NA, 4L), class = "data.frame")
df2 <- structure(list(id= c("XXX1", "XXX5"), wrong= c(1L,
1L)), .Names = c("id", "wrong"), row.names = 1:2, class = "data.frame")
checkmerge <- structure(list(mergedat = structure(integer(0), .Label = character(0), class = "factor"),
countdat = numeric(0)), .Names = c("mergedat", "countdat"
), row.names = integer(0), class = "data.frame")
In the example, a list with the dataframes df3 and checkmerge are returned. I would need both dataframes separately. I know that I could do it via manual assignment (e.g., checkmerge <- results$checkmerge) but I want to eliminate manual changes as much as possible and am therefore looking for an automated way.

R: drop columns from tibbles inside a function

This is a followthrough of this topic. Here are my 3 tibbles:
dftest_tw <- structure(list(text = c("RT #BitMEXdotcom: A new high: US$500M turnover in the last 24 hours, over 80% of it on $XBTUSD. Congrats to the team and thank you to our u…",
"RT #Crowd_indicator: Thank you for this nice video, #Nicholas_Merten",
"RT #Crowd_indicator: Review of #Cindicator by DataDash: t.co/D0da3u5y3V"
), Tweet.id = c("896858423521837057", "896858275689398272", "896858135314538497"
), created.date = structure(c(17391, 17391, 17391), class = "Date"),
created.week = c(33, 33, 33)), .Names = c("text", "Tweet.id",
"created.date", "created.week"), row.names = c(NA, -3L), class = c("tbl_df",
"tbl", "data.frame"))
dftest1_tw <- dftest_tw
dftest2_tw <- dftest_tw
myUserList <- ls(,pattern = "_tw")
Following yesterday topic, I have the wanted result when running this:
library(tidyverse)
lst <- mget(myUserList) %>%
map2(myUserList, ~mutate(.data = .x, Twitter.name = .y)) %>%
list2env(lst, envir = .GlobalEnv)
I need to drop a few columns for each df. This do the job when running on one df:
select_(dftest_tw, quote(-text), quote(-Tweet.id), quote(-created.date))
It seems like I have a serious probelm when it comes to apply code to each member of a list. I can't find a way to apply it to all df when using lapply, or writing a function:
MySelect <- function(x){
select_(x, quote(-text), quote(-Tweet.id), quote(-created.date))
x
}
for(var in myUserList){MySelect(get(var))}
Thank you for your help.

Resources