Add multiple columns to multiple data tables (frame) at the same time - r

I have a couple of tables with identical column names but different row sizes.
some of the columns are Dates and I would need to extract the year, months, day from them as factor and as numeric values and create within every table those new columns.
What I need is a function which creates this faster than manually addad as in the example below.
StartDate <- seq(as.Date("2014-01-19"), length = 20, by ="days")
EndDate <- seq(as.Date("2015-01-19"), length = 20, by ="days")
dt1 <- data.frame(x = sample(1:20, 20, TRUE), y = sample(1:20, 20, TRUE), StartDate, EndDate )
setDT(dt1)
StartDate <- seq(as.Date("2014-01-19"), length = 25, by ="days")
EndDate <- seq(as.Date("2015-01-19"), length = 25, by ="days")
dt2 <- data.frame(x = sample(1:40, 25, TRUE), y = sample(1:40, 25, TRUE), StartDate, EndDate )
setDT(dt2)
StartDate <- seq(as.Date("2014-01-19"), length = 30, by ="days")
EndDate <- seq(as.Date("2015-01-19"), length = 30, by ="days")
dt3 <- data.frame(x = sample(1:40, 30, TRUE), y = sample(1:40, 30, TRUE), StartDate, EndDate )
setDT(dt3)
My manual solution:
dt1[, year := as.factor(year(EndDate)), by = year(EndDate)]
dt1[, year_num := year(EndDate), by = year(EndDate)]
dt1[, months := as.factor(month(EndDate)), by = month(EndDate)]
dt1[, months_num := month(EndDate), by = month(EndDate)]
dt1[, days := x]
dt1[, weekday := weekdays(EndDate), by = weekdays(EndDate)]
dt2 .....
dt3 .....
I learned I would need to get all the datasets in a list with mget, then use lapply to loop over the list elements but I got some errors. Not sure how to make it.
Any help on this?
Thank you

You are on the right track. You need to get all the data in a list and iterate over it through lapply. However, I don't think the operations that you want to perform need to be grouped by month, year or week. You can perform them directly on each data table.
library(data.table)
list_data <- mget(paste0('dt', 1:3))
list_data <- lapply(list_data, function(x) {
x[, c('year', 'year_num', 'months', 'months_num', 'days', 'weekday') :=
list(as.factor(year(EndDate)), year(EndDate), as.factor(month(EndDate)),
month(EndDate), x, weekdays(EndDate))]
})

Related

Calculating the mean of the absolute value of all numerical columns

I want to calculate the mean of the absolute value of all numerical columns for the example dataset DT:
library(data.table)
set.seed(1)
DT <- data.table(panelID = sample(50,50), # Creates a panel ID
Country = c(rep("Albania",30),rep("Belarus",50), rep("Chilipepper",20)),
some_NA = sample(0:5, 6),
some_NA_factor = sample(0:5, 6),
Group = c(rep(1,20),rep(2,20),rep(3,20),rep(4,20),rep(5,20)),
Time = rep(seq(as.Date("2010-01-03"), length=20, by="1 month") - 1,5),
norm = round(runif(100)/10,2),
Income = round(rnorm(10,-5,5),2),
Happiness = sample(10,10),
Sex = round(rnorm(10,0.75,0.3),2),
Age = sample(100,100),
Educ = round(rnorm(10,0.75,0.3),2))
DT [, uniqueID := .I] # Creates a unique ID
DT[DT == 0] <- NA # https://stackoverflow.com/questions/11036989/replace-all-0-values-to-na
DT$some_NA_factor <- factor(DT$some_NA_factor)
I tried to calculate the means and the absolute means as follows:
mean_of_differences <- DT[,lapply(Filter(is.numeric,.SD),mean, na.rm=TRUE)]
mean_of_differences <- as.data.frame(t(mean_of_differences))
mean_of_differences <- round(mean_of_differences, digits=2)
mean_of_absolute_diff <- DT[,lapply(Filter(is.numeric,.SD),function(x) mean(abs(x),na.rm=TRUE))]
mean_of_absolute_diff <- as.data.frame(t(mean_of_absolute_diff))
mean_of_absolute_diff <- round(mean_of_differences, digits=2)
The mean of Income for the absolute differences is however negative (as it is for the normal mean), which obviously is not possible. If I look at my code I don't understand what I am doing wrong. What am I overlooking?
Here is a solution using data.table. It (i) identifies numeric columns and (ii) obtains the mean of the absolute value of each numeric column.
Data
dt = data.table(
num1 = rnorm(100),
num2 = rnorm(100),
strv = sample(LETTERS, 100, replace = T)
)
Code
numcols = colnames(dt)[unlist(lapply(dt, is.numeric))] # Which columns are numeric?
# > numcols
# [1] "num1" "num2"
meandt = dt[, lapply(.SD, function(x) mean(abs(x))), .SDcols = numcols]
newcols = paste('mean_abs_', numcols, sep = ''); colnames(meandt) = newcols
# > meandt
# mean_abs_num1 mean_abs_num2
# 1: 0.8287523 0.8325123

Bin data within a group using breaks from another DF

How to avoid using the for loop in the following code to speed up the computation (the real data is about 1e6 times larger)
id = rep(1:5, 20)
v = 1:100
df = data.frame(groupid = id, value = v)
df = dplyr::arrange(df, groupid)
bkt = rep(seq(0, 100, length.out = 4), 5)
id = rep(1:5, each = 4)
bktpts = data.frame(groupid = id, value = bkt)
for (i in 1:5) {
df[df$groupid == i, "bin"] = cut(df[df$groupid == i, "value"],
bktpts[bktpts$groupid == i, "value"],
include.lowest = TRUE, labels = F)
}
I'm not sure why yout bktpts is formatted like it is?
But here is a data.table slution that should be (at least a bit) faster than your for-loop.
library( data.table )
setDT(df)[ setDT(bktpts)[, `:=`( id = seq_len(.N),
value_next = shift( value, type = "lead", fill = 99999999 ) ),
by = .(groupid) ],
bin := i.id,
on = .( groupid, value >= value, value < value_next ) ][]
Another way:
library(data.table)
setDT(df); setDT(bktpts)
bktpts[, b := rowid(groupid) - 1L]
df[, b := bktpts[copy(.SD), on=.(groupid, value), roll = -Inf, x.b]]
# check result
df[, any(b != bin)]
# [1] FALSE
See ?data.table for how rolling joins work.
I came out with another data.table answer:
library(data.table) # load package
# set to data.table
setDT(df)
setDT(bktpts)
# Make a join
df[bktpts[, list(.(value)), by = groupid], bks := V1, on = "groupid"]
# define the bins:
df[, bin := cut(value, bks[[1]], include.lowest = TRUE, labels = FALSE), by = groupid]
# remove the unneeded bks column
df[, bks := NULL]
Explaining the code:
bktpts[, list(.(value)), by = groupid] is a new table that has in a list al the values of value for each groupid. If you run it alone, you'll understand where we're going.
bks := V1 assigns to variable bks in df whatever exists in V1, which is the name of the list column in the previous table. Of course on = "groupid" is the variable on which we make the join.
The code defining the bins needs little explanation, except by the bks[[1]] bit. It needs to be [[ in order to access the list values and provide a vector, as required by the cut function.
EDIT TO ADD:
All data.table commands can be chained in a -rather unintelligible- single call:
df[bktpts[, list(.(value)), by = groupid],
bks := V1,
on = "groupid"][,
bin := cut(value,
bks[[1]],
include.lowest = TRUE,
labels = FALSE),
by = groupid][,
bks := NULL]

How to count matches between a vector and dataframe of sequence coordinates?

Given a data table with start and end coordinates for sequences of integers:
set.seed(1)
df1 <- data.table(
START = c(seq(1, 10000000, 10), seq(1, 10000000, 10), seq(1, 10000000, 10)),
END = c(seq(10, 10000000, 10), seq(10, 10000000, 10), seq(10, 10000000, 10))
And a vector of integers:
vec1 <- sample(1:100000, 10000)
How can I count the number of integers in vec1 that are within the start and end coordinates of each sequence in df1? I am currently using a for loop:
COUNT <- rep(NA, nrow(df1))
for (i in 1:nrow(df1)){
vec2 <- seq(from = df1$START[i], to = df1$END[i])
COUNT[i] <- table(vec2 %in% vec1)[2]
print(i)
}
df1$COUNT <- COUNT
However, the datatable and vector I am applying this to are very large? Is anyone able to suggest a way to improve performance?
Any help will be greatly appreciated!
One option is to use between
library(data.table)
df1[, count := sum(between(vec1, START, END)), by = seq_len(nrow(df1))]
We can do this with a non-equi join
df1[data.table(val = vec1), count := .N,on = .(START < val,
END >= val), by = .EACHI]
head(df1)
If we want to get the output in the other way, using #minem's example
data.table(START = vec1, END = vec1)[df1, .N,
on = .(START >= START, END < END), by = .EACHI]
# START END N
#1: 1 4 2
#2: 8 9 1
#3: 11 30 0
### example data:
# df1 <- data.table(START = c(1, 8, 11), END = c(4, 9, 30))
# vec1 <- c(3, 2, 8)
#
df1[, ind := .I] # add uniqe index to data.table
dt2 <- as.data.table(vec1, key = 'vec1') # convert to data.table
dt2[, vec2 := vec1] # dublicate column
setkey(df1) # sets keys // order data by all columns
# Fast overlap join:
ans1 = foverlaps(dt2, df1, by.x = c('vec1', 'vec2'), by.y = c('START', 'END'),
type = "within", nomatch = 0L)
counts <- ans1[, .N, keyby = ind] # count by ind
# merge to inital data
df1[, COUNT := counts[df1, on = .(ind), x.N]]
df1
setorder(df1, ind) # reorder by ind to get inital order
df1[, ind := NULL] # deletes ind colum
df1[is.na(COUNT), COUNT := 0L] # NAs is 0 count
df1
# START END COUNT
# 1: 1 4 2
# 2: 8 9 1
# 3: 11 30 0

split join data.table R

Objective
Join DT1 (as i in data.table) to DT2 given key(s) column(s), within each group of DT2 specified by the Date column.
I cannot run DT2[DT1, on = 'key'] as that would be incorrect since key column is repeated across the Date column, but unique within a single date.
Reproducible example with a working solution
DT3 is my expected output. Is there any way to achieve this without the split manoeuvre, which does not feel very data.table-y?
library(data.table)
set.seed(1)
DT1 <- data.table(
Segment = sample(paste0('S', 1:10), 100, TRUE),
Activity = sample(paste0('A', 1:5), 100, TRUE),
Value = runif(100)
)
dates <- seq(as.Date('2018-01-01'), as.Date('2018-11-30'), by = '1 day')
DT2 <- data.table(
Date = rep(dates, each = 5),
Segment = sample(paste0('S', 1:10), 3340, TRUE),
Total = runif(3340, 1, 2)
)
rm(dates)
# To ensure that each Date Segment combination is unique
DT2 <- unique(DT2, by = c('Date', 'Segment'))
iDT2 <- split(DT2, by = 'Date')
iDT2 <- lapply(
iDT2,
function(x) {
x[DT1, on = 'Segment', nomatch = 0]
}
)
DT3 <- rbindlist(iDT2, use.names = TRUE)
You can achieve the same result with a cartesian merge:
DT4 <- merge(DT2,DT1,by='Segment',allow.cartesian = TRUE)
Here is the proof:
> all(DT3[order(Segment,Date,Total,Activity,Value),
c('Segment','Date','Total','Activity','Value')] ==
DT4[order(Segment,Date,Total,Activity,Value),
c('Segment','Date','Total','Activity','Value')])
[1] TRUE

Converting a raw data frame into workable time series

I have a spreadsheet documenting prices of 40 similar products at various dates. It looks like this.
date_1<-seq(as.Date("2010-01-01"), as.Date("2011-01-01"), length.out = 40)
date_2<-seq(as.Date("2011-01-01"), as.Date("2012-01-01"), length.out = 40)
date_3<-seq(as.Date("2012-01-01"), as.Date("2013-01-01"), length.out = 40)
date_4<-seq(as.Date("2013-01-01"), as.Date("2014-01-01"), length.out = 40)
date_5<-seq(as.Date("2014-01-01"), as.Date("2015-01-01"), length.out = 40)
date_6<-seq(as.Date("2015-01-01"), as.Date("2016-01-01"), length.out = 40)
price_1<-floor(seq(20, 50, length.out = 40))
price_2<-floor(seq(20, 60, length.out = 40))
price_3<-floor(seq(20, 70, length.out = 40))
price_4<-floor(seq(30, 80, length.out = 40))
price_5<-floor(seq(40, 100, length.out = 40))
price_6<-floor(seq(50, 130, length.out = 40))
data.frame(date_1,price_1,date_2,price_2,date_3,price_3,date_4,price_4,date_5,price_5,date_6,price_6)
The problem is, the columns representing dates and prices alternate (convenient for record keeping). How can I transform the above data to a new dataframe consisting solely of prices of these 40 products as rows, with dates as column names? This will generate a lot of NA's because the dates in each column differ but that's OK.
When working with time series data it is often helpful to have it in long form (one row per observation), even if your target output is wide (one row per time series). Here are three possible approaches to get it into long form, then widen:
1. base reshape()
To get long form, base reshape is definitely a powerful option. The following solution improves on the accepted solution because it works for any numbers of products and observations and eliminates an unnecessary step:
df <- data.frame(date_1,price_1,date_2,price_2,date_3,price_3,
date_4,price_4,date_5,price_5,date_6,price_6)
# no need to create an id variable
long_form <- reshape(df, # idvar="id" by default
varying = list(grep('date_',names(df), value=TRUE),
grep('price_',names(df), value=TRUE) ),
v.names=c("date","price"),
direction="long",
sep="_")
And reshape can also widen it. (We'll use spread in another approach below.)
wide_form <- reshape(long_form, drop='time', timevar='date', direction='wide')
2. data.table melt() and dcast() (likely faster on real dataset)
Make sure you have data.table v1.9.6 or later, which allows you to melt multiple columns.
library(data.table)
setDT(df)
melt.data.table(df[, prod_id := .I], # product id = original row number
measure.vars = list(grep('date_',names(df), value=TRUE),
grep('price_',names(df), value=TRUE) ),
variable.name = 'sequence',
value.name = c('date','price'),
id.vars = 'prod_id') -> long_form
In this case you don't use the sequence, so to get wide form is just:
dcast.data.table(long_form[, !'sequence', with=FALSE],
value.var = 'price', # optional (function guesses correctly)
prod_id ~ date) -> wide_form
3. tidyr & dplyr split-apply-combine (easy to understand)
It doesn't require the mental gymnastics that reshape does (at least for me). It is a column-wise variant on the "split-apply-combine" paradigm.
library(dplyr); library(tidyr)
# Create long-form time series data
# Split table into sequenced prices and dates, then combine on product and sequence
full_join(
df %>%
select(starts_with('date_')) %>% #~~~~ Left side = date component ~~~~~~~~
mutate(prod_id = 1:nrow(df)) %>% #~ product id = original row number ~
gather(sequence, date, -prod_id) %>% #~ long form = 1 row per prod per seq ~
mutate(sequence = #~~~ Cols: product_id, sequence, date ~~~
sub('^date_(\\d+)$', '\\1', sequence) ) ,
df %>%
select(starts_with('price_')) %>% #~~~ Right side = price component ~~~~~~~
mutate(prod_id = 1:nrow(df)) %>% #~ ~
gather(sequence, price, -prod_id) %>% #~ same idea ~
mutate(sequence = #~~ Cols: product_id, sequence, price ~~~
sub('^price_(\\d+)$', '\\1', sequence) )
) -> long_form
In this case you don't need the sequence, so to get to wide form it's simply:
long_form %>% select(-sequence) %>% spread(date, price) -> wide_form
as noted by others above.
Here is one way I came up with using dplyr/tidyr packages:
library(tidyr)
library(dplyr)
date_1<-seq(as.Date("2010-01-01"), as.Date("2011-01-01"), length.out = 40)
date_2<-seq(as.Date("2011-01-01"), as.Date("2012-01-01"), length.out = 40)
date_3<-seq(as.Date("2012-01-01"), as.Date("2013-01-01"), length.out = 40)
date_4<-seq(as.Date("2013-01-01"), as.Date("2014-01-01"), length.out = 40)
date_5<-seq(as.Date("2014-01-01"), as.Date("2015-01-01"), length.out = 40)
date_6<-seq(as.Date("2015-01-01"), as.Date("2016-01-01"), length.out = 40)
price_1<-floor(seq(20, 50, length.out = 40))
price_2<-floor(seq(20, 60, length.out = 40))
price_3<-floor(seq(20, 70, length.out = 40))
price_4<-floor(seq(30, 80, length.out = 40))
price_5<-floor(seq(40, 100, length.out = 40))
price_6<-floor(seq(50, 130, length.out = 40))
df <- data.frame(date_1,price_1,date_2,price_2,date_3,price_3,date_4,price_4,date_5,price_5,date_6,price_6)
dates <- df[, grep('date', names(df))]
dates <- dates %>% gather(date_type, date) %>% select(-date_type)
prices <- df[, grep('price', names(df))]
prices <- prices %>% gather(price_type, price) %>% select(-price_type)
df <- cbind(dates, prices)
Then, to spread dates to columns and prices to rows, you can do something like this:
df <- arrange(df, price)
df <- spread(df, date, price)
Using baseR and tidyr you could do:
library(tidyr)
#add an id to identify the products
df$id=1:40
#transform the data to a long format
long_data <- reshape(df,idvar="id",varying=list(paste0("date_",1:6),paste0("price_",1:6)),v.names=c("date","price"),direction="long",sep="_")
long_data <- long_data[,!grepl("time",colnames(long_data))]
#put it back to a wide format
wide_data <- spread(long_data,date,price)

Resources