Fit model by group using Data.Table package - r

How can I fit multiple models by group using data.table syntax? I want my output to be a data.frame with columns for each "by group" and one column for each model fit. Currently I am able to do this using the dplyr package, but can't do this in data.table.
# example data frame
df <- data.table(
id = sample(c("id01", "id02", "id03"), N, TRUE),
v1 = sample(5, N, TRUE),
v2 = sample(round(runif(100, max = 100), 4), N, TRUE)
)
# equivalent code in dplyr
group_by(df, id) %>%
do( model1= lm(v1 ~v2, .),
model2= lm(v2 ~v1, .)
)
# attempt in data.table
df[, .(model1 = lm(v1~v2, .SD), model2 = lm(v2~v1, .SD) ), by = id ]
# Brodie G's solution
df[, .(model1 = list(lm(v1~v2, .SD)), model2 = list(lm(v2~v1, .SD))), by = id ]

Try:
df[, .(model1 = list(lm(v1~v2, .SD)), model2 = list(lm(v2~v1, .SD))), by = id ]
or slightly more idiomatically:
formulas <- list(v1~v2, v2~v1)
df[, lapply(formulas, function(x) list(lm(x, data=.SD))), by=id]

Related

R fast cosine distance between consecutive rows of a data.table

How can I efficiently calculate distances between (almost) consecutive rows of a large-ish (~4m rows) of a data.table? I've outlined my current approach, but it is very slow. My actual data has up to a few hundred columns. I need to calculate lags and leads for future use, so I create these and use them to calculate distances.
library(data.table)
library(proxy)
set_shift_col <- function(df, shift_dir, shift_num, data_cols, byvars = NULL){
df[, (paste0(data_cols, "_", shift_dir, shift_num)) := shift(.SD, shift_num, fill = NA, type = shift_dir), byvars, .SDcols = data_cols]
}
set_shift_dist <- function(dt, shift_dir, shift_num, data_cols){
stopifnot(shift_dir %in% c("lag", "lead"))
shift_str <- paste0(shift_dir, shift_num)
dt[, (paste0("dist", "_", shift_str)) := as.numeric(
proxy::dist(
rbindlist(list(
.SD[,data_cols, with=FALSE],
.SD[, paste0(data_cols, "_" , shift_str), with=FALSE]
), use.names = FALSE),
method = "cosine")
), 1:nrow(dt)]
}
n <- 10000
test_data <- data.table(a = rnorm(n), b = rnorm(n), c = rnorm(n), d = rnorm(n))
cols <- c("a", "b", "c", "d")
set_shift_col(test_data, "lag", 1, cols)
set_shift_col(test_data, "lag", 2, cols)
set_shift_col(test_data, "lead", 1, cols)
set_shift_col(test_data, "lead", 2, cols)
set_shift_dist(test_data, "lag", 1, cols)
I'm sure this is a very inefficient approach, any suggestions would be appreciated!
You aren't using the vectorisation efficiencies in the proxy::dist function - rather than call it once for each row you can get all the distances you need from a single call.
Try this replacement function and compare the speed:
set_shift_dist2 <- function(dt, shift_dir, shift_num, data_cols){
stopifnot(shift_dir %in% c("lag", "lead"))
shift_str <- paste0(shift_dir, shift_num)
dt[, (paste0("dist2", "_", shift_str)) := proxy::dist(
.SD[,data_cols, with=FALSE],
.SD[, paste0(data_cols, "_" , shift_str), with=FALSE],
method = "cosine",
pairwise = TRUE
)]
}
You could also do it in one go without storing copies of the data in the table
test_data[, dist_lag1 := proxy::dist(
.SD,
as.data.table(shift(.SD, 1)),
pairwise = TRUE,
method = 'cosine'
), .SDcols = c('a', 'b', 'c', 'd')]

Custom dcasting via data.table in R

Here is my data
dt = data.table(x=sample(8,20,TRUE),
y=sample(2,20,TRUE),
w = sample(letters[5:20], 20, TRUE),
u = sample(letters[2:25], 20, TRUE),
z=sample(letters[1:4], 20,TRUE),
d1 = runif(20), d2=runif(20))
Here is my dcasting code.
DC_1 = dcast.data.table(dt,x+w ~ z, value.var = "d1")
This works fine. However my data could also additionally include column 'a' and column 's' as shown below. Both of them could be included, either one, or none of them.
dt = data.table(x=sample(8,20,TRUE),
y=sample(2,20,TRUE),
w = sample(letters[5:20], 20, TRUE),
u = sample(letters[2:25], 20, TRUE),
z=sample(letters[1:4], 20,TRUE),
a = sample(letters[1:25], 20, T),
s = sample(letters[2:17], 20, T),
d1 = runif(20), d2=runif(20))
The additional columns however would always be characters . Also my data always has to be cast on column 'z' and value variable would always be 'd1'
How do I dcast via data.table such that it takes all the character columns (except z) available in the data table and casts them on z?
We could subset the dataset column and use ... on the lhs of ~ to specify for all columns and on the rhs of formula it would be 'z'
dcast(dt[, setdiff(names(dt), 'd2'), with = FALSE], ... ~ z, value.var = 'd1')
Or get the column names of the character columns programmatically
nm1 <- dt[, names(which(unlist(lapply(.SD, is.character))))]
nm2 <- setdiff(nm1, 'z')
dcast(dt,paste0(paste(nm2, collapse="+"), "~ z"), value.var = 'd1')
Or another option is select from dplyr
library(dplyr) #1.0.0
dcast(dt[, select(.SD, where(is.character), d1)], ... ~ z, value.var = 'd1')
A similar option in tidyverse would be
library(tidyr)
dt %>%
select(where(is.character), d1) %>%
pivot_wider(names_from = z, values_from = d1)

It is possible to compare the multiple regression models using AIC scores

It is possible to compare the multiple regression models using AIC scores, with the models ordered from best-supported to worst-supported?
Here is my code
library(data.table)
Regressions<-
data.table(February)[,
.(Lm = lapply(.SD, function(x) summary(lm(February$PPNA ~ February$Acum1 + x)))),
.SDcols = 80:157]
We can extract the AIC values and order based on the 'AIC' values
library(data.table)
dt <- as.data.table(February)
dt1 <- dt[, .(Lm = lapply(.SD, function(x) lm(February$PPNA ~ February$Acum1 + x))),
.SDcols = 80:157]
dt2 <- dt1[, .(Lm = Lm[order(unlist(lapply(Lm, AIC)))])]
Or using a reproducible example
dt1 <- as.data.table(iris)[, .(Lm = lapply(.SD, function(x)
lm(iris$Petal.Length ~ iris$Species + x)))]
dt2 <- dt1[, .(Lm = Lm[order(unlist(lapply(Lm, AIC)))])]

Creating single column dataframe with the means of another dataset

I have a dataset which looks as follows:
set.seed(1)
DF <- data.table(panelID = sample(50,50), # Creates a panel ID
Country = c(rep("A",30),rep("B",50), rep("C",20)),
Group = c(rep(1,20),rep(2,20),rep(3,20),rep(4,20),rep(5,20)),
Time = rep(seq(as.Date("2010-01-03"), length=20, by="1 month") - 1,5),
norm = round(runif(100)/10,2),
Income = sample(100,100),
Happiness = sample(10,10),
Sex = round(rnorm(10,0.75,0.3),2),
Age = round(rnorm(10,0.75,0.3),2),
Educ = round(rnorm(10,0.75,0.3),2))
DF [, uniqueID := .I]
DF <- as.data.table(DF) # Make sure it is a data.table
DF [, uniqueID := .I] # Add a unique ID
cols = sapply(DF, is.numeric) # Check numerical columns
DFm <- melt(DF[, cols, with = FALSE][, !"uniqueID"], id = "panelID") # https://stackoverflow.com/questions/57406654/speeding-up-a-function/57407959#57407959
DFm[, value := c(NA, diff(value)), by = .(panelID, variable)] # https://stackoverflow.com/questions/57406654/speeding-up-a-function/57407959#57407959
DF <- dcast(DFm, panelID + rowidv(DFm, cols = c("panelID", "variable")) ~ variable, value.var = "value") # ""
DF <- DF[DF[, !Reduce(`&`, lapply(.SD , is.na)), .SDcols = 3:ncol(DF)]] # Removes T1 for which there is no difference
Now what I would like to do is fairly simple. I want the mean of each column stored in a single column.
I tried:
mean_of_differences <- DF [, mean(sapply(.SD, is.numeric), na.rm=TRUE)]
mean_of_differences <- DF[,.SD[mean(sapply(.SD, is.numeric), na.rm=TRUE)]]
But somehow I cannot seems to get it right. I just end up with NA's or errors.
What am I overlooking?

Aggregate if string contains specific text in R

I've seen a lot of posts on this topic so apologies if this is a duplicate but I couldn't figure out my problem.
I have
df <- data.frame(name = c('bike+ride','shoe+store','ride','mountian%20bike','ride+along'),
count = c(2,5,8,7,6))
and want to sum each count if it name contains a string group
group <- data.frame(group = c('ride','bike'))
So the end result looks as follows:
Group Count
bike 9
ride 16
Can anyone help?
A base R idea,
sapply(sapply(as.character(group$group), function(i) grep(i, df$name)), function(i) sum(df$count[i]))
#or make it a function
aggr1 <- function(var1, grp, cnt){
m1 <- sapply(as.character(grp), function(i) grep(i, var1))
final_d <- sapply(m1, function(i) sum(cnt[i]))
return(data.frame(Group = names(final_d),
Count = as.integer(final_d), stringsAsFactors = FALSE)
)
}
aggr1(df$name, group$group, df$count)
# Group Count
#1 ride 16
#2 bike 9
One way is
do.call(rbind, sapply(group$group, FUN = function(x, df) {
out <- df[grepl(pattern = x, x = df$name), ]
data.frame(group = x, count = sum(out$count))
}, df = df, simplify = FALSE))
group count
1 ride 16
2 bike 9
In two steps:
# make a data.frame which locates where each group level is located
grp <- as.data.frame(sapply(group$group, FUN = function(x) grepl(pattern = x, x = df$name)))
names(grp) <- group$group
# based on above location (TRUE/FALSE), sum accordingly
data.frame(count = apply(grp, MARGIN = 2, FUN = function(x, df) {
sum(df[x, "count"])
}, df = df))
count
ride 16
bike 9
A way using tidyverse packages purrr, dplyr and tidyr:
library(tidyverse) # for dplyr, purr and tidyr
groups <- c('ride','bike')
map_df(groups, ~setNames(summarize_(df, interp(~sum(df$count[grepl(var, name)], na.rm = TRUE), var = .x)), .x)) %>%
gather(group, count, na.rm = TRUE)

Resources