Window function / For loop in R - r

I have a database of matches with players and players'scores for each game. I am trying to create a rating variable for my prediction model. I am using formula from a blogpost.
Here is the dummy dataset:
df = data.frame(
matchid = c(1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,4,4),
playerid = c(2,3,4,5,6,7,8,9,10,11,5,2,3,4,6,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,17,19,21,18,20,22,26,24,25,23),
point = c(52,38,34,33,16,19,16,8,10,2,38,37,31,34,21,18,18,13,9,-2,45,34,37,39,12,9,7,-3,-1,-8,47,38,31,17,26,32,28,17,16,9))
Here is my attempt using for loop. The for loop run extremely slow for 30000 games database. Please give me some pointers on how to improve this process / loop. I really have no idea.
## Initialize initial rating for each player
players_ratings = data.frame(playerid = unique(df$playerid),rating = 1000, stringsAsFactors = FALSE)
## Initialize unique matches
unique_matches = df$matchid %>% unique
## Matches with rating
relative_rating_matches = list(length(df))
### GENERATE RATING
for(index in 1:length(unique_matches)){
match = df %>% filter(matchid == unique_matches[[index]])
position = index
## UPDATE RATING
match = match %>% left_join(players_ratings,by = 'playerid')
relative_rating_matches[[position]] = match
print(match)
## BUILD ACTUAL RESULTS MATRIX
S = matrix(nrow = 10, ncol = 10)
rownames(S) = match$playerid
colnames(S) = match$playerid
for(i in 1:nrow(S)) {
for(j in 1:ncol(S)) {
player_row_point = as.numeric(match %>% filter(playerid == rownames(S)[i]) %>% select(point))
player_col_point = as.numeric(match %>% filter(playerid == colnames(S)[j]) %>% select(point))
S[i,j] = ifelse(player_col_point == player_row_point,0.5,
ifelse(player_col_point > player_row_point,1,0))
}
}
diag(S)= 0
print(S)
## BUILD EXPECTED WIN/LOSS MATRIX
E = matrix(nrow = 10, ncol = 10)
rownames(E) = match$playerid
colnames(E) = match$playerid
for(i in 1:nrow(E)) {
for(j in 1:ncol(E)) {
player_row_rating = as.numeric(match %>% filter(playerid == rownames(E)[i]) %>% select(rating))
player_col_rating = as.numeric(match %>% filter(playerid == colnames(E)[j]) %>% select(rating))
r = 1 + 10^((player_row_rating - player_col_rating)/400)
expected_result = 1/r
E[i,j] = expected_result
}
}
diag(E) = 0
print(E)
## GENERATE INCREMENTAL RATING
R = 20 * (S-E)
R = as.data.frame(colSums(R)) %>% rownames_to_column()
print(R)
## UPDATE EXISTING RATING DATABASE
for(i in 1:nrow(R)){
player_id = R[i,1]
incre_rating = ifelse(is.na(R[i,2]),0,R[i,2])
cur_rating = players_ratings[players_ratings$playerid == player_id,2]
players_ratings[players_ratings$playerid == player_id,2] = cur_rating + incre_rating
}
}

Related

Error in mclapply(1:20, function(ii) { : Windows does not supprot 'mc.cores' > 1

I try to reproduce and run the code from GitHub: https://github.com/sangwon-hyun/omd. I first try to run the code from the file: 01-climatology.Rmd (https://github.com/sangwon-hyun/omd/blob/master/main/01-climatology.Rmd)
When I run the code after the 296th line "Toy distances" part.
library(parallel)
start.time = Sys.time()
dists = mclapply(1:20, function(ii){
printprogress(ii, 20, start.time = start.time)
dat1 = darwindat %>% add_blob(c(-150, -25))
dat2 = darwindat %>% add_blob(c(-150 + 2*ii, -25), rotate = (ii / 20) * 90)
val1 = dat1 %>% pull(val)
val2 = dat2 %>% pull(val)
costm = form_cost_matrix(dat1)
res = omd(M1_long = dat1,
M2_long = dat2,
p = 1,
## costm = costm,
geodesic = TRUE)
diffval = val1 - val2
twodists = c(omd = res$dist, rmse = sqrt(sum((diffval)^2)))
return(twodists)
}, mc.cores = 3)
distmat = do.call(rbind, dists) %>% as_tibble() %>%
tibble::add_column(shift = 2*(1:20)) %>%
mutate(omd = omd-min(omd), rmse = rmse-min(rmse)) %>%
mutate(omd = omd/max(omd), rmse = rmse/max(rmse)) %>%
tidyr::pivot_longer(cols = c('omd', 'rmse'))
saveRDS(distmat, file = file.path(datadir, "distmat-toy.RDS"))
It reports the error:
Error in mclapply(1:20, function(ii) { : Windows does not supprot 'mc.cores' > 1
I use R 4.2.0. How to fix this error?

Incorrect number of probabilities dimsensions when optimizing a multi-season elo model

My data frame looks like this:
and the code to compute the initial/optimized elo
# Elo Rating System
library(eurolig)
library(tidyverse)
library(lubridate)
plldf2 <- read_csv('pll_elor.csv',show_col_types = FALSE)
plldf3 <- plldf2[(plldf2$season == 2019),]
# Helpers -----------------------------------------------------------------
# Expected win probability before a game
getExpectedProb <- function(r_team, r_opp, home_adv, s) {
1 / (1 + 10 ^ ((r_opp - r_team - home_adv) / s))
}
# Get Elo rating for next season
getCarryOver <- function(rating, c) {
c * rating + 1505 * (1 - c)
}
# Get margin of victory multiplier
getMovMultiplier <- function(points_diff, elo_diff) {
((points_diff + 3) ^ 0.8) / (7.5 + 0.006 * elo_diff)
}
getEloSummary <- function(df) {
df %>%
pivot_longer(
cols = ends_with("_new"),
names_to = "type",
values_to = "elo"
) %>%
select(
season,
date,
team,
opp,
type,
elo
) %>%
mutate(
team = ifelse(type == "elo_home_new", team, opp),
order = rank(date),
team_id = paste0(team, "-", season)
) %>%
select(season, date, team, elo, order, team_id)
}
# Algorithm ---------------------------------------------------------------
# For a single season
getSeasonElo <- function(df, k, home_adv, s, initial_elo) {
team_ratings <- initial_elo
# Data frame to store the subsequent values obtained by the algorithm
ratings_df <- df %>%
mutate(
home_adv = NA,
win_points_home = NA,
win_points_away = NA,
expected_prob_home = NA,
expected_prob_away = NA,
mov_home = NA,
mov_away = NA,
elo_home_prev = NA,
elo_away_prev = NA,
elo_home_new = NA,
elo_away_new = NA,
prob_pred = NA
)
for (i in 1:nrow(df)) {
team_home <- df$team[i]
team_away <- df$opp[i]
elo_home <- team_ratings[[team_home]]
elo_away <- team_ratings[[team_away]]
# Home advantage set to 0 for Final 4 games
h <- ifelse(df$phase[i] == "ff", 0, home_adv)
# Assign 1 for wins and 0 for losses
win_points_home <- ifelse(
df$score[i] > df$opp_score[i],
1,
0
)
win_points_away <- ifelse(win_points_home == 0, 1, 0)
# Find pre-game win probabilities
expected_prob_home <- getExpectedProb(
r_team = elo_home,
r_opp = elo_away,
home_adv = h,
s = s)
expected_prob_away <- 1 - expected_prob_home
# Margin of victory multiplier
points_diff_abs <- abs(df$score[i] - df$opp_score[i])
elo_diff_home <- elo_home + h - elo_away
elo_diff_away <- elo_away - elo_home - h
mov_home <- getMovMultiplier(points_diff_abs, elo_diff_home)
mov_away <- getMovMultiplier(points_diff_abs, elo_diff_away)
# Update Elo ratings
elo_home_new <- elo_home + k * (win_points_home - expected_prob_home) * mov_home
elo_away_new <- elo_away + k * (win_points_away - expected_prob_away) * mov_away
team_ratings[[team_home]] <- elo_home_new
team_ratings[[team_away]] <- elo_away_new
prob_pred <- sample(
x = c(team_home, team_away),
size = length(list(expected_prob_home,expected_prob_away)),
prob = c(expected_prob_home, expected_prob_away) #ISSUE HERE
)
ratings_df$home_adv[i] <- h
ratings_df$win_points_home[i] <- win_points_home
ratings_df$win_points_away[i] <- win_points_away
ratings_df$expected_prob_home[i] <- expected_prob_home
ratings_df$expected_prob_away[i] <- expected_prob_away
ratings_df$mov_home[i] <- mov_home
ratings_df$mov_away[i] <- mov_away
ratings_df$elo_home_prev[i] <- elo_home
ratings_df$elo_away_prev[i] <- elo_away
ratings_df$elo_home_new[i] <- elo_home_new
ratings_df$elo_away_new[i] <- elo_away_new
ratings_df$prob_pred[i] <- prob_pred
}
ratings_df <- ratings_df %>%
mutate(
winner = ifelse(score > opp_score, team, opp),
winner_pred = ifelse(elo_home_prev + home_adv >= elo_away_prev,
team, opp),
correct_pred = ifelse(winner == winner_pred, TRUE, FALSE)
)
list(ratings_df = ratings_df, team_elo = team_ratings)
}
# Algorithm for several seasons
getElo <- function(df, k, home_adv, s, carry) {
df <- arrange(df, season)
season_results <- split(df, df$season)
# Start with first season
teams <- sort(unique(season_results[[1]]$team))
# Since it is the first season overall, all teams start with 1300 Elo points
initial_ratings <- as.list(rep(1300, length(teams)))
names(initial_ratings) <- teams
first_season_ratings <- getSeasonElo(
season_results[[1]],
k = k,
home_adv = home_adv,
s = s,
initial_elo = initial_ratings
)
# TODO: Use the last recorded Elo rating, not last season
elo_final <- tibble(
season = unique(season_results[[1]]$season),
team = names(first_season_ratings$team_elo),
elo = unlist(first_season_ratings$team_elo)
)
season_ratings <- vector("list", length(season_results))
season_ratings[[1]] <- first_season_ratings
for (i in 2:length(season_ratings)) {
teams <- sort(unique(season_results[[i]]$team))
teams_new <- teams[!teams %in% elo_final$team]
teams_new_elo <- as.list(rep(1300, length(teams_new)))
names(teams_new_elo) <- teams_new
teams_old <- teams[teams %in% elo_final$team]
teams_old_elo <- vector("list", length(teams_old))
names(teams_old_elo) <- teams_old
for (j in seq_along(teams_old)) {
elo_team <- elo_final %>%
filter(team == teams_old[j])
teams_old_elo[[j]] <- elo_team$elo[which.max(elo_team$season)]
}
teams_old_elo <- lapply(teams_old_elo, getCarryOver, c = carry)
initial_elo <- c(teams_new_elo, teams_old_elo)
season_ratings[[i]] <- getSeasonElo(
season_results[[i]],
k = k,
home_adv = home_adv,
s = s,
initial_elo = initial_elo
)
elo_final_season <- tibble(
season = unique(season_results[[i]]$season),
team = names(season_ratings[[i]]$team_elo),
elo = unlist(season_ratings[[i]]$team_elo)
)
elo_final <- bind_rows(elo_final, elo_final_season)
}
output_df <- map_df(season_ratings, function(x) x$ratings_df)
output_df
}
# Tunning -----------------------------------------------------------------
# Grid optimization
k <- seq(10, 50, by = 5)
h <- seq(0, 150, by = 25)
c <- seq(0.5, 1, by = 0.1)
grid_df <- expand_grid(k, h, c)
checkAccuracy <- function(df, k, h, c) {
df <- getElo(df, k, h, s = 400, c)
sum(df$correct_pred) / nrow(df)
}
n <- nrow(grid_df)
accuracy <- numeric(n)
for (i in 1:n) {
acc <- checkAccuracy(
df = results,
k = grid_df$k[i],
h = grid_df$h[i],
c = grid_df$c[i]
)
accuracy[i] <- acc
}
acc_df <- cbind(grid_df, accuracy) %>%
as_tibble() %>%
arrange(desc(accuracy))
# Ratings -----------------------------------------------------------------
elo_df <- getElo(plldf2, k = 25, home_adv = 100, s = 400, carry = 0.8)
elo_summary <- getEloSummary(elo_df) %>%
left_join(teaminfo, by = c("team" = "team", "season"))
tester <- getSeasonElo(
plldf3,
k = 25,
home_adv = 100,
s = 400,
initial_elo = initial_ratings)
getEloSummary(tester)
elo_summary %>%
ggplot(aes(order, elo, group = team_id)) +
geom_line()
but when I attempt to run the 'getElo' function to optimize the model it says there is a incorrect number of probabilities and I get the error below. However when I subset my data and run it for a single season using the 'getSeasonElo' function it computes it without issue. I assumed the problem was coming from the 'size' parameter being set to 1 by default, which I've fixed to account for the multi-season calculation, but the incorrect number is still happening? Not sure what I missed.
prob_pred <- sample(
x = c(team_home, team_away),
size = length(list(expected_prob_home,expected_prob_away)),
prob = c(expected_prob_home, expected_prob_away) #ISSUE HERE
)

randomize observations by groups (blocks) without replacement

This is a follow up question. The answers in the previous question are doing the random sampling with replacement. How can I change the code so that I assign each observation to on of J "urn" without putting the observation back in the 'lottery'?
This is the code I have right now:
set.seed(9782)
I <- 500
g <- 10
library(dplyr)
anon_id <- function(n = 1, lenght = 12) {
randomString <- c(1:n)
for (i in 1:n)
{
randomString[i] <- paste(sample(c(0:9, letters, LETTERS),
lenght, replace = TRUE),
collapse = "")
}
return(randomString)
}
df <- data.frame(id = anon_id(n = I, lenght = 16),
group = sample(1:g, I, T))
J <- 3
p <- c(0.25, 0.5, 0.25)
randomize <- function(data, urns=2, block_id = NULL, p=NULL, seed=9782) {
if(is.null(p)) p <- rep(1/urns, urns)
if(is.null(block_id)){
df1 <- data %>%
mutate(Treatment = sample(x = c(1:urns),
size = n(),
replace = T,
prob = p))
return(df1)
}else{
df1 <- data %>% group_by_(block_id) %>%
mutate(Treatment = sample(x = c(1:urns),
size = n(),
replace = T,
prob = p))
}
}
df1 <- randomize(data = df, urns = J, block_id = "group", p = p, seed = 9782)
If I change replace = T to replace = F I get the following error:
Error: cannot take a sample larger than the population when 'replace = FALSE'
Clarification of my objective:
Suppose that I have 10 classrooms (or villages, or something like that). To keep it simple, suppose each classroom has 20 students (in reality they will have N_j). Classroom per classroom, I want to assign each student to one of J groups, for example J=3. P says the fraction that will be assigned to each group. For example 25% to group 1 40% to group 2 and 35% to group 3.
This solution is based on #Frank's comment. I created one function that does the randomization for block j and another that calls that function for every block.
randomize_block <- function(data, block=NULL, block_name=NULL, urns, p, seed=9782) {
set.seed(seed)
if(!is.null(block)) {
condition <- paste0(block_name,"==",block)
df <- data %>% filter_(condition)
} else df <- data
if(is.null(p)) p <- rep(1/urns, urns)
N <- nrow(df)
Np <- round(N*p,0)
if(sum(Np)!=N) Np[1] <- N - sum(Np[2:length(Np)])
Urns = rep(seq_along(p), Np)
Urns = sample(Urns)
df$urn <- Urns
return(df)
}
randomize <- function(data, block_name=NULL, urns, p, seed=9782) {
if(is.null(p)) p <- rep(1/urns, urns)
if(!is.null(block_name)){
blocks <- unique(data[,block_name])
df <- lapply(blocks, randomize_block,
data = data,
block_name=block_name,
urns = urns,
p = p,
seed=seed)
return(data.table::rbindlist(df))
}else {
df <- randomize_block(data = data,
urns = urns, p = p,
seed=seed)
}
}
test <- randomize(data = df, block_name = "group",
urns = 3, p = c(0.25, 0.5, 0.25),
seed=4222016)
I'm trying to figure out if it is possible to use dplyr to do this, alternative solutions implementing that are more than welcome!
My answer to your other question is without replacement, as can be seen below:
block_rand <- as.tibble(randomizr::block_ra(blocks = df$group, conditions = c("urn_1","urn_2","urn_3")))
df2 <- as.tibble(bind_cols(df, block_rand))
df2 %>% janitor::tabyl(group, value)
df2 %>%
group_by(id) %>%
filter(n()>1) %>%
str()

Replacement has length zero: can't find the issue with my loop

I'm trying to modify some code from a chapter of Quantitative Trading with R to work with returns instead of raw prices. Everythings to be going okay with the exception of the "PROFIT AND LOSS" section of my code. It keeps returning "Error in qty_x[i] = (vec[i] + prev_x_qty) : replacement has length zero" When looking at my variables I can't seem to find any problems. I've included the code for reproduction.
# LOAD LIBRARIES
library(quantmod)
library(xts)
# FUNCTIONS
# ROLLING BETA
pcbeta = function(dF){
r = prcomp( ~ dF$x[-1] + dF$y[-1])
return(r$rotation[2, 1] / r$rotation[1,1])
}
rolling_beta = function(z, width){
rollapply(z, width = width, FUN = pcbeta,
by.column = FALSE, align = 'right')
}
# GET TICKER DATA
SPY = getSymbols('SPY', adjust=T, auto.assign=FALSE)
AAPL = getSymbols('AAPL', adjust=T, auto.assign=FALSE)
# IN-SAMPLE DATE RANGE
in_start_date = '2011-01-01'
in_end_date = '2011-12-31'
in_range = paste(in_start_date, '::', in_end_date, sep='')
# RETRIEVE IN-SAMPLE DATA
x_in = SPY[in_range, 6]
y_in = AAPL[in_range, 6]
dF_in = cbind(x_in, y_in)
names(dF_in) = c('x','y')
# OUT-OF-SAMPLE DATE RANGE
out_start_date= '2012-01-01'
out_end_date = '2012-12-31'
out_range = paste(out_start_date, '::', out_end_date, sep='')
# RETRIEVE OUT-OF-SAMPLE DATA
x_out = SPY[out_range, 6]
y_out = AAPL[out_range, 6]
dF_out = cbind(x_out, y_out)
names(dF_out) = c('x', 'y')
# CALCULATE RETURNS (IN AND OUT OF SAMPLE)
returns_in = diff(dF_in) / dF_in
returns_out = diff(dF_out) / dF_out
# DEFINE ROLLING WINDOW LENGTH
window_length = 10
# FIND BETAS
betas_in = rolling_beta(returns_in, window_length)
betas_out = rolling_beta(returns_out, window_length)
# FIND SPREADS
spreadR_in = returns_in$y - betas_in * returns_in$x
spreadR_out = returns_out$y - betas_out * returns_out$x
names(spreadR_in) = c('spread')
names(spreadR_out) = c('spread')
# FIND THRESHOLD
threshold = sd(spreadR_in, na.rm=TRUE)
# FORM DATA SETS
data_in = merge(returns_in, betas_in, spreadR_in)
data_out = merge(x_out, y_out, returns_out, betas_out, spreadR_out)
names(data_out) = c('xp', 'yp', 'x', 'y', 'betas_out', 'spread')
data_in = data_in[-1]
data_out = data_out[-1]
# GENERATE BUY AND SELL SIGNALS FOR OUT OF SAMPLE
buys = ifelse(data_out$spread > threshold, 1, 0)
sells = ifelse(data_out$spread < -threshold, -1, 0)
data_out$signal = buys+sells
# PROFIT AND LOSS
prev_x_qty = 0
position = 0
trade_size = 100
signal = as.numeric(data_out$signal)
signal[is.na(signal)] = 0
beta = as.numeric(data_out$betas_out)
ratio = (data_out$yp/data_out$xp)
vec = round(beta*trade_size*ratio)
qty_x = rep(0, length(signal))
qty_y = rep(0, length(signal))
for(i in 1:length(signal)){
if(signal[i] == 1 && position == 0){
#buy the spread
prev_x_qty = vec[i]
qty_x[i] = -prev_x_qty
qty_y[i] = trade_size
position = 1
}
if(signal[i] == -1 && position == 0){
#buy the spread
prev_x_qty = vec[i]
qty_x[i] = prev_x_qty
qty_y[i] = -trade_size
position = -1
}
if(signal[i] == 1 && position == -1){
# we are short the spread and need to buy
qty_x[i] = -(vec[i] + prev_x_qty)
prev_x_qty = vec[i]
qty_y[i] = 2 * trade_size
position = 1
}
if(signal[i] == -1 && position == 1){
# we are short the spread and need to buy
qty_x[i] = (vec[i] + prev_x_qty)
prev_x_qty = vec[i]
qty_y[i] = -2 * trade_size
position = -1
}
}

Parallel computing in R (Windows): changing code from foreach %do% to foreach %dopar%

I have written a code to run several time-series rolling-regressions for multiple securities. Since the number of securities is more than 10,000, and having more than 200 rolling windows for each security, the runtime for a sequential set-up (using foreach %do%) is about 30min.
I would like to implement foreach %dopar% for parallel computing instead, using the "doParrallel" backend. Simply changing %do% with %dopar% in the code doesn't do the trick. I am very new to this parrallel computing method, and would hope to get some help.
Here is the foreach %do% code:
sec = ncol(ret.zoo)
num.factors = 2
rows = nrow(ret.zoo) - 60 + 1
beta.temp = matrix(nc = num.factors + 1, nr = sec*rows)
gvkey.vec = matrix(nc = 1, nr = sec*rows)
d = 1
foreach(i=1:sec) %do% {
df = merge(ret.zoo[,i], data)
names(df) <- c("return", names(data))
gvkey = substr(colnames(ret.zoo)[i],2,9)
reg = function(z) {
z.df = as.data.frame(z)
ret = z.df[,which(names(z.df) == "return")]
ret.no.na = ret[!is.na(ret)]
if(length(ret.no.na) >= 30) {
coef(lm(return ~ VAL + SIZE, data = as.data.frame(z), na.action = na.omit))
}
else {
as.numeric(rep(NA,num.factors + 1)) ## the "+1" is for the intercept value
}
}
beta = rollapply(df, width = 60, FUN = reg, by.column = FALSE, align = "right")
beta.temp[d:(d+rows-1),] = beta
gvkey.vec[d:(d+rows-1),] = gvkey
d = d+rows
}
beta.df = data.frame(secId = gvkey.vec, date = rep(index(beta), sec), beta.temp)
colnames(beta.df) <- c("gvkey", "date", "intercept", "VAL", "SIZE")
In order to enable parallel computing using %dopar%, I have called and registered the backend "doParallel".
Thank you very much!
UPDATE
Here is my first try:
library(doParallel) ## parallel backend for the foreach function
registerDoParallel()
sec = ncol(ret.zoo)
num.factors = 2
rows = nrow(ret.zoo) - 60 + 1
result <- foreach(i=1:sec) %dopar% {
library(zoo)
library(stats)
df = merge(ret.zoo[,i], data)
names(df) <- c("return", names(data))
gvkey = substr(colnames(ret.zoo)[i],2,9)
reg = function(z) {
z.df = as.data.frame(z)
ret = z.df[,which(names(z.df) == "return")]
ret.no.na = ret[!is.na(ret)]
if(length(ret.no.na) >= 30) {
coef(lm(return ~ VAL + SIZE, data = as.data.frame(z), na.action = na.omit))
}
else {
as.numeric(rep(NA,num.factors + 1)) ## the "+1" is for the intercept value
}
}
rollapply(df, width = 60, FUN = reg, by.column = FALSE, align = "right")
}
beta.df = do.call('combine', result)
This works perfectly up until the end of the loop. However, the beta.df = do.call('combine', result) gives the following error: Error in do.call("combine", result) : could not find function "combine".
How can I combine the output of result. Now it is a list rather than a dataframe.
Thanks,
Here is the way of combining the results from the different clusters into a dataframe (very efficient from a runtime standpoint):
lstData <- Map(as.data.frame, result)
dfData <- rbindlist(lstData)
beta.df = as.data.frame(dfData)

Resources