I would like to calculate the moving recency-weighted mean finishing positions of a horse given the times (day) and finishing positions (pos) for a sequence of races in which the horse participated. Such statistics are useful in handicapping.
Currently, I am using a "loop-inside-a-loop" approach. Is there a faster or more elegant R-language approach to this problem?
#
# Test data
#
day <- c(0, 6, 10, 17, 21, 26, 29, 31, 34, 38, 41, 47, 48, 51, 61)
pos <- c(3, 5, 6, 1, 1, 3, 4, 1, 2, 2, 2, 6, 4, 5, 6)
testdata <- data.frame(id = 1, day = day, pos = pos, wt.pos = NA)
#
# No weight is given to observations earlier than cutoff
#
cutoff <- 30
#
# Rolling recency-weighted mean (wt.pos)
#
for(i in 2:nrow(testdata)) {
wt <- numeric(i-1)
for(j in 1:(i-1))
wt[j] <- max(0, cutoff - day[i] + day[j] + 1)
if (sum(wt) > 0)
testdata$wt.pos[i] <- sum(pos[1:j] * wt) / sum(wt)
}
> testdata
id day pos wt.pos
1 1 0 3 NA
2 1 6 5 3.000000
3 1 10 6 4.125000
4 1 17 1 4.931034
5 1 21 1 3.520548
6 1 26 3 2.632911
7 1 29 4 2.652174
8 1 31 1 2.954128
9 1 34 2 2.436975
10 1 38 2 2.226891
11 1 41 2 2.119048
12 1 47 6 2.137615
13 1 48 4 3.030534
14 1 51 5 3.303704
15 1 61 6 4.075000
I'd go for
# Calculate `wt` for all values of `i` in one go
wt <- lapply(2:nrow(testdata), function(i)
pmax(0, cutoff - day[i] + day[1:(i-1)] + 1))
# Fill in the column
testdata$wt.pos[-1] <- mapply(
function(i, w) if(sum(w) > 0) sum(pos[1:i]*w)/sum(w) else NA,
1:(nrow(testdata)-1), wt)
Note that by calculating the second argument to max for all values of j at the same time we have vectorized the computation, which improves the speed by many orders of magnitude.
I found no easy way to vectorize the outer loop and the if case though (apart from rewriting it in C which seems like overkill), but lapply, mapply and similar are still faster than for loops.
This version demonstrates how to calculate moving recency-weighted means for 1 or more variables (e.g., finishing position, speed rating, etc.) and 1 or more subjects (horses).
library(plyr)
day <- c(0, 6, 10, 17, 21, 26, 29, 31, 34, 38, 41, 47, 48, 51, 61)
pos <- c(3, 5, 6, 1, 1, 3, 4, 1, 2, 2, 2, 6, 4, 5, 6)
dis <- 100 + 0.5 * (pos - 1)
testdata1 <- data.frame(id = 1, day = day, pos = pos, dis = dis)
day <- c(0, 4, 7, 14, 22, 23, 31, 38, 42, 47, 52, 59, 68, 69, 79)
pos <- c(1, 3, 2, 6, 4, 5, 2, 1, 4, 5, 2, 1, 5, 5, 2)
dis <- 100 + 0.5 * (pos - 1)
testdata2 <- data.frame(id = 2, day = day, pos = pos, dis = dis)
testdata <- rbind(testdata1, testdata2)
# Moving recency-weighted mean
rollmean <- function(day, obs, cutoff = 90) {
obs <- as.matrix(obs)
wt <- lapply(2:nrow(obs), function(i)
pmax(0, cutoff - day[i] + day[1:(i-1)] + 1))
wt.obs <- lapply(1:(nrow(obs)-1), FUN =
function(i)
if(sum(wt[[i]]) > 0) {
apply(obs[1:i, , drop = F] * wt[[i]], 2, sum) / sum(wt[[i]])
} else {
rep(NA, ncol(obs))
}
)
answer <- rbind(rep(NA, ncol(obs)), do.call(rbind, wt.obs))
if (!is.null(dimnames(answer)))
dimnames(answer)[[2]] <- paste("wt", dimnames(answer)[[2]], sep = ".")
return(answer)
}
x <- dlply(testdata, .(id), .fun =
function(DF) rollmean(DF$day, DF[, c("pos", "dis"), drop = F])
)
y <- do.call(rbind, x)
Related
From the following dataframe:
df1 <- data.frame(id=c(1, 2, 3, 4, 5, 6, 7),
revenue=c(34, 1000, 40, 49, 43, 55, 99))
df2 <- data.frame(id=c(1, 2, 3, 4, 5, 6, 7),
expenses=c(22, 26, 31, 40, 20, 2000, 22))
df3 <- data.frame(id=c(1, 2, 3, 4, 5, 6, 7),
profit=c(12, 10, 14, 12, 9, 15, 16))
df_list <- list(df1, df2, df3)
test <- Reduce(function(x, y) merge(x, y, all=TRUE), df_list)
rownames(test) <- test[,1]
test[,1] <- NULL
test
I would like to eliminate extreme values (e.g. 1000 and 2000). I need to cutoff everything that is greater than 100. When I check test<100 I see TRUE and FALSE positions but I would like to replace them with NA or zeroes.
To replace all values in a dataframe (df) which values are higher than 100 with a 0 simply use: df[df > 100] = 0
We can use replace()
replace(test, test>100, NA)
revenue expenses profit
1 34 22 12
2 NA 26 10
3 40 31 14
4 49 40 12
5 43 20 9
6 55 NA 15
7 99 22 16
I have the following data frame of an experiment with two replicates. I want to filter df based on score == 0 in both replicates for each timestamp & ID.
df <- data.frame(timestamp = c(1, 1, 1, 1, 2, 2, 2, 2),
ID = c(57, 57, 55, 55, 57, 57, 55, 55),
replicate= c(1, 2, 1, 2, 1, 2, 1, 2),
score = c(0, 1, 0, 0, 0, 1, 0, 0))
E.g. the desired output would be:
target <- data.frame(timestamp = c(1, 1, 2, 2),
ID = c(55, 55, 55, 55),
replicate = c(1, 2, 1, 2),
score = c(0, 0, 0, 0))
I've come up with a solution in a double-loop, which is inelegant and most likely inefficient:
tsvec <- df$timestamp %>% unique
idvec <- df$ID %>% unique
df_out <- c()
for(i in seq_along(tsvec)){ # loop along timestamps
innerdat <- df %>% filter(timestamp == tsvec[i])
for(j in seq_along(idvec)){ # loop along IDs
innerdat2 <- innerdat %>% filter(ID == idvec[j])
if(sum(innerdat2$score) == 0){
df_out <- rbind(df_out, innerdat2)
} else {
NULL
}
}
}
Does anybody have a dplyr way of making this more efficient?
library(dplyr)
df %>% group_by(ID) %>% filter(all(score==0))
# A tibble: 4 x 4
# Groups: ID [1]
timestamp ID replicate score
<dbl> <dbl> <dbl> <dbl>
1 1 55 1 0
2 1 55 2 0
3 2 55 1 0
4 2 55 2 0
An approach with data.table
library(data.table)
setDT(df)[, .SD[all(score == 0)], by = ID]
I am trying to write code that subtracts a given value from a variable until each row has a predicted probability at or above .05.
train <- data.frame('cost'= c(120, 3, 2, 4, 10, 110, 200, 43, 1, 51, 22, 14),
'price' = c(120, 20, 10, 4, 3, 4, 30, 43, 56, 88, 75, 44),
'dich' = c(0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0))
train$dich <- as.factor(train$dich)
test <- data.frame('cost'= c(13, 5, 32, 22, 14, 145, 54, 134, 11, 14, 33, 21),
'price' = c(32, 11, 210, 6, 3, 7, 22, 423, 19, 99, 192, 32)
)
model <- glm(dich ~ cost + price,
data = train,
family = "binomial")
pred <- predict(model, test, type = "response")
1 2 3 4
3.001821e-01 4.442316e-01 4.507495e-04 6.310900e-01
5 6 7 8
5.995459e-01 9.888085e-01 7.114101e-01 1.606681e-06
9 10 11 12
4.096450e-01 2.590474e-02 9.908167e-04 3.572890e-01
So in the above output the cases 4, 5, 6, and 7 would remain the same because they are already above .05 but for the rest of the cases I would like to subtract 1 from the price column and then run the prediction again and repeat until all cases have a probability at or above .05.
If you want to subtract 1 for each row (or "customer") individually, rather than 1 across the board:
test$pred_prob <- NA
for (n in 1:nrow(test)) {
print("-----------------------------")
print(n)
while (TRUE) {
pred <- predict(model, test[n,], type = "response")
print(pred)
test$pred_prob[n] <- pred
if (sum(pred > 0.05) == length(pred)) {
print(test$price[n])
break
}
test$price[n] <- test$price[n] - 1
}
print(test)
}
# cost price pred_prob
# 1 13 32 0.30018209
# 2 5 11 0.44423163
# 3 32 96 0.05128337
# 4 22 6 0.63109001
# 5 14 3 0.59954586
# 6 145 7 0.98880854
# 7 54 22 0.71141007
# 8 134 175 0.05074762
# 9 11 19 0.40964501
# 10 14 82 0.05149897
# 11 33 97 0.05081947
# 12 21 32 0.35728897
I see what you are trying to do but the results are quite hilarious. This is if you want to subtract 1 from all elements of price each time:
x <- 1
while (TRUE) {
print("----------------------------------------")
print(x)
test$price <- test$price - 1
pred <- predict(model, test, type = "response")
print(pred)
x <- x + 1
if (sum(pred > 0.05) == length(pred)) {
print(test)
break
}
}
# ... loops 247 times
# [1] "----------------------------------------"
# [1] 248
# 1 2 3 4 5 6 7 8 9 10 11 12
# 0.99992994 0.99996240 0.93751936 0.99998243 0.99997993 0.99999966 0.99998781 0.05074762 0.99995669 0.99887117 0.97058913 0.99994594
# cost price
# 1 13 -216
# 2 5 -237
# 3 32 -38
# 4 22 -242
# 5 14 -245
# 6 145 -241
# 7 54 -226
# 8 134 175
# 9 11 -229
# 10 14 -149
# 11 33 -56
# 12 21 -216
In case anyone else wants to run the same thing with a xgboost model.
train <- data.frame('cost'= c(120, 3, 2, 4, 10, 110, 200, 43, 1, 51, 22, 14),
'price' = c(120, 20, 10, 4, 3, 4, 30, 43, 56, 88, 75, 44))
label <- data.frame('dich' = c(0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0))
train <- as.matrix(train)
label <- as.matrix(label)
model <- xgboost(data = train,
label = label,
max.depth = 3,
nround = 1,
objective = "binary:logistic")
test <- data.frame('cost'= c(13, 5, 32, 22, 14, 145, 54, 134, 11, 14, 33, 21),
'price' = c(32, 11, 210, 6, 3, 7, 22, 423, 19, 99, 192, 32)
)
test <- as.matrix(test)
#FOR A MATRIX
test <- cbind(test, rep(NA, nrow(test)))
colnames(test)[3] <- c("pred_prob")
for (n in 1:nrow(test)) {
print("-----------------------------")
print(n)
while (TRUE) {
pred <- predict(model, t(test[n,]), type = "response")
print(pred)
test[,"pred_prob"][n] <- pred
if (sum(pred > 0.5) == length(pred)) {
print(test[,"pred_prob"][n])
break
}
test[,"price"][n] <- test[,"price"][n] - .01
}
print(test)
}
It seems to take a while to run on 12 rows. I need to do some thinking about the thresholds of a tree model and how that'll effect a range of different changes in the price to obtain at or above .5 probability (which I meant in my first question but I wrote .05 haha).
Let's say we have two list
x <- c(1, 3, 4, 2, 6, 5)
y <- c(12, 14, 15, 61, 71, 21)
I want to combine into a list so that we have 2 column x and y and values should be in same order.
x <- c(1, 3, 4, 2, 6, 5)
y <- c(12, 14, 15, 61, 71, 21)
After you have a list I want to sort it on y so the final list looks like
x <- c(1, 3, 4, 5, 2, 6)
y <- c(12, 14, 15, 21, 61, 71)
I am really new to R.
I tried list(x,y) but it seems to make a
list(1, 3, 4, 2, 6, 5, 12, 14, 15, 61, 71, 21)
so I was wondering someone could help me.
You need to put them in a data.frame first and then use order:
x <- c(1, 3, 4, 2, 6, 5)
y <- c(-12, 14, 15, 61, 71, 21)
DF <- data.frame(x, y)
> DF[order(DF$y),]
x y
1 1 -12
2 3 14
3 4 15
6 5 21
4 2 61
5 6 71
keeping as a list, using lapply:
x <- c(1, 3, 4, 2,6,5)
y <- c(12, 14,15,61,71,21)
l <- list(x = x, y = y)
## thelatemail
lapply(l, `[`, order(l$y))
# $x
# [1] 1 3 4 5 2 6
#
# $y
# [1] 12 14 15 21 61 71
a more explicit version of the short one given by #thelatemail above but doesn't preserve the names:
lapply(seq_along(l), function(x) l[[x]][order(l$y)])
# [[1]]
# [1] 1 3 4 5 2 6
#
# [[2]]
# [1] 12 14 15 21 61 71
or rapply:
rapply(l, function(x) x[order(l$y)], how = 'list')
# $x
# [1] 1 3 4 5 2 6
#
# $y
# [1] 12 14 15 21 61 71
I have the following data and nested for loop:
x <- c(12, 27, 21, 16, 12, 21, 18, 16, 20, 23, 21, 10, 15, 26, 21, 22, 22, 19, 26, 26)
y <- c(8, 10, 7, 7, 9, 5, 7, 7, 10, 4, 10, 3, 9, 6, 4, 2, 4, 2, 3, 6)
a <- c(20,25)
a.sub <- c()
df <- c()
for(j in 1:length(a)){
a.sub <- which(x >= a[j])
for(i in 1:length(a.sub)){
df[i] <- y[a.sub[i]]
}
print(df)
}
I'd like the loop to return values for df as:
[1] 10 6 3 6 4 10 6 4 2 4 3 6
[1] 10 6 3 6
As I have it, however, the loop returns the same values twice of df for a <- 20 but not a <- 25:
[1] 10 7 5 10 4 10 6 4 2 4 3 6
[1] 10 6 3 6 4 10 6 4 2 4 3 6
for(i in 1:length(a.sub)){
df[i] <- y[a.sub[i]]
}
can become
df <- y[a.sub]
neither a.sub nor df need to be predefined then and thus...
x <- c(12, 27, 21, 16, 12, 21, 18, 16, 20, 23, 21, 10, 15, 26, 21, 22, 22, 19, 26, 26)
y <- c(8, 10, 7, 7, 9, 5, 7, 7, 10, 4, 10, 3, 9, 6, 4, 2, 4, 2, 3, 6)
a <- c(20,25)
for(j in 1:length(a)){
a.sub <- which(x >= a[j])
df <- y[a.sub]
print(df)
}
It could be made shorter. df is unnecessary if you're just printing the subset of y anyway. Just print it directly. And the selector is so short it wouldn't make a single line confusing. Furthermore, why use length of a and index.. loop through a directly. So, it could be...
a <- c(20,25)
for(ax in a){
print( y[ which(x >= ax) ] )
}
Not sure if this is a simplified version of a more complex problem, but I'd probably solve this using some direct indexing and an apply function. Something like this:
z <- cbind(x,y)
sapply(c(20,25), function(x) z[z[, 1] >= x, 2])
[[1]]
[1] 10 7 5 10 4 10 6 4 2 4 3 6
[[2]]
[1] 10 6 3 6