Loop for Correlation in R - r

I trying to find a way to do a nested for loop in r to get every possible correlation combination of this:
cor(y, column1* column2),
cor(y, column1* column3),
cor(y, column1* column4)
and so on
This is what I have tried so far:
for(i in 1:length(dataframe))
{
for(j in 1:length(dataframe))
{
joint_correlation(i,j)=cor(y ~ dataframe(i) * dataframe(j));
}
}
My dataframe has 115 columns like shown with a small sample:
FG_pct FGA FT FT_pct FTA GP GS GmSc MP ORB
0.625 8 0 0.00 0 1 0 6.6 28.4 2
0.500 4 0 0.00 1 2 0 2.1 17.5 0
0.000 1 0 0.00 0 3 0 1.2 6.6 1
0.500 6 0 0.00 0 4 0 3.6 13.7 1
0.500 2 0 0.00 0 5 0 0.9 7.4 1
I want to find the correlation for cor(MP, column1* column2) for every possible combination switched out for column1 and column2. This way, I wouldn't have to do every single one of them separately. If possible, I would like to save the output for each correlation combination cor(MP, column1* column2), cor(MP, column1* column3),cor(MP, column2* column4), etc. in a separate column.
This is an example of what I want:
cor(MP, FG_pct*FT_pct)

Edit: Jean-Claude Arbaut gives a better answers, as commented to this answer. Use cor(df).
Here is my botched answer: Using the library corrgram (Which is mainly a visual tool) we can easily get all combinations of correlations in a dataset. Example:
library(corrgram)
#Example data
df <- data.frame(x = rnorm(50, 5, 5),
y = rnorm(50, 2, 5))
df$z <- df$x / df$y
df$abc <- df$x * df$y * df$z
#panel arguments are necessary if you want to visualize correlations
corr <- corrgram(df,
order = F,
lower.panel = panel.cor,
upper.panel = panel.pts,
text.panel = panel.txt,
diag.panel = panel.minmax,
main = "Correlation")
#call corr gives
corr
x y z abc
x 1.00000000 0.07064179 0.1402051 0.89166002
y 0.07064179 1.00000000 0.2495239 0.08024278
z 0.14020508 0.24952388 1.0000000 0.14649093
abc 0.89166002 0.08024278 0.1464909 1.00000000
There is absolutely a better way for doing this with functions and without a package, but its early here, and if you are desperate to get the results this will probably do you fine.
p.s using the corrgram() function without assigning it will give you a nice visualization of your correlations.

Assuming you want the correlations of every column multiplied by combinations of two of the remaining columns.
We can find the names of according combinations using combn(names(dat), 2) which we put into an lapply.
combs <- do.call(cbind.data.frame,
lapply("MP", rbind, combn(names(dat)[names(dat) != "MP"], 2)))
combs
# 1 2 3
# 1 MP MP MP
# 2 FG_pct FG_pct FGA
# 3 FGA FT FT
In another lapply we subset the data on the name-combinations and calculate cor with formula cor(x1 ~ x2 * x3). Simultaneously we store the names pasted as formula in an attribute, to remember later what we've calculated in each iteration.
res.l <- lapply(combs, function(x) {
`attr<-`(cor(dat[,x[1]], dat[,x[2]]*dat[,x[3]]),
"what", {
paste0(x[1], ", ", paste(x[2], "*", x[3]))})
})
Finally we unlist and setNames according to the attributes.
res <- setNames(unlist(res.l), sapply(res.l, attr, "what"))
res
Result
# MP, FG_pct * FGA MP, FG_pct * FT MP, FGA * FT
# 0.2121374 0.2829003 0.4737892
Check:
(Note, that you can directly put the names, e.g. MP, FG_pct * FGA into the cor function.)
with(dat, cor(MP, FG_pct * FGA))
# [1] 0.2121374
with(dat, cor(MP, FG_pct * FT))
# [1] 0.2829003
with(dat, cor(MP, FGA * FT))
# [1] 0.4737892
To sort, use e.g. sort(res) or rev(sort(res)).
Toy data:
set.seed(42)
dat <- as.data.frame(`colnames<-`(MASS::mvrnorm(n=1e4,
mu=c(0.425, 4.2, 0.2, 3),
Sigma=matrix(c(1, .3, .7, 0,
.3, 1, .5, 0,
.7, .5, 1, 0,
0, 0, 0, 1), nrow=4),
empirical=T), c("FG_pct", "MP", "FGA", "FT")))

Related

How to optimzie my function by dropping loops

I have the following function that uses nested loops and honestly I'm not sure how to proceed with making the code run more efficient. It runs fine for 100 sims in my opinion but when I ran for 2000 sims it took almost 12 seconds.
This code will generate any n Brownian Motion simulations and works well, the issue is once the simulation size is increased to say 500+ then it starts to bog down, and when it hits 2k then it's pretty slow ie 12.
Here is the function:
ts_brownian_motion <- function(.time = 100, .num_sims = 10, .delta_time = 1,
.initial_value = 0) {
# TidyEval ----
T <- as.numeric(.time)
N <- as.numeric(.num_sims)
delta_t <- as.numeric(.delta_time)
initial_value <- as.numeric(.initial_value)
# Checks ----
if (!is.numeric(T) | !is.numeric(N) | !is.numeric(delta_t) | !is.numeric(initial_value)){
rlang::abort(
message = "All parameters must be numeric values.",
use_cli_format = TRUE
)
}
# Initialize empty data.frame to store the simulations
sim_data <- data.frame()
# Generate N simulations
for (i in 1:N) {
# Initialize the current simulation with a starting value of 0
sim <- c(initial_value)
# Generate the brownian motion values for each time step
for (t in 1:(T / delta_t)) {
sim <- c(sim, sim[t] + rnorm(1, mean = 0, sd = sqrt(delta_t)))
}
# Bind the time steps, simulation values, and simulation number together in a data.frame and add it to the result
sim_data <- rbind(
sim_data,
data.frame(
t = seq(0, T, delta_t),
y = sim,
sim_number = i
)
)
}
# Clean up
sim_data <- sim_data %>%
dplyr::as_tibble() %>%
dplyr::mutate(sim_number = forcats::as_factor(sim_number)) %>%
dplyr::select(sim_number, t, y)
# Return ----
attr(sim_data, ".time") <- .time
attr(sim_data, ".num_sims") <- .num_sims
attr(sim_data, ".delta_time") <- .delta_time
attr(sim_data, ".initial_value") <- .initial_value
return(sim_data)
}
Here is some output of the function:
> ts_brownian_motion(.time = 10, .num_sims = 25)
# A tibble: 275 × 3
sim_number t y
<fct> <dbl> <dbl>
1 1 0 0
2 1 1 -2.13
3 1 2 -1.08
4 1 3 0.0728
5 1 4 0.562
6 1 5 0.255
7 1 6 -1.28
8 1 7 -1.76
9 1 8 -0.770
10 1 9 -0.536
# … with 265 more rows
# ℹ Use `print(n = ...)` to see more rows
As suggested in the comments, if you want speed, you should use cumsum. You need to be clear what type of Brownian Motion you want (arithmetic, geometric). For geometric Brownian motion, you'll need to correct the approximation error by adjusting the mean. As an example, the NMOF package (which I maintain), contains a function gbm that implements geometric Brownian Motion through cumsum. Here is an example call for 2000 paths with 100 timesteps each.
library("NMOF")
library("zoo") ## for plotting
timesteps <- 100
system.time(b <- NMOF::gbm(2000, tau = 1, timesteps = 100, r = 0, v = 1))
## user system elapsed
## 0.013 0.000 0.013
dim(b) ## each column is one path, starting at time zero
## [1] 101 2000
plot(zoo(b[, 1:5], 0:timesteps), plot.type = "single")

Looking for an apply, tidyr or dplyr solution to a nested for loop situation in R

Weirdly for this one, I think its easier to start by viewing the df.
#reproducible data
quantiles<-c("50","90")
var=c("w","d")
df=data.frame(a=runif(20,0.01,.5),b=runif(20,0.02,.5),c=runif(20,0.03,.5),e=runif(20,0.04,.5),
q50=runif(20,1,5),q90=runif(20,10,50))
head(df)
I want to automate a function that I've created (below) to calculate vars using different combinations of values from my df.
For example, the calculation of w needs to use a and b, and d needs to use c and e such that w = a *q ^ b and d = c * q ^ e. Further, q is a quantile, so I actually want w50, w90, etc., which will correspond to q50, q90 etc. from the df.
The tricky part as i see it is setting the condition to use a & b vs. c & d without using nested loops.
I have a function to calculate vars using the appropriate columns, however I can't get all the pieces together efficiently.
#function to calculate the w, d
calc_wd <- function(df,col_name,col1,col2,col3){
#Calculate and create new column col_name for each combo of var and quantile, e.g. "w_50", "d_50", etc.
df[[col_name]] <- df[[col1]] * (df[[col2]] ^ (df[[col3]]))
df
}
I can get this to work for a single case, but not by automating the coefficient swap... you'll see I specify "a" and "b" below.
wd<-c("w_","d_")
make_wd_list<-apply(expand.grid(wd, quantiles), 1, paste,collapse="")
calc_wdv(df,make_wd_list[1],"a",paste0("q",sapply(strsplit(make_wd_list[1],"_"),tail,1)),"b")
Alternatively, I have tried to make this work using nested for loops, but can't seem to append the data correctly. And its ugly.
var=c("w","d")
dataf<-data.frame()
for(j in unique(var)){
if(j=="w"){
coeff1="a"
coeff2="b"
}else if(j=="d"){
coeff1="c"
coeff1="e"
}
print(coeff1)
print(coeff2)
for(k in unique(quantiles)){
dataf<-calc_wd(df,paste0(j,k),coeff1,paste0("q",k),coeff2)
dataf[k,j]=rbind(df,dataf) #this aint right. tried to do.call outside, etc.
}
}
In the end, I'm looking to have new columns with w_50, w_90, etc., which use q50, q90 and the corresponding coefficients as defined originally.
One approach I find easy to type is using purrr::pmap. I like this because when you use with(list(...),), you can access the column names of your data.frame by name. Additionally, you can supply additional arguments.
library(purrr)
pmap_df(df, quant = "q90", ~with(list(...),{
list(w = a * get(quant) ^ b, d = c * get(quant) ^ e)
}))
## A tibble: 20 x 2
# w d
# <dbl> <dbl>
# 1 0.239 0.295
# 2 0.152 0.392
# 3 0.476 0.828
# 4 0.344 0.236
# 5 0.439 1.00
You could combine this with for example a second map call to iterate over quantiles.
library(dplyr)
map(setNames(quantiles,quantiles),
~ pmap_df(df, quant = paste0("q",.x),
~ with(list(...),{list(w = a * get(quant) ^ b, d = c * get(quant) ^ e)}))
) %>% do.call(cbind,.)
# 50.w 50.d 90.w 90.d
#1 0.63585897 0.11045837 1.7276019 0.1784987
#2 0.17286184 0.22033649 0.2333682 0.5200265
#3 0.32437528 0.72502654 0.5722203 1.4490065
#4 0.68020897 0.33797621 0.8749206 0.6179557
#5 0.73516886 0.38481785 1.2782923 0.4870877
Then assigning a custom function is trivial.
calcwd <- function(df,quantiles){
map(setNames(quantiles,quantiles),
~ pmap_df(df, quant = paste0("q",.x),
~ with(list(...),{list(w = a * get(quant) ^ b, d = c * get(quant) ^ e)}))
) %>% do.call(cbind,.)
}
I love #Ian's answer for the completeness and the use of classics like with and do.call. I'm late to the scene with my solution but since I have been trying to get better with rowwise operations (including the use of rowwise thought I would offer up a less elegant but simpler and faster solution using just mutate, formula.tools and map_dfc
library(dplyr)
library(purrr)
require(formula.tools)
# same type example data plus a much larger version in df2 for
# performance testing
df <- data.frame(a = runif(20, 0.01, .5),
b = runif(20, 0.02, .5),
c = runif(20, 0.03, .5),
e = runif(20, 0.04, .5),
q50 = runif(20,1,5),
q90 = runif(20,10,50)
)
df2 <- data.frame(a = runif(20000, 0.01, .5),
b = runif(20000, 0.02, .5),
c = runif(20000, 0.03, .5),
e = runif(20000, 0.04, .5),
q50 = runif(20000,1,5),
q90 = runif(20000,10,50)
)
# from your original post
quantiles <- c("q50", "q90")
wd <- c("w_", "d_")
make_wd_list <- apply(expand.grid(wd, quantiles),
1,
paste, collapse = "")
make_wd_list
#> [1] "w_q50" "d_q50" "w_q90" "d_q90"
# an empty list to hold our formulas
eqn_list <- vector(mode = "list",
length = length(make_wd_list)
)
# populate the list makes it very extensible to more outcomes
# or to more quantile levels
for (i in seq_along(make_wd_list)) {
if (substr(make_wd_list[[i]], 1, 1) == "w") {
eqn_list[[i]] <- as.formula(paste(make_wd_list[[i]], "~ a * ", substr(make_wd_list[[i]], 3, 5), " ^ b"))
} else if (substr(make_wd_list[[i]], 1, 1) == "d") {
eqn_list[[i]] <- as.formula(paste(make_wd_list[[i]], "~ c * ", substr(make_wd_list[[i]], 3, 5), " ^ e"))
}
}
# formula.tools helps us grab both left and right sides
add_column <- function(df, equation){
df <- transmute_(df, rhs(equation))
colnames(df)[ncol(df)] <- as.character(lhs(equation))
return(df)
}
result <- map_dfc(eqn_list, ~ add_column(df = df, equation = .x))
#> w_q50 d_q50 w_q90 d_q90
#> 1 0.10580863 0.29136904 0.37839737 0.9014040
#> 2 0.34798729 0.35185585 0.64196417 0.4257495
#> 3 0.79714122 0.37242915 1.57594506 0.6198531
#> 4 0.56446922 0.43432160 1.07458217 1.1082825
#> 5 0.26896574 0.07374273 0.28557366 0.1678035
#> 6 0.36840408 0.72458466 0.72741030 1.2480547
#> 7 0.64484009 0.69464045 1.93290705 2.1663690
#> 8 0.43336109 0.21265672 0.46187366 0.4365486
#> 9 0.61340404 0.47528697 0.89286358 0.5383290
#> 10 0.36983212 0.53292900 0.53996112 0.8488402
#> 11 0.11278412 0.12532491 0.12486156 0.2413191
#> 12 0.03599639 0.25578020 0.04084221 0.3284659
#> 13 0.26308183 0.05322304 0.87057854 0.1817630
#> 14 0.06533586 0.22458880 0.09085436 0.3391683
#> 15 0.11625845 0.32995233 0.12749040 0.4730407
#> 16 0.81584442 0.07733376 2.15108243 0.1041342
#> 17 0.38198254 0.60263861 0.68082354 0.8502999
#> 18 0.51756058 0.43398089 1.06683204 1.3397900
#> 19 0.34490492 0.13790601 0.69168711 0.1580659
#> 20 0.39771037 0.33286225 1.32578056 0.4141457
microbenchmark::microbenchmark(result <- map_dfc(eqn_list, ~ add_column(df = df2, equation = .x)), times = 10)
#> Unit: milliseconds
#> expr min
#> result <- map_dfc(eqn_list, ~add_column(df = df2, equation = .x)) 10.58004
#> lq mean median uq max neval
#> 11.34603 12.56774 11.6257 13.24273 16.91417 10
The mutate and formula solution is about fifty times faster although both rip through 20,000 rows in less than a second
Created on 2020-04-30 by the reprex package (v0.3.0)

How to plot correlation matrix in R with already derived correlation values between variables?

I have an output table which looks like:
Vars Corr SE
1_2 0.51 0.003
1_3 0.32 0.001
...
49_50 0.23 0.006
where correlation values were derived in another software for variables stated in Vars (1_2 refers to between Variable 1 and 2). What is the best way to convert this into a format which could display the correlation matrix between all 50 variables?
I'm assuming there needs to be a way to make the diagonals 1 as well?
Thanks!
So suppose you have the data in a single column, you can restructure and use corrplot
cordata = data.frame(
Vars = paste0(rep(1:50, times = 50), "_",
rep(1:50, each = 50)),
Corr = rnorm(n = 50*50, mean = 0, sd = .3)
) %>%
#for the sake of demonstration return Corrs beyond -1 and 1 to 0.
mutate(Corr = replace(Corr, Corr > 1 | Corr < -1, 0))
head(cordata)
Vars Corr
1 1_1 0.453807195
2 2_1 0.237179163
3 3_1 0.303635874
4 4_1 -0.314318833
5 5_1 0.008682918
6 6_1 -0.067164730
cormat = matrix(cordata$Corr, byrow = TRUE, ncol = 50)
# You can use corrplot::corrplot
corrplot(cormat)

Creating R function to find both distance and angle between two points

I am trying to create or find a function that calculates the distance and angle between two points, the idea is that I can have two data.frames with x, y coordinates as follows:
Example dataset
From <- data.frame(x = c(0.5,1, 4, 0), y = c(1.5,1, 1, 0))
To <- data.frame(x =c(3, 0, 5, 1), y =c(3, 0, 6, 1))
Current function
For now, I've managed to develop the distance part using Pythagoras:
distance <- function(from, to){
D <- sqrt((abs(from[,1]-to[,1])^2) + (abs(from[,2]-to[,2])^2))
return(D)
}
Which works fine:
distance(from = From, to = To)
[1] 2.915476 1.414214 5.099020 1.414214
but I can't figure out how to get the angle part.
What I tried so far:
I tried adapting the second solution of this question
angle <- function(x,y){
dot.prod <- x%*%y
norm.x <- norm(x,type="2")
norm.y <- norm(y,type="2")
theta <- acos(dot.prod / (norm.x * norm.y))
as.numeric(theta)
}
x <- as.matrix(c(From[,1],To[,1]))
y <- as.matrix(c(From[,2],To[,2]))
angle(t(x),y)
But I am clearly making a mess of it
Desired output
I would like having the angle part of the function added to my first function, where I get both the distance and angle between the from and to dataframes
By angle between two points, I am assuming you mean angle between two vectors
defined by endpoints (and assuming the start is the origin).
The example you used was designed around only a single pair of points, with the transpose used only on this principle. It is however robust enough to work in more than 2 dimensions.
Your function should be vectorised as your distance function is, as it is expecting a number of pairs of points (and we are only considering 2 dimensional points).
angle <- function(from,to){
dot.prods <- from$x*to$x + from$y*to$y
norms.x <- distance(from = `[<-`(from,,,0), to = from)
norms.y <- distance(from = `[<-`(to,,,0), to = to)
thetas <- acos(dot.prods / (norms.x * norms.y))
as.numeric(thetas)
}
angle(from=From,to=To)
[1] 0.4636476 NaN 0.6310794 NaN
The NaNs are due to you having zero-length vectors.
how about:
library(useful)
df=To-From
cart2pol(df$x, df$y, degrees = F)
which returns:
# A tibble: 4 x 4
r theta x y
<dbl> <dbl> <dbl> <dbl>
1 2.92 0.540 2.50 1.50
2 1.41 3.93 -1.00 -1.00
3 5.10 1.37 1.00 5.00
4 1.41 0.785 1.00 1.00
where r us the distance and theta is the angle

BTYD Individual Level Estimations For All Observations

I am using BTYD BG NBD in R and did the individual level estimates.
For instance following the documentation in page 20 of:
BTYD Walkthrough
Code for Data Prep:
system.file("data/cdnowElog.csv", package = "BTYD")%>%
dc.ReadLines(., cust.idx = 2, date.idx = 3, sales.idx = 5)%>%
dc.MergeTransactionsOnSameDate()%>%
mutate(date = parse_date_time(date, "%Y%m%d")) -> elog
end.of.cal.period <- as.Date("1997-09-30")
elog.cal <- elog[which(elog$date <= end.of.cal.period), ]
split.data <- dc.SplitUpElogForRepeatTrans(elog.cal);
birth.periods <- split.data$cust.data$birth.per
last.dates <- split.data$cust.data$last.date
clean.elog <- split.data$repeat.trans.elog;
freq.cbt <- dc.CreateFreqCBT(clean.elog);
tot.cbt <- dc.CreateFreqCBT(elog)
cal.cbt <- dc.MergeCustomers(tot.cbt, freq.cbt)
cal.cbs.dates <- data.frame(birth.periods, last.dates, end.of.cal.period)
cal.cbs <- dc.BuildCBSFromCBTAndDates(cal.cbt, cal.cbs.dates,per="week")
params <- pnbd.EstimateParameters(cal.cbs);
one could get estimates for a particular observation.
Code for Individual Level Estimation:
cal.cbs["1516",]
# x t.x T.cal
# 26.00 30.86 31.00
x <- cal.cbs["1516", "x"]
t.x <- cal.cbs["1516", "t.x"]
T.cal <- cal.cbs["1516", "T.cal"]
bgnbd.ConditionalExpectedTransactions(params, T.star = 52,
x, t.x, T.cal)
# [1] 25.76
My question is, is it possible to recursively run this such that I could get a data frame containing the expectations for each row instead of hard coding a particular ID number such as "1516" in this case?
Thanks!
Yes, it is straightforward with dplyr's mutate()
cal.cbs%>%
data.frame()%>%
mutate(`Conditional Expectation` = bgnbd.ConditionalExpectedTransactions(params, T.star = 52, x, t.x, T.cal))
x t.x T.cal Conditional Expectation
1 2 30.428571 38.85714 2.3224971
2 1 1.714286 38.85714 1.0646350
3 0 0.000000 38.85714 0.5607707
4 0 0.000000 38.85714 0.5607707
5 0 0.000000 38.85714 0.5607707
6 7 29.428571 38.85714 6.0231497

Resources