Divide each column of a dataframe by one row of the dataframe - r

I would like to divide each column of my dataframe by the values of one row.
I tried to transform my dataframe into a matrix and to extract one row of the dataframe as a vector then divide the matrix by the vector but it did not work. Indeed, only the first row of the matrix got divided by the vector.
Here is my original dataframe.
And this is the code I tried to run :
data <- read_excel("Documents/TFB/xlsx_geochimie/solfatara_maj.xlsx")
View(data)
data.mat <- as.matrix(data[,2:20])
vector <- data[12,2:20]
data.mat/vector

We replicate the vector to make the length same and then do the division
data.mat/unlist(vector)[col(data.mat)]
# FeO Total S SO4 Total N SiO2 Al2O3 Fe2O3 MnO MgO CaO Na2O K2O
#[1,] 0.10 16.5555556 NA NA 0.8908607 0.8987269 0.1835206 0.08333333 0.03680982 0.04175365 0.04823151 0.5738562
#[2,] 0.40 125.8333333 NA NA 0.5510204 0.4456019 0.2359551 0.08333333 0.04294479 0.01878914 0.04501608 0.2588235
#[3,] 0.85 0.6111111 NA NA 1.0021295 1.0162037 0.7715356 1.08333333 0.53987730 0.69728601 1.03858521 1.0457516
#[4,] 0.15 48.0555556 NA NA 1.1027507 0.2569444 NA 0.08333333 0.01840491 0.01878914 0.04180064 0.1647059
#[5,] 0.85 NA NA NA 1.0889086 1.0271991 0.6591760 0.75000000 0.59509202 0.53862213 1.02250804 1.1228758
#[6,] NA NA NA NA 1.3426797 0.6319444 0.0411985 0.08333333 0.03067485 0.11899791 0.65594855 0.7764706
# TiO2 P2O5 LOI LOI2 Total Total 2 Fe2O3(T)
#[1,] 0.7924528 0.3928571 7.0841837 6.6963855 0.9922233 0.9894632 0.14489796
#[2,] 0.5094340 0.3214286 14.5561224 13.7710843 0.9958126 0.9936382 0.31020408
#[3,] 0.8679245 0.6428571 1.5637755 1.5228916 0.9990030 0.9970179 0.80612245
#[4,] 1.4905660 0.2857143 7.4056122 7.0024096 0.9795613 0.9769384 0.05510204
#[5,] 1.0377358 0.2500000 0.3520408 0.3783133 0.9969093 0.9960239 0.74489796
#[6,] 0.3018868 0.2500000 1.2551020 1.1879518 1.0019940 1.0000000 0.04489796
Or use sweep
sweep(data.mat, MARGIN = 2, unlist(vector), FUN = `/`)
Or using mapply with asplit
mapply(`/`, asplit(data.mat, 2), vector)
data
data_mat <- structure(c(0.2, 0.8, 1.7, 0.3, 1.7, NA, 5.96, 45.3, 0.22, 17.3,
NA, NA, NA, 6.72, NA, 4.08, 0.06, 0.16, NA, NA, NA, NA, NA, NA,
50.2, 31.05, 56.47, 62.14, 61.36, 75.66, 15.53, 7.7, 17.56, 4.44,
17.75, 10.92, 0.49, 0.63, 2.06, NA, 1.76, 0.11, 0.01, 0.01, 0.13,
0.01, 0.09, 0.01, 0.06, 0.07, 0.88, 0.03, 0.97, 0.05, 0.2, 0.09,
3.34, 0.09, 2.58, 0.57, 0.15, 0.14, 3.23, 0.13, 3.18, 2.04, 4.39,
1.98, 8, 1.26, 8.59, 5.94, 0.42, 0.27, 0.46, 0.79, 0.55, 0.16,
0.11, 0.09, 0.18, 0.08, 0.07, 0.07, 27.77, 57.06, 6.13, 29.03,
1.38, 4.92, 27.79, 57.15, 6.32, 29.06, 1.57, 4.93, 99.52, 99.88,
100.2, 98.25, 99.99, 100.5, 99.54, 99.96, 100.3, 98.28, 100.2,
100.6, 0.71, 1.52, 3.95, 0.27, 3.65, 0.22), .Dim = c(6L, 19L), .Dimnames = list(
NULL, c("FeO", "Total S", "SO4", "Total N", "SiO2", "Al2O3",
"Fe2O3", "MnO", "MgO", "CaO", "Na2O", "K2O", "TiO2", "P2O5",
"LOI", "LOI2", "Total", "Total 2", "Fe2O3(T)")))
vector <- structure(list(FeO = 2, `Total S` = 0.36, SO4 = NA_real_, `Total N` = NA_real_,
SiO2 = 56.35, Al2O3 = 17.28, Fe2O3 = 2.67, MnO = 0.12, MgO = 1.63,
CaO = 4.79, Na2O = 3.11, K2O = 7.65, TiO2 = 0.53, P2O5 = 0.28,
LOI = 3.92, LOI2 = 4.15, Total = 100.3, `Total 2` = 100.6,
`Fe2O3(T)` = 4.9), row.names = c(NA, -1L), class = c("tbl_df",
"tbl", "data.frame"))

To divide data frame, df, by the third row:
df/df[rep(3, nrow(df)), ]

Related

geom_smooth not working for trendline, too few points?

I am trying to get a trendline for my two sets of averages, in my main graph I will be putting error bars on the points to show the sd's but below is a simplified version:
ggplot(sl, aes(x=Stresslevel, y=Final, color=Treatment)) +
geom_point() +
geom_smooth(method = "lm")
In my output I can see in the legend that it is trying to add it, but it is not showing on the graph:
enter image description here
Here is an image of the data:
enter image description here
Edit: Here is my data, thank you for the advice for getting it>
dput(sl)
structure(list(Stresslevel = structure(c(1L, 2L, 3L, 4L, 5L,
6L, 7L, 3L, 4L, 5L), .Label = c("0", "1", "2 (30%)", "3 (50%)",
"4 (70%)", "5", "Recovered"), class = "factor"), WL = c(0, 15.5,
32.8, 52.9, 69.8, 89.2, 13.5, 30, 50, 70), WLsd = c(5, 6.5, 8.1,
8.8, 10.6, 4.2, 9.8, 5, 5, 5), Final = c(0.0292, 0.0276, 0.0263,
0.0248, 0.0208, 0.0199, 0.0249, 0.0274, 0.0235, 0.0121), Treatment = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L), .Label = c("Stressed", "Treated"
), class = "factor"), Finalsd = c(0.0039, 0.0019, 0.0026, 0.0033,
0.002, 0.0021, 0.0028, 0.0049, 0.0048, 0.0026), Dry = c(0.006,
0.008, 0.0107, 0.0139, 0.0138, 0.0174, 0.0047, 0.008, 0.0116,
0.0105), Drysd = c(0.0015, 0.0015, 0.0017, 0.0024, 0.0011, 0.0022,
0.001, 0.0016, 0.0033, 0.0021), Delta = c(0.0231, 0.0196, 0.0155,
0.0109, 0.007, 0.0025, 0.0201, 0.0194, 0.012, 0.0016), Deltasd = c(0.0034,
0.0015, 0.0019, 0.002, 0.0024, 0.001, 0.0025, 0.0043, 0.0035,
0.0013), WC = c(4.07, 2.54, 1.48, 0.81, 0.52, 0.15, 4.44, 2.48,
1.11, 0.16), WCsd = c(1.22, 0.59, 0.26, 0.21, 0.2, 0.08, 1.06,
0.56, 0.45, 0.12), CD = c(1, 1.33, 1.78, 2.31, 2.29, 2.89, 0.78,
1.33, 1.92, 1.75), CDsd = c(0.24, 0.25, 0.28, 0.4, 0.19, 0.37,
0.16, 0.26, 0.54, 0.35)), class = "data.frame", row.names = c(NA,
-10L))
Any help would be greatly appreciated.
Your x variable is a factor, meaning it is a categorical variable, so it's not clear how to fit a regression line through that:
str(sl)
'data.frame': 10 obs. of 14 variables:
$ Stresslevel: Factor w/ 7 levels "0","1","2 (30%)",..: 1 2 3 4 5 6 7 3 4 5
$ WL : num 0 15.5 32.8 52.9 69.8 89.2 13.5 30 50 70
I am not sure if it makes sense to convert your categories to numeric, that is stresslevel 0 will be 1, stresslevel 1 be 2 etc.. and force a line:
ggplot(sl, aes(x=Stresslevel, y=Final, color=Treatment)) +
geom_point() +
geom_smooth(aes(x=as.numeric(Stresslevel)),method = "lm",se=FALSE)
I would say it might make sense to connect the lines, if it makes sense to look at the progression of your dependent variable from 0 to 5 stress:
ggplot(sl, aes(x=Stresslevel, y=Final, color=Treatment)) +
geom_point() +
geom_line(aes(x=as.numeric(Stresslevel)),linetype="dashed")

Canonical Correlation in R with different matrix dimensions

I'm having difficulties about doing a CC analysis in R.
The assignment which I'm doing is from "Applied Multivariate Analysis" by Sharma, exercise 13.7, if you're familiar with it.
Basically, I'm asked to conduct a CCA on a set of variables. There are seven X variables, but only five Y variables, thus R complains that the dimensions are not compatible. See the image below for a visual representation of the data called CETNEW.
Edited (Changed from image to dput):
structure(list(...
1 = c("X1", "X2", "X3", "X4", "X5", "X6", "X7", "Y1", "Y2", "Y3", "Y4", "Y5"),
2 = c(2.72, 1.2, 0.82, 0.92, 1.19, 1, 1.45, 0.68, 0.98, 0.57, 1.07, 0.91), ...
3 = c(1.2, 3.78, 0.7, 1.04, 1.06, 1.32, 1.31, 0.56, 1, 0.79, 1.13, 1.38), ...
4 = c(0.82, 0.7, 1.7, 0.59, 0.83, 1.08, 1.01, 0.65, 0.78, 0.66, 0.93, 0.77), ...
5 = c(0.92, 1.04, 0.59, 3.09, 1.06, 0.93, 1.47, 0.62, 1.26, 0.51, 0.94, 0.85), ...
6 = c(1.19, 1.06, 0.83, 1.06, 2.94, 1.36, 1.66, 0.68, 1.16, 0.77, 1.37, 1.11), ...
7 = c(1, 1.32, 1.08, 0.93, 1.36, 2.94, 1.56, 0.9, 1.23, 0.78, 1.65, 1.31), ...
8 = c(1.45, 1.31, 1.01, 1.47, 1.66, 1.56, 3.11, 1.03, 1.7, 0.81, 1.63, 1.44), ...
9 = c(0.68, 0.56, 0.65, 0.62, 0.68, 0.9, 1.03, 1.71, 0.99, 0.65, 0.86, 0.72), ...
10 = c(0.98, 1, 0.78, 1.26, 1.16, 1.23, 1.7, 0.99, 3.07, 0.61, 1.43, 1.28), ...
11 = c(0.57, 0.79, 0.66, 0.51, 0.77, 0.78, 0.81, 0.65, 0.61, 2.83, 1.04, 0.84), ...
12 = c(1.07, 1.13, 0.93, 0.94, 1.37, 1.65, 1.63, 0.86, 1.43, 1.04, 2.83, 1.6), ...
13 = c(0.91, 1.38, 0.77, 0.85, 1.11, 1.31, 1.44, 0.72, 1.28, 0.84, 1.6, 4.01)),
row.names = c(NA, -12L), class = c("tbl_df", "tbl", "data.frame"))
What I've Done so Far
CETNEW <- CETNEW[,-1] #To remove the non-numeric values
Create two variables (criterion and predictor variables) as:
CETNEWx <- CETNEW[1:7,]
CETNEWy <- CETNEW[8:12,]
Then I've been using various packages such as CCA, CCP and candisk. From CCA:
ccCETNEW <- cc(CETNEWx,CETNEWy)
Yields the following error message:
Error in cov(X, Y, use = "pairwise") : incompatible dimensions
The matcor function also from CCA, yields the following error message:
Error in data.frame(..., check.names = FALSE) : arguments imply differing number of rows: 7, 5
Thus, it would seem that it all boils down to the different dimension problem. I've talked to my professor about it, but since he is using SAS, which apparently are compatible with this problem and could solve it, he could not help me.
Please, if you're familiar with canonical correlation and have had a similar problem before, any help regarding this topic is highly appreciated.
If you look at your data, notice the first column is divided into X and Y labels. That suggests to me that your data are transposed. If so, each column is an observation and the X and Y labels indicate various measurements taken on each observation. Canonical correlations are performed on two groups of measurements/variables from a single set of observations. First, here is the transposed data:
CETNEW.T <- structure(list(X1 = c(2.72, 1.2, 0.82, 0.92, 1.19, 1, 1.45, 0.68,
0.98, 0.57, 1.07, 0.91), X2 = c(1.2, 3.78, 0.7, 1.04, 1.06, 1.32,
1.31, 0.56, 1, 0.79, 1.13, 1.38), X3 = c(0.82, 0.7, 1.7, 0.59,
0.83, 1.08, 1.01, 0.65, 0.78, 0.66, 0.93, 0.77), X4 = c(0.92,
1.04, 0.59, 3.09, 1.06, 0.93, 1.47, 0.62, 1.26, 0.51, 0.94, 0.85
), X5 = c(1.19, 1.06, 0.83, 1.06, 2.94, 1.36, 1.66, 0.68, 1.16,
0.77, 1.37, 1.11), X6 = c(1, 1.32, 1.08, 0.93, 1.36, 2.94, 1.56,
0.9, 1.23, 0.78, 1.65, 1.31), X7 = c(1.45, 1.31, 1.01, 1.47,
1.66, 1.56, 3.11, 1.03, 1.7, 0.81, 1.63, 1.44), Y1 = c(0.68,
0.56, 0.65, 0.62, 0.68, 0.9, 1.03, 1.71, 0.99, 0.65, 0.86, 0.72
), Y2 = c(0.98, 1, 0.78, 1.26, 1.16, 1.23, 1.7, 0.99, 3.07, 0.61,
1.43, 1.28), Y3 = c(0.57, 0.79, 0.66, 0.51, 0.77, 0.78, 0.81,
0.65, 0.61, 2.83, 1.04, 0.84), Y4 = c(1.07, 1.13, 0.93, 0.94,
1.37, 1.65, 1.63, 0.86, 1.43, 1.04, 2.83, 1.6), Y5 = c(0.91,
1.38, 0.77, 0.85, 1.11, 1.31, 1.44, 0.72, 1.28, 0.84, 1.6, 4.01
)), class = "data.frame", row.names = c(NA, -12L))
Now the analysis runs fine:
library("CCA")
str(CETNEW.T)
# 'data.frame': 12 obs. of 12 variables:
# $ X1: num 2.72 1.2 0.82 0.92 1.19 1 1.45 0.68 0.98 0.57 ...
# $ X2: num 1.2 3.78 0.7 1.04 1.06 1.32 1.31 0.56 1 0.79 ...
# $ X3: num 0.82 0.7 1.7 0.59 0.83 1.08 1.01 0.65 0.78 0.66 ...
# $ X4: num 0.92 1.04 0.59 3.09 1.06 0.93 1.47 0.62 1.26 0.51 ...
# $ X5: num 1.19 1.06 0.83 1.06 2.94 1.36 1.66 0.68 1.16 0.77 ...
# $ X6: num 1 1.32 1.08 0.93 1.36 2.94 1.56 0.9 1.23 0.78 ...
# $ X7: num 1.45 1.31 1.01 1.47 1.66 1.56 3.11 1.03 1.7 0.81 ...
# $ Y1: num 0.68 0.56 0.65 0.62 0.68 0.9 1.03 1.71 0.99 0.65 ...
# $ Y2: num 0.98 1 0.78 1.26 1.16 1.23 1.7 0.99 3.07 0.61 ...
# $ Y3: num 0.57 0.79 0.66 0.51 0.77 0.78 0.81 0.65 0.61 2.83 ...
# $ Y4: num 1.07 1.13 0.93 0.94 1.37 1.65 1.63 0.86 1.43 1.04 ...
# $ Y5: num 0.91 1.38 0.77 0.85 1.11 1.31 1.44 0.72 1.28 0.84 ...
X <- CETNEW.T[, 1:7]
Y <- CETNEW.T[, 8:12]
ccCETNEW <- cc(X, Y)
ccCETNEW is list with 5 parts containing the results.

How to subset columns based on value in a different column?

EDITED:
I have a dataframe that stores information about when particular assessment happened ('when'). This assessment happened at different times (t1 - t3) which vary by participant.
The dataframe also contains all the assessments ever completed by every participant (including the one referenced in the 'when' column). I only want the assessment information represented in the 'when' column. So if the number is 1, I want to keep all the data related to that assessment and remove all the data that was not collected at that assessment. Please note that I have many more variables in my actual data set than are represented in this shortened data set so any solution should not rely on repeating variable names.
Here's the best I can do. The problem with this solution is that it would have to be repeated for every variable name.
df2 <- mutate(.data = df,
a1G_when = if_else(when == 1, a1G_t1, NA_real_))
# here is what we start with
df <- structure(list(id = 1:10, when = c(1, 3, 2, 1, 2, 1, 3, 2, 3,
1), a1G_t1 = c(0.78, 0.21, 0.04, 0.87, 0.08, 0.25, 0.9, 0.77,
0.51, 0.5), Stqo_t1 = c(0.68, 0.77, 0.09, 0.66, 0.94, 0.05, 0.97,
0.92, 1, 0.04), Twcdz_t1 = c(0.95, 0.41, 0.29, 0.54, 0.06, 0.45,
0.6, 0.24, 0.17, 0.55), Kgh_t1 = c(0.25, 0.86, 0.37, 0.34, 0.97,
0.75, 0.73, 0.68, 0.37, 0.66), `2xWX_t1` = c(0.47, 0.52, 0.23,
0.5, 0.88, 0.71, 0.21, 0.98, 0.76, 0.21), `2IYnS_t1` = c(0.32,
0.75, 0.03, 0.46, 0.89, 0.71, 0.51, 0.83, 0.34, 0.32), a1G_t2 = c(0.97,
0.01, 0.58, 0.33, 0.58, 0.37, 0.76, 0.33, 0.39, 0.56), Stqo_t2 = c(0.78,
0.42, 0.5, 0.69, 0.09, 0.72, 0.84, 0.94, 0.46, 0.83), Twcdz_t2 = c(0.62,
0.34, 0.72, 0.62, 0.8, 0.26, 0.3, 0.88, 0.42, 0.53), Kgh_t2 = c(0.99,
0.66, 0.02, 0.17, 0.51, 0.03, 0.03, 0.74, 0.1, 0.26), `2xWX_t2` = c(0.68,
0.97, 0.56, 0.27, 0.66, 0.71, 0.96, 0.24, 0.37, 0.76), `2IYnS_t2` = c(0.24,
0.88, 0.58, 0.31, 0.8, 0.92, 0.91, 0.9, 0.55, 0.52), a1G_t3 = c(0.73,
0.6, 0.66, 0.06, 0.33, 0.34, 0.09, 0.44, 0.73, 0.56), Stqo_t3 = c(0.28,
0.88, 0.56, 0.75, 0.85, 0.33, 0.88, 0.4, 0.63, 0.61), Twcdz_t3 = c(0.79,
0.95, 0.41, 0.07, 0.99, 0.06, 0.74, 0.17, 0.89, 0.4), Kgh_t3 = c(0.06,
0.52, 0.35, 0.91, 0.43, 0.74, 0.72, 0.96, 0.39, 0.4), `2xWX_t3` = c(0.25,
0.09, 0.64, 0.32, 0.15, 0.14, 0.18, 0.33, 0.97, 0.6), `2IYnS_t3` = c(0.92,
0.49, 0.09, 0.95, 0.3, 0.83, 0.82, 0.56, 0.29, 0.36)), row.names = c(NA,
-10L), class = "data.frame")
# here is an example of what I want with the first column. I would also want all other repeating columns to look like this (Stq0_when, Twcdz, etc.)
id when a1G_when
1 1 1 0.78
2 2 3 0.88
3 3 2 0.58
4 4 1 0.87
5 5 2 0.58
6 6 1 0.25
7 7 3 0.09
8 8 2 0.33
9 9 3 0.73
10 10 1 0.50
Using data.table, you could do something like:
library(data.table)
cols <- unique(paste0(gsub("_.*", "", setdiff(names(df), c("id", "when"))), "_when"))
setDT(df)[
, (cols) := lapply(cols, function(x) paste0(gsub("_.*", "", x), "_t", when))][
, (cols) := lapply(cols, function(x) as.character(.SD[[get(x)]])), by = cols][
, (cols) := lapply(.SD, as.numeric), .SDcols = cols
]
Output (only first 10 rows and only relevant when columns):
a1G_when Stqo_when Twcdz_when Kgh_when 2xWX_when 2IYnS_when
1: 0.78 0.68 0.95 0.25 0.47 0.32
2: 0.60 0.88 0.95 0.52 0.09 0.49
3: 0.58 0.50 0.72 0.02 0.56 0.58
4: 0.87 0.66 0.54 0.34 0.50 0.46
5: 0.58 0.09 0.80 0.51 0.66 0.80
6: 0.25 0.05 0.45 0.75 0.71 0.71
7: 0.09 0.88 0.74 0.72 0.18 0.82
8: 0.33 0.94 0.88 0.74 0.24 0.90
9: 0.73 0.63 0.89 0.39 0.97 0.29
10: 0.50 0.04 0.55 0.66 0.21 0.32
Here is an opportunity to use the new tidyr::pivot_longer. We can use this to reshape the data so that var and t are in their own columns, filter to just the rows with the data we want (i.e. where t equals when) and then pivot the data back out to wide.
library(tidyverse)
df1 <- structure(list(ID = c(101, 102, 103, 104, 105), when = c(1, 2, 3, 1, 2), var1_t1 = c(5, 6, 4, 5, 6), var2_t1 = c(2, 3, 4, 2, 3), var1_t2 = c(7, 8, 9, 7, 8), var2_t2 = c(5, 4, 5, 4, 5), var1_t3 = c(3, 4, 3, 4, 3), var2_t3 = c(6, 7, 6, 7, 6)), row.names = c(NA, 5L), class = "data.frame")
df1 %>%
pivot_longer(
cols = starts_with("var"),
names_to = c("var", "t"),
names_sep = "_t",
values_to = "val",
col_ptypes = list(var = character(), t = numeric())
) %>%
filter(when == t) %>%
select(-t) %>%
pivot_wider(names_from = "var", values_from = "val")
#> # A tibble: 5 x 4
#> ID when var1 var2
#> <dbl> <dbl> <dbl> <dbl>
#> 1 101 1 5 2
#> 2 102 2 8 4
#> 3 103 3 3 6
#> 4 104 1 5 2
#> 5 105 2 8 5
Created on 2019-07-16 by the reprex package (v0.3.0)

How to get count of pairs giving rise to correlation values using the cor() function

I like the cor() function but would like to know how to get a count of the number of pairs in a sparse matrix giving rise to the correlation values.
Many thanks.
Here's a very simplified version of the kind of data table I have.
example data table picture
I've also added a cut down version of the actual file I'm working on here
What I'd like is to find a way to give me something like the following matrix (below is from the pic rather than the file in the hyperlink):
example desired output
This shows how many pairs of values there are in common between columnA and columnB so I can see which columns are worth comparing in a correlation. Does that make sense?
dput(mat)
structure(list(A = c(9.4, 9.4, 4.7, 1.2, NA, 0.6, 7.712, 0.2,
NA, NA, 3.13, NA, 1.56, 6.25, NA, NA, 0.9471, NA, 1.56, 1.2,
0.78, NA, NA, NA, NA, NA, NA), B = c(4.7, 12.5, 2.3, 2.3, 9.4,
0.78, 9.45, 0.6, NA, NA, 3.13, NA, 2.3, 6.25, NA, NA, 10.72,
NA, 2.3, 12.5, 6.25, NA, NA, NA, NA, NA, NA), C = c(4.7, 9.4,
4.7, 0.6, NA, 0.6, 10.84, 0.2, 3.67, 2.345, 3.13, 3.288, 1.56,
9.4, 11.21, 0.6, 2.256, 50, 1.56, 3.13, 0.78, 18.7, 0.66, 1.2,
6.26, 6.258, 50)), .Names = c("A", "B", "C"), class = "data.frame", row.names = c(NA,
-27L))
outdf <- c()
for (x in colnames(mat)) {
for (y in colnames(mat)) {
subset <- mat[,c(x, y)]
number_complete <- nrow(subset[complete.cases(subset),])
row <- c(x, y, number_complete)
outdf <- rbind(outdf, row)
}
}
outdf <- data.frame(outdf)
dcast(outdf, X1 ~ X2)
# X1 A B C
# 1 A 14 14 14
# 2 B 14 15 14
# 3 C 14 14 26

How to compute/plot efficient frontiers per time period in one graph in R?

Currently we compute and sort data of stocks (X1 to X10). Historical data is stored in Excel and R for the time period 1950-1980, 1980-1999 and for 1950-1999.
The dataset:
date X1 X2 X3 X4 X5 X6 X7 X8 X9 X10
1 1950-01-01 5.92 6.35 4.61 4.08 5.47 3.90 2.35 1.49 2.27 0.82
2 1950-02-01 2.43 2.16 2.10 1.58 -0.05 1.14 1.51 1.52 2.02 1.12
3 1950-03-01 -0.81 0.21 -1.67 -0.02 -0.79 0.18 -0.22 1.03 0.12 1.75
4 1950-04-01 5.68 6.45 5.41 5.94 6.10 5.87 3.82 3.34 3.44 3.97
5 1950-05-01 3.84 1.60 1.64 3.33 2.54 2.12 4.46 2.83 3.82 4.75
6 1950-06-01 -9.88 -10.56 -8.02 -7.86 -7.27 -7.44 -7.13 -7.76 -6.32 -5.04
7 1950-07-01 9.09 8.76 7.31 5.88 3.84 4.61 3.09 3.07 1.41 0.42
598 1999-10-01 -0.95 -1.88 -1.25 -0.52 1.65 0.72 5.41 4.38 5.58 6.59
599 1999-11-01 11.57 9.15 8.17 7.14 6.15 4.95 5.78 4.21 1.55 2.15
600 1999-12-01 12.32 14.97 9.29 11.77 11.09 5.89 11.88 11.26 6.23 5.64
The main question is, we would like to compute/plot efficient frontiers for these 4 time periods to see how the efficient frontier has evolved in 1 graph. Are there ways to do this in R?
The efficient frontier is the set of optimal portfolios that offers the highest expected return for a defined level of risk or the lowest risk for a given level of expected return.
In modern portfolio theory, the efficient frontier (or portfolio frontier) is an investment portfolio which occupies the 'efficient' parts of the risk-return spectrum. Formally, it is the set of portfolios which satisfy the condition that no other portfolio exists with a higher expected return but with the same standard deviation of return.
So, how would one go about computing this in R?
dput sample data (first 50 rows)
> dput(head(data,50))
structure(list(X__1 = structure(c(-631152000, -628473600, -626054400,
-623376000, -620784000, -618105600, -615513600, -612835200, -610156800,
-607564800, -604886400, -602294400, -599616000, -596937600, -594518400,
-591840000, -589248000, -586569600, -583977600, -581299200, -578620800,
-576028800, -573350400, -570758400, -568080000, -565401600, -562896000,
-560217600, -557625600, -554947200, -552355200, -549676800, -546998400,
-544406400, -541728000, -539136000, -536457600, -533779200, -531360000,
-528681600, -526089600, -523411200, -520819200, -518140800, -515462400,
-512870400, -510192000, -507600000, -504921600, -502243200), class = c("POSIXct",
"POSIXt"), tzone = "UTC"), X__2 = c(5.92, 2.43, -0.81, 5.68,
3.84, -9.88, 9.09, 4.93, 3.99, -0.5, 3.09, 15.77, 8.22, 0.36,
-7.36, 3.84, -2.81, -7.12, 3.57, 6.59, 1.04, -1.41, -1.42, -0.53,
1.86, -3.25, 0.68, -4.4, 0.57, 2.5, -0.36, -0.74, -1.11, -0.58,
3.22, 0.33, 5.01, 2.75, -1.25, -2.13, 1.3, -4.42, 0.25, -5.56,
-4.09, 2.71, 2.01, -3.15, 8.48, -0.16), X__3 = c(6.35, 2.16,
0.21, 6.45, 1.6, -10.56, 8.76, 4.63, 3.52, -1.2, 3.36, 10.98,
8.41, 0.81, -4.01, 3.56, -4.27, -6.11, 4.7, 5.3, 2.73, -3.07,
-0.13, 0.6, 1.1, -2.77, 2.37, -4.5, 1.87, 3.18, 1.51, 0.43, -1.91,
-1.52, 4.91, 1.43, 3.4, 3.03, -2.25, -2, 0.34, -4.75, 2.24, -6.53,
-1.87, 1.97, 1.78, -2.96, 7.38, 0.43), X__4 = c(4.61, 2.1, -1.67,
5.41, 1.64, -8.02, 7.31, 4.56, 5.18, -0.46, 3.52, 10.78, 8.46,
0.28, -4.88, 4.26, -3.25, -6.76, 6.78, 4.99, 3.86, -2.57, 0.59,
0.16, 1.75, -2.04, 2.49, -5.29, 1.76, 2.88, 0.76, 0.67, -1.67,
-1.45, 5.69, 2.95, 3.66, 1.15, -1.58, -2.34, 0.51, -3.82, 0.72,
-6.25, -2.33, 3.1, 2.19, -2.63, 7.3, 1.82), X__5 = c(4.08, 1.58,
-0.02, 5.94, 3.33, -7.86, 5.88, 4.68, 5.99, 0.75, 2.68, 9.29,
8, 1.08, -3.13, 4.21, -3.35, -5.01, 5.77, 4.85, 2.73, -3.44,
0.27, 1.56, 1.62, -2.35, 2.93, -4.62, 2.36, 2.56, 0.86, 0.16,
-1.8, -2.04, 5.12, 2.72, 3.21, 1.21, -2.17, -1.84, 0.32, -3.63,
1.47, -5.16, -0.65, 3.33, 1.34, -1.36, 6.24, 1.19), X__6 = c(5.47,
-0.05, -0.79, 6.1, 2.54, -7.27, 3.84, 6.29, 4.46, -0.24, 2.42,
6.12, 8.63, 0.88, -3.31, 4.56, -2.14, -5.62, 5.73, 5.36, 2.44,
-1.88, 0.83, 0.65, 1.47, -1.81, 2.31, -4.48, 2.56, 2.69, 0.9,
0.34, -0.62, -1.58, 6.59, 0.86, 3.58, 1.92, -1.85, -2.79, 0.7,
-3.4, 1.26, -5.26, -1.18, 4.26, 1.35, -0.97, 6.66, 1.77), X__7 = c(3.9,
1.14, 0.18, 5.87, 2.12, -7.44, 4.61, 4.57, 6.14, -0.84, 4.22,
8.37, 7.44, 0.69, -4.26, 4.13, -2.24, -6.75, 5.81, 4.35, 1.98,
-2.87, 0.93, 0.61, 1.27, -2.18, 2.97, -4.09, 2.27, 2.96, 1.16,
-0.38, -2.37, -0.71, 5.53, 2.45, 1.3, 0.31, -0.47, -2.03, 0.14,
-3.26, 1.79, -5.5, -1.47, 4.18, 1.96, -1.35, 7.06, 1.69), X__8 = c(2.35,
1.51, -0.22, 3.82, 4.46, -7.13, 3.09, 5.01, 5.84, -1.05, 3.81,
7.54, 6.46, 0.71, -3.56, 4.42, -1.87, -4.52, 7.3, 3.66, 2.11,
-2.92, 2.25, 2.17, 1.32, -1.71, 3.17, -4.63, 2.59, 3.89, 0.49,
0.21, -1.71, -1.18, 4.95, 3.21, 1.41, 0.89, -1.02, -2.89, 0.59,
-2.67, 1.47, -4.62, -0.69, 4.07, 2.83, -1.44, 6.11, 1.58), X__9 = c(1.49,
1.52, 1.03, 3.34, 2.83, -7.76, 3.07, 3.72, 6.21, -1.66, 3.46,
6.14, 7.17, 2.13, -3.19, 4.59, -2.65, -3.5, 7.43, 3.5, 2.41,
-2.73, 1.35, 1.97, 1.72, -1.8, 4.06, -5.35, 2.57, 3.14, 1.89,
-0.86, -1.73, -0.95, 6.07, 1.73, 1.09, 0.37, -1.34, -2.48, 0.31,
-3.2, 1.34, -4.99, -0.18, 4.35, 3.03, 0.09, 5.65, 2.39), X__10 = c(2.27,
2.02, 0.12, 3.44, 3.82, -6.32, 1.41, 4.54, 5.55, -0.97, 3.8,
5.69, 5.65, 1.78, -2.6, 4.21, -1.29, -2.63, 7.15, 3.52, 1.85,
-2.32, 0.96, 2.74, 1.9, -2.6, 3.83, -4.31, 3.15, 2.76, 0.93,
-0.39, -1.86, -1.57, 7.05, 2.36, -0.33, -0.23, -0.54, -2.6, 0.61,
-2.37, 2.12, -3.76, 0.47, 3.98, 3.03, 0.2, 5.63, 1.26), X__11 = c(0.82,
1.12, 1.75, 3.97, 4.75, -5.04, 0.42, 4.96, 4.32, 0.25, 2.26,
4.71, 5.05, 1.63, -1.53, 5.12, -2.59, -1.92, 6.89, 4.48, -0.09,
-2.49, 0.26, 4.03, 1.37, -2.82, 4.95, -5.1, 3.4, 4.29, 0.89,
-1.06, -2.18, -0.31, 5.76, 3.32, -1.04, -0.63, -1.78, -2.97,
0.55, -1.3, 2.75, -4.47, 0.48, 4.83, 2.85, 0.27, 4.4, 1.93)), .Names = c("date",
"X1", "X2", "X3", "X4", "X5", "X6", "X7", "X8",
"X9", "X10"), row.names = c(NA, 50L), class = c("tbl_df",
"tbl", "data.frame"))
After a few correpondence via the comments with #Jonathan, I widened the example data from 3 columns to 12 columns with some sampling. And the code at the "With short-selling" section at the blog scales well for 10K observations:
# using code at:
# https://www.r-bloggers.com/a-gentle-introduction-to-finance-using-r-efficient-frontier-and-capm-part-1/
# https://datashenanigan.wordpress.com/2016/05/24/a-gentle-introduction-to-finance-using-r-efficient-frontier-and-capm-part-1/
library(data.table)
calcEFParams <- function(rets)
{
retbar <- colMeans(rets, na.rm = T)
covs <- var(rets, na.rm = T) # calculates the covariance of the returns
invS <- solve(covs)
i <- matrix(1, nrow = length(retbar))
alpha <- t(i) %*% invS %*% i
beta <- t(i) %*% invS %*% retbar
gamma <- t(retbar) %*% invS %*% retbar
delta <- alpha * gamma - beta * beta
retlist <- list(alpha = as.numeric(alpha),
beta = as.numeric(beta),
gamma = as.numeric(gamma),
delta = as.numeric(delta))
return(retlist)
}
# load data
link <- "https://raw.githubusercontent.com/DavZim/Efficient_Frontier/master/data/mult_assets.csv"
df <- data.table(read.csv(link))
df2 <- df[,lapply(.SD, sample),]
df3 <- cbind(df, df2)
df4 <- df3[,lapply(.SD, sample),]
df5 <- cbind(df3, df4)
Now loading the microbenchmark package, the performance is as such:
> library(microbenchmark)
> microbenchmark(calcEFParams(df5), times = 10)
Unit: milliseconds
expr min lq mean median uq max neval
calcEFParams(df5) 2.692514 2.764053 2.795127 2.777547 2.805447 3.024349 10
It seems that David Zimmermann's code is scalable and efficient enough!

Resources