Related
I have dataframe where some headers have names a character and dates. I want to select all the columns which dont have dates as header and all the columns which have the dates less than the current date or system date(sys.date()). How can I do thins using select statement in dplyr.
Below is the dataframe
> dput(job_times[1:5,])
structure(list(Skill = c("KAC", "KAC", "KAC", "KAC", "KAC"),
Patch = c("A1", "A2", "A3", "A4", "A5"), `Work Code` = c("W01",
"W01", "W01", "W01", "W01"), Product = c("KAC Repair", "KAC Repair",
"KAC Repair", "KAC Repair", "KAC Repair"), `Visit Time` = c(45.68,
42.55, 46.45, 51.86, 43.49), Travel = c(32.5, 21.66, 26.33,
28.63, 27.03), Success = c(0.69, 0.66, 0.67, 0.65, 0.67),
`Completion Time` = c(1.9, 1.61, 1.8, 2.05, 1.74), `28-12-2020` = c(1.9,
1.61, 1.8, 2.05, 1.74), `04-01-2021` = c(1.9, 1.61, 1.8,
2.05, 1.74), `11-01-2021` = c(1.9, 1.61, 1.8, 2.05, 1.74),
`18-01-2021` = c(1.9, 1.61, 1.8, 2.05, 1.74), `25-01-2021` = c(1.9,
1.61, 1.8, 2.05, 1.74), `01-02-2021` = c(1.9, 1.61, 1.8,
2.05, 1.74), `08-02-2021` = c(1.9, 1.61, 1.8, 2.05, 1.74),
`15-02-2021` = c(1.9, 1.61, 1.8, 2.05, 1.74), `22-02-2021` = c(1.9,
1.61, 1.8, 2.05, 1.74), `01-03-2021` = c(1.9, 1.61, 1.8,
2.05, 1.74), `08-03-2021` = c(1.9, 1.61, 1.8, 2.05, 1.74),
`15-03-2021` = c(1.9, 1.61, 1.8, 2.05, 1.74), `22-03-2021` = c(1.9,
1.61, 1.8, 2.05, 1.74), `29-03-2021` = c(1.9, 1.61, 1.8,
2.05, 1.74), `05-04-2021` = c(1.9, 1.61, 1.8, 2.05, 1.74),
`12-04-2021` = c(1.9, 1.61, 1.8, 2.05, 1.74), `19-04-2021` = c(1.9,
1.61, 1.8, 2.05, 1.74), `26-04-2021` = c(1.9, 1.61, 1.8,
2.05, 1.74), `03-05-2021` = c(1.9, 1.61, 1.8, 2.05, 1.74),
`10-05-2021` = c(1.9, 1.61, 1.8, 2.05, 1.74), `17-05-2021` = c(1.9,
1.61, 1.8, 2.05, 1.74), `24-05-2021` = c(1.9, 1.61, 1.8,
2.05, 1.74), `31-05-2021` = c(1.9, 1.61, 1.8, 2.05, 1.74),
`07-06-2021` = c(1.9, 1.61, 1.8, 2.05, 1.74), `14-06-2021` = c(1.9,
1.61, 1.8, 2.05, 1.74), `21-06-2021` = c(1.9, 1.61, 1.8,
2.05, 1.74), `28-06-2021` = c(1.9, 1.61, 1.8, 2.05, 1.74),
`05-07-2021` = c(1.9, 1.61, 1.8, 2.05, 1.74), `12-07-2021` = c(1.9,
1.61, 1.8, 2.05, 1.74), `19-07-2021` = c(1.9, 1.61, 1.8,
2.05, 1.74), `26-07-2021` = c(1.9, 1.61, 1.8, 2.05, 1.74),
`02-08-2021` = c(1.9, 1.61, 1.8, 2.05, 1.74), `09-08-2021` = c(1.9,
1.61, 1.8, 2.05, 1.74), `16-08-2021` = c(1.9, 1.61, 1.8,
2.05, 1.74), `23-08-2021` = c(1.9, 1.61, 1.8, 2.05, 1.74),
`30-08-2021` = c(1.9, 1.61, 1.8, 2.05, 1.74), `06-09-2021` = c(1.9,
1.61, 1.8, 2.05, 1.74), `13-09-2021` = c(1.9, 1.61, 1.8,
2.05, 1.74), `20-09-2021` = c(1.9, 1.61, 1.8, 2.05, 1.74),
`27-09-2021` = c(1.9, 1.61, 1.8, 2.05, 1.74), `04-10-2021` = c(1.9,
1.61, 1.8, 2.05, 1.74), `11-10-2021` = c(1.9, 1.61, 1.8,
2.05, 1.74), `18-10-2021` = c(1.9, 1.61, 1.8, 2.05, 1.74),
`25-10-2021` = c(1.9, 1.61, 1.8, 2.05, 1.74), `01-11-2021` = c(1.9,
1.61, 1.8, 2.05, 1.74), `08-11-2021` = c(1.9, 1.61, 1.8,
2.05, 1.74), `15-11-2021` = c(1.9, 1.61, 1.8, 2.05, 1.74),
`22-11-2021` = c(1.9, 1.61, 1.8, 2.05, 1.74), `29-11-2021` = c(1.9,
1.61, 1.8, 2.05, 1.74), `06-12-2021` = c(1.9, 1.61, 1.8,
2.05, 1.74), `13-12-2021` = c(1.9, 1.61, 1.8, 2.05, 1.74),
`20-12-2021` = c(1.9, 1.61, 1.8, 2.05, 1.74), `27-12-2021` = c(1.9,
1.61, 1.8, 2.05, 1.74), `03-01-2022` = c(1.9, 1.61, 1.8,
2.05, 1.74), `10-01-2022` = c(1.9, 1.61, 1.8, 2.05, 1.74),
`17-01-2022` = c(1.9, 1.61, 1.8, 2.05, 1.74), `24-01-2022` = c(1.9,
1.61, 1.8, 2.05, 1.74), `31-01-2022` = c(1.9, 1.61, 1.8,
2.05, 1.74), `07-02-2022` = c(1.9, 1.61, 1.8, 2.05, 1.74),
`14-02-2022` = c(1.9, 1.61, 1.8, 2.05, 1.74), `21-02-2022` = c(1.9,
1.61, 1.8, 2.05, 1.74), `28-02-2022` = c(1.9, 1.61, 1.8,
2.05, 1.74), `07-03-2022` = c(1.9, 1.61, 1.8, 2.05, 1.74),
`14-03-2022` = c(1.9, 1.61, 1.8, 2.05, 1.74), `21-03-2022` = c(1.9,
1.61, 1.8, 2.05, 1.74), `28-03-2022` = c(1.9, 1.61, 1.8,
2.05, 1.74), `04-04-2022` = c(1.9, 1.61, 1.8, 2.05, 1.74),
`11-04-2022` = c(1.9, 1.61, 1.8, 2.05, 1.74), `18-04-2022` = c(1.9,
1.61, 1.8, 2.05, 1.74), `25-04-2022` = c(1.9, 1.61, 1.8,
2.05, 1.74), `02-05-2022` = c(1.9, 1.61, 1.8, 2.05, 1.74),
`09-05-2022` = c(1.9, 1.61, 1.8, 2.05, 1.74), `16-05-2022` = c(1.9,
1.61, 1.8, 2.05, 1.74), `23-05-2022` = c(1.9, 1.61, 1.8,
2.05, 1.74), `30-05-2022` = c(1.9, 1.61, 1.8, 2.05, 1.74),
`06-06-2022` = c(1.9, 1.61, 1.8, 2.05, 1.74), `13-06-2022` = c(1.9,
1.61, 1.8, 2.05, 1.74), `20-06-2022` = c(1.9, 1.61, 1.8,
2.05, 1.74), `27-06-2022` = c(1.9, 1.61, 1.8, 2.05, 1.74),
`04-07-2022` = c(1.9, 1.61, 1.8, 2.05, 1.74), `11-07-2022` = c(1.9,
1.61, 1.8, 2.05, 1.74), `18-07-2022` = c(1.9, 1.61, 1.8,
2.05, 1.74), `25-07-2022` = c(1.9, 1.61, 1.8, 2.05, 1.74),
`01-08-2022` = c(1.9, 1.61, 1.8, 2.05, 1.74), `08-08-2022` = c(1.9,
1.61, 1.8, 2.05, 1.74), `15-08-2022` = c(1.9, 1.61, 1.8,
2.05, 1.74), `22-08-2022` = c(1.9, 1.61, 1.8, 2.05, 1.74),
`29-08-2022` = c(1.9, 1.61, 1.8, 2.05, 1.74), `05-09-2022` = c(1.9,
1.61, 1.8, 2.05, 1.74), `12-09-2022` = c(1.9, 1.61, 1.8,
2.05, 1.74), `19-09-2022` = c(1.9, 1.61, 1.8, 2.05, 1.74),
`26-09-2022` = c(1.9, 1.61, 1.8, 2.05, 1.74), `03-10-2022` = c(1.9,
1.61, 1.8, 2.05, 1.74), `10-10-2022` = c(1.9, 1.61, 1.8,
2.05, 1.74), `17-10-2022` = c(1.9, 1.61, 1.8, 2.05, 1.74),
`24-10-2022` = c(1.9, 1.61, 1.8, 2.05, 1.74), `31-10-2022` = c(1.9,
1.61, 1.8, 2.05, 1.74), `07-11-2022` = c(1.9, 1.61, 1.8,
2.05, 1.74), `14-11-2022` = c(1.9, 1.61, 1.8, 2.05, 1.74),
`21-11-2022` = c(1.9, 1.61, 1.8, 2.05, 1.74), `28-11-2022` = c(1.9,
1.61, 1.8, 2.05, 1.74), `05-12-2022` = c(1.9, 1.61, 1.8,
2.05, 1.74), `12-12-2022` = c(1.9, 1.61, 1.8, 2.05, 1.74),
`19-12-2022` = c(1.9, 1.61, 1.8, 2.05, 1.74), `26-12-2022` = c(1.9,
1.61, 1.8, 2.05, 1.74)), row.names = c(NA, -5L), class = c("tbl_df",
"tbl", "data.frame"))
I want the Skill, Patch, Work Code, Product, Visit Time, Travel, Success, Completion Time columns along with all the columns which have their dates less than or equal to sys.Date(). Using dplyr and select statements.
This is how I would solve it -
cols <- grep('\\d{2}-\\d{2}-\\d{2}', names(job_times), value = TRUE)
result <- job_times[, c(setdiff(names(job_times), cols),
cols[Sys.Date() > as.Date(cols, '%d-%m-%Y')])]
You can integrate this in dplyr pipe as -
library(dplyr)
job_times %>%
select({
cols <- grep('\\d{2}-\\d{2}-\\d{2}', names(.), value = TRUE)
c(setdiff(names(.), cols),
cols[Sys.Date() > as.Date(cols, '%d-%m-%Y')])
})
I would suggest creating a helper function and then you can use select like this:
library(tidyverse)
library(lubridate)
is_before_today <- function(x) {
(dmy(x, quiet = TRUE) < Sys.Date()) %>% coalesce(FALSE)
}
df %>%
select(
matches("^\\D"), all_of(colnames(.) %>% keep(is_before_today))
)
#> # A tibble: 5 x 38
#> Skill Patch `Work Code` Product `Visit Time` Travel Success `Completion Tim~
#> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 KAC A1 W01 KAC Repa~ 45.7 32.5 0.69 1.9
#> 2 KAC A2 W01 KAC Repa~ 42.6 21.7 0.66 1.61
#> 3 KAC A3 W01 KAC Repa~ 46.4 26.3 0.67 1.8
#> 4 KAC A4 W01 KAC Repa~ 51.9 28.6 0.65 2.05
#> 5 KAC A5 W01 KAC Repa~ 43.5 27.0 0.67 1.74
#> # ... with 30 more variables: 28-12-2020 <dbl>, 04-01-2021 <dbl>,
#> # 11-01-2021 <dbl>, 18-01-2021 <dbl>, 25-01-2021 <dbl>, 01-02-2021 <dbl>,
#> # 08-02-2021 <dbl>, 15-02-2021 <dbl>, 22-02-2021 <dbl>, 01-03-2021 <dbl>,
#> # 08-03-2021 <dbl>, 15-03-2021 <dbl>, 22-03-2021 <dbl>, 29-03-2021 <dbl>,
#> # 05-04-2021 <dbl>, 12-04-2021 <dbl>, 19-04-2021 <dbl>, 26-04-2021 <dbl>,
#> # 03-05-2021 <dbl>, 10-05-2021 <dbl>, 17-05-2021 <dbl>, 24-05-2021 <dbl>,
#> # 31-05-2021 <dbl>, 07-06-2021 <dbl>, 14-06-2021 <dbl>, 21-06-2021 <dbl>,
#> # 28-06-2021 <dbl>, 05-07-2021 <dbl>, 12-07-2021 <dbl>, 19-07-2021 <dbl>
Created on 2021-07-20 by the reprex package (v1.0.0)
Ronak Shah's answer is extremely great. Here is how I'll do.
## Get the List of All Column Names
ColumnNames <- names(TestDF)
## Retain Only those don't have Dates
CharacterColumnNames <- ColumnNames[grepl( "[[:alpha:]]" , names( TestDF ) )]
## Get the List of all Date Column Names
DateColumns <- setdiff(names(TestDF),CharacterColumnNames)
## Filter Required Date Column Names
RequiredDateColumns <- DateColumns[ Sys.Date() > as.Date(DateColumns, '%d-%m-%Y')]
## Get the Modified DF
ModifiedDF <- TestDF[, c(CharacterColumnNames, RequiredDateColumns)]
I have a dataframe with 33 varialbles and 1 dependable variable. I need to perform two-way ANOVA test to see their impacts.
Now I have to type vars manually:
two.way <- aov(`Yield t/ha` ~
TypeP*PreviousCulture *
T1may*T2may*T3may*T1june*
T2june*T3june*T1july*T2july*
T3july*T1aug*T2aug*T3aug*
T1sept*T2sept*T3sept*
P1may*P2may*P3may*P1june*
P2june*P3june*P1july*P2july*
P3july*P1aug*P2aug*P3aug*
P1sept*P2sept*P3sept,
data = KemData)
summary(two.way)
Maybe there's another way to put this variables into aov() function?
A sample of data:
> dput(head(KemData, 6))
structure(list(TypeP = structure(c(2L, 2L, 2L, 1L, 1L, 1L), .Label = c("Combined deep",
"Deep moldboard"), class = "factor"), PreviousCulture = structure(c(1L,
3L, 2L, 1L, 3L, 2L), .Label = c("Pure steam", "Sideral steam (melilot)",
"Sideral steam (rapeseed)"), class = "factor"), `Yield t/ha` = c(1.53,
1.33, 1.46, 0.5, 0.66, 0.58), T1may = c(9.55, 9.55, 9.55, 11.04,
11.04, 11.04), T2may = c(5.92, 5.92, 5.92, 6.89, 6.89, 6.89),
T3may = c(9.26, 9.26, 9.26, 7.61, 7.61, 7.61), T1june = c(11.43,
11.43, 11.43, 8.02, 8.02, 8.02), T2june = c(16.37, 16.37,
16.37, 18.28, 18.28, 18.28), T3june = c(15.89, 15.89, 15.89,
22.34, 22.34, 22.34), T1july = c(16.01, 16.01, 16.01, 21.1,
21.1, 21.1), T2july = c(20.02, 20.02, 20.02, 20.85, 20.85,
20.85), T3july = c(19.02, 19.02, 19.02, 18, 18, 18), T1aug = c(18.57,
18.57, 18.57, 17.32, 17.32, 17.32), T2aug = c(16.53, 16.53,
16.53, 20.82, 20.82, 20.82), T3aug = c(15.36, 15.36, 15.36,
13.64, 13.64, 13.64), T1sept = c(12.46, 12.46, 12.46, 10.45,
10.45, 10.45), T2sept = c(6.89, 6.89, 6.89, 7.33, 7.33, 7.33
), T3sept = c(6.64, 6.64, 6.64, 5.98, 5.98, 5.98), P1may = c(1.69,
1.69, 1.69, 0.06, 0.06, 0.06), P2may = c(2.44, 2.44, 2.44,
2.8, 2.8, 2.8), P3may = c(2.04, 2.04, 2.04, 3.94, 3.94, 3.94
), P1june = c(1, 1, 1, 2.23, 2.23, 2.23), P2june = c(1.73,
1.73, 1.73, 0.87, 0.87, 0.87), P3june = c(1.34, 1.34, 1.34,
0.31, 0.31, 0.31), P1july = c(5.65, 5.65, 5.65, 0.44, 0.44,
0.44), P2july = c(0.18, 0.18, 0.18, 2.18, 2.18, 2.18), P3july = c(6.7,
6.7, 6.7, 3.57, 3.57, 3.57), P1aug = c(3.38, 3.38, 3.38,
0.62, 0.62, 0.62), P2aug = c(7.65, 7.65, 7.65, 1.26, 1.26,
1.26), P3aug = c(2.73, 2.73, 2.73, 4.5, 4.5, 4.5), P1sept = c(0.31,
0.31, 0.31, 1.44, 1.44, 1.44), P2sept = c(2.94, 2.94, 2.94,
3.13, 3.13, 3.13), P3sept = c(1.65, 1.65, 1.65, 0.64, 0.64,
0.64)), row.names = c(NA, -6L), class = c("tbl_df", "tbl",
"data.frame"))
TypeP PreviousCulture `Yield t/ha` T1may T2may T3may T1june T2june T3june T1july T2july T3july T1aug T2aug T3aug T1sept T2sept T3sept P1may P2may P3may P1june P2june P3june
<fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Deep mo~ Pure steam 1.53 9.55 5.92 9.26 11.4 16.4 15.9 16.0 20.0 19.0 18.6 16.5 15.4 12.5 6.89 6.64 1.69 2.44 2.04 1 1.73 1.34
2 Deep mo~ Sideral steam (ra~ 1.33 9.55 5.92 9.26 11.4 16.4 15.9 16.0 20.0 19.0 18.6 16.5 15.4 12.5 6.89 6.64 1.69 2.44 2.04 1 1.73 1.34
3 Deep mo~ Sideral steam (me~ 1.46 9.55 5.92 9.26 11.4 16.4 15.9 16.0 20.0 19.0 18.6 16.5 15.4 12.5 6.89 6.64 1.69 2.44 2.04 1 1.73 1.34
4 Combine~ Pure steam 0.5 11.0 6.89 7.61 8.02 18.3 22.3 21.1 20.8 18 17.3 20.8 13.6 10.4 7.33 5.98 0.06 2.8 3.94 2.23 0.87 0.31
5 Combine~ Sideral steam (ra~ 0.66 11.0 6.89 7.61 8.02 18.3 22.3 21.1 20.8 18 17.3 20.8 13.6 10.4 7.33 5.98 0.06 2.8 3.94 2.23 0.87 0.31
6 Combine~ Sideral steam (me~ 0.58 11.0 6.89 7.61 8.02 18.3 22.3 21.1 20.8 18 17.3 20.8 13.6 10.4 7.33 5.98 0.06 2.8 3.94 2.23 0.87 0.31
Building the formula with paste() inside a loop:
Get the variable names, and exclude the dependent one:
var.names = colnames(KemData)
var.names = var.names[-which(var.names="Урожайность т/га")]
Now the loop:
formula = "Урожайность т/га ~ "
for(i in var.names){
formula = paste0(formula, "`", i, "`", " * ")}
formula = substr(formula, 1, nchar(formula)-3)
I have a table with numbers and can plot a 3d histogram in excel.
Here is my histogram in excel:
How can i do the same in R with plot3d?
In their example they are use 3 digits for x, y, z.
Here their dataset and histogram in R:
But i have only one digit for one bar
My table:
-2.88 -1.76 -0.41 -2.25 -0.83 -0.62 -1.25 -2.68 -2.41 -1.74 -2.51 -0.78 -1.97 -2.67 -1.41 -1.56 0.49 -1.54 -1.37 -1.47 -2.32 0.66
-2.39 -1.98 -0.65 -2.33 -1.98 -1.19 -2.44 -2.13 -2.16 -2.44 -2.20 -1.77 -0.60 -0.73 -0.77 -1.59 -1.01 -1.37 -1.68 -0.92 -1.28 -0.12
-1.99 -2.48 -0.43 -1.75 -1.81 -2.37 -1.08 -1.18 -0.80 -3.30 -2.04 -1.96 -0.65 -2.44 -0.83 -1.67 -0.48 -1.03 -1.76 0.04 -1.30 -0.71
-2.73 -2.22 -0.98 -1.24 -2.21 -1.29 -1.37 -0.89 -0.86 -2.22 -1.32 -2.13 -1.04 -1.12 -0.60 -1.58 0.20 0.01 -1.81 -0.17 -0.38 -1.74
-1.63 -1.29 -1.31 -1.94 -2.39 -1.20 -1.66 -0.14 -0.96 -1.10 -0.40 -1.29 -0.44 -0.26 0.01 -2.71 -0.55 0.17 -3.44 -0.95 0.75 -1.08
-0.95 -0.15 -1.13 -1.18 -1.74 0.09 -1.12 -0.37 -0.80 -0.44 -1.18 -1.53 -1.28 0.36 -0.56 -1.54 -0.58 0.71 -1.53 -0.57 -0.91 -1.29
-0.67 0.02 -1.82 -0.84 -2.11 -0.38 -1.12 -0.57 -0.81 -1.04 -1.22 -0.93 -1.29 -0.26 0.02 -0.76 -0.28 -0.24 -0.43 -0.37 -1.30 -1.61
-3.45 -2.79 -0.44 -2.25 -0.81 -1.00 -1.20 -2.90 -1.96 -2.79 -2.91 -0.58 -1.65 -3.10 -1.23 -2.20 -0.15 -1.60 -1.51 -0.97 -2.35 0.38
-3.03 -3.12 -0.62 -2.01 -2.25 -1.84 -2.29 -2.51 -1.86 -2.93 -2.32 -1.63 -0.35 -1.05 -1.09 -2.04 -0.79 -1.18 -2.39 -0.54 -0.60 -0.71
-2.78 -2.60 -0.49 -1.69 -1.96 -2.10 -1.70 -1.26 -0.37 -2.80 -2.40 -2.23 -0.61 -2.26 -0.80 -2.11 -0.17 -0.21 -2.61 -0.09 -1.18 -1.26
-3.13 -1.96 -1.19 -1.17 -2.76 -0.87 -1.96 -0.22 -0.49 -2.75 -1.81 -2.48 -1.26 -1.04 0.08 -2.52 0.21 0.80 -2.28 -0.14 -0.27 -1.69
-1.52 -1.85 -1.36 -1.42 -2.28 -0.49 -1.58 -0.34 -1.11 -0.59 -0.74 -1.63 -0.58 -0.23 0.12 -2.97 0.17 0.68 -3.14 -0.64 0.21 -1.70
-1.05 -0.42 -1.50 -1.46 -2.32 -0.57 -0.63 -0.17 -0.79 -0.92 -1.52 -1.69 -1.25 0.34 -0.46 -1.94 0.27 0.82 -1.48 0.35 -1.25 -1.89
-1.03 0.28 -1.39 -0.82 -2.44 -0.75 -0.86 -0.69 -1.07 -1.38 -1.46 -1.09 -1.71 -0.50 0.59 -1.42 -0.54 -0.13 -0.86 -0.14 -1.28 -1.84
UPD:
I tried to insert a full dataset to one of examples. Just want to see how plot3 handle with a huge amount of bars. Its pretty stucking.
And i dont see a negative bars. I assume that positive bar will apperars upper 0 and negative bottom, like on my first picture.
So, i realize that firstly i need to render a big amount of data to be able to choose a right library.
Also i assume, that full realtime 3d rendering maybe impossible for that amount of data. So it will be normal if library will render just a 1 picture like a hist3d does.
m <- structure(c(-2.88, -1.76, -0.41, -2.25, -0.83, -0.62, -1.25, -2.68, -2.41, -1.74, -2.51, -0.78, -1.97, -2.67, -1.41, -1.56, 0.49, -1.54, -1.37, -1.47, -2.32, 0.66,
-2.39, -1.98, -0.65, -2.33, -1.98, -1.19, -2.44, -2.13, -2.16, -2.44, -2.20, -1.77, -0.60, -0.73, -0.77, -1.59, -1.01, -1.37, -1.68, -0.92, -1.28, -0.12,
-1.99, -2.48, -0.43, -1.75, -1.81, -2.37, -1.08, -1.18, -0.80, -3.30, -2.04, -1.96, -0.65, -2.44, -0.83, -1.67, -0.48, -1.03, -1.76, 0.04, -1.30, -0.71,
<=-=-=-=-=-=-=-=-=-=-=-skipped ==============>>
-2.64, -0.89, -1.60, -2.28, -3.56, -0.84, 0.31, 0.48, -0.31, 0.03, -2.42, 0.92, -3.10, -2.35, 0.03, -2.56, -0.91, 1.01, -5.90, -0.40, 2.95, -1.32,
-3.06, -0.69, -0.74, -2.46, -4.16, 0.46, 0.97, 0.46, -0.47, -0.79, -3.12, 1.09, -3.53, -1.08, -0.25, -1.26, -0.57, 0.67, -4.76, 0.01, -0.08, -1.56,
-2.70, -0.89, -0.97, -2.40, -5.45, -1.26, 1.65, 0.24, -1.60, -1.79, -2.05, 0.18, -3.01, -0.39, 0.47, -2.21, -0.50, 0.77, -3.05, 0.81, -0.36, -1.98), .Dim = c(700L, 22L))
library(graph3d)
dat <- cbind(
expand.grid(x = 1:700, y = 1:22),
z = c(m)
)
graph3d(
dat,
~x, ~y, ~z,
type = "bar"
)
Help me please to plot a histogram from a full txt file with positive up bars and negative down.
My full txt file is here https://pastebin.com/2zyyRDy8
I've read my txt file to res_cut, but i see data structure different from your examples, in my there 700 objs of 23 variable
res_cut <- read.delim("d:/result_cut.txt",sep = "\t", header = FALSE)
With the graph3d package:
m <- structure(c(-2.88, -2.39, -1.99, -2.73, -1.63, -0.95, -0.67,
-3.45, -3.03, -2.78, -3.13, -1.52, -1.05, -1.03, -1.76, -1.98,
-2.48, -2.22, -1.29, -0.15, 0.02, -2.79, -3.12, -2.6, -1.96,
-1.85, -0.42, 0.28, -0.41, -0.65, -0.43, -0.98, -1.31, -1.13,
-1.82, -0.44, -0.62, -0.49, -1.19, -1.36, -1.5, -1.39, -2.25,
-2.33, -1.75, -1.24, -1.94, -1.18, -0.84, -2.25, -2.01, -1.69,
-1.17, -1.42, -1.46, -0.82, -0.83, -1.98, -1.81, -2.21, -2.39,
-1.74, -2.11, -0.81, -2.25, -1.96, -2.76, -2.28, -2.32, -2.44,
-0.62, -1.19, -2.37, -1.29, -1.2, 0.09, -0.38, -1, -1.84, -2.1,
-0.87, -0.49, -0.57, -0.75, -1.25, -2.44, -1.08, -1.37, -1.66,
-1.12, -1.12, -1.2, -2.29, -1.7, -1.96, -1.58, -0.63, -0.86,
-2.68, -2.13, -1.18, -0.89, -0.14, -0.37, -0.57, -2.9, -2.51,
-1.26, -0.22, -0.34, -0.17, -0.69, -2.41, -2.16, -0.8, -0.86,
-0.96, -0.8, -0.81, -1.96, -1.86, -0.37, -0.49, -1.11, -0.79,
-1.07, -1.74, -2.44, -3.3, -2.22, -1.1, -0.44, -1.04, -2.79,
-2.93, -2.8, -2.75, -0.59, -0.92, -1.38, -2.51, -2.2, -2.04,
-1.32, -0.4, -1.18, -1.22, -2.91, -2.32, -2.4, -1.81, -0.74,
-1.52, -1.46, -0.78, -1.77, -1.96, -2.13, -1.29, -1.53, -0.93,
-0.58, -1.63, -2.23, -2.48, -1.63, -1.69, -1.09, -1.97, -0.6,
-0.65, -1.04, -0.44, -1.28, -1.29, -1.65, -0.35, -0.61, -1.26,
-0.58, -1.25, -1.71, -2.67, -0.73, -2.44, -1.12, -0.26, 0.36,
-0.26, -3.1, -1.05, -2.26, -1.04, -0.23, 0.34, -0.5, -1.41, -0.77,
-0.83, -0.6, 0.01, -0.56, 0.02, -1.23, -1.09, -0.8, 0.08, 0.12,
-0.46, 0.59, -1.56, -1.59, -1.67, -1.58, -2.71, -1.54, -0.76,
-2.2, -2.04, -2.11, -2.52, -2.97, -1.94, -1.42, 0.49, -1.01,
-0.48, 0.2, -0.55, -0.58, -0.28, -0.15, -0.79, -0.17, 0.21, 0.17,
0.27, -0.54, -1.54, -1.37, -1.03, 0.01, 0.17, 0.71, -0.24, -1.6,
-1.18, -0.21, 0.8, 0.68, 0.82, -0.13, -1.37, -1.68, -1.76, -1.81,
-3.44, -1.53, -0.43, -1.51, -2.39, -2.61, -2.28, -3.14, -1.48,
-0.86, -1.47, -0.92, 0.04, -0.17, -0.95, -0.57, -0.37, -0.97,
-0.54, -0.09, -0.14, -0.64, 0.35, -0.14, -2.32, -1.28, -1.3,
-0.38, 0.75, -0.91, -1.3, -2.35, -0.6, -1.18, -0.27, 0.21, -1.25,
-1.28, 0.66, -0.12, -0.71, -1.74, -1.08, -1.29, -1.61, 0.38,
-0.71, -1.26, -1.69, -1.7, -1.89, -1.84), .Dim = c(14L, 22L))
library(graph3d)
dat <- cbind(
expand.grid(x = 1:14, y = 1:22),
z = c(m)
)
graph3d(
dat,
~x, ~y, ~z,
type = "bar"
)
You could use hist3D from plot3Dpackage with z parameter:
m <- structure(c(-2.88, -2.39, -1.99, -2.73, -1.63, -0.95, -0.67,
-3.45, -3.03, -2.78, -3.13, -1.52, -1.05, -1.03, -1.76, -1.98,
-2.48, -2.22, -1.29, -0.15, 0.02, -2.79, -3.12, -2.6, -1.96,
-1.85, -0.42, 0.28, -0.41, -0.65, -0.43, -0.98, -1.31, -1.13,
-1.82, -0.44, -0.62, -0.49, -1.19, -1.36, -1.5, -1.39, -2.25,
-2.33, -1.75, -1.24, -1.94, -1.18, -0.84, -2.25, -2.01, -1.69,
-1.17, -1.42, -1.46, -0.82, -0.83, -1.98, -1.81, -2.21, -2.39,
-1.74, -2.11, -0.81, -2.25, -1.96, -2.76, -2.28, -2.32, -2.44,
-0.62, -1.19, -2.37, -1.29, -1.2, 0.09, -0.38, -1, -1.84, -2.1,
-0.87, -0.49, -0.57, -0.75, -1.25, -2.44, -1.08, -1.37, -1.66,
-1.12, -1.12, -1.2, -2.29, -1.7, -1.96, -1.58, -0.63, -0.86,
-2.68, -2.13, -1.18, -0.89, -0.14, -0.37, -0.57, -2.9, -2.51,
-1.26, -0.22, -0.34, -0.17, -0.69, -2.41, -2.16, -0.8, -0.86,
-0.96, -0.8, -0.81, -1.96, -1.86, -0.37, -0.49, -1.11, -0.79,
-1.07, -1.74, -2.44, -3.3, -2.22, -1.1, -0.44, -1.04, -2.79,
-2.93, -2.8, -2.75, -0.59, -0.92, -1.38, -2.51, -2.2, -2.04,
-1.32, -0.4, -1.18, -1.22, -2.91, -2.32, -2.4, -1.81, -0.74,
-1.52, -1.46, -0.78, -1.77, -1.96, -2.13, -1.29, -1.53, -0.93,
-0.58, -1.63, -2.23, -2.48, -1.63, -1.69, -1.09, -1.97, -0.6,
-0.65, -1.04, -0.44, -1.28, -1.29, -1.65, -0.35, -0.61, -1.26,
-0.58, -1.25, -1.71, -2.67, -0.73, -2.44, -1.12, -0.26, 0.36,
-0.26, -3.1, -1.05, -2.26, -1.04, -0.23, 0.34, -0.5, -1.41, -0.77,
-0.83, -0.6, 0.01, -0.56, 0.02, -1.23, -1.09, -0.8, 0.08, 0.12,
-0.46, 0.59, -1.56, -1.59, -1.67, -1.58, -2.71, -1.54, -0.76,
-2.2, -2.04, -2.11, -2.52, -2.97, -1.94, -1.42, 0.49, -1.01,
-0.48, 0.2, -0.55, -0.58, -0.28, -0.15, -0.79, -0.17, 0.21, 0.17,
0.27, -0.54, -1.54, -1.37, -1.03, 0.01, 0.17, 0.71, -0.24, -1.6,
-1.18, -0.21, 0.8, 0.68, 0.82, -0.13, -1.37, -1.68, -1.76, -1.81,
-3.44, -1.53, -0.43, -1.51, -2.39, -2.61, -2.28, -3.14, -1.48,
-0.86, -1.47, -0.92, 0.04, -0.17, -0.95, -0.57, -0.37, -0.97,
-0.54, -0.09, -0.14, -0.64, 0.35, -0.14, -2.32, -1.28, -1.3,
-0.38, 0.75, -0.91, -1.3, -2.35, -0.6, -1.18, -0.27, 0.21, -1.25,
-1.28, 0.66, -0.12, -0.71, -1.74, -1.08, -1.29, -1.61, 0.38,
-0.71, -1.26, -1.69, -1.7, -1.89, -1.84), .Dim = c(14L, 22L))
plot3D::hist3D(z=m)
I'm having difficulties about doing a CC analysis in R.
The assignment which I'm doing is from "Applied Multivariate Analysis" by Sharma, exercise 13.7, if you're familiar with it.
Basically, I'm asked to conduct a CCA on a set of variables. There are seven X variables, but only five Y variables, thus R complains that the dimensions are not compatible. See the image below for a visual representation of the data called CETNEW.
Edited (Changed from image to dput):
structure(list(...
1 = c("X1", "X2", "X3", "X4", "X5", "X6", "X7", "Y1", "Y2", "Y3", "Y4", "Y5"),
2 = c(2.72, 1.2, 0.82, 0.92, 1.19, 1, 1.45, 0.68, 0.98, 0.57, 1.07, 0.91), ...
3 = c(1.2, 3.78, 0.7, 1.04, 1.06, 1.32, 1.31, 0.56, 1, 0.79, 1.13, 1.38), ...
4 = c(0.82, 0.7, 1.7, 0.59, 0.83, 1.08, 1.01, 0.65, 0.78, 0.66, 0.93, 0.77), ...
5 = c(0.92, 1.04, 0.59, 3.09, 1.06, 0.93, 1.47, 0.62, 1.26, 0.51, 0.94, 0.85), ...
6 = c(1.19, 1.06, 0.83, 1.06, 2.94, 1.36, 1.66, 0.68, 1.16, 0.77, 1.37, 1.11), ...
7 = c(1, 1.32, 1.08, 0.93, 1.36, 2.94, 1.56, 0.9, 1.23, 0.78, 1.65, 1.31), ...
8 = c(1.45, 1.31, 1.01, 1.47, 1.66, 1.56, 3.11, 1.03, 1.7, 0.81, 1.63, 1.44), ...
9 = c(0.68, 0.56, 0.65, 0.62, 0.68, 0.9, 1.03, 1.71, 0.99, 0.65, 0.86, 0.72), ...
10 = c(0.98, 1, 0.78, 1.26, 1.16, 1.23, 1.7, 0.99, 3.07, 0.61, 1.43, 1.28), ...
11 = c(0.57, 0.79, 0.66, 0.51, 0.77, 0.78, 0.81, 0.65, 0.61, 2.83, 1.04, 0.84), ...
12 = c(1.07, 1.13, 0.93, 0.94, 1.37, 1.65, 1.63, 0.86, 1.43, 1.04, 2.83, 1.6), ...
13 = c(0.91, 1.38, 0.77, 0.85, 1.11, 1.31, 1.44, 0.72, 1.28, 0.84, 1.6, 4.01)),
row.names = c(NA, -12L), class = c("tbl_df", "tbl", "data.frame"))
What I've Done so Far
CETNEW <- CETNEW[,-1] #To remove the non-numeric values
Create two variables (criterion and predictor variables) as:
CETNEWx <- CETNEW[1:7,]
CETNEWy <- CETNEW[8:12,]
Then I've been using various packages such as CCA, CCP and candisk. From CCA:
ccCETNEW <- cc(CETNEWx,CETNEWy)
Yields the following error message:
Error in cov(X, Y, use = "pairwise") : incompatible dimensions
The matcor function also from CCA, yields the following error message:
Error in data.frame(..., check.names = FALSE) : arguments imply differing number of rows: 7, 5
Thus, it would seem that it all boils down to the different dimension problem. I've talked to my professor about it, but since he is using SAS, which apparently are compatible with this problem and could solve it, he could not help me.
Please, if you're familiar with canonical correlation and have had a similar problem before, any help regarding this topic is highly appreciated.
If you look at your data, notice the first column is divided into X and Y labels. That suggests to me that your data are transposed. If so, each column is an observation and the X and Y labels indicate various measurements taken on each observation. Canonical correlations are performed on two groups of measurements/variables from a single set of observations. First, here is the transposed data:
CETNEW.T <- structure(list(X1 = c(2.72, 1.2, 0.82, 0.92, 1.19, 1, 1.45, 0.68,
0.98, 0.57, 1.07, 0.91), X2 = c(1.2, 3.78, 0.7, 1.04, 1.06, 1.32,
1.31, 0.56, 1, 0.79, 1.13, 1.38), X3 = c(0.82, 0.7, 1.7, 0.59,
0.83, 1.08, 1.01, 0.65, 0.78, 0.66, 0.93, 0.77), X4 = c(0.92,
1.04, 0.59, 3.09, 1.06, 0.93, 1.47, 0.62, 1.26, 0.51, 0.94, 0.85
), X5 = c(1.19, 1.06, 0.83, 1.06, 2.94, 1.36, 1.66, 0.68, 1.16,
0.77, 1.37, 1.11), X6 = c(1, 1.32, 1.08, 0.93, 1.36, 2.94, 1.56,
0.9, 1.23, 0.78, 1.65, 1.31), X7 = c(1.45, 1.31, 1.01, 1.47,
1.66, 1.56, 3.11, 1.03, 1.7, 0.81, 1.63, 1.44), Y1 = c(0.68,
0.56, 0.65, 0.62, 0.68, 0.9, 1.03, 1.71, 0.99, 0.65, 0.86, 0.72
), Y2 = c(0.98, 1, 0.78, 1.26, 1.16, 1.23, 1.7, 0.99, 3.07, 0.61,
1.43, 1.28), Y3 = c(0.57, 0.79, 0.66, 0.51, 0.77, 0.78, 0.81,
0.65, 0.61, 2.83, 1.04, 0.84), Y4 = c(1.07, 1.13, 0.93, 0.94,
1.37, 1.65, 1.63, 0.86, 1.43, 1.04, 2.83, 1.6), Y5 = c(0.91,
1.38, 0.77, 0.85, 1.11, 1.31, 1.44, 0.72, 1.28, 0.84, 1.6, 4.01
)), class = "data.frame", row.names = c(NA, -12L))
Now the analysis runs fine:
library("CCA")
str(CETNEW.T)
# 'data.frame': 12 obs. of 12 variables:
# $ X1: num 2.72 1.2 0.82 0.92 1.19 1 1.45 0.68 0.98 0.57 ...
# $ X2: num 1.2 3.78 0.7 1.04 1.06 1.32 1.31 0.56 1 0.79 ...
# $ X3: num 0.82 0.7 1.7 0.59 0.83 1.08 1.01 0.65 0.78 0.66 ...
# $ X4: num 0.92 1.04 0.59 3.09 1.06 0.93 1.47 0.62 1.26 0.51 ...
# $ X5: num 1.19 1.06 0.83 1.06 2.94 1.36 1.66 0.68 1.16 0.77 ...
# $ X6: num 1 1.32 1.08 0.93 1.36 2.94 1.56 0.9 1.23 0.78 ...
# $ X7: num 1.45 1.31 1.01 1.47 1.66 1.56 3.11 1.03 1.7 0.81 ...
# $ Y1: num 0.68 0.56 0.65 0.62 0.68 0.9 1.03 1.71 0.99 0.65 ...
# $ Y2: num 0.98 1 0.78 1.26 1.16 1.23 1.7 0.99 3.07 0.61 ...
# $ Y3: num 0.57 0.79 0.66 0.51 0.77 0.78 0.81 0.65 0.61 2.83 ...
# $ Y4: num 1.07 1.13 0.93 0.94 1.37 1.65 1.63 0.86 1.43 1.04 ...
# $ Y5: num 0.91 1.38 0.77 0.85 1.11 1.31 1.44 0.72 1.28 0.84 ...
X <- CETNEW.T[, 1:7]
Y <- CETNEW.T[, 8:12]
ccCETNEW <- cc(X, Y)
ccCETNEW is list with 5 parts containing the results.
Currently we compute and sort data of stocks (X1 to X10). Historical data is stored in Excel and R for the time period 1950-1980, 1980-1999 and for 1950-1999.
The dataset:
date X1 X2 X3 X4 X5 X6 X7 X8 X9 X10
1 1950-01-01 5.92 6.35 4.61 4.08 5.47 3.90 2.35 1.49 2.27 0.82
2 1950-02-01 2.43 2.16 2.10 1.58 -0.05 1.14 1.51 1.52 2.02 1.12
3 1950-03-01 -0.81 0.21 -1.67 -0.02 -0.79 0.18 -0.22 1.03 0.12 1.75
4 1950-04-01 5.68 6.45 5.41 5.94 6.10 5.87 3.82 3.34 3.44 3.97
5 1950-05-01 3.84 1.60 1.64 3.33 2.54 2.12 4.46 2.83 3.82 4.75
6 1950-06-01 -9.88 -10.56 -8.02 -7.86 -7.27 -7.44 -7.13 -7.76 -6.32 -5.04
7 1950-07-01 9.09 8.76 7.31 5.88 3.84 4.61 3.09 3.07 1.41 0.42
598 1999-10-01 -0.95 -1.88 -1.25 -0.52 1.65 0.72 5.41 4.38 5.58 6.59
599 1999-11-01 11.57 9.15 8.17 7.14 6.15 4.95 5.78 4.21 1.55 2.15
600 1999-12-01 12.32 14.97 9.29 11.77 11.09 5.89 11.88 11.26 6.23 5.64
The main question is, we would like to compute/plot efficient frontiers for these 4 time periods to see how the efficient frontier has evolved in 1 graph. Are there ways to do this in R?
The efficient frontier is the set of optimal portfolios that offers the highest expected return for a defined level of risk or the lowest risk for a given level of expected return.
In modern portfolio theory, the efficient frontier (or portfolio frontier) is an investment portfolio which occupies the 'efficient' parts of the risk-return spectrum. Formally, it is the set of portfolios which satisfy the condition that no other portfolio exists with a higher expected return but with the same standard deviation of return.
So, how would one go about computing this in R?
dput sample data (first 50 rows)
> dput(head(data,50))
structure(list(X__1 = structure(c(-631152000, -628473600, -626054400,
-623376000, -620784000, -618105600, -615513600, -612835200, -610156800,
-607564800, -604886400, -602294400, -599616000, -596937600, -594518400,
-591840000, -589248000, -586569600, -583977600, -581299200, -578620800,
-576028800, -573350400, -570758400, -568080000, -565401600, -562896000,
-560217600, -557625600, -554947200, -552355200, -549676800, -546998400,
-544406400, -541728000, -539136000, -536457600, -533779200, -531360000,
-528681600, -526089600, -523411200, -520819200, -518140800, -515462400,
-512870400, -510192000, -507600000, -504921600, -502243200), class = c("POSIXct",
"POSIXt"), tzone = "UTC"), X__2 = c(5.92, 2.43, -0.81, 5.68,
3.84, -9.88, 9.09, 4.93, 3.99, -0.5, 3.09, 15.77, 8.22, 0.36,
-7.36, 3.84, -2.81, -7.12, 3.57, 6.59, 1.04, -1.41, -1.42, -0.53,
1.86, -3.25, 0.68, -4.4, 0.57, 2.5, -0.36, -0.74, -1.11, -0.58,
3.22, 0.33, 5.01, 2.75, -1.25, -2.13, 1.3, -4.42, 0.25, -5.56,
-4.09, 2.71, 2.01, -3.15, 8.48, -0.16), X__3 = c(6.35, 2.16,
0.21, 6.45, 1.6, -10.56, 8.76, 4.63, 3.52, -1.2, 3.36, 10.98,
8.41, 0.81, -4.01, 3.56, -4.27, -6.11, 4.7, 5.3, 2.73, -3.07,
-0.13, 0.6, 1.1, -2.77, 2.37, -4.5, 1.87, 3.18, 1.51, 0.43, -1.91,
-1.52, 4.91, 1.43, 3.4, 3.03, -2.25, -2, 0.34, -4.75, 2.24, -6.53,
-1.87, 1.97, 1.78, -2.96, 7.38, 0.43), X__4 = c(4.61, 2.1, -1.67,
5.41, 1.64, -8.02, 7.31, 4.56, 5.18, -0.46, 3.52, 10.78, 8.46,
0.28, -4.88, 4.26, -3.25, -6.76, 6.78, 4.99, 3.86, -2.57, 0.59,
0.16, 1.75, -2.04, 2.49, -5.29, 1.76, 2.88, 0.76, 0.67, -1.67,
-1.45, 5.69, 2.95, 3.66, 1.15, -1.58, -2.34, 0.51, -3.82, 0.72,
-6.25, -2.33, 3.1, 2.19, -2.63, 7.3, 1.82), X__5 = c(4.08, 1.58,
-0.02, 5.94, 3.33, -7.86, 5.88, 4.68, 5.99, 0.75, 2.68, 9.29,
8, 1.08, -3.13, 4.21, -3.35, -5.01, 5.77, 4.85, 2.73, -3.44,
0.27, 1.56, 1.62, -2.35, 2.93, -4.62, 2.36, 2.56, 0.86, 0.16,
-1.8, -2.04, 5.12, 2.72, 3.21, 1.21, -2.17, -1.84, 0.32, -3.63,
1.47, -5.16, -0.65, 3.33, 1.34, -1.36, 6.24, 1.19), X__6 = c(5.47,
-0.05, -0.79, 6.1, 2.54, -7.27, 3.84, 6.29, 4.46, -0.24, 2.42,
6.12, 8.63, 0.88, -3.31, 4.56, -2.14, -5.62, 5.73, 5.36, 2.44,
-1.88, 0.83, 0.65, 1.47, -1.81, 2.31, -4.48, 2.56, 2.69, 0.9,
0.34, -0.62, -1.58, 6.59, 0.86, 3.58, 1.92, -1.85, -2.79, 0.7,
-3.4, 1.26, -5.26, -1.18, 4.26, 1.35, -0.97, 6.66, 1.77), X__7 = c(3.9,
1.14, 0.18, 5.87, 2.12, -7.44, 4.61, 4.57, 6.14, -0.84, 4.22,
8.37, 7.44, 0.69, -4.26, 4.13, -2.24, -6.75, 5.81, 4.35, 1.98,
-2.87, 0.93, 0.61, 1.27, -2.18, 2.97, -4.09, 2.27, 2.96, 1.16,
-0.38, -2.37, -0.71, 5.53, 2.45, 1.3, 0.31, -0.47, -2.03, 0.14,
-3.26, 1.79, -5.5, -1.47, 4.18, 1.96, -1.35, 7.06, 1.69), X__8 = c(2.35,
1.51, -0.22, 3.82, 4.46, -7.13, 3.09, 5.01, 5.84, -1.05, 3.81,
7.54, 6.46, 0.71, -3.56, 4.42, -1.87, -4.52, 7.3, 3.66, 2.11,
-2.92, 2.25, 2.17, 1.32, -1.71, 3.17, -4.63, 2.59, 3.89, 0.49,
0.21, -1.71, -1.18, 4.95, 3.21, 1.41, 0.89, -1.02, -2.89, 0.59,
-2.67, 1.47, -4.62, -0.69, 4.07, 2.83, -1.44, 6.11, 1.58), X__9 = c(1.49,
1.52, 1.03, 3.34, 2.83, -7.76, 3.07, 3.72, 6.21, -1.66, 3.46,
6.14, 7.17, 2.13, -3.19, 4.59, -2.65, -3.5, 7.43, 3.5, 2.41,
-2.73, 1.35, 1.97, 1.72, -1.8, 4.06, -5.35, 2.57, 3.14, 1.89,
-0.86, -1.73, -0.95, 6.07, 1.73, 1.09, 0.37, -1.34, -2.48, 0.31,
-3.2, 1.34, -4.99, -0.18, 4.35, 3.03, 0.09, 5.65, 2.39), X__10 = c(2.27,
2.02, 0.12, 3.44, 3.82, -6.32, 1.41, 4.54, 5.55, -0.97, 3.8,
5.69, 5.65, 1.78, -2.6, 4.21, -1.29, -2.63, 7.15, 3.52, 1.85,
-2.32, 0.96, 2.74, 1.9, -2.6, 3.83, -4.31, 3.15, 2.76, 0.93,
-0.39, -1.86, -1.57, 7.05, 2.36, -0.33, -0.23, -0.54, -2.6, 0.61,
-2.37, 2.12, -3.76, 0.47, 3.98, 3.03, 0.2, 5.63, 1.26), X__11 = c(0.82,
1.12, 1.75, 3.97, 4.75, -5.04, 0.42, 4.96, 4.32, 0.25, 2.26,
4.71, 5.05, 1.63, -1.53, 5.12, -2.59, -1.92, 6.89, 4.48, -0.09,
-2.49, 0.26, 4.03, 1.37, -2.82, 4.95, -5.1, 3.4, 4.29, 0.89,
-1.06, -2.18, -0.31, 5.76, 3.32, -1.04, -0.63, -1.78, -2.97,
0.55, -1.3, 2.75, -4.47, 0.48, 4.83, 2.85, 0.27, 4.4, 1.93)), .Names = c("date",
"X1", "X2", "X3", "X4", "X5", "X6", "X7", "X8",
"X9", "X10"), row.names = c(NA, 50L), class = c("tbl_df",
"tbl", "data.frame"))
After a few correpondence via the comments with #Jonathan, I widened the example data from 3 columns to 12 columns with some sampling. And the code at the "With short-selling" section at the blog scales well for 10K observations:
# using code at:
# https://www.r-bloggers.com/a-gentle-introduction-to-finance-using-r-efficient-frontier-and-capm-part-1/
# https://datashenanigan.wordpress.com/2016/05/24/a-gentle-introduction-to-finance-using-r-efficient-frontier-and-capm-part-1/
library(data.table)
calcEFParams <- function(rets)
{
retbar <- colMeans(rets, na.rm = T)
covs <- var(rets, na.rm = T) # calculates the covariance of the returns
invS <- solve(covs)
i <- matrix(1, nrow = length(retbar))
alpha <- t(i) %*% invS %*% i
beta <- t(i) %*% invS %*% retbar
gamma <- t(retbar) %*% invS %*% retbar
delta <- alpha * gamma - beta * beta
retlist <- list(alpha = as.numeric(alpha),
beta = as.numeric(beta),
gamma = as.numeric(gamma),
delta = as.numeric(delta))
return(retlist)
}
# load data
link <- "https://raw.githubusercontent.com/DavZim/Efficient_Frontier/master/data/mult_assets.csv"
df <- data.table(read.csv(link))
df2 <- df[,lapply(.SD, sample),]
df3 <- cbind(df, df2)
df4 <- df3[,lapply(.SD, sample),]
df5 <- cbind(df3, df4)
Now loading the microbenchmark package, the performance is as such:
> library(microbenchmark)
> microbenchmark(calcEFParams(df5), times = 10)
Unit: milliseconds
expr min lq mean median uq max neval
calcEFParams(df5) 2.692514 2.764053 2.795127 2.777547 2.805447 3.024349 10
It seems that David Zimmermann's code is scalable and efficient enough!