I have character strings that I want to convert to tables. The identifier in each row can have white spaces and I need them removed without also removing spaces between the numbers. Is it possible to use a regular expression to achieve this?
For example, the data would look like this:
A B C 5.65 7.8
DC 5.65 7.8
D AB 7.9 12.2
D AB C 7.9 1.2
A BC 13.88 2.4
AB C 7.9 12.2
And I want to get to this:
ABC 5.65 7.8
DC 5.65 7.8
DAB 7.9 12.2
DABC 7.9 1.2
ABC 13.88 2.4
ABC 7.9 12.2
EDIT: As requested, this is an example of the data type and the form in which I receive it. This has 16 rows, each with 6 columns of data, but the first column is an alphabetic identifier.
# Data as I receive it.
data <- c("A", "a", "2.07", "2.35", "39.00", "82.20", "8.8", "3.80",
"B", "2.26", "2.25", "40.00", "80.80", "8.1", "1.86", "D",
"Et", "2.07", "2.22", "41.00", "83.80", "8.8", "3.87", "F",
"2.05", "2.15", "43.00", "82.20", "8.4", "3.11", "Bc", "2.08",
"2.12", "48.00", "82.60", "8.3", "2.47", "Gf", "H", "I",
"2.08", "2.10", "46.00", "82.20", "8.1", "2.90", "J", "K",
"1.95", "2.08", "38.00", "83.40", "8.7", "1.63", "L", "M",
"1.89", "2.07", "45.00", "83.80", "9.0", "1.84", "N", "2.06",
"2.05", "41.00", "80.60", "9.0", "4.09", "O", "P", "1.86",
"2.04", "48.00", "81.60", "8.6", "2.60", "Qst", "R", "1.95",
"2.03", "44.00", "82.80", "8.8", "1.40", "S", "2.03", "2.02",
"40.00", "81.40", "8.2", "1.74", "T", "1.95", "2.01", "43.00",
"81.80", "9.0", "2.30", "Unh", "1.96", "2.00", "44.00", "82.60",
"9.2", "2.40", "V", "W", "C", "1.98", "1.97", "40.00",
"82.00", "8.1", "1.15", "Yu", "1.90", "1.96", "41.00", "82.80",
"9.6", "2.08", "Z", "a", "bi", "1.90", "1.95", "42.00",
"84.20", "9.6", "1.69")
# Required format
data2 <- c("Aa", "2.07", "2.35", "39.00", "82.20", "8.8", "3.80",
"B", "2.26", "2.25", "40.00", "80.80", "8.1", "1.86",
"DEt", "2.07", "2.22", "41.00", "83.80", "8.8", "3.87", "F",
"2.05", "2.15", "43.00", "82.20", "8.4", "3.11", "Bc", "2.08",
"2.12", "48.00", "82.60", "8.3", "2.47", "GfHI",
"2.08", "2.10", "46.00", "82.20", "8.1", "2.90", "JK",
"1.95", "2.08", "38.00", "83.40", "8.7", "1.63", "LM",
"1.89", "2.07", "45.00", "83.80", "9.0", "1.84", "N", "2.06",
"2.05", "41.00", "80.60", "9.0", "4.09", "OP", "1.86",
"2.04", "48.00", "81.60", "8.6", "2.60", "QstR", "1.95",
"2.03", "44.00", "82.80", "8.8", "1.40", "S", "2.03", "2.02",
"40.00", "81.40", "8.2", "1.74", "T", "1.95", "2.01", "43.00",
"81.80", "9.0", "2.30", "Unh", "1.96", "2.00", "44.00", "82.60",
"9.2", "2.40", "VWC", "1.98", "1.97", "40.00",
"82.00", "8.1", "1.15", "Yu", "1.90", "1.96", "41.00", "82.80",
"9.6", "2.08", "Zabi", "1.90", "1.95", "42.00",
"84.20", "9.6", "1.69")
df <- data.frame(matrix(data2, ncol=7, byrow=T))
To do as you request within your R environment, one approach is to convert the vector to a string, apply a regular expression filter to the string, then convert the string back to a vector.
See details below, hopefully this points you in the right direction.
Solution
data <- c("A", "a", "2.07", "2.35", "39.00", "82.20", "8.8", "3.80",
"B", "2.26", "2.25", "40.00", "80.80", "8.1", "1.86", "D",
"Et", "2.07", "2.22", "41.00", "83.80", "8.8", "3.87", "F",
"2.05", "2.15", "43.00", "82.20", "8.4", "3.11", "Bc", "2.08",
"2.12", "48.00", "82.60", "8.3", "2.47", "Gf", "H", "I",
"2.08", "2.10", "46.00", "82.20", "8.1", "2.90", "J", "K",
"1.95", "2.08", "38.00", "83.40", "8.7", "1.63", "L", "M",
"1.89", "2.07", "45.00", "83.80", "9.0", "1.84", "N", "2.06",
"2.05", "41.00", "80.60", "9.0", "4.09", "O", "P", "1.86",
"2.04", "48.00", "81.60", "8.6", "2.60", "Qst", "R", "1.95",
"2.03", "44.00", "82.80", "8.8", "1.40", "S", "2.03", "2.02",
"40.00", "81.40", "8.2", "1.74", "T", "1.95", "2.01", "43.00",
"81.80", "9.0", "2.30", "Unh", "1.96", "2.00", "44.00", "82.60",
"9.2", "2.40", "V", "W", "C", "1.98", "1.97", "40.00",
"82.00", "8.1", "1.15", "Yu", "1.90", "1.96", "41.00", "82.80",
"9.6", "2.08", "Z", "a", "bi", "1.90", "1.95", "42.00",
"84.20", "9.6", "1.69")
# Use stringi base regular expression engine
require(stringi)
# Convert the vector data to be a string sequence - so we can manipulate as text
data1 <- toString(data)
# Now we can apply the regular expression substitution to the data (formatted as a string...
# Here we do a:
#
# (?<!\d) - Negative look behind to prevent a digit.
# , - A literal combination of quotes, comma and space. We drop the ", " in conversion to string...
# (?!\d) - Negative look ahead to prevent a digit.
#
data3 = stri_replace_all_regex(str = data1, pattern = '(?<!\\d), (?!\\d)', replacement = '')
# OK, check the string data...
data3
# Now we convert the string back to be a vector...
newData = strsplit(data3, " ")[[1]]
newData
# Now we convert to a dataframe...
df <- data.frame(matrix(newData, ncol=7, byrow=T))
df
# Done
Output
> data <- c("A", "a", "2.07", "2.35", "39.00", "82.20", "8.8", "3.80",
+ "B", "2.26", "2.25", "40.00", "80.80", "8.1", "1.86", "D",
+ "Et", "2.07", "2.22", "41.00", "83.80", "8.8", "3.87", "F",
+ "2.05", "2.15", "43.00", "82.20", "8.4", "3.11", "Bc", "2.08",
+ "2.12", "48.00", "82.60", "8.3", "2.47", "Gf", "H", "I",
+ "2.08", "2.10", "46.00", "82.20", "8.1", "2.90", "J", "K",
+ "1.95", "2.08", "38.00", "83.40", "8.7", "1.63", "L", "M",
+ "1.89", "2.07", "45.00", "83.80", "9.0", "1.84", "N", "2.06",
+ "2.05", "41.00", "80.60", "9.0", "4.09", "O", "P", "1.86",
+ "2.04", "48.00", "81.60", "8.6", "2.60", "Qst", "R", "1.95",
+ "2.03", "44.00", "82.80", "8.8", "1.40", "S", "2.03", "2.02",
+ "40.00", "81.40", "8.2", "1.74", "T", "1.95", "2.01", "43.00",
+ "81.80", "9.0", "2.30", "Unh", "1.96", "2.00", "44.00", "82.60",
+ "9.2", "2.40", "V", "W", "C", "1.98", "1.97", "40.00",
+ "82.00", "8.1", "1.15", "Yu", "1.90", "1.96", "41.00", "82.80",
+ "9.6", "2.08", "Z", "a", "bi", "1.90", "1.95", "42.00",
+ "84.20", "9.6", "1.69")
>
> # Use stringi base regular expression engine
> require(stringi)
>
> # Convert the vector data to be a string sequence - so we can manipulate as text
> data1 <- toString(data)
>
> # Now we can apply the regular expression substitution to the data (formatted as a string...
> # Here we do a:
> #
> # (?<!\d) - Negative look behind to prevent a digit.
> # , - A literal combination of quotes, comma and space. We drop the ", " in conversion to string...
> # (?!\d) - Negative look ahead to prevent a digit.
> #
> data3 = stri_replace_all_regex(str = data1, pattern = '(?<!\\d), (?!\\d)', replacement = '')
> # OK, check the string data...
> data3
[1] "Aa, 2.07, 2.35, 39.00, 82.20, 8.8, 3.80, B, 2.26, 2.25, 40.00, 80.80, 8.1, 1.86, DEt, 2.07, 2.22, 41.00, 83.80, 8.8, 3.87, F, 2.05, 2.15, 43.00, 82.20, 8.4, 3.11, Bc, 2.08, 2.12, 48.00, 82.60, 8.3, 2.47, GfHI, 2.08, 2.10, 46.00, 82.20, 8.1, 2.90, JK, 1.95, 2.08, 38.00, 83.40, 8.7, 1.63, LM, 1.89, 2.07, 45.00, 83.80, 9.0, 1.84, N, 2.06, 2.05, 41.00, 80.60, 9.0, 4.09, OP, 1.86, 2.04, 48.00, 81.60, 8.6, 2.60, QstR, 1.95, 2.03, 44.00, 82.80, 8.8, 1.40, S, 2.03, 2.02, 40.00, 81.40, 8.2, 1.74, T, 1.95, 2.01, 43.00, 81.80, 9.0, 2.30, Unh, 1.96, 2.00, 44.00, 82.60, 9.2, 2.40, VWC, 1.98, 1.97, 40.00, 82.00, 8.1, 1.15, Yu, 1.90, 1.96, 41.00, 82.80, 9.6, 2.08, Zabi, 1.90, 1.95, 42.00, 84.20, 9.6, 1.69"
>
> # Now we convert the string back to be a vector...
> newData = strsplit(data3, " ")[[1]]
> newData
[1] "Aa," "2.07," "2.35," "39.00," "82.20," "8.8," "3.80," "B," "2.26," "2.25," "40.00," "80.80,"
[13] "8.1," "1.86," "DEt," "2.07," "2.22," "41.00," "83.80," "8.8," "3.87," "F," "2.05," "2.15,"
[25] "43.00," "82.20," "8.4," "3.11," "Bc," "2.08," "2.12," "48.00," "82.60," "8.3," "2.47," "GfHI,"
[37] "2.08," "2.10," "46.00," "82.20," "8.1," "2.90," "JK," "1.95," "2.08," "38.00," "83.40," "8.7,"
[49] "1.63," "LM," "1.89," "2.07," "45.00," "83.80," "9.0," "1.84," "N," "2.06," "2.05," "41.00,"
[61] "80.60," "9.0," "4.09," "OP," "1.86," "2.04," "48.00," "81.60," "8.6," "2.60," "QstR," "1.95,"
[73] "2.03," "44.00," "82.80," "8.8," "1.40," "S," "2.03," "2.02," "40.00," "81.40," "8.2," "1.74,"
[85] "T," "1.95," "2.01," "43.00," "81.80," "9.0," "2.30," "Unh," "1.96," "2.00," "44.00," "82.60,"
[97] "9.2," "2.40," "VWC," "1.98," "1.97," "40.00," "82.00," "8.1," "1.15," "Yu," "1.90," "1.96,"
[109] "41.00," "82.80," "9.6," "2.08," "Zabi," "1.90," "1.95," "42.00," "84.20," "9.6," "1.69"
>
> # Now we convert to a dataframe...
> df <- data.frame(matrix(newData, ncol=7, byrow=T))
> df
X1 X2 X3 X4 X5 X6 X7
1 Aa, 2.07, 2.35, 39.00, 82.20, 8.8, 3.80,
2 B, 2.26, 2.25, 40.00, 80.80, 8.1, 1.86,
3 DEt, 2.07, 2.22, 41.00, 83.80, 8.8, 3.87,
4 F, 2.05, 2.15, 43.00, 82.20, 8.4, 3.11,
5 Bc, 2.08, 2.12, 48.00, 82.60, 8.3, 2.47,
6 GfHI, 2.08, 2.10, 46.00, 82.20, 8.1, 2.90,
7 JK, 1.95, 2.08, 38.00, 83.40, 8.7, 1.63,
8 LM, 1.89, 2.07, 45.00, 83.80, 9.0, 1.84,
9 N, 2.06, 2.05, 41.00, 80.60, 9.0, 4.09,
10 OP, 1.86, 2.04, 48.00, 81.60, 8.6, 2.60,
11 QstR, 1.95, 2.03, 44.00, 82.80, 8.8, 1.40,
12 S, 2.03, 2.02, 40.00, 81.40, 8.2, 1.74,
13 T, 1.95, 2.01, 43.00, 81.80, 9.0, 2.30,
14 Unh, 1.96, 2.00, 44.00, 82.60, 9.2, 2.40,
15 VWC, 1.98, 1.97, 40.00, 82.00, 8.1, 1.15,
16 Yu, 1.90, 1.96, 41.00, 82.80, 9.6, 2.08,
17 Zabi, 1.90, 1.95, 42.00, 84.20, 9.6, 1.69
> # Done
Related
This error throws when I try to run this code. The anova and anova results function without issuse. Why is the column in the pairwise_t_test not the same as the dv= column in the anova.
Error in pull():
! Can't extract columns that don't exist.
✖ Column column doesn't exist.
library(rstatix)
nnames <- names(df)[unlist(lapply(df, is.numeric))]
res.aov <- list()
aov_tab <- list()
pc <- list()
pc1 <- list()
for (column in nnames) {
res.aov[[column]] <- anova_test(data = df, dv = column,
wid = `Subject`, within = `Timepoint`, between = `Genotype`)
aov_tab[[column]] <- get_anova_table(res.aov[[column]])
pc[[column]]<- df %>% pairwise_t_test(column ~`Timepoint`, paired=TRUE, p.adjust.method = "holm")
pc[[column]]<- pc[[column]] %>% add_xy_position(x="Timepoint")
pc1[[column]]<- df %>% group_by(Timepoint) %>% pairwise_t_test(column ~ `Genotype`)
pc1[[column]]<- pc1[[column]] %>% add_xy_position(x= "Timepoint")
}
dataframe
dput(df)
structure(list(Subject = c("ASCVD002", "ASCVD002", "ASCVD002",
"ASCVD003", "ASCVD003", "ASCVD003", "ASCVD004", "ASCVD004", "ASCVD004",
"ASCVD005", "ASCVD005", "ASCVD005", "ASCVD006", "ASCVD006", "ASCVD006",
"ASCVD008", "ASCVD008", "ASCVD008", "ASCVD009", "ASCVD009", "ASCVD009",
"ASCVD010", "ASCVD010", "ASCVD010", "ASCVD011", "ASCVD011", "ASCVD011"
), Timepoint = c("0", "0.25", "0.5", "0", "0.25", "0.5", "0",
"0.25", "0.5", "0", "0.25", "0.5", "0", "0.25", "0.5", "0", "0.25",
"0.5", "0", "0.25", "0.5", "0", "0.25", "0.5", "0", "0.25", "0.5"
), Genotype = c("Heterozygote", "Heterozygote", "Heterozygote",
"Heterozygote", "Heterozygote", "Heterozygote", "Heterozygote",
"Heterozygote", "Heterozygote", "GG", "GG", "GG", "AA", "AA",
"AA", "GG", "GG", "GG", "AA", "AA", "AA", "AA", "AA", "AA", "GG",
"GG", "GG"), `Tregs CD127lo CD25+` = c(2702, 2175, 2651, 1672.8,
3762, 4264, 1975, 3208, 3285, 3457, 3383, 2619.9, 11872, 16101,
13443, 3935, 1894, 2297, 7385, 8901, 9522, 7100, 8789, 9309,
371, 379, 514), `Monocytes % of Live by Size` = c(1.38, 2.66,
4.74, 5.83, 3.9, 5.06, 6.36, 3.45, 2.64, 6.33, 10.7, 9.41, 3.42,
3.46, 2.73, 2.38, 3.12, 4.44, 5.31, 3.59, 4.91, 1.53, 6.54, 4.85,
6.87, 3.66, 5.07), `NK cells` = c(90.62, 153.6, 159.8, 88, 118,
159, 74, 82, 64, 30, 344, 73, 29, 198, 79, 145, 258, 307, 30,
74.4, 0, 47.3, 32, 0, 52.6, 95.3, 51.7)), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -27L))
I have ran it out of the loop and used the specific column without the error.
In pairwise_t_test you provide a formula that contains "column". The object column is a length 1 vector containing the name (!) of the variable you are interested in but not the values of the variable itself. The formula needs to have the actual variable name in it, not an object referring to the variable.
You can avoid this by constructing the formula like this:
pairwise_t_test(as.formula(paste0("`",column,"`", "~ Timepoint")), ...)
And likewise in the second call to pairwise_t_test.
By the way, you have very unhandy variable names in nnames. With more simple variable names (no spaces or special characters) you do not need the "`" in the code.
I have the following data containing three Funds (A, B and C) and their the respective data for (Return minus Risk Free Rate) and (Market Return minus Risk Free Rate):
structure(list(`Fund Name` = c("A", "A", "A", "A", "A", "A",
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A",
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A",
"A", "A", "A", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B",
"B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B",
"B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B",
"B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "C", "C",
"C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "C",
"C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "C"), Date = c("2018-08-01",
"2018-08-02", "2018-08-03", "2018-10-22", "2018-10-23", "2018-10-24",
"2018-12-18", "2019-01-08", "2019-01-09", "2019-01-10", "2019-01-11",
"2019-01-14", "2019-01-15", "2019-01-16", "2019-02-07", "2019-02-08",
"2019-02-11", "2019-02-12", "2019-02-13", "2019-02-14", "2019-02-15",
"2019-02-18", "2019-02-19", "2019-02-20", "2019-03-15", "2019-03-18",
"2019-03-19", "2019-04-01", "2019-04-02", "2019-04-03", "2019-04-04",
"2019-04-10", "2019-04-11", "2019-04-12", "2019-04-15", "2018-08-01",
"2018-08-02", "2018-08-10", "2018-08-13", "2018-08-14", "2018-08-16",
"2018-08-17", "2018-10-23", "2018-10-24", "2018-10-25", "2018-10-26",
"2018-10-29", "2018-10-30", "2018-10-31", "2018-11-13", "2018-11-14",
"2018-11-22", "2018-11-23", "2018-12-06", "2018-12-07", "2018-12-10",
"2018-12-11", "2018-12-12", "2018-12-13", "2018-12-14", "2018-12-17",
"2018-12-18", "2019-02-06", "2019-02-07", "2019-02-08", "2019-02-11",
"2019-02-12", "2019-02-13", "2019-02-14", "2019-02-15", "2019-03-04",
"2019-03-05", "2019-03-06", "2019-03-07", "2019-03-08", "2019-03-11",
"2019-03-26", "2019-03-27", "2019-04-05", "2019-04-08", "2019-04-12",
"2019-04-15", "2018-08-01", "2018-08-02", "2018-08-03", "2018-08-06",
"2018-08-07", "2018-08-08", "2018-08-09", "2018-08-10", "2018-08-13",
"2018-08-14", "2018-08-23", "2019-01-29", "2019-03-01", "2019-03-04",
"2019-03-05", "2019-03-06", "2019-03-07", "2019-03-27", "2019-03-28",
"2019-03-29", "2019-04-01", "2019-04-02", "2019-04-03", "2019-04-04",
"2019-04-12", "2019-04-15"), `Return-RF` = c(NA, -0.031053409,
-0.004149784, -0.019431914, -0.025985785, -0.022325086, -0.013000177,
-0.005969802, 0.003743827, -0.005973689, -0.012279585, -0.012621233,
-0.014248868, -0.000850313, -0.038296552, -0.020249538, -0.002319941,
-0.003117846, -0.006643616, -0.012684205, 0.00480718, -0.000708029,
-0.007510481, -0.001464912, -0.008793153, -0.003356718, -0.005595538,
0.00592619, -0.006444843, 0.007778815, -0.01019018, -0.008793842,
-0.003549589, 0.000596707, -0.005270976, NA, -0.024337163, -0.030609843,
-0.012780354, -0.011857873, NA, -0.00906015, -0.035681946, -0.007920997,
-0.020963305, -0.013154577, 0.002038879, -0.019934722, 0.007708796,
-0.019404458, 0.000443959, -0.008925886, -0.017543139, -0.033810649,
-0.002362211, -0.02975915, -0.002819632, -0.000687416, -0.006733802,
-0.02423122, -0.017747687, -0.009444599, -0.006353213, -0.020454878,
-0.028563249, -0.005726489, -0.003094262, -0.001040783, -0.012626742,
-0.001097087, -0.009497361, -0.015542972, 5.53889e-05, -0.020560822,
-0.023744172, -0.00744049, -0.00193544, -0.013016594, -0.008529772,
-0.005602241, -0.004651093, -0.005644803, NA, -0.02207606, -0.006369491,
-0.012551725, -0.003201358, -0.01153393, -0.010203346, -0.033352688,
-0.01224557, -0.011346633, -0.012929118, -0.006728953, -0.004243723,
-0.012659234, -0.009103863, -0.011760838, -0.023812576, -0.013908016,
-0.013459074, -0.004005417, 0.004751808, -0.007972052, 0.006040872,
-0.011324789, -0.000427748, -0.007779257), `Mkt-RF` = c(-0.64,
-1.36, 0.36, -0.85, -1.53, -1.26, -0.41, 0.61, 1.51, -0.13, -0.21,
-0.6, -0.01, 0.19, -1.63, -0.75, 0.33, 0.94, 0.07, 0.01, 1.22,
0.46, 0.12, 0.55, 0.93, 0.39, 0.62, 1.09, 0.45, 1.01, -0.28,
0.25, 0.11, 0.63, 0.3, -0.64, -1.36, -2.01, -0.28, -0.54, 0.71,
0.41, -1.53, -1.26, 0.5, -0.61, 0.65, -0.07, 1.37, 1.01, -0.28,
-0.44, -0.29, -2.49, 0.45, -1.98, 0.8, 1.98, -0.13, -1.23, -0.93,
-0.41, -0.28, -1.63, -0.75, 0.33, 0.94, 0.07, 0.01, 1.22, 0.03,
-0.03, -0.19, -1.44, -0.47, 0.85, 0.31, -0.14, 0.15, 0.24, 0.63,
0.3, -0.64, -1.36, 0.36, -0.18, 0.73, -0.08, -0.42, -2.01, -0.28,
-0.54, -0.54, 0.43, 0.52, 0.03, -0.03, -0.19, -1.44, -0.14, -0.34,
0.67, 1.09, 0.45, 1.01, -0.28, 0.63, 0.3)), class = "data.frame", row.names = c(NA,
-108L))
Now I would like to compute the beta via the CAPM for the three different funds.
I tried with the lm function but I it gives only one beta for all three funds together.
I tried with the following code:
Panel <- Panel %>%
group_by(`Fund Name`)
Regression <- lm(Panel$`Return-RF`~ Panel$`Mkt-RF`)
Could someone help me here with the code?
You can split() your dataframe by fund, then run the regression on each subset using lapply():
Panel_Funds <- split(Panel, Panel$`Fund Name`)
Regressions <- lapply(
Panel_Funds,
\(x) lm(`Return-RF` ~ `Mkt-RF`, data = x)
)
Regressions
Output:
$A
Call:
lm(formula = `Return-RF` ~ `Mkt-RF`, data = x)
Coefficients:
(Intercept) `Mkt-RF`
-0.00964 0.01205
$B
Call:
lm(formula = `Return-RF` ~ `Mkt-RF`, data = x)
Coefficients:
(Intercept) `Mkt-RF`
-0.010538 0.008266
$C
Call:
lm(formula = `Return-RF` ~ `Mkt-RF`, data = x)
Coefficients:
(Intercept) `Mkt-RF`
-0.009401 0.010676
If you want to save the coefficients to a table, you can use broom::tidy(); see my answer here for an example.
Are you trying to calculate the variance and covariance to compute the beta?
I would turn your data into a tibble then drop the NA values,
(data %>% as_tibble() %>% drop_na())
then you can easily extract variance for each company,
fundA <- data %>% filter(`Fund Name` == A)
then get variance,
var(fundA$`Return-RF`)
I have data from a split-plot experiment, and I'd like to evaluate some contrasts between different factorial combinations. Evaluating contrasts of a complete randomized design, or a randomized complete block design seems straightforward: create an lm() object, construct a contrast matrix, then analyze contrasts with glht() from the multcomp package. However, I'm not aware of how to make an lm() object that takes into account the error terms needed for a split-plot analysis. I can make an aov() object that reflects the experiment design, but I'm unsure how to use that to evaluate contrasts. Below is a reprex with some example data. I feel an obvious answer must be out there, but my own searching has failed me. Any pointers would be appreciated.
library(tidyverse)
structure(list(Treatment = c("on", "on", "on", "on", "on", "on",
"on", "on", "on", "on", "on", "on", "on", "on", "on", "al", "al",
"al", "al", "al", "al", "al", "al", "al", "al", "al", "al", "al",
"al", "al", "ix", "ix", "ix", "ix", "ix", "ix", "ix", "ix", "ix",
"ix", "ix", "ix", "ix", "ix", "ix"), Material = c("T", "T", "T",
"A", "A", "A", "E", "E", "E", "N", "N", "N", "O", "O", "O", "T",
"T", "T", "A", "A", "A", "E", "E", "E", "N", "N", "N", "O", "O",
"O", "T", "T", "T", "A", "A", "A", "E", "E", "E", "N", "N", "N",
"O", "O", "O"), Rep = c(1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1,
2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1,
2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3), Measurement = c(20.67, 20.74,
20.6, 14.35, 14.42, 14.39, 15.04, 15.08, 15, 18.08, 18.08, 18.14,
18.61, 18.55, 18.08, 20.89, 20.89, 20.67, 14.46, 14.53, 14.39,
15.12, 15.16, 15.04, 18.25, 18.31, 18.14, 18.61, 18.8, 18.55,
20.74, 21.5, 20.89, 14.6, 14.53, 14.46, 15.4, 15.48, 15.16, 18.73,
18.73, 18.31, 18.55, 19.17, 18.73)), row.names = c(NA, -45L), class = c("tbl_df",
"tbl", "data.frame"))
spd.data$Treatment = as.factor(spd.data$Treatment)
spd.data$Material = as.factor(spd.data$Material)
spd.data$Rep = as.factor(spd.data$Rep)
spd.data.aov = spd.data %>%
aov(Measurement~
Rep +
Treatment +
Error(Rep/Treatment) +
Material +
Material:Treatment, .)
spd.data.aov %>% summary()
spd.data.lm = spd.data %>%
lm(Measurement~
Rep +
Treatment +
Error(Rep/Treatment) +
Material +
Material:Treatment, .)
The final lm() evaluation outputs Error in Error(Rep/Treatment) : could not find function "Error". Also, apologies for not using a built-in dataset, but I couldn't find any that reasonably fit into a split-plot design.
I have data with several data agregations (Year, Quarter, Month).
I try to leave a space between each Date aggregation in x-axis label.
I obtain this for the moment :
And I want to obtain :
Here my data :
My dataframe (dput) :
r8_plot = structure(list(DATE = c(2016L, 2017L, 2018L, 201701L, 201702L,
201703L, 201704L, 201801L, 201802L, 201803L, 201804L, 201801L,
201802L, 201803L, 201804L, 201805L, 201806L, 201807L, 201808L,
201809L, 201810L, 201811L, 201812L, 201844L, 201845L, 201846L,
201847L, 201848L, 201849L, 201850L), Var1 = c(6.64, 6.21, 6.53,
6.31, 6.01, 6.36, 6.17, 6.76, 6.37, 6.68, 6.27, 7.5, 6.49, 6.4,
6.54, 6.18, 6.37, 5.98, 6.37, 7.48, 6.6, 5.97, 6.25, 5.42, 6.18,
5.81, 6.46, 6.36, 6.05, 6.35), Var2 = c(2.38, 2.25, 2.36, 2.22,
2.52, 1.98, 2.27, 2.44, 2.31, 2.27, 2.41, 2.53, 2.25, 2.51, 2.35,
2.42, 2.19, 2.51, 1.91, 2.38, 2.34, 2.29, 2.68, 2.15, 1.89, 2.6,
2.52, 2.37, 2.97, 2.71), Var3 = c(4.26, 3.96, 4.17, 4.09, 3.5,
4.38, 3.9, 4.32, 4.06, 4.4, 3.86, 4.96, 4.23, 3.9, 4.19, 3.77,
4.18, 3.47, 4.46, 5.1, 4.26, 3.68, 3.57, 3.27, 4.29, 3.2, 3.95,
3.99, 3.09, 3.64), Var4 = c(35.84, 36.17, 36.08, 35.2, 41.86,
31.17, 36.76, 36.07, 36.27, 34.07, 38.43, 33.78, 34.76, 39.18,
35.95, 39.07, 34.35, 42.04, 29.91, 31.8, 35.48, 38.38, 42.86,
39.72, 30.53, 44.85, 38.94, 37.24, 48.98, 42.63), Var5 = c("Y",
"Y", "Y", "Q", "Q", "Q", "Q", "Q", "Q", "Q", "Q", "M", "M", "M",
"M", "M", "M", "M", "M", "M", "M", "M", "M", "W", "W", "W", "W",
"W", "W", "W"), Var6 = structure(1:30, .Label = c("2016", "2017",
"2018", "Q1-2017", "Q2-2017", "Q3-2017", "Q4-2017", "Q1-2018",
"Q2-2018", "Q3-2018", "Q4-2018", "M01-2018", "M02-2018", "M03-2018",
"M04-2018", "M05-2018", "M06-2018", "M07-2018", "M08-2018", "M09-2018",
"M10-2018", "M11-2018", "M12-2018", "W44-2018", "W45-2018", "W46-2018",
"W47-2018", "W48-2018", "W49-2018", "W50-2018"), class = "factor"),
Var7 = c(7.1, 6.7, 6.7, 6.7, 6.7, 6.6, 6.6, 6.7, 6.7, 6.6,
6.6, 6.7, 6.7, 6.7, 6.7, 6.7, 6.6, 6.6, 6.6, 6.6, 6.6, 6.6,
6.6, 6.6, 6.6, 6.6, 6.6, 6.6, 6.6, 6.6)), .Names = c("DATE",
"Var1", "Var2", "Var3", "Var4", "Var5", "Var6", "Var7"), row.names = c(NA,
30L), class = "data.frame")
r8_plot$Var6 = factor(r8_plot$Var6, labels = unique(r8_plot$Var6), levels=unique(r8_plot$Var6))
library(plotly)
r8_plot %>% plot_ly(x = ~Var6) %>%
add_bars(y = ~Var1,marker = list(color = '#00802b'),
name = "Var1") %>%
add_bars(y = ~Var2,marker = list(color = '#ff9933'),
name = "Var2")%>%
add_lines(y = ~Var4,
name = "Var4",
yaxis = "y2", line = list(color = '#1a1aff'))%>%
add_lines(y = ~Var7,
name = "Var7",
yaxis = "y1")%>%
layout(barmode = "stack",
yaxis2 = list(overlaying = "y",side = "right"),barmode = "stack",xaxis = list(title = 'DATE'), yaxis = list(title = 'All quantity'), title ="Chart") %>% layout(height = 750, width = 1000, hovermode = 'closest',margin = list(b = 115))
Thanks in advance
You can add NaN y-values to any data frame and Plotly will interrupt the plot at this position.
For example
library(plotly)
data = data.frame(list(x = c(1, 2, NaN, 3, 4),
y = c(1, 2, NaN, 3, 4)))
plot_ly(data, x = ~x) %>%
add_lines(y = ~y)
will give you
For more complex data, like in the question, a function introducing the empty rows might be easier than doing it manually.
split_by_date <- function(data) {
data_length <- length(data[,1])
index <- 0
new_data <- data
new_line <- list(replicate(length(data), NaN))
for (i in 2:length(data$DATE)) {
if (substr(toString(data$Var6[[i]]), 1, 1) != substr(toString(data$Var6[[i - 1]]), 1, 1)) {
new_data <- rbind.data.frame(new_data[1:i + index - 1,], new_line[[1]], data[i:data_length,])
new_data$Var6[[index + i]] <- paste(replicate(index + 1, " "), collapse = " ")
index <- index + 1
rownames(new_data) <- 1:as.integer(data_length + index)
}
}
return(new_data)
}
We just need to make sure that the x-value is always unique, i.e. simply concatenate an increasing number of spaces. Otherwise we'll just get a single break in the graph.
Also another row x is introduced to help with plotting the x-values in the correct order.
r8_plot = structure(list(DATE = c(2016L, 2017L, 2018L, 201701L, 201702L, 201703L, 201704L, 201801L, 201802L, 201803L, 201804L, 201801L, 201802L, 201803L, 201804L, 201805L, 201806L, 201807L, 201808L, 201809L, 201810L, 201811L, 201812L, 201844L, 201845L, 201846L, 201847L, 201848L, 201849L, 201850L),
Var1 = c(6.64, 6.21, 6.53, 6.31, 6.01, 6.36, 6.17, 6.76, 6.37, 6.68, 6.27, 7.5, 6.49, 6.4, 6.54, 6.18, 6.37, 5.98, 6.37, 7.48, 6.6, 5.97, 6.25, 5.42, 6.18, 5.81, 6.46, 6.36, 6.05, 6.35),
Var2 = c(2.38, 2.25, 2.36, 2.22, 2.52, 1.98, 2.27, 2.44, 2.31, 2.27, 2.41, 2.53, 2.25, 2.51, 2.35, 2.42, 2.19, 2.51, 1.91, 2.38, 2.34, 2.29, 2.68, 2.15, 1.89, 2.6, 2.52, 2.37, 2.97, 2.71),
Var3 = c(4.26, 3.96, 4.17, 4.09, 3.5, 4.38, 3.9, 4.32, 4.06, 4.4, 3.86, 4.96, 4.23, 3.9, 4.19, 3.77, 4.18, 3.47, 4.46, 5.1, 4.26, 3.68, 3.57, 3.27, 4.29, 3.2, 3.95, 3.99, 3.09, 3.64),
Var4 = c(35.84, 36.17, 36.08, 35.2, 41.86, 31.17, 36.76, 36.07, 36.27, 34.07, 38.43, 33.78, 34.76, 39.18, 35.95, 39.07, 34.35, 42.04, 29.91, 31.8, 35.48, 38.38, 42.86, 39.72, 30.53, 44.85, 38.94, 37.24, 48.98, 42.63),
Var5 = c("Y", "Y", "Y", "Q", "Q", "Q", "Q", "Q", "Q", "Q", "Q", "M", "M", "M", "M", "M", "M", "M", "M", "M", "M", "M", "M", "W", "W", "W", "W", "W", "W", "W"),
Var6 = c("2016", "2017", "2018", "Q1-2017", "Q2-2017", "Q3-2017", "Q4-2017", "Q1-2018", "Q2-2018", "Q3-2018", "Q4-2018", "M01-2018", "M02-2018", "M03-2018","M04-2018", "M05-2018", "M06-2018", "M07-2018", "M08-2018", "M09-2018", "M10-2018", "M11-2018", "M12-2018", "W44-2018", "W45-2018", "W46-2018", "W47-2018", "W48-2018", "W49-2018", "W50-2018"),
Var7 = c(7.1, 6.7, 6.7, 6.7, 6.7, 6.6, 6.6, 6.7, 6.7, 6.6, 6.6, 6.7, 6.7, 6.7, 6.7, 6.7, 6.6, 6.6, 6.6, 6.6, 6.6, 6.6,6.6, 6.6, 6.6, 6.6, 6.6, 6.6, 6.6, 6.6)),
.Names = c("DATE", "Var1", "Var2", "Var3", "Var4", "Var5", "Var6", "Var7"), row.names = c(NA, 30L), class = "data.frame")
plot <- split_by_date(r8_plot)
plot$x <- structure(1:length(plot$Var6), .Label = plot$Var6, class = "factor")
plot %>% plot_ly(x = ~x, height = 750, width = 1000) %>%
add_bars(y = ~Var1,
marker = list(color = '#00802b'),
name = "Var1") %>%
add_bars(y = ~Var2,
marker = list(color = '#ff9933'),
name = "Var2") %>%
add_lines(y = ~Var4,
name = "Var4",
yaxis = "y2",
line = list(color = '#1a1aff')) %>%
add_lines(y = ~Var7,
name = "Var7",
yaxis = "y1") %>%
layout(barmode = "stack",
xaxis = list(title = 'DATE', range = c(-0.1, 10)),
yaxis = list(title = 'All quantity'),
yaxis2 = list(overlaying = "y",
side = "right"),
title ="Chart",
hovermode = 'closest')
I have 4 columns: date & time, stage_duration, various_stages, Vehicle_ID. I want to plot date and time in mins on X-axis and id, stage_duration on Y-axis and fill by various stages on line or bar chart.
Something like this would be good:
Here is my data:
var_events time_date event_duration veh_id
LD 17-06-2018 13:25 6.52 B33
WL 17-06-2018 13:25 14.52 B31
TL 17-06-2018 13:26 0.32 B32
TE 17-06-2018 13:26 4.58 B13
UL 17-06-2018 13:26 3.45 B12
WT 17-06-2018 13:26 5.46 B25
UL 17-06-2018 13:26 1.56 B17
TL 17-06-2018 13:26 13.6 B33
SL 17-06-2018 13:26 0.05 B32
Here is a minimal example that creates the plot
# load data
data(presidential)
data(economics)
# events of interest
events <- presidential[-(1:3),]
# strip year from economics and events data frames
economics$year = as.numeric(format(economics$date, format = "%Y"))
# use dplyr to summarise data by year
#install.packages("dplyr")
library(dplyr)
econonomics_mean <- economics %>%
group_by(year) %>%
summarise(mean_unemployment = mean(unemploy))
# add president terms to summarized data frame as a factor
president <- c(rep(NA,14), rep("Reagan", 8), rep("Bush", 4), rep("Clinton", 8), rep("Bush", 8), rep("Obama", 7))
econonomics_mean$president <- president
# create ggplot
p <- ggplot(data = econonomics_mean, aes(x = year, y = mean_unemployment)) +
geom_point(aes(color = president)) +
geom_line(alpha = 1/3)
Update
This is the output:
structure(list(Event_stage = c("SE", "MN", "MN", "TE", "TE",
"TE", "TE", "TE", "TE", "TE", "TE", "WL", "TE", "TE", "SE", "TE",
"TE", "WL", "WT", "MN", "WL", "TE", "WL", "WL", "WT", "WL", "LD",
"WT", "WL", "WT", "WT", "TE", "WL", "LD", "WT", "LD", "MN", "TL",
"TE", "WL", "TL", "TL", "WT", "TE", "TE", "LD", "WT", "TL", "LD"),
event_date = structure(c(1529573704, 1529573710, 1529573713,
1529573724, 1529573855, 1529573874, 1529573880, 1529573895, 1529573906,
1529573918, 1529573925, 1529573931, 1529573931, 1529573941, 1529573947,
1529573969, 1529574006, 1529574054, 1529574088, 1529574114, 1529574120,
1529574123, 1529574134, 1529574137, 1529574148, 1529574163, 1529574164,
1529574148, 1529574169, 1529574170, 1529574178, 1529574188, 1529574189,
1529574196, 1529574178, 1529574188, 1529574203, 1529574213, 1529574214,
1529574214, 1529574215, 1529574227, 1529574231, 1529574242, 1529574244,
1529574245, 1529574248, 1529574260, 1529574262), class = c("POSIXct",
"POSIXt"), tzone = "UTC"), stage_duration = c(3.78, 3.47, 2.78,
3.45, 3.32, 4.93, 4.23, 4.22, 3.85, 3.37, 5.88, 5.92, 3.97, 3.7,
NA, 4.08, 3.05, 0.57, 11.18, 12.08, 2.6, 3.3, 0.23, 0.85, 0.27,
0.25, 0.82, 10.42, 0.15, 0.43, 1.4, 0.25, 0.7, 0.52, 1.12, 0.45,
12.87, 12.18, 2.92, 0.57, 14.07, 12.72, 17.12, 4.13, 3.13, 0.25,
0.33, 18.98, 1.05), veh_id = c("B35", "B05", "B04", "B08", "B14",
"B13", "B04", "B17", "B41", "B05", "B26", "B08", "B35", "B19a",
"B10a", "B01a", "B28", "B14", "B14", "B18", "B05", "B37", "B04",
"B41", "B04", "B19a", "B04", "B17", "B35", "B13", "B35", "B02b",
"B28", "B13", "B19a", "B41", "B02b", "B04", "B15", "B01a", "B41",
"B13", "B28", "B27", "B33", "B19a", "B01a", "B19a", "B35")),
.Names = c("Event_stage", "event_date", "stage_duration", "veh_id"),
row.names = c(NA, -49L), class = c("tbl_df", "tbl", "data.frame"))
require(ggplot2)
require(dplyr)
df = structure(list(Event_stage = c("SE", "MN", "MN", "TE", "TE", "TE", "TE", "TE", "TE", "TE", "TE", "WL", "TE", "TE", "SE", "TE", "TE", "WL", "WT", "MN", "WL", "TE", "WL", "WL", "WT", "WL", "LD", "WT", "WL", "WT", "WT", "TE", "WL", "LD", "WT", "LD", "MN", "TL", "TE", "WL", "TL", "TL", "WT", "TE", "TE", "LD", "WT", "TL", "LD" ), event_date = structure(c(1529573704, 1529573710, 1529573713, 1529573724, 1529573855, 1529573874, 1529573880, 1529573895, 1529573906, 1529573918, 1529573925, 1529573931, 1529573931, 1529573941, 1529573947, 1529573969, 1529574006, 1529574054, 1529574088, 1529574114, 1529574120, 1529574123, 1529574134, 1529574137, 1529574148, 1529574163, 1529574164, 1529574148, 1529574169, 1529574170, 1529574178, 1529574188, 1529574189, 1529574196, 1529574178, 1529574188, 1529574203, 1529574213, 1529574214, 1529574214, 1529574215, 1529574227, 1529574231, 1529574242, 1529574244, 1529574245, 1529574248, 1529574260, 1529574262), class = c("POSIXct", "POSIXt"), tzone = "UTC"), stage_duration = c(3.78, 3.47, 2.78, 3.45, 3.32, 4.93, 4.23, 4.22, 3.85, 3.37, 5.88, 5.92, 3.97, 3.7, NA, 4.08, 3.05, 0.57, 11.18, 12.08, 2.6, 3.3, 0.23, 0.85, 0.27, 0.25, 0.82, 10.42, 0.15, 0.43, 1.4, 0.25, 0.7, 0.52, 1.12, 0.45, 12.87, 12.18, 2.92, 0.57, 14.07, 12.72, 17.12, 4.13, 3.13, 0.25, 0.33, 18.98, 1.05), veh_id = c("B35", "B05", "B04", "B08", "B14", "B13", "B04", "B17", "B41", "B05", "B26", "B08", "B35", "B19a", "B10a", "B01a", "B28", "B14", "B14", "B18", "B05", "B37", "B04", "B41", "B04", "B19a", "B04", "B17", "B35", "B13", "B35", "B02b", "B28", "B13", "B19a", "B41", "B02b", "B04", "B15", "B01a", "B41", "B13", "B28", "B27", "B33", "B19a", "B01a", "B19a", "B35")), .Names = c("Event_stage", "event_date", "stage_duration", "veh_id"), row.names = c(NA, -49L), class = c("tbl_df", "tbl", "data.frame"))
# create ggplot
ggplot(data = df, aes(x = event_date,
y = stage_duration)) +
geom_point(aes(color = Event_stage), size= 3) +
geom_line(alpha = 1/2)+
facet_wrap(~veh_id, nrow = 4) +
labs(x = "Event date", y = "Stage duration")