I have character strings that I want to convert to tables. The identifier in each row can have white spaces and I need them removed without also removing spaces between the numbers. Is it possible to use a regular expression to achieve this?
For example, the data would look like this:
A B C 5.65 7.8
DC 5.65 7.8
D AB 7.9 12.2
D AB C 7.9 1.2
A BC 13.88 2.4
AB C 7.9 12.2
And I want to get to this:
ABC 5.65 7.8
DC 5.65 7.8
DAB 7.9 12.2
DABC 7.9 1.2
ABC 13.88 2.4
ABC 7.9 12.2
EDIT: As requested, this is an example of the data type and the form in which I receive it. This has 16 rows, each with 6 columns of data, but the first column is an alphabetic identifier.
# Data as I receive it.
data <- c("A", "a", "2.07", "2.35", "39.00", "82.20", "8.8", "3.80",
"B", "2.26", "2.25", "40.00", "80.80", "8.1", "1.86", "D",
"Et", "2.07", "2.22", "41.00", "83.80", "8.8", "3.87", "F",
"2.05", "2.15", "43.00", "82.20", "8.4", "3.11", "Bc", "2.08",
"2.12", "48.00", "82.60", "8.3", "2.47", "Gf", "H", "I",
"2.08", "2.10", "46.00", "82.20", "8.1", "2.90", "J", "K",
"1.95", "2.08", "38.00", "83.40", "8.7", "1.63", "L", "M",
"1.89", "2.07", "45.00", "83.80", "9.0", "1.84", "N", "2.06",
"2.05", "41.00", "80.60", "9.0", "4.09", "O", "P", "1.86",
"2.04", "48.00", "81.60", "8.6", "2.60", "Qst", "R", "1.95",
"2.03", "44.00", "82.80", "8.8", "1.40", "S", "2.03", "2.02",
"40.00", "81.40", "8.2", "1.74", "T", "1.95", "2.01", "43.00",
"81.80", "9.0", "2.30", "Unh", "1.96", "2.00", "44.00", "82.60",
"9.2", "2.40", "V", "W", "C", "1.98", "1.97", "40.00",
"82.00", "8.1", "1.15", "Yu", "1.90", "1.96", "41.00", "82.80",
"9.6", "2.08", "Z", "a", "bi", "1.90", "1.95", "42.00",
"84.20", "9.6", "1.69")
# Required format
data2 <- c("Aa", "2.07", "2.35", "39.00", "82.20", "8.8", "3.80",
"B", "2.26", "2.25", "40.00", "80.80", "8.1", "1.86",
"DEt", "2.07", "2.22", "41.00", "83.80", "8.8", "3.87", "F",
"2.05", "2.15", "43.00", "82.20", "8.4", "3.11", "Bc", "2.08",
"2.12", "48.00", "82.60", "8.3", "2.47", "GfHI",
"2.08", "2.10", "46.00", "82.20", "8.1", "2.90", "JK",
"1.95", "2.08", "38.00", "83.40", "8.7", "1.63", "LM",
"1.89", "2.07", "45.00", "83.80", "9.0", "1.84", "N", "2.06",
"2.05", "41.00", "80.60", "9.0", "4.09", "OP", "1.86",
"2.04", "48.00", "81.60", "8.6", "2.60", "QstR", "1.95",
"2.03", "44.00", "82.80", "8.8", "1.40", "S", "2.03", "2.02",
"40.00", "81.40", "8.2", "1.74", "T", "1.95", "2.01", "43.00",
"81.80", "9.0", "2.30", "Unh", "1.96", "2.00", "44.00", "82.60",
"9.2", "2.40", "VWC", "1.98", "1.97", "40.00",
"82.00", "8.1", "1.15", "Yu", "1.90", "1.96", "41.00", "82.80",
"9.6", "2.08", "Zabi", "1.90", "1.95", "42.00",
"84.20", "9.6", "1.69")
df <- data.frame(matrix(data2, ncol=7, byrow=T))
To do as you request within your R environment, one approach is to convert the vector to a string, apply a regular expression filter to the string, then convert the string back to a vector.
See details below, hopefully this points you in the right direction.
Solution
data <- c("A", "a", "2.07", "2.35", "39.00", "82.20", "8.8", "3.80",
"B", "2.26", "2.25", "40.00", "80.80", "8.1", "1.86", "D",
"Et", "2.07", "2.22", "41.00", "83.80", "8.8", "3.87", "F",
"2.05", "2.15", "43.00", "82.20", "8.4", "3.11", "Bc", "2.08",
"2.12", "48.00", "82.60", "8.3", "2.47", "Gf", "H", "I",
"2.08", "2.10", "46.00", "82.20", "8.1", "2.90", "J", "K",
"1.95", "2.08", "38.00", "83.40", "8.7", "1.63", "L", "M",
"1.89", "2.07", "45.00", "83.80", "9.0", "1.84", "N", "2.06",
"2.05", "41.00", "80.60", "9.0", "4.09", "O", "P", "1.86",
"2.04", "48.00", "81.60", "8.6", "2.60", "Qst", "R", "1.95",
"2.03", "44.00", "82.80", "8.8", "1.40", "S", "2.03", "2.02",
"40.00", "81.40", "8.2", "1.74", "T", "1.95", "2.01", "43.00",
"81.80", "9.0", "2.30", "Unh", "1.96", "2.00", "44.00", "82.60",
"9.2", "2.40", "V", "W", "C", "1.98", "1.97", "40.00",
"82.00", "8.1", "1.15", "Yu", "1.90", "1.96", "41.00", "82.80",
"9.6", "2.08", "Z", "a", "bi", "1.90", "1.95", "42.00",
"84.20", "9.6", "1.69")
# Use stringi base regular expression engine
require(stringi)
# Convert the vector data to be a string sequence - so we can manipulate as text
data1 <- toString(data)
# Now we can apply the regular expression substitution to the data (formatted as a string...
# Here we do a:
#
# (?<!\d) - Negative look behind to prevent a digit.
# , - A literal combination of quotes, comma and space. We drop the ", " in conversion to string...
# (?!\d) - Negative look ahead to prevent a digit.
#
data3 = stri_replace_all_regex(str = data1, pattern = '(?<!\\d), (?!\\d)', replacement = '')
# OK, check the string data...
data3
# Now we convert the string back to be a vector...
newData = strsplit(data3, " ")[[1]]
newData
# Now we convert to a dataframe...
df <- data.frame(matrix(newData, ncol=7, byrow=T))
df
# Done
Output
> data <- c("A", "a", "2.07", "2.35", "39.00", "82.20", "8.8", "3.80",
+ "B", "2.26", "2.25", "40.00", "80.80", "8.1", "1.86", "D",
+ "Et", "2.07", "2.22", "41.00", "83.80", "8.8", "3.87", "F",
+ "2.05", "2.15", "43.00", "82.20", "8.4", "3.11", "Bc", "2.08",
+ "2.12", "48.00", "82.60", "8.3", "2.47", "Gf", "H", "I",
+ "2.08", "2.10", "46.00", "82.20", "8.1", "2.90", "J", "K",
+ "1.95", "2.08", "38.00", "83.40", "8.7", "1.63", "L", "M",
+ "1.89", "2.07", "45.00", "83.80", "9.0", "1.84", "N", "2.06",
+ "2.05", "41.00", "80.60", "9.0", "4.09", "O", "P", "1.86",
+ "2.04", "48.00", "81.60", "8.6", "2.60", "Qst", "R", "1.95",
+ "2.03", "44.00", "82.80", "8.8", "1.40", "S", "2.03", "2.02",
+ "40.00", "81.40", "8.2", "1.74", "T", "1.95", "2.01", "43.00",
+ "81.80", "9.0", "2.30", "Unh", "1.96", "2.00", "44.00", "82.60",
+ "9.2", "2.40", "V", "W", "C", "1.98", "1.97", "40.00",
+ "82.00", "8.1", "1.15", "Yu", "1.90", "1.96", "41.00", "82.80",
+ "9.6", "2.08", "Z", "a", "bi", "1.90", "1.95", "42.00",
+ "84.20", "9.6", "1.69")
>
> # Use stringi base regular expression engine
> require(stringi)
>
> # Convert the vector data to be a string sequence - so we can manipulate as text
> data1 <- toString(data)
>
> # Now we can apply the regular expression substitution to the data (formatted as a string...
> # Here we do a:
> #
> # (?<!\d) - Negative look behind to prevent a digit.
> # , - A literal combination of quotes, comma and space. We drop the ", " in conversion to string...
> # (?!\d) - Negative look ahead to prevent a digit.
> #
> data3 = stri_replace_all_regex(str = data1, pattern = '(?<!\\d), (?!\\d)', replacement = '')
> # OK, check the string data...
> data3
[1] "Aa, 2.07, 2.35, 39.00, 82.20, 8.8, 3.80, B, 2.26, 2.25, 40.00, 80.80, 8.1, 1.86, DEt, 2.07, 2.22, 41.00, 83.80, 8.8, 3.87, F, 2.05, 2.15, 43.00, 82.20, 8.4, 3.11, Bc, 2.08, 2.12, 48.00, 82.60, 8.3, 2.47, GfHI, 2.08, 2.10, 46.00, 82.20, 8.1, 2.90, JK, 1.95, 2.08, 38.00, 83.40, 8.7, 1.63, LM, 1.89, 2.07, 45.00, 83.80, 9.0, 1.84, N, 2.06, 2.05, 41.00, 80.60, 9.0, 4.09, OP, 1.86, 2.04, 48.00, 81.60, 8.6, 2.60, QstR, 1.95, 2.03, 44.00, 82.80, 8.8, 1.40, S, 2.03, 2.02, 40.00, 81.40, 8.2, 1.74, T, 1.95, 2.01, 43.00, 81.80, 9.0, 2.30, Unh, 1.96, 2.00, 44.00, 82.60, 9.2, 2.40, VWC, 1.98, 1.97, 40.00, 82.00, 8.1, 1.15, Yu, 1.90, 1.96, 41.00, 82.80, 9.6, 2.08, Zabi, 1.90, 1.95, 42.00, 84.20, 9.6, 1.69"
>
> # Now we convert the string back to be a vector...
> newData = strsplit(data3, " ")[[1]]
> newData
[1] "Aa," "2.07," "2.35," "39.00," "82.20," "8.8," "3.80," "B," "2.26," "2.25," "40.00," "80.80,"
[13] "8.1," "1.86," "DEt," "2.07," "2.22," "41.00," "83.80," "8.8," "3.87," "F," "2.05," "2.15,"
[25] "43.00," "82.20," "8.4," "3.11," "Bc," "2.08," "2.12," "48.00," "82.60," "8.3," "2.47," "GfHI,"
[37] "2.08," "2.10," "46.00," "82.20," "8.1," "2.90," "JK," "1.95," "2.08," "38.00," "83.40," "8.7,"
[49] "1.63," "LM," "1.89," "2.07," "45.00," "83.80," "9.0," "1.84," "N," "2.06," "2.05," "41.00,"
[61] "80.60," "9.0," "4.09," "OP," "1.86," "2.04," "48.00," "81.60," "8.6," "2.60," "QstR," "1.95,"
[73] "2.03," "44.00," "82.80," "8.8," "1.40," "S," "2.03," "2.02," "40.00," "81.40," "8.2," "1.74,"
[85] "T," "1.95," "2.01," "43.00," "81.80," "9.0," "2.30," "Unh," "1.96," "2.00," "44.00," "82.60,"
[97] "9.2," "2.40," "VWC," "1.98," "1.97," "40.00," "82.00," "8.1," "1.15," "Yu," "1.90," "1.96,"
[109] "41.00," "82.80," "9.6," "2.08," "Zabi," "1.90," "1.95," "42.00," "84.20," "9.6," "1.69"
>
> # Now we convert to a dataframe...
> df <- data.frame(matrix(newData, ncol=7, byrow=T))
> df
X1 X2 X3 X4 X5 X6 X7
1 Aa, 2.07, 2.35, 39.00, 82.20, 8.8, 3.80,
2 B, 2.26, 2.25, 40.00, 80.80, 8.1, 1.86,
3 DEt, 2.07, 2.22, 41.00, 83.80, 8.8, 3.87,
4 F, 2.05, 2.15, 43.00, 82.20, 8.4, 3.11,
5 Bc, 2.08, 2.12, 48.00, 82.60, 8.3, 2.47,
6 GfHI, 2.08, 2.10, 46.00, 82.20, 8.1, 2.90,
7 JK, 1.95, 2.08, 38.00, 83.40, 8.7, 1.63,
8 LM, 1.89, 2.07, 45.00, 83.80, 9.0, 1.84,
9 N, 2.06, 2.05, 41.00, 80.60, 9.0, 4.09,
10 OP, 1.86, 2.04, 48.00, 81.60, 8.6, 2.60,
11 QstR, 1.95, 2.03, 44.00, 82.80, 8.8, 1.40,
12 S, 2.03, 2.02, 40.00, 81.40, 8.2, 1.74,
13 T, 1.95, 2.01, 43.00, 81.80, 9.0, 2.30,
14 Unh, 1.96, 2.00, 44.00, 82.60, 9.2, 2.40,
15 VWC, 1.98, 1.97, 40.00, 82.00, 8.1, 1.15,
16 Yu, 1.90, 1.96, 41.00, 82.80, 9.6, 2.08,
17 Zabi, 1.90, 1.95, 42.00, 84.20, 9.6, 1.69
> # Done
I made a barplot with error bars and labels written on the bars.
My problem is: I want the labels to appear on the bars and also next to the error bars. That is, I don't want labels and error bars to overlap.
An example with my code:
i <- data.frame(
nbr =c(15.18 ,11.53 ,13.37 ,9.2, 10.9, 12.23 ,9.53, 9.81, 7.86, 12.79,
22.03 ,17.64 ,18.1, 16.78 ,17.53 ,16.97 ,17.76 ,18.35 ,12.82 ,20.91,
22.09 ,19.18 ,17.54 ,18.45 ,19.83 ,16.99 ,19.69 ,19.45 ,13.07 ,21.41,
12.13 ,9.76, 10.79 ,10.74 ,12.43 ,9.65, 12.18 ,11.63 ,6.74, 12.31,
17.5, 14.75 ,15.2, 13.89 ,15.24 ,17.43 ,15.22 ,14.04,9.49, 15.86,
8.09, 5.86, 6.68, 7.34, 8.01, 6.35, 8.4, 7.4, 3.88, 6.92 ),
SD = c(4.46, 4.19, 2.27, 2.19, 5.10, 7.25, 8.42, 6.47, 6.04, 7.48, 6.38, 6.05, 3.58, 3.85,
6.94, 6.87, 6.32, 4.28, 4.10, 7.34, 7.46, 6.62, 4.28, 5.24, 8.00, 8.10, 7.73, 5.18,
5.53, 7.96, 7.46, 7.05, 4.47, 4.73, 8.15, 6.95, 5.88, 3.20, 4.01, 7.34, 7.24, 6.98,
5.98, 4.53, 4.22, 7.21, 4.02, 4.30, 1.96, 2.11, 4.98, 7.16, 8.45, 6.39, 6.20, 7.03,
6.10, 6.42, 3.77, 3.53),
x2=rep(c("a", "b", "c", "d", "e", "f", "g",
"h", "i", "j"),6),
s = c(rep(c(rep(c("3"),10),
rep(c("4"),10),
rep(c("5"),10),
rep(c("6"),10),
rep(c("7"),10),
rep(c("8"),10)),1)))
ii <- i[order(i$s, i$nbr ), ]
sn <- factor(x = 1:60, labels = ii$x2)
ii$sn <- sn
scale_x_reordered <- function(..., sep = "___") {
reg <- paste0(sep, ".+$")
ggplot2::scale_x_discrete(labels = function(x) gsub(reg, "", x), ...)
}
reorder_within <- function(x, by, within, fun = mean, sep = "___", ...) {
new_x <- paste(x, within, sep = sep)
stats::reorder(new_x, by, FUN = fun)
}
dummy2 <- data.frame(s = levels(i$s)[-1], Z = c( 4,16,16,8,4))
dummy2$s <- factor(dummy2$s)
ggplot(ii, aes(reorder_within(sn, nbr, s), nbr,
label =x2)) +
geom_bar(stat = 'identity') +
geom_text(aes(y = 0,fontface=2), angle = 90, hjust = -.05, size = 4)+
scale_x_reordered() +
facet_wrap(.~ s, scales = "free_x", ncol=2)+
#geom_text(aes(label=nbr), vjust=1.6, color="white", size=3.5)+
theme(axis.text.x = element_blank(),
axis.title=element_text(size=16),
axis.text=element_text(face = "bold"),
strip.text.x = element_text(size = 14,face="bold")
)+ geom_errorbar(aes(reorder_within(sn, nbr, s),ymin=nbr-SD, ymax=nbr+SD), width=.2, position=position_dodge(.9))
Example of expected parcel:
I want all the labels to be written next to the error bars on the bars.
Thanks for your help !
I found this solution and wanted to share it with you:
geom_text(aes(y = 0,fontface=2), angle = 90, vjust = -1, hjust = -.05, size = 4)
R::How to Plot Single Horizontal Bar Showing different stages on Continous Time-series Data from startdate to present date and for navigating time a horizontal scrollbar in R?
This is my data:
var_events time_date event_duration veh_id
LD 17-06-2018 13:25 6.52 B33
WL 17-06-2018 13:25 14.52 B31
TL 17-06-2018 13:26 0.32 B32
TE 17-06-2018 13:26 4.58 B13
UL 17-06-2018 13:26 3.45 B12
WT 17-06-2018 13:26 5.46 B25
UL 17-06-2018 13:26 1.56 B17
TL 17-06-2018 13:26 13.6 B33
SL 17-06-2018 13:26 0.05 B32
Here is a Example of line chart of Previous code:
require(ggplot2)
require(dplyr)
df = structure(list(Event_stage = c("SE", "MN", "MN", "TE", "TE", "TE", "TE", "TE", "TE", "TE", "TE", "WL", "TE", "TE", "SE", "TE", "TE", "WL", "WT", "MN", "WL", "TE", "WL", "WL", "WT", "WL", "LD", "WT", "WL", "WT", "WT", "TE", "WL", "LD", "WT", "LD", "MN", "TL", "TE", "WL", "TL", "TL", "WT", "TE", "TE", "LD", "WT", "TL", "LD" ), event_date = structure(c(1529573704, 1529573710, 1529573713, 1529573724, 1529573855, 1529573874, 1529573880, 1529573895, 1529573906, 1529573918, 1529573925, 1529573931, 1529573931, 1529573941, 1529573947, 1529573969, 1529574006, 1529574054, 1529574088, 1529574114, 1529574120, 1529574123, 1529574134, 1529574137, 1529574148, 1529574163, 1529574164, 1529574148, 1529574169, 1529574170, 1529574178, 1529574188, 1529574189, 1529574196, 1529574178, 1529574188, 1529574203, 1529574213, 1529574214, 1529574214, 1529574215, 1529574227, 1529574231, 1529574242, 1529574244, 1529574245, 1529574248, 1529574260, 1529574262), class = c("POSIXct", "POSIXt"), tzone = "UTC"), stage_duration = c(3.78, 3.47, 2.78, 3.45, 3.32, 4.93, 4.23, 4.22, 3.85, 3.37, 5.88, 5.92, 3.97, 3.7, NA, 4.08, 3.05, 0.57, 11.18, 12.08, 2.6, 3.3, 0.23, 0.85, 0.27, 0.25, 0.82, 10.42, 0.15, 0.43, 1.4, 0.25, 0.7, 0.52, 1.12, 0.45, 12.87, 12.18, 2.92, 0.57, 14.07, 12.72, 17.12, 4.13, 3.13, 0.25, 0.33, 18.98, 1.05), veh_id = c("B35", "B05", "B04", "B08", "B14", "B13", "B04", "B17", "B41", "B05", "B26", "B08", "B35", "B19a", "B10a", "B01a", "B28", "B14", "B14", "B18", "B05", "B37", "B04", "B41", "B04", "B19a", "B04", "B17", "B35", "B13", "B35", "B02b", "B28", "B13", "B19a", "B41", "B02b", "B04", "B15", "B01a", "B41", "B13", "B28", "B27", "B33", "B19a", "B01a", "B19a", "B35")), .Names = c("Event_stage", "event_date", "stage_duration", "veh_id"), row.names = c(NA, -49L), class = c("tbl_df", "tbl", "data.frame"))
# create ggplot
ggplot(data = df %>% filter(veh_id == "B35"), aes(x = event_date,
y = stage_duration)) +
geom_point(aes(color = Event_stage), size= 3) +
geom_line(alpha = 1/2)+
labs(x = "Event date", y = "Stage duration")
enter image description here
This is Sample bar plot, Everything same as in above line chart but instead of line with spikes a Horizontal line or I just want a single bar which is interactive with a Slider/Scrollbar to navigate time ::
enter image description here
Something resembling this plot,But only a Single Horizontal bar with a scrollbar from start-time to present-time::
enter image description here
df %>% filter(veh_id == "B35") %>%
ggplot(
aes(
x = event_date,
y = stage_duration)
) +
geom_bar(stat = "identity") +
labs(x = "Event date", y = "Stage duration") +
coord_flip()
Appreciate this may have been asked before but I have not found a clear solution to work over a data frame.
I want to run a rolling linear regression over a look back of 5 days. (small so can illustrate here)
So far I am trying:
rollingbeta <- rollapply(df,
width=5,
FUN = function(Z)
{
t = lm(formula=y_Close ~ x_Close+0, data = as.data.frame(Z));
return(t$coef)[1]
},
by.column=FALSE, align="right",fill = NA)
head(rollingbeta,100)
However, I expect to have the beta for the rolling lookback window. Instead I have and output with 10 columns.
> NCOL(rollingbeta)
[1] 10
Can anyone assist?
Here is dummy data (save to .txt and read)
df <- read.table("your_dir\df.txt",header=TRUE, sep="", stringsAsFactors=FALSE)
Date open.x high.x low.x x_Close volume.x open.y high.y low.y y_Close volume.y x.y.cor
1451 2010-01-04 57.32 58.13 57.32 57.85 442900 6.61 6.8400 6.61 6.83 833100 NA
1452 2010-01-05 57.90 58.33 57.54 58.20 436900 6.82 7.1200 6.80 7.12 904500 NA
1453 2010-01-06 58.20 58.56 58.01 58.42 850600 7.05 7.3800 7.05 7.27 759800 NA
1454 2010-01-07 58.31 58.41 57.14 57.90 463600 7.24 7.3000 7.06 7.11 557800 NA
1455 2010-01-08 57.45 58.62 57.45 58.47 206500 7.08 7.3500 6.95 7.29 588100 NA
1456 2010-01-11 58.79 59.00 57.22 57.73 331900 7.38 7.4500 7.17 7.22 450500 NA
1457 2010-01-12 57.20 57.21 56.15 56.34 428500 7.15 7.1900 6.87 7.00 694700 NA
1458 2010-01-13 56.32 56.66 54.83 56.56 577500 7.05 7.1700 6.98 7.15 528800 NA
1459 2010-01-14 56.51 57.05 55.37 55.53 368100 7.08 7.1701 7.08 7.11 279900 NA
1460 2010-01-15 56.59 56.59 55.19 55.84 417900 7.03 7.0500 6.95 7.03 407600 NA
The output should for the first rolling linear regression should be:
NA NA NA NA NA 0.1229065
A faster alternative than wibeasley's answer is to use the rollRegres package as follows
ds <- structure(list(Date = structure(
c(14613, 14614, 14615, 14616, 14617, 14620, 14621, 14622, 14623, 14624), class = "Date"),
open.x = c(57.32, 57.9, 58.2, 58.31, 57.45, 58.79, 57.2, 56.32, 56.51, 56.59),
high.x = c(58.13, 58.33, 58.56, 58.41, 58.62, 59, 57.21, 56.66, 57.05, 56.59),
low.x = c(57.32, 57.54, 58.01, 57.14, 57.45, 57.22, 56.15, 54.83, 55.37, 55.19),
x_Close = c(57.85, 58.2, 58.42, 57.9, 58.47, 57.73, 56.34, 56.56, 55.53, 55.84),
volume.x = c(442900L, 436900L, 850600L, 463600L, 206500L, 331900L, 428500L, 577500L, 368100L, 417900L),
open.y = c(6.61, 6.82, 7.05, 7.24, 7.08, 7.38, 7.15, 7.05, 7.08, 7.03),
high.y = c(6.84, 7.12, 7.38, 7.3, 7.35, 7.45, 7.19, 7.17, 7.1701, 7.05),
low.y = c(6.61, 6.8, 7.05, 7.06, 6.95, 7.17, 6.87, 6.98, 7.08, 6.95),
y_Close = c(6.83, 7.12, 7.27, 7.11, 7.29, 7.22, 7, 7.15, 7.11, 7.03),
volume.y = c(833100L, 904500L, 759800L, 557800L, 588100L, 450500L, 694700L, 528800L, 279900L, 407600L)),
row.names = c(NA, -10L), class = "data.frame")
# we get the same
library(roll)
library(rollRegres)
X <- as.matrix(ds$x_Close)
Y <- ds$y_Close
Ymat <- as.matrix(Y)
all.equal(
roll_lm(x = X, y = Ymat, intercept = FALSE, width = 5L)$coefficients[, 2],
drop(roll_regres.fit(x = X, y = Y, width = 5L)$coefs),
check.attributes = FALSE)
#R [1] TRUE
You can also fit the model with a formula as with lm using the roll_regres function
all.equal(
roll_lm(x = X, y = Ymat, intercept = FALSE, width = 5L)$coefficients[, 2],
drop(roll_regres(y_Close ~ x_Close - 1, ds, width = 5L)$coefs),
check.attributes = FALSE)
#R [1] TRUE
Here is a benchmark of the computation speed
# We add a few more observation to get an interesting example
set.seed(1)
n <- 250 * 5 # 5 years of trading data
X <- as.matrix(rnorm(n))
Y <- rnorm(n)
Ymat <- as.matrix(Y)
microbenchmark::microbenchmark(
roll_lm(x = X, y = Ymat, intercept = FALSE, width = 5L),
roll_regres.fit(x = X, y = Y, width = 5L),
times = 1e3)
#R Unit: microseconds
#R expr min lq mean median uq max neval
#R roll_lm(x = X, y = Ymat, intercept = FALSE, width = 5L) 663.7 739.9 834.2 777.1 860.2 3972.3 1000
#R roll_regres.fit(x = X, y = Y, width = 5L) 186.9 204.6 237.4 224.8 248.3 546.4 1000
Consider using the roll package.
library(magrittr); requireNamespace("roll")
ds <- readr::read_csv(
" Date, open.x, high.x, low.x, x_Close, volume.x, open.y, high.y, low.y, y_Close, volume.y
2010-01-04, 57.32, 58.13, 57.32, 57.85, 442900, 6.61, 6.8400, 6.61, 6.83, 833100
2010-01-05, 57.90, 58.33, 57.54, 58.20, 436900, 6.82, 7.1200, 6.80, 7.12, 904500
2010-01-06, 58.20, 58.56, 58.01, 58.42, 850600, 7.05, 7.3800, 7.05, 7.27, 759800
2010-01-07, 58.31, 58.41, 57.14, 57.90, 463600, 7.24, 7.3000, 7.06, 7.11, 557800
2010-01-08, 57.45, 58.62, 57.45, 58.47, 206500, 7.08, 7.3500, 6.95, 7.29, 588100
2010-01-11, 58.79, 59.00, 57.22, 57.73, 331900, 7.38, 7.4500, 7.17, 7.22, 450500
2010-01-12, 57.20, 57.21, 56.15, 56.34, 428500, 7.15, 7.1900, 6.87, 7.00, 694700
2010-01-13, 56.32, 56.66, 54.83, 56.56, 577500, 7.05, 7.1700, 6.98, 7.15, 528800
2010-01-14, 56.51, 57.05, 55.37, 55.53, 368100, 7.08, 7.1701, 7.08, 7.11, 279900
2010-01-15, 56.59, 56.59, 55.19, 55.84, 417900, 7.03, 7.0500, 6.95, 7.03, 407600"
)
runs <- roll::roll_lm(
x = as.matrix(ds$x_Close),
y = as.matrix(ds$y_Close),
width = 5,
intercept = FALSE
)
# Nested in a named-column, within a matrix, within a list.
ds$beta <- runs$coefficients[, "x1"]
ds$beta
# [1] NA NA NA NA 0.1224813
# [6] 0.1238653 0.1242478 0.1246279 0.1256553 0.1259121
Double-check the alignment of the variables in your dataset. x_Close is around 50, while y_Close is around 7. That might explain the small disparity between the expected 0.1229065 and the 0.1224813 value above.