I have the following table:
# A tibble: 40 x 5
# Groups: stock [1]
stock date mkt_cap week returns
<chr> <date> <dbl> <int> <dbl>
1 A 2019-03-04 10522834. NA NA
2 A 2019-03-05 11659707. NA 9.70
3 A 2019-03-06 11464531. NA -2.25
4 A 2019-03-07 12217241. NA 5.80
5 A 2019-03-08 11619351. 1 -5.57
6 A 2019-03-09 11578687. NA -0.899
7 A 2019-03-10 11658368. NA 0.141
8 A 2019-03-11 12722921. NA 8.20
9 A 2019-03-12 15429934. NA 18.8
10 A 2019-03-13 16801600. NA 7.98
11 A 2019-03-14 17898334. NA 5.79
12 A 2019-03-15 18492686. 2 2.74
13 A 2019-03-16 20686683. NA 10.7
14 A 2019-03-17 22299970. NA 6.98
15 A 2019-03-18 22924182. NA 2.24
16 A 2019-03-19 24174351. NA 4.79
17 A 2019-03-20 24661467. NA 1.48
18 A 2019-03-21 23351810. NA -5.97
19 A 2019-03-22 27826601. 3 17.0
20 A 2019-03-23 30670482. NA 9.22
21 A 2019-03-24 32802772. NA 6.21
22 A 2019-03-25 31778387. NA -3.68
23 A 2019-03-26 33237006. NA 3.99
24 A 2019-03-27 34971479. NA 4.59
25 A 2019-03-28 36774005. NA 4.53
26 A 2019-03-29 37594815. 4 1.71
27 A 2019-03-30 38321816. NA 1.42
28 A 2019-03-31 35167070. NA -9.08
29 A 2019-04-01 35625396. NA 0.808
30 A 2019-04-02 35764747. NA -0.0940
31 A 2019-04-03 28316242. NA -23.8
32 A 2019-04-04 26124803. NA -8.53
33 A 2019-04-05 30390295. 5 14.6
34 A 2019-04-06 28256485. NA -7.76
35 A 2019-04-07 29807837. NA 4.87
36 A 2019-04-08 30970364. NA 3.36
37 A 2019-04-09 30470093. NA -2.10
38 A 2019-04-10 30860276. NA 0.806
39 A 2019-04-11 27946472. NA -10.4
40 A 2019-04-12 27662766. 6 -1.48
Over this table, I want to run a rolling regression where the rolling regression contains the past month of data. I want to run these rolling regressions over the weeks. That is, over week==1, week ==2 etc., where we use the past month of data. The regression should be lm(return~mkt_cap). I have tried a number of things using the slide_period() function, however, this did not work out for me. For example, I have tried to run
tbl.data %>% group_by(stock, week) %>% slide_period(date, date, "month", ~.x, .before = 1). There are some gaps in my data, therefore I prefer a solution that considers the date.
Could someone help me out? Kind regards.
I would use a tidyverse rowwise approach.
Not clear to me is how models should be created by week and go back to the last month. In the approach below I calculate max_date per week and from this I go back 30 days.
# setup
library(tidyverse)
library(lubridate)
dat <- tribble(~stock, ~date, ~mkt_cap, ~week, ~returns,
"A", "2019-03-04", 10522834., NA, NA,
"A", "2019-03-05", 11659707., NA, 9.70,
"A", "2019-03-06", 11464531., NA, -2.25,
"A", "2019-03-07", 12217241., NA, 5.80,
"A", "2019-03-08", 11619351., 1, -5.57,
"A", "2019-03-09", 11578687., NA, -0.899,
"A", "2019-03-10", 11658368., NA, 0.141,
"A", "2019-03-11", 12722921., NA, 8.20,
"A", "2019-03-12", 15429934., NA, 18.8,
"A", "2019-03-13", 16801600., NA, 7.98,
"A", "2019-03-14", 17898334., NA, 5.79,
"A", "2019-03-15", 18492686., 2, 2.74,
"A", "2019-03-16", 20686683., NA, 10.7,
"A", "2019-03-17", 22299970., NA, 6.98,
"A", "2019-03-18", 22924182., NA, 2.24,
"A", "2019-03-19", 24174351., NA, 4.79,
"A", "2019-03-20", 24661467., NA, 1.48,
"A", "2019-03-21", 23351810., NA, -5.97,
"A", "2019-03-22", 27826601., 3, 17.0,
"A", "2019-03-23", 30670482., NA, 9.22,
"A", "2019-03-24", 32802772., NA, 6.21,
"A", "2019-03-25", 31778387., NA, -3.68,
"A", "2019-03-26", 33237006., NA, 3.99,
"A", "2019-03-27", 34971479., NA, 4.59,
"A", "2019-03-28", 36774005., NA, 4.53,
"A", "2019-03-29", 37594815., 4, 1.71,
"A", "2019-03-30", 38321816., NA, 1.42,
"A", "2019-03-31", 35167070., NA, -9.08,
"A", "2019-04-01", 35625396., NA, 0.808,
"A", "2019-04-02", 35764747., NA, -0.0940,
"A", "2019-04-03", 28316242., NA, -23.8,
"A", "2019-04-04", 26124803., NA, -8.53,
"A", "2019-04-05", 30390295., 5, 14.6,
"A", "2019-04-06", 28256485., NA, -7.76,
"A", "2019-04-07", 29807837., NA, 4.87,
"A", "2019-04-08", 30970364., NA, 3.36,
"A", "2019-04-09", 30470093., NA, -2.10,
"A", "2019-04-10", 30860276., NA, 0.806,
"A", "2019-04-11", 27946472., NA, -10.4,
"A", "2019-04-12", 27662766., 6, -1.48) %>%
mutate(date = as.Date(date)) %>%
fill(week, .direction = "up")
# summarised data.frame by week with min and max date
dat2 <- dat %>%
group_by(week) %>%
summarise(max_date = max(date),
min_date = max_date %m-% months(1))
#> `summarise()` ungrouping output (override with `.groups` argument)
# create the models
dat3 <- dat2 %>%
rowwise() %>%
mutate(mod = list(lm(returns ~ mkt_cap,
data = filter(dat,
date <= .env$max_date,
date >= .env$min_date))))
# get the relevant informationen per week
dat3 %>%
mutate(res = list(broom::tidy(mod)),
broom::glance(mod)) %>%
select(week,
res,
adj.r.squared,
mod_p.value = p.value,
nobs) %>%
unnest(res) %>%
filter(term != "(Intercept)")
#> # A tibble: 6 x 9
#> week term estimate std.error statistic p.value adj.r.squared mod_p.value
#> <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 mkt_~ 1.01e-5 1.34e-5 0.756 0.529 -0.167 0.529
#> 2 2 mkt_~ 9.26e-7 7.45e-7 1.24 0.245 0.0520 0.245
#> 3 3 mkt_~ 2.56e-7 2.97e-7 0.864 0.400 -0.0152 0.400
#> 4 4 mkt_~ 2.00e-8 1.42e-7 0.141 0.889 -0.0426 0.889
#> 5 5 mkt_~ -1.18e-7 1.61e-7 -0.736 0.467 -0.0150 0.467
#> 6 6 mkt_~ -3.23e-7 2.37e-7 -1.37 0.182 0.0271 0.182
#> # ... with 1 more variable: nobs <int>
Created on 2021-04-27 by the reprex package (v0.3.0)
Update
This approach can be easily expanded when working with more than one stock:
# lets append the same data and change stock to "B":
dat <- dat %>%
bind_rows({mutate(., stock = "B")})
# summarised data.frame by week and group with min and max date
dat2 <- dat %>%
group_by(stock, week) %>%
summarise(max_date = max(date),
min_date = max_date %m-% months(1))
#> `summarise()` has grouped output by 'stock'. You can override using the `.groups` argument.
# create the models, and this time also filer for .env$stock
dat3 <- dat2 %>%
rowwise() %>%
mutate(mod = list(lm(returns ~ mkt_cap,
data = filter(dat,
stock == .env$stock,
date <= .env$max_date,
date >= .env$min_date))))
# get the relevant informationen per week (this stays the same!)
dat3 %>%
mutate(res = list(broom::tidy(mod)),
broom::glance(mod)) %>%
select(week,
res,
adj.r.squared,
mod_p.value = p.value,
nobs) %>%
unnest(res) %>%
filter(term != "(Intercept)")
#> Adding missing grouping variables: `stock`
#> # A tibble: 12 x 10
#> # Groups: stock [2]
#> stock week term estimate std.error statistic p.value adj.r.squared
#> <chr> <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 A 1 mkt_cap 0.0000101 0.0000134 0.756 0.529 -0.167
#> 2 A 2 mkt_cap 0.000000926 0.000000745 1.24 0.245 0.0520
#> 3 A 3 mkt_cap 0.000000256 0.000000297 0.864 0.400 -0.0152
#> 4 A 4 mkt_cap 0.0000000200 0.000000142 0.141 0.889 -0.0426
#> 5 A 5 mkt_cap -0.000000118 0.000000161 -0.736 0.467 -0.0150
#> 6 A 6 mkt_cap -0.000000323 0.000000237 -1.37 0.182 0.0271
#> 7 B 1 mkt_cap 0.0000101 0.0000134 0.756 0.529 -0.167
#> 8 B 2 mkt_cap 0.000000926 0.000000745 1.24 0.245 0.0520
#> 9 B 3 mkt_cap 0.000000256 0.000000297 0.864 0.400 -0.0152
#> 10 B 4 mkt_cap 0.0000000200 0.000000142 0.141 0.889 -0.0426
#> 11 B 5 mkt_cap -0.000000118 0.000000161 -0.736 0.467 -0.0150
#> 12 B 6 mkt_cap -0.000000323 0.000000237 -1.37 0.182 0.0271
#> # … with 2 more variables: mod_p.value <dbl>, nobs <int>
Created on 2021-04-27 by the reprex package (v0.3.0)
An ugly Base R solution (assuming you just want the predicted values returned):
# Allocate some memory such that each stock in data.frame
# can become an element in a list: df_list => empty list:
df_list <- vector("list", length(unique(df$stock)))
# Split the data.frame into the list: df_list => list of data.frames:
df_list <- with(df, split(df, stock))
# Number of weeks to consider in rolling regression in this case 4,
# approximating a month: n_weeks => integer scalar:
n_weeks <- 4
# For each stock in the list: nested lists => stdout(console)
lapply(df_list, function(x){
# Clean the week vector, filling NAs with values:
# week => integer vector
x$week <- with(x, rev(na.omit(rev(week))[cumsum(!is.na(rev(week)))]))
# Impute the first return value if it is missing:
x$returns[1] <- with(x,
ifelse(is.na(returns[1]), returns[which.min(!(is.na(returns)))],
returns[1]
)
)
# Interpolate the return using the previous value:
# returns => numeric vector
x$returns <- with(x, na.omit(returns)[cumsum(!is.na(returns))])
# For each week:
y <- lapply(unique(x$week), function(z){
# Calculate the range for the regression:
rng <- if(z - n_weeks <= 0){
seq_len(z)
}else{
seq(from = (z - n_weeks), to = z, by = 1)
}
# Subset the data: sbst => data.frame
sbst <- x[x$week %in% rng,]
# Calculate the regression:
predict(lm(returns ~ mkt_cap, data = sbst))
}
)
# Return the list of regressions:
y
}
)
Data:
df <- structure(list(stock = c("A", "A", "A", "A", "A", "A", "A", "A",
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A",
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A",
"A", "A", "A", "A", "A", "A"), date = structure(17959:17998, class = c("IDate",
"Date")), mkt_cap = c(10522834, 11659707, 11464531, 12217241,
11619351, 11578687, 11658368, 12722921, 15429934, 16801600, 17898334,
18492686, 20686683, 22299970, 22924182, 24174351, 24661467, 23351810,
27826601, 30670482, 32802772, 31778387, 33237006, 34971479, 36774005,
37594815, 38321816, 35167070, 35625396, 35764747, 28316242, 26124803,
30390295, 28256485, 29807837, 30970364, 30470093, 30860276, 27946472,
27662766), week = c(NA, NA, NA, NA, 1L, NA, NA, NA, NA, NA, NA,
2L, NA, NA, NA, NA, NA, NA, 3L, NA, NA, NA, NA, NA, NA, 4L, NA,
NA, NA, NA, NA, NA, 5L, NA, NA, NA, NA, NA, NA, 6L), returns = c(NA,
9.7, -2.25, 5.8, -5.57, -0.899, 0.141, 8.2, 18.8, 7.98, 5.79,
2.74, 10.7, 6.98, 2.24, 4.79, 1.48, -5.97, 17, 9.22, 6.21, -3.68,
3.99, 4.59, 4.53, 1.71, 1.42, -9.08, 0.808, -0.094, -23.8, -8.53,
14.6, -7.76, 4.87, 3.36, -2.1, 0.806, -10.4, -1.48)), class = "data.frame", row.names = c(NA,
-40L))
Does slide_index() from the slider package do what you want?
library(tidyverse)
library(slider)
library(broom)
set.seed(1001)
## more or less the slider help page for slide_index()
df <- data.frame(
y = rnorm(100),
x = rnorm(100),
i = as.Date("2019-08-15") + c(0, 2, 4, 6:102) # <- irregular
)
head(df)
#> y x i
#> 1 2.1886481 0.07862339 2019-08-15
#> 2 -0.1775473 -0.98708727 2019-08-17
#> 3 -0.1852753 -1.17523226 2019-08-19
#> 4 -2.5065362 1.68140888 2019-08-21
#> 5 -0.5573113 0.75623228 2019-08-22
#> 6 -0.1435595 0.30309733 2019-08-23
# 20 day rolling regression. Current day + 10 days back.
out <- df %>%
mutate(model = slide_index(df, i, ~ lm(y ~ x, df),
.before = 10, .complete = TRUE)) %>%
as_tibble()
out %>%
filter(!(map_lgl(model, ~ is_empty(.x)))) %>%
mutate(results = map(model, tidy)) %>%
unnest(cols = c(results))
#> # A tibble: 186 x 9
#> y x i model term estimate std.error statistic p.value
#> <dbl> <dbl> <date> <list> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 -0.623 0.741 2019-08-25 <lm> (Intercept) -0.000347 0.115 -0.00302 0.998
#> 2 -0.623 0.741 2019-08-25 <lm> x -0.0825 0.144 -0.575 0.567
#> 3 -0.907 0.495 2019-08-26 <lm> (Intercept) -0.000347 0.115 -0.00302 0.998
#> 4 -0.907 0.495 2019-08-26 <lm> x -0.0825 0.144 -0.575 0.567
#> 5 -1.59 -1.13 2019-08-27 <lm> (Intercept) -0.000347 0.115 -0.00302 0.998
#> 6 -1.59 -1.13 2019-08-27 <lm> x -0.0825 0.144 -0.575 0.567
#> 7 0.303 -1.16 2019-08-28 <lm> (Intercept) -0.000347 0.115 -0.00302 0.998
#> 8 0.303 -1.16 2019-08-28 <lm> x -0.0825 0.144 -0.575 0.567
#> 9 1.63 -0.713 2019-08-29 <lm> (Intercept) -0.000347 0.115 -0.00302 0.998
#> 10 1.63 -0.713 2019-08-29 <lm> x -0.0825 0.144 -0.575 0.567
#> # … with 176 more rows
Related
I have a subset of my data in a dataframe (dput codeblock below) containing dates in which a storm occurred ("Date_AR"). I'd like to know if a storm occurred in the north, south or both, by determining whether the same date occurred in the "Date_N" and/or "Date_S" column/s.
For example, the first date is Jan 17, 1989 in the "Date_AR" column. In the location column, I would like "S" to be printed, since this date is found in the "Date_S" column. If Apr 5. 1989 occurs in "Date_N" and "Date_S", the I would like a "B" (for both) to be printed in the location column.
Thanks in advance for the help! Apologies if this type of question is already out there. I may not know the keywords to search.
structure(list(Date_S = structure(c(6956, 6957, 6970, 7008, 7034,
7035, 7036, 7172, 7223, 7224, 7233, 7247, 7253, 7254, 7255, 7262, 7263, 7266, 7275,
7276), class = "Date"),
Date_N = structure(c(6968, 6969, 7035, 7049, 7103, 7172, 7221, 7223, 7230, 7246, 7247,
7251, 7252, 7253, 7262, 7266, 7275, 7276, 7277, 7280), class = "Date"),
Date_AR = structure(c(6956, 6957, 6968, 6969, 6970, 7008,
7034, 7035, 7036, 7049, 7103, 7172, 7221, 7223, 7224, 7230,
7233, 7246, 7247, 7251), class = "Date"), Precip = c(23.6,
15.4, 3, 16.8, 0.2, 3.6, 22, 13.4, 0, 30.8, 4.6, 27.1, 0,
19, 2.8, 11.4, 2, 57.6, 9.4, 39), Location = c(NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA)), row.names = c(NA, 20L), class = "data.frame")
Using dplyr::case_when you could do:
library(dplyr)
dat |>
mutate(Location = case_when(
Date_AR %in% Date_S & Date_AR %in% Date_N ~ "B",
Date_AR %in% Date_S ~ "S",
Date_AR %in% Date_N ~ "N"
))
#> Date_S Date_N Date_AR Precip Location
#> 1 1989-01-17 1989-01-29 1989-01-17 23.6 S
#> 2 1989-01-18 1989-01-30 1989-01-18 15.4 S
#> 3 1989-01-31 1989-04-06 1989-01-29 3.0 N
#> 4 1989-03-10 1989-04-20 1989-01-30 16.8 N
#> 5 1989-04-05 1989-06-13 1989-01-31 0.2 S
#> 6 1989-04-06 1989-08-21 1989-03-10 3.6 S
#> 7 1989-04-07 1989-10-09 1989-04-05 22.0 S
#> 8 1989-08-21 1989-10-11 1989-04-06 13.4 B
#> 9 1989-10-11 1989-10-18 1989-04-07 0.0 S
#> 10 1989-10-12 1989-11-03 1989-04-20 30.8 N
#> 11 1989-10-21 1989-11-04 1989-06-13 4.6 N
#> 12 1989-11-04 1989-11-08 1989-08-21 27.1 B
#> 13 1989-11-10 1989-11-09 1989-10-09 0.0 N
#> 14 1989-11-11 1989-11-10 1989-10-11 19.0 B
#> 15 1989-11-12 1989-11-19 1989-10-12 2.8 S
#> 16 1989-11-19 1989-11-23 1989-10-18 11.4 N
#> 17 1989-11-20 1989-12-02 1989-10-21 2.0 S
#> 18 1989-11-23 1989-12-03 1989-11-03 57.6 N
#> 19 1989-12-02 1989-12-04 1989-11-04 9.4 B
#> 20 1989-12-03 1989-12-07 1989-11-08 39.0 N
I currently have a list with columns as individual elements.
I would like to combine list elements with the same column names (i.e. bind rows) and merge across the different columns (i.e. bind columns) into a single data frame. I'm having difficulty finding examples of how to do this.
l = list(est = c(0, 0.062220390087795, 1.1020213968139, 0.0359939361491544
), se = c(0.0737200634874046, 0.237735179934829, 0.18105632705918,
0.111359438298789), rf = structure(c(NA, NA, NA, 4L), levels = c("Never\nsmoker",
"Occasional\nsmoker", "Ex-regular\nsmoker", "Smoker"), class = "factor"),
n = c(187L, 18L, 32L, 82L), model = c("Crude", "Crude", "Crude",
"Crude"), est = c(0, 0.112335510453586, 0.867095253670329,
0.144963556944891), se = c(0.163523775933409, 0.237039485900481,
0.186247776987999, 0.119887623484768), rf = structure(c(NA,
NA, NA, 4L), levels = c("Never\nsmoker", "Occasional\nsmoker",
"Ex-regular\nsmoker", "Smoker"), class = "factor"), n = c(187L,
18L, 32L, 82L), model = c("Model 1", "Model 1", "Model 1",
"Model 1"), est = c(0, 0.107097305324242, 0.8278765140371,
0.0958220447859447), se = c(0.164787596943329, 0.237347836229364,
0.187201880036661, 0.120882616647714), rf = structure(c(NA,
NA, NA, 4L), levels = c("Never\nsmoker", "Occasional\nsmoker",
"Ex-regular\nsmoker", "Smoker"), class = "factor"), n = c(187L,
18L, 32L, 82L), model = c("Model 2", "Model 2", "Model 2",
"Model 2"))
I would like the data to have the following format:
data.frame(
est = c(),
se = c(),
rf = c(),
model = c()
)
Any help would be appreciated. Thank you!
In this solution, first the elements of l are grouped by name and then are combined using c. Finally, the resulting list is converted to a dataframe using map_dfc.
library(dplyr)
library(purrr)
cols <- c("est", "se", "rf", "model")
setNames(cols,cols) |>
map(~l[names(l) == .x]) |>
map_dfc(~do.call(c, .x))
#> # A tibble: 12 × 4
#> est se rf model
#> <dbl> <dbl> <fct> <chr>
#> 1 0 0.0737 NA Crude
#> 2 0.0622 0.238 NA Crude
#> 3 1.10 0.181 NA Crude
#> 4 0.0360 0.111 Smoker Crude
#> 5 0 0.164 NA Model 1
#> 6 0.112 0.237 NA Model 1
#> 7 0.867 0.186 NA Model 1
#> 8 0.145 0.120 Smoker Model 1
#> 9 0 0.165 NA Model 2
#> 10 0.107 0.237 NA Model 2
#> 11 0.828 0.187 NA Model 2
#> 12 0.0958 0.121 Smoker Model 2
another option
library(purrr)
grp <- (seq(length(l)) - 1) %/% 5
l_split <- split(l, grp)
map_df(l_split, c)
#> # A tibble: 12 × 5
#> est se rf n model
#> <dbl> <dbl> <fct> <int> <chr>
#> 1 0 0.0737 <NA> 187 Crude
#> 2 0.0622 0.238 <NA> 18 Crude
#> 3 1.10 0.181 <NA> 32 Crude
#> 4 0.0360 0.111 Smoker 82 Crude
#> 5 0 0.164 <NA> 187 Model 1
#> 6 0.112 0.237 <NA> 18 Model 1
#> 7 0.867 0.186 <NA> 32 Model 1
#> 8 0.145 0.120 Smoker 82 Model 1
#> 9 0 0.165 <NA> 187 Model 2
#> 10 0.107 0.237 <NA> 18 Model 2
#> 11 0.828 0.187 <NA> 32 Model 2
#> 12 0.0958 0.121 Smoker 82 Model 2
I would like to make a connection between the x and df2 datasets. Notice that the dataset x, I have a percentage value, which in this case for the day 03-01-2021 is 0.1 and for the days 01-02-2021 and 01-01-2022 it is 0.45. So from that information, I know the percentage value for 03-01-2021 is 0.1, so this value falls into category I of my dataset df2 (since the values range from 0.1 to 0.2). As for the days 02-01-2021 and 01-01-2022, they correspond to category F of the df2,since the values range from 0.4 to 0.5. So, I would like to generate an output table as follows:
library(dplyr)
df1<- structure(
list(date2= c("01-01-2022","01-01-2022","03-01-2021","03-01-2021","01-02-2021","01-02-2021"),
Category= c("ABC","CDE","ABC","CDE","ABC","CDE"),
coef= c(5,4,0,2,4,5)),
class = "data.frame", row.names = c(NA, -6L))
x<-df1 %>%
group_by(date2) %>%
summarize(across("coef", sum),.groups = 'drop')%>%
arrange(date2 = as.Date(date2, format = "%d-%m-%Y"))
number<-20
x$Percentage<-x$coef/number
date2 coef Percentage
<chr> <dbl> <dbl>
1 03-01-2021 2 0.1
2 01-02-2021 9 0.45
3 01-01-2022 9 0.45
df2 <- structure(
list(
Category = c("A", "B", "C", "D",
"E", "F", "G", "H", "I", "J"),
From = c(0.9,
0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1, 0),
Until = c(
1,
0.8999,
0.7999,
0.6999,
0.5999,
0.4999,
0.3999,
0.2999,
0.1999,
0.0999
),
`1 Val` = c(
2222,
2017.8,
1793.6,
1621.5,
1522.4,
1457.3,
1325.2,
1229.15,
1223.1,
1177.05
),
`2 Val` = c(3200, 2220, 2560,
2200, 2220, 2080, 1220, 1240, 1720, 1620),
`3 Val` = c(
4665,
4122.5,
3732,
3498.75,
3265.5,
3032.25,
2799,
2682.375,
2565.75,
2449.125
),
`4 Val` = c(
6112,
5222.8,
4889.6,
4224,
4278.4,
3972.8,
3667.2,
3224.4,
3361.6,
3222.8
)
),
row.names = c(NA,-10L),
class = c("tbl_df",
"tbl", "data.frame")
)
Category From Until 1 Val 2 Val 3 Val 4 Val
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 A 0.9 1 2222 3200 4665 6112
2 B 0.8 0.900 2018 2220 4122 5223
3 C 0.7 0.800 1794 2560 3732 4890
4 D 0.6 0.700 1622 2200 3499 4224
5 E 0.5 0.600 1522 2220 3266 4278
6 F 0.4 0.500 1457 2080 3032 3973
7 G 0.3 0.400 1325 1220 2799 3667
8 H 0.2 0.300 1229 1240 2682 3224
9 I 0.1 0.200 1223 1720 2566 3362
10 J 0 0.0999 1177 1620 2449 3223
Using tidyverse, we do a rowwise on the 'x' dataset, slice the rows of 'df2' where the 'Percentage' falls between the 'From' and 'Until', and unpack the data.frame/tibble column
library(dplyr)
library(tidyr)
x %>%
rowwise %>%
mutate(out = df2 %>%
slice(which(Percentage>= From &
Percentage <= Until)[1]) %>%
select(-(1:3)) ) %>%
ungroup %>%
unpack(out)
-output
# A tibble: 3 × 7
date2 coef Percentage `1 Val` `2 Val` `3 Val` `4 Val`
<chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 03-01-2021 2 0.1 1223. 1720 2566. 3362.
2 01-02-2021 9 0.45 1457. 2080 3032. 3973.
3 01-01-2022 9 0.45 1457. 2080 3032. 3973.
Or this could be done with a non-equi join
library(data.table)
nm1 <- names(df2)[endsWith(names(df2), 'Val')]
setDT(x)[setDT(df2), (nm1) := mget(nm1),
on = .(Percentage >= From, Percentage <= Until)]
-output
> x
date2 coef Percentage 1 Val 2 Val 3 Val 4 Val
1: 03-01-2021 2 0.10 1223.1 1720 2565.75 3361.6
2: 01-02-2021 9 0.45 1457.3 2080 3032.25 3972.8
3: 01-01-2022 9 0.45 1457.3 2080 3032.25 3972.8
I have a datafrema with land use data of 6 points between the years 2005 to 2018. I would like to calculate the percentage change between 2005 to 2018.
df<-structure(list(place = c("F01", "F01", "F02", "F02", "F03", "F03",
"F04", "F04", "F05", "F05", "F06", "F06"), year = c(2005, 2018, 2005,
2018, 2005, 2018, 2005, 2018, 2005, 2018, 2005, 2018), Veg =
c(12281.5824712026, 12292.2267477317, 7254.98919713131,
7488.9138055415, 864.182200710528, 941.602680778032, 549.510775817472, 584.104674537216, 5577.10195081334, 5688.28474549675, 1244.96456185886, 1306.41862713264), Agri = c(113.178596532624, 1376.68748390712, 85.2373706436, 1048.71071335262, 0, 46.236076173504, 0, 46.236076173504, 85.2373706436, 1002.47463717912, 1.413692976528,
228.851945376768 ), Past = c(9190.16856517738, 7855.55923692456, 5029.33750161394, 3776.9718412309, 983.015569149264, 800.981808818688, 710.255983089744, 572.213021852304, 3726.66100294858, 2700.40306039963, 879.982298683488, 597.410020198656), Urb = c(146.026168634304, 200.910719487744, 146.026168634304,
200.910719487744, 141.119822421648, 194.840155529712, 141.119822421648, 194.840155529712, 4.906346212656, 6.070563958032, NA, NA), SoloExp = c(61.12143163224, 67.940421283728, 61.12143163224,
62.451966198384, 50.144521461552, 54.801392443056, 49.146620536944, 52.639273773072, 9.895850835696, 7.650573755328, 6.320039189184, 1.164217745376), Hidro = c(9.230583552624, 7.983207396864, 9.230583552624, 7.983207396864, NA, NA, NA, NA, 7.401098524176, 6.320039189184, 5.654771906112, 4.490554160736), total = c(691953.981181971, 691953.981181971, 691953.981181971,
691953.981181971, 691953.981181971, 691953.981181971, 691953.981181971, 691953.981181971, 691953.981181971, 691953.981181971, 691953.981181971, 691953.981181971)), row.names = c(NA, -12L), class = "data.frame")
I tried using the lead command to calculate the difference between 2005 and 2018, but I was not successful:
df2<-df%>%
select(-c(total))%>%
replace(is.na(.), 0)%>%
pivot_longer(cols = c(3:8),
names_to = 'classe',
values_to = 'area')%>%
group_by(place, classe)%>%
mutate(percent=(((((area)-lead(area))/area)*100)*-1))%>%
pivot_wider(names_from = 'classe',
values_from = 'percent')%>%
select(-c(area, year))
For example for the Veg class I expected to get:
place Veg
F01 0.09
F02 3.22
F03 8.96
F04 6.30
F05 1.99
F06 4.94
Here is one solution if you need percentages for all the parameters
library(dplyr)
library(tidyr)
df_new<-df %>%
select(-(total))%>%
replace(is.na(.), 0)%>%
pivot_longer(cols = c(3:8),
names_to = 'classe',
values_to = 'area') %>%
pivot_wider(names_from=year, values_from=area) %>%
mutate(percent=(`2018`-`2005`)/`2005`) %>%
select(-`2018`,-`2005`) %>%
pivot_wider( names_from="classe", values_from="percent")
df_new
#> # A tibble: 6 × 7
#> place Veg Agri Past Urb SoloExp Hidro
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 F01 0.000867 11.2 -0.145 0.376 0.112 -0.135
#> 2 F02 0.0322 11.3 -0.249 0.376 0.0218 -0.135
#> 3 F03 0.0896 Inf -0.185 0.381 0.0929 NaN
#> 4 F04 0.0630 Inf -0.194 0.381 0.0711 NaN
#> 5 F05 0.0199 10.8 -0.275 0.237 -0.227 -0.146
#> 6 F06 0.0494 161. -0.321 NaN -0.816 -0.206
Created on 2022-01-09 by the reprex package (v2.0.1)
Another possible solution:
library(tidyverse)
df %>%
group_by(place) %>%
summarise(across(-year, ~ 100*(last(.x) / first(.x) - 1)))
#> # A tibble: 6 × 8
#> place Veg Agri Past Urb SoloExp Hidro total
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 F01 0.0867 1116. -14.5 37.6 11.2 -13.5 0
#> 2 F02 3.22 1130. -24.9 37.6 2.18 -13.5 0
#> 3 F03 8.96 Inf -18.5 38.1 9.29 NA 0
#> 4 F04 6.30 Inf -19.4 38.1 7.11 NA 0
#> 5 F05 1.99 1076. -27.5 23.7 -22.7 -14.6 0
#> 6 F06 4.94 16088. -32.1 NA -81.6 -20.6 0
I want to convert a cox table to forest plot as showed below. Unforunatly I’ve lost my original data (coxph object) so I have to use the data from the table. Data below are just examples:
Desired output:
Reprex for the two tables:
GRP1<-tibble::tribble(
~Variable, ~Level, ~Number, ~`HR.(univariable)`, ~`HR.(multivariable)`,
"Sex", "Female", "2204 (100.0)", NA, NA,
NA, "Male", "2318 (100.0)", "1.13 (0.91-1.40, p=0.265)", "1.13 (0.91-1.40, p=0.276)",
"Score", "1", "2401 (100.0)", NA, NA,
NA, "1-2", "1637 (100.0)", "1.49 (1.19-1.87, p=0.001)", "1.15 (0.90-1.47, p=0.250)",
NA, "3-4", "412 (100.0)", "1.71 (1.14-2.56, p=0.010)", "1.09 (0.71-1.67, p=0.710)",
NA, ">=5", "42 (100.0)", "1.67 (0.53-5.21, p=0.381)", "0.96 (0.30-3.05, p=0.943)",
"Treatment", "A", "1572 (100.0)", NA, NA,
NA, "B", "2951 (100.0)", "1.74 (1.26-2.40, p=0.001)", "1.53 (1.09-2.13, p=0.013)"
)
GRP2<-tibble::tribble(
~Variable, ~Level, ~Number, ~`HR.(univariable)`, ~`HR.(univariable)`,
"Sex", "Female", "2204 (100.0)", NA, NA,
NA, "Male", "2318 (100.0)", "1.70 (1.36-2.13, p<0.001)", "1.62 (1.28-2.04, p<0.001)",
"Score", "1", "2401 (100.0)", NA, NA,
NA, "1-2", "1637 (100.0)", "2.76 (1.21-6.29, p=0.016)", "2.69 (1.18-6.13, p=0.019)",
NA, "3-4", "412 (100.0)", "5.11 (2.26-11.58, p<0.001)", "4.46 (1.95-10.23, p<0.001)",
NA, ">=5", "42 (100.0)", "5.05 (2.19-11.64, p<0.001)", "4.08 (1.73-9.59, p=0.001)",
"Treatment", "A", "1572 (100.0)", NA, NA,
NA, "B", "2951 (100.0)", "1.48 (1.16-1.88, p=0.001)", "1.23 (0.95-1.59, p=0.114)"
)
Is it doable?
Best regards, H
The difficult thing about this task is not making the plot; it is converting your data from a bunch of text strings into a single long-format data frame that can be used for plotting. This involves using regular expressions to capture the appropriate number for each column, pivoting the result, then repeating that process for the second data frame before binding the two frames together. This is unavoidably ugly and complicated, but that is one of the reasons why having data stored in the correct format is so important.
Anyway, the following code performs the necessary operations:
library(dplyr)
wrangler <- function(data){
grp <- as.character(match.call()$data)
data %>%
tidyr::fill(Variable) %>%
mutate(Variable = paste(Variable, Level),
Number = as.numeric(gsub("^(\\d+).*$", "\\1", Number)),
univariable_HR = as.numeric(gsub("^((\\d+|\\.)+).*$", "\\1", `HR.(univariable)`)),
univariable_lower = as.numeric(gsub("^.+? \\((.+?)-.*$", "\\1", `HR.(univariable)`)),
univariable_upper = as.numeric(gsub("^.+?-(.+?),.*$", "\\1", `HR.(univariable)`)),
univariable_p = gsub("^.+?p=*(.+?)\\).*$", "\\1", `HR.(univariable)`),
multivariable_HR = as.numeric(gsub("^((\\d+|\\.)+).*$", "\\1", `HR.(multivariable)`)),
multivariable_lower = as.numeric(gsub("^.+? \\((.+?)-.*$", "\\1", `HR.(multivariable)`)),
multivariable_upper = as.numeric(gsub("^.+?-(.+?),.*$", "\\1", `HR.(multivariable)`)),
multivariable_p = gsub("^.+?p=*(.+?)\\).*$", "\\1", `HR.(multivariable)`),
group = grp) %>%
filter(!is.na(univariable_HR)) %>%
select(-Level, -`HR.(multivariable)`, - `HR.(univariable)`) %>%
tidyr::pivot_longer(cols = -(c(1:2, 11)), names_sep = "_", names_to = c("type", ".value"))
}
df <- rbind(wrangler(GRP1), wrangler(GRP2))
This now gives us the data in the correct format for plotting. Each row will become a single pointrange in our plot, so it needs a hazard ratio, a lower confidence bound, an upper confidence bound, a variable label, the type (multivariable versus univariable), and the group it originally came from (GRP1 or GRP2):
df
#> # A tibble: 20 x 8
#> Variable Number group type HR lower upper p
#> <chr> <dbl> <chr> <chr> <dbl> <dbl> <dbl> <chr>
#> 1 Sex Male 2318 GRP1 univariable 1.13 0.91 1.4 0.265
#> 2 Sex Male 2318 GRP1 multivariable 1.13 0.91 1.4 0.276
#> 3 Score 1-2 1637 GRP1 univariable 1.49 1.19 1.87 0.001
#> 4 Score 1-2 1637 GRP1 multivariable 1.15 0.9 1.47 0.250
#> 5 Score 3-4 412 GRP1 univariable 1.71 1.14 2.56 0.010
#> 6 Score 3-4 412 GRP1 multivariable 1.09 0.71 1.67 0.710
#> 7 Score >=5 42 GRP1 univariable 1.67 0.53 5.21 0.381
#> 8 Score >=5 42 GRP1 multivariable 0.96 0.3 3.05 0.943
#> 9 Treatment B 2951 GRP1 univariable 1.74 1.26 2.4 0.001
#> 10 Treatment B 2951 GRP1 multivariable 1.53 1.09 2.13 0.013
#> 11 Sex Male 2318 GRP2 univariable 1.7 1.36 2.13 <0.001
#> 12 Sex Male 2318 GRP2 multivariable 1.62 1.28 2.04 <0.001
#> 13 Score 1-2 1637 GRP2 univariable 2.76 1.21 6.29 0.016
#> 14 Score 1-2 1637 GRP2 multivariable 2.69 1.18 6.13 0.019
#> 15 Score 3-4 412 GRP2 univariable 5.11 2.26 11.6 <0.001
#> 16 Score 3-4 412 GRP2 multivariable 4.46 1.95 10.2 <0.001
#> 17 Score >=5 42 GRP2 univariable 5.05 2.19 11.6 <0.001
#> 18 Score >=5 42 GRP2 multivariable 4.08 1.73 9.59 0.001
#> 19 Treatment B 2951 GRP2 univariable 1.48 1.16 1.88 0.001
#> 20 Treatment B 2951 GRP2 multivariable 1.23 0.95 1.59 0.114
Now that we have the data in this format, the plot itself is straightforward:
library(ggplot2)
ggplot(df, aes(HR, Variable)) +
geom_pointrange(aes(xmin = lower, xmax = upper, colour = type),
position = position_dodge(width = 0.5)) +
facet_grid(group~., switch = "y") +
geom_vline(xintercept = 0, linetype = 2) +
theme_bw() +
theme(strip.placement = "outside",
strip.text= element_text(angle = 180),
strip.background = element_blank(),
panel.spacing = unit(0, "mm"))
Created on 2021-11-01 by the reprex package (v2.0.0)