Custom function: update old variables and create new variables/ non standard evaluation - r

I would use some help with a tidyverse solution for a function that I've custom written. I have a dataset with a binary phenotype and an associated diagnosis date, as well as 2 other dates I hope to use to update and create new variables.
I want to:
update the value of the supplied variables to NA if vardt < other_dt
generate new variables, {var}_incid & {var}_incid_dt, if the vardt variable is before baseline_dt
Here's my go at a function; I know that it likely requires some non-standard evaluation techniques, so I've tried to use assign() and eval(substitute()) around the names to no avail. Any tips? Thanks in advance for the help.
# load lib
library(tidyverse)
library(lubridate)
rdate <- function(x,
min = paste0(format(Sys.Date(), '%Y'), '-01-01'),
max = paste0(format(Sys.Date(), '%Y'), '-12-31'),
sort = TRUE) {
dates <- sample(seq(as.Date(min), as.Date(max), by = "day"), x, replace = TRUE)
if (sort == TRUE) {
sort(dates)
} else {
dates
}
}
# set seed for reproducibility
set.seed(42)
# Beginning dataset
das <- data.frame(id = rep(letters[1:3], each = 5),
pheno = rbinom(n=15, size = 1, prob = 0.30),
pheno_dt = rdate(15),
baseline_dt = rdate(15),
other_dt = rdate(15))
update_pheno <- function(var, vardt){
outds <- das %>%
mutate(eval(substitute(var)) = ifelse(var == 1 & pheno_dt < other_dt, NA, var),
# update vardt to NA if var value is NA
vardt = ifelse(is.na(var), NA, vardt))
# create incidence variable based on nomenclature of variable
paste0(var, "_incid") = ifelse(var == 1 & vardt < baseline_dt, NA, var),
# create associated dt variable
paste0(var, "_incid_dt" = ifelse(is.na(paste0(var, "_incid")), NA, vardt)))
return(outds)
}
test <- update_pheno(var = pheno, vardt = phenodt)

Limitations, Assumptions, and Simplifications
# Since we're talking *tidyverse*, let's make this a tibble:
das <- as_tibble( das )
das
# A tibble: 15 × 5
id pheno pheno_dt baseline_dt other_dt
<chr> <int> <date> <date> <date>
1 a 1 2022-01-05 2022-01-04 2022-01-03
2 a 1 2022-01-20 2022-04-19 2022-01-05
3 a 0 2022-01-24 2022-05-16 2022-02-02
4 a 1 2022-03-30 2022-05-26 2022-02-09
5 a 0 2022-04-19 2022-06-07 2022-04-13
6 b 0 2022-04-20 2022-07-16 2022-04-19
7 b 1 2022-06-14 2022-08-03 2022-04-24
8 b 0 2022-07-31 2022-08-14 2022-05-10
9 b 0 2022-09-16 2022-09-02 2022-05-18
10 b 1 2022-10-10 2022-10-19 2022-07-05
11 c 0 2022-10-24 2022-10-26 2022-08-16
12 c 1 2022-10-25 2022-11-10 2022-09-15
13 c 1 2022-11-10 2022-11-20 2022-09-19
14 c 0 2022-12-14 2022-12-14 2022-11-25
15 c 0 2022-12-26 2022-12-21 2022-12-24
Update the value of the supplied variables to NA if vardt < other_dt
# Do this directly:
das[ das$pheno_dt < das$other_dt , "pheno" ] <- NA
das
# A tibble: 15 × 5
id pheno pheno_dt baseline_dt other_dt
<chr> <int> <date> <date> <date>
1 a 1 2022-01-05 2022-01-04 2022-01-03
2 a 1 2022-01-20 2022-04-19 2022-01-05
3 a NA 2022-01-24 2022-05-16 2022-02-02
4 a 1 2022-03-30 2022-05-26 2022-02-09
5 a 0 2022-04-19 2022-06-07 2022-04-13
6 b 0 2022-04-20 2022-07-16 2022-04-19
7 b 1 2022-06-14 2022-08-03 2022-04-24
8 b 0 2022-07-31 2022-08-14 2022-05-10
9 b 0 2022-09-16 2022-09-02 2022-05-18
10 b 1 2022-10-10 2022-10-19 2022-07-05
11 c 0 2022-10-24 2022-10-26 2022-08-16
12 c 1 2022-10-25 2022-11-10 2022-09-15
13 c 1 2022-11-10 2022-11-20 2022-09-19
14 c 0 2022-12-14 2022-12-14 2022-11-25
15 c 0 2022-12-26 2022-12-21 2022-12-24
Generate new variables, {var}_incid & {var}_incid_dt, if the vardt variable is before baseline_dt
# What are the names of these new variables?
potential_new_names <- paste0( das$id, '_incid' )
potential_new_names
[1] "a_incid" "a_incid" "a_incid" "a_incid" "a_incid" "b_incid" "b_incid" "b_incid" "b_incid" "b_incid" "c_incid"
[12] "c_incid" "c_incid" "c_incid" "c_incid"
# To which rows does this apply?
these_rows <- which( das$pheno_dt < das$baseline_dt )
these_rows
[1] 2 3 4 5 6 7 8 10 11 12 13
# Remove duplicates
new_value_variables <- unique( potential_new_names[ these_rows ] )
# Create corresponding date variables
new_date_variables <- paste0( new_value_variables, "_dt" )
# Combine value variables and date variables
new_column_names <- c( new_value_variables, new_date_variables )
new_column_names
[1] "a_incid" "b_incid" "c_incid" "a_incid_dt" "b_incid_dt" "c_incid_dt"
code_to_make_new_columns <- sprintf(
'das %%>%% mutate( %s )'
, paste0( new_column_names, "=NA", collapse="," )
)
code_to_make_new_columns
[1] "das %>% mutate( a_incid=NA,b_incid=NA,c_incid=NA,a_incid_dt=NA,b_incid_dt=NA,c_incid_dt=NA )"
new_das <- eval( parse( text = code_to_make_new_columns ))
new_das
# A tibble: 15 × 11
id pheno pheno_dt baseline_dt other_dt a_incid b_incid c_incid a_incid_dt b_incid_dt c_incid_dt
<chr> <int> <date> <date> <date> <lgl> <lgl> <lgl> <lgl> <lgl> <lgl>
1 a 1 2022-01-05 2022-01-04 2022-01-03 NA NA NA NA NA NA
2 a 1 2022-01-20 2022-04-19 2022-01-05 NA NA NA NA NA NA
3 a NA 2022-01-24 2022-05-16 2022-02-02 NA NA NA NA NA NA
4 a 1 2022-03-30 2022-05-26 2022-02-09 NA NA NA NA NA NA
5 a 0 2022-04-19 2022-06-07 2022-04-13 NA NA NA NA NA NA
6 b 0 2022-04-20 2022-07-16 2022-04-19 NA NA NA NA NA NA
7 b 1 2022-06-14 2022-08-03 2022-04-24 NA NA NA NA NA NA
8 b 0 2022-07-31 2022-08-14 2022-05-10 NA NA NA NA NA NA
9 b 0 2022-09-16 2022-09-02 2022-05-18 NA NA NA NA NA NA
10 b 1 2022-10-10 2022-10-19 2022-07-05 NA NA NA NA NA NA
11 c 0 2022-10-24 2022-10-26 2022-08-16 NA NA NA NA NA NA
12 c 1 2022-10-25 2022-11-10 2022-09-15 NA NA NA NA NA NA
13 c 1 2022-11-10 2022-11-20 2022-09-19 NA NA NA NA NA NA
14 c 0 2022-12-14 2022-12-14 2022-11-25 NA NA NA NA NA NA
15 c 0 2022-12-26 2022-12-21 2022-12-24 NA NA NA NA NA NA
Now update the values for the new variables
incident_value_columns <- grep( pattern = "incid$" , names( new_das ))
incident_date_columns <- grep( pattern = "incid_dt$", names( new_das ))
rows_to_update <- das$pheno_dt >= das$baseline_dt
new_das[ rows_to_update, incident_value_columns ] <- new_das[ rows_to_update, 'pheno' ]
new_das[ rows_to_update, incident_date_columns ] <- new_das[ rows_to_update, 'pheno_dt' ]
new_das
# A tibble: 15 × 11
id pheno pheno_dt baseline_dt other_dt a_incid b_incid c_incid a_incid_dt b_incid_dt c_incid_dt
<chr> <int> <date> <date> <date> <int> <int> <int> <date> <date> <date>
1 a 1 2022-01-05 2022-01-04 2022-01-03 1 1 1 2022-01-05 2022-01-05 2022-01-05
2 a 1 2022-01-20 2022-04-19 2022-01-05 NA NA NA NA NA NA
3 a NA 2022-01-24 2022-05-16 2022-02-02 NA NA NA NA NA NA
4 a 1 2022-03-30 2022-05-26 2022-02-09 NA NA NA NA NA NA
5 a 0 2022-04-19 2022-06-07 2022-04-13 NA NA NA NA NA NA
6 b 0 2022-04-20 2022-07-16 2022-04-19 NA NA NA NA NA NA
7 b 1 2022-06-14 2022-08-03 2022-04-24 NA NA NA NA NA NA
8 b 0 2022-07-31 2022-08-14 2022-05-10 NA NA NA NA NA NA
9 b 0 2022-09-16 2022-09-02 2022-05-18 0 0 0 2022-09-16 2022-09-16 2022-09-16
10 b 1 2022-10-10 2022-10-19 2022-07-05 NA NA NA NA NA NA
11 c 0 2022-10-24 2022-10-26 2022-08-16 NA NA NA NA NA NA
12 c 1 2022-10-25 2022-11-10 2022-09-15 NA NA NA NA NA NA
13 c 1 2022-11-10 2022-11-20 2022-09-19 NA NA NA NA NA NA
14 c 0 2022-12-14 2022-12-14 2022-11-25 0 0 0 2022-12-14 2022-12-14 2022-12-14
15 c 0 2022-12-26 2022-12-21 2022-12-24 0 0 0 2022-12-26 2022-12-26 2022-12-26
The non-standard-evaluation part
When you need to access something referenced by a combination of names and values of the parameters passed to a function, eval and sym can be used as follows:
example_within_a_function <- function(
the_data
, var_column_name
, var_datestamp_column_name
, baseline_column_name
, other_column_name
){
# Skip the first argument, which is the function, itself,
# and get all the rest of the arguments,
# which are passed parameters
arguments <- match.call()[ -1 ] %>% as.list
# Extract the value passed to each argument
values <- seq( arguments ) %>% map_chr( ~rlang::as_string( arguments[[.]] ))
# Return the names of the arguments, their values,
# the data table (using non-standard evaluation), and
# the data table (using a straight-forward reference).
list(
labels_within_function = names( arguments )
, labels_in_parent_env = values
, data = eval( sym( values[[ 1 ]] ))
, also_data = the_data
)
}
example_within_a_function(
the_data = das
, var_column_name = pheno
, var_datestamp_column_name = pheno_dt
, baseline_column_name = baseline_dt
, other_column_name = other_dt
)
$labels_within_function
[1] "the_data" "var_column_name" "var_datestamp_column_name" "baseline_column_name"
[5] "other_column_name"
$labels_in_parent_env
[1] "das" "pheno" "pheno_dt" "baseline_dt" "other_dt"
$data
# A tibble: 15 × 5
id pheno pheno_dt baseline_dt other_dt
<chr> <int> <date> <date> <date>
1 a 1 2022-01-05 2022-01-04 2022-01-03
2 a 1 2022-01-20 2022-04-19 2022-01-05
3 a NA 2022-01-24 2022-05-16 2022-02-02
4 a 1 2022-03-30 2022-05-26 2022-02-09
5 a 0 2022-04-19 2022-06-07 2022-04-13
6 b 0 2022-04-20 2022-07-16 2022-04-19
7 b 1 2022-06-14 2022-08-03 2022-04-24
8 b 0 2022-07-31 2022-08-14 2022-05-10
9 b 0 2022-09-16 2022-09-02 2022-05-18
10 b 1 2022-10-10 2022-10-19 2022-07-05
11 c 0 2022-10-24 2022-10-26 2022-08-16
12 c 1 2022-10-25 2022-11-10 2022-09-15
13 c 1 2022-11-10 2022-11-20 2022-09-19
14 c 0 2022-12-14 2022-12-14 2022-11-25
15 c 0 2022-12-26 2022-12-21 2022-12-24
$also_data
# A tibble: 15 × 5
id pheno pheno_dt baseline_dt other_dt
<chr> <int> <date> <date> <date>
1 a 1 2022-01-05 2022-01-04 2022-01-03
2 a 1 2022-01-20 2022-04-19 2022-01-05
3 a NA 2022-01-24 2022-05-16 2022-02-02
4 a 1 2022-03-30 2022-05-26 2022-02-09
5 a 0 2022-04-19 2022-06-07 2022-04-13
6 b 0 2022-04-20 2022-07-16 2022-04-19
7 b 1 2022-06-14 2022-08-03 2022-04-24
8 b 0 2022-07-31 2022-08-14 2022-05-10
9 b 0 2022-09-16 2022-09-02 2022-05-18
10 b 1 2022-10-10 2022-10-19 2022-07-05
11 c 0 2022-10-24 2022-10-26 2022-08-16
12 c 1 2022-10-25 2022-11-10 2022-09-15
13 c 1 2022-11-10 2022-11-20 2022-09-19
14 c 0 2022-12-14 2022-12-14 2022-11-25
15 c 0 2022-12-26 2022-12-21 2022-12-24

Related

Joining dates are changed to certain key in data.table

I have the following dataframes df and df_dates (dput below):
> df
group start end
1 A 2022-12-01 2022-12-04
2 A 2022-12-04 2022-12-07
3 A 2022-12-07 2022-12-10
4 A 2022-12-10 2022-12-13
5 A 2022-12-13 2022-12-16
6 A 2022-12-16 2022-12-19
7 B 2022-12-01 2022-12-04
8 B 2022-12-04 2022-12-07
9 B 2022-12-07 2022-12-10
10 B 2022-12-10 2022-12-13
11 B 2022-12-13 2022-12-16
12 B 2022-12-16 2022-12-19
> df_dates
group date value
1 A 2022-12-02 1
2 A 2022-12-14 3
3 B 2022-12-06 2
4 B 2022-12-13 4
I would like to join the rows of df_dates by group where the date column is between the column start and end of df. When I join these two dataframes, the dates of the date column return the same as the dates from the start column of df. Here is the code with output:
df <- data.frame(group = rep(c('A', 'B'), each = 6),
start = c(seq.Date(as.Date('2022-12-01'), as.Date('2022-12-16'), '3 days')),
end = c(seq.Date(as.Date('2022-12-04'), as.Date('2022-12-19'), '3 days')))
df_dates <- data.frame(group = c('A', 'A', 'B', 'B'),
date = as.Date(c('2022-12-02', '2022-12-14', '2022-12-06', '2022-12-13')),
value = c(1,3,2,4))
library(data.table)
setDT(df)
setDT(df_dates)
df_dates[df,
.(group, date, start, end, value),
on = .(group, date >= start, date <= end)]
#> group date start end value
#> 1: A 2022-12-01 2022-12-01 2022-12-04 1
#> 2: A 2022-12-04 2022-12-04 2022-12-07 NA
#> 3: A 2022-12-07 2022-12-07 2022-12-10 NA
#> 4: A 2022-12-10 2022-12-10 2022-12-13 NA
#> 5: A 2022-12-13 2022-12-13 2022-12-16 3
#> 6: A 2022-12-16 2022-12-16 2022-12-19 NA
#> 7: B 2022-12-01 2022-12-01 2022-12-04 NA
#> 8: B 2022-12-04 2022-12-04 2022-12-07 2
#> 9: B 2022-12-07 2022-12-07 2022-12-10 NA
#> 10: B 2022-12-10 2022-12-10 2022-12-13 4
#> 11: B 2022-12-13 2022-12-13 2022-12-16 4
#> 12: B 2022-12-16 2022-12-16 2022-12-19 NA
Created on 2022-12-12 with reprex v2.0.2
As you can see, the dates of the date column are now changed to the dates of the start column, while I want them to be the same as in df_dates dataframe. The desired output should look like this:
#> group date start end value
#> 1: A 2022-12-02 2022-12-01 2022-12-04 1
#> 2: A NA 2022-12-04 2022-12-07 NA
#> 3: A NA 2022-12-07 2022-12-10 NA
#> 4: A NA 2022-12-10 2022-12-13 NA
#> 5: A 2022-12-14 2022-12-13 2022-12-16 3
#> 6: A NA 2022-12-16 2022-12-19 NA
#> 7: B NA 2022-12-01 2022-12-04 NA
#> 8: B 2022-12-06 2022-12-04 2022-12-07 2
#> 9: B NA 2022-12-07 2022-12-10 NA
#> 10: B 2022-12-13 2022-12-10 2022-12-13 4
#> 11: B 2022-12-13 2022-12-13 2022-12-16 4
#> 12: B NA 2022-12-16 2022-12-19 NA
So I was wondering if anyone knows how to join these two dataframes in the right way using data.table?
dput of df and df_dates:
df <- structure(list(group = c("A", "A", "A", "A", "A", "A", "B", "B",
"B", "B", "B", "B"), start = structure(c(19327, 19330, 19333,
19336, 19339, 19342, 19327, 19330, 19333, 19336, 19339, 19342
), class = "Date"), end = structure(c(19330, 19333, 19336, 19339,
19342, 19345, 19330, 19333, 19336, 19339, 19342, 19345), class = "Date")), class = "data.frame", row.names = c(NA,
-12L))
df_dates <- structure(list(group = c("A", "A", "B", "B"), date = structure(c(19328,
19340, 19332, 19339), class = "Date"), value = c(1, 3, 2, 4)), class = "data.frame", row.names = c(NA,
-4L))
library(data.table)
setDT(df)
setDT(df_dates)
Just specifying you want to the original date (you can use prefix x. or i. (to reference df in this case) )
df_dates[df,
.(group, x.date, start, end, value),
on = .(group, date >= start, date <= end)]
Or modifying the original:
df[, c("date", "value") :=
df_dates[.SD, on = .(group, date >= start, date <= end), .(x.date, value)]]
# group start end date value
# <char> <Date> <Date> <Date> <num>
# 1: A 2022-12-01 2022-12-04 2022-12-02 1
# 2: A 2022-12-04 2022-12-07 <NA> NA
# 3: A 2022-12-07 2022-12-10 <NA> NA
# 4: A 2022-12-10 2022-12-13 <NA> NA
# 5: A 2022-12-13 2022-12-16 2022-12-14 3
# 6: A 2022-12-16 2022-12-19 <NA> NA
# 7: B 2022-12-01 2022-12-04 <NA> NA
# 8: B 2022-12-04 2022-12-07 2022-12-06 2
# 9: B 2022-12-07 2022-12-10 <NA> NA
# 10: B 2022-12-10 2022-12-13 2022-12-13 4
# 11: B 2022-12-13 2022-12-16 2022-12-13 4
# 12: B 2022-12-16 2022-12-19 <NA> NA

How to search upwards a column for a value based on whether another column is NA or not?

I need to find the previous date for which value is not NA and then also use the value on that row. I have tried to use shift, but I have met a problem because shift works well for row 9 but not for when there are consecutive non-NAs on type, such as on rows 5,6.
dtihave = data.table(date = as.Date(c("2020-01-01", "2020-02-01", "2020-03-03", "2020-04-02", "2020-05-09", "2020-06-10", "2020-07-18", "2020-08-23", "2020-09-09")),
type = c(1,1,NA,NA,1,1,NA,NA,1),
value = c(7,NA,6,8,NA,NA,5,9,NA))
> dtihave
date type value
1: 2020-01-01 1 7
2: 2020-02-01 1 NA
3: 2020-03-03 NA 6
4: 2020-04-02 NA 8
5: 2020-05-09 1 NA
6: 2020-06-10 1 NA
7: 2020-07-18 NA 5
8: 2020-08-23 NA 9
9: 2020-09-09 1 NA
dtiwant = data.table(date = as.Date(c("2020-01-01", "2020-02-01", "2020-03-03", "2020-04-02", "2020-05-09", "2020-06-10", "2020-07-18", "2020-08-23", "2020-09-09")),
type = c(1,1,NA,NA,1,1,NA,NA,1),
value = c(7,NA,6,8,NA,NA,5,9,NA),
iwantdate = c(NA, as.Date("2020-01-01"), NA, NA, as.Date("2020-04-02"), as.Date("2020-04-02"), NA, NA, as.Date("2020-08-23")),
iwantvalue = c(NA,7,NA,NA,8,8,NA,NA,9))
dtiwant[, iwantdate := as.Date(iwantdate, origin = "1970-01-01")]
> dtiwant
date type value iwantdate iwantvalue
1: 2020-01-01 1 7 <NA> NA
2: 2020-02-01 1 NA 2020-01-01 7
3: 2020-03-03 NA 6 <NA> NA
4: 2020-04-02 NA 8 <NA> NA
5: 2020-05-09 1 NA 2020-04-02 8
6: 2020-06-10 1 NA 2020-04-02 8
7: 2020-07-18 NA 5 <NA> NA
8: 2020-08-23 NA 9 <NA> NA
9: 2020-09-09 1 NA 2020-08-23 9
My current progress using shift, but I need row 6's iwantdate = "2020-04-02". The number of shifts I need to make is unknown, so I can not just use n=2 in shift.
dtprogress = copy(dtihave)
dtprogress[, iwantdate := ifelse(!is.na(type) & is.na(value), shift(date), NA)]
dtprogress[, iwantdate := ifelse(!is.na(type) & !is.na(value), date, iwantdate)]
dtprogress[, iwantdate := as.Date(iwantdate, origin = "1970-01-01")]
> dtprogress
date type value iwantdate
1: 2020-01-01 1 7 2020-01-01
2: 2020-02-01 1 NA 2020-01-01
3: 2020-03-03 NA 6 <NA>
4: 2020-04-02 NA 8 <NA>
5: 2020-05-09 1 NA 2020-04-02
6: 2020-06-10 1 NA 2020-05-09
7: 2020-07-18 NA 5 <NA>
8: 2020-08-23 NA 9 <NA>
9: 2020-09-09 1 NA 2020-08-23
You could do:
dtihave[, idx := cummax((!is.na(value)) * .I) * NA^!is.na(value)][,
c('want_date', 'want_value') := lapply(.SD, '[', idx),
.SDcols = c('date', 'value')][, idx:=NULL]
dtihave
date type value want_date want_value
1: 2020-01-01 1 7 <NA> NA
2: 2020-02-01 1 NA 2020-01-01 7
3: 2020-03-03 NA 6 <NA> NA
4: 2020-04-02 NA 8 <NA> NA
5: 2020-05-09 1 NA 2020-04-02 8
6: 2020-06-10 1 NA 2020-04-02 8
7: 2020-07-18 NA 5 <NA> NA
8: 2020-08-23 NA 9 <NA> NA
9: 2020-09-09 1 NA 2020-08-23 9
with tidyverse. Hopefully this solves the grouping. ie just add %>%group_by(...) before mutate and you are good to go
dtihave %>%
mutate(val_na = !is.na(value),
idx = nafill(na_if(row_number() * val_na, 0), "locf"),
idx = idx * NA ^ val_na,
date1 = date[idx], value1 = value[idx],
val_na = NULL, idx = NULL)
You can use lag to get previous values, e.g.
library(dplyr)
dtihave %>%
mutate(iwantdate = ifelse(is.na(value), lag(date), NA) %>% as.Date(., origin = "1970-01-01"),
iwantvalue = ifelse(is.na(value), lag(value), NA))
date type value iwantdate iwantvalue
1: 2020-01-01 1 7 <NA> NA
2: 2020-02-01 1 NA 2020-01-01 7
3: 2020-03-03 NA 6 <NA> NA
4: 2020-04-02 NA 8 <NA> NA
5: 2020-05-09 1 NA 2020-04-02 8
6: 2020-06-10 1 NA 2020-05-09 NA
7: 2020-07-18 NA 5 <NA> NA
8: 2020-08-23 NA 9 <NA> NA
9: 2020-09-09 1 NA 2020-08-23 9

How to Make a stock returns data set using R?

I have a dataset as below:
stockCode date Closeprice
A 2022-01-24 100
A 2022-01-25 101
A 2022-01-26 103
A 2022-01-27 104
A 2022-01-28 103
B 2022-01-24 200
B 2022-01-25 180
B 2022-01-26 177
B 2022-01-27 192
B 2022-01-28 202
C 2022-01-24 304
C 2022-01-25 333
C 2022-01-26 324
C 2022-01-27 360
C 2022-01-28 335
and then, I wish to add some return columns as below:
enter image description here
I tried to make a new column, and calculating the return,
but always shows errors.
> data$newclose <- data$Closeprice[2:length(data$Closeprice)-2]
Error in `$<-.data.frame`(`*tmp*`, newclose, value = c(8900, 9090, 9200, :
replacement has 126626 rows, data has 126628
The assignment should have the same length on the lhs and rhs. Perhaps we need to get the lead
library(dplyr)
data1 <- data %>%
mutate(newcolose = lead(Closeprice, n = 1))
I first create new columns with the values from 1 to 4 days using lead. Then, I calculate the percentage change for each day for each group.
library(tidyverse)
df %>%
group_by(stockCode) %>%
mutate(day1 = lead(Closeprice, n = 1),
day2 = lead(Closeprice, n = 2),
day3 = lead(Closeprice, n = 3),
day4 = lead(Closeprice, n = 4)) %>%
mutate(across(starts_with("day"), ~((. - Closeprice)/Closeprice)*100))
Output
# A tibble: 15 × 5
# Groups: stockCode [3]
stockCode day1 day2 day3 day4
<chr> <dbl> <dbl> <dbl> <dbl>
1 A 1 3 4 3
2 A 1.98 2.97 1.98 NA
3 A 0.971 0 NA NA
4 A -0.962 NA NA NA
5 A NA NA NA NA
6 B -10 -11.5 -4 1
7 B -1.67 6.67 12.2 NA
8 B 8.47 14.1 NA NA
9 B 5.21 NA NA NA
10 B NA NA NA NA
11 C 9.54 6.58 18.4 10.2
12 C -2.70 8.11 0.601 NA
13 C 11.1 3.40 NA NA
14 C -6.94 NA NA NA
15 C NA NA NA NA

Turn a loop based code into a vectorised one in R?

I´ve got this dataset and want to perform some calculations based on certain conditions:
library(tidyverse)
library(lubridate)
filas <- structure(list(Año = c(rep(2020,4),rep(2021,4),2022),
Mes = c(2:5,3:4,9,11,1),
Id = c(rep(1,7),2,2)),
row.names = c(NA, -9L),
class = c("tbl_df", "tbl", "data.frame")) %>%
mutate(fecha = make_date(Año,Mes,1),
meses_imp = make_date(2999,1,1))
Año
Mes
Id
fecha
meses_imp
2020
2
1
2020-02-01
2999-01-01
2020
3
1
2020-03-01
2999-01-01
2020
4
1
2020-04-01
2999-01-01
2020
5
1
2020-05-01
2999-01-01
2021
3
1
2021-03-01
2999-01-01
2021
4
1
2021-04-01
2999-01-01
2021
9
1
2021-09-01
2999-01-01
2021
11
2
2021-11-01
2999-01-01
2022
1
2
2022-01-01
2999-01-01
I need to add rows for each "Id" when there are "holes" between two consecutive ones, and count those added rows later. I´ve achieved this using a "while" loop:
i <- 2
while(!is.na(filas[i,]$Id)) {
if (as.double(difftime(filas[i,]$fecha,filas[i-1,]$fecha)) > 31 &
filas[i,]$Id == filas[i-1,]$Id) {
filas <- add_row(filas,
Id = filas[i,]$Id,
fecha = filas[i-1,]$fecha + months(1),
meses_imp = pmin(filas[i-1,]$fecha,
filas[i-1,]$meses_imp),
.after = i-1)}
i=i+1}
filas2 <- filas %>%
group_by(Id,meses_imp) %>%
summarise(cant_meses_imp = n()) %>%
ungroup() %>%
filter(meses_imp != "2999-01-01")
filas <- left_join(filas,
filas2,
by=c("Id","meses_imp"))
Año
Mes
Id
fecha
meses_imp
cant_meses_imp
2020
2
1
2020-02-01
2999-01-01
NA
2020
3
1
2020-03-01
2999-01-01
NA
2020
4
1
2020-04-01
2999-01-01
NA
2020
5
1
2020-05-01
2999-01-01
NA
NA
NA
1
2020-06-01
2020-05-01
9
NA
NA
1
2020-07-01
2020-05-01
9
NA
NA
1
2020-08-01
2020-05-01
9
NA
NA
1
2020-09-01
2020-05-01
9
NA
NA
1
2020-10-01
2020-05-01
9
NA
NA
1
2020-11-01
2020-05-01
9
NA
NA
1
2020-12-01
2020-05-01
9
NA
NA
1
2021-01-01
2020-05-01
9
NA
NA
1
2021-02-01
2020-05-01
9
2021
3
1
2021-03-01
2999-01-01
NA
2021
4
1
2021-04-01
2999-01-01
NA
NA
NA
1
2021-05-01
2021-04-01
4
NA
NA
1
2021-06-01
2021-04-01
4
NA
NA
1
2021-07-01
2021-04-01
4
NA
NA
1
2021-08-01
2021-04-01
4
2021
9
1
2021-09-01
2999-01-01
NA
2021
11
2
2021-11-01
2999-01-01
NA
NA
NA
2
2021-12-01
2021-11-01
1
2022
1
2
2022-01-01
2999-01-01
NA
Since I`d like to apply this to a much larger dataset (~ 300k rows), how could I rewrite it in a vectorised way so it´s more efficient (and elegant maybe)?
Thanks!
You can apply the following code using padr and zoo packages.
This idea is to:
Add missing dates with the padr::pad() function.
Remove unwanted lines (non-integer Id values)
Create na and grp columns to identify rows added in 1.
Group by grp and create a column cant_meses_imp to count the number of consecutive na in each group
Select only desired columns
library(dplyr)
library(padr)
library(zoo)
filas %>%
pad(by = "fecha") %>% # add missing dates
mutate(Id = na.approx(Id)) %>% # interpolate NA values in Id column
subset(Id%%1 == 0) %>% # Keep only Id interger
# This part is for generating the cant_meses_imp column
mutate(na = ifelse(is.na(Mes), 1, 0),
grp = rle(na)$lengths %>% {rep(seq(length(.)), .)}) %>%
group_by(grp) %>%
mutate(cant_meses_imp = ifelse(na == 0, NA, n())) %>%
ungroup() %>%
select(-c(na, grp))
The code does not reproduce exactly the fecha column as there is no guidelines for its values.

How to merge two columns in R?

newdf=data.frame(id=c(1,3,2),admission=c("2020-05-18","2020-04-30","2020-05-08"),
vent=c("mechanical_vent","self_vent","mechanical_vent"))
newdf$admission=as.Date(newdf$admission)
newdf1=data.frame(id=c(1,3,1,2,1,3,2,2),
date=c("2020-05-19","2020-05-02","2020-05-20","2020-05-09","2020-05-21","2020-05-04","2020-05-10","2020-05-11"),
vent=c("self_vent","mechanical_vent","mechanical_vent","mechanical_vent","self_vent","mechanical_vent","mechanical_vent","self_vent"))
newdf1$date=as.Date(newdf1$date)
newdf=newdf %>% group_by(id) %>% bind_rows(newdf,newdf1)
newdf$dates=paste(newdf$admission,newdf$date)
I want to merge admission and date columns as dates. I used paste function but it gives output with NA values. I have attached image of data set herewith. Could you please suggest a method to solve this?
If you want to transfer the dates from admissionto date, where dateis NA, this will work:
newdf %>%
mutate(across(c(admission, date), ~ as.character(.))) %>%
mutate(date = ifelse(is.na(date), admission, date))
We could use pmax:
newdf$dates <- pmax(newdf$admission, newdf$date, na.rm = TRUE)
Output:
id admission vent date dates
<dbl> <date> <chr> <date> <date>
1 1 2020-05-18 mechanical_vent NA 2020-05-18
2 3 2020-04-30 self_vent NA 2020-04-30
3 2 2020-05-08 mechanical_vent NA 2020-05-08
4 1 2020-05-18 mechanical_vent NA 2020-05-18
5 3 2020-04-30 self_vent NA 2020-04-30
6 2 2020-05-08 mechanical_vent NA 2020-05-08
7 1 NA self_vent 2020-05-19 2020-05-19
8 3 NA mechanical_vent 2020-05-02 2020-05-02
9 1 NA mechanical_vent 2020-05-20 2020-05-20
10 2 NA mechanical_vent 2020-05-09 2020-05-09
11 1 NA self_vent 2020-05-21 2020-05-21
12 3 NA mechanical_vent 2020-05-04 2020-05-04
13 2 NA mechanical_vent 2020-05-10 2020-05-10
14 2 NA self_vent 2020-05-11 2020-05-11
You can use coalesce -
library(dplyr)
newdf %>% ungroup %>% mutate(dates = coalesce(admission, date))
# id admission vent date dates
# <dbl> <date> <chr> <date> <date>
# 1 1 2020-05-18 mechanical_vent NA 2020-05-18
# 2 3 2020-04-30 self_vent NA 2020-04-30
# 3 2 2020-05-08 mechanical_vent NA 2020-05-08
# 4 1 2020-05-18 mechanical_vent NA 2020-05-18
# 5 3 2020-04-30 self_vent NA 2020-04-30
# 6 2 2020-05-08 mechanical_vent NA 2020-05-08
# 7 1 NA self_vent 2020-05-19 2020-05-19
# 8 3 NA mechanical_vent 2020-05-02 2020-05-02
# 9 1 NA mechanical_vent 2020-05-20 2020-05-20
#10 2 NA mechanical_vent 2020-05-09 2020-05-09
#11 1 NA self_vent 2020-05-21 2020-05-21
#12 3 NA mechanical_vent 2020-05-04 2020-05-04
#13 2 NA mechanical_vent 2020-05-10 2020-05-10
#14 2 NA self_vent 2020-05-11 2020-05-11

Resources