dplyr to summarize by groups while allocating hours across dates and increments - r

I have a df with timesheet data and am looking for an easy way to summarize it. My data looks like df1 and I want to summarize it as df2. I am having a hard time devising a way to create the increments and allocate hours across them. The tricky part is allocating the hours that span across dates, ID 1 and 3, for example.
df1
ID Garage Unit_Name START_DATE_TIME END_DATE_TIME
<chr> <chr> <chr> <dttm> <dttm>
1 A Truck 1/26/2015 21:00 1/27/2015 7:00
2 B Truck 5/13/2015 6:00 5/13/2015 16:00
3 C Car 8/21/2015 21:00 8/22/2015 7:00
6 C Car 8/21/2015 11:00 8/21/2015 21:00
structure(list(ID = c("<chr>", "1", "2", "3", "6", NA, NA, NA,
NA, NA, NA), Garage = c("<chr>", "A", "B", "C", "C", NA, NA,
NA, NA, NA, NA), Unit_Name = c("<chr>", "Truck", "Truck", "Car",
"Car", NA, NA, NA, NA, NA, NA), START_DATE_TIME = c("<dttm>",
"1/26/2015 21:00", "5/13/2015 6:00", "8/21/2015 21:00", "8/21/2015 11:00",
NA, NA, NA, NA, NA, NA), END_DATE_TIME = c("<dttm>", "1/27/2015 7:00",
"5/13/2015 16:00", "8/22/2015 7:00", "8/21/2015 21:00", NA, NA,
NA, NA, NA, NA)), .Names = c("ID", "Garage", "Unit_Name", "START_DATE_TIME",
"END_DATE_TIME"), row.names = c(NA, -11L), class = c("tbl_df",
"tbl", "data.frame"), spec = structure(list(cols = structure(list(
ID = structure(list(), class = c("collector_character", "collector"
)), Garage = structure(list(), class = c("collector_character",
"collector")), Unit_Name = structure(list(), class = c("collector_character",
"collector")), START_DATE_TIME = structure(list(), class = c("collector_character",
"collector")), END_DATE_TIME = structure(list(), class = c("collector_character",
"collector"))), .Names = c("ID", "Garage", "Unit_Name", "START_DATE_TIME",
"END_DATE_TIME")), default = structure(list(), class = c("collector_guess",
"collector"))), .Names = c("cols", "default"), class = "col_spec"))
df2
Garage Unit_Name Date Increment Hours
<chr> <chr> <dttm> <chr> <dbl>
A Truck 1/26/2015 18:01-00:00 3
A Truck 1/27/2015 00:01-6:00 6
A Truck 1/27/2015 6:01-12:00 1
B Truck 5/13/2015 6:01-12:00 6
B Truck 5/13/2015 12:01-18:00 4
C Car 8/21/2015 6:01-12:00 1
C Car 8/21/2015 12:01-18:00 6
C Car 8/21/2015 18:01-00:00 6
C Car 8/22/2015 00:01-6:00 6
C Car 8/23/2015 6:01-12:00 1

library(tidyverse)
library(lubridate)
times=c("00:00","06:00","12:00","18:00")
times1=c("00:01","06:01","12:01","18:01")
df1%>%
group_by(Garage,Unit_Name)%>%
mutate(size=n())%>%
summarise(START_DATE_TIME=min(START_DATE_TIME),
END_DATE_TIME=max(END_DATE_TIME))%>%
mutate(S=mdy_hm(START_DATE_TIME),
b=floor(hour(S)/24*4)+1,
m=ymd_hm(paste(format(S,"%F"),get("times",.GlobalEnv)[b])),
n=ymd_hm(paste(format(S,"%F"),get("times",.GlobalEnv)[(b+1)%%4%>%replace(.,.==0,4)]))%>%
if_else(m>.,.+days(1),.),
rem=as.numeric(mdy_hm(END_DATE_TIME)-n),
HOURS=list(as.numeric(c(n-S,rep(6,rem%/%6),rem%%6))))%>%
unnest()%>%
mutate(Date=S+hours(cumsum(lag(HOURS,default = 0))),
b=floor(hour(Date)/24*4)+1,
increament=paste0(get("times1",.GlobalEnv)[b],"-",
get("times",.GlobalEnv)[replace(d<-(b+1)%%4,d==0,4)]),
Date=as.Date(Date))%>%
select(Garage,Date,HOURS,increament)
Groups: Garage [3]
Garage Date HOURS increament
<chr> <date> <dbl> <chr>
1 A 2015-01-26 3. 18:01-00:00
2 A 2015-01-27 6. 00:01-06:00
3 A 2015-01-27 1. 06:01-12:00
4 B 2015-05-13 6. 06:01-12:00
5 B 2015-05-13 4. 12:01-18:00
6 C 2015-08-21 1. 06:01-12:00
7 C 2015-08-21 6. 12:01-18:00
8 C 2015-08-21 6. 18:01-00:00
9 C 2015-08-22 6. 00:01-06:00
10 C 2015-08-22 1. 06:01-12:00

Related

R:Sorting rows with time within multiple time interval

I want to pick up rows of which time data is between multiple intervals.
The data frame is like this:
dputs
structure(list(ID = c("A", "A", "A", "A", "A", "B", "B", "B",
"B", "B"), score_time = c("2022/09/01 9:00:00", "2022/09/02 18:00:00",
"2022/09/03 12:00:00", NA, NA, "2022/09/15 18:00:00", "2022/09/18 20:00:00",
NA, NA, NA), score = c(243, 232, 319, NA, NA, 436, 310, NA, NA,
NA), treatment_start = c(NA, NA, NA, "2022/09/02 8:00:00", "2022/09/03 11:00:00",
NA, NA, "2022/09/15 8:00:00", "2022/09/16 14:00:00", "2022/09/16 23:00:00"
), treatment_end = c(NA, NA, NA, "2022/09/02 22:00:00", "2022/09/09 12:00:00",
NA, NA, "2022/09/16 2:00:00", "2022/09/16 22:00:00", "2022/09/17 0:00:00"
)), row.names = c(NA, -10L), spec = structure(list(cols = list(
ID = structure(list(), class = c("collector_character", "collector"
)), score_time = structure(list(), class = c("collector_character",
"collector")), score = structure(list(), class = c("collector_double",
"collector")), treatment_start = structure(list(), class = c("collector_character",
"collector")), treatment_end = structure(list(), class = c("collector_character",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), delim = ","), class = "col_spec"), problems = <pointer: 0x6000000190b0>, class = c("spec_tbl_df",
"tbl_df", "tbl", "data.frame"))
ID score_time score treatment_start treatment_end
<chr> <chr> <dbl> <chr> <chr>
1 A 2022/09/01 9:00:00 243 NA NA
2 A 2022/09/02 18:00:00 232 NA NA
3 A 2022/09/03 12:00:00 319 NA NA
4 A NA NA 2022/09/02 8:00:00 2022/09/02 22:00:00
5 A NA NA 2022/09/03 11:00:00 2022/09/09 12:00:00
6 B 2022/09/15 18:00:00 436 NA NA
7 B 2022/09/18 20:00:00 310 NA NA
8 B NA NA 2022/09/15 8:00:00 2022/09/16 2:00:00
9 B NA NA 2022/09/16 14:00:00 2022/09/16 22:00:00
10 B NA NA 2022/09/16 23:00:00 2022/09/17 0:00:00
Multiple score values are given for each ID with the measurement time.
And each ID has more than one information of treatment duration shown by start and end time.
My target is score values that are measured during treatment periods.
I tried with the package lubridate and tidyverse to mutate intervals but could not apply "%in%" method.
Here is my attempt until putting intervals in the same rows with score values.
data %>%
mutate(trt_interval = interval(start = treatment_start, end = treatment_end)) %>%
group_by(ID) %>%
mutate(num = row_number()) %>%
pivot_wider(names_from = num, names_prefix = "intvl", values_from = trt_interval) %>%
fill(c(intvl1:last_col()), .direction = "up")
Desired output is like this.
(The first score of A and the last score of B dismissed because their score_time are out of interval.)
ID score
<chr> <dbl>
1 A 232
2 A 319
3 B 436
I want to know the smarter way to put data in a row and how to apply "%in%" for multiple intervals.
Sorry that the question is not qualified and include multiple steps but any advices will be a great help for me.
Hi I would first create two seperate data frames. One for the scores and one for the intervalls. Then would I join them both and filter the score that are within an treatment intervall.
data_score <- data %>%
filter(!is.na(score_time)) %>%
select(-starts_with("treat")) %>%
mutate(score_time = ymd_hms(score_time))
data_score
data_interval <- data %>%
filter(is.na(score_time)) %>%
select(ID,starts_with("treat")) %>%
mutate(trt_interval = interval(start = treatment_start, end = treatment_end))
data_score %>%
inner_join(
data_interval
) %>%
filter(
lubridate::`%within%`(score_time,trt_interval )
)
Hope this helps!!

How to join tables on prefix equality?

I have a table with prefixes (here in csv format):
PREFIX,LABEL
A,Infectious diseases
B,Infectious diseases
C,Tumor
D1,Tumor
D2,Tumor
D31,Tumor
D32,Tumor
D33,Blood disorder
D4,Blood disorder
D5,Blood disorder
And I want to join it with this one:
AGE,DEATH_CODE
67,A02
85,D318
75,C007+X
62,D338
To get obviously:
AGE,LABEL
67,Infectious diseases
85,Tumor
75,Tumor
62,Blood disorder
I know how to do that with SQL and LIKE but not with tidyverse left_join or base R.
Dput of data
Table 1: CIM_CODES
structure(list(PREFIX = c("A", "B", "C", "D1", "D2", "D31", "D32",
"D33", "D4", "D5"), LABEL = c("Infectious diseases", "Infectious diseases",
"Tumor", "Tumor", "Tumor", "Tumor", "Tumor", "Blood disorder",
"Blood disorder", "Blood disorder")), row.names = c(NA, -10L), spec = structure(list(
cols = list(PREFIX = structure(list(), class = c("collector_character",
"collector")), LABEL = structure(list(), class = c("collector_character",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), delim = ","), class = "col_spec"), problems = <pointer: 0x000002527d306190>, class = c("spec_tbl_df",
"tbl_df", "tbl", "data.frame"))
Table 2: DEATH_CAUSES
structure(list(AGE = c(67, 85, 75, 62), DEATH_CODE = c("A02",
"D318", "C007+X", "D338")), row.names = c(NA, -4L), spec = structure(list(
cols = list(AGE = structure(list(), class = c("collector_double",
"collector")), DEATH_CODE = structure(list(), class = c("collector_character",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), delim = ","), class = "col_spec"), problems = <pointer: 0x0000025273898c60>, class = c("spec_tbl_df",
"tbl_df", "tbl", "data.frame"))
You could do a partial string match that has the lowest difference between the two columns:
library(tidyverse)
DEATH_CAUSES |>
mutate(LABEL = map_chr(DEATH_CODE,
~CIM_CODES$LABEL[
which.min(stringdist::stringdist(.x, CIM_CODES$PREFIX))
]))
#> # A tibble: 4 x 3
#> AGE DEATH_CODE LABEL
#> <dbl> <chr> <chr>
#> 1 67 A02 Infectious diseases
#> 2 85 D318 Tumor
#> 3 75 C007+X Tumor
#> 4 62 D338 Blood disorder
UPDATE
not using the stringdist package as requested.
library(tidyverse)
get_match <- function(code, prefix, target){
map(code, \(x){
map(prefix, \(y){
grepl(paste0("^", y), x)
})
}) |>
map_chr(\(z) target[unlist(z) |> which()] )
}
DEATH_CAUSES |>
mutate(LABEL = get_match(DEATH_CAUSES$DEATH_CODE,
CIM_CODES$PREFIX,
CIM_CODES$LABEL))
#> # A tibble: 4 x 3
#> AGE DEATH_CODE LABEL
#> <dbl> <chr> <chr>
#> 1 67 A02 Infectious diseases
#> 2 85 D318 Tumor
#> 3 75 C007+X Tumor
#> 4 62 D338 Blood disorder
EDIT
how to do this with a join:
library(tidyverse)
library(fuzzyjoin)
fuzzy_left_join(DEATH_CAUSES,
CIM_CODES,
by = c("DEATH_CODE" = "PREFIX"),
str_detect)
#> # A tibble: 4 x 4
#> AGE DEATH_CODE PREFIX LABEL
#> <dbl> <chr> <chr> <chr>
#> 1 67 A02 A Infectious diseases
#> 2 85 D318 D31 Tumor
#> 3 75 C007+X C Tumor
#> 4 62 D338 D33 Blood disorder
My code below, I used mysql:
select a.age, p.label
from prefix p
left join age a on a.death_code like CONCAT("%",p.prefix,"%");
You can refer here: how to use a like with a join in sql?

R, find character string from vector, return true/false column and a multiple choice column

This question is very similar to one I had previously asked here:
R, find character string from vector, create new TRUE/FALSE columns
and that solution worked perfectly, but now I have an added twist.
df<-structure(list(Date = c("5/20/2019", "5/20/2019", "5/20/2019",
"5/20/2019", "5/20/2019", "5/20/2019", "5/20/2019", "5/20/2019",
"10/22/2018", "5/20/2019"), ESRD_1 = c("CKD (chronic kidney disease), stage III [N18.30]; CKD (chronic kidney disease), stage III [N18.30]; Type 2 diabetes mellitus [E11.9]",
NA, NA, NA, NA, NA, NA, NA, NA, NA), ESRD_10 = c(NA, NA, NA,
NA, NA, NA, NA, NA, "End stage renal disease on dialysis [N18.6, Z99.2]; End stage renal disease on dialysis [N18.6, Z99.2]; Type 2 diabetes mellitus with stage 4 chronic kidney disease, with long-term current use of insulin [E11.22, N18.4, Z79.4]; Type 2 diabetes mellitus with stage 4 chronic kidney disease, with long-term current use of insulin [E11.22, N18.4, Z79.4]; Type 2 diabetes mellitus with stage 4 chronic kidney disease, with long-term current use of insulin [E11.22, N18.4, Z79.4]",
NA), ESRD_11 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, "ESRD (end stage renal disease) on dialysis [N18.6, Z99.2]; ESRD (end stage renal disease) on dialysis [N18.6, Z99.2]; Peripheral vascular disease due to secondary diabetes [E13.51]; Type 2 diabetes mellitus with chronic kidney disease on chronic dialysis, with long-term current use of insulin [E11.22, N18.6, Z99.2, Z79.4]; Type 2 diabetes mellitus with chronic kidney disease on chronic dialysis, with long-term current use of insulin [E11.22, N18.6, Z99.2, Z79.4]; Type 2 diabetes mellitus with chronic kidney disease on chronic dialysis, with long-term current use of insulin [E11.22, N18.6, Z99.2, Z79.4]"
), ESRD_12 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), ESRD_13 = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA), ESRD_2 = c(NA, "CKD (chronic kidney disease), stage III [N18.30]; CKD (chronic kidney disease), stage III [N18.30]; Diabetic gastroparesis [E11.43, K31.84]; Type 2 diabetes mellitus [E11.9]",
NA, NA, NA, NA, NA, NA, NA, NA), ESRD_3 = c(NA, NA, "CKD (chronic kidney disease), stage III [N18.30]; CKD (chronic kidney disease), stage III [N18.30]; Type 2 diabetes mellitus [E11.9]",
NA, NA, NA, NA, NA, NA, NA), ESRD_4 = c(NA, NA, NA, "CKD (chronic kidney disease), stage III [N18.30]; CKD (chronic kidney disease), stage III [N18.30];",
NA, NA, NA, NA, NA, NA), ESRD_5 = c(NA, NA, NA, NA, "CKD (chronic kidney disease), stage II [N18.20]; CKD (chronic kidney disease), stage II [N18.20]; Type 2 diabetes mellitus [E11.9]",
NA, NA, NA, NA, NA), ESRD_6 = c(NA, NA, NA, NA, NA, "CKD (chronic kidney disease), stage III [N18.30]; CKD (chronic kidney disease), stage III [N18.30]; Type 2 diabetes mellitus [E11.9]",
NA, NA, NA, NA), ESRD_7 = c(NA, NA, NA, NA, NA, NA, "CKD (chronic kidney disease), stage III [N18.30]; CKD (chronic kidney disease), stage III [N18.30]; Diabetic gastroparesis [E11.43, K31.84]; Type 2 diabetes mellitus [E11.9]",
NA, NA, NA), ESRD_8 = c(NA, NA, NA, NA, NA, NA, NA, "CKD (chronic kidney disease), stage II [N18.20]; CKD (chronic kidney disease), stage II [N18.20];",
NA, NA), ESRD_9 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA)), class = c("spec_tbl_df",
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -10L), spec = structure(list(
cols = list(Date = structure(list(), class = c("collector_character",
"collector")), ESRD_1 = structure(list(), class = c("collector_character",
"collector")), ESRD_10 = structure(list(), class = c("collector_character",
"collector")), ESRD_11 = structure(list(), class = c("collector_character",
"collector")), ESRD_12 = structure(list(), class = c("collector_logical",
"collector")), ESRD_13 = structure(list(), class = c("collector_logical",
"collector")), ESRD_2 = structure(list(), class = c("collector_character",
"collector")), ESRD_3 = structure(list(), class = c("collector_character",
"collector")), ESRD_4 = structure(list(), class = c("collector_character",
"collector")), ESRD_5 = structure(list(), class = c("collector_character",
"collector")), ESRD_6 = structure(list(), class = c("collector_character",
"collector")), ESRD_7 = structure(list(), class = c("collector_character",
"collector")), ESRD_8 = structure(list(), class = c("collector_character",
"collector")), ESRD_9 = structure(list(), class = c("collector_logical",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), skip = 1L), class = "col_spec"))
I want to look through those "ESRD" columns and look for specific patterns, creating TRUE/FALSE columns of whether specific patterns are found. For instance R would look through the ESRD columns for either the word "diabetes" or the code "E11.9" and it would create a true/false diabetes column based on what it found.
I could do that, the code would be very similar to what was posted in that other question.
My question now is: for one of those particular columns I am creating now, it wouldn't just be true/false. I'd like to create a column that was labeled "CKD" that has possible values of: "No" (if a code was not found), "Stage 1", "Stage 2", "stage 3", "Stage 4", "Stage 5", "End Stage Renal Disease" and "Unspecified" (a specific code for unspecified). Following these rules:
So my end result would look like this:
For the flag, we can use str_detect with if_any, then create the 'N_val' column by extracting the substring that matches 'N' followed by one or digits, (\\d+), a dot (\\.) and one or more digits, coalecse them and join on a key/val dataset to return the corresponding 'stage'
library(dplyr)
library(stringr)
library(tidyr)
library(purrr)
keydat <- tibble(N_val = str_c('N18.', c(1:6, 9)),
CRD = c(str_c('Stage ', 1:5), 'ESRD', 'unspecified'))
out <- df %>%
mutate(Diabetes = replace_na(if_any(starts_with("ESRD"),
~ str_detect(., regex('diabetes|E11\\.9', ignore_case = TRUE))), FALSE),
N_val = invoke(coalesce, across(starts_with("ESRD"),
~ str_remove(str_extract(., "N\\d+\\.\\d+"), "0+$")))) %>%
left_join(keydat)
-output
> out$Diabetes
[1] TRUE TRUE TRUE FALSE TRUE TRUE TRUE FALSE TRUE TRUE
> out$CRD
[1] "Stage 3" "Stage 3" "Stage 3" "Stage 3" "Stage 2" "Stage 3" "Stage 3" "Stage 2" "ESRD" "ESRD"
I wanted to give it a shot aswell using unite from tidyr,
library(tidyverse)
df %>% unite(
col = "dummy",
na.rm = TRUE,
-"Date"
) %>% mutate(
diabetes = case_when(
str_detect(dummy, pattern = "diabetes|E11.9") ~ TRUE,
TRUE ~ FALSE
),
CKD = str_extract(
dummy, pattern = "stage I+|stage [:digit:]+|ESRD|CKD, unspecified"
)
) %>% select(-"dummy") %>% mutate(
dummy = str_extract(
CKD, pattern = "I+"
) %>% as.roman() %>% as.numeric(),
CKD = str_remove(
CKD, pattern = "I+"
)
) %>% unite(
col = "CKD",
na.rm = TRUE,
CKD:dummy,sep = ""
)
# A tibble: 10 x 3
Date diabetes CKD
<chr> <lgl> <chr>
1 5/20/2019 TRUE stage 3
2 5/20/2019 TRUE stage 3
3 5/20/2019 TRUE stage 3
4 5/20/2019 FALSE stage 3
5 5/20/2019 TRUE stage 2
6 5/20/2019 TRUE stage 3
7 5/20/2019 TRUE stage 3
8 5/20/2019 FALSE stage 2
9 10/22/2018 TRUE stage 4
10 5/20/2019 TRUE ESRD
Update: Converted Romans into Numeric.

how to keep the variables that is match another dataset's one col

I have a dataset :
df<-structure(list(EDC_file_name = c("e1", "e2", "e3",
"e4", "e5", "e6", "e7", "e8"), Tab = c("Demographics",
"Demographics", "PatientRegister", "PatientRegister", "PatientRegister",
"PatientRegister", "PatientConsent", "PatientConsent"), DatasetName = c("Demographics Merged",
NA, "Patient Register", NA, NA, NA, "Patient Consent", NA), GroupVar1 = c( "Subject",
NA, "Subject", NA, NA, NA,
NA, NA)), row.names = c(NA, -8L), class = c("tbl_df",
"tbl", "data.frame"))
then I have another dataset
structure(list(varlist = c("Tab", "DatasetName"), lable = c("Tab",
"Name of Dataset")), row.names = c(NA, -2L), class = c("tbl_df",
"tbl", "data.frame"))
how can I only keep the variables that is within varlist$Varlist?
df_New<- df %>% select( %in% ?) or filter ...?. Sorry for such simple question.
You can use all_of in select :
library(dplyr)
df %>% select(all_of(varlist$varlist))
# Tab DatasetName
# <chr> <chr>
#1 Demographics Demographics Merged
#2 Demographics NA
#3 PatientRegister Patient Register
#4 PatientRegister NA
#5 PatientRegister NA
#6 PatientRegister NA
#7 PatientConsent Patient Consent
#8 PatientConsent NA
Or if there are some values in varlist$varlist which are not present as column name in df use any_of.
df %>% select(any_of(varlist$varlist))

Create function based on condition of another column R

I have a df attached and I would like to create a loop that would apply a specific sequence (set by the user in R) based on conditions in column "x9". I would like to be able to set the sequence myself so I can try different sequences for this data frame, I will explain more below.
I have a df of losses and wins for an algorithm. On the first instance of a win I want to take the value in "x9" and divide it by the sequence value. I want to keep iterating through the sequence values until a loss is achieved. Once a loss is achieved the sequence will restart, when "x9" <0 to be specific.
I would like to create the two columns in my example "Risk Control" and "Sequence". Ideally I would like the function to iterate through the entire data frame so I can compare the column "x9" to "Risk Control".
Sample Data:
structure(list(x1 = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10), x2 = c("2016.01.04 01:05",
"2016.01.04 01:12", "2016.01.04 01:13", "2016.01.04 01:17", "2016.01.04 01:20",
"2016.01.04 01:23", "2016.01.04 01:25", "2016.01.04 01:30", "2016.01.04 01:31",
"2016.01.04 01:59"), x3 = c("buy", "close", "buy", "close", "buy",
"close", "buy", "t/p", "buy", "close"), x4 = c(1, 1, 2, 2, 3,
3, 4, 4, 5, 5), x5 = c(8.46, 8.46, 8.6, 8.6, 8.69, 8.69, 8.83,
8.83, 9, 9), x6 = c(1.58873, 1.58955, 1.5887, 1.58924, 1.58862,
1.58946, 1.58802, 1.58902, 1.58822, 1.58899), x7 = c(1.57873,
1.57873, 1.5787, 1.5787, 1.57862, 1.57862, 1.57802, 1.57802,
1.57822, 1.57822), x8 = c(1.58973, 1.58973, 1.5897, 1.5897, 1.58962,
1.58962, 1.58902, 1.58902, 1.58922, 1.58922), x9 = c(0, 478.69,
0, 320.45, 0, 503.7, 0, 609.3, 0, 478.19), x10 = c(30000, 30478.69,
30478.69, 30799.14, 30799.14, 31302.84, 31302.84, 31912.14, 31912.14,
32390.33), `Risk Control` = c(NA, 478.69, NA, 320.45, NA, 251.85,
NA, 304.65, NA, 159.3966667), ...12 = c(NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA), Sequence = c(NA, 1, NA, 1, NA, 2, NA, 2, NA,
3)), row.names = c(NA, -10L), class = c("tbl_df", "tbl", "data.frame"
), spec = structure(list(cols = list(x1 = structure(list(), class = c("collector_double",
"collector")), x2 = structure(list(), class = c("collector_character",
"collector")), x3 = structure(list(), class = c("collector_character",
"collector")), x4 = structure(list(), class = c("collector_double",
"collector")), x5 = structure(list(), class = c("collector_double",
"collector")), x6 = structure(list(), class = c("collector_double",
"collector")), x7 = structure(list(), class = c("collector_double",
"collector")), x8 = structure(list(), class = c("collector_double",
"collector")), x9 = structure(list(), class = c("collector_double",
"collector")), x10 = structure(list(), class = c("collector_double",
"collector")), `Risk Control` = structure(list(), class = c("collector_double",
"collector")), ...12 = structure(list(), class = c("collector_logical",
"collector")), Sequence = structure(list(), class = c("collector_double",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), delim = ","), class = "col_spec"))
In short I need assistance in:
1.Constructing a sequence to apply to my df, would like to be able to alter this sequence to try different sequences;
2.Take values in "x9" and create a new column that would apply the sequence values set. The sequence is taking the value in "x9" and dividing it by the sequence number
3.Construct a loop to iterate through the entire df to apply this over all of the values of the dataframe.
In the example above I have manually created "Risk Control" and the sample "Sequence". The sequence in the example is 1,1,2,2,3,3,4. The sequence in the sample uses each number twice before iterating to the next number. Once a loss is achieved in "x9" the sequence restarts.
I would appreciate any help with this function and loop. Thank you
Starting with input data only (not desired columns)
df1 <- df %>% select(1:10)
Reducing this data to only data with x9 not zero
This may not be intended and the user may prefer to key off an x3 event, but hopefully is illustrative.
df1 <- df1 %>% filter(x9 != 0)
Initiate seq column and insert dummy data.
df1$seq <- c(1, NA, 1, NA, NA)
Fill in, thanks to Allan Cameron for this answer to my post link
df1$seq <- unlist(sapply(diff(c(which(!is.na(df1$seq)), nrow(df1) + 1)), seq))
Apply user's rule 2:
df1$risk_control <- df1$x9 / df1$seq
# A tibble: 5 x 12
x1 x2 x3 x4 x5 x6 x7 x8 x9 x10 seq risk_control
<dbl> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int> <dbl>
1 2 2016.01.04 0~ close 1 8.46 1.59 1.58 1.59 479. 30479. 1 479.
2 4 2016.01.04 0~ close 2 8.6 1.59 1.58 1.59 320. 30799. 2 160.
3 6 2016.01.04 0~ close 3 8.69 1.59 1.58 1.59 504. 31303. 1 504.
4 8 2016.01.04 0~ t/p 4 8.83 1.59 1.58 1.59 609. 31912. 2 305.
5 10 2016.01.04 0~ close 5 9 1.59 1.58 1.59 478. 32390. 3 159.
Recombining this with the original data can be performed if desired with:
df2 <- dplyr::left_join(df[, -c(11:13)], df1)
# A tibble: 10 x 12
x1 x2 x3 x4 x5 x6 x7 x8 x9 x10 seq risk_control
<dbl> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int> <dbl>
1 1 2016.01.04 ~ buy 1 8.46 1.59 1.58 1.59 0 30000 NA NA
2 2 2016.01.04 ~ close 1 8.46 1.59 1.58 1.59 479. 30479. 1 479.
3 3 2016.01.04 ~ buy 2 8.6 1.59 1.58 1.59 0 30479. NA NA
4 4 2016.01.04 ~ close 2 8.6 1.59 1.58 1.59 320. 30799. 2 160.
5 5 2016.01.04 ~ buy 3 8.69 1.59 1.58 1.59 0 30799. NA NA
6 6 2016.01.04 ~ close 3 8.69 1.59 1.58 1.59 504. 31303. 1 504.
7 7 2016.01.04 ~ buy 4 8.83 1.59 1.58 1.59 0 31303. NA NA
8 8 2016.01.04 ~ t/p 4 8.83 1.59 1.58 1.59 609. 31912. 2 305.
9 9 2016.01.04 ~ buy 5 9 1.59 1.58 1.59 0 31912. NA NA
10 10 2016.01.04 ~ close 5 9 1.59 1.58 1.59 478. 32390. 3 159.

Resources