For Loop List Not Storing Values - r

I have a data set of behaviours performed by individuals repeatedly at different temperatures, e.g.:
ID Test Behaviour Temperature
A12.4.2 ONE 8.64 4
A12.4.2 TWO 7.63 5
A6.3.3 ONE 1.81 3
A6.3.3 TWO 2.47 9
B12.4.1 ONE 1.17 12
B12.4.1 TWO 3.96 2
E9.4.2 ONE 13.04 13
E9.4.2 TWO 9.51 6
...
I use the following code to randomly subset that data set, and then run repeatability analysis on the subset, producing R values and CI values from the repeatability analysis at the end.
P<-10000
R_value<-numeric(length=P)
CI_value<-numeric(length=P)
for(i in 1:P){
newdata<-Data[Data$ID %in% sample(unique(Data$ID), 16), ]
m1<-rptR::rpt(((Behaviour))~Temperature+(1|ID),grname="ID",data=newdata,datatype="Gaussian",nboot=1000,npermut=1000)
R_value[i] <- m1$R
CI_value[i] <- m1$CI
}
Unfortunately this doesn't seem to be working. When I call R_value or CI_value, I am greeted with a string of 0's. Upon calling newdata or m1, R tells me that the object cannot be found.
Where I run the repeatability analysis outside of the for loop, everything turns out fine.
Can anyone help?

Your code is running. There was an error message in the sample which I have changed to
sample(unique(Data$ID), 4). And then it runs. You probably could also have added replace like so sample(unique(Data$ID), 16, replace = TRUE), this works, too. I have also reduced the numbers in rboot and in npermut.
library(rptR)
Data <- read.table(text = "
ID Test Behaviour Temperature
A12.4.2 ONE 8.64 4
A12.4.2 TWO 7.63 5
A6.3.3 ONE 1.81 3
A6.3.3 TWO 2.47 9
B12.4.1 ONE 1.17 12
B12.4.1 TWO 3.96 2
E9.4.2 ONE 13.04 13
E9.4.2 TWO 9.51 6
", header =T)
Data
#> ID Test Behaviour Temperature
#> 1 A12.4.2 ONE 8.64 4
#> 2 A12.4.2 TWO 7.63 5
#> 3 A6.3.3 ONE 1.81 3
#> 4 A6.3.3 TWO 2.47 9
#> 5 B12.4.1 ONE 1.17 12
#> 6 B12.4.1 TWO 3.96 2
#> 7 E9.4.2 ONE 13.04 13
#> 8 E9.4.2 TWO 9.51 6
P<-10
R_value<-numeric(length=P)
CI_value<-numeric(length=P)
for(i in 1:P){
newdata<-Data[Data$ID %in% sample(unique(Data$ID), 4), ]
m1<-rptR::rpt(((Behaviour))~Temperature+(1|ID), grname="ID", data=newdata, datatype="Gaussian", nboot=10, npermut=10)
R_value[i] <- m1$R
CI_value[i] <- m1$CI
}
R_value
#> [[1]]
#> [1] 0.8324396
#>
#> [[2]]
#> [1] 0.8324396
#>
#> [[3]]
#> [1] 0.8324396
#>
#> [[4]]
#> [1] 0.8324396
#>
#> [[5]]
#> [1] 0.8324396
#>
#> [[6]]
#> [1] 0.8324396
#>
#> [[7]]
#> [1] 0.8324396
#>
#> [[8]]
#> [1] 0.8324396
#>
#> [[9]]
#> [1] 0.8324396
#>
#> [[10]]
#> [1] 0.8324396
CI_value
#> [1] 0.95 0.95 0.95 0.95 0.95 0.95 0.95 0.95 0.95 0.95

Related

What does a tidymodels survival fit require more than one predictor

Can a survival model with just the treatment as a predictor be fit
with a tidymodels survival function?
Here I mention the example, which uses many predictors, then try
to duplicated it with only one predictor. This fails.
https://www.tidyverse.org/blog/2021/11/survival-analysis-parsnip-adjacent/
has code to fit a survival tidymodel
library(survival)
bladder_train <- bladder[-c(1:3),]
bladder_test <- bladder[1:3,]
cox_spec <- proportional_hazards(penalty = 0.123) %>%
set_engine("glmnet")
f_fit <- fit(cox_spec,
Surv(stop, event) ~ rx + size + number + strata(enum),
data = bladder_train)
But with only the treatment in the model, it does not work
f_fit <- fit(cox_spec,
Surv(stop, event) ~ rx,
data = bladder_train)
Why? What am I missing
It seems the error has more to do with glmnet than tidymodels. This is the error:
library(survival)
library(censored)
#> Loading required package: parsnip
library(tidymodels)
#> Registered S3 method overwritten by 'tune':
#> method from
#> required_pkgs.model_spec parsnip
bladder_train <- bladder[-c(1:3), ]
bladder_test <- bladder[1:3, ]
cox_spec <- proportional_hazards(penalty = 0.123) %>%
set_engine("glmnet")
f_fit <- fit(cox_spec,
Surv(stop, event) ~ rx,
data = bladder_train)
#> Error in glmnet::glmnet(data_obj$x, data_obj$y, family = "cox", alpha = alpha, : x should be a matrix with 2 or more columns
Created on 2021-12-30 by the reprex package (v2.0.1)
glmnet needs a mtrix with 2 or more columns. Using just rx means you'd have just one column. If I add size as an additional feature, it works just fine.
library(survival)
library(censored)
#> Loading required package: parsnip
library(tidymodels)
#> Registered S3 method overwritten by 'tune':
#> method from
#> required_pkgs.model_spec parsnip
bladder_train <- bladder[-c(1:3), ]
bladder_test <- bladder[1:3, ]
cox_spec <- proportional_hazards(penalty = 0.123) %>%
set_engine("glmnet")
f_fit <- fit(cox_spec,
Surv(stop, event) ~ rx + size,
data = bladder_train)
f_fit
#> parsnip model object
#>
#> Fit time: NA
#>
#> Call: glmnet::glmnet(x = data_obj$x, y = data_obj$y, family = "cox", alpha = alpha, lambda = lambda)
#>
#> Df %Dev Lambda
#> 1 0 0.00 0.070850
#> 2 1 0.10 0.064560
#> 3 1 0.18 0.058820
#> 4 1 0.24 0.053600
#> 5 1 0.30 0.048840
#> 6 2 0.35 0.044500
#> 7 2 0.43 0.040550
#> 8 2 0.50 0.036940
#> 9 2 0.55 0.033660
#> 10 2 0.60 0.030670
#> 11 2 0.64 0.027950
#> 12 2 0.67 0.025460
#> 13 2 0.70 0.023200
#> 14 2 0.72 0.021140
#> 15 2 0.74 0.019260
#> 16 2 0.75 0.017550
#> 17 2 0.77 0.015990
#> 18 2 0.78 0.014570
#> 19 2 0.79 0.013280
#> 20 2 0.79 0.012100
#> 21 2 0.80 0.011020
#> 22 2 0.81 0.010040
#> 23 2 0.81 0.009151
#> 24 2 0.81 0.008338
#> 25 2 0.82 0.007597
#> 26 2 0.82 0.006922
#> 27 2 0.82 0.006308
#> 28 2 0.82 0.005747
#> 29 2 0.82 0.005237
#> 30 2 0.83 0.004771
#> 31 2 0.83 0.004348
#> 32 2 0.83 0.003961
#> 33 2 0.83 0.003609
#> 34 2 0.83 0.003289
#> 35 2 0.83 0.002997
#> 36 2 0.83 0.002730
#> 37 2 0.83 0.002488
#> 38 2 0.83 0.002267
#> 39 2 0.83 0.002065
#> 40 2 0.83 0.001882
#> 41 2 0.83 0.001715
#> The training data has been saved for prediction.
Created on 2021-12-30 by the reprex package (v2.0.1)
If you wanted to just use one feature rx, consider other models e.g. decision trees
library(survival)
library(censored)
#> Loading required package: parsnip
library(tidymodels)
#> Registered S3 method overwritten by 'tune':
#> method from
#> required_pkgs.model_spec parsnip
bladder_train <- bladder[-c(1:3), ]
bladder_test <- bladder[1:3, ]
dt_spec <- decision_tree() %>%
set_engine("rpart") %>%
set_mode("censored regression")
f_fit <- fit(dt_spec,
Surv(stop, event) ~ rx,
data = bladder_train)
f_fit
#> parsnip model object
#>
#> $rpart
#> n= 337
#>
#> node), split, n, deviance, yval
#> * denotes terminal node
#>
#> 1) root 337 403.0968 1.0000000
#> 2) rx>=1.5 152 166.6335 0.7751669 *
#> 3) rx< 1.5 185 231.2927 1.1946030 *
#>
#> $survfit
#>
#> Call: prodlim::prodlim(formula = form, data = data)
#> Stratified Kaplan-Meier estimator for the conditional event time survival function
#> Discrete predictor variable: rpartFactor (0.775166899958249, 1.19460305107131)
#>
#> $levels
#> [1] "0.775166899958249" "1.19460305107131"
#>
#> attr(,"class")
#> [1] "pecRpart"
Created on 2021-12-30 by the reprex package (v2.0.1)

Improve parallel performance with batching in a static-dynamic branching pipeline

BLUF: I am struggling to understand out how to use batching in the R targets package to improve performance in a static and dynamic branching pipeline processed in parallel using tar_make_future(). I presume that I need to batch within each dynamic branch but I am unsure how to go about doing that.
Here's a reprex that uses dynamic branching nested inside static branching, similar to what my actual pipeline is doing. It first branches statically for each value in all_types, and then dynamically branches within each category. This code produces 1,000 branches and 1,010 targets total. In the actual workflow I obviously don't use replicate, and the dynamic branches vary in number depending on the type value.
# _targets.r
library(targets)
library(tarchetypes)
library(future)
library(future.callr)
plan(callr)
all_types = data.frame(type = LETTERS[1:10])
tar_map(values = all_types, names = "type",
tar_target(
make_data,
replicate(100,
data.frame(x = seq(1000) + rnorm(1000, 0, 5),
y = seq(1000) + rnorm(1000, 20, 20)),
simplify = FALSE
),
iteration = "list"
),
tar_target(
fit_model,
lm(make_data),
pattern = map(make_data),
iteration = "list"
)
)
And here's a timing comparison of tar_make() vs tar_make_future() with eight workers:
# tar_destroy()
t1 <- system.time(tar_make())
# tar_destroy()
t2 <- system.time(tar_make_future(workers = 8))
rbind(serial = t1, parallel = t2)
## user.self sys.self elapsed user.child sys.child
## serial 2.12 0.11 25.59 NA NA
## parallel 2.07 0.24 184.68 NA NA
I don't think the user or system fields are useful here since the job gets dispatched to separate R processes, but the elapsed time for the parallel job takes about 7 times longer than the serial job.
I presume this slowdown is caused by the large number of targets. Will batching improve performance in this case, and if so how can I implement batching within the dynamic branch?
You are on the right track with batching. In your case, that is a matter of breaking up your list of 100 datasets into groups of, say, 10 or so. You could do this with a nested list of datasets, but that's a lot of work. Luckily, there is an easier way.
Your question is actually really well-timed. I just wrote some new target factories in tarchetypes that could help. To access them, you will need the development version of tarchetypes from GitHub:
remotes::install_github("ropensci/tarchetypes")
Then, with tar_map2_count(), it will be much easier to batch your list of 100 datasets for each scenario.
library(targets)
tar_script({
library(broom)
library(targets)
library(tarchetypes)
library(tibble)
make_data <- function(n) {
datasets_per_batch <- replicate(
100,
tibble(
x = seq(n) + rnorm(n, 0, 5),
y = seq(n) + rnorm(n, 20, 20)
),
simplify = FALSE
)
tibble(dataset = datasets_per_batch, rep = seq_along(datasets_per_batch))
}
tar_map2_count(
name = model,
command1 = make_data(n = rows),
command2 = tidy(lm(y ~ x, data = dataset)), # Need dataset[[1]] in tarchetypes 0.4.0
values = data_frame(
scenario = LETTERS[seq_len(10)],
rows = seq(10, 100, length.out = 10)
),
columns2 = NULL,
batches = 10
)
})
tar_make(reporter = "silent")
#> Warning message:
#> `data_frame()` was deprecated in tibble 1.1.0.
#> Please use `tibble()` instead.
#> This warning is displayed once every 8 hours.
#> Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
tar_read(model)
#> # A tibble: 2,000 × 8
#> term estimate std.error statistic p.value scenario rows tar_group
#> <chr> <dbl> <dbl> <dbl> <dbl> <chr> <dbl> <int>
#> 1 (Intercept) 17.1 12.8 1.34 0.218 A 10 10
#> 2 x 1.39 1.35 1.03 0.333 A 10 10
#> 3 (Intercept) 6.42 14.0 0.459 0.658 A 10 10
#> 4 x 1.75 1.28 1.37 0.209 A 10 10
#> 5 (Intercept) 32.8 7.14 4.60 0.00176 A 10 10
#> 6 x -0.300 1.14 -0.263 0.799 A 10 10
#> 7 (Intercept) 29.7 3.24 9.18 0.0000160 A 10 10
#> 8 x 0.314 0.414 0.758 0.470 A 10 10
#> 9 (Intercept) 20.0 13.6 1.47 0.179 A 10 10
#> 10 x 1.23 1.77 0.698 0.505 A 10 10
#> # … with 1,990 more rows
Created on 2021-12-10 by the reprex package (v2.0.1)
There is also tar_map_rep(), which may be easier if all your datasets are randomly generated, but I am not sure if I am overfitting your use case.
library(targets)
tar_script({
library(broom)
library(targets)
library(tarchetypes)
library(tibble)
make_one_dataset <- function(n) {
tibble(
x = seq(n) + rnorm(n, 0, 5),
y = seq(n) + rnorm(n, 20, 20)
)
}
tar_map_rep(
name = model,
command = tidy(lm(y ~ x, data = make_one_dataset(n = rows))),
values = data_frame(
scenario = LETTERS[seq_len(10)],
rows = seq(10, 100, length.out = 10)
),
batches = 10,
reps = 10
)
})
tar_make(reporter = "silent")
#> Warning message:
#> `data_frame()` was deprecated in tibble 1.1.0.
#> Please use `tibble()` instead.
#> This warning is displayed once every 8 hours.
#> Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
tar_read(model)
#> # A tibble: 2,000 × 10
#> term estimate std.error statistic p.value scenario rows tar_batch tar_rep
#> <chr> <dbl> <dbl> <dbl> <dbl> <chr> <dbl> <int> <int>
#> 1 (Inter… 37.5 7.50 5.00 0.00105 A 10 1 1
#> 2 x -0.701 1.17 -0.601 0.564 A 10 1 1
#> 3 (Inter… 21.5 9.64 2.23 0.0567 A 10 1 2
#> 4 x -0.213 1.55 -0.138 0.894 A 10 1 2
#> 5 (Inter… 20.6 9.51 2.17 0.0620 A 10 1 3
#> 6 x 1.40 1.79 0.783 0.456 A 10 1 3
#> 7 (Inter… 11.6 11.2 1.04 0.329 A 10 1 4
#> 8 x 2.34 1.39 1.68 0.131 A 10 1 4
#> 9 (Inter… 26.8 9.16 2.93 0.0191 A 10 1 5
#> 10 x 0.288 1.10 0.262 0.800 A 10 1 5
#> # … with 1,990 more rows, and 1 more variable: tar_group <int>
Created on 2021-12-10 by the reprex package (v2.0.1)
Unfortunately, futures do come with overhead. Maybe it will be faster in your case if you try tar_make_clustermq()?

What is the correct way to use dplyr's slice_sample() within my apply function?

In the below code, I've simulated dice rolls at increasing sample sizes and computed the average roll at each sample size. My lapply function works, but I'm uncomfortable with it since I know sample_n is not a dplyr function and has been superceded by slice_sample. I would like make my code better with a dplyr solution rather than sample_n() within the lapply. I think I may have other syntactical errors within the lapply. Here is the code:
#Dice
dice <- c(1,2,3,4,5,6) #the set of possible outcomes of a dice role
dice_probs <- c(1/6,1/6,1/6,1/6,1/6,1/6) #the probability of each option per roll
dice_df <- data.frame(dice,dice_probs)
#Simulate dice rolls for each of these sample sizes and record the average of the rolls
sample_sizes <- c(10,25,50,100,1000,10000,100000,1000000,100000000) #compute at each sample size
output <- lapply(X=sample_sizes, FUN = function(var){
obs = sample_n(dice_df,var,replace=TRUE)
sample_mean = mean(obs$dice)
new.df <- data.frame(sample_mean, var)
return(new.df)
})
The final step is computing the difference compared to the expected value, 3.5. I want a column where that shows the difference between 3.5 and the sample mean. We should see the difference decreasing as the sample size increases.
output <- output %>%
mutate(difference = across(sample_mean, ~3.5 - .x))
When I run this, it's throwing this error:
Error in UseMethod("mutate") :
no applicable method for 'mutate' applied to an object of class "list"
I've tried using sapply but I get a similar error: no applicable method for 'mutate' applied to an object of class "c('matrix', 'array', 'list')"
If it helps, here was my failed attempt at using slice_sample:
output <- lapply(X=sample_sizes, FUN = function(...){
obs = slice_sample(dice_df, ..., .preserve=TRUE)
sample_mean = mean(obs$dice)
new.df <- data.frame(sample_mean, ...)
return(new.df)
})
I got this error: Error: '...' used in an incorrect context
The output is just a single row data.frame element in a list. We can bind them with bind_rows and simply subtract once instead of doing this multiple times
library(dplyr)
bind_rows(output) %>%
mutate(difference = 3.5 - sample_mean )
sample_mean var difference
1 3.500000 10 0.00000000
2 2.800000 25 0.70000000
3 3.440000 50 0.06000000
4 3.510000 100 -0.01000000
5 3.495000 1000 0.00500000
6 3.502200 10000 -0.00220000
7 3.502410 100000 -0.00241000
8 3.498094 1000000 0.00190600
9 3.500183 100000000 -0.00018332
The n argument of slice_sample correspondes to sample_n's size argument.
And to calculate the difference of your output list we can use purrr::map instead of dplyr::across.
library(dplyr)
library(purrr)
set.seed(123)
#Dice
dice <- c(1,2,3,4,5,6) #the set of possible outcomes of a dice role
dice_probs <- c(1/6,1/6,1/6,1/6,1/6,1/6) #the probability of each option per roll
dice_df <- data.frame(dice,dice_probs)
#Simulate dice rolls for each of these sample sizes and record the average of the rolls
sample_sizes <- c(10,25,50,100,1000,10000,100000,1000000,100000000) #compute at each sample size
output <- lapply(X=sample_sizes, FUN = function(var){
obs = slice_sample(dice_df,n = var,replace=TRUE)
sample_mean = mean(obs$dice)
new.df <- data.frame(sample_mean, var)
return(new.df)
})
output %>%
map(~ 3.5 - .x$sample_mean)
#> [[1]]
#> [1] -0.5
#>
#> [[2]]
#> [1] 0.42
#>
#> [[3]]
#> [1] -0.04
#>
#> [[4]]
#> [1] -0.34
#>
#> [[5]]
#> [1] 0.025
#>
#> [[6]]
#> [1] 0.0317
#>
#> [[7]]
#> [1] 0.00416
#>
#> [[8]]
#> [1] -2.6e-05
#>
#> [[9]]
#> [1] -4.405e-05
Created on 2021-08-02 by the reprex package (v0.3.0)
Alternatively, we can use purrr::map_df and add a row diff inside each tibble as proposed by Martin Gal in the comments:
output %>%
map_df(~ tibble(.x, diff = 3.5 - .x$sample_mean))
#> # A tibble: 9 x 3
#> sample_mean var diff
#> <dbl> <dbl> <dbl>
#> 1 2.6 10 0.9
#> 2 3.28 25 0.220
#> 3 3.66 50 -0.160
#> 4 3.5 100 0
#> 5 3.53 1000 -0.0270
#> 6 3.50 10000 -0.00180
#> 7 3.50 100000 -0.00444
#> 8 3.50 1000000 -0.000226
#> 9 3.50 100000000 -0.0000669
Here is a base R way -
transform(do.call(rbind, output), difference = 3.5 - sample_mean)
# sample_mean var difference
#1 3.80 10 -0.300000
#2 3.44 25 0.060000
#3 3.78 50 -0.280000
#4 3.30 100 0.200000
#5 3.52 1000 -0.015000
#6 3.50 10000 -0.004200
#7 3.50 100000 -0.004370
#8 3.50 1000000 0.002696
#9 3.50 100000000 0.000356
If you just need the difference value you can do -
3.5 - sapply(output, `[[`, 'sample_mean')

How to index the features() function to iterate over a list of data frames using map() function in R?

Plotting my soil compaction data gives a convex-up curve. I need to determine the maximum y-value and the x-value which produces that maximum.
The 'features' package fits a smooth spline to the data and returns the features of the spline, including the y-maximum and critical x-value. I am having difficulty iterating the features() function over multiple samples, which are contained in a tidy list.
It seems that the features package is having trouble indexing to the data. The code works fine when I use data for only one sample, but when I try to use the dot placeholder and square brackets it loses track of the data.
Below is the code showing how this process works correctly for one sample, but not for an iteration.
#load packages
library(tidyverse)
#> Warning: package 'ggplot2' was built under R version 3.6.3
#> Warning: package 'forcats' was built under R version 3.6.3
library(features)
#> Warning: package 'features' was built under R version 3.6.3
#> Loading required package: lokern
#> Warning: package 'lokern' was built under R version 3.6.3
# generate example data
df <- tibble(
sample = (rep(LETTERS[1:3], each=4)),
w = c(seq(0.08, 0.12, by=0.0125),
seq(0.09, 0.13, by=0.0125),
seq(0.10, 0.14, by=0.0125)),
d= c(1.86, 1.88, 1.88, 1.87,
1.90, 1.92, 1.92, 1.91,
1.96, 1.98, 1.98, 1.97) )
df
#> # A tibble: 12 x 3
#> sample w d
#> <chr> <dbl> <dbl>
#> 1 A 0.08 1.86
#> 2 A 0.0925 1.88
#> 3 A 0.105 1.88
#> 4 A 0.118 1.87
#> 5 B 0.09 1.9
#> 6 B 0.102 1.92
#> 7 B 0.115 1.92
#> 8 B 0.128 1.91
#> 9 C 0.1 1.96
#> 10 C 0.112 1.98
#> 11 C 0.125 1.98
#> 12 C 0.138 1.97
# use the 'features' package to fit a smooth spline and extract the spline features,
# including local y-maximum and critical point along x-axis.
# This works fine for one sample at a time:
sample1_data <- df %>% filter(sample == 'A')
sample1_features <- features(x= sample1_data$w,
y= sample1_data$d,
smoother = "smooth.spline")
sample1_features
#> $f
#> fmean fmin fmax fsd noise
#> 1.880000e+00 1.860000e+00 1.880000e+00 1.000000e-02 0.000000e+00
#> snr d1min d1max fwiggle ncpts
#> 2.707108e+11 -9.100000e-01 1.970000e+00 9.349000e+01 1.000000e+00
#>
#> $cpts
#> [1] 0.1
#>
#> $curvature
#> [1] -121.03
#>
#> $outliers
#> [1] NA
#>
#> attr(,"fits")
#> attr(,"fits")$x
#> [1] 0.0800 0.0925 0.1050 0.1175
#>
#> attr(,"fits")$y
#> [1] 1.86 1.88 1.88 1.87
#>
#> attr(,"fits")$fn
#> [1] 1.86 1.88 1.88 1.87
#>
#> attr(,"fits")$d1
#> [1] 1.9732965 0.8533784 -0.5868100 -0.9061384
#>
#> attr(,"fits")$d2
#> [1] 4.588832e-03 -1.791915e+02 -5.123866e+01 1.461069e-01
#>
#> attr(,"class")
#> [1] "features"
# But when attempting to use the pipe and the map() function
# to iterate over a list containing data for multiple samples,
# using the typical map() placeholder dot will not index to the
# list element/columns that are being passed to .f
df_split <- split(df, f= df[['sample']])
df_split
#> $A
#> # A tibble: 4 x 3
#> sample w d
#> <chr> <dbl> <dbl>
#> 1 A 0.08 1.86
#> 2 A 0.0925 1.88
#> 3 A 0.105 1.88
#> 4 A 0.118 1.87
#>
#> $B
#> # A tibble: 4 x 3
#> sample w d
#> <chr> <dbl> <dbl>
#> 1 B 0.09 1.9
#> 2 B 0.102 1.92
#> 3 B 0.115 1.92
#> 4 B 0.128 1.91
#>
#> $C
#> # A tibble: 4 x 3
#> sample w d
#> <chr> <dbl> <dbl>
#> 1 C 0.1 1.96
#> 2 C 0.112 1.98
#> 3 C 0.125 1.98
#> 4 C 0.138 1.97
df_split %>% map(.f = features, x = .[['w']], y= .[['d']], smoother = "smooth.spline")
#> Warning in min(x): no non-missing arguments to min; returning Inf
#> Warning in max(x): no non-missing arguments to max; returning -Inf
#> Error in seq.default(min(x), max(x), length = max(npts, length(x))): 'from' must be a finite number
Created on 2020-04-04 by the reprex package (v0.3.0)
You could use group_split to split the data based on sample and use map to apply features functions to each subset of data.
library(features)
library(dplyr)
library(purrr)
list_model <- df %>%
group_split(sample) %>%
map(~features(x = .x$w, y = .x$d, smoother = "smooth.spline"))

Extracting the slope for individual observation

I'm a newbie in R. I have a data set with 3 set of lung function measurements for 3 corresponding dates given below for each observation. I would like to extract slope for each observation(decline in lung function) using R software and insert in the new column for each observation.
1. How should I approach the problem?
2. Is my data set arranged in right format?
ID FEV1_Date11 FEV1_Date12 FEV1_Date13 DATE11 DATE12 DATE13
18105 1.35 1.25 1.04 6/9/1990 8/16/1991 8/27/1993
18200 0.87 0.85 9/12/1991 3/11/1993
18303 0.79 4/23/1992
24204 4.05 3.95 3.99 6/8/1992 3/22/1993 11/5/1994
28102 1.19 1.04 0.96 10/31/1990 7/24/1991 6/27/1992
34104 1.03 1.16 1.15 7/25/1992 12/8/1993 12/7/1994
43108 0.92 0.83 0.79 6/23/1993 1/12/1994 1/11/1995
103114 2.43 2.28 2.16 6/5/1994 6/21/1995 4/7/1996
114101 0.73 0.59 0.6 6/25/1989 8/5/1990 8/24/1991
example for 1st observation, slope=0.0003
Thanks..
If I understood the question, I think you want the slope between each set of visits:
library(dplyr)
group_by(df, ID) %>%
mutate_at(vars(starts_with("DATE")), funs(as.Date(., "%m/%d/%Y"))) %>%
do(data_frame(slope=diff(unlist(.[,2:4]))/diff(unlist(.[,5:7])),
after_visit=1+(1:length(slope))))
## Source: local data frame [18 x 3]
## Groups: ID [9]
##
## ID slope after_visit
## <int> <dbl> <dbl>
## 1 18105 -2.309469e-04 2
## 2 18105 -2.830189e-04 3
## 3 18200 -3.663004e-05 2
## 4 18200 NA 3
## 5 18303 NA 2
## 6 18303 NA 3
## 7 24204 -3.484321e-04 2
## 8 24204 6.745363e-05 3
## 9 28102 -5.639098e-04 2
## 10 28102 -2.359882e-04 3
## 11 34104 2.594810e-04 2
## 12 34104 -2.747253e-05 3
## 13 43108 -4.433498e-04 2
## 14 43108 -1.098901e-04 3
## 15 103114 -3.937008e-04 2
## 16 103114 -4.123711e-04 3
## 17 114101 -3.448276e-04 2
## 18 114101 2.604167e-05 3
Alternate munging:
group_by(df, ID) %>%
mutate_at(vars(starts_with("DATE")), funs(as.Date(., "%m/%d/%Y"))) %>%
do(data_frame(date=as.Date(unlist(.[,5:7]), origin="1970-01-01"), # in the event you wanted to keep the data less awful and have one observation per row, this preserves the Date class
reading=unlist(.[,2:4]))) %>%
do(data_frame(slope=diff(.$reading)/unclass(diff(.$date))))
This is a bit of a "hacky" solution but if I understand your question correctly (some clarification may be needed), this should work in your case. Note, this is somewhat specific to your case since the column pairs are expected to be in the order you specified.
library(dplyr)
library(lubridate)
### Load Data
tdf <- read.table(header=TRUE, stringsAsFactors = FALSE, text = '
ID FEV1_Date11 FEV1_Date12 FEV1_Date13 DATE11 DATE12 DATE13
18105 1.35 1.25 1.04 6/9/1990 8/16/1991 8/27/1993
18200 0.87 0.85 NA 9/12/1991 3/11/1993 NA
18303 0.79 NA NA 4/23/1992 NA NA
24204 4.05 3.95 3.99 6/8/1992 3/22/1993 11/5/1994
28102 1.19 1.04 0.96 10/31/1990 7/24/1991 6/27/1992
34104 1.03 1.16 1.15 7/25/1992 12/8/1993 12/7/1994
43108 0.92 0.83 0.79 6/23/1993 1/12/1994 1/11/1995
103114 2.43 2.28 2.16 6/5/1994 6/21/1995 4/7/1996
114101 0.73 0.59 0.6 6/25/1989 8/5/1990 8/24/1991') %>% tbl_df
#####################################
### Reshape the data by column pairs.
#####################################
### Function to reshape a single column pair
xform_data <- function(x) {
df<-data.frame(tdf[,'ID'],
names(tdf)[x],
tdf[,names(tdf)[x]],
tdf[,names(tdf)[x+3]], stringsAsFactors = FALSE)
names(df) <- c('ID', 'DateKey', 'Val', 'Date'); df
}
### Create a new data frame with the data in a deep format (i.e. reshaped)
### 'lapply' is used to reshape each pair of columns (date and value).
### 'lapply' returns a list of data frames (on df per pair) and 'bind_rows'
### combines them into one data frame.
newdf <-
bind_rows(lapply(2:4, function(x) {xform_data(x)})) %>%
mutate(Date = mdy(Date, tz='utc'))
#####################################
### Calculate the slopes per ID
#####################################
slopedf <-
newdf %>%
arrange(DateKey, Date) %>%
group_by(ID) %>%
do(slope = lm(Val ~ Date, data = .)$coefficients[[2]]) %>%
mutate(slope = as.vector(slope)) %>%
ungroup
slopedf
## # A tibble: 9 x 2
## ID slope
## <int> <dbl>
## 1 18105 -3.077620e-09
## 2 18200 -4.239588e-10
## 3 18303 NA
## 4 24204 -5.534095e-10
## 5 28102 -4.325210e-09
## 6 34104 1.690414e-09
## 7 43108 -2.490139e-09
## 8 103114 -4.645589e-09
## 9 114101 -1.924497e-09
##########################################
### Adding slope column to original data.
##########################################
> tdf %>% left_join(slopedf, by = 'ID')
## # A tibble: 9 x 8
## ID FEV1_Date11 FEV1_Date12 FEV1_Date13 DATE11 DATE12 DATE13 slope
## <int> <dbl> <dbl> <dbl> <chr> <chr> <chr> <dbl>
## 1 18105 1.35 1.25 1.04 6/9/1990 8/16/1991 8/27/1993 -3.077620e-09
## 2 18200 0.87 0.85 NA 9/12/1991 3/11/1993 <NA> -4.239588e-10
## 3 18303 0.79 NA NA 4/23/1992 <NA> <NA> NA
## 4 24204 4.05 3.95 3.99 6/8/1992 3/22/1993 11/5/1994 -5.534095e-10
## 5 28102 1.19 1.04 0.96 10/31/1990 7/24/1991 6/27/1992 -4.325210e-09
## 6 34104 1.03 1.16 1.15 7/25/1992 12/8/1993 12/7/1994 1.690414e-09
## 7 43108 0.92 0.83 0.79 6/23/1993 1/12/1994 1/11/1995 -2.490139e-09
## 8 103114 2.43 2.28 2.16 6/5/1994 6/21/1995 4/7/1996 -4.645589e-09
## 9 114101 0.73 0.59 0.60 6/25/1989 8/5/1990 8/24/1991 -1.924497e-09

Resources