Randomly sampling iteratively from a data frame - r

I have the following dataframe that simulates a panel data set (i.e., multiple waves per unit).
dat <- structure(list(x = c(-0.32, -0.26, 0.05, -0.37, -0.37, -0.08,
-0.01, 0.05, 0.19, -0.48, 0.37, 0.05, -0.58, -0.18, -0.04, -0.28,
-0.44, -0.48, 1.05, 0.62, 0.85, 0.42, 0.7, 0.64, -0.19, -0.11,
-0.65, -0.01, 0.39, -0.02, -0.23, -0.6, -0.1, 0.39, 0.33, 0.39,
-0.09, -0.16, 0.26, -0.62, -0.44, -0.6, -0.17, -0.27, -0.12,
-0.53, -0.38, -0.33, -0.17, -0.11, -0.25, -0.92, -0.6, -0.81,
0.75, 0.52, 0.57, 1.32, 1.21, 1.21), y = c(-0.42, -2.01, -1.19,
0.7, 1.28, 1.37, 0.52, 2.04, 2.34, -1.45, 2.84, 0.1, -3.12, 0.22,
-0.06, -1.65, -0.9, -1.5, -0.98, -0.69, 0.15, 1.7, 1.47, 0.15,
0.26, 0.84, 0.35, 0.86, -1.23, -0.74, -1.79, -0.56, -2.15, 2.11,
2.34, 0.57, 0.38, 0.57, 0.97, 0.32, -1.71, -0.8, 1.45, -0.12,
1.93, 2.76, 0.08, -2.8, -0.06, 1.09, -0.4, 0.41, 0.02, -1.61,
1.75, 1.6, -0.19, 0.13, -0.89, -1.1), unit = c(1, 1, 1, 2, 2,
2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 8, 9, 9,
9, 10, 10, 10, 11, 11, 11, 12, 12, 12, 13, 13, 13, 14, 14, 14,
15, 15, 15, 16, 16, 16, 17, 17, 17, 18, 18, 18, 19, 19, 19, 20,
20, 20), wave = c(1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3,
1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3,
1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3,
1, 2, 3)), class = c("grouped_df", "tbl_df", "tbl", "data.frame"
), row.names = c(NA, -60L), groups = structure(list(unit = c(1,
2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
20), .rows = structure(list(1:3, 4:6, 7:9, 10:12, 13:15, 16:18,
19:21, 22:24, 25:27, 28:30, 31:33, 34:36, 37:39, 40:42, 43:45,
46:48, 49:51, 52:54, 55:57, 58:60), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -20L), .drop = TRUE))
I now want to simulate attrition into this data set: Some units drop out with a certain probability in wave 2; some of those who remain drop out in wave 3; and so forth until wave n. The probability remains the same at each step. Note that this approach should be flexible regarding the number of waves.
Here's what I came up with. Though it works, it feels slow to me. However, due to the varying number of waves, I am not sure how to avoid the loop.
# number of units and number of observations per unit:
n = 20
n_perunit = 3
# define attrition probability:
attrition = 2/3
# Start with a vector of all units
remaining <- 1:n
# loop through waves beginning with 2
system.time(for (i in 2:n_perunit) {
n_remaining <- round(length(remaining)*attrition)
remaining <- sample(remaining, n_remaining)
dat <- dat %>%
mutate(drop = ifelse(
wave >= i & !(unit %in% remaining), TRUE, FALSE)) %>%
filter(drop == FALSE) %>%
mutate(drop = NULL)
})
Efficiency:
user system elapsed
0.016 0.000 0.016
Any ideas how to improve this?
EDIT:
Based on #jpsmith 's answer (which as far as I see does not work for groups in which no people drop out, because min(which(dropout == "yes") will return a value of Inf for those), I came up with the following:
set.seed(1234)
system.time(if (!is.null(attrition)) {
# assign a 1 or 0 indicating dropout
dat <- dat %>%
mutate(dropout = ifelse(
wave > 1, sample(
0:1, n(), prob = c(attrition, 1-attrition), replace = TRUE), 0))
# first get the first (minimum) dropout in each unit...
dat <- dat %>%
group_by(unit) %>%
mutate(min = ifelse(
length(which(dropout == 1) > 0), min(which(dropout == 1)), n_perunit)) %>%
# ... then slice out rows up to that row
slice(1:min) %>%
# as this also includes the first dropout rows, drop that one
filter(dropout == 0)
})
Efficiency:
user system elapsed
0.01 0.00 0.01
However, some annoying warnings produced by slice - any idea why?

Perhaps I'm wrong, but in effect the attrition is iid after the first wave: each subsequent wave has a probability of dropout - so if you made it to wave 3 then that probability is not conditioned on anything (akin to the probability of flipping a third heads if the first two were heads). If I am reading this correctly, you could assign the dropout simultaneously across waves > 1 and then drop all observations after the first "dropout". This would vectorize everything and be faster.
Code
set.seed(123) ), row.names = c(NA, -20L), .drop = TRUE))
attrition <- 2/3
# Assign "dropout" position
dat$dropout <- ifelse(dat$wave > 1, sample(c("Yes","No"), prob = c(attrition, 1-attrition)), "No")
# Drop all observations after first dropout recorded
dat %>% group_by(unit) %>% slice(seq_len(min(which(dropout == "Yes") - 1)))
Output:
# Groups: unit [20]
# x y unit wave dropout
# <dbl> <dbl> <dbl> <dbl> <chr>
# 1 -0.32 -0.42 1 1 No
# 2 -0.26 -2.01 1 2 No
# 3 -0.37 0.7 2 1 No
# 4 -0.01 0.52 3 1 No
# 5 0.05 2.04 3 2 No
# 6 -0.48 -1.45 4 1 No
# 7 -0.58 -3.12 5 1 No
# 8 -0.18 0.22 5 2 No
# 9 -0.28 -1.65 6 1 No
# 10 1.05 -0.98 7 1 No
# # … with 20 more rows
Since you didnt set a seed or provide a desired output dataset, I cant compare, but happy to test this if you provide.
user system elapsed
0.008 0.001 0.009

Since the number of units remaining after each wave is deterministic, we can do the sampling all in one go.
library(dplyr)
set.seed(5)
n <- 20
n_perunit <- 3
# define attrition probability:
attrition <- 2/3
# Start with a vector of all units
remaining <- 1:n
# loop through waves beginning with 2
fOriginal <- function(df, remaining) {
for (i in 2:n_perunit) {
n_remaining <- round(length(remaining)*attrition)
remaining <- sample(remaining, n_remaining)
df <- df %>%
mutate(drop = ifelse(
wave >= i & !(unit %in% remaining), TRUE, FALSE)) %>%
filter(drop == FALSE) %>%
mutate(drop = NULL)
}
df
}
fNew <- function(df) {
nleft <- numeric(n_perunit + 1)
nleft[1] <- n
for (i in 2:n_perunit) nleft[i] <- round(nleft[i - 1]*attrition)
df[df$wave <= sample(rep.int(1:n_perunit, -diff(nleft)))[df$unit],]
}
dfOrig <- fOriginal(dat, remaining)
dfNew <- fNew(dat)
# the resulting data.frames are not identical due to different random sampling
# methods, but they both have the same number of rows and same wave counts
identical(tabulate(dfOrig$wave), tabulate(dfNew$wave))
#> [1] TRUE
microbenchmark::microbenchmark(fOriginal = fOriginal(dat, remaining),
fNew = fNew(dat))
#> Unit: milliseconds
#> expr min lq mean median uq max neval
#> fOriginal 12.0433 13.24815 14.52889 14.02410 15.0525 23.5338 100
#> fNew 1.2956 1.41915 1.73176 1.56935 1.7398 5.0738 100

Related

How do I change confidence interval calculation to log-log on R?

If I wanted to calculate confidence intervals using the coxph and confinf functions, how do I change the confidence interval calculation to log-log? My understanding is that log is the default.
I tried conf.type="log-log" but it did not work, just got an error message
library(survival)
coxph(formula = Surv(futime, fustat) ~ tx, data = tki, conf.type="log-log")
fit <- coxph(formula = Surv(futime, fustat) ~ tx, data = tki)
summary(fit)
#output provides HR CIs
confint(fit)
#coefficient CIs
exp(confint(fit))
> dput(tki) structure(list(futime = c(9.26, 11.06, 2.35, 3.75, 12.4, 10.3, 8.11, 7.29, 6.75, 6.56, 0.26, 1.9, 0.34, 1.63, 1.55, 1.6, 4.78, 2.65, 1.72, 3.63), fustat = c(1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1), tx = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, -20L))

Finding the range based on minimum values that increases

I have a dataset with multiple stations, depths and concentration. I am trying to find the difference in depth (or the thickness) based on where the minimum concentration increases by 0.1
For example: At station 1, the maximum depth is 14m. There is a conc of 0.1 at 4m and it increases to 0.2 at 6m. But then it goes down again to 0.1 at 10m and stays that way till 12m before it increases. It increases only by 0.05 at 13m. At 14m, the concentration is increased by 0.1. So 14m is the deepest (or maximum depth) where the lowest conc is found. I need to find a way to fix my code to find that 14... (i.e. where concentration increases by 0.1). I can find the max depth for a given station and the minimum concentration.
This code gives me a column with maximum depth for each station (max_depth) and another column on what the minimum concentration is for each station (min_conc).
How do I find at what depth does the lowest concentration increase by 0.1?
Im trying to use 'which' max and min but I can't figure out the code.. How to use Dplyr's Summarize and which() to lookup min/max values
station <- c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4)
depth <- c(1, 2, 3, 6, 8, 9, 10, 11, 12, 13, 14, 1, 3, 4, 6, 8, 10, 11, 14, 1, 2, 4, 6, 8, 9, 10, 15, 18, 20, 1, 2, 4, 6, 8, 10, 11)
conc <- c(0.4, 0.4, 0.3, 0.1, 0.2, NA, 0.2, 0.1, 0.1, 0.1, 0.15, 0.2, 0.5, 0.4, 0.3, 0.6, 0.4, 0.2, 0.1, 0.2, 0.3, 0.2, 0.5, 0.5, 0.3, 0.2, 0.1, 0.2, 0.2, 0.2, 0.8, 0.6, 0.4, 0.3, 0.2, 0.3, 0.3)
df <- cbind(station, depth, conc)
(df <- as.data.frame(df))
(depth <- df %>%
group_by(station) %>%
summarize(
Max_depth=miss(max(depth)),
min_conc=miss(min(conc, na.rm=TRUE)),
press_depth = depth[tail(which(conc == min(conc, na.rm = TRUE)), 1)]))
when I try this instead:
press_depth = depth[tail(which(conc == min(conc > 0.1, na.rm = TRUE)), 1)])
I get an error: Column press_depth must be length 1 (a summary value), not 0
I'm not sure I understood completely what you are asking, but hopefully this can help you start. If it's different from what you have in mind, let me know:
library(dplyr)
df <- data_frame(
station = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4),
depth = c(1, 2, 4, 6, 8, 9, 10, 11, 12, 13, 14, 1, 3, 4, 6, 8, 10, 11, 14, 1, 2, 4, 6, 8, 9, 10, 15, 18, 20, 1, 2, 4, 6, 8, 10, 11),
conc = c(0.4, 0.6, 0.3, 0.2, 0.2, NA, 0.2, 0.2, 0.2, 0.1, 0.2, 0.5, 0.4, 0.3, 0.6, 0.4, 0.2, 0.1, 0.2, 0.3, 0.2, 0.5, 0.5, 0.3, 0.2, 0.1, 0.2, 0.2, 0.2, 0.8, 0.6, 0.4, 0.3, 0.2, 0.3, 0.3)
)
df %>%
group_by(station) %>%
mutate(conc_diff = lead(conc) - conc,
dept_diff = lead(depth) - depth) %>%
filter(conc_diff == .1, conc == min(conc, na.rm = TRUE)) %>%
filter(depth == max(depth))
#> # A tibble: 3 x 5
#> # Groups: station [3]
#> station depth conc conc_diff dept_diff
#> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 13 0.1 0.1 1
#> 2 2 11 0.1 0.1 3
#> 3 3 10 0.1 0.1 5
Created on 2020-06-17 by the reprex package (v0.3.0)

Coarsened Exact Matching with cem package- Error in .subset2(x, i, exact = exact)

I'm trying to perform coarsened exact matching on the following data.
> dput(head(cem_data))
structure(list(sex = c(1, 1, 1, 2, 2, 2), age = c(40, 59, 53,
60, 49, 60), edlev = c(3, 3, 3, 2, 3, 3), sw = c(44, 17, 10,
41, 26, 23), sw2 = c(15, 1, 5, 34, 5, 6), som = c(2.14, 0.14,
1.86, 3, 1.71, 2.14), som_2 = c(0.71, 0.14, 2, 2.57, 1.71, 2.14
), ap = c(3.5, 1.5, 1.33, 3.33, 2.67, 2.17), ap_2 = c(3, 0.17,
2.33, 3, 0.83, 1.67), dep = c(2.83, 0.17, 0.33, 2.83, 2.17, 2.33
), dep_2 = c(1.17, 0, 0.33, 2.33, 0.83, 1), int = c(2.86, 1.43,
1, 2, 2.29, 2.14), int_2 = c(2.29, 0.57, 0.14, 2.57, 1.71, 1.43
), pho = c(3.2, 0, 0, 3.4, 0.8, 0.4), pho_2 = c(1.6, 0, 0, 3.2,
0, 0.4), psy_b = c(2.67, 0.11, 0.83, 3.06, 1.61, 1.72), psy_b_2 = c(1.11,
0.06, 0.89, 2.67, 0.94, 1.28), s_wirk = c(4, 2.2, 1.6, 3.2, 1.4,
2.2), s_wirk_2 = c(2.8, 0.8, 1.8, 2.6, 1.6, 1.4), soz_b = c(2.75,
1.5, 1, 2.25, 1.25, 1.5), soz_b_2 = c(2.75, 1, 1, 2.25, 1.5,
1.25), soz_u = c(0.75, 0.75, 1.75, 3.25, 1, 3.25), soz_u_2 = c(1,
0.25, 1.75, 2.5, 2.5, 2), wohl = c(3.6, 1.4, 1.8, 3.4, 3, 3),
wohl_2 = c(2, 0.6, 1.4, 2.8, 2.2, 1.2), au_bei_aufn = c(1,
1, 1, 1, 1, 1), age_reha = c(40.9890410958904, 59.3945205479452,
53.372602739726, 60.2, 49.3342465753425, 60.7534246575342
), group_format = c(0, 0, 0, 0, 0, 0)), row.names = c(6L,
7L, 10L, 15L, 20L, 29L), class = "data.frame")
With the following code:
require(cem)
voll_data <- voll_data %>%
select(-c("auf_nr", "icd_10_1", "icd_10_2", "icd_10_3", "icd_10_4","icd_10_5", "bdi_date", "aufnahme", "entlassung")) %>%
mutate_if(is.factor,as.numeric) %>%
mutate_if(is.character, as.numeric)
cem_data <- data.frame(na.omit(voll_data))
#cem_data_s <- scale(cem_data[,5:26])
#cem_data <- cbind.data.frame(cem_data[, 1:4], cem_data_s, cem_data[, 27:36])
variables <- c("age", "sex", "edlev", "sw","au_bei_aufn")
ungleich2 <- imbalance(cem_data$group_format, data=cem_data)
However, following error is being shown, when calculating the "matt".
Error in .subset2(x, i, exact = exact) : attempt to select less than one element in get1index
7.
(function(x, i, exact) if (is.matrix(i)) as.matrix(x)[[i]] else .subset2(x, i, exact = exact))(x, ..., exact = exact)
6.
[[.data.frame(data, treatment)
5.
data[[treatment]]
4.
is.factor(x)
3.
as.factor(data[[treatment]])
2.
cem.main(treatment = treatment, data = data, cutpoints = cutpoints, drop = drop, k2k = k2k, method = method, mpower = mpower, verbose = verbose, baseline.group = baseline.group, keep.all = keep.all)
1.
cem(treatment = cem_data$group_format, data = cem_data, drop = "sw2", cutpoints = list(age = agecut), grouping = list(edlev_gr))
# automated coarsening
matt <- cem(cem_data$group_format, data = cem_data, drop= "sw2")
print(matt)
Does anyone have an idea what am I doing wrong?
Thanks a lot!!

Stratify then impute in R - using mi()

I want to "stratify-then-impute" using the packages available in R.
That is, I am hoping to:
1) stratify my dataset using a binary variable called "arm". This variable has no missing data.
2) run an imputation model for the two subsets
3) combine the two imputed data sets
4) run a pooled analysis.
My dataset looks like:
dataSim <- structure(list(pid = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
13, 14, 15, 16, 17, 18, 19, 20), arm = c(0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), X1 = c(0.1, NA, 0.51,
0.56, -0.82, NA, NA, NA, -0.32, 0.4, 0.58, NA, 0.22, -0.23, 1.49,
-1.88, -1.77, -0.94, NA, -1.34), X2 = c(NA, -0.13, NA, 1.2, NA,
NA, NA, 0.02, -0.04, NA, NA, 0.25, -0.81, -1.67, 1.01, 1.69,
-0.06, 0.07, NA, -0.11)), .Names = c("pid", "arm", "X1", "X2"
), row.names = c(NA, 20L), class = "data.frame")
To impute, the data, I'm currently using the mi() function as follows:
library(mi)
data.1 <- dataSim[dataSim[,"arm"]==1,]
data.0 <- dataSim[dataSim[,"arm"]==0,]
data.miss.1 <- missing_data.frame(data.1)
data.miss.0 <- missing_data.frame(data.0)
imputations.1 <- mi(data.1, n.iter=5, n.chains=5, max.minutes=20, parallel=FALSE)
imputations.0 <- mi(data.0, n.iter=5, n.chains=5, max.minutes=20, parallel=FALSE)
complete(imputations.1) # viewing the imputed datasets
complete(imputations.0)
Then I don't know how to combine the 2 imputations in order to do a pooled analysis. I have unsuccessfully tried:
imputations <- rbind(imputations.0, imputations.1) # This doesn't work
# analysis.X1 <- pool(X1 ~ arm, data = imputations ) # This is what I want to run
I assume this method is a simplified version of including an interaction term when imputing, but I don't know how this is possible either.
Thanks

Multiple stat_function on grouped data with ggplot2

I am studying a data set with multiple observation of a parameter overtime. the data is like:
test<-data.frame(t = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.33, 0.33, 0.33, 0.33, 0.33, 0.33, 0.33, 0.33, 0.33, 0.33, 0.33, 0.33, 0.33, 0.33, 0.33, 0.33, 0.33, 0.67, 0.67, 0.67, 0.67, 0.67, 0.67, 0.67, 0.67, 0.67, 0.67, 0.67, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1.33, 1.33, 1.33, 1.33, 1.33, 1.33, 1.33, 1.33, 1.33, 1.33, 1.33, 1.33, 1.33, 1.33, 1.33, 1.33, 1.67, 1.67, 1.67, 1.67, 1.67, 1.67, 1.67, 1.67, 1.67, 1.67, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 6, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10), int = c(76.44609375, 94.6619686800895, 112.148907103825, 75.1003097802036, 74.1037037037037, 76.7526662128432, 74.0734830988873, 87.9052100068855, 81.0525931336742, 92.1907873244038, 84.0708929788684, 88.8232221775814, 98.1323678006063, 115.175322139789, 91.2653104925053, 76.3661620658949, 152.637799717913, 107.054702135631, 83.4693197755961, 91.658991910392, 81.3991787335206, 106.153762268266, 100.919789842382, 67.2119436084271, 137.558914728682, 89.1182608695652, 156.10352233677, 108.180911207183, 87.9794680354643, 77.7501400560224, 80.7675382653061, 95.6662793399954, 92.5649630541872, 88.3301402668491, 84.3891875746714, 76.4318673395818, 111.413893510815, 82.4753828420879, 119.099190283401, 192.539417212559, 208.49203187251, 106.919937512205, 105.370936371214, 180.028767711464, 130.29369773608, 170.193357597816, 172.703180212014, 178.061569518042, 182.097607918614, 227.066976984743, 153.856101031661, 432.991580916745, 299.143735224586, 144.118156808803, 396.36644895153, 334.538796516231, 350.186359610275, 200.781101530882, 279.866079790223, 122.542700519331, 235.199555308505, 204.924140655867, 229.181848967152, 225.542753383955, 468.308974987739, 269.306058221873, 229.969282013323, 255.553846153846, 621.021220159151, 255.017211703959, 396.658265826583, 273.300663227708, 232.449965010497, 303.343894502483, 276.952483801296, 327.419805194805, 241.136864249474, 457.961489497136, 498.901714285714, 280.9558101473, 322.089588377724, 386.754533152909, 364.356809338521, 340.416035518412, 428.482916666667, 668.447197400487, 387.671341748481, 471.049545829893, 255.8802020688, 361.979536152797, 192.224629418472, 284.088954468803, 170.763997760358, 237.869065100343, 365.08237271854, 294.266488413547, 718.279750479846, 211.599427030671, 294.045375597047, 207.099267015707, 194.209973045822, 251.306358381503, 190.786794766966, 400.396083385976, 183.133240482823, 130.442107867392, 167.231452991453, 345.110896351776, 299.304645622394, 192.078204692282, 121.273544841369, 153.996295438759, 97.6034616378197, 362.80049522462, 130.498551774077, 106.031656035908, 117.682936668011, 90.1247837370242, 140.855475040258, 169.050049067713, 244.290241606527, 120.603356419819, 173.413333333333, 125.896389002872, 206.543873212215, 186.668320340184, 85.0988108720272, 106.57849117175, 102.867232728676, 216.232957110609, 86.6538461538462, 149.459777852575, 212.498573059361, 93.3816390633923, 105.567730417318, 120.095470383275, 137.205696941396, 141.156985871272, 90.578857338351, 84.8457760314342, 127.092660685395, 136.859870967742, 188.406440382942, 86.0879705400982))
class(test)
I managed to plot the density for each time point using:
ggplot(test, aes(int, group = as.factor(t),colour=t))+ geom_density()
But I would like to do the same graph but instead of the density I would like to plot a log normal fit of the density.
I know how to plot the lognormal fitting on one time point using fitdistr and passing parameter to stat_function whit this code
library(MASS)
fit <- fitdistr(subset(test, t == 0,select='int')$int, "lognormal")
ggplot(data=subset(test, t == 0,select='int'), aes(x=int)) +stat_function(fun = dlnorm,args = list(mean = fit$estimate[1], sd = fit$estimate[2]))
But how can I do it for all t with the colour of the line being given by the value of t is it possible to provide a function in the args list?
I thought of another naive solution: Predicting the values of every dlnorm().
## Split up the data according to t
tt <- split(test, test$t)
## Fit a lognormal to every dataset
fits <- lapply(tt, function(x) fitdistr(x$int, "lognormal"))
## Predict values
fitted <- lapply(fits, function(x) dlnorm(x = 1:max(test$int),
mean = x$estimate[1], sd = x$estimate[2]))
## Wrap everything into a data.frame ggplot can handle
plot.data <- data.frame(y = unlist(fitted), int = 1:max(test$int),
t = rep(unique(test$t),
each = length(unlist(fitted))/length(unique(test$t))))
## Plot
ggplot(test, aes(int, group = as.factor(t), colour=t)) +
#geom_density() +
geom_line(data = plot.data, aes(y = y), lwd = 1)
What about a naive solution, adding iteratively stat_function()?
cols <- brewer.pal(length(unique(test$t)),"Set1")
g <- ggplot(data=subset(test, t == 0, select='int'), aes(x=int))
n <- 1
for(i in unique(test$t)){
fit <- fitdistr(subset(test, t == i, select='int')$int, "lognormal")
g <- g+stat_function(fun = dlnorm,
args=list(mean=fit$estimate[1],sd=fit$estimate[2]),
col=cols[n])
n <- n + 1
}
g

Resources