R Extract Data From SurvFit - r

library(survival)
etime <- with(mgus2, ifelse(pstat==0, futime, ptime))
event <- with(mgus2, ifelse(pstat==0, 2*death, 1))
event <- factor(event, 0:2, labels=c("censor", "pcm", "death"))
mfit2 <- survfit(Surv(etime, event) ~ sex, data=mgus2)
plot(mfit2, col=c(1,2,1,2), lty=c(2,2,1,1),
mark.time=FALSE, lwd=2, xscale=12,
xlab="Years post diagnosis", ylab="Probability in State")
legend(240, .6, c("death:female", "death:male", "pcm:female", "pcm:male"),
col=c(1,2,1,2), lty=c(1,1,2,2), lwd=2, bty='n')
This is a reproducible example here. I wonder, how can it be possible to take out these data from 'mfit2' so it can be plotted in ggplot2?

You can extract the data from the summary of the fitted object using lapply
sfit <- summary(mfit2)
str(sfit)
List of 24
$ n : int [1:2] 631 753
$ time : num [1:359] 1 2 3 4 5 6 7 8 9 10 ...
$ n.risk : int [1:359, 1:3] 631 610 599 595 588 587 581 580 573 569 ...
$ n.event : int [1:359, 1:3] 0 0 0 0 0 0 0 0 0 0 ...
$ n.censor : num [1:359] 1 0 0 0 0 0 0 0 0 1 ...
$ pstate : num [1:359, 1:3] 0.968 0.951 0.944 0.933 0.932 ...
$ p0 : num [1:2, 1:3] 1 1 0 0 0 0
..- attr(*, "dimnames")=List of 2
.. ..$ : chr [1:2] "sex=F" "sex=M"
.. ..$ : chr [1:3] "(s0)" "pcm" "death"
$ strata : Factor w/ 2 levels "sex=F","sex=M": 1 1 1 1 1 1 1 1 1 1 ...
...
I think the columns you need are the time, pstate and `strata. But some others, such as the numbers at risk may be useful.
cols <- lapply(c(2:6, 8, 16, 17), function(x) sfit[x])
Then combine these columns into a data frame with do.call
data <- do.call(data.frame, cols)
str(data)
'data.frame': 359 obs. of 21 variables:
$ time : num 1 2 3 4 5 6 7 8 9 10 ...
$ n.risk.1 : int 631 610 599 595 588 587 581 580 573 569 ...
$ n.risk.2 : int 0 0 0 0 0 0 0 0 0 0 ...
$ n.risk.3 : int 0 0 0 0 0 0 0 0 0 0 ...
$ n.event.1: int 0 0 0 0 0 0 0 0 0 0 ...
$ n.event.2: int 0 2 0 1 0 1 0 0 2 1 ...
$ n.event.3: int 20 9 4 6 1 5 1 7 2 2 ...
$ n.censor : num 1 0 0 0 0 0 0 0 0 1 ...
$ pstate.1 : num 0.968 0.951 0.944 0.933 0.932 ...
$ pstate.2 : num 0 0.00317 0.00317 0.00476 0.00476 ...
$ pstate.3 : num 0.0317 0.046 0.0523 0.0619 0.0634 ...
$ strata : Factor w/ 2 levels "sex=F","sex=M": 1 1 1 1 1 1 1 1 1 1 ...
$ lower.1 : num 0.955 0.934 0.927 0.914 0.912 ...
$ lower.2 : num NA 0.000796 0.000796 0.00154 0.00154 ...
$ lower.3 : num 0.0206 0.0322 0.0375 0.0456 0.047 ...
$ upper.1 : num 0.982 0.968 0.963 0.953 0.952 ...
$ upper.2 : num NA 0.0127 0.0127 0.0147 0.0147 ...
$ upper.3 : num 0.0488 0.0656 0.0729 0.0838 0.0856 ...
This data is in wide format, best to reshape to long for the graph.
mgus3 <- data %>%
pivot_longer(cols=-c(time, strata, n.censor),
names_to=c(".value","state"),
names_pattern="(.+).(.+)") %>%
filter(state!=1) %>% # Exclude the censored state
mutate(state=factor(state, labels=c("pcm","death")),
group=interaction(strata, state))
Then plot it.
library(ggplot)
mgus3 %>%
ggplot(aes(x=time, y=pstate, col=group)) +
geom_line(aes(linetype=group)) +
ylab("Probability in State") +
theme_bw()
You should be able to add confidence bands and make it more pretty.

Related

issue with gbm.step() function in R

I'm trying to execute the Cross-Validation for the boosting regression/classification trees using the function gbm.step() from the R package dismo, but it returns a empty output and I can't figure out why. This is the code I'm using:
ColIndexCov <- match(names(myRS),colnames(DFbrt_df2))
ColIndexResp <- match(c("HasRes"),colnames(DFbrt_df2))
DFbrt_df <- DFbrt#data
DFbrt_df2 <- na.omit(DFbrt_df)
myBRT = gbm.step(data=DFbrt_df2,
gbm.x = ColIndexCov,
gbm.y = ColIndexResp,
tree.complexity = 3,
learning.rate = 10^(-8),
n.trees = 50,
family = "bernoulli",
n.folds = 4,
fold.vector = DFbrt_df2$Region.num,
step.size = 50,
verbose = F,
silent = T
)
str(DFbrt_df2)
'data.frame': 560845 obs. of 18 variables:
$ Nsamples : num 310 310 310 310 310 310 310 310 310 310 ...
$ cluster : num 39 39 39 39 39 39 39 39 39 39 ...
$ R : num 44.9 44.9 44.9 44.9 44.9 ...
$ P50 : num 0.565 0.544 0.609 0.605 0.593 ...
$ regions : Factor w/ 6 levels "China_east","China_middlesouth",..: 1 1 1 1 1 1 1 1 1 1 ...
$ HasRes : num 1 0 1 0 0 0 1 1 0 0 ...
$ use : num 10.02 9.75 0 9.38 8.77 ...
$ acc : num 0 0 0.4103 0.0769 0.0779 ...
$ tmp : num 2.46 2.46 2.46 2.46 2.45 ...
$ irg : num 1.788 0.399 1.205 1.836 1.841 ...
$ PgExt : num 3.11 0 3.7 3.11 3.18 ...
$ PgInt : num 4.69 2.76 0 3.99 2.22 ...
$ ChExt : num 3.74 0 4.33 3.74 3.81 ...
$ ChInt : num 5.01 5.99 5.35 4.88 4.97 ...
$ Ca : num 0 0 2.71 0 2.8 ...
$ veg : num 0 0 0 0 0 0 0 0 0 0 ...
$ Region.num: num 4 4 4 4 4 4 4 4 4 4 ...
$ Region : num 4 4 4 4 4 4 4 4 4 4 ...
- attr(*, "na.action")= 'omit' Named int 1 2 3 4 5 6 7 8 9 10 ...
..- attr(*, "names")= chr "1" "2" "3" "4" ...
the answer variable is the variable HasRes and the covariates are the variables use, acc, tmp, irg, PgExt, PgInt, ChExt, ChInt, ca, veg.

Loop through list programatically

I have a list in R that I want to loop through all the elements.
This is the structure of the object:
> str(AAPL.OPT[c])
List of 1
$ jun.12.2020:List of 2
..$ calls:'data.frame': 52 obs. of 7 variables:
.. ..$ Strike: num [1:52] 180 185 200 210 240 ...
.. ..$ Last : num [1:52] 123 118 131 120 85 ...
.. ..$ Chg : num [1:52] 0 0 7.61 9.48 0 ...
.. ..$ Bid : num [1:52] 149 144 129 119 89 ...
.. ..$ Ask : num [1:52] 153.3 148.5 133.5 123.7 93.5 ...
.. ..$ Vol : int [1:52] NA 15 16 2 1 1 3 36 1 2 ...
.. ..$ OI : int [1:52] 0 15 25 4 50 3 4 36 6 10 ...
..$ puts :'data.frame': 56 obs. of 7 variables:
.. ..$ Strike: num [1:56] 150 165 170 180 185 190 195 200 205 210 ...
.. ..$ Last : num [1:56] 0.05 0.02 0.14 0.05 0.03 0.02 0.01 0.02 0.01 0.01 ...
.. ..$ Chg : num [1:56] 0 0 0 0 0 0 0 0 0 0 ...
.. ..$ Bid : num [1:56] NA 0 0 0 0 0 0 0 0 0 ...
.. ..$ Ask : num [1:56] 2.13 0.11 0.11 1.8 1.87 0.01 1.88 0.5 1.88 2.13 ...
.. ..$ Vol : int [1:56] NA 1 1 2 1 16 1 17 1 21 ...
.. ..$ OI : int [1:56] 1 10 7 9 76 201 113 314 92 264 ...
I cannot access the next level of the object programatically (by indexing the value)
I want to do something like this:
AAPL.OPT[c][1]
instead of this
AAPL.OPT[c]$jun.12.2020
Sample data of AAPL.OPT[c]
$`jun.12.2020`$`calls`
Strike Last Chg Bid Ask Vol OI
AAPL200612C00180000 180.0 123.29 0.00000000 149.00 153.35 NA 0
AAPL200612C00185000 185.0 117.60 0.00000000 144.00 148.50 15 15
AAPL200612C00200000 200.0 131.15 7.60999300 129.00 133.50 16 25
AAPL200612C00210000 210.0 119.95 9.47999600 119.30 123.65 2 4
....
AAPL.OPT[c] gives a list of length 1 which has two other lists in them. If we use [[c]] it gives a list of length 2 andtTo access each dataframe you can subset them further using [[ so AAPL.OPT[[c]][[1]] and AAPL.OPT[[c]][[2]].
We can use
AAPL.OPT[[c]]$jun.12.2020

null model fail to converge in an AICc anaysis

I am doing an AICc analysis with my insect biomass per hour data in R to find which of the environmental predictors I've measured influence the biomass the most. I am doing glmm with the Gamma distribution and a "log" as link function for my model competition. All my models are converging except my null model. I am still struggling to understand why this is hapenning. Does anybody have an idea? Here is my code in R:
What my data look like:
> str(insectnona)
'data.frame': 76 obs. of 28 variables:
$ TIME : Factor w/ 7 levels "2016_6","2016_7",..: 4 6 7 3 2 4 6 7 2 3 ...
$ JULIAN : Factor w/ 28 levels "147","148","149",..: 3 9 23 24 16 2 11 20 10 19 ...
$ SITE : Factor w/ 8 levels "1","3","5","12",..: 1 1 1 1 2 3 3 3 4 4 ...
$ HABITAT : Factor w/ 3 levels "C","E","F": 1 1 1 1 1 1 1 1 1 1 ...
$ TEMP_CIVIL : num 17.8 18.9 21.1 15 16 ...
$ BIO_ZONE : Factor w/ 3 levels "ESSFwh3","ICHdw1",..: 2 2 2 2 2 2 2 2 1 1 ...
$ AGE_CLASS : Factor w/ 3 levels "6","7","8": 2 2 2 2 2 1 1 1 3 3 ...
$ RICHNESS : int 4 9 9 9 8 6 8 8 3 2 ...
$ ARANEAE_Btot: num 0 0.1 0.1 6.9 3.73 ...
$ COL_Btot : num 2152.4 66.8 88.4 6.9 80.4 ...
$ DIP_Btot : num 72.8 39.6 17.7 20.9 132.4 ...
$ EPH_Btot : num 0 0 0 10.2 0.0333 ...
$ HEM_Btot : num 0 0.1 18.5 0 0 ...
$ HOM_Btot : num 0 14.9 30 6.2 0 ...
$ HYM_Btot : num 40.9 65.6 36.5 38 36.7 ...
$ LEP_Btot : num 161 2625 696 390 869 ...
$ NEU_Btot : num 0 0.1 3 15.5 10.6 ...
$ ORT_Btot : num 0 24.8 0 0 0 0 0 0 0 0 ...
$ PSO_Btot : num 0 0 0 0 0 0 9.3 0 0 0 ...
$ THY_Btot : num 0 0 0 0 0 0 0 0 0 0 ...
$ TRI_Btot : num 0 0 34.5 20.3 4.4 ...
$ BIOMASS_tot : num 2427 2837 924 515 1138 ...
$ OTHER_Btot : num 114 145 140 118 188 ...
$ COL_bhr : num 321.254 10.603 10.914 0.843 11.518 ...
$ LEP_bhr : num 24.1 416.7 85.9 47.7 124.5 ...
$ BIOMASS_hr : num 362.3 450.3 114.1 62.9 162.9 ...
$ RICHNESS_hr : num 0.597 1.429 1.111 1.1 1.146 ...
$ sTEMP_CIVIL : num 0.6228 0.8304 1.263 0.0796 0.2736 ...
My model competition:
modl <- list()
modl[[1]]=glmer(BIOMASS_hr~AGE_CLASS + HABITAT + (1|SITE) + (1|TIME), data=insectnona,family="Gamma"(link="log") )
modl[[2]]=glmer(BIOMASS_hr~HABITAT + (1|SITE) + (1|TIME), data=insectnona,family="Gamma"(link="log") )
modl[[3]]=glmer(BIOMASS_hr~AGE_CLASS + (1|SITE) + (1|TIME), data=insectnona,family="Gamma"(link="log") )
modl[[4]]=glmer(BIOMASS_hr~BIO_ZONE + (1|SITE) + (1|TIME), data=insectnona,family="Gamma"(link="log") )
modl[[5]]=glmer(BIOMASS_hr~sTEMP_CIVIL + (1|SITE) + (1|TIME), data=insectnona,family="Gamma"(link="log") )
modl[[6]]=glmer(BIOMASS_hr~AGE_CLASS + HABITAT + sTEMP_CIVIL + (1|SITE) + (1|TIME), data=insectnona,family="Gamma"(link="log") )
modl[[7]]=glmer(BIOMASS_hr~HABITAT + sTEMP_CIVIL + (1|SITE) + (1|TIME),data=insectnona,family="Gamma"(link="log") )
modl[[8]]=glmer(BIOMASS_hr~HABITAT + BIO_ZONE + (1|SITE) + (1|TIME),data=insectnona,family="Gamma"(link="log") )
modl[[9]]=glmer(BIOMASS_hr~AGE_CLASS + HABITAT + BIO_ZONE + (1|SITE) +(1|TIME),data=insectnona,family="Gamma"(link="log") )
modl[[10]]=glmer(BIOMASS_hr~1 + (1|SITE) + (1|TIME),data=insectnona,family="Gamma"(link="log"))
aictab(modl)
And then I get this warning message only for the null model (model 10):
Warning message: In checkConv(attr(opt, "derivs"), opt$par, ctrl =
control$checkConv, : Model failed to converge with max|grad| =
0.0169244 (tol = 0.001, component 1)
Thanks in advance for your help!

Building time series from lists in R?

Hi I am trying to make a list of time-series objects by applying ts() function to the list of dataframes that I have. My list of dataframes is called whisk.basic.lst and it contain 69 dataframes. I am showing how it look like below:
> str(whisk.basic.lst)
$ SI_VALUES_IMV_EU28_INTRA :List of 12
..$ 220830 : num [1:74] 218674 255317 327377 363219 335267 ...
..$ 22083011: num [1:74] 9363 10129 19672 20631 10403 ...
..$ 22083019: num [1:74] 0 1978 0 0 7116 ...
..$ 22083030: num [1:74] 3837 15684 14588 20487 30870 ...
..$ 22083041: num [1:74] 18979 5123 7176 36842 9390 ...
..$ 22083049: num [1:74] 688 0 0 0 0 0 0 0 0 0 ...
..$ 22083061: num [1:74] 0 0 3452 4225 96 ...
..$ 22083069: num [1:74] 0 0 0 40 0 0 7520 0 0 0 ...
..$ 22083071: num [1:74] 139915 204803 256095 218105 185088 ...
..$ 22083079: num [1:74] 0 3219 0 0 3381 ...
..$ 22083082: num [1:74] 45892 14381 26394 62889 88527 ...
..$ 22083088: num [1:74] 0 0 0 0 396 0 0 0 642 105 ...
$ SK_VALUES_IMV_EU28_EXTRA :List of 11
..$ 220830 : num [1:74] 7155 12311 237 2705 7419 ...
..$ 22083011: num [1:74] 0 0 0 0 35 0 0 0 122 0 ...
..$ 22083019: num [1:74] 0 0 0 0 0 0 0 0 0 0 ...
..$ 22083030: num [1:74] 0 0 0 0 935 235 732 0 669 0 ...
..$ 22083041: num [1:74] 0 0 0 0 2603 ...
..$ 22083049: num [1:74] 0 0 0 0 0 0 0 0 0 0 ...
..$ 22083061: num [1:74] 0 0 0 0 0 0 0 0 0 0 ...
..$ 22083071: num [1:74] 0 0 0 0 27 0 546 0 0 0 ...
..$ 22083079: num [1:74] 0 0 0 0 0 0 0 0 0 0 ...
..$ 22083082: num [1:74] 7155 12311 237 2705 3819 ...
..$ 22083088: num [1:74] 0 0 0 0 0 0 0 0 0 0 ...
$ SK_VALUES_IMV_EU28_INTRA :List of 11
..$ 220830 : num [1:74] 380007 459653 155033 205879 297446 ...
..$ 22083011: num [1:74] 26772 68577 8585 24567 17996 ...
..$ 22083019: num [1:74] 0 0 0 0 0 0 0 0 0 0 ...
..$ 22083030: num [1:74] 60521 15068 1830 9788 5557 ...
..$ 22083041: num [1:74] 110461 71551 66317 47354 46776 ...
..$ 22083049: num [1:74] 100 19718 4115 201 201 ...
..$ 22083061: num [1:74] 0 29706 0 0 8177 ...
..$ 22083071: num [1:74] 418 21760 3138 68164 46300 ...
..$ 22083079: num [1:74] 0 738 0 738 2213 ...
..$ 22083082: num [1:74] 181487 186179 68737 53360 170226 ...
..$ 22083088: num [1:74] 248 46356 2311 1707 0 ...
Ts.whisk.lst <- lapply(whisk.basic.lst, function(x) ts(x, frequency= 12, start=c(2010,1)))
I am getting the following error:
Error in ts(x, frequency = 12, start = c(2010, 1)) :
'ts' object must have one or more observations
Called from: ts(x, frequency = 12, start = c(2010, 1))
I have been creating time series from my dataframes without any problem in the past. But since I created a list of these dataframes, I have run into this problem. Does anyone know why?

Calculate y-value of curve maximum of a smooth line in R and ggplot2

I'm following up an old question addressed here:
calculate x-value of curve maximum of a smooth line in R and ggplot2
How could I calculate the Y-value of curve maximum?
Cheers
It would seem to me that code changes of "x" to "y" and 'vline' to 'hline' and "xintercept" to "yintercept" would be all that were needed:
gb <- ggplot_build(p1)
exact_y_value_of_the_curve_maximum <- gb$data[[1]]$y[which(diff(sign(diff(gb$data[[1]]$y)))==-2)+1]
p1 + geom_hline( yintercept =exact_y_value_of_the_curve_maximum)
exact_y_value_of_the_curve_maximum
I don't think I would call these "exact" since they are only numerical estimates. The other way to get that value would be
max(gb$data[[1]]$y)
As the $data element of that build-object can be examined:
> str(gb$data)
List of 2
$ :'data.frame': 80 obs. of 7 variables:
..$ x : num [1:80] 1 1.19 1.38 1.57 1.76 ...
..$ y : num [1:80] -123.3 -116.6 -109.9 -103.3 -96.6 ...
..$ ymin : num [1:80] -187 -177 -166 -156 -146 ...
..$ ymax : num [1:80] -59.4 -56.5 -53.5 -50.3 -46.9 ...
..$ se : num [1:80] 29.3 27.6 25.9 24.3 22.8 ...
..$ PANEL: int [1:80] 1 1 1 1 1 1 1 1 1 1 ...
..$ group: int [1:80] 1 1 1 1 1 1 1 1 1 1 ...
$ :'data.frame': 16 obs. of 4 variables:
..$ x : num [1:16] 1 2 3 4 5 6 7 8 9 10 ...
..$ y : num [1:16] -79.6 -84.7 -88.4 -74.1 -29.6 ...
..$ PANEL: int [1:16] 1 1 1 1 1 1 1 1 1 1 ...
..$ group: int [1:16] 1 1 1 1 1 1 1 1 1 1 ...

Resources