Data Average Not Close to the Age I chose - r

I have a function called sim.LifeAnnuity, which takes three parameters nsim (number of samples), a (age) and g (Gender of either Male of Female). There is another table that I am drawing data from called LifeTable. This table ranges from ages 0 to 119. The function is supposed to return the number of payments purchased by an individual of age = a and gender (Male of Female). The probability is impacted by the gender and age, where it should take into consideration the prob values from age (a to 119). This is the my code thus far:
sim.lifeAnnuity <- function(nsim, a, g = c("Female", "Male")) {
age <- c(1:(120-a))
malep <- c(Lab1$Male_Prob[(a+1):120])
falep <- c(Lab1$Female_Prob[(a+1):120])
if(a>=0 & a<=119 & g=="Male") {
male <- sample(age, size = nsim, replace = TRUE, prob =
malep)
return(male)
}
if(a>=0 & a<=119 & g=="Female") {
female <- sample(age, size = nsim, replace = TRUE, prob =
falep)
return(female)
}
}
sim.lifeAnnuity(nsim = 5, a = 40, g = "Male")
I created two vectors malep and femalep and utilized them in sample(). The return outcome should give a vector of nsim payments, whose average should be close to the age I pick. However the values I receive are not close to the age I chose.

Related

R: rowwise Confidence Intervals for a Difference of Binomials

I have a table with frequencies for control and treatment group for a multinomial factor (`response'), with three levels (Negative, Neutral, Positive). I want to calculate for each levelthe difference between treatment and control, and confidence intervals, and add them to the table.
I am looking for something that can be applied to several similar frequency tables that compare treatment and control groups, where the response categories vary (e.g. unlikely, 50-50, likely).
Here is the table:
N_A <- data.frame (response = c("Negative", "Neutral", "Positive"),
n_T = c(48, 43, 42), # treatment group
n_C = c(36, 40, 51) # control group
)
I have tried to use the BinomDiffCI function from the DescTools package. I managed to write a function that runs BinomDiffCI for the first row, and extracts the lower CI.
library(DescTools)
lci.diff <- function(){
xci <- BinomDiffCI(x1 = N_A[1,2], n1 = sum(N_A[2]), x2 = N_A[1,3], n2 = sum(N_A[3]), method=c("waldcc"))
xci[,2]
}
It's not great, but maybe a start. I want to 1) add difference and upper CI, 2) do the same for all rows, 3) attach this to the dataset, and 4) apply the same to other frequency tables comparing treatment and control.
Here is the code to create the lower and upper bounds of the confidence interval
library(DescTools)
ci_diff <- function(df, i) {
tbl <- BinomDiffCI(x1 = df[i,2], n1 = sum(df[2]), x2 = df[i,3], n2 = sum(df[3]), method=c("waldcc"))
tbl[ , c("lwr.ci", "upr.ci")]
}
N_A <- cbind(N_A, t(sapply(1:nrow(N_A), \(i) ci_diff(N_A, i)))
response n_T n_C lwr.ci upr.ci
1 Negative 48 36 -0.04342071 0.1982961
2 Neutral 43 40 -0.11268594 0.1293812
3 Positive 42 51 -0.20971246 0.0381418

How to fix vector size discrepancy(?): lmeInfo::g_mlm function, between r_const and utils::combn(1:cor_q, 2) : n < m

Using this code block in fresh install of R,
if (!require('readr')) install.packages('readr')
data <- as.data.frame(readr::read_delim("/Users/{le me}/Desktop/{path}/Histology Data for R.csv", delim = ","))
DataGroupLabel <- "3groupAnova"
subdata <- data[3:80,1:7] # Select a single trial of data to analyze for 3-way ANOVA
colnames(subdata) <- c("Tx","XCoordinate","Channel","VarianceLabel", "VarianceCode", "BrainRegion", "CellCount") # give the data simple labels
subdata$CellCount <- as.numeric(subdata$CellCount) # convert CellCount to numeric
# Calculate intraclass correlation (ICC)
ANOVAresult <- aov(CellCount ~ XCoordinate, data = subdata)
ANOVAtable <- summary(ANOVAresult)
names(ANOVAtable) <- "Ftable"
Atable <- as.data.frame(ANOVAtable$Ftable)
icc <- Atable[1,3]/(Atable[1,3] + Atable[2,3])
icc
# dummy code the 3 groups (sham = baseline = 0 in $CCI and 0 in $Xpro)
subdata$CCI <- 0
subdata$CCI[subdata$Tx=="CCI"]<- 1
subdata$Xpro <- 0
subdata$Xpro[subdata$Tx=="Xpro"]<- 1
subdata$CCI=as.factor(subdata$CCI)
subdata$Xpro=as.factor(subdata$Xpro)
#subdata$XCoordinate <- type_convert(subdata$XCoordinate)
head(subdata, 100) # Preview & inspect
if (!require('nlme')) install.packages('nlme') # fit multilevel model to run t-test
HCSresult <- nlme::gls(CellCount ~ CCI + Xpro,
# heterogeneous variances
weights=varIdent(form = ~1|VarianceCode),
correlation = corSymm(value=numeric(0),form = ~1|XCoordinate,fixed=T),
data = subdata)
residual <- HCSresult$residuals #setup plots for inspection
par(mfrow=c(1,2))
h <- hist(residual, xlab="Residual")
xfit <- seq(min(residual), max(residual), length = 40)
yfit <- dnorm(xfit, mean = mean(residual), sd = sd(residual))
yfit <- yfit * diff(h$mids[1:2]) * length(residual)
lines(xfit, yfit, col = "black", lwd = 2)
qqnorm(residual, frame = FALSE)
qqline(residual)
par(mfrow=c(1,1))
summary(HCSresult)
# fit null model for comparison
NULresult <- nlme::gls(CellCount ~ 1,
# heterogeneous variances
weights=varIdent(form = ~1|VarianceCode),
correlation = corSymm(value=numeric(0),form = ~1|XCoordinate,fixed=T),
data = subdata)
logLik(NULresult, REML=FALSE)
logLik(HCSresult, REML=FALSE)
chidiff <- -2*as.numeric(logLik(NULresult, REML=FALSE) - logLik(HCSresult, REML=FALSE))
chidiff # print delta chi-square value
qchisq(.95, df=2) # print critical value
round(pchisq(chidiff, df=2, lower.tail=FALSE), 4) # p-value
if (!require('lmeInfo')) install.packages('lmeInfo') # asymmetric conf interval for ES
ctrl <- lmeControl(opt='optim');
result <- lme(fixed = CellCount ~ CCI + Xpro, control=ctrl,
random = ~ 1 | XCoordinate,
correlation = corSymm(value=numeric(0),form = ~1|XCoordinate,fixed=T),
# heterogeneous variances
weights=varIdent(form = ~1|VarianceCode),
data = subdata)
#control=lmeControl(opt="optim") # Switching optimizers for occasional convergence errors
summary(result)
# extract_varcomp(result,separate_variances = TRUE)
ESresult <- lmeInfo::g_mlm(result, p_const = c(0,1,1), r_const = c(1,1,0,0,1,1,1),
infotype = "expected", separate_variances = TRUE)
summary(ESresult)
CI_g(ESresult, symmetric = FALSE)
I fixed a bunch of errors in this to get it to run all the way to the final ESresult <- lmeInfo ::gmlm function call. Unfortunately the lmeInfo module's GitHub page ( https://github.com/jepusto/lmeInfo ) doesn't seem to go into detail on how to specify the r_const = c() vector. My advisor said to use these modules to run this analysis, but is functionally unavailable to teach me how to use them. Specifically, how to understand the string of integers in r_const(). I think I'll need to adjust r_const for other blocks of data in this set, as many of the blocks are of unique sample sizes.
Got this error: Error in utils::combn(1:cor_q, 2) : n < m
...against the following example block of data:
CCI,2108,FITC,Ipsi Group 1,A,Cortex,12.2222222
CCI,1218,FITC,Ipsi Group 1,A,Cortex,19.4736842
CCI,2380,FITC,Ipsi Group 1,A,Cortex,15
CCI,4479,FITC,Ipsi Group 1,A,Cortex,9.47368421
CCI,949,FITC,Ipsi Group 1,A,Cortex,25.2631579
CCI,2129,FITC,Ipsi Group 1,A,Cortex,0
CCI,618,FITC,Ipsi Group 1,A,Cortex,15.5555556
CCI,5061,FITC,Ipsi Group 1,A,Cortex,28.3333333
CCI,2768,FITC,Ipsi Group 1,A,Cortex,25
CCI,2733,FITC,Ipsi Group 1,A,Cortex,31
CCI,2063,FITC,Ipsi Group 1,A,Cortex,25.5
CCI,5307,FITC,Ipsi Group 1,A,Cortex,36
CCI,1923,FITC,Ipsi Group 1,A,Cortex,16.3157895
CCI,9414,FITC,Contra Group 1,D,Cortex,1.05263158
CCI,9123,FITC,Contra Group 1,D,Cortex,4
CCI,10723,FITC,Contra Group 1,D,Cortex,5.2631579
CCI,9737,FITC,Contra Group 1,D,Cortex,6.5
CCI,7422,FITC,Contra Group 1,D,Cortex,8.42105263
CCI,9012,FITC,Contra Group 1,D,Cortex,5
CCI,8924,FITC,Contra Group 1,D,Cortex,5
CCI,9550,FITC,Contra Group 1,D,Cortex,9
CCI,8070,FITC,Contra Group 1,D,Cortex,6
CCI,8734,FITC,Contra Group 1,D,Cortex,7.5
CCI,10154,FITC,Contra Group 1,D,Cortex,9.5
CCI,10203,FITC,Contra Group 1,D,Cortex,38.5
CCI,11305,FITC,Contra Group 1,D,Cortex,24.4444444
Sham,803,FITC,Ipsi Group 2,B,Cortex,1.49342891
Sham,1937,FITC,Ipsi Group 2,B,Cortex,1.88101926
Sham,278,FITC,Ipsi Group 2,B,Cortex,1.38966092
Sham,1414,FITC,Ipsi Group 2,B,Cortex,0
Sham,2762,FITC,Ipsi Group 2,B,Cortex,2.86418056
Sham,3218,FITC,Ipsi Group 2,B,Cortex,1.06126694
Sham,157,FITC,Ipsi Group 2,B,Cortex,5.68860572
Sham,1174,FITC,Ipsi Group 2,B,Cortex,3.09463391
Sham,4481,FITC,Ipsi Group 2,B,Cortex,5.33253701
Sham,4011,FITC,Ipsi Group 2,B,Cortex,4.53607312
Sham,2784,FITC,Ipsi Group 2,B,Cortex,1.1154863
Sham,2379,FITC,Ipsi Group 2,B,Cortex,1.05037604
Sham,1417,FITC,Ipsi Group 2,B,Cortex,0
Sham,6739,FITC,Contra Group 2,E,Cortex,9.59452079
Sham,5166,FITC,Contra Group 2,E,Cortex,2.18957325
Sham,7252,FITC,Contra Group 2,E,Cortex,5.16175655
Sham,7115,FITC,Contra Group 2,E,Cortex,9.05843988
Sham,8000,FITC,Contra Group 2,E,Cortex,11.5952485
Sham,5843,FITC,Contra Group 2,E,Cortex,0
Sham,6361,FITC,Contra Group 2,E,Cortex,4.84261501
Sham,8444,FITC,Contra Group 2,E,Cortex,6.10209357
Sham,8247,FITC,Contra Group 2,E,Cortex,5.22677678
Sham,5260,FITC,Contra Group 2,E,Cortex,3.88888889
Sham,7526,FITC,Contra Group 2,E,Cortex,3.97645936
Sham,7495,FITC,Contra Group 2,E,Cortex,5.7390456
Sham,8052,FITC,Contra Group 2,E,Cortex,3.17302506
Xpro,2845,FITC,Ipsi Group 3,C,Cortex,33.6842105
Xpro,1379,FITC,Ipsi Group 3,C,Cortex,9.09090909
Xpro,1592,FITC,Ipsi Group 3,C,Cortex,21.6666667
Xpro,968,FITC,Ipsi Group 3,C,Cortex,6.25
Xpro,2982,FITC,Ipsi Group 3,C,Cortex,17.2222222
Xpro,1212,FITC,Ipsi Group 3,C,Cortex,6.31578947
Xpro,2563,FITC,Ipsi Group 3,C,Cortex,29
Xpro,4615,FITC,Ipsi Group 3,C,Cortex,30
Xpro,630,FITC,Ipsi Group 3,C,Cortex,23
Xpro,5014,FITC,Ipsi Group 3,C,Cortex,27
Xpro,1409,FITC,Ipsi Group 3,C,Cortex,16.5
Xpro,3969,FITC,Ipsi Group 3,C,Cortex,23
Xpro,3527,FITC,Ipsi Group 3,C,Cortex,15
Xpro,7253,FITC,Contra Group 3,F,Cortex,7
Xpro,7891,FITC,Contra Group 3,F,Cortex,9
Xpro,6711,FITC,Contra Group 3,F,Cortex,15.625
Xpro,7830,FITC,Contra Group 3,F,Cortex,11.1111111
Xpro,7218,FITC,Contra Group 3,F,Cortex,10.2857143
Xpro,7849,FITC,Contra Group 3,F,Cortex,4.57085395
Xpro,7303,FITC,Contra Group 3,F,Cortex,12.6019498
Xpro,6928,FITC,Contra Group 3,F,Cortex,33.8199036
Xpro,7991,FITC,Contra Group 3,F,Cortex,5.81483248
Xpro,8452,FITC,Contra Group 3,F,Cortex,1.28539661
Xpro,6717,FITC,Contra Group 3,F,Cortex,36.0048967
Xpro,7328,FITC,Contra Group 3,F,Cortex,23.0207289
Xpro,5772,FITC,Contra Group 3,F,Cortex,7
So, I've troubleshot all the way to the end and I'm stuck here on the last lines of code. Much appreciation to anyone for wisdom.

r: for loop to simulate predictions when random sampling is applied

I am trying to simulate how replacement/reassignment of values on random samples affect predictions conveyed by AUC.
I have a tumor classification in a dataframe denoted df$who which has levels 1, 2, 3 corresponding to the severity of the tumor lesion.
Intro to the question
Lets say the baseline data looks like this:
set.seed(1)
df <- data.frame(
who = as.factor(sample(1:3, size = 6000, replace = TRUE, prob = c(0.8, 0.15, 0.05))),
age = round(runif(n = 6000, min = 18, max = 95), digits = 1),
gender = sample(c("m", "f"), size = 6000, replace = TRUE, prob = c(1/3, 2/3)),
event.time = runif(n = 6000, min = 8, max = 120),
event = as.factor(sample(0:2, size = 6000, replace = TRUE, prob = c(0.25, 0.2, 0.55)))
)
And a standard cause-specific Cox regression looks like:
library(survival)
a_baseline <- coxph(Surv(event.time, event == 1) ~ who + age + gender, data = df, x = TRUE)
From which AUC can be obtained as a measure of predictive performance. Here, leave-one-out bootstrap on 5-year prediction on df$event == 1.
library(riskRegression)
u <- Score(list("baseline" = a_baseline),
Surv(event.time, event == 1) ~ 1,
data = df,
times = 60,
plots = "cal",
B = 50,
split.method = "loob",
metrics = c("auc", "brier")
)
# The AUC is then obtained
u$AUC$score$AUC[2]
Question
I want to simulate how re-classifying a random 5% of df$who == 1 to dfwho == 2 affect the 5-year prediction on df$event == 1
I want to create 10 separate and simulated subsets of the baseline data df, but each containing a random allocation of 5% df$who == 1 to .. == 2. Then, I want to apply each of these 10 separate and simulated subsets to predict the 5-year risk of df$event == 1.
I have applied a for loop to this. The expected output is dataframe that tells me which of the 10 simulated datasets yielded the highest and lowest u$AUC$score$AUC[2] (i.e., the best and worst prediction).
I am new to for loop, but here is my go (that obviously did not work).
all_auc <- data.frame() ## create a dataframe to fill in AUC from all 10 simulated sub-datasets
for(i in 1:10){ #1:10 represent the simulated datasets from 1 to 10
df[i] <- df #allocating baseline data to each of the 10 datasets
df[i]$who[sample(which(df[i]$who==1), round(0.05*length(which(df[i]$who==1))))]=2 #create the random 5% allocation of who==1 to who==2 in the i'th simulated dataset
ith_cox <- coxph(Surv(event.time, event == 1) ~ who + age + gender, data = df[i], x = TRUE) #create the i'th Cox regression based on the i´th dataset
# create the predictions based on the i´th Cox
u[i] <- Score(list("baseline" = ith_cox),
Surv(event.time, event == 1) ~ 1,
data = df[i],
times = 60,
plots = "cal",
B = 50,
split.method = "loob",
metrics = c("auc", "brier")
)
# summarize all AUC from all 10 sub-datasets
all_auc <- u[i]$AUC$score$AUC[2]
}
(1) I could not get this for loop to work as described, and
(2) the final dataframe all_auc should provide only which of the 10 datasets yielded the worst and best predictions (I will then use these two data sets for further analysis).
A final note
This is only a reproducible example. The for loop will be applied to 10.000 simulated datasets in our analysis. I do not know if this could affect the answer - but, it illustrates the importance of the result: a dataframe (or vector?) that simply tells me which simulated dataset yielded the best vs worst predictions, and that I subsequently will be able to use these two dataframes for furter analysis, eg df2930 and df8939.

Extracting individual growth constants using population growth curve model in R

I would like to derive individual growth rates from our growth model directly, similar to this OP and this OP.
I am working with a dataset that contains the age and weight (wt) measurements for ~2000 individuals in a population. Each individual is represented by a unique id number.
A sample of the data can be found here. Here is what the data looks like:
id age wt
1615 6 15
3468 32 61
1615 27 50
1615 60 145
6071 109 209
6071 125 207
10645 56 170
10645 118 200
I have developed a non-linear growth curve to model growth for this dataset (at the population level). It looks like this:
wt~ A*atan(k*age - t0) + m
which predicts weight (wt) for a given age and has modifiable parameters A, t0, and m. I have fit this model to the dataset at the population level using a nlme regression fit where I specified individual id as a random effect and used pdDiag to specify each parameter as uncorrelated. (Note: the random effect would need to be dropped when looking at the individual level.)
The code for this looks like:
nlme.k = nlme(wt~ A*atan(k*age - t0) + m,
data = df,
fixed = A+k+t0+m~1,
random = list(id = pdDiag(A+t0+k+m~1)), #cannot include when looking at the individual level
start = c(A = 99.31,k = 0.02667, t0 = 1.249, m = 103.8), #these values are what we are using at the population level # might need to be changed for individual models
na.action = na.omit,
control = nlmeControl(maxIter = 200, pnlsMaxIter = 10, msMaxIter = 100))
I have our population level growth model (nlme.k), but I would like to use it to derive/extract individual values for each growth constant.
How can I extract individual growth constants for each id using my population level growth model (nlme.k)? Note that I don't need it to be a solution that uses nlme, that is just the model I used for the population growth model.
Any suggestions would be appreciated!
I think this is not possible due to the nature on how random effects are designed. According to this post the effect size (your growth constant) is estimated using partial pooling. This involves using data points from other groups. Thus you can not estimate the effect size of each group (your individual id).
Strictly speaking (see here) random effects are not really a part of the model at all, but more a part of the error.
However, you can estimate the R2 for all groups together. If you want it on an individual level (e.g. parameter estiamtes for id 1), then just run the same model only on all data points of this particular individual. This give you n models with n parameter sets for n individuals.
We ended up using a few loops to do this.
Note that our answer builds off a model posted in this OP if anyone wants the background script. We will also link to the published script when it is posted.
For now - this is should give a general idea of how we did this.
#Individual fits dataframe generation
yid_list <- unique(young_inds$squirrel_id)
indf_prs <- list('df', 'squirrel_id', 'A_value', 'k_value', 'mx_value', 'my_value', 'max_grate', 'hit_asymptote', 'age_asymptote', 'ind_asymptote', 'ind_mass_asy', 'converge') #List of parameters
ind_fits <- data.frame(matrix(ncol = length(indf_prs), nrow = length(yid_list))) #Blank dataframe for all individual fits
colnames(ind_fits) <- indf_prs
#Calculates individual fits for all individuals and appends into ind_fits
for (i in 1:length(yid_list)) {
yind_df <-young_inds%>%filter(squirrel_id %in% yid_list[i]) #Extracts a dataframe for each squirrel
ind_fits[i , 'squirrel_id'] <- as.numeric(yid_list[i]) #Appends squirrel i's id into individual fits dataframe
sex_lab <- unique(yind_df$sex) #Identifies and extracts squirrel "i"s sex
mast_lab <- unique(yind_df$b_mast) #Identifies and extracts squirrel "i"s mast value
Hi_dp <- max(yind_df$wt) #Extracts the largest mass for each squirrel
ind_long <- unique(yind_df$longevity) #Extracts the individual death date
#Sets corresponding values for squirrel "i"
if (mast_lab==0 && sex_lab=="F") { #Female no mast
ind_fits[i , 'df'] <- "fnm" #Squirrel dataframe (appends into ind_fits dataframe)
df_asm <- af_asm #average asymptote value corresponding to sex
df_B_guess <- guess_df[1, "B_value"] #Inital guesses for nls fits corresponding to sex and mast sex and mast
df_k_guess <- guess_df[1, "k_value"]
df_mx_guess <- guess_df[1, "mx_value"]
df_my_guess <- guess_df[1, "my_value"]
ind_asyr <- indf_asy #growth rate at individual asymptote
} else if (mast_lab==0 && sex_lab=="M") { #Male no mast
ind_fits[i , 'df'] <- "mnm"
df_asm <- am_asm
df_B_guess <- guess_df[2, "B_value"]
df_k_guess <- guess_df[2, "k_value"]
df_mx_guess <- guess_df[2, "mx_value"]
df_my_guess <- guess_df[2, "my_value"]
ind_asyr <- indm_asy
} else if (mast_lab==1 && sex_lab=="F") { #Female mast
ind_fits[i , 'df'] <- "fma"
df_asm <- af_asm
df_B_guess <- guess_df[3, "B_value"]
df_k_guess <- guess_df[3, "k_value"]
df_mx_guess <- guess_df[3, "mx_value"]
df_my_guess <- guess_df[3, "my_value"]
ind_asyr <- indm_asy
} else if (mast_lab==1 && sex_lab=="M") { #Males mast
ind_fits[i , 'df'] <- "mma"
df_asm <- am_asm
df_B_guess <- guess_df[4, "B_value"]
df_k_guess <- guess_df[4, "k_value"]
df_mx_guess <- guess_df[4, "mx_value"]
df_my_guess <- guess_df[4, "my_value"]
ind_asyr <- indf_asy
} else { #If sex or mast is not identified or identified improperlly in the data
print("NA")
} #End of if else loop
#Arctangent
#Fits nls model to the created dataframe
nls.floop <- tryCatch({data.frame(tidy(nls(wt~ B*atan(k*(age - mx)) + my, #tryCatch lets nls have alternate results instead of "code stopping" errors
data=yind_df,
start = list(B = df_B_guess, k = df_k_guess, mx = df_mx_guess, my = df_my_guess),
control= list(maxiter = 200000, minFactor = 1/100000000))))
},
error = function(e){
nls.floop <- data.frame(c(0,0), c(0,0)) #Specifies nls.floop as a dummy dataframe if no convergence
},
warning = function(w) {
nls.floop <- data.frame(tidy(nls.floop)) #Fit is the same if warning is displayed
}) #End of nls.floop
#Creates a dummy numerical index from nls.floop for if else loop below
numeric_floop <- as.numeric(nls.floop[1, 2])
#print(numeric_floop) #Taking a look at the values. If numaric floop...
# == 0, function did not converge on iteration "i"
# != 0, function did converge on rapid "i" and code will run through calculations
if (numeric_floop != 0) {
results_DF <- nls.floop
ind_fits[i , 'converge'] <- 1 #converge = 1 for converging fit
#Extracting, calculating, and appending values into dataframe
B_value <- as.numeric(results_DF[1, "estimate"]) #B value
k_value <- as.numeric(results_DF[2, "estimate"]) #k value
mx_value <- as.numeric(results_DF[3, "estimate"]) #mx value
my_value <- as.numeric(results_DF[4, "estimate"]) #my value
A_value <- ((B_value*pi)/2)+ my_value #A value calculation
ind_fits[i , 'A_value'] <- A_value
ind_fits[i , 'k_value'] <- k_value
ind_fits[i , 'mx_value'] <- mx_value
ind_fits[i , 'my_value'] <- my_value #appends my_value into df
ind_fits[i , 'max_grate'] <- adr(mx_value, B_value, k_value, mx_value, my_value) #Calculates max growth rate
}
} #End of individual fits loop
Which gives this output:
> head(ind_fits%>%select(df, squirrel_id, A_value, k_value, mx_value, my_value))
df squirrel_id A_value k_value mx_value my_value
1 mnm 332 257.2572 0.05209824 52.26842 126.13183
2 mnm 1252 261.0728 0.02810033 42.37454 103.02102
3 mnm 3466 260.4936 0.03946594 62.27705 131.56665
4 fnm 855 437.9569 0.01347379 86.18629 158.27641
5 fnm 2409 228.7047 0.04919819 63.99252 123.63404
6 fnm 1417 196.0578 0.05035963 57.67139 99.65781
Note that you need to create a blank dataframe first before running the loops.

Learning hidden markov model in R

A hidden Markov model (HMM) is one in which you observe a sequence of observations, but do not know the sequence of states the model went through to generate the observations. Analyses of hidden Markov models seek to recover the sequence of hidden states from the observed data.
I have data with both observations and hidden states (observations are of continuous values) where the hidden states were tagged by an expert. I would like to train a HMM that would be able - based on a (previously unseen) sequence of observations - to recover the corresponding hidden states.
Is there any R package to do that? Studying the existing packages (depmixS4, HMM, seqHMM - for categorical data only) allows you to specify a number of hidden states only.
EDIT:
Example:
data.tagged.by.expert = data.frame(
hidden.state = c("Wake", "REM", "REM", "NonREM1", "NonREM2", "REM", "REM", "Wake"),
sensor1 = c(1,1.2,1.2,1.3,4,2,1.78,0.65),
sensor2 = c(7.2,5.3,5.1,1.2,2.3,7.5,7.8,2.1),
sensor3 = c(0.01,0.02,0.08,0.8,0.03,0.01,0.15,0.45)
)
data.newly.measured = data.frame(
sensor1 = c(2,3,4,5,2,1,2,4,5,8,4,6,1,2,5,3,2,1,4),
sensor2 = c(2.1,2.3,2.2,4.2,4.2,2.2,2.2,5.3,2.4,1.0,2.5,2.4,1.2,8.4,5.2,5.5,5.2,4.3,7.8),
sensor3 = c(0.23,0.25,0.23,0.54,0.36,0.85,0.01,0.52,0.09,0.12,0.85,0.45,0.26,0.08,0.01,0.55,0.67,0.82,0.35)
)
I would like to create a HMM with discrete time t whrere random variable x(t) represents the hidden state at time t, x(t) {"Wake", "REM", "NonREM1", "NonREM2"}, and 3 continuous random variables sensor1(t), sensor2(t), sensor3(t) representing the observations at time t.
model.hmm = learn.model(data.tagged.by.user)
Then I would like to use the created model to estimate hidden states responsible for newly measured observations
hidden.states = estimate.hidden.states(model.hmm, data.newly.measured)
Data (training/testing)
To be able to run learning methods for Naive Bayes classifier, we need longer data set
states = c("NonREM1", "NonREM2", "NonREM3", "REM", "Wake")
artificial.hypnogram = rep(c(5,4,1,2,3,4,5), times = c(40,150,200,300,50,90,30))
data.tagged.by.expert = data.frame(
hidden.state = states[artificial.hypnogram],
sensor1 = log(artificial.hypnogram) + runif(n = length(artificial.hypnogram), min = 0.2, max = 0.5),
sensor2 = 10*artificial.hypnogram + sample(c(-8:8), size = length(artificial.hypnogram), replace = T),
sensor3 = sample(1:100, size = length(artificial.hypnogram), replace = T)
)
hidden.hypnogram = rep(c(5,4,1,2,4,5), times = c(10,10,15,10,10,3))
data.newly.measured = data.frame(
sensor1 = log(hidden.hypnogram) + runif(n = length(hidden.hypnogram), min = 0.2, max = 0.5),
sensor2 = 10*hidden.hypnogram + sample(c(-8:8), size = length(hidden.hypnogram), replace = T),
sensor3 = sample(1:100, size = length(hidden.hypnogram), replace = T)
)
Solution
In the solution, we used Viterbi algorithm - combined with Naive Bayes classifier.
At each clock time t, a Hidden Markov Model consist of
an unobserved state (denoted as hidden.state in this case) taking a finite number of states
states = c("NonREM1", "NonREM2", "NonREM3", "REM", "Wake")
a set of observed variables (sensor1, sensor2, sensor3 in this case)
Transition matrix
A new state is entered based upon a transition probability distribution
(transition matrix). This can be easily computed from data.tagged.by.expert e.g. using
library(markovchain)
emit_p <- markovchainFit(data.tagged.by.expert$hidden.state)$estimate
Emission matrix
After each transition is made, an observation (sensor_i) is produced according to a conditional probability distribution (emission matrix) which depends on the current state H of hidden.state only. We will replace emmision matrices by Naive Bayes classifier.
library(caret)
library(klaR)
library(e1071)
model = train(hidden.state ~ .,
data = data.tagged.by.expert,
method = 'nb',
trControl=trainControl(method='cv',number=10)
)
Viterbi algorithm
To solve the problem, we use Viterbi algorithm with the initial probability of 1 for "Wake" state and 0 otherwise. (We expect the patient to be awake in the beginning of the experiment)
# we expect the patient to be awake in the beginning
start_p = c(NonREM1 = 0,NonREM2 = 0,NonREM3 = 0, REM = 0, Wake = 1)
# Naive Bayes model
model_nb = model$finalModel
# the observations
observations = data.newly.measured
nObs <- nrow(observations) # number of observations
nStates <- length(states) # number of states
# T1, T2 initialization
T1 <- matrix(0, nrow = nStates, ncol = nObs) #define two 2-dimensional tables
row.names(T1) <- states
T2 <- T1
Byj <- predict(model_nb, newdata = observations[1,])$posterior
# init first column of T1
for(s in states)
T1[s,1] = start_p[s] * Byj[1,s]
# fill T1 and T2 tables
for(j in 2:nObs) {
Byj <- predict(model_nb, newdata = observations[j,])$posterior
for(s in states) {
res <- (T1[,j-1] * emit_p[,s]) * Byj[1,s]
T2[s,j] <- states[which.max(res)]
T1[s,j] <- max(res)
}
}
# backtract best path
result <- rep("", times = nObs)
result[nObs] <- names(which.max(T1[,nObs]))
for (j in nObs:2) {
result[j-1] <- T2[result[j], j]
}
# show the result
result
# show the original artificial data
states[hidden.hypnogram]
References
To read more about the problem, see Vomlel Jiří, Kratochvíl Václav : Dynamic Bayesian Networks for the Classification of Sleep Stages , Proceedings of the 11th Workshop on Uncertainty Processing (WUPES’18), p. 205-215 , Eds: Kratochvíl Václav, Vejnarová Jiřina, Workshop on Uncertainty Processing (WUPES’18), (Třeboň, CZ, 2018/06/06) [2018] Download

Resources