Trouble visualizing K-means clusters with fviz_clusters() - r

Currently trying to visualize k-means clusters and running in to a bit of trouble. I'm getting this error message when I run the code below:
Error in fviz_cluster(res.km, data = nci[, 5], palette = c("#2E9FDF", :
The dimension of the data < 2! No plot.
Here's my code:
library(dplyr)
library(tidyr)
library(ggplot2)
library(tidyverse)
library(hrbrthemes)
library(factoextra)
library(ggpubr)
nci <- read.csv('/Users/KyleHammerberg/Desktop/ML Extra Credit/nci.datanames.csv')
names(nci)[1] <- "gene"
# Compute k-means with k = 3
set.seed(123)
res.km <- kmeans(scale(nci[,2]), 3, nstart = 25)
# K-means clusters showing the group of each individuals
res.km$cluster
fviz_cluster(res.km, data = nci[,5 ],
palette = c("#2E9FDF", "#00AFBB", "#E7B800"),
geom = "point",
ellipse.type = "convex",
ggtheme = theme_bw()
)
res.km$cluster
[1] 1 2 1 2 3 1 1 3 3 3 3 3 1 1 1 3 3 3 1 3 3 3 3 1 1 1 3 3 3 3 1 3 3 1 3 3 1 1 1 1 1 3
[43] 1 3 3 3 1 1 1 1 3 3 3 3 3 3 3 1 1 3 3 1 1 1 1 1 1 1 3 1 3 1 1 1 3 3 1 2 1 1 3 2 1 3
[85] 1 1 1 1 1 1 1 2 3 1 1 1 3 3 1 1 1 1 1 1 1 3 2 1 2 1 3 3 1 1 1 1 3 3 1 3 3 3 3 1 1 1
[127] 3 3 1 3 1 1 1 3 1 1 1 2 2 2 1 2 2 2 3 1 1 3 3 1 3 1 2 1 3 3 3 3 3 3 1 1 3 1 1 3 3 3
[169] 1 3 3 3 3 1 1 3 1 1 1 1 1 3 1 1 1 1 1 3 1 1 1 1 2 3 3 3 1 3 3 1 1 3 3 1 3 1 1 3 3 1
[211] 3 1 3 1 3 3 1 3 3 1 1 1 1 3 3 1 3 1 3 3 3 3 1 1 1 1 1 3 3 1 3 1 3 1 3 1 3 1 3 3 3 3
[253] 3 3 1 3 3 3 3 3 1 2 1 3 1 3 3 1 1 3 1 1 1 1 1 3 1 3 3 3 3 1 1 3 3 1 3 3 1 1 1 3 1 1
[295] 2 3 1 3 1 3 1 3 1 3 3 3 1 3 3 3 3 3 3 3 1 1 1 1 3 1 1 1 3 1 3 1 1 1 1 3 3 1 3 1 1 1
[337] 3 1 1 2 1 1 1 1 1 1 3 1 3 3 1 3 1 3 3 1 1 3 3 1 1 1 3 1 1 3 3 1 1 1 1 1 1 1 3 1 3 1
[379] 1 1 1 1 1 1 1 1 3 3 1 3 1 1 1 2 1 1 1 3 1 1 1 1 1 3 3 1 3 3 3 1 1 1 1 1 1 1 1 1 3 1
[421] 1 1 1 3 1 3 1 2 1 3 3 3 1 1 1 1 1 1 3 1 1 3 1 1 1 1 1 1 1 3 1 3 3 3 1 1 3 3 1 1 1 3
[463] 3 3 1 3 3 1 3 3 3 3 1 3 1 1 1 3 1 3 3 3 3 3 3 3 3 3 1 3 1 1 3 3 1 1 3 3 3 3 3 3 3 3
[505] 3 3 3 1 3 1 3 3 2 1 1 3 3 1 3 3 3 1 1 3 3 3 1 1 1 1 1 3 3 1 3 3 1 1 1 3 3 1 3 3 1 3
[547] 1 1 1 1 3 3 3 1 3 3 3 3 3 3 1 2 1 1 3 3 3 3 1 1 3 3 3 3 3 1 3 1 1 3 1 3 3 3 3 3 3 3
[589] 1 1 1 1 1 1 3 1 3 1 3 3 3 3 3 1 3 3 3 3 3 1 1 3 3 3 3 3 3 1 3 1 3 3 3 3 3 3 1 3 3 3
[631] 3 3 3 3 3 1 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 3 1 3 1 3 3 1 3 3 3 1 3
[673] 1 3 3 1 1 1 3 1 3 3 3 3 1 3 3 1 3 1 1 1 1 3 1 3 1 3 3 3 1 1 1 3 1 1 1 1 3 3 3 3 3 3
[715] 1 1 1 1 1 1 1 3 1 1 1 3 1 1 3 3 1 1 3 1 1 1 1 1 1 1 3 1 1 1 1 1 1 1 1 3 1 3 1 1 3 3
[757] 1 1 1 1 1 1 1 3 3 3 3 1 3 1 1 3 1 3 3 1 1 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3 1 1 1 1
[799] 1 1 1 1 1 1 1 1 3 1 1 1 1 3 1 1 3 3 1 3 3 1 3 1 3 1 3 1 3 1 3 1 3 1 1 1 1 3 3 1 3 3
[841] 3 3 3 3 3 3 3 3 3 3 3 1 3 3 3 3 3 1 1 3 3 1 2 1 1 1 3 3 1 3 1 1 1 1 1 1 3 1 3 1 1 1
[883] 1 1 1 1 1 1 3 1 1 1 1 3 3 1 1 3 3 3 3 3 3 1 1 2 1 3 1 1 1 1 1 1 1 3 1 3 1 3 1 1 1 1
[925] 1 1 1 3 3 1 1 3 1 1 1 1 1 1 1 1 1 1 3 3 3 3 1 3 3 3 3 3 3 3 1 1 1 3 1 3 1 1 1 1 1 1
[967] 1 1 1 3 1 1 3 1 3 1 3 1 1 3 1 3 3 3 3 3 3 3 1 3 1 3 3 3 3 1 3 1 1 1
[ reached getOption("max.print") -- omitted 5830 entries ]
Here's a look at the data if that helps:
head(nci)
gene CNS CNS.1 CNS.2 RENAL BREAST CNS.3 CNS.4 BREAST.1 NSCLC NSCLC.1
1 g1 0.300 0.679961 0.940 2.80e-01 0.485 0.310 -0.830 -0.190 0.460 0.760
2 g2 1.180 1.289961 -0.040 -3.10e-01 -0.465 -0.030 0.000 -0.870 0.000 1.490
3 g3 0.550 0.169961 -0.170 6.80e-01 0.395 -0.100 0.130 -0.450 1.150 0.280
4 g4 1.140 0.379961 -0.040 -8.10e-01 0.905 -0.460 -1.630 0.080 -1.400 0.100
5 g5 -0.265 0.464961 -0.605 6.25e-01 0.200 -0.205 0.075 0.005 -0.005 -0.525
6 g6 -0.070 0.579961 0.000 -1.39e-17 -0.005 -0.540 -0.360 0.350 -0.700 0.360
RENAL.1 RENAL.2 RENAL.3 RENAL.4 RENAL.5 RENAL.6 RENAL.7 BREAST.2 NSCLC.2 RENAL.8 UNKNOWN
1 0.270 -0.450 -0.030 0.710 -0.360 -0.210 -0.500 -1.060 0.150 -0.290 -0.200
2 0.630 -0.060 -1.120 0.000 -1.420 -1.950 -0.520 -2.190 -0.450 0.000 0.740
3 -0.360 0.150 -0.050 0.160 -0.030 -0.700 -0.660 -0.130 -0.320 0.050 0.080
4 -1.040 -0.610 0.000 -0.770 -2.280 -1.650 -2.610 0.000 -1.610 0.730 0.760
5 0.015 -0.395 -0.285 0.045 0.135 -0.075 0.225 -0.485 -0.095 0.385 -0.105
6 -0.040 0.150 -0.250 -0.160 -0.320 0.060 -0.050 -0.430 -0.080 0.390 -0.080
OVARIAN MELANOMA PROSTATE OVARIAN.1 OVARIAN.2 OVARIAN.3 OVARIAN.4 OVARIAN.5 PROSTATE.1
1 0.430 -0.490 -0.530 -0.010 0.640 -0.480 0.140 0.640 0.070
2 0.500 0.330 -0.050 -0.370 0.550 0.970 0.720 0.150 0.290
3 -0.730 0.010 -0.230 -0.160 -0.540 0.300 -0.240 -0.170 0.070
4 0.600 -1.660 0.170 0.930 -1.780 0.470 0.000 0.550 1.310
5 -0.635 -0.185 0.825 0.395 0.315 0.425 1.715 -0.205 0.085
6 -0.430 -0.140 0.010 -0.100 0.810 0.020 0.260 0.290 -0.620
NSCLC.3 NSCLC.4 NSCLC.5 LEUKEMIA K562B.repro X6K562B.repro LEUKEMIA.1 LEUKEMIA.2
1 0.130 0.320 0.515 0.080 0.410 -0.200 -0.36998050 -0.370
2 2.240 0.280 1.045 0.120 0.000 0.000 -1.38998000 0.180
3 0.640 0.360 0.000 0.060 0.210 0.060 -0.05998047 0.000
4 0.680 -1.880 0.000 0.400 0.180 -0.070 0.07001953 -1.320
5 0.135 0.475 0.330 0.105 -0.255 -0.415 -0.07498047 -0.825
6 0.300 0.110 -0.155 -0.190 -0.110 0.020 0.04001953 -0.130
LEUKEMIA.3 LEUKEMIA.4 LEUKEMIA.5 COLON COLON.1 COLON.2 COLON.3 COLON.4
1 -0.430 -0.380 -0.550 -0.32003900 -0.620 -4.90e-01 0.07001953 -0.120
2 -0.590 -0.550 0.000 0.08996101 0.080 4.20e-01 -0.82998050 0.000
3 -0.500 -1.710 0.100 -0.29003900 0.140 -3.40e-01 -0.59998050 -0.010
4 -1.520 -1.870 -2.390 -1.03003900 0.740 7.00e-02 -0.90998050 0.130
5 -0.785 -0.585 -0.215 0.09496101 0.205 -2.05e-01 0.24501950 0.555
6 0.520 0.120 -0.620 0.05996101 0.000 -1.39e-17 -0.43998050 -0.550
COLON.5 COLON.6 MCF7A.repro BREAST.3 MCF7D.repro BREAST.4 NSCLC.6 NSCLC.7
1 -0.290 -0.8100195 0.200 0.37998050 0.3100195 0.030 -0.42998050 0.160
2 0.030 0.0000000 -0.230 0.44998050 0.4800195 0.220 -0.38998050 -0.340
3 -0.310 0.2199805 0.360 0.65998050 0.9600195 0.150 -0.17998050 -0.020
4 1.500 0.7399805 0.180 0.76998050 0.9600195 -1.240 0.86001950 -1.730
5 0.005 0.1149805 -0.315 0.05498047 -0.2149805 -0.305 0.78501950 -0.625
6 -0.540 0.1199805 0.410 0.54998050 0.3700195 0.050 0.04001953 -0.140
NSCLC.8 MELANOMA.1 BREAST.5 BREAST.6 MELANOMA.2 MELANOMA.3 MELANOMA.4 MELANOMA.5
1 0.010 -0.620 -0.380 0.04998047 0.650 -0.030 -0.270 0.210
2 -1.280 -0.130 0.000 -0.72001950 0.640 -0.480 0.630 -0.620
3 -0.770 0.200 -0.060 0.41998050 0.150 0.070 -0.100 -0.150
4 0.940 -1.410 0.800 0.92998050 -1.970 -0.700 1.100 -1.330
5 -0.015 1.585 -0.115 -0.09501953 -0.065 -0.195 1.045 0.045
6 0.270 1.160 0.180 0.19998050 0.130 0.410 0.080 -0.400
MELANOMA.6 MELANOMA.7
1 -5.00e-02 0.350
2 1.40e-01 -0.270
3 -9.00e-02 0.020
4 -1.26e+00 -1.230
5 4.50e-02 -0.715
6 -2.71e-20 -0.340

nci[,5 ] is the data with only one column. fviz_cluster requires data with atleast 2 columns. This check is performed in these lines https://github.com/kassambara/factoextra/blob/master/R/fviz_cluster.R#L184-L203 .
Using mtcars as example -
Passing a single column in data :
res.km <- kmeans(scale(mtcars[,2]), 3, nstart = 25)
factoextra::fviz_cluster(res.km, data = mtcars[,5],
palette = c("#2E9FDF", "#00AFBB", "#E7B800"),
geom = "point",
ellipse.type = "convex",
ggtheme = theme_bw())
Error in factoextra::fviz_cluster(res.km, data = mtcars[, 5], palette = c("#2E9FDF", :
The dimension of the data < 2! No plot.
Passing two columns in data :
factoextra::fviz_cluster(res.km, data = mtcars[,5:6],
palette = c("#2E9FDF", "#00AFBB", "#E7B800"),
geom = "point",
ellipse.type = "convex",
ggtheme = theme_bw())

Related

How to extract or predict latent class membership in gmnl?

Let's say you run the example for a latent class model from ?gmnl:
library(mlogit)
library(gmnl)
## Examples using the Electricity data set from the mlogit package
data("Electricity", package = "mlogit")
Electr <- mlogit.data(Electricity, id.var = "id", choice = "choice",
varying = 3:26, shape = "wide", sep = "")
## Estimate a LC model with 2 classes
Elec.lc <- gmnl(choice ~ pf + cl + loc + wk + tod + seas| 0 | 0 | 0 | 1,
data = Electr,
subset = 1:3000,
model = 'lc',
panel = TRUE,
Q = 2)
summary(Elec.lc)
You get a fitted model with coefficient estimates for two classes (class 1 & 2). Is there a way to extract (or predict) for each observation, what the most likely class is that this observation belongs to?
After several helpful comments and lots of digging, it seems that there is an undocumented feature that allows you to get predicted class probabilities, which are stored in Wnq. You get one entry per observation and the number of columns matches the number of latent classes (Q = 2 from above), and entries sum to 1.
## Get class probabilities
head(Elec.lc$Wnq)
init
[1,] 0.5547805 0.4452195
[2,] 0.5547805 0.4452195
[3,] 0.5547805 0.4452195
[4,] 0.5547805 0.4452195
[5,] 0.5547805 0.4452195
[6,] 0.5547805 0.4452195
The fitted model contains a matrix called prob.alt which gives the probability of each choice, so you can do:
predictions <- apply(Elec.cor$prob.alt,1, which.max)
predictions
#> [1] 1 1 2 3 1 4 4 3 3 3 2 1 2 2 3 1 1 1 2 3 4 4 4 1 1 4 1 1 4 4 4 2 4 3 1 2 4
#> [38] 4 4 1 1 4 1 1 4 4 4 2 1 1 2 3 4 4 4 2 4 3 4 2 1 4 2 2 2 2 4 2 1 3 4 3 4 4
#> [75] 4 1 4 2 3 2 2 1 3 3 4 3 4 1 1 4 2 1 4 4 2 2 2 2 2 2 1 4 2 2 2 2 1 2 2 4 3
#> [112] 1 1 1 2 3 4 4 4 2 4 3 4 1 1 4 2 1 4 4 2 2 1 4 2 2 2 2 1 2 1 2 4 3 2 2 2 2
#> [149] 1 4 2 2 2 1 2 1 4 3 2 2 2 1 2 1 1 4 2 1 4 2 2 2 2 1 2 1 1 4 3 2 2 2 2 1 4
#> [186] 2 2 2 2 4 2 1 4 3 2 2 2 2 2 1 1 4 2 1 4 4 3 2 2 4 4 1 3 4 1 2 4 3 1 1 1 2
#> [223] 3 4 4 4 1 2 4 2 3 4 4 1 3 4 2 3 3 2 4 1 1 4 4 4 2 1 3 1 2 1 1 2 3 1 4 4 2
#> [260] 4 3 2 1 2 4 2 3 3 4 1 3 4 2 3 3 4 4 4 4 4 1 3 2 3 1 3 3 1 4 2 1 4 4 2 2 1
#> [297] 3 1 1 4 2 4 1 2 4 1 1 4 4 4 2 1 1 2 3 4 4 4 2 4 3 4 1 1 1 2 3 1 4 4 3 4 3
#> [334] 2 1 1 4 1 1 4 4 2 2 1 3 1 3 1 4 2 2 2 2 1 2 1 3 4 3 2 2 2 2 1 4 3 2 2 2 1
#> [371] 2 4 4 1 3 4 2 3 3 2 1 3 3 3 3 4 1 1 4 1 1 4 4 2 2 2 4 2 3 4 4 4 1 4 2 3 2
#> [408] 1 4 3 2 2 2 1 2 1 1 4 3 1 1 2 3 4 4 4 3 3 3 2 1 2 4 3 4 4 4 3 4 3 4 3 4 1
#> [445] 1 4 1 1 4 4 4 2 1 4 2 2 2 2 1 2 1 3 4 3 1 4 2 2 2 2 1 2 4 2 4 3 3 3 4 1 1
#> [482] 4 2 1 4 4 2 2 2 2 3 1 1 1 2 3 4 4 4 2 2 4 2 3 4 4 4 3 4 2 3 2 2 4 2 3 4 4
#> [519] 1 1 4 2 3 2 2 4 1 1 4 4 4 2 2 3 1 3 2 1 2 2 1 4 4 2 2 2 4 2 1 4 3 2 2 2 4
#> [556] 2 1 1 4 2 1 4 2 2 2 2 1 2 1 2 4 3 1 1 2 3 4 4 4 2 4 3 4 2 4 4 4 3 4 2 3 3
#> [593] 3 1 3 3 1 1 2 3 1 4 4 3 4 3 2 1 2 2 2 2 1 4 3 2 2 2 2 2 2 4 2 3 3 4 1 3 4
#> [630] 2 3 3 2 3 1 1 4 4 4 2 2 3 1 3 1 1 2 3 1 4 4 3 3 3 4 1 4 4 4 3 4 1 4 3 1 1
#> [667] 3 3 2 2 3 1 1 1 2 3 1 4 4 2 1 4 2 2 2 2 1 2 1 1 4 2 1 1 2 3 4 4 4 2 4 3 4
#> [704] 1 2 2 2 2 1 4 2 2 2 2 4 2 2 2 2 2 1 4 3 2 2 2 4 2 1 4 2 2 2 2 4 2 1 3 4 3
#> [741] 1 4 3 2 2 2 2 2 1 1
If we compare these predictions to the actual choice, we see that the prediction is correct about 50% of the time (the values in the diagonal are correct):
table(predictions, Electricity$choice[1:750])
#>
#> predictions 1 2 3 4
#> 1 78 35 28 32
#> 2 40 129 40 33
#> 3 16 27 57 24
#> 4 27 36 38 110
Created on 2022-08-06 by the reprex package (v2.0.1)
I have a feeling that this object Wnq is not class membership probabilities though.
Even in your example above, when calling Elec.lc$Wnq, you seem to have obtained a list of probabilities of class membership for your individuals, but critically they are all equal across individuals.
When looking for this I also found myself with the same problem. I think Elec.lc$Wnq is just the mean of class membership probabilities.
I have not looked throughly in the gmnl code, but I think the object Qir is what you should look for ?

CTMC problem: How to calculate mean MLE, bias, mean SE, SE and CI for a Continuous Time Markov Chain simulation dataset in R?

Given a 3-state CTMC, I would like to generate 1000 datasets and record all their transitions and transition times on the time duration [0,100] for a 3-state continuous-time Markov chain with transition rate matrix and find the MLE for all parameters and their mean, SE, SD and bias of the MLEs and CI for each parameter.
I'm using the MSM package in R, I'm not sure if the first-step is correct. And What to do next to find the mean, SE,etc.? Thank you!
Here's the code:
qmatrix<-rbind(c( -1.55,0.5,0.7),c(1.0,-2.5,0.8),c(0.55,2.0,-1.5 ))
sim.msm(qmatrix, maxtime=100, covs=NULL, beta=NULL, obstimes=0, start=1, mintime=0)
Here's the output:
$states
[1] 1 2 3 2 3 2 1 3 2 1 3 2 1 2 3 1 2 3 2 1 2 1 2 3 2 1 3 2 1 3 2 1 3 2 3 2 1 3 2 1 3 2 3 2 1 3 1 3 2 1 3 2 1 3 2 3 2
[58] 3 2 3 2 3 2 3 1 3 2 1 2 3 2 3 1 2 3 2 1 3 2 1 3 2 1 3 1 2 1 2 1 3 2 1 2 3 2 1 2 1 3 2 3 1 2 1 2 3 2 1 3 1 3 1 3 2
[115] 3 2 1 3 1 3 1 2 1 2 1 2 3 1 3 2 3 2 1 3 1 3 2 1 2 3 2 1 3 2 3 2 3 1 3 2 1 3 2 1 3 2 3 1 2 1 2 3 1 3 2 1 2 3 1 3 2
[172] 1 2 1 2 3 2 3 2 1 2 1 3 2 3 2 3 2 3 2 1 2 3 2 1 3 2 1 1
$times
[1] 0.000000 1.337596 1.362191 1.476702 1.555734 1.861048 1.872354 2.028488 2.424991 2.656919
[11] 3.442822 3.536488 3.748460 5.456584 5.613555 5.660168 5.867178 6.011677 6.085615 6.141217
[21] 7.500027 7.831883 8.741804 9.022980 9.075131 10.008952 10.587310 10.724296 11.720231 11.773875
[31] 11.975656 12.256983 12.472417 12.783526 12.848953 12.953589 14.743587 15.832113 15.874583 16.017451
[41] 16.461453 16.731139 16.874843 17.035627 17.597629 19.856766 19.872131 20.431187 20.702377 21.098359
[51] 21.137319 21.365903 22.790554 23.148535 23.442444 24.257172 24.621436 24.831465 26.093660 26.308910
[61] 27.453272 27.582364 27.698166 27.937213 27.940383 28.397250 28.721288 28.729847 28.830485 30.165057
[71] 30.321897 30.447654 30.452361 30.777617 31.018373 32.301805 32.993770 33.191359 33.729895 33.871943
[81] 34.119537 34.194971 34.253327 35.725821 35.884895 36.925904 38.274757 39.107009 39.277357 40.227662
[91] 40.609264 40.901967 42.510962 43.171889 43.605517 44.313961 44.541594 45.683055 46.879402 46.905675
[101] 47.481479 48.257792 48.544124 48.971349 49.013105 50.707812 51.074451 52.061199 52.154260 52.612230
[111] 53.565019 53.728064 54.495656 54.786598 54.873416 54.988014 55.234529 55.423862 57.328118 57.928870
[121] 57.960640 59.943583 59.961192 60.841125 61.443912 62.087233 62.219001 62.265142 62.461134 62.830230
[131] 62.984355 63.351908 63.423539 63.728888 64.180341 64.459818 64.909733 65.034425 66.138596 66.606237
[141] 67.459424 67.593525 67.802345 68.291826 68.787457 68.811446 69.303668 69.435817 70.666894 70.754037
[151] 70.858464 71.311939 71.769933 74.052097 75.697696 76.148180 76.389240 77.002067 77.510761 77.748987
[161] 78.974831 79.153566 79.814880 79.959816 80.409660 80.762432 84.893508 85.828107 86.498693 87.739144
[171] 87.963883 87.989974 88.628014 89.716151 90.027518 90.383951 90.439742 90.730203 90.772182 90.906435
[181] 91.088374 91.281104 91.716445 91.885433 92.063158 92.063926 93.110963 94.336095 95.043027 95.661818
[191] 96.170785 98.203715 98.548922 98.566083 98.910060 99.289348 99.358420 99.579823 100.000000
$qmatrix
[,1] [,2] [,3]
[1,] -1.55 0.5 0.7
[2,] 1.00 -2.5 0.8
[3,] 0.55 2.0 -1.5

Add data row(s) to a tibble depending on the content of a single column

I want to automatically add one or more rows to an existing tibble depending on the values present in one of the tibble columns.
Data
A B C D E
1 1 1 1 5 7.81
2 1 1 1 4 13.12
3 1 1 1 5 3.39
4 1 1 1 4 3.28
5 1 1 1 5 2.69
6 1 1 1 2 5.70
7 1 1 1 1 8.22
Expected Output if '3' is missing from 'D'
A B C D E
1 1 1 1 5 7.81
2 1 1 1 4 13.12
3 1 1 1 5 3.39
4 1 1 1 4 3.28
5 1 1 1 5 2.69
6 1 1 1 2 5.70
7 1 1 1 1 8.22
8 1 1 1 3 0.00
In the tibble column D the values should range from 1:5 depending on the data set.
I want to be able to identify whether one or more numbers from this range are missing from column D, which if one value (e.g. 3) is missing I want to add a new row that copies the data in columns A : C and enters 3 in column D and 0 in column E. If two or more values are missing (e.g. 3 and 4) I want to add two rows, etc.
You can use tidyr::complete:
library(tidyverse)
Data %>%
complete(nesting(A,B,C), D = seq(min(D), max(D), 1L))
#> # A tibble: 8 x 5
#> A B C D E
#> <int> <int> <int> <int> <dbl>
#> 1 1 1 1 1 8.22
#> 2 1 1 1 2 5.7
#> 3 1 1 1 3 NA
#> 4 1 1 1 4 13.1
#> 5 1 1 1 4 3.28
#> 6 1 1 1 5 7.81
#> 7 1 1 1 5 3.39
#> 8 1 1 1 5 2.69
I'd recommend not using 0 instead of NA but if you want to add them at the bottom and have them as 0 then this works:
Data %>%
complete(nesting(A,B,C), D = seq(min(D), max(D), 1L)) %>%
arrange(is.na(E)) %>%
mutate(E = replace_na(E, 0))
#> # A tibble: 8 x 5
#> A B C D E
#> <int> <int> <int> <int> <dbl>
#> 1 1 1 1 1 8.22
#> 2 1 1 1 2 5.7
#> 3 1 1 1 4 13.1
#> 4 1 1 1 4 3.28
#> 5 1 1 1 5 7.81
#> 6 1 1 1 5 3.39
#> 7 1 1 1 5 2.69
#> 8 1 1 1 3 0
Created on 2019-06-20 by the reprex package (v0.3.0)

Taking means over `sam` and `dup`

I am trying to take the means over the columns sam and dup of the following dataset:
fat co lab sam dup
1 0.62 1 1 1 1
2 0.55 1 1 1 2
3 0.34 1 1 2 1
4 0.24 1 1 2 2
5 0.80 1 1 3 1
6 0.68 1 1 3 2
7 0.76 1 1 4 1
8 0.65 1 1 4 2
9 0.30 1 2 1 1
10 0.40 1 2 1 2
11 0.33 1 2 2 1
12 0.43 1 2 2 2
13 0.39 1 2 3 1
14 0.40 1 2 3 2
15 0.29 1 2 4 1
16 0.18 1 2 4 2
17 0.46 1 3 1 1
18 0.38 1 3 1 2
19 0.27 1 3 2 1
20 0.37 1 3 2 2
21 0.37 1 3 3 1
22 0.42 1 3 3 2
23 0.45 1 3 4 1
24 0.54 1 3 4 2
25 0.18 2 1 1 1
26 0.47 2 1 1 2
27 0.53 2 1 2 1
28 0.32 2 1 2 2
29 0.40 2 1 3 1
30 0.37 2 1 3 2
31 0.31 2 1 4 1
32 0.43 2 1 4 2
33 0.35 2 2 1 1
34 0.39 2 2 1 2
35 0.37 2 2 2 1
36 0.33 2 2 2 2
37 0.42 2 2 3 1
38 0.36 2 2 3 2
39 0.20 2 2 4 1
40 0.41 2 2 4 2
41 0.37 2 3 1 1
42 0.43 2 3 1 2
43 0.28 2 3 2 1
44 0.36 2 3 2 2
45 0.18 2 3 3 1
46 0.20 2 3 3 2
47 0.26 2 3 4 1
48 0.06 2 3 4 2
The output should be this:
lab co fat
1 1 1 0.58000
2 2 1 0.34000
3 3 1 0.40750
4 1 2 0.37625
5 2 2 0.35375
6 3 2 0.26750
These are both in the form of .RData files.
How can this be done?
An example with part of the data you posted:
dt = read.table(text = "
fat co lab sam dup
0.62 1 1 1 1
0.55 1 1 1 2
0.34 1 1 2 1
0.24 1 1 2 2
0.80 1 1 3 1
0.68 1 1 3 2
0.76 1 1 4 1
0.65 1 1 4 2
0.30 1 2 1 1
0.40 1 2 1 2
0.33 1 2 2 1
0.43 1 2 2 2
0.39 1 2 3 1
0.40 1 2 3 2
0.29 1 2 4 1
0.18 1 2 4 2
", header= T)
library(dplyr)
dt %>%
group_by(lab, co) %>% # for each lab and co combination
summarise(fat = mean(fat)) %>% # get the mean of fat
ungroup() # forget the grouping
# # A tibble: 2 x 3
# lab co fat
# <int> <int> <dbl>
# 1 1 1 0.58
# 2 2 1 0.34

Adding confidence bands for log growth curve

I'm working with data that shows a log growth curve. I was able to fit a non-linear mixed effects regression using the nlme package nicely. However, I am uncertain as to how to add confidence bands around the estimated lines. Can anyone help?
Please find data and code below:
Data:
Harvest Plot Irrigation Graft Rep AwtRun 1 11b 1 b 1 0 2 11b 1 b 1 1.6 3 11b 1 b 1 7.67 4 11b 1 b 1 11.96 5 11b 1 b 1 18.82 6 11b 1 b 1 31.43 11b 1 b 1 41.84 8 11b 1 b 1 45.08 9 11b 1 b 1 48.09 10 11b 1 b 1 48.8 11 11b 1 b 1 51.73 12 11b 1 b 1 54.13 13 11b 1 b 1 60.56 14 11b 1 b 1 63.44 15 11b 1 b 1 65.44 16 11b 1 b 1 67.33 1 11c 1 c 1 0 2 11c 1 c 1 0.86 3 11c 1 c 1 1.6 4 11c 1 c 1 5.41 5 11c 1 c 1 10.17 6 11c 1 c 1 20.4 7 11c 1 c 1 23.32 8 11c 1 c 1 23.99 9 11c 1 c 1 25.23 10 11c 1 c 1 25.89 11 11c 1 c 1 27.71 12 11c 1 c 1 29.64 13 11c 1 c 1 30.81 14 11c 1 c 1 33.09 15 11c 1 c 1 35.66 16 11c 1 c 1 36.59 1 11s 1 s 1 0.82 2 11s 1 s 1 0.82 3 11s 1 s 1 1.19 4 11s 1 s 1 4.39 5 11s 1 s 1 11.77 6 11s 1 s 1 15.81 7 11s 1 s 1 21.9 8 11s 1 s 1 28.16 9 11s 1 s 1 33.63 10 11s 1 s 1 45.22 11 11s 1 s 1 49.45 12 11s 1 s 1 51.71 13 11s 1 s 1 54.82 14 11s 1 s 1 57.44 15 11s 1 s 1 57.61 16 11s 1 s 1 58.38 1 12b 2 b 1 0 2 12b 2 b 1 0.9 3 12b 2 b 1 2.19 4 12b 2 b 1 7.1 5 12b 2 b 1 10.98 6 12b 2 b 1 26.48 7 12b 2 b 1 32.08 8 12b 2 b 1 37.58 9 12b 2 b 1 40.45 10 12b 2 b 1 48.27 11 12b 2 b 1 53.03 12 12b 2 b 1 55.05 13 12b 2 b 1 55.05 14 12b 2 b 1 55.75 15 12b 2 b 1 56.57 16 12b 2 b 1 57.57 1 12c 2 c 1 0 2 12c 2 c 1 0 3 12c 2 c 1 5.05 4 12c 2 c 1 10.08 5 12c 2 c 1 13.65 6 12c 2 c 1 25.03 7 12c 2 c 1 26.9 8 12c 2 c 1 27.47 9 12c 2 c 1 28.66 10 12c 2 c 1 31.98 11 12c 2 c 1 34.79 12 12c 2 c 1 35.2 13 12c 2 c 1 36.65 14 12c 2 c 1 38.41 15 12c 2 c 1 38.68 16 12c 2 c 1 38.94 1 12s 2 s 1 0 2 12s 2 s 1 0 3 12s 2 s 1 0.39 4 12s 2 s 1 4.59 5 12s 2 s 1 8.02 6 12s 2 s 1 17.45 7 12s 2 s 1 25.83 8 12s 2 s 1 33.04 9 12s 2 s 1 35.87 10 12s 2 s 1 52.42 11 12s 2 s 1 57.91 12 12s 2 s 1 57.91 13 12s 2 s 1 57.91 14 12s 2 s 1 57.91 15 12s 2 s 1 57.91 16 12s 2 s 1 58.38 1 21b 1 b 2 0 2 21b 1 b 2 0 3 21b 1 b 2 1.36 4 21b 1 b 2 6.2 5 21b 1 b 2 10.08 6 21b 1 b 2 17.53 7 21b 1 b 2 21.36 8 21b 1 b 2 24.92 9 21b 1 b 2 31.62 10 21b 1 b 2 47.42 11 21b 1 b 2 50.85 12 21b 1 b 2 50.85 13 21b 1 b 2 53.27 14 21b 1 b 2 53.66 15 21b 1 b 2 53.93 16 21b 1 b 2 56.48 1 21c 1 c 2 0 2 21c 1 c 2 0 3 21c 1 c 2 0.74 4 21c 1 c 2 6.44 5 21c 1 c 2 13.8 6 21c 1 c 2 20.12 7 21c 1 c 2 20.75 8 21c 1 c 2 23.58 9 21c 1 c 2 23.58 10 21c 1 c 2 28.69 11 21c 1 c 2 30.4 12 21c 1 c 2 31.74 13 21c 1 c 2 33.86 14 21c 1 c 2 34.06 15 21c 1 c 2 35.15 16 21c 1 c 2 36 1 21s 1 s 2 0 2 21s 1 s 2 0 3 21s 1 s 2 1.67 4 21s 1 s 2 3.41 5 21s 1 s 2 8.36 6 21s 1 s 2 16.97 7 21s 1 s 2 23.85 8 21s 1 s 2 28.16 9 21s 1 s 2 30.54 10 21s 1 s 2 37.33 11 21s 1 s 2 40.11 12 21s 1 s 2 40.41 13 21s 1 s 2 42.03 14 21s 1 s 2 42.03 15 21s 1 s 2 42.03 16 21s 1 s 2 42.03 1 22b 2 b 2 0 2 22b 2 b 2 2.06 3 22b 2 b 2 3.99 4 22b 2 b 2 6.7 5 22b 2 b 2 9.67 6 22b 2 b 2 14.8 7 22b 2 b 2 20.64 8 22b 2 b 2 28.33 9 22b 2 b 2 34.15 10 22b 2 b 2 44.86 11 22b 2 b 2 53.06 12 22b 2 b 2 54.44 13 22b 2 b 2 57.14 14 22b 2 b 2 60.16 15 22b 2 b 2 61.32 16 22b 2 b 2 61.32 1 22c 2 c 2 0 2 22c 2 c 2 0 3 22c 2 c 2 1.55 4 22c 2 c 2 4.93 5 22c 2 c 2 13.63 6 22c 2 c 2 21.98 7 22c 2 c 2 26.7 8 22c 2 c 2 27.23 9 22c 2 c 2 30.56 10 22c 2 c 2 40.73 11 22c 2 c 2 42.01 12 22c 2 c 2 45.52 13 22c 2 c 2 51.7 14 22c 2 c 2 53.59 15 22c 2 c 2 53.59 16 22c 2 c 2 53.59 1 22s 2 s 2 0 2 22s 2 s 2 0 3 22s 2 s 2 1.15 4 22s 2 s 2 9.27 5 22s 2 s 2 13.5 6 22s 2 s 2 23.78 7 22s 2 s 2 24.38 8 22s 2 s 2 27.7 9 22s 2 s 2 33.63 10 22s 2 s 2 41.23 11 22s 2 s 2 44.84 12 22s 2 s 2 48.26 13 22s 2 s 2 51.96 14 22s 2 s 2 54.83 15 22s 2 s 2 54.83 16 22s 2 s 2 54.83 1 31b 1 b 3 0 2 31b 1 b 3 0 3 31b 1 b 3 0 4 31b 1 b 3 0 5 31b 1 b 3 1.32 6 31b
Code based on answer from Rob Hall # https://stats.stackexchange.com/questions/67049/non-linear-mixed-effects-regression-in-r
#nonlinear mixed effects model with self start logistic (SSlogis) for starting values
library(nlme)
#base Model: y = (Asym+u)/(1+exp((Harvest-xmid)/scale)), u ~ N(0,s2u); (no graft)
initVals <- getInitial(sqrtawtrun ~ SSlogis(Harvest, Asym, xmid, scal), data = Data)
#base model without graft, based on starting points found earlier
baseModel<- nlme(sqrtawtrun ~ SSlogis(Harvest, Asym, xmid, scal),
data = Data,
fixed = list(Asym ~ 1, xmid ~ 1, scal ~ 1),
random = Asym ~ 1|Plot,
start = initVals
)
#creating dummy variables for graft; releveling so 's' is the reference category to match SAS code
graft.dummy=model.matrix(~relevel(Data[["Graft"]],"s"))[,2:3]
#updating to include graft in model -- same as above but allowing for diff asym, xmid, and scal vars for each graft
#starting values based on fitted base model values (for those in the base model) & zero for all new parameters
nestedModel <- update(baseModel,fixed=list(Asym ~graft.dummy, xmid ~graft.dummy, scal~graft.dummy),
start = c(fixef(baseModel)[1], 0, 0, fixef(baseModel)[2], 0, 0, fixef(baseModel)[3], 0, 0))
#growth curve plots -- line for each plot, colored on graft
#currently only predicted population level mean at each observed point, plotted as smooth line
ggplot(data=Data,aes(x=Harvest,y=sqrtawtrun,
color=Graft,na.rm=T)) +
geom_point(cex=0.6) +
geom_line(aes(x=Harvest,y=nestedModel$fitted[,1],color=Graft),size=2)

Resources