error with extracting data from a list table - r

I am trying to extract the median values from the following data
df<-structure(list(n = 26L, time = c(64, 77, 142, 148, 167, 175,
181, 218, 286, 294, 323, 362, 375, 414, 427, 442, 455, 460, 505,
543, 544, 548, 598, 604, 771, 951), n.risk = c(26, 25, 24, 23,
22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7,
6, 5, 4, 3, 2, 1), n.event = c(1, 0, 1, 1, 0, 1, 0, 0, 0, 0,
1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0), n.censor = c(0,
1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
0, 1, 0, 1), surv = c(0.961538461538462, 0.961538461538462, 0.921474358974359,
0.881410256410256, 0.881410256410256, 0.839438339438339, 0.839438339438339,
0.839438339438339, 0.839438339438339, 0.839438339438339, 0.786973443223443,
0.734508547008547, 0.682043650793651, 0.629578754578755, 0.577113858363858,
0.524648962148962, 0.524648962148962, 0.4663546330213, 0.408060303893637,
0.349765974765975, 0.349765974765975, 0.27981277981278, 0.209859584859585,
0.209859584859585, 0.104929792429792, 0.104929792429792), type = "right",
std.err = c(0.0392232270276368, 0.0392232270276368, 0.0578796660439579,
0.0729817807835649, 0.0729817807835649, 0.0877911880959172,
0.0877911880959172, 0.0877911880959172, 0.0877911880959172,
0.0877911880959172, 0.108967698764172, 0.128980092013706,
0.148762796526449, 0.168939711260041, 0.190043109889266,
0.212620066567793, 0.212620066567793, 0.24309706208875, 0.277404622263805,
0.317431643449181, 0.317431643449181, 0.388281918537096,
0.483834870173886, 0.483834870173886, 0.856794130229766,
0.856794130229766), upper = c(1, 1, 1, 1, 1, 0.997049673308717,
0.997049673308717, 0.997049673308717, 0.997049673308717,
0.997049673308717, 0.974346771572688, 0.945768634864856,
0.912933812389795, 0.876701615980298, 0.837580372384821,
0.795886882462859, 0.795886882462859, 0.751001648029994,
0.70283210436471, 0.651592180391947, 0.651592180391947, 0.598926755204663,
0.541713673163476, 0.541713673163476, 0.56260462703826, 0.56260462703826
), lower = c(0.890389006776242, 0.890389006776242, 0.822651689473135,
0.763934098528765, 0.763934098528765, 0.706741845048289,
0.706741845048289, 0.706741845048289, 0.706741845048289,
0.706741845048289, 0.635633245173389, 0.570438462156972,
0.509547937949868, 0.45211438075625, 0.397645905392106, 0.345848812876783,
0.345848812876783, 0.289595428067216, 0.236917480831754,
0.187749701094333, 0.187749701094333, 0.130725820922461,
0.0812994900059442, 0.0812994900059442, 0.019570157816371,
0.019570157816371), conf.type = "log", conf.int = 0.95, call = survfit(formula = Surv(as.numeric(as.character(all_clin$new_death))[ind_clin],
all_clin$death_event[ind_clin]) ~ event_rna[ind_gene,
ind_tum])), .Names = c("n", "time", "n.risk", "n.event",
"n.censor", "surv", "type", "std.err", "upper", "lower", "conf.type",
"conf.int", "call"), class = "survfit")
I try to get it like below
x1 <- ifelse (is.na(as.numeric(summary(s)$table[,'median'][1])),'NA',as.numeric(summary(s)$table[,'median'][1]))
x2 <- as.numeric(summary(s)$table[,'median'][2])
if(x1 != 'NA' & x2 != 'NA'){
lines(c(0,x1),c(0.5,0.5),col='blue')
lines(c(x1,x1),c(0,0.5),col='black')
lines(c(x2,x2),c(0,0.5),col='red')
}
I get the following error for both comments
Error in summary(s)$table[, "median"] : incorrect number of dimensions

Related

ggplot barplot to boxplot

I have code to produce a good barplot and I'm trying to create a boxplot with the same data.
The barplot displays the count of "response" across all people (id). I'd like to create a boxplot for each type of "response" to replace the 3 bars. Boxplots should be calculated from the count of that specific "response" for each participant. So far no luck because I'm stuck on how to count the response for each participant.
current code:
df %>%
ggplot(position = dodge) +
labs(title= "question") +
geom_bar(aes(x = response), fill="red") +
labs(y = "count", x = "responses") +
scale_y_continuous(breaks=seq(0,100,20), limits = c(0,100))
output:
data sample:
structure(list(id = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5),
response = c(0, 1, 1, 0, 0, 0, 1, -1, 1, -1, 0, 1, -1, 1,
0, 0, 0, 0, 1, 1, 1, -1, 0, 1, 0, 1, 1, -1, 0, 1, 1, 1, 0,
1, 0, 0, 1, -1, 0, 1, 1, 1, -1, 1, 1, 1, 0, 0, -1, 1, 1,
-1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0,
1, 1, 0, 0, 0), iscorrect = c(0, 1, 1, 0, 0, 0, 1, 0, 1,
0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0,
0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0,
0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1,
0, 1, 0, 0, 1, 1, 0, 0, 0), min = c(100, 150, 150,
50, 50, 50, 150, 100, 100, 100, 50, 100, 50, 150, 150, 150,
50, 100, 100, 100, 150, 150, 50, 50, 50, 150, 150, 100, 50,
100, 100, 150, 150, 50, 50, 50, 150, 100, 100, 100, 50, 100,
50, 150, 150, 150, 50, 100, 100, 100, 150, 150, 50, 50, 50,
150, 150, 100, 50, 100, 100, 150, 100, 50, 50, 50, 150, 100,
100, 50, 150, 100, 50, 150, 150), max = c(125.4, 180.8,
180.8, 62.4, 62.4, 62.4, 180.8, 125.4, 125.4, 125.4, 62.4,
125.4, 62.4, 180.8, 180.8, 180.8, 62.4, 125.4, 125.4, 125.4,
180.8, 180.8, 62.4, 62.4, 62.4, 180.8, 180.8, 125.4, 62.4,
125.4, 125.4, 180.8, 180.8, 62.4, 62.4, 62.4, 180.8, 125.4,
125.4, 125.4, 62.4, 125.4, 62.4, 180.8, 180.8, 180.8, 62.4,
125.4, 125.4, 125.4, 180.8, 180.8, 62.4, 62.4, 62.4, 180.8,
180.8, 125.4, 62.4, 125.4, 125.4, 180.8, 125.4, 62.4, 62.4,
62.4, 180.8, 125.4, 125.4, 62.4, 180.8, 125.4, 62.4, 180.8,
180.8), time = c(5, 7, 9, 5, 1, 7, 1, 1, 7, 3, 9, 9,
3, 5, 3, 1, 9, 5, 1, 7, 9, 3, 5, 7, 1, 5, 7, 3, 3, 9, 5,
7, 9, 5, 1, 7, 1, 1, 7, 3, 9, 9, 3, 5, 3, 1, 9, 5, 1, 7,
9, 3, 5, 7, 1, 5, 7, 3, 3, 9, 9, 7, 5, 7, 5, 9, 5, 3, 1,
1, 9, 7, 3, 3, 1)), row.names = c(NA, -75L), class = c("tbl_df",
"tbl", "data.frame"))
You can use this code:
data %>%
group_by(id) %>%
count(response) %>%
mutate(response = as.factor(response)) %>%
ggplot(aes(x = response, y = n)) +
geom_boxplot(fill = "red") +
labs(y = "count", x = "responses")
Output:
You can try:
library(dplyr)
library(ggplot2)
df %>%
group_by(id, response) %>%
count() %>%
mutate(id = factor(id), response = factor(response)) %>%
ggplot(aes(response, n)) +
geom_boxplot(fill = "red") +
scale_y_continuous(name = "Number of responses per participant")
Note that boxplots don't work well for discrete data like small counts (unless your actual data has a far higher number of participants with a far higher count per response)

How to calculate the conditional expectation Weibull model?

I would like to calculate the conditional expectation of the Weibull model. In specific, I would like to estimate the remaining tenure of a client looking at random moments (time = t) in his total tenure.
To do so, I have calculated the total tenure for each client (currently active or inactive) and based on the random moment for each client, calculated his/her tenure at that moment.
The example below is a snapshot of my attempt. I use 2 variables STED and TemporalTenure to predict the dependent variable tenure which has either status 0 = active or 1 = inactive. I use the survival package for obtaining the survival object (km_surv).
df = structure(list(ID = c(16008, 21736, 18851, 20387, 30749,
42159), STED = c(2,
5, 1, 3, 2, 2), TemporalTenure = c(84, 98, 255, 392, 108, 278
), tenure = c(152, 166, 273, 460, 160, 289), status = c(0, 0,
1, 0, 1, 1)), row.names = c(NA,
6L), class = "data.frame")
km_surv <- Surv(time = df$tenure, event = df$status)
df <- data.frame(y = km_surv, df[,!(names(df) %in% c("tenure","status", "ID"))])
weibull_fit <- psm(y ~. , dist="weibull", data = df)
quantsurv <- Quantile(weibull_fit, df)
lp <- predict(weibull_fit, df, type="lp")
print(quantsurv(0.5, lp))
The output of these estimations are way too high. I assume this is caused by including the TemporalTenure, but I can't find out how the psm package calculates this and if there are other packages where it's possible to estimate the remaining tenure of client i at time t.
How can I obtain the predicted tenure conditioned over the time that a client is already active (random moment in time: TemporalTenure) where the dependent tenure can either be a client that is still active or one that is inactive?
EDIT
To clarify, whenever I add time conditional variables such as: TemporalTenure, number of received payments and number of complaints until time t, the predicted lifetime explodes in many cases. Therefore, I suspect that the psm is not the right way to go. Similar question is asked here, but the solution given doesn't work for the same reasons.
Below a slightly bigger dataset which already causes problems.
df = structure(list(ID= c(16008, 21736, 18851, 20387, 30749,
42159, 34108, 47511, 47917, 61116, 66600, 131380, 112668, 90799,
113615, 147562, 166247, 191603, 169698, 1020841, 1004077, 1026953,
1125673, 1129788, 22457, 1147883, 1163870, 1220268, 2004623,
1233924, 2009026, 2026688, 2031284, 2042982, 2046137, 2043214,
2033631, 2034252, 2068467, 2070284, 2070697, 2084859, 2090567,
2087133, 2087685, 2095100, 2095720, 2100482, 2105150, 2109353,
28852, 29040, 29592, 29191, 31172, 2126369, 2114207, 2111947,
2102678, 237687, 1093221, 2111607, 2031732, 2105275, 2020226,
1146777, 1028487, 1030165, 1098033, 1142093, 1186763, 2005605,
2007182, 2021092, 2027676, 2027525, 2070471, 2070621, 2072706,
2081862, 2085084, 2085353, 2094429, 2096216, 2109774, 2114526,
2115510, 2117329, 2122045, 2119764, 2122522, 2123080, 2128547,
2130005, 30025, 24166, 61529, 94568, 70809, 159214), STED = c(2,
5, 1, 3, 2, 2, 3, 1, 2, 2, 2, 2, 2, 1, 2, 2, 4, 1, 4, 3, 2, 4,
1, 1, 2, 1, 4, 1, 1, 1, 2, 4, 2, 5, 4, 1, 4, 2, 5, 3, 2, 1, 4,
2, 1, 5, 3, 1, 1, 5, 2, 2, 2, 2, 3, 4, 3, 5, 1, 1, 5, 2, 5, 1,
3, 5, 3, 1, 1, 1, 2, 2, 2, 2, 1, 2, 1, 3, 5, 2, 2, 1, 2, 1, 2,
3, 1, 1, 3, 5, 1, 2, 2, 2, 2, 1, 2, 1, 3, 1), TemporalTenure = c(84,
98, 255, 392, 108, 278, 120, 67, 209, 95, 224, 198, 204, 216,
204, 190, 36, 160, 184, 95, 140, 256, 142, 216, 56, 79, 194,
172, 155, 158, 78, 24, 140, 87, 134, 111, 15, 126, 41, 116, 66,
60, 0, 118, 22, 116, 110, 52, 66, 0, 325, 323, 53, 191, 60, 7,
45, 73, 42, 161, 30, 17, 30, 12, 87, 85, 251, 120, 7, 6, 38,
119, 156, 54, 11, 141, 50, 25, 33, 3, 48, 58, 13, 113, 25, 18,
23, 2, 102, 5, 90, 0, 101, 83, 44, 125, 226, 213, 216, 186),
tenure = c(152, 166, 273, 460, 160, 289, 188, 72, 233, 163,
266, 266, 216, 232, 247, 258, 65, 228, 252, 99, 208, 324,
201, 284, 124, 84, 262, 180, 223, 226, 146, 92, 208, 155,
202, 179, 80, 185, 64, 184, 120, 65, 6, 186, 45, 120, 170,
96, 123, 12, 393, 391, 64, 259, 73, 42, 69, 141, 47, 229,
37, 19, 37, 17, 155, 99, 319, 188, 75, 11, 49, 187, 180,
55, 52, 209, 115, 93, 88, 6, 53, 126, 31, 123, 26, 26, 24,
9, 114, 6, 111, 4, 168, 84, 112, 193, 294, 278, 284, 210),
status = c(0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1,
0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1,
0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0,
1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 0, 0, 0, 1, 0, 1), TotalValue = c(2579.35, 2472.85,
581.19, 2579.35, 2472.85, 0, 1829.18, 0, 936.79, 2098.2,
850.47, 2579.35, 463.68, 463.68, 2171.31, 3043.03, 561.16,
3043.03, 3043.03, -68.06, 2098.2, 2504.4, 1536.67, 2719.7,
3043.03, 109.91, 2579.35, 265.57, 3560.34, 2266.95, 3123.16,
3544.4, 1379.19, 2288.35, 2472.85, 2560.48, 1414.45, 3741.49,
202.2, 2856.23, 1457.75, 313.68, 191.32, 2266.95, 661.01,
0, 2050.81, 298.76, 1605.44, 373.86, 3043.03, 2579.35, 448.63,
3043.03, 463.68, 977.28, 818.06, 2620.06, 0, 3235.8, 280.99,
0, 0, 194.04, 3212.75, -23.22, 1833.46, 1829.18, 2786.7,
0, 0, 3250.38, 936.79, 0, 1045.21, 3043.03, 1988.36, 2472.85,
1197.94, 0, 313.68, 3212.75, 1419.33, 531.14, 0, 96.28, 0,
142.92, 174.79, 0, 936.79, 156.19, 2472.85, 463.68, 3520.69,
2579.35, 3328.87, 2567.88, 3043.03, 1081.14)), row.names = c(NA,
100L), class = "data.frame")
So here's what I have done: 1) added library call to load pkg:rms, removed the attempt to place a Surv object in a dataframe column, 3) built the Surv object inside formula as Therneau expects formulas to be built, and removed ID from the covariates where it most probably does not belong.
library(survival); library(rms)
#km_surv <- Surv(time = df$tenure, event = df$status)
#df <- data.frame(y = km_surv, df[,!(names(df) %in% c("tenure","status"))])
weibull_fit <- psm(Surv(time = tenure, event = status) ~TemporalTenure +STED , dist="weibull", data = df)
quantsurv <- Quantile(weibull_fit, df)
lp <- predict(weibull_fit, df, type="lp")
Results#
print(quantsurv(0.5, lp))
1 2 3 4 5 6
151.4129 176.0490 268.4644 466.8266 164.8640 301.2630

Replicating marginal effects from logit model example in Gujarati & Porter on R

I am replicating a logit model example from Econometrics book from Gujarati and Porter (Spanish edition). I have no problems with the model estimation, but I can't replicate marginal effects. In book, regression results are the following:
In the following lines I show my regression results:
> dat <- data.frame(
+ debito = c(0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1,
+ 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1,
+ 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1),
+ saldo = c(1756, 748, 1501, 1831, 1622, 1886, 740, 1593, 1169, 2125,
+ 1554, 1474, 1913, 1218, 1006, 2215, 137, 167, 343, 2557,
+ 2276, 1494, 2144, 1995, 1053, 1526, 1120, 1838, 1746, 1616,
+ 1958, 634, 580, 1320, 1675, 789, 1735, 1784, 1326, 2051, 1044,
+ 1885, 1790, 765, 1645, 32, 1266, 890, 2204, 2409, 1338, 2076,
+ 1708, 2138, 2375, 1455, 1487, 1125, 1989, 2156),
+ cajero = c(13, 9, 10, 10, 14, 17, 6, 10, 6, 18, 12, 12, 6, 10, 12, 20,
+ 7, 5, 7, 20, 15, 11, 17, 10, 8, 8, 8, 7, 11, 10, 6, 2, 4, 4,
+ 6, 8, 12, 11, 16, 14, 7, 10, 11, 4, 6, 2, 11, 7, 14, 16, 14,
+ 12, 13, 18, 12, 9, 8, 6, 12, 14),
+ interes = c(1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
+ 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0)
+ )
> mod <- glm(debito ~ saldo + cajero + interes,data = dat,
+ family = "binomial")
> summary(mod)
Call:
glm(formula = debito ~ saldo + cajero + interes, family = "binomial",
data = dat)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.6619 -0.9712 -0.6629 1.1440 1.7076
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -0.5749004 0.7857866 -0.732 0.4644
saldo 0.0012481 0.0006973 1.790 0.0735 .
cajero -0.1202251 0.0939842 -1.279 0.2008
interes -1.3520856 0.6809872 -1.985 0.0471 *
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 82.108 on 59 degrees of freedom
Residual deviance: 75.500 on 56 degrees of freedom
AIC: 83.5
Number of Fisher Scoring iterations: 4
As you can see, I obtained the same results. My problem is when I try to replicate marginal effects, which in book are the following:
I try to replicate thus results with the following method:
> coef(mod) * mean(dlogis(predict(mod, type = "link")))
(Intercept) saldo cajero interes
-0.1262098468 0.0002740024 -0.0263934252 -0.2968279711
As you can see, I failed. I know results in book are obtained from Stata, so I don't know the details of calculation or if there is posible to replicate the calculation in R. There exist a way?
Use the package "margins" for marginal effects.
install.packages("margins")
library("margins")
mod <- glm(debito ~ saldo + cajero + interes, data = dat, family = "binomial")
(m <- margins(mod))
Here are the outputs:
Average marginal effects
glm(formula = debito ~ saldo + cajero + interes, family = "binomial", data = dat)
saldo cajero interes
0.000274 -0.02639 -0.2968
Are your column names correct?

Clustering with binary variables

I have a dataset with some variables having a binary type.
The first column are names, so when applying cluster analysis it is showing error.
kc <- kmeans(j1,4) ## j1 is the stored data frame
Error in do_one(nmeth) : NA/NaN/Inf in foreign function call (arg 1)
In addition: Warning message: In storage.mode(x) <- "double" : NAs
introduced by coercion –
The data head I am giving here using dput(j1[1:5,]:
structure(list(OUTPUT_NAME = c("nonsaturation_fba268_2ch_0_out.wav",
"nonsaturation_fba268_2ch_32_out.wav", "substreaminfo_fba268_2ch_96_out.wav",
"substreaminfo_fba268_2ch_201_out.wav", "substreaminfo_fba268_2ch_93_out.wav"
), PEAK_MIPS = c(82.47, 82.5, 82.63, 82.73, 82.73), PRESENTATION = c(0,
0, 0, 0, 0), DTHD_ATMOS_PRE = c(0, 0, 0, 0, 0), FBAFBBDETECTER = c(1,
1, 1, 1, 1), DIAL_NORM = c(31, 31, 31, 31, 31), NORMAL_DRC = c(0,
0, 0, 0, 0), ANALOG_DB_GAIN_REQ = c(0, 0, 0, 0, 0), DECODER_CH_ASSIGN = c(1,
1, 1, 1, 1), DECODER_6_CH_ASSIGN = c(1, 1, 13, 1, 1), DECODER_8_CH_ASSIGN = c(1,
1, 13, 1, 1), DECODER_16_CH_ASSIGN = c(0, 0, 0, 0, 0), CH_MODIFIER = c(0,
0, 0, 0, 0), CH_ASSIGNMENT_TYPE = c(0, 0, 0, 0, 0), FILTER_ORDER = c(0,
0, 0, 0, 0), COEFF_BITS = c(9, 9, 9, 9, 9), COEFF_SHIFT = c(7,
7, 7, 7, 7), STATE_BITS = c(4, 4, 6, 6, 6), STATE_SHIFT = c(0,
0, 0, 0, 0), `31EC_PRIMITIVE_MATRIX_CNT` = c(16, 16, 8, 8, 8),
LSB_BYPASS_COUNT = c(0, 0, 0, 0, 0), DITHER_SCALE = c(1,
1, 1, 1, 1), `31EC_FRAC_BITS` = c(14, 14, 12, 12, 12), INTERPOLATION_USED = c(1,
1, 0, 0, 0), `31EA_31EB_PRIMITIVE_MATIX_CNT` = c(0, 0, 0,
0, 0), `31EA_31EB_FRAC_BITS` = c(14, 14, 12, 12, 12), LSB_BYPASS_USED = c(0,
0, 0, 0, 0), AU_LENGTH = c(937, 937, 937, 937, 937), VARIABLE_RATE = c(1,
1, 1, 1, 1), PEAK_DATA_RATE = c(6000, 6000, 6000, 6000, 6000
), SUBSTREAM_CNT = c(1, 1, 2, 2, 2), EXTENDED_SUBSTREAM_CNT = c(0,
0, 0, 0, 0), SUBSTREAM_INFO = c(20, 20, 40, 24, 24), SPEAKER_LAYOUT = c(0,
0, 0, 0, 0), CONTROL_EN_2 = c(0, 0, 0, 0, 0), CONTROL_EN_6 = c(0,
0, 0, 0, 0), CONTROL_EN_8 = c(0, 0, 0, 0, 0), MIX_LEVEL_2 = c(35,
35, 35, 35, 35), MIX_LEVEL_6 = c(35, 35, 35, 35, 35), MIX_LEVEL_8 = c(35,
35, 35, 35, 35), DIALOGUE_NORM_2 = c(31, 31, 31, 31, 31),
DIALOGUE_NORM_6 = c(31, 31, 31, 31, 31), DIALOGUE_NORM_8 = c(31,
31, 31, 31, 31), SOURCE_FORMAT_6 = c(0, 0, 0, 0, 0), SOURCE_FORMAT_8 = c(0,
0, 0, 0, 0), DRC_STARTUP_GAIN = c(0, 0, 0, 0, 0), DIALOGUE_NORM_16 = c(28,
28, 31, 31, 31), MIX_LEVEL_16 = c(35, 35, 35, 35, 35), CHANNEL_CNT_16 = c(16,
16, 16, 16, 16), DYNAMIC_OBJ_ONLY = c(1, 1, 1, 1, 1), DYNAMIC_CHANNEL_CNT_16 = c(0,
0, 0, 0, 0), LFE_PRE = c(1, 1, 0, 0, 0), CHANNEL_CONTENT_DES_16 = c(0,
0, 0, 0, 0), MIN_CHAN = c(0, 0, 0, 0, 0), MAX_CHAN = c(1,
1, 1, 1, 1), RESTART_SYNC_WORD = c(12778, 12778, 12778, 12778,
12778), MAX_MATRIX_CHAN = c(1, 1, 1, 1, 1), DITHER_SHIFT = c(0,
0, 0, 0, 0), ERROR_PROTECT = c(1, 1, 1, 1, 1), LOSSLESS_PROTECT = c(0,
0, 1, 1, 1), BLOCK_SIZE = c(32, 32, 40, 40, 40), OUTPUT_SHIFT = c(0,
0, 0, 0, 0), QUANT_STEP_SIZE = c(0, 0, 0, 0, 0), HUFF_OFFSET = c(0,
0, 0, 0, 0), HUFF_TYPE = c(1, 1, 0, 2, 2), HUFF_LSBS = c(6,
6, 8, 5, 5), SAMPLE_RATE = c(0, 3, 0, 3, 0), OUTPUT_SAMPLE_COUNT = c(40,
40, 40, 40, 40), RESTART_HEADER_EXISTS = c(0, 0, 0, 0, 0)), row.names = c(NA,
-5L), class = c("tbl_df", "tbl", "data.frame"))
You're using a variable that is not numeric, look at this:
class(j1[,1])
[1] "character"
You've to remove it, to make kmeans works:
set.seed(1234)
kmeans(j1[,-1],2)

How would I chart three percent columns into a stacked percent barplot using R?

I have the raw totals for three values that I was looking to display over time in a stacked bar chart, but I don't know how to display this.
I have the percentage values (.22, et cetera), and the raw numbers.
How would I create a stacked bar chart using ggplot2 considering I have three proportions I am trying to graph. Do I need to melt the data?
I would like to do something like:
ggplot(data, aes(fill=condition, y=value, x=specie)) +
geom_bar( stat="identity", position="fill")
But I do not know how to do this as my data isn't formatted right. Should I use dplyr?
Here is my df:
structure(list(date = structure(c(17405, 17406, 17407, 17408,
17409, 17410, 17411, 17412, 17413, 17414), class = "Date"), total_membership = c(1,
1, 1, 1, 1, 188, 284, 324, 354, 390), full_members = c(1, 1,
1, 1, 1, 188, 284, 324, 354, 390), guests = c(0, 0, 0, 0, 0,
0, 0, 0, 0, 0), daily_active_members = c(1, 1, 1, 1, 1, 169,
225, 214, 203, 254), daily_members_posting_messages = c(1, 0,
1, 0, 1, 111, 110, 96, 67, 70), weekly_active_members = c(1,
1, 1, 1, 1, 169, 270, 309, 337, 378), weekly_members_posting_messages = c(1,
1, 1, 1, 1, 111, 183, 218, 234, 255), messages_in_public_channels = c(4,
0, 0, 0, 1, 252, 326, 204, 155, 135), messages_in_private_channels = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0), messages_in_shared_channels = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0), messages_in_d_ms = c(1, 0, 0, 0,
0, 119, 46, 71, 70, 122), percent_of_messages_public_channels = c(0.8,
0, 0, 0, 1, 0.6792, 0.8763, 0.7418, 0.6889, 0.5253), percent_of_messages_private_channels = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0), percent_of_messages_d_ms = c(0.2,
0, 0, 0, 0, 0.3208, 0.1237, 0.2582, 0.3111, 0.4747), percent_of_views_public_channels = c(0.2857,
1, 1, 1, 1, 0.8809, 0.9607, 0.945, 0.9431, 0.9211), percent_of_views_private_channels = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0), percent_of_views_d_ms = c(0.7143,
0, 0, 0, 0, 0.1191, 0.0393, 0.055, 0.0569, 0.0789), name = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0), public_channels_single_workspace = c(10,
10, 11, 11, 12, 12, 12, 13, 13, 13), messages_posted = c(35,
35, 37, 38, 66, 1101, 1797, 2265, 2631, 3055)), row.names = c(NA,
-10L), class = c("tbl_df", "tbl", "data.frame"))
Here is an example using a toy data set, where the original data are first grouped and summarised to get the 'proportions', then piped to ggplot, which will automatically create a stacked bar plot
df <- data.frame(group=sample(letters[1:10],1000,T),
species=sample(1:4,1000,T),
amount=sample(10:30,1000,T))
df %>% group_by(group,species) %>% summarise(perc=mean(amount)) %>%
ggplot(aes(group,perc,fill=factor(species))) +
geom_bar(stat='identity')
UPDATE
This will calculate the proportion that 'species' occurs within each 'group'.
df %>% group_by(group,species) %>% summarise(n=n()) %>%
group_by(group) %>% mutate(perc=n/sum(n)) %>%
ggplot(aes(group,perc,fill=factor(species))) +
geom_bar(stat='identity')

Resources