How make labels of plot clear - r

I did a plot explaining occurrences of each modality for many variables.
It is about clustering problem to show which variables are explaining each cluster.
So
> dput(DATA1[1:20,])
structure(list(TYPE_PEAU = structure(c(1L, 2L, 1L, 3L, 1L, 2L,
1L, 2L, 2L, 2L, 3L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 2L), .Label = c("Sèche",
"Mixte", "Normale", "Grasse"), class = "factor"), SENSIBILITE = structure(c(2L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 3L, 3L, 1L, 2L, 2L, 3L, 3L, 3L,
1L, 3L, 1L), .Label = c("Aucune", "Fréquente", "Occasionnelle"
), class = "factor"), IMPERFECTIONS = structure(c(2L, 2L, 3L,
2L, 3L, 1L, 2L, 2L, 1L, 2L, 2L, 3L, 2L, 1L, 2L, 2L, 2L, 3L, 2L,
3L), .Label = c("Fréquente", "Occasionnelle", "Rares"), class = "factor"),
BRILLANCE = structure(c(3L, 3L, 1L, 1L, 1L, 2L, 1L, 3L, 3L,
3L, 3L, 1L, 3L, 3L, 3L, 3L, 1L, 1L, 3L, 3L), .Label = c("Aucune",
"Partout", "Zone T"), class = "factor"), GRAIN_PEAU = structure(c(1L,
2L, 1L, 2L, 2L, 3L, 3L, 3L, 3L, 2L, 1L, 2L, 3L, 2L, 2L, 2L,
2L, 2L, 2L, 1L), .Label = c("Fin", "Moyen", "Dilaté"), class = "factor"),
RIDES_VISAGE = structure(c(3L, 3L, 3L, 3L, 3L, 1L, 1L, 2L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 1L, 3L, 3L, 3L), .Label = c("Aucune",
"Très visibles", "Visibles"), class = "factor"), ALLERGIES = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L), .Label = c("Non", "Oui"), class = "factor"),
MAINS = structure(c(1L, 2L, 1L, 1L, 1L, 2L, 2L, 1L, 3L, 2L,
2L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 2L), .Label = c("Sèches",
"Normales", "Moites"), class = "factor"), PEAU_CORPS = structure(c(3L,
2L, 2L, 2L, 2L, 1L, 2L, 2L, 3L, 1L, 2L, 2L, 2L, 3L, 2L, 3L,
2L, 3L, 3L, 1L), .Label = c("Normale", "Sèche", "Très sèche"
), class = "factor"), INTERET_ALIM_NATURELLE = structure(c(3L,
1L, 1L, 1L, 1L, 2L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L), .Label = c("Beaucoup", "Pas du tout", "Un peu"
), class = "factor"), INTERET_ORIGINE_GEO = structure(c(3L,
1L, 1L, 1L, 3L, 2L, 3L, 1L, 1L, 1L, 1L, 3L, 3L, 3L, 1L, 1L,
3L, 1L, 1L, 1L), .Label = c("Beaucoup", "Pas du tout", "Un peu"
), class = "factor"), INTERET_VACANCES = structure(c(1L,
1L, 2L, 2L, 3L, 2L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 1L,
1L, 1L, 2L, 2L), .Label = c("À la mer", "À la montagne",
"En ville"), class = "factor"), INTERET_ENVIRONNEMENT = structure(c(3L,
1L, 1L, 1L, 3L, 2L, 3L, 3L, 1L, 1L, 1L, 1L, 3L, 1L, 1L, 1L,
1L, 1L, 1L, 1L), .Label = c("Beaucoup", "Pas du tout", "Un peu"
), class = "factor"), INTERET_COMPOSITION = structure(c(3L,
1L, 1L, 1L, 1L, 3L, 3L, 1L, 1L, 1L, 1L, 3L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L), .Label = c("Beaucoup", "Pas du tout", "Un peu"
), class = "factor"), PRIORITE_1 = structure(c(1L, 1L, 1L,
1L, 3L, 3L, 3L, 3L, 2L, 3L, 3L, 1L, 1L, 1L, 1L, 3L, 2L, 2L,
3L, 1L), .Label = c("éclatante", "hydratée", "lisse", "matifiée",
"nourrie", "purifiée", "reposée"), class = "factor"), MILIEU_RESIDENCE = structure(c(1L,
1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 1L,
2L, 2L, 1L, 2L), .Label = c("nature", "urbain"), class = "factor")), .Names = c("TYPE_PEAU",
"SENSIBILITE", "IMPERFECTIONS", "BRILLANCE", "GRAIN_PEAU", "RIDES_VISAGE",
"ALLERGIES", "MAINS", "PEAU_CORPS", "INTERET_ALIM_NATURELLE",
"INTERET_ORIGINE_GEO", "INTERET_VACANCES", "INTERET_ENVIRONNEMENT",
"INTERET_COMPOSITION", "PRIORITE_1", "MILIEU_RESIDENCE"), row.names = c(1L,
2L, 11L, 13L, 15L, 16L, 17L, 20L, 23L, 32L, 33L, 34L, 37L, 38L,
39L, 40L, 42L, 43L, 45L, 48L), class = "data.frame")
Then I use this code:
library(tidyverse)
DATA1 %>%
gather(k, v) %>%
ggplot(aes(v)) +
geom_bar(fill = "orange", width = 0.7) +
coord_flip() +
facet_wrap(~k)
Then I get as result this plot:
But as you can see lebels in the vertical axis are not clear !!
please how can I resolve this issue??

You can try to resize the text:
DATA1 %>%
gather(k, v) %>%
ggplot(aes(v)) +
geom_bar(fill = "orange", width = 0.7) +
theme(axis.text.y = element_text(face="bold", color="black", size=4)) +
coord_flip() +
facet_wrap(~k)
And/or abbreviate the labels with:
+ scale_x_discrete(labels = abbreviate)

Related

Plot linear regression analysis with error bar for variability

I wanted to make plots that look like figure 1 (source: link)
In figure 1, they have plotted the regression analysis with one-year yield variability. In my case, I would like to plot variability between two locations and 4 blocks for each treatment group. So the plot I wanted would have three facets for factors B.glucosidase, Protein, POX.C of variable and four colors for treatments factors. Also, in my current plot I have legend for block and treatment. I should only have treatment because the block should be used for making error bar for variability.
I tried with this code, which obviously doesn't work for what I want. (Data for df.melted included below.)
ggplot(df.melted, aes(x = value, y = yield, color = as.factor(treatment))) +
geom_point(aes(shape= as.factor(block))) +
stat_smooth(method = "lm", formula = y ~ x, col = "darkslategrey", se=F) +
stat_poly_eq(formula = y~x,
# aes(label = paste(..eq.label.., ..rr.label.., sep = "~~~")),
aes(label = ..rr.label..),
parse = TRUE) +
theme_classic() +
geom_errorbar(aes(ymax = df.melted$yield+sd(df.melted$yield), ymin = df.melted$yield-sd(df.melted$yield)), width = 0.05)+
facet_wrap(~variable)
Data:
df.melted <- structure(list(Location = structure(c(1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("M", "U"), class = "factor"),
treatment = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L,
3L, 3L, 3L, 4L, 4L, 4L, 4L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L,
3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 1L, 1L, 1L, 1L, 2L,
2L, 2L, 2L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L), .Label = c("CC",
"CCS", "CS", "SCS"), class = "factor"), block = c(1L, 2L,
3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L,
2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L,
1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L,
4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L,
3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L,
2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L,
1L, 2L, 3L, 4L), yield = c(5156L, 5157L, 5551L, 5156L, 4804L,
4720L, 4757L, 5021L, 4826L, 4807L, 4475L, 4596L, 4669L, 4588L,
4542L, 4592L, 5583L, 5442L, 5693L, 5739L, 5045L, 4902L, 5006L,
5086L, 4639L, 4781L, 4934L, 4857L, 4537L, 4890L, 4842L, 4608L,
5156L, 5157L, 5551L, 5156L, 4804L, 4720L, 4757L, 5021L, 4826L,
4807L, 4475L, 4596L, 4669L, 4588L, 4542L, 4592L, 5583L, 5442L,
5693L, 5739L, 5045L, 4902L, 5006L, 5086L, 4639L, 4781L, 4934L,
4857L, 4537L, 4890L, 4842L, 4608L, 5156L, 5157L, 5551L, 5156L,
4804L, 4720L, 4757L, 5021L, 4826L, 4807L, 4475L, 4596L, 4669L,
4588L, 4542L, 4592L, 5583L, 5442L, 5693L, 5739L, 5045L, 4902L,
5006L, 5086L, 4639L, 4781L, 4934L, 4857L, 4537L, 4890L, 4842L,
4608L), variable = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("B.glucosidase",
"Protein", "POX.C"), class = "factor"), value = c(1.600946,
1.474084, 1.433078, 1.532492, 1.198667, 1.193193, 1.214941,
1.360981, 1.853056, 1.690117, 1.544357, 1.825132, 1.695409,
1.764123, 1.903743, 1.538684, 0.845077, 1.011463, 0.857032,
0.989803, 0.859022, 0.919467, 1.01717, 0.861689, 0.972332,
0.952922, 0.804431, 0.742634, 1.195837, 1.267285, 1.08571,
1.20097, 6212.631579, 5641.403509, 4392.280702, 7120.701754,
5305.964912, 4936.842105, 5383.157895, 6077.894737, 5769.122807,
5016.842105, 5060.350877, 5967.017544, 5576.842105, 5174.035088,
5655.438596, 5468.77193, 7933.333333, 7000, 6352.982456,
8153.684211, 6077.894737, 4939.649123, 5002.807018, 6489.122807,
4694.035088, 5901.052632, 4303.859649, 6768.421053, 6159.298246,
6090.526316, 4939.649123, 5262.45614, 810.3024, 835.5242,
856.206, 759.8589, 726.2298, 792.6472, 724.7165, 699.3266,
500.9153, 634.8698, 637.9536, 648.8814, 641.0357, 623.3822,
555.2834, 520.8119, 683.3528, 595.9173, 635.4315, 672.4234,
847.2944, 745.5665, 778.3548, 735.8141, 395.2647, 570.4148,
458.0383, 535.3851, 678.0293, 670.7419, 335.2923, 562.5674
)), row.names = c(NA, -96L), class = "data.frame")
library(dplyr)
library(ggplot2)
library(ggpmisc)
Summarize data frame (this could also be done with stat_summary(), but it's often clearer/more transparent to do it explicitly up front). (I think that because your data set is balanced you could collapse/average over the block structure first, and then do your whole plot with the reduced data set - it shouldn't change the outcome of the linear regressions at all, at least not the mean values ... and any statistical comparisons should probably done on block-level summaries anyway ...)
df.sum <- (df.melted
%>% group_by(Location,treatment,variable)
%>% summarise(value=mean(value),yield_sd=sd(yield),
## collapse yield to mean *after* computing sd!
yield=mean(yield))
)
Plot:
(ggplot(df.melted,
aes(x = value, y = yield, color = treatment))
+ stat_smooth(method = "lm", col = "darkslategrey", se=FALSE)
+ stat_poly_eq(
formula = y ~ x,
## aes(label = paste(..eq.label.., ..rr.label.., sep = "~~~")),
aes(group=1, label = ..rr.label..),
parse = TRUE)
+ theme_classic()
+ scale_shape(guide=FALSE)
+ geom_point(data=df.sum)
+ geom_errorbar(data=df.sum,
aes(ymax = yield+yield_sd, ymin = yield-yield_sd),
width = 0.05)
+ facet_wrap(~variable,scale="free_x")
)
(adding group=1 to the stat_poly_eq() aesthetics means we only plot a single R^2 value per facet)
Since you're no longer using the shape aesthetic for anything, you could consider using it to show the Location variable ...

Geom Bar Plot, sum of count not visible

I have a dataframe tag, with 51X5 structure
structure(list(Tagging = structure(c(1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("CIRCLE CAMPIAGN",
"NATIONAL CAMPIAGN"), class = "factor"), Status = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L), .Label = c("Negative", "Positive"), class = "factor"),
Month = structure(c(1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L,
3L, 3L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L,
3L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 1L, 1L,
1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 3L), .Label = c("JUL",
"JUN", "MAY"), class = "factor"), Category = structure(c(1L,
4L, 6L, 1L, 2L, 4L, 6L, 1L, 2L, 4L, 5L, 6L, 1L, 2L, 4L, 5L,
6L, 1L, 2L, 4L, 5L, 6L, 1L, 2L, 4L, 6L, 1L, 4L, 6L, 2L, 3L,
4L, 6L, 1L, 2L, 3L, 4L, 6L, 1L, 2L, 3L, 4L, 5L, 6L, 1L, 2L,
3L, 4L, 5L, 6L, 6L), .Label = c("Data", "Other", "Roaming",
"Unlimited", "VAS", "Voice"), class = "factor"), count = c(3L,
2L, 1L, 4L, 5L, 2L, 1L, 2L, 6L, 7L, 2L, 3L, 4L, 9L, 6L, 2L,
3L, 3L, 3L, 10L, 2L, 5L, 5L, 5L, 4L, 3L, 1L, 1L, 1L, 2L,
1L, 1L, 1L, 2L, 2L, 1L, 2L, 2L, 2L, 4L, 1L, 1L, 3L, 3L, 2L,
1L, 1L, 1L, 3L, 4L, 2L)), class = "data.frame", row.names = c(NA,
-51L))
I want to create a bar plot (ggplot) to show bar graph with label on bar as sum of count of category month wise I am using below code
ggplot(data = tag, aes(x = Tagging, y = count, fill = Status)) +
geom_col() +
labs(x = "Tagging", y = "Count", title = "FlyTxt ROI", subtitle = "Statistics") +
geom_text(aes(label = count), color = "white", size = 3, position = position_stack(vjust = 0.5)) +
theme_minimal()+facet_wrap(~Month)
But I am getting split count values:
Help as I want only sum of count for each status
The problem is, that the information you show in the bar is accumulated by geom_col over all categories but the geom_text doesn't do that.
On option is to pre-summarize the data (to get rid of the category split) and then plot the graph.
library(tidyverse)
tag_sum <- tag %>%
group_by(Tagging, Status, Month) %>%
summarise(count_sm = sum(count))
ggplot(data = tag_sum, aes(x = Tagging, y = count_sm, fill = Status)) +
geom_col() +
geom_text(aes(label = count_sm), color = "white", size = 3,
position = position_stack(vjust = 0.5)) +
facet_wrap(~Month) +
labs(x = "Tagging", y = "Count", title = "FlyTxt ROI", subtitle = "Statistics") +
theme_minimal()

bestglm : Error in levels(x)[x] : only 0's may be mixed with negative subscripts

was trying to use bestglm function via the AIC method to come up with a logistic regression model.
The following is a summary of the data set I ran it on:
dataset summary
The following is the line I ran:
best1 <- bestglm(trainset, IC="AIC", family=binomial)
The following is the error message I have received:
Error in levels(x)[x] : only 0's may be mixed with negative subscripts
In addition: Warning messages:
1: In model.response(mf, "numeric") :
using type = "numeric" with a factor response will be ignored
2: In Ops.factor(y, z$residuals) : ‘-’ not meaningful for factors
dput(testset)
structure(list(EyeContact = structure(c(2L, 1L, 2L, 1L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 1L,
1L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 1L, 2L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 1L,
1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 2L, 2L,
2L, 1L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 1L, 2L,
1L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 1L,
2L), .Label = c("N", "Y"), class = "factor"), Post.Processing = structure(c(2L,
2L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 1L, 2L,
2L, 2L, 2L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 2L,
1L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L), .Label = c("N", "Y"), class = "factor"),
HairColour = structure(c(3L, 2L, 2L, 2L, 2L, 2L, 4L, 2L,
1L, 3L, 1L, 2L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 3L, 1L, 1L, 2L, 2L, 2L, 2L,
1L, 1L, 4L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 1L,
2L, 4L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 4L, 1L,
2L, 1L, 1L, 2L, 2L, 3L, 3L, 1L, 2L, 1L, 4L, 2L, 2L, 1L, 1L,
4L, 1L, 2L, 3L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 4L,
1L, 2L, 1L, 1L, 4L), .Label = c("BL", "BR", "NULL", "O"), class = "factor"),
Animals = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L), .Label = c("N", "Y"), class = "factor"),
Age = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L), .Label = c("21", "22", "23"), class = "factor"),
Backview = structure(c(1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L), .Label = c("N", "Y"), class = "factor"),
SkinTone = structure(c(3L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L,
2L, 1L, 2L, 2L, 2L, 2L, 2L, 3L, 2L, 2L, 2L, 2L, 2L, 1L, 1L,
2L, 2L, 2L, 2L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L,
1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L,
2L, 2L, 1L, 2L, 2L, 2L, 1L, 1L, 2L, 1L, 3L, 3L, 2L, 2L, 1L,
2L, 2L, 2L, 2L, 3L, 3L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L,
2L, 2L, 3L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 1L, 2L, 2L,
2L, 1L, 1L, 2L), .Label = c("Dark", "Fair", "NULL"), class = "factor"),
Smile = structure(c(5L, 3L, 1L, 1L, 5L, 4L, 1L, 1L, 5L, 1L,
4L, 4L, 1L, 1L, 4L, 3L, 1L, 2L, 2L, 1L, 4L, 3L, 5L, 5L, 1L,
3L, 1L, 5L, 5L, 2L, 5L, 1L, 2L, 5L, 1L, 2L, 2L, 1L, 4L, 5L,
5L, 4L, 3L, 5L, 2L, 4L, 2L, 3L, 5L, 3L, 5L, 4L, 1L, 5L, 5L,
4L, 5L, 5L, 5L, 1L, 5L, 2L, 2L, 1L, 5L, 5L, 3L, 5L, 4L, 4L,
5L, 4L, 1L, 3L, 2L, 1L, 1L, 5L, 4L, 5L, 4L, 5L, 5L, 1L, 2L,
4L, 3L, 5L, 5L, 1L, 5L, 1L, 4L, 1L, 4L, 5L, 1L, 5L, 4L, 4L,
5L, 5L, 1L), .Label = c("CS", "NS", "NULL", "O", "ST"), class = "factor"),
HairLength = structure(c(1L, 3L, 2L, 2L, 2L, 1L, 3L, 3L,
1L, 2L, 1L, 3L, 1L, 1L, 1L, 1L, 1L, 3L, 1L, 1L, 3L, 2L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 3L, 1L, 2L, 1L, 1L, 1L, 1L, 3L, 1L,
1L, 3L, 3L, 1L, 2L, 3L, 3L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 3L,
1L, 1L, 1L, 2L, 1L, 1L, 1L, 3L, 1L, 3L, 2L, 1L, 1L, 1L, 2L,
2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 1L, 1L, 3L, 3L, 1L,
1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 1L, 1L, 1L, 3L), .Label = c("L", "NULL", "SM"), class = "factor"),
HairTexture = structure(c(3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 3L, 1L, 3L, 3L, 3L, 1L,
3L, 3L, 1L, 3L, 3L, 3L, 3L, 1L, 2L, 1L, 3L, 1L, 3L, 1L, 3L,
1L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 1L, 3L, 1L, 3L, 2L, 1L, 3L,
3L, 3L, 3L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 3L, 3L, 2L,
1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 1L, 3L, 1L,
1L, 3L, 3L, 2L, 1L, 3L, 1L, 3L, 3L, 3L, 3L, 1L, 3L, 3L, 1L,
1L, 3L, 3L, 3L, 3L), .Label = c("C", "NULL", "S"), class = "factor"),
HairStyle = structure(c(1L, 1L, 3L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 1L, 1L, 1L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 3L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 3L, 1L, 1L, 1L,
1L, 1L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 1L,
1L, 1L, 1L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L,
1L, 1L, 1L, 1L), .Label = c("LD", "NULL", "T"), class = "factor"),
Outfit = structure(c(2L, 1L, 2L, 1L, 3L, 1L, 1L, 4L, 1L,
4L, 1L, 1L, 1L, 1L, 1L, 1L, 4L, 1L, 2L, 3L, 2L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 2L, 1L, 4L, 4L, 2L, 1L, 1L, 2L, 3L, 3L, 4L,
1L, 1L, 1L, 1L, 1L, 3L, 1L, 1L, 1L, 1L, 1L, 3L, 1L, 1L, 1L,
4L, 3L, 4L, 1L, 1L, 1L, 2L, 3L, 1L, 3L, 1L, 1L, 1L, 1L, 3L,
2L, 2L, 1L, 2L, 1L, 1L, 3L, 1L, 1L, 1L, 3L, 3L, 1L, 1L, 1L,
1L, 1L, 4L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 3L, 1L, 1L, 1L, 1L,
2L, 4L, 1L, 4L), .Label = c("D", "I", "NULL", "O"), class = "factor"),
Background = structure(c(2L, 4L, 1L, 4L, 3L, 1L, 1L, 2L,
1L, 1L, 1L, 1L, 1L, 1L, 4L, 2L, 1L, 4L, 1L, 4L, 1L, 1L, 4L,
1L, 3L, 2L, 1L, 1L, 4L, 2L, 1L, 1L, 1L, 4L, 1L, 1L, 1L, 1L,
1L, 4L, 2L, 1L, 1L, 1L, 3L, 3L, 1L, 1L, 4L, 1L, 3L, 1L, 1L,
1L, 2L, 1L, 1L, 1L, 2L, 3L, 2L, 1L, 2L, 4L, 4L, 4L, 1L, 4L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 1L, 1L, 1L, 1L, 3L, 2L, 1L,
1L, 1L, 3L, 2L, 4L, 2L, 4L, 1L, 1L, 4L, 3L, 3L, 1L, 2L, 4L,
1L, 3L, 4L, 4L, 3L), .Label = c("I", "N", "NULL", "P"), class = "factor"),
TypeofShot = structure(c(1L, 4L, 1L, 4L, 2L, 4L, 1L, 1L,
4L, 1L, 1L, 2L, 1L, 1L, 4L, 3L, 4L, 1L, 1L, 3L, 4L, 3L, 3L,
3L, 4L, 4L, 2L, 1L, 3L, 1L, 3L, 4L, 1L, 4L, 1L, 1L, 2L, 1L,
1L, 4L, 1L, 1L, 4L, 4L, 2L, 1L, 3L, 4L, 1L, 1L, 2L, 1L, 4L,
4L, 3L, 1L, 4L, 1L, 3L, 1L, 4L, 1L, 1L, 1L, 1L, 3L, 1L, 1L,
2L, 2L, 1L, 4L, 1L, 4L, 4L, 1L, 1L, 1L, 1L, 1L, 2L, 3L, 3L,
3L, 4L, 2L, 3L, 3L, 1L, 3L, 4L, 1L, 3L, 2L, 1L, 1L, 1L, 3L,
2L, 1L, 4L, 3L, 4L), .Label = c("CU", "ECU", "LS", "MS"), class = "factor"),
Obstruction = structure(c(1L, 2L, 1L, 1L, 1L, 3L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 1L, 1L, 1L, 3L, 1L, 2L, 1L,
1L, 3L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 3L, 1L, 3L, 1L, 1L, 3L, 1L, 2L, 1L, 1L, 3L,
1L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 3L, 3L, 1L, 1L, 3L, 1L,
1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 1L,
1L, 1L, 1L, 1L, 1L), .Label = c("N", "NULL", "Y"), class = "factor"),
Makeup = structure(c(4L, 4L, 2L, 2L, 3L, 2L, 2L, 3L, 2L,
2L, 1L, 1L, 2L, 2L, 3L, 4L, 1L, 2L, 2L, 4L, 1L, 4L, 2L, 3L,
4L, 2L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L,
2L, 2L, 2L, 3L, 3L, 2L, 1L, 2L, 3L, 3L, 1L, 2L, 1L, 3L, 2L,
4L, 2L, 2L, 2L, 3L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 1L, 2L, 2L,
3L, 2L, 3L, 2L, 4L, 4L, 2L, 3L, 2L, 1L, 2L, 3L, 3L, 1L, 2L,
1L, 2L, 4L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 4L, 2L, 2L,
1L, 3L, 1L, 1L), .Label = c("H", "L", "N", "NULL"), class = "factor"),
Results = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 2L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L), .Label = c("N", "Y"), class = "factor"),
prediction = c(9.32475933917106e-09, 0.0385259384817495,
0.0678681154072461, 0.234968717458685, 0.0290199853775816,
0.171162293958793, 0.00129264601900783, 0.00675484440459677,
0.128155946032347, 0.133539709174044, 0.118744423809008,
0.060206929901843, 0.128155946032347, 0.146426608321148,
0.0552623520735392, 0.227991153820736, 2.54581077993876e-08,
0.0195543511193415, 0.128155946032347, 0.256310568145846,
0.0520386124569491, 0.111383163512112, 0.0402597164944323,
0.0141022328039524, 0.55471858422641, 0.128155946032347,
0.35526622136263, 0.128155946032347, 0.382743622548627, 0.00485338573377989,
0.128155946032347, 0.0324058895421302, 0.320728574893713,
0.320728574893713, 0.35526622136263, 0.146426608321148, 0.0179540767871002,
0.398798221640772, 0.362407391381727, 0.00485338573377989,
0.00129264601900783, 0.128155946032347, 0.0823507208338033,
0.00675484440459677, 0.0195543511193415, 0.320728574893713,
0.128155946032347, 0.174534177022049, 0.0477307982973154,
0.0625662879441275, 0.0174929064796301, 0.135882446473831,
0.00696631574219797, 0.419831884479578, 0.0862150002573959,
0.128155946032347, 0.0698582713166507, 0.128155946032347,
0.174534177022049, 0.146426608321148, 0.0234463612462439,
0.0141022328039524, 0.0239924885903984, 0.0290199853775816,
3.15391485574326e-09, 1.14002192545012e-08, 0.0345251778805331,
0.208346726243955, 0.0203551415502053, 0.020830802150735,
0.128155946032347, 0.197915823620481, 0.146426608321148,
9.32475933917106e-09, 9.32475933917106e-09, 0.128155946032347,
0.0552623520735392, 0.016802787713206, 0.0345251778805331,
0.146426608321148, 0.00675484440459677, 0.00579370288906212,
0.320728574893713, 0.00316694181006374, 0.320728574893713,
0.146426608321148, 1.66951123737628e-08, 0.0466701670833381,
0.0402597164944323, 0.382743622548627, 0.128155946032347,
0.128155946032347, 0.118744423809008, 0.171162293958793,
0.0402597164944323, 0.146426608321148, 0.0895467055067367,
0.0110101302622226, 0.05872534886842, 0.35526622136263, 0.0141022328039524,
0.118744423809008, 0.00414031965843898)), .Names = c("EyeContact",
"Post.Processing", "HairColour", "Animals", "Age", "Backview",
"SkinTone", "Smile", "HairLength", "HairTexture", "HairStyle",
"Outfit", "Background", "TypeofShot", "Obstruction", "Makeup",
"Results", "prediction"), row.names = c(2L, 3L, 9L, 17L, 19L,
22L, 23L, 28L, 29L, 41L, 42L, 45L, 47L, 53L, 55L, 67L, 68L, 69L,
72L, 78L, 80L, 81L, 82L, 83L, 84L, 90L, 94L, 95L, 101L, 103L,
106L, 111L, 113L, 116L, 118L, 119L, 120L, 122L, 123L, 128L, 130L,
134L, 136L, 138L, 144L, 146L, 148L, 150L, 152L, 161L, 162L, 163L,
165L, 168L, 174L, 175L, 180L, 181L, 183L, 194L, 204L, 207L, 210L,
213L, 214L, 215L, 221L, 224L, 230L, 234L, 235L, 236L, 237L, 239L,
240L, 244L, 249L, 250L, 255L, 259L, 262L, 272L, 277L, 278L, 280L,
281L, 284L, 289L, 296L, 297L, 304L, 306L, 308L, 316L, 321L, 323L,
327L, 329L, 332L, 335L, 337L, 339L, 340L), class = "data.frame")
The model is running, it gives some output but the print method doesn't work.
> print(best1)
AIC
Best Model:
Error in levels(x)[x] : only 0's may be mixed with negative subscripts
In addition: Warning messages:
1: In model.response(mf, "numeric") :
using type = "numeric" with a factor response will be ignored
2: In Ops.factor(y, z$residuals) : '-' not meaningful for factors
but best1 structure is correct and best1$BestModel is provided
best1$BestModel
Call: glm(formula = y ~ ., family = family, data = Xi, weights = weights)
Coefficients:
(Intercept) Post.ProcessingY Age22 Age23
-40.416 -244.338 59.277 -41.652
SkinToneFair SkinToneNULL SmileNS SmileNULL
245.316 -5.102 -80.986 -142.908
SmileO SmileST HairLengthNULL HairLengthSM
-121.258 -80.482 -159.677 -20.045
OutfitI OutfitNULL OutfitO BackgroundN
41.652 -41.653 -410.492 19.895
BackgroundNULL BackgroundP TypeofShotECU TypeofShotLS
-82.640 -208.283 16.369 -101.467
TypeofShotMS MakeupL MakeupN MakeupNULL
101.819 39.438 -122.850 285.187
Degrees of Freedom: 102 Total (i.e. Null); 79 Residual
Null Deviance: 69.99
Residual Deviance: 5.545 AIC: 53.55
You could replace the print.bestglm method with
print.bestglm <- function (x, ...)
{
ti <- x$Title
cat(ti, fill = TRUE)
if ((x$ModelReport$Bestk > 0) || (x$ModelReport$IncludeInterceptQ)) {
cat("Best Model:", fill = TRUE)
if (any(x$ModelReport$NumDF > 1))
out <- summary(x$BestModel)
else out <- summary(x$BestModel)$coefficients
print(out)
}
else cat("Best Model is the null model with no parameters.",
fill = TRUE)
}
The problem is that the code uses a deprecated feature. It calls aov on a glm object, which is wrong. I think that using the replacement function should solve the problem.

Finding Outliers and ChiSquare Matrix of a purely Categorical Dataset

I have been assigned the task of making a prediction model. The data set given to me is purely categorical and consists of 92 variables. A portion of it is given below:
Dataset <- structure(list(Age.Group = structure(c(1L, 2L, 3L, 3L, 4L, 4L,
4L, 1L, 4L, 4L, 2L, 1L, 2L, 5L, 3L, 2L, 1L, 4L, 1L, 4L, 4L, 3L,
4L, 2L, 2L, 1L, 4L, 2L, 3L, 2L, 4L, 4L, 3L, 3L, 3L, 3L, 5L, 3L,
2L, 2L, 2L, 2L, 4L, 2L, 3L, 4L, 3L, 3L, 1L, 4L), .Label = c("1",
"2", "3", "4", "5"), class = "factor"), Sex = structure(c(2L,
2L, 1L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 1L,
2L, 1L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L,
2L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 2L,
1L), .Label = c("Female", "Male"), class = "factor"), LOS = structure(c(2L,
2L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 1L,
2L, 2L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 1L,
2L), .Label = c("Abnormal", "Normal"), class = "factor"), Day.to.Operation = structure(c(1L,
2L, 2L, 2L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 2L,
1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 2L,
1L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 2L,
1L), .Label = c("Abnormal", "Normal"), class = "factor"), Admit.Source = structure(c(2L,
2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L,
1L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 1L, 2L, 1L, 2L,
1L, 2L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L), .Label = c("Emergency", "Outpatient clinic"), class = "factor"),
Insurance.Payors = structure(c(3L, 1L, 3L, 3L, 1L, 1L, 1L,
3L, 1L, 3L, 1L, 3L, 1L, 1L, 5L, 1L, 1L, 2L, 1L, 5L, 1L, 5L,
1L, 3L, 1L, 3L, 1L, 1L, 1L, 3L, 3L, 5L, 1L, 1L, 1L, 5L, 5L,
1L, 1L, 1L, 1L, 1L, 3L, 5L, 1L, 1L, 1L, 1L, 3L, 4L), .Label = c("Basic medical insurance for urban residents",
"Basic medical insurance for urban residents Others", "Free Medical Care",
"New Rural Cooperative Medical Care", "Self payment"), class = "factor"),
Current.Recent.Smoker...1.year. = structure(c(1L, 2L, 2L,
2L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L,
1L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L,
1L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L,
1L, 2L), .Label = c("No", "Yes"), class = "factor"), Hypertension = structure(c(1L,
1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 2L,
2L, 1L, 2L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 2L, 2L,
2L, 2L, 1L, 2L), .Label = c("No", "Yes"), class = "factor"),
Dyslipidemia = structure(c(1L, 2L, 1L, 2L, 1L, 2L, 2L, 1L,
2L, 2L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 1L,
1L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 1L,
2L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L), .Label = c("No",
"Yes"), class = "factor"), Family.History.of.Premature.CAD = structure(c(2L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L,
1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 1L, 2L), .Label = c("No", "Yes"), class = "factor"),
MI.History = structure(c(1L, 1L, 2L, 1L, 2L, 1L, 2L, 2L,
2L, 2L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 1L,
1L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 2L,
1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L), .Label = c("No",
"Yes"), class = "factor"), Heart.Failure.History = structure(c(1L,
2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 1L,
1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"),
PCI.History = structure(c(2L, 1L, 1L, 1L, 1L, 1L, 2L, 1L,
2L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L,
1L, 1L, 2L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L), .Label = c("No",
"Yes"), class = "factor"), BMI.Group = structure(c(3L, 2L,
3L, 2L, 3L, 1L, 2L, 2L, 3L, 2L, 2L, 2L, 2L, 3L, 2L, 3L, 3L,
3L, 3L, 3L, 4L, 2L, 3L, 3L, 3L, 2L, 2L, 2L, 2L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 4L, 2L, 3L, 3L, 3L, 2L, 3L, 2L, 3L,
3L, 4L, 2L), .Label = c("2", "3", "4", "5"), class = "factor"),
Cerebrovascular.Disease = structure(c(1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L,
1L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L
), .Label = c("No", "Yes"), class = "factor"), Peripheral.Arterial.Disease = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L,
1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"),
Chronic.Lung.Disease = structure(c(1L, 1L, 1L, 1L, 1L, 2L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L,
1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("No",
"Yes"), class = "factor"), Diabetes.Mellitus = structure(c(2L,
1L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 2L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 1L, 2L, 1L), .Label = c("No", "Yes"), class = "factor"),
Diabetes.Therapy = structure(c(4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 3L, 4L, 2L, 4L, 4L, 1L, 2L, 4L, 4L, 4L, 2L, 2L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 2L, 4L,
2L, 4L, 4L, 4L, 4L, 2L, 4L, 2L, 4L, 4L, 4L, 4L, 2L), .Label = c("Diet",
"Insulin", "N/A", "Oral"), class = "factor"), Heart.Rate = structure(c(2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 1L,
1L, 2L, 2L, 2L), .Label = c("Abnormal", "Normal"), class = "factor"),
CAD.Presentation = structure(c(3L, 5L, 5L, 4L, 5L, 5L, 4L,
1L, 5L, 5L, 5L, 5L, 4L, 4L, 5L, 1L, 5L, 5L, 5L, 3L, 5L, 5L,
5L, 1L, 5L, 5L, 5L, 5L, 5L, 3L, 4L, 1L, 5L, 5L, 5L, 5L, 3L,
5L, 4L, 3L, 5L, 4L, 5L, 5L, 2L, 5L, 5L, 3L, 1L, 1L), .Label = c("Non STEMI 7 days",
"Silent myocardial ischemia 14 days", "Stable angina 42 days",
"STEMI 7 days", "Unstable angina 60 days"), class = "factor"),
STEMI.Non.STEMI.Onset.Date = structure(c(1L, 1L, 2L, 1L,
1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L,
1L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L,
1L), .Label = c("0", "1", "17"), class = "factor"), STEMI.Non.STEMI.Estimated.Time = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"),
Anginal.Classification.w.in.2.Weeks = structure(c(2L, 4L,
3L, 5L, 1L, 5L, 4L, 1L, 5L, 4L, 5L, 2L, 2L, 3L, 1L, 1L, 2L,
5L, 5L, 3L, 2L, 5L, 2L, 2L, 2L, 4L, 1L, 2L, 3L, 5L, 2L, 4L,
3L, 5L, 4L, 4L, 5L, 2L, 1L, 3L, 2L, 1L, 3L, 1L, 5L, 2L, 3L,
2L, 1L, 2L), .Label = c("CCS I", "CCS II", "CCS III", "CCS IV",
"No symptoms"), class = "factor"), Anti.Anginal.Drug.Therapy.within.2.Weeks = structure(c(2L,
1L, 2L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 2L,
2L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 1L,
1L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 2L, 1L, 2L, 2L,
1L, 2L, 2L, 2L), .Label = c("No", "Yes"), class = "factor")), .Names = c("Age.Group",
"Sex", "LOS", "Day.to.Operation", "Admit.Source", "Insurance.Payors",
"Current.Recent.Smoker...1.year.", "Hypertension", "Dyslipidemia",
"Family.History.of.Premature.CAD", "MI.History", "Heart.Failure.History",
"PCI.History", "BMI.Group", "Cerebrovascular.Disease", "Peripheral.Arterial.Disease",
"Chronic.Lung.Disease", "Diabetes.Mellitus", "Diabetes.Therapy",
"Heart.Rate", "CAD.Presentation", "STEMI.Non.STEMI.Onset.Date",
"STEMI.Non.STEMI.Estimated.Time", "Anginal.Classification.w.in.2.Weeks",
"Anti.Anginal.Drug.Therapy.within.2.Weeks"), class = "data.frame", row.names = c(NA,
-50L))
I have performed the string cleaning and missing data treatment as of now. I need help in my next task which is to remove outliers and compute a chi square matrix from this categorical dataset. I am new to data analysis and am quite confused at this point. I would be extremely grateful if I could get help regarding this.

Bar chart for factorial designs in R

I'm currently trying to create a clustered bar chart using ggplot2. It's basically just mean response times for a 2x2x2 factorial design. The three factors are load, compatibility and salience. I'm having a hard time jamming the third factor (salience) in there though. It shouldn't be a stacked graph though
This is what I currently have
bar+stat_summary(fun.y = mean, geom = "bar", position = "dodge") +
+ stat_summary(fun.data = mean_cl_normal, geom = "errorbar", position = position_dodge(width = 0.90), width = 0.2)+
+ labs(x = "Compatibility", y = "Mean RT", fill = "Load")
Here's a small sample of the data I'm trying to graph:
ID load comp sal rt
1 1 High Incompatible Non_Salient 787
2 1 Low Compatible Salient 754
3 2 High Incompatible Salient 654
I've seen graphs like these numerous times before but I have no idea how to get ggplot2 to display three independent variables at the same time.
I've tried splitting the graphs by adding
+ facet_wrap( ~ sal)
but that doesn't work either. It just says "Invalid argument to unary operator"
Any help would be appreciated.
Is this the kind of plot you are looking for?
I used the Wii data from the book "Discovering Statistics Using R", which is in a similar format to yours.
structure(list(athlete = structure(c(1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L), .Label = c("Athlete", "Non-Athlete"), class = "factor"),
stretch = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L), .Label = c("No Stretching", "Stretching"
), class = "factor"), wii = structure(c(1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("Playing Wii",
"Watching Wii"), class = "factor"), injury = c(2L, 2L, 1L,
2L, 0L, 1L, 2L, 0L, 2L, 2L, 2L, 1L, 4L, 2L, 2L, 0L, 0L, 3L,
3L, 3L, 2L, 1L, 0L, 2L, 2L, 3L, 2L, 2L, 3L, 1L, 2L, 4L, 1L,
2L, 2L, 2L, 1L, 4L, 4L, 1L, 2L, 3L, 3L, 3L, 3L, 2L, 3L, 2L,
2L, 2L, 1L, 0L, 3L, 3L, 2L, 1L, 2L, 4L, 1L, 2L, 5L, 5L, 3L,
6L, 4L, 3L, 4L, 5L, 5L, 2L, 6L, 4L, 4L, 4L, 3L, 4L, 3L, 2L,
1L, 4L, 3L, 2L, 2L, 1L, 3L, 1L, 1L, 3L, 4L, 2L, 7L, 8L, 6L,
9L, 4L, 7L, 5L, 9L, 6L, 4L, 8L, 5L, 4L, 7L, 10L, 1L, 3L,
2L, 1L, 3L, 3L, 2L, 3L, 4L, 2L, 0L, 1L, 3L, 2L, 0L)),
.Names = c("athlete", "stretch", "wii", "injury"),
class = "data.frame", row.names = c(NA, -120L))
Here is how to produce the plot.
library(ggplot2)
library(Hmisc)
ggplot(data=Wii, aes(x=stretch, y=injury, fill=wii)) +
facet_wrap(~athlete) +
stat_summary(fun.y = mean, geom = "bar", position = "dodge") +
stat_summary(fun.data = mean_cl_normal, geom = "errorbar", position = position_dodge(width = 0.90), width = 0.2)

Resources