I have a dataset as below, which has many columns. There are some columns whose headings are :
baked_hamburgur,spinach,mashed_potato,cabbages,jello,rolls,brown,milk,coffee,water,cakes,vanilla,chocolate,fruitsalad
There are other columns as well, but I am only interested as of now in the above columns.
the value in each row of these columns is either: yes, or no.
A screenshot of this data is as under, as I am not able to attach/share this file in the question itself.
The dput(head()) output is as under:
> dput(head(illness_data))
structure(list(Age = structure(c(18L, 26L, 22L, 25L, 29L, 13L
), .Label = c("10", "106", "11", "12", "14", "15", "16", "17",
"18", "19", "2", "20", "22", "23", "24", "25", "26", "27", "28",
"30", "31", "32", "33", "34", "36", "38", "39", "4", "42", "43",
"44", "45", "46", "48", "5", "7", "8", "9", "seven"), class = "factor"),
sex = structure(c(3L, 2L, 3L, 3L, 2L, 3L), .Label = c("-1",
"Female", "Male"), class = "factor"), timesupper = c(2000L,
1830L, 1830L, 1930L, 1930L, 1930L), ill = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = "yes", class = "factor"), onsetdate = structure(c(4L,
4L, 4L, 1L, 1L, 4L), .Label = c("18-Apr", "18-Jun", "18/4",
"19-Apr"), class = "factor"), onsettime = c(30L, 30L, 30L,
2230L, 2230L, 200L), baked_hamburgur = structure(c(2L, 2L,
2L, 2L, 2L, 1L), .Label = c("no", "yes"), class = "factor"),
spinach = structure(c(2L, 2L, 2L, 2L, 2L, 1L), .Label = c("no",
"yes"), class = "factor"), mashed_potato = structure(c(2L,
2L, 1L, 1L, 2L, 1L), .Label = c("no", "yes"), class = "factor"),
cabbages = structure(c(1L, 2L, 1L, 2L, 1L, 1L), .Label = c("no",
"yes"), class = "factor"), jello = structure(c(1L, 1L, 1L,
2L, 2L, 1L), .Label = c("no", "yes"), class = "factor"),
rolls = structure(c(2L, 1L, 1L, 1L, 2L, 1L), .Label = c("no",
"yes"), class = "factor"), brown = structure(c(1L, 1L, 1L,
1L, 2L, 1L), .Label = c("no", "yes"), class = "factor"),
milk = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("no",
"yes"), class = "factor"), coffee = structure(c(2L, 2L, 2L,
1L, 2L, 1L), .Label = c("no", "yes"), class = "factor"),
water = structure(c(1L, 1L, 1L, 2L, 2L, 1L), .Label = c("no",
"yes"), class = "factor"), cakes = structure(c(1L, 1L, 2L,
1L, 1L, 1L), .Label = c("no", "yes"), class = "factor"),
vanilla = structure(c(2L, 2L, 2L, 2L, 2L, 2L), .Label = c("no",
"yes"), class = "factor"), chocolate = structure(c(1L, 2L,
2L, 1L, 1L, 2L), .Label = c("no", "yes"), class = "factor"),
fruitsalad = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("no",
"yes", "yes</pre></body></html>Ztext/plain\b\v\035(F]l~Ó_Ý\026R\002\001"
), class = "factor")), .Names = c("Age", "sex", "timesupper",
"ill", "onsetdate", "onsettime", "baked_hamburgur", "spinach",
"mashed_potato", "cabbages", "jello", "rolls", "brown", "milk",
"coffee", "water", "cakes", "vanilla", "chocolate", "fruitsalad"
), row.names = c(NA, 6L), class = "data.frame")
A complete dput command output is as under:
> dput(illness_data)
structure(list(Age = structure(c(18L, 26L, 22L, 25L, 29L, 13L,
36L, 8L, 11L, 7L, 24L, 10L, 8L, 35L, 34L, 6L, 22L, 39L, 12L,
9L, 36L, 17L, 9L, 20L, 37L, 27L, 32L, 30L, 21L, 24L, 3L, 18L,
33L, 16L, 5L, 31L, 28L, 14L, 19L, 38L, 2L, 4L, 23L, 1L, 18L,
15L), .Label = c("10", "106", "11", "12", "14", "15", "16", "17",
"18", "19", "2", "20", "22", "23", "24", "25", "26", "27", "28",
"30", "31", "32", "33", "34", "36", "38", "39", "4", "42", "43",
"44", "45", "46", "48", "5", "7", "8", "9", "seven"), class = "factor"),
sex = structure(c(3L, 2L, 3L, 3L, 2L, 3L, 3L, 3L, 2L, 3L,
3L, 2L, 3L, 3L, 2L, 2L, 3L, 3L, 3L, 2L, 2L, 1L, 3L, 3L, 3L,
2L, 2L, 3L, 2L, 3L, 3L, 3L, 2L, 3L, 2L, 2L, 3L, 3L, 3L, 3L,
3L, 3L, 2L, 3L, 2L, 3L), .Label = c("-1", "Female", "Male"
), class = "factor"), timesupper = c(2000L, 1830L, 1830L,
1930L, 1930L, 1930L, 2200L, 1900L, 1930L, NA, NA, NA, NA,
2200L, NA, NA, NA, 2200L, NA, NA, 2200L, 2200L, NA, NA, 2200L,
NA, NA, NA, NA, NA, 1900L, NA, 1100L, NA, NA, NA, 2200L,
1930L, 1930L, 2200L, NA, NA, 1930L, 1930L, NA, NA), ill = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L
), .Label = "yes", class = "factor"), onsetdate = structure(c(4L,
4L, 4L, 1L, 1L, 4L, 4L, 2L, 4L, 4L, 4L, 1L, 1L, 4L, 1L, 3L,
1L, 4L, 1L, 1L, 4L, 4L, 1L, 1L, 4L, 1L, 4L, 4L, 1L, 4L, 4L,
1L, 1L, 1L, 1L, 1L, 4L, 4L, 1L, 4L, 4L, 4L, 4L, 1L, 4L, 1L
), .Label = c("18-Apr", "18-Jun", "18/4", "19-Apr"), class = "factor"),
onsettime = c(30L, 30L, 30L, 2230L, 2230L, 200L, 100L, 2300L,
200L, 1030L, 30L, 2215L, 2200L, 100L, 2300L, 2145L, 2145L,
100L, 2300L, 2100L, 100L, 100L, 2115L, 2330L, 100L, 2130L,
230L, 200L, 2130L, 30L, 100L, 2230L, 1500L, 2400L, 2300L,
2230L, 100L, 230L, 2330L, 100L, 30L, 30L, 100L, 2400L, 215L,
2300L), baked_hamburgur = structure(c(2L, 2L, 2L, 2L, 2L,
1L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 2L, 2L,
1L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 1L, 2L, 2L,
2L, 1L, 2L, 2L, 1L, 2L, 2L, 1L, 2L, 2L, 2L), .Label = c("no",
"yes"), class = "factor"), spinach = structure(c(2L, 2L,
2L, 2L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 2L, 2L,
1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 2L,
1L, 2L, 1L, 2L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 2L), .Label = c("no",
"yes"), class = "factor"), mashed_potato = structure(c(2L,
2L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 2L,
2L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L,
2L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 2L
), .Label = c("no", "yes"), class = "factor"), cabbages = structure(c(1L,
2L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 1L,
2L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 1L,
2L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 1L, 2L
), .Label = c("no", "yes"), class = "factor"), jello = structure(c(1L,
1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 1L,
1L, 1L, 2L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L,
1L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 2L, 2L
), .Label = c("no", "yes"), class = "factor"), rolls = structure(c(2L,
1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 2L,
2L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 1L,
2L, 1L, 2L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L
), .Label = c("no", "yes"), class = "factor"), brown = structure(c(1L,
1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L,
2L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 2L, 1L, 1L,
1L, 1L, 2L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 2L
), .Label = c("no", "yes"), class = "factor"), milk = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L,
1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L
), .Label = c("no", "yes"), class = "factor"), coffee = structure(c(2L,
2L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L,
1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 2L, 1L, 1L,
2L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 2L
), .Label = c("no", "yes"), class = "factor"), water = structure(c(1L,
1L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 2L,
2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L,
1L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 1L
), .Label = c("no", "yes"), class = "factor"), cakes = structure(c(1L,
1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 1L,
1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L
), .Label = c("no", "yes"), class = "factor"), vanilla = structure(c(2L,
2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L
), .Label = c("no", "yes"), class = "factor"), chocolate = structure(c(1L,
2L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 1L, 2L, 1L,
1L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, NA, 1L, 1L,
2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 1L
), .Label = c("no", "yes"), class = "factor"), fruitsalad = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L
), .Label = c("no", "yes", "yes</pre></body></html>Ztext/plain\b\v\035(F]l~Ó_Ý\026R\002\001"
), class = "factor")), .Names = c("Age", "sex", "timesupper",
"ill", "onsetdate", "onsettime", "baked_hamburgur", "spinach",
"mashed_potato", "cabbages", "jello", "rolls", "brown", "milk",
"coffee", "water", "cakes", "vanilla", "chocolate", "fruitsalad"
), class = "data.frame", row.names = c(NA, -46L))
R has correctly read these columns as Factor type variable(s).
Now, each of these columns correspond to what every ill patient in a hospital has consumed.
I'd like to know the most consumed food item by ill patients, using R.
Kindly advise a good way to do so. Thanks!
Note, I have not tried any other options, than the ones mentioned in this URL below. However, I could not make it work.
Count Factor Columns Using R
Since all the rows have ill = 'yes' we can count in each column the number of values with yes in it. A base R approach could be :
head(sort(colSums(illness_data[7:20] == "yes"), decreasing = TRUE), 5)
# vanilla baked_hamburgur cakes spinach mashed_potato
# 43 29 27 26 23
I have selected columns 7 to 20 because those are the only columns where food item is present. Also I have selected only top 5 values, you can select any value by changing the number 5 in head command.
I am not completely sure what you are looking for, but this will calculate how often foods are consumed (using the tidyverse package):
library(tidyverse)
illness_data_summed <- illness_data %>%
mutate_at(vars(-Age, -sex, -timesupper,-onsetdate,-onsettime), ~ifelse(. == "yes", 1,0)) %>%
summarise_at(vars(-Age, -sex, -timesupper,-onsetdate,-onsettime, -ill), ~sum(., na.rm = TRUE))
illness_data_summed[which(illness_data_summed == max(illness_data_summed))]
So first I convert the yes into 1 and no into 0, which makes the sum a representation of the number of times the specific food was eaten. I do it for all columns except those you are not interested in (indicated by the - in vars) but you can also reverse that if that is desirable (e.g., when the number of vars yo uwant to convert is lower than those you do not want to convert).
The last part will result in:
vanilla
1 43
Related
For my experiment, I have 3 independent variables: trial type, sex and gaming experience (all of which are categorical).
I have one dependent variable: proportion of correct trials (which is continuous).
When I tried running a 3-way ANOVA, the assumptions were not met, and so I used an aligned-rank transformation ANOVA.
m1 <- art(Proportioncorrect ~ Videogamefrequency + Biologicalsex + + Trialtype + Videogamefrequency:Biologicalsex + Videogamefrequency:Trialtype + Biologicalsex:Trialtype + Biologicalsex:Trialtype:Videogamefrequency, data = Gaming)
The model gave me the error:
Error in Anova.III.lm(mod, error, singular.ok = singular.ok, ...) :
there are aliased coefficients in the model
Could anyone give me a helping hand?
My data is here:
structure(list(ID = c("P_200214123342", "P_200224092247", "P_200219163622",
"P_200220130332", "P_200219091823", "P_200225184226", "P_200219123120",
"P_200219175102", "P_200214103155", "P_200219111605", "P_200217101213",
"P_200219102411", "P_200221101028", "P_200220145557", "P_200225171612",
"P_200224092247", "P_200219163622", "P_200220130332", "P_200214123342",
"P_200219091823", "P_200225184226", "P_200219123120", "P_200219175102",
"P_200214103155", "P_200219111605", "P_200217101213", "P_200219102411",
"P_200221101028", "P_200220145557", "P_200225171612"), Trialtype = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("Beaconed",
"Probe"), class = "factor"), Proportioncorrect = c(0.729727660699102,
1.33933990048532, 0.729727660699102, 1.075862200454, 0.578378233982015,
1.16808048521424, 1.33933990048532, 1.13531397797248, 1.28700221758657,
1.13531397797248, 1.28700221758657, 1.13531397797248, 1.28700221758657,
1.28700221758657, 1.20358829695229, 0.297711691252463, 0.160690652951911,
0.147197653346961, 0.0667161517509908, 0.080085580033659, 0.160690652951911,
0.133731586046578, 0.214985569478799, 0.160690652951911, 0.269932799291976,
0.339836905918588, 0.242365851038963, 0.214985569478799, 0.677268408841807,
1.20358829695229), Videogamefrequency = structure(c(2L, 1L, 1L,
1L, 2L, 2L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 1L, 1L, 1L, 2L,
2L, 2L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L), .Label = c("Monthly",
"Never", "Weekly", "Yearly"), class = "factor"), Biologicalsex = structure(c(1L,
1L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 1L, 1L,
2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L), .Label = c("Female",
"Male"), class = "factor")), row.names = c(NA, -30L), class = "data.frame")
I did a plot explaining occurrences of each modality for many variables.
It is about clustering problem to show which variables are explaining each cluster.
So
> dput(DATA1[1:20,])
structure(list(TYPE_PEAU = structure(c(1L, 2L, 1L, 3L, 1L, 2L,
1L, 2L, 2L, 2L, 3L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 2L), .Label = c("Sèche",
"Mixte", "Normale", "Grasse"), class = "factor"), SENSIBILITE = structure(c(2L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 3L, 3L, 1L, 2L, 2L, 3L, 3L, 3L,
1L, 3L, 1L), .Label = c("Aucune", "Fréquente", "Occasionnelle"
), class = "factor"), IMPERFECTIONS = structure(c(2L, 2L, 3L,
2L, 3L, 1L, 2L, 2L, 1L, 2L, 2L, 3L, 2L, 1L, 2L, 2L, 2L, 3L, 2L,
3L), .Label = c("Fréquente", "Occasionnelle", "Rares"), class = "factor"),
BRILLANCE = structure(c(3L, 3L, 1L, 1L, 1L, 2L, 1L, 3L, 3L,
3L, 3L, 1L, 3L, 3L, 3L, 3L, 1L, 1L, 3L, 3L), .Label = c("Aucune",
"Partout", "Zone T"), class = "factor"), GRAIN_PEAU = structure(c(1L,
2L, 1L, 2L, 2L, 3L, 3L, 3L, 3L, 2L, 1L, 2L, 3L, 2L, 2L, 2L,
2L, 2L, 2L, 1L), .Label = c("Fin", "Moyen", "Dilaté"), class = "factor"),
RIDES_VISAGE = structure(c(3L, 3L, 3L, 3L, 3L, 1L, 1L, 2L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 1L, 3L, 3L, 3L), .Label = c("Aucune",
"Très visibles", "Visibles"), class = "factor"), ALLERGIES = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L), .Label = c("Non", "Oui"), class = "factor"),
MAINS = structure(c(1L, 2L, 1L, 1L, 1L, 2L, 2L, 1L, 3L, 2L,
2L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 2L), .Label = c("Sèches",
"Normales", "Moites"), class = "factor"), PEAU_CORPS = structure(c(3L,
2L, 2L, 2L, 2L, 1L, 2L, 2L, 3L, 1L, 2L, 2L, 2L, 3L, 2L, 3L,
2L, 3L, 3L, 1L), .Label = c("Normale", "Sèche", "Très sèche"
), class = "factor"), INTERET_ALIM_NATURELLE = structure(c(3L,
1L, 1L, 1L, 1L, 2L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L), .Label = c("Beaucoup", "Pas du tout", "Un peu"
), class = "factor"), INTERET_ORIGINE_GEO = structure(c(3L,
1L, 1L, 1L, 3L, 2L, 3L, 1L, 1L, 1L, 1L, 3L, 3L, 3L, 1L, 1L,
3L, 1L, 1L, 1L), .Label = c("Beaucoup", "Pas du tout", "Un peu"
), class = "factor"), INTERET_VACANCES = structure(c(1L,
1L, 2L, 2L, 3L, 2L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 1L,
1L, 1L, 2L, 2L), .Label = c("À la mer", "À la montagne",
"En ville"), class = "factor"), INTERET_ENVIRONNEMENT = structure(c(3L,
1L, 1L, 1L, 3L, 2L, 3L, 3L, 1L, 1L, 1L, 1L, 3L, 1L, 1L, 1L,
1L, 1L, 1L, 1L), .Label = c("Beaucoup", "Pas du tout", "Un peu"
), class = "factor"), INTERET_COMPOSITION = structure(c(3L,
1L, 1L, 1L, 1L, 3L, 3L, 1L, 1L, 1L, 1L, 3L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L), .Label = c("Beaucoup", "Pas du tout", "Un peu"
), class = "factor"), PRIORITE_1 = structure(c(1L, 1L, 1L,
1L, 3L, 3L, 3L, 3L, 2L, 3L, 3L, 1L, 1L, 1L, 1L, 3L, 2L, 2L,
3L, 1L), .Label = c("éclatante", "hydratée", "lisse", "matifiée",
"nourrie", "purifiée", "reposée"), class = "factor"), MILIEU_RESIDENCE = structure(c(1L,
1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 1L,
2L, 2L, 1L, 2L), .Label = c("nature", "urbain"), class = "factor")), .Names = c("TYPE_PEAU",
"SENSIBILITE", "IMPERFECTIONS", "BRILLANCE", "GRAIN_PEAU", "RIDES_VISAGE",
"ALLERGIES", "MAINS", "PEAU_CORPS", "INTERET_ALIM_NATURELLE",
"INTERET_ORIGINE_GEO", "INTERET_VACANCES", "INTERET_ENVIRONNEMENT",
"INTERET_COMPOSITION", "PRIORITE_1", "MILIEU_RESIDENCE"), row.names = c(1L,
2L, 11L, 13L, 15L, 16L, 17L, 20L, 23L, 32L, 33L, 34L, 37L, 38L,
39L, 40L, 42L, 43L, 45L, 48L), class = "data.frame")
Then I use this code:
library(tidyverse)
DATA1 %>%
gather(k, v) %>%
ggplot(aes(v)) +
geom_bar(fill = "orange", width = 0.7) +
coord_flip() +
facet_wrap(~k)
Then I get as result this plot:
But as you can see lebels in the vertical axis are not clear !!
please how can I resolve this issue??
You can try to resize the text:
DATA1 %>%
gather(k, v) %>%
ggplot(aes(v)) +
geom_bar(fill = "orange", width = 0.7) +
theme(axis.text.y = element_text(face="bold", color="black", size=4)) +
coord_flip() +
facet_wrap(~k)
And/or abbreviate the labels with:
+ scale_x_discrete(labels = abbreviate)
I have several dataframes with data from the same survey. I want to combine them for analysis. The dataframes contain both unique variables and two variables (ID and Contest_no) that are shared across all the dataframes; the two shared variables contain information about the respondent and the contest number (1,2,3, as respondents were asked the same questions three times).
The difficulty is that the dataframes have missing values:
DF1 <- data.frame(V1 = factor(c("A", "B", "C", "D")),
V2 = factor(c("A", "B", "C", "D")),
ID = factor(c("x1", "x1", "y2", "y2")),
Contest_no = factor(c("1", "2", "1", "2")))
DF2 <- data.frame(V3 = factor(c("A", "C", "D")),
V4 = factor(c("A", "C", "D")),
ID = factor(c("x1", "y2", "y2")),
Contest_no = factor(c("1", "1", "2")))
DF3 <- data.frame(V5 = factor(c("A", "B", "C")),
V6 = factor(c("A", "B", "C")),
ID = factor(c("x1", "x1", "y2")),
Contest_no = factor(c("1", "2", "1")))
As a result, respondent IDs and contest numbers aren't aligned. I want to match the data to respondent IDS and contest numbers so that the merged dataframe looks like this:
DF_merged <- data.frame(V1 = factor(c("A", "B", "C", "D")),
V2 = factor(c("A", "B", "C", "D")),
V3 = factor(c("A", NA, "C", "D")),
V4 = factor(c("A", NA, "C", "D")),
V5 = factor(c("A", "B", "C", NA)),
V6 = factor(c("A", "B", "C", NA)),
ID = factor(c("x1", "x1", "y2", "y2")),
Contest_no = factor(c("1", "2", "1", "2")))
I thought that full_join would do the trick, but DF_merged <- full_join(DF1, DF2, DF3, by="ID") gives me nonsensical results.
How can disparate data like this be combined?
New, updated example (to address the problem of multiplied rows). In this example there are no missing values at all, and both dataframes have the same number of rows, but the code results in multiplied rows. First, the two dataframes to be merged:
df1:
structure(list(ID = structure(c(1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L), .Label = c("EE1", "EE101", "EE102"), class = "factor"),
Contest_no = c(1L, 1L, 1L, 1L, 2L, 2L, 3L, 3L, 2L, 2L, 3L,
3L), Option = structure(c(1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L,
1L, 2L, 1L, 2L), .Label = c("Option1", "Option2"), class = "factor"),
Chosen_option = c(0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L,
0L, 1L), Combination = structure(c(5L, 5L, 6L, 6L, 4L, 4L,
2L, 2L, 1L, 1L, 3L, 3L), .Label = c("V133", "V181", "V234",
"V252", "V32", "V67"), class = "factor"), Attribute1 = structure(c(1L,
1L, 2L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 2L), .Label = c("has strong ties to the government",
"has weak ties to the government"), class = "factor"), Attribute2 = structure(c(1L,
2L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 2L, 1L), .Label = c("has strong ties to the local pastoralist community",
"has weak ties to the local pastoralist community"), class = "factor"),
Attribute3 = structure(c(2L, 1L, 1L, 2L, 1L, 2L, 2L, 1L,
2L, 1L, 1L, 2L), .Label = c("is poor", "is wealthy"), class = "factor"),
Attribute4 = structure(c(2L, 1L, 1L, 1L, 2L, 2L, 1L, 2L,
1L, 2L, 2L, 2L), .Label = c("has attained a high level of formal education (for example university degree)",
"has not attained a high level of formal education (for example never went to school or only attended primary school)"
), class = "factor")), .Names = c("ID", "Contest_no", "Option",
"Chosen_option", "Combination", "Attribute1", "Attribute2", "Attribute3",
"Attribute4"), class = "data.frame", row.names = c(NA, -12L))
df2:
structure(list(ID = structure(c(1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L,
3L, 3L, 3L, 3L), .Label = c("EE1", "EE101", "EE102"), class = "factor"),
Contest_no = c(1L, 1L, 1L, 1L, 2L, 2L, 3L, 3L, 2L, 2L, 3L,
3L), Option = structure(c(1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L,
1L, 2L, 1L, 2L), .Label = c("Option1", "Option2"), class = "factor"),
Chosen_option = c(1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L,
0L, 1L), Combination = structure(c(6L, 6L, 4L, 4L, 1L, 1L,
3L, 3L, 5L, 5L, 2L, 2L), .Label = c("V150", "V249", "V252",
"V29", "V56", "V77"), class = "factor"), Attribute1 = structure(c(2L,
2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L), .Label = c("has strong ties to the government",
"has weak ties to the government"), class = "factor"), Attribute2 = structure(c(2L,
2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 1L), .Label = c("has strong ties to the local pastoralist community",
"has weak ties to the local pastoralist community"), class = "factor"),
Attribute3 = structure(c(2L, 2L, 2L, 1L, 2L, 1L, 1L, 2L,
2L, 1L, 1L, 2L), .Label = c("is poor", "is wealthy"), class = "factor"),
Attribute4 = structure(c(2L, 1L, 2L, 1L, 1L, 2L, 2L, 2L,
1L, 1L, 2L, 2L), .Label = c("has attained a high level of formal education (for example university degree)",
"has not attained a high level of formal education (for example never went to school or only attended primary school)"
), class = "factor")), .Names = c("ID", "Contest_no", "Option",
"Chosen_option", "Combination", "Attribute1", "Attribute2", "Attribute3",
"Attribute4"), class = "data.frame", row.names = c(NA, -12L))
and now the unsuccessful attempt to combine the two dataframes:
df_merge_attempt <- dplyr::full_join(df1, df2, by=c("ID","Contest_no"))
results in:
structure(list(ID = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L
), .Label = c("EE1", "EE101", "EE102"), class = "factor"), Contest_no = c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 2L,
2L, 2L, 2L, 3L, 3L, 3L, 3L), Option.x = structure(c(1L, 1L, 2L,
2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L,
2L, 1L, 1L, 2L, 2L), .Label = c("Option1", "Option2"), class = "factor"),
Chosen_option.x = c(0L, 0L, 1L, 1L, 0L, 0L, 1L, 1L, 0L, 0L,
1L, 1L, 0L, 0L, 1L, 1L, 0L, 0L, 1L, 1L, 0L, 0L, 1L, 1L),
Combination.x = structure(c(5L, 5L, 5L, 5L, 6L, 6L, 6L, 6L,
4L, 4L, 4L, 4L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 3L, 3L, 3L,
3L), .Label = c("V133", "V181", "V234", "V252", "V32", "V67"
), class = "factor"), Attribute1.x = structure(c(1L, 1L,
1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 2L,
2L, 2L, 2L, 1L, 1L, 2L, 2L), .Label = c("has strong ties to the government",
"has weak ties to the government"), class = "factor"), Attribute2.x = structure(c(1L,
1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L,
2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L), .Label = c("has strong ties to the local pastoralist community",
"has weak ties to the local pastoralist community"), class = "factor"),
Attribute3.x = structure(c(2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L,
1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 2L,
2L), .Label = c("is poor", "is wealthy"), class = "factor"),
Attribute4.x = structure(c(2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 2L,
2L), .Label = c("has attained a high level of formal education (for example university degree)",
"has not attained a high level of formal education (for example never went to school or only attended primary school)"
), class = "factor"), Option.y = structure(c(1L, 2L, 1L,
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L,
1L, 2L, 1L, 2L, 1L, 2L), .Label = c("Option1", "Option2"), class = "factor"),
Chosen_option.y = c(1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L,
1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 0L, 1L, 0L, 1L),
Combination.y = structure(c(6L, 6L, 6L, 6L, 4L, 4L, 4L, 4L,
1L, 1L, 1L, 1L, 3L, 3L, 3L, 3L, 5L, 5L, 5L, 5L, 2L, 2L, 2L,
2L), .Label = c("V150", "V249", "V252", "V29", "V56", "V77"
), class = "factor"), Attribute1.y = structure(c(2L, 2L,
2L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 1L, 2L, 1L), .Label = c("has strong ties to the government",
"has weak ties to the government"), class = "factor"), Attribute2.y = structure(c(2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 2L, 1L, 2L, 1L), .Label = c("has strong ties to the local pastoralist community",
"has weak ties to the local pastoralist community"), class = "factor"),
Attribute3.y = structure(c(2L, 2L, 2L, 2L, 2L, 1L, 2L, 1L,
2L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 1L,
2L), .Label = c("is poor", "is wealthy"), class = "factor"),
Attribute4.y = structure(c(2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L,
1L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L), .Label = c("has attained a high level of formal education (for example university degree)",
"has not attained a high level of formal education (for example never went to school or only attended primary school)"
), class = "factor")), class = "data.frame", row.names = c(NA,
-24L), .Names = c("ID", "Contest_no", "Option.x", "Chosen_option.x",
"Combination.x", "Attribute1.x", "Attribute2.x", "Attribute3.x",
"Attribute4.x", "Option.y", "Chosen_option.y", "Combination.y",
"Attribute1.y", "Attribute2.y", "Attribute3.y", "Attribute4.y"
))
You can try dplyr::full_join with by=c("ID","Contest_no") argument as:
library(dplyr)
df1 <- full_join(DF1, DF2, by=c("ID","Contest_no")) %>%
full_join(DF3, by=c("ID","Contest_no"))
df1
# V1 V2 V3 V4 V5 V6 ID Contest_no
#1 A A A A A A x1 1
#2 B B <NA> <NA> B B x1 2
#3 C C C C C C y2 1
#4 D D D D <NA> <NA> y2 2
Updated: Answer has been modified to consider another column Option in full_join as:
df1 <- full_join(DF1, DF2, by=c("ID","Contest_no", "Option"))
Note: I had to tweak my dplyr to match what is suggested by #Gregor in order to get expected result.
I have some data structured in the same way as the following:
structure(list(respectfromsuperior = structure(c(1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L,
1L, 1L, 1L, NA, 2L, 1L, 1L, 1L, 1L, 2L), .Label = c("agree",
"disagree"), class = "factor"), respectideserve = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 1L, 2L), .Label = c("agree",
"disagree"), class = "factor"), undesirablechange = structure(c(2L,
2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 1L, 2L, 2L, 1L, NA, 2L, 2L,
2L, 2L, 2L, 1L, 1L, NA, 1L, 2L, 1L, 2L, 2L, 2L), .Label = c("agree",
"disagree"), class = "factor"), jobsecuritypoor = structure(c(2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L,
2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("agree",
"disagree"), class = "factor"), promotionprospectsadequate = structure(c(2L,
1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 1L,
2L, 1L, 2L, 2L, 1L, 2L, 2L, 1L, 2L, 2L, 2L, 2L), .Label = c("agree",
"disagree"), class = "factor"), salaryadequate = structure(c(2L,
1L, 1L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L,
2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("agree",
"disagree"), class = "factor"), branch = structure(c(1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("Edinburgh",
"Head Office", "Manchester"), class = "factor")), .Names = c("respectfromsuperior",
"respectideserve", "undesirablechange", "jobsecuritypoor", "promotionprospectsadequate",
"salaryadequate", "branch"), class = "data.frame", row.names = c(1L,
2L, 4L, 6L, 10L, 11L, 13L, 15L, 16L, 17L, 19L, 20L, 22L, 23L,
25L, 27L, 29L, 30L, 32L, 33L, 34L, 35L, 39L, 40L, 41L, 42L, 43L,
44L, 45L))
I would like to use ggplot 2 to plot a bar graph with the following features:
the bars representing percentage of respondents who agree with
statements in columns 2:6 of the data (disagree not plotted). Percentage calculated as a
percentage of branch members (not as percentage of total
respondents)
bars grouped by branch on the x axis
the questions (columns 2:6) are used as the 'Fill' argument
I've tried playing around with the code below but not able to work it out:
data.r <- melt(rewitemsbr, id.vars='branch')
ggplot(data=data.r, aes(x=value, fill=variable)) +
geom_bar(stat="count", position=position_dodge())
this is the best I've come up with:
Any help very much appreciated thank you.
You can try following.
# get the stats using aggregate
res <- aggregate(d[,1:6], list(d$branch), function(x) sum(x=="agree", na.rm = T)/length(x))
res
Group.1 respectfromsuperior respectideserve undesirablechange jobsecuritypoor promotionprospectsadequate salaryadequate
1 Edinburgh 1.0 0.8888889 0.1111111 0.0 0.6666667 0.4444444
2 Head Office 0.7 0.3000000 0.4000000 0.2 0.2000000 0.0000000
3 Manchester 0.8 0.8000000 0.2000000 0.1 0.6000000 0.2000000
# to long format
library(reshape2)
res_long <- melt(res, id.vars='Group.1')
# plot
ggplot(data=res_long, aes(x=Group.1, y=value, fill=variable)) +
geom_bar(stat="identity", position=position_dodge())
I have been assigned the task of making a prediction model. The data set given to me is purely categorical and consists of 92 variables. A portion of it is given below:
Dataset <- structure(list(Age.Group = structure(c(1L, 2L, 3L, 3L, 4L, 4L,
4L, 1L, 4L, 4L, 2L, 1L, 2L, 5L, 3L, 2L, 1L, 4L, 1L, 4L, 4L, 3L,
4L, 2L, 2L, 1L, 4L, 2L, 3L, 2L, 4L, 4L, 3L, 3L, 3L, 3L, 5L, 3L,
2L, 2L, 2L, 2L, 4L, 2L, 3L, 4L, 3L, 3L, 1L, 4L), .Label = c("1",
"2", "3", "4", "5"), class = "factor"), Sex = structure(c(2L,
2L, 1L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 1L,
2L, 1L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L,
2L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 2L,
1L), .Label = c("Female", "Male"), class = "factor"), LOS = structure(c(2L,
2L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 1L,
2L, 2L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 1L,
2L), .Label = c("Abnormal", "Normal"), class = "factor"), Day.to.Operation = structure(c(1L,
2L, 2L, 2L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 2L,
1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 2L,
1L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 2L,
1L), .Label = c("Abnormal", "Normal"), class = "factor"), Admit.Source = structure(c(2L,
2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L,
1L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 1L, 2L, 1L, 2L,
1L, 2L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L), .Label = c("Emergency", "Outpatient clinic"), class = "factor"),
Insurance.Payors = structure(c(3L, 1L, 3L, 3L, 1L, 1L, 1L,
3L, 1L, 3L, 1L, 3L, 1L, 1L, 5L, 1L, 1L, 2L, 1L, 5L, 1L, 5L,
1L, 3L, 1L, 3L, 1L, 1L, 1L, 3L, 3L, 5L, 1L, 1L, 1L, 5L, 5L,
1L, 1L, 1L, 1L, 1L, 3L, 5L, 1L, 1L, 1L, 1L, 3L, 4L), .Label = c("Basic medical insurance for urban residents",
"Basic medical insurance for urban residents Others", "Free Medical Care",
"New Rural Cooperative Medical Care", "Self payment"), class = "factor"),
Current.Recent.Smoker...1.year. = structure(c(1L, 2L, 2L,
2L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L,
1L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L,
1L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L,
1L, 2L), .Label = c("No", "Yes"), class = "factor"), Hypertension = structure(c(1L,
1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 2L,
2L, 1L, 2L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 2L, 2L,
2L, 2L, 1L, 2L), .Label = c("No", "Yes"), class = "factor"),
Dyslipidemia = structure(c(1L, 2L, 1L, 2L, 1L, 2L, 2L, 1L,
2L, 2L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 1L,
1L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 1L,
2L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L), .Label = c("No",
"Yes"), class = "factor"), Family.History.of.Premature.CAD = structure(c(2L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L,
1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 1L, 2L), .Label = c("No", "Yes"), class = "factor"),
MI.History = structure(c(1L, 1L, 2L, 1L, 2L, 1L, 2L, 2L,
2L, 2L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 1L,
1L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 2L,
1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L), .Label = c("No",
"Yes"), class = "factor"), Heart.Failure.History = structure(c(1L,
2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 1L,
1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"),
PCI.History = structure(c(2L, 1L, 1L, 1L, 1L, 1L, 2L, 1L,
2L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L,
1L, 1L, 2L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L), .Label = c("No",
"Yes"), class = "factor"), BMI.Group = structure(c(3L, 2L,
3L, 2L, 3L, 1L, 2L, 2L, 3L, 2L, 2L, 2L, 2L, 3L, 2L, 3L, 3L,
3L, 3L, 3L, 4L, 2L, 3L, 3L, 3L, 2L, 2L, 2L, 2L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 4L, 2L, 3L, 3L, 3L, 2L, 3L, 2L, 3L,
3L, 4L, 2L), .Label = c("2", "3", "4", "5"), class = "factor"),
Cerebrovascular.Disease = structure(c(1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L,
1L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L
), .Label = c("No", "Yes"), class = "factor"), Peripheral.Arterial.Disease = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L,
1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"),
Chronic.Lung.Disease = structure(c(1L, 1L, 1L, 1L, 1L, 2L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L,
1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("No",
"Yes"), class = "factor"), Diabetes.Mellitus = structure(c(2L,
1L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 2L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 1L, 2L, 1L), .Label = c("No", "Yes"), class = "factor"),
Diabetes.Therapy = structure(c(4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 3L, 4L, 2L, 4L, 4L, 1L, 2L, 4L, 4L, 4L, 2L, 2L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 2L, 4L,
2L, 4L, 4L, 4L, 4L, 2L, 4L, 2L, 4L, 4L, 4L, 4L, 2L), .Label = c("Diet",
"Insulin", "N/A", "Oral"), class = "factor"), Heart.Rate = structure(c(2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 1L,
1L, 2L, 2L, 2L), .Label = c("Abnormal", "Normal"), class = "factor"),
CAD.Presentation = structure(c(3L, 5L, 5L, 4L, 5L, 5L, 4L,
1L, 5L, 5L, 5L, 5L, 4L, 4L, 5L, 1L, 5L, 5L, 5L, 3L, 5L, 5L,
5L, 1L, 5L, 5L, 5L, 5L, 5L, 3L, 4L, 1L, 5L, 5L, 5L, 5L, 3L,
5L, 4L, 3L, 5L, 4L, 5L, 5L, 2L, 5L, 5L, 3L, 1L, 1L), .Label = c("Non STEMI 7 days",
"Silent myocardial ischemia 14 days", "Stable angina 42 days",
"STEMI 7 days", "Unstable angina 60 days"), class = "factor"),
STEMI.Non.STEMI.Onset.Date = structure(c(1L, 1L, 2L, 1L,
1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L,
1L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L,
1L), .Label = c("0", "1", "17"), class = "factor"), STEMI.Non.STEMI.Estimated.Time = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"),
Anginal.Classification.w.in.2.Weeks = structure(c(2L, 4L,
3L, 5L, 1L, 5L, 4L, 1L, 5L, 4L, 5L, 2L, 2L, 3L, 1L, 1L, 2L,
5L, 5L, 3L, 2L, 5L, 2L, 2L, 2L, 4L, 1L, 2L, 3L, 5L, 2L, 4L,
3L, 5L, 4L, 4L, 5L, 2L, 1L, 3L, 2L, 1L, 3L, 1L, 5L, 2L, 3L,
2L, 1L, 2L), .Label = c("CCS I", "CCS II", "CCS III", "CCS IV",
"No symptoms"), class = "factor"), Anti.Anginal.Drug.Therapy.within.2.Weeks = structure(c(2L,
1L, 2L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 2L,
2L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 1L,
1L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 2L, 1L, 2L, 2L,
1L, 2L, 2L, 2L), .Label = c("No", "Yes"), class = "factor")), .Names = c("Age.Group",
"Sex", "LOS", "Day.to.Operation", "Admit.Source", "Insurance.Payors",
"Current.Recent.Smoker...1.year.", "Hypertension", "Dyslipidemia",
"Family.History.of.Premature.CAD", "MI.History", "Heart.Failure.History",
"PCI.History", "BMI.Group", "Cerebrovascular.Disease", "Peripheral.Arterial.Disease",
"Chronic.Lung.Disease", "Diabetes.Mellitus", "Diabetes.Therapy",
"Heart.Rate", "CAD.Presentation", "STEMI.Non.STEMI.Onset.Date",
"STEMI.Non.STEMI.Estimated.Time", "Anginal.Classification.w.in.2.Weeks",
"Anti.Anginal.Drug.Therapy.within.2.Weeks"), class = "data.frame", row.names = c(NA,
-50L))
I have performed the string cleaning and missing data treatment as of now. I need help in my next task which is to remove outliers and compute a chi square matrix from this categorical dataset. I am new to data analysis and am quite confused at this point. I would be extremely grateful if I could get help regarding this.