Related
I have to draw a bar chart in R ggplot2 with multiple variables (i.e each bar for BMI, weight, cholesterol, Blood pressure etc) in each group ( i.e. different populations ex: Indian, Korean, Philipinos etc.) But the bars are overflowing to the next group in the axis. for example: the bars of the Indian group is overflowing to Korean group. The axis marks are not adjusted accordingly. I have attached the figure .. can someone please help. Following is my code. dput(data) is also given.
p = ggplot(data = t,
aes(x = factor(Population, levels = names(sort(table(Population), increasing = TRUE))),
y = Snp_Count,
group = factor(Trait, levels = c("BMI", "DBP", "HDL", "Height", "LDL", "TC", "TG", "WC", "Weight"),
ordered = TRUE)))
p = p + geom_bar(aes(fill = Trait),
position = position_dodge(preserve = "single"),
stat = "identity") +
scale_fill_manual(values = c("#28559A", "#3EB650", "#E56B1F", "#A51890", "#FCC133", "#663300", "#6666ff", "#ff3300", "#ff66ff")) +
coord_flip()
structure(list(Trait = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 5L, 5L, 5L,
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 7L,
7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L,
7L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 9L, 9L, 9L, 9L, 9L, 9L, 9L), .Label = c("BMI",
"DBP", "HDL", "HT", "LDL", "TC", "TG", "WC", "Weight"), class = "factor"),
Association = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), .Label = "Direct", class = "factor"), TraitClass = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 1L, 1L, 1L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("Anthropometric",
"BP", "Lipid"), class = "factor"), Population = structure(c(2L,
3L, 4L, 5L, 7L, 8L, 10L, 11L, 12L, 13L, 22L, 24L, 3L, 5L,
11L, 22L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L,
14L, 15L, 18L, 20L, 28L, 5L, 7L, 13L, 14L, 1L, 3L, 5L, 7L,
9L, 11L, 12L, 16L, 18L, 20L, 22L, 5L, 6L, 7L, 10L, 12L, 18L,
20L, 3L, 5L, 6L, 7L, 8L, 11L, 12L, 13L, 14L, 15L, 18L, 19L,
20L, 21L, 22L, 23L, 26L, 28L, 3L, 4L, 5L, 8L, 12L, 22L, 24L,
3L, 5L, 7L, 8L, 17L, 25L, 27L), .Label = c("ACB", "AFR",
"ASW", "ASW/ACB", "CEU", "CHB", "EAS", "Filipino", "FIN",
"GBR", "Hispanic", "Hispanic/Latinos", "JPT", "Korean", "Kuwaiti",
"Micronesian", "Moroccan", "MXL", "Mylopotamos", "Orcadian",
"Pomak", "SAS", "Saudi_Arabian", "Seychellois", "Surinamese",
"Taiwanese", "Turkish", "YRI"), class = "factor"), Snp_Count = c(3L,
12L, 6L, 17L, 2L, 10L, 1L, 6L, 3L, 3L, 10L, 6L, 1L, 1L, 1L,
1L, 2L, 1L, 10L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 3L, 1L, 1L,
2L, 1L, 2L, 20L, 5L, 4L, 1L, 1L, 2L, 7L, 2L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 8L, 2L, 4L, 3L, 1L, 2L, 1L, 4L, 20L, 5L,
11L, 2L, 4L, 3L, 4L, 2L, 3L, 4L, 1L, 1L, 1L, 2L, 2L, 1L,
2L, 3L, 2L, 4L, 4L, 1L, 4L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L
), Gene_Count = c(3L, 9L, 7L, 9L, 2L, 8L, 1L, 7L, 3L, 2L,
8L, 7L, 1L, 1L, 1L, 1L, 2L, 1L, 4L, 1L, 1L, 1L, 1L, 2L, 2L,
1L, 2L, 1L, 1L, 1L, 1L, 1L, 9L, 6L, 5L, 1L, 1L, 2L, 5L, 2L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 6L, 2L, 3L, 3L, 1L, 2L, 1L, 3L,
10L, 4L, 7L, 1L, 3L, 3L, 4L, 1L, 3L, 5L, 1L, 1L, 1L, 3L,
3L, 1L, 1L, 2L, 2L, 3L, 3L, 1L, 3L, 2L, 3L, 3L, 2L, 3L, 2L,
2L, 2L)), class = "data.frame", row.names = c(NA, -86L))
The total width of each group in your barchart is 0.9 by default, which means that 90% of the area is covered. When you increase the width of the individual bars to 3 they will overlap with other groups, the maximum value for with should thus be 1 and then it will touch the other groups.
I'd suggest in your situation to use facet_wrap instead of a dodged barchart.
Note: geom_col is the same as geom_bar(stat = "identity).
my.df$Trait <- factor(my.df$Trait, levels = c("BMI", "DBP", "HDL", "HT", "LDL", "TC", "TG", "WC", "Weight"))
my.df$Population <- factor(my.df$Population, levels = names(sort(table(my.df$Population), increasing = TRUE)))
ggplot(my.df, aes(x = Trait, y = Snp_Count, fill = Trait)) +
geom_col(width = 1) +
scale_fill_manual(values = c("#28559A", "#3EB650", "#E56B1F", "#A51890", "#FCC133", "#663300", "#6666ff", "#ff3300", "#ff66ff")) +
# Split the data by Population, allow flexible scales and spacing for y axis (Trait)
facet_grid(Population ~ ., scales = "free_y", space = "free_y", switch = "y") +
coord_flip() +
theme(axis.text.y = element_blank(), # Remove Trait labels (indicated by color)
axis.ticks.y = element_blank(), # Remove tick marks
strip.background = element_blank(),
strip.text.y = element_text(angle = 180, hjust = 1), # Rotate Population labels
panel.spacing.y = unit(3, "pt")) # Spacing between groups
Data
my.df <-
structure(list(Trait = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 5L, 5L, 5L,
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 7L,
7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L,
7L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 9L, 9L, 9L, 9L, 9L, 9L, 9L),
.Label = c("BMI", "DBP", "HDL", "HT", "LDL", "TC", "TG", "WC", "Weight"), class = "factor"),
Population = structure(c(2L, 3L, 4L, 5L, 7L, 8L, 10L, 11L,
12L, 13L, 22L, 24L, 3L, 5L, 11L, 22L, 3L, 4L, 5L, 6L, 7L,
8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 18L, 20L, 28L, 5L,
7L, 13L, 14L, 1L, 3L, 5L, 7L, 9L, 11L, 12L, 16L, 18L, 20L,
22L, 5L, 6L, 7L, 10L, 12L, 18L, 20L, 3L, 5L, 6L, 7L, 8L,
11L, 12L, 13L, 14L, 15L, 18L, 19L, 20L, 21L, 22L, 23L, 26L,
28L, 3L, 4L, 5L, 8L, 12L, 22L, 24L, 3L, 5L, 7L, 8L, 17L,
25L, 27L),
.Label = c("ACB", "AFR", "ASW", "ASW/ACB", "CEU",
"CHB", "EAS", "Filipino", "FIN", "GBR", "Hispanic", "Hispanic/Latinos",
"JPT", "Korean", "Kuwaiti", "Micronesian", "Moroccan", "MXL",
"Mylopotamos", "Orcadian", "Pomak", "SAS", "Saudi_Arabian",
"Seychellois", "Surinamese", "Taiwanese", "Turkish", "YRI"), class = "factor"),
Snp_Count = c(3L, 12L, 6L, 17L, 2L,
10L, 1L, 6L, 3L, 3L, 10L, 6L, 1L, 1L, 1L, 1L, 2L, 1L, 10L,
1L, 1L, 2L, 1L, 2L, 1L, 1L, 3L, 1L, 1L, 2L, 1L, 2L, 20L,
5L, 4L, 1L, 1L, 2L, 7L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 8L,
2L, 4L, 3L, 1L, 2L, 1L, 4L, 20L, 5L, 11L, 2L, 4L, 3L, 4L,
2L, 3L, 4L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 3L, 2L, 4L, 4L, 1L,
4L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L)),
class = "data.frame", row.names = c(NA, -86L))
I am using the rms library and the lrm function to do a penalized logistic regression.
Just look to my data:
> dput(cs_data_train[1:50,])
structure(list(DataCRMSanoflore.Year_Sales = structure(c(1L,
2L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L,
2L), .Label = c("2015", "2016", "2017"), class = "factor"), DataCRMSanoflore.HOURS_INSCR = c(14L,
18L, 17L, 16L, 11L, 22L, 23L, 17L, 9L, 21L, 18L, 19L, 12L, 11L,
17L, 16L, 21L, 20L, 14L, 19L, 22L, 17L, 22L, 13L, 19L, 13L, 21L,
16L, 23L, 19L, 11L, 21L, 11L, 22L, 20L, 13L, 11L, 17L, 15L, 12L,
15L, 21L, 17L, 14L, 10L, 17L, 10L, 12L, 18L, 13L), DataCRMSanoflore.Month_Sales = structure(c(9L,
2L, 5L, 9L, 4L, 7L, 3L, 9L, 7L, 12L, 3L, 3L, 12L, 3L, 3L, 6L,
3L, 4L, 5L, 8L, 8L, 1L, 4L, 10L, 9L, 5L, 4L, 9L, 2L, 12L, 9L,
4L, 4L, 3L, 6L, 8L, 6L, 4L, 12L, 5L, 6L, 9L, 7L, 9L, 1L, 9L,
7L, 11L, 11L, 4L), .Label = c("01", "02", "03", "04", "05", "06",
"07", "08", "09", "10", "11", "12"), class = "factor"), DataCRMSanoflore.Date_Sales = structure(c(3L,
10L, 22L, 23L, 26L, 13L, 12L, 2L, 25L, 11L, 10L, 9L, 4L, 10L,
18L, 9L, 9L, 1L, 14L, 24L, 4L, 2L, 2L, 22L, 17L, 4L, 14L, 22L,
2L, 5L, 29L, 13L, 2L, 10L, 25L, 5L, 10L, 1L, 6L, 20L, 7L, 9L,
1L, 3L, 17L, 22L, 3L, 9L, 20L, 13L), .Label = c("01", "02", "03",
"04", "05", "06", "07", "08", "09", "10", "11", "12", "13", "14",
"15", "16", "17", "18", "19", "20", "21", "22", "23", "24", "25",
"26", "27", "28", "29", "30", "31"), class = "factor"), DataCRMSanoflore.HOURS_INSCR.1 = c(14L,
18L, 17L, 16L, 11L, 22L, 23L, 17L, 9L, 21L, 18L, 19L, 12L, 11L,
17L, 16L, 21L, 20L, 14L, 19L, 22L, 17L, 22L, 13L, 19L, 13L, 21L,
16L, 23L, 19L, 11L, 21L, 11L, 22L, 20L, 13L, 11L, 17L, 15L, 12L,
15L, 21L, 17L, 14L, 10L, 17L, 10L, 12L, 18L, 13L), DataCRMSanoflore.Year_Creation_Sales = structure(c(1L,
2L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 1L, 1L, 1L,
2L), .Label = c("2015", "2016", "2017"), class = "factor"), DataCRMSanoflore.Month_Creation_Sales = structure(c(9L,
2L, 10L, 10L, 9L, 7L, 12L, 9L, 7L, 12L, 3L, 4L, 2L, 6L, 3L, 6L,
10L, 4L, 5L, 8L, 3L, 1L, 4L, 11L, 9L, 5L, 4L, 9L, 2L, 12L, 10L,
4L, 4L, 3L, 10L, 8L, 6L, 4L, 12L, 8L, 6L, 2L, 10L, 5L, 1L, 9L,
8L, 11L, 11L, 4L), .Label = c("01", "02", "03", "04", "05", "06",
"07", "08", "09", "10", "11", "12"), class = "factor"), DataCRMSanoflore.Day_Creation_Sales = structure(c(11L,
15L, 2L, 31L, 26L, 23L, 5L, 2L, 25L, 16L, 10L, 13L, 7L, 3L, 18L,
9L, 8L, 27L, 18L, 24L, 6L, 2L, 4L, 16L, 17L, 12L, 15L, 22L, 10L,
5L, 1L, 14L, 2L, 10L, 5L, 5L, 10L, 25L, 6L, 5L, 28L, 8L, 10L,
18L, 17L, 22L, 31L, 9L, 21L, 22L), .Label = c("01", "02", "03",
"04", "05", "06", "07", "08", "09", "10", "11", "12", "13", "14",
"15", "16", "17", "18", "19", "20", "21", "22", "23", "24", "25",
"26", "27", "28", "29", "30", "31"), class = "factor"), DataCRMSanoflore.Year_Validation_Sales = structure(c(1L,
2L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 1L, 1L, 1L,
2L), .Label = c("2015", "2016", "2017"), class = "factor"), DataCRMSanoflore.Month_Validation_Sales = structure(c(9L,
2L, 10L, 11L, 10L, 7L, 12L, 9L, 7L, 12L, 3L, 4L, 2L, 6L, 3L,
6L, 10L, 4L, 5L, 8L, 3L, 1L, 4L, 11L, 9L, 5L, 4L, 9L, 2L, 12L,
10L, 4L, 4L, 3L, 10L, 8L, 6L, 4L, 12L, 8L, 6L, 2L, 10L, 5L, 1L,
9L, 9L, 11L, 11L, 4L), .Label = c("01", "02", "03", "04", "05",
"06", "07", "08", "09", "10", "11", "12"), class = "factor"),
DataCRMSanoflore.Day_Validation_Sales = structure(c(14L,
16L, 3L, 3L, 1L, 27L, 6L, 5L, 27L, 21L, 19L, 27L, 8L, 5L,
21L, 10L, 9L, 30L, 26L, 27L, 7L, 4L, 15L, 17L, 18L, 13L,
20L, 29L, 11L, 7L, 2L, 16L, 3L, 20L, 6L, 6L, 13L, 29L, 8L,
6L, 30L, 9L, 12L, 20L, 18L, 29L, 1L, 10L, 23L, 25L), .Label = c("01",
"02", "03", "04", "05", "06", "07", "08", "09", "10", "11",
"12", "13", "14", "15", "16", "17", "18", "19", "20", "21",
"22", "23", "24", "25", "26", "27", "28", "29", "30", "31"
), class = "factor"), DataCRMSanoflore.AGE_CUSTUMER = c(37L,
23L, 34L, 32L, 45L, 52L, 44L, 55L, 37L, 29L, 33L, 29L, 30L,
37L, 56L, 48L, 44L, 42L, 45L, 33L, 37L, 53L, 55L, 60L, 57L,
33L, 51L, 32L, 35L, 54L, 41L, 47L, 59L, 33L, 45L, 35L, 36L,
28L, 42L, 24L, 32L, 39L, 33L, 36L, 49L, 56L, 45L, 39L, 54L,
55L), DataCRMSanoflore.MEAN_PURCHASE = c(71.75, 50.7142857142857,
18.6666666666667, 0, 0, 54.7, 0.666666666666667, 38, 6.5,
0, 83.3333333333333, 44.3333333333333, 25.7777777777778,
24.1818181818182, 23.3846153846154, 35.5294117647059, 21.6363636363636,
1.125, 6, 8.66666666666667, 18.4, 16.9285714285714, 0, 0,
36.5, 21.5, 18.5714285714286, 28.125, 101.333333333333, 0,
2, 0, 20.9166666666667, 69.1428571428571, 16.6666666666667,
1.5, 87.1666666666667, 48.25, 13.3333333333333, 20.5833333333333,
12, 0, 23, 15.1428571428571, 0, 30.4375, 30.3076923076923,
24.625, 23.4285714285714, 20.0833333333333), DataCRMSanoflore.NUMBER_GIFTS = c(1L,
1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 3L, 4L, 3L,
4L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 3L, 1L, 3L, 2L, 1L, 1L, 1L,
1L, 2L, 2L, 1L, 1L, 1L, 2L, 3L, 1L, 3L, 1L, 4L, 1L, 1L, 1L,
2L, 5L, 2L, 2L), SENSIBILITE = c(4L, 4L, 1L, 3L, 1L, 1L,
2L, 1L, 1L, 1L, 4L, 1L, 3L, 1L, 3L, 3L, 4L, 1L, 1L, 1L, 4L,
1L, 1L, 4L, 1L, 3L, 4L, 4L, 4L, 1L, 1L, 1L, 1L, 4L, 1L, 1L,
1L, 4L, 1L, 3L, 2L, 1L, 3L, 4L, 1L, 1L, 4L, 3L, 1L, 4L),
IMPERFECTIONS = c(4L, 3L, 1L, 2L, 1L, 1L, 4L, 1L, 1L, 1L,
3L, 1L, 2L, 1L, 3L, 2L, 3L, 1L, 1L, 1L, 3L, 1L, 1L, 3L, 1L,
3L, 3L, 3L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 3L, 1L, 2L,
3L, 1L, 2L, 2L, 1L, 1L, 3L, 3L, 1L, 3L), BRILLANCE = c(2L,
2L, 1L, 4L, 1L, 1L, 4L, 1L, 1L, 1L, 4L, 1L, 4L, 1L, 4L, 4L,
4L, 1L, 1L, 1L, 4L, 1L, 1L, 3L, 1L, 4L, 4L, 4L, 4L, 1L, 1L,
1L, 1L, 4L, 1L, 1L, 1L, 4L, 1L, 4L, 4L, 1L, 4L, 4L, 1L, 1L,
4L, 4L, 1L, 4L), GRAIN_PEAU = c(4L, 4L, 1L, 4L, 1L, 1L, 2L,
1L, 1L, 1L, 4L, 1L, 2L, 1L, 2L, 4L, 4L, 1L, 1L, 1L, 3L, 1L,
1L, 2L, 1L, 2L, 4L, 4L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L,
2L, 1L, 4L, 4L, 1L, 2L, 4L, 1L, 1L, 4L, 3L, 1L, 4L), RIDES_VISAGE = c(2L,
2L, 1L, 4L, 1L, 1L, 4L, 1L, 1L, 1L, 4L, 1L, 2L, 1L, 4L, 2L,
4L, 1L, 1L, 1L, 4L, 1L, 1L, 4L, 1L, 2L, 4L, 2L, 2L, 1L, 1L,
1L, 1L, 2L, 1L, 1L, 1L, 4L, 1L, 2L, 4L, 1L, 2L, 4L, 1L, 1L,
4L, 4L, 1L, 4L), ALLERGIES = c(2L, 2L, 1L, 2L, 1L, 1L, 2L,
1L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 1L,
1L, 2L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L,
2L, 1L, 2L, 2L, 1L, 2L, 2L, 1L, 1L, 3L, 2L, 1L, 2L), MAINS = c(4L,
4L, 1L, 4L, 1L, 1L, 3L, 1L, 1L, 1L, 3L, 1L, 3L, 1L, 3L, 3L,
3L, 1L, 1L, 1L, 4L, 1L, 1L, 4L, 1L, 3L, 4L, 4L, 3L, 1L, 1L,
1L, 1L, 3L, 1L, 1L, 1L, 3L, 1L, 4L, 3L, 1L, 3L, 4L, 1L, 1L,
3L, 3L, 1L, 4L), PEAU_CORPS = c(3L, 3L, 1L, 2L, 1L, 1L, 2L,
1L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 3L, 1L, 1L, 1L, 2L, 1L,
1L, 3L, 1L, 3L, 3L, 2L, 3L, 1L, 1L, 1L, 1L, 4L, 1L, 1L, 1L,
3L, 1L, 3L, 2L, 1L, 2L, 4L, 1L, 1L, 3L, 3L, 1L, 3L), INTERET_ALIM_NATURELLE = c(4L,
4L, 1L, 2L, 1L, 1L, 4L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 4L, 2L,
2L, 1L, 1L, 1L, 2L, 1L, 1L, 4L, 1L, 4L, 2L, 2L, 2L, 1L, 1L,
1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 4L, 4L, 1L, 4L, 2L, 1L, 1L,
4L, 2L, 1L, 2L), INTERET_ORIGINE_GEO = c(4L, 2L, 1L, 2L,
1L, 1L, 5L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 5L, 2L, 1L, 1L,
1L, 2L, 1L, 1L, 2L, 1L, 2L, 5L, 2L, 2L, 1L, 1L, 1L, 1L, 2L,
1L, 1L, 1L, 2L, 1L, 5L, 5L, 1L, 4L, 2L, 1L, 1L, 2L, 2L, 1L,
2L), INTERET_VACANCES = c(4L, 2L, 1L, 3L, 1L, 1L, 2L, 1L,
1L, 1L, 3L, 1L, 2L, 1L, 3L, 4L, 3L, 1L, 1L, 1L, 2L, 1L, 1L,
3L, 1L, 4L, 3L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L,
1L, 2L, 2L, 1L, 4L, 3L, 1L, 1L, 2L, 2L, 1L, 2L), INTERET_ENVIRONNEMENT = c(5L,
5L, 1L, 5L, 1L, 1L, 5L, 1L, 1L, 1L, 3L, 1L, 3L, 1L, 3L, 3L,
3L, 1L, 1L, 1L, 3L, 1L, 1L, 3L, 1L, 3L, 3L, 3L, 3L, 1L, 1L,
1L, 1L, 3L, 1L, 1L, 1L, 3L, 1L, 3L, 5L, 1L, 5L, 3L, 1L, 1L,
3L, 5L, 1L, 3L), INTERET_COMPOSITION = c(2L, 2L, 1L, 4L,
1L, 1L, 4L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 1L,
1L, 2L, 1L, 1L, 4L, 1L, 4L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L,
1L, 1L, 1L, 4L, 1L, 2L, 4L, 1L, 4L, 2L, 1L, 1L, 2L, 2L, 1L,
2L), DataCRMSanoflore.Nb_achats = c(4, 7, 3, 3, 4, 10, 3,
4, 14, 4, 6, 6, 9, 22, 26, 17, 22, 8, 3, 9, 10, 14, 3, 7,
12, 6, 14, 16, 3, 3, 3, 3, 12, 7, 3, 6, 6, 12, 18, 12, 15,
6, 21, 7, 6, 16, 13, 16, 14, 12), OUTCOME = structure(c(1L,
2L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L,
1L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L), .Label = c("0", "1"), class = "factor")), .Names = c("DataCRMSanoflore.Year_Sales",
"DataCRMSanoflore.HOURS_INSCR", "DataCRMSanoflore.Month_Sales",
"DataCRMSanoflore.Date_Sales", "DataCRMSanoflore.HOURS_INSCR.1",
"DataCRMSanoflore.Year_Creation_Sales", "DataCRMSanoflore.Month_Creation_Sales",
"DataCRMSanoflore.Day_Creation_Sales", "DataCRMSanoflore.Year_Validation_Sales",
"DataCRMSanoflore.Month_Validation_Sales", "DataCRMSanoflore.Day_Validation_Sales",
"DataCRMSanoflore.AGE_CUSTUMER", "DataCRMSanoflore.MEAN_PURCHASE",
"DataCRMSanoflore.NUMBER_GIFTS", "SENSIBILITE", "IMPERFECTIONS",
"BRILLANCE", "GRAIN_PEAU", "RIDES_VISAGE", "ALLERGIES", "MAINS",
"PEAU_CORPS", "INTERET_ALIM_NATURELLE", "INTERET_ORIGINE_GEO",
"INTERET_VACANCES", "INTERET_ENVIRONNEMENT", "INTERET_COMPOSITION",
"DataCRMSanoflore.Nb_achats", "OUTCOME"), row.names = c(22L,
33L, 40L, 48L, 54L, 59L, 74L, 78L, 87L, 89L, 104L, 115L, 121L,
141L, 159L, 161L, 163L, 165L, 196L, 202L, 211L, 222L, 272L, 300L,
318L, 325L, 327L, 349L, 374L, 380L, 392L, 393L, 394L, 398L, 427L,
440L, 449L, 456L, 470L, 477L, 479L, 490L, 505L, 508L, 514L, 520L,
528L, 531L, 534L, 543L), class = "data.frame")
Then when I want to fit the model using this code:
fit = lrm(OUTCOME ~ .-1,data = cs_data_train,x=T, y=T)
It gives an error:
singular information matrix in lrm.fit (rank= 148 ). Offending
variable(s): DataCRMSanoflore.HOURS_INSCR.1 Error in lrm(OUTCOME ~ .
- 1, data = cs_data_train, x = T, y = T) : Unable to fit model using “lrm.fit”
I searched but I could not resolve this issue. Thank you for your help!
EDIT:
As Said in the comment below. I need to remove one of each both correlated variables. So I write this code :
> highlyCorrelated <- findCorrelation(correlationMatrix, cutoff=(0.7),verbose = FALSE)
> print(highlyCorrelated)
[1] 21 20 26 15 18 17 22 16 25 19 23 24 6 9 7 10 28 2
> important_var=colnames(DATA_BASE[,-highlyCorrelated])
> important_var
[1] "DataCRMSanoflore.Year_Sales" "DataCRMSanoflore.Date_Sales" "DataCRMSanoflore.HOURS_INSCR.1"
[4] "DataCRMSanoflore.Day_Creation_Sales" "DataCRMSanoflore.MEAN_PURCHASE" "OUTCOME"
> DATA_BASE<-DATA_BASE[,-highlyCorrelated]
> str(DATA_BASE)
'data.frame': 5775 obs. of 6 variables:
$ DataCRMSanoflore.Year_Sales : num 2 1 2 1 2 1 1 1 1 2 ...
$ DataCRMSanoflore.Date_Sales : num 13 3 10 22 23 26 13 1 12 2 ...
$ DataCRMSanoflore.HOURS_INSCR.1 : num 17 14 18 17 16 11 22 14 23 17 ...
$ DataCRMSanoflore.Day_Creation_Sales: num 13 11 15 2 31 26 23 1 5 2 ...
$ DataCRMSanoflore.MEAN_PURCHASE : num 0 71.8 50.7 18.7 0 ...
$ OUTCOME : Factor w/ 2 levels "0","1": 1 1 2 1 1 1 2 2 1 1 ...
But I get then the same error
Error in lrm(OUTCOME ~ . - 1, data = train, x = T, y = T) : Unable
to fit model using “lrm.fit”
This really weird!
How can I resolve this please ?
I have the following
t <- structure(list(name = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L), .Label = c("Alice", "Bob",
"Jane Doe", "John Doe"), class = "factor"), school = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L), .Label = c("Alice School",
"Bob School", "Someother School", "Someschool College"), class = "factor"),
group = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("A", "B"), class = "factor"),
question = structure(c(2L, 4L, 6L, 8L, 1L, 3L, 5L, 7L, 2L,
4L, 6L, 8L, 1L, 3L, 5L, 7L, 2L, 4L, 6L, 8L, 1L, 3L, 5L, 7L,
2L, 4L, 6L, 8L, 1L, 3L, 5L, 7L), .Label = c("q1", "q2", "q3",
"q4", "q5", "q6", "q7", "q8"), class = "factor"), mark = c(0L,
0L, 0L, 0L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L,
1L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 1L, 0L, 1L, 0L, 1L,
1L), subject = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L,
1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L), .Label = c("C", "M"), class = "factor")), .Names = c("name",
"school", "group", "question", "mark", "subject"), row.names = c(7L,
15L, 23L, 31L, 3L, 11L, 19L, 27L, 8L, 16L, 24L, 32L, 4L, 12L,
20L, 28L, 6L, 14L, 22L, 30L, 2L, 10L, 18L, 26L, 5L, 13L, 21L,
29L, 1L, 9L, 17L, 25L), class = "data.frame")
and I need to produce a data frame in which each student has one combined mark for each subject. The combination is simply a sum of the marks on each question. So, for example, Jane Doe will have 3 on subject C and 2 on subject M. I've been banging my head for long enough with Reduce and other approaches. I could possibly solve this in a very procedural way, but if I could do that with a one-liner (or close approximation), I'd be happier. I'm sure it can be done...
You said it in your question; you want to group_by student and subject and compute the sum
library(tidyverse)
asdf %>%
group_by(name, subject) %>%
summarise(score = sum(mark))
Here a data.table solution:
library(data.table)
setDT(t)[, sum(mark), by = list(name, subject)]
And just for completeness, base R:
aggregate(mark ~ name + subject, data=t, sum)
This says "aggregate the response variable mark by the grouping variables name and subject, using sum as the aggregation function".
I am currently trying to make a 'heat map' using ggplot2 to display a series of p-values, but can't figure out how to tailor the actual color assignments and legend.
sampledata.m <- melt(sampledata)
sampledata.m$var2 <- as.character(sampledata.m$var2)
sampledata.m$var2 <- factor(sampledata.m$var2, levels=unique(sampledata.m$var2),ordered=TRUE)
sampledata.m$var1 <- as.character(sampledata.m$var1)
sampledata.m$var1 <- factor(sampledata.m$var1, levels=unique(sampledata.m$var1),ordered=TRUE)
This was done so that I could maintain the order of my variables.
p <- ggplot(sampledata.m, aes(var2, var1)) +
geom_tile(aes(fill = value), colour = "transparent") +
scale_fill_gradientn(colours=c("light green","dark green", "black"),
values=rescale(c(0,0.0003,0.05,0.5,1)),limits=c(0,1)))
p + theme_bw(base_size = base_size) + labs(x = "", y = "") +
scale_x_discrete(expand = c(0,0)) +
theme(legend.position = "bottom", axis.ticks = element_blank(),
axis.text.x = element_text(size = base_size * 0.8, angle = 310,
hjust = 0, colour = "black"))
This creates a nice looking plot, however my legend and my color gradient don't represent the rescale that I assigned. Forgive my ignorance if this is a simple fix, but I've only been coding R for about 2 weeks now. Ideally, I would love my plot and legend to mimic the color scheme and legend labeling similar to this paper: http://www.ncbi.nlm.nih.gov/pubmed/22496159
structure(list(var1 = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 5L, 5L, 5L, 5L, 5L, 5L, 5L,
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L
), .Label = c("A", "B", "C",
"D", "E"), class = "factor"), var2 = structure(c(1L,
5L, 23L, 18L, 9L, 8L, 14L, 12L, 20L, 6L, 21L, 11L, 2L, 22L, 10L,
3L, 19L, 16L, 4L, 7L, 15L, 17L, 13L, 24L, 1L, 5L, 23L, 18L, 9L,
8L, 14L, 12L, 20L, 6L, 21L, 11L, 2L, 22L, 10L, 3L, 19L, 16L,
4L, 7L, 15L, 17L, 13L, 24L, 1L, 5L, 23L, 18L, 9L, 8L, 14L, 12L,
20L, 6L, 21L, 11L, 2L, 22L, 10L, 3L, 19L, 16L, 4L, 7L, 15L, 17L,
13L, 24L, 1L, 5L, 23L, 18L, 9L, 8L, 14L, 12L, 20L, 6L, 21L, 11L,
2L, 22L, 10L, 3L, 19L, 16L, 4L, 7L, 15L, 17L, 13L, 24L, 1L, 5L,
23L, 18L, 9L, 8L, 14L, 12L, 20L, 6L, 21L, 11L, 2L, 22L, 10L,
3L, 19L, 16L, 4L, 7L, 15L, 17L, 13L), .Label = c("1", "2",
"3", "4", "5", "6", "7", "8",
"9", "10", "11", "12", "13", "14", "15",
"16", "17", "18", "19", "20", "21",
"22", "23", "24"), class = "factor"), variable = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L), .Label = "pvalue", class = "factor"),
value = c(0.810172671, 0.596026338, 0.076550169, 0.908670635,
0.300418653, 0.051553286, 0.124196482, 0.601568833, 0.058431468,
0.341726981, 0.876674726, 0.002698295, 0.812059425, 0.068199656,
0.758383287, 0.60362134, 0.89265723, 0.246111936, 0.156348035,
0.909574522, 0.020202377, 0.388843992, 0.769441835, 0.102272916,
0.38895717, 0.882296525, 0.792438683, 0.000491393, 0.004233434,
0.202424095, 0.426941568, 0.08520186, 0.763036306, 0.602828564,
0.037278697, 0.121642743, 0.669123606, 0.974328438, 0.834329923,
0.050413697, 0.078476666, 0.387647156, 0.000540422, 0.379576632,
0.361428444, 0.502439758, 0.001326035, 0.027652693, 0.188885638,
0.579244445, 0.471985778, 0.677458228, 0.119307242, 0.364857868,
0.238260538, 0.53472206, 0.204344281, 0.291888993, 0.295809688,
0.00029, 0.005476157, 0.960975822, 0.00029, 0.055915429,
0.618284682, 0.040605253, 0.521649682, 0.421086546, 0.164333061,
0.755528982, 0.306854182, 0.012832628, 0.270393143, 0.946675764,
0.59227376, 0.112658388, 0.429091426, 0.01662083, 0.017342483,
0.065817234, 0.012140224, 0.359828816, 0.031969725, 0.00029,
0.14555102, 0.18865081, 0.00029, 0.064107531, 0.505257768,
0.070224536, 0.017082975, 0.375864198, 0.00029, 0.104103689,
0.898979883, 0.004879605, 0.003597954, 0.036722932, 0.849058218,
0.00029, 0.003739938, 0.00029, 0.00029, 0.00029, 0.008179017,
0.193870353, 0.460181712, 0.389475522, 0.00029, 0.8785017,
0.070414642, 0.584977921, 0.990764677, 0.767253318, 0.002234906,
0.051331823, 0.00446149, 0.234477639, 0.275139791)), .Names = c("var1", "var2", "variable", "value"), row.names = c(NA, -119L), class = "data.frame")
I'm not going to get into all of the theme settings you've got - as I understand it the key of your problem is the scale of the fill gradient. You can set this in scale_fill_gradient() with a log transformation:
p <- ggplot(sampledata.m, aes(var2, var1)) +
geom_tile(aes(fill = value), colour = "transparent") +
scale_fill_gradient(trans = "log", low = "light green", high = "black",
breaks = c(0, 0.001, 0.05, 0.5))
dt <- data.frame(
N=letters[5:11],
a=c(0.01,0.05,0.1,0.5,1,5,10),
b=c(10,20,50,100,200,1000,2000))
dt.mlt <- melt(dt,variable.name="Cls",value.name="Val")
ggplot(dt.mlt,aes(x=N,y=Cls,fill=Val))+
geom_tile()+
scale_fill_gradient2(
low="green",high="red",mid="black",trans="log",breaks=c(0,0.01,0.1,1,10,100,1000))+
geom_text(data=dt.mlt,aes(x=N,y=Cls,label=Val))
But if I add the midpoint=10 to the scale_fill_gradient2, the picture will become:
I am running a negative binomial regression.
I would like to know why I have the following errors:
In sqrt(1/i) : NaNs produced
It appears that there are some negative values in "i", but how do I avoid that?
Another one is:
In loglik(n, th, mu, Y, w) : value out of range in 'lgamma'
It is probably a consequences of the first error, so if I fix the first one, the second might be gone. Or maybe not.
In some other cases I am able to calculate the regression but the following output seems strange for me:
(Dispersion parameter for Negative Binomial(10684331573) family taken
to be 1)
Null deviance: 8779.49 on 359 degrees of freedom
Residual deviance: 270.32 on 200 degrees of freedom
AIC: 2074.7
Number of Fisher Scoring iterations: 1
Theta: 10684331573
Std. Err.: 615849693813
2 x log-likelihood: -1752.749
Do these numbers seem okay? I mean the dispersion parameter, theta and standard error. They look enormously big to me and therefore I am not sure if the results are okay.
I never had any problems like that using poisson regression, but then I realized that I have an overdispersed data and that is why I am using negative binomial. However, I am having a lot of troubles with this one.
Here is the code:
negbin <- glm.nb(Freq ~ cluster*gender*agecombined*educ, maxit=100)
mod.good <- step(negbin, direction='both', maxit=100)
And here is the dput of the whole dataset:
structure(list(gender = structure(c(1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L,
1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L,
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L,
1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L,
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L,
1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L,
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L,
1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L,
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L,
1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L,
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L,
1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L,
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L,
1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L,
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L,
1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L,
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L,
1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L,
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L,
1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L,
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L,
1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L), .Label = c("1",
"2"), class = "factor"), agecombined = structure(c(1L, 1L, 2L, 2L,
3L, 3L, 4L, 4L, 5L, 5L, 6L, 6L, 1L, 1L, 2L, 2L, 3L, 3L, 4L, 4L, 5L,
5L, 6L, 6L, 1L, 1L, 2L, 2L, 3L, 3L, 4L, 4L, 5L, 5L, 6L, 6L, 1L, 1L,
2L, 2L, 3L, 3L, 4L, 4L, 5L, 5L, 6L, 6L, 1L, 1L, 2L, 2L, 3L, 3L, 4L,
4L, 5L, 5L, 6L, 6L, 1L, 1L, 2L, 2L, 3L, 3L, 4L, 4L, 5L, 5L, 6L, 6L,
1L, 1L, 2L, 2L, 3L, 3L, 4L, 4L, 5L, 5L, 6L, 6L, 1L, 1L, 2L, 2L, 3L,
3L, 4L, 4L, 5L, 5L, 6L, 6L, 1L, 1L, 2L, 2L, 3L, 3L, 4L, 4L, 5L, 5L,
6L, 6L, 1L, 1L, 2L, 2L, 3L, 3L, 4L, 4L, 5L, 5L, 6L, 6L, 1L, 1L, 2L,
2L, 3L, 3L, 4L, 4L, 5L, 5L, 6L, 6L, 1L, 1L, 2L, 2L, 3L, 3L, 4L, 4L,
5L, 5L, 6L, 6L, 1L, 1L, 2L, 2L, 3L, 3L, 4L, 4L, 5L, 5L, 6L, 6L, 1L,
1L, 2L, 2L, 3L, 3L, 4L, 4L, 5L, 5L, 6L, 6L, 1L, 1L, 2L, 2L, 3L, 3L,
4L, 4L, 5L, 5L, 6L, 6L, 1L, 1L, 2L, 2L, 3L, 3L, 4L, 4L, 5L, 5L, 6L,
6L, 1L, 1L, 2L, 2L, 3L, 3L, 4L, 4L, 5L, 5L, 6L, 6L, 1L, 1L, 2L, 2L,
3L, 3L, 4L, 4L, 5L, 5L, 6L, 6L, 1L, 1L, 2L, 2L, 3L, 3L, 4L, 4L, 5L,
5L, 6L, 6L, 1L, 1L, 2L, 2L, 3L, 3L, 4L, 4L, 5L, 5L, 6L, 6L, 1L, 1L,
2L, 2L, 3L, 3L, 4L, 4L, 5L, 5L, 6L, 6L, 1L, 1L, 2L, 2L, 3L, 3L, 4L,
4L, 5L, 5L, 6L, 6L, 1L, 1L, 2L, 2L, 3L, 3L, 4L, 4L, 5L, 5L, 6L, 6L,
1L, 1L, 2L, 2L, 3L, 3L, 4L, 4L, 5L, 5L, 6L, 6L, 1L, 1L, 2L, 2L, 3L,
3L, 4L, 4L, 5L, 5L, 6L, 6L, 1L, 1L, 2L, 2L, 3L, 3L, 4L, 4L, 5L, 5L,
6L, 6L, 1L, 1L, 2L, 2L, 3L, 3L, 4L, 4L, 5L, 5L, 6L, 6L, 1L, 1L, 2L,
2L, 3L, 3L, 4L, 4L, 5L, 5L, 6L, 6L, 1L, 1L, 2L, 2L, 3L, 3L, 4L, 4L,
5L, 5L, 6L, 6L, 1L, 1L, 2L, 2L, 3L, 3L, 4L, 4L, 5L, 5L, 6L, 6L),
.Label = c("18-24", "25-34", "35-44", "45-54", "55-64", "65 and
older"), class = "factor"), educ = structure(c(1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L,
5L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L,
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L,
6L, 6L, 6L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 6L,
6L, 6L, 6L, 6L, 6L, 6L, 6L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 6L,
6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L,
5L, 5L, 5L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L), .Label =
c("2-year college", "BA", "Illiterate", "MA or higher", "Primary",
"Secondary"), class = "factor"),
cluster = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 5L, 5L,
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L,
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L,
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L,
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L,
5L, 5L, 5L, 5L, 5L, 5L), .Label = c("E", "A", "B", "C", "D"
), class = "factor"), Freq = c(27L, 18L, 48L, 29L, 18L, 19L,
14L, 10L, 2L, 1L, 2L, 0L, 48L, 36L, 69L, 54L, 33L, 15L, 12L,
4L, 5L, 1L, 0L, 0L, 2L, 4L, 12L, 14L, 17L, 17L, 23L, 32L,
16L, 17L, 18L, 6L, 4L, 2L, 17L, 7L, 8L, 4L, 5L, 0L, 1L, 0L,
0L, 0L, 53L, 42L, 82L, 58L, 81L, 60L, 42L, 35L, 16L, 14L,
22L, 6L, 83L, 40L, 62L, 54L, 43L, 46L, 26L, 12L, 15L, 3L,
3L, 3L, 11L, 13L, 11L, 23L, 16L, 18L, 11L, 5L, 1L, 3L, 1L,
1L, 26L, 44L, 34L, 54L, 25L, 41L, 19L, 17L, 10L, 3L, 3L,
0L, 4L, 4L, 7L, 14L, 22L, 31L, 14L, 34L, 14L, 33L, 14L, 20L,
7L, 11L, 22L, 11L, 14L, 8L, 8L, 1L, 2L, 0L, 1L, 2L, 29L,
65L, 34L, 84L, 36L, 65L, 28L, 39L, 16L, 15L, 16L, 9L, 25L,
51L, 12L, 38L, 23L, 29L, 22L, 19L, 7L, 5L, 5L, 1L, 7L, 16L,
14L, 35L, 6L, 27L, 8L, 5L, 1L, 1L, 1L, 0L, 24L, 57L, 29L,
53L, 24L, 28L, 11L, 9L, 7L, 2L, 0L, 0L, 3L, 7L, 1L, 8L, 2L,
18L, 5L, 13L, 10L, 11L, 5L, 10L, 3L, 1L, 5L, 13L, 4L, 2L,
2L, 1L, 1L, 0L, 0L, 0L, 14L, 51L, 21L, 77L, 23L, 50L, 25L,
31L, 17L, 16L, 13L, 13L, 19L, 52L, 24L, 59L, 18L, 44L, 9L,
20L, 6L, 3L, 7L, 2L, 14L, 28L, 34L, 47L, 29L, 47L, 15L, 13L,
9L, 3L, 2L, 0L, 46L, 75L, 124L, 81L, 67L, 45L, 33L, 15L,
9L, 4L, 5L, 3L, 0L, 10L, 6L, 19L, 12L, 28L, 22L, 37L, 31L,
41L, 26L, 31L, 7L, 6L, 21L, 13L, 6L, 7L, 8L, 2L, 2L, 1L,
0L, 0L, 67L, 89L, 116L, 159L, 99L, 102L, 64L, 80L, 42L, 25L,
25L, 8L, 108L, 123L, 60L, 97L, 68L, 66L, 44L, 35L, 12L, 5L,
9L, 2L, 7L, 3L, 53L, 15L, 33L, 3L, 8L, 3L, 4L, 0L, 0L, 0L,
48L, 19L, 76L, 40L, 55L, 11L, 16L, 1L, 4L, 1L, 2L, 0L, 6L,
7L, 21L, 22L, 18L, 23L, 32L, 37L, 40L, 13L, 23L, 10L, 4L,
2L, 19L, 2L, 8L, 3L, 6L, 0L, 1L, 0L, 1L, 0L, 68L, 37L, 90L,
42L, 76L, 38L, 47L, 16L, 29L, 5L, 18L, 2L, 82L, 32L, 62L,
27L, 44L, 22L, 20L, 8L, 8L, 2L, 1L, 0L)), .Names = c("gender", "agecombined", "educ", "cluster", "Freq"), row.names = c(NA,
-360L), class = "data.frame")