Comparing multiple categorical variables in R - r

So I would like to stack the two bars from each of these graphs into one big graph. That is, I would like Black State Claim (from plot a) to be right next to Black Civil Rights Claim (from plot b) and consequently for all races into one graph.
Since some of the data, like asian, is so low, is there a more ideal way to compare State Claim/Civil Rights Claim Status with Race???
#a) State Claim?
race_claim <- data.frame(table(jail$Race,jail$State_Claim_Made))
names(race_claim) <- c("Race","Claim","Count")
ggplot(data=race_claim, aes(x=Race, y=Count, fill=Claim)) + geom_bar(stat = "identity")
#b) civil rights claim?
race_claim_civ <- data.frame(table(jail$Race,jail$Non_Statutory))
names(race_claim_civ) <- c("Race","Claim","Count")
ggplot(data=race_claim_civ, aes(x=Race, y=Count, fill=Claim)) + geom_bar(stat = "identity")
DATA SAMPLE:
structure(list(Last_Name = c("Banks", "Beamon", "Dandridge",
"Deakle, Jr.", "Doyle", "Drinkard", "Ellis", "Embry", "Gaines",
"Gurley", "Hinton", "Holemon", "Holsomback", "Hunt", "Jones",
"Mahan", "Mahan", "McMillian", "Moore", "Padgett"), First_Name = c("Medell",
"Melvin Todd", "Beniah Alton", "Evan Lee", "Robert E.", "Gary",
"Andre", "Anthony", "Freddie Lee", "Timothy", "Anthony", "Jeffrey",
"John", "H. Guy", "Lydia Diane", "Dale", "Ronnie", "Walter",
"Daniel Wade", "Larry Randal"), Age = c("27", "24", "29", "59",
"44", "37", "35", "23", "22", "22", "29", "23", "33", "54", "40",
"22", "26", "45", "24", "40"), Race = c("Black", "Asian", "Caucasian",
"Caucasian", "Other", "Asian", "Black", "Black", "Black",
"Caucasian", "Black", "Caucasian", "Caucasian", "Other",
"Black", "Caucasian", "Asian", "Black", "Native American", "Caucasian"
), Sex = c("Male", "Male", "Male", "Male", "Male", "Male", "Male",
"Male", "Male", "Male", "Male", "Male", "Male", "Male", "Female",
"Male", "Male", "Male", "Male", "Male"), State = c("Alabama",
"Alabama", "Alabama", "Alabama", "Alabama", "Alabama", "Alabama",
"Alabama", "Alabama", "Alabama", "Alabama", "Alabama", "Alabama",
"Alabama", "Alabama", "Alabama", "Alabama", "Alabama", "Alabama",
"Alabama"), CIU = c(0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0,
0, 0, 0, 0, 1, 0), Guilty_Plea = c(1, 0, 0, 0, 0, 0, 0, 1, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), IO = c(0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), Worst_Crime = c(6, 1,
1, 4, 4, 1, 2, 1, 1, 6, 1, 2, 4, 6, 3, 2, 2, 1, 1, 1), Occurred = c(1999,
1988, 1994, 2014, 1991, 1993, 2012, 1992, 1972, 1999, 1985, 1987,
1987, 1987, 1997, 1983, 1983, 1986, 1999, 1990), Convicted = c(2001,
1989, 1996, 2015, 1992, 1995, 2013, 1993, 1974, 2000, 1986, 1988,
1988, 1993, 2000, 1986, 1986, 1988, 2002, 1992), Exonerated = c(2003,
1990, 2015, 2015, 2001, 2001, 2014, 1997, 1991, 2002, 2015, 1999,
2000, 1998, 2006, 1998, 1998, 1993, 2009, 1997), Sentence = c("15",
"25", "Life", "Not sentenced", "20", "Death", "85", "20", "30",
"35", "Death", "Life", "25", "Probation", "Life without parole",
"35", "Life without parole", "Death", "Death", "Death"), Death_Penalty = c(0,
0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1), DNA_Only = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0), FC = c(1,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), MWID = c(0,
0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0), F_MFE = c(0,
0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1), P_FA = c(1,
1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0), OM = c(1,
1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1), ILD = c(0,
0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0), State_Statute = c("Y",
"Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y",
"Y", "Y", "Y", "Y", "Y", "Y"), State_Claim_Made = c(0, 0, 1,
0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1 0), Zero_time = c(0,
0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0), Prem = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), Pending = c(0,
0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0), Denied = c(0,
0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), State_Award = c("0",
"0", "2", "0", "1", "0", "0", "0", "1", "0", "2", "0", "0", "0",
"0", "0", "0", "0", "0", "0"), Amount = c("0", "0", NA, "0",
"129041.88", "0", "0", "0", "1000000", "0", NA, "0", "0", "0",
"0", "0", "0", "0", "0", "0"), `Non-Statutory_Case_Filed` = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0), No_Time = c(0,
0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0), Unfiled = c(1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1), Dismissed = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0), Pending__1 = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), Award = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0), Premature = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), Amount__1 = c("0",
"0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0",
"0", "0", "0", "$ undisclosed", "0", "0"), Years_Lost = c(1.7,
0.1, 19.5, 0, 2.6, 5.7, 1.8, 4, 10.7, 1.5, 28.5, 10.6, 10.1,
0, 5.8, 11.4, 11.4, 4.5, 5.4, 5.5), State_Award2 = c("0", "0",
"0", "0", "1", "0", "0", "0", "1", "0", "0", "0", "0", "0", "0",
"0", "0", "0", "0", "0")), row.names = c(NA, -20L), class = c("tbl_df",
"tbl", "data.frame"))

I think there is a clash between two requirements: to make the barplot stack-ed and at the same time - dodge-d. Probably my solution isn't the best, and someone would do better. But that's what I've got right now:
Preprocessing
library(tidyverse)
dat <- jail %>%
rename_all(tolower) %>%
select(race, state_claim_made, non_statutory_case_filed) %>%
gather(key = action, value = claim, 2, 3) %>%
count(race, action, claim) %>%
mutate(action = ifelse(action == "state_claim_made", "state", "civil")) %>%
mutate(x = as.numeric(reorder(interaction(race, action), 1:n())))
Output:
# # A tibble: 15 x 5
# race action claim n x
# <chr> <chr> <dbl> <int> <dbl>
# 1 Asian civil 0 3 1
# 2 Asian state 0 2 2
# 3 Asian state 1 1 2
# 4 Black civil 0 6 3
# 5 Black civil 1 1 3
# 6 Black state 0 3 4
# 7 Black state 1 4 4
# 8 Caucasian civil 0 7 5
# 9 Caucasian state 0 6 6
# 10 Caucasian state 1 1 6
# 11 Native American civil 1 1 7
# 12 Native American state 1 1 8
# 13 Other civil 0 2 9
# 14 Other state 0 1 10
# 15 Other state 1 1 10
Some necessary tweaks for x-axis labels:
Adapted from this answer:
breaks = sort(c(unique(dat$x), seq(min(dat$x) + .5,
max(dat$x) + .5,
length(unique(dat$action))
)
)
)
labels = unlist(
lapply(unique(dat$race), function(i) c("civil", paste0("\n", i), "state"))
)
Plot data
ggplot(dat, aes(x = x, y = n, fill = factor(claim))) +
geom_col(show.legend = T) +
ggthemes::theme_few() +
scale_fill_manual(name = NULL,
values = c("gray75", "gray25"),
breaks= c("0", "1"),
labels = c("false", "true")
) +
scale_x_continuous(breaks = breaks, labels = labels) +
theme(axis.title.x = element_blank(), axis.ticks.x = element_blank()) +
labs(title = "Jail Plot", y = "Count")
Data
The data you attached are corrupted - missing comma or $ somewhere in the table (I don't remember what that was). There are the same data, but without variables we don't to solve the problem.
structure(
list(Race = c("Black", "Asian", "Caucasian", "Caucasian", "Other", "Asian",
"Black", "Black", "Black", "Caucasian", "Black", "Caucasian",
"Caucasian", "Other", "Black", "Caucasian", "Asian", "Black",
"Native American", "Caucasian"),
State_Claim_Made = c(0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1,
0, 1, 0),
Non_Statutory_Case_Filed = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 1, 1, 0)
),
row.names = c(NA, -20L),
class = c("tbl_df", "tbl", "data.frame")
)

Related

Error: Length of `labels` should be as same as number of slices

I want to use ComplexHeatmap to plot the "types" (type 1, type 2) and "subtypes" as top annotations using block annotation.
The row annotations would be the column names of met.resolv.
Code Part 1:
library(ComplexHeatmap)
library(GetoptLong)
clust.col <- c("#003f5c", "#374c80","#7a5195","#bc5090","#ef5675","#ff764a", "#ffa600", "#84E4F7", "#FFB480","#FDFD86", "#00a692")
meth.col <- c("#ff816f", "#ffbaae", "#f1f1f1", "#7e959e", "#004252")
met.immune.col <- c("#01444f", "#01575e", "#0d6a6a", "#217d74", "#38917c", "#52a482", "#6eb786", "#8eca89", "#b0dc8c", "#d5ed90", "#fdfd96")
pdf("Plots/heatmap_methylresolver.pdf")
ha = HeatmapAnnotation(
name = "Sub-Cluster", empty = anno_empty(border = TRUE, height = unit(8, "mm")),
foo = anno_block(gp = gpar(fill = clust.col), labels = unique(meta$clust))
)
la = rowAnnotation(foo = anno_block(gp = gpar(col=met.immune.col),
labels = colnames(met.resolv),
labels_gp = gpar(col = "white", fontsize = 10)))
Heatmap(met.resolv, name = "SubClust", top_annotation = ha,
left_annotation = la, column_title = NULL)
Traceback:
Error: Length of `labels` should be as same as number of slices.
Code part 2:
group_block_anno = function(group, empty_anno, gp = gpar(),
label = NULL, label_gp = gpar()) {
seekViewport(qq("annotation_#{empty_anno}_#{min(group)}"))
loc1 = deviceLoc(x = unit(0, "npc"), y = unit(0, "npc"))
seekViewport(qq("annotation_#{empty_anno}_#{max(group)}"))
loc2 = deviceLoc(x = unit(1, "npc"), y = unit(1, "npc"))
seekViewport("global")
grid.rect(loc1$x, loc1$y, width = loc2$x - loc1$x, height = loc2$y - loc1$y,
just = c("left", "bottom"), gp = gp)
if(!is.null(label)) {
grid.text(label, x = (loc1$x + loc2$x)*0.5, y = (loc1$y + loc2$y)*0.5, gp = label_gp)
}
}
group_block_anno(meta[meta$type==1,], "empty", gp = gpar(fill = "#003f5c"), label = "type 1")
group_block_anno(meta[meta$type==2,], "empty", gp = gpar(fill = "#ffa600"), label = "type 2")
dev.off()
Data:
met.resolv
> dput(met.resolv[1:20,])
structure(list(Monocytes = c(0, 0, 0, 0, 0.0691477875220381,
0.0461824156116519, 0.00777223000960038, 0, 0, 0, 0.00165316191239164,
0.0245461060386295, 0.026342142484403, 0, 0, 0, 0.0362473177899938,
0, 0, 0.0615459951223746), `Dendritic Cells` = c(0, 0, 0.00772620422001257,
0, 0, 0, 0.0480402297895918, 0, 0, 0.00898992233305366, 0.057888955860833,
0.0362367878235371, 0, 0.0472205793224695, 0.0286203273050095,
0, 0, 0, 0, 0), Macrophages = c(0, 0.0664642500649833, 0, 0,
0.0371204658284402, 0, 0, 0.0225187084795453, 0.0603416047052193,
0, 0, 0, 0, 0, 0, 0.0313730144635087, 0.0704265029977412, 0,
0.00934366999330129, 0.0411264824824766), Neutrophils = c(0.173202855063056,
0, 0, 0.0643464479529596, 0, 0.0187142163615865, 0.0117918312263748,
0, 0, 0.115244141262919, 0.0520653071278115, 0.00997874098002133,
0, 0.00754706466322519, 0.0885236230551497, 0.0144246971006176,
0.000602296924347016, 0, 0.0195266392400734, 0.00343527794086701
), Eosinophils = c(0, 0.00809451621782635, 0, 0, 0, 0, 0, 0.0026662337469062,
0.0126433025837339, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.00796674767545607,
0), `Regulatory T cells` = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0), `Naive T cells` = c(0, 0.00984426904423764,
0, 0.111694279700795, 0.0123267828167452, 0.0121761009946946,
0.015487451006506, 0, 0.0231848393777138, 0, 0.00278269237244245,
0.0200645732264692, 0, 0.0147361795082149, 0.0526711496398388,
0, 0.00992032127196248, 0, 0, 0.030586635289606), `Memory T cells` = c(0,
0.0312258875142767, 0.124409625779986, 0, 0.0135351004994425,
0.0537156172200875, 0.0540049513012593, 0.0297542571267331, 0,
0.0363411597373587, 0.0464268327265193, 0.0397546685980086, 0.0425232243321057,
0.0491394530734343, 0, 0.0512205034016493, 0.023265025230139,
0.130162735781893, 0, 0.00172924583134173), `CD8 T cells` = c(0.00282626493694126,
0.0225524838253428, 0, 0.0030508623462426, 0, 0.0128041131453121,
0, 0.102208367313482, 0, 0, 0, 0, 0.0668565430396047, 0.0343785834326558,
0, 0.0418137510155405, 0.0039045724524825, 0.0142647475514386,
0.0757110710314276, 0), `NK cells` = c(0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `B cells` = c(0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0.0158949534772194, 0, 0, 0.0174286273520171,
0, 0, 0.0240398020597407, 0)), row.names = c("TCGA.2K.A9WE.01A",
"TCGA.2Z.A9J1.01A", "TCGA.2Z.A9J3.01A", "TCGA.2Z.A9J5.01A", "TCGA.2Z.A9J6.01A",
"TCGA.2Z.A9J7.01A", "TCGA.2Z.A9J8.01A", "TCGA.2Z.A9JD.01A", "TCGA.2Z.A9JI.01A",
"TCGA.2Z.A9JJ.01A", "TCGA.2Z.A9JO.01A", "TCGA.2Z.A9JQ.01A", "TCGA.4A.A93W.01A",
"TCGA.4A.A93X.01A", "TCGA.4A.A93Y.01A", "TCGA.5P.A9JU.01A", "TCGA.5P.A9JY.01A",
"TCGA.5P.A9KE.01A", "TCGA.A4.7288.01A", "TCGA.A4.7583.01A"), class = "data.frame")
meta
> dput(meta[1:20,])
structure(list(clust = c("1a", "2a", "2b", "1b", "2a", "2c",
"1c", "1c", "1b", "1d", "1e", "2c", "2b", "1c", "1e", "1c", "2c",
"1f", "1c", "2a"), type = c("1", "2", "2", "1", "2", "2", "1",
"1", "1", "1", "1", "2", "2", "1", "1", "1", "2", "1", "1", "2"
)), row.names = c("TCGA.2K.A9WE.01A", "TCGA.2Z.A9J1.01A", "TCGA.2Z.A9J3.01A",
"TCGA.2Z.A9J5.01A", "TCGA.2Z.A9J6.01A", "TCGA.2Z.A9J7.01A", "TCGA.2Z.A9J8.01A",
"TCGA.2Z.A9JD.01A", "TCGA.2Z.A9JI.01A", "TCGA.2Z.A9JJ.01A", "TCGA.2Z.A9JO.01A",
"TCGA.2Z.A9JQ.01A", "TCGA.4A.A93W.01A", "TCGA.4A.A93X.01A", "TCGA.4A.A93Y.01A",
"TCGA.5P.A9JU.01A", "TCGA.5P.A9JY.01A", "TCGA.5P.A9KE.01A", "TCGA.A4.7288.01A",
"TCGA.A4.7583.01A"), class = "data.frame")
Expected output (Example):

Marginal Effect Plot with ggeffects package

The command predict <- ggpredict(fit_tw1, terms = "pko_dummy") does not work and it gives me the following error. Do you know how to solve my problem?
Error in model.frame.default(Terms, newdata, na.action = na.action, xlev = object$xlevels) : factor as.factor(pa_dummy) has new level 0.0906593406593407
Can you help me?
Model: (The Model has fixed effects for countries (cown) and years (year))
fit_tw1 <- lm(parl_wom.per ~ as.factor(pko_dummy)*as.factor(pa_dummy) + as.factor(cown) + as.factor(year) + female_pko.per + lf_wom.per + ss.per + fdi.per + jud_ind.per + polity + as.factor(intensity_level) + as.factor(cons_ref),
data = subset(data9, rownames!="639"))
Reproducible sample of the dataset
structure(list(cown = c(432, 432, 432, 432, 432, 432, 432, 432,
432, 432, 432, 432, 432, 432, 432, 432, 432, 432, 432, 432),
year = c(1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997,
1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007,
2008, 2009), intensity_level = c("1", "1", "0", "0", "1",
"0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0",
"1", "1", "1"), pa_dummy = c(0, 1, 1, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0), pko_dummy = c(0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), parl_wom.per = c(NA,
NA, 0.023, 0.023, 0.023, 0.023, 0.023, 0.122449, 0.122449,
0.122449, 0.122449, 0.122449, 0.1020408, 0.1020408, 0.1020408,
0.1020408, 0.1020408, 0.1020408, 0.1020408, 0.1020408), exe_wom.per = c(0.0588235,
0.1052632, 0.0526316, 0.0952381, 0.1111111, 0.0555556, 0.125,
0.1176471, 0.2608696, 0.2727273, 0.45, 0.4210526, 0.15, 0.15,
0.15, 0.1923077, 0.1923077, 0.1923077, 0.1851852, 0.1785714
), gender_mean = c(0, 0, 1.75, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0), gender_art = c(0, 0, 7, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), female_pko.per = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
lf_wom.per = c(0.60855, 0.60834, 0.6082, 0.60815, 0.6082,
0.60838, 0.60806, 0.60798, 0.60804, 0.60811, 0.60813, 0.60782,
0.60752, 0.60725, 0.60701, 0.60681, 0.60616, 0.60564, 0.60525,
0.60495), ss.per = c(0.0679798984527588, 0.0723097991943359,
0.0827134037017822, 0.0837932968139648, 0.0957365036010742,
0.107322397232056, 0.112752199172974, 0.122838802337646,
0.133676099777222, 0.151076498031616, 0.174537200927734,
NA, NA, 0.221253795623779, 0.239939594268799, 0.25832540512085,
0.277074604034424, 0.303055400848389, 0.33731990814209, 0.36671989440918
), fdi.per = c(0.0021364, 0.0004424, -0.0077276, 0.001441,
0.0083661, 0.0411724, 0.009786, 0.0275705, 0.0032724, 0.0090061,
0.0203215, 0.0602065, -0.0031506, 0.0153489, 0.015555, 0.0256452,
0.0214593, 0.0252638, 0.0270809, 0.0631946), ele.sy = c(1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1),
polity = c(-7, NA, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5,
7, 7, 7, 7, 7), mus.per = c(0.944, 0.944, 0.944, 0.944, 0.944,
0.944, 0.944, 0.944, 0.944, 0.944, 0.944, 0.944, 0.944, 0.944,
0.944, 0.944, 0.944, 0.944, 0.944, 0.944), cons_ref = c(0,
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1),
jud_ind.per = c(0.476311308991478, 0.523786338536123, 0.557528417528326,
0.548066004702523, 0.548066004702523, 0.548066004702523,
0.548066004702523, 0.548066004702523, 0.548066004702523,
0.548066004702523, 0.548066004702523, 0.548066004702523,
0.548066004702523, 0.548066004702523, 0.548066004702523,
0.539288342106394, 0.539288342106394, 0.548066004702523,
0.539288342106394, 0.539288342106394)), row.names = c(NA,
-20L), class = c("tbl_df", "tbl", "data.frame"))

Percentage of 2 responses in a variable using Dplyr

For an assignment, I would like to see the number of subjects who have 0 for the variable CIU vs. 1 for CIU.
structure(list(Last_Name = c("Banks", "Beamon", "Dandridge",
"Deakle, Jr.", "Doyle", "Drinkard", "Ellis", "Embry", "Gaines",
"Gurley", "Hinton", "Holemon", "Holsomback", "Hunt", "Jones",
"Mahan", "Mahan", "McMillian", "Moore", "Padgett"), First_Name = c("Medell",
"Melvin Todd", "Beniah Alton", "Evan Lee", "Robert E.", "Gary",
"Andre", "Anthony", "Freddie Lee", "Timothy", "Anthony", "Jeffrey",
"John", "H. Guy", "Lydia Diane", "Dale", "Ronnie", "Walter",
"Daniel Wade", "Larry Randal"), Age = c("27", "24", "29", "59",
"44", "37", "35", "23", "22", "22", "29", "23", "33", "54", "40",
"22", "26", "45", "24", "40"), Race = c("Black", "Black", "Caucasian",
"Caucasian", "Caucasian", "Caucasian", "Black", "Black", "Black",
"Caucasian", "Black", "Caucasian", "Caucasian", "Caucasian",
"Black", "Caucasian", "Caucasian", "Black", "Caucasian", "Caucasian"
), Sex = c("Male", "Male", "Male", "Male", "Male", "Male", "Male",
"Male", "Male", "Male", "Male", "Male", "Male", "Male", "Female",
"Male", "Male", "Male", "Male", "Male"), State = c("Alabama",
"Alabama", "Alabama", "Alabama", "Alabama", "Alabama", "Alabama",
"Alabama", "Alabama", "Alabama", "Alabama", "Alabama", "Alabama",
"Alabama", "Alabama", "Alabama", "Alabama", "Alabama", "Alabama",
"Alabama"), CIU = c(0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0,
0, 0, 0, 0, 1, 0), Guilty_Plea = c(1, 0, 0, 0, 0, 0, 0, 1, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), IO = c(0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), Worst_Crime = c(6, 1,
1, 4, 4, 1, 2, 1, 1, 6, 1, 2, 4, 6, 3, 2, 2, 1, 1, 1), Occurred = c(1999,
1988, 1994, 2014, 1991, 1993, 2012, 1992, 1972, 1999, 1985, 1987,
1987, 1987, 1997, 1983, 1983, 1986, 1999, 1990), Convicted = c(2001,
1989, 1996, 2015, 1992, 1995, 2013, 1993, 1974, 2000, 1986, 1988,
1988, 1993, 2000, 1986, 1986, 1988, 2002, 1992), Exonerated = c(2003,
1990, 2015, 2015, 2001, 2001, 2014, 1997, 1991, 2002, 2015, 1999,
2000, 1998, 2006, 1998, 1998, 1993, 2009, 1997), Sentence = c("15",
"25", "Life", "Not sentenced", "20", "Death", "85", "20", "30",
"35", "Death", "Life", "25", "Probation", "Life without parole",
"35", "Life without parole", "Death", "Death", "Death"), Death_Penalty = c(0,
0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1), DNA_Only = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0), FC = c(1,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), MWID = c(0,
0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0), F_MFE = c(0,
0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1), P_FA = c(1,
1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0), OM = c(1,
1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1), ILD = c(0,
0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0), State_Statute = c("Y",
"Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y",
"Y", "Y", "Y", "Y", "Y", "Y"), State_Claim_Made = c(0, 0, 1,
0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0), Zero_time = c(0,
0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0), Prem = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), Pending = c(0,
0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0), Denied = c(0,
0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), State_Award = c("0",
"0", "2", "0", "1", "0", "0", "0", "1", "0", "2", "0", "0", "0",
"0", "0", "0", "0", "0", "0"), Amount = c("0", "0", NA, "0",
"129041.88", "0", "0", "0", "1000000", "0", NA, "0", "0", "0",
"0", "0", "0", "0", "0", "0"), `Non-Statutory_Case_Filed` = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0), No_Time = c(0,
0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0), Unfiled = c(1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1), Dismissed = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0), Pending__1 = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), Award = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0), Premature = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), Amount__1 = c("0",
"0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0",
"0", "0", "0", "$ undisclosed", "0", "0"), Years_Lost = c(1.7,
0.1, 19.5, 0, 2.6, 5.7, 1.8, 4, 10.7, 1.5, 28.5, 10.6, 10.1,
0, 5.8, 11.4, 11.4, 4.5, 5.4, 5.5), State_Award2 = c("0", "0",
"0", "0", "1", "0", "0", "0", "1", "0", "0", "0", "0", "0", "0",
"0", "0", "0", "0", "0")), row.names = c(NA, -20L), class = c("tbl_df",
"tbl", "data.frame"))
Using the dplyr package, I accomplished this much:
CUI <- jail %>%
group_by(CIU) %>%
summarize(count = n())
Now I would like to create a table showing the percentage of each group within the "State_Claim_Made" category, but I am unsure what to do from here. In the end I would like to see the percent of CUI=0 that have State_Claim_Made=0 vs. State_Claim_Made=1 and same for CUI=1; a 2-2 table of sorts. I also prefer to continue to use the dplyr package but not necessary.
Your example doesn't really let to see the full picture, so let
df <- data.frame(CIU = rep(0:1, times = c(20, 30)),
State_Claim_Made = rep(1:0, times = c(15, 35)))
Then
table(CIU = df$CIU, State_Claim_Made = df$State_Claim_Made)
# State_Claim_Made
# CIU 0 1
# 0 5 15
# 1 30 0
table(CIU = df$CIU, State_Claim_Made = df$State_Claim_Made) / c(table(df$CIU))
# State_Claim_Made
# CIU 0 1
# 0 0.25 0.75
# 1 1.00 0.00
Using base R you can just use the table command:
table(data$CIU, data$State_Claim_Made)
Output:
0 1
0 15 5
If you have data including CUI =1 then the output would be a 2x2 table like you need

Calculate euclidean distance between profiles stored in data frame. Using one row as a reference

I have a data frame like that one below:
> dput(data)
structure(list(`28` = c(0, 0, 0, 0, 0, 0), `38` = c(0, 0, 0,
0, 0, 0), `45` = c(0, 0, 0, 0, 0, 0), `53` = c(0, 0, 0, 0, 0,
0), `60` = c(0, 0, 0, 0, 0, 0), `78` = c(0, 0, 0, 0, 0, 0), `116` = c(0,
0, 0, 0, 0, 0.983309489747258), `145` = c(0, 0, 0, 0, 0, 1),
`189` = c(0, 1, 0.560384508734634, 0, 0, 0.875695437927198
), `223` = c(0, 0.988158197286733, 1, 0, 0, 0.492500108379937
), `281` = c(1, 0.677856978615774, 0.448525741750624, 0,
0.362088745790311, 0.180474270603026), `362` = c(0.79151704397606,
0.763278914693033, 0.35864682503004, 1, 1, 0.114178985852806
), `440` = c(0.662841530054645, 0.818636468153598, 0.448488769756909,
0, 0.448447503793346, 0), `524` = c(0, 0.638192687974247,
0, 0, 0, 0), `634` = c(0, 0, 0, 0, 0, 0), `759` = c(0, 0,
0, 0, 0, 0), `848` = c(0, 0, 0, 0, 0, 0), `979` = c(0, 0,
0, 0, 0, 0), `1120` = c(0, 0, 0, 0, 0, 0), `1248` = c(0,
0, 0, 0, 0, 0)), .Names = c("28", "38", "45", "53", "60",
"78", "116", "145", "189", "223", "281", "362", "440", "524",
"634", "759", "848", "979", "1120", "1248"), row.names = c("Mark",
"Gregg", "Tim", "Oscar", "Tom", "Matthew"
), class = "data.frame")
I would like to calculate euclidean distance between all the profiles from this data and Tim should be used as a reference. The results can be stored in additional column.
Mark to Tim
Gregg to Tim
Oscar to Tim
and etc
You can use dist function (which actually computes all the distances between all the profiles) :
m <- as.matrix(DF)
distances <- as.matrix(dist(m, method = "euclidean", upper = TRUE,diag = TRUE))
> distances['Mark','Tim']
[1] 1.36069
> distances['Gregg','Tim']
[1] 0.9767401
> distances['Oscar','Tim']
[1] 1.458658

R plotting rows of a list

Final edit: Result using matplot()
I'll see to it that I work with ~3 digit values to get a more distinctive result, but basically it's what I wanted
Original Question
I want to create a graph that should look like the attached image
What I got are the values for each line (representing a different topic) for both the y and x axis.
Close to an example on R plotting I found, I tried the following:
arts=c(pt[1,])
g_range <- range(0, arts)
plot(arts, type="o", col="blue", ylim=g_range,axes=FALSE, ann=FALSE)
axis(1, at=1:23, lab=c(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23))
box()
This results in the error
Error in xy.coords(x, y, xlabel, ylabel, log) :
'x' is a list, but does not have components 'x' and 'y'
So apparently arts is not the right parameter for plot here, right?
Sidenote: the values in the arts vector are ordered to fit the 0-23 scale
In addition: this is what the head of pt looks like
Edit: on request, here the output of dput(head(pt))
structure(list(`0` = c(2, 1, 0, 0, 0, 0), `1` = c(1, 0, 0, 0,
0, 0), `2` = c(1, 0, 0, 0, 0, 0), `3` = c(0, 0, 0, 0, 0, 0),
`4` = c(0, 0, 0, 0, 0, 0), `5` = c(0, 0, 0, 0, 0, 0), `6` = c(1,
0, 0, 0, 0, 0), `7` = c(1, 0, 0, 0, 0, 0), `8` = c(1, 0,
0, 0, 0, 0), `9` = c(2, 0, 0, 0, 0, 0), `10` = c(2, 1, 0,
0, 0, 0), `11` = c(2, 1, 0, 0, 0, 0), `12` = c(2, 1, 0, 0,
0, 0), `13` = c(2, 1, 0, 0, 0, 0), `14` = c(2, 1, 0, 0, 0,
0), `15` = c(3, 1, 0, 0, 0, 0), `16` = c(3, 1, 0, 0, 0, 0
), `17` = c(3, 1, 0, 0, 0, 0), `18` = c(3, 1, 1, 1, 0, 0),
`19` = c(3, 1, 1, 1, 0, 0), `20` = c(3, 1, 1, 1, 0, 0), `21` = c(3,
1, 1, 1, 0, 0), `22` = c(3, 1, 1, 0, 0, 0), `23` = c(2, 1,
0, 0, 0, 0)), .Names = c("0", "1", "2", "3", "4", "5", "6",
"7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17",
"18", "19", "20", "21", "22", "23"), row.names = c(NA, 6L), class = "data.frame")
matplot can be used to plot each column of a matrix as a separate line in a figure. You just need to transpose the matrix and you are all set. For details on the arguments to matplot see ?matplot and ?par that controls general graphics parameters.
# Generate data
categories <- c("Entertainment", "Games", "Health", "Personal Finance",
"Shopping", "Music", "USSites", "Porn")
colors <- c("green", "blue", "cyan", "yellow", "magenta", "orange",
"red", "black")
markers <- 1:8
pt <- matrix(runif(length(categories)*23), length(categories), 23)
# Plot
matplot(1:23, t(pt), type="l", col=colors, lty=1, pch=markers,
bty="n", las=1, main="Categorical Percent over Time")
legend("topright", col=colors, categories, bg="white", lwd=1, pch=markers)
It looks terribly messy since I just sampled uniformly distributed random data, but it will look much better with yours.

Resources