Related
I want to use ComplexHeatmap to plot the "types" (type 1, type 2) and "subtypes" as top annotations using block annotation.
The row annotations would be the column names of met.resolv.
Code Part 1:
library(ComplexHeatmap)
library(GetoptLong)
clust.col <- c("#003f5c", "#374c80","#7a5195","#bc5090","#ef5675","#ff764a", "#ffa600", "#84E4F7", "#FFB480","#FDFD86", "#00a692")
meth.col <- c("#ff816f", "#ffbaae", "#f1f1f1", "#7e959e", "#004252")
met.immune.col <- c("#01444f", "#01575e", "#0d6a6a", "#217d74", "#38917c", "#52a482", "#6eb786", "#8eca89", "#b0dc8c", "#d5ed90", "#fdfd96")
pdf("Plots/heatmap_methylresolver.pdf")
ha = HeatmapAnnotation(
name = "Sub-Cluster", empty = anno_empty(border = TRUE, height = unit(8, "mm")),
foo = anno_block(gp = gpar(fill = clust.col), labels = unique(meta$clust))
)
la = rowAnnotation(foo = anno_block(gp = gpar(col=met.immune.col),
labels = colnames(met.resolv),
labels_gp = gpar(col = "white", fontsize = 10)))
Heatmap(met.resolv, name = "SubClust", top_annotation = ha,
left_annotation = la, column_title = NULL)
Traceback:
Error: Length of `labels` should be as same as number of slices.
Code part 2:
group_block_anno = function(group, empty_anno, gp = gpar(),
label = NULL, label_gp = gpar()) {
seekViewport(qq("annotation_#{empty_anno}_#{min(group)}"))
loc1 = deviceLoc(x = unit(0, "npc"), y = unit(0, "npc"))
seekViewport(qq("annotation_#{empty_anno}_#{max(group)}"))
loc2 = deviceLoc(x = unit(1, "npc"), y = unit(1, "npc"))
seekViewport("global")
grid.rect(loc1$x, loc1$y, width = loc2$x - loc1$x, height = loc2$y - loc1$y,
just = c("left", "bottom"), gp = gp)
if(!is.null(label)) {
grid.text(label, x = (loc1$x + loc2$x)*0.5, y = (loc1$y + loc2$y)*0.5, gp = label_gp)
}
}
group_block_anno(meta[meta$type==1,], "empty", gp = gpar(fill = "#003f5c"), label = "type 1")
group_block_anno(meta[meta$type==2,], "empty", gp = gpar(fill = "#ffa600"), label = "type 2")
dev.off()
Data:
met.resolv
> dput(met.resolv[1:20,])
structure(list(Monocytes = c(0, 0, 0, 0, 0.0691477875220381,
0.0461824156116519, 0.00777223000960038, 0, 0, 0, 0.00165316191239164,
0.0245461060386295, 0.026342142484403, 0, 0, 0, 0.0362473177899938,
0, 0, 0.0615459951223746), `Dendritic Cells` = c(0, 0, 0.00772620422001257,
0, 0, 0, 0.0480402297895918, 0, 0, 0.00898992233305366, 0.057888955860833,
0.0362367878235371, 0, 0.0472205793224695, 0.0286203273050095,
0, 0, 0, 0, 0), Macrophages = c(0, 0.0664642500649833, 0, 0,
0.0371204658284402, 0, 0, 0.0225187084795453, 0.0603416047052193,
0, 0, 0, 0, 0, 0, 0.0313730144635087, 0.0704265029977412, 0,
0.00934366999330129, 0.0411264824824766), Neutrophils = c(0.173202855063056,
0, 0, 0.0643464479529596, 0, 0.0187142163615865, 0.0117918312263748,
0, 0, 0.115244141262919, 0.0520653071278115, 0.00997874098002133,
0, 0.00754706466322519, 0.0885236230551497, 0.0144246971006176,
0.000602296924347016, 0, 0.0195266392400734, 0.00343527794086701
), Eosinophils = c(0, 0.00809451621782635, 0, 0, 0, 0, 0, 0.0026662337469062,
0.0126433025837339, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.00796674767545607,
0), `Regulatory T cells` = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0), `Naive T cells` = c(0, 0.00984426904423764,
0, 0.111694279700795, 0.0123267828167452, 0.0121761009946946,
0.015487451006506, 0, 0.0231848393777138, 0, 0.00278269237244245,
0.0200645732264692, 0, 0.0147361795082149, 0.0526711496398388,
0, 0.00992032127196248, 0, 0, 0.030586635289606), `Memory T cells` = c(0,
0.0312258875142767, 0.124409625779986, 0, 0.0135351004994425,
0.0537156172200875, 0.0540049513012593, 0.0297542571267331, 0,
0.0363411597373587, 0.0464268327265193, 0.0397546685980086, 0.0425232243321057,
0.0491394530734343, 0, 0.0512205034016493, 0.023265025230139,
0.130162735781893, 0, 0.00172924583134173), `CD8 T cells` = c(0.00282626493694126,
0.0225524838253428, 0, 0.0030508623462426, 0, 0.0128041131453121,
0, 0.102208367313482, 0, 0, 0, 0, 0.0668565430396047, 0.0343785834326558,
0, 0.0418137510155405, 0.0039045724524825, 0.0142647475514386,
0.0757110710314276, 0), `NK cells` = c(0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `B cells` = c(0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0.0158949534772194, 0, 0, 0.0174286273520171,
0, 0, 0.0240398020597407, 0)), row.names = c("TCGA.2K.A9WE.01A",
"TCGA.2Z.A9J1.01A", "TCGA.2Z.A9J3.01A", "TCGA.2Z.A9J5.01A", "TCGA.2Z.A9J6.01A",
"TCGA.2Z.A9J7.01A", "TCGA.2Z.A9J8.01A", "TCGA.2Z.A9JD.01A", "TCGA.2Z.A9JI.01A",
"TCGA.2Z.A9JJ.01A", "TCGA.2Z.A9JO.01A", "TCGA.2Z.A9JQ.01A", "TCGA.4A.A93W.01A",
"TCGA.4A.A93X.01A", "TCGA.4A.A93Y.01A", "TCGA.5P.A9JU.01A", "TCGA.5P.A9JY.01A",
"TCGA.5P.A9KE.01A", "TCGA.A4.7288.01A", "TCGA.A4.7583.01A"), class = "data.frame")
meta
> dput(meta[1:20,])
structure(list(clust = c("1a", "2a", "2b", "1b", "2a", "2c",
"1c", "1c", "1b", "1d", "1e", "2c", "2b", "1c", "1e", "1c", "2c",
"1f", "1c", "2a"), type = c("1", "2", "2", "1", "2", "2", "1",
"1", "1", "1", "1", "2", "2", "1", "1", "1", "2", "1", "1", "2"
)), row.names = c("TCGA.2K.A9WE.01A", "TCGA.2Z.A9J1.01A", "TCGA.2Z.A9J3.01A",
"TCGA.2Z.A9J5.01A", "TCGA.2Z.A9J6.01A", "TCGA.2Z.A9J7.01A", "TCGA.2Z.A9J8.01A",
"TCGA.2Z.A9JD.01A", "TCGA.2Z.A9JI.01A", "TCGA.2Z.A9JJ.01A", "TCGA.2Z.A9JO.01A",
"TCGA.2Z.A9JQ.01A", "TCGA.4A.A93W.01A", "TCGA.4A.A93X.01A", "TCGA.4A.A93Y.01A",
"TCGA.5P.A9JU.01A", "TCGA.5P.A9JY.01A", "TCGA.5P.A9KE.01A", "TCGA.A4.7288.01A",
"TCGA.A4.7583.01A"), class = "data.frame")
Expected output (Example):
So I would like to stack the two bars from each of these graphs into one big graph. That is, I would like Black State Claim (from plot a) to be right next to Black Civil Rights Claim (from plot b) and consequently for all races into one graph.
Since some of the data, like asian, is so low, is there a more ideal way to compare State Claim/Civil Rights Claim Status with Race???
#a) State Claim?
race_claim <- data.frame(table(jail$Race,jail$State_Claim_Made))
names(race_claim) <- c("Race","Claim","Count")
ggplot(data=race_claim, aes(x=Race, y=Count, fill=Claim)) + geom_bar(stat = "identity")
#b) civil rights claim?
race_claim_civ <- data.frame(table(jail$Race,jail$Non_Statutory))
names(race_claim_civ) <- c("Race","Claim","Count")
ggplot(data=race_claim_civ, aes(x=Race, y=Count, fill=Claim)) + geom_bar(stat = "identity")
DATA SAMPLE:
structure(list(Last_Name = c("Banks", "Beamon", "Dandridge",
"Deakle, Jr.", "Doyle", "Drinkard", "Ellis", "Embry", "Gaines",
"Gurley", "Hinton", "Holemon", "Holsomback", "Hunt", "Jones",
"Mahan", "Mahan", "McMillian", "Moore", "Padgett"), First_Name = c("Medell",
"Melvin Todd", "Beniah Alton", "Evan Lee", "Robert E.", "Gary",
"Andre", "Anthony", "Freddie Lee", "Timothy", "Anthony", "Jeffrey",
"John", "H. Guy", "Lydia Diane", "Dale", "Ronnie", "Walter",
"Daniel Wade", "Larry Randal"), Age = c("27", "24", "29", "59",
"44", "37", "35", "23", "22", "22", "29", "23", "33", "54", "40",
"22", "26", "45", "24", "40"), Race = c("Black", "Asian", "Caucasian",
"Caucasian", "Other", "Asian", "Black", "Black", "Black",
"Caucasian", "Black", "Caucasian", "Caucasian", "Other",
"Black", "Caucasian", "Asian", "Black", "Native American", "Caucasian"
), Sex = c("Male", "Male", "Male", "Male", "Male", "Male", "Male",
"Male", "Male", "Male", "Male", "Male", "Male", "Male", "Female",
"Male", "Male", "Male", "Male", "Male"), State = c("Alabama",
"Alabama", "Alabama", "Alabama", "Alabama", "Alabama", "Alabama",
"Alabama", "Alabama", "Alabama", "Alabama", "Alabama", "Alabama",
"Alabama", "Alabama", "Alabama", "Alabama", "Alabama", "Alabama",
"Alabama"), CIU = c(0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0,
0, 0, 0, 0, 1, 0), Guilty_Plea = c(1, 0, 0, 0, 0, 0, 0, 1, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), IO = c(0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), Worst_Crime = c(6, 1,
1, 4, 4, 1, 2, 1, 1, 6, 1, 2, 4, 6, 3, 2, 2, 1, 1, 1), Occurred = c(1999,
1988, 1994, 2014, 1991, 1993, 2012, 1992, 1972, 1999, 1985, 1987,
1987, 1987, 1997, 1983, 1983, 1986, 1999, 1990), Convicted = c(2001,
1989, 1996, 2015, 1992, 1995, 2013, 1993, 1974, 2000, 1986, 1988,
1988, 1993, 2000, 1986, 1986, 1988, 2002, 1992), Exonerated = c(2003,
1990, 2015, 2015, 2001, 2001, 2014, 1997, 1991, 2002, 2015, 1999,
2000, 1998, 2006, 1998, 1998, 1993, 2009, 1997), Sentence = c("15",
"25", "Life", "Not sentenced", "20", "Death", "85", "20", "30",
"35", "Death", "Life", "25", "Probation", "Life without parole",
"35", "Life without parole", "Death", "Death", "Death"), Death_Penalty = c(0,
0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1), DNA_Only = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0), FC = c(1,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), MWID = c(0,
0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0), F_MFE = c(0,
0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1), P_FA = c(1,
1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0), OM = c(1,
1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1), ILD = c(0,
0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0), State_Statute = c("Y",
"Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y",
"Y", "Y", "Y", "Y", "Y", "Y"), State_Claim_Made = c(0, 0, 1,
0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1 0), Zero_time = c(0,
0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0), Prem = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), Pending = c(0,
0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0), Denied = c(0,
0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), State_Award = c("0",
"0", "2", "0", "1", "0", "0", "0", "1", "0", "2", "0", "0", "0",
"0", "0", "0", "0", "0", "0"), Amount = c("0", "0", NA, "0",
"129041.88", "0", "0", "0", "1000000", "0", NA, "0", "0", "0",
"0", "0", "0", "0", "0", "0"), `Non-Statutory_Case_Filed` = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0), No_Time = c(0,
0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0), Unfiled = c(1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1), Dismissed = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0), Pending__1 = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), Award = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0), Premature = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), Amount__1 = c("0",
"0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0",
"0", "0", "0", "$ undisclosed", "0", "0"), Years_Lost = c(1.7,
0.1, 19.5, 0, 2.6, 5.7, 1.8, 4, 10.7, 1.5, 28.5, 10.6, 10.1,
0, 5.8, 11.4, 11.4, 4.5, 5.4, 5.5), State_Award2 = c("0", "0",
"0", "0", "1", "0", "0", "0", "1", "0", "0", "0", "0", "0", "0",
"0", "0", "0", "0", "0")), row.names = c(NA, -20L), class = c("tbl_df",
"tbl", "data.frame"))
I think there is a clash between two requirements: to make the barplot stack-ed and at the same time - dodge-d. Probably my solution isn't the best, and someone would do better. But that's what I've got right now:
Preprocessing
library(tidyverse)
dat <- jail %>%
rename_all(tolower) %>%
select(race, state_claim_made, non_statutory_case_filed) %>%
gather(key = action, value = claim, 2, 3) %>%
count(race, action, claim) %>%
mutate(action = ifelse(action == "state_claim_made", "state", "civil")) %>%
mutate(x = as.numeric(reorder(interaction(race, action), 1:n())))
Output:
# # A tibble: 15 x 5
# race action claim n x
# <chr> <chr> <dbl> <int> <dbl>
# 1 Asian civil 0 3 1
# 2 Asian state 0 2 2
# 3 Asian state 1 1 2
# 4 Black civil 0 6 3
# 5 Black civil 1 1 3
# 6 Black state 0 3 4
# 7 Black state 1 4 4
# 8 Caucasian civil 0 7 5
# 9 Caucasian state 0 6 6
# 10 Caucasian state 1 1 6
# 11 Native American civil 1 1 7
# 12 Native American state 1 1 8
# 13 Other civil 0 2 9
# 14 Other state 0 1 10
# 15 Other state 1 1 10
Some necessary tweaks for x-axis labels:
Adapted from this answer:
breaks = sort(c(unique(dat$x), seq(min(dat$x) + .5,
max(dat$x) + .5,
length(unique(dat$action))
)
)
)
labels = unlist(
lapply(unique(dat$race), function(i) c("civil", paste0("\n", i), "state"))
)
Plot data
ggplot(dat, aes(x = x, y = n, fill = factor(claim))) +
geom_col(show.legend = T) +
ggthemes::theme_few() +
scale_fill_manual(name = NULL,
values = c("gray75", "gray25"),
breaks= c("0", "1"),
labels = c("false", "true")
) +
scale_x_continuous(breaks = breaks, labels = labels) +
theme(axis.title.x = element_blank(), axis.ticks.x = element_blank()) +
labs(title = "Jail Plot", y = "Count")
Data
The data you attached are corrupted - missing comma or $ somewhere in the table (I don't remember what that was). There are the same data, but without variables we don't to solve the problem.
structure(
list(Race = c("Black", "Asian", "Caucasian", "Caucasian", "Other", "Asian",
"Black", "Black", "Black", "Caucasian", "Black", "Caucasian",
"Caucasian", "Other", "Black", "Caucasian", "Asian", "Black",
"Native American", "Caucasian"),
State_Claim_Made = c(0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1,
0, 1, 0),
Non_Statutory_Case_Filed = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 1, 1, 0)
),
row.names = c(NA, -20L),
class = c("tbl_df", "tbl", "data.frame")
)
For an assignment, I would like to see the number of subjects who have 0 for the variable CIU vs. 1 for CIU.
structure(list(Last_Name = c("Banks", "Beamon", "Dandridge",
"Deakle, Jr.", "Doyle", "Drinkard", "Ellis", "Embry", "Gaines",
"Gurley", "Hinton", "Holemon", "Holsomback", "Hunt", "Jones",
"Mahan", "Mahan", "McMillian", "Moore", "Padgett"), First_Name = c("Medell",
"Melvin Todd", "Beniah Alton", "Evan Lee", "Robert E.", "Gary",
"Andre", "Anthony", "Freddie Lee", "Timothy", "Anthony", "Jeffrey",
"John", "H. Guy", "Lydia Diane", "Dale", "Ronnie", "Walter",
"Daniel Wade", "Larry Randal"), Age = c("27", "24", "29", "59",
"44", "37", "35", "23", "22", "22", "29", "23", "33", "54", "40",
"22", "26", "45", "24", "40"), Race = c("Black", "Black", "Caucasian",
"Caucasian", "Caucasian", "Caucasian", "Black", "Black", "Black",
"Caucasian", "Black", "Caucasian", "Caucasian", "Caucasian",
"Black", "Caucasian", "Caucasian", "Black", "Caucasian", "Caucasian"
), Sex = c("Male", "Male", "Male", "Male", "Male", "Male", "Male",
"Male", "Male", "Male", "Male", "Male", "Male", "Male", "Female",
"Male", "Male", "Male", "Male", "Male"), State = c("Alabama",
"Alabama", "Alabama", "Alabama", "Alabama", "Alabama", "Alabama",
"Alabama", "Alabama", "Alabama", "Alabama", "Alabama", "Alabama",
"Alabama", "Alabama", "Alabama", "Alabama", "Alabama", "Alabama",
"Alabama"), CIU = c(0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0,
0, 0, 0, 0, 1, 0), Guilty_Plea = c(1, 0, 0, 0, 0, 0, 0, 1, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), IO = c(0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), Worst_Crime = c(6, 1,
1, 4, 4, 1, 2, 1, 1, 6, 1, 2, 4, 6, 3, 2, 2, 1, 1, 1), Occurred = c(1999,
1988, 1994, 2014, 1991, 1993, 2012, 1992, 1972, 1999, 1985, 1987,
1987, 1987, 1997, 1983, 1983, 1986, 1999, 1990), Convicted = c(2001,
1989, 1996, 2015, 1992, 1995, 2013, 1993, 1974, 2000, 1986, 1988,
1988, 1993, 2000, 1986, 1986, 1988, 2002, 1992), Exonerated = c(2003,
1990, 2015, 2015, 2001, 2001, 2014, 1997, 1991, 2002, 2015, 1999,
2000, 1998, 2006, 1998, 1998, 1993, 2009, 1997), Sentence = c("15",
"25", "Life", "Not sentenced", "20", "Death", "85", "20", "30",
"35", "Death", "Life", "25", "Probation", "Life without parole",
"35", "Life without parole", "Death", "Death", "Death"), Death_Penalty = c(0,
0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1), DNA_Only = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0), FC = c(1,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), MWID = c(0,
0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0), F_MFE = c(0,
0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1), P_FA = c(1,
1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0), OM = c(1,
1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1), ILD = c(0,
0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0), State_Statute = c("Y",
"Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y",
"Y", "Y", "Y", "Y", "Y", "Y"), State_Claim_Made = c(0, 0, 1,
0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0), Zero_time = c(0,
0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0), Prem = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), Pending = c(0,
0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0), Denied = c(0,
0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), State_Award = c("0",
"0", "2", "0", "1", "0", "0", "0", "1", "0", "2", "0", "0", "0",
"0", "0", "0", "0", "0", "0"), Amount = c("0", "0", NA, "0",
"129041.88", "0", "0", "0", "1000000", "0", NA, "0", "0", "0",
"0", "0", "0", "0", "0", "0"), `Non-Statutory_Case_Filed` = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0), No_Time = c(0,
0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0), Unfiled = c(1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1), Dismissed = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0), Pending__1 = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), Award = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0), Premature = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), Amount__1 = c("0",
"0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0",
"0", "0", "0", "$ undisclosed", "0", "0"), Years_Lost = c(1.7,
0.1, 19.5, 0, 2.6, 5.7, 1.8, 4, 10.7, 1.5, 28.5, 10.6, 10.1,
0, 5.8, 11.4, 11.4, 4.5, 5.4, 5.5), State_Award2 = c("0", "0",
"0", "0", "1", "0", "0", "0", "1", "0", "0", "0", "0", "0", "0",
"0", "0", "0", "0", "0")), row.names = c(NA, -20L), class = c("tbl_df",
"tbl", "data.frame"))
Using the dplyr package, I accomplished this much:
CUI <- jail %>%
group_by(CIU) %>%
summarize(count = n())
Now I would like to create a table showing the percentage of each group within the "State_Claim_Made" category, but I am unsure what to do from here. In the end I would like to see the percent of CUI=0 that have State_Claim_Made=0 vs. State_Claim_Made=1 and same for CUI=1; a 2-2 table of sorts. I also prefer to continue to use the dplyr package but not necessary.
Your example doesn't really let to see the full picture, so let
df <- data.frame(CIU = rep(0:1, times = c(20, 30)),
State_Claim_Made = rep(1:0, times = c(15, 35)))
Then
table(CIU = df$CIU, State_Claim_Made = df$State_Claim_Made)
# State_Claim_Made
# CIU 0 1
# 0 5 15
# 1 30 0
table(CIU = df$CIU, State_Claim_Made = df$State_Claim_Made) / c(table(df$CIU))
# State_Claim_Made
# CIU 0 1
# 0 0.25 0.75
# 1 1.00 0.00
Using base R you can just use the table command:
table(data$CIU, data$State_Claim_Made)
Output:
0 1
0 15 5
If you have data including CUI =1 then the output would be a 2x2 table like you need
I would like to find which column contains the highest number of 1. Number 1 should appear only once per row. As soon as column with highest number 1 will be located the script should check also neighboring columns (+1+ / -1) and if any of them contain number 1 it should be also selected. All of these rows should be kept within subset function.
Let's put part of original data:
structure(list( `10` = c(0, 0, 0, 0), `34` = c(0, 0, 0, 0),
`59` = c(0, 0, 0, 0), `84` = c(0, 0, 0, 0),
`110` = c(0, 0, 0, 0), `134` = c(0, 0, 0, 0),
`165` = c(0, 0, 0, 0), `199` = c(0, 0, 0, 0),
`234` = c(0, 0, 0, 0),
`257` = c(0.0160178986200301, 0, 0.0409772658686249, 0.0289710439505515),
`362` = c(0.0679054515644214, 0.126933274414494, 0.0855598028367368, 0.0596214721268868),
`433` = c(0.490914059297718, 0.604765061128296, 0.813348757670254, 1),
`506` = c(1, 1, 1, 0.971410482822965),
`581` = c(0.198244295668807, 0.234158197083517, 0.269655970224324, 0.195318383259472),
`652` = c(0.271177756524115, 0.223018854028576, 0.301352982597324, 0.142584385725234),
`733` = c(0.212426561005602, 0.212778023272942, 0.228513228045468, 0),
`818` = c(0.213816778248395, 0.168570481661511, 0.264465345538678, 0),
`896` = c(0.137102063123377, 0, 0.320234382858867, 0),
`972` = c(0.108932231179123, 0, 0.179106729705261, 0),
`1039` = c(0.101762535865555, 0, 0, 0),
EOD = c("Peter", "Peter", "Peter", "Peter"),
Complex = c(""FT team", "FT team", "FT team", "FT team")),
.Names = c("10", "34", "59", "84", "110", "134", "165", "199",
"234", "257", "362", "433", "506", "581", "652", "733",
"818", "896", "972", "1039", "EOD", "Complex"),
row.names = c("Peter_1_Rep_1_E", "Peter_1_Rep_2_E",
"Peter_1_Rep_3_E", "Peter_1_Rep_4_E"),
class = "data.frame")
As you can clearly see in the original data the column 506 should be selected as the one containing the highest number of 1 and data should be subseted base on it. However, output would be exactly the same because in this data neighboring fraction (-1, 433) contains also 1. That's easy example.
Situation might be more complicated, like in that case:
structure(list( `10` = c(0, 0, 0, 0, 0, 0, 0, 0),
`34` = c(0, 0, 0, 0, 0, 0, 0, 0),
`59` = c(0, 0, 0, 0, 0, 0, 0, 0),
`84` = c(0, 0, 0, 0, 0, 0, 0, 0),
`110` = c(0, 0, 0, 0, 0, 0, 0, 0),
`134` = c(0.168783347110543, 0, 0.382618775924215, 0, 0.530638724516877, 0, 0.169526042048202, 0),
`165` = c(1, 0.36380544964196, 1, 0.13979454361738, 1, 0.239652477288689, 1, 0.240341578327444),
`199` = c(0.355158938904336, 1, 0.646724265971128, 1, 0.582637073151552, 1, 0.20319390520841, 1),
`234` = c(0.0963628165627114, 0.575436312346942, 0.229853828180188, 0.433555069046817, 0.247567185011894, 0.508529485059242, 0.138356164383562, 0.389880251276011),
`257` = c(0, 0.17393595585728, 0, 0.127787133715056, 0, 0.117147323350173, 0, 0),
`362` = c(0, 0, 0, 0.0919333108790839, 0, 0, 0, 0),
`433` = c(0, 0, 0, 0.0745570899292691, 0, 0, 0, 0),
`506` = c(0, 0, 0, 0, 0, 0, 0, 0),
`581` = c(0, 0, 0, 0, 0, 0, 0, 0),
`652` = c(0, 0, 0, 0, 0, 0, 0, 0),
`733` = c(0, 0, 0, 0, 0, 0, 0, 0),
`818` = c(0, 0, 0, 0, 0, 0, 0, 0),
`896` = c(0, 0, 0, 0, 0, 0, 0, 0),
`972` = c(0, 0, 0, 0, 0, 0, 0, 0),
`1039` = c(0, 0, 0, 0, 0, 0, 0, 0),
EOD = c("Paul", "Paul", "Paul", "Paul", "Paul", "Paul", "Paul", "Paul"),
Complex = c("GG Team", "GG Team", "GG Team", "GG Team", "GG Team", "GG Team", "GG Team", "GG Team")),
.Names = c("10", "34", "59", "84", "110", "134", "165", "199", "234", "257", "362", "433", "506", "581", "652", "733", "818", "896", "972", "1039", "EOD", "Complex"),
row.names = c("PaulG_1_Rep_1_E", "Paul_1_Rep_1_E", "PaulN_1_Rep_2_E", "PaulG_1_Rep_2_E", "Paul_1_Rep_3_E", "PaulC_1_Rep_3_E", "PaulC_1_Rep_4_E", "Paul_1_Rep_4_E"),
class = "data.frame")
In that situation there are two columns which contain the same number of 1s. In this case column with bigger colsum should be selected.
let df1 be your input:
df_num <- df1[,sapply(df1,is.numeric)] # keep only numeric columns to build filter
n1 <- colSums(df_num == 1) # number of 1s per column
i <- which(n1 == max(n1)) # index of cols with max 1s
if(length(i) > 1){
max_cs <- which.max(colSums(df_num[,i])) # index of col with max colsum among results
i <- i[max_cs] # our column index
}
filter <- rowSums(df_num[,seq(max(i-1,0),min(i+1,ncol(df_num)))]==1) >0 # filter is true if chosen column is 1 or if any neighbour is 1
df1[filter,] # your result
In both of your examples, all rows are kept
I'd use the tidyverse to convert it to long format then pull in the column sums to determine where the first one (with the largest sum) is:
library(tidyverse)
# add rownames to the data frame
df2$id <- rownames(df2)
# make a data frame of each column's sum
thecolsums <- colSums(df2[,map_lgl(df2, is.numeric)]) %>%
enframe(name = "colname", value = "colsum")
# change the data frame to long format
dflong <- df2 %>%
mutate(rowid = row_number()) %>%
gather(colname, val, -rowid)
# which column has the first 1 value
whichcol <- dflong %>%
group_by(colname) %>%
filter(val ==1) %>%
summarize(
firstone = min(rowid, na.rm = T)
) %>%
left_join(thecolsums, by = 'colname') %>%
filter(colsum == max(colsum)) %>%
pluck('colname')
# what's the numerical index of the column
whichcolindex <- which(names(df2) == whichcol)
# get previous and next columns if they exist
prevcolindex <- ifelse(whichcolindex < 1, F, whichcolindex -1)
nextcolindex <- ifelse(whichcolindex == ncol(df2) , F, whichcolindex +1)
# do the previous and next columns have 1s in them?
prevcolhasone <- any(df2[,prevcolindex] == 1)
nextcolhasone <- any(df2[,nextcolindex] == 1)
# create a vector with 1, 2 or 3 column indexes
finalindex <- c(
prevcolindex[prevcolhasone]
, whichcolindex
, nextcolindex[nextcolhasone]
)
# subset the original data frame, only preserving the columns in question
results <- df2[, finalindex]
I have a data frame like that one below:
> dput(data)
structure(list(`28` = c(0, 0, 0, 0, 0, 0), `38` = c(0, 0, 0,
0, 0, 0), `45` = c(0, 0, 0, 0, 0, 0), `53` = c(0, 0, 0, 0, 0,
0), `60` = c(0, 0, 0, 0, 0, 0), `78` = c(0, 0, 0, 0, 0, 0), `116` = c(0,
0, 0, 0, 0, 0.983309489747258), `145` = c(0, 0, 0, 0, 0, 1),
`189` = c(0, 1, 0.560384508734634, 0, 0, 0.875695437927198
), `223` = c(0, 0.988158197286733, 1, 0, 0, 0.492500108379937
), `281` = c(1, 0.677856978615774, 0.448525741750624, 0,
0.362088745790311, 0.180474270603026), `362` = c(0.79151704397606,
0.763278914693033, 0.35864682503004, 1, 1, 0.114178985852806
), `440` = c(0.662841530054645, 0.818636468153598, 0.448488769756909,
0, 0.448447503793346, 0), `524` = c(0, 0.638192687974247,
0, 0, 0, 0), `634` = c(0, 0, 0, 0, 0, 0), `759` = c(0, 0,
0, 0, 0, 0), `848` = c(0, 0, 0, 0, 0, 0), `979` = c(0, 0,
0, 0, 0, 0), `1120` = c(0, 0, 0, 0, 0, 0), `1248` = c(0,
0, 0, 0, 0, 0)), .Names = c("28", "38", "45", "53", "60",
"78", "116", "145", "189", "223", "281", "362", "440", "524",
"634", "759", "848", "979", "1120", "1248"), row.names = c("Mark",
"Gregg", "Tim", "Oscar", "Tom", "Matthew"
), class = "data.frame")
I would like to calculate euclidean distance between all the profiles from this data and Tim should be used as a reference. The results can be stored in additional column.
Mark to Tim
Gregg to Tim
Oscar to Tim
and etc
You can use dist function (which actually computes all the distances between all the profiles) :
m <- as.matrix(DF)
distances <- as.matrix(dist(m, method = "euclidean", upper = TRUE,diag = TRUE))
> distances['Mark','Tim']
[1] 1.36069
> distances['Gregg','Tim']
[1] 0.9767401
> distances['Oscar','Tim']
[1] 1.458658