how to show average, median, and stdv in a histogram - r

How would I go about showing in these histograms the average, median, and standard deviation of the data.
Here is my histogram code:
hist(PRE$Productivity...Productivité, main = "PRE", xlab = "Productivity")
hist(DBN$Productivity...Productivité, main = "DBN", xlab = "Productivity")
hist(DBG$Productivity...Productivité, main = "DBG", xlab = "Productivity")
hist(POST$Productivity...Productivité, main = "POST", xlab = "Productivity")
And here is it's output
dput(head(DBN))
structure(list(Participant.Code = c("AE1_02", "AE1_02", "AE1_02",
"AE1_02", "AE2_08", "AE2_08"), Condition = structure(c(5L, 5L,
5L, 5L, 5L, 5L), levels = c("", "DBG", "DBG DBN", "DBG POST",
"DBN", "DBN DBG", "DBN POST", "POST", "PRE", "PRE DBG", "PRE DBN"
), class = "factor"), Start.time = c("3-9-22 8:39:27", "3-9-22 16:27:44",
"3-10-22 8:48:34", "3-10-22 16:09:33", "3-18-22 8:36:15", "3-18-22 17:26:13"
), Stiffness...Raideur = c(7L, 7L, 7L, 7L, 4L, 4L), Fatigue...Fatigue = c(7L,
8L, 8L, 8L, 4L, 6L), Discomfort...Inconfort = c(7L, 7L, 7L, 7L,
3L, 6L), Happiness...Joie = c(8L, 8L, 8L, 8L, 6L, 5L), Productivity...Productivité = c(6L,
8L, 7L, 7L, 5L, 4L), Ability.to.concentrate...Capacité.de.se.concentrer = c(7L,
8L, 7L, 6L, 5L, 4L), Alertness...Vigilance = c(7L, 8L, 7L, 6L,
5L, 5L), Stress...Stress = c(6L, 8L, 7L, 6L, 5L, 5L), Back.Pain...Mal.de.dos = c(8L,
7L, 8L, 8L, 3L, 4L), Neck.Pain...Douleur.au.cou = c(5L, 4L, 7L,
7L, 3L, 4L), Head.Pain...Mal.de.tête = c(1L, 1L, 1L, 1L, 2L,
4L), Eye.Pain...Douleur.oculaire = c(1L, 1L, 1L, 1L, 3L, 4L)), row.names = c(17L,
18L, 21L, 22L, 57L, 58L), class = "data.frame")

You can use the function abline immediatly after the histogram call to add a vertical line intersecting the x axis. In this case, I am creating a line to show the location of the mean in each dataset. Then, to add the value, you can add it directly as a label or put it into a legend. I am adding some padding to the ylim so the legend or label doesn't overlap with the title. Finally, to arrange them in a similar way as you want it, you can prepare the panel using the function par():
n <- 10000
example_a <- rgamma(n, 5, 2)
example_b <- rnorm(n, 5, 2)
example_c <- rbeta(n, 5, 2)
max_a <- max(hist(example_a, plot = F)$counts)
max_b <- max(hist(example_b, plot = F)$counts)
max_c <- max(hist(example_c, plot = F)$counts)
mean_a <- mean(example_a)
mean_b <- mean(example_b)
mean_c <- mean(example_c)
par(mfrow = c(2,2)) #creates 4x4 layout
hist(example_a, main = "PRE", xlab = "Productivity",
col = "slategray1", border = "gray",
ylim = c(0, max_a + 200))
abline(v = mean_a, col = "darkred", lwd = 3, lty = 2)
legend("topright", legend = c("Mean", round(mean_a, 3)),
lwd = c(3, NA), lty = c(2, NA), col = c("darkred", NA))
hist(example_b, main = "DBN", xlab = "Productivity",
col = "slategray1", border = "gray",
ylim = c(0, max_b + 200))
abline(v = mean_b, col = "forestgreen", lwd = 3, lty = 2)
text(x = mean_b - 2, y = 1990, paste("Mean = ", round(mean_b, 3)))
hist(example_c, main = "DBG", xlab = "Productivity",
col = "slategray1", border = "gray",
ylim = c(0, max_c + 200))
abline(v = mean(example_c), col = "purple4", lwd = 3, lty = 2)
legend("topleft", legend = c("Mean", round(mean_c, 3)),
lwd = c(3, NA), lty = c(2, NA), col = c("darkred", NA))
And that gives us the following plots:

Related

R circos plot calculating percent of total for each category

My data has two variables, TRV and TRJ, and I am seeing how often they match with each other. I would like to place on a circos figure what percent each variable shows up in total on the outer layer. It should add up to 200%, 100% for TRV, 100% for TRJ.
library(circlize)
library(plyr)
df <- structure(list(TRV = structure(c(1L, 1L, 1L, 2L, 3L, 3L, 3L,
3L, 4L, 4L, 4L, 5L, 5L, 5L), .Label = c("TRAV29/DV5", "TRAV36/DV7",
"TRDV1", "TRDV2", "TRDV3", "TRGV8", "TRGV9"), class = "factor"),
TRJ = structure(c(64L, 65L, 67L, 64L, 64L, 65L, 66L, 67L,
64L, 65L, 66L, 64L, 65L, 66L), .Label = c("", "mTRAJ22",
"mTRAJ30", "mTRAJ34", "mTRAJ37", "mTRAJ45", "mTRAJ49", "mTRBJ1-1",
"mTRBJ2-5", "mTRDJ1", "mTRDJ2", "mTRGJ1", "mTRGJ4", "TRAJ10",
"TRAJ15", "TRAJ16", "TRAJ19", "TRAJ2", "TRAJ20", "TRAJ21",
"TRAJ22", "TRAJ23", "TRAJ24", "TRAJ26", "TRAJ27", "TRAJ30",
"TRAJ32", "TRAJ34", "TRAJ36", "TRAJ37", "TRAJ38", "TRAJ39",
"TRAJ40", "TRAJ41", "TRAJ42", "TRAJ43", "TRAJ44", "TRAJ45",
"TRAJ49", "TRAJ5", "TRAJ50", "TRAJ52", "TRAJ53", "TRAJ54",
"TRAJ56", "TRAJ57", "TRAJ58", "TRAJ6", "TRAJ7", "TRAJ8",
"TRBJ1-1", "TRBJ1-2", "TRBJ1-3", "TRBJ1-4", "TRBJ1-5", "TRBJ1-6",
"TRBJ2-1", "TRBJ2-2", "TRBJ2-3", "TRBJ2-4", "TRBJ2-5", "TRBJ2-6",
"TRBJ2-7", "TRDJ1", "TRDJ2", "TRDJ3", "TRDJ4", "TRGJ1", "TRGJ2",
"TRGJP", "TRGJP1", "TRGJP2"), class = "factor"), freq = c(387L,
3L, 1L, 1L, 3533L, 445L, 132L, 55L, 563L, 15L, 5L, 830L,
4L, 72L)), row.names = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L,
10L, 11L, 12L, 13L, 14L, 15L), class = "data.frame")
grid.col = c(`TRAV29/DV5` = "red", TRDV3 = "green", TRDV2 = "blue", TRDV1 = "purple", `TRAV36/DV7` = "pink",
TRDJ4 = "orange", TRDJ1 = "palegreen", TRDJ2 = "lightsteelblue", TRDJ3 = "thistle", TRGJP = "yellow", TRGJ2 = "grey", TRGJP2 = "brown", TRGJ1 = "lightpink")
circ_plot <- function(df){
circos.par(canvas.xlim=c(-1.5,1.5),canvas.ylim=c(-1.5,1.5))
chordDiagram(df, annotationTrack = "grid",
grid.col = grid.col,
link.lwd = matrix(1, nrow = nrow(df), ncol = ncol(df)),
link.border = "black")
for(si in get.all.sector.index()) {
xlim = get.cell.meta.data("xlim", sector.index = si, track.index = 1)
ylim = get.cell.meta.data("ylim", sector.index = si, track.index = 1)
circos.text(mean(xlim),ylim[1], si, sector.index = si, track.index = 1,
facing = "clockwise",
cex=0.8,
adj=c(-1,0),
niceFacing = TRUE)
circos.axis(h = 0,
major.at = c(0,0.5,1,1.5,2,2.5,3,3.5,4,4.5,5) ,
labels.cex = 0.2,labels.facing = "inside",
sector.index = si, track.index = 1)
}
# Restart circular layout parameters
circos.clear()
}
circ_plot(df)

Standard column width in facetted and grouped ggplot bar plot

I've made a bar chart using ggplot with grouped data, and facetted with facet_grid. The column widths are inconsistent, so I want to make them all the same. I've read this can be done with preserve="single, but it seems to mess up the position dodging. Any idea how to prevent this happening??
Here is a small sample of the data:
data <- structure(list(grp2 = structure(c(3L, 3L, 3L, 3L, 3L, 3L, 3L,
7L, 7L, 7L, 7L, 7L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 7L, 7L, 7L, 7L,
7L), .Label = c("CSF1", "CSF2", "PC", "NC", "GPC", "GNC", "standard"
), class = "factor"), label2 = structure(c(7L, 8L, 9L, 7L, 8L,
9L, 7L, 15L, 15L, 15L, 15L, 15L, 7L, 8L, 9L, 7L, 8L, 9L, 7L,
15L, 15L, 15L, 15L, 15L), .Label = c("CSF1_raw", "CSF1_supernatant",
"CSF1_pellet", "CSF2_raw", "CSF2_supernatant", "CSF2_pellet",
"PC_raw", "PC_supernatant", "PC_pellet", "NC_raw", "NC_supernatant",
"NC_pellet", "GPC", "GNC", "standard", "NC"), class = "factor"),
mda_label = structure(c(1L, 1L, 1L, 2L, 2L, 2L, 3L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 3L, 1L, 1L, 1L, 1L, 1L
), .Label = c("none", "mda_20", "mda_200"), class = "factor"),
conc = c(`7` = 0, `8` = 0, `9` = 0.324886127298521, `55` = 4.14765656994934,
`56` = 1.16840050032707, `57` = 8.33529714053568, `76` = 10.6220645144775,
`77` = 48.9241552191721, `78` = 4.51513315624087, `79` = 1.03887911533275,
`80` = 0.0445944796011582, `81` = 0.00484116548901831, `89` = 0,
`90` = 0, `91` = 0.322922569348207, `137` = 6.38488684568018,
`138` = 1.68909814271646, `139` = 7.61828609738757, `158` = 15.3082130743032,
`159` = 41.3127531345335, `160` = 4.64193087683391, `161` = 0.411672491030815,
`162` = 0.0568193835425769, `163` = 0.00439419098560105)), row.names = c(NA,
-24L), class = c("tbl_df", "tbl", "data.frame"))
Here's the initial plot:
ggplot(data, aes(x=label2, y=conc, colour=mda_label, fill=mda_label)) +
facet_grid(. ~ grp2, scales="free_x", space="free") +
stat_summary(fun = mean, geom = "bar", position = position_dodge()) +
stat_summary(fun.data = mean_se, geom = "errorbar", colour="black", width=0.5,
position = position_dodge(width=0.9)) +
geom_point(position = position_dodge(width=0.9), pch=21, colour="black") +
scale_y_continuous(trans='pseudo_log',
labels = scales::number_format(accuracy=0.01)) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
But when I try to standardise the column widths with preserve="single", it gets messed up:
ggplot(data, aes(x=label2, y=conc, colour=mda_label, fill=mda_label)) +
facet_grid(. ~ grp2, scales="free_x", space="free") +
stat_summary(fun = mean, geom = "bar", position = position_dodge(preserve="single")) +
stat_summary(fun.data = mean_se, geom = "errorbar", colour="black", width=0.5,
position = position_dodge(width=0.9, preserve="single")) +
geom_point(position = position_dodge(width=0.9, preserve="single"), pch=21, colour="black") +
scale_y_continuous(trans='pseudo_log',
labels = scales::number_format(accuracy=0.01)) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Since you're using data that as 0 values, you could make the 0 values for the other 'mda_label' on grp2/label2 standard categories.
data <- rbind(data, data.frame(grp2 = c("standard", "standard"),
label2 = c("standard", "standard"),
mda_label = c("mda_20", "mda_200"),
conc = c(0, 0)))
Also you never actually make the bar plot
data %>%
ggplot(aes(label2, conc, fill = mda_label)) +
geom_col(position = position_dodge(width = 1)) +
facet_grid(. ~ grp2, scales = "free", space = "free")

Subset of values for geom_smooth() wrapped in a function

I've been unable to make my function work into R
Here are my test data:
df.summary <- structure(list(sample = structure(c(1L, 11L, 13L, 14L, 15L, 16L,
17L, 18L, 19L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 12L), .Label = c("P1",
"P10", "P11", "P12", "P13", "P14", "P15", "P16", "P18", "P19",
"P2", "P20", "P3", "P4", "P5", "P6", "P7", "P8", "P9"), class = "factor"),
my_col1 = c(0.18933457306591, 0.235931461802108, 0.189103550993512,
0.125949595916727, 0.0534753960389538, 0.147040309859083,
0.0911609796692189, 0.175136203125972, 0.116254981602728,
0.133480302179393, 0.109994771038499, 0.149204159468607,
0.105682126016057, 0.0967607072540045, 0.172893104456964,
0.115091434919033, 0.0653509609616037, 0.113300972345115,
0.0801326785643683), my_col2 = structure(c(1L, 1L, 1L, 2L,
2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L
), .Label = c("F", "M"), class = "factor"), my_col3 = c(0,
0, 0, 20.9715009722175, 13.3519208510716, 24.0257081096482,
19.2584928826721, 0, 0, 22.3923771843906, 16.6293335002717,
26.5622107372171, 0, 0, 0, 0, 0, 0, 0)), class = "data.frame", row.names = c(NA,
-19L))
library(ggplot2)
## read data in
## df.summary <- read.csv('data_test.csv',header = TRUE,sep=';', check.names = FALSE)
plot_correlation <- function(my_df, my_col1, my_col3, my_col2, output) {
my_df[, my_col1] <- my_df[, my_col1] * 100
lm_plot <- ggplot(my_df, aes(my_col1, my_col3)) +
geom_point(data = my_df, aes(colour = my_col2), size = 2.5) +
scale_color_manual(values=c("violetred1", "royalblue1", "gold")) +
labs(x = "", y = "") +
geom_abline(intercept = 0, slope = 1,linetype="dotted") +
geom_smooth(data=subset(my_df, my_col2 == "M"),method="lm", color="royalblue1")
my_output <- output
ggsave(filename=my_output, plot=lm_plot,width = 9, height = 9, pointsize = 10)
}
plot_correlation(df.summary,'my_col1','my_col3','my_col2','test_outfig.pdf')
this code is giving me this plot:
When this code:
df.summary[,my_col1] <- df.summary[,my_col1]*100
ggplot(df.summary, aes(my_col1,my_col3)) +
geom_point(data = df.summary, aes(colour = my_col2), size = 2.5) +
scale_color_manual(values=c("violetred1", "royalblue1", "gold")) +
labs(x = "", y = "") +
geom_abline(intercept = 0, slope = 1,linetype="dotted") +
geom_smooth(data=subset(df.summary, my_col2 == "M"), method="lm", color="royalblue1")
Is giving me this plot (which is giving me exactly what I want):
It's looks like (maybe I'm wrong) inside the function, R is unable to link my col names and I can't figure out which is the right syntax ...
Replace aes with aes_string. Your code may somewhat work because the variable name (my_col1 etc) is exactly the variable value ("my_col1" etc). Since you want to specify column names using function arguments you'll need to either use tidyeval or use aes_string, which takes string values rather than unquoted symbols.
Also, there's no reason to copy output to my_output in the function body.
library("ggplot2")
df.summary <- structure(list(sample = structure(c(1L, 11L, 13L, 14L, 15L, 16L,
17L, 18L, 19L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 12L), .Label = c("P1",
"P10", "P11", "P12", "P13", "P14", "P15", "P16", "P18", "P19",
"P2", "P20", "P3", "P4", "P5", "P6", "P7", "P8", "P9"), class = "factor"),
my_col1 = c(0.18933457306591, 0.235931461802108, 0.189103550993512,
0.125949595916727, 0.0534753960389538, 0.147040309859083,
0.0911609796692189, 0.175136203125972, 0.116254981602728,
0.133480302179393, 0.109994771038499, 0.149204159468607,
0.105682126016057, 0.0967607072540045, 0.172893104456964,
0.115091434919033, 0.0653509609616037, 0.113300972345115,
0.0801326785643683), my_col2 = structure(c(1L, 1L, 1L, 2L,
2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L
), .Label = c("F", "M"), class = "factor"), my_col3 = c(0,
0, 0, 20.9715009722175, 13.3519208510716, 24.0257081096482,
19.2584928826721, 0, 0, 22.3923771843906, 16.6293335002717,
26.5622107372171, 0, 0, 0, 0, 0, 0, 0)), class = "data.frame", row.names = c(NA,
-19L))
plot_correlation <- function(my_df, my_col1, my_col3, my_col2) {
my_df[, my_col1] <- my_df[, my_col1] * 100
ggplot(my_df, aes_string(my_col1, my_col3)) +
geom_point(data = my_df, aes(colour = my_col2), size = 2.5) +
scale_color_manual(values=c("violetred1", "royalblue1", "gold")) +
labs(x = "", y = "") +
geom_abline(intercept = 0, slope = 1,linetype="dotted") +
geom_smooth(data=subset(my_df, my_col2 == "M"),method="lm", color="royalblue1")
}
plot_correlation(df.summary,'my_col1','my_col3','my_col2')
Created on 2019-12-16 by the reprex package (v0.3.0)

ggplot2: axis line at 0 with negative datapoints

I am doing boxplot with positive and negative data and would like to have the axis at y=0.
adding a line afterwards is not elegant since the line would be on to of the boxes and not behind them.
(the goal is to have the line at y=0 black, while the lines at 1 and -1 should be gray)
In addition I would like to have only the axis lines. I therefore used
axis.line=element_line()
,panel.border = element_blank()
in the theme. however the vertical line goes above 1, which does not look good (my data is by definition between -1 and 1).
here is the code:
require (ggplot2)
theme_jack <- function (base_size = 10, base_family = "") {
theme_bw(base_size = base_size, base_family = base_family) %+replace%
theme(
title = element_text(size = 12)
,axis.title.x = element_text(colour = "white", size=0)
,axis.title.y = element_text(size=12, angle=90)
,axis.text.x=element_text(size=8)
,panel.grid.major = element_line(colour = "grey")#,
,axis.line=element_line()
,panel.border = element_blank()
,panel.grid.minor = element_blank()
,panel.grid.major.x = element_blank()
,legend.position = "none"
)
}
theme_set(theme_jack())
datatest2=structure(list(datatest2.genotype = structure(c(1L, 1L, 5L, 5L,
1L, 5L, 5L, 5L, 1L, 5L, 1L, 5L, 1L, 5L, 5L, 1L, 1L, 1L, 5L, 5L,
1L, 1L, 1L, 5L, 1L, 1L, 5L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L,
6L, 6L, 6L, 6L), .Label = c("CS_d42-chG80", "CS_G_c380cha", "CS_Gd42-chG80",
"PKC_CS", "PKC_d42-chG80", "PKC_G_c380cha", "PKC_Gd42-chG80"), class = "factor"),
datatest2.score = c(0.8882, -0.3775, -0.4053, 0.1962, 0.9982,
0.5627, -0.4865, 0.7267, 0.3276, 0.5017, 0.9731, 0.1525,
0.7857, 0.6121, 0.8508, 0.1311, -0.2457, 0.8848, -0.1254,
0.1047, -0.2715, 0.7189, 0.4115, 0.9704, -0.8328, -0.1301,
0.9756, 0.2317, 0.4297, 0.9967, 0.6423, 0.8516, 0.3386, 0.5208,
0.9148, 0.2539, 0.8581, 0.5621, 0.5969, 0.7435)), .Names = c("genotype",
"score"), row.names = c(NA, 40L), class = "data.frame")
p=ggplot(datatest2, aes(x=factor(genotype),y= score))
plot=p+ geom_boxplot()+ labs(x="genotype",y="PI during final test")+
scale_fill_grey(start = 0.9, end = 0.9)+ ##allow good bw prints
scale_y_continuous(minor_breaks=NULL,breaks = seq(-1 , 1, 1) )
plot
Try the expand argument to `scale_y_continuous. Replace the final section with this and you should get something closer to what you want. You may have to experiment with the values below.
p = ggplot(datatest2, aes(x = factor(genotype), y = score))
plot = p+ geom_boxplot()+ labs(x = "genotype",
y = "PI during final test")+
scale_fill_grey(start = 0.9, end = 0.9)+ ##allow good bw prints
scale_y_continuous(expand = c(0,0.005),
minor_breaks = NULL,
breaks = seq(-1 , 1, 1) ) +
theme()
plot

visualize associations between two groups of data

Where each datapoint has a pairing of A and B and there multiple entries in A and multiple entires in B. IE multiple syndromes and multiple diagnoses, although for each datapoint there is one single syndrome-diagnoses pair.
Examples, suggestions, or ideas much appreciated
here's what the data is like. And I want to see connections between values of A and B (how many GG's are linked to TTs etc). Both are nominal datatypes.
ID,A ,B
1,GG,TT
2,AA,SS
3,BB,XX
4,DD,SS
5,DD,TT
6,CC,XX
7,HH,ZZ
8,AA,TT
9,CC,RR
10,DD,ZZ
11,AA,XX
12,AA,TT
13,DD,SS
14,DD,XX
15,AA,YY
16,CC,ZZ
17,FF,SS
18,FF,XX
19,BB,VV
20,GG,VV
21,GG,SS
22,AA,RR
23,AA,TT
24,AA,SS
25,CC,VV
26,CC,TT
27,FF,RR
28,GG,UU
29,CC,TT
30,BB,ZZ
31,II,TT
32,FF,RR
33,BB,SS
34,GG,YY
35,FF,RR
36,BB,VV
37,II,RR
38,CC,YY
39,FF,VV
40,AA,XX
41,AA,ZZ
42,GG,VV
43,BB,UU
44,II,UU
45,II,SS
46,DD,SS
47,AA,UU
48,BB,VV
49,GG,TT
50,BB,TT
Since your data is bipartite, I would suggest plotting points in the first factor on one side, points in the other factor on the other, with lines between them, like so:
The code I used to generate this was:
## Make up data.
data <- data.frame(X1=sample(state.region, 10),
X2=sample(state.region, 10))
## Set up plot window.
plot(0, xlim=c(0,1), ylim=c(0,1),
type="n", axes=FALSE, xlab="", ylab="")
factor.to.int <- function(f) {
(as.integer(f) - 1) / (length(levels(f)) - 1)
}
segments(factor.to.int(data$X1), 0, factor.to.int(data$X2), 1,
col=data$X1)
axis(1, at = seq(0, 1, by = 1 / (length(levels(data$X1)) - 1)),
labels = levels(data$X1))
axis(3, at = seq(0, 1, by = 1 / (length(levels(data$X2)) - 1)),
labels = levels(data$X2))
This is what I do. A darker colour indicates a more important combination of A and B.
dataset <- data.frame(A = sample(LETTERS[1:5], 200, prob = runif(5), replace = TRUE), B = sample(LETTERS[1:5], 200, prob = runif(5), replace = TRUE))
Counts <- as.data.frame(with(dataset, table(A, B)))
library(ggplot2)
ggplot(Counts, aes(x = A, y = B, fill = Freq)) + geom_tile() + scale_fill_gradient(low = "white", high = "black")
Or if you prefer lines
library(ggplot2)
dataset <- data.frame(A = sample(letters[1:5], 200, prob = runif(5), replace = TRUE), B = sample(letters[1:5], 200, prob = runif(5), replace = TRUE))
Counts <- as.data.frame(with(dataset, table(A, B)))
Counts$X <- 0
Counts$Xend <- 1
Counts$Y <- as.numeric(Counts$A)
Counts$Yend <- as.numeric(Counts$B)
ggplot(Counts, aes(x = X, xend = Xend, y = Y, yend = Yend, size = Freq)) +
geom_segment() + scale_x_continuous(breaks = 0:1, labels = c("A", "B")) +
scale_y_continuous(breaks = 1:5, labels = letters[1:5])
This third options add labels to the data points using geom_text().
library(ggplot2)
dataset <- data.frame(
A = sample(letters[1:5], 200, prob = runif(5), replace = TRUE),
B = sample(LETTERS[20:26], 200, prob = runif(7), replace = TRUE)
)
Counts <- as.data.frame(with(dataset, table(A, B)))
Counts$X <- 0
Counts$Xend <- 1
Counts$Y <- as.numeric(Counts$A)
Counts$Yend <- as.numeric(Counts$B)
ggplot(Counts, aes(x = X, xend = Xend, y = Y, yend = Yend)) +
geom_segment(aes(size = Freq)) +
scale_x_continuous(breaks = 0:1, labels = c("A", "B")) +
scale_y_continuous(breaks = -1) +
geom_text(aes(x = X, y = Y, label = A), colour = "red", size = 7, hjust = 1, vjust = 1) +
geom_text(aes(x = Xend, y = Yend, label = B), colour = "red", size = 7, hjust = 0, vjust = 0)
Maybe mosaicplot:
X <- structure(list(
ID = 1:50,
A = structure(c(6L, 1L, 2L, 4L, 4L, 3L, 7L, 1L, 3L, 4L, 1L, 1L, 4L, 4L, 1L, 3L, 5L, 5L, 2L, 6L, 6L, 1L, 1L, 1L, 3L, 3L, 5L, 6L, 3L, 2L, 8L, 5L, 2L, 6L, 5L, 2L, 8L, 3L, 5L, 1L, 1L, 6L, 2L, 8L, 8L, 4L, 1L, 2L, 6L, 2L), .Label = c("AA","BB", "CC", "DD", "FF", "GG", "HH", "II"), class = "factor"),
B = structure(c(3L, 2L, 6L, 2L, 3L, 6L, 8L, 3L, 1L, 8L, 6L, 3L, 2L, 6L, 7L, 8L, 2L, 6L, 5L, 5L, 2L, 1L, 3L, 2L, 5L, 3L, 1L, 4L, 3L, 8L, 3L, 1L, 2L, 7L, 1L, 5L, 1L, 7L, 5L, 6L, 8L, 5L, 4L, 4L, 2L, 2L, 4L, 5L, 3L, 3L), .Label = c("RR", "SS", "TT", "UU", "VV", "XX", "YY", "ZZ"), class = "factor")
), .Names = c("ID", "A", "B"), class = "data.frame", row.names = c(NA, -50L)
)
mosaicplot(with(X,table(A,B)))
For you example dataset:
Thanks! I think that the connectivity between elements in each class is best visualized by the link graph examples given by both Jonathon and Thierry. Thierry's 2nd which shows the magnitude is definitely where i will start.
update
thanks everyone for you ideas and tips!
I came acrossthe bipartite package that has functions to visualize this kind of data. I think its a clean visualization of the relationships I am trying to show.
did:
library(bipartite)
dataset <- data.frame(
A = sample(letters[1:5], 200, prob = runif(5), replace = TRUE),
B = sample(LETTERS[20:26], 200, prob = runif(7), replace = TRUE)
)
datamat <- as.matrix(table(dataset$A, dataset$B))
visweb(datamat, text = "interaction", textsize = .8)
giving:
visweb result
couldnt put image in as a new user :(

Resources