Subset of values for geom_smooth() wrapped in a function - r

I've been unable to make my function work into R
Here are my test data:
df.summary <- structure(list(sample = structure(c(1L, 11L, 13L, 14L, 15L, 16L,
17L, 18L, 19L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 12L), .Label = c("P1",
"P10", "P11", "P12", "P13", "P14", "P15", "P16", "P18", "P19",
"P2", "P20", "P3", "P4", "P5", "P6", "P7", "P8", "P9"), class = "factor"),
my_col1 = c(0.18933457306591, 0.235931461802108, 0.189103550993512,
0.125949595916727, 0.0534753960389538, 0.147040309859083,
0.0911609796692189, 0.175136203125972, 0.116254981602728,
0.133480302179393, 0.109994771038499, 0.149204159468607,
0.105682126016057, 0.0967607072540045, 0.172893104456964,
0.115091434919033, 0.0653509609616037, 0.113300972345115,
0.0801326785643683), my_col2 = structure(c(1L, 1L, 1L, 2L,
2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L
), .Label = c("F", "M"), class = "factor"), my_col3 = c(0,
0, 0, 20.9715009722175, 13.3519208510716, 24.0257081096482,
19.2584928826721, 0, 0, 22.3923771843906, 16.6293335002717,
26.5622107372171, 0, 0, 0, 0, 0, 0, 0)), class = "data.frame", row.names = c(NA,
-19L))
library(ggplot2)
## read data in
## df.summary <- read.csv('data_test.csv',header = TRUE,sep=';', check.names = FALSE)
plot_correlation <- function(my_df, my_col1, my_col3, my_col2, output) {
my_df[, my_col1] <- my_df[, my_col1] * 100
lm_plot <- ggplot(my_df, aes(my_col1, my_col3)) +
geom_point(data = my_df, aes(colour = my_col2), size = 2.5) +
scale_color_manual(values=c("violetred1", "royalblue1", "gold")) +
labs(x = "", y = "") +
geom_abline(intercept = 0, slope = 1,linetype="dotted") +
geom_smooth(data=subset(my_df, my_col2 == "M"),method="lm", color="royalblue1")
my_output <- output
ggsave(filename=my_output, plot=lm_plot,width = 9, height = 9, pointsize = 10)
}
plot_correlation(df.summary,'my_col1','my_col3','my_col2','test_outfig.pdf')
this code is giving me this plot:
When this code:
df.summary[,my_col1] <- df.summary[,my_col1]*100
ggplot(df.summary, aes(my_col1,my_col3)) +
geom_point(data = df.summary, aes(colour = my_col2), size = 2.5) +
scale_color_manual(values=c("violetred1", "royalblue1", "gold")) +
labs(x = "", y = "") +
geom_abline(intercept = 0, slope = 1,linetype="dotted") +
geom_smooth(data=subset(df.summary, my_col2 == "M"), method="lm", color="royalblue1")
Is giving me this plot (which is giving me exactly what I want):
It's looks like (maybe I'm wrong) inside the function, R is unable to link my col names and I can't figure out which is the right syntax ...

Replace aes with aes_string. Your code may somewhat work because the variable name (my_col1 etc) is exactly the variable value ("my_col1" etc). Since you want to specify column names using function arguments you'll need to either use tidyeval or use aes_string, which takes string values rather than unquoted symbols.
Also, there's no reason to copy output to my_output in the function body.
library("ggplot2")
df.summary <- structure(list(sample = structure(c(1L, 11L, 13L, 14L, 15L, 16L,
17L, 18L, 19L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 12L), .Label = c("P1",
"P10", "P11", "P12", "P13", "P14", "P15", "P16", "P18", "P19",
"P2", "P20", "P3", "P4", "P5", "P6", "P7", "P8", "P9"), class = "factor"),
my_col1 = c(0.18933457306591, 0.235931461802108, 0.189103550993512,
0.125949595916727, 0.0534753960389538, 0.147040309859083,
0.0911609796692189, 0.175136203125972, 0.116254981602728,
0.133480302179393, 0.109994771038499, 0.149204159468607,
0.105682126016057, 0.0967607072540045, 0.172893104456964,
0.115091434919033, 0.0653509609616037, 0.113300972345115,
0.0801326785643683), my_col2 = structure(c(1L, 1L, 1L, 2L,
2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L
), .Label = c("F", "M"), class = "factor"), my_col3 = c(0,
0, 0, 20.9715009722175, 13.3519208510716, 24.0257081096482,
19.2584928826721, 0, 0, 22.3923771843906, 16.6293335002717,
26.5622107372171, 0, 0, 0, 0, 0, 0, 0)), class = "data.frame", row.names = c(NA,
-19L))
plot_correlation <- function(my_df, my_col1, my_col3, my_col2) {
my_df[, my_col1] <- my_df[, my_col1] * 100
ggplot(my_df, aes_string(my_col1, my_col3)) +
geom_point(data = my_df, aes(colour = my_col2), size = 2.5) +
scale_color_manual(values=c("violetred1", "royalblue1", "gold")) +
labs(x = "", y = "") +
geom_abline(intercept = 0, slope = 1,linetype="dotted") +
geom_smooth(data=subset(my_df, my_col2 == "M"),method="lm", color="royalblue1")
}
plot_correlation(df.summary,'my_col1','my_col3','my_col2')
Created on 2019-12-16 by the reprex package (v0.3.0)

Related

ggplot 'race track' plot with polar_coord + points

I'm struggling to get polar_coords to work as I had hoped. I want each item to be represented by a coloured track, with a range of 1:50000. I then wanted to plot points over these tracks at the corresponding locations, with symbols representing the different categories. The points would then be annotated with the id.
Dataframe:
structure(list(item = structure(c(1L, 2L, 2L, 2L, 3L, 4L, 5L,
6L, 7L, 8L, 9L, 10L), .Label = c("AA", "AB", "AC", "AD", "AE",
"BA", "BB", "BC", "BD", "BE"), class = "factor"), location = c(10045L,
12041L, 15035L, 22054L, 19023L, 49411L, 39012L, 3041L, 23065L,
33015L, 42069L, 26859L), category = structure(c(1L, 1L, 2L, 3L,
1L, 2L, 2L, 1L, 1L, 1L, 2L, 1L), .Label = c("X", "Y", "Z"), class = "factor"),
id = structure(c(1L, 8L, 2L, 7L, 6L, 10L, 5L, 1L, 1L, 3L,
4L, 9L), .Label = c("Apple", "Banana", "Cherry", "Grape",
"Mango", "Melon", "Orange", "Pear", "Raspberry", "Strawberry"
), class = "factor")), .Names = c("item", "location", "category",
"id"), class = "data.frame", row.names = c(NA, -12L))
my_data %>%
ggplot(aes(item, location, shape = category, label = id)) +
geom_col(aes(y = Inf), fill = "gray80") +
geom_point(size = 3) +
geom_text(vjust = -1) +
scale_x_discrete(expand = expand_scale(add = c(5,0))) +
coord_polar(theta = "y") +
theme_void()
If you want a break in the middle, you could change the item to a numeric value relating to it's desired position:
my_data %>%
mutate(item_pos = as.numeric(item),
item_pos = item_pos + if_else(item_pos > 5, 1, 0)) %>%
ggplot(aes(item_pos, location, shape = category, label = id)) +
...
Maybe you can work from this:
ggplot(data,aes(x=location, color=id, y=id)) +
geom_linerange(aes(y=id, xmin=0, xmax=50000, color=category), size=2, alpha=0.5) +
geom_point(size=3) +
coord_polar()

Change color for specific data in ggplot2

I have 15 measurement points and i defined "renkler" color palette for them. I want to change the color of 2 (red: DEF-2 and DEF-13 points in the ps_no column) in these 15.
My codes are
library(ggplot2)
library(reshape)
dat <- read.delim("a.txt")
dat$Date <- as.Date(dat$Date,"%d/%m/%Y")
# order
dat$parameter <- factor(dat$parameter, levels = c("DEF-2", "DEF-13"))
dat$ps_no <- factor(dat$ps_no, levels = c("DEF-2", "PS.584", "PS.585", "PS.586", "PS.603", "PS.630", "DEF-13", "PS.600", "PS.667", "PS.690", "PS.714", "PS.734", "PS.754", "PS.811", "PS.813"))
# create own color palette
library(RColorBrewer)
renkler = c(brewer.pal(name="Set2", n = 7), brewer.pal(name="Set2", n = 8))
# Setup plot without facets
p <- ggplot(data = dat, aes(x = Date, y = value)) +
geom_line(aes(color = ps_no)) +
geom_point(aes(color = ps_no)) +
scale_color_manual(values = renkler) + # oluşturduğumuz paleti yüklemek için
scale_x_date(date_breaks = "1 months",date_labels = "%Y-%m",
limits = as.Date.character(c("01/12/2017","31/12/2018"),
format = "%d/%m/%Y")) +
ylab("[mm/year]") +
xlab("") +
facet_grid(parameter ~ .) +
theme_bw()
p + theme(
axis.text.x = element_text(angle = 45, hjust = 1),
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank(),
)
and the data output with dput(dat):
structure(list(parameter = structure(c(2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("DEF-13",
"DEF-2"), class = "factor"), ps_no = structure(c(3L, 3L, 3L,
3L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 7L, 7L, 7L, 8L, 8L, 8L, 2L,
2L, 2L, 6L, 6L, 6L, 9L, 9L, 9L, 10L, 10L, 10L, 11L, 11L, 11L,
12L, 12L, 12L, 13L, 13L, 13L, 14L, 14L, 14L, 15L, 15L, 15L, 1L,
1L, 1L), .Label = c("DEF-13", "DEF-2", "PS.584", "PS.585", "PS.586",
"PS.600", "PS.603", "PS.630", "PS.667", "PS.690", "PS.714", "PS.734",
"PS.754", "PS.811", "PS.813"), class = "factor"), Date = structure(c(17534,
17546, 17870, 17882, 17534, 17546, 17870, 17882, 17534, 17546,
17870, 17882, 17534, 17546, 17882, 17534, 17546, 17882, 17536,
17557, 17879, 17534, 17546, 17882, 17534, 17546, 17882, 17534,
17546, 17882, 17534, 17546, 17882, 17534, 17546, 17882, 17534,
17546, 17882, 17534, 17546, 17882, 17534, 17546, 17882, 17536,
17549, 17886), class = "Date"), value = c(0, 1.23684, -12.15729097,
-11.4102363, 0, 2.45200798, 1.12950398, -2.76779102, 0, 0.924571,
-7.1917482, -6.2764626, 0, -4.0725265, 0.4847485, 0, 0.290382,
-6.098794, 0, 0.813289109, -0.426076522, 0, 1.7502, -5.139665,
0, -29.67012, -14.956098, 0, 12.8852143, 7.4377433, 0, 1.404183,
-12.426633, 0, -24.09551, -7.619493, 0, -4.194441, -16.258703,
0, -0.835691, -10.504454, 0, 1.311699, 6.30102, 0, -1.49366556,
-1.835284539)), row.names = c(NA, -48L), class = "data.frame")
And also I need to change legend tittle (ps_no) and the texts on the right side of plots (DEF-2 and DEF-13).
Thank you.
Edit:
I filter the data which I want to show different color with using filter command. After filter command, I add a command line for geom_line and another command line for geom_point. It is working in the plot. But this is not the answer literally because the colors in the legend do not change.
So this the the new version of codes:
library(ggplot2)
library(reshape)
dat <- read.delim("aroundDEF.txt")
dat$Date <- as.Date(dat$Date,"%d/%m/%Y")
# order
dat$parameter <- factor(dat$parameter, levels = c("DEF-2", "DEF-13"))
dat$ps_no <- factor(dat$ps_no, levels = c("DEF-2", "PS.584", "PS.585", "PS.586", "PS.603", "PS.630", "DEF-13", "PS.600", "PS.667", "PS.690", "PS.714", "PS.734", "PS.754", "PS.811", "PS.813"))
# create own color palette
library(RColorBrewer)
renkler = c(brewer.pal(name="Set2", n = 7), brewer.pal(name="Set2", n = 8))
geom_line(aes(color = ps_no)) +
geom_line(data=highlight_df, aes(color = ps_no), color='#da0018') +
geom_point(aes(color = ps_no)) +
geom_point(data=highlight_df, aes(color = ps_no), color='#da0018') +
# filter dataframe to get data to be highligheted
highlight_df <- dat %>%
filter(ps_no=="DEF-2" | ps_no=="DEF-13")
# Setup plot without facets
p <- ggplot(data = dat, aes(x = Date, y = value)) +
scale_color_manual(values = renkler) +
scale_x_date(date_breaks = "1 months",date_labels = "%Y-%m",
limits = as.Date.character(c("01/12/2017","31/12/2018"),
format = "%d/%m/%Y")) +
ylab("[mm/year]") +
xlab("") +
facet_grid(parameter ~ .
, labeller = as_labeller( c("DEF-2" = "DEF-2 and around", "DEF-13" = "DEF-13 and around"))) +
theme_bw()
p + theme(
axis.text.x = element_text(angle = 45, hjust = 1),
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank(),
)
In short, still I need an answer...
After renkler variable:
renkler[1]= "#DA0018"
renkler[7]= "#DA0018"
For the legend title:
scale_color_manual(values = renkler, name="new name")

Multiple vertical shaded area

I am plotting the proportion of deep sleep (y axis) vs days (x axis). I would like to add vertical shaded area for a better understanding (e.g. grey for week-ends, orange for sick period...).
I have tried using geom_ribbon (I created a variable taking the value of 30, with is the top of my y axis if the data is during the WE - information given in another column), but instead of getting rectangles, I get trapezes.
In another post, someone proposed the use of "geom_rect", or "annotate" if one's know the x and y coordinates, but I don't see how to adapt it in my case, when I want to have the colored area repeated to all week-end (it is not exactly every 7 days because some data are missing).
Do you have any idea ?
Many thanks in advance !
ggplot(Sleep.data, aes(x = DATEID)) +
geom_line(aes(y = P.DEEP, group = 1), col = "deepskyblue3") +
geom_point(aes(y = P.DEEP, group = 1, col = Sign.deep)) +
guides(col=FALSE) +
geom_ribbon(aes(ymin = min, ymax = max.WE), fill = '#6495ED80') +
facet_grid(MONTH~.) +
geom_hline(yintercept = 15, col = "forestgreen") +
geom_hline(yintercept = 20, col = "forestgreen", linetype = "dashed") +
geom_vline(xintercept = c(7,14,21,28), col = "grey") +
scale_x_continuous(breaks=seq(0,28,7)) +
scale_y_continuous(breaks=seq(0,30,5)) +
labs(x = "Days",y="Proportion of deep sleep stage", title = "Deep sleep")
Proportion of deep sleep vs time
Head(Sleep.data)
> dput(head(Sleep.data))
structure(list(DATE = structure(c(1L, 4L, 7L, 10L, 13L, 16L), .Label = c("01-Dec-17",
"01-Feb-18", "01-Jan-18", "02-Dec-17", "02-Feb-18", "02-Jan-18",
"03-Dec-17", "03-Feb-18", "03-Jan-18", "04-Dec-17", "04-Feb-18",
"04-Jan-18", "05-Dec-17", "05-Feb-18", "05-Jan-18", "06-Dec-17",
"06-Feb-18", "06-Jan-18", "07-Dec-17", "07-Feb-18", "07-Jan-18",
"08-Dec-17", "08-Jan-18", "09-Dec-17", "09-Feb-18", "09-Jan-18",
"10-Dec-17", "10-Jan-18", "11-Dec-17", "11-Feb-18", "11-Jan-18",
"12-Dec-17", "12-Jan-18", "13-Dec-17", "13-Feb-18", "13-Jan-18",
"14-Dec-17", "14-Feb-18", "14-Jan-18", "15-Dec-17", "15-Jan-18",
"16-Dec-17", "16-Jan-18", "17-Dec-17", "17-Jan-18", "18-Dec-17",
"18-Jan-18", "19-Dec-17", "19-Jan-18", "20-Dec-17", "21-Dec-17",
"21-Jan-18", "22-Dec-17", "22-Jan-18", "23-Dec-17", "23-Jan-18",
"24-Dec-17", "24-Jan-18", "25-Dec-17", "25-Jan-18", "26-Dec-17",
"26-Jan-18", "27-Dec-17", "27-Jan-18", "28-Dec-17", "28-Jan-18",
"29-Dec-17", "29-Jan-18", "30-Dec-17", "30-Jan-18", "31-Dec-17",
"31-Jan-18"), class = "factor"), DATEID = 1:6, MONTH = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = c("Decembre", "Janvier", "Février"
), class = "factor"), DURATION = c(8.08, 7.43, 6.85, 6.23, 7.27,
6.62), D.DEEP = c(1.67, 1.37, 1.62, 1.75, 1.95, 0.9), P.DEEP = c(17L,
17L, 21L, 24L, 25L, 12L), STIMS = c(0L, 0L, 0L, 0L, 390L, 147L
), D.REM = c(1.7, 0.95, 0.95, 1.43, 1.47, 0.72), P.REM = c(17L,
11L, 12L, 20L, 19L, 9L), D.LIGHT = c(4.7, 5.12, 4.27, 3.05, 3.83,
4.98), P.LIGHT = c(49L, 63L, 55L, 43L, 49L, 66L), D.AWAKE = c(1.45,
0.58, 0.47, 0.87, 0.37, 0.85), P.AWAKE = c(15L, 7L, 6L, 12L,
4L, 11L), WAKE.UP = c(-2L, 0L, 2L, -1L, 3L, 1L), AGITATION = c(-1L,
-3L, -1L, -2L, 2L, -1L), FRAGMENTATION = c(1L, -2L, 2L, 1L, 0L,
-1L), PERIOD = structure(c(3L, 3L, 4L, 4L, 4L, 4L), .Label = c("HOLIDAYS",
"SICK", "WE", "WORK"), class = "factor"), SPORT = structure(c(2L,
1L, 2L, 2L, 2L, 1L), .Label = c("", "Day", "Evening"), class = "factor"),
ACTIVITY = structure(c(6L, 1L, 3L, 4L, 5L, 1L), .Label = c("",
"Bkool", "eBike", "Gym", "Natation", "Run"), class = "factor"),
TABLETS = c(0.5, 0.5, 0.5, 0.5, 0.5, 0.5), Ratio = c(1.15,
2.36, 3.45, 2.01, 5.27, 1.06), Sign = structure(c(2L, 2L,
2L, 2L, 2L, 2L), .Label = c("0", "1"), class = "factor"),
Sign.ratio = structure(c(2L, 2L, 2L, 2L, 2L, 2L), .Label = c("0",
"1"), class = "factor"), Sign.deep = structure(c(2L, 2L,
2L, 2L, 2L, 1L), .Label = c("0", "1"), class = "factor"),
Sign.awake = structure(c(1L, 2L, 2L, 1L, 2L, 1L), .Label = c("0",
"1"), class = "factor"), Sign.light = structure(c(2L, 1L,
1L, 2L, 2L, 1L), .Label = c("0", "1"), class = "factor"),
index = structure(c(1L, 1L, 1L, 1L, 2L, 1L), .Label = c("0",
"1"), class = "factor"), min = c(0, 0, 0, 0, 0, 0), max.WE = c(30,
30, 0, 0, 0, 0)), .Names = c("DATE", "DATEID", "MONTH", "DURATION",
"D.DEEP", "P.DEEP", "STIMS", "D.REM", "P.REM", "D.LIGHT", "P.LIGHT",
"D.AWAKE", "P.AWAKE", "WAKE.UP", "AGITATION", "FRAGMENTATION",
"PERIOD", "SPORT", "ACTIVITY", "TABLETS", "Ratio", "Sign", "Sign.ratio",
"Sign.deep", "Sign.awake", "Sign.light", "index", "min", "max.WE"
), row.names = c(NA, 6L), class = "data.frame")
Thanks for adding the data, that makes it easier to understand exactly what you're working with and to confirm that an answer actually addresses your question.
I thought it would be helpful to make a separate table with just the start and end of each contiguous set of rows with the same PERIOD. I did this using dplyr::case_when, assuming we should mark dates as a "start" if they are the first row in the table (row_number() == 1), or they have a different PERIOD value than the prior row. I mark dates as an "end" if they are the last row of the table, or have a different PERIOD than the next row. I only keep the starts and ends, and spread these into new columns called start and end.
library(tidyverse)
Period_ranges <- Sleep.data %>%
mutate(period_status = case_when(row_number() == 1 ~ "start",
PERIOD != lag(PERIOD) ~ "start",
row_number() == n() ~ "end",
PERIOD != lead(PERIOD) ~ "end",
TRUE ~ "other")) %>%
filter(period_status %in% c("start", "end")) %>%
select(DATEID, PERIOD, period_status) %>%
mutate(PERIOD_NUM = cumsum(PERIOD != lag(PERIOD) | row_number() == 1)) %>%
spread(period_status, DATEID)
# Output based on sample data only. If there's a problem with the full data, please add more. To share full data, use `dput(Sleep.data)` or to share 20 rows use `dput(head(Sleep.data, 20))`.
>Period_ranges
PERIOD PERIOD_NUM end start
1 WE 1 2 1
2 WORK 2 6 3
We can now use that in the plot. If you want to toggle the inclusion or fiddle with the appearance separately of different PERIOD types, you could modify the code below with Period_ranges %>% filter(PERIOD == "WE"),
ggplot(Sleep.data, aes(x = DATEID)) +
# Here I specify that this geom should use its own data.
# I start the rectangles half a day before and end half a day after to fill the space.
geom_rect(data = Period_ranges, inherit.aes = F,
aes(xmin = start - 0.5, xmax = end + 0.5,
ymin = 0, ymax = 30,
fill = PERIOD), alpha = 0.5) +
# Here we can specify the shading color for each type of PERIOD
scale_fill_manual(values = c(
"WE" = '#6495ED80',
"WORK" = "gray60"
)) +
# rest of your code
Chart based on data sample:

ggplot change color of one bar from stacked bar chart

Is there way to change colors of one bar( x - value) manualy in ggplot
data
for_plot_test=structure(list(name = c("A", "B",
"C", "A1", "A2", "A3",
"A4", "BI", "A", "B",
"C", "A1", "A2", "A3",
"A4", "BI"), n = c(1L, 3L, 5L, 7L, 9L, 11L, 13L, 15L, 2L, 4L, 6L, 8L, 10L, 12L, 14L, 16L),
value = c(0, 0.05, 0, 0.05, 0.05, 0.1, 0.05, 0, 1, 0.7, 0.6, 0.5, 0.4, 0.2, 0.2, 0.1),
variable = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L),
.Label = c("PROGRESS", "prev_progress"), class = "factor")),
class = c("grouped_df", "tbl_df", "tbl", "data.frame"),
row.names = c(NA, -16L), vars = "name", labels = structure(list(name = c("Applications", "BI", "Clients", "CRE & Scoring", "Portfolio & Production", "SG Russia", "Transactions", "УКЛ & Prescoring")),
row.names = c(NA, -8L), class = "data.frame", vars = "name", drop = TRUE,
indices = list(0:1, 14:15, 6:7, 10:11, 2:3, 12:13, 8:9, 4:5),
group_sizes = c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L),
biggest_group_size = 2L, .Names = "name"),
indices = list(c(0L, 8L), c(7L, 15L), c(3L, 11L), c(5L, 13L), c(1L, 9L), c(6L, 14L), c(4L, 12L), c(2L, 10L)),
drop = TRUE, group_sizes = c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), biggest_group_size = 2L,
.Names = c("name", "n", "value", "variable"))
Current plot
colot_progress=c("#be877a","#dcbfad")
s <- ggplot(for_plot_test, aes(x= reorder(name, -n),y = value, fill = variable,label=ifelse(for_plot$value==0,"",scales::percent(for_plot$value))))+
geom_bar(stat='identity',position = "stack")+
scale_fill_manual(values=colot_progress,aesthetics = "fill")+
coord_flip()+
theme_minimal() + theme(
axis.title = element_blank(),
axis.text.x=element_blank(),
panel.grid = element_blank(),
legend.position="none"
)+
geom_text(size = 5, position = position_stack(vjust = 0.5))
s
Illustration of desire result
Creating another level for the column variable.
library(dplyr)
for_plot_test1 <-
for_plot_test %>%
group_by(name) %>%
summarise(n = n()) %>%
mutate(value = ifelse(name == "A", 1, 0), variable = "dummy") %>%
full_join(for_plot_test %>% mutate(value = replace(value, name == "A", 0)))
for_plot_test1$variable <- factor(for_plot_test1$variable,
levels = c("dummy", "PROGRESS", "prev_progress"))
colot_progress <- c("limegreen", "#be877a", "#dcbfad")
s <- ggplot(for_plot_test1,
aes(
x = reorder(name,-n),
y = value,
fill = variable,
label = ifelse(value == 0, "", scales::percent(value))
)) +
geom_bar(stat = 'identity', position = "stack") +
scale_fill_manual(values = colot_progress, aesthetics = "fill") +
coord_flip() +
theme_minimal() + theme(
axis.title = element_blank(),
axis.text.x = element_blank(),
panel.grid = element_blank(),
legend.position = "none"
) +
geom_text(size = 5, position = position_stack(vjust = 0.5))
s

visualize associations between two groups of data

Where each datapoint has a pairing of A and B and there multiple entries in A and multiple entires in B. IE multiple syndromes and multiple diagnoses, although for each datapoint there is one single syndrome-diagnoses pair.
Examples, suggestions, or ideas much appreciated
here's what the data is like. And I want to see connections between values of A and B (how many GG's are linked to TTs etc). Both are nominal datatypes.
ID,A ,B
1,GG,TT
2,AA,SS
3,BB,XX
4,DD,SS
5,DD,TT
6,CC,XX
7,HH,ZZ
8,AA,TT
9,CC,RR
10,DD,ZZ
11,AA,XX
12,AA,TT
13,DD,SS
14,DD,XX
15,AA,YY
16,CC,ZZ
17,FF,SS
18,FF,XX
19,BB,VV
20,GG,VV
21,GG,SS
22,AA,RR
23,AA,TT
24,AA,SS
25,CC,VV
26,CC,TT
27,FF,RR
28,GG,UU
29,CC,TT
30,BB,ZZ
31,II,TT
32,FF,RR
33,BB,SS
34,GG,YY
35,FF,RR
36,BB,VV
37,II,RR
38,CC,YY
39,FF,VV
40,AA,XX
41,AA,ZZ
42,GG,VV
43,BB,UU
44,II,UU
45,II,SS
46,DD,SS
47,AA,UU
48,BB,VV
49,GG,TT
50,BB,TT
Since your data is bipartite, I would suggest plotting points in the first factor on one side, points in the other factor on the other, with lines between them, like so:
The code I used to generate this was:
## Make up data.
data <- data.frame(X1=sample(state.region, 10),
X2=sample(state.region, 10))
## Set up plot window.
plot(0, xlim=c(0,1), ylim=c(0,1),
type="n", axes=FALSE, xlab="", ylab="")
factor.to.int <- function(f) {
(as.integer(f) - 1) / (length(levels(f)) - 1)
}
segments(factor.to.int(data$X1), 0, factor.to.int(data$X2), 1,
col=data$X1)
axis(1, at = seq(0, 1, by = 1 / (length(levels(data$X1)) - 1)),
labels = levels(data$X1))
axis(3, at = seq(0, 1, by = 1 / (length(levels(data$X2)) - 1)),
labels = levels(data$X2))
This is what I do. A darker colour indicates a more important combination of A and B.
dataset <- data.frame(A = sample(LETTERS[1:5], 200, prob = runif(5), replace = TRUE), B = sample(LETTERS[1:5], 200, prob = runif(5), replace = TRUE))
Counts <- as.data.frame(with(dataset, table(A, B)))
library(ggplot2)
ggplot(Counts, aes(x = A, y = B, fill = Freq)) + geom_tile() + scale_fill_gradient(low = "white", high = "black")
Or if you prefer lines
library(ggplot2)
dataset <- data.frame(A = sample(letters[1:5], 200, prob = runif(5), replace = TRUE), B = sample(letters[1:5], 200, prob = runif(5), replace = TRUE))
Counts <- as.data.frame(with(dataset, table(A, B)))
Counts$X <- 0
Counts$Xend <- 1
Counts$Y <- as.numeric(Counts$A)
Counts$Yend <- as.numeric(Counts$B)
ggplot(Counts, aes(x = X, xend = Xend, y = Y, yend = Yend, size = Freq)) +
geom_segment() + scale_x_continuous(breaks = 0:1, labels = c("A", "B")) +
scale_y_continuous(breaks = 1:5, labels = letters[1:5])
This third options add labels to the data points using geom_text().
library(ggplot2)
dataset <- data.frame(
A = sample(letters[1:5], 200, prob = runif(5), replace = TRUE),
B = sample(LETTERS[20:26], 200, prob = runif(7), replace = TRUE)
)
Counts <- as.data.frame(with(dataset, table(A, B)))
Counts$X <- 0
Counts$Xend <- 1
Counts$Y <- as.numeric(Counts$A)
Counts$Yend <- as.numeric(Counts$B)
ggplot(Counts, aes(x = X, xend = Xend, y = Y, yend = Yend)) +
geom_segment(aes(size = Freq)) +
scale_x_continuous(breaks = 0:1, labels = c("A", "B")) +
scale_y_continuous(breaks = -1) +
geom_text(aes(x = X, y = Y, label = A), colour = "red", size = 7, hjust = 1, vjust = 1) +
geom_text(aes(x = Xend, y = Yend, label = B), colour = "red", size = 7, hjust = 0, vjust = 0)
Maybe mosaicplot:
X <- structure(list(
ID = 1:50,
A = structure(c(6L, 1L, 2L, 4L, 4L, 3L, 7L, 1L, 3L, 4L, 1L, 1L, 4L, 4L, 1L, 3L, 5L, 5L, 2L, 6L, 6L, 1L, 1L, 1L, 3L, 3L, 5L, 6L, 3L, 2L, 8L, 5L, 2L, 6L, 5L, 2L, 8L, 3L, 5L, 1L, 1L, 6L, 2L, 8L, 8L, 4L, 1L, 2L, 6L, 2L), .Label = c("AA","BB", "CC", "DD", "FF", "GG", "HH", "II"), class = "factor"),
B = structure(c(3L, 2L, 6L, 2L, 3L, 6L, 8L, 3L, 1L, 8L, 6L, 3L, 2L, 6L, 7L, 8L, 2L, 6L, 5L, 5L, 2L, 1L, 3L, 2L, 5L, 3L, 1L, 4L, 3L, 8L, 3L, 1L, 2L, 7L, 1L, 5L, 1L, 7L, 5L, 6L, 8L, 5L, 4L, 4L, 2L, 2L, 4L, 5L, 3L, 3L), .Label = c("RR", "SS", "TT", "UU", "VV", "XX", "YY", "ZZ"), class = "factor")
), .Names = c("ID", "A", "B"), class = "data.frame", row.names = c(NA, -50L)
)
mosaicplot(with(X,table(A,B)))
For you example dataset:
Thanks! I think that the connectivity between elements in each class is best visualized by the link graph examples given by both Jonathon and Thierry. Thierry's 2nd which shows the magnitude is definitely where i will start.
update
thanks everyone for you ideas and tips!
I came acrossthe bipartite package that has functions to visualize this kind of data. I think its a clean visualization of the relationships I am trying to show.
did:
library(bipartite)
dataset <- data.frame(
A = sample(letters[1:5], 200, prob = runif(5), replace = TRUE),
B = sample(LETTERS[20:26], 200, prob = runif(7), replace = TRUE)
)
datamat <- as.matrix(table(dataset$A, dataset$B))
visweb(datamat, text = "interaction", textsize = .8)
giving:
visweb result
couldnt put image in as a new user :(

Resources