R code of scatter plot for four variables - r

I tried plotting ASB vs YOI for each Child grouped by Race
I got something like:
library(tidyverse)
Antisocial <- structure(list(Child = c(1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 4L), ASB = c(1L, 1L, 1L, 0L, 0L, 0L, 5L, 5L, 5L, 2L), Race = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), Y92 = c(0L, 1L, 0L, 0L, 1L, 0L, 0L, 1L, 0L, 0L), Y94 = c(0L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 1L, 0L), YOI = c(90L, 92L, 94L, 90L, 92L, 94L, 90L, 92L, 94L, 90L)), row.names = c(NA, 10L), class = "data.frame")
ggplot(data = Antisocial, aes(x = YOI, y = ASB)) +
geom_point( colour = "Black", size = 2) +
geom_line(data = Antisocial, aes(x= Child), size = 1) +
facet_grid(.~ Race)
Plot Image I generated: https://drive.google.com/file/d/1sZVsRFiGC0dIGg0GWhHhNDCaiW2iB-ky/view?usp=sharing
Full dataset- https://drive.google.com/file/d/1UeVTJ1M_eKQDNtvyUHRB77VDpSF1ASli/view?usp=sharing
I want to use 2 charts side by side Race=0, Race= 1 to plot ASB vs YOI for each Child grouped by Race. The line, however, should only connect to dots of the same child. As it is right now, all the dots are connected. Furthermore the scale of YOI should be (90,94).
Can you suggest what change should I do?
Thanks!

Thanks for providing the data. I changed 4 observations to race 0 to have some variation:
library(tidyverse)
Antisocial <- structure(list(Child = c(1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 4L), ASB = c(1L, 1L, 1L, 0L, 0L, 0L, 5L, 5L, 5L, 2L), Race = c(1L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 1L), Y92 = c(0L, 1L, 0L, 0L, 1L, 0L, 0L, 1L, 0L, 0L), Y94 = c(0L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 1L, 0L), YOI = c(90L, 92L, 94L, 90L, 92L, 94L, 90L, 92L, 94L, 90L)), row.names = c(NA, 10L), class = "data.frame")
ggplot(data = Antisocial, aes(x = YOI, y = ASB, , group = Child)) +
geom_point( colour = "Black", size = 2) +
geom_line()+
facet_grid(.~ Race)
To connect the dots for each child, you need to include group = Child in the code. I think this is what you want? Let me know if this solved your problem :)

Related

DLNM: Error: coef/vcov not consistent with basis matrix. See help(crosspred)

I am using distributed lag non-linear models . I ran a glm model with a cross-basis matrix from the DLNM package. When I tried to get the predictions, I got this error:
Error in crosspred(cbpm1, Tp1, by = 1, bylag = 1, at = speimin:speimax) :
coef/vcov not consistent with basis matrix. See help(crosspred).
This happened when I tried lag 1,2, and 3, but there was no error when I tried lag 0, 4, and 5. I read about a similar question from this link. But still, I cannot figure it out with my own code. Your help is really meaningful for me. Thanks.
The code is:
Dis <- ss$dis1
vkt <- equalknots(ss$T,nk=2)
lkt = logknots(1,nk=2)
vkpm <- equalknots(ss$spei3,nk=2)
lkpm <- logknots(1,nk=2)
speimin <- min(ss$spei3, na.rm = TRUE)
speimax <- max(ss$spei3, na.rm = TRUE)
cbt1 = crossbasis(ss$T, lag=1, argvar=list(fun="bs",degree=2,knots=vkt), arglag=list(knots=lkt))
cbpm1 <- crossbasis(ss$spei3, lag=1, argvar=list(fun="bs",degree=2,knots=vkpm), arglag=list(knots=lkpm))
Tp1 <- glm(Dis ~ cbt1 + cbpm1 + ns(RH,3)+ns(timeseries,2*5),
family=poisson(link=log),ss)
at=speimin:speimax
predsltp1 <- crosspred(cbpm1,Tp1,by=1,bylag=1,at=speimin:speimax)
Here is the used library:
library(splines);library(class);library(stats);library(mda)
library(akima);library(gam);library(mgcv);library(foreign);library(som)
library(dlnm) #equalknots logknots crossbasis
library(splines) #ns
library(magrittr)
Here is the reproducible sample of my dataset:
a<-structure(list(job = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "all", class = "factor"),
age3 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L), .Label = "00_05", class = "factor"),
sexA = structure(c(1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L,
2L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 1L,
2L, 1L, 2L, 2L, 1L), .Label = c("F", "M"), class = "factor"),
All = c(65L, 53L, 92L, 68L, 81L, 103L, 144L, 92L, 44L, 40L,
54L, 19L, 55L, 61L, 72L, 89L, 77L, 68L, 71L, 27L, 15L, 18L,
39L, 52L, 52L, 58L, 27L, 44L, 32L, 37L), dis1 = c(6L, 0L,
9L, 0L, 0L, 0L, 9L, 0L, 3L, 6L, 3L, 0L, 0L, 3L, 6L, 0L, 9L,
3L, 0L, 3L, 6L, 0L, 0L, 0L, 0L, 0L, 3L, 0L, 0L, 0L), dis2 = c(3L,
6L, 0L, 0L, 0L, 0L, 0L, 3L, 0L, 0L, 0L, 3L, 0L, 0L, 6L, 6L,
0L, 0L, 0L, 3L, 3L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L),
T = c(20.39032258, 20.39032258, 19.78387097, 19.78387097,
19.64193548, 19.64193548, 18.78709677, 18.78709677, 19.17419355,
19.17419355, 20.46774194, 21.63870968, 21.85806452, 21.85806452,
19.73448276, 19.73448276, 20.55357143, 20.55357143, 19.925,
29.12580645, 29.12580645, 29.39354839, 29.39354839, 28.96129032,
28.96129032, 27.36666667, 27.40333333, 27.40333333, 27.82333333,
27.82333333), RH = c(70.09677419, 70.09677419, 70.03225806,
70.03225806, 70.35483871, 70.35483871, 72.32258065, 72.32258065,
69.80645161, 69.80645161, 74.58064516, 77.58064516, 71.32258065,
71.32258065, 75.82758621, 75.82758621, 62.28571429, 62.28571429,
72.60714286, 77.61290323, 77.61290323, 75.06451613, 75.06451613,
75.61290323, 75.61290323, 76.03333333, 76.23333333, 76.23333333,
75.03333333, 75.03333333), PP = c(11.5, 11.5, 44.5, 44.5,
25.9, 25.9, 14, 14, 5, 5, 35.7, 34.1, 30.8, 30.8, 44.4, 44.4,
15.6, 15.6, 40.7, 184, 184, 137.1, 137.1, 377, 377, 110.5,
129.8, 129.8, 292, 292), spei3 = c(0.447495072, 0.447495072,
1.537295165, 1.537295165, 1.285067571, 1.285067571, 0.441010834,
0.441010834, 1.505630159, 1.505630159, 1.725831329, 1.075029338,
-1.227673724, -1.227673724, 0.329690702, 0.329690702, 0.724314874,
0.724314874, 1.228544608, 0.60782059, 0.60782059, 0.191804009,
0.191804009, 1.752145476, 1.752145476, 1.94554333, 1.139058482,
1.139058482, -0.554472376, -0.554472376), timeseries = 1:30), class = "data.frame", row.names = c(NA,
-30L))

How to insert blank space on x-axis in boxplot using ggplot /R?

I have My Data stored in p, which can be found below.
I have four specific categories for a group of tumor patients. Three of the groups correspond to the tumor stage and is stored as p$WHO.Grade=1,2,3. The last group is All tumor patients combined.
I am producing a specific plot consisting of multiple boxplots demonstrating the distribution of a continuous covariate (p$ki67pro) in the four groups described as above and in relation to the event of recurrence (p$recurrence==0 for no and p$recurrence==1 for yes).
As it turns out, there are no events for p$WHO.Grade==3, which means that I want my blot to look exactly like this (manipulated in photoshop):
However, I get the picture below when I use the following script:
library(ggplot2)
library(dplyr)
p %>%
bind_rows(p %>% mutate(WHO.Grade = 4)) %>%
mutate(WHO.Grade = factor(WHO.Grade),
recurrence = factor(recurrence)) %>%
ggplot(aes(WHO.Grade, ki67pro,
fill = recurrence, colour = recurrence)) +
geom_boxplot(outlier.alpha = 0,
position = position_dodge(width = 0.78, preserve = "single")) +
geom_point(size = 3, shape = 21,
position = position_jitterdodge()) +
scale_x_discrete(name = "",
label = c("WHO-I","WHO-II","WHO-III","All")) +
scale_y_continuous(name = "x", breaks=seq(0,30,5), limits=c(0,30)) +
scale_fill_manual(values = c("#edf1f9", "#fcebeb"), name = "",
labels = c("", "")) +
scale_colour_manual(values = c("#1C73C2", "red"), name = "",
labels = c("","")) +
theme(legend.position="none",
panel.background = element_blank(),
axis.line = element_line(colour = "black"))
It seems like p$WHO.Grade==All automatically is inserted at the p$WHO.Grade==3 space, which should be leaved blank.
Therefore, my question is: how can I graphically insert a blank space at p$WHO.Grade==3 given my
script above?
p <- structure(list(WHO.Grade = c(1L, 2L, 1L, 1L, 1L, 1L, 2L, 1L,
1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L), ki67pro = c(1L, 12L, 3L, 3L, 5L, 3L,
25L, 7L, 4L, 5L, 12L, 3L, 15L, 4L, 5L, 7L, 8L, 3L, 12L, 10L,
4L, 10L, 7L, 3L, 2L, 3L, 7L, 4L, 7L, 10L, 4L, 5L, 5L, 3L, 5L,
2L, 5L, 3L, 3L, 3L, 4L, 4L, 3L, 2L, 5L, 1L, 5L, 2L, 3L, 1L, 2L,
3L, 3L, 5L, 4L, 20L, 5L, 0L, 4L, 3L, 0L, 3L, 4L, 1L, 2L, 20L,
2L, 3L, 5L, 4L, 8L, 1L, 4L, 5L, 4L, 3L, 6L, 12L, 3L, 4L, 4L,
2L, 5L, 3L, 3L, 3L, 2L, 5L, 4L, 2L, 3L, 4L, 3L, 3L, 2L, 2L, 4L,
7L, 4L, 3L, 4L, 2L, 3L, 6L, 2L, 3L, 10L, 5L, 10L, 3L, 10L, 3L,
4L, 5L, 2L, 4L, 3L, 4L, 4L, 4L, 5L, 3L, 12L, 5L, 4L, 3L, 2L,
4L, 3L, 4L, 2L, 1L, 6L, 1L, 4L, 12L, 3L, 4L, 3L, 2L, 6L, 5L,
4L, 3L, 4L, 4L, 4L, 3L, 5L, 4L, 5L, 4L, 1L, 3L, 3L, 4L, 0L, 3L
), recurrence = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L,
0L, 1L, 0L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 0L, 0L,
1L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
1L, 1L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 1L, 1L, 0L, 1L, 0L, 0L, 1L,
1L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L,
0L, 0L, 0L)), class = "data.frame", row.names = c(1L, 2L, 3L,
9L, 10L, 11L, 13L, 14L, 15L, 16L, 18L, 19L, 20L, 21L, 22L, 23L,
24L, 25L, 26L, 27L, 28L, 29L, 30L, 31L, 32L, 33L, 34L, 35L, 36L,
37L, 38L, 39L, 40L, 41L, 44L, 45L, 46L, 47L, 48L, 49L, 50L, 51L,
52L, 53L, 54L, 55L, 57L, 59L, 60L, 61L, 62L, 63L, 64L, 65L, 66L,
67L, 68L, 69L, 70L, 71L, 72L, 73L, 74L, 75L, 76L, 77L, 78L, 79L,
80L, 81L, 82L, 83L, 84L, 85L, 87L, 89L, 90L, 91L, 92L, 93L, 94L,
96L, 97L, 98L, 99L, 100L, 101L, 102L, 103L, 104L, 105L, 106L,
107L, 109L, 110L, 111L, 112L, 113L, 114L, 115L, 116L, 117L, 118L,
119L, 120L, 121L, 123L, 124L, 125L, 126L, 127L, 128L, 130L, 131L,
132L, 133L, 134L, 135L, 136L, 137L, 138L, 139L, 140L, 141L, 142L,
143L, 144L, 145L, 146L, 147L, 148L, 149L, 150L, 151L, 152L, 153L,
154L, 155L, 156L, 157L, 158L, 159L, 160L, 161L, 162L, 163L, 164L,
165L, 166L, 167L, 168L, 169L, 170L, 171L, 172L, 173L, 174L, 175L
))
The simplest way is to adjust your WHO.Grade factor to include all 4 levels - c("WHO-I","WHO-II","WHO-III","All")```. Here's the first adjustment on line 3:
p %>%
bind_rows(p %>% mutate(WHO.Grade = 4)) %>%
mutate(WHO.Grade = factor(WHO.Grade, levels = 1:4, labels = c("WHO-I","WHO-II","WHO-III","All")),
recurrence = factor(recurrence))
Now that we've named our factors, we can modify the scale_x_discrete() call to remove the label and add drop = FALSE:
scale_x_discrete(name = "",
# label = c("WHO-I","WHO-II","WHO-III","All"),
drop = FALSE)
Putting everything together we get:
p %>%
bind_rows(p %>% mutate(WHO.Grade = 4)) %>% as_tibble()%>%
mutate(WHO.Grade = factor(WHO.Grade, levels = 1:4, labels = c("WHO-I","WHO-II","WHO-III","All")),
recurrence = factor(recurrence))%>%
ggplot(aes(WHO.Grade, ki67pro,
fill = recurrence, colour = recurrence)) +
geom_boxplot(outlier.alpha = 0,
position = position_dodge(width = 0.78, preserve = "single")) +
geom_point(size = 3, shape = 21,
position = position_jitterdodge()) +
scale_x_discrete(name = "",
# label = c("WHO-I","WHO-II","WHO-III","All"),
drop = FALSE) +
scale_y_continuous(name = "x", breaks=seq(0,30,5), limits=c(0,30)) +
scale_fill_manual(values = c("#edf1f9", "#fcebeb"), name = "",
labels = c("", "")) +
scale_colour_manual(values = c("#1C73C2", "red"), name = "",
labels = c("","")) +
theme(legend.position="none",
panel.background = element_blank(),
axis.line = element_line(colour = "black"))
I could not reach to the result you want as appeared in the Photoshop image, but you could gain the following image:
Which it is what you want, but all the entries of "WHO-III" are zeros
The code that generates it is:
library(ggplot2)
library(dplyr)
p= p %>%
bind_rows(p %>% mutate(WHO.Grade = 3)) %>%
bind_rows(p %>% mutate(WHO.Grade = 4))
p[p$WHO.Grade == 3, 2] = 0
p %>%
mutate(WHO.Grade = factor(WHO.Grade),
recurrence = factor(recurrence)) %>%
ggplot(aes(WHO.Grade, ki67pro,
fill = recurrence, colour = recurrence)) +
geom_boxplot(outlier.alpha = 0,
position = position_dodge(width = 0.78, preserve = "single")) +
geom_point(size = 3, shape = 21,
position = position_jitterdodge()) +
scale_x_discrete(name = "",
label = c("WHO-I","WHO-II","WHO-III","All"), drop = FALSE) +
scale_y_continuous(name = "x", breaks=seq(0,30,5), limits=c(0,30)) +
scale_fill_manual(values = c("#edf1f9", "#fcebeb"), name = "",
labels = c("", "")) +
scale_colour_manual(values = c("#1C73C2", "red"), name = "",
labels = c("","")) +
theme(legend.position="none",
panel.background = element_blank(),
axis.line = element_line(colour = "black"))
Hope this helps

How to add different boxplots to the same plot based on different data sources in ggplot /R?

Please find My Data below. Please note that picture below is an example of the design I wish to copy and does not correlate to My Data specifically.
My Data is stored in p. I have a continuous covariate p$ki67pro which denominate the percentage of cells actively dividing in a tumor sample (thus, ranging from 0 to 100). I have three different stages of the tumor, which correspond to p$WHO.Grade==1,2,3. Each sample represent a tumor patient that either had recurrence (p$recurrence==1) or not (p$recurrence==0).
Therefore:
head(p)
WHO.Grade recurrence ki67pro
1 1 0 1
2 2 0 12
3 1 0 3
9 1 0 3
10 1 0 5
11 1 0 3
I wish to produce the boxplot below. As you can see, there are four points which correspond to each p$WHO.Grade and and All samples. There are two boxplots per p$WHO.Grade + All.
Per p$WHO.Grade and All, I want one boxplot to represent p$ki67pro for recurrent tumors (p$recurrence==1) and the other boxplot to represent p$ki67pro for non-recurrent tumors (p$recurrence==0).
I.e.
p$ki67pro[p$WHO.Grade==1 & p$recurrence==0] versus
p$ki67pro[p$WHO.Grade==1 & p$recurrence==1]
p$ki67pro[p$WHO.Grade==2 & p$recurrence==0] versus
p$ki67pro[p$WHO.Grade==2 & p$recurrence==1]
p$ki67pro[p$WHO.Grade==3 & p$recurrence==0] versus
p$ki67pro[p$WHO.Grade==3 & p$recurrence==1]
And for All
p$ki67pro[p$recurrence==0] versus
p$ki67pro[p$recurrence==1]
I have used the following script so far, but I can figure out on how to get the All included. Please, note that there is only one case p$WHO.Grade==3
df <- data.frame(x = as.factor(c(p$WHO.Grade)),
y = c(p$ki67pro),
f = rep(c("ki67pro"), c(nrow(p))))
df <- df[!is.na(df$x),]
ggplot(df) +
geom_boxplot(aes(x, y, fill = f, colour = f), outlier.alpha = 0, position = position_dodge(width = 0.78)) +
scale_x_discrete(name = "", label=c("WHO-I","WHO-II","WHO-III","All")) +
scale_y_continuous(name="x", breaks=seq(0,30,5), limits=c(0,30)) +
stat_boxplot(aes(x, y, colour = f), geom = "errorbar", width = 0.3,position = position_dodge(0.7753)) +
geom_point(aes(x, y, fill = f, colour = f), size = 3, shape = 21, position = position_jitterdodge()) +
scale_fill_manual(values = c("#edf1f9", "#fcebeb"), name = "",
labels = c("", "")) +
scale_colour_manual(values = c("#1C73C2", "red"), name = "",
labels = c("","")) + theme(legend.position="none")
My Data p
p <- structure(list(WHO.Grade = c(1L, 2L, 1L, 1L, 1L, 1L, 3L, 2L,
1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L), recurrence = c(0L, 0L, 0L, 0L, 0L,
0L, 1L, 0L, 0L, 0L, 1L, 0L, 0L, 1L, 0L, 1L, 0L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 1L, 0L, 0L, 0L,
1L, 1L, 0L, 1L, 0L, 0L, 1L, 1L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L,
1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L), ki67pro = c(1L, 12L,
3L, 3L, 5L, 3L, 20L, 25L, 7L, 4L, 5L, 12L, 3L, 15L, 4L, 5L, 7L,
8L, 3L, 12L, 10L, 4L, 10L, 7L, 3L, 2L, 3L, 7L, 4L, 7L, 10L, 4L,
5L, 5L, 3L, 5L, 2L, 5L, 3L, 3L, 3L, 4L, 4L, 3L, 2L, 5L, 1L, 5L,
2L, 3L, 1L, 2L, 3L, 3L, 5L, 4L, 20L, 5L, 0L, 4L, 3L, 0L, 3L,
4L, 1L, 2L, 20L, 2L, 3L, 5L, 4L, 8L, 1L, 4L, 5L, 4L, 3L, 6L,
12L, 3L, 4L, 4L, 2L, 5L, 3L, 3L, 3L, 2L, 5L, 4L, 2L, 3L, 4L,
3L, 3L, 2L, 2L, 4L, 7L, 4L, 3L, 4L, 2L, 3L, 6L, 2L, 3L, 10L,
5L, 10L, 3L, 10L, 3L, 4L, 5L, 2L, 4L, 3L, 4L, 4L, 4L, 5L, 3L,
12L, 5L, 4L, 3L, 2L, 4L, 3L, 4L, 2L, 1L, 6L, 1L, 4L, 12L, 3L,
4L, 3L, 2L, 6L, 5L, 4L, 3L, 4L, 4L, 4L, 3L, 5L, 4L, 5L, 4L, 1L,
3L, 3L, 4L, 0L, 3L)), class = "data.frame", row.names = c(1L,
2L, 3L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 18L, 19L, 20L,
21L, 22L, 23L, 24L, 25L, 26L, 27L, 28L, 29L, 30L, 31L, 32L, 33L,
34L, 35L, 36L, 37L, 38L, 39L, 40L, 41L, 44L, 45L, 46L, 47L, 48L,
49L, 50L, 51L, 52L, 53L, 54L, 55L, 57L, 59L, 60L, 61L, 62L, 63L,
64L, 65L, 66L, 67L, 68L, 69L, 70L, 71L, 72L, 73L, 74L, 75L, 76L,
77L, 78L, 79L, 80L, 81L, 82L, 83L, 84L, 85L, 87L, 89L, 90L, 91L,
92L, 93L, 94L, 96L, 97L, 98L, 99L, 100L, 101L, 102L, 103L, 104L,
105L, 106L, 107L, 109L, 110L, 111L, 112L, 113L, 114L, 115L, 116L,
117L, 118L, 119L, 120L, 121L, 123L, 124L, 125L, 126L, 127L, 128L,
130L, 131L, 132L, 133L, 134L, 135L, 136L, 137L, 138L, 139L, 140L,
141L, 142L, 143L, 144L, 145L, 146L, 147L, 148L, 149L, 150L, 151L,
152L, 153L, 154L, 155L, 156L, 157L, 158L, 159L, 160L, 161L, 162L,
163L, 164L, 165L, 166L, 167L, 168L, 169L, 170L, 171L, 172L, 173L,
174L, 175L))
A trick that can be used is to create a new level in WHO.Grade, since it only has 3 levels. This should be a temporary level, so a good way of doing it is with package dplyr, function mutate.
Note that there is no need to create a new dataframe df.
library(ggplot2)
library(dplyr)
p %>%
bind_rows(p %>% mutate(WHO.Grade = 4)) %>%
mutate(WHO.Grade = factor(WHO.Grade),
recurrence = factor(recurrence)) %>%
ggplot(aes(WHO.Grade, ki67pro,
fill = recurrence, colour = recurrence)) +
geom_boxplot(outlier.alpha = 0,
position = position_dodge(width = 0.78, preserve = "single")) +
geom_point(size = 3, shape = 21,
position = position_jitterdodge()) +
scale_x_discrete(name = "",
label = c("WHO-I","WHO-II","WHO-III","All")) +
scale_y_continuous(name = "x", breaks=seq(0,30,5), limits=c(0,30)) +
scale_fill_manual(values = c("#edf1f9", "#fcebeb"), name = "",
labels = c("", "")) +
scale_colour_manual(values = c("#1C73C2", "red"), name = "",
labels = c("","")) +
theme(legend.position="none")
What about something like this:
# here you duplicate your original data
p1 <- p
# how to catch the all
p1$WHO.Grade <- 'all'
p <- rbind(p1,p)
library(ggplot2)
ggplot(p) +
geom_boxplot(aes(as.factor(WHO.Grade),
y = ki67pro,
fill = factor(recurrence) ,
color = factor(recurrence) ),
outlier.alpha = 0 , position = position_dodge(width = 0.78)) +
# from here it's more or less your code
scale_x_discrete(name = "", label=c("WHO-I","WHO-II","WHO-III","All")) +
scale_y_continuous(name="x", breaks=seq(0,30,5), limits=c(0,30)) +
stat_boxplot(aes(as.factor(WHO.Grade),
y = ki67pro,
color = factor(recurrence) ),
geom = "errorbar", width = 0.3,position = position_dodge(0.7753)) +
geom_point(aes(as.factor(WHO.Grade),
y = ki67pro,
color = factor(recurrence) ),
size = 3, shape = 21, position = position_jitterdodge()) +
scale_fill_manual(values = c("#edf1f9", "#fcebeb"), name = "",
labels = c("", "")) +
scale_colour_manual(values = c("#1C73C2", "red"), name = "",
labels = c("","")) +
theme(legend.position="none",
panel.background = element_blank(),
axis.line = element_line(colour = "black"))
In case your dataset is too large for just doubling it in size you create two plots and put them next to each other via grid.arrange().
library(ggplot2)
library(gridExtra)
#the data
df <- data.frame(x = as.factor(c(p$WHO.Grade)),
y = p$ki67pro,
f = as.factor(p$recurrence))
df <- df[!is.na(df$x),]
# plot 1
plot1 <- ggplot(df) +
geom_boxplot(aes(x, y, fill = f, colour = f), outlier.alpha = 0, position = position_dodge(width = 0.78)) +
scale_x_discrete(name = "", label=c("WHO-I","WHO-II","WHO-III","All")) +
scale_y_continuous(name="x", breaks=seq(0,30,5), limits=c(0,30)) +
stat_boxplot(aes(x, y, colour = f), geom = "errorbar", width = 0.3,position = position_dodge(0.7753)) +
geom_point(aes(x, y, fill = f, colour = f), size = 3, shape = 21, position = position_jitterdodge()) +
scale_fill_manual(values = c("#edf1f9", "#fcebeb"), name = "",
labels = c("", "")) +
scale_colour_manual(values = c("#1C73C2", "red"), name = "",
labels = c("","")) + theme(legend.position="none") +
theme(plot.margin = unit(c(1,-0.5,1, 1), "cm"))
#plot 2
plot2 <- ggplot(df) +
geom_boxplot(aes(x = "All", y = y, fill = f, colour = f), outlier.alpha = 0, position = position_dodge(width = 0.78)) +
scale_x_discrete(name = "") +
scale_y_continuous(name="x", breaks=seq(0,30,5), limits=c(0,30)) +
stat_boxplot(aes(x = "All", y = y, colour = f), geom = "errorbar", width = 0.3,position = position_dodge(0.7753)) +
geom_point(aes(x = "All", y = y, fill = f, colour = f), size = 3, shape = 21, position = position_jitterdodge()) +
scale_fill_manual(values = c("#edf1f9", "#fcebeb"), name = "",
labels = c("", "")) +
scale_colour_manual(values = c("#1C73C2", "red"), name = "",
labels = c("","")) + theme(legend.position="none") +
theme(axis.line.y = element_blank(),
axis.title.y = element_blank(),
axis.ticks.y = element_blank(),
axis.text.y = element_blank(),
plot.margin = unit(c(1,1,1, -0.5), "cm"))
#put it together
lm <- rbind(c(1,1,1,2))
grid.arrange(plot1, plot2, layout_matrix = lm)
If I understood correctly, you just want to show all of your data in the last boxplot.
You can do this easily by just duplicating the data while creating the data frame and labelling the duplicate with All.
df <- data.frame(x = as.factor(c(p$WHO.Grade, rep("All", nrow(p)))),
y = rep(c(p$ki67pro), 2),
f = "ki67pro")
The plotting remains the same and you can easily add recurrence.
However, the plot you're showing above looks weird as the All boxplot doesn't contain all the data.

Express relations between three variables using ggplot2 in R

I have a data frame like this
structure(list(cli_exp = c(1L, 1L, 2L, 1L, 1L, 0L, 2L, 0L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 2L, 2L, 0L, 1L, 0L,
1L, 1L, 2L, 0L, 1L), vcs_exp = c(0L, 0L, 1L, 0L, 0L, 0L, 0L,
1L, 0L, 0L, 1L, 1L, 0L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 1L, 2L, 1L,
1L, 0L, 0L, 0L, 2L, 1L, 0L), web_exp = c(2L, 2L, 2L, 1L, 0L,
0L, 1L, 2L, 0L, 0L, 3L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 2L, 1L, 1L,
1L, 1L, 0L, 0L, 1L, 1L, 2L, 0L, 0L)), .Names = c("cli_exp", "vcs_exp",
"web_exp"), row.names = c(NA, 30L), class = "data.frame")
I want to use ggplot2 to express the relation between these three variables and tried the simple point plot
ggplot(data = data) +
geom_point(mapping = aes(x = web_exp, y = vcs_exp, color = cli_exp))
But apparently, there are many overlapping data points, which are not suitable for point display. Are there any better ways?
I would use ggpairs from GGally package
tmp_df <- structure(list(cli_exp = c(1L, 1L, 2L, 1L, 1L, 0L, 2L, 0L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 2L, 2L, 0L, 1L, 0L,
1L, 1L, 2L, 0L, 1L), vcs_exp = c(0L, 0L, 1L, 0L, 0L, 0L, 0L,
1L, 0L, 0L, 1L, 1L, 0L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 1L, 2L, 1L,
1L, 0L, 0L, 0L, 2L, 1L, 0L), web_exp = c(2L, 2L, 2L, 1L, 0L,
0L, 1L, 2L, 0L, 0L, 3L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 2L, 1L, 1L,
1L, 1L, 0L, 0L, 1L, 1L, 2L, 0L, 0L)), .Names = c("cli_exp", "vcs_exp",
"web_exp"), row.names = c(NA, 30L), class = "data.frame")
library(GGally)
ggpairs(tmp_df,
upper = list(continuous = wrap("cor", size = 10)),
lower = list(continuous = "smooth"))
Edit: use pairs from base R
pairs(tmp_df)
Use pairs.panels from psych package
library(psych)
pairs.panels(tmp_df,
method = "pearson",
density = TRUE,
ellipses = TRUE
)
As you mentioned, the points overlap, so some points aren't visible when using geom_point.
ggplot(data = df, aes(x = web_exp, y = vcs_exp, color = cli_exp)) +
geom_point()
This can be solved by adding a small amount of jitter. Also, making the points slightly transparent will make any overlaps more clear.
ggplot(data = df, aes(x = web_exp, y = vcs_exp, color = cli_exp)) +
geom_jitter(width = 0.05, height = 0.05, alpha = 0.8)

R create variable IF ELSE leads to wrong values

I have a dataframe with:
"serial" the number of households, each one with a variable number of components "head, spouse, parent and child or grandchild" and total number of children in the house "nchild"
I want to create a new variable (in the dput I added an example for clarity: withCM 'living with male child' and withCF). I have tried various combinations but I cannot discriminate on the sex of the child within the same "serial", so that for withCM=1 only when relate=="child"&sex==1, but the 1 would appear on a different row (that of the head, spouse or parent)
mydata$withCM<- ifelse(mydata$nchild>0&mydata$relate!="child",1,0)
mydata <- structure(list(serial = c(12345L, 12345L, 12345L, 12345L, 12346L,
12346L, 12347L, 12347L, 12347L, 12348L, 12348L, 12348L, 12348L,
12348L, 12348L, 12348L, 12349L, 12350L, 12350L, 12351L, 12351L,
12351L, 12352L, 12352L, 12352L, 12352L, 12352L, 12353L, 12354L,
12354L), age = c(45L, 44L, 13L, 11L, 29L, 28L, 65L, 61L, 35L,
68L, 61L, 35L, 34L, 6L, 2L, 1L, 62L, 54L, 52L, 67L, 67L, 12L,
49L, 50L, 28L, 21L, 22L, 70L, 89L, 55L), sex = c(1L, 2L, 2L,
1L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 1L,
1L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 2L), relate = structure(c(4L,
7L, 1L, 1L, 4L, 7L, 6L, 6L, 4L, 4L, 7L, 1L, 2L, 3L, 3L, 3L, 4L,
4L, 7L, 4L, 7L, 3L, 4L, 7L, 1L, 5L, 5L, 4L, 6L, 4L), .Label = c("child",
"childinlaw", "grandchild", "head", "nonrelative", "parent",
"spouse"), class = "factor"), nchild = c(2L, 2L, 0L, 0L, 0L,
0L, 1L, 1L, 0L, 1L, 1L, 3L, 3L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 1L, 1L, 0L, 0L, 0L, 0L, 1L, 0L), conhija = c(1L, 1L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L), conhijo = c(1L,
1L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L)), .Names = c("serial",
"age", "sex", "relate", "nchild", "conhija", "conhijo"), class = "data.frame", row.names = c(NA,
-30L))
You can tabulate the gender, family, and role-within-family as:
xtab <- table(mydata$serial, mydata$sex, mydata$relate)
And then choose the heads of the families (or, in the commented line, anyone who has the specific relationship), and alter their tallies as follows:
mydata$sex1 <- 0
mydata$sex2 <- 0
ind <- mydata$relate=="head"
#ind <- mydata$relate %in% c("head","spouse","parent")
mydata$sex1[ind] <- xtab[as.character(mydata$serial[ind]), "1", "child"]
mydata$sex2[ind] <- xtab[as.character(mydata$serial[ind]), "2", "child"]
Use lapply to split into families, then test if they are an adult, and there is at least one male child in the unit.
lives_with_boy <- function(serial)
{
unit <- mydata[mydata$serial==serial,]
as.character(unit$relate) %in% c("head","spouse","parent") & any(unit$relate == "child" & unit$sex==1)
}
mydata$withCM <- unlist(lapply(unique(mydata$serial),lives_with_boy ))

Resources