Scale circle size Venn diagram by relative proportion

Scale circle size Venn diagram by relative proportion - r

I am plotting a Venn diagram using the function draw.triple.venn() library(VennDiagram). This is my code in R:
g = draw.triple.venn(
area1 = 4.1, area2 = 5.6, area3 = 15.9, n12 = 1.3,n23 = 4.2, n13 = 2.3, n123 = 1.2,
category = c("Land use", "Environment", "Space"), lwd = c(1.2, 1.2, 1.2), lty = c(1, 1, 1),
fill = c("darkgray", "gray", "lightgrey"), alpha = c(0.8, 0.8, 0.8),
cat.pos = c(330, 30, 150), cat.dist = c(0.06, 0.06, 0.05), sigdig=2, cex=2, cat.cex=2,
print.mode = c("raw", "percent"), cat.fontfamily = rep("serif", 3), margin = 0.01,
ind = T)
grid.arrange(gTree(children=g))
grid::grid.text("Residual variance: 80.8%", x=0.18, y=0.03, gp=gpar(col="black", fontsize=16, fontfamily="serif", fontface=1))
This is the current figure:
My question, is it possible to scale the circle size by the relative proportion?
This is a link to the package https://cran.r-project.org/web/packages/VennDiagram/VennDiagram.pdf
Thank you very much for any advice.

Using library(eulerr)
VennDiag <- euler(c("A" = 1.8, "B" = 1.5, "C" = 10.6, "A&B" = 0, "B&C" = 3.0,
"A&C" = 1.1, "A&B&C" = 1.2))
plot(VennDiag, counts = TRUE, font=1, cex=1, alpha=0.5,
fill=c("grey", "lightgrey", "darkgrey"))
It comes with this error estimate:
> VennDiag
original fitted residuals region_error
A 1.8 1.776 0.024 0.002
B 1.5 1.471 0.029 0.002
C 10.6 10.597 0.003 0.005
A&B 0.0 0.210 -0.210 0.011
A&C 1.1 1.158 -0.058 0.002
B&C 3.0 3.024 -0.024 0.000
A&B&C 1.2 1.145 0.055 0.003
diag_error: 0.011
stress: 0

Related

Better illustration of very small differences in graphs

I'm having to generate a graph using different internet links. As the data input has very close values, the graph has practically no differences.
Is there a way to make this difference more apparent?
Here is the code used:
generateGraphic <- function(gFile, wifiFile, wifigFile){
gData <- read.csv(file=gFile, header=FALSE)
wifiData <- read.csv(file=wifiFile, header=FALSE)
wifigData <- read.csv(file=wifigFile, header=FALSE)
x = gData[,1]
y1 = gData[,2]
print(y1)
y2 = wifiData[,2]
print(y2)
y3 = wifigData[,2]
print(y3)
plot(x, y1, type="b", pch = 16, xlab="Tempo (s)", ylab="Probabilidade", xaxt="n", yaxt="n")
axis(1, at = seq(0, 4, by = 0.4), las=2)
axis(2, at = seq(0, 1, by = 0.1), las=2)
lines(x, y2, type="b", col="dark gray", pch = 15)
lines(x, y3, type="b", col="light gray", pch = 8)
legend("topleft",
legend = c("4G", "WiFi", "WiFi4G"),
col = c("black", "dark gray", "light gray"),
pch = c(16, 15, 8))
}
Here is a example of the input data:
Wifi Link
0 0.0
0.1 0.20326429999999998
0.2 0.4248706
0.3 0.5867006
0.4 0.7030588999999999
0.5 0.7866605
0.6 0.8467247999999999
0.7 0.8898784999999999
0.8 0.9208824999999999
0.9 0.9431575999999999
1 0.9591613
1.1 0.9706591999999999
1.2 0.9789199999999999
1.3 0.9848549999999999
1.4 0.9891190999999999
1.5 0.9921825999999999
1.6 0.9943835999999999
1.7 0.9959648999999999
1.8 0.9971009999999999
1.9 0.9979172
2 0.9985035999999999
4G Link
0 0.0
0.1 0.2032832
0.2 0.4249017
0.3 0.5867356
0.4 0.703093
0.5 0.7866915
0.6 0.8467515999999999
0.7 0.8899009999999999
0.8 0.9209010999999999
0.9 0.9431725999999999
1 0.9591732
1.1 0.9706686
1.2 0.9789274
1.3 0.9848606999999999
1.4 0.9891234999999999
1.5 0.9921859
1.6 0.9943862
1.7 0.9959669
1.8 0.9971025
1.9 0.9979184
2 0.9985044999999999
2.1 0.9989256
2.2 0.9992281
2.3 0.9994455
2.4 0.9996016

You could plot the differences to 4G as reference.
I would put the value columns into a list, and adapt their lengths by filling with NA.
values <- list(gData=gData$V2, wifiData=wifiData$V2, wifigData=wifigData$V2)
After that you may cbind to a matrix m.
m <- do.call(cbind, lapply(values, `length<-`, max(lengths(values))))
Since 4G is in first column, subtract iot from the other columns
m <- m[, 2:3] - m[, 1]
and use matplot.
matplot(m, type='b', xaxt='n', pch=c(15, 8), col=1, xlab='Tempo (s)',
ylab='Probabilidade', main='Diferencias')
sq <- seq.int(0, by=.1, length.out=nrow(m))
axis(1, sq[seq_along(sq) %% 4 == 1], at=seq_along(sq)[seq_along(sq) %% 4 == 1])
abline(h=0, col=8)
mtext('4G', 4, -2, at=-.5e-5, las=2, col=8, font=2)
legend("bottomright", leg=c("WiFi", "WiFi4G"), horiz=T, pch=c(15, 8), bty='n')
You could also turn it around of course.
m <- do.call(cbind, lapply(values, `length<-`, max(lengths(values))))
m <- m[, 1] - m[, 2:3]
matplot(m, type='b', xaxt='n', pch=c(15, 8), col=1, xlab='Tempo (s)',
ylab='Probabilidade', main='Diferencias')
sq <- seq.int(0, by=.1, length.out=nrow(m))
axis(1, sq[seq_along(sq) %% 4 == 1], at=seq_along(sq)[seq_along(sq) %% 4 == 1])
legend("bottomright", leg=c("4G vs. WiFi", "4G vs. WiFi4G"), horiz=T, pch=c(15, 8),
bty='n')
Data:
gData <- structure(list(V1 = c(0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8,
0.9, 1, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2, 2.1,
2.2, 2.3, 2.4), V2 = c(0, 0.203385231442384, 0.425002665444611,
0.586833487363752, 0.703191551616986, 0.7867914923001, 0.846852450869833,
0.889999103106634, 0.921001598894712, 0.943272154043885, 0.959273034150037,
0.970768887392538, 0.979028541813682, 0.984959224443878, 0.989223699201295,
0.992284899754003, 0.994485588797307, 0.996068229156018, 0.997202959924694,
0.998019756784877, 0.998605023394967, 0.999025567967921, 0.999327477772277,
0.999546146749613, 0.999701921229212)), row.names = c(NA, -25L
), class = "data.frame")
wifiData <- structure(list(V1 = c(0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8,
0.9, 1, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2), V2 = c(0,
0.2032643, 0.4248706, 0.5867006, 0.7030589, 0.7866605, 0.8467248,
0.8898785, 0.9208825, 0.9431576, 0.9591613, 0.9706592, 0.97892,
0.984855, 0.9891191, 0.9921826, 0.9943836, 0.9959649, 0.997101,
0.9979172, 0.9985036)), class = "data.frame", row.names = c(NA,
-21L))
wifigData <- structure(list(V1 = c(0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8,
0.9, 1, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2, 2.1,
2.2, 2.3, 2.4), V2 = c(0, 0.2032832, 0.4249017, 0.5867356, 0.703093,
0.7866915, 0.8467516, 0.889901, 0.9209011, 0.9431726, 0.9591732,
0.9706686, 0.9789274, 0.9848607, 0.9891235, 0.9921859, 0.9943862,
0.9959669, 0.9971025, 0.9979184, 0.9985045, 0.9989256, 0.9992281,
0.9994455, 0.9996016)), class = "data.frame", row.names = c(NA,
-25L))

Since you are dealing with probabilities, you can convert them to odds very easily:
odds_wifi <- wifiData[[2]] / (1-wifiData[[2]])
odds_gData <- gData[[2]] / (1-gData[[2]])
And depending on your set-up, it might make sense to compare them with an odds ratio:
OR <- odds_wifi / odds_gData
plot(x, OR, type="b", pch = 16, xlab="Tempo (s)",
ylab="Relación de probabilidades")

How do I find best starting values for nlimb optimization?

I am trying to run the following code with the attached dataset. How do I solve the error of hessian matrix inversion?
library(stats4)
library(bbmle)
library(stats)
library(numDeriv)
library('bbmle')
x<- c(1.1, 1.4, 1.3, 1.7,1.9, 1.8, 1.6, 2.2, 1.7, 2.7, 4.1, 1.8, 1.5, 1.2, 1.4, 3, 1.7, 2.3, 1.6, 2.0)
hist(x)
fEHLKUMW<-function(a,b,alpha,vartheta)
{
-sum(log( (2*a*b*alpha*vartheta*(x^(vartheta-1))*(exp(-x^(vartheta)))*((1- (exp(-x^(vartheta))))^(a-1))*((1-((1-((1-(exp(-x^(vartheta))))^a))^b))^(alpha-1)))/((1-((1-(exp(-x^(vartheta))))^a))^(b*(alpha+1)))
))
}
EHLKUMW.result<-mle2(fEHLKUMW,hessian = NULL,start=list(a=0.01,b=0.01,alpha=.3,vartheta=0.01),optimizer="nlminb",lower=0)
summary(EHLKUMW.result)
I get the error as;
**
Warning messages:
1: In nlminb(start = start, objective = objectivefunction, hessian = NULL, :
NA/NaN function evaluation
2: In mle2(fEHLKUML, hessian = NULL, start = list(a = 1, b = 0.4, c = 0.5, :
couldn't invert Hessian
**

This is a very open question I think, so I'll present some tools and an approach to this, and maybe others can comment, etc.
First, the main part:
library(bbmle)
library(stats)
library(numDeriv)
library(bbmle)
x<- c(1.1, 1.4, 1.3, 1.7,1.9, 1.8, 1.6, 2.2, 1.7, 2.7, 4.1, 1.8, 1.5, 1.2, 1.4, 3, 1.7, 2.3, 1.6, 2.0)
hist(x)
fEHLKUMW <- function(a,b,alpha,vartheta) {
-sum(log( (2*a*b*alpha*vartheta*(x^(vartheta-1))*(exp(-x^(vartheta)))*((1- (exp(-x^(vartheta))))^(a-1))*((1-((1-((1-(exp(-x^(vartheta))))^a))^b))^(alpha-1)))/((1-((1-(exp(-x^(vartheta))))^a))^(b*(alpha+1)))
))
}
Now, we can of course run it like you've done:
EHLKUMW.result <- mle2(
fEHLKUMW,
hessian = NULL,
start = list(
a = 0.01,
b = 0.01,
alpha = .3,
vartheta = 0.01
),
optimizer = "nlminb",
lower = 0
)
But we can also run it with a distribution on each of these parameters, to get a new input all the time:
EHLKUMW.result <- mle2(
fEHLKUMW,
hessian = NULL,
start =
list(
# a = 0.01,
# a = rt(1, 10, ncp = 0.01),
a = truncdist::rtrunc(1, spec = "t", a = 0, ncp = 0.01, df = 10),
# b = 0.01,
b = truncdist::rtrunc(1, spec = "t", a = 0, ncp = 0.01, df = 10),
# alpha = .3,
alpha = truncdist::rtrunc(1, spec = "t", a = 0, ncp = 0.3, df = 10),
# vartheta = 0.01
vartheta = truncdist::rtrunc(1, spec = "t", a = 0, ncp = 0.01, df = 10)
),
optimizer = "nlminb",
lower = 0
)
I've chosen to use trundist to get a t-distribution, centralised around
the values you provided, and lower is 0 through the a = -argument.
If you know what the upper limit of these parameters, this can be done with b = -argument.
The output that I think is most relevant are the attained logLik and the coef.
library(tidyverse)
exec(fEHLKUMW, !!!list(
a = 0.01,
b = 0.01,
alpha = .3,
vartheta = 0.01
))
replicate(
250,
exec(fEHLKUMW, !!!list(
# a = 0.01,
# a = rt(1, 10, ncp = 0.01),
a = truncdist::rtrunc(1, spec = "t", a = 0, ncp = 0.01, df = 10),
# b = 0.01,
b = truncdist::rtrunc(1, spec = "t", a = 0, ncp = 0.01, df = 10),
# alpha = .3,
alpha = truncdist::rtrunc(1, spec = "t", a = 0, ncp = 0.3, df = 10),
# vartheta = 0.01
vartheta = truncdist::rtrunc(1, spec = "t", a = 0, ncp = 0.01, df = 10)
)))
I used this to fine-tune the distributions the parameters
The following is a multiple runs and their output compared to
logLik.
tibble(n = seq_len(100),
output = map(n, ~mle2(
fEHLKUMW,
hessian = NULL,
start =
list(
# a = 0.01,
# a = rt(1, 10, ncp = 0.01),
a = truncdist::rtrunc(1, spec = "t", a = 0, ncp = 0.01, df = 10),
# b = 0.01,
b = truncdist::rtrunc(1, spec = "t", a = 0, ncp = 0.01, df = 10),
# alpha = .3,
alpha = truncdist::rtrunc(1, spec = "t", a = 0, ncp = 0.3, df = 10),
# vartheta = 0.01
vartheta = truncdist::rtrunc(1, spec = "t", a = 0, ncp = 0.01, df = 10)
),
optimizer = "nlminb",
lower = 0
))) ->
outputs_df
This code gives a nice print
outputs_df %>%
mutate(coef = output %>% map(coef),
logLik = output %>% map_dbl(logLik)) %>%
unnest_wider(coef) %>%
arrange(logLik) %>%
print(n=Inf)

Iterate over character vector in dplyR summarise and use it to assign new column names

I have a character vector with the name of my variables:
variables -> c("w", "x", "y", "z")
I need to create a function that calculates the mean of every variable for a specified parameter (as below for alpha). However, it doesn't rename the columns with the iterating variable names and does not reduce the alpha columns down to one on the left.
calc <- function(df,
parameter,
iteration,
variables){
variable <- sym(variables[iteration])
mean <- df %>% group_by(.dots = parameter) %>%
summarise(variable = mean(!!variable),sd_variable = sd(!!variable))
return(mean)
}
means <- map_dfc(1:length(variables), ~calc(df = input,
parameter = "alpha",
iteration = .,
variables = variables))
Ideally the output df (means) would look like this:
alpha | w | sd_w | x | sd_x | y | sd_y | z | sd_z |
Here is what the input df looks like:
structure(list(time = c(0, 0.1, 0.2, 0.3, 0.4, 0.5, 999.5, 999.6,
999.7, 999.8, 999.9, 1000), w = c(10, 10.0057192322758, 10.0198266325956,
10.040096099625, 10.0637654242843, 10.087779652849, 0.889708853982268,
0.890916575744663, 0.892121389863897, 0.89332329218135, 0.894522278550115,
0.895718344834999), x = c(10, 11.0467963604334, 12.1782709261765,
13.3728962503142, 14.6035317074526, 15.8398164069251, 62.6631746231113,
62.6583134156356, 62.6534565303638, 62.648604016965, 62.6437559251575,
62.6389123047088), y = c(10, 9.89605687874935, 9.59253574727296,
9.11222320249057, 8.48917353431654, 7.76447036695841, 0.00833796964522317,
0.00835876233547079, 0.00837957883570158, 0.00840041916631544,
0.00842128334742553, 0.00844217139885453), z = c(10, 9.05439359565339,
8.21533762023494, 7.48379901688836, 6.85562632179817, 6.3231517466183,
-7.50539460838544, -7.48234149534558, -7.45927733670319, -7.43620225192078,
-7.41311636057114, -7.39001978233681), alpha = c(0.1, 0.1, 0.1,
0.1, 0.1, 0.1, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5), beta = c(0.1, 0.1,
0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1), eta = c(0.1,
0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1), zeta = c(0.1,
0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1), lambda = c(0.95,
0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95
), phi = c(5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5), kappa = c(1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), ode_outputs..iteration.. = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA)), row.names = c("1",
"1.1", "1.2", "1.3", "1.4", "1.5", "3.9995", "3.9996", "3.9997",
"3.9998", "3.9999", "3.10000"), class = "data.frame")
Ideally the function would use dplyr and/or baseR.

If I understand you correctly, there's no need to iterate over columns. It can all be done directly in dplyr...
library(tidyverse)
df %>%
group_by(alpha) %>%
summarise(
across(
c(w, x, y, z),
list(mean=mean, sd=sd)
),
.groups="drop"
) %>%
rename_with(function(x) str_sub(x,1,1), ends_with("mean"))
# A tibble: 2 x 9
alpha w w_sd x x_sd y y_sd z z_sd
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 0.1 10.0 0.0345 12.8 2.20 9.14 0.875 7.99 1.38
2 0.5 0.893 0.00225 62.7 0.00908 0.00839 0.0000390 -7.45 0.0432

R sample from sequence stepwise vs replicate with different results

EDIT (completely revised question as requested)
I get some unexpected behavior when sampling one index from a sequence stepwise vs sampling the whole sequence. If I set seed once
set.seed(123)
and execute
sample(c(0.9,0.95,1,1.01,1.02,1.03,1.04,1.05))
I get e.g.
[1] 1.03 0.90 1.02 1.00 0.95 1.04 1.05 1.01
[1] 1.05 0.95 1.01 1.04 0.90 1.00 1.03 1.02
[1] 0.90 1.04 1.01 1.05 1.00 0.95 1.03 1.02
However, if I repeatedly execute (very often, e.g. 100 times)
sample(c(0.9,0.95,1,1.01,1.02,1.03,1.04,1.05))[3]
R will never sample anything but 0.9, 0.95, 1 or 1.0. I also changed the seed but behavior is the same. What am I missing?
R version 3.1.3 (2015-03-09)
Platform: x86_64-w64-mingw32/x64 (64-bit)

No repro:
> set.seed(123)
> sample(seq(from = 0.9, to = 1.1, by = 0.01), size=1, replace=T)[1]
[1] 0.96
> sample(seq(from = 0.9, to = 1.1, by = 0.01), size=1, replace=T)[1]
[1] 1.06
> sample(seq(from = 0.9, to = 1.1, by = 0.01), size=1, replace=T)[1]
[1] 0.98
> sample(seq(from = 0.9, to = 1.1, by = 0.01), size=1, replace=T)[1]
[1] 1.08
> sample(seq(from = 0.9, to = 1.1, by = 0.01), size=1, replace=T)[1]
[1] 1.09
> sample(seq(from = 0.9, to = 1.1, by = 0.01), size=1, replace=T)[1]
[1] 0.9
> sample(seq(from = 0.9, to = 1.1, by = 0.01), size=1, replace=T)[1]
[1] 1.01
> sample(seq(from = 0.9, to = 1.1, by = 0.01), size=1, replace=T)[1]
[1] 1.08
> sample(seq(from = 0.9, to = 1.1, by = 0.01), size=1, replace=T)[1]
[1] 1.01
> sample(seq(from = 0.9, to = 1.1, by = 0.01), size=1, replace=T)[1]
[1] 0.99
And:
> set.seed(123)
> replicate(10,sample(seq(from = 0.9, to = 1.1, by = 0.01), size=1, replace=T))
[1] 0.96 1.06 0.98 1.08 1.09 0.90 1.01 1.08 1.01 0.99
Exact same list of values (as expected) as replicate is just a wrapper around sapply:
> replicate
function (n, expr, simplify = "array")
sapply(integer(n), eval.parent(substitute(function(...) expr)),
simplify = simplify)
With a small test I can find a seed replicating your problem (I think):
for(i in 1000:2000) {
set.seed(i)
if( all(replicate(10,sample(seq(from = 0.9, to = 1.1, by = 0.01), size=1, replace=T)) < 1 )) {
print(i)
break
}
}
Gives me 1887 and so:
> set.seed(1887)
> sample(seq(from = 0.9, to = 1.1, by = 0.01), size=1, replace=T)[1]
[1] 0.99
> sample(seq(from = 0.9, to = 1.1, by = 0.01), size=1, replace=T)[1]
[1] 0.92
> sample(seq(from = 0.9, to = 1.1, by = 0.01), size=1, replace=T)[1]
[1] 0.96
> sample(seq(from = 0.9, to = 1.1, by = 0.01), size=1, replace=T)[1]
[1] 0.99
> sample(seq(from = 0.9, to = 1.1, by = 0.01), size=1, replace=T)[1]
[1] 0.95
> sample(seq(from = 0.9, to = 1.1, by = 0.01), size=1, replace=T)[1]
[1] 0.99
> sample(seq(from = 0.9, to = 1.1, by = 0.01), size=1, replace=T)[1]
[1] 0.96
> sample(seq(from = 0.9, to = 1.1, by = 0.01), size=1, replace=T)[1]
[1] 0.93
> sample(seq(from = 0.9, to = 1.1, by = 0.01), size=1, replace=T)[1]
[1] 0.94
> sample(seq(from = 0.9, to = 1.1, by = 0.01), size=1, replace=T)[1]
[1] 0.99
> replicate(10,sample(seq(from = 0.9, to = 1.1, by = 0.01), size=1, replace=T))
[1] 1.07 1.06 0.97 1.07 1.00 0.99 0.91 1.01 1.05 0.97

The problem was the sequence creation which happened under digit constraints (options("digits"=2)). See here for an answer "R seq function produces wrong result"

connecting line like tree in r

I have following type data for human family:
indvidual <- c("John", "Kris", "Peter", "King", "Marry", "Renu", "Kim", "Ken", "Lu")
Parent1 <- c( NA, NA, "John", "John", "John", NA, "Peter", NA, NA)
Parent2 <- c( NA, NA, "Kris", "Kris", "Renu", NA, "Lu", NA, NA)
X <- c( 2, 3, 2, 3, 4, 5, 1.5, 1, 1)
Y <- c( 3, 3, 2, 2, 2, 3, 1, 3, 2)
pchsize <- c( 4.5, 4.3, 9.2, 6.2, 3.2, 6.4, 2.1, 1.9, 8)
fillcol <- c( 8.5, 8.3, 1.2, 3.2, 8.2, 2.4, 2.6, 6.1, 3.2)
myd <- data.frame (indvidual, Parent1, Parent2, X, Y, pchsize,fillcol)
indvidual Parent1 Parent2 X Y pchsize fillcol
1 John <NA> <NA> 2.0 3 4.5 8.5
2 Kris <NA> <NA> 3.0 3 4.3 8.3
3 Peter John Kris 2.0 2 9.2 1.2
4 King John Kris 3.0 2 6.2 3.2
5 Marry John Renu 4.0 2 3.2 8.2
6 Renu <NA> <NA> 5.0 3 6.4 2.4
7 Kim Peter Lu 1.5 1 2.1 2.6
8 Ken <NA> <NA> 1.0 3 1.9 6.1
9 Lu <NA> <NA> 1.0 2 8.0 3.2
I want plot something like the following, individuals points are connected to parents (Preferably different line color to Parent1 and Parent2 listed). Also pch size and pch fill is scaled to other variables pchsize and fillcol. Thus plot outline is:
Here is my progress in ggplot2:
require(ggplot2)
ggplot(data=myd, aes(X, Y,fill = fillcol)) +
geom_point(aes(size = pchsize, fill = fillcol), pch = "O") +
geom_text(aes (label = indvidual, vjust=1.25))
Issues unsolved: connecting lines, making size of pch big and fill color at the sametime.

Here is ggplot2 solution
library(ggplot2)
individual <- c("John", "Kris", "Peter", "King", "Marry", "Renu", "Kim", "Ken", "Lu")
Parent1 <- c( NA, NA, "John", "John", "John", NA, "Peter", NA, NA)
Parent2 <- c( NA, NA, "Kris", "Kris", "Renu", NA, "Lu", NA, NA)
X <- c( 2, 3, 2, 3, 4, 5, 1.5, 1, 1)
Y <- c( 3, 3, 2, 2, 2, 3, 1, 3, 2)
pchsize <- c( 4.5, 4.3, 9.2, 6.2, 3.2, 6.4, 2.1, 1.9, 8)
fillcol <- c( 8.5, 8.3, 1.2, 3.2, 8.2, 2.4, 2.6, 6.1, 3.2)
myd <- data.frame (individual, Parent1, Parent2, X, Y, pchsize,fillcol)
SegmentParent1 <- merge(
myd[, c("individual", "X", "Y")],
myd[!is.na(myd$Parent1), c("Parent1", "X", "Y")],
by.x = "individual", by.y = "Parent1")
SegmentParent2 <- merge(
myd[, c("individual", "X", "Y")],
myd[!is.na(myd$Parent1), c("Parent2", "X", "Y")],
by.x = "individual", by.y = "Parent2")
Segments <- rbind(SegmentParent1, SegmentParent2)
ggplot(data=myd, aes(X, Y)) +
geom_segment(data = Segments, aes(x = X.x, xend = X.y, y = Y.x, yend = Y.y)) +
geom_point(aes(size = pchsize, colour = fillcol)) +
geom_text(aes (label = indvidual), vjust = 0.5, colour = "red", fontface = 2) +
scale_x_continuous("", expand = c(0, 0.6), breaks = NULL) +
scale_y_continuous("", expand = c(0, 0.4), breaks = NULL) +
scale_size(range = c(20, 40)) +
theme_bw()

Here is a solution just using plot(), text(), and arrows(). The for loop is a bit cluttered, but will work for larger data sets and it should be easy to play with the plot and arrows:
plot(myd$X,myd$Y, col='white', type="p", main="", ylab="", xlab="",
axes = FALSE, ylim = c(min(myd$Y)*.8, max(myd$Y)*1.2),
xlim = c(min(myd$X)*.8, max(myd$X)*1.2))
child = data.frame()
child = myd[!is.na(myd$Parent1),]
DArrows = matrix(0,nrow(child),4);
MArrows = matrix(0,nrow(child),4);
for (n in 1:nrow(child)){
d<-child[n,];
c1<-myd$indvidual==as.character(d$Parent1);
b1<-myd[t(c1)];
c2<-myd$indvidual==as.character(d$Parent2);
b2<-myd[t(c2)];
DArrows[n, 1]=as.double(d$X)
DArrows[n, 2]=as.double(d$Y)
DArrows[n, 3]=as.double(b1[4])
DArrows[n, 4]=as.double(b1[5])
MArrows[n, 1]=as.double(d$X)
MArrows[n, 2]=as.double(d$Y)
MArrows[n, 3]=as.double(b2[4])
MArrows[n, 4]=as.double(b2[5])
}
arrows(DArrows[,3],DArrows[,4],DArrows[,1],DArrows[,2],lwd= 2, col = "blue",length=".1")
arrows(MArrows[,3],MArrows[,4],MArrows[,1],MArrows[,2],lwd=2, col = "red",length=".1")
par(new=TRUE)
plot(myd$X,myd$Y,type = "p", main = "", ylab = "", xlab = "",cex = myd$pchsize,
axes = FALSE, pch = 21, ylim = c(min(myd$Y)*.8, max(myd$Y)*1.2),
xlim=c(min(myd$X)*.8, max(myd$X)*1.2), bg = myd$fillcol,fg = 'black')
text(1.12*myd$X, .85*myd$Y, myd$indvidual)
arrows((DArrows[,3]+DArrows[,1])/2, (DArrows[,4]+DArrows[,2])/2,
DArrows[,1], DArrows[,2], lwd = 2, col = "blue", length = ".1")
arrows((MArrows[,3]+MArrows[,1])/2, (MArrows[,4]+MArrows[,2])/2,
MArrows[,1], MArrows[,2], lwd = 2, col = "red", length = ".1")

One thing that jumped out to me was to treat this is a network - R has many packages to plot these.
Here's a very simple solution:
First, I used your parent list to make a sociomatrix - you can generally input networks using edge lists as well - here I put 1 for the first parental relationship and 2 for the second.
psmat <- rbind(c(0, 0, 1, 1, 1, 0, 0, 0, 0),
c(0, 0, 2, 2, 0, 0, 0, 0, 0),
c(0, 0, 0, 0, 0, 0, 1, 0, 0),
rep(0, 9),
rep(0, 9),
c(0, 0, 0, 0, 2, 0, 0, 0, 0),
rep(0, 9),
rep(0, 9),
c(0, 0, 0, 0, 0, 0, 2, 0, 0))
Then, using the network package I just hit:
require(network)
plot(network(psmat), coord = cbind(X, Y), vertex.cex = pchsize,
vertex.col = fillcol, label = indvidual, edge.col = psmat)
This isn't terribly pretty in itself, but I think gives you all the basic elements you wanted.
For the colors, I believe the decimal places are just rounded - I wasn't sure what to do with those.
I know I've seen people plot networks in ggplot, so that might give you a better result.
Edit:
So here's a really messy way of turning your data into a network object directly - someone else might be able to fix it. Additionally, I add an edge attribute (named 'P' for parental status) and give the first set a value of 1 and the second set a value of 2. This can be used when plotting to set the colors.
P1 <- match(Parent1, indvidual)
e1 <- cbind(P1, 1:9); e1 <- na.omit(e1); attr(e1, 'na.action') <- NULL
P2 <- match(Parent2, indvidual)
e2 <- cbind(P2, 1:9); e2 <- na.omit(e2); attr(e2, 'na.action') <- NULL
en1 <- network.initialize(9)
add.edges(en1, e1[,1], e1[,2])
set.edge.attribute(en1, 'P', 1)
add.edges(en1, e2[,1], e2[,2], names.eval = 'P', vals.eval = 2)
plot(en1, coord = cbind(X, Y), vertex.cex = pchsize,
vertex.col = fillcol, label = indvidual, edge.col = 'P')

Alternative solution use igraph
library(igraph)
mm<-data.frame(dest=c(as.character(myd$Parent1),as.character(myd$Parent2)))
mm$orig<-myd$individual
g<-graph.edgelist(as.matrix(mm[!is.na(mm$dest),]))
rownames(myd)<-as.character(myd[,1])
l<-as.matrix(myd[V(g)$name,4:5])
plot(g,layout=l,vertex.color=myd[V(g)$name,6],vertex.size=myd[V(g)$name,6])
Just play a bit with color a sizes!

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

Scale circle size Venn diagram by relative proportion - r

Related

Better illustration of very small differences in graphs

How do I find best starting values for nlimb optimization?

Iterate over character vector in dplyR summarise and use it to assign new column names

R sample from sequence stepwise vs replicate with different results

connecting line like tree in r

Categories

Resources