how graduate the axis of the graphic - r

I have two data tables:
vah_p_1
x y
0 4
0.25 5
0.27 6
0,29 7
0.31 8
0.33 10
0.34 13
0.36 16
0.37 20
0.38 23
0.39 28
0.4 37
0.41 43
0.42 55
0.43 67
0.44 81
0.45 94
0.46 118
0.47 143
0.48 187
0.49 225
vah_o_1
x y
-17.2 -9
-14.2 -8
-9.27 -7
-6.9 -6
-4.09 -5
0 -4
I need to build data for two tables in one graph(code below).
vah_p <- read.table(file='vah_p_1',header =TRUE)
y <- log2(vah_p$y)
x <- vah_p$x
mat_p <- data.frame(x,y)
error_p <- lm(y ~ x, mat_p)
error_p <- tidy(error_p)
vah_o <- read.table(file='vah_o_1',header =TRUE)
y <- log2((vah_o$y)*(-1))
x <- vah_o$x
mat_o <- data.frame(x,y)
error_o <- lm(y ~ x, mat_o)
error_o <- broom::tidy(error_o)
library(ggplot2)
p <- ggplot(vah_p, aes(x = x, y = y)) +
geom_point() + geom_point(data = vah_o, aes(x = x, y = y))
p
After compilation I will get a graph.
(source: savepice.ru)
This schedule is very bad. I tried to graduate the axis the graphics that looked better, but I did not succeed. Help please.

If you would like to change the scale as I understand the problem use
ggplot() + ylim(min, max)

Related

Why does my random trajectory systematic biais in angle?

I have a tracjectory in 2D (list of x,y positions).
I am trying to measure the angles of the motion between consecutive points.
So I calculate the scalar product of the two consecutive vectors, divide by the vector norms, and this gives me the cosinus of the angles I am looking for.
However, when I generate totally random trajectories (by generating random x and random y), I always have a high number of cos results very close to -1, or 1. While I was expecting to have all cos between -1 and 1 equally likely.
Here's my code to generate the trajectories (after correction from the comments below), and calculate the cosinus:
cost = c()
t = seq(0,500,0.5)
x = 1*runif( length(t),-1,1 )
y = 1*runif( length(t),-1,1 )
x = cumsum(x)
y = cumsum(y)
step = 1
dstep = 2
for ( j in 1:((length(t)-dstep)))
{
x1 = x[j+step]-x[j]
y1 = y[j+step]-y[j]
x2 = x[j+dstep]-x[j+step]
y2 = y[j+dstep]-y[j+step]
n1 = sqrt( x1*x1 + y1*y1 )
n2 = sqrt( x2*x2 + y2*y2 )
if ( (n1*n2) > 0 )
{
scal = x1*x2 + y1*y2
cost = c( cost, scal/(n1*n2) )
#print( paste(n1, " ", n2, " ", n1*n2, " ", scal, " ", x1, " ", x2, " ", scal/(n1*n2), sep="") )
}
}
When i look at the histogram of the cost results, I always have a high number of cost very close to -1 and 1:
> hist(cost, plot=F)
$breaks
[1] -1.00 -0.95 -0.90 -0.85 -0.80 -0.75 -0.70 -0.65 -0.60 -0.55 -0.50 -0.45
[13] -0.40 -0.35 -0.30 -0.25 -0.20 -0.15 -0.10 -0.05 0.00 0.05 0.10 0.15
[25] 0.20 0.25 0.30 0.35 0.40 0.45 0.50 0.55 0.60 0.65 0.70 0.75
[37] 0.80 0.85 0.90 0.95 1.00
$counts
[1] 108 43 32 20 22 21 19 20 19 17 16 19 8 19 23 17 15 10 18
[20] 22 15 19 14 15 18 16 21 11 18 20 16 35 23 24 24 20 23 33
[39] 37 107
Any idea where I'm wrong or why it should do that ?
Thanks for help
In case somebody else meet this problem, here's the summary of the solution from the comments:
Actually this distribution of the cos is what you get when angles are uniformly distributed! Consider hist(cos(runif(1000, min = 0, max = 2*pi))). So it's working as expected. cos just moves quickly over 0 and slowly over 1 and -1. See plot(cos, from = 0, to = 2*pi).
Which is indeed explained there: https://math.stackexchange.com/questions/1153339/distribution-of-cosine-of-uniformly-random-variables
The solution is thus that it is normal to have more values of cosinus close to 1 and -1 from a distribution of totally random angles.

ggplot() color each point manually

How do I create a scatter-plot in ggplot() with each points coloured manually? The necessary colours are given in my dataframe.
> head(df)
x y col
1 0.72 2757 #2AAE89
2 0.72 2757 #2DFE83
3 0.72 2757 #40FE89
4 0.70 2757 #28FE97
5 0.86 2757 #007C7D
6 0.75 2757 #24FEA1
The colour of the points must be exactly as given in the dataframe
Luckily there is a relatively easy solution by using scale_colour_identity(), see the following example:
library(ggplot2)
z <- " x y z col
1 0.72 2757 86 #2AAE89
2 0.72 2757 86 #2DFE83
3 0.72 2757 86 #40FE89
4 0.70 2757 82 #28FE97
5 0.86 2757 26 #007C7D
6 0.75 2757 79 #24FEA1"
df <- read.table(text = z, header = T)
ggplot(df, aes(x, y, colour = col)) +
geom_point() +
scale_colour_identity()
EDIT: I made a mistake in loading in the data, but the plotting syntax is still valid.

Can't add legend panel to a certain scatter plot with multiple data sets

I simply can't find a way to plot legends panel in this specific ggplot with ggplot2 on R. Just want to make it appear.
For context, I'm plotting chemical abundances of sample versus the atomic number of the elements.
For background, I tried many things that are described here:
Reasons that ggplot2 legend does not appear
including links therein, however could not find a solution for my specific data set.
I know the problem could be within the structure of the data set, since I've been able to do that with other data, but I can't solve it. I also know that the problem should have to do with the theme() described in the code below, because when I use default ggplot configuration legends actually appear. I use this personalized theme for consistency trough out my work.
This is what I have so far removing cosmetics:
ggplot(atomic, aes(x=atomic$Z, y = atomic$avg, group=1), fill = atomic$Z) +
plot dots for average of values
geom_point(data=atomic, aes(x=atomic$Z, y=atomic$avg, group=1, color="black"), size=0.5, alpha=1, shape=16 ) +
connect dots for average of values
geom_line(data=atomic, aes(x=atomic$Z, y=atomic$avg, group=1), color="black", linetype= "dashed") +
plot dots for actual values from the samples
geom_point(data=atomic, aes(x=atomic$Z, y=atomic$SDSS, group=1, color="#00ba38"), size=5, alpha=1, shape=16, color="#00ba38") +
geom_point(data=atomic, aes(x=atomic$Z, y=atomic$HE22, group=1, color="#619cff"), size=5, alpha=1, shape=16, color="#619cff") +
geom_point(data=atomic, aes(x=atomic$Z, y=atomic$HE12, group=1, color="#F8766D"), size=5, alpha=1, shape=16, color="#F8766D") +
EDIT: the Definition of base_breaks (used below)
base_breaks_x <- function(x){
b <- pretty(x)
d <- data.frame(y=-Inf, yend=-Inf, x=min(b), xend=max(b))
list(geom_segment(data=d, aes(x=x, y=y, xend=xend, yend=yend), inherit.aes=FALSE),
scale_x_continuous(breaks=b))
}
base_breaks_y <- function(x){
b <- pretty(x)
d <- data.frame(x=-Inf, xend=-Inf, y=min(b), yend=max(b))
list(geom_segment(data=d, aes(x=x, y=y, xend=xend, yend=yend), inherit.aes=FALSE),
scale_y_continuous(breaks=b))
}
the problem might be here
theme_bw() +
theme(plot.title = element_text(hjust = 0.5),
text = element_text(size=20),
legend.position="bottom",
panel.border = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank()) +
base_breaks_x(atomic$Z) +
base_breaks_y(atomic$HE22)
The data set is the following
Z Name HE22 SDSS HE12 avg
1 3 Li NA 1.00 NA 1.00
2 6 C 6.16 5.50 6.06 5.91
3 7 N NA NA 6.49 6.49
4 11 Na NA NA 3.53 3.53
5 12 Mg 5.32 4.43 4.99 4.91
6 13 Al 2.90 NA 3.08 2.99
7 14 Si NA 4.90 4.89 4.90
8 20 Ca 4.07 3.37 3.56 3.67
9 21 Sc 0.72 -0.07 0.24 0.30
10 22 Ti 2.74 1.79 2.47 2.33
11 23 V NA NA 1.18 1.18
12 24 Cr 2.88 2.14 2.67 2.56
13 25 Mn 2.34 1.59 2.44 2.12
14 26 Fe 4.92 4.14 4.59 4.55
15 27 Co 2.57 1.72 2.36 2.22
16 28 Ni 3.63 2.96 3.51 3.37
17 29 Cu NA NA 0.31 0.31
18 30 Zn 2.29 NA 2.44 2.37
19 38 Sr 0.62 0.29 0.41 0.44
20 39 Y -0.22 -0.44 -0.33 -0.33
21 40 Zr 0.60 NA 0.30 0.45
22 56 Ba 0.13 -0.10 0.12 0.05
23 57 La -0.77 -0.49 -0.77 -0.68
24 58 Ce NA NA -0.39 -0.39
25 59 Pr NA NA -0.78 -0.78
26 60 Nd -0.47 NA -0.37 -0.42
27 62 Sm NA NA -0.57 -0.57
28 63 Eu -1.02 -0.92 -0.85 -0.93
29 64 Gd NA NA -0.39 -0.39
30 66 Dy NA NA -0.16 -0.16
31 68 Er NA -0.40 NA -0.40
32 70 Yb NA -0.60 NA -0.60
33 90 Th NA -0.60 NA -0.60
as Z = atomic number, Name = element, HE12/HE22/SDSS = samples, avg = average of the samples.
I would like to know how I can add legend panel coherent with the colors of my scatter plots.
Thank you so much! Hope I could describe the problem properly.
This is personally what I would do.
I converted the data from wide format to long format since it's easier to manipulate colors that way (Sorry I just used generic "key" and "value" since I'm not sure what you would want your columns to be named). Hopefully this will get you at least part of the way to where you want to go. Let me know if you have questions!
library(ggplot2)
library(tidyr)
p <- atomic %>%
gather(key = "key", value = "value", SDSS, HE22, HE12) %>%
ggplot(aes(Z, value, color = key))+
geom_point() +
geom_text(aes(x = Z, y = avg, label = Name), # EDITED
color = "black")
scale_color_manual(values = c("#00ba38", "#619cff", "#F8766D"))
p +
geom_line(data=atomic, aes(x=atomic$Z, y=atomic$avg, group=1), color="black",
linetype= "dashed") +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5),
text = element_text(size=20),
legend.position="bottom",
panel.border = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank()) +
base_breaks_x(atomic$Z) +
base_breaks_y(atomic$HE22)
EDITED
I added the geom_text() command so labels show up. You can adjust the arguments so the labels look better. I've also heard geom_text_repel() in the ggrepel package is helpful for creating nice labels: https://cran.r-project.org/web/packages/ggrepel/vignettes/ggrepel.html#examples

dplyr: categorical counts from a single column across multiple variables

So I have been trying to do a boxplot of "yes/no" counts for hours now.
My dataset looks like this
> stack
Site Plot Treatment Meters Retrieved
2 Southern 18 Control -5.00 y
3 Southern 18 Control 9.55 y
4 Southern 18 Control 4.70 y
5 Southern 27 Control -5.00 y
6 Southern 27 Control 20.00 n
9 Southern 18 Control -0.10 y
17 Southern 18 Control 20.00 y
23 Southern 31 Control 100.00 y
53 Southern 25 Mu 3.55 n
54 Southern 20 Mu 5.90 y
55 Southern 25 Mu -0.10 y
56 Southern 29 Mu 9.55 y
58 Southern 25 Mu 4.70 y
60 Southern 20 Mu 2.90 y
61 Southern 24 Mu 5.90 n
62 Southern 24 Mu 3.55 y
63 Southern 20 Mu 3.55 y
65 Southern 24 Mu 0.55 y
66 Southern 29 Mu 8.90 y
68 Southern 25 Mu 8.90 y
69 Southern 29 Mu 0.55 y
70 Southern 24 Mu 1.70 y
72 Southern 29 Mu -5.00 y
76 Southern 29 Mu 1.70 y
77 Southern 25 Mu 9.55 y
78 Southern 25 Mu 13.20 y
79 Southern 29 Mu 3.55 y
80 Southern 25 Mu 15.00 y
81 Southern 25 Mu -5.00 n
84 Southern 24 Mu 8.90 y
85 Southern 20 Mu 6.55 y
86 Southern 29 Mu 2.90 y
92 Southern 24 Mu -0.10 y
93 Southern 20 Mu 100.00 y
I want to get counts of both y(yes) and n(no) of the variable "Retrieved" while grouping for "Treatment" and "Meters".
So it should look something like this
Treatment Meters Yes No
Control -5.00 2 0
Control 9.55 1 2
Control 4.70 1 1
Control 20.00 0 2
Mu 3.55 4 0
Mu 5.90 0 1
Mu -0.10 2 2
Mu 9.55 1 0
With this data I want to do a stacked boxplot with x=Meters, y= count and Treatment as grid or something. like this
This is my code but it's not working
plot_data <- stack %>%
count(Retrieved, Treatment, Meters) %>%
group_by(Treatment, Meters) %>%
mutate(count= n)
plot_data
ggplot(plot_data, aes(x = Meters, y = count, fill = Treatment)) +
geom_col(position = "fill") +
geom_label(aes(label = count(count)), position = "fill", color = "white", vjust = 1, show.legend = FALSE) +
scale_y_continuous(labels = count)
Could you please tell me what I'm doing wrong.
geom_bar is for precisely this case, and you won't even need to use group_by or count. (From the docs: "geom_bar makes the height of the bar proportional to the number of cases in each group".)
This should do what you're looking for:
ggplot(stack, aes(x = Meters, fill = Treatment)) +
geom_bar(position = "stack")
However, the bars will be very narrow because "Meters" is continuous and has a large range. You could address this by converting it into a factor. One way to do that would be to do this first:
data <- data %>%
mutate(Meters = as.factor(Meters))
If you want to get the counts in the format that you mentioned (in addition to creating the plot), you could do:
data %>%
count(Treatment, Meters, Retrieved) %>%
spread(Retrieved, n, fill = 0) %>%
rename(Yes = y, No = n)
count does group_by for you, so I didn't need to carry that over from your code. Then, spread creates the separate columns for y and n. Finally, I rename those columns to Yes and No.

How to add shaded confidence intervals to line plot with specified values

I have a small table of summary data with the odds ratio, upper and lower confidence limits for four categories, with six levels within each category. I'd like to produce a chart using ggplot2 that looks similar to the usual one created when you specify a lm and it's se, but I'd like R just to use the pre-specified values I have in my table. I've managed to create the line graph with error bars, but these overlap and make it unclear. The data look like this:
interval OR Drug lower upper
14 0.004 a 0.002 0.205
30 0.022 a 0.001 0.101
60 0.13 a 0.061 0.23
90 0.22 a 0.14 0.34
180 0.25 a 0.17 0.35
365 0.31 a 0.23 0.41
14 0.84 b 0.59 1.19
30 0.85 b 0.66 1.084
60 0.94 b 0.75 1.17
90 0.83 b 0.68 1.01
180 1.28 b 1.09 1.51
365 1.58 b 1.38 1.82
14 1.9 c 0.9 4.27
30 2.91 c 1.47 6.29
60 2.57 c 1.52 4.55
90 2.05 c 1.31 3.27
180 2.422 c 1.596 3.769
365 2.83 c 1.93 4.26
14 0.29 d 0.04 1.18
30 0.09 d 0.01 0.29
60 0.39 d 0.17 0.82
90 0.39 d 0.2 0.7
180 0.37 d 0.22 0.59
365 0.34 d 0.21 0.53
I have tried this:
limits <- aes(ymax=upper, ymin=lower)
dodge <- position_dodge(width=0.9)
ggplot(data, aes(y=OR, x=days, colour=Drug)) +
geom_line(stat="identity") +
geom_errorbar(limits, position=dodge)
and searched for a suitable answer to create a pretty plot, but I'm flummoxed!
Any help greatly appreciated!
You need the following lines:
p<-ggplot(data=data, aes(x=interval, y=OR, colour=Drug)) + geom_point() + geom_line()
p<-p+geom_ribbon(aes(ymin=data$lower, ymax=data$upper), linetype=2, alpha=0.1)
Here is a base R approach using polygon() since #jmb requested a solution in the comments. Note that I have to define two sets of x-values and associated y values for the polygon to plot. It works by plotting the outer perimeter of the polygon. I define plot type = 'n' and use points() separately to get the points on top of the polygon. My personal preference is the ggplot solutions above when possible since polygon() is pretty clunky.
library(tidyverse)
data('mtcars') #built in dataset
mean.mpg = mtcars %>%
group_by(cyl) %>%
summarise(N = n(),
avg.mpg = mean(mpg),
SE.low = avg.mpg - (sd(mpg)/sqrt(N)),
SE.high =avg.mpg + (sd(mpg)/sqrt(N)))
plot(avg.mpg ~ cyl, data = mean.mpg, ylim = c(10,30), type = 'n')
#note I have defined c(x1, x2) and c(y1, y2)
polygon(c(mean.mpg$cyl, rev(mean.mpg$cyl)),
c(mean.mpg$SE.low,rev(mean.mpg$SE.high)), density = 200, col ='grey90')
points(avg.mpg ~ cyl, data = mean.mpg, pch = 19, col = 'firebrick')

Resources