ggplot counting observations in each quadrant - r

I have some data as below:
# A tibble: 158 x 2
X Y
<dbl> <dbl>
1 -0.71 -2.39
2 0.92 0.573
3 -2.52 -1.61
4 3.88 5.43
5 0.15 0.106
6 3.49 6.66
7 -0.54 0.613
8 1.4 4.21
9 1.16 0.107
10 -3.37 -3.62
# ... with 148 more rows
I plot the observations and draw a line horizontally and vertically at 0 using:
df %>%
ggplot(aes(x = X,
y = Y)) +
geom_point() +
#geom_smooth(method = "lm") +
geom_hline(aes(yintercept = 0)) +
geom_vline(aes(xintercept = 0))
What I would like to do now is count the number of points which are in each quadrant of the plot and just display the number on each of the quadrant.
Data:
df <- structure(list(X = c(-0.71, 0.92, -2.52, 3.88, 0.15, 3.49, -0.54,
1.4, 1.16, -3.37, -0.55, -0.74, 2.13, 1.33, 3.27, 1.74, 0.65,
1.23, -1.75, 0.9, 3.86, 3.69, -1.74, -3.43, 0.67, 3.83, 2.32,
-5.46, -0.55, -6.39, -2.23, -1.3, 4.72, 2.42, -7.9, -1.54, 0.99,
-9.97, -18.41, -7.73, 1.5, -7.5, -9.88, 8.82, 10.48, 6.7, -0.23,
8.15, 3.02, 4.54, -2.76, 5.77, 3.03, -3.63, 3.71, 6.27, 1.92,
-7.86, -5.5, -4.44, 9.47, 3.89, 0.81, 6.83, 1.98, 4.01, 0.43,
2.79, -1.48, -1.87, -5.93, -8.58, 11.56, -0.46, 0.33, 5.27, 4.32,
2.4, -0.64, -6.7, 3.74, 1.01, 2.76, 2.8, -1.63, 0.65, 1.3, 5.33,
0.96, 3.71, 1.27, 2.53, -1.52, 5.69, -2.53, 3.82, 4.09, 2.79,
2.64, -3.42, 4.72, 0.62, 0.25, 1.98, 2.82, -2.06, 4.06, -2.45,
2.03, 2.22, -0.2, -3.47, 6.15, -1.2, 1.11, 1, -1.71, 1.05, -5.93,
-3.35, 7.53, 0.45, -2.45, -5.73, 0.26, 7, 1.12, 1.39, -0.11,
0.43, 0.34, -2.05, 4.54, 1.76, 2.15, 3.26, 0.2, 0.84, 0.93, 0.98,
1.97, 0.07, 2.48, 1.98, 2.88, 1.18, 5.23, -3.95, -2.17, 0.35,
2.51, 0.39, 3.11, 3.09, 0.06, -7.81, 1.62, -9.53), Y = c(-2.38916419707325,
0.572675136581781, -1.61130358515631, 5.42706994951004, 0.105533424368025,
6.65697289481407, 0.613486039256266, 4.21013704773222, 0.106990463992386,
-3.62352710962904, -0.203607589793183, -4.24563967581072, 2.97070300267885,
2.92544516479698, 5.02538739147422, 2.25461465260415, 1.66492554339803,
3.5690423154001, 0.108411247307002, 0.961008630173696, 3.79172784045593,
1.94108347244724, -2.12992072359958, -5.87473482253699, -1.45100684091412,
1.47842234462587, 1.43196010231586, -7.74290369146724, -2.79056547363334,
-5.03532133668577, -1.99400739381075, -2.92320856826413, 3.93394610595585,
3.29451174347621, -10.0410470556235, 3.34517672842812, 2.41625183369762,
-10.3476519710384, -21.791966984666, -11.1142687331988, 3.32761656369176,
-3.96223311815655, -11.093184503697, 11.6694167237026, 22.2461574652919,
9.28255170483023, 4.63817899423635, 11.8553670456421, 8.27889381692159,
8.19911670446593, -6.470817611772, 3.09218109975165, 7.5825172514382,
0.0284717847140023, 4.90864483240255, 10.0311544305095, 8.55401150272708,
-8.84107625063785, -8.04105369987643, -6.65872061590883, 10.8577722872979,
4.03706922467202, 3.04148092466194, 8.90634921641063, 1.56555573277521,
4.42535372370123, 0.841035482771217, 1.75578768128183, -2.67241757153407,
-2.25418139889371, -8.7723458397205, -11.2420616969584, 11.4836809985778,
-1.8649021388476, 0.832085873992507, 11.6062841497052, 2.59039949751966,
2.28509371230735, -1.97715071813135, -7.3280081242774, 3.97121830333205,
-0.569284938256821, 2.31082313266322, 3.02490478503254, -1.38512132143018,
-0.866847983058995, 2.97552563660034, 5.95976111047322, -0.102502393594657,
4.58003409048615, 0.842834319309465, 3.06786040532266, 0.250639945095402,
6.78696057469418, -1.62606880448011, 5.46367912370997, 2.53357559730344,
4.73895950607308, 2.50934817572881, -0.312149263565189, 4.82621271905962,
-0.79009628184665, -3.12115495501355, -0.461711220579862, 4.27359516836912,
-4.60871127364226, 3.84488020178729, -5.26245849925393, 3.54222359765326,
1.04191534953213, 1.4982293818719, -3.56618092951384, 4.95478586278666,
-0.270584959088251, -0.900452947549406, 0.901254072925249, -0.254483190258712,
-2.63217404877559, -4.71624328721887, -7.1747474980974, 4.86036342835152,
3.24549729559669, -4.19219918146311, -10.128570960197, 0.803895306904637,
9.33865112323734, 2.85517888612945, 0.316844258915139, -0.151669189522978,
1.00839469793829, 1.57398998124214, -5.0607247073979, 8.91704977465508,
2.59984205825244, 1.31737969318745, 2.70804837397023, 1.80193676584248,
1.48362026996833, -2.11380109244311, 3.54300752215851, 1.6501194298151,
-1.01504840432201, 6.74326962933175, 0.1866931051541, 2.9825290286452,
1.42593783576641, 2.71110274944611, -4.09572797775837, 1.50144422897237,
-0.552818435076999, 5.23843746771127, 1.33321908169899, 1.28745947800351,
2.60490918566195, -1.54038908822145, -9.6363012621261, -0.190177144865133,
-13.0653210889016)), row.names = c(NA, -158L), class = c("tbl_df",
"tbl", "data.frame"))

library(dplyr)
quad_count <- df %>%
# Count how many with each combination of X and Y being positive
count(right = X > 0, top = Y > 0) %>%
# TRUE = 1, FALSE = 0, so these map the TRUE to +1 and FALSE to -1
mutate(X = 2 * (right - 0.5), Y = 2 * (top - 0.5))
df %>%
ggplot(aes(x = X, y = Y)) +
geom_point() +
geom_hline(aes(yintercept = 0)) +
geom_vline(aes(xintercept = 0)) +
# This layer should use the other dataset, but keep using X and Y for location
geom_text(data = quad_count, aes(label = n), size = 10)

df %>%
ggplot(aes(x = X,
y = Y)) +
geom_point() +
#geom_smooth(method = "lm") +
geom_hline(aes(yintercept = 0)) +
geom_vline(aes(xintercept = 0)) +
geom_text(data = df %>%
mutate(X = X >= 0, Y = Y >= 0) %>%
count(X, Y) %>%
mutate(X = if_else(X, 10, -10),
Y = if_else(Y, 10, -10)),
mapping = aes(X, Y, label = n), size = 10)

Related

ggplot2 grouped replicate samples

My csv file contains replicate values, e.g., Mab1, Mab1rep, Mab2, Mab2rep, etc.
The data frame Data is as below:
Data <- structure(list(Samples = c("Isotype_L", "Isotype_L", "Isotype_L",
"Isotype_L", "Mab1", "Mab1", "Mab1", "Mab1", "Mab1-GL", "Mab1-GL",
"Mab1-GL", "Mab1-GL", "Mab2", "Mab2", "Mab2", "Mab2", "Mab2-GL",
"Mab2-GL", "Mab2-GL", "Mab2-GL", "Mab3", "Mab3", "Mab3", "Mab3",
"Mab4", "Mab4", "Mab4", "Mab4", "Mab4", "Mab5", "Mab5", "Mab5",
"Mab5", "Mab5", "Isotype_K", "Isotype_K", "Isotype_K", "Isotype_K",
"Isotype_Lrep", "Isotype_Lrep", "Isotype_Lrep", "Isotype_Lrep",
"Mab1rep", "Mab1rep", "Mab1rep", "Mab1rep", "Mab1rep", "Mab1-GLrep",
"Mab1-GLrep", "Mab1-GLrep", "Mab1-GLrep", "Mab2rep", "Mab2rep",
"Mab2rep", "Mab2rep", "Mab2-GLrep", "Mab2-GLrep", "Mab2-GLrep",
"Mab2-GLrep", "Mab3rep", "Mab3rep", "Mab3rep", "Mab3rep", "Mab4rep",
"Mab4rep", "Mab4rep", "Mab4rep", "Mab4rep", "Mab5rep", "Mab5rep",
"Mab5rep", "Mab5rep", "Mab5rep", "Isotype_Krep", "Isotype_Krep",
"Isotype_Krep", "Isotype_Krep", "PosCtrl", "PosCtrl", "PosCtrl",
"PosCtrl", "PosCtrl", "neg-AF488", "neg-AF488", "neg-AF488",
"neg-AF488", "Negative", "Negative", "Negative", "Negative",
"PosCtrl_rep", "PosCtrl_rep", "PosCtrl_rep", "PosCtrl_rep", "neg-AF488rep",
"neg-AF488rep", "neg-AF488rep", "neg-AF488rep", "Negative_rep",
"Negative_rep", "Negative_rep", "Negative_rep"), Blue = c(128.3952818,
120.2831546, 143.243713, 132.0577827, 133.8880534, 133.7664632,
121.0706891, 157.5932623, 182.4168577, 160.3366789, 205.4662033,
194.5710452, 136.0504487, 130.899206, 158.7230946, 146.368408,
152.2359201, 135.2182368, 142.0670308, 117.5533153, 135.8317231,
147.2705529, 130.3724567, 137.7607945, 135.0915241, 114.6307573,
153.3744009, 148.6203231, 141.6522212, 142.7500602, 129.3132835,
133.323963, 161.4505614, 120.3986388, 149.467766, 131.8404767,
121.1891517, 134.1152953, 142.9095762, 148.1782023, 133.1172244,
132.8860874, 124.8857092, 140.9295437, 122.7443303, 142.281986,
148.2327674, 138.7267188, 147.7389215, 157.2358721, 153.455753,
135.239042, 168.5716308, 122.357492, 141.6833326, 125.6991336,
121.3251682, 142.712414, 174.2987679, 140.9524518, 121.1017373,
154.801132, 126.8055734, 145.4754619, 168.1953102, 121.5520058,
137.4914411, 142.5554603, 147.9192906, 123.1908202, 134.2369485,
132.6270733, 143.3067567, 120.2250493, 127.5301465, 142.1151132,
125.718732, 117.2397291, 134.7169574, 120.9030571, 138.0262017,
121.5363059, 140.1157374, 171.9441906, 179.801995, 157.7747676,
135.5647523, 130.947343, 124.0994119, 117.3040363, 120.2912237,
128.9369029, 129.2967454, 134.7686437, 127.5407896, 155.7879164,
134.9068068, 121.4993647, 146.2323789, 131.6257992, 161.208799,
137.8464021), Green = c(204.0416907, 179.8289799, 192.7909809,
185.1904749, 119.5289134, 116.1968717, 119.8961343, 119.3418334,
114.7639073, 113.7169804, 118.3994388, 118.0875025, 120.7343683,
119.6826046, 121.079657, 124.4646777, 118.125646, 114.1900465,
114.0732686, 110.5228171, 115.0555818, 118.7761173, 113.2995208,
118.5396075, 167.6058496, 149.1461499, 189.7257013, 207.9481807,
177.2098519, 118.5133042, 118.6931648, 119.4754029, 128.9372642,
129.7043945, 112.037337, 111.9090535, 110.2099861, 112.2431433,
191.4316539, 201.5396396, 190.3129216, 192.7112734, 114.2036743,
115.6031688, 115.5844771, 115.7509866, 118.9890215, 112.9275697,
115.6021348, 119.0952462, 117.3730964, 113.6875097, 117.9319529,
114.2584918, 182.7833727, 111.9750247, 114.6643268, 117.7445263,
119.7687462, 113.3304581, 146.4097633, 114.3161156, 111.3511068,
200.9120144, 218.8782048, 169.1520322, 161.2219501, 266.5332884,
117.3344686, 117.3277836, 118.1452713, 115.3104536, 127.5856625,
112.5214363, 116.5449408, 115.1459536, 111.7753407, 349.2590405,
385.2193187, 439.0155097, 490.8051766, 394.1068064, 107.9149422,
108.005748, 108.1659999, 109.7366457, 107.8067543, 108.471598,
108.4746003, 108.6726188, 330.5756935, 329.6602842, 243.6285135,
266.0160698, 107.21539, 108.4953225, 104.6257189, 108.7797861,
108.3317481, 107.2107311, 107.6584237, 106.8200559), Green_norm = c(1.59,
1.5, 1.35, 1.4, 0.89, 0.87, 0.99, 0.76, 0.63, 0.71, 0.58, 0.61,
0.89, 0.91, 0.76, 0.85, 0.78, 0.84, 0.8, 0.94, 0.85, 0.81, 0.87,
0.86, 1.24, 1.3, 1.24, 1.4, 1.25, 0.83, 0.92, 0.9, 0.8, 1.08,
0.75, 0.85, 0.91, 0.84, 1.34, 1.36, 1.43, 1.45, 0.91, 0.82, 0.94,
0.81, 0.8, 0.81, 0.78, 0.76, 0.76, 0.84, 0.7, 0.93, 1.29, 0.89,
0.95, 0.83, 0.69, 0.8, 1.21, 0.74, 0.88, 1.38, 1.3, 1.39, 1.17,
1.87, 0.79, 0.95, 0.88, 0.87, 0.89, 0.94, 0.91, 0.81, 0.89, 2.98,
2.86, 3.63, 3.56, 3.24, 0.77, 0.63, 0.6, 0.7, 0.8, 0.83, 0.87,
0.93, 2.75, 2.56, 1.88, 1.97, 0.84, 0.7, 0.78, 0.9, 0.74, 0.81,
0.67, 0.77)), class = "data.frame", row.names = c(NA, -102L))
I plotted box plot using ggplot2 library:
Firstly, I wanted to see the distribution of samples and appreciate the difference between replicates.
ggplot(Data, aes(x = reorder (Samples, -Green_norm), y = Green_norm, fill = Samples)) +
geom_boxplot(alpha = 0.5) + geom_point(aes(colour=Samples))+
theme_bw() +
rotate_x_text(angle = 45)
I obtained the plot as follows:
Secondly, I would like to group the replicates per sample type, for example Mab1 and its replicate Mab1rep, and so on.
You can use regular expression substitution to remove any occurrences of "rep" or "_rep" in the Samples column, and then use your existing plotting code. I don't have your rotate_x_text function, so instead I'm doing the equivalent via theme. I've also modified the plotting code to use a different column name, rather than overwriting Samples.
library(tidyverse)
data_new <- Data %>%
mutate(Samples_grouped = gsub('_*rep$', '', Samples))
ggplot(data_new, aes(x = reorder (Samples_grouped, -Green_norm), y = Green_norm, fill = Samples_grouped)) +
geom_boxplot(alpha = 0.5) + geom_point(aes(colour=Samples_grouped))+
theme_bw() +
theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1))

How to fit exponential model in R and print correct y=ab^(x) equation

I'm trying to fit an exponential model on this datased.
y <- c(0.04, 0.04, 0.03, 0.03, 0.04, 0.03, 0.02, 0.03, 0.03, 0.02, 0.08, 0.04, 0.04, 0.07, 0.04, 0.05, 0.12, 0.05, 0.13, 0.11, 0.11, 0.33, 0.03, 0.08)
x <- c(3.75, 4.25, 1.77, 4.24, 2.99, 3.82, 1.85, 3.17, 2.64, 2.10, 4.23, 3.81, 3.55, 3.73, 3.85, 4.31, 4.35, 3.80, 7.26, 5.91, 8.15, 8.56, 7.49, 8.12)
df <- data.frame(x, y)
ggplot(data = df, aes(x=x,y=y)) +
geom_point(size = 3) +
stat_smooth(method = "lm", formula = y ~ exp(x))+
stat_poly_eq(label.x=0.1, label.y=0.85,
aes(x=x,y=y,label = paste(..eq.label..)), formula = y ~ exp(x),
parse = TRUE, size = 3.5)+
stat_poly_eq(label.x=0.1, label.y=0.8,
aes(x=x,y=y,label = paste(..rr.label..)), formula = y ~ exp(x),
parse = TRUE, size = 3.5)+
theme_classic()
I'd also need to plot it, and so far I was able to fit a proper smooth together with a correct r2 I think. however, I can't seem to be able to print the correct exponential function on the plot, at least by using stat_poly_eq() function.
This only seem to be able to print a function in a linear way, althgough I specify the formula = y ~ exp(x), argument.
Does anyone know how I could have the right exp function on the plot?
Thank you!
Here is a solution.
define a format string, eq_fmt, to make the plot code easier to read;
use the coefficients names b_0 and b_1 like below. This will not write the equation as a*b^x, the base is the base of natural logarithms;
and set output.type = "numeric".
library(ggplot2)
library(ggpmisc)
#> Loading required package: ggpp
#>
#> Attaching package: 'ggpp'
#> The following object is masked from 'package:ggplot2':
#>
#> annotate
y <- c(0.04, 0.04, 0.03, 0.03, 0.04, 0.03, 0.02, 0.03, 0.03, 0.02, 0.08, 0.04, 0.04, 0.07, 0.04, 0.05, 0.12, 0.05, 0.13, 0.11, 0.11, 0.33, 0.03, 0.08)
x <- c(3.75, 4.25, 1.77, 4.24, 2.99, 3.82, 1.85, 3.17, 2.64, 2.10, 4.23, 3.81, 3.55, 3.73, 3.85, 4.31, 4.35, 3.80, 7.26, 5.91, 8.15, 8.56, 7.49, 8.12)
df <- data.frame(x, y)
eq_fmt <- "`y`~`=`~%.3g~italic(e)^{%.3g~`x`}"
ggplot(data = df, aes(x=x,y=y)) +
geom_point(size = 3) +
stat_smooth(method = "lm", formula = y ~ exp(x))+
stat_poly_eq(mapping = aes(x = x, y = y,
label = sprintf(eq_fmt,
after_stat(b_0),
after_stat(b_1))),
label.x = 0.1, label.y = 0.85,
formula = y ~ exp(x),
output.type = "numeric",
parse = TRUE
) +
stat_poly_eq(label.x=0.1, label.y=0.8,
aes(x=x,y=y,label = paste(..rr.label..)), formula = y ~ exp(x),
parse = TRUE, size = 3.5)+
theme_classic()
Created on 2022-09-22 with reprex v2.0.2

Forecasting Using Group and Regressors in Prophet

I'm trying to use prophet library to predict y using Group and Regressors. My code and the errors received are below.
In Model1:
I've received this error: Error in setup_dataframe(object, df) :
Regressor "x1" missing from dataframe
In Model2:
Model2 runs. But I'm unable to figure it out how to add regressors x1
and x2.
library(prophet)
library(dplyr)
df <- data.frame(ds = rep(c("2020-01-01", "2020-01-02", "2020-01-03", "2020-01-04", "2020-01-05",
"2020-01-06", "2020-01-07", "2020-01-08", "2020-01-09", "2020-01-10", "2020-01-11", "2020-01-12",
"2020-01-13", "2020-01-14", "2020-01-15"), 2),
group = rep(c("A", "B"), each = 15),
y = c(8.15, 1.74, 2.97, 2.36, 0.94, 1.84, 3.17, 12.51, 0.63, 6.92, 5.51,
7.50, -2.47, 4.38, 6.28, 7.69, 2.89, 3.77, 7.27, -1.19, 4.64, 9.49, 5.43, 0.36, 14.12,
8.77, -3.05, -0.72, 10.99, 10.33),
x1 = c(3.11, 2.16, 0.91, 2.78, 0.06, 1.12, 1.73, 3.95, 1.43, 3.40, 2.37, 1.80, 0.95,
1.66, 3.06, -0.23, 3.11, 3.07, -0.39, 0.13, 4.38, 2.15, 1.61, 1.54, 5.50, 2.21,
0.89, 3.24, 4.27, 2.55),
x2 = c(2.52, -0.21, 1.03, -0.21, 0.44, 0.36 , 0.72, 4.28, -0.40, 1.76, 1.57,
2.85, -1.71, 1.36, 1.61, 3.96, -0.11 , 0.35, 3.83, -0.66, 0.13, 3.67, 1.91, -0.59, 4.31,
3.28, -1.97, -1.98, 3.36, 3.89))
df$ds <- as.Date(df$ds)
# Model 1
Model1 <- function(df) {
m <- prophet(seasonality.mode = 'multiplicative')
m <- add_regressor(m, 'x1')
m <- add_regressor(m, 'x2')
m <- fit.prophet(m, df)
future <- make_future_dataframe(m, periods = 5, freq = 'day')
mod1 <- predict(m, future)
return(mod1)
}
mod1 <-df %>%
group_by(group) %>%
do(Model1(.)) %>%
dplyr::select(ds, group, yhat)
# Model 2
library(prophet)
library(dplyr)
library(purrr)
library(tidyr)
Model2 <- df %>%
nest(-group) %>%
mutate(m = map(data, prophet)) %>%
mutate(future = map(m, make_future_dataframe, period = 5)) %>%
mutate(forecast = map2(m, future, predict))

Removing NAs from ggplot x-axis in ggplot2

I would like to get rid off the whole NA block (highlighted here ).
I tried na.ommit and na.rm = TRUE unsuccesfully.
Here is the code I used :
library(readxl)
data <- read_excel("Documents/TFB/xlsx_geochimie/solfatara_maj.xlsx")
View(data)
data <- gather(data,FeO:`Fe2O3(T)`,key = "Element",value="Pourcentage")
library(ggplot2)
level_order <- factor(data$Element,levels = c("SiO2","TiO2","Al2O3","Fe2O3","FeO","MgO","CaO","Na2O","K2O"))
ggplot(data=data,mapping=aes(x=level_order,y=data$Pourcentage,colour=data$Ech)+geom_point()+geom_line(group=data$Ech) +scale_y_log10()
And here is my original file
https://drive.google.com/file/d/1bZi7fPWebbpodD1LFScoEcWt5Bs-cqhb/view?usp=sharing
If I run your code and look at data that goes into ggplot:
table(data$Element)
Al2O3 CaO Fe2O3 Fe2O3(T) FeO K2O LOI LOI2 MgO MnO
12 12 12 12 12 12 12 12 12 12
Na2O P2O5 SiO2 SO4 TiO2 Total Total 2 Total N Total S
12 12 12 12 12 12 12 12 12
You have included Total into the melted data frame.. which is not intended I guess. Hence when you do factor on these, and these "Total.." are not included in the levels, they become NA.
So we can do it from scratch:
data <- read_excel("solfatara_maj.xlsx")
The data:
structure(list(Ech = c("AGN 1A", "AGN 2A", "AGN 3B", "SOL 4B",
"SOL 8Ag", "SOL 8Ab", "SOL 16A", "SOL 16B", "SOL 16C", "SOL 22 A",
"SOL 22D", "SOL 25B"), FeO = c(0.2, 0.8, 1.7, 0.3, 1.7, NA, 0.2,
NA, 0.1, 0.7, 1.3, 2), `Total S` = c(5.96, 45.3, 0.22, 17.3,
NA, NA, NA, NA, NA, NA, 2.37, 0.36), SO4 = c(NA, 6.72, NA, 4.08,
0.06, 0.16, 42.2, 35.2, 37.8, 0.32, 6.57, NA), `Total N` = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, 15.2, NA, NA), SiO2 = c(50.2,
31.05, 56.47, 62.14, 61.36, 75.66, 8.41, 21.74, 17.44, 13.52,
19.62, 56.35), Al2O3 = c(15.53, 7.7, 17.56, 4.44, 17.75, 10.92,
31.92, 26.38, 27.66, 0.64, 3.85, 17.28), Fe2O3 = c(0.49, 0.63,
2.06, NA, 1.76, 0.11, 0.64, 0.88, 1.71, NA, 1.32, 2.67), MnO = c(0.01,
0.01, 0.13, 0.01, 0.09, 0.01, 0.01, 0.01, 0.01, 0.005, 0.04,
0.12), MgO = c(0.06, 0.07, 0.88, 0.03, 0.97, 0.05, 0.04, 0.07,
0.03, 0.02, 1.85, 1.63), CaO = c(0.2, 0.09, 3.34, 0.09, 2.58,
0.57, 0.2, 0.26, 0.15, 0.06, 35.66, 4.79), Na2O = c(0.15, 0.14,
3.23, 0.13, 3.18, 2.04, 0.68, 0.68, 0.55, 0.05, 0.45, 3.11),
K2O = c(4.39, 1.98, 8, 1.26, 8.59, 5.94, 8.2, 6.97, 8.04,
0.2, 0.89, 7.65), TiO2 = c(0.42, 0.27, 0.46, 0.79, 0.55,
0.16, 0.09, 0.22, 0.16, 0.222, 0.34, 0.53), P2O5 = c(0.11,
0.09, 0.18, 0.08, 0.07, 0.07, 0.85, 0.68, 0.62, NA, 0.14,
0.28), LOI = c(27.77, 57.06, 6.13, 29.03, 1.38, 4.92, 42.58,
37.58, 38.76, NA, 26.99, 3.92), LOI2 = c(27.79, 57.15, 6.32,
29.06, 1.57, 4.93, 42.6, 37.59, 38.77, 0.08, 27.13, 4.15),
Total = c(99.52, 99.88, 100.2, 98.25, 99.99, 100.5, 93.81,
95.57, 95.23, 15.25, 92.45, 100.3), `Total 2` = c(99.54,
99.96, 100.3, 98.28, 100.2, 100.6, 93.83, 95.58, 95.24, 15.33,
92.59, 100.6), `Fe2O3(T)` = c(0.71, 1.52, 3.95, 0.27, 3.65,
0.22, 0.87, 0.99, 1.82, 0.61, 2.76, 4.9)), row.names = c(NA,
-12L), class = c("tbl_df", "tbl", "data.frame"))
First we set the plotting level like you did:
plotlvls = c("SiO2","TiO2","Al2O3","Fe2O3","FeO","MgO","CaO","Na2O","K2O")
Then we select only these columns, and also Ech, note I use pivot_longer() because gather() will supposedly be deprecated, and then we do the factoring too:
plotdf = data %>% select(c(plotlvls,"Ech")) %>%
pivot_longer(-Ech,names_to = "Element",values_to = "Pourcentage") %>%
mutate(Element=factor(Element,levels=toplot))
Finally we plot, and there are no NAs:
ggplot(data=plotdf,mapping=aes(x=Element,y=Pourcentage,colour=Ech))+
geom_point()+geom_line(aes(group=Ech)) +scale_y_log10()
1.Create reproducible minimal data
data <- data.frame(Element = c("SiO2","TiO2","Al2O3","Fe2O3","FeO","MgO","CaO","Na2O","K2O",NA),
Pourcentage = 1:10,
Ech = c("AGN 1A", "SOL 16"))
2.Set factor levels for variable 'Element'
data$Element <- factor(data$Element,levels = c("SiO2","TiO2","Al2O3","Fe2O3","FeO","MgO","CaO","Na2O","K2O"))
3.Remove rows containing NA in the variable 'Element'
data <- data[!is.na(data$Element), ]
4.Plot data using ggplot2 (ggplot2 syntax uses NSE (non standard evaluation), which means you dont't have to pass the variable names as strings or using the $ notation):
ggplot(data=data,aes(x=Element,y=Pourcentage,colour=Ech)) +
geom_point() +
geom_line(aes(group=Ech)) +
scale_y_log10()

How to calculate the average of a comma separated string of numbers in R

I have following file :
file 1
structure(list(Total_Gene_Symbol = c("5S_rRNA", "7SK", "A1BG-AS1"
), Test = c("1.02, 1.12, 1.11, 1.18, 1.12, 1.19, 1.25, 1.24, 1.24, 1.02",
"1.97, 2.27, 2.14, 1.15", "1.3, 1.01, 1.36, 1.42, 1.38, 1.01, 1.31, 1.34,
1.29, 1.34, 2.02, 1.12, 1.01, 1.31, 1.22"
)), .Names = c("Total_Gene_Symbol", "Test"), row.names = c(NA,
3L), class = "data.frame")
file 1 column test is number separated by ",".
I tried
mat <- stri_split_fixed(Down_FC, ',', simplify=T)
mat <- `dim<-`(as.numeric(mat), dim(mat)) # convert to numeric and save dims
rowMeans(mat, na.rm=T)->M
View(M)
but the above code is averaging entire data.
I want output same like below file 2
file 2
structure(list(Total_Gene_Symbol = c("5S_rRNA", "7SK", "A1BG-AS1"
), Test = c("1.02, 1.12, 1.11, 1.18, 1.12, 1.19, 1.25, 1.24, 1.24, 1.02",
"1.97, 2.27, 2.14, 1.15", "1.3, 1.01, 1.36, 1.42, 1.38, 1.01, 1.31, 1.34,
1.29, 1.34, 2.02, 1.12, 1.01, 1.31, 1.22"
), Average = c(11.49, 7.53, 19.44)), .Names = c("Total_Gene_Symbol",
"Test", "Average"), row.names = c(NA, 3L), class = "data.frame")
What you want is the sum not average! The average is something like the mode, median, mean.
library(magrittr)
df1$total_sum<-
df1$Test %>% str_split(.,",\\s+") %>% sapply(function(x) as.numeric(x) %>% sum(na.rm=T))
Using apply
d1$sum <- apply(d1,1,
function(x)(sum(as.numeric(unlist(strsplit(x['Test'],','))),na.rm = TRUE)))
You can use scan :
df$sum <- sapply(df$Test, function(x) sum(scan(text = x, what=numeric(),sep=","), na.rm=TRUE))
df$average <- sapply(df$Test, function(x) mean(scan(text = x, what=numeric(),sep=","), na.rm=TRUE))
# Total_Gene_Symbol Test sum average
# 1 5S_rRNA 1.02, 1.12, 1.11, 1.18, 1.12, 1.19, 1.25, 1.24, 1.24, 1.02 11.49 1.1490
# 2 7SK 1.97, 2.27, 2.14, 1.15 7.53 1.8825
# 3 A1BG-AS1 1.3, 1.01, 1.36, 1.42, 1.38, 1.01, 1.31, 1.34, \n 1.29, 1.34, 2.02, 1.12, 1.01, 1.31, 1.22 19.44 1.2960

Resources