How to add environmental variable to DCA plot using ggplot2 - r

I would like to plot an environmental variable on a ggplot2 version of a DCA plot.
I have some code where I extract species and data scores from vegan and then plot them up in ggplot2. I am having trouble trying to work out how I can get my environmental variable SWLI to plot as an arrow - something like this RDA's plots with ggvegan: How can I change text position for arrows text? (or see PCA example here https://www.rpubs.com/an-bui/vegan-cheat-sheet)
Can anybody help?
#DCA Plot
library(plyr)
library(vegan)
library(ggplot2)
library(cluster)
library(ggfortify)
library(factoextra)
#read in csv and remove variables you don't want to go through analysis
regforamcountsall<-read_csv("regionalforamcountsallnocalcs.csv")
swli<-read_csv("DCAenv.csv")
rownames(regforamcountsall)<-regforamcountsall$Sample
regforamcountsall$Sample = NULL
regforamcountsall$Site=NULL
regforamcountsall$SWLI=NULL
#check csv
regforamcountsall
#run ordination
ord<-decorana(regforamcountsall)
#get species scores
summary(ord)
#get DCA values of environmental variable
ord.fit <- envfit(ord ~ SWLI, data=swli, perm=999)
ord.fit
plot(ord, dis="site")
plot(ord.fit)
#use this summary code to get species scores for DCA1 and DCA2
#put species scores values in from ord plot summary stats
species.scores<-read.csv("speciescores.csv")
species.scores$species <- row.names(species.scores)
#Using the scores function from vegan to extract the sample scores and convert to a data.frame
data.scores <- as.data.frame(scores(ord))
# create a column of groupings/clusters, from the rownames of data.scores
data.scores$endgroup <- as.factor(pam(regforamcountsall, 3)$clustering)
#getting the convex hull of each unique point set
find_hull <- function(df) df[chull(data.scores$DCA1, data.scores$DCA2), ]
hulls <- NULL
for(i in 1:length(unique(data.scores$endgroup))){
endgroup_coords <- data.scores[data.scores$endgroup == i,]
hull_coords <- data.frame(
endgroup_coords[chull(endgroup_coords[endgroup_coords$endgroup == i,]$DCA1,
endgroup_coords[endgroup_coords$endgroup == i,]$DCA2),])
hulls <- rbind(hulls,hull_coords)
}
data.scores$numbers <- 1:length(data.scores$endgroup)
regforamcountsall<-read_csv("regionalforamcountsallnocalcs.csv")
rownames(regforamcountsall)<-regforamcountsall$Sample
data.scores$Site<-regforamcountsall$Site
data.scores$SWLI<-regforamcountsall$SWLI
data.scores
#DCA with species
data.scores$Site <- as.character(data.scores$Site)
library(scico)
dca <- ggplot() +
# add the point markers
geom_point(data=data.scores,aes(x=DCA1,y=DCA2,colour=SWLI,pch=Site),size=4) + geom_point(data=species.scores,aes(x=DCA1,y=DCA2),size=3,pch=3,alpha=0.8,colour="grey22") +
# add the hulls and labels - numbers position labels
geom_polygon(data = hulls,aes(x=DCA1,y=DCA2,fill=endgroup), alpha = 0.25) +
#geom_text(data=data.scores,aes(x=DCA1-0.03,y=DCA2,colour=endgroup, label = numbers))+
geom_text(data=species.scores,aes(x=DCA1+0.1,y=DCA2+0.1, label = species))+
#look this up
geom_segment(data=ord.fit,aes(x = 0, y = 0, xend=DCA1,yend=DCA2), arrow = arrow(length = unit(0.3, "cm")))+
theme_classic()+
scale_color_scico(palette = "lapaz")+
coord_fixed()
dca
#regforamcountsall data
structure(list(Sample = c("T3LB7.008", "T3LB7.18", "T3LB7.303",
"WAP 0 ST-2", "T3LB7.5", "LG120"), T.salsa = c(86.63793102, 68.5897436,
70.39274924, 5.199999999, 79.15057916, 44.40000001), H.wilberti = c(0,
0, 0, 0, 0.386100386, 9.399999998), Textularia = c(0, 0, 0, 0,
0, 0.4), T.irregularis = c(2.155172414, 10.25641026, 7.854984897,
0, 2.702702703, 0), P.ipohalina = c(0, 0, 0, 0, 0, 0), J.macrescens = c(4.741379311,
5.769230769, 4.833836859, 5.800000001, 8.108108107, 5.400000001
), T.inflata = c(6.465517244, 15.38461538, 16.918429, 83.2, 5.791505794,
40.4), S.lobata = c(0, 0, 0, 2.300000001, 0, 0), M.fusca = c(0,
0, 0, 3.499999999, 3.861003862, 0), A.agglutinans = c(0, 0, 0,
0, 0, 0), A.exiguus = c(0, 0, 0, 0, 0, 0), A.subcatenulatus = c(0,
0, 0, 0, 0, 0), P.hyperhalina = c(0, 0, 0, 0, 0, 0), SWLI = c(200,
197.799175, 194.497937, 192.034776, 191.746905, 190.397351),
Site = c("LSP", "LSP", "LSP", "WAP", "LSP", "LG")), row.names = c(NA,
-6L), class = c("tbl_df", "tbl", "data.frame"))
#data.scores
structure(list(DCA1 = c(-1.88587476921648, -1.58550534382589,
-1.59816311314591, -0.0851161831632892, -1.69080448670088, -1.14488987340879
), DCA2 = c(0.320139736602921, 0.226662031865046, 0.230912045301637,
-0.0531232712001122, 0.272143119753744, 0.0696939776869396),
DCA3 = c(-0.755595015095353, -0.721144380683279, -0.675071834919103,
0.402339366526422, -0.731006052784081, 0.00474996849420783
), DCA4 = c(-1.10780013276303, -0.924265835490466, -0.957711953532202,
-0.434438970032073, -0.957873836258657, -0.508347000558056
), endgroup = structure(c(1L, 1L, 1L, 2L, 1L, 1L), .Label = c("1",
"2", "3"), class = "factor"), numbers = 1:6, Site = c("LSP",
"LSP", "LSP", "WAP", "LSP", "LG"), SWLI = c(200, 197.799175,
194.497937, 192.034776, 191.746905, 190.397351)), row.names = c(NA,
6L), class = "data.frame")
#species.scores
structure(list(species = c("1", "2", "3", "4", "5", "6"), DCA1 = c(-2.13,
-1.6996, -2.0172, -0.9689, 1.0372, -0.3224), DCA2 = c(0.342,
-0.8114, 0.3467, -0.3454, 2.0007, 0.9147)), row.names = c(NA,
6L), class = "data.frame")

Related

plot pearson correlation for the two datasets

I want to plot a correlation plot between two datasets. y axis is frequency of genes and x axis is correlation value or R2 value.
gvds_counts = read.delim("GVDS_normalized_counts_new.txt", header = TRUE, sep="\t",row.names = 1)
gvds_elastic = (read.delim("GVDS_PrediXcan_Test_2021_new.txt", header = TRUE, sep="\t",row.names = 1))
gvds_elastic= gvds_elastic[,-1]
gvds_elastic1=t(gvds_elastic)
This is how my two datasets look like:
dput(gvds_elastic1[1:5,1:5])
structure(c(0.117057915232398, 0, 0.16739607781207, -0.00582814885591799,
-0.00522232324665859, 0.196331738115648, 0, -0.00712521269403638,
0, -0.00522232324665859, 0.196331738115648, -0.021754984214482,
-0.00356260634701819, -0.0210845683319449, -0.00522232324665859,
0.177439691941073, -0.021754984214482, 0, -0.0210845683319449,
-0.00522232324665859, 0.158547645766499, -0.0435099684289639,
0, -0.00582814885591799, -0.00522232324665859), .Dim = c(5L,
5L), .Dimnames = list(c("ENSG00000107959.15", "ENSG00000057608.16",
"ENSG00000281186.1", "ENSG00000151632.17", "ENSG00000134452.19"
), c("GTEX-1117F", "GTEX-111CU", "GTEX-111FC", "GTEX-111VG",
"GTEX-111YS")))
dput(gvds_counts[1:5,1:5])
structure(list(GTEX.1117F.3226.SM.5N9CT = c(1.07050611836238,
319.010823271988, 0, 0, 0), GTEX.111FC.3126.SM.5GZZ2 = c(0, 137.627503211918,
0.81921132864237, 1.63842265728474, 0), GTEX.1128S.2726.SM.5H12C = c(0.93125973749023,
98.7135321739644, 0, 0.93125973749023, 0.93125973749023), GTEX.117XS.3026.SM.5N9CA = c(0,
140.966661860149, 0, 0.766123162283421, 0.766123162283421), GTEX.1192X.3126.SM.5N9BY = c(0.937426181007344,
139.676500970094, 0, 0.937426181007344, 0)), row.names = c("ENSG00000223972.5",
"ENSG00000227232.5", "ENSG00000278267.1", "ENSG00000243485.5",
"ENSG00000237613.2"), class = "data.frame")
I want to first rename the ID in gvds_counts similar to gvds_elastic1 which is to edit it from
GTEX.1128S.2726.SM.5H12C to GTEX-1128S
and then plot a correlation between these two datasets with x axis as R2 value and y axis as frequency of genes.

Error while creating workflow from a recipie using linear models in R

I am training a linear regression model predicting salary from company size (company_size_number) and country (country) using the StackOverflow data.
What I perform is:
Read the data. Split the data into a training set (75%) and a test set (25%).
Create a recipe that converts company_size_number into a factor variable and then transforms the two predictors into dummy variables.
Create the model specification.
Create a workflow object and add the recipe and model specification to it, then fit the model on the training set.
Calculate R² on the test set.
This is my code
library(tidyverse)
library(tidymodels)
so <- read_rds("stackoverflow.rds")
set.seed(123)
init_split <- initial_split(so)
so_training <- training(init_split)
so_testing <- testing(init_split)
rec <- recipe(salary ~ ., data = so_training %>% select(salary, company_size_number, country)) %>%
step_num2factor(company_size_number = factor(company_size_number)) %>%
step_dummy(country, company_size_number)
model_spec <- linear_reg() %>%
set_engine("lm") %>%
set_mode("regression")
fit <- workflow() %>%
add_model(model_spec) %>%
add_recipe(rec) %>%
fit(data = so_training)
predict(fit, new_data = so_testing) %>%
mutate(truth = so_testing$salary) %>%
rmse(estimate = .pred, truth = truth)
But not able to proceed due to an error:
Error: Please provide a character vector of appropriate length for `levels`.
I presume I am messing up something here in the spec_*()
rec <- recipe(salary ~ ., data = so_training %>% select(salary, company_size_number, country)) %>%
step_novel(company_size_number = factor(company_size_number)) %>%
step_dummy(country, company_size_number)
But not sure if this correct. Any inputs would be helpful.
> dput(head(so))
structure(list(country = structure(c(5L, 5L, 4L, 4L, 5L, 5L), .Label = c("Canada",
"Germany", "India", "United Kingdom", "United States"), class = "factor"),
salary = c(63750, 93000, 40625, 45000, 1e+05, 170000), years_coded_job = c(4L,
9L, 8L, 3L, 8L, 12L), open_source = c(0, 1, 1, 1, 0, 1),
hobby = c(1, 1, 1, 0, 1, 1), company_size_number = c(20,
1000, 10000, 1, 10, 100), remote = structure(c(1L, 1L, 1L,
1L, 1L, 1L), .Label = c("Remote", "Not remote"), class = "factor"),
career_satisfaction = c(8L, 8L, 5L, 10L, 8L, 10L), data_scientist = c(0,
0, 1, 0, 0, 0), database_administrator = c(1, 0, 1, 0, 0,
0), desktop_applications_developer = c(1, 0, 1, 0, 0, 0),
developer_with_stats_math_background = c(0, 0, 0, 0, 0, 0
), dev_ops = c(0, 0, 0, 0, 0, 1), embedded_developer = c(0,
0, 0, 0, 0, 0), graphic_designer = c(0, 0, 0, 0, 0, 0), graphics_programming = c(0,
0, 0, 0, 0, 0), machine_learning_specialist = c(0, 0, 0,
0, 0, 0), mobile_developer = c(0, 1, 0, 0, 1, 0), quality_assurance_engineer = c(0,
0, 0, 0, 0, 0), systems_administrator = c(1, 0, 1, 0, 0,
1), web_developer = c(0, 0, 0, 1, 1, 1)), row.names = c(NA,
-6L), class = c("tbl_df", "tbl", "data.frame"))
I have a couple of recommendations for adjustments in what you are doing.
The first is to do the selecting of variables before splitting, so that when you use a formula like salary ~ ., you and/or the functions don't get confused about what is there.
The second is to not use step_num2factor() in the way you have; it would take a lot to get it to work correctly and I think you're better served converting it to a factor before you split. Take a look at this step's documentation to see a more appropriate use for this recipe step, and notice that you have to give it levels. This is the reason you saw the error you did, but honestly I wouldn't try to find the right levels and input them there; I'd do it before splitting.
library(tidyverse)
library(tidymodels)
data("stackoverflow", package = "modeldata")
so <- janitor::clean_names(stackoverflow)
set.seed(123)
init_split <- so %>%
select(salary, company_size_number, country) %>%
mutate(company_size_number = factor(company_size_number)) %>%
initial_split()
so_training <- training(init_split)
so_testing <- testing(init_split)
rec <- recipe(salary ~ ., data = so_training) %>%
step_dummy(country, company_size_number)
model_spec <- linear_reg() %>%
set_engine("lm") %>%
set_mode("regression")
fit <- workflow() %>%
add_model(model_spec) %>%
add_recipe(rec) %>%
fit(data = so_training)
predict(fit, new_data = so_testing) %>%
mutate(truth = so_testing$salary) %>%
rmse(estimate = .pred, truth = truth)
#> # A tibble: 1 x 3
#> .metric .estimator .estimate
#> <chr> <chr> <dbl>
#> 1 rmse standard 27822.
Created on 2021-05-25 by the reprex package (v2.0.0)

Create faceted xy scatters using vectors of column names in R

I have two character vectors of equal length; where position one in vector.x matches position one in vector.y and so on. The elements refer to column names in a data frame (wide format). I would like to somehow loop through these vectors to produce xy scatter graphs for each pair in the vector, preferably in a faceted plot. Here is a (hopefully) reproducible example. To be clear, with this example, I would end up with 10 scatter graphs.
vector.x <- c("Aplanochytrium", "Aplanochytrium", "Aplanochytrium", "Aplanochytrium", "Aplanochytrium", "Bathycoccus", "Brockmanniella", "Brockmanniella", "Caecitellus_paraparvulus", "Caecitellus_paraparvulus")
vector.y <- c("Aliiroseovarius", "Neptuniibacter", "Pseudofulvibacter", "Thalassobius", "unclassified_Porticoccus", "Tenacibaculum", "Pseudomonas", "unclassified_GpIIa", "Marinobacter", "Thalassobius")
structure(list(Aliiroseovarius = c(0, 0, 0, 0.00487132352941176,
0.0108639420589757), Marinobacter = c(0, 0.00219023779724656,
0, 0.00137867647058824, 0.00310398344542162), Neptuniibacter = c(0.00945829750644884,
0.00959532749269921, 0.0171310629514964, 0.2796875, 0.345835488877393
), Pseudofulvibacter = c(0, 0, 0, 0.00284926470588235, 0.00362131401965856
), Pseudomonas = c(0.00466773123694878, 0.00782227784730914,
0.0282765737874097, 0.00707720588235294, 0.00400931195033627),
Tenacibaculum = c(0, 0, 0, 0.00505514705882353, 0.00362131401965856
), Thalassobius = c(0, 0.00166875260742595, 0, 0.0633272058823529,
0.147697878944646), unclassified_GpIIa = c(0, 0.000730079265748853,
0, 0.003125, 0.00103466114847387), unclassified_Porticoccus = c(0,
0, 0, 0.00119485294117647, 0.00569063631660631), Aplanochytrium = c(0,
0, 0, 0.000700770847932726, 0.0315839846865529), Bathycoccus = c(0.000388802488335925,
0, 0, 0.0227750525578136, 0.00526399744775881), Brockmanniella = c(0,
0.00383141762452107, 0, 0.000875963559915907, 0), Caecitellus_paraparvulus = c(0,
0, 0, 0.000875963559915907, 0.00797575370872547)), row.names = c("B11",
"B13", "B22", "DI5", "FF6"), class = "data.frame")
As Rui Barradas shows, it's possible to get a very nice plot from ggplot and gridExta. If you wanted to stick to base R, here's how you'd do that (assuming your data set is called df1):
# set plot sizes
par(mfcol = c(floor(sqrt(length(vector.x))), ceiling(sqrt(length(vector.x)))))
# loop through plots
for (i in 1:length(vector.x)) {
plot(df1[[vector.x[i]]], df1[[vector.y[i]]], xlab = vector.x[i], ylab = vector.y[i])
}
# reset plot size
par(mfcol = c(1,1))
This is a bit long and convoluted but it works.
library(tidyverse)
library(gridExtra)
df_list <- apply(data.frame(vector.x, vector.y), 1, function(x){
DF <- df1[which(names(df1) %in% x)]
i <- which(names(DF) %in% vector.x)
if(i == 2) DF[2:1] else DF
})
gg_list <- lapply(df_list, function(DF){
ggplot(DF, aes(x = get(names(DF)[1]), y = get(names(DF)[2]))) +
geom_point() +
xlab(label = names(DF)[1]) +
ylab(label = names(DF)[2])
})
g <- do.call(grid.arrange, gg_list)
g
Not too elegant, but should get you going:
vector.x <- c("Aplanochytrium", "Aplanochytrium", "Aplanochytrium", "Aplanochytrium", "Aplanochytrium", "Bathycoccus", "Brockmanniella", "Brockmanniella", "Caecitellus_paraparvulus", "Caecitellus_paraparvulus")
vector.y <- c("Aliiroseovarius", "Neptuniibacter", "Pseudofulvibacter", "Thalassobius", "unclassified_Porticoccus", "Tenacibaculum", "Pseudomonas", "unclassified_GpIIa", "Marinobacter", "Thalassobius")
df1 = structure(
list(Aliiroseovarius = c(0, 0, 0, 0.00487132352941176, 0.0108639420589757),
Marinobacter = c(0, 0.00219023779724656, 0, 0.00137867647058824, 0.00310398344542162),
Neptuniibacter = c(0.00945829750644884, 0.00959532749269921, 0.0171310629514964, 0.2796875, 0.345835488877393),
Pseudofulvibacter = c(0, 0, 0, 0.00284926470588235, 0.00362131401965856),
Pseudomonas = c(0.00466773123694878, 0.00782227784730914, 0.0282765737874097, 0.00707720588235294, 0.00400931195033627),
Tenacibaculum = c(0, 0, 0, 0.00505514705882353, 0.00362131401965856),
Thalassobius = c(0, 0.00166875260742595, 0, 0.0633272058823529, 0.147697878944646),
unclassified_GpIIa = c(0, 0.000730079265748853, 0, 0.003125, 0.00103466114847387),
unclassified_Porticoccus = c(0, 0, 0, 0.00119485294117647, 0.00569063631660631),
Aplanochytrium = c(0, 0, 0, 0.000700770847932726, 0.0315839846865529),
Bathycoccus = c(0.000388802488335925, 0, 0, 0.0227750525578136, 0.00526399744775881),
Brockmanniella = c(0, 0.00383141762452107, 0, 0.000875963559915907, 0),
Caecitellus_paraparvulus = c(0, 0, 0, 0.000875963559915907, 0.00797575370872547)),
row.names = c("B11", "B13", "B22", "DI5", "FF6"),
class = "data.frame"
)
df2 = NULL
for(i in 1:10) {
df.tmp = data.frame(
plot = paste0(vector.x[i], ":", vector.y[i]),
x = df1[[vector.x[i]]],
y = df1[[vector.y[i]]]
)
if(is.null(df2)) df2=df.tmp else df2 = rbind(df2, df.tmp)
}
ggplot(data=df2, aes(x, y)) +
geom_point() +
facet_grid(cols = vars(plot))

How to properly index list items to return rows, not columns, inside a for loop

I'm trying to write a for loop within another for loop. The first loop grabs the ith vcov matrix from a list of variously sized matrices (vcmats below) and grabs a frame of 24 predictor models of appropriate dimension to multiply with the current vcov matrix from a list of frames (jacobians below) for the different models. The second loop should pull the jth record (row) from the selected predictor frame, correctly format it, then run the calculation with the vcov matrix and output an indicator variable and calculated result needed for post processing to the holding table (holdtab).
When I run the code below I get the following error: Error in jjacob[, 1:4] : incorrect number of dimensions because R is returning the column of 1s (i.e. the intercept column of jacobs), not the complete first record (i.e. jjacob = jacobs[1,]). I've substantially simplified the example but left enough complexity to demonstrate the problem. I would appreciate any help in resolving this issue.
vcmats <- list(structure(c(0.67553, -0.1932, -0.00878, -0.00295, -0.00262,
-0.00637, -0.1932, 0.19988, 0.00331, -0.00159, 0.00149, 2e-05,
-0.00878, 0.00331, 0.00047, -6e-05, 3e-05, 3e-05, -0.00295, -0.00159,
-6e-05, 0.00013, -2e-05, 6e-05, -0.00262, 0.00149, 3e-05, -2e-05,
2e-05, 0, -0.00637, 2e-05, 3e-05, 6e-05, 0, 0.00026), .Dim = c(6L,
6L)), structure(c(0.38399, -0.03572, -0.00543, -0.00453, -0.00634,
-0.03572, 0.10912, 0.00118, -0.00044, 0.00016, -0.00543, 0.00118,
0.00042, -3e-05, 4e-05, -0.00453, -0.00044, -3e-05, 0.00011,
5e-05, -0.00634, 0.00016, 4e-05, 5e-05, 0.00025), .Dim = c(5L,
5L)))
jacobians <- list(structure(list(intcpt = c(1, 1, 1, 1), species = c(1, 1,
0, 0), nage = c(6, 6, 6, 6), T = c(12, 50, 12, 50), hgt = c(90,
90, 90, 90), moon = c(7, 7, 7, 7), hXm = c(0, 0, 0, 0), covr = c(0,
0, 0, 0), het = c(0, 0, 0, 0)), .Names = c("intcpt", "species",
"nage", "T", "hgt", "moon", "hXm", "covr", "het"), row.names = c("1",
"1.4", "1.12", "1.16"), class = "data.frame"), structure(list(
intcpt = c(1, 1, 1, 1), species = c(1, 1, 0, 0), nage = c(6,
6, 6, 6), T = c(12, 50, 12, 50), hgt = c(0, 0, 0, 0), moon = c(7,
7, 7, 7), hXm = c(0, 0, 0, 0), covr = c(0, 0, 0, 0), het = c(0,
0, 0, 0)), .Names = c("intcpt", "species", "nage", "T", "hgt",
"moon", "hXm", "covr", "het"), row.names = c("2", "2.4", "2.12",
"2.16"), class = "data.frame"))
holdtab <- structure(list(model = structure(c(4L, 4L, 4L, 4L, 5L, 5L, 5L,
5L), .Label = c("M.1.BaseCov", "M.2.Height", "M.5.Height.X.LastNewMoon",
"M.6.Height.plus.LastNew", "M.7.LastNewMoon", "M.G.Global"), class = "factor"),
aicc = c(341.317, 341.317, 341.317, 341.317, 342.1412, 342.1412,
342.1412, 342.1412), species = c(NA, NA, NA, NA, NA, NA,
NA, NA), condVar = c(NA, NA, NA, NA, NA, NA, NA, NA)), .Names = c("model",
"aicc", "species", "condVar"), row.names = c(1L, 2L, 3L, 4L,
25L, 26L, 27L, 28L), class = "data.frame")
jloop <- 1
for (imat in vcmats) { # Call the outside loop of vcov matrices
jacobs = jacobians[[jloop]] # Set tempvar jacobs as the jth member of the jacobians frame (n/24)
for (jjacob in jacobs) { # Call inside loop of lines in jacob (each individual set of predictor levels)
# I need to reduce the vector length to match my vcov matrix so
pt1 = jjacob[,1:4] # Separate Core columns from variable columns (because I don't want to drop species when ==0)
pt2 = jjacob[,5:9] # Pull out variable columns for next step
pt2 = pt2[,!apply(pt2 == 0, 2, all)] # Drop any variable columns that ==0
jjacob = cbind(pt1, pt2) # Reconstruct the record now of correct dimensions for the relevant vcov matrix
jjacob = as.matrix(jjacob) # Explicitly convert jjmod - I was having trouble with this previously
tj = (t(jjacob)) # Transpose the vector
condvar = jjacob %*% imat %*% tj # run the calculation
condVarTab[record,3] = jjacob[2] # Write species 0 or 1 to the output table
condVarTab[record,4] = condvar # Write the conditional variance to the table
record = record+1 # Iterate the record number for the next output run
}
jloop = jloop+1 # Once all 24 models in a frame are calculated iterate to the next frame of models which will be associated with a new vcv matrix
}

Create new column with percentages in data frame

I have the following dataframe:
dput(df1)
structure(list(month = c(1, 1, 2, 2, 3, 4), transaction_type = c("AAA",
"BBB", "BBB", "CCC",
"DDD", "AAA"), max_wt_per_month = c(54.9,
51.6833333333333, 52.3333333333333, 49.4666666666667, 49.85,
48.5833333333333), min_wt_per_month = c(0, 0, 0, 0, 0, 0), avg_wt_per_month = c(8.41701333107861,
7.65211141060198, 6.44184012508551, 7.74798927613941, 7.4360566888844,
7.50611319574734), prop = c(Inf, Inf, Inf, Inf, Inf, Inf)), .Names = c("month",
"transaction_type", "max_wt_per_month", "min_wt_per_month", "avg_wt_per_month",
"prop"), row.names = c(NA, -6L), class = c("grouped_df", "tbl_df",
"tbl", "data.frame"), vars = list(month), drop = TRUE, indices = list(
0:5), group_sizes = 6L, biggest_group_size = 6L, labels = structure(list(
month = 1), row.names = c(NA, -1L), class = "data.frame", vars = list(
month), drop = TRUE, .Names = "month"))
I want to create column prop that would contain the percentage of maximum waiting time with respect to each month. If I run this code, then I get Inf values in most of the rows... (especially it is evident in the real dataset):
my_fun=function(vec){
100*as.numeric(vec[3]) /
sum(with(data_merged_transactions, ifelse(month == vec[1], max_wt_per_month, 0))) }
data_merged_transactions$prop=apply(data_merged_transactions , 1 , my_fun)
I then finally need to create the filled area chart so that each area would be a percentage out of 100%:
ggplot(data_merged_transactions, aes(x=month, y=prop, fill=transaction_type)) +
geom_area(alpha=0.6 , size=1, colour="black")
Why do I get Inf if the sum is not equal to 0?
Moreover, is it possible to create filled area chart with months being factors (Jan, Feb,etc.), not numbers? I tried to substitute month id's by month names, but then I got very thin bars instead of a filled area.
Is this what you were looking for?
library(tidyverse)
df1_tidy <- df1 %>%
group_by(month) %>%
summarise(SUM = sum(max_wt_per_month)) %>%
full_join(df1) %>%
mutate(prop = max_wt_per_month / SUM)
ggplot(data = df1_tidy,
aes(x = month,
y = prop,
fill = transaction_type)) +
geom_area(alpha = 0.6,
size = 1,
colour = "black") +
scale_x_continuous(labels = c("Jan", "Feb", "Mar", "Apr"))

Resources