How to extract features with different percentile from the density plot? - r

I am trying to extract features-compounds lying in different percentile (25th, 50th, 75th) from the density plot. Then save these features in the new data.frame. I will then use these new features and map with the original data.frame. Identification of these features would help in further analysis and in-depth exploration. I have provided example data and density/boxplot (screenshot below).
dput(Delta)
structure(list(`PC1-PC2` = c(0.0161933528045602, 0.766612235998576,
-0.237724873642335, -0.0733015604900428, 0.400545815637124, 0.414481719044214,
0.208303811501068, 0.392408339922047, 0.336514581021898, -0.320322998122561,
0.36615463065484, -0.263557666645363, 0.180272570114807, 0.255255831254277,
0.0138502697450574, 0.23798933387042, -0.296936870921566, 0.206190306805568,
0.141038353337885, 0.167942308239497, 0.147174778368622, -0.0111611567646942,
-0.141468109519736, 0.11179112137823, 0.114216799808335, 0.0185917572079534,
0.0147028493400293), Gene_Symbols = structure(c(15L, 13L, 21L,
9L, 2L, 7L, 1L, 19L, 14L, 5L, 17L, 24L, 18L, 8L, 27L, 20L, 12L,
26L, 4L, 23L, 3L, 6L, 16L, 22L, 11L, 25L, 10L), .Label = c("Feature_1_Compound_2",
"Feature_1_Compound_3", "Feature_10_Compound_1", "Feature_10_Compound_2",
"Feature_10_Compound_3", "Feature_2_Compound_2", "Feature_2_Compound_3",
"Feature_3_Compound_1", "Feature_3_Compound_2", "Feature_4_Compound_1",
"Feature_4_Compound_2", "Feature_4_Compound_3", "Feature_5_Compound_1",
"Feature_5_Compound_2", "Feature_5_Compound_3", "Feature_6_Compound_1",
"Feature_6_Compound_2", "Feature_6_Compound_3", "Feature_7_Compound_1",
"Feature_7_Compound_2", "Feature_7_Compound_3", "Feature_8_Compound_1",
"Feature_8_Compound_2", "Feature_8_Compound_3", "Feature_9_Compound_1",
"Feature_9_Compound_2", "Feature_9_Compound_3"), class = "factor")), row.names = c("Feature_5_Compound_3",
"Feature_5_Compound_1", "Feature_7_Compound_3", "Feature_3_Compound_2",
"Feature_1_Compound_3", "Feature_2_Compound_3", "Feature_1_Compound_2",
"Feature_7_Compound_1", "Feature_5_Compound_2", "Feature_10_Compound_3",
"Feature_6_Compound_2", "Feature_8_Compound_3", "Feature_6_Compound_3",
"Feature_3_Compound_1", "Feature_9_Compound_3", "Feature_7_Compound_2",
"Feature_4_Compound_3", "Feature_9_Compound_2", "Feature_10_Compound_2",
"Feature_8_Compound_2", "Feature_10_Compound_1", "Feature_2_Compound_2",
"Feature_6_Compound_1", "Feature_8_Compound_1", "Feature_4_Compound_2",
"Feature_9_Compound_1", "Feature_4_Compound_1"), class = "data.frame")
#> PC1-PC2 Gene_Symbols
#> Feature_5_Compound_3 0.01619335 Feature_5_Compound_3
#> Feature_5_Compound_1 0.76661224 Feature_5_Compound_1
#> Feature_7_Compound_3 -0.23772487 Feature_7_Compound_3
#> Feature_3_Compound_2 -0.07330156 Feature_3_Compound_2
#> Feature_1_Compound_3 0.40054582 Feature_1_Compound_3
#> Feature_2_Compound_3 0.41448172 Feature_2_Compound_3
#> Feature_1_Compound_2 0.20830381 Feature_1_Compound_2
#> Feature_7_Compound_1 0.39240834 Feature_7_Compound_1
#> Feature_5_Compound_2 0.33651458 Feature_5_Compound_2
#> Feature_10_Compound_3 -0.32032300 Feature_10_Compound_3
#> Feature_6_Compound_2 0.36615463 Feature_6_Compound_2
#> Feature_8_Compound_3 -0.26355767 Feature_8_Compound_3
#> Feature_6_Compound_3 0.18027257 Feature_6_Compound_3
#> Feature_3_Compound_1 0.25525583 Feature_3_Compound_1
#> Feature_9_Compound_3 0.01385027 Feature_9_Compound_3
#> Feature_7_Compound_2 0.23798933 Feature_7_Compound_2
#> Feature_4_Compound_3 -0.29693687 Feature_4_Compound_3
#> Feature_9_Compound_2 0.20619031 Feature_9_Compound_2
#> Feature_10_Compound_2 0.14103835 Feature_10_Compound_2
#> Feature_8_Compound_2 0.16794231 Feature_8_Compound_2
#> Feature_10_Compound_1 0.14717478 Feature_10_Compound_1
#> Feature_2_Compound_2 -0.01116116 Feature_2_Compound_2
#> Feature_6_Compound_1 -0.14146811 Feature_6_Compound_1
#> Feature_8_Compound_1 0.11179112 Feature_8_Compound_1
#> Feature_4_Compound_2 0.11421680 Feature_4_Compound_2
#> Feature_9_Compound_1 0.01859176 Feature_9_Compound_1
#> Feature_4_Compound_1 0.01470285 Feature_4_Compound_1
# Density distribution
plt2 <- ggdensity(Delta, x = "PC1-PC2", y = "..count..",
xlab = "Delta (PC1-PC2)",
ylab = "Number of genes",
fill = "lightgray", color = "black",
label = "Gene_Symbols", repel = TRUE,
font.label = list(color= "PC1-PC2"),
xticks.by = 20, # Break x ticks by 20
gradient.cols = c("blue", "red"),
legend = c(0.7, 0.6),
legend.title = "" # Hide legend title
)
#
library(dplyr)
library(ggplot2)
plt1 <- Delta %>% select(`PC1-PC2`) %>%
ggplot(aes(x="", y = `PC1-PC2`)) +
geom_boxplot(fill = "lightblue", color = "black") +
coord_flip() +
theme_classic() +
xlab("") +
theme(axis.text.y=element_blank(),
axis.ticks.y=element_blank())
# install.packages("egg", dependencies = TRUE)
egg::ggarrange(plt2, plt1, heights = 2:1)
Thank You,
Toufiq

Extract feature between 25th and 75th percentile of PC1-PC2:
Delta %>% filter(`PC1-PC2` >= quantile(Delta$`PC1-PC2`, .25) &
`PC1-PC2` <= quantile(Delta$`PC1-PC2`, .75) )
PC1-PC2 Gene_Symbols
Feature_5_Compound_3 0.01619335 Feature_5_Compound_3
Feature_1_Compound_2 0.20830381 Feature_1_Compound_2
Feature_6_Compound_3 0.18027257 Feature_6_Compound_3
Feature_9_Compound_3 0.01385027 Feature_9_Compound_3
Feature_7_Compound_2 0.23798933 Feature_7_Compound_2
Feature_9_Compound_2 0.20619031 Feature_9_Compound_2
Feature_10_Compound_2 0.14103835 Feature_10_Compound_2
Feature_8_Compound_2 0.16794231 Feature_8_Compound_2
Feature_10_Compound_1 0.14717478 Feature_10_Compound_1
Feature_8_Compound_1 0.11179112 Feature_8_Compound_1
Feature_4_Compound_2 0.11421680 Feature_4_Compound_2
Feature_9_Compound_1 0.01859176 Feature_9_Compound_1
Feature_4_Compound_1 0.01470285 Feature_4_Compound_1

Related

Order Y axis labels based on X increasing values

I have this data like this
z <- structure(list(Description = c("Neurotransmitter receptors and postsynaptic signal transmission",
"Muscle contraction", "Class A/1 (Rhodopsin-like receptors)",
"Signaling by Rho GTPases", "Metabolism of carbohydrates", "Extracellular matrix organization",
"Transmission across Chemical Synapses", "G alpha (i) signalling events",
"GPCR ligand binding", "Neuronal System"), p.adjust = c(0.563732077253957,
0.563732077253957, 0.774251160588198, 0.797669099976286, 0.655931854998983,
0.655931854998983, 0.563732077253957, 0.774251160588198, 0.774251160588198,
0.655931854998983), Count = c(9L, 9L, 9L, 9L, 10L, 10L, 11L,
11L, 12L, 13L)), row.names = c("R-HSA-112314", "R-HSA-397014",
"R-HSA-373076", "R-HSA-194315", "R-HSA-71387", "R-HSA-1474244",
"R-HSA-112315", "R-HSA-418594", "R-HSA-500792", "R-HSA-112316"
), class = "data.frame")
I would like to plot labels of y axis based on values of x axis, so from the smallest one to the largest one. Now it plots me in alphabetic order. How to do this?
ggplot(z, aes(Count, Description, size=Count, color=p.adjust))+
geom_point()
Somethine like this
With forcats::fct_reorder(Description, Count) you can change the order of y values.
library(ggplot2)
library(forcats)
ggplot(z, aes(Count, fct_reorder(Description, Count), size=Count, color=p.adjust))+
geom_point()
Created on 2022-02-01 by the reprex package (v2.0.1)

Grouped bar chart in R for multiple filter and select

Following is my dataset:
Result
course1
course2
course3
pass
15
17
18
pass
12
14
19
Fail
9
13
3
Fail
3
2
0
pass
14
11
20
Fail
5
0
7
I want to plot a grouped bar graph. I am able to plot following graphs but I want both the results in same graph.
par(mfrow=c(1,1))
options(scipen=999)
coul <- brewer.pal(3, "Set2")
# Bar graph for passed courses
result_pass <-data %>% filter(Result=='Pass') %>% summarize(c1_tot=sum(course1),
c2_tot = sum(course2), c3_tot = sum(course3) )
col_sum <- colSums(result_pass[,1:3])
barplot(colSums(result_pass[,1:3]), xlab = "Courses", ylab = "Total Marks", col = coul, ylim=range(pretty(c(0, col_sum))), main = "Passed courses ")
# Bar graph for Failed courses
result_fail <-data %>% filter(Result=='Fail') %>% summarize(c1_tot=sum(course1),
c2_tot = sum(course2), c3_tot = sum(course3) )
col_sum <- colSums(result_fail[,1:3])
barplot(colSums(result_fail[,1:3]), xlab = "Courses", ylab = "Total Marks", col = coul, ylim=range(pretty(c(0, col_sum))), main = "Failed courses ")
Any suggestion for which I can merge both the above plots and create grouped bar graph for Pass and Fail courses.
It's probably easier than you think. Just put the data directly in aggregate and use as formula . ~ Result, where . means all other columns. Removing first column [-1] and coerce as.matrix (because barplot eats matrices) yields exactly the format we need for barplot.
This is the basic code:
barplot(as.matrix(aggregate(. ~ Result, data, sum)[-1]), beside=TRUE)
And here with some visual enhancements:
barplot(as.matrix(aggregate(. ~ Result, data, sum)[-1]), beside=TRUE, ylim=c(0, 70),
col=hcl.colors(2, palette='viridis'), legend.text=sort(unique(data$Result)),
names.arg=names(data)[-1], main='Here could be your title',
args.legend=list(x='topleft', cex=.9))
box()
Data:
data <- structure(list(Result = c("pass", "pass", "Fail", "Fail", "pass",
"Fail"), course1 = c(15L, 12L, 9L, 3L, 14L, 5L), course2 = c(17L,
14L, 13L, 2L, 11L, 0L), course3 = c(18L, 19L, 3L, 0L, 20L, 7L
)), class = "data.frame", row.names = c(NA, -6L))

Change order for factor in grouped-bar plot

I have the following code:
figg4 <- lala4 %>% gather(key, value, -Species_Name) %>%
mutate (Species_Name = factor(Species_Name,
levels=c('Dasyprocta punctata',
'Cuniculus paca','Large Rats',
'Heteromys unknown', 'Sciurus variegatoides',
'Sciurus granatensis','Dasypus novemcinctus',
'Didelphis marsupialis', 'Philander opossum',
'Metachirus nudicaudatus', 'Nasua narica',
'Procyon lotor', 'Eira barbara',
'Galictis vittata', 'Leopardus pardalis'))) %>%
ggplot(aes(x=Species_Name, y=value,
fill=key)) + coord_flip() + geom_col (position = "stack") +
theme(panel.background = element_blank()) + bbc_style() +
labs(title = "Species occupancy by Site Type")+
scale_fill_manual(values = c("#333333","#1380A1", "#FAAB18"))
I get bar graph which is listing the names in the reverse order, I want to make them appear in the order that I have written the levels in... how do I do so?
I tried using fct_reorder from forcats by adding the following code
mutate(name = fct_reorder(Species_Name, desc(value)))
But that did not change the order.
I am quite new to r and not sure of how to do this. Would be grateful for any help
Here is dput output for the source:
dput(lala4)
structure(list(Species_Name = structure(c(9L, 12L, 13L, 14L,
19L, 22L, 27L, 46L, 41L, 42L, 10L, 15L, 32L, 33L, 24L), .Label = c("Buteo platypterus",
"Canis latrans", "Cathartes aura", "Catharus unknown", "Catharus ustulatus",
"Cebus capucinus", "Chordeiles unknown", "Conepatus semistriatus",
"Crax rubra", "Crypturellus cinnamomeus", "Cuniculus paca", "Dasyprocta punctata",
"Dasypus novemcinctus", "Didelphis marsupialis", "Eira barbara",
"Galictis vittata", "Geotrygon montana", "Geotrygon violacea",
"Heteromys unknown", "Holcosus quadrilineatus", "Large Rats",
"Leopardus pardalis", "Leopardus wiedii", "Leptotila unknown",
"Melozone unknown", "Metachirus nudicaudatus", "Nasua narica",
"Odocoileus virginianus", "Panthera onca", "Parkesia noveboracensis",
"Pecari tajacu", "Penelopina nigra", "Philander opossum", "Piaya cayana",
"Procyon lotor", "Puma concolor", "Puma yagouaroundi", "Sciurus granatensis",
"Sciurus variegatoides", "Setophaga unknown", "Sylvilagus sp ",
"Tamandua mexicana", "Tapirus bairdii", "Tayassu pecari", "Tigrisoma fasciatum",
"Tinamus major"), class = "factor"), Forest Area (<5ha) = c(0.067307692,
0.134615385, 0.173076923, 0.144230769, 0.019230769, 0.086538462,
0.192307692, 0.009615385, 0.163461538, 0.038461538, 0, 0.019230769,
0, 0.163461538, 0.153846154), Forest Area (5-27ha) = c(0.067307692,
0.317307692, 0.269230769, 0.096153846, 0.038461538, 0.105769231,
0.192307692, 0.115384615, 0.134615385, 0.057692308, 0, 0.096153846,
0, 0.076923077, 0.173076923), Forest Area (>350ha) = c(0.163461538,
0.384615385, 0.278846154, 0.201923077, 0.105769231, 0.067307692,
0.144230769, 0.298076923, 0.028846154, 0.048076923, 0.086538462,
0.038461538, 0.019230769, 0.028846154, 0.125)), row.names = c(NA,
15L), class = "data.frame")
You need to redefine the factor as an ordered factor first.
Try just fixing the code where you define the factor by adding
ordered = TRUE
This should probably work:
figg4 <- lala4 %>% gather(key, value, -Species_Name) %>%
mutate (Species_Name = factor(Species_Name,
levels=c('Dasyprocta punctata',
'Cuniculus paca','Large Rats',
'Heteromys unknown', 'Sciurus variegatoides',
'Sciurus granatensis','Dasypus novemcinctus',
'Didelphis marsupialis', 'Philander opossum',
'Metachirus nudicaudatus', 'Nasua narica',
'Procyon lotor', 'Eira barbara',
'Galictis vittata', 'Leopardus pardalis'), ordered = TRUE)) %>%
ggplot(aes(x=Species_Name, y=value,
fill=key)) + coord_flip() + geom_col (position = "stack") +
theme(panel.background = element_blank()) + bbc_style() +
labs(title = "Species occupancy by Site Type")+
scale_fill_manual(values = c("#333333","#1380A1", "#FAAB18"))
I can't run it because I don't have the lala4 data to test though.

R code to plot a date range as a bar or line for a number of categorical variables

I am struggling to do this in R. I have a list of station names with two associated variables: Start Date and End Date. What I would like to do is plot a horizontal line or bar chart that ranges from the start and end date for each station name.
I have tried using ggplot, but I'll confess I am recent user to R.
If you have data looking like this (dput is at the end) with
start date
end date
task name or station name
and an optional group name
(I invented some data, as the OP does not provide data)
StartDate EndDate TaskName Group
1 2018-10-01 2018-11-02 KPI: high level definition KPI Definition
2 2018-11-05 2018-11-16 KPI: data translation KPI Definition
3 2019-02-18 2019-03-01 KPI: corroboration KPI Definition
4 2018-11-05 2018-11-16 KPI: Define Graphical Format KPI Definition
5 2018-10-22 2018-12-07 Data: Which data Define and Get Data
6 2018-10-08 2018-10-19 Data: Mail requesting data Define and Get Data
7 2018-12-07 2018-12-14 Data: Mail defining data Define and Get Data
8 2018-12-17 2018-12-28 Data: Test data dump Define and Get Data
9 2018-12-17 2018-12-28 Data: CSV temporary Define and Get Data
10 2018-12-31 2019-01-25 Data: Quality inspection of Data Dump Define and Get Data
11 2018-12-31 2019-01-25 Data: Create graphs Define and Get Data
12 2019-01-28 2019-02-15 Data: Correct data comparison with KPI defs Define and Get Data
13 2019-02-04 2019-03-01 Data: Create and publish ppt format Define and Get Data
14 2018-11-19 2018-12-14 Storage: Where Storage
15 2018-11-19 2018-12-14 Storage: How much Storage
You will need to put it in long format (a separate line for start and end)
library(ggplot2)
library(reshape2) # for melt to get the data in long format
m_planning_data2 <- melt(planning_data2, measure.vars = c("StartDate", "EndDate"))
Then plot it using ggplot:
ggplot(m_planning_data2, aes(value, TaskName)) +
geom_line(size=4) +
xlab(NULL) +
ylab(NULL) +
ggtitle("Example Assignment Planning 1") +
theme_minimal() +
theme(aspect.ratio = 0.4, axis.text = element_text(size = 7))
... yielding this simple plot:
Or plot it with grouping and an annotation for "today"
ggplot(m_planning_data2, aes(value, TaskName, col = Group)) +
geom_line(size=4) +
xlab(NULL) +
ylab(NULL) +
ggtitle("Example Assignment Planning 2") +
geom_vline(xintercept = as.POSIXct(as.Date(Sys.time())) , linetype = 1, size=1.5, colour = "purple", alpha= .5) +
annotate("text", x = as.POSIXct(as.Date(Sys.time())) + 86400*1.5, y = 3,
label = as.Date(Sys.time()), colour = "purple", angle=90, size= 3) +
theme_minimal() +
theme(aspect.ratio = 0.4, axis.text = element_text(size = 7))
... yielding the following plot:
Please, let me know whether this is what you were after.
DATA
structure(list(StartDate = structure(c(1538344800, 1541372400,
1550444400, 1541372400, 1540159200, 1538949600, 1544137200, 1545001200,
1545001200, 1546210800, 1546210800, 1548630000, 1549234800, 1542582000,
1542582000), class = c("POSIXct", "POSIXt"), tzone = ""), EndDate = structure(c(1541113200,
1542322800, 1551394800, 1542322800, 1544137200, 1539900000, 1544742000,
1545951600, 1545951600, 1548370800, 1548370800, 1550185200, 1551394800,
1544742000, 1544742000), class = c("POSIXct", "POSIXt"), tzone = ""),
TaskName = structure(c(13L, 11L, 10L, 12L, 9L, 6L, 5L, 8L,
4L, 7L, 3L, 1L, 2L, 15L, 14L), .Label = c("Data: Correct data comparison with KPI defs",
"Data: Create and publish ppt format", "Data: Create graphs",
"Data: CSV temporary", "Data: Mail defining data", "Data: Mail requesting data",
"Data: Quality inspection of Data Dump", "Data: Test data dump",
"Data: Which data", "KPI: corroboration", "KPI: data translation",
"KPI: Define Graphical Format", "KPI: high level definition",
"Storage: How much", "Storage: Where"), class = "factor"),
Group = structure(c(2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 3L, 3L), .Label = c("Define and Get Data", "KPI Definition",
"Storage"), class = "factor")), .Names = c("StartDate", "EndDate",
"TaskName", "Group"), row.names = c(NA, -15L), class = "data.frame")

plotting points within a date range specified with shiny slider bar

From a larger dataset, I want to only plot points that are within a min and max date that is specified with a shiny slider bar containing a date range. This post builds from a related post linked here. Data are contained at the bottom using dput.
The code/app below sequentially plots points as the date is increased with the slider bar. When I move the 2nd slider bar I want points no longer in the date range to be removed, which currenlty does not happen.
How do I subset the data so that only points (and paths) >= the min date and <= the max date are shown? It is not clear to me how to reference the two dates on the slider bar.
Thanks in advance.
library(ggplot2)
library(shiny)
ui <- fluidPage(
titlePanel("GPS Data Summary"),
sliderInput(inputId = "Order",
label = "Sequance of Observations",
min = as.Date(min(dat$PosiGMT)), max = as.Date(max(dat$PosiGMT)),
value = c(as.Date(min(dat$PosiGMT)), as.Date(min(dat$PosiGMT)))),
plotOutput("PointPlot")
)
server <- function(input, output) {
output$PointPlot <- renderPlot({
p <- ggplot(dat[as.Date(dat$PosiGMT) <= input$Order ,], (aes(x = GPSUTMEasting , y = GPSUTMNorthing ))) +
geom_point() + geom_path() +
xlim( min(dat$GPSUTMEasting), max(dat$GPSUTMEasting))+
ylim( min(dat$GPSUTMNorthing), max(dat$GPSUTMNorthing))
print(p)
})
}
shinyApp(ui = ui, server = server)
Data below
dat <- structure(list(GPSUTMNorthing =
c(4947787L, 4947945L, 4947957L,
4947954L, 4947797L, 4947835L, 4947825L, 4947784L, 4947842L, 4947839L,
4947789L, 4947807L, 4947839L, 4947845L, 4947779L, 4947824L, 4947824L,
4947772L, 4947824L, 4947821L, 4947816L, 4947809L, 4947840L, 4947829L,
4947820L),
GPSUTMEasting = c(600201L, 600910L, 600911L, 600907L,
601052L, 601038L, 601031L, 601066L, 600998L, 600995L, 601058L,
601038L, 600987L, 601071L, 601016L, 601002L, 601003L, 601003L,
600917L, 600916L, 600918L, 600923L, 600985L, 600980L, 600914L),
PosiGMT = structure(c(1360393200, 1360414800, 1360479600,
1360501200, 1360544400, 1360566000, 1360587600, 1360630800, 1360652400,
1360674000, 1360695600, 1360717200, 1360738800, 1360803600, 1360825200,
1360846800, 1360868400, 1360890000, 1360911600, 1360933200, 1360954800,
1360976400, 1360998000, 1361019600, 1361041200),
class = c("POSIXct", "POSIXt"), tzone = "") ),
.Names = c("GPSUTMNorthing", "GPSUTMEasting", "PosiGMT"),
row.names = c(1L, 2L, 5L, 6L, 8L, 9L, 10L, 12L, 13L, 14L, 15L,
16L, 17L, 20L, 21L, 22L, 23L, 24L, 25L, 26L, 27L, 28L, 29L, 30L, 31L),
class = "data.frame")
Hi input$Order is a vector of length 2, so input$Order[1] is the min and input$Order[2] the max, you can do something like this :
library(ggplot2)
library(shiny)
ui <- fluidPage(
titlePanel("GPS Data Summary"),
sliderInput(inputId = "Order",
label = "Sequance of Observations",
min = as.Date(min(dat$PosiGMT)), max = as.Date(max(dat$PosiGMT)),
value = c(as.Date(min(dat$PosiGMT)), as.Date(min(dat$PosiGMT)))),
plotOutput("PointPlot")
)
server <- function(input, output) {
output$PointPlot <- renderPlot({
### Filter by date
dat <- dat[as.Date(dat$PosiGMT) >= input$Order[1] & as.Date(dat$PosiGMT) <= input$Order[2] ,]
###
p <- ggplot(dat, (aes(x = GPSUTMEasting , y = GPSUTMNorthing ))) +
geom_point() + geom_path() +
xlim( min(dat$GPSUTMEasting), max(dat$GPSUTMEasting))+
ylim( min(dat$GPSUTMNorthing), max(dat$GPSUTMNorthing))
print(p)
})
}
shinyApp(ui = ui, server = server)

Resources