Related
Is there an easy way to get ASV richness for each Phylum for each Station using the estimate_richness function in phyloseq? Or is there another simple way of extracting the abundance data for each taxonomic rank and calculating richness that way?
So far I have just been subsetting individual Phyla of interest using for example:
ps.Prymnesiophyceae <- subset_taxa(ps, Phylum == "Prymnesiophyceae")
alpha_diversity<-estimate_richness(ps.Prymnesiophyceae,measure=c("Shannon","Observed"))
H<-alpha_diversity$Shannon
S1<-alpha_diversity$Observed
S<-log(S1)
evenness<-H/S
alpha<-cbind(Shannon=H,Richness=S1,Evenness=evenness,sample_data(Prymnesiophyceae))
But this is rather a pain when having to do it for e.g. the top 20 phyla.
EDIT:
suggestion by #GTM works well until last step. See comment + dput:
> dput(head(sample_names(ps.transect), n=2)) c("2-1-DCM_S21_L001_R1_001.fastq", "2-1-SA_S9_L001_R1_001.fastq" )
> dput(head(alpha, n=2)) structure(list(Observed = c(31, 25), Shannon = c(2.84184012598765,
2.53358345702604), taxon = c("Prymnesiophyceae", "Prymnesiophyceae" ), sample_id = c("X2.1.DCM_S21_L001_R1_001.fastq", "X2.1.SA_S9_L001_R1_001.fastq" ), S = c(3.43398720448515,
3.2188758248682), evenness = c(0.827562817437384,
0.787101955736294)), row.names = c("X2.1.DCM_S21_L001_R1_001.fastq", "X2.1.SA_S9_L001_R1_001.fastq"), class = "data.frame")
> dput(head(smpl_data, n=1)) new("sample_data", .Data = list("001_DCM", 125L, structure(1L, .Label = "DCM", class = "factor"), structure(1L, .Label = "Transect", class = "factor"), structure(1L, .Label = "STZ", class = "factor"),
structure(1L, .Label = "STFW", class = "factor"), "Oligotrophic",
16L, -149.9978333, -29.997, 130.634, 17.1252, 35.4443, 1025.835008,
1.1968, 1e-12, 5.387, 2.8469, 52.26978546, 98.0505, 0, 0,
0.02, 0.9, 0, 0, 2069.47, 8.057, 377.3), names = c("Station_neat", "Depth_our", "Depth_bin", "Loc", "Front", "Water", "Zone", "Bottle", "Lon", "Lat", "pressure..db.", "Temperature", "Salinity", "Density_kgm.3", "Fluorescence_ugL", "PAR", "BottleO2_mLL", "CTDO2._mLL", "OxygenSat_.", "Beam_Transmission", "N_umolL", "NO3_umolL", "PO4_umolL", "SIL_umolL", "NO2_umolL", "NH4_umolL", "DIC_uMkg", "pH", "pCO2_matm"), row.names = "2-1-DCM_S21_L001_R1_001.fastq",
.S3Class = "data.frame")
You can wrap your code in a for loop to do so. I've slightly modified your code to make it a bit more flexible, see below.
require("phyloseq")
require("dplyr")
# Calculate alpha diversity measures for a specific taxon at a specified rank.
# You can pass any parameters that you normally pass to `estimate_richness`
estimate_diversity_for_taxon <- function(ps, taxon_name, tax_rank = "Phylum", ...){
# Subset to taxon of interest
tax_tbl <- as.data.frame(tax_table(ps))
keep <- tax_tbl[,tax_rank] == taxon_name
keep[is.na(keep)] <- FALSE
ps_phylum <- prune_taxa(keep, ps)
# Calculate alpha diversity and generate a table
alpha_diversity <- estimate_richness(ps_phylum, ...)
alpha_diversity$taxon <- taxon_name
alpha_diversity$sample_id <- row.names(alpha_diversity)
return(alpha_diversity)
}
# Load data
data(GlobalPatterns)
ps <- GlobalPatterns
# Estimate alpha diversity for each phylum
phyla <- get_taxa_unique(ps,
taxonomic.rank = 'Phylum')
phyla <- phyla[!is.na(phyla)]
alpha <- data.frame()
for (phylum in phyla){
a <- estimate_diversity_for_taxon(ps = ps,
taxon_name = phylum,
measure = c("Shannon", "Observed"))
alpha <- rbind(alpha, a)
}
# Calculate the additional alpha diversity measures
alpha$S <- log(alpha$Observed)
alpha$evenness <- alpha$Shannon/alpha$S
# Add sample data
smpl_data <- as.data.frame(sample_data(ps))
alpha <- left_join(alpha,
smpl_data,
by = c("sample_id" = "X.SampleID"))
This is a reproducible example with GlobalPatterns. Make sure to alter the code to match your data by replacing X.SampleID in the left join with the name of the column that contains the sample IDs in your sample_data. If there is no such column, you can create it from the row names:
smpl_data <- as.data.frame(sample_data(ps))
smpl_data$sample_id < row.names(smpl_data)
alpha <- left_join(alpha,
smpl_data,
by = c("sample_id" = "sample_id"))
I have already searched the Forum for Hours (really) and start to get the faint Feeling that I am slowly going crazy, especially as it appears to me to be a really easily solvable Problem.
What do I want to do?
Basically, I want to simulate clinical data. Specifically, for each Patient (column 1:ID) an arbitrary score (column 3: score), dependant on the assigned Treatment Group (column 2: group).
set.seed(123)
# Number of subjects in study
n_patients = 1000
# Score: Mean and SDs
mean_verum = 70
sd_verum = 20
mean_placebo = 40
sd_placebo = 20
# Allocating to Treatment groups:
data = data.frame(id = as.character(1:n_patients))
data$group[1:(n_patients/2)] <- "placebo"
data$group[(n_patients/2+1):n_patients] <- "verum"
# Attach Score for each treatment group
data$score <- ifelse(data$group == "verum", rnorm(n=100, mean=mean_verum, sd=sd_verum), rnorm(n=100, mean=mean_placebo, sd=sd_placebo))
So far so easy. Now, I wish to 1) calculate a probability of an Event happening (logit function) depending on the score. Then, 2) I want to actually assign an Event, depending on the probability (rbinom).
I want to do this for n different probablities/Events. This is the Code I've used so far:
Calculate probabilities:
a = -1
b = 0.01
p1 = 1-exp(a+b*data$score)/(1+exp(a+b*data$score))
data$p_AE1 <- p1
a = -0.5
b = 0.01
p1 = 1-exp(a+b*data$score)/(1+exp(a+b*data$score))
data$p_AE2 <- p1
…
Assign Events:
data$Abbruch_AE1 <- rbinom(n_patients, 1, data$p_E1)
data$Abbruch_AE2 <- rbinom(n_patients, 1, data$p_E2)
…
Obviously, this is really inefficient, as it would like to easily scale this up or down, depending on how many probabilities/Events I want to simulate.
The Problem is, I simply do not get it, how I can simultaneously a) generate new, single column in the dataframe, where I want to put in the values for each, b) perform the function to assign the probabilities/Events, and c) do this for a number n of different formulas, which have their specific a and b.
I am sure the solution to this Problem is a simple one - what I didn't manage was to do all These Things at once, which is were I would like this to be eventually. I ahve played around with for loops, all to no avail.
Any help would be greatly appreciated!
This how my dataframe Looks like:
structure(list(id = structure(1:3, .Label = c("1", "2", "3"), class = "factor"),
group = c("placebo", "placebo", "placebo"), score = c(25.791868726014,
45.1376741831306, 35.0661624307525), p_AE1 = c(0.677450814266315,
0.633816117436442, 0.656861351663365), p_AE2 = c(0.560226492151216,
0.512153420188678, 0.537265362130761), p_AE3 = c(0.435875409622676,
0.389033483248856, 0.413221988111604), p_AE4 = c(0.319098312196655,
0.278608032377073, 0.299294085148527), p_AE5 = c(0.221332386680766,
0.189789774534235, 0.205762225373345), p_AE6 = c(0.147051201194953,
0.124403316086538, 0.135795233451071), p_AE7 = c(0.0946686004658072,
0.0793379289917946, 0.0870131973838217), p_AE8 = c(0.0596409872667201,
0.0496714832182721, 0.0546471270895262), AbbruchAE1 = c(1L,
1L, 1L), AbbruchAE2 = c(1L, 1L, 0L), AbbruchAE3 = c(0L, 0L,
0L), AbbruchAE4 = c(0L, 1L, 0L), AbbruchAE5 = c(1L, 0L, 0L
), AbbruchAE6 = c(1L, 0L, 0L), AbbruchAE7 = c(0L, 0L, 0L),
AbbruchAE8 = c(0L, 0L, 0L)), .Names = c("id", "group", "score", "p_AE1", "p_AE2", "p_AE3", "p_AE4", "p_AE5", "p_AE6", "p_AE7", "p_AE8", "AbbruchAE1", "AbbruchAE2", "AbbruchAE3", "AbbruchAE4", "AbbruchAE5", "AbbruchAE6", "AbbruchAE7", "AbbruchAE8"), row.names = c(NA, 3L), class = "data.frame")
I am creating a Flexdashboard in R. I want the dashboard to contains both a table and a series of visualizations, that would be filtered through inputs.
As I need to deliver a dashboard locally (without a server running in the background), I am unable to use Shiny, hence I rely on crosstalk.
I know that the crosstalk package provides limited functionality in the front-end. For instance, the documentation says that you can't aggregate the SharedData object.
Nonetheless, I am not clear if I can use the same inputs to filter two different dataframes.
For example, lets say I have:
Dataframe One: Contains original data
df1 <- structure(list(owner = structure(c(1L, 2L, 2L, 2L, 2L), .Label = c("John",
"Mark"), class = "factor"), hp = c(250, 120, 250, 100, 110),
car = structure(c(2L, 2L, 2L, 1L, 1L), .Label = c("benz",
"bmw"), class = "factor"), id = structure(1:5, .Label = c("car1",
"car2", "car3", "car4", "car5"), class = "factor")), .Names = c("owner",
"hp", "car", "id"), row.names = c(NA, -5L), class = "data.frame")
Dataframe Two: Contains aggregated data
df2 <- structure(list(car = structure(c(1L, 2L, 1L, 2L), .Label = c("benz",
+ "bmw"), class = "factor"), owner = structure(c(1L, 1L, 2L, 2L
+ ), .Label = c("John", "Mark"), class = "factor"), freq = c(0L,
+ 1L, 2L, 2L)), .Names = c("car", "owner", "freq"), row.names = c(NA,
+ -4L), class = "data.frame")
These two dataframes contain columns with identical values - car and owner. As well as, additional columns too.
I could create two different objects:
library(crosstalk)
shared_df1 <- SharedData$new(df1)
shared_df2 <- SharedData$new(df2)
and than:
filter_select("owner", "Car owner:", shared_df1, ~ owner)
filter_select("owner", "Car owner:", shared_df2, ~ owner)
However, that would mean that the user will need to fill inputs that are essentially identical, twice. Also, if the table is large, this would double the size of the memory needed to use the dashboard.
Is it possible to work around this problem in crosstalk?
Ah I recently ran into this too, there is another argument to SharedData$new(..., group = )! The group argument seems to do the trick. I found out by accident when I had two dataframes and used the group =.
If you make a sharedData object, it will include
a dataframe
a key to select rows by - preferably unique, but not necessarily.
a group name
What I think happens is that crosstalk filters the sharedData by the key - for all sharedData objects in the same group! So as long as two dataframes use the same key, you should be able to filter them together in one group.
This should work for your example.
---
title: "blabla"
output:
flexdashboard::flex_dashboard:
orientation: rows
social: menu
source_code: embed
theme: cerulean
---
```{r}
library(plotly)
library(crosstalk)
library(tidyverse)
```
```{r Make dataset}
df1 <- structure(list(owner = structure(c(1L, 2L, 2L, 2L, 2L), .Label = c("John", "Mark"), class = "factor"), hp = c(250, 120, 250, 100, 110), car = structure(c(2L, 2L, 2L, 1L, 1L), .Label = c("benz", "bmw"), class = "factor"), id = structure(1:5, .Label = c("car1", "car2", "car3", "car4", "car5"), class = "factor")), .Names = c("owner", "hp", "car", "id"), row.names = c(NA, -5L), class = "data.frame")
df2 <- structure(list(car = structure(c(1L, 2L, 1L, 2L), .Label = c("benz",
"bmw"), class = "factor"), owner = structure(c(1L, 1L, 2L, 2L
), .Label = c("John", "Mark"), class = "factor"), freq = c(0L,
1L, 2L, 2L)), .Names = c("car", "owner", "freq"), row.names = c(NA,
-4L), class = "data.frame")
```
#
##
### Filters
```{r}
library(crosstalk)
# Notice the 'group = ' argument - this does the trick!
shared_df1 <- SharedData$new(df1, ~owner, group = "Choose owner")
shared_df2 <- SharedData$new(df2, ~owner, group = "Choose owner")
filter_select("owner", "Car owner:", shared_df1, ~owner)
# You don't need this second filter now
# filter_select("owner", "Car owner:", shared_df2, ~ owner)
```
### Plot1 with plotly
```{r}
plot_ly(shared_df1, x = ~id, y = ~hp, color = ~owner) %>% add_markers() %>% highlight("plotly_click")
```
### Plots with plotly
```{r}
plot_ly(shared_df2, x = ~owner, y = ~freq, color = ~car) %>% group_by(owner) %>% add_bars()
```
##
### Dataframe 1
```{r}
DT::datatable(shared_df1)
```
### Dataframe 2
```{r}
DT::datatable(shared_df2)
```
I spent some time on this by trying to extract data from plot_ly() using plotly_data() without luck until I figured out the answer. That's why there's some very simple plots with plotly.
Recently, I've also wanted to use one filter to filter 2 visualizations.
Brief description of my situation
I've wanted to use one filter to filter a boxplot and a table.
Source data has been a data frame. I've wanted to use some of variables for the boxplot and also calculate some statistics (like mean, standard deviation, mode, number of records).
Functions I've needed to use to display results: plotly::plot_ly(), DT::datatable(), crosstalk::bscols().
I've found out that there are 3 key information to solve this situation
Key 1) It's necessary to correctly create shared data.
In my case, I've had to use crosstalk::SharedData$new() twice.
Correct shared data, to be used as source for visualizations, can be used if firstly keys 2 and 3 are fulfilled.
Key 2) When creating shared data, use the same group argument as "Lodewic Van Twillert" explained on 16 Mar 2018.
Key 3) Ensure that all SharedData instances refer conceptually to the same data points, and share the same keys.
Start with ensuring that a data frame has row names even if row names are character vector with numbers (like "1", "2", ...).
Used literature for this key 3: https://rstudio.github.io/crosstalk/using.html. (I suggest to mainly read subtitle "Grouping".)
Summary of steps I've used to fulfill key information from above
Key 3) This one could be tricky in order to fulfill relevant conditions of key 3 above.
The approach I've chosen creates one table containing all data and this table (data frame) will be used to create both shared data.
I've applied data manipulations to original data frame (risk_scores_df) so now this data has a new column.
I've created a new data frame with statistics.
I've joined both data frames using
risk_scores_df <- dplyr::left_join... so now the original data frame contains all prepared data.
I've run print(rownames(risk_scores_df)) to ensure that my updated data frame has row names.
Now, I've had one data frame containing all data (needed for both visualizations) that fulfill conditions of information of key 3 above.
Key 2) I've simply added group = "sd1" in both crosstalk::SharedData$new()
Key 1) This one could be also tricky if a wrong approach is chosen.
Here, the key to create correct shared data instances is to use that one table with all data and choose only rows and columns needed for a relevant shared data.
Example - in my case, I've run codes in Option 1 to create two shared data instances, but also Option 2 is possible.
Option 1 (choosing of only needed rows and columns is in crosstalk::SharedData$new())
rs_df_sd1 <- crosstalk::SharedData$new(
risk_scores_df[, c(1, 2, 5)],
group = "sd1"
)
rs_df_sd1a <- crosstalk::SharedData$new(
risk_scores_df[risk_scores_df$NumRecords > 0 &
is.na(risk_scores_df$NumRecords) == F,
c(1, 6:11)],
group = "sd1"
)
Option 2 (choosing of only needed rows and columns is in additional variables)
sd1 <- risk_scores_df[, c(1, 2, 5)]
sd1a <- risk_scores_df[risk_scores_df$NumRecords > 0 &
is.na(risk_scores_df$NumRecords) == F,
c(1, 6:11)]
rs_df_sd1 <- crosstalk::SharedData$new(sd1, group = "sd1")
rs_df_sd1a <- crosstalk::SharedData$new(sd1a, group = "sd1")
Completing the solution
At this point I've created shared data instances rs_df_sd1 and rs_df_sd1a that can be used as main sources for visualizations that will be filtered using crosstalk::bscols().
Brief example:
box_n_jitter_chart1 <- plotly::plot_ly(rs_df_sd1) %>% add_trace(...
DT_table1 <- DT::datatable(rs_df_sd1a)
crosstalk::bscols(
widths = c(6, 12, NA),
crosstalk::filter_select(
id = "idAvgRisk",
label = "Account",
sharedData = rs_df_sd1,
group = ~Account,
multiple = F
),
box_n_jitter_chart1,
DT_table1
)
Note: DT::datatable() can also use rs_df_sd1a$data() and cells = list(values = base::rbind(... (see that cells = ... is used; see more about using cells e.g. at https://plotly.com/r/reference/table/) but because method data() is used (see more e.g. at https://rdrr.io/cran/crosstalk/man/SharedData.html#method-data) then it will not work with crosstalk::bscols.
I am learning the use of the ifelse function from Zuur et al (2009) A Beginners guide to R. In one exercise, there is a data frame called Owls which contains data about about 27 nests and two night of observations.
structure(list(Nest = structure(c(1L, 1L, 1L, 1L), .Label = "AutavauxTV", class = "factor"),
FoodTreatment = structure(c(1L, 2L, 1L, 1L), .Label = c("Deprived",
"Satiated"), class = "factor"), SexParent = structure(c(1L,
1L, 1L, 1L), .Label = "Male", class = "factor"), ArrivalTime = c(22.25,
22.38, 22.53, 22.56), SiblingNegotiation = c(4L, 0L, 2L,
2L), BroodSize = c(5L, 5L, 5L, 5L), NegPerChick = c(0.8,
0, 0.4, 0.4)), .Names = c("Nest", "FoodTreatment", "SexParent",
"ArrivalTime", "SiblingNegotiation", "BroodSize", "NegPerChick"
), row.names = c(NA, 4L), class = "data.frame")
The two nights differed as to the feeding regime (satiated or deprived) and are indicated in the Foodregime variable. The task is to use ifelse and past functions that make a new categorical variable that defines observations from a single night at a particular nest.
In the solutions the following code is suggested:
Owls <- read.table(file = "Owls.txt", header = TRUE, dec = ".")
ifelse(Owls$FoodTreatment == "Satiated", Owls$NestNight <- paste(Owls$Nest, "1",sep = "_"), Owls$NestNight <- paste(Owls$Nest, "2",sep = "_"))
and apparently it creates a new variable with values the endings of which vary ("-1" or "-2")
however when I call the original dataframe, all "-1" endings in the NestNight variable disappears and are turned to "-2."
Why does this happen? Did the authors miss something from the code or it's me who is not getting it?
Many thanks
EDIT: Sorry, I wanted to give a reproducible example by copying my data using dput but it did not work. If you can let me know how I can correct it so that it appears properly, I'd be grateful too!
Solution
If you do the assignment outside the ifelse structure, it works:
Owls$NestNight <- ifelse(Owls$FoodTreatment == "Satiated",
paste(Owls$Nest, "1",sep = ""),
paste(Owls$Nest, "2",sep = ""))
Explanation
What happens in your case is simply if you would execute the following two lines:
Owls$NestNight <- paste(Owls$Nest, "1",sep = "")
Owls$NestNight <- paste(Owls$Nest, "2",sep = "")
You first assign paste(Owls$Nest, "1",sep = "") to Owls$NestNight and then you reassign paste(Owls$Nest, "2",sep = "") to it. The ifelse is not affected by this, but you don't assign it's result to any variable.
Maybe it is more clear if you test this simple code:
c(a <- 1:5, a <- 6:10) #c is your ifelse, a is your Owls$NestNight
a #[1] 6 7 8 9 10
I have a simple trellis scatterplot. Two panels - male/female. ID is a unique number for each participant. The var1 is a total test time. Mean.values is a vector of two numbers (the means for gender).
No point including a best fit line so what I want is to plot a trend line of the mean in each panel. The two panels have different means, say male = 1 minute, female = 2 minutes.
xyplot(var1 ~ ID|Gender, data=DF,
group = Gender,
panel=function(...) {
panel.xyplot(...)
panel.abline(h=mean.values)
})
At the minute the graph is coming out so that both trendlines appear in each panel. I want only one trendline in each.
Does anyone have the way to do this?
I have tried a number of different ways including the long code for function Addline which just doesn't work for me. I just want to define which panel im looking at and i've looked at ?panel.number but not sure how that works as its coming up that I don't have a current row. (current.row(prefix)).
There must be a simple way of doing this?
[EDIT - Here's the actual data i'm using]
I've tried to simplify the DF
library(lattice)
dput(head(DF))
structure(list(ID = 1:6, Var1 = c(2333858, 4220644,
2941774, 2368496, 3165740, 3630300), mean = c(2412976, 2412976,
2412976, 2412976, 2412976, 2412976), Gender = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = c("1", "2"), class = "factor")), .Names = c("ID",
"Var1", "mean", "Gender"), row.names = c(NA, 6L), class = "data.frame")
dput(tail(DF))
structure(list(ID = 161:166, Var1= c(2825246, 3552170,
3688882, 2487760, 3849108, 3085342), mean = c(3689805, 3689805,
3689805, 3689805, 3689805, 3689805), Gender = structure(c(2L,
2L, 2L, 2L, 2L, 2L), .Label = c("1", "2"), class = "factor")), .Names = c("ID",
"Var1", "mean", "Gender"), row.names = 109:114, class = "data.frame")
plot i'm using:
xyplot((Var1/1000) ~ ID|Gender, data=DF,
group = Gender,scales=list(x=list(at=NULL)),
panel=function(...) {
panel.xyplot(...)
panel.abline(h=mean.values) })
causes 2 lines.
[EDIT - This is the code which includes the function Addline & is everywhere on all the posts and doesn't seem to work for me]
addLine<- function(a=NULL, b=NULL, v = NULL, h = NULL, ..., once=F) { tcL <- trellis.currentLayout() k<-0 for(i in 1:nrow(tcL)) for(j in 1:ncol(tcL)) if (tcL[i,j] > 0) { k<-k+1 trellis.focus("panel", j, i, highlight = FALSE) if (once) panel.abline(a=a[k], b=b[k], v=v[k], h=h[k], ...) else panel.abline(a=a,b=b, v=v, h=h, ...) trellis.unfocus() } }
then writing after the trellis plot (mean.values being a vector of two numbers, mean for female, mean for male)
addLine(v=(mean.values), once=TRUE)
Update - I managed to do it in ggplot2.
Make the ggplot using facet_wrap then -
hline.data <- data.frame(z = c(2413, 3690), Gender = c("Female","Male"))
This creates a DF of the two means and the Gender, 2x2 DF
myplot <- myplot + geom_hline(aes(yintercept = z), hline.data)
This adds the lines to the ggplot.
If you just wanted plot the mean of values you are drawing on the plot aready, you can skip the mean.values variable and just do
xyplot(Var1 ~ ID|Gender, data=DF,
group = Gender,
panel=function(x,y,...) {
panel.xyplot(x,y,...)
panel.abline(h=mean(y))
}
)
With the sample data
DF<-data.frame(
ID=1:10,
Gender=rep(c("M","F"), each=5),
Var1=c(5,6,7,6,5,8,9,10,8,9)
)
this produces
I believe lattice has a specific panel function for this, panel.average().
Try replacing panel.abline(h=mean.values) with panel.average(...).
If that doesn't solve the problem, we might need more information; try using dput() on your data (e.g., dput(DF), or some representative subset).