multivariate k-means cluster - r

I'm trying to do a multivariate k-means cluster plot in r. I have 3 variables, and 10 columns of data, plus the context (like species for Iris) so 11 variables. And my x is PeruReady, obviously
Following a tutorial online I got this far:
PeruReady.km <- kmeans(PeruReady[, -1], 3, iter.max=1000)
tbl <- table(PeruReady[, 1], PeruReady.km$cluster)
PeruReady.dist <- dist(PeruReady[, -1])
PeruReady.mds <- cmdscale(PeruReady.dist)
c.chars <- c("*", "o", "+")[as.integer(PeruReady$Context)]
a.cols <- rainbow(3)[PeruReady$cluster]
plot(PeruReady.mds, col=a.cols, pch=c.chars, xlab="X", ylab="Y")
But my plot is coming up completely empty, what am I doing wrong?

With a small data set (demand.sm), your code worked just fine. Have you normalized all your numeric columns?
dput(demand.sm)
structure(list(Demand = c("rify la", "p quasi", "rify LD", "ventive",
"ekeeper", " de min", " risk g", " approv", "uest te", "", "al trai",
"cation", "ely inv", "rge tim", "get of ", "vey pro", "ent ONA",
"ble sel", "cipline", "tus rep", "ced-ran"), normalized = structure(c(-1.15780226157481,
-0.319393727330983, -1.15780226157481, -1.15780226157481, -0.319393727330983,
-0.319393727330983, -0.319393727330983, -0.319393727330983, 0.519014806912847,
0.519014806912847, 0.519014806912847, -0.738597994452898, -0.738597994452898,
2.19583187540051, 2.19583187540051, -1.15780226157481, -0.319393727330983,
-0.319393727330983, 0.519014806912847, 1.35742334115668, 0.519014806912847
), .Dim = c(21L, 1L), "`scaled:center`" = 3.76190476190476, "`scaled:scale`" = 2.38547190100328)), .Names = c("Demand",
"normalized"), row.names = c(NA, -21L), class = "data.frame")
clusters <- kmeans(demand.sm[ , "normalized"], 5)
demand.dist <- dist(demand.sm[ , "normalized"])
demand.mds <- cmdscale(demand.dist) # multidimensional scaling of data matrix, aka principal coordinates analysis
c.chars <- c("*", "o", "+")[as.integer(clusters$Context)]
a.cols <- rainbow(3)[clusters$cluster]
plot(demand.mds, col=a.cols, pch=c.chars, xlab="X", ylab="Y")

Related

Estimate_richness for all phyla in phyloseq

Is there an easy way to get ASV richness for each Phylum for each Station using the estimate_richness function in phyloseq? Or is there another simple way of extracting the abundance data for each taxonomic rank and calculating richness that way?
So far I have just been subsetting individual Phyla of interest using for example:
ps.Prymnesiophyceae <- subset_taxa(ps, Phylum == "Prymnesiophyceae")
alpha_diversity<-estimate_richness(ps.Prymnesiophyceae,measure=c("Shannon","Observed"))
H<-alpha_diversity$Shannon
S1<-alpha_diversity$Observed
S<-log(S1)
evenness<-H/S
alpha<-cbind(Shannon=H,Richness=S1,Evenness=evenness,sample_data(Prymnesiophyceae))
But this is rather a pain when having to do it for e.g. the top 20 phyla.
EDIT:
suggestion by #GTM works well until last step. See comment + dput:
> dput(head(sample_names(ps.transect), n=2)) c("2-1-DCM_S21_L001_R1_001.fastq", "2-1-SA_S9_L001_R1_001.fastq" )
> dput(head(alpha, n=2)) structure(list(Observed = c(31, 25), Shannon = c(2.84184012598765,
2.53358345702604), taxon = c("Prymnesiophyceae", "Prymnesiophyceae" ), sample_id = c("X2.1.DCM_S21_L001_R1_001.fastq", "X2.1.SA_S9_L001_R1_001.fastq" ), S = c(3.43398720448515,
3.2188758248682), evenness = c(0.827562817437384,
0.787101955736294)), row.names = c("X2.1.DCM_S21_L001_R1_001.fastq", "X2.1.SA_S9_L001_R1_001.fastq"), class = "data.frame")
> dput(head(smpl_data, n=1)) new("sample_data", .Data = list("001_DCM", 125L, structure(1L, .Label = "DCM", class = "factor"), structure(1L, .Label = "Transect", class = "factor"), structure(1L, .Label = "STZ", class = "factor"),
structure(1L, .Label = "STFW", class = "factor"), "Oligotrophic",
16L, -149.9978333, -29.997, 130.634, 17.1252, 35.4443, 1025.835008,
1.1968, 1e-12, 5.387, 2.8469, 52.26978546, 98.0505, 0, 0,
0.02, 0.9, 0, 0, 2069.47, 8.057, 377.3), names = c("Station_neat", "Depth_our", "Depth_bin", "Loc", "Front", "Water", "Zone", "Bottle", "Lon", "Lat", "pressure..db.", "Temperature", "Salinity", "Density_kgm.3", "Fluorescence_ugL", "PAR", "BottleO2_mLL", "CTDO2._mLL", "OxygenSat_.", "Beam_Transmission", "N_umolL", "NO3_umolL", "PO4_umolL", "SIL_umolL", "NO2_umolL", "NH4_umolL", "DIC_uMkg", "pH", "pCO2_matm"), row.names = "2-1-DCM_S21_L001_R1_001.fastq",
.S3Class = "data.frame")
You can wrap your code in a for loop to do so. I've slightly modified your code to make it a bit more flexible, see below.
require("phyloseq")
require("dplyr")
# Calculate alpha diversity measures for a specific taxon at a specified rank.
# You can pass any parameters that you normally pass to `estimate_richness`
estimate_diversity_for_taxon <- function(ps, taxon_name, tax_rank = "Phylum", ...){
# Subset to taxon of interest
tax_tbl <- as.data.frame(tax_table(ps))
keep <- tax_tbl[,tax_rank] == taxon_name
keep[is.na(keep)] <- FALSE
ps_phylum <- prune_taxa(keep, ps)
# Calculate alpha diversity and generate a table
alpha_diversity <- estimate_richness(ps_phylum, ...)
alpha_diversity$taxon <- taxon_name
alpha_diversity$sample_id <- row.names(alpha_diversity)
return(alpha_diversity)
}
# Load data
data(GlobalPatterns)
ps <- GlobalPatterns
# Estimate alpha diversity for each phylum
phyla <- get_taxa_unique(ps,
taxonomic.rank = 'Phylum')
phyla <- phyla[!is.na(phyla)]
alpha <- data.frame()
for (phylum in phyla){
a <- estimate_diversity_for_taxon(ps = ps,
taxon_name = phylum,
measure = c("Shannon", "Observed"))
alpha <- rbind(alpha, a)
}
# Calculate the additional alpha diversity measures
alpha$S <- log(alpha$Observed)
alpha$evenness <- alpha$Shannon/alpha$S
# Add sample data
smpl_data <- as.data.frame(sample_data(ps))
alpha <- left_join(alpha,
smpl_data,
by = c("sample_id" = "X.SampleID"))
This is a reproducible example with GlobalPatterns. Make sure to alter the code to match your data by replacing X.SampleID in the left join with the name of the column that contains the sample IDs in your sample_data. If there is no such column, you can create it from the row names:
smpl_data <- as.data.frame(sample_data(ps))
smpl_data$sample_id < row.names(smpl_data)
alpha <- left_join(alpha,
smpl_data,
by = c("sample_id" = "sample_id"))

Creating distribution curves with specific moments

Is there a way to create a distribution curve given the 1st through 4th moments (mean, variance or standard deviation, skewness, and kurtosis)? Here is a small table of the descriptive statistics. The fifth variable has stronger positive skew and larger kurtosis than the rest, and leads me to believe a non-normal distribution may need to be used.
dput(summarystats_factors)
structure(list(ERVALUEY = c(1.21178722715092, 8.4400515531338,
0.226004674926861, 3.89328347004421), ERVOLY = c(0.590757887612924,
7.48697754999463, 0.295973723450469, 3.31326615805655), ERQUALY = c(1.59367031426668,
4.57371901763411, 0.601172123904339, 3.89080479205755), ERMOMTY = c(3.09719686678745,
7.01446175391253, 0.260638252621096, 3.28326189430607), ERSIZEY = c(1.69935727981412,
6.1917295410928, 1.24021163316834, 6.23493767854042), Moment = structure(c("Mean",
"Standard Deviation", "Skewness", "Kurtosis"), .Dim = c(4L, 1L
))), row.names = c(NA, -4L), class = "data.frame")
We could use curve with PearsonDS::dpearson. Note, that the moments= argument expects exactly the order mean, variance, skewness, kurtosis, so that the rows of the data must be ordered correspondingly (as is the case in your example data).
FUN <- function(d, xlim, ylim, lab=colnames(d), main='Theoretical Distributions') {
s <- seq(d)
lapply(s, \(i) {
curve(PearsonDS::dpearson(x, moments=d[, i]), col=i + 1, xlim=xlim, ylim=ylim,
add=ifelse(i == 1, FALSE, TRUE), ylab='y', main=main)
})
legend('topright', col=s + 1, lty=1, legend=lab, cex=.8, bty='n')
}
FUN(dat[-6], xlim=c(-2, 10), ylim=c(-.01, .2))
Data:
dat <- structure(list(ERVALUEY = c(1.21178722715092, 8.4400515531338,
0.226004674926861, 3.89328347004421), ERVOLY = c(0.590757887612924,
7.48697754999463, 0.295973723450469, 3.31326615805655), ERQUALY = c(1.59367031426668,
4.57371901763411, 0.601172123904339, 3.89080479205755), ERMOMTY = c(3.09719686678745,
7.01446175391253, 0.260638252621096, 3.28326189430607), ERSIZEY = c(1.69935727981412,
6.1917295410928, 1.24021163316834, 6.23493767854042), Moment = structure(c("Mean",
"Standard Deviation", "Skewness", "Kurtosis"), .Dim = c(4L, 1L
))), row.names = c(NA, -4L), class = "data.frame")
Use the PearsonDS package, the pearson0 family creates "normal" distributions matching specified moments, but other options are available.

plot(var()) displays two different plots, how do I merge them into one? Also having two y axis

> dput(head(inputData))
structure(list(Date = c("2018:07:00", "2018:06:00", "2018:05:00",
"2018:04:00", "2018:03:00", "2018:02:00"), IIP = c(125.8, 127.5,
129.7, 122.6, 140.3, 127.4), CPI = c(139.8, 138.5, 137.8, 137.1,
136.5, 136.4), `Term Spread` = c(1.580025, 1.89438, 2.020112,
1.899074, 1.470544, 1.776862), RealMoney = c(142713.9916, 140728.6495,
140032.2762, 139845.5215, 139816.4682, 139625.865), NSE50 = c(10991.15682,
10742.97381, 10664.44773, 10472.93333, 10232.61842, 10533.10526
), CallMoneyRate = c(6.161175, 6.10112, 5.912088, 5.902226, 5.949956,
5.925538), STCreditSpread = c(-0.4977, -0.3619, 0.4923, 0.1592,
0.3819, -0.1363)), row.names = c(NA, -6L), class = c("tbl_df",
"tbl", "data.frame"))
I want to make my autoregressive plot like this plot:
#------> importing all libraries
library(readr)
install.packages("lubridtae")
library("lubridate")
install.packages("forecast")
library('ggplot2')
library('fpp')
library('forecast')
library('tseries')
#--------->reading data
inputData <- read_csv("C:/Users/sanat/Downloads/exercise_1.csv")
#--------->calculating the lag=1 for NSE50
diff_NSE50<-(diff(inputData$NSE50, lag = 1, differences = 1)/lag(inputData$NSE50))
diff_RealM2<-(diff(inputData$RealMoney, lag = 1, differences = 1)/lag(inputData$RealMoney))
plot.ts(diff_NSE50)
#--------->
lm_fit = dynlm(IIP ~ CallMoneyRate + STCreditSpread + diff_NSE50 + diff_RealM2, data = inputData)
summary(lm_fit)
#--------->
inputData_ts = ts(inputData, frequency = 12, start = 2012)
#--------->area of my doubt is here
VAR_data <- window(ts.union(ts(inputData$IIP), ts(inputData$CallMoneyRate)))
VAR_est <- VAR(y = VAR_data, p = 12)
plot(VAR_est)
I want to my plots to get plotted together in same plot. How do I serparate the var() plots to two separate ones.
Current plot:
My dataset :
dataset
Okay, so this still needs some work, but it should set the right framework for you. I would look more into working with the ggplot2 for future.
Few extra packages needed, namely library(vars) and library(dynlm).
Starting from,
VAR_est <- VAR(y = VAR_data, p = 12)
Now we extract the values we want from the VAR_est object.
y <- as.numeric(VAR_est$y[,1])
z <- as.numeric(VAR_est$y[,2])
x <- 1:length(y)
## second data set on a very different scale
par(mar = c(5, 4, 4, 4) + 0.3) # Leave space for z axis
plot(x, y, type = "l") # first plot
par(new = TRUE)
plot(x, z, type = "l", axes = FALSE, bty = "n", xlab = "", ylab = "")
axis(side=4, at = pretty(range(z)))
mtext("z", side=4, line=3)
I will leave you to add the dotted lines on etc...
Hint: Decompose the VAR_est object, for example, VAR_est$datamat, then see which bit of data corresponds to the part of the plot you want.
Used some of this

R Bass Diffusion Using nls() with predictors

How do researchers include additional predictors/independent variables when estimating the bass curve? I've read that Bass diffusion does not accept additional predictors, but yet I see papers claiming that they do. With that said, is this possible with the nls() function in R? I have created some toy data and an example without the extra predictor, advert.
TIA
library(data.table)
options(scipen=999)
rm(list=ls())
df <- data.table(
year = seq(1979, 1988, by=1),
T = 1:10,
sales = c(840, 1470, 2110, 4000, 7590, 10950, 10530, 9470, 7790, 5890),
advert = c(100,100,100,100,100,75,75,50,50,25))
df[, sales_cumulative := cumsum(sales)]
est_bass <- nls(sales ~ M*(((P+Q)^2/P)*exp(-(P+Q)*T))/(1+(Q/P)*exp(-(P+Q)*T))^2,
# add P and Q below
start=c(list(M = 60630, P = 0.03, Q = 0.38)),
data = df)
coef(est_bass)
# P, Q, M parameters
m = coef(est_bass)[1]
p = coef(est_bass)[2]
q = coef(est_bass)[3]
Tdelt <- (1:100)/10
ngete <- exp(-(p+q)*Tdelt)
# plot pdf
par(mfrow=c(1,2))
Bpdf <- m*((p+q)^2/p)*ngete/(1+(q/p)*ngete)^2
plot(Tdelt, Bpdf, xlab = "Year from 1979", ylab = "Sales per year", type = "l")
points(df$T, df$sales) # compare to original
# plot cdf
Bcdf <- m*(1-ngete)/(1+(q/p)*ngete)
plot(Tdelt, Bcdf, xlab = "Year from 1979", ylab = "Cumulative sales", type = "l")
points(df$T, df$sales_cumulative)

Riverplot package in R - output plot covered in gridlines or outlines

I've made a Sankey diagram in R Riverplot (v0.5), the output looks OK small in RStudio, but when exported or zoomed in it the colours have dark outlines or gridlines.
I think it may be because the outlines of the shapes are not matching the transparency I want to use for the fill?
I possibly need to find a way to get rid of outlines altogether (rather than make them semi-transparent), as I think they're also the reason why flows with a value of zero still show up as thin lines.
my code is here:
#loading packages
library(readr)
library("riverplot", lib.loc="C:/Program Files/R/R-3.3.2/library")
library(RColorBrewer)
#loaing data
Cambs_flows <- read_csv("~/RProjects/Cambs_flows4.csv")
#defining the edges
edges = rep(Cambs_flows, col.names = c("N1","N2","Value"))
edges <- data.frame(edges)
edges$ID <- 1:25
#defining the nodes
nodes <- data.frame(ID = c("Cambridge","S Cambs","Rest of E","Rest of UK","Abroad","to Cambridge","to S Cambs","to Rest of E","to Rest of UK","to Abroad"))
nodes$x = c(1,1,1,1,1,2,2,2,2,2)
nodes$y = c(1,2,3,4,5,1,2,3,4,5)
#picking colours
palette = paste0(brewer.pal(5, "Set1"), "90")
#plot styles
styles = lapply(nodes$y, function(n) {
list(col = palette[n], lty = 0, textcol = "black")
})
#matching nodes to names
names(styles) = nodes$ID
#defining the river
r <- makeRiver( nodes, edges,
node_labels = c("Cambridge","S Cambs","Rest of E","Rest of UK","Abroad","to Cambridge","to S Cambs","to Rest of E","to Rest of UK","to Abroad"),
node_styles = styles)
#Plotting
plot( r, plot_area = 0.9)
And my data is here
dput(Cambs_flows)
structure(list(N1 = c("Cambridge", "Cambridge", "Cambridge",
"Cambridge", "Cambridge", "S Cambs", "S Cambs", "S Cambs", "S Cambs",
"S Cambs", "Rest of E", "Rest of E", "Rest of E", "Rest of E",
"Rest of E", "Rest of UK", "Rest of UK", "Rest of UK", "Rest of UK",
"Rest of UK", "Abroad", "Abroad", "Abroad", "Abroad", "Abroad"
), N2 = c("to Cambridge", "to S Cambs", "to Rest of E", "to Rest of UK",
"to Abroad", "to Cambridge", "to S Cambs", "to Rest of E", "to Rest of UK",
"to Abroad", "to Cambridge", "to S Cambs", "to Rest of E", "to Rest of UK",
"to Abroad", "to Cambridge", "to S Cambs", "to Rest of E", "to Rest of UK",
"to Abroad", "to Cambridge", "to S Cambs", "to Rest of E", "to Rest of UK",
"to Abroad"), Value = c(0L, 1616L, 2779L, 13500L, 5670L, 2593L,
0L, 2975L, 4742L, 1641L, 2555L, 3433L, 0L, 0L, 0L, 6981L, 3802L,
0L, 0L, 0L, 5670L, 1641L, 0L, 0L, 0L)), class = c("tbl_df", "tbl",
"data.frame"), row.names = c(NA, -25L), .Names = c("N1", "N2",
"Value"), spec = structure(list(cols = structure(list(N1 = structure(list(), class = c("collector_character",
"collector")), N2 = structure(list(), class = c("collector_character",
"collector")), Value = structure(list(), class = c("collector_integer",
"collector"))), .Names = c("N1", "N2", "Value")), default = structure(list(), class = c("collector_guess",
"collector"))), .Names = c("cols", "default"), class = "col_spec"))
The culprit is a line in riverplot::curveseg. We can hack this function to fix it, or there is also a very simple workaround that does not require hacking the function. In fact, the simple solution is probably preferably in many cases, but first I explain how to hack the function, so we understand why the workaround also works. Scroll to the end of this answer if you only want the simple solution:
UPDATE: The change suggested below has now been implemented in riverplot version 0.6
To edit the function, you can use
trace(curveseg, edit=T)
Then find the line near the end of the function that reads
polygon(c(xx[i], xx[i + 1], xx[i + 1], xx[i]), c(yy[i],
yy[i + 1], yy[i + 1] + w, yy[i] + w), col = grad[i],
border = grad[i])
We can see here that the package authors chose not to pass the lty parameter to polygon (UPDATE: see this answer for an explanation of why the package author did it this way). Change this line by adding lty = 0 (or, if you prefer, border = NA) and it works as intended for OPs case. (But note that this may not work well if you wish to render a pdf - see here)
polygon(c(xx[i], xx[i + 1], xx[i + 1], xx[i]), c(yy[i],
yy[i + 1], yy[i + 1] + w, yy[i] + w), col = grad[i],
border = grad[i], lty=0)
As a side note, this also explains the somewhat odd reported behaviour in the comments that "if you run it twice, the second time the plot looks OK, although export it and the lines come back". When lty is not specified in a call to polygon, the default value it uses is lty = par("lty"). Initially, the default par("lty") is a solid line, but after running the riverplot function once, par("lty") gets set to 0 during a call to riverplot:::draw.nodes thus, suppressing the lines when riverplot is run a 2nd time. But if you then try to export the image, opening a new device resets par("lty") to its default value.
An alternative way to update the function with this edit is to use assignInNamespace to overwrite the package function with your own version. Like this:
curveseg.new = function (x0, x1, y0, y1, width = 1, nsteps = 50, col = "#ffcc0066",
grad = NULL, lty = 1, form = c("sin", "line"))
{
w <- width
if (!is.null(grad)) {
grad <- colorRampPaletteAlpha(grad)(nsteps)
}
else {
grad <- rep(col, nsteps)
}
form <- match.arg(form, c("sin", "line"))
if (form == "sin") {
xx <- seq(-pi/2, pi/2, length.out = nsteps)
yy <- y0 + (y1 - y0) * (sin(xx) + 1)/2
xx <- seq(x0, x1, length.out = nsteps)
}
if (form == "line") {
xx <- seq(x0, x1, length.out = nsteps)
yy <- seq(y0, y1, length.out = nsteps)
}
for (i in 1:(nsteps - 1)) {
polygon(c(xx[i], xx[i + 1], xx[i + 1], xx[i]),
c(yy[i], yy[i + 1], yy[i + 1] + w, yy[i] + w),
col = grad[i], border = grad[i], lty=0)
lines(c(xx[i], xx[i + 1]), c(yy[i], yy[i + 1]), lty = lty)
lines(c(xx[i], xx[i + 1]), c(yy[i] + w, yy[i + 1] + w), lty = lty)
}
}
assignInNamespace('curveseg', curveseg.new, 'riverplot', pos = -1, envir = as.environment(pos))
Now for the simple solution, which does not require changes to the function:
Just add the line par(lty=0) before you plot!!!
Here is the author of the package. I am now struggling for a satisfactory solution to be included in the next version of the package.
The problem is with how R renders PDFs as compared to bitmaps. In the original version of the package, indeed I passed on lty=0 to polygon() (you can still see it in the commented source code). However, polygon w/o borders looks good only on the png graphics. In the pdf output, thin white lines appear between the polygons. Take a look:
cc <- "#E41A1C90"
plot.new()
rect(0.2, 0.2, 0.4, 0.4, col=cc, border=NA)
rect(0.4, 0.2, 0.6, 0.4, col=cc, border=NA)
dev.copy2pdf(file="riverplot.pdf")
In X or on png, the output is correct. However, if rendered as PDF, you will see a thin white line between the recangles:
When you render a riverplot graphics as PDF like the one above, this looks really bad:
I therefore forced adding borders, however forgot about checking transparency. When no transparency is used, this looks OK -- the borders overlap with the polygons as well as which each other, but you cannot see it. The PDF is now acceptable. However, it messes up the figure if you have transparency.
EDIT:
I have now uploaded version 0.6 of riverplot to CRAN. Besides some new stuff (you can now add riverplot to any part of an existing drawing), by default it uses lty=0 again. However, there is now an option called "fix.pdf" which you can set to TRUE in order to draw the borders around the segments again.
Bottom line, and solutions for now:
Use riverplot 0.6`
If you want to render a PDF, don't use transparency and use fix.pdf=TRUE
If you want to use both transparency and PDF, help me solving the issue.

Resources