Link to data as txt file here
I'm having trouble with this PCA. PC1 results appear binary, and I can't figure out why as none of my variables are binary.
df = bees
pca_dat_condition <- bees %>% ungroup() %>%
select(Length.1:Length.25, OBJECTID, Local, Elevation, Longitude,
Latitude, Cubital.Index) %>%
na.omit()
pca_dat_first <- pca_dat_condition %>% #remove the final nonnumerical information
select(-Local, -OBJECTID, -Elevation, -Longitude, -Latitude)
pca <- pca_dat_first%>%
scale() %>%
prcomp()
# add identifying information back into PCA data
pca_data <- data.frame(pca$x, Local=pca_dat_condition$Local, ID =
pca_dat_condition$OBJECTID, elevation = pca_dat_condition$Elevation,
Longitude = pca_dat_condition$Longitude, Latitude =
pca_dat_condition$Latitude)
ggplot(pca_data, aes(x=PC1, y=PC2, color = Latitude)) +
geom_point() +ggtitle("PC1 vs PC2: All Individuals") +
scale_colour_gradient(low = "blue", high = "red")
I'm not getting any error messages with the code, and when I look at the data frame nothing looks out of place. Should I be using a different function for the PCA? Any insight into why my graph may look like this?
Previously, I did the same PCA but for the average values for each Local (whereas this is each individual), and it came out as a normal PCA with no clear clustering. I don't understand why this problem would arise when looking at individual points. It's possible I merged some other data frames in a wonky way, but the structure of the dataset seems completely normal.
This is how the PCA looks.
bees <- read.csv(paste0("https://gist.githubusercontent.com/AkselA/",
"08a4e78a6a29a918ed597e9a32adc228/raw/",
"6d0005fad4cb91830bcf7087176283b18683e9cd/bees.csv"),
header=TRUE)
# bees <- bees[bees[,1] < 10,] # This will remove the three offending rows
bees <- na.omit(bees)
bees.cond <- bees[, grep("Length|OBJ|Loc|Ele|Lon|Lat|Cubi", colnames(bees))]
bees.first <- bees[, grep("Length|Cubi", colnames(bees))]
summary(bees.first)
par(mfrow=c(7, 4), mar=rep(1, 4))
q <- lapply(1:ncol(bees.first), function(x) {
h <- hist(scale(bees.first[, x]), plot=FALSE)
h$counts <- log1p(h$counts)
plot(h, main="", axes=FALSE, ann=FALSE)
legend("topright", legend=names(bees.first[x]),
bty="n", cex=0.8, adj=c(0, -2), xpd=NA)
})
bees.pca <- prcomp(bees.first, scale.=TRUE)
biplot(bees.pca)
Before removing the outliers
After
Related
I can only get the qq plot one by one with different datasets..
library(fitdistrplus)
x1<-c(1300,541,441,35,278,167,276,159,126,60.8,160,5000,264.6,379,170,251.3,155.84,187.01,850)
x2<-c(25,500,42,100,10,8.2,76,2.2,7.86,50)
y1<-log10(x1)
y2<-log10(x2)
x1.logis <- fitdist(y1, "logis", method="mle")
x2.logis <- fitdist(y2, "logis", method="mle")
ppcomp(x1.logis, addlegend=FALSE)
ppcomp(x2.logis, addlegend=FALSE)
How can i place the two qq plot in same coordinate system?
Use ggplot2. You need to extract your fitted values from the fitdist object n and make a new data frame. Use ggplot2 layers to add the fitted values from the two data sets and then add an abline.
library(ggplot2)
fittedx1 <- data.frame(x = sort(plogis(x1.logis$data,
location = x1.logis$estimate[1],
scale = x1.logis$estimate[2])),
p = (1:length(x1.logis$data))/length(x1.logis$data))
fittedx2 <- data.frame(x = sort(plogis(x2.logis$data,
location = x2.logis$estimate[1],
scale = x2.logis$estimate[2])),
p = (1:length(x2.logis$data))/length(x2.logis$data))
fitted <- rbind(fittedx1,fittedx2) #You need to combine the two datasets
#Add a variable that identifies which dataset the values belong to
#Then you can use the col option in ggplot to give each data set its own color!
fitted$set <- c(rep("1", nrow(fittedx1)), rep("2", nrow(fittedx2)))
#Now plot
ggplot(fitted) +
geom_point(aes(p, x, col=set), shape=1, size=3) +
geom_abline(intercept=0, slope=1)
I would like to create a map showing local spatial cluster of a phenomenon, preferably using Local Moran (LISA).
In the reproducible example below, I calculate the local moran's index using spdep but I would like to know if there is as simple way to map the clustes, prefebly using ggplot2. Help ?
library(UScensus2000tract)
library(ggplot2)
library(spdep)
# load data
data("oregon.tract")
# plot Census Tract map
plot(oregon.tract)
# create Queens contiguity matrix
spatmatrix <- poly2nb(oregon.tract)
#calculate the local moran of the distribution of black population
lmoran <- localmoran(oregon.tract#data$black, nb2listw(spatmatrix))
Now to make this example more similar to my real dataset, I have some NA values in my shape file, which represent holes in the polygon, so these areas shouldn't be used in the calculation.
oregon.tract#data$black[3:5] <- NA
Here is a strategy:
library(UScensus2000tract)
library(spdep)
library(ggplot2)
library(dplyr)
# load data
data("oregon.tract")
# plot Census Tract map
plot(oregon.tract)
# create Queens contiguity matrix
spatmatrix <- poly2nb(oregon.tract)
# create a neighbours list with spatial weights
listw <- nb2listw(spatmatrix)
# calculate the local moran of the distribution of white population
lmoran <- localmoran(oregon.tract$white, listw)
summary(lmoran)
# padronize the variable and save it to a new column
oregon.tract$s_white <- scale(oregon.tract$white) %>% as.vector()
# create a spatially lagged variable and save it to a new column
oregon.tract$lag_s_white <- lag.listw(listw, oregon.tract$s_white)
# summary of variables, to inform the analysis
summary(oregon.tract$s_white)
summary(oregon.tract$lag_s_white)
# moran scatterplot, in basic graphics (with identification of influential observations)
x <- oregon.tract$s_white
y <- oregon.tract$lag_s_white %>% as.vector()
xx <- data.frame(x, y)
moran.plot(x, listw)
# moran sccaterplot, in ggplot
# (without identification of influential observations - which is possible but requires more effort)
ggplot(xx, aes(x, y)) + geom_point() + geom_smooth(method = 'lm', se = F) + geom_hline(yintercept = 0, linetype = 'dashed') + geom_vline(xintercept = 0, linetype = 'dashed')
# create a new variable identifying the moran plot quadrant for each observation, dismissing the non-significant ones
oregon.tract$quad_sig <- NA
# high-high quadrant
oregon.tract[(oregon.tract$s_white >= 0 &
oregon.tract$lag_s_white >= 0) &
(lmoran[, 5] <= 0.05), "quad_sig"] <- "high-high"
# low-low quadrant
oregon.tract[(oregon.tract$s_white <= 0 &
oregon.tract$lag_s_white <= 0) &
(lmoran[, 5] <= 0.05), "quad_sig"] <- "low-low"
# high-low quadrant
oregon.tract[(oregon.tract$s_white >= 0 &
oregon.tract$lag_s_white <= 0) &
(lmoran[, 5] <= 0.05), "quad_sig"] <- "high-low"
# low-high quadrant
oregon.tract#data[(oregon.tract$s_white <= 0
& oregon.tract$lag_s_white >= 0) &
(lmoran[, 5] <= 0.05), "quad_sig"] <- "low-high"
# non-significant observations
oregon.tract#data[(lmoran[, 5] > 0.05), "quad_sig"] <- "not signif."
oregon.tract$quad_sig <- as.factor(oregon.tract$quad_sig)
oregon.tract#data$id <- rownames(oregon.tract#data)
# plotting the map
df <- fortify(oregon.tract, region="id")
df <- left_join(df, oregon.tract#data)
df %>%
ggplot(aes(long, lat, group = group, fill = quad_sig)) +
geom_polygon(color = "white", size = .05) + coord_equal() +
theme_void() + scale_fill_brewer(palette = "Set1")
This answer was based on this page, suggested by Eli Knaap on twitter, and also borrowed from the answer by #timelyportfolio to this question.
I used the variable white instead of black because black had less explicit results.
Concerning NAs, localmoran() includes the argument na.action, about which the documentation says:
na.action is a function (default na.fail), can also be na.omit or > na.exclude - in these cases the weights list will be subsetted to remove NAs in the data. It may be necessary to set zero.policy to TRUE because this subsetting may create no-neighbour observations. Note that only weights lists created without using the glist argument to nb2listw may be subsetted. If na.pass is used, zero is substituted for NA values in calculating the spatial lag.
I tried:
oregon.tract#data$white[3:5] <- NA
lmoran <- localmoran(oregon.tract#data$white, listw, zero.policy = TRUE,
na.action = na.exclude)
But run into problems in lag.listw but did not have time to look into it. Sorry.
This is obviously very late, but I came across the post while working on something similar. This uses the rgeoda package, which wasn't out when the question was posted, but it's developed by the GeoDa folks to port some of the functionality of that software to R. sf has also really taken off in the meantime, which makes manipulating spatial data very easy; the rgeoda functions generally expect sf objects.
Like another poster, I'm using the white population instead of Black because the clusters show up better. I converted the original data, with those few observations missing, to sf. rgeoda::local_moran doesn't work when there's missing data, but if you make a copy with the missing observations removed, you can run the analysis and join them back together by ID. Use a right join so you retain all the IDs from the original data, including missing values.
Because this mimics GeoDa, the same colors and labels are stored in the LISA object that local_moran returns. Extract those and use them as the color palette. Because the palette is named, and those names don't include "NA", you can add an NA value to the palette vector, or manually specify a color for NA values to make sure those shapes still get drawn. I made it green just so it would be visible (top left corner).
library(UScensus2000tract)
library(ggplot2)
library(dplyr)
library(sf)
library(rgeoda)
# load data
data("oregon.tract")
oregon.tract#data$white[3:5] <- NA
ore_sf <- st_as_sf(oregon.tract) %>%
tibble::rownames_to_column("id")
to_clust <- ore_sf %>%
filter(!is.na(white))
queen_wts <- queen_weights(to_clust)
moran <- local_moran(queen_wts, st_drop_geometry(to_clust["white"]))
moran_lbls <- lisa_labels(moran)
moran_colors <- setNames(lisa_colors(moran), moran_lbls)
ore_clustered <- to_clust %>%
st_drop_geometry() %>%
select(id) %>%
mutate(cluster_num = lisa_clusters(moran) + 1, # add 1 bc clusters are zero-indexed
cluster = factor(moran_lbls[cluster_num], levels = moran_lbls)) %>%
right_join(ore_sf, by = "id") %>%
st_as_sf()
ggplot(ore_clustered, aes(fill = cluster)) +
geom_sf(color = "white", size = 0) +
scale_fill_manual(values = moran_colors, na.value = "green") +
theme_dark()
I don't think this answer is worthy of a bounty, but perhaps it will get you closer to an answer. Since I don't know anything about localmoran, I just guessed at a fill.
library(UScensus2000tract)
library(ggplot2)
library(spdep)
# load data
data("oregon.tract")
# plot Census Tract map
plot(oregon.tract)
# create Queens contiguity matrix
spatmatrix <- poly2nb(oregon.tract)
#calculate the local moran of the distribution of black population
lmoran <- localmoran(oregon.tract#data$black, nb2listw(spatmatrix))
# get our id from the rownames in a data.frame
oregon.tract#data$id <- rownames(oregon.tract#data)
oregon.tract#data$lmoran_ii <- lmoran[,1]
oregon_df <- merge(
# convert to a data.frame
fortify(oregon.tract, region="id"),
oregon.tract#data,
by="id"
)
ggplot(data=oregon_df, aes(x=long,y=lat,group=group)) +
geom_polygon(fill=scales::col_numeric("Blues",domain=c(-1,5))(oregon_df$lmoran_ii)) +
geom_path(color="white")
I have data that describe several measurements taken from several individuals (each individual is represented by several measurements taken at several different time points).
I want to present the data as a scatter plot of measurements vs. individuals. Since for each individual I have several measurements, it means that I'll have a stack of points at each x-axis point.
Here's an example random code to generate these data:
set.seed(1)
n.individuals <- 10
n.measurements <- 15
vars <- runif(n.individuals, 0.1, 1)
means <- runif(n.individuals, 1, 5)
negative.idx <- sample(n.individuals, n.individuals/2)
means[negative.idx] <- -1*means[negative.idx]
df <- data.frame(measurement=c(sapply(1:n.individuals, function(x) rnorm(n.measurements, means[x], sqrt(vars[x])))),
individual=c(sapply(1:n.individuals, function(x) rep(x, n.measurements))))
Here's how I'm presenting the data so far:
#add colors
cols <- rgb(runif(n.measurements),runif(n.measurements),runif(n.measurements))
df$col <- rep(cols, n.individuals)
#simple plot
plot(df$individual, df$measurement, col=df$col, lwd=2, xlab = "individual", ylab = "measurement")
abline(h=0,lty=2)
abline(v=seq(min(df$individual)-0.5, max(df$individual)+0.5, 1),lty=2)
I'm wondering if there's a more elegant way to present the data (perhaps a ggplot way?)
Note that the signal I'm looking for in the data (and this is how I generated them) is that the measurements for each individual are correlated with respect to their sign. If they are uncorrelated with respect to their sign they should appear scattered on both sides of the y-axis.
Firstly, I would jitter your individuals so that individual measurements do not overlap. Use this code:
plot(jitter(df$individual), df$measurement, col=df$col,
lwd=2, xlab = "individual", ylab = "measurement")
There are a million ways to plot it in ggplot. Here's a quick violin graph:
p <- ggplot(df, aes(factor(individual), measurement))
p + geom_violin(aes(fill = factor(individual))) +
geom_hline((aes(yintercept = 0))) + geom_jitter( ) + xlab("Individual")
my challenge is to plot several bar plots at once, a plot for each of variables of different subsets. My goal is to compare regional differences for each variable. I would like to print all the resulting plots to a html file via R Markdown.
My main difficulty in making automatic grouped bar charts is that you need to tabulate the groups using table(data$Var[i], data$Region)but I don't know how to do this automatically. I would highly appreciate a hint on this.
Here is a an example of what one of my subset looks like:
# To Create this example of data:
b <- rep(matrix(c(1,2,3,2,1,3,1,1,1,1)), times=10)
data <- matrix(b, ncol=10)
colnames(data) <- paste("Var", 1:10, sep = "")
data <- as.data.frame(data)
reg_name <- c("North", "South")
Region <- rep(reg_name, 5)
data <- cbind(data,Region)
Using beside = TRUE, I was able to create one grouped bar plot (grouped by Region for Var1 from data):
tb <- table(data$Var1,data$Region)
barplot(tb, main="Var1", xlab="Values", legend=rownames(tb), beside=TRUE,
col=c("green", "darkblue", "red"))
I would like to loop this process to generate for example 10 plots for Var1 to Var10:
for(i in 1:10){
tb <- table(data[i], data$Region)
barplot(tb, main = i, xlab = "Values", legend = rownames(tb), beside = TRUE,
col=c("green", "darkblue", "red"))
}
R prefer the apply family of functions, therefore I tried to create a function to be applied:
fct <- function(i) {
tb <- table(data[i], data$Region)
barplot(tb, main=i, xlab="Values", legend = rownames(tb), beside = TRUE,
col=c("green", "darkblue", "red"))
}
sapply(data, fct)
I have tried other ways, but I was never successful. Maybe lattice or ggplot2 would offer easier way to do this. I am just starting in R, I will gladly accept any tips and suggestions. Thank you!
(I run on Windows, with the most recent Rv3.1.2 "Pumpking Helmet")
Given that you say "My goal is to compare regional differences for each variable", I'm not sure you've chosen the optimal plotting strategy. But yes, it is possible to do what you are asking.
Here's the default plot you get with your code above, for reference:
If you want a list with 10 plots for each variable, you can do the following (with ggplot)
many_plots <-
# for each column name in dat (except the last one)...
lapply(names(dat)[-ncol(dat)], function(x) {
this_dat <- dat[, c(x, 'Region')]
names(this_dat)[1] <- 'Var'
ggplot(this_dat, aes(x=Var, fill=factor(Var))) +
geom_bar(binwidth=1) + facet_grid(~Region) +
theme_classic()
})
Sample output, for many_plots[[1]]:
If you wanted all the plots in one image, you can do this (using reshape and data.table)
library(data.table)
library(reshape2)
dat2 <-
data.table(melt(dat, id.var='Region'))[, .N, by=list(value, variable, Region)]
ggplot(dat2, aes(y=N, x=value, fill=factor(value))) +
geom_bar(stat='identity') + facet_grid(variable~Region) +
theme_classic()
...but that's not a great plot.
Can anyone tell me how to create a plot which features 3 different matrices sets of data. In general, I have 3 different matricies of data all 1*1001 dimensions, and i wish to plot all 3 on the same graph.
I have managed to get one matrix to plot at once, and assemble the code to create the other 2 matrices but not to plot it. B[i,] is randomly generated data. What I would like to know is what would be the coding to get all 3 plots together on one graph.
Code for one matrix:
ntime<-1000
average.price.at.each.timestep<-matrix(0,nrow=1,ncol=ntime+1)
for(i in 1:(ntime+1)){
average.price.at.each.timestep[i]<-mean(B[i,])
}
matplot(t, t(average.price.at.each.timestep), type="l", lty=1, main="MC Price of a Zero Coupon Bond", ylab="Price", xlab = "Option Exercise Date")
Code for 3:
average.price.at.each.timestep<-matrix(0,nrow=1,ncol=ntime+1)
s.e.at.each.time <-matrix(0,nrow=1,ncol=ntime+1)
upper.c.l.at <- matrix(0,nrow=1,ncol=ntime+1)
lower.c.l.at <- matrix(0,nrow=1,ncol=ntime+1)
std <- function(x) sd(x)/sqrt(length(x))
for(i in 1:(ntime+1)){
average.price.at.each.timestep[i]<-mean(B[i,])
s.e.at.each.time[i] <- std(B[i,])
upper.c.l.at[i] <- average.price.at.each.timestep[i]+1.96*s.e.at.each.time[i]
lower.c.l.at[i] <- average.price.at.each.timestep[i]-1.96*s.e.at.each.time[i]
}
I'm still struggling with this as I cannot get the solutions given to match with my data sets, I have now included the code below that generates the matrix B as a working example so you can see the data I am dealing with. As you can see it produces a plot of the different prices, I would like a plot with the average price and confidence intervals of the average.
# Define Bond Price Parameters
#
P<-1 #par value
# Define Vasicek Model Parameters
#
rev.rate<-0.3 #speed of reversion
long.term.mean<-0.1 #long term level of the mean
sigma<-0.05 #volatility
r0<-0.03 #spot interest rate
Strike<-0.05
# Define Simulation Parameters
#
T<-50 #time to expiry
ntime<-1000 #number of timesteps
yearstep<-ntime/T #yearstep
npaths<-1000 #number of paths
dt<-T/ntime #timestep
R <- matrix(0,nrow=ntime+1,ncol=npaths) #matrix of Vasicek interest rate values
B <- matrix(0,nrow=ntime+1,ncol=npaths) # matrix of Bond Prices
R[1,]<-r0 #specifies that all paths start at specified spot rate
B[1,]<-P
# do loop which generates values to fill matrix R with multiple paths of Interest Rates as they evolve over time.
# stochastic process based on standard normal distribution
for (j in 1:npaths) {
for (i in 1:ntime) {
dZ <-rnorm(1,mean=0,sd=1)*sqrt(dt)
Rij<-R[i,j]
Bij<-B[i,j]
dr <-rev.rate*(long.term.mean-Rij)*dt+sigma*dZ
R[i+1,j]<-Rij+dr
B[i+1,j]<-Bij*exp(-R[i+1,j]*dt)
}
}
t<-seq(0,T,dt)
par(mfcol = c(3,3))
matplot(t, B[,1:pmin(20,npaths)], type="l", lty=1, main="Price of a Zero Coupon Bond", ylab="Price", xlab = "Time to Expiry")
Your example isn't reproducible, so I created some fake data that I hope is structured similarly to yours. If this isn't what you were looking for, let me know and I'll update as needed.
# Fake data
ntime <- 100
mat1 <- matrix(rnorm(ntime+1, 10, 2), nrow=1, ncol=ntime+1)
mat2 <- matrix(rnorm(ntime+1, 20, 2), nrow=1, ncol=ntime+1)
mat3 <- matrix(rnorm(ntime+1, 30, 2), nrow=1, ncol=ntime+1)
matplot(1:(ntime+1), t(mat1), type="l", lty=1, ylim=c(0, max(c(mat1,mat2,mat3))),
main="MC Price of a Zero Coupon Bond",
ylab="Price", xlab = "Option Exercise Date")
# Add lines for mat2 and mat3
lines(1:101, mat2, col="red")
lines(1:101, mat3, col="blue")
UPDATE: Is this what you're trying to do?
matplot(t, t(average.price.at.each.timestep), type="l", lty=1,
main="MC Price of a Zero Coupon Bond", ylab="Price",
xlab = "Option Exercise Date")
matlines(t, t(upper.c.l.at), lty=2, col="red")
matlines(t, t(lower.c.l.at), lty=2, col="green")
See plot below. If you have multiple columns that you want to plot (as in your updated example where you plot 20 separate paths) and you want to add lower and upper CIs for all of them (though this would make the plot unreadable), just use a matrix of upper and lower CI values that correspond to each path in average.price.at.each.timestep and use matlines to add them to your existing plot of the multiple paths.
This is doable using ggplot2 and reshape2. The structures you have are a little awkward, which you could improve by using a data frame instead of a matrix.
#Dummy data
average.price.at.each.timestep <- rnorm(1000, sd=0.01)
s.e.at.each.time <- rnorm(1000, sd=0.0005, mean=1)
#CIs (note you can vectorise this):
upper.c.l.at <- average.price.at.each.timestep+1.96*s.e.at.each.time
lower.c.l.at <- average.price.at.each.timestep-1.96*s.e.at.each.time
#create a data frame:
prices <- data.frame(time = 1:length(average.price.at.each.timestep), price=average.price.at.each.timestep, upperCI= upper.c.l.at, lowerCI= lower.c.l.at)
library(reshape2)
#turn the data frame into time, variable, value triplets
prices.t <- melt(prices, id.vars=c("time"))
#plot
library(ggplot2)
ggplot(prices.t, aes(time, value, colour=variable)) + geom_line()
This produces the following plot:
This can be improved somewhat by using geom_ribbon instead:
ggplot(prices, aes(time, price)) + geom_ribbon(aes(ymin=lowerCI, ymax=upperCI), alpha=0.1) + geom_line()
Which produces this plot:
Here's another, slightly different ggplot solution that does not require you to calculate the confidence limits first - ggplot does it for you.
# create sample dataset
set.seed(1) # for reproducible example
B <- matrix(rnorm(1000,mean=rep(10+1:10/2,each=10)),nc=10)
library(ggplot2)
library(reshape2) # for melt(...)
gg <- melt(data.frame(date=1:nrow(B),B), id="date")
ggplot(gg, aes(x=date,y=value)) +
stat_summary(fun.y = mean, geom="line")+
stat_summary(fun.y = function(y)mean(y)-1.96*sd(y)/sqrt(length(y)), geom="line",linetype="dotted", color="blue")+
stat_summary(fun.y = function(y)mean(y)+1.96*sd(y)/sqrt(length(y)), geom="line",linetype="dotted", color="blue")+
theme_bw()
stat_summary(...) summarizes the y-values for a given value of x (the date). So in the first call, it calculates the mean, in the second the lowerCL, and in the third the upperCL.
You could also create a CL(...) function, and call that:
CL <- function(x,level=0.95,type=c("lower","upper")) {
fact <- c(lower=-1,upper=1)
mean(x) - fact[type]*qnorm((1-level)/2)*sd(x)/sqrt(length(x))
}
ggplot(gg, aes(x=date,y=value)) +
stat_summary(fun.y = mean, geom="line")+
stat_summary(fun.y = CL, type="lower", geom="line",linetype="dotted", color="blue")+
stat_summary(fun.y = CL, type="upper", geom="line",linetype="dotted", color="blue")+
theme_bw()
This produces a plot identical to the one above.