add clusters and nodes from SOMbrero package to training data - r

I am playing a bit with the SOMbrero package. I would like to attach the cluster numbers created like so (taken from here):
my.sc <- superClass(iris.som, k=3)
and X and Y coordinates of the SOM nodes to the training dataset.
In some code, where I use the kohonen package, I create clusters like this:
range01 <- function(x){(x-min(x))/(max(x)-min(x))}
ind <- sapply(SubsetData, is.numeric)
SubsetData[ind] <- lapply(SubsetData[ind], range01)
TrainingMatrix <- as.matrix(SubsetData)
GridDefinition <- somgrid(xdim = 4, ydim = 4, topo = "rectangular", toroidal = FALSE)
SomModel <- som(
data = TrainingMatrix,
grid = GridDefinition,
rlen = 10000,
alpha = c(0.05, 0.01),
keep.data = TRUE
)
nb <- table(SomModel$unit.classif)
groups = 5
tree.hc = cutree(hclust(d=dist(SomModel$codes[[1]]),method="ward.D2",members=nb),groups)
plot(SomModel, type="codes", bgcol=rainbow(groups)[tree.hc])
add.cluster.boundaries(SomModel, tree.hc)
result <- OrginalData
result$Cluster <- tree.hc[SomModel$unit.classif]
result$X <- SomModel$grid$pts[SomModel$unit.classif,"x"]
result$Y <- SomModel$grid$pts[SomModel$unit.classif,"y"]
write.table(result, file = "FinalData.csv", sep = ",", col.names = NA, quote = FALSE)
PS:
Some example code using the iris dataset can be found here.
PPS:
I played a bit with the code iris code quoted above and think I have managed to extract the clusters, node ids and prototypes (see code below). What is missing are the coordinates X and Y. I think they are in here:
iris.som$parameters$the.grid$coord
Code:
library(SOMbrero)
set.seed(100)
setwd("D:\\RProjects\Clustering")
#iris.som <- trainSOM(x.data=iris[,1:4],dimension=c(10,10), maxit=100000, scaling="unitvar", radius.type="gaussian")
iris.som <- trainSOM(x.data=iris[,1:4],dimension=c(3,3), maxit=100000, scaling="unitvar", radius.type="gaussian")
# perform a hierarchical clustering
## with 3 super clusters
iris.sc <- superClass(iris.som, k=3)
summary(iris.sc)
# compute the projection quality indicators
quality(iris.som)
iris1 <- iris
iris1$Cluster = iris.sc$cluster[iris.sc$som$clustering]
iris1$Node = iris.sc$som$clustering
iris1$Pt1Sepal.Length = iris.sc$som$prototypes[iris.sc$som$clustering,1]
iris1$Pt2Sepal.Width = iris.sc$som$prototypes[iris.sc$som$clustering,2]
iris1$Pt3Petal.Length = iris.sc$som$prototypes[iris.sc$som$clustering,3]
iris1$Pt4Petal.Width = iris.sc$som$prototypes[iris.sc$som$clustering,4]
write.table(iris1, file = "Iris.csv", sep = ",", col.names = NA, quote = FALSE)

I think I have figured it out using the iris example (please correct/improve code! - I am not fluent in R):
library(SOMbrero)
set.seed(100)
setwd("D:\\RProjects\\SomBreroClustering")
iris.som <- trainSOM(x.data=iris[,1:4],dimension=c(5,5), maxit=10000, scaling="unitvar", radius.type="letremy")
# perform a hierarchical clustering
# with 3 super clusters
iris.sc <- superClass(iris.som, k=3)
summary(iris.sc)
# compute the projection quality indicators
quality(iris.som)
iris1 <- iris
iris1$Cluster = iris.sc$cluster[iris.sc$som$clustering]
iris1$Node = iris.sc$som$clustering
iris1$Pt1Sepal.Length = iris.sc$som$prototypes[iris.sc$som$clustering,1]
iris1$Pt2Sepal.Width = iris.sc$som$prototypes[iris.sc$som$clustering,2]
iris1$Pt3Petal.Length = iris.sc$som$prototypes[iris.sc$som$clustering,3]
iris1$Pt4Petal.Width = iris.sc$som$prototypes[iris.sc$som$clustering,4]
iris1$X = iris.som$parameters$the.grid$coord[iris.sc$som$clustering,1]
iris1$Y = iris.som$parameters$the.grid$coord[iris.sc$som$clustering,2]
write.table(iris1, file = "Iris.csv", sep = ",", col.names = NA, quote = FALSE)

I am not sure that I got it right but:
iris.som$parameters$the.grid contains coordinates of the clusters (it is a two column array with x and y coordinates in the mapping space)
so I think that what you want to do is
out.grid <- iris.som$parameters$the.grid$coord
out.grid$sc <- iris.sc$clustering
and export out.grid (a three column array). iris.sc$som$prototypes contains the coordinate of the prototypes of the clusters but in the original space (the four dimensional space in which the iris dataset takes its values.

I think my answer captures the requirements. Adding the node ids, x +
y coordinates, cluster and prototypes to the original data. Would you
agree.
yes :)

Related

Using "lapply" in R to create multiple raster files from folder with lidar data

How can I read all files in a folder, perform a script and create separate outputs from all files
containing the original name? I have a folder with .las files and I need to create corresponding .asc files from them. My script as below:
library(lidR)
# Path to data
LASfile <- ("path/1234.las")
# Sorting out points in point cloud data, keeping vegetation and ground point classes.
las <- readLAS(LASfile, filter="-keep_class 1 2") # Keep high vegetation and ground point classes
# Normalizing ground points to 0 elevation (idwinterpolation), instead of meters above sea level.
dtm <- grid_terrain(las, algorithm = knnidw(k = 8, p = 2))
las_normalized <- normalize_height(las, dtm)
# Create a filter to remove points above 95th percentile of height
lasfilternoise = function(las, sensitivity)
{
p95 <- grid_metrics(las, ~quantile(Z, probs = 0.95), 10)
las <- merge_spatial(las, p95, "p95")
las <- filter_poi(las, Z < p95*sensitivity)
las$p95 <- NULL
return(las)
}
# Generating a pitfree canopy height modela model without null values (Khosravipour et al., 2014)
las_denoised <- lasfilternoise(las_normalized, sensitivity = 1.2)
chm <- grid_canopy(las_denoised, 0.32, pitfree(c(0,2,5,10,15), c(3,1.5), subcircle = 0.2))
# Applying a median filter, 5x5 moving window to smooth the image and remove noise
ker <- matrix(1,3,3)
chms <- raster::focal(chm, w = ker, fun = median)
plot(chms)
library(raster)
# Writing output file
writeRaster(chms, filename="path/1234.asc", format="ascii", overwrite=TRUE) # Ändra till relevant för varje körning
citation("lidR")
I tried using lapply but I dont know how to use it in the right way.
Must be something like this to read all files in the folder: list.files("path", pattern = "*.las", full.names = TRUE)
and something like this to write the output files: lapply(r, writeRaster, filename = paste0(f, ".asc"), format = "ascii")
But I cannot get it right
An example of my LAZ to LAS+Index conversion:
convertLAZ <- function(lazfile, outdir = "") {
if(!dir.exists({{outdir}})) { dir.create({{outdir}}, recursive = TRUE)}
print(lazfile)
las <- lidR::readLAS(files = {{lazfile}}, filter = "-keep_class 2 9")
.file <- stringi::stri_replace_all_regex({{lazfile}}, "^.*/", "")
lidR::writeLAS(las, file = paste0({{outdir}}, "/", stringi::stri_replace_all_fixed(.file, "laz", "las")), index = TRUE)
}
f <- list.files("data/laz", pattern = "*.laz", full.names = TRUE)
lapply(f, convertLAZ, outdir = "data/las22")
You can expand it to rasterization, normalization, etc and saving as .asc. But I would encourage you to have a look on https://r-lidar.github.io/lidRbook/engine.html. In short: process your LAZ/LAS files as LAScatalog, and then tile the result raster and save to .asc.
And an example how to use parallel processing (in below example 3+1 processes - please note, it can be memory hungry, so be careful with number of workers/processing parameters like opt_chunk_buffer.
library(future)
options(parallelly.availableCores.methods = "mc.cores")
options(mc.cores = 3)
plan(multisession)
parallelly::availableWorkers()
library(lidR)
myPath <- "data/las"
ctg <- readLAScatalog(myPath)
crs(ctg) <- "EPSG:2180"
ctg#output_options$drivers$SpatRaster$param$overwrite <- TRUE
opt_output_files(ctg) <- "data/dtm2/barycz__{XLEFT}_{YBOTTOM}"
opt_chunk_size(ctg) <- 500
opt_chunk_buffer(ctg) <- 600
opt_filter(ctg) <- "-keep_class 2 9"
summary(ctg)
vr <- rasterize_terrain(ctg, 0.25, tin())
plot(vr)
Solved it now
.libPaths( c( "C:/Users/Public/R/win-library/4.2" , .libPaths() ) )
library(lidR)
createASCI <- function(lasfile, outdir = "") {
if(!dir.exists({{outdir}})) { dir.create({{outdir}}, recursive = TRUE)}
print(lasfile)
las <- lidR::readLAS(files = {{lasfile}}, filter = "-keep_class 1 2 3 4 5")
.file <- stringi::stri_replace_all_regex({{lasfile}}, "^.*/", "")
# Normalizing ground points to 0 elevation (idwinterpolation), instead of meters above sea level.
dtm <- grid_terrain(las, algorithm = knnidw(k = 8, p = 2))
las_normalized <- normalize_height(las, dtm)
# Create a filter to remove points above 95th percentile of height
lasfilternoise = function(las, sensitivity)
{
p95 <- grid_metrics(las, ~quantile(Z, probs = 0.95), 10)
las <- merge_spatial(las, p95, "p95")
las <- filter_poi(las, Z < p95*sensitivity)
las$p95 <- NULL
return(las)
}
# Generating a pitfree canopy height modela model without null values (Khosravipour et al., 2014)
las_denoised <- lasfilternoise(las_normalized, sensitivity = 1.2)
chm <- grid_canopy(las_denoised, 0.32, pitfree(c(0,2,5,10,15), c(3,1.5), subcircle = 0.2))
# Applying a median filter, 5x5 moving window to smooth the image and remove noise
ker <- matrix(1,3,3)
chms <- raster::focal(chm, w = ker, fun = median)
writeRaster(chms, file = paste0({{outdir}}, "/", stringi::stri_replace_all_fixed(.file, "las", "asc")), index = TRUE)
}
f <- list.files("C:/Lasdata", pattern = "*.las", full.names = TRUE)
lapply(f, createASCI, outdir = "C:/Lasdata/nytt")

Aligning Multiple Files in R by Pairwise Alignment

I have 15 protein sequences as fasta format in one file. I have to pairwise align them globally and locally then generate a distance score matrix 15x15 to construct dendrogram.
But when I do, i.e. A sequence is not aligning with itself and I get NA result. Moreover, AxB gives 12131 score but BxA gives NA. Thus R can not construct phylogenetic tree.
What should I do?
I'm using this script for the loop but it reads one way only :
for (i in 1:150) {
globalpwa<-pairwiseAlignment(toString(ProtDF[D[1,i],2])
,toString(ProtDF[D[2,i],2]),
substitutionMatrix = "BLOSUM62",
gapOpening = 0,
gapExtension = -2,
scoreOnly=FALSE,
type="global")
ScoreX[i]<-c(globalpwa#score)
nameSeq1[i]<-c(as.character(ProtDF[D[1,i],1]))
nameSeq2[i]<-c(as.character(ProtDF[D[2,i],1]))
}
I used an example fasta file, protein sequence of RPS29 in fungi.
ProtDF <-
c(OQS54945.1 = "MINDLKVRKDVEKSKAHCHVKPFGKGSRACERCASHRGHNRKYGMNLCRRCLHTNAKILGFTSFD",
XP_031008245.1 = "KHTESPVEPARRDNLKTAIMSHESVWNSRPRTYGKGARACRVCTHKAGLIRKYGLNICRQCFREKASDIGFVKVCDGHTDSSYDGSEF",
TVY80688.1 = "MSHESVWNSRPRTYGKGARACRVCTHKAGLIRKYGLNICRQCFREKAADIGFVKHR",
TVY57447.1 = "LPFLKIRVEPARRDNLKPAIMSHESVWNSRPRTYGKGARACRVCTHKAGLIRKYGLNICRQCFREKASDIGFVKVCVDAMGTLEPRASSPEL",
TVY47820.1 = "EPARRDNLKTTIMSHESVWNSRPRTYGKGARACRVCTHKAGLIRKYGLNICRQCFREKAADIGFVK",
TVY37154.1 = "AISKLNSRPQRPISTTPRKAKHTKSLVEPARRDNLKTAIMSHESVWNSRPRTYGKGARACRVCTHKAGLIRKYGLNICRQCFREKASDIGFVKHR",
TVY29458.1 = "KHTESPVEPARRDNLKTAIMSHESVWNSRPRTYGKGARACRVCTHKAGLIRKYGLNICRQCFREKASDIGFVKVCDGHTDSSYDGSEF",
TVY14147.1 = "MSHESVWNSRPRTYGKGARACRVCTHKAGLIRKYGLNICRQCFREKASDIGFVKVCDGWIGTLEL",
`sp|Q6CPG3.1|RS29_KLULA` = "MAHENVWYSHPRKFGKGSRQCRISGSHSGLIRKYGLNIDRQSFREKANDIGFYKYR",
`sp|Q8SS73.1|RS29_ENCCU` = "MSFEPSGPHSHRKPFGKGSRSCVSCYTFRGIIRKLMMCRRCFREYAGDIGFAIYD",
`sp|O74329.3|RS29_SCHPO` = "MAHENVWFSHPRKYGKGSRQCAHTGRRLGLIRKYGLNISRQSFREYANDIGFVKYR",
TPX23066.1 = "MTHESVFYSRPRNYGKGSRQCRVCAHKAGLIRKYGLLVCRQCFREKSQDIGFVKYR",
`sp|Q6FWE3.1|RS29_CANGA` = "MAHENVWFSHPRRFGKGSRQCRVCSSHTGLIRKYDLNICRQCFRERASDIGFNKYR",
`sp|Q6BY86.1|RS29_DEBHA` = "MAHESVWFSHPRNFGKGSRQCRVCSSHSGLIRKYDLNICRQCFRERASDIGFNKFR",
XP_028490553.1 = "MSHESVWNSRPRSYGKGSRSCRVCKHSAGLIRKYDLNLCRQCFREKAKDIGFNKFR"
)
So you got it right to use combn. As you said, you need a distance score matrix for dendrogram, so better to store the values in a matrix. See below, I simply named the matrix after the names of the fasta, and slot in the pairwise values
library(Biostrings)
# you can also read in your file
# like ProtDF = readAAStringSet("fasta")
ProtDF=AAStringSet(ProtDF)
# combination like you want
# here we just use the names
D = combn(names(ProtDF),2)
#make the pairwise matrix
mat = matrix(NA,ncol=length(ProtDF),nrow=length(ProtDF))
rownames(mat) = names(ProtDF)
colnames(mat) = names(ProtDF)
# loop through D
for(idx in 1:ncol(D)){
i <- D[1,idx]
j <- D[2,idx]
globalpwa<-pairwiseAlignment(ProtDF[i],
ProtDF[j],
substitutionMatrix = "BLOSUM62",
gapOpening = 0,
gapExtension = -2,
scoreOnly=FALSE,
type="global")
mat[i,j]<-globalpwa#score
mat[j,i]<-globalpwa#score
}
# if you need to make diagonal zero
diag(mat) <- 0
# plot
plot(hclust(as.dist(mat)))
An alternate method, if you're interested, using the same example as above:
library(DECIPHER)
ProtDF <- c(OQS54945.1 = "MINDLKVRKDVEKSKAHCHVKPFGKGSRACERCASHRGHNRKYGMNLCRRCLHTNAKILGFTSFD",
XP_031008245.1 = "KHTESPVEPARRDNLKTAIMSHESVWNSRPRTYGKGARACRVCTHKAGLIRKYGLNICRQCFREKASDIGFVKVCDGHTDSSYDGSEF",
TVY80688.1 = "MSHESVWNSRPRTYGKGARACRVCTHKAGLIRKYGLNICRQCFREKAADIGFVKHR",
TVY57447.1 = "LPFLKIRVEPARRDNLKPAIMSHESVWNSRPRTYGKGARACRVCTHKAGLIRKYGLNICRQCFREKASDIGFVKVCVDAMGTLEPRASSPEL",
TVY47820.1 = "EPARRDNLKTTIMSHESVWNSRPRTYGKGARACRVCTHKAGLIRKYGLNICRQCFREKAADIGFVK",
TVY37154.1 = "AISKLNSRPQRPISTTPRKAKHTKSLVEPARRDNLKTAIMSHESVWNSRPRTYGKGARACRVCTHKAGLIRKYGLNICRQCFREKASDIGFVKHR",
TVY29458.1 = "KHTESPVEPARRDNLKTAIMSHESVWNSRPRTYGKGARACRVCTHKAGLIRKYGLNICRQCFREKASDIGFVKVCDGHTDSSYDGSEF",
TVY14147.1 = "MSHESVWNSRPRTYGKGARACRVCTHKAGLIRKYGLNICRQCFREKASDIGFVKVCDGWIGTLEL",
`sp|Q6CPG3.1|RS29_KLULA` = "MAHENVWYSHPRKFGKGSRQCRISGSHSGLIRKYGLNIDRQSFREKANDIGFYKYR",
`sp|Q8SS73.1|RS29_ENCCU` = "MSFEPSGPHSHRKPFGKGSRSCVSCYTFRGIIRKLMMCRRCFREYAGDIGFAIYD",
`sp|O74329.3|RS29_SCHPO` = "MAHENVWFSHPRKYGKGSRQCAHTGRRLGLIRKYGLNISRQSFREYANDIGFVKYR",
TPX23066.1 = "MTHESVFYSRPRNYGKGSRQCRVCAHKAGLIRKYGLLVCRQCFREKSQDIGFVKYR",
`sp|Q6FWE3.1|RS29_CANGA` = "MAHENVWFSHPRRFGKGSRQCRVCSSHTGLIRKYDLNICRQCFRERASDIGFNKYR",
`sp|Q6BY86.1|RS29_DEBHA` = "MAHESVWFSHPRNFGKGSRQCRVCSSHSGLIRKYDLNICRQCFRERASDIGFNKFR",
XP_028490553.1 = "MSHESVWNSRPRSYGKGSRSCRVCKHSAGLIRKYDLNLCRQCFREKAKDIGFNKFR")
# All pairwise alignments:
# Convert characters to an AA String Set
ProtDF <- AAStringSet(ProtDF)
# Initialize some outputs
AliMat <- matrix(data = list(),
ncol = length(ProtDF),
nrow = length(ProtDF))
DistMat <- matrix(data = 0,
ncol = length(ProtDF),
nrow = length(ProtDF))
# loop through the upper triangle of your matrix
for (m1 in seq_len(length(ProtDF) - 1L)) {
for (m2 in (m1 + 1L):length(ProtDF)) {
# Align each pair
AliMat[[m1, m2]] <- AlignSeqs(myXStringSet = ProtDF[c(m1, m2)],
verbose = FALSE)
# Assign a distance to each alignment, fill both triangles of the matrix
DistMat[m1, m2] <- DistMat[m2, m1] <- DistanceMatrix(myXStringSet = AliMat[[m1, m2]],
type = "dist", # return a single value
includeTerminalGaps = TRUE, # return a global distance
verbose = FALSE)
}
}
dimnames(DistMat) <- list(names(ProtDF),
names(ProtDF))
Dend01 <- IdClusters(myDistMatrix = DistMat,
method = "NJ",
type = "dendrogram",
showPlot = FALSE,
verbose = FALSE)
# A single multiple alignment:
AllAli <- AlignSeqs(myXStringSet = ProtDF,
verbose = FALSE)
AllDist <- DistanceMatrix(myXStringSet = AllAli,
type = "matrix",
verbose = FALSE,
includeTerminalGaps = TRUE)
Dend02 <- IdClusters(myDistMatrix = AllDist,
method = "NJ",
type = "dendrogram",
showPlot = FALSE,
verbose = FALSE)
Dend01, from all the pairwise alignments:
Dend02, from a single multiple alignment:
Finally, DECIPHER has a function for loading up your alignment in your browser just to look at it, which, if your alignments are huge, can be a bit of a mistake, but in this case (and in cases up to a few hundred short sequences) is just fine:
BrowseSeqs(AllAli)
A side note about BrowseSeqs, for some reason it doesn't behave well with Safari, but it plays just fine with Chrome. Sequences are displayed in the order in which they exist in the aligned string set.
EDIT: BrowseSeqs DOES play fine with safari directly, but it does have a weird issue with being incorporated with HTMLs knitted together with RMarkdown. Weird, but not applicable here.
Another big aside: All of the functions i've used have a processors argument, which is set to 1 by default, if you'd like to get greedy with your cores you can set it to NULL and it'll just grab everything available. If you're aligning very large string sets, this can be pretty useful, if you're doing trivially small things like this example, not so much.
combn is a great function and I use it all the time. However for these really simple set ups I like looping through the upper triangle, but that's just a personal preference.

Create longitudinal data from a list of igraph objects in R

I'm doing analysis on company networks in R and am trying to export my igraph results into a dataframe.
Here's a reproducible example:
library(igraph)
sample <- data.frame(ID = 1:8, org_ID = c(5,4,1,2,2,2,5,7), mon = c("199801", "199802","199802","199802","199904","199912","200001", "200012"))
create.graphs <- function(df){
g <- graph.data.frame(d = df, directed = TRUE)
g <- simplify(g, remove.multiple = FALSE, remove.loops = TRUE)
E(g)$weight <- count_multiple(g)
#calculate global values
g$centrality <- centralization.degree(g)
#calculate local values
g$indegree <- degree(g, mode = "in",
loops = FALSE, normalized = FALSE)
return(g)
}
df.list <- split(sample, sample$mon)
g <- lapply(df.list, create.graphs)
As you can see, I have graphs for multiple months. I want to export this to longitudinal data, where each row represents a month (per ID) and each column represents the corresponding network measures.
So far I've managed to create a data frame, but not how to run it through the list of graphs and put it into a fitting format. An additional problem could be that the graphs have different numbers of nodes (some have around 25, others more than 40), but that should theoretically just be recognised as missing by my regression model.
output <- data.frame(Centrality = g$`199801`$centrality,
Indegree = g$`199801`$indegree)
output
summary(output)
I tried writing a function similar to the one above for this, but unfortunately to no avail.
Thanks in advance for reading this, any help is greatly appreciated
I wanted to share how I solved it (thanks to Dave2e's suggestion).
Note that ci$monat defines my time periods in the original data, so one row for each point in time.
sumarTable <- data.frame(time = unique(ci$monat))
sumarTable$indegree <- lapply(g, function(x){x$indegree})
sumarTable$outdegree <- lapply(g, function(x){x$outdegree})
sumarTable$constraint <- lapply(g, function(x){x$constraint})
etc
edit:
in order to export these values, I had to "flatten" the lists:
sumarTable$indegree <- vapply(sumarTable$indegree, paste, collapse = ", ", character(1L))
sumarTable$outdegree <- vapply(sumarTable$outdegree, paste, collapse = ", ", character(1L))
sumarTable$constraint <- vapply(sumarTable$constraint, paste, collapse = ", ", character(1L))

How to create new polygons by simplifying from two SpatialPolygonsDataFrame objects in R?

say I have two sets of shape files that cover the same region and often, but not always share borders, e.g. US counties and PUMAs. I'd like to define a new scale of polygon that uses both PUMAs and counties as atomic building blocks, i.e. neither can ever be split, but I'd still like as many units as possible. Here is a toy example:
library(sp)
# make fake data
# 1) counties:
Cty <- SpatialPolygons(list(
Polygons(list(Polygon(cbind(x=c(0,2,2,1,0,0),y=c(0,0,2,2,1,0)), hole=FALSE)),"county1"),
Polygons(list(Polygon(cbind(x=c(2,4,4,3,3,2,2),y=c(0,0,2,2,1,1,0)),hole=FALSE)),"county2"),
Polygons(list(Polygon(cbind(x=c(4,5,5,4,4),y=c(0,0,3,2,0)),hole=FALSE)),"county3"),
Polygons(list(Polygon(cbind(x=c(0,1,2,2,0,0),y=c(1,2,2,3,3,1)),hole=FALSE)),"county4"),
Polygons(list(Polygon(cbind(x=c(2,3,3,4,4,3,3,2,2),y=c(1,1,2,2,3,3,4,4,1)),hole=FALSE)),"county5"),
Polygons(list(Polygon(cbind(x=c(0,2,2,1,0,0),y=c(3,3,4,5,5,3)),hole=FALSE)),"county6"),
Polygons(list(Polygon(cbind(x=c(1,2,3,4,1),y=c(5,4,4,5,5)),hole=FALSE)),"county7"),
Polygons(list(Polygon(cbind(x=c(3,4,4,5,5,4,3,3),y=c(3,3,2,3,5,5,4,3)),hole=FALSE)),"county8")
))
counties <- SpatialPolygonsDataFrame(Cty, data = data.frame(ID=paste0("county",1:8),
row.names=paste0("county",1:8),
stringsAsFactors=FALSE)
)
# 2) PUMAs:
Pum <- SpatialPolygons(list(
Polygons(list(Polygon(cbind(x=c(0,4,4,3,3,2,2,1,0,0),y=c(0,0,2,2,1,1,2,2,1,0)), hole=FALSE)),"puma1"),
Polygons(list(Polygon(cbind(x=c(4,5,5,4,3,3,4,4),y=c(0,0,5,5,4,3,3,0)),hole=FALSE)),"puma2"),
Polygons(list(Polygon(cbind(x=c(0,1,2,2,3,3,2,0,0),y=c(1,2,2,1,1,2,3,3,1)),hole=FALSE)),"puma3"),
Polygons(list(Polygon(cbind(x=c(2,3,4,4,3,3,2,2),y=c(3,2,2,3,3,4,4,3)),hole=FALSE)),"puma4"),
Polygons(list(Polygon(cbind(x=c(0,1,1,3,4,0,0),y=c(3,3,4,4,5,5,3)),hole=FALSE)),"puma5"),
Polygons(list(Polygon(cbind(x=c(1,2,2,1,1),y=c(3,3,4,4,3)),hole=FALSE)),"puma6")
))
Pumas <- SpatialPolygonsDataFrame(Pum, data = data.frame(ID=paste0("puma",1:6),
row.names=paste0("puma",1:6),
stringsAsFactors=FALSE)
)
# desired result:
Cclust <- SpatialPolygons(list(
Polygons(list(Polygon(cbind(x=c(0,4,4,3,3,2,2,1,0,0),y=c(0,0,2,2,1,1,2,2,1,0)), hole=FALSE)),"ctyclust1"),
Polygons(list(Polygon(cbind(x=c(4,5,5,4,3,3,4,4),y=c(0,0,5,5,4,3,3,0)),hole=FALSE)),"ctyclust2"),
Polygons(list(Polygon(cbind(x=c(0,1,2,2,3,3,4,4,3,3,2,2,0,0),y=c(1,2,2,1,1,2,2,3,3,4,4,3,3,1)),hole=FALSE)),"ctyclust3"),
Polygons(list(Polygon(cbind(x=c(0,2,2,3,4,0,0),y=c(3,3,4,4,5,5,3)),hole=FALSE)),"ctyclust4")
))
CtyClusters <- SpatialPolygonsDataFrame(Cclust, data = data.frame(ID = paste0("ctyclust", 1:4),
row.names = paste0("ctyclust", 1:4),
stringsAsFactors=FALSE)
)
# take a look
par(mfrow = c(1, 2))
plot(counties, border = gray(.3), lwd = 4)
plot(Pumas, add = TRUE, border = "#EEBB00", lty = 2, lwd = 2)
legend(-.5, -.3, lty = c(1, 2), lwd = c(4, 2), col = c(gray(.3), "#EEBB00"),
legend = c("county line", "puma line"), xpd = TRUE, bty = "n")
text(coordinates(counties), counties#data$ID,col = gray(.3))
text(coordinates(Pumas), Pumas#data$ID, col = "#EEBB00",cex=1.5)
title("building blocks")
#desired result:
plot(CtyClusters)
title("desired result")
text(-.5, -.5, "maximum units possible,\nwithout breaking either PUMAs or counties",
xpd = TRUE, pos = 4)
I've naively tried many of the g* functions in the rgeos package to achieve this and have not made headway. Does anyone know of a nice function or awesome trick for this task? Thank you!
[I'm also open to suggestions on a better title]
Here's a relatively succinct solution which:
Uses rgeos::gRelate() to identify Pumas that overlap but don't completely encompass/cover each county.To understand what it does, run example(gRelate) and see this Wikipedia page. (h.t. to Tim Riffe)
Uses RBGL::connectedComp() to identify groups of Pumas that should thus be merged. (For pointers on installing the RBGL package, see my answer to this SO question.)
Uses rgeos::gUnionCascaded() to merge the indicated Pumas.
library(rgeos)
library(RBGL)
## Identify groups of Pumas that should be merged
x <- gRelate(counties, Pumas, byid=TRUE)
x <- matrix(grepl("2.2......", x), ncol=ncol(x), dimnames=dimnames(x))
k <- x %*% t(x)
l <- connectedComp(as(k, "graphNEL"))
## Extend gUnionCascaded so that each SpatialPolygon gets its own ID.
gMerge <- function(ii) {
x <- gUnionCascaded(Pumas[ii,])
spChFIDs(x, paste(ii, collapse="_"))
}
## Merge Pumas as needed
res <- do.call(rbind, sapply(l, gMerge))
plot(res)
I think you could do this with a smart set of tests for containment. This gets two of your parts, the simple paired case where puma1 contains county1 and county2, and puma2 contains county8 and county3.
library(rgeos)
## pumas by counties
pbyc <- gContains(Pumas, counties, byid = TRUE)
## row / col pairs of where contains test is TRUE for Pumas
pbyc.pairs <- cbind(row(pbyc)[pbyc], col(pbyc)[pbyc])
par(mfrow = c(nrow(pbyc.pairs), 1))
for (i in 1:nrow(pbyc.pairs)) {
plot(counties, col = "white")
plot(gUnion(counties[pbyc.pairs[i,1], ], Pumas[pbyc.pairs[i,2], ]), col = "red", add = TRUE)
}
The plotting there is dumbly redundant, but I think it shows a start. You need to find which contains tests accumulate the most smaller parts, and then union those. Sorry I haven't put the effort in to finish but I think this would work.
After much trial and error, I've come up with the following inelegant solution, rather in keeping with the tip by #mdsummer, but adding more checks, removing redundant merged polygons, and checking. Yikes. If someone can take what I've done and make it cleaner, then I'll accept that answer rather this, which I'd like to avoid if at all possible:
library(rgeos)
pbyc <- gCovers(Pumas, counties, byid = TRUE) |
gContains(Pumas, counties, byid = TRUE) |
gOverlaps(Pumas, counties, byid = TRUE) |
t(gCovers(counties, Pumas, byid = TRUE) |
gContains(counties, Pumas, byid = TRUE) |
gOverlaps(counties, Pumas, byid = TRUE))
## row / col pairs of where test is TRUE for Pumas or counties
pbyc.pairs <- cbind(row(pbyc)[pbyc], col(pbyc)[pbyc])
Potentials <- apply(pbyc.pairs, 1, function(x,counties,Pumas){
gUnion(counties[x[1], ], Pumas[x[2], ])
}, counties = counties, Pumas= Pumas)
for (i in 1:length(Potentials)){
Potentials[[i]]#polygons[[1]]#ID <- paste0("p",i)
}
Potentials <- do.call("rbind",Potentials)
# remove redundant polygons:
Redundants <- gEquals(Potentials, byid = TRUE)
Redundants <- row(Redundants)[Redundants & lower.tri(Redundants)]
Potentials <- Potentials[-c(Redundants),]
# for each Potential summary polygon, see which counties and Pumas are contained:
keep.i <- vector(length=length(Potentials))
for (i in 1:length(Potentials)){
ctyblocki <- gUnionCascaded(counties[c(gCovers(Potentials[i, ], counties, byid = TRUE)), ])
Pumablocki <- gUnionCascaded(Pumas[c(gCovers(Potentials[i, ], Pumas, byid = TRUE)), ])
keep.i[i] <- gEquals(ctyblocki, Potentials[i, ]) & gEquals(Pumablocki, Potentials[i, ])
}
# what do we have left?
NewUnits <- Potentials[keep.i, ]
plot(NewUnits)

Organizing data from physics experiments for ggplot2

I am currently trying to use ggplot2 to visualize results from simple current-voltage experiments. I managed to achieve good results for one set of data of course.
However, I have a number of current-voltage datasets, which I input in R recursively to get the following organisation (see minimal code) :
data.frame(cbind(batch(string list), sample(string list), dataset(data.frame list)))
Edit : My data are stored in text files names batchname_samplenumber.txt, with voltage and current columns. The code I use to import them is :
require(plyr)
require(ggplot2)
#VARIABLES
regex <- "([[:alnum:]_]+).([[:alpha:]]+)"
regex2 <- "G5_([[:alnum:]]+)_([[:alnum:]]+).([[:alpha:]]+)"
#FUNCTIONS
getJ <- function(list, k) llply(list, function(i) llply(i, function(i, indix) getElement(i,indix), indix = k))
#FILES
files <- list.files("Data/",full.names= T)
#NAMES FOR FILES
paths <- llply(llply(files, basename),function(i) regmatches(i,regexec(regex,i)))
paths2 <- llply(llply(files, basename),function(i) regmatches(i,regexec(regex2,i)))
names <- llply(llply(getJ(paths, 2)),unlist)
batches <- llply(llply(getJ(paths2, 2)),unlist)
samples <- llply(llply(getJ(paths2, 3)),unlist)
#SETS OF DATA, NAMED
sets <- llply(files,function(i) read.table(i,skip = 0, header = F))
names(sets) <- names
for (i in as.list(names)) names(sets[[i]]) <- c("voltage","current")
df<-data.frame(cbind(batches,samples,sets))
And a minimal data can be generated via :
require(plyr)
batch <- list("A","A","B","B")
sample <- list(1,2,1,2)
set <- list(data.frame(voltage = runif(10), current = runif(10)),data.frame(voltage = runif(10), current = runif(10)),data.frame(voltage = runif(10), current = runif(10)),data.frame(voltage = runif(10), current = runif(10)))
df<-data.frame(cbind(batch,sample,set))
My question is : is it possible to use the data as is to plot using a code similar to the following (which does not work) ?
ggplot(data, aes(x = dataset$current, y = dataset$voltage, colour = sample)) + facet_wrap(~batch)
The more general version would be : is ggplot2 able of handeling raw physical data, as opposed to discrete statistical data (like diamonds, cars) ?
With the newly-defined problem (two-column files named "batchname_samplenumber.txt"), I would suggest the following strategy:
read_custom <- function(f, ...) {
d <- read.table(f, ...)
names(d) <- c("V", "I")
## extract sample and batch from the base filename
ids <- strsplit(gsub(".txt", "", f), "_")
d$batch <- ids[[1]][1]
d$sample <- ids[[1]][2]
d
}
## list files to read
files <- list.files(pattern=".txt")
## read them all in a single data.frame
m <- ldply(files, read_custom)
It's not clear how the sample names are defined with respect to the dataset. The general idea for ggplot2 is that you should group all your data in the form of a melted (long format) data.frame.
library(ggplot2)
library(plyr)
library(reshape2)
l1 <- list(batch="b1", sample=paste("s", 1:4, sep=""),
dataset=data.frame(current=rnorm(10*4), voltage=rnorm(10*4)))
l2 <- list(batch="b2", sample=paste("s", 1:4, sep=""),
dataset=data.frame(current=rnorm(10*4), voltage=rnorm(10*4)))
l3 <- list(batch="b3", sample=paste("s", 1:4, sep=""),
dataset=data.frame(current=rnorm(10*4), voltage=rnorm(10*4)))
list_to_df <- function(l, n=10){
m <- l[["dataset"]]
m$batch <- l[["batch"]]
m$sample <- rep(l[["sample"]], each=n)
m
}
## list_to_df(l1)
m <- ldply(list(l1, l2, l3), list_to_df)
ggplot(m) + facet_wrap(~batch)+
geom_path(aes(current, voltage, colour=sample))

Resources