Related
I'm analysing real-estate sales for some N. American cities and am using k-means clustering on the data. I have seven clusters and for each observation in the cluster I have the latitude, longitude, zipcode, and cluster_id. I'd like to plot this on a map to better visualize the clusters - I'm not sure what such a plot is called - Choropleth? Polygon?
Most of the examples are using geoJSON files but I only have a data.frame object from my k-means clustering.
Actual data:
https://www.kaggle.com/threnjen/portland-housing-prices-sales-jul-2020-jul-2021
Sample data:
> dput(dt[runif(n = 10,min = 1,max = 25000)])
structure(list(id = c(23126L, 15434L, 5035L, 19573L, NA, 24486L,
NA, 14507L, 3533L, 20192L), zipcode = c(97224L, 97211L, 97221L,
97027L, NA, 97078L, NA, 97215L, 97124L, 97045L), latitude = c(45.40525436,
45.55965805, 45.4983139, 45.39398956, NA, 45.47454071, NA, 45.50736618,
45.52812958, 45.34381485), longitude = c(-122.7599182, -122.6500015,
-122.7288742, -122.591217, NA, -122.8898392, NA, -122.6084061,
-122.91745, -122.5948334), lastSoldPrice = c(469900L, 599000L,
2280000L, 555000L, NA, 370000L, NA, 605000L, 474900L, 300000L
), lotSize = c(5227L, 4791L, 64904L, 9147L, NA, 2178L, NA, 4356L,
2613L, 6969L), livingArea = c(1832L, 2935L, 5785L, 2812L, NA,
1667L, NA, 2862L, 1844L, 742L), cluster_id = c(7, 7, 2, 7, NA,
4, NA, 7, 7, 4)), row.names = c(NA, -10L), class = c("data.table",
"data.frame"), .internal.selfref = <pointer: 0x7faa8000fee0>)
I've followed the example on https://gist.github.com/josecarlosgonz/8565908 to try and create a geoJSON file to be able to plot this data but without success.
I'm not using markers because I have ~25,000 observations - it would be difficult to plot them all and the file would take forever to load.
EDIT:
observations by zipcode:
> dput(dat[, .N, by = .(`address/zipcode`)][(order(`address/zipcode`))])
structure(list(`address/zipcode` = c(7123L, 97003L, 97004L, 97005L,
97006L, 97007L, 97008L, 97009L, 97015L, 97019L, 97023L, 97024L,
97027L, 97030L, 97034L, 97035L, 97038L, 97045L, 97056L, 97060L,
97062L, 97068L, 97070L, 97078L, 97080L, 97086L, 97089L, 97113L,
97123L, 97124L, 97132L, 97140L, 97201L, 97202L, 97203L, 97204L,
97205L, 97206L, 97209L, 97210L, 97211L, 97212L, 97213L, 97214L,
97215L, 97216L, 97217L, 97218L, 97219L, 97220L, 97221L, 97222L,
97223L, 97224L, 97225L, 97227L, 97229L, 97230L, 97231L, 97232L,
97233L, 97236L, 97239L, 97266L, 97267L), N = c(1L, 352L, 9L,
252L, 421L, 1077L, 357L, 1L, 31L, 2L, 4L, 159L, 239L, 525L, 640L,
548L, 1L, 1064L, 5L, 353L, 471L, 736L, 6L, 403L, 866L, 913L,
8L, 5L, 1113L, 776L, 3L, 543L, 219L, 684L, 463L, 1L, 57L, 809L,
189L, 216L, 688L, 510L, 504L, 330L, 318L, 177L, 734L, 195L, 832L,
305L, 276L, 589L, 688L, 716L, 286L, 83L, 1307L, 475L, 77L, 150L,
382L, 444L, 290L, 423L, 430L)), row.names = c(NA, -65L), class = c("data.table",
"data.frame"), .internal.selfref = <pointer: 0x7f904781a6e0>)
I used the kaggle data on a simple laptop (i3 8th gen) to generate a ggplot2 object, with cluster IDs randomly sampled and transform this via the ggplotly() function ... the resulting plotly object seems OK to work with for analysis but I do not know your performance requirements:
library(dplyr)
library(ggplot2)
library(plotly)
library(rnaturalearth) # here we get the basic map data from
# read in data from zip, select minimal number of columns and sample cluster_id
df <- readr::read_csv(unzip("path_to_zip/portland_housing.csv.zip"))%>%
dplyr::select(az = `address/zipcode`, latitude, longitude) %>%
dplyr::mutate(cluster_id = sample(1:7, n(), replace = TRUE))
# get the map data
world <- rnaturalearth::ne_countries(scale = "medium", returnclass = "sf")
# build the ggplot2 object (note that I use rings as shapes and alpha parameter to reduce the over plotting
plt <- ggplot2::ggplot(data = world) +
ggplot2::geom_sf() +
ggplot2::geom_point(data = df, aes(x = longitude, y = latitude, color = factor(cluster_id)), size = 1, shape = 21, alpha = .7) +
ggplot2::coord_sf(xlim = c(-124.5, -122), ylim = c(45, 46), expand = FALSE)
# plot it:
plt
# plotly auto transform from ggplot2 object
plotly::ggplotly(plt)
EDIT
To include a map you can use for example the ggmap package instead of the map data from rnaturalearth... I will only display the plotly result:
library(ggmap)
# https://stackoverflow.com/questions/23130604/plot-coordinates-on-map
sbbox <- ggmap::make_bbox(lon = c(-124.5, -122), lat = c(45, 46), f = .1)
myarea <- ggmap::get_map(location=sbbox, zoom=10, maptype="terrain")
myarea <- ggmap::ggmap(myarea)
plt2 <- myarea +
ggplot2::geom_point(data = df, mapping = aes(x = longitude, y = latitude, color = factor(cluster_id)), shape = 21, alpha = .7)
plotly::ggplotly(plt2)
There are many other approaches concerning the map data, like using the mapbox-api
I have counties.rds and houses1990.rds as data, and put them in the variables California_shp and Houses:
California_shp <- readRDS("counties.rds")
California_shp <- raster::aggregate(California_shp, by = "NAME")
Houses <- readRDS("houses1990.rds")
The information of California_shp and Houses are as follow:
dput(head(California_shp))
structure(list(NAME = c("Alameda", "Alpine", "Amador", "Butte",
"Calaveras", "Colusa")), row.names = c(NA, 6L), class = "data.frame")
dput(head(Houses))
structure(list(houseValue = c(452600L, 358500L, 352100L, 341300L,
342200L, 269700L), income = c(8.3252, 8.3014, 7.2574, 5.6431,
3.8462, 4.0368), houseAge = c(41L, 21L, 52L, 52L, 52L, 52L),
rooms = c(880L, 7099L, 1467L, 1274L, 1627L, 919L), bedrooms = c(129L,
1106L, 190L, 235L, 280L, 213L), population = c(322L, 2401L,
496L, 558L, 565L, 413L), households = c(126L, 1138L, 177L,
219L, 259L, 193L), latitude = c(37.88, 37.86, 37.85, 37.85,
37.85, 37.85), longitude = c(-122.23, -122.22, -122.24, -122.25,
-122.25, -122.25)), row.names = c(NA, 6L), class = "data.frame")
The proj4string of California_shp is also as follows:
California_shp#proj4string
CRS arguments:
+proj=longlat +datum=NAD83 +no_defs +ellps=GRS80 +towgs84=0,0,0
How by employing the proj4string of California_shp I can convert Houses to a SpatialPointsDataFrame with the same name?
This should work
library(sf)
library(mapview)
houses.sf <- Houses %>%
sf::st_as_sf(coords = c("longitude", "latitude"), crs = 4269)
mapview::mapview(houses.sf)
gives
I have a quite "messy data". I have a model with a interaction between two factors. And I want to plot it. So:
f1 <- structure(list(tipo = c("digitables", "digitables", "digitables",
"digitables", "digitables", "digitables", "digitables", "digitables",
"payments", "payments", "payments", "payments", "payments", "payments",
"payments", "payments", "traditionals", "traditionals", "traditionals",
"traditionals", "traditionals", "traditionals", "traditionals",
"traditionals"), categoria = c("Advice", "Digital banks", "Exchange",
"FinTech", "Insurance", "Investments", "Lending", "Payments and transfers",
"Advice", "Digital banks", "Exchange", "FinTech", "Insurance",
"Investments", "Lending", "Payments and transfers", "Advice",
"Digital banks", "Exchange", "FinTech", "Insurance", "Investments",
"Lending", "Payments and transfers"), Total = c(63L, 450L, 279L,
63L, 36L, 108L, 567L, 549L, 63L, 450L, 279L, 63L, 36L, 108L,
567L, 549L, 35L, 250L, 155L, 35L, 20L, 60L, 315L, 305L), Frequencia = c(44L,
266L, 118L, 9L, 14L, 45L, 134L, 242L, 33L, 68L, 2L, 10L, 3L,
8L, 11L, 78L, 27L, 226L, 142L, 10L, 20L, 45L, 300L, 245L), Perc = c(69.84,
59.11, 42.29, 14.29, 38.89, 41.67, 23.63, 44.08, 52.38, 15.11,
0.72, 15.87, 8.33, 7.41, 1.94, 14.21, 77.14, 90.4, 91.61, 28.57,
100, 75, 95.24, 80.33), Failure = c(19L, 184L, 161L, 54L, 22L,
63L, 433L, 307L, 30L, 382L, 277L, 53L, 33L, 100L, 556L, 471L,
8L, 24L, 13L, 25L, 0L, 15L, 15L, 60L)), row.names = c(NA, -24L
), class = "data.frame")
# Packages
library(dplyr)
library(ggplot2)
library(emmeans) #version 1.4.8. or 1.5.1
# Works as expected
m1 <- glm(cbind(Frequencia, Failure) ~ tipo*categoria,
data = f1, family = binomial(link = "logit"))
l1 <- emmeans(m1, ~categoria|tipo)
plot(l1, type = "response",
comparison = T,
by = "categoria")
Using by="tipo" results:
# Doesn't work:
plot(l1, type = "response",
comparison = T,
by = "tipo")
Error: Aborted -- Some comparison arrows have negative length!
In addition: Warning message:
Comparison discrepancy in group digitables, Advice - Insurance:
Target overlap = -0.0241, overlap on graph = 0.0073
If I use comparison = F as suggested by explanation supplement vignette, it works. However, it does not show me the arrows, which are very important.
Q1 - Is there a work around for it? (Or is it impossible due to my data?)
As we can see from the last plot, there is a category with probability = 1 (categoria=Insurance and tipo=traditionals). So, I delete only this row of my data frame, and I try to redo the plotting, and results to me:
f1 <- f1 %>%
filter(!Perc ==100)
m1 <- glm(cbind(Frequencia, Failure) ~ tipo*categoria,
data = f1, family = binomial(link = "logit"))
l1 <- emmeans(m1, ~categoria|tipo)
plot(l1, type = "response",
comparison = T,
by = "categoria")
Error in if (dif[i] > 0) lmat[i, id1[i]] = rmat[i, id2[i]] = wgt * v1[i] else rmat[i, :
missing value where TRUE/FALSE needed
Q2 - How to plot my results even when I have a missing level of one variable (with respect to another variable?). I would expect that the Insurance facet would have only have the payments and digitables levels (while the others remain the same).
First, please don't ever re-use the same variable names for more than one thing; that makes things not reproducible. If you modify a dataset, or a model, or whatever, give it a new name so it can be distinguished.
Q1
As documented, comparison arrows cannot always be computed. This is such an example. I suggest displaying the results some other way, e.g. using pwpp() or pwpm()
Q2
There was a bug in handling missing cases. This has been fixed in the GitHub version:
f2 <- f1 %>%
filter(!Perc ==100)
m2 <- glm(cbind(Frequencia, Failure) ~ tipo*categoria,
data = f2, family = binomial(link = "logit"))
l2 <- emmeans(m2, ~categoria|tipo)
plot(l2, type = "response",
comparison = TRUE,
by = "categoria")
plot(l2, type = "response",
comparison = TRUE,
by = "tipo")
## Error: Aborted -- Some comparison arrows have negative length!
## (in group "payments")
Simple example with R on mydata:
l=structure(list(dat = structure(1:9, .Label = c("01.01.2016",
"02.01.2016", "03.01.2016", "04.01.2016", "05.01.2016", "06.01.2016",
"07.01.2016", "08.01.2016", "09.01.2016"), class = "factor"),
lpt = c(94L, 3L, 30L, 92L, 20L, 80L, 20L, 190L, 52L)), .Names = c("dat",
"lpt"), class = "data.frame", row.names = c(NA, -9L))
l=ts(l)
spectrum(l)
R returned plot with periodogram
On this periodogram we can see bursts of values (two bursts 0.23, 0.45).
How should the values of the x-axis and the у-axis be reduced to a dataframe, but only for those values on the x axis that are bursts?
Second question:
Can these values be displayed not in frequencies, but in absolute, original units(dat,lpt)?
I am very very new to R and stats in general, and am having trouble adding multiple confidence ellipses to a PCA plot.
My interest is in highlighting potential groupings/clusters in the PCA plot with 95% confidence ellipses. I have tried using the dataEllipse function in R, however I cannot figure out how to add multiple ellipses with different centers to the PCA plot (the centers would be at various points that appear to contain a cluster, in this case lithic sources and lithic tools likely made from that source).
Thanks for any help with this!
{
lithic_final <- LITHIC.DATASHEET.FOR.R.COMPLETE.FORMAT
lithic_final
pca1 <- princomp(lithic_final); pca1
lithic_source <- c("A1", "A1", "A1", "A1", "A2","A2", "A2", "A3","A3","A3","B","B","B","B","B","B","C","C","C","C","C","C","C","D","D","D","D","D","D","D","D","E","E","E","E","E","E","E","E","F","F","G","G","G","G","H","H","H","H","H","H","H","I1","I1","I1","I2","I2","I2","I2","I2","J1","J1","J2","J2","J2","J2","J2","J2","J2","J2","J2","K","K","K","K","K","K","K","L","L","L","L","L","L","L","L","L","L","L","L","L","L","BB1","BB1","BB1","FC","FC","FC","JRPP","JRPP","JRPP","BB2","BB2","BB2","BB2","MWP","MWP","MWP","MWP","RPO","RPO","RPO")
lithic_source
summary(pca1)
plot(pca1)
#Plotting the scores with the Lithic Source Info
round(pca1$scores[,1:2], 2)
pca_scores <-round(pca1$scores[,1:2], 2)
plot(pca1$scores[,1], pca1$scores[,2], type="n")
text(pca1$scores[,1], pca1$scores[,2],labels=abbreviate(lithic_source, minlength=3), cex=.45)
#Plotting PCA Scores of EACH SAMPLE for PCA 2 and 3 with Lithic Source Info
round(pca1$scores[,2:3], 2)
pca2_3_scores <-round(pca1$scores[,2:3], 2)
plot(pca1$scores[,2], pca1$scores[,3], type="n")
text(pca1$scores[,2], pca1$scores[,3], labels=abbreviate(lithic_source, minlength=3), cex=.45)
#Plotting PCA Scores of EACH SAMPLE for PCA 3 and 4 with Lithic Source Info
round(pca1$scores[,3:4], 2)
pca3_4_scores <-round(pca1$scores[,3:4], 2)
plot(pca1$scores[,3], pca1$scores[,4], type="n")
text(pca1$scores[,3], pca1$scores[,4], labels=abbreviate(lithic_source, minlength=3), cex=.45)
#Plotting PCA Scores of EACH SAMPLE for PCA 1 and 3 with Lithic Source Info
round(pca1$scores[,1:3], 2)
pca1_3_scores <-round(pca1$scores[,1:3], 2)
plot(pca1$scores[,1], pca1$scores[,3], type="n")
text(pca1$scores[,1], pca1$scores[,3], labels=abbreviate(lithic_source, minlength=3), cex=.45)
#Plotting PCA Scores of EACH SAMPLE for PCA 1 and 4 with Lithic Source Info
round(pca1$scores[,1:4], 2)
pca1_4_scores <-round(pca1$scores[,1:4], 2)
plot(pca1$scores[,1], pca1$scores[,4], type="n")
text(pca1$scores[,1], pca1$scores[,4], labels=abbreviate(lithic_source, minlength=3), cex=.45)
#TRYING TO GET ELLIPSES ADDED TO PCA 1 and 4 scores
dataEllipse(pca1$scores[,1], pca1$scores[,4],centers=12,add=TRUE,levels=0.9, plot.points=FALSE)
structure(list(Ca.K12 = c(418L, 392L, 341L, 251L, 297L, 238L,
258L, 5L, 2L, 37L), Cr.K12 = c(1L, 12L, 15L, 6L, 9L, 6L, 35L,
7L, 45L, 32L), Cu.K12 = c(89L, 96L, 81L, 63L, 88L, 103L, 104L,
118L, 121L, 90L), Fe.K12 = c(18627L, 18849L, 18413L, 12893L,
17757L, 17270L, 16198L, 2750L, 4026L, 3373L), K.K12 = c(20L,
23L, 28L, 0L, 34L, 17L, 45L, 102L, 150L, 147L), Mn.K12 = c(205L,
212L, 235L, 120L, 216L, 212L, 246L, 121L, 155L, 115L), Nb.K12 = c(139L,
119L, 154L, 91L, 122L, 137L, 137L, 428L, 414L, 428L), Rb.K12 = c(99L,
42L, 79L, 49L, 210L, 243L, 168L, 689L, 767L, 705L), Sr.K12 = c(3509L,
3766L, 3481L, 2715L, 2851L, 2668L, 2695L, 202L, 220L, 217L),
Ti.K12 = c(444L, 520L, 431L, 293L, 542L, 622L, 531L, 82L,
129L, 84L), Y.K12 = c(135L, 121L, 105L, 74L, 144L, 79L, 85L,
301L, 326L, 379L), Zn.K12 = c(131L, 133L, 108L, 78L, 124L,
111L, 114L, 81L, 78L, 59L), Zr.K12 = c(1348L, 1479L, 1333L,
964L, 1506L, 1257L, 1296L, 3967L, 4697L, 4427L)), .Names = c("Ca.K12",
"Cr.K12", "Cu.K12", "Fe.K12", "K.K12", "Mn.K12", "Nb.K12", "Rb.K12",
"Sr.K12", "Ti.K12", "Y.K12", "Zn.K12", "Zr.K12"), row.names = c(NA,
10L), class = "data.frame")
I think you would have received a speedier reply if you had focused on your question instead of all the extraneous stuff. You gave us your commands for plotting a bunch of principal components that had nothing to do with your question. The question is, how do you plot ellipses by group? Your sample data at 10 lines and three groups is not helpful because 3 points is not enough to plot data ellipses. You are using the dataEllipse function in package car which has the simplest answer to your question:
First, a reproducible example:
set.seed(42) # so you can get the same numbers I get
source_a <- data.frame(X1=rnorm(25, 50, 5), X2=rnorm(25, 40, 5))
source_b <- data.frame(X1=rnorm(25, 20, 5), X2=rnorm(25, 40, 5))
source_c <- data.frame(X1=rnorm(25, 35, 5), X2=rnorm(25, 25, 5))
lithic_dat <- rbind(source_a, source_b, source_c)
lithic_source <- c(rep("a", 25), rep("b", 25), rep("c", 25))
Plot ellipses with scatterplot() and add text:
scatterplot(X2~X1 | lithic_source, data=lithic_dat, pch="", smooth=FALSE,
reg.line=FALSE, ellipse=TRUE, levels=.9)
text(lithic_dat$X1, lithic_dat$X2, lithic_source, cex=.75)
Scatterplot can be tweaked to do everything you want, but it is also
possible to plot the ellipses without using it:
sources <- unique(lithic_source) # vector of the different sources
plot(lithic_dat$X1, lithic_dat$X1, type="n")
text(lithic_dat$X1, lithic_dat$X2, lithic_source, cex=.75)
for (i in sources) with(lithic_dat, dataEllipse(X1[lithic_source==i],
X2[lithic_source==i], levels=.9, plot.points=FALSE))
This will work for your principal components and any other data.
Here is a simple solution using a package called ggbiplot (available on github) with Iris data. I hope this is what you were looking for.
library(devtools);install_github('vqv/ggbiplot')
library(ggbiplot)
pca = prcomp(iris[,1:4])
ggbiplot(pca,groups = iris$Species,ellipse = T,ellipse.prob = .95)