postcode distances using google - r

I have a two lists of postcodes (in R)...one of children's addresses with their academic score and one of schools...
i would like to be able to get the closest school for each child...so presumably a calculation of distance would been needed between postcodes by converting to long and lat values?
And then I would like to be able to plot on a google map all the children per school...and see if the children who live closer to school get better grades...perhaps ploting schools a different colour to kids, and the kids having a gradient of colour according to their score?
perhaps something using the googleVis package?
so for example...
if we have the data for 3 kids and 2 schools...
student.data <- cbind(post.codes=c("KA12 6QE", "SW1A 0AA", "WC1X 9NT"),score=c(23,58,88))
school.postcodes <- c("SL4 6DW", "SW13 9JT")
(N.B. My actual data is obviously significantly larger than the one given so scalability would be useful...)
what should be done with googleVis or any other package for that matter to be able to complete the above?

I would start by something like this to get the lat/long
Get lat/long for each post code
library(XML)
school.postcodes <- c("KA12 6QE", "SW1A 0AA", "WC1X 9NT")
ll <- lapply(school.postcodes,
function(str){
u <- paste('http://maps.google.com/maps/api/geocode/xml?sensor=false&address=',str)
doc <- xmlTreeParse(u, useInternal=TRUE)
lat=xpathApply(doc,'/GeocodeResponse/result/geometry/location/lat',xmlValue)[[1]]
lng=xpathApply(doc,'/GeocodeResponse/result/geometry/location/lng',xmlValue)[[1]]
c(code = str,lat = lat, lng = lng)
})
# get long/lat for the students
ll.students <- lapply(student.data$post.codes,
function(str){
u <- paste('http://maps.google.com/maps/api/geocode/xml?sensor=false&address=',str)
doc <- xmlTreeParse(u, useInternal=TRUE)
lat=xpathApply(doc,'/GeocodeResponse/result/geometry/location/lat',xmlValue)[[1]]
lng=xpathApply(doc,'/GeocodeResponse/result/geometry/location/lng',xmlValue)[[1]]
c(code = str,lat = lat, lng = lng)
})
ll <- do.call(rbind,ll)
ll.students <- do.call(rbind,ll.students)
do.call(rbind,ll)
code lat lng
[1,] "KA12%206QE" "55.6188429" "-4.6766226"
[2,] "SW1A%200AA" "51.5004864" "-0.1254664"
[3,] "WC1X%209NT" "51.5287992" "-0.1181098"
get the distance matrix
library(RJSONIO)
dist.list <- lapply(seq(nrow(ll)),
function(id){
url <- paste("http://maps.googleapis.com/maps/api/distancematrix/json?origins=",
ll[id,2],",",ll[id,3],
"&destinations=",
paste( ll.students[,2],ll.students[,3],sep=',',collapse='|'),
"&sensor=false",sep ='')
res <- fromJSON(url)
hh <- sapply(res$rows[[1]]$elements,function(dest){
c(distance= as.numeric(dest$distance$value),
duration = dest$duration$text)
})
hh <- rbind(hh,destination = ll.students[,1])
})
names(dist.list) <- ll[,1]
dist.list
$`SL4 6DW`
[,1] [,2] [,3]
distance "664698" "36583" "41967"
duration "6 hours 30 mins" "43 mins" "49 mins"
destination "1" "2" "3"
$`SW13 9JT`
[,1] [,2] [,3]
distance "682210" "9476" "13125"
duration "6 hours 39 mins" "22 mins" "27 mins"
destination "1" "2" "3"

Related

R cbind is very slow

There is a question that has a very similar title (cbind runs very slow) but it does not help me with my problem. I am retrieving 10+ JSON files with 100 variables each and I try to create one big data.frame/table with 1000 columns. In practice, I do not use the very same JSON-file as per the example but different ones. Ideally only the problematic line cx <- cbind(cx, bx) would speed up as the other lines (unlist, as.data.table) work well for me and I would not know what else to use. I know, "cbind is slow" but do I have any alternatives? Ideally with Base R.
library(jsonlite)
library(data.table)
starttime <- Sys.time()
for (i in 1:10) { # loop through all 10 json files
zz <- Sys.time() # measuring the time for each loop
urlx <- "http://mysafeinfo.com/api/data?list=englishmonarchs&format=json"
jsnx <- fromJSON(urlx)
if(i==1) {
ax <- unlist(jsnx)
bx <- as.data.table(ax)
cx <- bx
}
for (j in 1:100) { # loop through all 100 variables in each file
ax <- unlist(jsnx)
bx <- as.data.table(ax)
cx <- cbind(cx, bx) # <---- VERY SLOW ----
}
zz <- round(Sys.time()-zz,1)
print(sprintf("%1.1f", zz))
flush.console()
}
endtime <- Sys.time()
endtime-starttime
This gets slower and slower with more files, here my timings.
[1] "0.7"
[1] "1.3"
[1] "1.3"
[1] "1.6"
[1] "2.1"
[1] "2.2"
[1] "2.5"
[1] "3.2"
[1] "3.4"
[1] "3.5"

R - Isolate clusters with specific characteristics in hclust

I've used hclust to generate a cluster dendrogram of some data, but I need to isolate all the paired clusters, i.e. all the clusters that comprise just 2 pieces of data (the first ones to be clustered together), even if they might be clustered with other data on a "higher" branch. Does anyone know how I can do that?
I've highlighted the clusters I want to isolate in the attached image, hopefully that explains it better.
I'd like to be able to isolate all the paired data in those clusters in such a way to be able to compare the clusters on their contents. For example to see which of them contain a particular type of data.
FWIW, you could extract the "forks" like this:
hc <- hclust(dist(USArrests), "ave")
plot(hc)
res <- list()
invisible(dendrapply(as.dendrogram(hc), function(x) {
if (attr(x, "members")==2)
if (all(sapply(x[1:2], is.leaf)))
res <<- c(res, list(c(attr(x[[1]], "label"), attr(x[[2]], "label"))))
x
}))
head( do.call(rbind, res) )
# [,1] [,2]
# [1,] "Florida" "North Carolina"
# [2,] "Arizona" "New Mexico"
# [3,] "Alabama" "Louisiana"
# [4,] "Illinois" "New York"
# [5,] "Michigan" "Nevada"
# [6,] "Mississippi" "South Carolina"
(just the first 6 rows of the result)

Create a list with named values by applying a function to each row of a data frame

I'm trying to get a list where each element has a name, by applying a function to each row of a data frame, but can't get the right output.
Assuming this is the function that I want to apply to each row:
format_setup_name <- function(m, v, s) {
a <- list()
a[[paste(m, "machines and", v, s, "GB volumes")]] <- paste(num_machines,num_volumes,vol_size,sep="-")
a
}
If this is the input data frame:
df <- data.frame(m=c(1,2,3), v=c(3,3,3), s=c(15,20,30))
I can't get a list that looks like:
$`1-3-15`
[1] "1 machines and 3 15 GB volumes"
$`2-3-20`
[1] "2 machines and 3 20 GB volumes"
$`3-3-30`
[1] "3 machines and 3 30 GB volumes"
Can someone give me hints how to do it?
Why do I need this? Well, I want to populate selectizeInput in shiny using values coming from the database. Since I'm combining several columns, I need a way to match the selected input with the values.
This is a good use case for setNames which can add the names() attribute to an object, in place. Also, if you use as.list, you can do this in just one line without any looping:
setNames(as.list(paste(df$m, ifelse(df$m == 1, "machine", "machines"), "and", df$v, df$s, "GB volumes")), paste(df$m,df$v,df$s,sep="-"))
# $`1-3-15`
# [1] "1 machine and 3 15 GB volumes"
#
# $`2-3-20`
# [1] "2 machines and 3 20 GB volumes"
#
# $`3-3-30`
# [1] "3 machines and 3 30 GB volumes"
Thomas has already found a pretty neat solution to your problem (and in one line, too!). But I'll just show you how you could have succeeded with the approach you first tried:
# We'll use the same data, this time called "dat" (I avoid calling
# objects `df` because `df` is also a function's name)
dat <- data.frame(m = c(1,2,3), v = c(3,3,3), s = c(15,20,30))
format_setup_name <- function(m, v, s) {
a <- list() # initialize the list, all is well up to here
# But here we'll need a loop to assign in turn each element to the list
for(i in seq_along(m)) {
a[[paste(m[i], v[i], s[i], sep="-")]] <-
paste(m[i], "machines and", v[i], s[i], "GB volumes")
}
return(a)
}
Note that what goes inside the brackets is the name of the element, while what's at the right side of the <- is the content to be assigned, not the inverse as your code was suggesting.
So let's try it:
my.setup <- format_setup_name(dat$m, dat$v, dat$s)
my.setup
# $`1-3-15`
# [1] "1 machines and 3 15 GB volumes"
#
# $`2-3-20`
# [1] "2 machines and 3 20 GB volumes"
#
# $`3-3-30`
# [1] "3 machines and 3 30 GB volumes"
Everything seems nice. Just one thing to note: with the $ operator, you'll need to use single or double quotes to access individual items by their names:
my.setup$"1-3-15" # my.setup$1-3-15 won't work
# [1] "1 machines and 3 15 GB volumes"
my.setup[['1-3-15']] # equivalent
# [1] "1 machines and 3 15 GB volumes"
Edit: lapply version
Since loops have really fallen out of favor, here's a version with lapply:
format_setup_name <- function(m, v, s) {
a <- lapply(seq_along(m), function(i) paste(m[i], "machines and", v[i], s[i], "GB volumes"))
names(a) <- paste(m, v, s, sep="-")
return(a)
}

R - Plotting netcdf climate data

I have been trying plot the following gridded netcdf file: "air.1999.nc" found at the following website:
http://www.esrl.noaa.gov/psd/data/gridded/data.ncep.html
I have tried the code below based on answers I have found here and elsewhere, but no luck.
library(ncdf);
temp.nc <- open.ncdf("air.1999.nc");
temp <- get.var.ncdf(temp.nc,"air");
temp.nc$dim$lon$vals -> lon
temp.nc$dim$lat$vals -> lat
lat <- rev(lat)
temp <- temp[nrow(temp):1,]
temp[temp==-32767] <- NA
temp <- t(temp)
image(lon,lat,temp)
library(maptools)
data(wrld_simpl)
plot(wrld_simpl, add = TRUE)
This code was from modified from the one found here: The variable from a netcdf file comes out flipped
Does anyone have any ideas or experience with using these type of netcdf files? Thanks
In the question you linked the whole part from lat <- rev(lat) to temp <- t(temp) was very specific to that particular OP dataset and have absolutely no universal value.
temp.nc <- open.ncdf("~/Downloads/air.1999.nc")
temp.nc
[1] "file ~/Downloads/air.1999.nc has 4 dimensions:"
[1] "lon Size: 144"
[1] "lat Size: 73"
[1] "level Size: 12"
[1] "time Size: 365"
[1] "------------------------"
[1] "file ~/Downloads/air.1999.nc has 2 variables:"
[1] "short air[lon,lat,level,time] Longname:Air temperature Missval:32767"
[1] "short head[level,time] Longname:Missing Missval:NA"
As you can see from these informations, in your case, missing values are represented by the value 32767 so the following should be your first step:
temp <- get.var.ncdf(temp.nc,"air")
temp[temp=="32767"] <- NA
Additionnaly in your case you have 4 dimensions to your data, not just 2, they are longitude, latitude, level (which I'm assuming represent the height) and time.
temp.nc$dim$lon$vals -> lon
temp.nc$dim$lat$vals -> lat
temp.nc$dim$time$vals -> time
temp.nc$dim$level$vals -> lev
If you have a look at lat you see that the values are in reverse (which image will frown upon) so let's reverse them:
lat <- rev(lat)
temp <- temp[, ncol(temp):1, , ] #lat being our dimension number 2
Then the longitude is expressed from 0 to 360 which is not standard, it should be from -180 to 180 so let's change that:
lon <- lon -180
So now let's plot the data for a level of 1000 (i. e. the first one) and the first date:
temp11 <- temp[ , , 1, 1] #Level is the third dimension and time the fourth.
image(lon,lat,temp11)
And then let's superimpose a world map:
library(maptools)
data(wrld_simpl)
plot(wrld_simpl,add=TRUE)

Compare datasets in R

I have gathered a set of transactions in a CSV file of the format:
{Pierre, lait, oeuf, beurre, pain}
{Paul, mange du pain,jambon, lait}
{Jacques, oeuf, va chez la crémière, pain, voiture}
I plan to do a simple association rule analysis, but first I want to exclude items from each transactions which do not belong to ReferenceSet = {lait, oeuf, beurre, pain}.
Thus my resulting dataset would be, in my example :
{Pierre, lait, oeuf, beurre, pain}
{Paul,lait}
{Jacques, oeuf, pain,}
I'm sure this is quite simple, but would love to read suggestions/answers to help me a bit.
Another answer references %in%, but in this case intersect is even handier (you may want to look at match, too -- but I think it's documented in the same place as %in%) -- with lapply and intersect we can make the answer into a one-liner:
Data:
> L <- list(pierre=c("lait","oeuf","beurre","pain") ,
+ paul=c("mange du pain", "jambon", "lait"),
+ jacques=c("oeuf","va chez la crémière", "pain", "voiture"))
> reference <- c("lait", "oeuf", "beurre", "pain")
Answer:
> lapply(L,intersect,reference)
$pierre
[1] "lait" "oeuf" "beurre" "pain"
$paul
[1] "lait"
$jacques
[1] "oeuf" "pain"
One way is follows (but, as I'm leaving the structure as a matrix I've left NAs where data has been removed (these could be removed if exporting back to CSV); I'm also sure it's possible to do it without loops - this would make it faster (but, IMHO less readable), and I'm sure there's a more efficient way to do the logic too - I'd also be interested in seeing someone's else view on this)
ref <- c("lait","oeuf","beurre","pain")
input <- read.csv("info.csv",sep=",",header=FALSE,strip.white=TRUE)
> input
V1 V2 V3 V4 V5
1 Pierre lait oeuf beurre pain
2 Paul mange du pain jambon lait
3 Jacques oeuf va chez la crémière pain voiture
input <- as.matrix(input)
output <- matrix(nrow=nrow(input),ncol=ncol(input))
currentRow <- c()
for(i in 1:nrow(input)) {
j <- 2
output[i,1]<-input[i,1]
for(k in 2:length(input[i,])) {
if(toString(input[i,k]) %in% ref){
output[i,j] <- toString(input[i,k])
j<-j+1
}
}
}
> output
[,1] [,2] [,3] [,4] [,5]
[1,] "Pierre" "lait" "oeuf" "beurre" "pain"
[2,] "Paul" "lait" NA NA NA
[3,] "Jacques" "oeuf" "pain" NA NA
The %in% operator will come in handy.
pierre <- c("lait","oeuf","beurre","pain")
paul <- c("mange du pain", "jambon", "lait")
jacques <- c("oeuf","va chez la crémière", "pain", "voiture")
reference <- c("lait", "oeuf", "beurre", "pain")
pierre_fixed <- pierre[pierre %in% reference]
paul_fixed <- paul[paul %in% reference]
jacques_fixed <- jacques[jacques %in% reference]
pierre_fixed
paul_fixed
jacques_fixed

Resources