double loop does not match - r

I'm having trouble with a loop that I'm trying to do.
I have data for monthly temperature (web-scraping from TerraClimate). I have to set the latitude and longitude, and my code gives me one observation for the average temperature of each month.
I have many pairs of coordinates that I need to web scrape so I'm doing a loop. The problem is that I'm trying to paste the name of the location and it's not working.
Here is an example:
I have a list with the coordinates
and an object with the names
loc1 <- c(-70.6666667, -33.4500000)
loc2 <- c(-71.6163889,-33.0458333)
c=list(loc1,loc2)
x=c("Loc 1", "Loc 2")
And my code looks like this:
# web scraping temperature data
var <- "tmax"
terra <- paste0(paste0("http://thredds.northwestknowledge.net:8080/thredds/dodsC/agg_terraclimate_",var),"_1958_CurrentYear_GLOBE.nc")
nc <- nc_open(terra)
lon <- ncvar_get(nc, "lon")
lat <- ncvar_get(nc, "lat")
# creating empty object to paste each set of data
DAT <- NULL
# loop that brings up the temperatures for each set of coordinates in c
for(i in c){
# names of locations
for(n in x){
flon = match(abs(lon -i[1]) < 1/48, 1)
lonindex = which(flon %in% 1)
flat = match(abs(lat -i[2]) < 1/48, 1)
latindex = which(flat %in% 1)
start <- c(lonindex, latindex, 1)
count <- c(1, 1, -1)
data <- as.numeric(ncvar_get(nc, varid = var,start = start, count))
data <- as.data.table(data)
data$date <- seq(as.Date("1958-02-01"), length.out=756, by="month")
data$ym_chr <- format(data$date, format = "%Y-%m")
data[,date:=NULL]
data$LOCATION <- as.character(c[n])
DAT <- rbind(DAT, data)
}
}
The problem is that the second part of the loop is not matching how I would like. Each location has 756 observations, so in total I should have 1512 obs. For the first time the loop runs, I would like "Loc 1" to be under LOCATION, and then, the second time the loop runs, for LOCATION to be "Loc 2".
But instead I'm getting 3024 observations.
This is a replicable code.

Related

Time efficient alternative to for-loops for manual sampling from timelines in R

So I have sampled a set of lakes at x timepoints throughout the year. I also have deployed loggers etc. in the water and I want to use daily averages from these loggers, at the timepoint of the visit to x days/hours before. Sometimes I also just grab the a sample for the timepoint of the visit.
This is my solution, it works just fine but since I experiment alot with some model assumptions and perform sensitivity analyses it operates unsatisfactory slow.
I seem to have solved most of my R problems with loops and I often encounter more efficient scripts, it would be very interesting to see some more effective alternatives to my code.
Below code just generates some dummy data..
library(dplyr)
library(lubridate)
do.pct.sat <- function(x,y,z){
t <- x
do <- y
p <- z
atm <- (p*100)/101325
do.sat <- atm*exp(-139.34411+157570.1/(t+273.15)-66423080/(t+273.15)^2+12438000000/(t+273.15)^3-862194900000/(t+273.15)^4)
do.pct.sat <- (do/do.sat)*100
return(do.pct.sat)
}#function for calculating the % oxygen saturation
#here's some dummy date resembling real data
date.initial <- as.POSIXct("2022-06-01")#deployment date
date.end <- as.POSIXct("2022-10-01")#date of retrieval
id <- c("a","b","c")#lake id
lake <- list()#make dataset list for each lake
s <- list()#list of dataframes for the samples from the lake logger timelines
#loop below generates dummy data. this is not part of the real script that I want to improve.
for(i in 1:3){
datetime <- seq(from = date.initial,to = date.end,by=10*60)#10 minute intervals from deploy to retrieve
l <- length(datetime)#vector length of datetime
#set dummy data
do <- rnorm(l,mean = 10,sd=3)#o2 conc.
pressure <- rnorm(l,mean = 980,sd=50)#baro pressure
temp <- rnorm(l,mean=15,sd=5)#water temp
k.z <- rnorm(l,mean=0.35,sd=0.1)#gas exchange koeff / mixed layer depth
dosat.pct <- do.pct.sat(temp,do,pressure)#oxygen sat in %
iso <- as.data.frame(cbind(datetime,do,dosat.pct,temp,pressure,k.z))#bind dummy dataframe to resemble real data
iso$datetime <- as.POSIXct(iso$datetime,origin = "1970-01-01")
lake[[i]] <- iso#save the data frame to the lake logger list
samples <- as.POSIXct(sample((date.initial+5*24*60*60):date.end, 7, replace=FALSE),origin = "1970-01-01")#randomize 7 timepoints
s[[i]] <- as.data.frame(samples)#save it in empty data frame
s[[i]]$lake <- id[i]
}
names(lake) <- id
samples <- bind_rows(s)
samples$samples <- round_date(samples$samples,unit="10 minutes")#rounds my random samples to closest 10 minute
Below is the function that I want to effectivize (same library). I think it operates slow because I take one date at a time, before taking the next;
sample.lakes <- function(average=3){
dts <- list()#empty list
for(i in 1:length(lake)){
print(id[i])
data = lake[[i]]
y <- samples[grepl(id[i],samples$lake),]
dates <- y$samples
#empty vectors to fill with values sampled in loop
avg.kz <- vector()
sd.kz <- vector()
do.mgl <- vector()
dosat.pct <- vector()
temp.c <- vector()
for (k in 1:length(dates)){
print(k)
#below I filter the logger data to contain timepoint of sampling minus number of days I want the average from 'averages'.
prior.days = filter(data, datetime > as.POSIXct(dates[k])-(24*60*60)*average & datetime < as.POSIXct(dates[k]))
#fill the empty vectors with value I desire, mean and sd k.z and point sample of the other variables.
avg.kz[k] = mean(prior.days$k.z)
sd.kz[k] = sd(prior.days$k.z)
temp.c[k] <- data[grepl(dates[k],data$datetime),]$temp
do.mgl[k] <- data[grepl(dates[k],data$datetime),]$do
dosat.pct[k] <- data[grepl(dates[k],data$datetime),]$dosat.pct
}
sd.kz[is.na(sd.kz)] <- 0
#add them to data frame y
y$dosat.pct <- dosat.pct
y$do.mgl <- do.mgl
y$temp.c <- temp.c
y$avg.kz <- avg.kz
y$sd.kz <- sd.kz
dts[[i]] <- y#add to single-row dataframe
}
iso <- bind_rows(dts)#make a complete dataframe with samples.
return(iso)
}
iso <- sample.lakes(average=4)#do not set average to > 5 in this example script
I would appreciaty any suggestions alot!
My guess is that this part using grepl:
data[grepl(dates[k],data$datetime),]
inside your inner for loop is slow.
Couldn't you instead try just seeing if the datetimes are the same with ==?
In addition, you only need to subset data once.
Try this as an alternative:
for (k in 1:length(dates)){
print(k)
prior.days = filter(data, datetime > as.POSIXct(dates[k])-(24*60*60)*average & datetime < as.POSIXct(dates[k]))
avg.kz[k] = mean(prior.days$k.z)
sd.kz[k] = sd(prior.days$k.z)
sub_data <- data[data$datetime == dates[k], ]
temp.c[k] <- sub_data$temp
do.mgl[k] <- sub_data$do
dosat.pct[k] <- sub_data$dosat.pct
}

Looped raster extractions from a SpatialPointsDataFrame; each row needs to extract from a corresponding raster indicated in a column

I am attempting to make a loop that extracts values from NDVI rasters for multiple shapefiles. The shapefiles are animal GPS locations that include a date and a time. The shapefiles include random locations that have been generated from the population's range and I am therefore assigning each random location that didn't have a date, a date from one of the GPS locations in a 5:1 ratio. I have a function that finds the closest date to the date for each GPS location or random point and stores it in the dataframe as "x$NDVIfile" The code all up to this point but I think the problem is here specifically:
for(j in length(nrow(x))){
a <- raster(paste0("E:/RSF_GIS/HabitatVariables/NDVI/",
x$NDVIfile[j], ".tif"))
x$ndvi[j] <- raster::extract(a, x[j,]) # extract each row based on the closest NDVI file
}
I want to extract from each row in my data the raster from the directory that corresponds to "x$NDVIfile[j]". My current outcome looks like this. All the extracted NDVI values are the same for each dataframe and I have the suspicion that the first raster referenced is the only raster being extracted from for each dataframe :
dataframe x
Here is the entire code :
RSF_dir <- list.files("E:/RSF_GIS/RSF_files",
pattern = "*.shp",
full.names = TRUE)
ndvi_dir = list.files("E:/RSF_GIS/HabitatVariables/NDVI",
pattern = "*.tif",
full.names = FALSE)
ndvi_dir.df <- tools::file_path_sans_ext(basename(ndvi_dir))
ndvi_dir.df <- as.Date(ndvi_dir.df)
ndvi_dir.df <- as.data.frame(ndvi_dir.df)
for (i in 1:length(RSF_dir)) {
x <- rgdal::readOGR(RSF_dir[i])
x <- as.data.frame(x)
nona <- x$Acqst_T[!is.na(x$Acqst_T)] # make a list of non- NA values
nona <- rep.int(nona, times = 6) # have the list repeat itself 6 times (5:1 = random_location:GPS_fix)
x$Acqst_T <- dplyr::coalesce(x$Acqst_T, nona) # and assign to NAs
x$date <- as.Date(x$Acqst_T, format = '%Y-%m-%d %H:%M:%S')
min_distances <- as.numeric(x$date)- matrix(rep(as.numeric(ndvi_dir.df$ndvi_dir),nrow(x)),ncol=length(ndvi_dir.df$ndvi_dir),byrow=T)
min_distances <- as.data.frame(t(min_distances))
closest <- sapply(min_distances,function(o) { # function to find the closest NDVI date for each GPS fix
w <- which(o==min(o[o>0])); # (MOD09Q1 collects imagery every 8 days)
ifelse(length(w)==0,NA,w)
})
x$NDVIfile <- as.Date(ndvi_dir.df$ndvi_dir[closest])
x <- SpatialPointsDataFrame(data.frame(x$coords.x1, x$coords.x2), x, proj4string=veg_INREV#proj4string)
for(j in length(nrow(x))){
a <- raster(paste0("E:/RSF_GIS/HabitatVariables/NDVI/",
x$NDVIfile[j], ".tif"))
x$ndvi[j] <- raster::extract(a, x[j,]) # extract each row based on the closest NDVI file
}
writeOGR(obj= x, dsn="E:/RSF_GIS/RSF_files/trial",
layer=(paste0(tools::file_path_sans_ext(basename(RSF_dir[i])))), driver="ESRI Shapefile", overwrite_layer = TRUE)
}
Thanks!
Answering my own question, the solution was simply a change in code indicating the iterations of the loop:
for(j in length(nrow(x))){
...
}
should have been instead
for(j in 1:nrow(x)){
...
}

Using loops to extract coordinates, match them and write to file

I am trying to use for loops (or the apply function as an alternative) to extract coordinates from a data.frame, search for the closest point within the E-OBS gridded dataset, extract the temperature-data for time x1-x2 and write it to another excel file.
While the code works to extract single data points, I seem unable to include this code within a loop and to add the results potentially next to the input-coordinates.
library(sp)
library(raster)
library(ncdf4)
#Coordinates
df
ID site E N
1 1 site_place_date1 7.558758 47.81004
2 2 site_place_date2 7.582749 47.63411
3 3 site_place_date3 7.607968 48.01475
4 4 site_place_date4 7.644660 47.67139
... ... ... ...`
Set coordinates of target point MANUALLY:
lon <- 7.558758 # longitude of location
lat <- 47.81004 # latitude of location
#Mean daily temperature
ncin <- nc_open("tg_0.25deg_reg_v17.0.nc")
print(ncin)
t <- ncvar_get(ncin,"time")
tunits <- ncatt_get(ncin,"time","units")nt <- dim(t)
nt
obsoutput <- ncvar_get(ncin,
start= c(which.min(abs(ncin$dim$longitude$vals - lon)), # look for closest long
which.min(abs(ncin$dim$latitude$vals - lat)), # look for closest lat
1),
count=c(1,1,-1))
DataMeanT <- data.frame(DateN= t, MeanDailyT = obsoutput)
nc_close(ncin)
head(DataMeanT)
#check if there are NAs =999
summary(DataMeanT)
Data = DataMeanT
Data$Date = as.Date(Data$DateN,origin="20000-01-01")
Data$Year = format(Data$Date,"%Y")
Data$Month = format(Data$Date,"%m")
head(Data)
Data$YearMonth = format(Data$Date, format="%Y-%b")
Data_annual = aggregate(("T_AnnualMean" = MeanDailyT) ~ Year,data = Data, FUN = mean,na.action = na.pass)
names(Data_annual)[2] <- "AirT"
head(Data_annual)
#Export table
write.table(Data_annual, "Site_AirTemp.csv", row.names = FALSE, append = FALSE, col.names = TRUE, sep = ", ", quote = TRUE)
The aim is to run the script as part of a loop for all coordinates in df and to write the temperature data to a new data-table with information on site-ID or alternatively into the next columns of df.
Simply wrap your entire process in a defined method and use an apply function to pass in lon/lat coordinates. One great candidate is mapply or its wrapper Map to iterate elementwise between both vectors of df$E and df$N. Also, a third argument, df$site, is passed into method for unique CSV names as right now the same file will be overwritten.
Below some non-assignment lines such as head or summary are removed since they do nothing inside a method. Also context managers, within and with are used to avoid repetition of Data$ for more streamlined data manipulation. The Map call writes to file AND builds a list of aggregated data frames for use later.
Function
my_function <- function(lon, lat, site) {
# Mean daily temperature
ncin <- nc_open("tg_0.25deg_reg_v17.0.nc")
print(ncin)
t <- ncvar_get(ncin,"time")
tunits <- ncatt_get(ncin,"time","units")nt <- dim(t)
# look for closest lon and lat
obsoutput <- ncvar_get(ncin,
start = c(which.min(abs(ncin$dim$longitude$vals - lon)),
which.min(abs(ncin$dim$latitude$vals - lat)),
1),
count = c(1,1,-1))
DataMeanT <- data.frame(DateN = t, MeanDailyT = obsoutput)
nc_close(ncin)
Data <- within(DataMeanT, {
Date <- as.Date(DateN, origin="2000-01-01")
Year <- format(Date,"%Y")
Month <- format(Date,"%m")
YearMonth <- format(Date, format="%Y-%b")
})
Data_annual <- with(Data, aggregate(list("AirT" = MeanDailyT), list(Year=Year),
FUN = mean, na.action = na.pass))
# Export table
write.table(Data_annual, paste0("Site_AirTemp_", site, "_.csv"), row.names=FALSE,
append = FALSE, col.names = TRUE, sep = ", ", quote = TRUE)
# SAVE AGGREGATED DATA FRAME
return(Data_annual)
}
Call
# ITERATE THROUGH EACH LON/LAT PAIR ELEMENTWISE
df_list <- Map(my_function, df$E, df$N, df$site)
# df_list <- mapply(my_function, df$E, df$N, df$site, SIMPLIFY=FALSE) # EQUIVALENT CALL
You can probably do:
library(raster)
b <- brick("tg_0.25deg_reg_v17.0.nc")
e <- extract(b, df[, c('E', 'N')])

average gridded climate data for duplicated times in r

I have a gridded climate dataset, such as:
# generate time vector
time1 <- seq(14847.5,14974.5, by = 1)
time2 <- seq(14947.5,14974.5, by = 1)
time <- c(time1,time2)
time <- as.POSIXct(time*86400,origin='1970-01-01 00:00')
# generate lat and lon coordinates
lat <- seq(80,90, by = 1)
lon <- seq(20,30, by = 1)
# generate 3dimensional array
dat <- array(runif(length(lat)*length(lon)*length(time)),
dim = c(length(lon),length(lat),length(time)))
such that
> dim(dat)
[1] 11 11 156
the dimensions of the data are describing the variable at different longitude (dim = 1), latitude (dim = 2), and time (dim = 3).
The issue I have at the moment is that some of the times are repeated, something to do with overlapping sensors measuring the data. Therefore, I was wondering if it was possible to only keep the unique times for dat, but average the data within the grid for the duplicated times i.e. if there are two repeated days we take the average value in each latitude and longitude grid for that time.
I can find the unique times as:
# only select unique times
new_time <- unique(time)
unique_time <- unique(time)
The following code then aims to loop through each grid (lat/lon) and average all of the duplicated days.
# loop through lat/lon coordinates to generate new data
new_dat <- array(dim = c(length(lon),length(lat),length(new_time)))
for(i in 1:length(lon)){
for(ii in 1:length(lat)){
dat2 <- dat[i,ii,]
dat2b <- NA
for(k in 1:length(unique_time)){
idx <- time == unique_time[k]
dat2b[k] <- mean(dat2[idx], na.rm = TRUE)
}
new_dat[i,ii,] <- dat2b
}
}
I'm convinced that this provides the correct answer, but I'm certain there is a much cleaner method do achieve this.
I should also note that my data is quite large (i.e. k = 7000), so this last loop is not very efficient, to say the least.
My original answer:
This is a bit more concise and efficient by use of aggregate:
for(i in 1:length(lon)){
for(ii in 1:length(lat)){
new_dat[i,ii,] <- as.numeric(aggregate(dat[i,ii,], by=list(time),mean)$x)
}
}
It still has 2 out of the 3 of the loops, but it manages to bypass creating dat2, dat2b, and unique_time.
My improved answer:
f <- function(i, ii){as.numeric(aggregate(dat[i,ii,], by=list(time),mean)$x)}
for(i in 1:nrow(expand.grid(1:length(lon),1:length(lat)))){
new_dat[expand.grid(1:length(lon),1:length(lat))[i,1],
expand.grid(1:length(lon),1:length(lat))[i,2],] <-
f(expand.grid(1:length(lon),1:length(lat))[i,1],expand.grid(1:length(lon),
1:length(lat))[i,2])
}
Got it down to just 1 loop. We could probably bypass that loop too with an apply.

Improving performance when working with geodata in R

I wrote the following script to produce the raw data for plotting the above map. The problem is, for 550,000 data points, this takes about 2 hours to run on a relatively powerful machine. I'm new to R, however, and I'm wondering if there are any optimized functions I can take advantage of?
The basic idea is that, given a set of geospatial data, you split the set into 200 rows, and split each row into a bunch of squares. You then calculate the total of a value in each square in a row. The approach I've taken below is to take the "upper left" point of a square, calculate the latitude/longitude of edges of the square, and exclude all points not in those bounds, and then sum what remains. Is there a better way without using a solution like PostGIS?
all.data <- read.csv("FrederictonPropertyTaxDiffCleanedv3.csv", header=TRUE,
stringsAsFactors=FALSE)
all.data$X <- as.numeric(all.data$X)
all.data$Y <- as.numeric(all.data$Y)
startEnd <- function(lats, lngs) {
# Find the "upper left" (NW) and "bottom right" (SE) coordinates of a set of data.
#
# Args:
# lats: A list of latitude coordinates
# lngs: A list of longitude coordinates
#
# Returns:
# A list of values corresponding to the northwest-most and southeast-most coordinates
# Convert to real number and remove NA values
lats <- na.omit(as.numeric(lats))
lngs <- na.omit(as.numeric(lngs))
topLat <- max(lats)
topLng <- min(lngs)
botLat <- min(lats)
botLng <- max(lngs)
return(c(topLat, topLng, botLat, botLng))
}
startEndVals <- startEnd(all.data$Y, all.data$X)
startLat <- startEndVals[1]
endLat <- startEndVals[3]
startLng <- startEndVals[2]
endLng <- startEndVals[4]
num_intervals = 200.0
interval <- (startEndVals[1] - startEndVals[3]) / num_intervals
# testLng <- -66.6462379307115
# testLat <- 45.9581234392
# Prepare the data to be sent in
data <- all.data[,c("Y", "X", "levy2014_ha")]
sumInsideSquare <- function(pointLat, pointLng, interval, data) {
# Sum all the values that fall within a square on a map given a point,
# an interval of the map, and data that contains lat, lng and the values
# of interest
colnames(data) <- c("lat", "lng", "value")
# Data east of point
data <- data[data$lng > pointLng,]
# Data west of point + interval
data <- data[data$lng < pointLng + interval,]
# Data north of point + interval (down)
data <- data[data$lat > pointLat - interval,]
# Data south of point
data <- data[data$lat < pointLat, ]
# Clean remaining data
data <- na.omit(data)
return(sum(data$value))
}
# Debugging
# squareSumTemp <- sumInsideSquare(testLat, testLng, interval, data)
# Given a start longitude and an end longitude, calculate an array of values
# corresponding to the sums for that latitude
calcSumLat <- function(startLng, endLng, lat, interval, data) {
row <- c()
lng <- startLng
while (lng < endLng) {
row <- c(row, sumInsideSquare(lat, lng, interval, data))
lng <- lng + interval
}
return(row)
}
# Debugging
# rowTemp <- calcSumLat(startLng, endLng, testLat, interval, data)
# write.csv(rowTemp, file = "Temp.csv", row.names = FALSE)
# Get each line of data to plot
lat <- startLat
rowCount <- 1
all.sums <- list()
while (lat > endLat) {
col <- calcSumLat(startLng, endLng, lat, interval, data)
all.sums[[as.character(rowCount)]] <- col
lat <- lat - interval
rowCount <- rowCount + 1
}
# Convert to data frame
all.sums.frame <- data.frame(all.sums)
# Save to disk so I don't have to run it again
write.csv(all.sums.frame, file = "Levy2014Sums200.csv", row.names = FALSE)
Ended up finding a solution to this myself. The key to it was using the foreach package with the doParallel package so it could take advantage of all the cores on my computer. There is a great guide on it here: http://www.r-bloggers.com/a-brief-foray-into-parallel-processing-with-r/

Resources