Write a matrix of strings to netCDF file in R - r

Is it possible to write a matrix of string variables to a netCDF file in R? When I run the following the R session aborts.
library(ncdf4)
# define a small set of netCDF dimensions
lon3 <- seq(0, 2.0, 0.5)
nlon3 <- length(lon3)
lat3 <- seq(0, 0.5, 0.5)
nlat3 <- length(lat3)
# Make the matrix of strings
id_matrix <- matrix(c("top_extreme_left", "top_left", "top_centre", "top_right", "top_extreme_right", "bottom_extreme_left", "bottom_left", "bottom_centre", "bottom_right", "bottom_extreme_right"), nrow=2, byrow = TRUE)
# create and write NetCDF file
# path and file name, set dname
ncpath <- "C:/"
ncname <- "tiny_test_netcdf"
ncfname <- paste(ncpath, ncname, ".nc", sep="")
# create and write the netCDF file -- ncdf4 version
# define dimensions
londim <- ncdim_def("lon","degrees_east",as.double(lon3))
latdim <- ncdim_def("lat","degrees_north",as.double(lat3))
# dimension for each char in the strings (max string length of 25 characters)
dimchar <- ncdim_def("nchar", "", 1:25, create_dimvar = FALSE)
# define variables
dlname <- "test string matrix data"
id_def <- ncvar_def("op_id", "", list(londim, latdim, dimchar), longname = dlname, prec="char")
# create netCDF file and put arrays
ncout <- nc_create(ncfname, list(id_def), force_v4=TRUE)
# Put the matrix of id strings into the netCDF file.
ncvar_put(ncout, id_def, id_matrix, start=c(1,1,1), count = c(5, 2, 25), verbose = TRUE)
# Close the file to write the data
nc_close(ncout)
I'm attempting to extend the example given in the help for ncvar_put which shows the process for a vector of strings. Any idea how/if it's possible to do the same with a matrix?

Eventually found a way by formatting the strings to write into a vector first rather than a matrix. You just have to be mindful that the writing will place the strings into the correct X,Y locations (X grows fastest).
#----------------------------------------------------------------------
# Illustrate creating a character type variable
#----------------------------------------------------------------------
cnames <- c("bottom_extreme_left", "bottom_left", "bottom_centre", "bottom_right", "bottom_extreme_right", "top_extreme_left", "top_left", "top_centre", "top_right", "top_extreme_right")
nstrings <- length(cnames)
max_length <- max(nchar(cnames))
dimX <- ncdim_def( "X", "lon", c(1,2,3,4,5))
dimY <- ncdim_def( "Y", "lat", c(1,2))
#--------------------------------------------------------------
# Make dimensions. Setting "dimnchar" to have a length equa to max string length in data
dimnchar <- ncdim_def("nchar", "", 1:max_length, create_dimvar=FALSE )
#------------------------------------------------------------------------
# NOTE in the following call that units is set to the empty string (""),
# which suppresses creation of a units attribute, and the missing value
# is entirely omitted, which suppresses creation of the missing value att
#------------------------------------------------------------------------
varlocations <- ncvar_def("locations", "", list(dimnchar, dimX, dimY),
prec="char" )
# path and file name
ncpath <- "C:/"
ncname <- "tiny_test_netcdf"
ncfname <- paste(ncpath, ncname, ".nc", sep="")
ncid <- nc_create(ncfname, list(varlocations) )
ncvar_put( ncid, "locations", cnames, verbose=TRUE )
nc_close( ncid )

Related

Combining .nc files and extracting selected variables

I have a similar question to u/Ananas here: Sentinel3 OLCI (chl) Average of netcdf files on Python
I am running into similar problems, in so much that I cannot seem to extract the necessary information from the .nc-files and then merge them to create a time-series. In my case,I am trying to do this in R. My current code, which I have followed and customised from here: https://www.youtube.com/watch?v=jWRszWCVWLc&t=1504s , returns an error:
Error in `[<-.data.frame`(`*tmp*`, variable, value = c(0, 0, 0, 0, 0, :
replacement has 1927 rows, data has 2202561
Maybe I am going at it the wrong way from the start and R-s capabilities wiht .nc files are not suited for this? Any suggestions are welcomed.
Here is my code
extract_variable_from_netcdf<- function(nc,variable){
tryCatch(
{
result<-var.get.nc(nc,variable)
return(result)
},
error=function(cond){
message(paste(variable,"attribute not found"))
message("Here is the original error message")
message(cond)
}
)
}
extract_global_attribute_from_netcdf<- function(nc,global_attribute){
tryCatch(
{
result<-att.get.nc(nc,"NC_GLOBAL",global_attribute)
return(result)
},
error=function(cond){
message(paste(global_attribute,"attribute not found"))
message("Here is the original error message")
message(cond)
}
)
}
folder<- "path to folder"
files<- list.files(folder, pattern= ".nc", full.names = TRUE)
variables<- c("conc_chl", "iop_bpart","lat", "lon") #variables I need to extract
global_attrs<- c("start_date", "stop_date")
headers<-c(global_attrs,variables)
df<-data.frame(matrix(ncol=length(headers), nrow=0))
colnames(df)<- headers
for(file in files) {
nc<- open.nc(file)
chl<- var.get.nc(nc, "conc_chl")
num_chl<- length(chl)
newdf<- data.frame(matrix(ncol=length(headers), nrow=num_chl))
colnames(newdf)<- headers
for (global_attribute in global_attrs) {
newdf[global_attribute]<-extract_global_attribute_from_netcdf(nc,global_attribute)
}
for (variable in variables) {
newdf[variable]<-extract_variable_from_netcdf(nc,variable)
}
df<-merge(df,newdf,all=TRUE)
}
The way I have used ".nc" files with satellite data, in R. Have been reading it in with the "raster" library as a raster file.
library(raster)
r <- raster("yuor_file.nc")
plot(r) # quick plot to see if everything is as it should be
The way I read in my timeseries was with a loop, and in addition I used a function found from this site somewhere, to covert the raster into a sensible r-data frame
stack overflow function, to convert the loaded raster to data frame
gplot_data <- function(x, maxpixels = 50000) {
x <- raster::sampleRegular(x, maxpixels, asRaster = TRUE)
coords <- raster::xyFromCell(x, seq_len(raster::ncell(x)))
## Extract values
dat <- utils::stack(as.data.frame(raster::getValues(x)))
names(dat) <- c('value', 'variable')
dat <- dplyr::as.tbl(data.frame(coords, dat))
if (!is.null(levels(x))) {
dat <- dplyr::left_join(dat, levels(x)[[1]],
by = c("value" = "ID"))
}
dat
}
Read in one file at a time, convert with function and return data.frame
files<- list.files(folder, pattern= ".nc", full.names = TRUE)
fun <- function(i) {
#read in one file at a time
r <- raster(files[i])
#convert to normal data frame
temp <- gplot_data(r)
temp #output
}
dat <- plyr::rbind.fill(lapply(1:length(files), fun)) #bind each iteration
Here a plot using ggplot2 and ggforce.
ggplot() +
geom_tile(data = dat,
aes(x = x, y = y, fill = value))
Alternatively if you do not know the context of you file, the following, from the "ncdf4" package, will help you inspect it. https://towardsdatascience.com/how-to-crack-open-netcdf-files-in-r-and-extract-data-as-time-series-24107b70dcd
library(ncdf4)
our_nc_data <- nc_open("/your_file.nc")
print(our_nc_data)
# look for the variable names and assign them to vectors that can be bound together in dataframes
lat <- ncvar_get(our_nc_data, "lat") #names of latitude column
lon <- ncvar_get(our_nc_data, "lon") #name of longitude column
time <- ncvar_get(our_nc_data, "time") #the time was called time
tunits <- ncatt_get(our_nc_data, "time", "units")# check units
lswt_array <- ncvar_get(our_nc_data, "analysed_sst") #select the relevant variable, this is temperature named "analysed_sst"

Downloading multiple jpegs into an array from urls in a data frame

Like the title says, I want to download multiple jpegs into an array from their url in a dataframe. The code I now have is as follows:
# for loop for train set
for (i in 1:274) {
theurl <- paste(train[i,c("Icon.URL")], sep = "")
z <- tempfile()
download.file(theurl,z,mode = "wb") #Download to the directory
train_image <- readJPEG(z)
writeJPEG(train_image, paste("imageTrain", i, ".jpg", sep = ""))
file.remove(z)
}
The 'train_image' is now a large array, with 786432 elements. This means that ultimately, only one of the jpegs is downloaded into that file, as each jpeg's dimensions are 512 x 512 x 3 = 786432. What is going wrong in the code that it seems to overwrite itself each time the for loop runs?
Additional information:
'train' is the data frame containing the urls
I got this code from here:
https://community.rstudio.com/t/access-and-download-images-from-urls-which-are-values-of-a-variable/66534 and
https://community.rstudio.com/t/access-and-download-images-from-urls/94422
I have also tried to use append() in the following way:
for (i in 1:10) {
theurl <- paste(train[i,c("Icon.URL")], sep = "")
z <- tempfile()
download.file(theurl,z,mode = "wb") #Download to the directory
train_img <- readJPEG(z)
train_image <- append(train_image, train_img)
writeJPEG(train_img, paste("imageTrain", i, ".jpg", sep = ""))
file.remove(z)
}
In this way, the images are added to 'train_image', but the results is a large numeric vector instead of an array
We can create an array with the desired dimensions and use an additional dimension for the picture number.
#Example list of urls
urllist <- rep("http://upload.wikimedia.org/wikipedia/commons/9/95/Apollonian_spheres.jpg", 10)
#create array with desired dimensions (pics here are 492x492x3). Include length of urllist as dimension
#create array with desired dimension
arrays <- array(dim = c(492, 492, 3, length(urllist)))
#loop through urllist and assign to array
for(i in 1:length(urllist)){
#download
z <- tempfile()
download.file(myurl,z,mode="wb")
#read
pic <- readJPEG(z)
#add to array.list
arrays[,,,i] <- pic
}

Read nested folder and file name, export to Excel file

So I am tasked with building an excel spreadsheet cataloging a drive with various nested folders and files.
This SO gets me somewhat there but I am confused on how to get my desired output.
I know that there might be a command to get file info and I can break that into these columns.
Apart from the directories split into subdirs, the adaptation of the function in the question's link, Stibu's answer, might be of help.
rfl <- function(path) {
folders <- list.dirs(path, recursive = FALSE, full.names = FALSE)
if (length(folders)==0) {
files <- list.files(path, full.names = TRUE)
finfo <- file.info(files)
Filename <- basename(files)
FileType <- tools::file_ext(files)
DateModified <- finfo$mtime
FullFilePath <- dirname(files)
size <- finfo$size
data.frame(Filename, FileType, DateModified, FullFilePath, size)
} else {
sublist <- lapply(paste0(path,"/",folders),rfl)
setNames(sublist,folders)
}
}
If you have the full path and file names then you can loop through that and parse it into these columns. You can get more file info with file.info:
files <- c("I:/Administration/Budget/2015-BUDGET DOCUMENT.xlsx",
"I:/Administration/Budget/2014-2015 Budget/BUDGET DOCUMENT.xlsx")
# files <- list.files("I:", recursive = T, full.names = T) # this could take a while to run
file_info <- list(length = length(files))
for (i in seq_along(files)){
fullpath <- dirname(files[i])
fullname <- basename(files[i])
file_ext <- unlist(strsplit(fullname, ".", fixed = T))
file_meta <- file.info(files[i])[c("size", "mtime")]
path <- unlist(strsplit(fullpath, "/", fixed = T))[-1]
file_info[[i]] <- unlist(c(file_ext, file_meta, fullpath, path))
}
l <- lapply(file_info, `length<-`, max(lengths(file_info)))
df <- data.frame(do.call(rbind, l))
names(df) <- c("filename", "extension", "size", "modified", paste0("sub", 1:(ncol(df) - 4)))
rownames(df) <- NULL
df$modified <- as.POSIXct.numeric(as.numeric(df$modified), origin = "1970-01-01")
df$size <- as.numeric(df$size)
If you do not have the files you can recursively search the drive using list.files() with recursive = T: list.files("I:", recursive = T, full.names = T)
Note:
l <- lapply(file_info, `length<-`, max(lengths(file_info))) sets the vector length of each list element to be the same. This is necessary because otherwise when the vectors are stacked with unequal lengths values get recycled. A simple example of this is: rbind(1:3, 1:5)
The output of unlist(c(file_ext, file_meta, fullpath, path)) is a vector and vectors in R are atomic, meaning all elements have to be the same class. That means everything gets converted to character in this case, which is why we have the lines df$modified <- ... and df$size <- ... at the end to convert them to their appropriate type.
If you want to output this data frame to excel check out xlsx::write.xlsx or openxlsx::write.xlsx. If you don't have those libraries installed you'll need to use install.packages() first.
Output
Because these files/locations don't actually exist on my computer there are NA values in the size and date modified fields:
filename extension size modified sub1 sub2 sub3 sub4
1 2015-BUDGET DOCUMENT xlsx NA <NA> I:/Administration/Budget Administration Budget <NA>
2 BUDGET DOCUMENT xlsx NA <NA> I:/Administration/Budget/2014-2015 Budget Administration Budget 2014-2015 Budget

Modify columns in a dataframe by using function

I'm trying to modify my data frame columns and positions. Finally I found some solution to do that but I want to do all process in a function for all data sets in the directory and overwrite the real data.
kw <- matrix(1:11400, ncol = 19) # to make sample data
kw <- kw[, !(colnames(kw) %in% c("V18","V19"))] # to remove last two cols
add <- c(kw$V18 <- 0,kw$V19<- 0) # add new columns with all zero values
kw$V1 <- kw$V1 * 1000 # to modify first col of data frame
kw <- kw[ ,c(1,18:19,2:17)] # to replace col positions
lets say I have data set in the directory
kw<-read.table("5LSTT-test10.avgm", header = FALSE,fill=FALSE) # example which shows how I read single data
`5LSTT-test10.avgm`
.
.
.
.
5LSTT-test10.avgm`
how can apply this column modification process to each data separately and overwrite or make new data?
edit output readLines("5LSTT-test10.avgm", n = 1)
you can see 19 columns and think this data has 600 rows
[1] " 9.0000E-02 0.0000E+00 2.3075E-03 -6.4467E-03 9.9866E-01 9.8648E-02 4.5981E-02 9.8004E-01 1.2359E-01 6.1175E-02 9.7701E-01 8.6662E-02 3.0034E-02 9.7884E-01 7.0891E-02 8.2247E-03 9.8564E-01 -8.7967E-11 4.3105E-02"
With "data.table" you would be able to do something like:
setcolorder(
fread(yourfile)[, c("V1", "V18", "V19") := list(V1 * 1000, 0, 0)], c(1, 18:19, 2:17))
Thus, if you really needed a function, you can do something like:
myFun <- function(infile) {
require(data.table)
write.table(
setcolorder(
fread(infile)[
, c("V1", "V18", "V19") := list(V1 * 1000, 0, 0)],
c(1, 18:19, 2:17)),
file = gsub("(.*)(\\..*)", "\\1_new\\2", infile),
row.names = FALSE)
}
You can then use myFun within lapply over a vector of the files you want to read and process.
In other words:
lapply(myListOfFilePaths, myFun)
By default, this function renames (rather than overwrites) your file appending "_new" at the end, but before the extension.
This could be another way
Read all the files and store it in a list like this
# to list down all the files in the directory
files.new = list.files(directory.path, recursive = TRUE, pattern=".avgm")
# to read all the files and store it in list
file.contents = lapply(paste(directory.path,files.new, sep="/"), read.table, sep='\t', header = TRUE)
Next you can do the modifications to each of the dataset in the list something like this
outlist = lapply(file.contents, function(x){
# modifications
kw <- x[, !(colnames(x) %in% c("V18","V19"))]
add <- c(kw$V18 <- 0,kw$V19<- 0)
kw$V1 <- kw$V1 * 1000
kw <- kw[ ,c(1,18:19,2:17)]
})
and write the modified data into new files using the function below
# function to write files from a list object
write.files = function(modified.list, path){
outlist = file.contents[sapply(modified.list, function(x) length(x) > 1)]
sapply(names(outlist), function(x)
write.table( outlist[[x]], file= paste(path, x, sep="/"),
sep="\t", row.names=FALSE))
}
Writing the data to files
write.files(outlist, "/directory/path")

Creating a list of raster bricks from a multivariate netCDF file

I've been working with the RCP (Representative Concentration Pathway) spatial data. It's a nice gridded dataset in netCDF format. How can I get a list of bricks where each element represents one variable from a multivariate netCDF file (by variable I don't mean lat,lon,time,depth...etc). This is what Iv'e tried to do. I can't post an example of the data, but I've set up the script below to be reproducible if you want to look in to it. Obviously questions welcome... I might not have expressed the language associated with the code smoothly. Cheers.
A: Package requirements
library(sp)
library(maptools)
library(raster)
library(ncdf)
library(rgdal)
library(rasterVis)
library(latticeExtra)
B: Gather data and look at the netCDF file structure
td <- tempdir()
tf <- tempfile(pattern = "fileZ")
download.file("http://tntcat.iiasa.ac.at:8787/RcpDb/download/R85_NOX.zip", tf , mode = 'wb' )
nc <- unzip( tf , exdir = td )
list.files(td)
## Take a look at the netCDF file structure, beyond this I don't use the ncdf package directly
ncFile <- open.ncdf(nc)
print(ncFile)
vars <- names(ncFile$var)[1:12] # I'll try to use these variable names later to make a list of bricks
C: Create a raster brick for one variable. Levels correspond to years
r85NOXene <- brick(nc, lvar = 3, varname = "emiss_ene")
NAvalue(r85NOXene) <- 0
dim(r85NOXene) # [1] 360 720 12
D: Names to faces
data(wrld_simpl) # in maptools
worldPolys <- SpatialPolygons(wrld_simpl#polygons)
cTheme <- rasterTheme(region = rev(heat.colors(20)))
levelplot(r85NOXene,layers = 4,zscaleLog = 10,main = "2020 NOx Emissions From Power Plants",
margin = FALSE, par.settings = cTheme) + layer(sp.polygons(worldPolys))
E: Summarize all grid cells for each year one variable "emis_ene", I want to do this for each variable of the netCDF file I'm working with.
gVals <- getValues(r85NOXene)
dim(gVals)
r85NOXeneA <- sapply(1:12,function(x){ mat <- matrix(gVals[,x],nrow=360)
matfun <- sum(mat, na.rm = TRUE) # Other conversions are needed, but not for the question
return(matfun)
})
F: Another meet and greet. Check out how E looks
library(ggplot2) # loaded here because of masking issues with latticeExtra
years <- c(2000,2005,seq(2010,2100,by=10))
usNOxDat <- data.frame(years=years,NOx=r85NOXeneA)
ggplot(data=usNOxDat,aes(x=years,y=(NOx))) + geom_line() # names to faces again
detach(package:ggplot2, unload=TRUE)
G: Attempt to create a list of bricks. A list of objects created in part C
brickLst <- lapply(1:12,function(x){ tmpBrk <- brick(nc, lvar = 3, varname = vars[x])
NAvalue(tmpBrk) <- 0
return(tmpBrk)
# I thought a list of bricks would be a good structure to do (E) for each netCDF variable.
# This doesn't break but, returns all variables in each element of the list.
# I want one variable in each element of the list.
# with brick() you can ask for one variable from a netCDF file as I did in (C)
# Why can't I loop through the variable names and return on variable for each list element.
})
H: Get rid of the junk you might have downloaded... Sorry
file.remove(dir(td, pattern = "^fileZ",full.names = TRUE))
file.remove(dir(td, pattern = "^R85",full.names = TRUE))
close(ncFile)
Your (E) step can be simplified using cellStats.
foo <- function(x){
b <- brick(nc, lvar = 3, varname = x)
NAvalue(b) <- 0
cellStats(b, 'sum')
}
sumLayers <- sapply(vars, foo)
sumLayers is the result you are looking for, if I understood correctly your question.
Moreover, you may use the zoo package because you are dealing with time series.
library(zoo)
tt <- getZ(r85NOXene)
z <- zoo(sumLayers, tt)
xyplot(z)

Resources