Automate "ncvar_get"-reading of different variable names in NetCDF files - r

I have a NetCDF dataset with two climate scenarios (rcp & hist), both of them containing 25 files. Each file either contains data for the variable "pr", "tas", "tasmax", or "tasmin". I wrote a for loop to iteratively read the files of hist and rcp, read them with nc_open, extract the variable with ncvar_get and finally make a calculation in form of mean(abs(hist - rcp) to obtain the mean absolute distance between each pair of hist and rcp. The problem: as ncvar_get requires the exact variable name of the current file I wrote an if else block (see below) that shall find the variable name of the current file and apply it for ncvar_get. Running the code I obtain the following error:
[1] "vobjtovarid4: error #F: I could not find the requsted var (or dimvar) in the file!"
[1] "var (or dimvar) name: tas"
[1] "file name: /data/historical/tasmax_ICHEC-EC-EARTH_DMI-HIRHAM5_r3i1p1.nc" Error in vobjtovarid4(nc, varid, verbose = verbose, allowdimvar = TRUE) : Variable not found
#Extract of the files in the hist list. Same file names in the rcp list, but different directory
> hist.files.cl <- list.files("/historical", full.names = TRUE)
> hist.files.cl
[1] "/historical/pr_CNRM-CERFACS-CNRM-CM5_ALADIN53_r1i1p1.nc"
[2] "/historical/pr_CNRM-CERFACS-CNRM-CM5_ALARO-0_r1i1p1.nc"
[3] "/historical/pr_ICHEC-EC-EARTH_HIRHAM5_r3i1p1.nc"
[4] "/historical/pr_ICHEC-EC-EARTH_RACMO22E_r12i1p1.nc"
[5] "/historical/pr_ICHEC-EC-EARTH_RCA4_r12i1p1.nc"
[6] "/historical/pr_MPI-M-MPI-ESM-LR_RCA4_r1i1p1.nc"
[7] "/historical/pr_MPI-M-MPI-ESM-LR_REMO2009_r1i1p1.nc"
[8] "/historical/pr_MPI-M-MPI-ESM-LR_REMO2009_r2i1p1.nc"
[9] "/historical/tas_CNRM-CERFACS-CNRM-CM5_CNRM-ALADIN53_r1i1p1.nc"
[10] "/historical/tas_CNRM-CERFACS-CNRM-CM5_RMIB-UGent-ALARO-0_r1i1p1.nc"
[11] "/historical/tas_ICHEC-EC-EARTH_DMI-HIRHAM5_r3i1p1.nc"
[12] "/historical/tas_ICHEC-EC-EARTH_KNMI-RACMO22E_r12i1p1.nc"
[13] "/historical/tas_ICHEC-EC-EARTH_SMHI-RCA4_r12i1p1.nc"
[14] "/historical/tas_MPI-M-MPI-ESM-LR_MPI-CSC-REMO2009_r1i1p1.nc"
[15] "/historical/tas_MPI-M-MPI-ESM-LR_MPI-CSC-REMO2009_r2i1p1.nc"
[16] "/historical/tasmax_ICHEC-EC-EARTH_DMI-HIRHAM5_r3i1p1.nc"
[17] "/historical/tasmax_ICHEC-EC-EARTH_KNMI-RACMO22E_r12i1p1.nc"
[18] "/historical/tasmax_ICHEC-EC-EARTH_SMHI-RCA4_r12i1p1.nc"
euc.distance <- list()
for(i in 1:length(hist.files.cl)) {
#Open ith file in list of hist files as well as in list of rcp files
hist.data <- nc_open(hist.files.cl[i])
rcp.data <- nc_open(rcp.files.cl[i])
if(grepl("pr", hist.data$filename)){
hist.var <- ncvar_get(hist.data, "pr")
rcp.var <- ncvar_get(rcp.data, "pr")
}else if (grepl("tas", hist.data$filename)){
hist.var <- ncvar_get(hist.data, "tas")
rcp.var <- ncvar_get(rcp.data, "tas")
}else if (grepl("tasmax", hist.data$filename)){
hist.var <- ncvar_get(hist.data, "tasmax")
rcp.var <- ncvar_get(rcp.data, "tasmax")
}else{
hist.var <- ncvar_get(hist.data, "tasmin")
rcp.var <- ncvar_get(rcp.data, "tasmin")
}
#Converting temperature variable from K to °C:
if(grepl("tas", hist.data$filename)){
hist.var <- hist.var-273.15
rcp.var <- rcp.var-273.15
}
#Find for the ith rcp file with dim=(1,1,360) in the ith hist file with dim=(385,373,360) the grid point with the best fitting distribution (each grid point consists of a distribution of 360 time steps).The calculation may contain errors...
euc.distance[[i]] <- apply(hist.var, c(1,2), function(x) mean(abs(rcp.var - x)))
min_values <- which(rank(euc.distance[[i]], ties.method='min') <= 10)
}
As cath highlighted the probable cause of the error, but the proposed approach to extract the part of interest (=variable name) from the filename does not work. I before tried to automate the extraction of the variable name by using stringr("filename",startposition, endposition) until I noticed that there is no sense in it, because each variable name (pr, tas, tasmax, tasmin) has another string length. What possibilities do you see for me?
Thank you a lot!

To complete a bit my comment, if you need to operate on each file, you could do it at once, putting everything in a list.
So, first get the "keypart" for each file:
keyparts <- sub("^([a-z]+)_.+", "\\1", basename(hist.files.cl))
keyparts
# [1] "pr" "pr" "pr" "pr" "pr" "pr" "pr" "pr"
# [9] "tas" "tas" "tas" "tas" "tas" "tas" "tas" "tasmax"
#[17] "tasmax" "tasmax"
Then you can use lapply to do what you need to do for every files at once:
my_res <- lapply(seq(keyparts),
function(i){
hist.data <- nc_open(hist.files.cl[i])
rcp.data <- nc_open(rcp.files.cl[i])
hist.var <- ncvar_get(hist.data, keyparts[i])
rcp.var <- ncvar_get(rcp.data, keyparts[i])
if(keyparts[i]=="tas"){
hist.var <- hist.var-273.15
rcp.var <- rcp.var-273.15
}
euc.distance <- apply(hist.var, c(1,2), function(x) mean(abs(rcp.var - x)))
min_values <- which(rank(euc.distance[[i]], ties.method='min') <= 10)
return(list(euc.distance=euc.distance, min.values=min.values))
})

Related

Basic for loop in R

I am trying to implement a for loop for a set of data-frames that I want to write to Excel
AU2<-intersect(A_CN_U$symbol,A_CP_U$symbol)
AD2<-intersect(A_CN_D$symbol,A_CP_D$symbol)
BU2<-intersect(B_CN_U$symbol,B_CP_U$symbol)
BD2<-intersect(B_CN_D$symbol,B_CP_D$symbol)
CU2<-intersect(C_CN_U$symbol,C_CP_U$symbol)
CD2<-intersect(C_CN_D$symbol,C_CP_D$symbol)
tot<- c(AU2, AD2, BU2, BD2, CU2, CD2)
for (i in tot){
print(i)
write_xlsx(i,"/Users/ABC/Desktop/Research/i.xlsx")
}
It returns this:
[1] "TMTC2"
Error in write_xlsx(i, "/Users/abkhan/Desktop/Research/Patel Meningioma/Smoker DEG/01RedoSubmissionMM/i.xlsx") :
Argument x must be a data frame or list of data frames
Outside my failed loop:
> AU2
[1] "TMTC2" "NPB" "GALNT6" "CDCA2" "ABTB1" "C12orf75" "GPR63" NA "ESR1" "NPAS2" "PLAGL1" "C11orf45" "SYNE1" "C16orf74"
[15] "S100A6" "LOXL4" "PLCL1" "KLHL29" "DTX4" "ITGB5" "BCAT1" "CDKN2B" "KANK4" "S1PR2" "AHR" "STAMBPL1" "TRPM3" "TMEM200A"
[29] "BASP1" "AQP5" "THBS2" "ADRA1B" "MGLL" "RIMBP2" "KCNN4" "PROCR" "MXRA5" "CAV1" "GALNT15" "RIMS1" "ELAVL4" "COL4A6"
[43] "FAM189A1" "AMH" "DPP4" "MEGF6" "JPH3" "POU5F1B" "EVA1A" "ABCC2" "PTGES" "CACNG8" "ALK" "VGLL3" "TGM2" "SLC9A2"
[57] "LVRN" "MEGF10" "LMO3" "PRPH" "ATP2B2" "SRPX2" "LUM" "SLC9A4" "CDKN2A" "LGR6" "ALPK2" "C6orf132" "FAP" "ANKRD1"
[71] "LTK" "ASPN" "SLC22A1" "PPL" "LYPD1" "GPR39" "DSC3" "SOX11" "NHLH2" "KRT14" "IGFL2" "GDF6"
Thanks!
Edit:
I realized that my original variable (Au2, AD2 etc) were not data frames
I converted them to dataframes and their names to reflect my mood.
Now I get:
> tot<- c(Atrash, Btrash, Ctrash)
> for (i in tot){
+ print(i)
+ write_xlsx(i,"/Users/abkhan/Desktop/Research/Patel Meningioma/Smoker DEG/01RedoSubmissionMM/i.xlsx")
+ }
[1] "TMTC2" "NPB" "GALNT6" "CDCA2" "ABTB1" "C12orf75" "GPR63" NA "ESR1" "NPAS2" "PLAGL1" "C11orf45" "SYNE1" "C16orf74"
[15] "S100A6" "LOXL4" "PLCL1" "KLHL29" "DTX4" "ITGB5" "BCAT1" "CDKN2B" "KANK4" "S1PR2" "AHR" "STAMBPL1" "TRPM3" "TMEM200A"
[29] "BASP1" "AQP5" "THBS2" "ADRA1B" "MGLL" "RIMBP2" "KCNN4" "PROCR" "MXRA5" "CAV1" "GALNT15" "RIMS1" "ELAVL4" "COL4A6"
[43] "FAM189A1" "AMH" "DPP4" "MEGF6" "JPH3" "POU5F1B" "EVA1A" "ABCC2" "PTGES" "CACNG8" "ALK" "VGLL3" "TGM2" "SLC9A2"
[57] "LVRN" "MEGF10" "LMO3" "PRPH" "ATP2B2" "SRPX2" "LUM" "SLC9A4" "CDKN2A" "LGR6" "ALPK2" "C6orf132" "FAP" "ANKRD1"
[71] "LTK" "ASPN" "SLC22A1" "PPL" "LYPD1" "GPR39" "DSC3" "SOX11" "NHLH2" "KRT14" "IGFL2" "GDF6"
Error in write_xlsx(i, "/Users/ABC/Desktop/Research/i.xlsx") :
Argument x must be a data frame or list of data frames
writexl take the first argument as a:
data frame or named list of data frames that will be sheets in the
xlsx
So you are passing another type of data. You can create a list of your data.frames and iterate within the list to loop each data.frame, here an example:
df1 <- data.frame(a = 1)
df2 <- data.frame(b = 2)
list_of_dfs <- list(df1,df2)
n <- length(list_of_dfs)
for (i in 1:n){
print(i)
write.csv(list_of_dfs[[i]],file = paste0(i,".csv"),row.names = FALSE)
}
list.files(pattern = "*.csv")
[1] "1.csv" "2.csv"

Extraction of data from HDF at different pressure level

I am trying to extract a variable named Data Fields/OzoneTropColumn at a point location (lon=40, lat=34) at different pressure level (825.40198, 681.29102, 464.16000, 316.22699 hPa) from multiple hdf files
library(raster)
library(ncdf4)
library(RNetCDF)
# read file
nc <- nc_open("E:/Ozone/test1.nc")
list_col1 <- as.list(list.files("E:/Ozone/", pattern = "*.hdf",
full.names = TRUE))
> attributes(nc$var) #using a single hdf file to check its variables
$names
[1] "Data Fields/Latitude" "Data Fields/Longitude"
[3] "Data Fields/O3" "Data Fields/O3DataCount"
[5] "Data Fields/O3Maximum" "Data Fields/O3Minimum"
[7] "Data Fields/O3StdDeviation" "Data Fields/OzoneTropColumn"
[9] "Data Fields/Pressure" "Data Fields/TotColDensDataCount"
[11] "Data Fields/TotColDensMaximum" "Data Fields/TotColDensMinimum"
[13] "Data Fields/TotColDensStdDeviation" "Data Fields/TotalColumnDensity"
[15] "HDFEOS INFORMATION/StructMetadata.0" "HDFEOS INFORMATION/coremetadata"
> pres <- ncvar_get(nc, "Data Fields/Pressure") #looking at pressure level from single file of hdf
> pres
[1] 825.40198 681.29102 464.16000 316.22699 215.44400 146.77901 100.00000 68.12950 46.41580 31.62290
[11] 21.54430 14.67800 10.00000 6.81291 4.64160
ncin <- raster::stack(list_col1,
varname = "Data Fields/OzoneTropColumn",
ncdf=TRUE)
#cannot extract using the following code
o3 <- ncvar_get(list_col1,attributes(list_col1$var)$names[9])
"Error in ncvar_get(list_col1, attributes(list_col1$var)$names[9]) :
first argument (nc) is not of class ncdf4!"
#tried to extract pressure levels
> prsr <- raster::stack(list_col1,varname = "Data Fields/Pressure",ncdf=TRUE)
"Error in h(simpleError(msg, call)) :
error in evaluating the argument 'x' in selecting a method for function 'stack': varname: Data Fields/Pressure does not exist in the file. Select one from:
Data Fields/O3, Data Fields/O3DataCount, Data Fields/O3Maximum, Data Fields/O3Minimum, Data Fields/O3StdDeviation, Data Fields/OzoneTropColumn, Data Fields/TotColDensDataCount, Data Fields/TotColDensMaximum, Data Fields/TotColDensMinimum, Data Fields/TotColDensStdDeviation, Data Fields/TotalColumnDensity"
#tried using index
#Point location can also be written as below 1 deg by 1 deg resolution
lonIdx <- which(lon >32 & lon <36)
latIdx <- which(lat >38 & lat <42)
presIdx <- which(pres >= 400 & pres <= 900)
#also tried
# Option 2 -- subset using array indexing
o3 <- ncvar_get(list_col1,'Data Fields/OzoneTropColumn')
"Error in ncvar_get(list_col1, "Data Fields/OzoneTropColumn") :
first argument (nc) is not of class ncdf4!"
extract2 <- o3[lonIdx, latIdx, presIdx, ]
How to I extract these values vertically at each pressure level ? (SM=Some value)
I would like the output in following way at location (lon=40, lat=34):
Pressure 1 2 3 4 5 .... 10
825.40198 SM1 SM2 SM3 SM4 SM5... SM10
681.29102 SM11 SM12
464.16000
316.22699 SM.. SM.. SM.. SM.. SM.. SM..
Appreciate any help.
Thank you
This might be an issue with how netcdf4 and raster name each of the layers in the file. And perhaps some confusion with trying to create a multilayer object from multiple ncdf at once.
I would do the following, using only raster: Load a single netCDF, using stack() or brick(). This will load the file as a multilayer object in R. Use names() to identify what is the name of the Ozone layer according to the raster package.
firstraster <- stack("E:/Ozone/test1.nc")
names(firstraster)
Once you find out the name, you can just execute a reading of all objects as stack(), extract the information on points of interest, without even assembling all layers in a single stack.
Ozonelayername <- "put name here"
files <- list.files("E:/Ozone/", pattern = "*.hdf", full.names = TRUE)
stacklist <- lapply(files, stack)
Ozonelayerlist <- lapply(stacklist, "[[", "Ozonelayername")
The line above will output a list of rasters objects (not stacks or bricks, just plain rasters), with only the layer you want.
Now we just need to execute an extract on each of these layers. sapply() will format that neatly in a matrix for us.
pointsofinterest <- expand.grid(32:36,38:42)
values <- sapply(Ozonelayerlist, extract, pointsofinterest)
I can test it, since I do not have the data, but I assume this would work.

stack files of various folders using R

I stacked certain tif files of a Landsat image, as is shown:
setwd("C:/Users/Landsat/L5__002072-09MAY-2006")
may2006<-list.files(".",pattern="*B[123457]\\.tif$", ignore.case=TRUE)
[1] "LT05_L1TP_002072_20060509_20161121_01_T1_B1.TIF"
[2] "LT05_L1TP_002072_20060509_20161121_01_T1_B2.TIF"
[3] "LT05_L1TP_002072_20060509_20161121_01_T1_B3.TIF"
[4] "LT05_L1TP_002072_20060509_20161121_01_T1_B4.TIF"
[5] "LT05_L1TP_002072_20060509_20161121_01_T1_B5.TIF"
[7] "LT05_L1TP_002072_20060509_20161121_01_T1_B7.TIF"
landsat_stack <- stack(may2006)
I want to do the same, but for all the images of the folder Landsat (each folder been a separate stack )
setwd("C:/Users/Landsat")
foldersList <- normalizePath(list.dirs(full.names = TRUE, recursive = FALSE))
[1] "C:\\Users\\Landsat\\L5__002072-09MAY-2006"
[2] "C:\\Users\\Landsat\\L5_001073_02MAY-2006"
[3] "C:\\Users\\Landsat\\L5_001073_14MAY-1987"
[4] "C:\\Users\\Landsat\\L8__002072-7MAY-2017"
Is it possible to do this simultaneously for all the images?
I thought in first do one list with all tif files(no matter the folder), and then with a loop stacks only the files that have a match in the name (condition 1), but finish with this pattern "B[123457]"(condition 2)
all_Landsat<-list.files(".",pattern="*B[123457]\\.tif$", ignore.case=TRUE, recursive= TRUE)
all_Landsat
[1] "L5__002072-09MAY-2006/LT05_L1TP_002072_20060509_20161121_01_T1_B1.TIF"
[2] "L5__002072-09MAY-2006/LT05_L1TP_002072_20060509_20161121_01_T1_B2.TIF"
[3] "L5__002072-09MAY-2006/LT05_L1TP_002072_20060509_20161121_01_T1_B3.TIF"
[4] "L5__002072-09MAY-2006/LT05_L1TP_002072_20060509_20161121_01_T1_B4.TIF"
[5] "L5__002072-09MAY-2006/LT05_L1TP_002072_20060509_20161121_01_T1_B5.TIF"
[6] "L5__002072-09MAY-2006/LT05_L1TP_002072_20060509_20161121_01_T1_B7.TIF"
[7] "L5_001073_02MAY-2006/LT05_L1TP_001073_20060502_20161122_01_T1_B1.TIF"
[8] "L5_001073_02MAY-2006/LT05_L1TP_001073_20060502_20161122_01_T1_B2.TIF"
[9] "L5_001073_02MAY-2006/LT05_L1TP_001073_20060502_20161122_01_T1_B3.TIF"
[10]"L5_001073_02MAY-2006/LT05_L1TP_001073_20060502_20161122_01_T1_B4.TIF"
[11]"L5_001073_02MAY-2006/LT05_L1TP_001073_20060502_20161122_01_T1_B5.TIF"
[12]"L5_001073_02MAY-2006/LT05_L1TP_001073_20060502_20161122_01_T1_B7.TIF"
[13]"L5_001073_14MAY-1987/LM50010731987134AAA03_B1.TIF"
[14]"L5_001073_14MAY-1987/LM50010731987134AAA03_B2.TIF"
[15]"L5_001073_14MAY-1987/LM50010731987134AAA03_B3.TIF"
[16]"L5_001073_14MAY-1987/LM50010731987134AAA03_B4.TIF"
[17]"L8__002072-7MAY-2017/LC08_L1TP_002072_20170507_20170515_01_T1_B1.TIF"
[18]"L8__002072-7MAY-2017/LC08_L1TP_002072_20170507_20170515_01_T1_B2.TIF"
[19]"L8__002072-7MAY-2017/LC08_L1TP_002072_20170507_20170515_01_T1_B3.TIF"
[20]"L8__002072-7MAY-2017/LC08_L1TP_002072_20170507_20170515_01_T1_B4.TIF"
[21]"L8__002072-7MAY-2017/LC08_L1TP_002072_20170507_20170515_01_T1_B5.TIF"
[22]"L8__002072-7MAY-2017/LC08_L1TP_002072_20170507_20170515_01_T1_B7.TIF"
But I can´t find the right code for the 2 conditions:
for (i in all_Landsat){
if (grep(pattern="+B[123457]\\.tif$", ignore.case=FALSE)){
stack(i)
}
}
I haven't checked this but hope this works:
setwd("C:/Users/Landsat")
a<-list.dirs(getwd(),recursive = FALSE )
flist <- list()
stackfile <- list()
for (i in 1:length(a)){
flist[[i]] <- list.files(a[i], recursive = TRUE, full.names = TRUE, pattern = "tif$")
stackfile[[i]] <- stack(flist[[i]])
}
Thanks #anup. I finally solved the problem with this code. It returns a list of the TIF images stacked by folder.
setwd("C:/Users/Landsat")
a<-list.dirs(getwd(),recursive = FALSE )
landsat<- apply(a,function (dir){
img<-stack(list.files(path=dir,ignore.case= TRUE,
pattern="*B[123457]\\.tif$", full.names= TRUE))
})

Changing the values in txt file using R r

I want to write a softcode that will change the values of required parameters of my Hydrological model input txt file. some parametres are fixed and some I will change to the observed values. For exampple HYDRUS_Version=4 is fixed and WaterFlow=1 and SoluteTransport=0 etc are the parametres that I want to change their values. I want to assign the values of waterFlow=5 or WaterFlow=3.1 and SoluteTransport=2 or =2.2
I tried this code for water Flow but the values in my txt files are not changed. I am just new learner of R.
lines <- readLines("G:/Rlearning/HYDRUS1D.txt")
library(gsubfn)
i1 <- grepl("Vertical Conductivity", lines)
lines[i1] <- gsubfn("[0-9.]+", ~format(as.numeric(x)*2,
scientific = FALSE), lines[i1])
Below is the complete txt file.
;
[Main]
HYDRUS_Version=4
WaterFlow=1
SoluteTransport=0
Unsatchem=0
Unsatchem=0
HP1=0
HeatTransport=0
EquilibriumAdsorption=1
MobileImmobile=0
RootWaterUptake=1
RootGrowth=0
MaterialNumbers=1
SubregionNumbers=1
SpaceUnit=cm
TimeUnit=days
PrintTimes=160
NumberOfSolutes=0
InitialCondition=1
;
[Profile]
NumberOfNodes=101
ProfileDepth=1.2E+02
ObservationNodes=5
GridVisible=1
SnapToGrid=1
ProfileWidth=80
LeftMargin=40
GridOrgX=0
GridOrgY=0
GridDX=5.E+00
GridDY=5.E+00
Create a custom function to search matching string and replace value:
replaceFn <- function(phrase, value, file){
line <- file[grep(phrase, file)]
value.pos <- as.numeric(gregexpr("=", line))
file[grep(phrase, file)] <- paste0(substring(line, 1, value.pos), value)
return(file)
}
dat <- replaceFn("WaterFlow", 3.1, dat)
dat <- replaceFn("SoluteTransport", 2.2, dat)
[1] ";" "[Main]" "HYDRUS_Version=4" "WaterFlow=3.1" "SoluteTransport=2.2" "Unsatchem=0"
[7] "Unsatchem=0" "HP1=0" "HeatTransport=0" "EquilibriumAdsorption=1" "MobileImmobile=0" "RootWaterUptake=1"
[13] "RootGrowth=0" "MaterialNumbers=1" "SubregionNumbers=1" "SpaceUnit=cm" "TimeUnit=days" "PrintTimes=160"
[19] "NumberOfSolutes=0" "InitialCondition=1" ";" "[Profile]" "NumberOfNodes=101" "ProfileDepth=1.2E+02"
[25] "ObservationNodes=5" "GridVisible=1" "SnapToGrid=1" "ProfileWidth=80" "LeftMargin=40" "GridOrgX=0"
[31] "GridOrgY=0" "GridDX=5.E+00" "GridDY=5.E+00"
Input data:
dat <- c(";", "[Main]", "HYDRUS_Version=4", "WaterFlow=1", "SoluteTransport=0",
"Unsatchem=0", "Unsatchem=0", "HP1=0", "HeatTransport=0", "EquilibriumAdsorption=1",
"MobileImmobile=0", "RootWaterUptake=1", "RootGrowth=0", "MaterialNumbers=1",
"SubregionNumbers=1", "SpaceUnit=cm", "TimeUnit=days", "PrintTimes=160",
"NumberOfSolutes=0", "InitialCondition=1", ";", "[Profile]",
"NumberOfNodes=101", "ProfileDepth=1.2E+02", "ObservationNodes=5",
"GridVisible=1", "SnapToGrid=1", "ProfileWidth=80", "LeftMargin=40",
"GridOrgX=0", "GridOrgY=0", "GridDX=5.E+00", "GridDY=5.E+00")

How to parse xml/sbml with R package xml?

I'm trying to parse information from the sbml/xml file below
https://dl.dropboxusercontent.com/u/10712588/file.xml
from this code
http://search.bioconductor.jp/codes/11172
It seems that I can import the file normally by
doc <- xmlTreeParse(filename,ignoreBlanks = TRUE)
but I can't recover node attributes by
atrr <- xpathApply(doc, "//species[#id]", xmlGetAttr, "id")
or
xpathApply(doc, "//species", function(n) xmlValue(n[[2]]))
A node of the file follows...
<species id="M_10fthf_m" initialConcentration="1" constant="false" hasOnly
SubstanceUnits="false" name="10-formyltetrahydrofolate(2-)" metaid="_metaM_10fth
f_m" boundaryCondition="false" sboTerm="SBO:0000247" compartment="m">
<notes>
<body xmlns="http://www.w3.org/1999/xhtml">
<p>FORMULA: C20H21N7O7</p>
<p>CHARGE: -2</p>
<p>INCHI: InChI=1S/C20H23N7O7/c21-20-25-16-15(18(32)26-20)23-11(7-22
-16)8-27(9-28)12-3-1-10(2-4-12)17(31)24-13(19(33)34)5-6-14(29)30/h1-4,9,11,13,23
H,5-8H2,(H,24,31)(H,29,30)(H,33,34)(H4,21,22,25,26,32)/p-2/t11-,13+/m1/s1</p>
<p>HEPATONET_1.0_ABBREVIATION: HC00212</p>
<p>EHMN_ABBREVIATION: C00234</p>
</body>
</notes>
<annotation>
...
I would like to retrieve all information inside species node, anyone know how to do that?
There exists an SBML parsing library libSBML (http://sbml.org/Software/libSBML).
This includes a binding to R that would allow access to the SBML objects directly within R using code similar to
document = readSBML(filename);
errors = SBMLErrorLog_getNumFailsWithSeverity(
SBMLDocument_getErrorLog(document),
enumToInteger("LIBSBML_SEV_ERROR", "_XMLErrorSeverity_t")
);
if (errors > 0) {
cat("Encountered the following SBML errors:\n");
SBMLDocument_printErrors(document);
q(status=1);
}
model = SBMLDocument_getModel(document);
if (is.null(model)) {
cat("No model present.\n");
q(status=1);
}
species = Model_getSpecies(model, index_of_species);
id = Species_getId(species);
conc = Species_getInitialConcentration(species)
There is a Species_get(NameOfAttribute) function for each possible attribute; together with Species_isSet(NameOfAttribute); Species_set(NameOfAttribute) and Species_unset(NameOfAttribute).
The API is similar for interacting with any SBML element.
The libSBML releases include R installers that are available from
http://sourceforge.net/projects/sbml/files/libsbml/5.8.0/stable
navigating to the R_interface subdirectory for the OS and architecture of your choice.
The source code distribution of libSBML contains an examples/r directory with many examples of using libSBML to interact with SBML in the R environment.
I guess it depends on what you mean when you say you want to "retrieve" all the information in the species nodes, because that retrieved data could be coerced to any number of different formats. The following assumes you want it all in a data frame, where each row is an species node from your XML file and the columns represent different pieces of information.
When just trying to extract information, I generally find it easier to work with lists than with XML.
doc <- xmlTreeParse(xml_file, ignoreBlanks = TRUE)
doc_list <- xmlToList(doc)
Once it's in a list, you can figure out where the species data is stored:
sapply(x, function(x)unique(names(x)))
[[1]]
NULL
[[2]]
NULL
[[3]]
NULL
[[4]]
[1] "species"
[[5]]
[1] "reaction"
[[6]]
[1] "metaid"
$.attrs
[1] "level" "version"
So you really only want the information in doc_list[[4]]. Take a look at just the first component of doc_list[[4]]:
str(doc_list[[4]][[1]])
List of 9
$ : chr "FORMULA: C20H21N7O7"
$ : chr "CHARGE: -2"
$ : chr "HEPATONET_1.0_ABBREVIATION: HC00212"
$ : chr "EHMN_ABBREVIATION: C00234"
$ : chr "http://identifiers.org/obo.chebi/CHEBI:57454"
$ : chr "http://identifiers.org/pubchem.compound/C00234"
$ : chr "http://identifiers.org/hmdb/HMDB00972"
$ : Named chr "#_metaM_10fthf_c"
..- attr(*, "names")= chr "about"
$ .attrs: Named chr [1:9] "M_10fthf_c" "1" "false" "false" ...
..- attr(*, "names")= chr [1:9] "id" "initialConcentration" "constant" "hasOnlySubstanceUnits" ...
So you have the information contained in the first eight lists, plus the information contained in the attributes.
Getting the attributes information is easy because it's already named. The following formats the attributes information into a data frame for each node:
doc_attrs <- lapply(doc_list[[4]], function(x) {
x <- unlist(x[names(x) == ".attrs"])
col_names <- gsub(".attrs.", "", names(x))
x <- data.frame(matrix(x, nrow = 1), stringsAsFactors = FALSE)
colnames(x) <- col_names
x
})
Some nodes didn't appear to have attributes information and so returned empty data frames. That caused problems later so I created data frames of NAs in their place:
doc_attrs_cols <- unique(unlist(sapply(doc_attrs, colnames)))
doc_attrs[sapply(doc_attrs, length) == 0] <-
lapply(doc_attrs[sapply(doc_attrs, length) == 0], function(x) {
df <- data.frame(matrix(rep(NA, length(doc_attrs_cols)), nrow = 1))
colnames(df) <- doc_attrs_cols
df
})
When it came to pulling non-attribute data, the names and values of the variables were generally contained within the same string. I originally tried to come up with a regular expression to extract the names, but they're all formatted so differently that I gave up and just identified all the possibilities in this particular data set:
flags <- c("FORMULA:", "CHARGE:", "HEPATONET_1.0_ABBREVIATION:",
"EHMN_ABBREVIATION:", "obo.chebi/CHEBI:", "pubchem.compound/", "hmdb/HMDB",
"INCHI: ", "kegg.compound/", "kegg.genes/", "uniprot/", "drugbank/")
Also, sometimes the non-attribute information was kept as just a list of values, as in the node I showed above, while other times it was contained in "notes" and "annotation" sublists, so I had to include an if else statement to make things more consistent.
doc_info <- lapply(doc_list[[4]], function(x) {
if(any(names(x) != ".attrs" & names(x) != "")) {
names(x)[names(x) != ".attrs"] <- ""
x <- unlist(do.call("c", as.list(x[names(x) != ".attrs"])))
} else {
x <- unlist(x[names(x) != ".attrs"])
}
x <- gsub("http://identifiers.org/", "", x)
need_names <- names(x) == ""
names(x)[need_names] <- gsub(paste0("(", paste0(flags, collapse = "|"), ").+"), "\\1", x[need_names], perl = TRUE)
#names(x) <- gsub("\\s+", "", names(x))
x[need_names] <- gsub(paste0("(", paste0(flags, collapse = "|"), ")(.+)"), "\\2", x[need_names], perl = TRUE)
col_names <- names(x)
x <- data.frame(matrix(x, nrow = 1), stringsAsFactors = FALSE)
colnames(x) <- col_names
x
})
To get everything together into a data frame, I suggest the plyr package's rbind.fill.
require(plyr)
doc_info <- do.call("rbind.fill", doc_info)
doc_attrs <- do.call("rbind.fill", doc_attrs)
doc_all <- cbind(doc_info, doc_attrs)
dim(doc_all)
[1] 3972 22
colnames(doc_all)
[1] "FORMULA:" "CHARGE:" "HEPATONET_1.0_ABBREVIATION:" "EHMN_ABBREVIATION:"
[5] "obo.chebi/CHEBI:" "pubchem.compound/" "hmdb/HMDB" "about"
[9] "INCHI: " "kegg.compound/" "kegg.genes/" "uniprot/"
[13] "drugbank/" "id" "initialConcentration" "constant"
[17] "hasOnlySubstanceUnits" "name" "metaid" "boundaryCondition"
[21] "sboTerm" "compartment"
As a partial answer, the document uses name spaces, and 'species' is part of the 'id' name space. So
> xpathSApply(doc, "//id:species", xmlGetAttr, "id", namespaces="id")
[1] "M_10fthf_c" "M_10fthf_m" "M_13dampp_c" "M_h2o_c" "M_o2_c"
[6] "M_bamppald_c" "M_h2o2_c" "M_nh4_c" "M_h_m" "M_nadph_m"
...
with id:species and namespaces="id" being different from what you illustrate above.

Resources