Generate a xml from a R list - r

I'm new to xml and processing it in R.
I've been able to read and retrieve info from xml files using the xml2 package, but creating xml files from R objects has proven to be more challenging.
In particular, I'd like to generate a xml file from a R list. Consider the example below:
library(reprex)
library(xml2)
r_list <- list(person1 = list(starts = letters[1:3], ends = letters[4:6]), person2 = list(starts = LETTERS[1:4], ends = LETTERS[5:8]))
str(r_list)
#> List of 2
#> $ person1:List of 2
#> ..$ starts: chr [1:3] "a" "b" "c"
#> ..$ ends : chr [1:3] "d" "e" "f"
#> $ person2:List of 2
#> ..$ starts: chr [1:4] "A" "B" "C" "D"
#> ..$ ends : chr [1:4] "E" "F" "G" "H"
test1 <- xml2::as_xml_document((r_list))
#> Error: Root nodes must be of length 1
new_xml <- xml_new_root(.value = "category", name = "personList")
for(person in names(r_list)){
xml_add_child(new_xml, as_xml_document(r_list[person]))
}
new_xml
#> {xml_document}
#> <category name="personList">
#> [1] <person1>ad</person1>
#> [2] <person2>AE</person2>
Created on 2021-11-25 by the reprex package (v2.0.1)
I tried to directly coerce the list to xml using the as_xml_document function, but I get the error Root nodes must be of length 1.
Following the idea on this question, I tried to create the xml document with a root node and xml_add_child() to this document, but I did not get the expected result (see code output). In that question, they transform from an R data frame and not a list.
I'd also like to have personalized tag names and add attributes to these tags. The wished output would be:
<category name="personList">
<pers name="person1">
<starts>
<value>a</value>
<value>b</value>
<value>c</value>
</starts>
<ends>
<value>d</value>
<value>e</value>
<value>f</value>
</ends>
</pers>
<pers name="person2">
<starts>
<value>A</value>
<value>B</value>
<value>C</value>
<value>D</value>
</starts>
<ends>
<value>D</value>
<value>E</value>
<value>F</value>
<value>G</value>
</ends>
</pers>
</category>
Thanks for your help and have a nice day

R list attributes can be mapped to XML attributes:
library(xml2)
library(tidyverse)
r_list <- list(person1 = list(starts = letters[1:3], ends = letters[4:6]), person2 = list(starts = LETTERS[1:4], ends = LETTERS[5:8]))
r_list
new_xml <- xml_new_root(.value = "category", name = "personList")
for (person in names(r_list)) {
p <- list()
p[["pers"]] <- list(
starts = r_list[[person]]$starts %>% map(~list(value = list(.x))),
ends = r_list[[person]]$ends %>% map(~list(value = list(.x)))
)
attr(p[["pers"]], "name") <- person
xml_add_child(new_xml, as_xml_document(p))
}
write_xml(new_xml, "foo.xml")
output:
<?xml version="1.0" encoding="UTF-8"?>
<category name="personList">
<pers name="person1">
<starts>
<value>a</value>
<value>b</value>
<value>c</value>
</starts>
<ends>
<value>d</value>
<value>e</value>
<value>f</value>
</ends>
</pers>
<pers name="person2">
<starts>
<value>A</value>
<value>B</value>
<value>C</value>
<value>D</value>
</starts>
<ends>
<value>E</value>
<value>F</value>
<value>G</value>
<value>H</value>
</ends>
</pers>
</category>

Following the comment by #Limey (to see this question), I could generate the wished output with the following code (posted as answer just for completeness, as #danlooo answer also produces the same output).
library(XML)
r_list <- list(person1 = list(starts = letters[1:3], ends = letters[4:6]), person2 = list(starts = LETTERS[1:4], ends = LETTERS[5:8]))
str(r_list)
category = newXMLNode("category", attrs = c(name="personList"))
for(person in names(r_list)){
pers <- newXMLNode("pers", attrs = c(name = person), parent = category)
startsn <- newXMLNode("starts", parent = pers)
for(value in seq_along(r_list[[person]][["starts"]])){
svalue <- newXMLNode("value", r_list[[person]][["starts"]][[value]], parent = startsn)
}
endsn <- newXMLNode("ends", parent = pers)
for(value in seq_along(r_list[[person]][["ends"]])){
evalue <- newXMLNode("value", r_list[[person]][["ends"]][[value]], parent = endsn)
}
}
category

Related

Namespace without prefix in XML in R

In the XML package in R, it is possible to create a new xmlTree object with a namespace, e.g. using:
library(XML)
d = xmlTree("foo", namespaces = list(prefix = "url"))
d$doc()
# <?xml version="1.0"?>
# <foo xmlns:prefix="url"/>
How do I create a default namespace, without the prefix bar, such that it looks like the following?
# <?xml version="1.0"?>
# <foo xmlns="url"/>
The following does not produce what I expected.
library(XML)
d = xmlTree("foo", namespaces = list("url"))
d$doc()
# <?xml version="1.0"?>
# <url:foo xmlns:url="<dummy>"/>
There seems to be a difference between nameless lists and lists with an empty name in R.
1 - A nameless list:
list("url")
# [[1]]
# [1] "url"
names(list("url"))
# NULL
2 - A named list:
list(prefix = "url")
# $prefix
# [1] "url"
names(list(prefix = "url"))
# [1] "prefix"
3 - An incorrectly initialised empty-name list:
list("" = "url")
# Error: attempt to use zero-length variable name
4 - An hacky way to initialise an empty-name list:
setNames(list(prefix = "url"), "")
# [[1]]
# [1] "url"
names(setNames(list(prefix = "url"), ""))
# [1] ""
It would seem 1. and 4. are identical, however, in the package XML they produce different results. The first gives the incorrect XML as mentioned in the OP, whereas option 4. produces:
library(XML)
d = d = xmlTree("foo", namespaces = setNames(list(prefix = "url"), ""))
d$doc()
# <?xml version="1.0"?>
# <foo xmlns="url"/>

pull all elements with specific name from a nested list

I have some archived Slack data that I am trying to get some of key message properties. I'd done this by stupidly flattening the entire list, getting a data.frame or tibble with lists nested in some cells. As this dataset gets bigger, I want to pick elements out of this list more smartly so that when this cache becomes big it doesn't take forever to create the data.frame or tibble with the elements I want.
Example where I am trying to pull everything named "type" below into a vector or flat list that I can pull in as a dataframe variable. I named the folder and message level for convenience. Anyone have model code that can help?
library(tidyverse)
l <- list(folder_1 = list(
`msg_1-1` = list(type = "message",
subtype = "channel_join",
ts = "1585771048.000200",
user = "UFUNNF8MA",
text = "<#UFUNNF8MA> has joined the channel"),
`msg_1-2` = list(type = "message",
subtype = "channel_purpose",
ts = "1585771049.000300",
user = "UNFUNQ8MA",
text = "<#UNFUNQ8MA> set the channel purpose: Talk about xyz")),
folder_2 = list(
`msg_2-1` = list(type = "message",
subtype = "channel_join",
ts = "1585771120.000200",
user = "UQKUNF8MA",
text = "<#UQKUNF8MA> has joined the channel"))
)
# gets a specific element
print(l[[1]][[1]][["type"]])
# tried to get all elements named "type", but am not at the right list level to do so
print(purrr::map(l, "type"))
As OP mentioned, this can solve the issue:
#Code
unlist(l)[grepl('.type',names(unlist(l)),fixed=T)]
Output:
folder_1.msg_1-1.type folder_1.msg_1-2.type folder_2.msg_2-1.type
"message" "message" "message"
Another options are (Many thanks and credit to #Abdessabour Mtk)
#Code1
purrr::map(l, ~ purrr::map(.x, "type"))
Depending on the desired output, I would probably use a simple recursive function here.
get_elements <- function(x, element) {
if(is.list(x))
{
if(element %in% names(x)) x[[element]]
else lapply(x, get_elements, element = element)
}
}
This allows:
get_elements(l, "type")
#> $folder_1
#> $folder_1$`msg_1-1`
#> [1] "message"
#>
#> $folder_1$`msg_1-2`
#> [1] "message"
#>
#>
#> $folder_2
#> $folder_2$`msg_2-1`
#> [1] "message"
Or if you want to get all "users":
get_elements(l, "user")
#> $folder_1
#> $folder_1$`msg_1-1`
#> [1] "UFUNNF8MA"
#>
#> $folder_1$`msg_1-2`
#> [1] "UNFUNQ8MA"
#>
#>
#> $folder_2
#> $folder_2$`msg_2-1`
#> [1] "UQKUNF8MA"
You could obviously unlist the result if you prefer it flattened into a vector.
unlist(get_elements(l, "type"))
#> folder_1.msg_1-1 folder_1.msg_1-2 folder_2.msg_2-1
#> "message" "message" "message"
Another option is to use rrapply() in the rrapply-package:
library(rrapply)
## return unlisted vector
rrapply(l, condition = function(x, .xname) .xname == "type", how = "unlist")
#> folder_1.msg_1-1.type folder_1.msg_1-2.type folder_2.msg_2-1.type
#> "message" "message" "message"
## return melted data.frame
rrapply(l, condition = function(x, .xname) .xname == "type", how = "melt")
#> L1 L2 L3 value
#> 1 folder_1 msg_1-1 type message
#> 2 folder_1 msg_1-2 type message
#> 3 folder_2 msg_2-1 type message
Related to those provided by #Duck & #Abdessabour Mtk yesterday, purrr has a function map_depth() that will let you get a named attribute if you know its name and how deep it is in the hierarchy. REALLY useful when crawling this big nested lists, and is a simpler solution to the nested map() calls above.
purrr::map_depth(l, 2, "type")

LIST to data.frame in XML file

I am working on XML files and I am trying to transform them into data.frame. However, during the transformation process the file is “LIST”, as seen below:
My Code:
require(tidyverse)
require(xml2)
page<-read_xml('<?xml version="1.0" encoding="ISO-8859-1" ?>
<test2:TASS xmlns="http://www.vvv.com/schemas"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://www.vvv.com/schemas http://www.vvv.com/schemas/testV2_02_03.xsd"
xmlns:test2="http://www.vvv.com/schemas" >
<test2:billing>
<test2:proceduresummary>
<test2:guidenumber>Z4088</test2:guidenumber>
<test2:diagnosis>
<test2:table>ICD-10</test2:table>
<test2:diagnosiscod>G93</test2:diagnosiscod>
<test2:description>DISORDER OF BRAIN, UNSPECIFIED</test2:description>
</test2:diagnosis>
<test2:procedure>
<test2:procedure>
<test2:description>HOSPITAL</test2:description>
</test2:procedure>
<test2:amount>15</test2:amount>
</test2:procedure>
</test2:proceduresummary>
</test2:billing>
</test2:TASS>')
t1<-if ("test2" %in% names(xml_ns(page))) {
ns<-xml_ns_rename(xml_ns(page), test2 = "test")
} else {
ns<- xml_ns(page)
}
MYFILE<- ifelse(names(xml_ns(page)) %in% "d1",
page %>% xml_find_all(".//d1:billing"),
page %>% xml_find_all(".//test:billing", ns))
MYFILE<-xml2::as_list(MYFILE) %>% jsonlite::toJSON() %>% jsonlite::fromJSON()
My "LIST"
**List of 1
$ :List of 2
..$ node:<externalptr>
..$ doc :<externalptr>
..- attr(*, "class")= chr "xml_node"**
I'm using the code below to transform it, but it's giving an error:
MYFILE <- xml2 :: as_list (MYFILE)%>% jsonlite :: toJSON ()%>% jsonlite :: fromJSON ()
This is the error.
Error in UseMethod("as_list") :
no applicable method for 'as_list' applied to an object of class "list"
How do I turn it into data.frame/tibble?
It looks like the ifelse statement is causing the file to be parsed three times. This is causing a problem. If you need this line try this instead ifelse("d1" %in% names(xml_ns(page)), . . .
This script works on the above sample. If there are more than 1 billing node then part of the below script will need modification. I highlighted that in the comments.
t1<-if ("test2" %in% names(xml_ns(page))) {
ns<-xml_ns_rename(xml_ns(page), test2 = "test")
} else {
ns<- xml_ns(page)
}
MYFILE<- ifelse(names(xml_ns(page)) %in% "d1",
page %>% xml_find_all(".//d1:billing"),
page %>% xml_find_all(".//test:billing", ns))
#To prevent repeating reading the file multiple times
# MYFILE<- if ("d1" %in% names(xml_ns(page))) {
# page %>% xml_find_all(".//d1:billing")
# } else {
# page %>% xml_find_all(".//test:billing", ns)
# }
OUTPUT<-lapply(MYFILE, function(MYFILE){
#convert all of the nodes to named vector
output<-as_list(MYFILE) %>% unlist()
#Shorten the names
names(output) <- gsub("^(.+?\\.)", "", names(output))
#depending on your next steps will determine the disired output
#create a long format dataframe
# long_answer<-data.frame(Name=names(output), output, row.names = NULL)
#create a wide format dataframe
wide_answer<-data.frame( t(output))
})
bind_rows(OUTPUT)

Accessing actual content of XML file in R?

I am working with a well-structured XML file with the following initial content:
<?xml version="1.0" encoding="UTF-8"?>
<drugbank xmlns="http://www.drugbank.ca" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.drugbank.ca http://www.drugbank.ca/docs/drugbank.xsd" version="5.0" exported-on="2017-07-06">
<drug type="biotech" created="2005-06-13" updated="2016-08-17">
<drugbank-id primary="true">DB00001</drugbank-id>
<drugbank-id>BTD00024</drugbank-id>
<drugbank-id>BIOD00024</drugbank-id>
<name>Lepirudin</name>
<description>Lepirudin is identical to natural hirudin except for substitution of leucine for isoleucine at the N-terminal end of the molecule and the absence of a sulfate group on the tyrosine at position 63. It is produced via yeast cells. Bayer ceased the production of lepirudin (Refludan) effective May 31, 2012.</description>
<cas-number>138068-37-8</cas-number>
<unii>Y43GF64R34</unii>
<state>liquid</state>
<groups>
<group>approved</group>
</groups>
...
This file consists of many nodes, each representing one drug. I am aiming to extract two specific fields from each node of this file: name and drugbank-id primary="true"
... and save these to a neatly formatted table (with one column for name and the second column for drugbank-id).
I have reviewed a number of tutorials and had success with accessing the higher levels of this XML table structure, but where the examples provide syntax to access the actual values (e.g. specific drug names), this code is not working for me.
This is my current code:
library(XML)
# Save the database file as a tree structure
xmldata = xmlRoot(xmlTreeParse("DrugBank_TruncatedDatabase_v3_Small.xml"))
# Number of nodes in the entire database file
NumNodes <- xmlSize(xmldata)
# Create array structure to hold DrugBank ID values
DB_ID <- array(1:NumNodes, dim=c(1,NumNodes,1))
# Create array structure to hold Drug Name values
DrugName <- array(1:NumNodes, dim=c(1,NumNodes,1))
# for each node (i.e. each drug) in the database
for (i in 1:NumNodes){
# Assign the Drug Names to easy-to-comprehend DrugName array
DrugName[i] <- xmldata[[i]][["name"]]
# Assign the DrugBank ID numbers to easy-to-comprehend DB_ID array
DB_ID[i] <- xmldata[[i]][["drugbank-id"]]
}
EdgeListTable = data.frame(DrugName, DB_ID)
write.table(EdgeListTable, file="Output1.txt", quote=F)
The output file contains the following text, which is a level higher than I want:
X.name. X.name..1 X.name..2 X.name..3 X.drugbank.id. X.drugbank.id..1 X.drugbank.id..2 X.drugbank.id..3
1 name name name name drugbank-id drugbank-id drugbank-id drugbank-id
If I try:
xmlSApply(xmldata, function(x) xmlSApply(x, xmlValue))
...my output looks like:
$drug
$drug$drugbank-id
[1] "DB00001"
$drug$drugbank-id
[1] "BTD00024"
$drug$drugbank-id
[1] "BIOD00024"
$drug$name
[1] "Lepirudin"
...
...but after experimentation, I'm not sure how to actually access the values needed.
I appreciate any advice regarding the best way to store the values in the two fields of interest as a table.
============================================================
Update: I am able to extract the desired values using the following code:
DrugBankData <- xmlSApply(xmldata, function(x) xmlSApply(x, xmlValue))
for (i in 1:NumNodes){
DB_ID[i] <- DrugBankData[[i]][[1]]
DrugName[i] <- DrugBankData[[i]][[4]]
}
EdgeListTable = data.frame(DrugName, DB_ID)
write.table(EdgeListTable, file="Output1.txt", quote=F)
The output file looks like this:
X1 X2 X3 X4 X1.1 X2.1 X3.1 X4.1
1 Lepirudin Cetuximab Dornase alfa Denileukin diftitox DB00001 DB00002 DB00003 DB00004
So I am just working on getting this correctly formatted into columns and removing the first line of text from this file, and the "1" at the beginning of the second line...
Thanks for your response, herbaman. I ended up resolving the formatting issues (mostly, except that the columns still aren't aligned...) using the following code:
DrugName_Matrix = matrix(DrugName,nrow=NumNodes,ncol=1)
DrugID_Matrix = matrix(DB_ID,nrow=NumNodes,ncol=1)
Composite_Matrix = cbind(DrugName_Matrix,DrugID_Matrix,Target)
write.table(Composite_Matrix, file="Output1.txt", sep='\t', row.names=F, quote=F)
There remain mysterious column header names ("V1" and "V2") that don't appear in the contents of these two matrices; my attempts to rename them have been unsuccessful using standard methods, e.g.
colnames(Composite_Matrix)[colnames(Composite_Matrix)=="V1"] <- "Drug Name"
colnames(Composite_Matrix)[colnames(Composite_Matrix)=="V2"] <- "Drug ID"
or
setnames(Composite_Matrix, old=c("V1","V2"), new=c("DrugName", "DrugID"))
I'm not sure where these V column headers are originating...
As requested, the contents of the two matrices of interest are:
> DrugName_Matrix
[,1]
[1,] "Lepirudin"
[2,] "Cetuximab"
[3,] "Dornase alfa"
[4,] "Denileukin diftitox"
> DrugID_Matrix
[,1]
[1,] "DB00001"
[2,] "DB00002"
[3,] "DB00003"
[4,] "DB00004"
...and the output table is:
V1 V2
Lepirudin DB00001
Cetuximab DB00002
Dornase alfa DB00003
Denileukin diftitox DB00004
To read drug bank nodes I created the following method:
drug_sub_df <- function(rec, main_node, seconadary_node = NULL, id = "drugbank-id", byValue = FALSE) {
parent_key <- NULL
if (!is.null(id)) {
parent_key <- xmlValue(rec[id][[1]])
}
if (byValue) {
df <- map_df(rec[main_node], xmlValue)
} else {
if (is.null(seconadary_node) && !is.null(rec[[main_node]])) {
df <- xmlToDataFrame(rec[[main_node]], stringsAsFactors = FALSE)
} else {
df <- xmlToDataFrame(rec[[main_node]][[seconadary_node]], stringsAsFactors = FALSE)
}
}
if (nrow(df) > 0 && !is.null(parent_key)) {
df$parent_key <- parent_key
}
return(df)
}
Then I call the method like the following:
# Extract drug enzymes actions df
get_enzymes_actions_df <- function(rec) {
return(map_df(xmlChildren(rec[["enzymes"]]),
~ drug_sub_df(.x, "actions", id = "id")))
}
# Extract drug articles df
get_enzymes_articles_df <- function(rec) {
return(map_df(
xmlChildren(rec[["enzymes"]]),
~ drug_sub_df(.x, "references", seconadary_node = "articles", id = "id")
))
}
Of course. there are different situations that require different solutions like the following:
get_enzyme_rec <- function(r, drug_key) {
tibble(
id = xmlValue(r[["id"]]),
name = xmlValue(r[["name"]]),
organism = xmlValue(r[["organism"]]),
known_action = xmlValue(r[["known-action"]]),
inhibition_strength = xmlValue(r[["inhibition-strength"]]),
induction_strength = xmlValue(r[["induction-strength"]]),
position = ifelse(is.null(xmlGetAttr(r, name = "position")),
NA, xmlGetAttr(r, name = "position")),
parent_key = drug_key
)
}
get_enzymes_df <- function(rec) {
return(map_df(xmlChildren(rec[["enzymes"]]),
~ get_enzyme_rec(.x, xmlValue(rec["drugbank-id"][[1]]))))
}
or that one
get_atc_codes_rec <- function(r, drug_key) {
tibble(
atc_code = xmlGetAttr(r, name = "code"),
level_1 = xmlValue(r[[1]]),
code_1 = xmlGetAttr(r[[1]], name = "code"),
level_2 = xmlValue(r[[2]]),
code_2 = xmlGetAttr(r[[2]], name = "code"),
level_3 = xmlValue(r[[3]]),
code_3 = xmlGetAttr(r[[3]], name = "code"),
level_4 = xmlValue(r[[4]]),
code_4 = xmlGetAttr(r[[4]], name = "code"),
parent_key = drug_key
)
}
get_atc_codes_df <- function(rec) {
return (map_df(xmlChildren(rec[["atc-codes"]]),
~ get_atc_codes_rec(.x,
xmlValue(rec["drugbank-id"][[1]]))))
}
You can find more examples to extract contents of an drug bank XML database in R in different structures in this package
https://github.com/Dainanahan/dbparser

How to parse xml/sbml with R package xml?

I'm trying to parse information from the sbml/xml file below
https://dl.dropboxusercontent.com/u/10712588/file.xml
from this code
http://search.bioconductor.jp/codes/11172
It seems that I can import the file normally by
doc <- xmlTreeParse(filename,ignoreBlanks = TRUE)
but I can't recover node attributes by
atrr <- xpathApply(doc, "//species[#id]", xmlGetAttr, "id")
or
xpathApply(doc, "//species", function(n) xmlValue(n[[2]]))
A node of the file follows...
<species id="M_10fthf_m" initialConcentration="1" constant="false" hasOnly
SubstanceUnits="false" name="10-formyltetrahydrofolate(2-)" metaid="_metaM_10fth
f_m" boundaryCondition="false" sboTerm="SBO:0000247" compartment="m">
<notes>
<body xmlns="http://www.w3.org/1999/xhtml">
<p>FORMULA: C20H21N7O7</p>
<p>CHARGE: -2</p>
<p>INCHI: InChI=1S/C20H23N7O7/c21-20-25-16-15(18(32)26-20)23-11(7-22
-16)8-27(9-28)12-3-1-10(2-4-12)17(31)24-13(19(33)34)5-6-14(29)30/h1-4,9,11,13,23
H,5-8H2,(H,24,31)(H,29,30)(H,33,34)(H4,21,22,25,26,32)/p-2/t11-,13+/m1/s1</p>
<p>HEPATONET_1.0_ABBREVIATION: HC00212</p>
<p>EHMN_ABBREVIATION: C00234</p>
</body>
</notes>
<annotation>
...
I would like to retrieve all information inside species node, anyone know how to do that?
There exists an SBML parsing library libSBML (http://sbml.org/Software/libSBML).
This includes a binding to R that would allow access to the SBML objects directly within R using code similar to
document = readSBML(filename);
errors = SBMLErrorLog_getNumFailsWithSeverity(
SBMLDocument_getErrorLog(document),
enumToInteger("LIBSBML_SEV_ERROR", "_XMLErrorSeverity_t")
);
if (errors > 0) {
cat("Encountered the following SBML errors:\n");
SBMLDocument_printErrors(document);
q(status=1);
}
model = SBMLDocument_getModel(document);
if (is.null(model)) {
cat("No model present.\n");
q(status=1);
}
species = Model_getSpecies(model, index_of_species);
id = Species_getId(species);
conc = Species_getInitialConcentration(species)
There is a Species_get(NameOfAttribute) function for each possible attribute; together with Species_isSet(NameOfAttribute); Species_set(NameOfAttribute) and Species_unset(NameOfAttribute).
The API is similar for interacting with any SBML element.
The libSBML releases include R installers that are available from
http://sourceforge.net/projects/sbml/files/libsbml/5.8.0/stable
navigating to the R_interface subdirectory for the OS and architecture of your choice.
The source code distribution of libSBML contains an examples/r directory with many examples of using libSBML to interact with SBML in the R environment.
I guess it depends on what you mean when you say you want to "retrieve" all the information in the species nodes, because that retrieved data could be coerced to any number of different formats. The following assumes you want it all in a data frame, where each row is an species node from your XML file and the columns represent different pieces of information.
When just trying to extract information, I generally find it easier to work with lists than with XML.
doc <- xmlTreeParse(xml_file, ignoreBlanks = TRUE)
doc_list <- xmlToList(doc)
Once it's in a list, you can figure out where the species data is stored:
sapply(x, function(x)unique(names(x)))
[[1]]
NULL
[[2]]
NULL
[[3]]
NULL
[[4]]
[1] "species"
[[5]]
[1] "reaction"
[[6]]
[1] "metaid"
$.attrs
[1] "level" "version"
So you really only want the information in doc_list[[4]]. Take a look at just the first component of doc_list[[4]]:
str(doc_list[[4]][[1]])
List of 9
$ : chr "FORMULA: C20H21N7O7"
$ : chr "CHARGE: -2"
$ : chr "HEPATONET_1.0_ABBREVIATION: HC00212"
$ : chr "EHMN_ABBREVIATION: C00234"
$ : chr "http://identifiers.org/obo.chebi/CHEBI:57454"
$ : chr "http://identifiers.org/pubchem.compound/C00234"
$ : chr "http://identifiers.org/hmdb/HMDB00972"
$ : Named chr "#_metaM_10fthf_c"
..- attr(*, "names")= chr "about"
$ .attrs: Named chr [1:9] "M_10fthf_c" "1" "false" "false" ...
..- attr(*, "names")= chr [1:9] "id" "initialConcentration" "constant" "hasOnlySubstanceUnits" ...
So you have the information contained in the first eight lists, plus the information contained in the attributes.
Getting the attributes information is easy because it's already named. The following formats the attributes information into a data frame for each node:
doc_attrs <- lapply(doc_list[[4]], function(x) {
x <- unlist(x[names(x) == ".attrs"])
col_names <- gsub(".attrs.", "", names(x))
x <- data.frame(matrix(x, nrow = 1), stringsAsFactors = FALSE)
colnames(x) <- col_names
x
})
Some nodes didn't appear to have attributes information and so returned empty data frames. That caused problems later so I created data frames of NAs in their place:
doc_attrs_cols <- unique(unlist(sapply(doc_attrs, colnames)))
doc_attrs[sapply(doc_attrs, length) == 0] <-
lapply(doc_attrs[sapply(doc_attrs, length) == 0], function(x) {
df <- data.frame(matrix(rep(NA, length(doc_attrs_cols)), nrow = 1))
colnames(df) <- doc_attrs_cols
df
})
When it came to pulling non-attribute data, the names and values of the variables were generally contained within the same string. I originally tried to come up with a regular expression to extract the names, but they're all formatted so differently that I gave up and just identified all the possibilities in this particular data set:
flags <- c("FORMULA:", "CHARGE:", "HEPATONET_1.0_ABBREVIATION:",
"EHMN_ABBREVIATION:", "obo.chebi/CHEBI:", "pubchem.compound/", "hmdb/HMDB",
"INCHI: ", "kegg.compound/", "kegg.genes/", "uniprot/", "drugbank/")
Also, sometimes the non-attribute information was kept as just a list of values, as in the node I showed above, while other times it was contained in "notes" and "annotation" sublists, so I had to include an if else statement to make things more consistent.
doc_info <- lapply(doc_list[[4]], function(x) {
if(any(names(x) != ".attrs" & names(x) != "")) {
names(x)[names(x) != ".attrs"] <- ""
x <- unlist(do.call("c", as.list(x[names(x) != ".attrs"])))
} else {
x <- unlist(x[names(x) != ".attrs"])
}
x <- gsub("http://identifiers.org/", "", x)
need_names <- names(x) == ""
names(x)[need_names] <- gsub(paste0("(", paste0(flags, collapse = "|"), ").+"), "\\1", x[need_names], perl = TRUE)
#names(x) <- gsub("\\s+", "", names(x))
x[need_names] <- gsub(paste0("(", paste0(flags, collapse = "|"), ")(.+)"), "\\2", x[need_names], perl = TRUE)
col_names <- names(x)
x <- data.frame(matrix(x, nrow = 1), stringsAsFactors = FALSE)
colnames(x) <- col_names
x
})
To get everything together into a data frame, I suggest the plyr package's rbind.fill.
require(plyr)
doc_info <- do.call("rbind.fill", doc_info)
doc_attrs <- do.call("rbind.fill", doc_attrs)
doc_all <- cbind(doc_info, doc_attrs)
dim(doc_all)
[1] 3972 22
colnames(doc_all)
[1] "FORMULA:" "CHARGE:" "HEPATONET_1.0_ABBREVIATION:" "EHMN_ABBREVIATION:"
[5] "obo.chebi/CHEBI:" "pubchem.compound/" "hmdb/HMDB" "about"
[9] "INCHI: " "kegg.compound/" "kegg.genes/" "uniprot/"
[13] "drugbank/" "id" "initialConcentration" "constant"
[17] "hasOnlySubstanceUnits" "name" "metaid" "boundaryCondition"
[21] "sboTerm" "compartment"
As a partial answer, the document uses name spaces, and 'species' is part of the 'id' name space. So
> xpathSApply(doc, "//id:species", xmlGetAttr, "id", namespaces="id")
[1] "M_10fthf_c" "M_10fthf_m" "M_13dampp_c" "M_h2o_c" "M_o2_c"
[6] "M_bamppald_c" "M_h2o2_c" "M_nh4_c" "M_h_m" "M_nadph_m"
...
with id:species and namespaces="id" being different from what you illustrate above.

Resources