BaseX XQuery issue with map merge? - xquery

I want to convert CSV to XML using XQuery on BaseX (currently using BaseX v9.4.1). With columns tab separated the CSV file Configuration.tsv (input) looks like this:
Datasets Autosar Fibex TSS0001 TSS0002 TSS0003
DS0001 AutosarFoo01.arxml FibexFoo01.xml x x
DS0002 AutosarFoo02.arxml FibexFoo02.xml x x
DS0003 AutosarFoo03.arxml FibexFoo03.xml x x
The script has tssid as input parameter. Setting tssid := TSS0001 the expected output XML is:
<database name="DS0001" autosar="AutosarFoo01.arxml" fibex="FibexFoo01.arxml"/>
<database name="DS0002" autosar="AutosarFoo02.arxml" fibex="FibexFoo02.arxml"/>
<database name="DS0003" autosar="AutosarFoo03.arxml" fibex="FibexFoo03.arxml"/>
and with tssid := TSS0003:
<database name="DS0002" autosar="AutosarFoo02.arxml" fibex="FibexFoo02.arxml"/>
<database name="DS0003" autosar="AutosarFoo03.arxml" fibex="FibexFoo03.arxml"/>
The script works fine but the third attribute value (fibex) is missing, so the output looks differently like this:
<database name="DS0001" autosar="AutosarFoo01.arxml" fibex=""/>
<database name="DS0002" autosar="AutosarFoo02.arxml" fibex=""/>
<database name="DS0003" autosar="AutosarFoo03.arxml" fibex=""/>
or with tssid := TSS0003:
<database name="DS0002" autosar="AutosarFoo02.arxml" fibex=""/>
<database name="DS0003" autosar="AutosarFoo03.arxml" fibex=""/>
I cannot find something wrong with this code ?!? :
xquery version "3.1" encoding "utf-8";
declare namespace test="unittest";
declare namespace map="http://www.w3.org/2005/xpath-functions/map";
declare variable $test:tssid := 'TSS0003';
declare variable $test:testsuitesfile := '/Users/ms/Projekte/UnitTests/Configuration.tsv';
declare variable $test:options := map { 'header' : 'yes', 'format' : 'attributes', 'separator' : 'tab' };
declare variable $test:xml := csv:doc($test:testsuitesfile,$test:options);
(: get the positions :)
declare variable $test:positionDataId := 1 ;
declare variable $test:positionTssCheck := $test:xml/csv/record[position() = 1]/entry[text() = $test:tssid]/count(./preceding-sibling::entry) + 1 ;
declare variable $test:positionAutosar := $test:xml/csv/record[position() = 1]/entry[text() = 'Autosar']/count(./preceding-sibling::entry) + 1 ;
declare variable $test:positionFibex := $test:xml/csv/record[position() = 1]/entry[text() = 'Fibex']/count(./preceding-sibling::entry) + 1 ;
declare variable $test:dbsetbycsv as map(*) := map:merge(for $record in $test:xml/csv/record[entry[position() = $test:positionTssCheck and text()='x']] return map:entry($record/entry[position() = $test:positionDataId]/text(),<database name="{$record/entry[position() = $test:positionDataId]/text()}" autosar="{$record/entry[position() = $test:positionAutosar]/text()}" fibex="{$record/entry[position() = $test:positionFibex]/text()}"/>));
declare variable $test:dbset := $test:dbsetbycsv;
declare function test:dump() {
for-each(
map:keys($test:dbset),
function($k) {
$test:dbset($k)
}
)
};
let $result := test:dump()
return($result)

It seems that
$xml/csv/record[entry[#name = $tssid and . = 'x']] ! <database name="{entry[#name = 'Datasets']}" autosar="{entry[#name = 'Autosar']}" fibex="{entry[#name = 'Fibex']}"/>
might suffice, given that the entry elements have name attributes.

It's a bug. Fixed now with BaseX 9.4.3 beta.

Related

Faster method for creating key value pairs

I want to create a mapping from key (string) to value (string[]). I read a file:
gloveEmbeddings := make(map[string][]string)
f, _ := ioutil.ReadFile("./glove.840B.300d.txt")
The file is in the following format:
key0 val0_index0 val0_index1 val0_index2 val0_index3
key1 val1_index0 val1_index1 val1_index2 val1_index3
...
There are two separations, the new line and the space.
First I create a split of the new line:
newlineSplit := strings.Split(string(f), "\n")
Then I split each row with the space and put the first value in the resulting array as the key and the slice of the rest as the value:
for i := 0; i < len(newlineSplit); i++ {
spaceSplit := strings.Split(newlineSplit[i], " ")
gloveEmbeddings[spaceSplit[0]] = spaceSplit[1:]
}
fmt.Println(gloveEmbeddings)
The file is 5.5GB and this loop is taking more than 20 minutes. The goal is to be able to access the value fast given the key. Is there a better way to do this?
EDIT
I reading the file line by line.
gloveEmbeddings := make(map[string][]string)
f, _ := os.Open("./glove.840B.300d.txt")
scanner := bufio.NewScanner(f)
count := 0
for scanner.Scan() {
spaceSplit := strings.Split(scanner.Text(), " ")
gloveEmbeddings[spaceSplit[0]] = spaceSplit[1:]
if count % 10000 == 0 {
fmt.Println(count)
}
count++
}
But the counts stop printing at 2.19 million.
The loop actually runs fast, even the original version that uses ioutil.ReadFile. What was taking long was the fmt.Println(gloveEmbeddings) which was there only for debugging. Reading the file all at once and one line at a time take the same time, about 42-44 s on my machine.

How to use unittest module for a python script which calls mutiple functions and uses global variables

This is one of the function in my python script for which I am trying to write unit test case, since it uses global variables and audit and big query functions which is written as different utility scripts I am not understanding how to write #patch and execute unit test cases for the same.
How will I patch global variables?
How to patch functions which doesn't have any return for eg :audit_event_source_table, can we ignore such functions during unit testing ? if so how to do the same?
How to do assertion as I do not have any return value but have logger.info messages?
import logging
from datetime import datetime
from pathlib import Path
import sys
import __main__
from intient_research_rdm_common.utils.audit_utils import audit_event_source_table, audit_event_job_table, \
get_job_id, get_source_object_id
from intient_research_rdm_kg_core.common_utils.utils.bigquery_utils import bigquery_data_read
from intient_research_rdm_kg_core.common_utils.utils.conf_read import read_args, read_source_config, read_env_config
global project_id, service_account, conn_ip, debug, node_table_list, edge_table_list, source_name
def edge_validation():
global edge_table_list
global source_name
edge_table_na = []
edge_table_list_rowcount_zero = []
dataset_e = "prep_e_" + source_name
row_count = 0
edge_table = ""
source_object_start_timestamp = datetime.now()
source_object_id = get_source_object_id(source_name, source_object_start_timestamp)
source_object_type = AUDIT_SOURCE_OBJECT_TYPE_BIGQUERY
job_id = get_job_id(source_object_start_timestamp)
source_object_name = dataset_e
try:
for edge_table in edge_table_list:
sql_query = " SELECT * FROM " + "`" + project_id + "." + dataset_e + ".__TABLES__` WHERE table_id =" + "'" + edge_table + "'"
data_read, col_names = bigquery_data_read(service_account, sql_query, project_id)
for ind in data_read.index:
row_count = (data_read['row_count'][ind])
if len(data_read.index) == 0:
edge_table_na.append(edge_table)
elif row_count == 0:
edge_table_list_rowcount_zero.append(edge_table)
if len(edge_table_na) > 0:
logging.info("Missing Edge tables in preprocessing layer {} ".format(edge_table_na))
if len(edge_table_list_rowcount_zero) > 0:
logging.info("Edge tables with row count as zero in Pre-processing layer {} ".format(edge_table_list_rowcount_zero))
if len(edge_table_na) == 0 and len(edge_table_list_rowcount_zero) == 0:
logging.info(
"Edge list validation for the source {} has been successfully completed with no discrepancies".format(
source_name))
audit_event_source_table(source_object_id, source_object_name, source_object_type, source_name,
source_object_name,
job_id, AUDIT_JOB_STATUS_PASS, source_object_start_timestamp,
datetime.now(), 'NA', 'NA', project_id)
if len(edge_table_na) > 0 or len(edge_table_list_rowcount_zero) > 0:
audit_event_source_table(source_object_id, source_object_name, source_object_type, source_name,
source_object_name,
job_id, AUDIT_JOB_STATUS_PASS, source_object_start_timestamp,
datetime.now(), 'NA', 'NA', project_id)
sys.exit(1)
except Exception as e:
msg = '{} : Issue with the edge validation for {} is: \n{}\n'.format(datetime.now(), edge_table, e)
logging.error(msg)
audit_event_source_table(source_object_id, source_object_name, source_object_type, source_name,
source_object_name,
job_id, AUDIT_JOB_STATUS_FAIL, source_object_start_timestamp,
datetime.now(), AUDIT_ERROR_TYPE_PREPROCESSING_KG_LAYER_VALIDATION, msg,
project_id)
raise Exception(msg)
Patch global variables - in the same way that you patch a method of a class, you patch the global variable. It's not clear in your code snippet where the global variables are defined (ie. do you import these variables from another module or do you assign to those variables at the top of your Python script). Either way, you patch in the namespace where the function is being used. If you can confirm further details I will be able to assist.
Personally, the way I patch and test functions with no return value is the same. For example, if I wanted to patch the source_object_start_timestamp variable, I would use: source_object_start_timestamp = patch('pandas.datetime.utcnow', return_value="2020-08-16 20:36:06.578174").start(). For BigQuery functions, I would still patch them but in your unit test, use the mock_call_count method of the unittest.mock.mock class to test if that function has been called.
Point 2 addresses your third query - use the mock_call_count method to check how many times the mock has been called

Sample python code to replace a substring value in an xlsx cell

Sample code snippet tried:
for row in range(1,sheet.max_row+1):
for col in range(1, sheet.max_column+1):
temp = None
cell_obj = sheet.cell(row=row,column=col)
temp = re.search(r"requestor", str(cell_obj.value))
if temp:
if 'requestor' in cell_obj.value:
cell_obj.value.replace('requestor',
'ABC')
Trying to replace from an xlsx cell containing value "Customer name: requestor " with value "Customer name: ABC" .How can this be achieved easily ?
I found my answer in this post:https://www.edureka.co/community/42935/python-string-replace-not-working
The replace function doesn't store the result in the same variable. Hence the solution for above:
mvar = None
for row in range(1,sheet.max_row+1):
for col in range(1, sheet.max_column+1):
temp = None
cell_obj = sheet.cell(row=row,column=col)
temp = re.search(r"requestor", str(cell_obj.value))
if temp:
if 'requestor' in cell_obj.value:
mvar = cell_obj.value.replace('requestor',
'ABC')
cell_obj.value = mvar
Just keep it simple. Instead of re and replace, search for the given value and override the cell.
The example below also gives you the ability to change 'customer name' if needed:
wb = openpyxl.load_workbook("example.xlsx")
sheet = wb["Sheet1"]
customer_name = "requestor"
replace_with = "ABC"
search_string = f"Customer name: {customer_name}"
replace_string = f"Customer name: {replace_with}"
for row in range(1, sheet.max_row + 1):
for col in range(1, sheet.max_column + 1):
cell_obj = sheet.cell(row=row, column=col)
if cell_obj.value == search_string:
cell_obj.value = replace_string
wb.save("example_copy.xlsx") # remember that you need to save the results to the file

XQuery Type of value does not match

declare variable $fb := doc("factbook.xml")/mondial;
for $c in $fb//country
where ($c/encompassed/#continent = 'f0_119') and ($c/#population < 100000)
return concat('Country: ',$c/name, ', Population: ',$c/#population);
it returns:
Type Error: Type of value '
()
' does not match sequence type: xs:anyAtomicType?
At characters 11681-11698
At File "q2_3.xq", line 4, characters 13-67
At File "q2_3.xq", line 4, characters 13-67
At File "q2_3.xq", line 4, characters 13-67
however, if i do not do a concat return, just name or population it will work, and most strange thing is i have another program :
declare variable $fb := doc("factbook.xml")/mondial;
for $c in $fb//country
where $c/religions = 'Seventh-Day Adventist'
order by $c/name
return concat('Country: ',$c/name, ', Population: ',$c/#population);
The return syntax is exactly same, however, it works.
Why this happens?
Without seeing an example of your data it's impossible to say for sure, but if $c/name returns more than one value, then your error would make sense. Do you have any results where there are more than one name element?

Get json with pljson & plsql

I'm trying to chop out some lists from the following json using pljson
my_json := json('{"Order":
{"no": 1, "batch": 2,"id": 3,"quantity": 10,
"sm_pack": [
{
"no": 10,
"id": 1010,
"quantity": 2
},
{
"no": 11,
"id": 1040,
"quantity": 8
}
],
"sm_size": [
{ ....etc etc
However, I can't get it to work?
I can print the data using this syntax:
v_myjson.path('Order.sm_pack').print;
v_myjson.path('Order.sm_pack[1].no').print;
But how can I assing all those different lists to variables for further processing. i tried different versions of "v_list := json_list(my_json.get('Order.sm_pack')) .. my_json.get('sm_pack').. whatever I try its"NULL SELF" and I seem to have turned blind.
Regards
Printing json lists and objects differs from assigning them to some variables to manipulate them. I will try to answer your question through your example as follows:
DECLARE
obj json := json();
obj_1 json := json();
arr json_list := json_list();
val NUMBER;
BEGIN
/*Create your object*/
obj := json('{"Order":{"no":1,"batch":2,"id":3,"quantity":10,"sm_pack":[{"no":10,"id":1010,"quantity":2},{"no":11,"id":1040,"quantity":8}],"sm_size":[{"no":10,"id":1010,"quantity":2},{"no":11,"id":1040,"quantity":8}]}}');
/*Assign object*/
obj_1 :=json(obj.get('Order'));
/*Assign list from within the object*/
arr := json_list(obj_1.get('sm_pack'));
arr.print;
--or
arr := json_list(json(obj.get('Order')).get('sm_pack'));
arr.print;
/*Get object value from within list*/
val := json_ext.get_number(json(arr.get(2)), 'id');
DBMS_OUTPUT.PUT_LINE(VAL);
END;
/
Notice I used get_number function as your values without single quotes, otherwise, I would use get_string.
Hope that helps!

Resources