Sorting multiple maps in marklogic 8 - xquery

This is more of an XQuery than MarkLogic. I have three map:map and each map has key-value pair of "id" and score. I would like to sort all the distinct ids based on the score from each maps.
For eg:
map1 : 1:2048, 5:2000
map2 : 2:5000, 1:1000, 4:3000
map3 : 6:100, 7:5000, 2:2000
In the above example, each map is id:score for key value (did not know how to represent here :))..
I want the sorted list of id from three maps based on score..
Is there a good way or better way of doing the sorting, or do I have to union the keys of the map and iterate the sequence of keys and sort them ?

This seems like a great use case for folding. Its part of Xquery 3.0 spec.
Folding can go through a sequence of items and gets the result for each item as it goes through. In this example $combinedMaps is the result of the last call and $mapToMerge is the item in the sequence it is currently going through.
Here an example of what you would want to do.
declare function local:sortMaps(
$newMap as map:map,
$mapA as map:map,
$mapB as map:map
) as map:map {
let $build :=
for $key in map:keys($mapA)
let $otherMapValue :=
(map:get($mapB, $key), 0)[1]
let $value := map:get($mapA, $key)
return
if ($value gt $otherMapValue) then (
map:put($newMap, $key, $value)
) else (
map:put($newMap, $key, $otherMapValue)
)
return $newMap
};
let $map1 :=
map:new((
map:entry("1",2048),
map:entry("5",2000)
))
let $map2 :=
map:new((
map:entry("2",5000),
map:entry("1",1000),
map:entry("4",3000)
))
let $map3 :=
map:new((
map:entry("6",100),
map:entry("7",5000),
map:entry("2",2000)
))
let $maps := ($map1, $map2, $map3)
return
fn:fold-left(
function($combinedMaps, $mapToMerge) {
let $newMap := map:map()
let $newMap := local:sortMaps($newMap, $combinedMaps, $mapToMerge)
let $newMap := local:sortMaps($newMap, $mapToMerge, $combinedMaps)
return $newMap
},
$maps[1],
$maps
)

Related

xdmp:spawn-function() Alternative

we have around "20 million" documents in the database and we have created badges of "10000" and use
xdmp:spawn-function() to query over these 20 million documents and perform delete operations according to some conditions . But running it through query console, query is getting timeout .. Any alternate option we can look for so that the query doesn't get timed-out
xquery version "1.0-ml";
declare variable $versionToMaintain := 10;
declare variable $batchSize := 10000;
declare function local:delete($values) {
for $value in $values
let $versionToDelete := $value[3] - $versionToMaintain
return
if ($versionToDelete > 0) then
let $query := cts:and-query((
cts:collection-query('collection name 2'),
cts:element-range-query(xs:QName('version'), '<=', xs:int($versionToDelete)),
cts:element-value-query(xs:QName('id'),$value[2]),
cts:element-range-query(xs:QName('c:created-on'), '<=', xs:dateTime(xdmp:parseDateTime('[Y0001]-[M01]-[D01]')
))
return (cts:uris((), (), $query) ! xdmp:document-delete(.))
else ()
};
let $totalDocs :=
xdmp:estimate(
cts:search(
collection("collection name 1"),
cts:not-query(cts:element-value-query(xs:QName('version'), "1")),
"unfiltered"
)
)
let $totalBatches := fn:ceiling($totalDocs div $batchSize)
for $x in (1 to $totalBatches)
let $values :=
cts:value-tuples(
(
cts:uri-reference(),
cts:element-reference(xs:QName('id')),
cts:element-reference(xs:QName('version'))
),
("skip=" || ($x - 1) * $batchSize, "truncate=" || $batchSize),
cts:and-query((
cts:collection-query("collection name 1"),
cts:not-query(cts:element-value-query(xs:QName('version'), "1"))
))
)
return
xdmp:spawn-function(function(){
local:delete($values)
})
Well, I think the reason why it is taking so long and potentially timing out is that you are doing a lot of cts:value-tuples() in the for loop iterating over the batches, rather than pushing that work out into the spawned function that gets called for each batch.
Move the paginated cts:value-tuples() call inside of the local:delete(), and pass in the $x batch value, instead of the tuples.
xquery version "1.0-ml";
declare variable $versionToMaintain := 10;
declare variable $batchSize := 10000;
declare function local:delete($x) {
let $values :=
cts:value-tuples(
(
cts:uri-reference(),
cts:element-reference(xs:QName('id')),
cts:element-reference(xs:QName('version'))
),
("skip=" || ($x - 1) * $batchSize, "truncate=" || $batchSize),
cts:and-query((
cts:collection-query("collection name 1"),
cts:not-query(cts:element-value-query(xs:QName('version'), "1"))
))
)
for $value in $values
let $versionToDelete := $value[3] - $versionToMaintain
return
if ($versionToDelete > 0) then
let $query := cts:and-query((
cts:collection-query('collection name 2'),
cts:element-range-query(xs:QName('version'), '<=', xs:int($versionToDelete)),
cts:element-value-query(xs:QName('id'),$value[2]),
cts:element-range-query(xs:QName('c:created-on'), '<=', xs:dateTime(xdmp:parseDateTime('[Y0001]-[M01]-[D01]')))
return (cts:uris((), (), $query) ! xdmp:document-delete(.))
else ()
};
let $totalDocs :=
xdmp:estimate(
cts:search(
collection("collection name 1"),
cts:not-query(cts:element-value-query(xs:QName('version'), "1")),
"unfiltered"
)
)
let $totalBatches := fn:ceiling($totalDocs div $batchSize)
for $x in (1 to $totalBatches)
return
xdmp:spawn-function(function(){
local:delete($x)
})
Deleting bulk content from MarkLogic database (with a bucket assignment) is always a challenge. Like Mads suggested, you should consider using CoRB. It is easier to tune the performance with different options available.
Secondly, you can consider using a tiered storage approach - like a range partition or query partition (provided the license requirements are met) where you can archive the documents that match the required criteria to a group of forests. You can then use the forest-clear() to do the job for you.

XQuery: map to array of sequences

I have the following map:
let $input := map { 'a-key': 'a-value', 'b-key': ['b-value-1', 'b-value-2'] }
(the length of the b-key array can vary, or be absent; a-key can be present or absent)
I need to create the following array:
[ ('a', 'b', 'b'), 'a-value', 'b-value-1', 'b-value-2' ]
The number of bs in the first sequence should correspond to the number of b-values.
I've tried just about every combination of iterating/map:for-each, etc., and the array either ends up with too much nesting or completely flat...
(n.b. the array is to be passed to a function -- not mine!-- using fn:apply, so needs to be in this form)
It seems hard to build an array containing a sequence that is constructed dynamically, I think you first need to construct the sequence into a variable and then use the square bracket array constructor [$seq] to construct the array with the sequence as the single item. Then you can array:join the other values:
let $input := map { 'a-key': 'a-value', 'b-key': ['b-value-1', 'b-value-2'] }
let $seq := ($input?a-key!'a', (1 to array:size($input?b-key))!'b')
return
array:join(([$seq], array {$input?a-key, $input?b-key?* }))
https://xqueryfiddle.liberty-development.net/nbUY4ku/2
If you are comfortable with the functional-programming side of XQuery, you can create the whole output in two nested "loops" (i.e. folds), one over the keys and the other one over those values that are arrays:
(: the input :)
let $input := map { 'a-key': 'a-value', 'b-key': ['b-value-1', 'b-value-2'] }
(: utility function that adds a single key/value pair to the output array :)
let $update-arr :=
function($arr, $k, $v) {
array:append(array:put($arr, 1, ($arr(1), $k)), $v)
}
(: nested folds over keys and values :)
return fold-left(map:keys($input), [()], function($arr, $key) {
let $k := substring-before($key, '-key')
let $val := $input($key)
return typeswitch($val)
case array(*)
return array:fold-left($val, $arr, function($arr2, $v) {
$update-arr($arr2, $k, $v)
})
default
return $update-arr($arr, $k, $val)
})
You can even abbreviate the array(*) case as return array:fold-left($val, $arr, $update-arr(?, $k, ?)) if you want.
The result is [("a", "b", "b"), "a-value", "b-value-1", "b-value-2"] as expected.

Inserting a document and reading it in same transaction in MarkLogic

Below is the code snippet I am using for one of the functionality
declare function local:matchCounts($Id as xs:string, $status as xs:string) as xs:int {
xdmp:estimate(cts:search(/count, cts:and-query((
cts:element-attribute-value-query(xs:QName("count"), xs:QName("Id"), $Id, "exact"),
cts:element-attribute-value-query(xs:QName("child"), xs:QName("MatchStatus"), $status, "exact")
)), "unfiltered"))
};
declare function local:saveCountsMatchC($Id as xs:string) {
let $evenCount := local:matchCounts($Id, "even")
let $oddCount := local:matchCounts($Id, "odd")
return ($evenCount, $oddCount)
};
declare function local:matchingProcess($Id as xs:string) {
let $total-records := 1000
let $batch-size := 50
let $pagination := 0
let $bs :=
for $records in 1 to fn:ceiling($total-records div $batch-size )
let $start := fn:sum($pagination + 1)
let $end := fn:sum($batch-size + $pagination)
let $_ := xdmp:set($pagination, $end)
return
xdmp:spawn-function
(
function() {
for $each at $pos in ($start to $end)
let $id := sem:uuid-string()
let $xml := if(($pos mod 2) eq 0) then <count Id='{$Id}'><child MatchStatus='even'></child></count>
else <count Id='{$Id}'><child MatchStatus='odd'></child></count>
return xdmp:document-insert(concat("/", $id, ".xml"), $xml)
},
<options xmlns="xdmp:eval"><result>{fn:true()}</result><commit>auto</commit><update>true</update></options>
)
let $_ := $bs
return local:saveCountsMatchC($Id)
};
local:matchingProcess("1")
The requirement over here is to iterate 1000 documents using batch size of 50, so basically I am using spawn function to create 20 batches of size 50 which inserts 1000 documents in my database.
Once those documents are inserted, I need to read those documents in same transaction. Here 500 documents have MatchStatus='odd' and 500 documents have MatchStatus='even'
The query should return (500,500) as output; Instead it returns (0,0)
I am using <result>{fn:true()}</results> option so that my next statement waits for all spawn task to be completed, but its not happeneing.
Can anybody help me with the requirement?
Note: Need to insert 1000 documents and then read them in same function call only
Your code that executes the spawns does not perform updates itself, so will run in so-called query mode. In query mode only updates from before the start of the code are visible.
You could try running in update mode (declare option xdmp:transaction-mode "update";), but usually it is easier to just spawn or eval the counting/reading of your updates as well. E.g. wrap the xdmp:estimate in an xdmp:spawn-function with result true as well.
HTH!

I need some help on an XQuery sequence merge that preserves order

I am working on a function to merge a set of sequences that will preserve the order of all of the sequences as best as possible. Doing a distinct-values($sequences) on all of the sequences does not preserve the order.
I have the following MarkLogic XQuery code:
xquery version "1.0-ml";
declare function local:map-sequence($map, $list as xs:string*) {
let $count := fn:count($list) - 1
return for $idx in (1 to $count)
return if (map:contains($map, $list[$idx]))
then map:put($map, $list[$idx], fn:distinct-values((map:get($map, $list[$idx]), $list[$idx + 1])))
else map:put($map, $list[$idx], $list[$idx + 1])
};
declare function local:first($map) {
let $all-children := for $key in map:keys($map) return map:get($map, $key)
return distinct-values(map:keys($map)[not(.=$all-children)])
};
declare function local:next($map, $key as xs:string) {
if (map:contains($map, $key))
then if (fn:count(map:get($map, $key)) eq 1)
then map:get($map, $key)
else
let $children := map:get($map, $key)
return
for $next in $children
let $others := $children[fn:not(.=$next)]
let $descedents := local:descendents($map, $next)
return if ($descedents[.=$others])
then $next
else ()
else ()
};
declare function local:descendents($map, $key as xs:string) {
for $child in map:get($map, $key)
return ($child, local:descendents($map, $child))
};
declare function local:sequence($map, $key as xs:string) {
let $next := local:next($map, $key)
return if (fn:count($next) gt 1)
then
for $choice in $next
return $choice
else if (fn:count($next) eq 1)
then ($next, local:sequence($map, $next))
else ()
};
let $map := map:map()
let $seq1 := local:map-sequence($map, ('fred', 'barney', 'pebbles'))
let $seq2 := local:map-sequence($map, ('fred', 'wilma', 'betty', 'pebbles'))
let $seq3 := local:map-sequence($map, ('barney', 'wilma', 'betty'))
let $first := local:first($map)
return ($map,
for $top in $first
return ($top, local:sequence($map, $top))
)
it returns
{"barney":["pebbles", "wilma"], "fred":["barney", "wilma"], "wilma":"betty", "betty":"pebbles"}
fred
barney
wilma
betty
pebbles
It still needs work. If you add:
let $seq4 := local:map-sequence($map, ('fred', 'bambam'))
bambam does not show up. I am still working on it, but if others have suggestions, then I would like to hear them.
Thanks,
Loren
As far as I understand your problem, each sequence represents a hierarchy of values, so from the sequence ("foo", "bar", "baz") we can follow that "foo" < "bar", "foo" < "baz" and "bar" < "baz" should preferably hold in the resulting ordering.
From your expected output it seems that you want the values to be sorted from the one with the smallest number of (transitive) predecessors ("fred" in your case) to that with the most ones ("pebbles" with four predecessors: ("barney", "fred", "betty", "wilma")).
I do not have access to MarkLogic and its proprietary maps, so I'll use standard XQuery 3.0 maps instead. The underlying algorithms should be easy to translate.
As a first step we build a map of all immediate predecessors of each unique value found in at least one of the input sequences. Because XQuery 3.0 maps cannot be modified in-place, we use fn:fold-left(...) to build one up incrementally. Note also that even the first element of each list is added to the map with an empty sequence of predecessors.
declare function local:add-preds($map0, $list as xs:string*) {
fn:fold-left(
1 to fn:count($list),
$map0,
function($map, $idx) {
map:put(
$map,
$list[$idx],
(: add the current predecessor to the list :)
fn:distinct-values((map:get($map, $list[$idx]), $list[$idx - 1]))
)
}
)
};
Next we need the transitive closure of this map of predecessors, so we need to gather all values that can be reached from a given key by a chain of predecessors. We can do this using a simple depth-first search:
declare function local:transitive($preds) {
map:merge(
for $key in map:keys($preds)
return map:entry($key, local:all-predecessors($preds, $key, $key)[not(. = $key)])
)
};
declare function local:all-predecessors($succ, $key, $seen0) {
fold-left(
map:get($succ, $key),
$seen0,
function($seen, $next) {
if($next = $seen) then $seen
else local:all-predecessors($succ, $next, ($seen, $next))
}
)
};
This transforms your example initial predecessor map
map {
"bambam": "fred",
"pebbles": ("barney", "betty"),
"fred": (),
"wilma": ("fred", "barney"),
"barney": "fred",
"betty": "wilma"
}
and transforms it into
map {
"bambam": "fred",
"pebbles": ("barney", "fred", "betty", "wilma"),
"fred": (),
"wilma": ("fred", "barney"),
"barney": "fred",
"betty": ("wilma", "fred", "barney")
}
With that map your sorting now becomes very easy: Just take all keys in the map, order them by the number of their predecessors, and output them:
let $map0 := map{}
let $map1 := local:add-preds($map0, ('fred', 'barney', 'pebbles'))
let $map2 := local:add-preds($map1, ('fred', 'wilma', 'betty', 'pebbles'))
let $map3 := local:add-preds($map2, ('barney', 'wilma', 'betty'))
let $map4 := local:add-preds($map3, ('fred', 'bambam'))
let $trans := local:transitive($map4)
for $key in map:keys($trans)
order by count(map:get($trans, $key))
return $key
This returns your desired result: "fred", "bambam", "barney", "wilma", "betty", "pebbles"

Ordering by a sequence of values in XQuery

I've got some XML data that takes this form:
<products>
<product version="1.2.3"/>
<product version="1.10.0"/>
<product version="2.1.6"/>
</products>
...And so on. I want to order these in XQuery by version number. Trouble is, if I just do order by $thing/#version, it does a lexicographic comparison that puts 1.10.0 before 1.2.3, which is wrong.
What I really want to do is something like:
order by tokenize($thing/#version, '\.') ! number(.)
Unfortunately this doesn't work because XQuery doesn't let you use an entire sequence as an ordering key. How can I get something like this?
A solution that doesn't rely on all the version numbers having the same number of dots would be preferable, but I'll take what I can get.
All you can do is normalize the version numbers so you can apply lexical ordering.
Determine maximum string length in a version step
Pad it with 0's (or space if you prefer, but you will have to change the code for this)
Tokenize each version, pad each version step, rejoin them
Compare based on padded version
I didn't clean up that code and pulled two functions from functx, but it works and should be fine for embedding as needed. The code is also able to deal with single-letters, if necessary you could replace all occurences of "alpha", ... for example by "a", ...
declare namespace functx = "http://www.functx.com";
declare function functx:repeat-string
( $stringToRepeat as xs:string? ,
$count as xs:integer ) as xs:string {
string-join((for $i in 1 to $count return $stringToRepeat),
'')
} ;
declare function functx:pad-integer-to-length
( $integerToPad as xs:anyAtomicType? ,
$length as xs:integer ) as xs:string {
if ($length < string-length(string($integerToPad)))
then error(xs:QName('functx:Integer_Longer_Than_Length'))
else concat
(functx:repeat-string(
'0',$length - string-length(string($integerToPad))),
string($integerToPad))
} ;
declare function local:version-compare($a as xs:string, $max-length as xs:integer)
as xs:string*
{
string-join(tokenize($a, '\.') ! functx:pad-integer-to-length(., $max-length), '.')
};
let $bs := ("1.42", "1.5", "1", "1.42.1", "1.43", "2")
let $max-length := max(
for $b in $bs
return tokenize($b, '\.') ! string-length(.)
)
for $b in $bs
let $normalized := local:version-compare($b, $max-length)
order by $normalized
return $b
Returns:
1 1.5 1.42 1.42.1 1.43 2
Order by doesn't accept a sequence, but you can explicitly tokenize the versions and add them to the order by, separated by commas (note the exclusion of parens).
let $products :=
<products>
<product version="1.2.3"/>
<product version="1.10.0"/>
<product version="2.1.6"/>
</products>
for $p in $products/product
let $toks := tokenize($p/#version, '\.')
let $main := xs:integer($toks[1])
let $point := xs:integer($toks[2])
let $sub := xs:integer($toks[3])
order by $main, $point, $sub
return $p
Update: for a variable number of tokens, you could make the order by more robust:
order by
if (count($toks) gt 0) then $main else (),
if (count($toks) gt 1) then $point else (),
if (count($toks) gt 2) then $sub else ()
I did something similar to Jens's answer:
let $products := //product
let $max-length := max($products/#version ! string-length(.))
for $product in $products
order by string-join(
for $part in tokenize($product/#version, '\.')
return string-join((
for $_ in 1 to $max-length - string-length($part) return ' ',
$part)))
return $product
Here's a version that will handle an arbitrary number of segments, as long as they're numeric and all version strings have the same number of segments. It also assumes no one component ever exceeds 999.
This simply combines each numeric segment into a single big number and sorts by that.
declare function local:version-order ($version as xs:string) as xs:double
{
fn:sum (
let $toks := fn:tokenize ($version, "\.")
let $count := fn:count ($toks)
for $tok at $idx in $toks
return xs:double ($tok) * math:pow (1000, ($count - $idx))
)
};
let $products :=
<products>
<product version="1.10.0"/>
<product version="2.1.6"/>
<product version="1.2.3"/>
</products>
for $p in $products/product
order by local:version-order ($p/#version)
return $p

Resources