xdmp:spawn-function() Alternative - xquery

we have around "20 million" documents in the database and we have created badges of "10000" and use
xdmp:spawn-function() to query over these 20 million documents and perform delete operations according to some conditions . But running it through query console, query is getting timeout .. Any alternate option we can look for so that the query doesn't get timed-out
xquery version "1.0-ml";
declare variable $versionToMaintain := 10;
declare variable $batchSize := 10000;
declare function local:delete($values) {
for $value in $values
let $versionToDelete := $value[3] - $versionToMaintain
return
if ($versionToDelete > 0) then
let $query := cts:and-query((
cts:collection-query('collection name 2'),
cts:element-range-query(xs:QName('version'), '<=', xs:int($versionToDelete)),
cts:element-value-query(xs:QName('id'),$value[2]),
cts:element-range-query(xs:QName('c:created-on'), '<=', xs:dateTime(xdmp:parseDateTime('[Y0001]-[M01]-[D01]')
))
return (cts:uris((), (), $query) ! xdmp:document-delete(.))
else ()
};
let $totalDocs :=
xdmp:estimate(
cts:search(
collection("collection name 1"),
cts:not-query(cts:element-value-query(xs:QName('version'), "1")),
"unfiltered"
)
)
let $totalBatches := fn:ceiling($totalDocs div $batchSize)
for $x in (1 to $totalBatches)
let $values :=
cts:value-tuples(
(
cts:uri-reference(),
cts:element-reference(xs:QName('id')),
cts:element-reference(xs:QName('version'))
),
("skip=" || ($x - 1) * $batchSize, "truncate=" || $batchSize),
cts:and-query((
cts:collection-query("collection name 1"),
cts:not-query(cts:element-value-query(xs:QName('version'), "1"))
))
)
return
xdmp:spawn-function(function(){
local:delete($values)
})

Well, I think the reason why it is taking so long and potentially timing out is that you are doing a lot of cts:value-tuples() in the for loop iterating over the batches, rather than pushing that work out into the spawned function that gets called for each batch.
Move the paginated cts:value-tuples() call inside of the local:delete(), and pass in the $x batch value, instead of the tuples.
xquery version "1.0-ml";
declare variable $versionToMaintain := 10;
declare variable $batchSize := 10000;
declare function local:delete($x) {
let $values :=
cts:value-tuples(
(
cts:uri-reference(),
cts:element-reference(xs:QName('id')),
cts:element-reference(xs:QName('version'))
),
("skip=" || ($x - 1) * $batchSize, "truncate=" || $batchSize),
cts:and-query((
cts:collection-query("collection name 1"),
cts:not-query(cts:element-value-query(xs:QName('version'), "1"))
))
)
for $value in $values
let $versionToDelete := $value[3] - $versionToMaintain
return
if ($versionToDelete > 0) then
let $query := cts:and-query((
cts:collection-query('collection name 2'),
cts:element-range-query(xs:QName('version'), '<=', xs:int($versionToDelete)),
cts:element-value-query(xs:QName('id'),$value[2]),
cts:element-range-query(xs:QName('c:created-on'), '<=', xs:dateTime(xdmp:parseDateTime('[Y0001]-[M01]-[D01]')))
return (cts:uris((), (), $query) ! xdmp:document-delete(.))
else ()
};
let $totalDocs :=
xdmp:estimate(
cts:search(
collection("collection name 1"),
cts:not-query(cts:element-value-query(xs:QName('version'), "1")),
"unfiltered"
)
)
let $totalBatches := fn:ceiling($totalDocs div $batchSize)
for $x in (1 to $totalBatches)
return
xdmp:spawn-function(function(){
local:delete($x)
})

Deleting bulk content from MarkLogic database (with a bucket assignment) is always a challenge. Like Mads suggested, you should consider using CoRB. It is easier to tune the performance with different options available.
Secondly, you can consider using a tiered storage approach - like a range partition or query partition (provided the license requirements are met) where you can archive the documents that match the required criteria to a group of forests. You can then use the forest-clear() to do the job for you.

Related

XQuery - wrong indexes in substring after reverse-string function use

Im trying to implement base64 coding in a very simple way. In my approach (lets for a second put away whether its appropriate or not) I need to reverse strings and then concate them. After that this concated string is used in substring function. Strings are joined properly but when I use substring basex seems to lose it.
Funny thing is substring works for well for all indexes starting at 8. So substring($string, 1, 8) and higher gives correct output. But everything below that is messed up. Starting with one disappeared number: substring($string, 1, 7 (and below) ) results in 6 length string.
Moreover substring can start only with 1st or 0 index. Anything greater results in empty return.
declare variable $array := [];
declare function bs:encode
( $input as xs:string ) {
bs:integer-to-binary(string-to-codepoints($input), "", $array)
} ;
declare function bs:integer-to-binary
( $input as xs:integer*, $string as xs:string, $array as array(xs:string) ) {
let $strings :=
for $i in $input
return
if ($i != 0)
then if ($i mod 2 = 0)
then bs:integer-to-binary(xs:integer($i div 2), concat($string, 0), $array)
else bs:integer-to-binary(xs:integer($i div 2), concat($string, 1), $array)
else if ($i <= 0)
then array:append($array, $string)
return bs:check-if-eight($strings)
} ;
declare function bs:check-if-eight
( $strings as item()+ ) {
let $fullBinary :=
for $string in $strings
return if (string-length($string) < 8)
then bs:check-if-eight(concat($string, 0))
else $string (: add as private below :)
return bs:concat-strings($fullBinary)
} ;
declare function bs:concat-strings
( $strings as item()+ ) {
let $firstStringToConcat := functx:reverse-string($strings[position() = 1])
let $secondStringToConcat := functx:reverse-string($strings[position() = 2])
let $thirdStringToConcat := functx:reverse-string($strings[position() = 3])
let $concat :=
concat
($firstStringToConcat,
$secondStringToConcat,
$thirdStringToConcat)
(: this returns correct string of binary value for Cat word :)
return bs:divide-into-six($concat)
} ;
declare function bs:divide-into-six
( $binaryString as xs:string) {
let $sixBitString := substring($binaryString, 1, 6)
(: this should return 010000 instead i get 000100 which is not even in $binaryString at all :)
return $sixBitString
} ;
bs:encode("Cat")
I expect first six letters from string (010000) instead I get some random sequence I guess (00100). The whole module is meant to encode strings into base64 format but for now (the part i uploaded) should just throw first six bits for 'C'
Alright so I figured it out I guess.
First of all in function concat-strings I changed concat to fn:string-join. It allowed me to pass as an argument symbol that separates joined strings.
declare function bs:concat-strings ( $strings as item()+ ) {
let $firstStringToConcat := xs:string(functx:reverse-string($strings[position() = 1]))
let $secondStringToConcat := xs:string(functx:reverse-string($strings[position() = 2]))
let $thirdStringToConcat := xs:string(functx:reverse-string($strings[position() = 3]))
let $concat :=
****fn:string-join(****
($firstStringToConcat,
$secondStringToConcat,
$thirdStringToConcat),****'X'****)
return bs:divide-into-six($concat) } ;
I saw that my input looked like this:
XXXXXXXX01000011XXXXXXXXXXXXXXXXX01100001XXXXXXXXXXXXXXXXX01110100XXXXXXXX
Obviously it had to looping somewhere without clear for loop and as I novice to Xquery i must have been missed that. And indeed. I found it in check-if-eight function:
> declare function bs:check-if-eight ( $strings as item()+ ) {
> **let $fullBinary :=**
> for $string in $strings
> return if (string-length($string) < 8)
> then bs:check-if-eight(concat($string, 0))
> else $string (: add as private below :)
> **return bs:concat-strings($fullBinary)** } ;
Despite being above FOR keyword, $fullBinary variable was in a loop and produced empty spaces(?) and it was clearly shown when i used X as a separator.
DISCLAIMER: I thought about this before and used functx:trim but for some reason it doesnt work like I expected. So it might not for you too if having similar issue.
At this point it was clear that let $fullBinary cannot be bided in FLWR statement at least can't trigger concat-strings function. I changed it and now it produces only string and now im trying to figure out new sequence of running whole module but I think the main problem here is solved.

Inserting a document and reading it in same transaction in MarkLogic

Below is the code snippet I am using for one of the functionality
declare function local:matchCounts($Id as xs:string, $status as xs:string) as xs:int {
xdmp:estimate(cts:search(/count, cts:and-query((
cts:element-attribute-value-query(xs:QName("count"), xs:QName("Id"), $Id, "exact"),
cts:element-attribute-value-query(xs:QName("child"), xs:QName("MatchStatus"), $status, "exact")
)), "unfiltered"))
};
declare function local:saveCountsMatchC($Id as xs:string) {
let $evenCount := local:matchCounts($Id, "even")
let $oddCount := local:matchCounts($Id, "odd")
return ($evenCount, $oddCount)
};
declare function local:matchingProcess($Id as xs:string) {
let $total-records := 1000
let $batch-size := 50
let $pagination := 0
let $bs :=
for $records in 1 to fn:ceiling($total-records div $batch-size )
let $start := fn:sum($pagination + 1)
let $end := fn:sum($batch-size + $pagination)
let $_ := xdmp:set($pagination, $end)
return
xdmp:spawn-function
(
function() {
for $each at $pos in ($start to $end)
let $id := sem:uuid-string()
let $xml := if(($pos mod 2) eq 0) then <count Id='{$Id}'><child MatchStatus='even'></child></count>
else <count Id='{$Id}'><child MatchStatus='odd'></child></count>
return xdmp:document-insert(concat("/", $id, ".xml"), $xml)
},
<options xmlns="xdmp:eval"><result>{fn:true()}</result><commit>auto</commit><update>true</update></options>
)
let $_ := $bs
return local:saveCountsMatchC($Id)
};
local:matchingProcess("1")
The requirement over here is to iterate 1000 documents using batch size of 50, so basically I am using spawn function to create 20 batches of size 50 which inserts 1000 documents in my database.
Once those documents are inserted, I need to read those documents in same transaction. Here 500 documents have MatchStatus='odd' and 500 documents have MatchStatus='even'
The query should return (500,500) as output; Instead it returns (0,0)
I am using <result>{fn:true()}</results> option so that my next statement waits for all spawn task to be completed, but its not happeneing.
Can anybody help me with the requirement?
Note: Need to insert 1000 documents and then read them in same function call only
Your code that executes the spawns does not perform updates itself, so will run in so-called query mode. In query mode only updates from before the start of the code are visible.
You could try running in update mode (declare option xdmp:transaction-mode "update";), but usually it is easier to just spawn or eval the counting/reading of your updates as well. E.g. wrap the xdmp:estimate in an xdmp:spawn-function with result true as well.
HTH!

Sorting multiple maps in marklogic 8

This is more of an XQuery than MarkLogic. I have three map:map and each map has key-value pair of "id" and score. I would like to sort all the distinct ids based on the score from each maps.
For eg:
map1 : 1:2048, 5:2000
map2 : 2:5000, 1:1000, 4:3000
map3 : 6:100, 7:5000, 2:2000
In the above example, each map is id:score for key value (did not know how to represent here :))..
I want the sorted list of id from three maps based on score..
Is there a good way or better way of doing the sorting, or do I have to union the keys of the map and iterate the sequence of keys and sort them ?
This seems like a great use case for folding. Its part of Xquery 3.0 spec.
Folding can go through a sequence of items and gets the result for each item as it goes through. In this example $combinedMaps is the result of the last call and $mapToMerge is the item in the sequence it is currently going through.
Here an example of what you would want to do.
declare function local:sortMaps(
$newMap as map:map,
$mapA as map:map,
$mapB as map:map
) as map:map {
let $build :=
for $key in map:keys($mapA)
let $otherMapValue :=
(map:get($mapB, $key), 0)[1]
let $value := map:get($mapA, $key)
return
if ($value gt $otherMapValue) then (
map:put($newMap, $key, $value)
) else (
map:put($newMap, $key, $otherMapValue)
)
return $newMap
};
let $map1 :=
map:new((
map:entry("1",2048),
map:entry("5",2000)
))
let $map2 :=
map:new((
map:entry("2",5000),
map:entry("1",1000),
map:entry("4",3000)
))
let $map3 :=
map:new((
map:entry("6",100),
map:entry("7",5000),
map:entry("2",2000)
))
let $maps := ($map1, $map2, $map3)
return
fn:fold-left(
function($combinedMaps, $mapToMerge) {
let $newMap := map:map()
let $newMap := local:sortMaps($newMap, $combinedMaps, $mapToMerge)
let $newMap := local:sortMaps($newMap, $mapToMerge, $combinedMaps)
return $newMap
},
$maps[1],
$maps
)

marklogic 8 - How to search collection wise

I have tried this below mentioned Xquery in my query console, but I need search the data from collections wise?
let $value1 := "antony"
let $value2 := "cse"
for $uri1 in cts:uris((),(), (
cts:element-query(xs:QName("P"),
cts:and-query((
cts:element-attribute-value-query(xs:QName("P"),xs:QName("name"),$value1),
cts:element-attribute-value-query(xs:QName("P"),xs:QName("value"),$value2)
))
)
))
let $xml := doc($uri1)
return $xml//PS/P [#name eq "volume"]/#value
Please suggest me how to add the collection in above mentioned XQuery?
First I would say that since you ultimately want documents rather than URIs, it would be more effective to use cts:search directly. You can add the "unfiltered" option if you want to avoid the cost of filtering, e.g.:
let $value1 := "antony"
let $value2 := "cse"
return
cts:search( doc(),
cts:element-query(xs:QName("P"),
cts:and-query((
cts:element-attribute-value-query(xs:QName("P"),xs:QName("name"),$value1),
cts:element-attribute-value-query(xs:QName("P"),xs:QName("value"),$value2)
))
), "unfiltered"
)//PS/P[#name eq "volume"]/#value
or, since you only care about /PS/P elements:
cts:search( doc()//PS/P,
cts:element-query(xs:QName("P"),
cts:and-query((
cts:element-attribute-value-query(xs:QName("P"),xs:QName("name"),"antony"),
cts:element-attribute-value-query(xs:QName("P"),xs:QName("value"),"cse")
))
), "unfiltered"
)[#name eq "volume"]/#value
To search within a collection, replace the doc() with collection("yourcollection"):
cts:search( collection("yourcollection")//PS/P,
cts:element-query(xs:QName("P"),
cts:and-query((
cts:element-attribute-value-query(xs:QName("P"),xs:QName("name"),"antony"),
cts:element-attribute-value-query(xs:QName("P"),xs:QName("value"),"cse")
))
), "unfiltered"
)[#name eq "volume"]/#value

Ordering by a sequence of values in XQuery

I've got some XML data that takes this form:
<products>
<product version="1.2.3"/>
<product version="1.10.0"/>
<product version="2.1.6"/>
</products>
...And so on. I want to order these in XQuery by version number. Trouble is, if I just do order by $thing/#version, it does a lexicographic comparison that puts 1.10.0 before 1.2.3, which is wrong.
What I really want to do is something like:
order by tokenize($thing/#version, '\.') ! number(.)
Unfortunately this doesn't work because XQuery doesn't let you use an entire sequence as an ordering key. How can I get something like this?
A solution that doesn't rely on all the version numbers having the same number of dots would be preferable, but I'll take what I can get.
All you can do is normalize the version numbers so you can apply lexical ordering.
Determine maximum string length in a version step
Pad it with 0's (or space if you prefer, but you will have to change the code for this)
Tokenize each version, pad each version step, rejoin them
Compare based on padded version
I didn't clean up that code and pulled two functions from functx, but it works and should be fine for embedding as needed. The code is also able to deal with single-letters, if necessary you could replace all occurences of "alpha", ... for example by "a", ...
declare namespace functx = "http://www.functx.com";
declare function functx:repeat-string
( $stringToRepeat as xs:string? ,
$count as xs:integer ) as xs:string {
string-join((for $i in 1 to $count return $stringToRepeat),
'')
} ;
declare function functx:pad-integer-to-length
( $integerToPad as xs:anyAtomicType? ,
$length as xs:integer ) as xs:string {
if ($length < string-length(string($integerToPad)))
then error(xs:QName('functx:Integer_Longer_Than_Length'))
else concat
(functx:repeat-string(
'0',$length - string-length(string($integerToPad))),
string($integerToPad))
} ;
declare function local:version-compare($a as xs:string, $max-length as xs:integer)
as xs:string*
{
string-join(tokenize($a, '\.') ! functx:pad-integer-to-length(., $max-length), '.')
};
let $bs := ("1.42", "1.5", "1", "1.42.1", "1.43", "2")
let $max-length := max(
for $b in $bs
return tokenize($b, '\.') ! string-length(.)
)
for $b in $bs
let $normalized := local:version-compare($b, $max-length)
order by $normalized
return $b
Returns:
1 1.5 1.42 1.42.1 1.43 2
Order by doesn't accept a sequence, but you can explicitly tokenize the versions and add them to the order by, separated by commas (note the exclusion of parens).
let $products :=
<products>
<product version="1.2.3"/>
<product version="1.10.0"/>
<product version="2.1.6"/>
</products>
for $p in $products/product
let $toks := tokenize($p/#version, '\.')
let $main := xs:integer($toks[1])
let $point := xs:integer($toks[2])
let $sub := xs:integer($toks[3])
order by $main, $point, $sub
return $p
Update: for a variable number of tokens, you could make the order by more robust:
order by
if (count($toks) gt 0) then $main else (),
if (count($toks) gt 1) then $point else (),
if (count($toks) gt 2) then $sub else ()
I did something similar to Jens's answer:
let $products := //product
let $max-length := max($products/#version ! string-length(.))
for $product in $products
order by string-join(
for $part in tokenize($product/#version, '\.')
return string-join((
for $_ in 1 to $max-length - string-length($part) return ' ',
$part)))
return $product
Here's a version that will handle an arbitrary number of segments, as long as they're numeric and all version strings have the same number of segments. It also assumes no one component ever exceeds 999.
This simply combines each numeric segment into a single big number and sorts by that.
declare function local:version-order ($version as xs:string) as xs:double
{
fn:sum (
let $toks := fn:tokenize ($version, "\.")
let $count := fn:count ($toks)
for $tok at $idx in $toks
return xs:double ($tok) * math:pow (1000, ($count - $idx))
)
};
let $products :=
<products>
<product version="1.10.0"/>
<product version="2.1.6"/>
<product version="1.2.3"/>
</products>
for $p in $products/product
order by local:version-order ($p/#version)
return $p

Resources