Count number of word occurrences working slow BaseX xquery - xquery

I want to count occurrences of the words in the XML document, query giving the actual count but it is working slow.
There are only two xml files size (236 KB, 155 KB) and it is taking 17 sec to produce result.
Below is the query:
let $doc := db:open('test','/ieee/test.xml')
let $tokens := $doc//text()/fn:tokenize(fn:normalize-space(.),'\s')
let $stringtoken := for $x at $pos in $tokens[position() = 1 to fn:last()-1]
let $y := string-join($tokens[position() = $pos to $pos + 1],' ')
return $y
return
<results>
{
for $result in distinct-values($stringtoken)
let $count := count($stringtoken[. = $result])
return
<term word="{$result}" count="{$count}"></term>
}
</results>
In the above query let $count := count($stringtoken[. = $result]) is taking too much time.
Any suggestion to improve the performance of the code much appreciated.

The group by statement will speed up your query a lot:
return <results>{
for $grouped-token in $stringtoken
group by $token := $grouped-token
let $count := count($grouped-token)
return <term word="{ $token }" count="{ $count }"/>
}</results>

Related

I don't know the syntax error in this xquery code

let $d := doc('mondial.xml')
let $airports := $d/mondial/airport
let $countries := $d/mondial/country
for $data1 in $countries
let $count :=xs:integer("0")
let $name :=$data1/name
let $car_code :=$data1/#car_code
for $data2 in $airports
where $car_code = $data2/#country
$count:= $count+ 1
where xs:integer($count)>25
return
<country>
<name>{data($name)}</name>
<count>{data($count)}</count>
</country>
This is my code, when I ran this code, there is syntax error:
error: syntax error, unexpected $[err:XPST0003]
$count:=$count+1
Perhaps
let $d := doc('mondial.xml')
let $airports := $d/mondial/airport
let $countries := $d/mondial/country
for $country in $countries
let $airport-count := count($airports[#country = $country/#car_code])
where $airport-count > 25
return
<country>
{
$country/name,
<count>{$airport-count}</count>
}
</country>
Actually I found the answer
let $d := doc('mondial.xml')
let $airports := $d/mondial/airport
let $countries := $d/mondial/country
for $data1 in $countries
let $name :=$data1/name
let $car_code :=$data1/#car_code
let $count := count($airports[#country eq $car_code])
where $count>25
return
<country>
<name>{data($name)}</name>
<count>{data($count)}</count>
</country>

MarkLogic Xquery: How to sort string values in a for loop

I've got a sequence that needs to sort a list based off earliest year vs. latest year. Due to some unique values in the year element, it is making the sort a little more complicated. Is there any way to achieve the following?
let $dates := ('1982', '2019', '2095', 'pre-1982', 'post-2095')
return
for $date in $dates
order by $date
return $date
the dates element text is usually the year in the data, but outlier cases have a pre- or post- attached. Any way to achieve this minimally?
I am not sure if this is minimal, but it works:
let $dates := ('1982', '2019', '2095', 'pre-1982', 'post-2095')
return
for $date in $dates
let $year :=
if (fn:contains($date, "-"))
then fn:substring-after($date, "-")
else $date
let $prepost :=
if (fn:starts-with($date, "pre"))
then -1
else if (fn:starts-with($date, "post"))
then 1
else 0
order by $year, $prepost
return $date
Just FYI: Definitely not minimal, but I wanted to know what fn:sort does when a sequence is returned. Turns out it does the right thing.
xquery version "3.1";
declare variable $local:ascending := 1;
declare variable $local:descending := -1;
declare function local:sort-prefixed-years ($y, $order) {
if (fn:contains($y, "-"))
then (
let $p := fn:tokenize($y, "-")
let $m :=
switch($p[1])
case "pre" return -1 * $order
case "post" return 1 * $order
default return 0
return (xs:integer($p[2]) * $order, $m)
)
else (xs:integer($y) * $order, 0)
};
declare function local:sort-prefixed-years-ascending ($prefixed-year) {
local:sort-prefixed-years($prefixed-year, $local:ascending)
};
declare function local:sort-prefixed-years-descending ($prefixed-year) {
local:sort-prefixed-years($prefixed-year, $local:descending)
};
let $dates := ('1982', '2019', '2095', 'pre-1982', 'post-2095')
return sort($dates, (), local:sort-prefixed-years-descending#1)

Trouble with making xquery function recursive

I'm having some difficulty with making a function I've written recursive. I need to be able to turn this xml:
<entry ref="22">
<headword>abaishen</headword>
<part_of_speech> v. </part_of_speech>
<variant>abeishen</variant>
<variant>abaissen</variant>
<variant>abeisen</variant>
<variant>abashen</variant>
<variant>abasshen</variant>
<variant>abassen</variant>
<variant>abeeshen</variant>
<variant>abesen</variant>
<variant>abessen</variant>
<variant>abaished</variant>
<variant>-et</variant>
<variant>-it</variant>
<variant>abaisht</variant>
<variant>abaist</variant>
<variant>abasht</variant>
<variant>abast</variant>
</entry>
Into this XML -- essentially replacing the ending of any entry that begins with an "-" with the stem of the last complete entry:
<entry ref="22">
<headword>abaishen</headword>
<variant>abeishen</variant>
<variant>abaissen</variant>
<variant>abeisen</variant>
<variant>abashen</variant>
<variant>abasshen</variant>
<variant>abassen</variant>
<variant>abeeshen</variant>
<variant>abesen</variant>
<variant>abessen</variant>
<variant>abaished</variant>
<variant>abaishet</variant>
<variant>abaishit</variant>
<variant>abaisht</variant>
<variant>abaist</variant>
<variant>abasht</variant>
<variant>abast</variant>
<part_of_speech> v. </part_of_speech>
</entry>
The issue I'm running into is that second entry, the -it one, returns "abaishet" with the code I currently have:
declare function local:hyphen-replace($f) {
let $j :=
if (substring($f/text(), 1, 1) = "-") then
let $ending := substring-after($f/text(),"-")
let $ending-length := string-length($ending)
let $previous := $f/preceding-sibling::*[1]
let $previous-length := string-length($previous)
return
if (substring($previous/text(), 1, 1) = "-") then
local:hyphen-replace($previous)
else
element {name($f)} {concat(substring($previous,1,($previous-length - $ending-length)),$ending)}
else
$f
return $j
};
declare function local:verbCheck($nodes as node()*) as node()* {
let $d := $nodes/part_of_speech
let $s := functx:siblings($d)
let $p := for $node in $nodes
return
let $d := $node/part_of_speech
let $s := functx:siblings($d)
return
if ($d/text() = " v. ") then
for $f in $s
let $j :=
local:hyphen-replace($f)
return ($j)
else
<empty/>
return
($p,$d)
};
<list>
{
let $collection := concat($collection, '?select=*.xml')
let $q := collection($collection)
let $v := local:buildNodes($q)
let $entries :=
for $n in $v
return <entry ref="{$n/#ref}">{local:verbCheck($n)}</entry>
return local:remove-empty-elements($entries)
}
</list>
It's obvious to me that my problem is with this piece of code in local:hypen-replace:
if (substring($previous/text(), 1, 1) = "-") then
local:hyphen-replace($previous)
because it's calling to the immediately previous item and replacing the "-it" node with it's information. But I don't know how to rewrite it to make it work recursively properly. Any suggestions would be appreciated. Thank you.

How can I format a decimal in xquery?

I'm trying to format decimals in XQuery. The decimals are currency, so the format should be ,###.##.
For example:
5573652.23 should be 5,573,652.23
and
352769 should be 352,769 (or 352,769.00 if it's easier/cleaner)
Right now I'm using this function from http://www.xqueryhacker.com/2009/09/format-number-in-xquery/, but I can't use decimals with it:
declare function local:format-int($i as xs:int) as xs:string
{
let $input :=
if ($i lt 0) then fn:substring(fn:string($i), 2)
else fn:string($i)
let $rev := fn:reverse(fn:string-to-codepoints(fn:string($input)))
let $comma := fn:string-to-codepoints(',')
let $chars :=
for $c at $i in $rev
return (
$c,
if ($i mod 3 eq 0 and fn:not($i eq count($rev)))
then $comma else ()
)
return fn:concat(
if ($i lt 0) then '-' else (),
fn:codepoints-to-string(fn:reverse($chars))
)
};
I'm using Saxon 9HE for my processor.
Any help would be greatly appreciated.
----- UPDATE -----
Based on Dimitre's answer, I modified the function to save the decimal portion and add it to the end of the return string.
New Function
declare function local:format-dec($i as xs:decimal) as xs:string
{
let $input := tokenize(string(abs($i)),'\.')[1]
let $dec := substring(tokenize(string($i),'\.')[2],1,2)
let $rev := reverse(string-to-codepoints(string($input)))
let $comma := string-to-codepoints(',')
let $chars :=
for $c at $i in $rev
return (
$c,
if ($i mod 3 eq 0 and not($i eq count($rev)))
then $comma else ()
)
return concat(if ($i lt 0) then '-' else (),
codepoints-to-string(reverse($chars)),
if ($dec != '') then concat('.',$dec) else ()
)
};
Use:
let $n := 5573652.23
return
concat(local:format-int(xs:int(floor($n))),
'.',
substring(string($n - floor($n)), 3)
)
This produces exactly the wanted, correct result:
5,573,652.23
This doesn't work for you?:
format-number(5573652.23,",###.##")
You can play with this here. I am pretty sure that saxon supports this function.
Edit: This function is not supported in saxon (see comments below).
With XQuery 3.0 and Saxon-HE 9.7 Parser you can do the following:
declare decimal-format local:de decimal-separator = "," grouping-separator = ".";
declare decimal-format local:en decimal-separator = "." grouping-separator = ",";
let $numbers := (1234.567, 789, 1234567.765)
for $i in $numbers
return (
format-number($i,"#.###,##","local:de"),
format-number($i,"#,###.##","local:en")
)
The output is:
<?xml version="1.0" encoding="UTF-8"?>1.234,57 1,234.57 789,0 789.0 1.234.567,76
1,234,567.76

XQuery wrap result in computed node

I am trying to do some simple pagination in XQuery. I would like my root element of the returned XML to have (as attributes) various properties about the pagination (current page etc).
However I can't seem to find a way to add these dynamic attributes to my root element.
I've tried playing with the
element name {expr} and attribute name {expr}
functions, but can't seem to get them to work.
<result>{
let $results :=
for $item in doc('mydoc')/root/item
return $item
let $requested-page-nbr := 2
let $items-per-page := 10
let $count := count($results)
let $last-page-nbr := fn:ceiling($count div $items-per-page)
let $actual-page-nbr := if ($requested-page-nbr gt $last-page-nbr) then $last-page-nbr else $requested-page-nbr
let $start-item := $items-per-page * $actual-page-nbr - ( $items-per-page - 1 )
let $natural-end-item := $actual-page-nbr * $items-per-page
let $actual-end-item := if ($count ge $natural-end-item) then $natural-end-item else $count
for $j in ($start-item to $actual-end-item )
let $current := item-at($results, $j)
return
<document-summary
requested-page-nbr="{$requested-page-nbr}"
items-per-page="{$items-per-page}"
count="{$count}"
last-page-nbr="{$last-page-nbr}"
actual-page-nbr="{$actual-page-nbr}"
start-item="{$start-item}"
natural-end-item="{$natural-end-item}"
actual-end-item="{$actual-end-item}">
{($current)}
</document-summary>
}</result>
to add an attribute to the root:
<result>{attribute page {3}}</result>
in your case you probably want to do something like: (?)
...
return (
attribute page {$actual-page-nbr},
for $j in ($start-item to $actual-end-item )
let $current := item-at($results, $j)
return
<document-summary
requested-page-nbr="{$requested-page-nbr}"
items-per-page="{$items-per-page}"
count="{$count}"
last-page-nbr="{$last-page-nbr}"
actual-page-nbr="{$actual-page-nbr}"
start-item="{$start-item}"
natural-end-item="{$natural-end-item}"
actual-end-item="{$actual-end-item}">
{($current)}
</document-summary>)
...
does that answer your question?
I don't think that is the proper XQuery way...
This XQuery:
declare variable $requested-page-nbr external;
declare variable $items-per-page external;
declare variable $items := /root/item;
declare variable $firsties := $items[position() mod $items-per-page = 1];
for $first in $firsties
let $actual-page-nbr := index-of($firsties,$first)
let $group := $first|
$first/following-sibling::item[position() < $items-per-page]
let $previous := ($actual-page-nbr - 1) * $items-per-page
where $actual-page-nbr = $requested-page-nbr
return
<result>
<document-summary requested-page-nbr="{$requested-page-nbr}"
items-per-page="{$items-per-page}"
count="{count($items)}"
last-page-nbr="{count($firsties)}"
actual-page-nbr="{$actual-page-nbr}"
start-item="{$previous + 1}"
natural-end-item="{$previous + $items-per-page}"
actual-end-item="{$previous + count($group)}">{
$group
}</document-summary>
</result>
With this input:
<root>
<item>1</item>
<item>2</item>
<item>3</item>
<item>4</item>
<item>5</item>
<item>6</item>
<item>7</item>
<item>8</item>
<item>9</item>
<item>10</item>
<item>11</item>
<item>12</item>
<item>13</item>
</root>
With $requested-page-nbr set to 2 and $items-per-page set to 3, output:
<result>
<document-summary requested-page-nbr="2"
items-per-page="3"
count="13"
last-page-nbr="5"
actual-page-nbr="2"
start-item="4"
natural-end-item="6"
actual-end-item="6">
<item>4</item>
<item>5</item>
<item>6</item>
</document-summary>
</result>
With $requested-page-nbr set to 4 and $items-per-page set to 4, output:
<result>
<document-summary requested-page-nbr="4"
items-per-page="4"
count="13"
last-page-nbr="4"
actual-page-nbr="4"
start-item="13"
natural-end-item="16"
actual-end-item="13">
<item>13</item>
</document-summary>
</result>
As for returning a "page" of results, where you are using
{($current)}
we are using something like the following
{ subsequence($results, $start-item, $items-per-page) }

Resources