I'm trying to write an HTTP parser in C++ using ragel, but finding that the parser generated is not greedy.
The first step is to parse a URL, and here is the ragel grammar translated from RFC-3986:
#include <string>
#include <iostream>
#include <string_view>
%%{
machine http_uri_parser;
action mark_scheme {
std::printf("mark scheme, p:%s\n", p);
this->scheme.data = p;
}
action store_scheme {
std::printf("store scheme, p:%s\n", p);
this->scheme.len = p - this->scheme.data;
}
action mark_authority {
std::printf("mark authority, p:%s\n", p);
this->authority.data = p;
}
action store_authority {
std::printf("store authority, p:%s\n", p);
this->authority.len = p - this->authority.data;
}
action mark_userinfo {
std::printf("mark userinfo, p:%s\n", p);
this->userinfo.data = p;
}
action store_userinfo {
std::printf("store userinfo, p:%s\n", p);
this->userinfo.len = p - this->userinfo.data;
}
action mark_host {
std::printf("mark host, p:%s\n", p);
this->host.data = p;
}
action store_host {
std::printf("store host, p:%s\n", p);
this->host.len = p - this->host.data;
}
action mark_port {
std::printf("mark port, p:%s\n", p);
this->port.data = p;
}
action store_port {
std::printf("store port, p:%s\n", p);
this->port.len = p - this->port.data;
}
action mark_path {
std::printf("mark path, p:%s\n", p);
this->path.data = p;
}
action store_path {
std::printf("store path, p:%s\n", p);
this->path.len = p - this->path.data;
}
action mark_query {
std::printf("mark query, p:%s\n", p);
this->query.data = p;
}
action store_query {
std::printf("store query, p:%s\n", p);
this->query.len = p - this->query.data;
}
action mark_fragment {
std::printf("mark fragment, p:%s\n", p);
this->fragment.data = p;
}
action store_fragment {
std::printf("store fragment, p:%s\n", p);
this->fragment.len = p - this->fragment.data;
}
action done {
std::printf("parser done, p:%s\n", p);
this->_done = 1;
fbreak;
}
###############################################################################
# Characters
###############################################################################
crlf = '\r\n';
gen_delims = ( ':' | '/' | '?' | '#' | '[' | ']' | '#' );
sub_delims = ( '!' | '$' | '&' | "'" | '(' | ')'
| '*' | '+' | ',' | ';' | '=' );
reserved = ( gen_delims | sub_delims );
unreserved = ( alpha | digit | '-' | '.' | '_' | '~' );
pct_encoded = ( '%' xdigit xdigit );
###############################################################################
# Scheme
###############################################################################
scheme = ( alpha ( alpha | digit | '+' | '-' | '.' )* )
>mark_scheme %store_scheme;
###############################################################################
# Authority
###############################################################################
dec_octet = ( ( digit ) # 0-9
| ( ( '1'..'9' ) digit ) # 10-99
| ( '1' digit{2} ) # 100-199
| ( '2' ( '0'..'4' ) digit ) # 200-249
| ( '25' ( '0'..'5' ) ) # 250-255
);
IPv4_address = ( dec_octet '.' dec_octet '.' dec_octet '.' dec_octet );
h16 = ( xdigit{1,4} );
# 16 bits of address represented in hexadecimal
ls32 = ( ( h16 ':' h16 ) | IPv4_address );
# least-significant 32 bits of address
IPv6_address = ( ( ( h16 ':' ){6} ls32 )
| ( '::' ( h16 ':' ){5} ls32 )
| ( ( h16 )? '::' ( h16 ':' ){4} ls32 )
| ( ( ( h16 ':' ){,1} h16 )? '::' ( h16 ':' ){3} ls32 )
| ( ( ( h16 ':' ){,2} h16 )? '::' ( h16 ':' ){2} ls32 )
| ( ( ( h16 ':' ){,3} h16 )? '::' ( h16 ':' ){1} ls32 )
| ( ( ( h16 ':' ){,4} h16 )? '::' ls32 )
| ( ( ( h16 ':' ){,5} h16 )? '::' h16 )
| ( ( ( h16 ':' ){,6} h16 )? '::' )
);
IPv_future = ( 'v' ( ( xdigit+ ) '.' ) ( unreserved | sub_delims | ':' )+ );
IP_literal = ( '[' ( IPv6_address | IPv_future ) ']' );
reg_name = ( ( unreserved | pct_encoded | sub_delims )* )
> { std::printf("mark reg_name, p:%s\n", p); }
% { std::printf("store reg_name, p:%s\n", p); };
port = ( digit* )
>mark_port %store_port;
host = ( IP_literal | IPv4_address | reg_name )
>mark_host %store_host;
userinfo = ( ( unreserved | pct_encoded | sub_delims | ':' )* )
>mark_userinfo %store_userinfo;
authority = ( ( userinfo '#' )? host ( ':' port )? )
>mark_authority %store_authority;
###############################################################################
# Path
###############################################################################
pchar = ( unreserved | pct_encoded | sub_delims | ':' | '#' );
segment = ( pchar* );
segment_nz = ( pchar+ );
# non-zero-length
segment_nz_nc = ( ( unreserved | pct_encoded | sub_delims | '#' )+ );
# non-zero-length segment without any colon ':'
path_abempty = ( ( '/' segment )* )
>mark_path %store_path;
path_absolute = ( '/' ( segment_nz ( '/' segment )* )? )
>mark_path %store_path;
path_noscheme = ( segment_nz_nc ( '/' segment )* )
>mark_path %store_path;
path_rootless = ( segment_nz ( '/' segment )* )
>mark_path %store_path;
path_empty = ( zlen )
>mark_path %store_path;
path = ( path_abempty # begins with '/' or is empty
| path_absolute # begins with '/' but not '//'
| path_noscheme # begins with a non-colon segment
| path_rootless # begins with a segment
| path_empty # zero characters
);
###############################################################################
# Query
###############################################################################
query = ( ( pchar | '/' | '?' )* )
>mark_query %store_query;
###############################################################################
# Fragment
###############################################################################
fragment = ( ( pchar | '/' | '?' )* )
>mark_fragment %store_fragment;
###############################################################################
# URI
###############################################################################
hier_part = ( ( '//' authority path_abempty )
| ( path_absolute )
| ( path_rootless )
| ( path_empty )
);
relative_part = ( ( '//' authority path_abempty )
| ( path_absolute )
| ( path_noscheme )
| ( path_empty )
);
absolute_URI = ( scheme ':' hier_part ( '?' query )? );
relative_ref = ( relative_part ( '?' query )? ( '#' fragment )? );
URI = ( scheme ':' hier_part ( '?' query )? ( '#' fragment )? );
URI_reference = ( URI | relative_ref );
###############################################################################
# main rule
###############################################################################
main := URI #done;
}%%
%% write data;
struct slice {
size_t len{};
const char* data{};
};
struct http_parser {
http_parser() = default;
~http_parser() = default;
void reset();
void execute();
int state = 0;
std::string uri;
/* parsed result */
slice scheme{};
slice authority{};
slice userinfo{};
slice host{};
slice port{};
slice path{};
slice query{};
slice fragment{};
/* parse status */
bool _eof{};
bool _done{};
bool _failed{};
};
void http_parser::reset() {
int cs = 0;
%% write init;
this->state = cs;
this->_eof = false;
this->_done = false;
this->_failed = false;
this->scheme = slice{};
this->authority = slice{};
this->userinfo = slice{};
this->host = slice{};
this->port = slice{};
this->path = slice{};
this->query = slice{};
this->fragment = slice{};
}
void http_parser::execute() {
const char* p = &this->uri.front();
const char* pe = &this->uri.back() + 1;
const char* eof = pe;
int cs = this->state;
%% write exec;
if (!this->_failed) {
this->state = cs;
}
std::printf(
"eof:%d, done:%d, failed:%d, state:%d, p:%p, pe:%p, diff:%ld, rest:%s\n",
this->_eof, this->_done, this->_failed, this->state, p, pe, pe - p, p);
#define print_parser_component(fld) \
if (this->fld.len) { \
std::printf(#fld ": %.*s\n", (int)this->fld.len, this->fld.data); \
}
print_parser_component(scheme);
print_parser_component(authority);
print_parser_component(userinfo);
print_parser_component(host);
print_parser_component(port);
print_parser_component(path);
print_parser_component(query);
print_parser_component(fragment);
#undef print_parser_component
}
Here I set the main rule to URL rather than URI_reference in order to test absolute URL first.
And here is the test code:
int main(int argc, char** argv) {
auto parser = std::make_unique<http_parser>();
parser->uri =
"https://chenjianyong.com/blog/2022/01/"
"seastar_fpc_1.html?hello=world#preface";
parser->reset();
parser->execute();
return 0;
}
Run the program and it prints:
mark scheme, p:https://chenjianyong.com/blog/2022/01/seastar_fpc_1.html?hello=world#preface
store scheme, p:://chenjianyong.com/blog/2022/01/seastar_fpc_1.html?hello=world#preface
parser done, p:://chenjianyong.com/blog/2022/01/seastar_fpc_1.html?hello=world#preface
eof:0, done:1, failed:0, state:171, p:0x6000016f4006, pe:0x6000016f404c, diff:70, rest://chenjianyong.com/blog/2022/01/seastar_fpc_1.html?hello=world#preface
scheme: https
It seems that the parser stops after parsing the scheme https://, its so weird! Why doesn't it greedily consume to the last byte?
After changing the main rule to main := (crlf #done);, append a crlf to the test URL, regenerate the parser and this time the parser can consume to the end and the print shows that all URL components are parsed successfully:
mark scheme, p:https://chenjianyong.com/blog/2022/01/seastar_fpc_1.html?hello=world#preface
store scheme, p:://chenjianyong.com/blog/2022/01/seastar_fpc_1.html?hello=world#preface
mark path, p://chenjianyong.com/blog/2022/01/seastar_fpc_1.html?hello=world#preface
mark authority, p:chenjianyong.com/blog/2022/01/seastar_fpc_1.html?hello=world#preface
mark userinfo, p:chenjianyong.com/blog/2022/01/seastar_fpc_1.html?hello=world#preface
mark host, p:chenjianyong.com/blog/2022/01/seastar_fpc_1.html?hello=world#preface
mark reg_name, p:chenjianyong.com/blog/2022/01/seastar_fpc_1.html?hello=world#preface
store reg_name, p:/blog/2022/01/seastar_fpc_1.html?hello=world#preface
store host, p:/blog/2022/01/seastar_fpc_1.html?hello=world#preface
store authority, p:/blog/2022/01/seastar_fpc_1.html?hello=world#preface
mark path, p:/blog/2022/01/seastar_fpc_1.html?hello=world#preface
store path, p:?hello=world#preface
mark query, p:hello=world#preface
store query, p:#preface
mark fragment, p:preface
store fragment, p:
parser done, p:
eof:0, done:1, failed:0, state:188, p:0x60000140c04e, pe:0x60000140c04e, diff:0, rest:
scheme: https
authority: chenjianyong.com
host: chenjianyong.com
path: /blog/2022/01/seastar_fpc_1.html
query: hello=world
fragment: preface
So, why isn't my ragel parser greedy?
Why doesn't it greedily consume to the last byte?
As you have already noticed, your done action is executed right after the colon (with fbreak inside) and we can confirm it by rendering your (quite big one!) FSM (ragel -o f.dot -Vp source.c++ && dot -Tpng -o f.png f.dot):
It's executed because main specification says #done which, according to the documentation
embeds an action into any transitions that move the machine
into a final state
And as you can see, 171 is one of the possible final states of your machine. If you're to change main specification to URI %done it's gonna be a bit different:
done is no longer executed, it's going to be only after
the transitions that go out of a machine via a final state
Which I think is a bit more appropriate for your case.
Now you probably wonder why 171 is one of the final states and that's because hier_part among other things can be path_empty which is a zlen, so it's OK for machine to finish in this state (for scheme: input) -> it's one of the final states -> #done is executed when transitioning to this state.
I have a json file as this,
# cat input.json
{
"foo":[
"key1",
"key2"
],
"bar": "key3"
}
And I want to join all those strings ("key1", "key2", key3") into one string, and I define the filter as this,
# cat filter.jq
[.[] | . as { foo: $names, bar: $name} | {
name1: [ $names | range(0;length) as $i | {
key1: ($names[$i])
}],
name2: {
key2: $name
}
} | {
values: (.name1 | map(.key1) | join(" ") + .name2.key2)
}]
But this doesn't work,
# <input.json jq --slurp --from-file filter.jq > output.json
jq: error (at <stdin>:8): Cannot index array with string "name2"
What's wrong going here?
btw. concat with a string seems works.
values: (.name1 | map(.key1) | join(" ") + "key3")
The problem is underspecified, but you might wish to consider:
[.. | strings] | join(" ")
or similar.
Here my documents:
[
{
"id":"f3b8c257-9950-45e7-9e79-ace19ec8905e",
"identifier":[
{
"system":{
"value":"urn:oid:2.16.724.4.9.10.2"
},
"value":{
"value":"10839812"
}
}
]
},
{
"id":"f0a1e3ae-826f-4a03-b29e-10ef3bc86ea0",
"identifier":null
}
]
Currently, I'm aplying this jq filter:
map(
{
id,
dni: .identifier[] | (select(.system.value == "urn:oid:1.3.6.1.4.1.19126.3") | .value.value)
}
)
However, I'm getting this message:
jq: error (at practitioner-mongoexport.json:146715): Cannot iterate over null (null)
As you can guess, problem arises processing second object where .identifier: null.
I've tried with that:
map(
{
id,
dni: select(.identifier) | .identifier[] | (select(.system.value == "urn:oid:1.3.6.1.4.1.19126.3") | .value.value)
}
)
I've also tried with:
map(
select(.identifier) |
{
id,
dni: .identifier[] | (select(.system.value == "urn:oid:1.3.6.1.4.1.19126.3") | .value.value)
}
)
Then result is [].
I don't quite figure out what I'm doing wrong.
My desired output would be:
[
{
"id":"f3b8c257-9950-45e7-9e79-ace19ec8905e",
"dni": "10839812"
}
]
Any ideas?
You need to deal with the case if .identifier is null. Here, the ? operator may help.
without selection
jq 'map({id, dni: (.identifier[]?).value.value})'
Demo
with selection
jq 'map({id, dni: (.identifier[]? | select(.system.value == "urn:oid:2.16.724.4.9.10.2")).value.value})'
Demo
[
{
"id": "f3b8c257-9950-45e7-9e79-ace19ec8905e",
"dni": "10839812"
}
]
map(
select(.identifier?[]?.system.value == "urn:oid:2.16.724.4.9.10.2")
| { id, "dni": .identifier[].value.value }
)
Will generate:
[
{
"id": "f3b8c257-9950-45e7-9e79-ace19ec8905e",
"dni": "10839812"
}
]
The trick here is .identifier?[]?.system.value where the ? will ignore .identifier if it's null
JqPlay Demo
I have an json array logged as
[
{
"Key": "key0",
"Value": 0
},
{
"Key": "key1",
"Value": 2
}
]
How do I get Value for Key with value key0, so 0.
I have been using this kluge.
...
| extend jsonarray = parse_json(...)
| extend value = toint(case(
jsonarray[0].Key == 'key0', jsonarray[0].Value,
jsonarray[1].Key == 'key0', jsonarray[1].Value,
"<out-of-range>"))
Update:
Using mv-apply:
| extend jsonarray = parse_json(...)
| mv-apply jsonarray on (
where jsonarray.Key == 'key0'
| project value = toint(jsonarray.Value)
)
you could use mv-expand or mv-apply:
print d = dynamic([
{
"Key": "key0",
"Value": 0
},
{
"Key": "key1",
"Value": 2
}
])
| mv-apply d on (
where d.Key == "key0"
| project d.Value
)
https://learn.microsoft.com/en-us/azure/kusto/query/mvexpandoperator
https://learn.microsoft.com/en-us/azure/kusto/query/mv-applyoperator
I have some code like:
declare #Condition nvarchar(50), --value 'and user_name in ('deep')
select * from table where id = 10
After 10 I want to put #Condition value. How to assign and put it?
Use SET #local_variable
SET { { #local_variable = expression }
| { #cursor_variable = { #cursor_variable | cursor_name
| { CURSOR [ FORWARD_ONLY | SCROLL ]
[ STATIC | KEYSET | DYNAMIC | FAST_FORWARD ]
[ READ_ONLY | SCROLL_LOCKS | OPTIMISTIC ]
[ TYPE_WARNING ]
FOR select_statement
[ FOR { READ ONLY | UPDATE [ OF column_name [ ,...n ] ] }
]
}
} }
}
To use a string that contains SQL as part of a query you need the EXECUTE statement.
DECLARE #Condition NVARCHAR(50) = 'AND user_name IN (''deep'')';
EXECUTE( 'SELECT * FROM table WHERE id = 10 ' + #Condition );