jq: filter nested array objects - jq

Here my documents:
[
{
"id": "3e67b455-8cdb-4bc0-a5e1-f90253870fc9",
"identifier": [
{
"system": {
"value": "urn:oid:2.16.724.4.9.20.91-INVENTAT"
},
"value": {
"value": "04374"
}
},
{
"system": {
"value": "urn:oid:2.16.724.4.9.20.2-INVENTAT"
},
"value": {
"value": "INFP3"
}
},
{
"system": {
"value": "urn:oid:INVENTAT"
},
"value": {
"value": "CBOU035"
}
}
]
},
{
"id": "0f22e5ff-70bc-457f-bdaf-7afe86d478de",
"identifier": [
{
"system": {
"value": "urn:oid:2.16.724.4.9.20.91-INVENTAT"
},
"value": {
"value": "04376"
}
},
{
"system": {
"value": "urn:oid:2.16.724.4.9.20.2-INVENTAT"
},
"value": {
"value": "INF07"
}
},
{
"system": {
"value": "urn:oid:INVENTAT"
},
"value": {
"value": "S527918"
}
}
]
},
{
"id": "a1ea574c-438b-443c-ad87-d31d09d581f0",
"identifier": [
{
"system": {
"value": "urn:oid:2.16.724.4.9.20.91-INVENTAT"
},
"value": {
"value": "08096"
}
},
{
"system": {
"value": "urn:oid:2.16.724.4.9.20.2-INVENTAT"
},
"value": {
"value": "INF04"
}
},
{
"system": {
"value": "urn:oid:INVENTAT"
},
"value": {
"value": "5635132"
}
}
]
}
]
I need to filter .identifier where system.value="urn:oid:2.16.724.4.9.20.91-INVENTAT" or system.value="urn:oid:2.16.724.4.9.20.2-INVENTAT" and pick .value.value.
Desired output:
[
{
"id": "3e67b455-8cdb-4bc0-a5e1-f90253870fc9",
"oid1": "04374",
"oid2": "INFP3"
},
{
"id": "0f22e5ff-70bc-457f-bdaf-7afe86d478de",
"oid1": "04376",
"oid2": "INF07"
},
{
"id": "a1ea574c-438b-443c-ad87-d31d09d581f0",
"oid1": "08096",
"oid2": "INF04"
}
]
I've tried:
map(
{
id,
oid1: select(.identifier?[]?.system.value == "urn:oid:2.16.724.4.9.20.91-INVENTAT") | .identifier[].value.value,
oid2: select(.identifier?[]?.system.value == "urn:oid:2.16.724.4.9.20.2-INVENTAT") | .identifier[].value.value
}
)
But output is not what I need: you can find it on this jqplay.
Any ideas?

This uses IN to check for your query strings, and with_entries on an array to generate the indeces for the oid keys.
jq '
map({id} + (.identifier | map(select(IN(.system.value;
"urn:oid:2.16.724.4.9.20.91-INVENTAT",
"urn:oid:2.16.724.4.9.20.2-INVENTAT"
)).value.value) | with_entries(.key |= "oid\(. + 1)")))
'
[
{
"id": "3e67b455-8cdb-4bc0-a5e1-f90253870fc9",
"oid1": "04374",
"oid2": "INFP3"
},
{
"id": "0f22e5ff-70bc-457f-bdaf-7afe86d478de",
"oid1": "04376",
"oid2": "INF07"
},
{
"id": "a1ea574c-438b-443c-ad87-d31d09d581f0",
"oid1": "08096",
"oid2": "INF04"
}
]
Demo

Here is a ruby to do that:
ruby -r json -e '
def walk(x, filt)
rtr=[]
rep=["uab", "ub"]
x.each{|e|
rd={"id"=>e["id"]}.merge(
e["identifier"].
filter{|ea| filt.include?(ea["system"]["value"])}.
map.with_index(1){|di, i| ["#{rep[i%2]}", "#{di["value"]["value"]}"]}.to_h)
rtr << rd
}
rtr
end
data=JSON.parse($<.read)
puts walk(data, ["urn:oid:2.16.724.4.9.20.91-INVENTAT", "urn:oid:2.16.724.4.9.20.2-INVENTAT"]).to_json
' file
Prints:
[{"id":"3e67b455-8cdb-4bc0-a5e1-f90253870fc9","ub":"04374","uab":"INFP3"},{"id":"0f22e5ff-70bc-457f-bdaf-7afe86d478de","ub":"04376","uab":"INF07"},{"id":"a1ea574c-438b-443c-ad87-d31d09d581f0","ub":"08096","uab":"INF04"}]

Related

How to project values from a Gremlin traversal with nested and()/or() steps

I have the graph model below which represents the sub-pattern I'd like to traverse or fetch. The nodes and their properties are shown below as well.
The expected response to my query would look something like this:
where 's', 'c', 'aid', 'qid', 'p', 'r1', 'r2' are the nodes that make up the subpattern or subgraph.
[
{
"s": {
"id": "345fbdad-9c67-47bb-9f3b-cf50c8cdbee4",
"label": "severity",
"type": "vertex",
"properties": {
"severity": [
{
"id": "a6a9e38f-0802-48b6-ac37-490f45e824e9",
"value": "High"
}
],
"pk": [
{
"id": "345fbdad-9c67-47bb-9f3b-cf50c8cdbee4|pk",
"value": "pk"
}
]
}
},
"c": {
"id": "345fbdad-9c67-47bb-9f3b-cf50c8cdbee4",
"label": "cve",
"type": "vertex",
"properties": {
"cve_id": [
{
"id": "a6a9e38f-0802-48b6-ac37-490f45e824e9",
"value": "CVE-xxxx-xxxx"
}
],
"publishedOn": [
{
"id": "fc5dde4d-c027-4c19-9b16-b3314b2b10c6",
"value": "xxx"
}
],
"pk": [
{
"id": "345fbdad-9c67-47bb-9f3b-cf50c8cdbee4|pk",
"value": "pk"
}
]
}
},
"aid": {
"id": "345fbdad-9c67-47bb-9f3b-cf50c8cdbee4",
"label": "aid",
"type": "vertex",
"properties": {
"aid": [
{
"id": "a6a9e38f-0802-48b6-ac37-490f45e824e9",
"value": "xxxx-xxxx"
}
"pk": [
{
"id": "345fbdad-9c67-47bb-9f3b-cf50c8cdbee4|pk",
"value": "pk"
}
]
}
},
"qid": {
"id": "345fbdad-9c67-47bb-9f3b-cf50c8cdbee4",
"label": "qid",
"type": "vertex",
"properties": {
"qid": [
{
"id": "a6a9e38f-0802-48b6-ac37-490f45e824e9",
"value": "xxxx-xxxx"
}
"pk": [
{
"id": "345fbdad-9c67-47bb-9f3b-cf50c8cdbee4|pk",
"value": "pk"
}
]
}
},
"p": {
"id": "345fbdad-9c67-47bb-9f3b-cf50c8cdbee4",
"label": "package",
"type": "vertex",
"properties": {
"name": [
{
"id": "a6a9e38f-0802-48b6-ac37-490f45e824e9",
"value": "xxxxx"
}
],
"version": [
{
"id": "fc5dde4d-c027-4c19-9b16-b3314b2b10c6",
"value": "xxx"
}
],
"pk": [
{
"id": "345fbdad-9c67-47bb-9f3b-cf50c8cdbee4|pk",
"value": "pk"
}
]
}
},
"r1": {
"id": "345fbdad-9c67-47bb-9f3b-cf50c8cdbee4",
"label": "release",
"type": "vertex",
"properties": {
"source": [
{
"id": "a6a9e38f-0802-48b6-ac37-490f45e824e9",
"value": "xxxx-xxxx"
}
],
"status": [
{
"id": "fc5dde4d-c027-4c19-9b16-b3314b2b10c6",
"value": "xxx"
}
],
"pk": [
{
"id": "345fbdad-9c67-47bb-9f3b-cf50c8cdbee4|pk",
"value": "pk"
}
]
}
},
"r2": {
"id": "345fbdad-9c67-47bb-9f3b-cf50c8cdbee4",
"label": "release",
"type": "vertex",
"properties": {
"source": [
{
"id": "a6a9e38f-0802-48b6-ac37-490f45e824e9",
"value": "xxxx-xxxx"
}
],
"status": [
{
"id": "fc5dde4d-c027-4c19-9b16-b3314b2b10c6",
"value": "xxx"
}
],
"pk": [
{
"id": "345fbdad-9c67-47bb-9f3b-cf50c8cdbee4|pk",
"value": "pk"
}
]
}
},
{
....
....
},
{
....
..
}
]
My question is how do I build my traversal query to achieve this end result?
What I have so far is this, but the project() step is not working as expected
g.V().hasLabel('cve').as('c').and(
__.in('severity').as('s'),
__.out('cve_to_aid').as('aid').and(
__.out('has_qid').as('qid'),
__.in('package_to_aid').as('p'),
or(
__.in('r1_to_aid').has('status', 'Patched').as('r1'),
__.in('r2_to_aid').has('status', 'Patched').as('r2')
)
)
).project('c', 's', 'aid', 'qid', 'p', 'r1', 'r2').
by(('c').values('cve_id')).
by(('s').values('severity')).
by(('aid').values('aid')).
by(('qid').values('qid')).
by(('p').values()).
by(('r1').values()).
by(('r2').values()).
I am doing this on CosmosDB, so please only provide answers using supported steps found here: https://learn.microsoft.com/en-us/azure/cosmos-db/gremlin/support
It is possible to nest project() steps, e.g. on the TinkerGraph:
gremlin> g = TinkerFactory.createModern().traversal()
==>graphtraversalsource[tinkergraph[vertices:6 edges:6], standard]
gremlin> g.V(1).as('x').project('x').by(
select('x').project('id', 'label','properties').by(id).by(label).by(
project('name').by(properties())
)
)
==>[x:[id:1,label:person,properties:[name:vp[name->marko]]]]
gremlin>
but then you end up coding your entire data model into your query.
In full TinkerPop you could turn your result into a subGraph() and write it to graphSon with the io() step. In Cosmos you can add the returned vertices to a TinkerGraph instance clientside and again use the io() step to serialize the TinkerGraph to graphSon.

Merge all objects inside an array that share the same key

I'm trying to deduplicate all objects inside the array results that share the same key id, and merge their path arrays.
JSON input:
[
{
"type": "apple",
"results": [
{
"id": "apple1",
"name": "appleName1",
"path": "/some/path/a"
},
{
"id": "apple1",
"name": "appleName1",
"path": "/some/path/b"
},
{
"id": "apple2",
"name": "appleName2",
"path": "/some/path/c"
}
]
},
{
"type": "orange",
"results": [
{
"id": "orange1",
"name": "orangeName1",
"path": "/some/path/a"
},
{
"id": "orange1",
"name": "orangeName1",
"path": "/some/path/b"
},
{
"id": "orange2",
"name": "orangeName2",
"path": "/some/path/c"
}
]
}
]
Expected output:
[
{
"type": "apple",
"results": [
{
"id": "apple1",
"name": "appleName1",
"path": [
"/some/path/a",
"/some/path/b"
]
},
{
"id": "apple2",
"name": "appleName2",
"path": [
"/some/path/c"
]
}
]
},
{
"type": "orange",
"results": [
{
"id": "orange1",
"name": "orangeName1",
"path": [
"/some/path/a",
"/some/path/b"
]
},
{
"id": "orange2",
"name": "orangeName2",
"path": [
"/some/path/c"
]
}
]
}
]
I've managed to get an approximate solution using:
jq '[{type: .[].type, results: .[].results | group_by(.id) | map({id: .[0].id, name: .[0].name, path: (map(.path))})}]'
But my solution produces two additional elements that aren't supposed to be there.
I know there are some similar questions already answered but I didn't manage to get them to work with this example. Any help is appreciated!
You could group_by the .id field, then for each group take the first item and replace its .path field with a map on the .path fields of all group members:
jq 'map(.results |= (group_by(.id) | map(first + {path: map(.path)})))'
[
{
"type": "apple",
"results": [
{
"id": "apple1",
"name": "appleName1",
"path": [
"/some/path/a",
"/some/path/b"
]
},
{
"id": "apple2",
"name": "appleName2",
"path": [
"/some/path/c"
]
}
]
},
{
"type": "orange",
"results": [
{
"id": "orange1",
"name": "orangeName1",
"path": [
"/some/path/a",
"/some/path/b"
]
},
{
"id": "orange2",
"name": "orangeName2",
"path": [
"/some/path/c"
]
}
]
}
]
Demo

Get the value after group by in gremlin?

g.V('JobDefinition1').out("JobDefinitionToJobHistory").has("Timestamp", between("2022-02-01T00:00:00Z", "2022-02-03T00:00:00Z")).group().by("ttl").by(limit(1))
I had a gremlin query above and get the result below.
[
{
"776": [
{
"id": "JobHistory-2-1-2022 12:19:15 AM",
"label": "JobHistory",
"type": "vertex",
"properties": {
"Timestamp": [
{
"id": "6d187ccf-160d-4d87-a360-48526b7a1461",
"value": "2022-02-01T00:00:00Z"
}
],
"ttl": [
{
"id": "JobHistory-2-1-2022 12:19:15 AM|ttl",
"value": "776"
}
]
}
}
],
"888": [
{
"id": "JobHistory-2-1-2022 12:19:15 AM",
"label": "JobHistory",
"type": "vertex",
"properties": {
"Timestamp": [
{
"id": "6d187ccf-160d-4d87-a360-48526b7a1461",
"value": "2022-02-01T00:00:00Z"
}
],
"ttl": [
{
"id": "JobHistory-2-1-2022 12:19:15 AM|ttl",
"value": "888"
}
]
}
}
]
}
]
But I want to only get the value of the result after group by, the excepted result is shown below. I want the groupby result value without the key(as you can see, the excepted result don't have key info such as "776" and "888"). Is there any gremlin method to help me achieve this goal. Hope you can give me some help. Thanks!
[
{
"id": "JobHistory-2-1-2022 12:19:15 AM",
"label": "JobHistory",
"type": "vertex",
"properties": {
"Timestamp": [
{
"id": "6d187ccf-160d-4d87-a360-48526b7a1461",
"value": "2022-02-01T00:00:00Z"
}
],
"ttl": [
{
"id": "JobHistory-2-1-2022 12:19:15 AM|ttl",
"value": "776"
}
]
}
}
,
{
"id": "JobHistory-2-1-2022 12:19:15 AM",
"label": "JobHistory",
"type": "vertex",
"properties": {
"Timestamp": [
{
"id": "6d187ccf-160d-4d87-a360-48526b7a1461",
"value": "2022-02-01T00:00:00Z"
}
],
"ttl": [
{
"id": "JobHistory-2-1-2022 12:19:15 AM|ttl",
"value": "888"
}
]
}
}
]
You can get values from a Map with select(values):
gremlin> g.V().groupCount().by(label)
==>[software:2,person:4]
gremlin> g.V().groupCount().by(label).select(values)
==>[2,4]

MongoDB - Document Structure to create matrix from multiple value pairs

I am new to NoSQL and MongoDB, so please don't bash. I have used SQL databases in the past, but am now looking to leverage the scalability of NoSQL. One application that comes to mind is the collection of experimental results, where they are serialized in some manner with a start date, end date, part number, serial number, etc. Along with each experiment, there are many "measurements" collected, but the list of measurements may be unique in each experiment.
I am looking for ideas in how to structure the document to achieve the follow tasks:
1) Query based on date ranges, part numbers, serial numbers
2) See resulting table in a "spreadsheet" table
3) Perform statistical calculats, perhaps with R, on the different "measurements"
An example might look like:
[
{
"_id": {
"$oid": "5e680d6063cb144f9d1be261"
},
"StartDate": {
"$date": {
"$numberLong": "1583841600000"
}
},
"EndDate": {
"$date": {
"$numberLong": "1583842007000"
}
},
"PartNumber": "1Z45NP7X",
"SerialNumber": "U84A3102",
"Status": "Acceptable",
"Results": [
{
"Sensor": "Pressure",
"Value": "14.68453",
"Units": "PSIA",
"Flag": "1"
},
{
"Sensor": "Temperature",
"Value": {
"$numberDouble": "68.43"
},
"Units": "DegF",
"Flag": {
"$numberInt": "1"
}
},
{
"Sensor": "Velocity",
"Value": {
"$numberDouble": "12.4"
},
"Units": "ft/s",
"Flag": {
"$numberInt": "1"
}
}
]
},
{
"_id": {
"$oid": "5e68114763cb144f9d1be263"
},
"StartDate": {
"$date": {
"$numberLong": "1583842033000"
}
},
"EndDate": {
"$date": {
"$numberLong": "1583842434000"
}
},
"PartNumber": "1Z45NP7X",
"SerialNumber": "U84A3103",
"Status": "Acceptable",
"Results": [
{
"Sensor": "Pressure",
"Value": "14.70153",
"Units": "PSIA",
"Flag": "1"
},
{
"Sensor": "Temperature",
"Value": {
"$numberDouble": "68.55"
},
"Units": "DegF",
"Flag": {
"$numberInt": "1"
}
},
{
"Sensor": "Velocity",
"Value": {
"$numberDouble": "12.7"
},
"Units": "ft/s",
"Flag": {
"$numberInt": "1"
}
}
]
},
{
"_id": {
"$oid": "5e68115f63cb144f9d1be264"
},
"StartDate": {
"$date": {
"$numberLong": "1583842464000"
}
},
"EndDate": {
"$date": {
"$numberLong": "1583842434000"
}
},
"PartNumber": "1Z45NP7X",
"SerialNumber": "U84A3104",
"Status": "Acceptable",
"Results": [
{
"Sensor": "Pressure",
"Value": "14.59243",
"Units": "PSIA",
"Flag": "1"
},
{
"Sensor": "Weight",
"Value": {
"$numberDouble": "67.93"
},
"Units": "lbf",
"Flag": {
"$numberInt": "1"
}
},
{
"Sensor": "Torque",
"Value": {
"$numberDouble": "122.33"
},
"Units": "ft-lbf",
"Flag": {
"$numberInt": "1"
}
}
]
}
]
Another approach might be:
[
{
"_id": {
"$oid": "5e680d6063cb144f9d1be261"
},
"StartDate": {
"$date": {
"$numberLong": "1583841600000"
}
},
"EndDate": {
"$date": {
"$numberLong": "1583842007000"
}
},
"PartNumber": "1Z45NP7X",
"SerialNumber": "U84A3102",
"Status": "Acceptable",
"Pressure (PSIA)" : "14.68453",
"Pressure - Flag": "1",
"Temperature (degF)": "68.43",
"Temperature - Flag": "1",
"Velocity (ft/s)": "12.4",
"Velocity Flag": "1"
},
{
"_id": {
"$oid": "5e68114763cb144f9d1be263"
},
"StartDate": {
"$date": {
"$numberLong": "1583842033000"
}
},
"EndDate": {
"$date": {
"$numberLong": "1583842434000"
}
},
"PartNumber": "1Z45NP7X",
"SerialNumber": "U84A3103",
"Status": "Acceptable",
"Pressure (PSIA)" : "14.70153",
"Pressure - Flag": "1",
"Temperature (degF)": "68.55",
"Temperature - Flag": "1",
"Velocity (ft/s)": "12.7",
"Velocity Flag": "1"
},
{
"_id": {
"$oid": "5e68115f63cb144f9d1be264"
},
"StartDate": {
"$date": {
"$numberLong": "1583842464000"
}
},
"EndDate": {
"$date": {
"$numberLong": "1583842434000"
}
},
"PartNumber": "1Z45NP7X",
"SerialNumber": "U84A3104",
"Status": "Acceptable",
"Pressure (PSIA)" : "14.59243",
"Pressure - Flag": "1",
"Weight (lbf)": "67.93",
"Weight - Flag": "1",
"Torque (ft-lbf)": "122.33",
"Torque - Flag": : "1"
}
]
An example table might look like (probably with correct spacing):
StartDate EndDate PartNumber SerialNumber Pressure 'Pressure - Flag' Temperature 'Temperature - Flag' Velocity 'Velocity - Flag' Torque 'Torque - Flag' Weight 'Weight - Flag'
2020-03-10T12:00:00Z 2020-03-10T12:06:47Z 1Z45NP7X U84A3102 14.68453 1 68.43 1 12.4 1 N/A N/A N/A
N/A
2020-03-10T12:07:13Z 2020-03-10T12:13:54Z 1Z45NP7X U84A3103 14.70153 1 68.55 1 12.7 1 N/A N/A N/A
N/A
2020-03-10T12:07:13Z 2020-03-10T12:13:54Z 1Z45NP7X U84A3104 14.59243 1 N/A N/A N/A N/A 67.93 1 122.33
1
Any thoughts on the best structure? In reality, there might be 200+ "sensor values".
Thanks,
DG

Different results between date histogram and date range on Elastic Search

I would like to analyse my logs data with Elastic Search/Kibana and count unique customer by month.
Results are different when I use a date histogram aggregation and date range aggregation.
Here is the date histogram query :
"query": {
"query_string": {
"query": "_type:logs AND created_at:[2015-04-01 TO now]",
"analyze_wildcard": true
}
},
"size": 0,
"aggs": {
"2": {
"date_histogram": {
"field": "created_at",
"interval": "1M",
"min_doc_count": 1
},
"aggs": {
"1": {
"cardinality": {
"field": "customer.id"
}
}
}
}
}
And results :
"aggregations": {
"2": {
"buckets": [
{
"1": {
"value": 595805
},
"key_as_string": "2015-04-01T00:00:00.000Z",
"key": 1427839200000,
"doc_count": 6410438
},
{
"1": {
"value": 647788
},
"key_as_string": "2015-05-01T00:00:00.000Z",
"key": 1430431200000,
"doc_count": 6669555
},...
Here is the date range query :
"query": {
"query_string": {
"query": "_type:logs AND created_at:[2015-04-01 TO now]",
"analyze_wildcard": true
}
},
"size": 0,
"aggs": {
"2": {
"date_range": {
"field": "created_at",
"ranges": [
{
"from": "2015-04-01",
"to": "2015-05-01"
},
{
"from": "2015-05-01",
"to": "2015-06-01"
}
]
},
"aggs": {
"1": {
"cardinality": {
"field": "customer.id"
}
}
}
}
}
And the response :
"aggregations": {
"2": {
"buckets": [
{
"1": {
"value": 592179
},
"key": "2015-04-01T00:00:00.000Z-2015-05-01T00:00:00.000Z",
"from": 1427846400000,
"from_as_string": "2015-04-01T00:00:00.000Z",
"to": 1430438400000,
"to_as_string": "2015-05-01T00:00:00.000Z",
"doc_count": 6411884
},
{
"1": {
"value": 616995
},
"key": "2015-05-01T00:00:00.000Z-2015-06-01T00:00:00.000Z",
"from": 1430438400000,
"from_as_string": "2015-05-01T00:00:00.000Z",
"to": 1433116800000,
"to_as_string": "2015-06-01T00:00:00.000Z",
"doc_count": 6668060
}
]
}
}
In the first case, I have 595,805 for April and 647,788 for May
In the second case, I have 592,179 for April and 616,995 for May
Someone could explain me why I have these differences between these use cases ?
Thank you
I update my first post to add another example
I add another example with fewer data (on 1 day) but with the same issue. Here is the first request with date histogram :
{
"size": 0,
"query": {
"query_string": {
"query": "_type:logs AND logs.created_at:[2015-04-01 TO 2015-04-01]",
"analyze_wildcard": true
}
},
"aggs": {
"2": {
"date_histogram": {
"field": "created_at",
"interval": "1h",
"pre_zone": "00:00",
"pre_zone_adjust_large_interval": true,
"min_doc_count": 1
},
"aggs": {
"1": {
"cardinality": {
"field": "customer.id"
}
}
}
}
}
}
And we can see 660 unique count with 1717 doc count for the first hour :
{
"hits":{
"total":203961,
"max_score":0,
"hits":[
]
},
"aggregations":{
"2":{
"buckets":[
{
"1":{
"value":660
},
"key_as_string":"2015-04-01T00:00:00.000Z",
"key":1427846400000,
"doc_count":1717
},
{
"1":{
"value":324
},
"key_as_string":"2015-04-01T01:00:00.000Z",
"key":1427850000000,
"doc_count":776
},
{
"1":{
"value":190
},
"key_as_string":"2015-04-01T02:00:00.000Z",
"key":1427853600000,
"doc_count":481
}
]
}
}
}
But on the second request with the date range :
{
"size": 0,
"query": {
"query_string": {
"query": "_type:logs AND logs.created_at:[2015-04-01 TO 2015-04-01]",
"analyze_wildcard": true
}
},
"aggs": {
"2": {
"date_range": {
"field": "created_at",
"ranges": [
{
"from": "2015-04-01T00:00:00",
"to": "2015-04-01T01:00:00"
},
{
"from": "2015-04-01T01:00:00",
"to": "2015-04-01T02:00:00"
}
]
},
"aggs": {
"1": {
"cardinality": {
"field": "customer.id"
}
}
}
}
}
}
We can see only 633 unique count with 1717 doc count :
{
"hits":{
"total":203961,
"max_score":0,
"hits":[
]
},
"aggregations":{
"2":{
"buckets":[
{
"1":{
"value":633
},
"key":"2015-04-01T00:00:00.000Z-2015-04-01T01:00:00.000Z",
"from":1427846400000,
"from_as_string":"2015-04-01T00:00:00.000Z",
"to":1427850000000,
"to_as_string":"2015-04-01T01:00:00.000Z",
"doc_count":1717
},
{
"1":{
"value":328
},
"key":"2015-04-01T01:00:00.000Z-2015-04-01T02:00:00.000Z",
"from":1427850000000,
"from_as_string":"2015-04-01T01:00:00.000Z",
"to":1427853600000,
"to_as_string":"2015-04-01T02:00:00.000Z",
"doc_count":776
}
]
}
}
}
Please someone could tell me why ? Thank you
When using the date_histogram aggregation you need to take into account the timezone, which date_range doesn't as it's always using the GMT timezone.
If you look at the long millisecond values in your results, you'll see the following:
For your date histogram, from: 1427839200000 is actually equal to 2015-03-31T22:00:00.000Z which differs from the key_as_string value (i.e. 2015-04-01T00:00:00.000Z) that is formatted according to the GMT timezone.
In your first aggregation, try explicitly specifying the time_zone parameter to be your current timezone (apparently GMT+2) and you should get the same results:
"date_histogram": {
"field": "created_at",
"interval": "1M",
"min_doc_count": 1,
"time_zone": -2
},

Resources