Ok, I need someone to explain this to me
I have this index: def_type
:
CREATE INDEX `def_type` ON `data`(`type`)
… and then I have docs. with this structure:
{
"areakey": "1",
"ispublic": true,
"name": "Kattegat Nord",
"type": "SeaLocalArea",
"key": "1",
"points": [
{
"lon": 10.1735065284,
"lat": 56.7035020083
},
{
"lon": 10.366207197,
"lat": 57.5101324097
},
{
"lon": 10.3695737543,
"lat": 57.5999336965
},
:
:
{
"lon": 12.0185944208,
"lat": 57.3016435524
}
]
}
Now, if I just write a query to find all of the points in total (over a handful of “types”):
SELECT sum(ARRAY_COUNT(points)) AS size FROM data WHERE type IN ["Lake","Stream","CoastLocalArea","SeaLocalArea","PutTakeLake"] and points is valued
I get this result:
[
{
"size": 42223
}
]
All is good - except this query takes 1.5 second. The Explain
looks like this:
{
"plan": {
"#operator": "Sequence",
"~children": [
{
"#operator": "IndexScan2",
"index": "def_type",
"index_id": "aac6039ac62cc592",
"index_projection": {
"primary_key": true
},
"keyspace": "data",
"namespace": "default",
"spans": [
{
"exact": true,
"range": [
{
"high": "\"CoastLocalArea\"",
"inclusion": 3,
"low": "\"CoastLocalArea\""
}
]
},
{
"exact": true,
"range": [
{
"high": "\"Lake\"",
"inclusion": 3,
"low": "\"Lake\""
}
]
},
{
"exact": true,
"range": [
{
"high": "\"PutTakeLake\"",
"inclusion": 3,
"low": "\"PutTakeLake\""
}
]
},
{
"exact": true,
"range": [
{
"high": "\"SeaLocalArea\"",
"inclusion": 3,
"low": "\"SeaLocalArea\""
}
]
},
{
"exact": true,
"range": [
{
"high": "\"Stream\"",
"inclusion": 3,
"low": "\"Stream\""
}
]
}
],
"using": "gsi"
},
{
"#operator": "Fetch",
"keyspace": "data",
"namespace": "default"
},
{
"#operator": "Parallel",
"~child": {
"#operator": "Sequence",
"~children": [
{
"#operator": "Filter",
"condition": "(((`data`.`type`) in [\"Lake\", \"Stream\", \"CoastLocalArea\", \"SeaLocalArea\", \"PutTakeLake\"]) and ((`data`.`points`) is valued))"
},
{
"#operator": "InitialGroup",
"aggregates": [
"sum(array_count((`data`.`points`)))"
],
"group_keys": []
}
]
}
},
{
"#operator": "IntermediateGroup",
"aggregates": [
"sum(array_count((`data`.`points`)))"
],
"group_keys": []
},
{
"#operator": "FinalGroup",
"aggregates": [
"sum(array_count((`data`.`points`)))"
],
"group_keys": []
},
{
"#operator": "Parallel",
"~child": {
"#operator": "Sequence",
"~children": [
{
"#operator": "InitialProject",
"result_terms": [
{
"as": "size",
"expr": "sum(array_count((`data`.`points`)))"
}
]
},
{
"#operator": "FinalProject"
}
]
}
}
]
},
"text": "SELECT sum(ARRAY_COUNT(points)) AS size FROM data WHERE type IN [\"Lake\",\"Stream\",\"CoastLocalArea\",\"SeaLocalArea\",\"PutTakeLake\"] and points is valued"
}
So I decided to create an index for this:
CREATE INDEX `def_points` ON `data`(`type`,`points`) where type IN ["Lake","Stream","CoastLocalArea","SeaLocalArea","PutTakeLake"] and points is valued
When I run the same query as above I get this:
[
{
"size": 40101
}
]
And the Explain
looks like this (which I think looks Ok):
{
"plan": {
"#operator": "Sequence",
"~children": [
{
"#operator": "DistinctScan",
"scan": {
"#operator": "IndexScan2",
"covers": [
"cover ((`data`.`type`))",
"cover ((`data`.`points`))",
"cover ((meta(`data`).`id`))"
],
"filter_covers": {
"cover (((`data`.`points`) is valued))": true,
"cover (((`data`.`type`) in [\"Lake\", \"Stream\", \"CoastLocalArea\", \"SeaLocalArea\", \"PutTakeLake\"]))": true
},
"index": "def_points",
"index_id": "e11843af240694d8",
"keyspace": "data",
"namespace": "default",
"spans": [
{
"range": [
{
"high": "\"CoastLocalArea\"",
"inclusion": 3,
"low": "\"CoastLocalArea\""
},
{
"inclusion": 0,
"low": "null"
}
]
},
{
"range": [
{
"high": "\"Lake\"",
"inclusion": 3,
"low": "\"Lake\""
},
{
"inclusion": 0,
"low": "null"
}
]
},
{
"range": [
{
"high": "\"PutTakeLake\"",
"inclusion": 3,
"low": "\"PutTakeLake\""
},
{
"inclusion": 0,
"low": "null"
}
]
},
{
"range": [
{
"high": "\"SeaLocalArea\"",
"inclusion": 3,
"low": "\"SeaLocalArea\""
},
{
"inclusion": 0,
"low": "null"
}
]
},
{
"range": [
{
"high": "\"Stream\"",
"inclusion": 3,
"low": "\"Stream\""
},
{
"inclusion": 0,
"low": "null"
}
]
}
],
"using": "gsi"
}
},
{
"#operator": "Parallel",
"~child": {
"#operator": "Sequence",
"~children": [
{
"#operator": "Filter",
"condition": "(cover (((`data`.`type`) in [\"Lake\", \"Stream\", \"CoastLocalArea\", \"SeaLocalArea\", \"PutTakeLake\"])) and cover (((`data`.`points`) is valued)))"
},
{
"#operator": "InitialGroup",
"aggregates": [
"sum(array_count(cover ((`data`.`points`))))"
],
"group_keys": []
}
]
}
},
{
"#operator": "IntermediateGroup",
"aggregates": [
"sum(array_count(cover ((`data`.`points`))))"
],
"group_keys": []
},
{
"#operator": "FinalGroup",
"aggregates": [
"sum(array_count(cover ((`data`.`points`))))"
],
"group_keys": []
},
{
"#operator": "Parallel",
"~child": {
"#operator": "Sequence",
"~children": [
{
"#operator": "InitialProject",
"result_terms": [
{
"as": "size",
"expr": "sum(array_count(cover ((`data`.`points`))))"
}
]
},
{
"#operator": "FinalProject"
}
]
}
}
]
},
"text": "SELECT sum(ARRAY_COUNT(points)) AS size FROM data WHERE type IN [\"Lake\",\"Stream\",\"CoastLocalArea\",\"SeaLocalArea\",\"PutTakeLake\"] and points is valued"
}
Why does the query not return the same nuber?
If I use this query instead (without specifying that the points should be valued):
SELECT sum(ARRAY_COUNT(points)) AS size FROM data WHERE type IN ["Lake","Stream","CoastLocalArea","SeaLocalArea","PutTakeLake"]
Then it returns (as expected):
[
{
"size": 42223
}
]
But this is probably due to it using the def_type
index instead (and it takes 1.5-2 seconds):
{
"plan": {
"#operator": "Sequence",
"~children": [
{
"#operator": "IndexScan2",
"index": "def_type",
"index_id": "aac6039ac62cc592",
"index_projection": {
"primary_key": true
},
"keyspace": "data",
"namespace": "default",
"spans": [
{
"exact": true,
"range": [
{
"high": "\"CoastLocalArea\"",
"inclusion": 3,
"low": "\"CoastLocalArea\""
}
]
},
{
"exact": true,
"range": [
{
"high": "\"Lake\"",
"inclusion": 3,
"low": "\"Lake\""
}
]
},
{
"exact": true,
"range": [
{
"high": "\"PutTakeLake\"",
"inclusion": 3,
"low": "\"PutTakeLake\""
}
]
},
{
"exact": true,
"range": [
{
"high": "\"SeaLocalArea\"",
"inclusion": 3,
"low": "\"SeaLocalArea\""
}
]
},
{
"exact": true,
"range": [
{
"high": "\"Stream\"",
"inclusion": 3,
"low": "\"Stream\""
}
]
}
],
"using": "gsi"
},
{
"#operator": "Fetch",
"keyspace": "data",
"namespace": "default"
},
{
"#operator": "Parallel",
"~child": {
"#operator": "Sequence",
"~children": [
{
"#operator": "Filter",
"condition": "((`data`.`type`) in [\"Lake\", \"Stream\", \"CoastLocalArea\", \"SeaLocalArea\", \"PutTakeLake\"])"
},
{
"#operator": "InitialGroup",
"aggregates": [
"sum(array_count((`data`.`points`)))"
],
"group_keys": []
}
]
}
},
{
"#operator": "IntermediateGroup",
"aggregates": [
"sum(array_count((`data`.`points`)))"
],
"group_keys": []
},
{
"#operator": "FinalGroup",
"aggregates": [
"sum(array_count((`data`.`points`)))"
],
"group_keys": []
},
{
"#operator": "Parallel",
"~child": {
"#operator": "Sequence",
"~children": [
{
"#operator": "InitialProject",
"result_terms": [
{
"as": "size",
"expr": "sum(array_count((`data`.`points`)))"
}
]
},
{
"#operator": "FinalProject"
}
]
}
}
]
},
"text": "SELECT sum(ARRAY_COUNT(points)) AS size FROM data WHERE type IN [\"Lake\",\"Stream\",\"CoastLocalArea\",\"SeaLocalArea\",\"PutTakeLake\"]"
}
Note:
I have narrowed it down to 3 docs. not being included in the query with a total number of points making up the difference. But I have absolutely no clue why? There is nothing “special” about theses three docs - they are just missing