Hi @vsr1, I eventually had to go with the first query that you suggested. I am using two queries here. One for Count, and the other for selecting specific list of attributes.
When I created a covering index for the select query, (which automatically covers count query as well) and executed it on two different machines with same Hardware configuration (16GB RAM and 4 Core CPU) but different versions of couchbase server, I found a huge difference in the query performance.
Machine 1 is using CB 5.0.1 and Machine-2 is using CB Analytics DP4. Machine 1 is executing this query in 10 seconds, and machine 2 is executing it within 1.5 seconds. the reporting bucket on both machine is allocated with 2 GB of RAM and and index service has 512 MB RAM allocated.
Also, the slower machine has 5 more buckets other than reporting (with 1 GB RAM each) and the machine which is showing better performance has only one bucket on which I am working.
I compared the explain plan for both machines as well and didn’t find any difference in it.
Index:
CREATE INDEX IX_reporting_Person ON reporting((distinct (array n for n in [lower(reservationNumber), lower(firstName), lower(lastName)] end)),verificationStatus,lastPendingDocumentDate,startDate,isInternational,isOcrDirty,endDate,firstName,middleName,lastName,personId,securityPhotoMediaItemId,cardNumber,isReviewLater,verifiedBy) WHERE (type = "Person")
Data query:
Select case When r.verificationStatus = 'PENDING' and (date_diff_str(clock_local() , r.lastPendingDocumentDate, 'day') >= 4 OR date_diff_str(embarkDate, r.lastPendingDocumentDate, 'day') <= 5) Then 'OVERDUE' ELSE r.verificationStatus end as status, r.startDate,r.isInternational,r.isOcrDirty,r.endDate,r.firstName, r.middleName, r.lastName, r.personId, r.securityPhotoMediaItemId,r.cardNumber,r.isReviewLater, r.verifiedBy From reporting r where r.type = 'Person' AND meta(r).id NOT LIKE "_sync%" AND (ANY x IN [LOWER(r.cardNumber),LOWER(r.firstName), LOWER(r.lastName)] SATISFIES x like LOWER("b%") END) order by meta(r).id limit 20 offset 0;
Time taken on both machines:
On Machine-1 greater than 10 s (with approx. 0.2 M records in the bucket)
On Machine-2 within 1.5 s (with 0.3 M documents in the bucket)
Explain Plan:
{
"plan": {
"#operator": "Sequence",
"~children": [
{
"#operator": "Sequence",
"~children": [
{
"#operator": "DistinctScan",
"scan": {
"#operator": "IndexScan2",
"covers": [
"cover ((distinct (array `n` for `n` in [lower((`r`.`cardNumber`)), lower((`r`.`firstName`)), lower((`r`.`lastName`))] end)))",
"cover ((`r`.`verificationStatus`))",
"cover ((`r`.`lastPendingDocumentDate`))",
"cover ((`r`.`startDate`))",
"cover ((`r`.`isInternational`))",
"cover ((`r`.`isOcrDirty`))",
"cover ((`r`.`endDate`))",
"cover ((`r`.`firstName`))",
"cover ((`r`.`middleName`))",
"cover ((`r`.`lastName`))",
"cover ((`r`.`personId`))",
"cover ((`r`.`securityPhotoMediaItemId`))",
"cover ((`r`.`cardNumber`))",
"cover ((`r`.`isReviewLater`))",
"cover ((`r`.`verifiedBy`))",
"cover ((meta(`r`).`id`))"
],
"filter_covers": {
"cover ((`r`.`type`))": "Person",
"cover (any `x` in [lower((`r`.`cardNumber`)), lower((`r`.`firstName`)), lower((`r`.`lastName`))] satisfies ((\"b\" <= `x`) and (`x` < \"c\")) end)": true,
"cover (any `x` in [lower((`r`.`cardNumber`)), lower((`r`.`firstName`)), lower((`r`.`lastName`))] satisfies (`x` like lower(\"b%\")) end)": true
},
"index": "IX_reporting_Person",
"index_id": "2b071909296788d4",
"index_projection": {
"entry_keys": [
1,
3,
4,
5,
6,
8,
9,
10,
11,
12,
13,
14,
15,
16,
17
],
"primary_key": true
},
"keyspace": "reporting",
"namespace": "default",
"spans": [
{
"exact": true,
"range": [
{
"high": "\"c\"",
"inclusion": 1,
"low": "\"b\""
}
]
}
],
"using": "gsi"
}
},
{
"#operator": "Parallel",
"~child": {
"#operator": "Sequence",
"~children": [
{
"#operator": "Filter",
"condition": "(((cover ((`r`.`type`)) = \"Person\") and (not (cover ((meta(`r`).`id`)) like \"_sync%\"))) and cover (any `x` in [lower((`r`.`cardNumber`)), lower((`r`.`firstName`)), lower((`r`.`lastName`))] satisfies (`x` like lower(\"b%\")) end))"
},
{
"#operator": "InitialProject",
"result_terms": [
{
"as": "status",
"expr": "case when ((cover ((`r`.`verificationStatus`)) = \"PENDING\") and ((4 <= date_diff_str(clock_str(), cover ((`r`.`lastPendingDocumentDate`)), \"day\")) or (date_diff_str(cover ((`r`.`startDate`)), cover ((`r`.`lastPendingDocumentDate`)), \"day\") <= 5))) then \"OVERDUE\" else cover ((`r`.`verificationStatus`)) end"
},
{
"expr": "cover ((`r`.`startDate`))"
},
{
"expr": "cover ((`r`.`isInternational`))"
},
{
"expr": "cover ((`r`.`isOcrDirty`))"
},
{
"expr": "cover ((`r`.`endDate`))"
},
{
"expr": "cover ((`r`.`firstName`))"
},
{
"expr": "cover ((`r`.`middleName`))"
},
{
"expr": "cover ((`r`.`lastName`))"
},
{
"expr": "cover ((`r`.`personId`))"
},
{
"expr": "cover ((`r`.`securityPhotoMediaItemId`))"
},
{
"expr": "cover ((`r`.`cardNumber`))"
},
{
"expr": "cover ((`r`.`isReviewLater`))"
},
{
"expr": "cover ((`r`.`verifiedBy`))"
}
]
}
]
}
}
]
},
{
"#operator": "Order",
"limit": "20",
"sort_terms": [
{
"expr": "cover ((meta(`r`).`id`))"
}
]
},
{
"#operator": "Limit",
"expr": "20"
},
{
"#operator": "FinalProject"
}
]
},
"text": "Select case When r.verificationStatus = 'PENDING' and (date_diff_str(clock_local() , r.lastPendingDocumentDate, 'day') >= 4 OR date_diff_str(startDate , r.lastPendingDocumentDate, 'day') <= 5) Then 'OVERDUE' ELSE r.verificationStatus end as status,\nr.startDate,r.isInternational,r.isOcrDirty,r.endDate,r.firstName, r.middleName, r.lastName, r.personId, r.securityPhotoMediaItemId, r.cardNumber,r.isReviewLater, r.verifiedBy \nFrom reporting r\nwhere r.type = 'Person'\nAND meta(r).id NOT LIKE \"_sync%\" \nAND (ANY x IN [LOWER(r.cardNumber),LOWER(r.firstName), LOWER(r.lastName)] SATISFIES x like LOWER(\"b%\") END)\norder by meta(r).id limit 20 offset 0;"
}
Is DP4 really that faster than Non-Analytics version of couchbase Server? or it might be something else that I am missing. Please let me know.