Full Text Search Improve Query Performance

Hi Community,

Description:

I have a data set around 4 - 5 Million documents, where I need to configure Full Text Search Capability with minimum response time.
I configured the FTS index as below.

 "name": "full_text_index",
 "type": "fulltext-index",
 "params": {
  "mapping": {
   "types": {
    "_default.native": {
     "enabled": true,
     "dynamic": true,
     "default_analyzer": "standard",
     "properties": {
      "text": {
       "enabled": true,
       "dynamic": false,
       "fields": [
        {
         "name": "text",
         "type": "text",
         "analyzer": "simple",
         "store": false,
         "index": true,
         "include_term_vectors": true,
         "include_in_all": false,
         "docvalues": false
        }
       ]
      },
      "tenant": {
       "enabled": true,
       "dynamic": false,
       "fields": [
        {
         "name": "tenant",
         "type": "text",
         "analyzer": "keyword",
         "store": false,
         "index": true,
         "include_term_vectors": false,
         "include_in_all": false,
         "docvalues": false
        }
       ]
      },
      "status": {
       "enabled": true,
       "dynamic": false,
       "fields": [
        {
         "name": "status",
         "type": "text",
         "analyzer": "keyword",
         "store": false,
         "index": true,
         "include_term_vectors": false,
         "include_in_all": false,
         "docvalues": false
        }
       ]
      },
      "locale": {
       "enabled": true,
       "dynamic": false,
       "fields": [
        {
         "name": "locale",
         "type": "text",
         "analyzer": "keyword",
         "store": false,
         "index": true,
         "include_term_vectors": false,
         "include_in_all": false,
         "docvalues": false
        }
       ]
      },
      "lastUpdateTime": {
       "enabled": true,
       "dynamic": false,
       "fields": [
        {
         "name": "lastUpdateTime",
         "type": "number",
         "store": false,
         "index": true,
         "include_term_vectors": false,
         "include_in_all": false,
         "docvalues": true
        }
       ]
      },
      "productIds": {
       "enabled": true,
       "dynamic": false,
       "fields": [
        {
         "name": "productIds",
         "type": "text",
         "analyzer": "keyword",
         "store": false,
         "index": true,
         "include_term_vectors": false,
         "include_in_all": false,
         "docvalues": false
        }
       ]
      },
      "id": {
       "enabled": true,
       "dynamic": false,
       "fields": [
        {
         "name": "id",
         "type": "text",
         "analyzer": "keyword",
         "store": false,
         "index": true,
         "include_term_vectors": false,
         "include_in_all": false,
         "docvalues": false
        }
       ]
      },
      "summary": {
       "enabled": true,
       "dynamic": false,
       "fields": [
        {
         "name": "summary",
         "type": "text",
         "analyzer": "simple",
         "store": false,
         "index": true,
         "include_term_vectors": true,
         "include_in_all": false,
         "docvalues": false
        }
       ]
      }
     }
    }
   },
   "default_mapping": {
    "enabled": false,
    "dynamic": true
   },
   "default_type": "_default",
   "default_analyzer": "standard",
   "default_datetime_parser": "dateTimeOptional",
   "default_field": "",
   "store_dynamic": false,
   "index_dynamic": false,
   "docvalues_dynamic": false
  },
  "store": {
   "indexType": "scorch",
   "kvStoreName": ""
  },
  "doc_config": {
   "docid_prefix_delim": "",
   "docid_regexp": "",
   "mode": "scope.collection.type_field",
   "type_field": "type"
  }
 },
 "sourceType": "couchbase",
 "sourceName": "Sample",
 "sourceUUID": "be04daad7edfa09f20ecf781c0817483",
 "sourceParams": {},
 "planParams": {
  "maxPartitionsPerPIndex": 1024,
  "numReplicas": 0,
  "indexPartitions": 12
 },
 "uuid": ""
}

Document Description:
tenant, status, locale are string attributes where I need a full match, hence used keyword analyser
productIds is list of IDs where I need a full match, hence used keyword analyser
lastUpdateTime is long value where I need to query by range and sort in descending order
Id is a string, where I need to query for full match or a partial match as a wildcard like suffix match (Ex: *documentId)
text and summary are text attributes where I need to match phrases or normal word match.

I have created index as above screenshot with index partition as 12 without using any custom analyser of filter.

Search Query:

{
    "query": {
        "conjuncts": [
            {
                "disjuncts": [
                    {
                        "wildcard": "*{{searchText}}",
                        "field": "id"
                    },
                    {
                        "match_phrase": "{{searchText}}",
                        "field": "text"
                    },
                    {
                        "match_phrase": "{{searchText}}",
                        "field": "summary"
                    },
                    {
                        "match": "{{searchText}}",
                        "field": "prod"
                    }
                ]
            },
            {
                "term": "abc-123",
                "field": "tenant"
            },
            {
                "disjuncts": [
                    {
                        "term": "en",
                        "field": "locale"
                    }
                ]
            },
            {
                "disjuncts": [
                    {
                        "term": "Approved",
                        "field": "status"
                    },
                    {
                        "term": "Rejected",
                        "field": "status"
                    }
                ]
            },
            {
                "field": "lastUpdateTime",
                "min": 1603799414000,
                "max": 1730029814000,
                "inclusive_min": true,
                "inclusive_max": true
            }
        ]
    },
    "sort": [
        "-lastUpdateTime"
    ],
    "size": 10,
    "from": 0
}

My query looks above, the {{searchText}} place holder will be replaced with my dynamic input from UI and other query attributes are filled based on user type and filter params.

Problem:
Currently with above index configuration and querying for 4-5 million documents I am able to get the data in 400ms - 500ms. Even though I do not store any data in index for retrieval as it increases my index size in disk.
I need to get response within 50ms. Is is possible to achieve such low latency. If yes, Can anyone help me here to get query data with faster retrieval.

Hello @mohithraj_kulal ,

Your query generally looks good for the index.

A wildcard query could be the slower one here because it runs in two phases where first the candidate terms for the list will be determined and then a disjunction will run over all those candidate terms - you can look into moving this query-time-compute to the index time, by devising a custom analyzer that uses maybe a whitespace or unicode tokenizer but importantly an ngram token filter with a low enough min (smallest word size) and reasonable max (maybe max word length).

With this your index size would slightly increase (to hold more tokens for the data), but you get to replace your wildcard query with a term query instead …

{
  "query": {
    "conjuncts": [
      {
        "disjuncts": [
          {"field": "id", "term": "{{searchText}}"},
          {"match_phrase": "{{searchText}}", "field": "text"},
          {"match_phrase": "{{searchText}}", "field": "summary"},
          {"match": "{{searchText}}", "field": "prod"}
        ]
      },
      {"term": "abc-123", "field": "tenant"},
      {"term": "en", "field": "locale"},
      {
        "disjuncts": [
          {"term": "Approved", "field": "status"},
          {"term": "Rejected", "field": "status"}
        ]
      },
      {
        "field": "lastUpdateTime",
        "min": 1603799414000,
        "max": 1730029814000,
        "inclusive_min": true,
        "inclusive_max": true
      }
    ]
  },
  "sort": [
    "-lastUpdateTime"
  ],
  "size": 10,
  "from": 0
}

Another optimization also comes to mind here …

  • find a common analyzer for fields - id, text, summary and prod
  • set include in _all to true for all these fields
  • set the default_analyzer at the top level of your index definition to this common analyzer, or maybe something better
  • now you can replace the top 4 disjuncts with a single match query (uses the default_analyzer) that would search within the _all field which will be a composite of the 4 fields above …
{
  "query": {
    "conjuncts": [
      {"match": "{{searchText}}"},
      {"term": "abc-123", "field": "tenant"},
      {"term": "en", "field": "locale"},
      {
        "disjuncts": [
          {"term": "Approved", "field": "status"},
          {"term": "Rejected", "field": "status"}
        ]
      },
      {
        "field": "lastUpdateTime",
        "min": 1603799414000,
        "max": 1730029814000,
        "inclusive_min": true,
        "inclusive_max": true
      }
    ]
  },
  "sort": [
    "-lastUpdateTime"
  ],
  "size": 10,
  "from": 0
}