getIndexStatus failed

I was able to add SERVER5 back into the cluster. Unfortunately the server was autofailovered.
Here the log

Node ('ns_1@SERVER5') was automatically failovered.
[{last_heard,{1476,217471,476683}},
{stale_slow_status,{1476,217391,461051}},
{now,{1476,217461,477799}},
{active_buckets,["adw_dmp_helper","adw_dmp","adw_dmp_cache"]},
{ready_buckets,[]},
{status_latency,4050},
{outgoing_replications_safeness_level,
[{"adw_dmp_cache",green},{"adw_dmp_helper",green},{"adw_dmp",green}]},
{incoming_replications_conf_hashes,
[{"adw_dmp_cache",[]},
{"adw_dmp_helper",
[{'ns_1@SERVER1',51588383},
{'ns_1@SERVER2',31434318},
{'ns_1@SERVER3',5142064},
{'ns_1@SERVER4',47698658}]},
{"adw_dmp",
[{'ns_1@SERVER1',51588383},
{'ns_1@SERVER2',31434318},
{'ns_1@SERVER3',5142064},
{'ns_1@SERVER4',47698658}]}]},
{local_tasks,
[[{pid,<<"<0.32116.629>">>},
{bucket,<<"adw_dmp">>},
{original_target,{[{type,bucket}]}},
{progress,15},
{started_on,1476215735},
{total_vbuckets,408},
{trigger_type,scheduled},
{type,bucket_compaction},
{updated_on,1476217295},
{vbuckets_done,62}]]},
{memory,
[{total,543092384},
{processes,209005272},
{processes_used,208909576},
{system,334087112},
{atom,594537},
{atom_used,582551},
{binary,61681632},
{code,14338494},
{ets,244106688}]},
{system_memory_data,
[{system_total_memory,203143090176},
{free_swap,0},
{total_swap,0},
{cached_memory,76768473088},
{buffered_memory,45056},
{free_memory,5950480384},
{total_memory,203143090176}]},
{node_storage_conf,
[{db_path,"/opt/couchbase/var/lib/couchbase/data"},
{index_path,"/couchbase_indexes"}]},
{statistics,
[{wall_clock,{3367033655,5000}},
{context_switches,{683992261,0}},
{garbage_collection,{184362848,763910679225,0}},
{io,{{input,489634681301},{output,1442443299899}}},
{reductions,{152576132077,721049}},
{run_queue,0},
{runtime,{192841640,630}},
{run_queues,
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}}]},
{system_stats,
[{cpu_utilization_rate,9.868007542426147},
{swap_total,0},
{swap_used,0},
{mem_total,203143090176},
{mem_free,82714206208}]},
{interesting_stats,
[{cmd_get,572.0},
{couch_docs_actual_disk_size,216535439645},
{couch_docs_data_size,156232156762},
{couch_spatial_data_size,0},
{couch_spatial_disk_size,0},
{couch_views_actual_disk_size,19698307868},
{couch_views_data_size,12829843192},
{curr_items,133287322},
{curr_items_tot,266242972},
{ep_bg_fetched,1.0},
{get_hits,399.0},
{mem_used,101170367517},
{ops,1552.0},
{vb_replica_curr_items,133129261}]},
{per_bucket_interesting_stats,
[{"adw_dmp",
[{cmd_get,317.0},
{couch_docs_actual_disk_size,210534646167},
{couch_docs_data_size,150718175657},
{couch_spatial_data_size,0},
{couch_spatial_disk_size,0},
{couch_views_actual_disk_size,19255124078},
{couch_views_data_size,12457328414},
{curr_items,129453593},
{curr_items_tot,258918489},
{ep_bg_fetched,1.0},
{get_hits,258.0},
{mem_used,92826191640},
{ops,669.0},
{vb_replica_curr_items,129464896}]},
{"adw_dmp_helper",
[{cmd_get,16.0},
{couch_docs_actual_disk_size,6000793478},
{couch_docs_data_size,5513981105},
{couch_spatial_data_size,0},
{couch_spatial_disk_size,0},
{couch_views_actual_disk_size,443183790},
{couch_views_data_size,372514778},
{curr_items,3660118},
{curr_items_tot,7324483},
{ep_bg_fetched,0.0},
{get_hits,2.0},
{mem_used,8299293448},
{ops,18.0},
{vb_replica_curr_items,3664365}]},
{"adw_dmp_cache",
[{cmd_get,239.0},
{couch_docs_data_size,0},
{curr_items,173611},
{curr_items_tot,0},
{ep_bg_fetched,0.0},
{get_hits,139.0},
{mem_used,44882429},
{ops,865.0},
{vb_replica_curr_items,0}]}]},
{processes_stats,
[{<<"godu/46123/ppid">>,46062},
{<<"godu/46123/mem_size">>,5971968},
{<<"godu/46123/mem_resident">>,1331200},
{<<"godu/46123/cpu_utilization">>,0},
{<<"godu/46123/major_faults_raw">>,61},
{<<"cbq-engine/46399/ppid">>,46395},
{<<"cbq-engine/46399/mem_size">>,579633152},
{<<"cbq-engine/46399/mem_resident">>,7364608},
{<<"cbq-engine/46399/cpu_utilization">>,0},
{<<"cbq-engine/46399/major_faults_raw">>,160},
{<<"indexer/46385/ppid">>,46383},
{<<"indexer/46385/mem_size">>,27131867136},
{<<"indexer/46385/mem_resident">>,1122639872},
{<<"indexer/46385/cpu_utilization">>,0},
{<<"indexer/46385/major_faults_raw">>,148},
{<<"projector/46380/ppid">>,46378},
{<<"projector/46380/mem_size">>,580124672},
{<<"projector/46380/mem_resident">>,4452352},
{<<"projector/46380/cpu_utilization">>,0},
{<<"projector/46380/major_faults_raw">>,89},
{<<"goxdcr/46136/ppid">>,46134},
{<<"goxdcr/46136/mem_size">>,363720704},
{<<"goxdcr/46136/mem_resident">>,4628480},
{<<"goxdcr/46136/cpu_utilization">>,0},
{<<"goxdcr/46136/major_faults_raw">>,71},
{<<"sigar_port/46133/ppid">>,2469},
{<<"sigar_port/46133/mem_size">>,11972608},
{<<"sigar_port/46133/mem_resident">>,5087232},
{<<"sigar_port/46133/cpu_utilization">>,0},
{<<"sigar_port/46133/major_faults_raw">>,0},
{<<"godu/46125/ppid">>,2469},
{<<"godu/46125/mem_size">>,4853760},
{<<"godu/46125/mem_resident">>,884736},
{<<"godu/46125/cpu_utilization">>,0},
{<<"godu/46125/major_faults_raw">>,0},
{<<"beam.smp/46062/ppid">>,2469},
{<<"beam.smp/46062/mem_size">>,5799075840},
{<<"beam.smp/46062/mem_resident">>,1942224896},
{<<"beam.smp/46062/cpu_utilization">>,223},
{<<"beam.smp/46062/major_faults_raw">>,35},
{<<"memcached/81712/ppid">>,2379},
{<<"memcached/81712/mem_size">>,117734207488},
{<<"memcached/81712/mem_resident">>,113022160896},
{<<"memcached/81712/cpu_utilization">>,71},
{<<"memcached/81712/major_faults_raw">>,63},
{<<"saslauthd-port/80484/ppid">>,2379},
{<<"saslauthd-port/80484/mem_size">>,107495424},
{<<"saslauthd-port/80484/mem_resident">>,1802240},
{<<"saslauthd-port/80484/cpu_utilization">>,0},
{<<"saslauthd-port/80484/major_faults_raw">>,23},
{<<"goport/46395/ppid">>,2379},
{<<"goport/46395/mem_size">>,5124096},
{<<"goport/46395/mem_resident">>,417792},
{<<"goport/46395/cpu_utilization">>,0},
{<<"goport/46395/major_faults_raw">>,3},
{<<"goport/46383/ppid">>,2379},
{<<"goport/46383/mem_size">>,5124096},
{<<"goport/46383/mem_resident">>,425984},
{<<"goport/46383/cpu_utilization">>,0},
{<<"goport/46383/major_faults_raw">>,0},
{<<"goport/46378/ppid">>,2379},
{<<"goport/46378/mem_size">>,5124096},
{<<"goport/46378/mem_resident">>,434176},
{<<"goport/46378/cpu_utilization">>,0},
{<<"goport/46378/major_faults_raw">>,0},
{<<"goport/46134/ppid">>,2379},
{<<"goport/46134/mem_size">>,5124096},
{<<"goport/46134/mem_resident">>,434176},
{<<"goport/46134/cpu_utilization">>,0},
{<<"goport/46134/major_faults_raw">>,0},
{<<"beam.smp/2469/ppid">>,2379},
{<<"beam.smp/2469/mem_size">>,3318468608},
{<<"beam.smp/2469/mem_resident">>,796700672},
{<<"beam.smp/2469/cpu_utilization">>,14},
{<<"beam.smp/2469/major_faults_raw">>,20},
{<<"beam.smp/2379/ppid">>,1},
{<<"beam.smp/2379/mem_size">>,1593823232},
{<<"beam.smp/2379/mem_resident">>,45670400},
{<<"beam.smp/2379/cpu_utilization">>,1},
{<<"beam.smp/2379/major_faults_raw">>,1}]},
{index_status,[{num_connections,0}]},
{cluster_compatibility_version,262144},
{version,
[{lhttpc,"1.3.0"},
{os_mon,"2.2.14"},
{public_key,"0.21"},
{asn1,"2.0.4"},
{kernel,"2.16.4"},
{ale,"4.0.0-4051-community"},
{inets,"5.9.8"},
{ns_server,"4.0.0-4051-community"},
{crypto,"3.2"},
{ssl,"5.3.3"},
{sasl,"2.3.4"},
{stdlib,"1.19.4"}]},
{supported_compat_version,[4,0]},
{advertised_version,[4,0,0]},
{system_arch,"x86_64-unknown-linux-gnu"},
{wall_clock,3367033},
{memory_data,{203143090176,197231075328,{<19866.146.0>,41050832}}},
{disk_data,
[{"/dev",10240,0},
{"/run",39676388,11},
{"/",3904036336,6},
{"/dev/shm",99190960,0},
{"/run/lock",5120,0},
{"/sys/fs/cgroup",99190960,0},
{"/etc/machine-id",39676388,11},
{"/couchbase_indexes",937233624,3},
{"/boot",499656,8}]},
{meminfo,
<<"MemTotal: 198381924 kB\nMemFree: 5811140 kB\nMemAvailable: 78621668 kB\nBuffers: 44 kB\nCached: 74969204 kB\nSwapCached: 0 kB\nActive: 152109116 kB\nInactive: 37268700 kB\nActive(anon): 116936648 kB\nInactive(anon): 1525788 kB\nActive(file): 35172468 kB\nInactive(file): 35742912 kB\nUnevictable: 14344 kB\nMlocked: 14344 kB\nSwapTotal: 0 kB\nSwapFree: 0 kB\nDirty: 127316 kB\nWriteback: 49240 kB\nAnonPages: 114425100 kB\nMapped: 99992 kB\nShmem: 4048904 kB\nSlab: 2370436 kB\nSReclaimable: 2233008 kB\nSUnreclaim: 137428 kB\nKernelStack: 14784 kB\nPageTables: 263400 kB\nNFS_Unstable: 0 kB\nBounce: 0 kB\nWritebackTmp: 0 kB\nCommitLimit: 99190960 kB\nCommitted_AS: 149483864 kB\nVmallocTotal: 34359738367 kB\nVmallocUsed: 716036 kB\nVmallocChunk: 34258220536 kB\nHardwareCorrupted: 0 kB\nAnonHugePages: 0 kB\nHugePages_Total: 0\nHugePages_Free: 0\nHugePages_Rsvd: 0\nHugePages_Surp: 0\nHugepagesize: 2048 kB\nDirectMap4k: 119548 kB\nDirectMap2M: 3979264 kB\nDirectMap1G: 199229440 kB\n">>}]

here the entry of the error.log that gets repeated over and over

[ns_server:error,2016-10-11T22:25:43.625+02:00,ns_1@SERVER5:timeout_diag_logger<0.129.0>:timeout_diag_logger:do_diag:107]
{<0.0.0>,
 [{registered_name,init},
  {status,waiting},
  {initial_call,{otp_ring0,start,2}},
  {backtrace,[<<"Program counter: 0x00007feddf04a4c8 (init:boot_loop/2 + 64)">>,
              <<"CP: 0x0000000000000000 (invalid)">>,<<"arity = 0">>,<<>>,
              <<"0x00007fed863feaa0 Return addr 0x0000000000874708 (<terminate process normally>)">>,
              <<"y(0)     []">>,
              <<"(1)     {state,[{'-root',[<<25 bytes>>]},{'-progname',[<<3 bytes>>]},{'-home',[<<14 bytes>>]},{'-smp',[<<6 bytes>>]},{'-">>,
              <<"y(2)     <0.2.0>">>,<<>>]},
  {error_handler,error_handler},
  {garbage_collection,[{min_bin_vheap_size,46422},
                       {min_heap_size,233},
                       {fullsweep_after,512},
                       {minor_gcs,396}]},
  {heap_size,2586},
  {total_heap_size,6771},
  {links,[<0.3.0>,<0.6.0>,<0.7.0>,<0.2.0>]},
  {monitors,[]},
  {monitored_by,[]},
  {memory,55024},
  {message_queue_len,0},
  {reductions,399930},
  {trap_exit,true},
  {current_location,{init,boot_loop,2,[]}}]}

@mathias, from the logs you uploaded, the logs are not from Debug log level. 3 out of the 4 nodes have the last log line from 4 months back and nothing after that. In case you already tried changing log level from UI, I suspect Indexer processes might be in hang state. In that case, can you restart couchbase on all nodes one by one.

@deepkaran.salooja I uploaded the logs again to the same location.
hopefully they are now informational enough

@mathias, there seems to be some communication issues in your cluster. The indexer nodes seem to be having issues talking to each other leading to the errors you are seeing:

2016-10-18T08:57:57.49Z+02:00 [Debug] RequestHandler::handleIndexStatusRequest: failed nodes [adwcb03.adwebster.com:9102]
2016-10-18T08:58:57.392Z+02:00 [Debug] RequestHandler::handleIndexStatusRequest: failed nodes [adwcb01.adwebster.com:9102]
2016-10-18T08:59:17.408Z+02:00 [Debug] RequestHandler::handleIndexStatusRequest: failed nodes [adwcb02.adwebster.com:9102]

[stats:warn,2016-10-18T11:55:40.384+02:00,ns_1@adwcb01.adwebster.com:system_stats_collector<0.334.0>:base_stats_collector:latest_tick:69](Collector: system_stats_collector) Dropped 1 ticks
[stats:warn,2016-10-18T11:55:40.385+02:00,ns_1@adwcb01.adwebster.com:query_stats_collector<0.6385.6989>:base_stats_collector:latest_tick:69](Collector: query_stats_collector) Dropped 1 ticks
[stats:warn,2016-10-18T11:55:40.390+02:00,ns_1@adwcb01.adwebster.com:<0.3161.4>:base_stats_collector:latest_tick:69](Collector: stats_collector) Dropped 1 ticks

Indexer process is running fine on all nodes. I cannot tell the exact problem from the logs.

@deepkaran.salooja, Sorry I had holidays…

I checked if all needed ports are open and tested the bandwith with iperf. The bandwith test showed me that I have around 800 Mbit/s with a ping of 1ms.
Do you any other thing I could test to find the communication issue?

@deepkaran.salooja, I analyzed now the traffic on the interface which is used for the cluster communication and I think I found the issue…

Everytime I see in the error.log I see a huge view_merge and during the answer of this request the servers send new view_merge requests and this could be the timeouts. I’ll try this week to reduce the size of our biggest view to test if it gets better

So I was able to resolve / workaround this issue.

With a bit force I was able to remove the 2 affected nodes out of the cluster and reinstall them. Everything works like a charm now.

One point I would like to be changed in future releases are the logmessages, because they are quite confusing.

In the admin interface the log entry tells me that one node, let’s call it node1, was not able to connect to the master node.

But in the indexer log file of all other nodes in the cluster I see that this particularly node1 was marked as failed node.

From my point of view the log message in the admin interface should display which node sees which nodes as failed. This would generate more log entries but would help a lot to find out which nodes are misbehaving.

The reason for this change is that I was not 100% sure if all the other nodes are not working properly or only node1.

Thanks to @deepkaran.salooja for the help