summaryrefslogtreecommitdiff
path: root/tests/cluster
diff options
context:
space:
mode:
Diffstat (limited to 'tests/cluster')
-rw-r--r--tests/cluster/cluster.tcl130
-rw-r--r--tests/cluster/run.tcl28
-rw-r--r--tests/cluster/tests/00-base.tcl59
-rw-r--r--tests/cluster/tests/01-faildet.tcl38
-rw-r--r--tests/cluster/tests/02-failover.tcl65
-rw-r--r--tests/cluster/tests/03-failover-loop.tcl115
-rw-r--r--tests/cluster/tests/04-resharding.tcl172
-rw-r--r--tests/cluster/tests/05-slave-selection.tcl94
-rw-r--r--tests/cluster/tests/06-slave-stop-cond.tcl73
-rw-r--r--tests/cluster/tests/07-replica-migration.tcl103
-rw-r--r--tests/cluster/tests/08-update-msg.tcl90
-rw-r--r--tests/cluster/tests/09-pubsub.tcl40
-rw-r--r--tests/cluster/tests/10-manual-failover.tcl192
-rw-r--r--tests/cluster/tests/11-manual-takeover.tcl59
-rw-r--r--tests/cluster/tests/12-replica-migration-2.tcl64
-rw-r--r--tests/cluster/tests/helpers/onlydots.tcl16
-rw-r--r--tests/cluster/tests/includes/init-tests.tcl70
-rw-r--r--tests/cluster/tmp/.gitignore2
18 files changed, 1410 insertions, 0 deletions
diff --git a/tests/cluster/cluster.tcl b/tests/cluster/cluster.tcl
new file mode 100644
index 0000000..0647914
--- /dev/null
+++ b/tests/cluster/cluster.tcl
@@ -0,0 +1,130 @@
+# Cluster-specific test functions.
+#
+# Copyright (C) 2014 Salvatore Sanfilippo [email protected]
+# This software is released under the BSD License. See the COPYING file for
+# more information.
+
+# Returns a parsed CLUSTER NODES output as a list of dictionaries.
+proc get_cluster_nodes id {
+ set lines [split [R $id cluster nodes] "\r\n"]
+ set nodes {}
+ foreach l $lines {
+ set l [string trim $l]
+ if {$l eq {}} continue
+ set args [split $l]
+ set node [dict create \
+ id [lindex $args 0] \
+ addr [lindex $args 1] \
+ flags [split [lindex $args 2] ,] \
+ slaveof [lindex $args 3] \
+ ping_sent [lindex $args 4] \
+ pong_recv [lindex $args 5] \
+ config_epoch [lindex $args 6] \
+ linkstate [lindex $args 7] \
+ slots [lrange $args 8 -1] \
+ ]
+ lappend nodes $node
+ }
+ return $nodes
+}
+
+# Test node for flag.
+proc has_flag {node flag} {
+ expr {[lsearch -exact [dict get $node flags] $flag] != -1}
+}
+
+# Returns the parsed myself node entry as a dictionary.
+proc get_myself id {
+ set nodes [get_cluster_nodes $id]
+ foreach n $nodes {
+ if {[has_flag $n myself]} {return $n}
+ }
+ return {}
+}
+
+# Return the value of the specified CLUSTER INFO field.
+proc CI {n field} {
+ get_info_field [R $n cluster info] $field
+}
+
+# Assuming nodes are reest, this function performs slots allocation.
+# Only the first 'n' nodes are used.
+proc cluster_allocate_slots {n} {
+ set slot 16383
+ while {$slot >= 0} {
+ # Allocate successive slots to random nodes.
+ set node [randomInt $n]
+ lappend slots_$node $slot
+ incr slot -1
+ }
+ for {set j 0} {$j < $n} {incr j} {
+ R $j cluster addslots {*}[set slots_${j}]
+ }
+}
+
+# Check that cluster nodes agree about "state", or raise an error.
+proc assert_cluster_state {state} {
+ foreach_redis_id id {
+ if {[instance_is_killed redis $id]} continue
+ wait_for_condition 1000 50 {
+ [CI $id cluster_state] eq $state
+ } else {
+ fail "Cluster node $id cluster_state:[CI $id cluster_state]"
+ }
+ }
+}
+
+# Search the first node starting from ID $first that is not
+# already configured as a slave.
+proc cluster_find_available_slave {first} {
+ foreach_redis_id id {
+ if {$id < $first} continue
+ if {[instance_is_killed redis $id]} continue
+ set me [get_myself $id]
+ if {[dict get $me slaveof] eq {-}} {return $id}
+ }
+ fail "No available slaves"
+}
+
+# Add 'slaves' slaves to a cluster composed of 'masters' masters.
+# It assumes that masters are allocated sequentially from instance ID 0
+# to N-1.
+proc cluster_allocate_slaves {masters slaves} {
+ for {set j 0} {$j < $slaves} {incr j} {
+ set master_id [expr {$j % $masters}]
+ set slave_id [cluster_find_available_slave $masters]
+ set master_myself [get_myself $master_id]
+ R $slave_id cluster replicate [dict get $master_myself id]
+ }
+}
+
+# Create a cluster composed of the specified number of masters and slaves.
+proc create_cluster {masters slaves} {
+ cluster_allocate_slots $masters
+ if {$slaves} {
+ cluster_allocate_slaves $masters $slaves
+ }
+ assert_cluster_state ok
+}
+
+# Set the cluster node-timeout to all the reachalbe nodes.
+proc set_cluster_node_timeout {to} {
+ foreach_redis_id id {
+ catch {R $id CONFIG SET cluster-node-timeout $to}
+ }
+}
+
+# Check if the cluster is writable and readable. Use node "id"
+# as a starting point to talk with the cluster.
+proc cluster_write_test {id} {
+ set prefix [randstring 20 20 alpha]
+ set port [get_instance_attrib redis $id port]
+ set cluster [redis_cluster 127.0.0.1:$port]
+ for {set j 0} {$j < 100} {incr j} {
+ $cluster set key.$j $prefix.$j
+ }
+ for {set j 0} {$j < 100} {incr j} {
+ assert {[$cluster get key.$j] eq "$prefix.$j"}
+ }
+ $cluster close
+}
diff --git a/tests/cluster/run.tcl b/tests/cluster/run.tcl
new file mode 100644
index 0000000..93603dd
--- /dev/null
+++ b/tests/cluster/run.tcl
@@ -0,0 +1,28 @@
+# Cluster test suite. Copyright (C) 2014 Salvatore Sanfilippo [email protected]
+# This software is released under the BSD License. See the COPYING file for
+# more information.
+
+cd tests/cluster
+source cluster.tcl
+source ../instances.tcl
+source ../../support/cluster.tcl ; # Redis Cluster client.
+
+set ::instances_count 20 ; # How many instances we use at max.
+
+proc main {} {
+ parse_options
+ spawn_instance redis $::redis_base_port $::instances_count {
+ "cluster-enabled yes"
+ "appendonly yes"
+ }
+ run_tests
+ cleanup
+ end_tests
+}
+
+if {[catch main e]} {
+ puts $::errorInfo
+ if {$::pause_on_error} pause_on_error
+ cleanup
+ exit 1
+}
diff --git a/tests/cluster/tests/00-base.tcl b/tests/cluster/tests/00-base.tcl
new file mode 100644
index 0000000..280befb
--- /dev/null
+++ b/tests/cluster/tests/00-base.tcl
@@ -0,0 +1,59 @@
+# Check the basic monitoring and failover capabilities.
+
+source "../tests/includes/init-tests.tcl"
+
+if {$::simulate_error} {
+ test "This test will fail" {
+ fail "Simulated error"
+ }
+}
+
+test "Different nodes have different IDs" {
+ set ids {}
+ set numnodes 0
+ foreach_redis_id id {
+ incr numnodes
+ # Every node should just know itself.
+ set nodeid [dict get [get_myself $id] id]
+ assert {$nodeid ne {}}
+ lappend ids $nodeid
+ }
+ set numids [llength [lsort -unique $ids]]
+ assert {$numids == $numnodes}
+}
+
+test "It is possible to perform slot allocation" {
+ cluster_allocate_slots 5
+}
+
+test "After the join, every node gets a different config epoch" {
+ set trynum 60
+ while {[incr trynum -1] != 0} {
+ # We check that this condition is true for *all* the nodes.
+ set ok 1 ; # Will be set to 0 every time a node is not ok.
+ foreach_redis_id id {
+ set epochs {}
+ foreach n [get_cluster_nodes $id] {
+ lappend epochs [dict get $n config_epoch]
+ }
+ if {[lsort $epochs] != [lsort -unique $epochs]} {
+ set ok 0 ; # At least one collision!
+ }
+ }
+ if {$ok} break
+ after 1000
+ puts -nonewline .
+ flush stdout
+ }
+ if {$trynum == 0} {
+ fail "Config epoch conflict resolution is not working."
+ }
+}
+
+test "Nodes should report cluster_state is ok now" {
+ assert_cluster_state ok
+}
+
+test "It is possible to write and read from the cluster" {
+ cluster_write_test 0
+}
diff --git a/tests/cluster/tests/01-faildet.tcl b/tests/cluster/tests/01-faildet.tcl
new file mode 100644
index 0000000..8fe87c9
--- /dev/null
+++ b/tests/cluster/tests/01-faildet.tcl
@@ -0,0 +1,38 @@
+# Check the basic monitoring and failover capabilities.
+
+source "../tests/includes/init-tests.tcl"
+
+test "Create a 5 nodes cluster" {
+ create_cluster 5 5
+}
+
+test "Cluster should start ok" {
+ assert_cluster_state ok
+}
+
+test "Killing two slave nodes" {
+ kill_instance redis 5
+ kill_instance redis 6
+}
+
+test "Cluster should be still up" {
+ assert_cluster_state ok
+}
+
+test "Killing one master node" {
+ kill_instance redis 0
+}
+
+# Note: the only slave of instance 0 is already down so no
+# failover is possible, that would change the state back to ok.
+test "Cluster should be down now" {
+ assert_cluster_state fail
+}
+
+test "Restarting master node" {
+ restart_instance redis 0
+}
+
+test "Cluster should be up again" {
+ assert_cluster_state ok
+}
diff --git a/tests/cluster/tests/02-failover.tcl b/tests/cluster/tests/02-failover.tcl
new file mode 100644
index 0000000..6b2fd09
--- /dev/null
+++ b/tests/cluster/tests/02-failover.tcl
@@ -0,0 +1,65 @@
+# Check the basic monitoring and failover capabilities.
+
+source "../tests/includes/init-tests.tcl"
+
+test "Create a 5 nodes cluster" {
+ create_cluster 5 5
+}
+
+test "Cluster is up" {
+ assert_cluster_state ok
+}
+
+test "Cluster is writable" {
+ cluster_write_test 0
+}
+
+test "Instance #5 is a slave" {
+ assert {[RI 5 role] eq {slave}}
+}
+
+test "Instance #5 synced with the master" {
+ wait_for_condition 1000 50 {
+ [RI 5 master_link_status] eq {up}
+ } else {
+ fail "Instance #5 master link status is not up"
+ }
+}
+
+set current_epoch [CI 1 cluster_current_epoch]
+
+test "Killing one master node" {
+ kill_instance redis 0
+}
+
+test "Wait for failover" {
+ wait_for_condition 1000 50 {
+ [CI 1 cluster_current_epoch] > $current_epoch
+ } else {
+ fail "No failover detected"
+ }
+}
+
+test "Cluster should eventually be up again" {
+ assert_cluster_state ok
+}
+
+test "Cluster is writable" {
+ cluster_write_test 1
+}
+
+test "Instance #5 is now a master" {
+ assert {[RI 5 role] eq {master}}
+}
+
+test "Restarting the previously killed master node" {
+ restart_instance redis 0
+}
+
+test "Instance #0 gets converted into a slave" {
+ wait_for_condition 1000 50 {
+ [RI 0 role] eq {slave}
+ } else {
+ fail "Old master was not converted into slave"
+ }
+}
diff --git a/tests/cluster/tests/03-failover-loop.tcl b/tests/cluster/tests/03-failover-loop.tcl
new file mode 100644
index 0000000..8e1bcd6
--- /dev/null
+++ b/tests/cluster/tests/03-failover-loop.tcl
@@ -0,0 +1,115 @@
+# Failover stress test.
+# In this test a different node is killed in a loop for N
+# iterations. The test checks that certain properties
+# are preseved across iterations.
+
+source "../tests/includes/init-tests.tcl"
+
+test "Create a 5 nodes cluster" {
+ create_cluster 5 5
+}
+
+test "Cluster is up" {
+ assert_cluster_state ok
+}
+
+set iterations 20
+set cluster [redis_cluster 127.0.0.1:[get_instance_attrib redis 0 port]]
+
+while {[incr iterations -1]} {
+ set tokill [randomInt 10]
+ set other [expr {($tokill+1)%10}] ; # Some other instance.
+ set key [randstring 20 20 alpha]
+ set val [randstring 20 20 alpha]
+ set role [RI $tokill role]
+ if {$role eq {master}} {
+ set slave {}
+ set myid [dict get [get_myself $tokill] id]
+ foreach_redis_id id {
+ if {$id == $tokill} continue
+ if {[dict get [get_myself $id] slaveof] eq $myid} {
+ set slave $id
+ }
+ }
+ if {$slave eq {}} {
+ fail "Unable to retrieve slave's ID for master #$tokill"
+ }
+ }
+
+ puts "--- Iteration $iterations ---"
+
+ if {$role eq {master}} {
+ test "Wait for slave of #$tokill to sync" {
+ wait_for_condition 1000 50 {
+ [string match {*state=online*} [RI $tokill slave0]]
+ } else {
+ fail "Slave of node #$tokill is not ok"
+ }
+ }
+ set slave_config_epoch [CI $slave cluster_my_epoch]
+ }
+
+ test "Cluster is writable before failover" {
+ for {set i 0} {$i < 100} {incr i} {
+ catch {$cluster set $key:$i $val:$i} err
+ assert {$err eq {OK}}
+ }
+ # Wait for the write to propagate to the slave if we
+ # are going to kill a master.
+ if {$role eq {master}} {
+ R $tokill wait 1 20000
+ }
+ }
+
+ test "Killing node #$tokill" {
+ kill_instance redis $tokill
+ }
+
+ if {$role eq {master}} {
+ test "Wait failover by #$slave with old epoch $slave_config_epoch" {
+ wait_for_condition 1000 50 {
+ [CI $slave cluster_my_epoch] > $slave_config_epoch
+ } else {
+ fail "No failover detected, epoch is still [CI $slave cluster_my_epoch]"
+ }
+ }
+ }
+
+ test "Cluster should eventually be up again" {
+ assert_cluster_state ok
+ }
+
+ test "Cluster is writable again" {
+ for {set i 0} {$i < 100} {incr i} {
+ catch {$cluster set $key:$i:2 $val:$i:2} err
+ assert {$err eq {OK}}
+ }
+ }
+
+ test "Restarting node #$tokill" {
+ restart_instance redis $tokill
+ }
+
+ test "Instance #$tokill is now a slave" {
+ wait_for_condition 1000 50 {
+ [RI $tokill role] eq {slave}
+ } else {
+ fail "Restarted instance is not a slave"
+ }
+ }
+
+ test "We can read back the value we set before" {
+ for {set i 0} {$i < 100} {incr i} {
+ catch {$cluster get $key:$i} err
+ assert {$err eq "$val:$i"}
+ catch {$cluster get $key:$i:2} err
+ assert {$err eq "$val:$i:2"}
+ }
+ }
+}
+
+test "Post condition: current_epoch >= my_epoch everywhere" {
+ foreach_redis_id id {
+ assert {[CI $id cluster_current_epoch] >= [CI $id cluster_my_epoch]}
+ }
+}
diff --git a/tests/cluster/tests/04-resharding.tcl b/tests/cluster/tests/04-resharding.tcl
new file mode 100644
index 0000000..0ccbf71
--- /dev/null
+++ b/tests/cluster/tests/04-resharding.tcl
@@ -0,0 +1,172 @@
+# Failover stress test.
+# In this test a different node is killed in a loop for N
+# iterations. The test checks that certain properties
+# are preseved across iterations.
+
+source "../tests/includes/init-tests.tcl"
+
+test "Create a 5 nodes cluster" {
+ create_cluster 5 5
+}
+
+test "Cluster is up" {
+ assert_cluster_state ok
+}
+
+test "Enable AOF in all the instances" {
+ foreach_redis_id id {
+ R $id config set appendonly yes
+ # We use "appendfsync no" because it's fast but also guarantees that
+ # write(2) is performed before replying to client.
+ R $id config set appendfsync no
+ }
+
+ foreach_redis_id id {
+ wait_for_condition 1000 500 {
+ [RI $id aof_rewrite_in_progress] == 0 &&
+ [RI $id aof_enabled] == 1
+ } else {
+ fail "Failed to enable AOF on instance #$id"
+ }
+ }
+}
+
+# Return nno-zero if the specified PID is about a process still in execution,
+# otherwise 0 is returned.
+proc process_is_running {pid} {
+ # PS should return with an error if PID is non existing,
+ # and catch will return non-zero. We want to return non-zero if
+ # the PID exists, so we invert the return value with expr not operator.
+ expr {![catch {exec ps -p $pid}]}
+}
+
+# Our resharding test performs the following actions:
+#
+# - N commands are sent to the cluster in the course of the test.
+# - Every command selects a random key from key:0 to key:MAX-1.
+# - The operation RPUSH key <randomvalue> is perforemd.
+# - Tcl remembers into an array all the values pushed to each list.
+# - After N/2 commands, the resharding process is started in background.
+# - The test continues while the resharding is in progress.
+# - At the end of the test, we wait for the resharding process to stop.
+# - Finally the keys are checked to see if they contain the value they should.
+
+set numkeys 50000
+set numops 200000
+set cluster [redis_cluster 127.0.0.1:[get_instance_attrib redis 0 port]]
+catch {unset content}
+array set content {}
+set tribpid {}
+
+test "Cluster consistency during live resharding" {
+ set ele 0
+ for {set j 0} {$j < $numops} {incr j} {
+ # Trigger the resharding once we execute half the ops.
+ if {$tribpid ne {} &&
+ ($j % 10000) == 0 &&
+ ![process_is_running $tribpid]} {
+ set tribpid {}
+ }
+
+ if {$j >= $numops/2 && $tribpid eq {}} {
+ puts -nonewline "...Starting resharding..."
+ flush stdout
+ set target [dict get [get_myself [randomInt 5]] id]
+ set tribpid [lindex [exec \
+ ../../../src/redis-trib.rb reshard \
+ --from all \
+ --to $target \
+ --slots 100 \
+ --yes \
+ 127.0.0.1:[get_instance_attrib redis 0 port] \
+ | [info nameofexecutable] \
+ ../tests/helpers/onlydots.tcl \
+ &] 0]
+ }
+
+ # Write random data to random list.
+ set listid [randomInt $numkeys]
+ set key "key:$listid"
+ incr ele
+ # We write both with Lua scripts and with plain commands.
+ # This way we are able to stress Lua -> Redis command invocation
+ # as well, that has tests to prevent Lua to write into wrong
+ # hash slots.
+ if {$listid % 2} {
+ $cluster rpush $key $ele
+ } else {
+ $cluster eval {redis.call("rpush",KEYS[1],ARGV[1])} 1 $key $ele
+ }
+ lappend content($key) $ele
+
+ if {($j % 1000) == 0} {
+ puts -nonewline W; flush stdout
+ }
+ }
+
+ # Wait for the resharding process to end
+ wait_for_condition 1000 500 {
+ [process_is_running $tribpid] == 0
+ } else {
+ fail "Resharding is not terminating after some time."
+ }
+
+}
+
+test "Verify $numkeys keys for consistency with logical content" {
+ # Check that the Redis Cluster content matches our logical content.
+ foreach {key value} [array get content] {
+ if {[$cluster lrange $key 0 -1] ne $value} {
+ fail "Key $key expected to hold '$value' but actual content is [$cluster lrange $key 0 -1]"
+ }
+ }
+}
+
+test "Crash and restart all the instances" {
+ foreach_redis_id id {
+ kill_instance redis $id
+ restart_instance redis $id
+ }
+}
+
+test "Cluster should eventually be up again" {
+ assert_cluster_state ok
+}
+
+test "Verify $numkeys keys after the crash & restart" {
+ # Check that the Redis Cluster content matches our logical content.
+ foreach {key value} [array get content] {
+ if {[$cluster lrange $key 0 -1] ne $value} {
+ fail "Key $key expected to hold '$value' but actual content is [$cluster lrange $key 0 -1]"
+ }
+ }
+}
+
+test "Disable AOF in all the instances" {
+ foreach_redis_id id {
+ R $id config set appendonly no
+ }
+}
+
+test "Verify slaves consistency" {
+ set verified_masters 0
+ foreach_redis_id id {
+ set role [R $id role]
+ lassign $role myrole myoffset slaves
+ if {$myrole eq {slave}} continue
+ set masterport [get_instance_attrib redis $id port]
+ set masterdigest [R $id debug digest]
+ foreach_redis_id sid {
+ set srole [R $sid role]
+ if {[lindex $srole 0] eq {master}} continue
+ if {[lindex $srole 2] != $masterport} continue
+ wait_for_condition 1000 500 {
+ [R $sid debug digest] eq $masterdigest
+ } else {
+ fail "Master and slave data digest are different"
+ }
+ incr verified_masters
+ }
+ }
+ assert {$verified_masters >= 5}
+}
diff --git a/tests/cluster/tests/05-slave-selection.tcl b/tests/cluster/tests/05-slave-selection.tcl
new file mode 100644
index 0000000..6efedce
--- /dev/null
+++ b/tests/cluster/tests/05-slave-selection.tcl
@@ -0,0 +1,94 @@
+# Slave selection test
+# Check the algorithm trying to pick the slave with the most complete history.
+
+source "../tests/includes/init-tests.tcl"
+
+# Create a cluster with 5 master and 10 slaves, so that we have 2
+# slaves for each master.
+test "Create a 5 nodes cluster" {
+ create_cluster 5 10
+}
+
+test "Cluster is up" {
+ assert_cluster_state ok
+}
+
+test "The first master has actually two slaves" {
+ assert {[llength [lindex [R 0 role] 2]] == 2}
+}
+
+test {Slaves of #0 are instance #5 and #10 as expected} {
+ set port0 [get_instance_attrib redis 0 port]
+ assert {[lindex [R 5 role] 2] == $port0}
+ assert {[lindex [R 10 role] 2] == $port0}
+}
+
+test "Instance #5 and #10 synced with the master" {
+ wait_for_condition 1000 50 {
+ [RI 5 master_link_status] eq {up} &&
+ [RI 10 master_link_status] eq {up}
+ } else {
+ fail "Instance #5 or #10 master link status is not up"
+ }
+}
+
+set cluster [redis_cluster 127.0.0.1:[get_instance_attrib redis 0 port]]
+
+test "Slaves are both able to receive and acknowledge writes" {
+ for {set j 0} {$j < 100} {incr j} {
+ $cluster set $j $j
+ }
+ assert {[R 0 wait 2 60000] == 2}
+}
+
+test "Write data while slave #10 is paused and can't receive it" {
+ # Stop the slave with a multi/exec transaction so that the master will
+ # be killed as soon as it can accept writes again.
+ R 10 multi
+ R 10 debug sleep 10
+ R 10 client kill 127.0.0.1:$port0
+ R 10 deferred 1
+ R 10 exec
+
+ # Write some data the slave can't receive.
+ for {set j 0} {$j < 100} {incr j} {
+ $cluster set $j $j
+ }
+
+ # Prevent the master from accepting new slaves.
+ # Use a large pause value since we'll kill it anyway.
+ R 0 CLIENT PAUSE 60000
+
+ # Wait for the slave to return available again
+ R 10 deferred 0
+ assert {[R 10 read] eq {OK OK}}
+
+ # Kill the master so that a reconnection will not be possible.
+ kill_instance redis 0
+}
+
+test "Wait for instance #5 (and not #10) to turn into a master" {
+ wait_for_condition 1000 50 {
+ [RI 5 role] eq {master}
+ } else {
+ fail "No failover detected"
+ }
+}
+
+test "Wait for the node #10 to return alive before ending the test" {
+ R 10 ping
+}
+
+test "Cluster should eventually be up again" {
+ assert_cluster_state ok
+}
+
+test "Node #10 should eventually replicate node #5" {
+ set port5 [get_instance_attrib redis 5 port]
+ wait_for_condition 1000 50 {
+ ([lindex [R 10 role] 2] == $port5) &&
+ ([lindex [R 10 role] 3] eq {connected})
+ } else {
+ fail "#10 didn't became slave of #5"
+ }
+}
diff --git a/tests/cluster/tests/06-slave-stop-cond.tcl b/tests/cluster/tests/06-slave-stop-cond.tcl
new file mode 100644
index 0000000..f2e6705
--- /dev/null
+++ b/tests/cluster/tests/06-slave-stop-cond.tcl
@@ -0,0 +1,73 @@
+# Slave stop condition test
+# Check that if there is a disconnection time limit, the slave will not try
+# to failover its master.
+
+source "../tests/includes/init-tests.tcl"
+
+# Create a cluster with 5 master and 5 slaves.
+test "Create a 5 nodes cluster" {
+ create_cluster 5 5
+}
+
+test "Cluster is up" {
+ assert_cluster_state ok
+}
+
+test "The first master has actually one slave" {
+ assert {[llength [lindex [R 0 role] 2]] == 1}
+}
+
+test {Slaves of #0 is instance #5 as expected} {
+ set port0 [get_instance_attrib redis 0 port]
+ assert {[lindex [R 5 role] 2] == $port0}
+}
+
+test "Instance #5 synced with the master" {
+ wait_for_condition 1000 50 {
+ [RI 5 master_link_status] eq {up}
+ } else {
+ fail "Instance #5 master link status is not up"
+ }
+}
+
+test "Lower the slave validity factor of #5 to the value of 2" {
+ assert {[R 5 config set cluster-slave-validity-factor 2] eq {OK}}
+}
+
+test "Break master-slave link and prevent further reconnections" {
+ # Stop the slave with a multi/exec transaction so that the master will
+ # be killed as soon as it can accept writes again.
+ R 5 multi
+ R 5 client kill 127.0.0.1:$port0
+ # here we should sleep 6 or more seconds (node_timeout * slave_validity)
+ # but the actual validity time is actually incremented by the
+ # repl-ping-slave-period value which is 10 seconds by default. So we
+ # need to wait more than 16 seconds.
+ R 5 debug sleep 20
+ R 5 deferred 1
+ R 5 exec
+
+ # Prevent the master from accepting new slaves.
+ # Use a large pause value since we'll kill it anyway.
+ R 0 CLIENT PAUSE 60000
+
+ # Wait for the slave to return available again
+ R 5 deferred 0
+ assert {[R 5 read] eq {OK OK}}
+
+ # Kill the master so that a reconnection will not be possible.
+ kill_instance redis 0
+}
+
+test "Slave #5 is reachable and alive" {
+ assert {[R 5 ping] eq {PONG}}
+}
+
+test "Slave #5 should not be able to failover" {
+ after 10000
+ assert {[RI 5 role] eq {slave}}
+}
+
+test "Cluster should be down" {
+ assert_cluster_state fail
+}
diff --git a/tests/cluster/tests/07-replica-migration.tcl b/tests/cluster/tests/07-replica-migration.tcl
new file mode 100644
index 0000000..68231cd
--- /dev/null
+++ b/tests/cluster/tests/07-replica-migration.tcl
@@ -0,0 +1,103 @@
+# Replica migration test.
+# Check that orphaned masters are joined by replicas of masters having
+# multiple replicas attached, according to the migration barrier settings.
+
+source "../tests/includes/init-tests.tcl"
+
+# Create a cluster with 5 master and 10 slaves, so that we have 2
+# slaves for each master.
+test "Create a 5 nodes cluster" {
+ create_cluster 5 10
+}
+
+test "Cluster is up" {
+ assert_cluster_state ok
+}
+
+test "Each master should have two replicas attached" {
+ foreach_redis_id id {
+ if {$id < 5} {
+ wait_for_condition 1000 50 {
+ [llength [lindex [R 0 role] 2]] == 2
+ } else {
+ fail "Master #$id does not have 2 slaves as expected"
+ }
+ }
+ }
+}
+
+test "Killing all the slaves of master #0 and #1" {
+ kill_instance redis 5
+ kill_instance redis 10
+ kill_instance redis 6
+ kill_instance redis 11
+ after 4000
+}
+
+foreach_redis_id id {
+ if {$id < 5} {
+ test "Master #$id should have at least one replica" {
+ wait_for_condition 1000 50 {
+ [llength [lindex [R $id role] 2]] >= 1
+ } else {
+ fail "Master #$id has no replicas"
+ }
+ }
+ }
+}
+
+# Now test the migration to a master which used to be a slave, after
+# a failver.
+
+source "../tests/includes/init-tests.tcl"
+
+# Create a cluster with 5 master and 10 slaves, so that we have 2
+# slaves for each master.
+test "Create a 5 nodes cluster" {
+ create_cluster 5 10
+}
+
+test "Cluster is up" {
+ assert_cluster_state ok
+}
+
+test "Kill slave #7 of master #2. Only slave left is #12 now" {
+ kill_instance redis 7
+}
+
+set current_epoch [CI 1 cluster_current_epoch]
+
+test "Killing master node #2, #12 should failover" {
+ kill_instance redis 2
+}
+
+test "Wait for failover" {
+ wait_for_condition 1000 50 {
+ [CI 1 cluster_current_epoch] > $current_epoch
+ } else {
+ fail "No failover detected"
+ }
+}
+
+test "Cluster should eventually be up again" {
+ assert_cluster_state ok
+}
+
+test "Cluster is writable" {
+ cluster_write_test 1
+}
+
+test "Instance 12 is now a master without slaves" {
+ assert {[RI 12 role] eq {master}}
+}
+
+# The remaining instance is now without slaves. Some other slave
+# should migrate to it.
+
+test "Master #12 should get at least one migrated replica" {
+ wait_for_condition 1000 50 {
+ [llength [lindex [R 12 role] 2]] >= 1
+ } else {
+ fail "Master #12 has no replicas"
+ }
+}
diff --git a/tests/cluster/tests/08-update-msg.tcl b/tests/cluster/tests/08-update-msg.tcl
new file mode 100644
index 0000000..6f9661d
--- /dev/null
+++ b/tests/cluster/tests/08-update-msg.tcl
@@ -0,0 +1,90 @@
+# Test UPDATE messages sent by other nodes when the currently authorirative
+# master is unavaialble. The test is performed in the following steps:
+#
+# 1) Master goes down.
+# 2) Slave failover and becomes new master.
+# 3) New master is partitoned away.
+# 4) Old master returns.
+# 5) At this point we expect the old master to turn into a slave ASAP because
+# of the UPDATE messages it will receive from the other nodes when its
+# configuration will be found to be outdated.
+
+source "../tests/includes/init-tests.tcl"
+
+test "Create a 5 nodes cluster" {
+ create_cluster 5 5
+}
+
+test "Cluster is up" {
+ assert_cluster_state ok
+}
+
+test "Cluster is writable" {
+ cluster_write_test 0
+}
+
+test "Instance #5 is a slave" {
+ assert {[RI 5 role] eq {slave}}
+}
+
+test "Instance #5 synced with the master" {
+ wait_for_condition 1000 50 {
+ [RI 5 master_link_status] eq {up}
+ } else {
+ fail "Instance #5 master link status is not up"
+ }
+}
+
+set current_epoch [CI 1 cluster_current_epoch]
+
+test "Killing one master node" {
+ kill_instance redis 0
+}
+
+test "Wait for failover" {
+ wait_for_condition 1000 50 {
+ [CI 1 cluster_current_epoch] > $current_epoch
+ } else {
+ fail "No failover detected"
+ }
+}
+
+test "Cluster should eventually be up again" {
+ assert_cluster_state ok
+}
+
+test "Cluster is writable" {
+ cluster_write_test 1
+}
+
+test "Instance #5 is now a master" {
+ assert {[RI 5 role] eq {master}}
+}
+
+test "Killing the new master #5" {
+ kill_instance redis 5
+}
+
+test "Cluster should be down now" {
+ assert_cluster_state fail
+}
+
+test "Restarting the old master node" {
+ restart_instance redis 0
+}
+
+test "Instance #0 gets converted into a slave" {
+ wait_for_condition 1000 50 {
+ [RI 0 role] eq {slave}
+ } else {
+ fail "Old master was not converted into slave"
+ }
+}
+
+test "Restarting the new master node" {
+ restart_instance redis 5
+}
+
+test "Cluster is up again" {
+ assert_cluster_state ok
+}
diff --git a/tests/cluster/tests/09-pubsub.tcl b/tests/cluster/tests/09-pubsub.tcl
new file mode 100644
index 0000000..e62b91c
--- /dev/null
+++ b/tests/cluster/tests/09-pubsub.tcl
@@ -0,0 +1,40 @@
+# Test PUBLISH propagation across the cluster.
+
+source "../tests/includes/init-tests.tcl"
+
+test "Create a 5 nodes cluster" {
+ create_cluster 5 5
+}
+
+proc test_cluster_publish {instance instances} {
+ # Subscribe all the instances but the one we use to send.
+ for {set j 0} {$j < $instances} {incr j} {
+ if {$j != $instance} {
+ R $j deferred 1
+ R $j subscribe testchannel
+ R $j read; # Read the subscribe reply
+ }
+ }
+
+ set data [randomValue]
+ R $instance PUBLISH testchannel $data
+
+ # Read the message back from all the nodes.
+ for {set j 0} {$j < $instances} {incr j} {
+ if {$j != $instance} {
+ set msg [R $j read]
+ assert {$data eq [lindex $msg 2]}
+ R $j unsubscribe testchannel
+ R $j read; # Read the unsubscribe reply
+ R $j deferred 0
+ }
+ }
+}
+
+test "Test publishing to master" {
+ test_cluster_publish 0 10
+}
+
+test "Test publishing to slave" {
+ test_cluster_publish 5 10
+}
diff --git a/tests/cluster/tests/10-manual-failover.tcl b/tests/cluster/tests/10-manual-failover.tcl
new file mode 100644
index 0000000..5441b79
--- /dev/null
+++ b/tests/cluster/tests/10-manual-failover.tcl
@@ -0,0 +1,192 @@
+# Check the manual failover
+
+source "../tests/includes/init-tests.tcl"
+
+test "Create a 5 nodes cluster" {
+ create_cluster 5 5
+}
+
+test "Cluster is up" {
+ assert_cluster_state ok
+}
+
+test "Cluster is writable" {
+ cluster_write_test 0
+}
+
+test "Instance #5 is a slave" {
+ assert {[RI 5 role] eq {slave}}
+}
+
+test "Instance #5 synced with the master" {
+ wait_for_condition 1000 50 {
+ [RI 5 master_link_status] eq {up}
+ } else {
+ fail "Instance #5 master link status is not up"
+ }
+}
+
+set current_epoch [CI 1 cluster_current_epoch]
+
+set numkeys 50000
+set numops 10000
+set cluster [redis_cluster 127.0.0.1:[get_instance_attrib redis 0 port]]
+catch {unset content}
+array set content {}
+
+test "Send CLUSTER FAILOVER to #5, during load" {
+ for {set j 0} {$j < $numops} {incr j} {
+ # Write random data to random list.
+ set listid [randomInt $numkeys]
+ set key "key:$listid"
+ set ele [randomValue]
+ # We write both with Lua scripts and with plain commands.
+ # This way we are able to stress Lua -> Redis command invocation
+ # as well, that has tests to prevent Lua to write into wrong
+ # hash slots.
+ if {$listid % 2} {
+ $cluster rpush $key $ele
+ } else {
+ $cluster eval {redis.call("rpush",KEYS[1],ARGV[1])} 1 $key $ele
+ }
+ lappend content($key) $ele
+
+ if {($j % 1000) == 0} {
+ puts -nonewline W; flush stdout
+ }
+
+ if {$j == $numops/2} {R 5 cluster failover}
+ }
+}
+
+test "Wait for failover" {
+ wait_for_condition 1000 50 {
+ [CI 1 cluster_current_epoch] > $current_epoch
+ } else {
+ fail "No failover detected"
+ }
+}
+
+test "Cluster should eventually be up again" {
+ assert_cluster_state ok
+}
+
+test "Cluster is writable" {
+ cluster_write_test 1
+}
+
+test "Instance #5 is now a master" {
+ assert {[RI 5 role] eq {master}}
+}
+
+test "Verify $numkeys keys for consistency with logical content" {
+ # Check that the Redis Cluster content matches our logical content.
+ foreach {key value} [array get content] {
+ assert {[$cluster lrange $key 0 -1] eq $value}
+ }
+}
+
+test "Instance #0 gets converted into a slave" {
+ wait_for_condition 1000 50 {
+ [RI 0 role] eq {slave}
+ } else {
+ fail "Old master was not converted into slave"
+ }
+}
+
+## Check that manual failover does not happen if we can't talk with the master.
+
+source "../tests/includes/init-tests.tcl"
+
+test "Create a 5 nodes cluster" {
+ create_cluster 5 5
+}
+
+test "Cluster is up" {
+ assert_cluster_state ok
+}
+
+test "Cluster is writable" {
+ cluster_write_test 0
+}
+
+test "Instance #5 is a slave" {
+ assert {[RI 5 role] eq {slave}}
+}
+
+test "Instance #5 synced with the master" {
+ wait_for_condition 1000 50 {
+ [RI 5 master_link_status] eq {up}
+ } else {
+ fail "Instance #5 master link status is not up"
+ }
+}
+
+test "Make instance #0 unreachable without killing it" {
+ R 0 deferred 1
+ R 0 DEBUG SLEEP 10
+}
+
+test "Send CLUSTER FAILOVER to instance #5" {
+ R 5 cluster failover
+}
+
+test "Instance #5 is still a slave after some time (no failover)" {
+ after 5000
+ assert {[RI 5 role] eq {master}}
+}
+
+test "Wait for instance #0 to return back alive" {
+ R 0 deferred 0
+ assert {[R 0 read] eq {OK}}
+}
+
+## Check with "force" failover happens anyway.
+
+source "../tests/includes/init-tests.tcl"
+
+test "Create a 5 nodes cluster" {
+ create_cluster 5 5
+}
+
+test "Cluster is up" {
+ assert_cluster_state ok
+}
+
+test "Cluster is writable" {
+ cluster_write_test 0
+}
+
+test "Instance #5 is a slave" {
+ assert {[RI 5 role] eq {slave}}
+}
+
+test "Instance #5 synced with the master" {
+ wait_for_condition 1000 50 {
+ [RI 5 master_link_status] eq {up}
+ } else {
+ fail "Instance #5 master link status is not up"
+ }
+}
+
+test "Make instance #0 unreachable without killing it" {
+ R 0 deferred 1
+ R 0 DEBUG SLEEP 10
+}
+
+test "Send CLUSTER FAILOVER to instance #5" {
+ R 5 cluster failover force
+}
+
+test "Instance #5 is a master after some time" {
+ wait_for_condition 1000 50 {
+ [RI 5 role] eq {master}
+ } else {
+ fail "Instance #5 is not a master after some time regardless of FORCE"
+ }
+}
+
+test "Wait for instance #0 to return back alive" {
+ R 0 deferred 0
+ assert {[R 0 read] eq {OK}}
+}
diff --git a/tests/cluster/tests/11-manual-takeover.tcl b/tests/cluster/tests/11-manual-takeover.tcl
new file mode 100644
index 0000000..f567c69
--- /dev/null
+++ b/tests/cluster/tests/11-manual-takeover.tcl
@@ -0,0 +1,59 @@
+# Manual takeover test
+
+source "../tests/includes/init-tests.tcl"
+
+test "Create a 5 nodes cluster" {
+ create_cluster 5 5
+}
+
+test "Cluster is up" {
+ assert_cluster_state ok
+}
+
+test "Cluster is writable" {
+ cluster_write_test 0
+}
+
+test "Killing majority of master nodes" {
+ kill_instance redis 0
+ kill_instance redis 1
+ kill_instance redis 2
+}
+
+test "Cluster should eventually be down" {
+ assert_cluster_state fail
+}
+
+test "Use takeover to bring slaves back" {
+ R 5 cluster failover takeover
+ R 6 cluster failover takeover
+ R 7 cluster failover takeover
+}
+
+test "Cluster should eventually be up again" {
+ assert_cluster_state ok
+}
+
+test "Cluster is writable" {
+ cluster_write_test 4
+}
+
+test "Instance #5, #6, #7 are now masters" {
+ assert {[RI 5 role] eq {master}}
+ assert {[RI 6 role] eq {master}}
+ assert {[RI 7 role] eq {master}}
+}
+
+test "Restarting the previously killed master nodes" {
+ restart_instance redis 0
+ restart_instance redis 1
+ restart_instance redis 2
+}
+
+test "Instance #0, #1, #2 gets converted into a slaves" {
+ wait_for_condition 1000 50 {
+ [RI 0 role] eq {slave} && [RI 1 role] eq {slave} && [RI 2 role] eq {slave}
+ } else {
+ fail "Old masters not converted into slaves"
+ }
+}
diff --git a/tests/cluster/tests/12-replica-migration-2.tcl b/tests/cluster/tests/12-replica-migration-2.tcl
new file mode 100644
index 0000000..48ecd1d
--- /dev/null
+++ b/tests/cluster/tests/12-replica-migration-2.tcl
@@ -0,0 +1,64 @@
+# Replica migration test #2.
+#
+# Check that the status of master that can be targeted by replica migration
+# is acquired again, after being getting slots again, in a cluster where the
+# other masters have slaves.
+
+source "../tests/includes/init-tests.tcl"
+
+# Create a cluster with 5 master and 15 slaves, to make sure there are no
+# empty masters and make rebalancing simpler to handle during the test.
+test "Create a 5 nodes cluster" {
+ create_cluster 5 15
+}
+
+test "Cluster is up" {
+ assert_cluster_state ok
+}
+
+test "Each master should have at least two replicas attached" {
+ foreach_redis_id id {
+ if {$id < 5} {
+ wait_for_condition 1000 50 {
+ [llength [lindex [R 0 role] 2]] >= 2
+ } else {
+ fail "Master #$id does not have 2 slaves as expected"
+ }
+ }
+ }
+}
+
+set master0_id [dict get [get_myself 0] id]
+test "Resharding all the master #0 slots away from it" {
+ set output [exec \
+ ../../../src/redis-trib.rb rebalance \
+ --weight ${master0_id}=0 \
+ 127.0.0.1:[get_instance_attrib redis 0 port] >@ stdout]
+}
+
+test "Master #0 should lose its replicas" {
+ wait_for_condition 1000 50 {
+ [llength [lindex [R 0 role] 2]] == 0
+ } else {
+ fail "Master #0 still has replicas"
+ }
+}
+
+test "Resharding back some slot to master #0" {
+ # Wait for the cluster config to propagate before attempting a
+ # new resharding.
+ after 10000
+ set output [exec \
+ ../../../src/redis-trib.rb rebalance \
+ --weight ${master0_id}=.01 \
+ --use-empty-masters \
+ 127.0.0.1:[get_instance_attrib redis 0 port] >@ stdout]
+}
+
+test "Master #0 should re-acquire one or more replicas" {
+ wait_for_condition 1000 50 {
+ [llength [lindex [R 0 role] 2]] >= 1
+ } else {
+ fail "Master #0 has no has replicas"
+ }
+}
diff --git a/tests/cluster/tests/helpers/onlydots.tcl b/tests/cluster/tests/helpers/onlydots.tcl
new file mode 100644
index 0000000..4a6d1ae
--- /dev/null
+++ b/tests/cluster/tests/helpers/onlydots.tcl
@@ -0,0 +1,16 @@
+# Read the standard input and only shows dots in the output, filtering out
+# all the other characters. Designed to avoid bufferization so that when
+# we get the output of redis-trib and want to show just the dots, we'll see
+# the dots as soon as redis-trib will output them.
+
+fconfigure stdin -buffering none
+
+while 1 {
+ set c [read stdin 1]
+ if {$c eq {}} {
+ exit 0; # EOF
+ } elseif {$c eq {.}} {
+ puts -nonewline .
+ flush stdout
+ }
+}
diff --git a/tests/cluster/tests/includes/init-tests.tcl b/tests/cluster/tests/includes/init-tests.tcl
new file mode 100644
index 0000000..466ab8f
--- /dev/null
+++ b/tests/cluster/tests/includes/init-tests.tcl
@@ -0,0 +1,70 @@
+# Initialization tests -- most units will start including this.
+
+test "(init) Restart killed instances" {
+ foreach type {redis} {
+ foreach_${type}_id id {
+ if {[get_instance_attrib $type $id pid] == -1} {
+ puts -nonewline "$type/$id "
+ flush stdout
+ restart_instance $type $id
+ }
+ }
+ }
+}
+
+test "Cluster nodes are reachable" {
+ foreach_redis_id id {
+ # Every node should be reachable.
+ wait_for_condition 1000 50 {
+ ([catch {R $id ping} ping_reply] == 0) &&
+ ($ping_reply eq {PONG})
+ } else {
+ catch {R $id ping} err
+ fail "Node #$id keeps replying '$err' to PING."
+ }
+ }
+}
+
+test "Cluster nodes hard reset" {
+ foreach_redis_id id {
+ if {$::valgrind} {
+ set node_timeout 10000
+ } else {
+ set node_timeout 3000
+ }
+ catch {R $id flushall} ; # May fail for readonly slaves.
+ R $id MULTI
+ R $id cluster reset hard
+ R $id cluster set-config-epoch [expr {$id+1}]
+ R $id EXEC
+ R $id config set cluster-node-timeout $node_timeout
+ R $id config set cluster-slave-validity-factor 10
+ R $id config rewrite
+ }
+}
+
+test "Cluster Join and auto-discovery test" {
+ # Join node 0 with 1, 1 with 2, ... and so forth.
+ # If auto-discovery works all nodes will know every other node
+ # eventually.
+ set ids {}
+ foreach_redis_id id {lappend ids $id}
+ for {set j 0} {$j < [expr [llength $ids]-1]} {incr j} {
+ set a [lindex $ids $j]
+ set b [lindex $ids [expr $j+1]]
+ set b_port [get_instance_attrib redis $b port]
+ R $a cluster meet 127.0.0.1 $b_port
+ }
+
+ foreach_redis_id id {
+ wait_for_condition 1000 50 {
+ [llength [get_cluster_nodes $id]] == [llength $ids]
+ } else {
+ fail "Cluster failed to join into a full mesh."
+ }
+ }
+}
+
+test "Before slots allocation, all nodes report cluster failure" {
+ assert_cluster_state fail
+}
diff --git a/tests/cluster/tmp/.gitignore b/tests/cluster/tmp/.gitignore
new file mode 100644
index 0000000..f581f73
--- /dev/null
+++ b/tests/cluster/tmp/.gitignore
@@ -0,0 +1,2 @@
+redis_*
+sentinel_*