diff options
Diffstat (limited to 'tests/cluster')
| -rw-r--r-- | tests/cluster/cluster.tcl | 130 | ||||
| -rw-r--r-- | tests/cluster/run.tcl | 28 | ||||
| -rw-r--r-- | tests/cluster/tests/00-base.tcl | 59 | ||||
| -rw-r--r-- | tests/cluster/tests/01-faildet.tcl | 38 | ||||
| -rw-r--r-- | tests/cluster/tests/02-failover.tcl | 65 | ||||
| -rw-r--r-- | tests/cluster/tests/03-failover-loop.tcl | 115 | ||||
| -rw-r--r-- | tests/cluster/tests/04-resharding.tcl | 172 | ||||
| -rw-r--r-- | tests/cluster/tests/05-slave-selection.tcl | 94 | ||||
| -rw-r--r-- | tests/cluster/tests/06-slave-stop-cond.tcl | 73 | ||||
| -rw-r--r-- | tests/cluster/tests/07-replica-migration.tcl | 103 | ||||
| -rw-r--r-- | tests/cluster/tests/08-update-msg.tcl | 90 | ||||
| -rw-r--r-- | tests/cluster/tests/09-pubsub.tcl | 40 | ||||
| -rw-r--r-- | tests/cluster/tests/10-manual-failover.tcl | 192 | ||||
| -rw-r--r-- | tests/cluster/tests/11-manual-takeover.tcl | 59 | ||||
| -rw-r--r-- | tests/cluster/tests/12-replica-migration-2.tcl | 64 | ||||
| -rw-r--r-- | tests/cluster/tests/helpers/onlydots.tcl | 16 | ||||
| -rw-r--r-- | tests/cluster/tests/includes/init-tests.tcl | 70 | ||||
| -rw-r--r-- | tests/cluster/tmp/.gitignore | 2 |
18 files changed, 1410 insertions, 0 deletions
diff --git a/tests/cluster/cluster.tcl b/tests/cluster/cluster.tcl new file mode 100644 index 0000000..0647914 --- /dev/null +++ b/tests/cluster/cluster.tcl @@ -0,0 +1,130 @@ +# Cluster-specific test functions. +# +# Copyright (C) 2014 Salvatore Sanfilippo [email protected] +# This software is released under the BSD License. See the COPYING file for +# more information. + +# Returns a parsed CLUSTER NODES output as a list of dictionaries. +proc get_cluster_nodes id { + set lines [split [R $id cluster nodes] "\r\n"] + set nodes {} + foreach l $lines { + set l [string trim $l] + if {$l eq {}} continue + set args [split $l] + set node [dict create \ + id [lindex $args 0] \ + addr [lindex $args 1] \ + flags [split [lindex $args 2] ,] \ + slaveof [lindex $args 3] \ + ping_sent [lindex $args 4] \ + pong_recv [lindex $args 5] \ + config_epoch [lindex $args 6] \ + linkstate [lindex $args 7] \ + slots [lrange $args 8 -1] \ + ] + lappend nodes $node + } + return $nodes +} + +# Test node for flag. +proc has_flag {node flag} { + expr {[lsearch -exact [dict get $node flags] $flag] != -1} +} + +# Returns the parsed myself node entry as a dictionary. +proc get_myself id { + set nodes [get_cluster_nodes $id] + foreach n $nodes { + if {[has_flag $n myself]} {return $n} + } + return {} +} + +# Return the value of the specified CLUSTER INFO field. +proc CI {n field} { + get_info_field [R $n cluster info] $field +} + +# Assuming nodes are reest, this function performs slots allocation. +# Only the first 'n' nodes are used. +proc cluster_allocate_slots {n} { + set slot 16383 + while {$slot >= 0} { + # Allocate successive slots to random nodes. + set node [randomInt $n] + lappend slots_$node $slot + incr slot -1 + } + for {set j 0} {$j < $n} {incr j} { + R $j cluster addslots {*}[set slots_${j}] + } +} + +# Check that cluster nodes agree about "state", or raise an error. +proc assert_cluster_state {state} { + foreach_redis_id id { + if {[instance_is_killed redis $id]} continue + wait_for_condition 1000 50 { + [CI $id cluster_state] eq $state + } else { + fail "Cluster node $id cluster_state:[CI $id cluster_state]" + } + } +} + +# Search the first node starting from ID $first that is not +# already configured as a slave. +proc cluster_find_available_slave {first} { + foreach_redis_id id { + if {$id < $first} continue + if {[instance_is_killed redis $id]} continue + set me [get_myself $id] + if {[dict get $me slaveof] eq {-}} {return $id} + } + fail "No available slaves" +} + +# Add 'slaves' slaves to a cluster composed of 'masters' masters. +# It assumes that masters are allocated sequentially from instance ID 0 +# to N-1. +proc cluster_allocate_slaves {masters slaves} { + for {set j 0} {$j < $slaves} {incr j} { + set master_id [expr {$j % $masters}] + set slave_id [cluster_find_available_slave $masters] + set master_myself [get_myself $master_id] + R $slave_id cluster replicate [dict get $master_myself id] + } +} + +# Create a cluster composed of the specified number of masters and slaves. +proc create_cluster {masters slaves} { + cluster_allocate_slots $masters + if {$slaves} { + cluster_allocate_slaves $masters $slaves + } + assert_cluster_state ok +} + +# Set the cluster node-timeout to all the reachalbe nodes. +proc set_cluster_node_timeout {to} { + foreach_redis_id id { + catch {R $id CONFIG SET cluster-node-timeout $to} + } +} + +# Check if the cluster is writable and readable. Use node "id" +# as a starting point to talk with the cluster. +proc cluster_write_test {id} { + set prefix [randstring 20 20 alpha] + set port [get_instance_attrib redis $id port] + set cluster [redis_cluster 127.0.0.1:$port] + for {set j 0} {$j < 100} {incr j} { + $cluster set key.$j $prefix.$j + } + for {set j 0} {$j < 100} {incr j} { + assert {[$cluster get key.$j] eq "$prefix.$j"} + } + $cluster close +} diff --git a/tests/cluster/run.tcl b/tests/cluster/run.tcl new file mode 100644 index 0000000..93603dd --- /dev/null +++ b/tests/cluster/run.tcl @@ -0,0 +1,28 @@ +# Cluster test suite. Copyright (C) 2014 Salvatore Sanfilippo [email protected] +# This software is released under the BSD License. See the COPYING file for +# more information. + +cd tests/cluster +source cluster.tcl +source ../instances.tcl +source ../../support/cluster.tcl ; # Redis Cluster client. + +set ::instances_count 20 ; # How many instances we use at max. + +proc main {} { + parse_options + spawn_instance redis $::redis_base_port $::instances_count { + "cluster-enabled yes" + "appendonly yes" + } + run_tests + cleanup + end_tests +} + +if {[catch main e]} { + puts $::errorInfo + if {$::pause_on_error} pause_on_error + cleanup + exit 1 +} diff --git a/tests/cluster/tests/00-base.tcl b/tests/cluster/tests/00-base.tcl new file mode 100644 index 0000000..280befb --- /dev/null +++ b/tests/cluster/tests/00-base.tcl @@ -0,0 +1,59 @@ +# Check the basic monitoring and failover capabilities. + +source "../tests/includes/init-tests.tcl" + +if {$::simulate_error} { + test "This test will fail" { + fail "Simulated error" + } +} + +test "Different nodes have different IDs" { + set ids {} + set numnodes 0 + foreach_redis_id id { + incr numnodes + # Every node should just know itself. + set nodeid [dict get [get_myself $id] id] + assert {$nodeid ne {}} + lappend ids $nodeid + } + set numids [llength [lsort -unique $ids]] + assert {$numids == $numnodes} +} + +test "It is possible to perform slot allocation" { + cluster_allocate_slots 5 +} + +test "After the join, every node gets a different config epoch" { + set trynum 60 + while {[incr trynum -1] != 0} { + # We check that this condition is true for *all* the nodes. + set ok 1 ; # Will be set to 0 every time a node is not ok. + foreach_redis_id id { + set epochs {} + foreach n [get_cluster_nodes $id] { + lappend epochs [dict get $n config_epoch] + } + if {[lsort $epochs] != [lsort -unique $epochs]} { + set ok 0 ; # At least one collision! + } + } + if {$ok} break + after 1000 + puts -nonewline . + flush stdout + } + if {$trynum == 0} { + fail "Config epoch conflict resolution is not working." + } +} + +test "Nodes should report cluster_state is ok now" { + assert_cluster_state ok +} + +test "It is possible to write and read from the cluster" { + cluster_write_test 0 +} diff --git a/tests/cluster/tests/01-faildet.tcl b/tests/cluster/tests/01-faildet.tcl new file mode 100644 index 0000000..8fe87c9 --- /dev/null +++ b/tests/cluster/tests/01-faildet.tcl @@ -0,0 +1,38 @@ +# Check the basic monitoring and failover capabilities. + +source "../tests/includes/init-tests.tcl" + +test "Create a 5 nodes cluster" { + create_cluster 5 5 +} + +test "Cluster should start ok" { + assert_cluster_state ok +} + +test "Killing two slave nodes" { + kill_instance redis 5 + kill_instance redis 6 +} + +test "Cluster should be still up" { + assert_cluster_state ok +} + +test "Killing one master node" { + kill_instance redis 0 +} + +# Note: the only slave of instance 0 is already down so no +# failover is possible, that would change the state back to ok. +test "Cluster should be down now" { + assert_cluster_state fail +} + +test "Restarting master node" { + restart_instance redis 0 +} + +test "Cluster should be up again" { + assert_cluster_state ok +} diff --git a/tests/cluster/tests/02-failover.tcl b/tests/cluster/tests/02-failover.tcl new file mode 100644 index 0000000..6b2fd09 --- /dev/null +++ b/tests/cluster/tests/02-failover.tcl @@ -0,0 +1,65 @@ +# Check the basic monitoring and failover capabilities. + +source "../tests/includes/init-tests.tcl" + +test "Create a 5 nodes cluster" { + create_cluster 5 5 +} + +test "Cluster is up" { + assert_cluster_state ok +} + +test "Cluster is writable" { + cluster_write_test 0 +} + +test "Instance #5 is a slave" { + assert {[RI 5 role] eq {slave}} +} + +test "Instance #5 synced with the master" { + wait_for_condition 1000 50 { + [RI 5 master_link_status] eq {up} + } else { + fail "Instance #5 master link status is not up" + } +} + +set current_epoch [CI 1 cluster_current_epoch] + +test "Killing one master node" { + kill_instance redis 0 +} + +test "Wait for failover" { + wait_for_condition 1000 50 { + [CI 1 cluster_current_epoch] > $current_epoch + } else { + fail "No failover detected" + } +} + +test "Cluster should eventually be up again" { + assert_cluster_state ok +} + +test "Cluster is writable" { + cluster_write_test 1 +} + +test "Instance #5 is now a master" { + assert {[RI 5 role] eq {master}} +} + +test "Restarting the previously killed master node" { + restart_instance redis 0 +} + +test "Instance #0 gets converted into a slave" { + wait_for_condition 1000 50 { + [RI 0 role] eq {slave} + } else { + fail "Old master was not converted into slave" + } +} diff --git a/tests/cluster/tests/03-failover-loop.tcl b/tests/cluster/tests/03-failover-loop.tcl new file mode 100644 index 0000000..8e1bcd6 --- /dev/null +++ b/tests/cluster/tests/03-failover-loop.tcl @@ -0,0 +1,115 @@ +# Failover stress test. +# In this test a different node is killed in a loop for N +# iterations. The test checks that certain properties +# are preseved across iterations. + +source "../tests/includes/init-tests.tcl" + +test "Create a 5 nodes cluster" { + create_cluster 5 5 +} + +test "Cluster is up" { + assert_cluster_state ok +} + +set iterations 20 +set cluster [redis_cluster 127.0.0.1:[get_instance_attrib redis 0 port]] + +while {[incr iterations -1]} { + set tokill [randomInt 10] + set other [expr {($tokill+1)%10}] ; # Some other instance. + set key [randstring 20 20 alpha] + set val [randstring 20 20 alpha] + set role [RI $tokill role] + if {$role eq {master}} { + set slave {} + set myid [dict get [get_myself $tokill] id] + foreach_redis_id id { + if {$id == $tokill} continue + if {[dict get [get_myself $id] slaveof] eq $myid} { + set slave $id + } + } + if {$slave eq {}} { + fail "Unable to retrieve slave's ID for master #$tokill" + } + } + + puts "--- Iteration $iterations ---" + + if {$role eq {master}} { + test "Wait for slave of #$tokill to sync" { + wait_for_condition 1000 50 { + [string match {*state=online*} [RI $tokill slave0]] + } else { + fail "Slave of node #$tokill is not ok" + } + } + set slave_config_epoch [CI $slave cluster_my_epoch] + } + + test "Cluster is writable before failover" { + for {set i 0} {$i < 100} {incr i} { + catch {$cluster set $key:$i $val:$i} err + assert {$err eq {OK}} + } + # Wait for the write to propagate to the slave if we + # are going to kill a master. + if {$role eq {master}} { + R $tokill wait 1 20000 + } + } + + test "Killing node #$tokill" { + kill_instance redis $tokill + } + + if {$role eq {master}} { + test "Wait failover by #$slave with old epoch $slave_config_epoch" { + wait_for_condition 1000 50 { + [CI $slave cluster_my_epoch] > $slave_config_epoch + } else { + fail "No failover detected, epoch is still [CI $slave cluster_my_epoch]" + } + } + } + + test "Cluster should eventually be up again" { + assert_cluster_state ok + } + + test "Cluster is writable again" { + for {set i 0} {$i < 100} {incr i} { + catch {$cluster set $key:$i:2 $val:$i:2} err + assert {$err eq {OK}} + } + } + + test "Restarting node #$tokill" { + restart_instance redis $tokill + } + + test "Instance #$tokill is now a slave" { + wait_for_condition 1000 50 { + [RI $tokill role] eq {slave} + } else { + fail "Restarted instance is not a slave" + } + } + + test "We can read back the value we set before" { + for {set i 0} {$i < 100} {incr i} { + catch {$cluster get $key:$i} err + assert {$err eq "$val:$i"} + catch {$cluster get $key:$i:2} err + assert {$err eq "$val:$i:2"} + } + } +} + +test "Post condition: current_epoch >= my_epoch everywhere" { + foreach_redis_id id { + assert {[CI $id cluster_current_epoch] >= [CI $id cluster_my_epoch]} + } +} diff --git a/tests/cluster/tests/04-resharding.tcl b/tests/cluster/tests/04-resharding.tcl new file mode 100644 index 0000000..0ccbf71 --- /dev/null +++ b/tests/cluster/tests/04-resharding.tcl @@ -0,0 +1,172 @@ +# Failover stress test. +# In this test a different node is killed in a loop for N +# iterations. The test checks that certain properties +# are preseved across iterations. + +source "../tests/includes/init-tests.tcl" + +test "Create a 5 nodes cluster" { + create_cluster 5 5 +} + +test "Cluster is up" { + assert_cluster_state ok +} + +test "Enable AOF in all the instances" { + foreach_redis_id id { + R $id config set appendonly yes + # We use "appendfsync no" because it's fast but also guarantees that + # write(2) is performed before replying to client. + R $id config set appendfsync no + } + + foreach_redis_id id { + wait_for_condition 1000 500 { + [RI $id aof_rewrite_in_progress] == 0 && + [RI $id aof_enabled] == 1 + } else { + fail "Failed to enable AOF on instance #$id" + } + } +} + +# Return nno-zero if the specified PID is about a process still in execution, +# otherwise 0 is returned. +proc process_is_running {pid} { + # PS should return with an error if PID is non existing, + # and catch will return non-zero. We want to return non-zero if + # the PID exists, so we invert the return value with expr not operator. + expr {![catch {exec ps -p $pid}]} +} + +# Our resharding test performs the following actions: +# +# - N commands are sent to the cluster in the course of the test. +# - Every command selects a random key from key:0 to key:MAX-1. +# - The operation RPUSH key <randomvalue> is perforemd. +# - Tcl remembers into an array all the values pushed to each list. +# - After N/2 commands, the resharding process is started in background. +# - The test continues while the resharding is in progress. +# - At the end of the test, we wait for the resharding process to stop. +# - Finally the keys are checked to see if they contain the value they should. + +set numkeys 50000 +set numops 200000 +set cluster [redis_cluster 127.0.0.1:[get_instance_attrib redis 0 port]] +catch {unset content} +array set content {} +set tribpid {} + +test "Cluster consistency during live resharding" { + set ele 0 + for {set j 0} {$j < $numops} {incr j} { + # Trigger the resharding once we execute half the ops. + if {$tribpid ne {} && + ($j % 10000) == 0 && + ![process_is_running $tribpid]} { + set tribpid {} + } + + if {$j >= $numops/2 && $tribpid eq {}} { + puts -nonewline "...Starting resharding..." + flush stdout + set target [dict get [get_myself [randomInt 5]] id] + set tribpid [lindex [exec \ + ../../../src/redis-trib.rb reshard \ + --from all \ + --to $target \ + --slots 100 \ + --yes \ + 127.0.0.1:[get_instance_attrib redis 0 port] \ + | [info nameofexecutable] \ + ../tests/helpers/onlydots.tcl \ + &] 0] + } + + # Write random data to random list. + set listid [randomInt $numkeys] + set key "key:$listid" + incr ele + # We write both with Lua scripts and with plain commands. + # This way we are able to stress Lua -> Redis command invocation + # as well, that has tests to prevent Lua to write into wrong + # hash slots. + if {$listid % 2} { + $cluster rpush $key $ele + } else { + $cluster eval {redis.call("rpush",KEYS[1],ARGV[1])} 1 $key $ele + } + lappend content($key) $ele + + if {($j % 1000) == 0} { + puts -nonewline W; flush stdout + } + } + + # Wait for the resharding process to end + wait_for_condition 1000 500 { + [process_is_running $tribpid] == 0 + } else { + fail "Resharding is not terminating after some time." + } + +} + +test "Verify $numkeys keys for consistency with logical content" { + # Check that the Redis Cluster content matches our logical content. + foreach {key value} [array get content] { + if {[$cluster lrange $key 0 -1] ne $value} { + fail "Key $key expected to hold '$value' but actual content is [$cluster lrange $key 0 -1]" + } + } +} + +test "Crash and restart all the instances" { + foreach_redis_id id { + kill_instance redis $id + restart_instance redis $id + } +} + +test "Cluster should eventually be up again" { + assert_cluster_state ok +} + +test "Verify $numkeys keys after the crash & restart" { + # Check that the Redis Cluster content matches our logical content. + foreach {key value} [array get content] { + if {[$cluster lrange $key 0 -1] ne $value} { + fail "Key $key expected to hold '$value' but actual content is [$cluster lrange $key 0 -1]" + } + } +} + +test "Disable AOF in all the instances" { + foreach_redis_id id { + R $id config set appendonly no + } +} + +test "Verify slaves consistency" { + set verified_masters 0 + foreach_redis_id id { + set role [R $id role] + lassign $role myrole myoffset slaves + if {$myrole eq {slave}} continue + set masterport [get_instance_attrib redis $id port] + set masterdigest [R $id debug digest] + foreach_redis_id sid { + set srole [R $sid role] + if {[lindex $srole 0] eq {master}} continue + if {[lindex $srole 2] != $masterport} continue + wait_for_condition 1000 500 { + [R $sid debug digest] eq $masterdigest + } else { + fail "Master and slave data digest are different" + } + incr verified_masters + } + } + assert {$verified_masters >= 5} +} diff --git a/tests/cluster/tests/05-slave-selection.tcl b/tests/cluster/tests/05-slave-selection.tcl new file mode 100644 index 0000000..6efedce --- /dev/null +++ b/tests/cluster/tests/05-slave-selection.tcl @@ -0,0 +1,94 @@ +# Slave selection test +# Check the algorithm trying to pick the slave with the most complete history. + +source "../tests/includes/init-tests.tcl" + +# Create a cluster with 5 master and 10 slaves, so that we have 2 +# slaves for each master. +test "Create a 5 nodes cluster" { + create_cluster 5 10 +} + +test "Cluster is up" { + assert_cluster_state ok +} + +test "The first master has actually two slaves" { + assert {[llength [lindex [R 0 role] 2]] == 2} +} + +test {Slaves of #0 are instance #5 and #10 as expected} { + set port0 [get_instance_attrib redis 0 port] + assert {[lindex [R 5 role] 2] == $port0} + assert {[lindex [R 10 role] 2] == $port0} +} + +test "Instance #5 and #10 synced with the master" { + wait_for_condition 1000 50 { + [RI 5 master_link_status] eq {up} && + [RI 10 master_link_status] eq {up} + } else { + fail "Instance #5 or #10 master link status is not up" + } +} + +set cluster [redis_cluster 127.0.0.1:[get_instance_attrib redis 0 port]] + +test "Slaves are both able to receive and acknowledge writes" { + for {set j 0} {$j < 100} {incr j} { + $cluster set $j $j + } + assert {[R 0 wait 2 60000] == 2} +} + +test "Write data while slave #10 is paused and can't receive it" { + # Stop the slave with a multi/exec transaction so that the master will + # be killed as soon as it can accept writes again. + R 10 multi + R 10 debug sleep 10 + R 10 client kill 127.0.0.1:$port0 + R 10 deferred 1 + R 10 exec + + # Write some data the slave can't receive. + for {set j 0} {$j < 100} {incr j} { + $cluster set $j $j + } + + # Prevent the master from accepting new slaves. + # Use a large pause value since we'll kill it anyway. + R 0 CLIENT PAUSE 60000 + + # Wait for the slave to return available again + R 10 deferred 0 + assert {[R 10 read] eq {OK OK}} + + # Kill the master so that a reconnection will not be possible. + kill_instance redis 0 +} + +test "Wait for instance #5 (and not #10) to turn into a master" { + wait_for_condition 1000 50 { + [RI 5 role] eq {master} + } else { + fail "No failover detected" + } +} + +test "Wait for the node #10 to return alive before ending the test" { + R 10 ping +} + +test "Cluster should eventually be up again" { + assert_cluster_state ok +} + +test "Node #10 should eventually replicate node #5" { + set port5 [get_instance_attrib redis 5 port] + wait_for_condition 1000 50 { + ([lindex [R 10 role] 2] == $port5) && + ([lindex [R 10 role] 3] eq {connected}) + } else { + fail "#10 didn't became slave of #5" + } +} diff --git a/tests/cluster/tests/06-slave-stop-cond.tcl b/tests/cluster/tests/06-slave-stop-cond.tcl new file mode 100644 index 0000000..f2e6705 --- /dev/null +++ b/tests/cluster/tests/06-slave-stop-cond.tcl @@ -0,0 +1,73 @@ +# Slave stop condition test +# Check that if there is a disconnection time limit, the slave will not try +# to failover its master. + +source "../tests/includes/init-tests.tcl" + +# Create a cluster with 5 master and 5 slaves. +test "Create a 5 nodes cluster" { + create_cluster 5 5 +} + +test "Cluster is up" { + assert_cluster_state ok +} + +test "The first master has actually one slave" { + assert {[llength [lindex [R 0 role] 2]] == 1} +} + +test {Slaves of #0 is instance #5 as expected} { + set port0 [get_instance_attrib redis 0 port] + assert {[lindex [R 5 role] 2] == $port0} +} + +test "Instance #5 synced with the master" { + wait_for_condition 1000 50 { + [RI 5 master_link_status] eq {up} + } else { + fail "Instance #5 master link status is not up" + } +} + +test "Lower the slave validity factor of #5 to the value of 2" { + assert {[R 5 config set cluster-slave-validity-factor 2] eq {OK}} +} + +test "Break master-slave link and prevent further reconnections" { + # Stop the slave with a multi/exec transaction so that the master will + # be killed as soon as it can accept writes again. + R 5 multi + R 5 client kill 127.0.0.1:$port0 + # here we should sleep 6 or more seconds (node_timeout * slave_validity) + # but the actual validity time is actually incremented by the + # repl-ping-slave-period value which is 10 seconds by default. So we + # need to wait more than 16 seconds. + R 5 debug sleep 20 + R 5 deferred 1 + R 5 exec + + # Prevent the master from accepting new slaves. + # Use a large pause value since we'll kill it anyway. + R 0 CLIENT PAUSE 60000 + + # Wait for the slave to return available again + R 5 deferred 0 + assert {[R 5 read] eq {OK OK}} + + # Kill the master so that a reconnection will not be possible. + kill_instance redis 0 +} + +test "Slave #5 is reachable and alive" { + assert {[R 5 ping] eq {PONG}} +} + +test "Slave #5 should not be able to failover" { + after 10000 + assert {[RI 5 role] eq {slave}} +} + +test "Cluster should be down" { + assert_cluster_state fail +} diff --git a/tests/cluster/tests/07-replica-migration.tcl b/tests/cluster/tests/07-replica-migration.tcl new file mode 100644 index 0000000..68231cd --- /dev/null +++ b/tests/cluster/tests/07-replica-migration.tcl @@ -0,0 +1,103 @@ +# Replica migration test. +# Check that orphaned masters are joined by replicas of masters having +# multiple replicas attached, according to the migration barrier settings. + +source "../tests/includes/init-tests.tcl" + +# Create a cluster with 5 master and 10 slaves, so that we have 2 +# slaves for each master. +test "Create a 5 nodes cluster" { + create_cluster 5 10 +} + +test "Cluster is up" { + assert_cluster_state ok +} + +test "Each master should have two replicas attached" { + foreach_redis_id id { + if {$id < 5} { + wait_for_condition 1000 50 { + [llength [lindex [R 0 role] 2]] == 2 + } else { + fail "Master #$id does not have 2 slaves as expected" + } + } + } +} + +test "Killing all the slaves of master #0 and #1" { + kill_instance redis 5 + kill_instance redis 10 + kill_instance redis 6 + kill_instance redis 11 + after 4000 +} + +foreach_redis_id id { + if {$id < 5} { + test "Master #$id should have at least one replica" { + wait_for_condition 1000 50 { + [llength [lindex [R $id role] 2]] >= 1 + } else { + fail "Master #$id has no replicas" + } + } + } +} + +# Now test the migration to a master which used to be a slave, after +# a failver. + +source "../tests/includes/init-tests.tcl" + +# Create a cluster with 5 master and 10 slaves, so that we have 2 +# slaves for each master. +test "Create a 5 nodes cluster" { + create_cluster 5 10 +} + +test "Cluster is up" { + assert_cluster_state ok +} + +test "Kill slave #7 of master #2. Only slave left is #12 now" { + kill_instance redis 7 +} + +set current_epoch [CI 1 cluster_current_epoch] + +test "Killing master node #2, #12 should failover" { + kill_instance redis 2 +} + +test "Wait for failover" { + wait_for_condition 1000 50 { + [CI 1 cluster_current_epoch] > $current_epoch + } else { + fail "No failover detected" + } +} + +test "Cluster should eventually be up again" { + assert_cluster_state ok +} + +test "Cluster is writable" { + cluster_write_test 1 +} + +test "Instance 12 is now a master without slaves" { + assert {[RI 12 role] eq {master}} +} + +# The remaining instance is now without slaves. Some other slave +# should migrate to it. + +test "Master #12 should get at least one migrated replica" { + wait_for_condition 1000 50 { + [llength [lindex [R 12 role] 2]] >= 1 + } else { + fail "Master #12 has no replicas" + } +} diff --git a/tests/cluster/tests/08-update-msg.tcl b/tests/cluster/tests/08-update-msg.tcl new file mode 100644 index 0000000..6f9661d --- /dev/null +++ b/tests/cluster/tests/08-update-msg.tcl @@ -0,0 +1,90 @@ +# Test UPDATE messages sent by other nodes when the currently authorirative +# master is unavaialble. The test is performed in the following steps: +# +# 1) Master goes down. +# 2) Slave failover and becomes new master. +# 3) New master is partitoned away. +# 4) Old master returns. +# 5) At this point we expect the old master to turn into a slave ASAP because +# of the UPDATE messages it will receive from the other nodes when its +# configuration will be found to be outdated. + +source "../tests/includes/init-tests.tcl" + +test "Create a 5 nodes cluster" { + create_cluster 5 5 +} + +test "Cluster is up" { + assert_cluster_state ok +} + +test "Cluster is writable" { + cluster_write_test 0 +} + +test "Instance #5 is a slave" { + assert {[RI 5 role] eq {slave}} +} + +test "Instance #5 synced with the master" { + wait_for_condition 1000 50 { + [RI 5 master_link_status] eq {up} + } else { + fail "Instance #5 master link status is not up" + } +} + +set current_epoch [CI 1 cluster_current_epoch] + +test "Killing one master node" { + kill_instance redis 0 +} + +test "Wait for failover" { + wait_for_condition 1000 50 { + [CI 1 cluster_current_epoch] > $current_epoch + } else { + fail "No failover detected" + } +} + +test "Cluster should eventually be up again" { + assert_cluster_state ok +} + +test "Cluster is writable" { + cluster_write_test 1 +} + +test "Instance #5 is now a master" { + assert {[RI 5 role] eq {master}} +} + +test "Killing the new master #5" { + kill_instance redis 5 +} + +test "Cluster should be down now" { + assert_cluster_state fail +} + +test "Restarting the old master node" { + restart_instance redis 0 +} + +test "Instance #0 gets converted into a slave" { + wait_for_condition 1000 50 { + [RI 0 role] eq {slave} + } else { + fail "Old master was not converted into slave" + } +} + +test "Restarting the new master node" { + restart_instance redis 5 +} + +test "Cluster is up again" { + assert_cluster_state ok +} diff --git a/tests/cluster/tests/09-pubsub.tcl b/tests/cluster/tests/09-pubsub.tcl new file mode 100644 index 0000000..e62b91c --- /dev/null +++ b/tests/cluster/tests/09-pubsub.tcl @@ -0,0 +1,40 @@ +# Test PUBLISH propagation across the cluster. + +source "../tests/includes/init-tests.tcl" + +test "Create a 5 nodes cluster" { + create_cluster 5 5 +} + +proc test_cluster_publish {instance instances} { + # Subscribe all the instances but the one we use to send. + for {set j 0} {$j < $instances} {incr j} { + if {$j != $instance} { + R $j deferred 1 + R $j subscribe testchannel + R $j read; # Read the subscribe reply + } + } + + set data [randomValue] + R $instance PUBLISH testchannel $data + + # Read the message back from all the nodes. + for {set j 0} {$j < $instances} {incr j} { + if {$j != $instance} { + set msg [R $j read] + assert {$data eq [lindex $msg 2]} + R $j unsubscribe testchannel + R $j read; # Read the unsubscribe reply + R $j deferred 0 + } + } +} + +test "Test publishing to master" { + test_cluster_publish 0 10 +} + +test "Test publishing to slave" { + test_cluster_publish 5 10 +} diff --git a/tests/cluster/tests/10-manual-failover.tcl b/tests/cluster/tests/10-manual-failover.tcl new file mode 100644 index 0000000..5441b79 --- /dev/null +++ b/tests/cluster/tests/10-manual-failover.tcl @@ -0,0 +1,192 @@ +# Check the manual failover + +source "../tests/includes/init-tests.tcl" + +test "Create a 5 nodes cluster" { + create_cluster 5 5 +} + +test "Cluster is up" { + assert_cluster_state ok +} + +test "Cluster is writable" { + cluster_write_test 0 +} + +test "Instance #5 is a slave" { + assert {[RI 5 role] eq {slave}} +} + +test "Instance #5 synced with the master" { + wait_for_condition 1000 50 { + [RI 5 master_link_status] eq {up} + } else { + fail "Instance #5 master link status is not up" + } +} + +set current_epoch [CI 1 cluster_current_epoch] + +set numkeys 50000 +set numops 10000 +set cluster [redis_cluster 127.0.0.1:[get_instance_attrib redis 0 port]] +catch {unset content} +array set content {} + +test "Send CLUSTER FAILOVER to #5, during load" { + for {set j 0} {$j < $numops} {incr j} { + # Write random data to random list. + set listid [randomInt $numkeys] + set key "key:$listid" + set ele [randomValue] + # We write both with Lua scripts and with plain commands. + # This way we are able to stress Lua -> Redis command invocation + # as well, that has tests to prevent Lua to write into wrong + # hash slots. + if {$listid % 2} { + $cluster rpush $key $ele + } else { + $cluster eval {redis.call("rpush",KEYS[1],ARGV[1])} 1 $key $ele + } + lappend content($key) $ele + + if {($j % 1000) == 0} { + puts -nonewline W; flush stdout + } + + if {$j == $numops/2} {R 5 cluster failover} + } +} + +test "Wait for failover" { + wait_for_condition 1000 50 { + [CI 1 cluster_current_epoch] > $current_epoch + } else { + fail "No failover detected" + } +} + +test "Cluster should eventually be up again" { + assert_cluster_state ok +} + +test "Cluster is writable" { + cluster_write_test 1 +} + +test "Instance #5 is now a master" { + assert {[RI 5 role] eq {master}} +} + +test "Verify $numkeys keys for consistency with logical content" { + # Check that the Redis Cluster content matches our logical content. + foreach {key value} [array get content] { + assert {[$cluster lrange $key 0 -1] eq $value} + } +} + +test "Instance #0 gets converted into a slave" { + wait_for_condition 1000 50 { + [RI 0 role] eq {slave} + } else { + fail "Old master was not converted into slave" + } +} + +## Check that manual failover does not happen if we can't talk with the master. + +source "../tests/includes/init-tests.tcl" + +test "Create a 5 nodes cluster" { + create_cluster 5 5 +} + +test "Cluster is up" { + assert_cluster_state ok +} + +test "Cluster is writable" { + cluster_write_test 0 +} + +test "Instance #5 is a slave" { + assert {[RI 5 role] eq {slave}} +} + +test "Instance #5 synced with the master" { + wait_for_condition 1000 50 { + [RI 5 master_link_status] eq {up} + } else { + fail "Instance #5 master link status is not up" + } +} + +test "Make instance #0 unreachable without killing it" { + R 0 deferred 1 + R 0 DEBUG SLEEP 10 +} + +test "Send CLUSTER FAILOVER to instance #5" { + R 5 cluster failover +} + +test "Instance #5 is still a slave after some time (no failover)" { + after 5000 + assert {[RI 5 role] eq {master}} +} + +test "Wait for instance #0 to return back alive" { + R 0 deferred 0 + assert {[R 0 read] eq {OK}} +} + +## Check with "force" failover happens anyway. + +source "../tests/includes/init-tests.tcl" + +test "Create a 5 nodes cluster" { + create_cluster 5 5 +} + +test "Cluster is up" { + assert_cluster_state ok +} + +test "Cluster is writable" { + cluster_write_test 0 +} + +test "Instance #5 is a slave" { + assert {[RI 5 role] eq {slave}} +} + +test "Instance #5 synced with the master" { + wait_for_condition 1000 50 { + [RI 5 master_link_status] eq {up} + } else { + fail "Instance #5 master link status is not up" + } +} + +test "Make instance #0 unreachable without killing it" { + R 0 deferred 1 + R 0 DEBUG SLEEP 10 +} + +test "Send CLUSTER FAILOVER to instance #5" { + R 5 cluster failover force +} + +test "Instance #5 is a master after some time" { + wait_for_condition 1000 50 { + [RI 5 role] eq {master} + } else { + fail "Instance #5 is not a master after some time regardless of FORCE" + } +} + +test "Wait for instance #0 to return back alive" { + R 0 deferred 0 + assert {[R 0 read] eq {OK}} +} diff --git a/tests/cluster/tests/11-manual-takeover.tcl b/tests/cluster/tests/11-manual-takeover.tcl new file mode 100644 index 0000000..f567c69 --- /dev/null +++ b/tests/cluster/tests/11-manual-takeover.tcl @@ -0,0 +1,59 @@ +# Manual takeover test + +source "../tests/includes/init-tests.tcl" + +test "Create a 5 nodes cluster" { + create_cluster 5 5 +} + +test "Cluster is up" { + assert_cluster_state ok +} + +test "Cluster is writable" { + cluster_write_test 0 +} + +test "Killing majority of master nodes" { + kill_instance redis 0 + kill_instance redis 1 + kill_instance redis 2 +} + +test "Cluster should eventually be down" { + assert_cluster_state fail +} + +test "Use takeover to bring slaves back" { + R 5 cluster failover takeover + R 6 cluster failover takeover + R 7 cluster failover takeover +} + +test "Cluster should eventually be up again" { + assert_cluster_state ok +} + +test "Cluster is writable" { + cluster_write_test 4 +} + +test "Instance #5, #6, #7 are now masters" { + assert {[RI 5 role] eq {master}} + assert {[RI 6 role] eq {master}} + assert {[RI 7 role] eq {master}} +} + +test "Restarting the previously killed master nodes" { + restart_instance redis 0 + restart_instance redis 1 + restart_instance redis 2 +} + +test "Instance #0, #1, #2 gets converted into a slaves" { + wait_for_condition 1000 50 { + [RI 0 role] eq {slave} && [RI 1 role] eq {slave} && [RI 2 role] eq {slave} + } else { + fail "Old masters not converted into slaves" + } +} diff --git a/tests/cluster/tests/12-replica-migration-2.tcl b/tests/cluster/tests/12-replica-migration-2.tcl new file mode 100644 index 0000000..48ecd1d --- /dev/null +++ b/tests/cluster/tests/12-replica-migration-2.tcl @@ -0,0 +1,64 @@ +# Replica migration test #2. +# +# Check that the status of master that can be targeted by replica migration +# is acquired again, after being getting slots again, in a cluster where the +# other masters have slaves. + +source "../tests/includes/init-tests.tcl" + +# Create a cluster with 5 master and 15 slaves, to make sure there are no +# empty masters and make rebalancing simpler to handle during the test. +test "Create a 5 nodes cluster" { + create_cluster 5 15 +} + +test "Cluster is up" { + assert_cluster_state ok +} + +test "Each master should have at least two replicas attached" { + foreach_redis_id id { + if {$id < 5} { + wait_for_condition 1000 50 { + [llength [lindex [R 0 role] 2]] >= 2 + } else { + fail "Master #$id does not have 2 slaves as expected" + } + } + } +} + +set master0_id [dict get [get_myself 0] id] +test "Resharding all the master #0 slots away from it" { + set output [exec \ + ../../../src/redis-trib.rb rebalance \ + --weight ${master0_id}=0 \ + 127.0.0.1:[get_instance_attrib redis 0 port] >@ stdout] +} + +test "Master #0 should lose its replicas" { + wait_for_condition 1000 50 { + [llength [lindex [R 0 role] 2]] == 0 + } else { + fail "Master #0 still has replicas" + } +} + +test "Resharding back some slot to master #0" { + # Wait for the cluster config to propagate before attempting a + # new resharding. + after 10000 + set output [exec \ + ../../../src/redis-trib.rb rebalance \ + --weight ${master0_id}=.01 \ + --use-empty-masters \ + 127.0.0.1:[get_instance_attrib redis 0 port] >@ stdout] +} + +test "Master #0 should re-acquire one or more replicas" { + wait_for_condition 1000 50 { + [llength [lindex [R 0 role] 2]] >= 1 + } else { + fail "Master #0 has no has replicas" + } +} diff --git a/tests/cluster/tests/helpers/onlydots.tcl b/tests/cluster/tests/helpers/onlydots.tcl new file mode 100644 index 0000000..4a6d1ae --- /dev/null +++ b/tests/cluster/tests/helpers/onlydots.tcl @@ -0,0 +1,16 @@ +# Read the standard input and only shows dots in the output, filtering out +# all the other characters. Designed to avoid bufferization so that when +# we get the output of redis-trib and want to show just the dots, we'll see +# the dots as soon as redis-trib will output them. + +fconfigure stdin -buffering none + +while 1 { + set c [read stdin 1] + if {$c eq {}} { + exit 0; # EOF + } elseif {$c eq {.}} { + puts -nonewline . + flush stdout + } +} diff --git a/tests/cluster/tests/includes/init-tests.tcl b/tests/cluster/tests/includes/init-tests.tcl new file mode 100644 index 0000000..466ab8f --- /dev/null +++ b/tests/cluster/tests/includes/init-tests.tcl @@ -0,0 +1,70 @@ +# Initialization tests -- most units will start including this. + +test "(init) Restart killed instances" { + foreach type {redis} { + foreach_${type}_id id { + if {[get_instance_attrib $type $id pid] == -1} { + puts -nonewline "$type/$id " + flush stdout + restart_instance $type $id + } + } + } +} + +test "Cluster nodes are reachable" { + foreach_redis_id id { + # Every node should be reachable. + wait_for_condition 1000 50 { + ([catch {R $id ping} ping_reply] == 0) && + ($ping_reply eq {PONG}) + } else { + catch {R $id ping} err + fail "Node #$id keeps replying '$err' to PING." + } + } +} + +test "Cluster nodes hard reset" { + foreach_redis_id id { + if {$::valgrind} { + set node_timeout 10000 + } else { + set node_timeout 3000 + } + catch {R $id flushall} ; # May fail for readonly slaves. + R $id MULTI + R $id cluster reset hard + R $id cluster set-config-epoch [expr {$id+1}] + R $id EXEC + R $id config set cluster-node-timeout $node_timeout + R $id config set cluster-slave-validity-factor 10 + R $id config rewrite + } +} + +test "Cluster Join and auto-discovery test" { + # Join node 0 with 1, 1 with 2, ... and so forth. + # If auto-discovery works all nodes will know every other node + # eventually. + set ids {} + foreach_redis_id id {lappend ids $id} + for {set j 0} {$j < [expr [llength $ids]-1]} {incr j} { + set a [lindex $ids $j] + set b [lindex $ids [expr $j+1]] + set b_port [get_instance_attrib redis $b port] + R $a cluster meet 127.0.0.1 $b_port + } + + foreach_redis_id id { + wait_for_condition 1000 50 { + [llength [get_cluster_nodes $id]] == [llength $ids] + } else { + fail "Cluster failed to join into a full mesh." + } + } +} + +test "Before slots allocation, all nodes report cluster failure" { + assert_cluster_state fail +} diff --git a/tests/cluster/tmp/.gitignore b/tests/cluster/tmp/.gitignore new file mode 100644 index 0000000..f581f73 --- /dev/null +++ b/tests/cluster/tmp/.gitignore @@ -0,0 +1,2 @@ +redis_* +sentinel_* |
