SAP-convergent-mediation-ha-setup-sle15.adoc SAPNotes-convergent-medi…

…ation.adoc: tests
SUSE · May 20, 2024 · c3fc3cb · c3fc3cb
1 parent 5885160
commit c3fc3cb
Show file tree

Hide file tree

Showing 2 changed files with 136 additions and 84 deletions.
diff --git a/adoc/SAP-convergent-mediation-ha-setup-sle15.adoc b/adoc/SAP-convergent-mediation-ha-setup-sle15.adoc
@@ -172,6 +172,13 @@ and UI, together with related IP address.
 NOTE: Neither installation of the basic {sleha} cluster, nor installation of the
 CM ControlZone software is covered in the document at hand.
 
+Please consult the {sleha} product documentation for installation instructions
+(https://documentation.suse.com/sle-ha/15-SP4/single-html/SLE-HA-administration/#part-install).
+For Convergent Mediation installation instructions, please refer to the respective
+product documentation 
+(https://infozone.atlassian.net/wiki/spaces/MD9/pages/4849683/Installation+Instructions). 
+
+
 [[sec.prerequisites]]
 === Prerequisites
 
@@ -1079,7 +1086,8 @@ by the HA cluster.
 - <<sec.test-split-brain>>
 
 This is not a complete list. Please define additional test cases according to your
-needs. And please do not forget to perform every test on each node.
+needs. Some examples are listed in <<sec.test-additional>>.
+And please do not forget to perform every test on each node.
 
 NOTE: Tests for the basic HA cluster as well as tests for the bare CM ControlZone
 components are not covered in this document. Please refer to the respective product
@@ -1104,22 +1112,22 @@ actions are pending.
 
 .{testProc}
 . Check the ControlZone resources and cluster.
-. Stop the ControlZone resources.
-. Check the ControlZone resources.
-. Start the ControlZone resources.
-. Check the ControlZone resources and cluster.
-
++
 [subs="specialchars,attributes"]
 ----
 # cs_wait_for_idle -s 5; crm_mon -1r
 ----
-
++
+. Stop the ControlZone resources.
++
 [subs="specialchars,attributes"]
 ----
 # cs_wait_for_idle -s 5; crm resource stop grp_cz_{mySid}
 # cs_wait_for_idle -s 5; crm_mon -1r
 ----
-
++
+. Check the ControlZone resources.
++
 [subs="specialchars,attributes"]
 ----
 # su - {mySapAdm} -c "mzsh status"
@@ -1129,12 +1137,21 @@ actions are pending.
 # df -h /usr/sap/{mySid}
 ...
 ----
-
++
+. Start the ControlZone resources.
++
 [subs="specialchars,attributes"]
 ----
 # cs_wait_for_idle -s 5; crm resource start grp_cz_{mySid}
 # cs_wait_for_idle -s 5; crm_mon -1r
 ----
++
+. Check the ControlZone resources and cluster.
++
+[subs="specialchars,attributes"]
+----
+# cs_wait_for_idle -s 5; crm_mon -1r
+----
 
 .{testExpect}
 . The cluster stops all resources gracefully.
@@ -1154,23 +1171,30 @@ actions are pending.
 
 .{testProc}
 . Check the ControlZone resources and cluster.
-. Migrate the ControlZone resources.
-. Remove migration constraint.
-. Check the ControlZone resources.
-. Check the ControlZone resources and cluster.
-
++
 [subs="specialchars,attributes"]
 ----
 # cs_wait_for_idle -s 5; crm_mon -1r
 ----
-
++
+. Migrate the ControlZone resources.
++
 [subs="specialchars,attributes"]
 ----
 # cs_wait_for_idle -s 5; crm resource move grp_cz_{mySid} force
 # cs_wait_for_idle -s 5; crm_mon -1r
-# cs_wait_for_idle -s 5; crm resource clear grp_cz_{mySid}
 ----
-
++
+. Remove migration constraint.
++
+[subs="specialchars,attributes"]
+----
+# crm resource clear grp_cz_{mySid}
+# crm configure show | grep cli-
+----
++
+. Check the ControlZone resources and cluster.
++
 [subs="specialchars,attributes"]
 ----
 # cs_wait_for_idle -s 5; crm_mon -1r
@@ -1194,23 +1218,31 @@ actions are pending.
 
 .{testProc}
 . Check the ControlZone resources and cluster.
-. Manually kill ControlZone UI (on e.g. `{mynode1}`).
-. Check the ControlZone resources.
-. Cleanup failcount.
-. Check the ControlZone resources and cluster.
-
++
 [subs="specialchars,attributes"]
 ----
 # cs_wait_for_idle -s 5; crm_mon -1r
 ----
++
+. Manually kill ControlZone UI (on e.g. `{mynode1}`).
++
 
 [subs="specialchars,attributes"]
 ----
 # ssh root@{mynode1} "su - {mySapAdm} \"mzsh kill ui\""
 # cs_wait_for_idle -s 5; crm_mon -1r
-# cs_wait_for_idle -s 5; crm resource cleanup grp_cz_{mySid}
 ----
-
++
+. Cleanup failcount.
++
+[subs="specialchars,attributes"]
+----
+# crm resource cleanup grp_cz_{mySid}
+# cibadmin -Q | grep fail-count
+----
++
+. Check the ControlZone resources and cluster.
++
 [subs="specialchars,attributes"]
 ----
 # cs_wait_for_idle -s 5; crm_mon -1r
@@ -1234,23 +1266,30 @@ actions are pending.
 
 .{testProc}
 . Check the ControlZone resources and cluster.
-. Manually kill ControlZone platform (on e.g. `{mynode1}`).
-. Check the ControlZone resources.
-. Cleanup failcount.
-. Check the ControlZone resources and cluster.
-
++
 [subs="specialchars,attributes"]
 ----
 # cs_wait_for_idle -s 5; crm_mon -1r
 ----
-
++
+. Manually kill ControlZone platform (on e.g. `{mynode1}`).
++
 [subs="specialchars,attributes"]
 ----
 # ssh root@{mynode1} "su - {mySapAdm} \"mzsh kill platform\""
 # cs_wait_for_idle -s 5; crm_mon -1r
-# cs_wait_for_idle -s 5; crm resource cleanup grp_cz_{mySid}
 ----
-
++
+. Cleanup failcount.
++
+[subs="specialchars,attributes"]
+----
+# crm resource cleanup grp_cz_{mySid}
+# cibadmin -Q | grep fail-count
+----
++
+. Check the ControlZone resources and cluster.
++
 [subs="specialchars,attributes"]
 ----
 # cs_wait_for_idle -s 5; crm_mon -1r
@@ -1274,31 +1313,31 @@ actions are pending.
 
 .{testProc}
 . Check the ControlZone resources and cluster.
-. Manually kill cluster node, where resources are running (e.g. `{mynode1}`).
-. Check the ControlZone resources and cluster.
-. Re-join fenced node (e.g. `{mynode1}`) to cluster.
-. Check the ControlZone resources and cluster.
-
++
 [subs="specialchars,attributes"]
 ----
 {mynode2}:~ # cs_wait_for_idle -s 5; crm_mon -1r
 ----
-
++
+. Manually kill cluster node, where resources are running (e.g. `{mynode1}`).
++
 [subs="specialchars,attributes"]
 ----
 {mynode2}:~ # ssh root@{mynode1} "systemctl reboot --force"
 {mynode2}:~ # cs_wait_for_idle -s 5; crm_mon -1r
 ----
-
-Once node has been rebooted, do:
-
++
+. Re-join fenced node (e.g. `{mynode1}`) to cluster.
++
 [subs="specialchars,attributes"]
 ----
 {mynode2}:~ # cs_show_sbd_devices | grep reset
 {mynode2}:~ # cs_clear_sbd_devices --all
 {mynode2}:~ # crm cluster start --all
 ----
-
++
+. Check the ControlZone resources and cluster.
++
 [subs="specialchars,attributes"]
 ----
 {mynode2}:~ # cs_wait_for_idle -s 5; crm_mon -1r
@@ -1323,34 +1362,39 @@ Once node has been rebooted, do:
 
 .{testProc}
 . Check the ControlZone resources and cluster.
-. Manually block port for NFS, where resources are running (e.g. `{mynode1}`).
-. Check the ControlZone resources and cluster.
-. Re-join fenced node (e.g. `{mynode1}`) to cluster.
-. Check the ControlZone resources and cluster.
-// TODO PRIO1: more test details
-
++
 [subs="specialchars,attributes"]
 ----
 {mynode2}:~ # cs_wait_for_idle -s 5; crm_mon -1r
 ----
-
++
+. Manually block port for NFS, where resources are running (e.g. `{mynode1}`).
++
 [subs="specialchars,attributes"]
 ----
 {mynode2}:~ # ssh root@{mynode1} "iptables -I INPUT -p tcp -m multiport --ports 2049 -j DROP"
 {mynode2}:~ # ssh root@{mynode1} "iptables -L | grep 2049"
 {mynode2}:~ # cs_wait_for_idle -s 5; crm_mon -1r
 ----
-
++
+. Re-join fenced node (e.g. `{mynode1}`) to cluster.
++
 [subs="specialchars,attributes"]
 ----
-...
+{mynode2}:~ # cs_show_sbd_devices | grep reset
+{mynode2}:~ # cs_clear_sbd_devices --all
+{mynode2}:~ # crm cluster start --all
 ----
-
++
+. Check the ControlZone resources and cluster.
++
 [subs="specialchars,attributes"]
 ----
 {mynode2}:~ # cs_wait_for_idle -s 5; crm_mon -1r
 ----
 
+// TODO PRIO1: more test details
+
 .{testExpect}
 . The cluster detects failed NFS.
 . The cluster fences node.
@@ -1370,35 +1414,40 @@ Once node has been rebooted, do:
 
 .{testProc}
 . Check the ControlZone resources and cluster.
-. Manually block ports for corosync.
-. Check the ControlZone resources and cluster.
-. Re-join fenced node (e.g. `{mynode1}`) to cluster.
-. Check the ControlZone resources and cluster.
-// TODO PRIO1: more test details
-
++
 [subs="specialchars,attributes"]
 ----
 {mynode2}:~ # cs_wait_for_idle -s 5; crm_mon -1r
 ----
-
++
+. Manually block ports for corosync.
++
 [subs="specialchars,attributes"]
 ----
 {mynode2}:~ # grep mcastport /etc/corosync/corosync.conf
 {mynode2}:~ # ssh root@{mynode1} "iptables -I INPUT -p udp -m multiport --ports 5405,5407 -j DROP"
 {mynode2}:~ # ssh root@{mynode1} "iptables -L | grep -e 5405 -e 5407"
 {mynode2}:~ # cs_wait_for_idle -s 5; crm_mon -1r
 ----
-
++
+. Re-join fenced node (e.g. `{mynode1}`) to cluster.
++
 [subs="specialchars,attributes"]
 ----
-...
+{mynode2}:~ # cs_show_sbd_devices | grep reset
+{mynode2}:~ # cs_clear_sbd_devices --all
+{mynode2}:~ # crm cluster start --all
 ----
-
++
+. Check the ControlZone resources and cluster.
++
 [subs="specialchars,attributes"]
 ----
 {mynode2}:~ # cs_wait_for_idle -s 5; crm_mon -1r
 ----
 
+// TODO PRIO1: more test details
+
 .{testExpect}
 . The cluster detects failed corosync.
 . The cluster fences node.
@@ -1407,18 +1456,21 @@ Once node has been rebooted, do:
 . No resource failure.
 ==========
 
-////
-==== Additional tests
-// TODO PRIO3: add basic tests
-Remove IP address.
-Stop the complete cluster.
-Parallel start of all cluster nodes.
-Isolate the SBD.
-Simulate a maintenance procedure with cluster continuously running.
-Simulate a maintenance procedure with cluster restart.
-Kill the corosync process of one cluster node.
-See manual page crm(8) for cluster crash_test.
-////
+[[sec.test-additional]]
+=== Additional tests
+
+Please define additional test cases according to your needs. Some cases you might
+want to test are listes below.
+
+- Remove virtual IP address.
+- Stop and re-start passive node.
+- Stop and parallel re-start of all cluster nodes.
+- Isolate the SBD.
+- Simulate a maintenance procedure with cluster continuously running.
+- Simulate a maintenance procedure with cluster restart.
+- Kill the corosync process of one cluster node.
+
+See also manual page crm(8) for cluster crash_test.
 
 
 
@@ -1551,7 +1603,7 @@ cluster are finally checked for clean and idle state.
 # cs_clusterstate -i
 # crm resource maintenance grp_cz_{mySid}
 
-# echo -e "\e[0;32m PLEASE DO MAINTENANCE NOW"
+# echo "PLEASE DO MAINTENANCE NOW"
 
 # crm resource refresh grp_cz_{mySid}
 # cs_wait_for_idle -s 5; crm_mon -1r