diff --git a/DC-SLES4SAP-hana-angi-perfopt-15 b/DC-SLES4SAP-hana-angi-perfopt-15 new file mode 100644 index 00000000..7b62cecf --- /dev/null +++ b/DC-SLES4SAP-hana-angi-perfopt-15 @@ -0,0 +1,18 @@ +MAIN="SLES4SAP-hana-angi-perfopt-15.adoc" + +ADOC_TYPE="article" + +ADOC_POST="yes" + +ADOC_ATTRIBUTES="--attribute docdate=2024-05-24" + +STYLEROOT=/usr/share/xml/docbook/stylesheet/sbp +FALLBACK_STYLEROOT=/usr/share/xml/docbook/stylesheet/suse2022-ns + +XSLTPARAM="--stringparam publishing.series=sbp" + +DRAFT=no +ROLE="sbp" +#PROFROLE="sbp" + +DOCBOOK5_RNG_URI="http://docbook.org/xml/5.2/rng/docbookxi.rnc" diff --git a/adoc/SAP-S4HA10-setup-simplemount-sle15.adoc b/adoc/SAP-S4HA10-setup-simplemount-sle15.adoc index 0a779aff..e48f4ca9 100644 --- a/adoc/SAP-S4HA10-setup-simplemount-sle15.adoc +++ b/adoc/SAP-S4HA10-setup-simplemount-sle15.adoc @@ -996,7 +996,7 @@ the ASCS vs. resilience against sporadic temporary NFS issues. You may slightly increase it to fit your infrastructure. Consult your storage or NFS server documentation for appropriate timeout values. Make sure the SAPStartSrv resource has *NO* monitor operation configured. -See also manual pages ocf_heartbeat_SAPInstance(7), ocf_heartbeat_IPAddr2(7) ocf_suse_SAPStartSrv(7) +See also manual pages ocf_heartbeat_SAPInstance(7), ocf_heartbeat_IPaddr2(7) ocf_suse_SAPStartSrv(7) and nfs(5). .ASCS group @@ -1052,7 +1052,7 @@ the ERS vs. resilience against sporadic temporary NFS issues. You may slightly increase it to fit your infrastructure. Consult your storage or NFS server documentation for appropriate timeout values. Make sure the SAPStartSrv resource has *NO* monitor operation configured. -See also manual pages ocf_heartbeat_SAPInstance(7), ocf_heartbeat_IPAddr2(7) ocf_suse_SAPStartSrv(7) +See also manual pages ocf_heartbeat_SAPInstance(7), ocf_heartbeat_IPaddr2(7) ocf_suse_SAPStartSrv(7) and nfs(5). .ERS group diff --git a/adoc/SAP-S4HA10-setupguide-sle15.adoc b/adoc/SAP-S4HA10-setupguide-sle15.adoc index 5660ec51..bdefa9ed 100644 --- a/adoc/SAP-S4HA10-setupguide-sle15.adoc +++ b/adoc/SAP-S4HA10-setupguide-sle15.adoc @@ -1022,7 +1022,7 @@ primitive rsc_sap_{mySID}_{myInstAscs} SAPInstance \ AUTOMATIC_RECOVER=false \ meta resource-stickiness=5000 ---- -See also manual pages ocf_heartbeat_SAPInstance(7), ocf_heartbeat_IPAddr2(7) and ocf_heartbeat_Filesystem(7). +See also manual pages ocf_heartbeat_SAPInstance(7), ocf_heartbeat_IPaddr2(7) and ocf_heartbeat_Filesystem(7). ================================================ .ASCS group @@ -1079,7 +1079,7 @@ primitive rsc_sap_{mySID}_{myInstErs} SAPInstance \ START_PROFILE="/usr/sap/{mySid}/SYS/profile/{mySid}_{myInstErs}_{myVipNErs}" \ AUTOMATIC_RECOVER=false IS_ERS=true ---- -See also manual pages ocf_heartbeat_SAPInstance(7), ocf_heartbeat_IPAddr2(7) and ocf_heartbeat_Filesystem(7). +See also manual pages ocf_heartbeat_SAPInstance(7), ocf_heartbeat_IPaddr2(7) and ocf_heartbeat_Filesystem(7). ================================================ .ERS group diff --git a/adoc/SAPNotes_HANA20_15.adoc b/adoc/SAPNotes_HANA20_15.adoc index e7213140..a4ec3270 100644 --- a/adoc/SAPNotes_HANA20_15.adoc +++ b/adoc/SAPNotes_HANA20_15.adoc @@ -40,7 +40,7 @@ https://www.suse.com/releasenotes/x86_64/SUSE-SLES/15-SP4/index.html#file-system // https://www.suse.com/products/server/ //// XFS file system:: - https://www.suse.com/communities/conversations/xfs-the-file-system-of-choice/ + https://www.suse.com/c/xfs-the-file-system-of-choice/ {SUSE} YES certified hardware database:: https://www.suse.com/yessearch/ {suma} Product Page:: @@ -68,13 +68,12 @@ XFS file system:: - cs_wait_for_idle(8) - ha_related_sap_notes(7) - ha_related_suse_tids(7) -- ocf_heartbeat_IPAddr2(7) +- ocf_heartbeat_IPaddr2(7) - ocf_heartbeat_SAPInstance(7) - ocf_suse_SAPHana(7) - ocf_suse_SAPHanaController(7) - ocf_suse_SAPHanaTopology(7) - SAHanaSR(7) -- SAPHanaSR-ScaleOut(7) - SAPHanaSR-filter(8) - SAPHanaSR_basic_cluster(7) - SAPHanaSR-hookHelper(8) @@ -133,6 +132,30 @@ Troubleshooting the SAPHanaSR python hook:: {tidNotes}000019865 Entry "CALLING CRM: ... rc=256" in HANA trace after upgrading SAPHanaSR-ScaleOut:: {tidNotes}000020599 +SAP HANA monitors timed out after 5 seconds:: + {tidNotes}000020626 +HA cluster takeover takes too long on HANA indexserver failure:: + {tidNotes}000020845 +Cluster node fence as SAPHanaTopology fails with error code 1 (OCF_ERR_GENERIC) during a normal cluster stop:: + {tidNotes}000020964 +SUSE HA for HANA cluster node fenced at shutdown, despite of systemd integration:: + {tidNotes}000021046 +SAP HANA scale-out - pacemaker.service: "Cannot find sapstartsrv and sapcontrol executable, please set DIR_EXECUTABLE parameter!":: + {tidNotes}000021062 +SAPHanaSR-showAttr fails with error "Error: NIECONN_REFUSED ...":: + {tidNotes}000020548 +Protect HANA against manually caused dual-primary situation in SUSE HA cluster:: + {tidNotes}000021044 +Address space monitoring and HANA DB performance:: + {tidNotes}000020746 +HANA nodes end up having the same LPT values:: + {tidNotes}000020690 +HANA DB resource failed to start:: + {tidNotes}000020948 +SAPHanaController monitor timeout leads to database restart:: + {tidNotes}000021249 +HANA Database Planning Engine crashes in __strncmp_avx2_rtm+0x1b3:: + {tidNotes}000021026 Basic health check for two-node SAP HANA performance based model:: {tidNotes}7022984 How to re-enable replication in a two-node SAP performance based model:: diff --git a/adoc/SAPNotes_HANA20_angi_15.adoc b/adoc/SAPNotes_HANA20_angi_15.adoc new file mode 100644 index 00000000..8fe303fd --- /dev/null +++ b/adoc/SAPNotes_HANA20_angi_15.adoc @@ -0,0 +1,327 @@ +// TODO: unify with HANA and ENSA setup guides + += SUSE Product Documentation + +// TODO PRIO2: use variables, e.g. {sles4sapDocs15} +Best Practices for {sap} on SUSE Linux Enterprise:: + {reslibrary} +SUSE product manuals and documentation:: + https://documentation.suse.com/ +Release notes:: + https://www.suse.com/releasenotes/ +Online documentation of {sles4sapa}:: + https://documentation.suse.com/sles-sap/15-SP6/ +Online documentation of {sleha}:: + {haAdminGuide15} +Deployment guide for {sls}:: + {deploymentGuide15} +Tuning guide for {sls}:: + {tuningGuide15} +Storage administration guide for {sls}:: + {storageGuide15} +{sls} Persistent Memory Guide:: + {persMemDoc} + +//// +// TODO PRIO2: still relevant? +// {SUSE} partners with {SAP} and IBM on Persistent Memory:: +// https://www.suse.com/c/suse-partners-with-intel-and-sap-to-accelerate-it-transformation-with-persistent-memory-in-the-data-center/ +// Persistent Memory on Power9:: +// https://www.suse.com/c/using-ibm-power9-powervm-virtual-persistent-memory-for-sap-hana-with-suse-linux/ +//// + +SUSE Linux Enterprise kernel specs:: +https://www.suse.com/releasenotes/x86_64/SUSE-SLES/15-SP4/index.html#kernel-limits +SUSE Linux Enterprise file system specs:: +https://www.suse.com/releasenotes/x86_64/SUSE-SLES/15-SP4/index.html#file-system-comparison +//// +// TODO PRIO2: fix tech info URL +// {slsa} technical information:: +// https://www.suse.com/products/server/ +//// +XFS file system:: + https://www.suse.com/c/xfs-the-file-system-of-choice/ +{SUSE} YES certified hardware database:: + https://www.suse.com/yessearch/ +{suma} Product Page:: + {sumalandingpage} +{suma} Documentation:: + {sumadoc} +{RMT} = {rmtool} documentation:: + {rmtGuide15} +{scc} Fequently Asked Questions:: + {sccfaq} + + += Related Manual Pages + +- chronyc(8) +- corosync.conf(8) +- corosync_overview(8) +- crm(8) +- crm_mon(8) +- crm_simulate(8) +- cs_clusterstate(8) +- cs_man2pdf(8) +- cs_show_hana_info(8) +- cs_show_sbd_devices(8) +- cs_wait_for_idle(8) +- ha_related_sap_notes(7) +- ha_related_suse_tids(7) +- ocf_heartbeat_IPaddr2(7) +- ocf_heartbeat_SAPInstance(7) +- ocf_suse_SAPHanaController(7) +- ocf_suse_SAPHanaTopology(7) +- ocf_suse_SAPHanaFilesystem(7) +- SAHanaSR(7) +- SAHanaSR-angi(7) +- SAPHanaSR_basic_cluster(7) +- SAPHanaSR-hookHelper(8) +- SAPHanaSR_maintenance_examples(7) +- SAPHanaSR-manageAttr(8) +- SAPHanaSR-manageProvider(8) +- SAPHanaSR-monitor(8) +- SAPHanaSR-replay-archive(8) +- SAPHanaSR-ScaleOut(7) +- SAPHanaSR-ScaleOut_basic_cluster(7) +- SAPHanaSR-showAttr(8) +- SAPHanaSR-show-hadr-runtimes(8) +- saptune(8) +- sbd(8) +- stonith_sbd(7) +- sudo(8) +- sudoers(5) +- supportconfig(8) +- susChkSrv.py(7) +- susCostOpt.py(7) +- susHanaSR.py(7) +- susTkOver.py(7) +- systemctl(8) +- systemd-cgls(8) +- votequorum(5) +- zypper(8) + + += Related SUSE TIDs + +// TODO PRIO1: check if still relevant +SAP HANA SR Performance Optimized Scenario - Setup Guide - Errata:: + {tidNotes}7023882 +Estimate correct multipath timeout:: + {tidNotes}7016305 +Can't open watchdog device: /dev/watchdog: Device or resource busy:: + {tidNotes}7008216 +Systemd-udev-settle timing out:: + {tidNotes}7022681 +Configuring Persistent Memory Devices (PMEM) results in booting to the recovery shell:: + {tidNotes}000019517 +Slow boot boot initialization on machines with Intel Optane DC Memory causing auto-mount to fail:: + {tidNotes}000019462 +How to load the correct watchdog kernel module:: + {tidNotes}7016880 +TID XFS metadata corruption and invalid checksum on SAP Hana servers:: + {tidNotes}7022921 +Overcommit Memory in SLES:: + {tidNotes}7002775 +Recommended SUSE SLES 4 SAP Settings:: + {tidNotes}7024082 +SAPHanaController running in timeout when starting SAP Hana:: + {tidNotes}000019899 +Troubleshooting the SAPHanaSR python hook:: + {tidNotes}000019865 +Entry "CALLING CRM: ... rc=256" in HANA trace after upgrading SAPHanaSR-ScaleOut:: + {tidNotes}000020599 +SAP HANA monitors timed out after 5 seconds:: + {tidNotes}000020626 +HA cluster takeover takes too long on HANA indexserver failure:: + {tidNotes}000020845 +Cluster node fence as SAPHanaTopology fails with error code 1 (OCF_ERR_GENERIC) during a normal cluster stop:: + {tidNotes}000020964 +SUSE HA for HANA cluster node fenced at shutdown, despite of systemd integration:: + {tidNotes}000021046 +SAP HANA scale-out - pacemaker.service: "Cannot find sapstartsrv and sapcontrol executable, please set DIR_EXECUTABLE parameter!":: + {tidNotes}000021062 +SAPHanaSR-showAttr fails with error "Error: NIECONN_REFUSED ...":: + {tidNotes}000020548 +Protect HANA against manually caused dual-primary situation in SUSE HA cluster:: + {tidNotes}000021044 +Address space monitoring and HANA DB performance:: + {tidNotes}000020746 +HANA nodes end up having the same LPT values:: + {tidNotes}000020690 +HANA DB resource failed to start:: + {tidNotes}000020948 +SAPHanaController monitor timeout leads to database restart:: + {tidNotes}000021249 +HANA Database Planning Engine crashes in __strncmp_avx2_rtm+0x1b3:: + {tidNotes}000021026 +Basic health check for two-node SAP HANA performance based model:: + {tidNotes}7022984 +How to re-enable replication in a two-node SAP performance based model:: + {tidNotes}7023127 +Showing SOK Status in Cluster Monitoring Tools Workaround:: + {tidNotes}7023526 +HANA SystemReplication doesn't provide SiteName to Corosync Cluster:: + {tidNotes}000019754 +SUSE Cluster Support for SAP HANA System Replication Active / Active Read Enabled Feature:: + {tidNotes}7023884 +SAP Generating 'Database host operating system is not supported' alerts:: + {tidNotes}7023744 +sapstartsrv does not respawn after a forceful kill of the master nameserver:: + {tidNotes}7024291 +SAPHanaSR HANA system replication automation without layer 2 network:: + {tidNotes}000020333 +The vIP cluster resource does not follow the SAP HANA master ...:: + {tidNotes}000019769 +Handling failed NFS share in SUSE HA cluster for HANA system replication:: + {tidNotes}000019904 +SAP Instances failed stop on shutdown (PACEMAKER, SYSTEMD, SAP):: + {tidNotes}7022671 +SAP on SLES shows Error: NIECONN_REFUSED in the logs:: + {tidNotes}7023236 +Indepth HANA Cluster Debug Data Collection (PACEMAKER, SAP):: + {tidNotes}7022702 +How to prevent certain values in limits.conf from being changed by saptune:: + {tidNotes}7023104 +Disabling fstrim - under which conditions?:: + {tidNotes}7023805 +saptune: WARNING saptune.io.go:66: 'noop' is not a valid scheduler for device:: + {tidNotes}000019572 +How to patch a SAP Application Pacemaker Cluster:: + {tidNotes}000020268 + + += Related SUSE blogs + +Emergency Braking for SAP HANA Dying Indexserver:: +https://www.suse.com/c/emergency-braking-for-sap-hana-dying-indexserver/ +SAP HANA Cockpit with SUSE HA integration greatly improves data integrity:: +https://www.suse.com/c/sap-hana-cockpit-with-suse-ha-integration-greatly-improves-data-integrity/ +Handover for the Next Round – SAP on SUSE Cluster and systemd Native Integration:: + https://www.suse.com/c/handover-for-the-next-round-sap-on-suse-cluster-and-systemd-native-integration/ +SAPHanaSR-ScaleOut for Multi-Target Architecture and Principles:: + https://www.suse.com/c/saphanasr-scaleout-multi-target/ +SAP HANA Scale-Out System Replication for large ERP Systems:: + https://www.suse.com/c/sap-hana-scale-out-system-replication-for-large-erp-systems/ +SAP HANA Cost-optimized – An alternative Route is available:: + https://www.suse.com/c/sap-hana-cost-optimized-an-alternative-route-is-available/ +Let’s flip the flags! Is my SAP HANA database in sync or not?:: + https://www.suse.com/c/lets-flip-the-flags-is-my-sap-hana-database-in-sync-or-not/ +Entry to blog series #towardsZeroDowntime:: + https://www.suse.com/c/tag/towardszerodowntime/ +Fail-Safe Operation of {SAPHANA}: {SUSE} Extends Its High-Availability Solution:: + http://scn.sap.com/community/hana-in-memory/blog/2014/04/04/fail-safe-operation-of-sap-hana-suse-extends-its-high-availability-solution + + += Related SAP Documentation + +{sap} Product Availability Matrix:: + https://support.sap.com/en/release-upgrade-maintenance.html#section_1969201630 +{SAPHANA} Installation and Update Guide:: + https://help.sap.com/doc/e9702d76c3284623b02de196c0e79e49/2.0.05/en-US/SAP_HANA_Server_Installation_Guide_en.pdf +{SAPHANA} Administration Guide:: + https://help.sap.com/doc/eb75509ab0fd1014a2c6ba9b6d252832/2.0.05/en-US/SAP_HANA_Administration_Guide_en.pdf +{SAPHANA} Documentation Entry Page:: + https://help.sap.com/viewer/product/SAP_HANA_PLATFORM/2.0.05/en-US +{SAPHANA} Tailored Data Center Integration - FAQ:: + https://www.sap.com/documents/2016/05/e8705aae-717c-0010-82c7-eda71af511fa.html +{SAPHANA} and Persistent Memory:: + https://blogs.sap.com/2020/01/30/sap-hana-and-persistent-memory/ +{SAPHANA} HA/DR Provider Hook Methods:: + https://help.sap.com/viewer/6b94445c94ae495c83a19646e7c3fd56/2.0.05/en-US/5df2e766549a405e95de4c5d7f2efc2d.html + + += Related SAP Notes + +[[sap-notes]] +// TODO PRIO1: check if still relevant +611361 - Hostnames of SAP servers:: + {launchPadNotes}611361 +768727 - Automatic restart functions in sapstart for processes:: + {launchPadNotes}768727 +927637 - Web service authentication in sapstartsrv as of Release 7.00:: + {launchPadNotes}927637 +1092448 - IBM XL C/C++ runtime environment for Linux on system p:: + {launchPadNotes}1092448 +1514967 - SAP HANA: Central Note:: + {launchPadNotes}1514967 +1552925 - Linux: High Availability Cluster Solutions:: + {launchPadNotes}1552925 +1763512 - Support details for SUSE Linux Enterprise for SAP Applications:: + {launchPadNotes}1763512 +1846872 - "No space left on device" error reported from HANA:: +{launchPadNotes}1846872 +1876398 - Network configuration for System Replication in HANA SP6:: + {launchPadNotes}1876398 +1888072 - SAP HANA DB: Indexserver crash in strcmp sse42:: + {launchPadNotes}1888072 +2021789 - SAP HANA Revison and Maintenance Strategy:: + {launchPadNotes}2021789 +2196941 - SAP HANA Software Replication Takeover Hook Changes:: + {launchPadNotes}2196941 +2235581 - SAP HANA: Supported Operating Systems:: + {launchPadNotes}2235581 +2369981 - Required configuration steps for authentication with HANA System Replication:: + {launchPadNotes}2369981 +2369910 - SAP Software on Linux: General information:: + {launchPadNotes}2369910 +2380229 - SAP HANA Platform 2.0 - Central Note:: + {launchPadNotes}2380229 +2434562 - System Replication Hanging in Status "SYNCING" or "ERROR" With Status Detail "Missing Log" or "Invalid backup size":: + {launchPadNotes}2434562 +2578899 - SUSE Linux Enterprise Server 15: Installation Note:: + {launchPadNotes}2578899 +2647673 - HANA Installation Failure:: + {launchPadNotes}2647673 +2684254 - SAP HANA DB: Recommended OS settings for SLES 15 / SLES for SAP Applications 15:: + {launchPadNotes}2684254 +2733483 - Host Auto-Failover Not Occur when Indexserver Crash on Worker Node:: + {launchPadNotes}2733483 +2750199 - Incorrect Alert Regarding Unsupported Operating System Version:: + {launchPadNotes}2750199 +2844322 - SAP HANA Platform 2.0 SPS 05 Release Note:: + {launchPadNotes}2844322 +2945239 - SAP HANA Platform 2.0 SPS 06 Release Note:: + {launchPadNotes}2945239 +3007062 - FAQ: SAP HANA & Third Party Cluster Solutions:: + {launchPadNotes}3007062 +3014176 - Applying System Size Dependent Resource Limits During Installation or Upgrade:: + {launchPadNotes}3014176 +3043459 - SAP HANA 2 SPS05 Revision 056.00:: + {launchPadNotes}3043459 +3072590 - Python 3 Support for Non-Productive SAP HANA Systems:: + {launchPadNotes}3072590 +3070359 - Python 3 Migration Guide For SAP HANA:: + {launchPadNotes}3070359 +3084229 - SAP HANA Python Support Scripts Fail due to Incompatibility With Python 3:: + {launchPadNotes}3084229 +3091152 - sapstartsrv - improved deregistration for UNIX/Linux:: + {launchPadNotes}3091152 +3093542 - Transition to Python 3 of the Python Distribution Delivered With SAP HANA 2.0 Server:: + {launchPadNotes}3093542 +3139184 - Linux: systemd integration for sapstartsrv and SAP Hostagent:: + {launchPadNotes}3139184 +3145200 - SAP Host Agent 7.22 PL57:: + {launchPadNotes}3145200 + +// TODO PRIO3: check whether still relevant: +//// +1056161 - SUSE Priority Support for SAP applications:: +{launchPadNotes}1056161 +1275776 - Preparing SLES for Sap Environments:: +{launchPadNotes}1275776 +1514967 - SAP HANA: Central Note:: +{launchPadNotes}1514967 +1501701 - Single Computing Unit Performance and Sizing:: +{launchPadNotes}1501701 +1944799 - SAP HANA Guidelines for SLES Operating System Installation:: +{launchPadNotes}1944799 +//// + + +// +// REVISION 1.0 2022/02 +// - copied from SAPNotes_s4_1809.adoc +// REVISION 1.1 2022/03 +// - include SAPHanaSR-ScaleOut 15, CostOpt 15 +// diff --git a/adoc/SAPNotes_ha740.adoc b/adoc/SAPNotes_ha740.adoc index b5f2bc0e..179fed32 100644 --- a/adoc/SAPNotes_ha740.adoc +++ b/adoc/SAPNotes_ha740.adoc @@ -8,7 +8,7 @@ - cs_clusterstate(8) - cs_man2pdf(8) - mount.nfs(8) -- ocf_heartbeat_IPAddr2(7) +- ocf_heartbeat_IPaddr2(7) - ocf_heartbeat_SAPInstance(7) - ocf_suse_SAPStartSrv(7) - sapping(7) diff --git a/adoc/SAPNotes_s4_2101.adoc b/adoc/SAPNotes_s4_2101.adoc index d1ad03c6..57dff3bc 100644 --- a/adoc/SAPNotes_s4_2101.adoc +++ b/adoc/SAPNotes_s4_2101.adoc @@ -16,7 +16,7 @@ - ha_related_sap_notes(7) - ha_related_suse_tids(7) - mount.nfs(8) -- ocf_heartbeat_IPAddr2(7) +- ocf_heartbeat_IPaddr2(7) - ocf_heartbeat_SAPInstance(7) - ocf_suse_SAPStartSrv(7) - sapping(7) diff --git a/adoc/SLES4SAP-hana-angi-perfopt-15-docinfo.xml b/adoc/SLES4SAP-hana-angi-perfopt-15-docinfo.xml new file mode 100644 index 00000000..82ee47e7 --- /dev/null +++ b/adoc/SLES4SAP-hana-angi-perfopt-15-docinfo.xml @@ -0,0 +1,96 @@ + + + https://github.com/SUSE/suse-best-practices/issues/new + {SAPHANA} System Replication Scale-Up - Performance Optimized Scenario SLES15 + + + + + + + SUSE Linux Enterprise Server for SAP Applications + 15 + +SUSE Best Practices + + + SAP + + + High Availability + Clustering + Installation + +SAP HANA System Replication Scale-Up - Performance Optimized Scenario +How to install and customize SLES for SAP Applications + for SAP HANA system replication in the scale-up performance-optimized scenario + + SLES for SAP + +2024-04-02 + +SUSE Linux Enterprise Server for SAP Applications 15 + + + + + Fabian + Herschel + + + Distinguished Architect SAP + SUSE + + + + + Lars + Pinne + + + System Engineer + SUSE + + + + + + + + + + + + + + + + + + + + + + SUSE® Linux Enterprise Server for SAP Applications is optimized in various ways for SAP* + applications. This guide provides detailed information about installing and customizing SUSE + Linux Enterprise Server for SAP Applications for SAP HANA system replication in the performance + optimized scenario. The document focuses on the steps to integrate an already installed and + working SAP HANA with system replication. It is based on SUSE Linux Enterprise Server for SAP + Applications 15 SP6. The concept however can also be used with SUSE Linux Enterprise Server for + SAP Applications 15 SP4 or newer. + + Disclaimer: + Documents published as part of the SUSE Best Practices series have been contributed voluntarily + by SUSE employees and third parties. They are meant to serve as examples of how particular + actions can be performed. They have been compiled with utmost attention to detail. + However, this does not guarantee complete accuracy. SUSE cannot verify that actions described + in these documents do what is claimed or whether actions described have unintended consequences. + SUSE LLC, its affiliates, the authors, and the translators may not be held liable for possible errors + or the consequences thereof. + + diff --git a/adoc/SLES4SAP-hana-angi-perfopt-15.adoc b/adoc/SLES4SAP-hana-angi-perfopt-15.adoc new file mode 100644 index 00000000..88072684 --- /dev/null +++ b/adoc/SLES4SAP-hana-angi-perfopt-15.adoc @@ -0,0 +1,3490 @@ +:docinfo: + +// Load document variables +include::Var_SLES4SAP-hana-angi-perfopt-15.txt[] +include::Var_SLES4SAP-hana-angi-perfopt-15-param.txt[] +// +// Start of the document +// + += {SAPHANA} System Replication Scale-Up - Performance Optimized Scenario + +[[pre.hana-sr]] +== About this guide + +//// + ____ __ __ __ _ + / _/___ / /__________ ____/ /_ _______/ /_(_)___ ____ + / // __ \/ __/ ___/ __ \/ __ / / / / ___/ __/ / __ \/ __ \ + _/ // / / / /_/ / / /_/ / /_/ / /_/ / /__/ /_/ / /_/ / / / / +/___/_/ /_/\__/_/ \____/\__,_/\__,_/\___/\__/_/\____/_/ /_/ +//// + +=== Introduction + +{sles4sapreg} is optimized in various ways for {SAPreg} applications. +This guide provides detailed information about installing and customizing +*{sles4sap} for {HANA} system replication in the {usecase} scenario*. + +“{SAP} customers invest in {HANA}” is the conclusion reached by a recent +market study carried out by Pierre Audoin Consultants (PAC). In Germany, +half of the companies expect {HANA} to become the dominant +database platform in the {SAP} environment. Often the “{SAP} +Business Suite* powered by {HANA}*” scenario is already being discussed +in concrete terms. + +{SUSE} is accommodating this development by offering {sles4sap}, +the recommended and supported operating system for {HANA}. In close +collaboration with {SAP}, cloud service and hardware partners, {SUSE} provides two +resource agents for customers to ensure the high availability of {HANA} system +replications. + +==== Abstract + +This guide describes planning, setup, and basic testing of {sles4sap} based on +the high availability solution scenario "{SAPHANA} Scale-Up System Replication Performance Optimized". + +From the application perspective, the following variants are covered: + +- plain system replication + +- system replication with secondary site read-enabled + +- multi-tier (chained) system replication + +- multi-target system replication + +- multi-tenant database containers for all above + +From the infrastructure perspective, the following variants are covered: + +- 2-node cluster with disk-based SBD + +- 3-node cluster with diskless SBD + +- On-premises deployment on physical and virtual machines + +- Public cloud deployment (usually needs additional documentation focusing on the cloud specific implementation details) + +Deployment automation simplifies roll-out. There are several options available, +particularly on public cloud platfoms (for example https://www.suse.com/c/automating-the-sap-hana-high-availability-cluster-deployment-for-microsoft-azure/). +Ask your public cloud provider or your SUSE contact for details. + +See <> for details. + +==== Scale-up versus scale-out + +// TODO PRIO2: add stonith resource to the graphic + +The first set of scenarios includes the architecture and development of _scale-up_ solutions. + +.{HANA} System Replication Scale-Up in the Cluster +image::hana_sr_in_cluster.svg[scaledwidth=70.0%] + +For these scenarios, {SUSE} has developed the scale-up +resource agent package `{SAPHanaSR}`. System replication helps to +replicate the database data from one computer to another computer to compensate for database failures (single-box replication). + +//.{HANA} System Replication Scale-Up in the Cluster +//image::hana_sr_in_cluster.svg[scaledwidth=100.0%] + +The second set of scenarios includes the architecture and development of +_scale-out_ solutions (multi-box replication). For these scenarios, {SUSE} +has developed the scale-out resource agent package `{SAPHanaSR}-ScaleOut`. + +.{HANA} System Replication Scale-Out in the Cluster +image::SAPHanaSR-ScaleOut-Cluster.svg[scaledwidth=70.0%] + +With this mode of operation, internal {HANA} high availability (HA) +mechanisms and the resource agent must work together or be coordinated +with each other. {HANA} system replication automation for scale-out +is described in a separate document available on our documentation Web page +at {reslibrary}. The document for scale-out is named _"{docScaleOut}"_. + +==== Scale-up scenarios and resource agents + +[[scenarios]] + +{SUSE} has implemented the scale-up scenario with the `{SAPHanaRA}` resource +agent (RA), which performs the actual check of the {HANA} database +instances. This RA is configured as a multi-state resource. In the +scale-up scenario, the promoted RA instance assumes responsibility for the {HANA} +databases running in primary mode. The non-promoted RA instance is responsible for +instances that are operated in synchronous (secondary) status. + +To make configuring the cluster as simple as possible, SUSE has +developed the `{SAPHanaTopRA}` resource agent. This RA runs on all nodes +of a {sles4sap} cluster and gathers information about the +statuses and configurations of {HANA} system replications. It is +designed as a normal (stateless) clone. + +{HANA} system replication for scale-up is supported in the following +scenarios or use cases: + +* *Performance optimized* (_A => B_). This scenario and setup *is described +in this document.* ++ +.{HANA} System Replication Scale-Up in the Cluster - performance optimized +image::SAPHanaSR-ScaleUP-perfOpt.svg[scaledwidth=100.0%] ++ +In the performance optimized scenario an {HANA} RDBMS site A is synchronizing +with an {HANA} RDBMS site B on a second node. As the {HANA} RDBMS on the second node +is configured to pre-load the tables, the takeover time is typically very +short. ++ +One big advance of the performance optimized scenario of {HANA} is the +possibility to allow read access on the secondary database site. To support +this *read enabled* scenario, a second virtual IP address is added to the cluster +and bound to the secondary role of the system replication. +* *Cost optimized* (_A => B, Q_). This scenario and setup is described +in another document available from the documentation Web page ({reslibrary}). +The document for _cost optimized_ is named _"{docCostOpt}"_. ++ +.{HANA} System Replication Scale-Up in the Cluster - cost optimized +image::SAPHanaSR-ScaleUP-costOpt2.svg[scaledwidth=100.0%] ++ +In the cost optimized scenario, the second node is also used for a +stand-alone non-replicated {HANA} RDBMS system (like QAS or TST). Whenever a takeover +is needed, the non-replicated system must be stopped first. As the +productive secondary system on this node must be limited in using system +resources, the table preload must be switched off. A possible +takeover needs longer than in the performance optimized use case. ++ +In the cost optimized scenario, the secondary needs to be running in a reduced +memory consumption configuration. This is why _read enabled_ must not be used in this +scenario. ++ +As already explained, the secondary {HANA} database must run with memory resource +restrictions. The HA/DR provider needs to remove these memory restrictions when a +takeover occurs. This is why multi SID (also MCOS) must not be used in this scenario. + +* *Multi-tier* (_A => B -> C_) and *Multi-target* (_B <= A => C_). ++ +.{HANA} System Replication Scale-Up in the Cluster - performance optimized chain +image::SAPHanaSR-ScaleUP-Chain.svg[scaledwidth=100.0%] ++ +A _multi-tier_ system replication has an additional target. In the past, this third +side must have been connected to the secondary (chain topology). With current {HANA} +versions, the _multiple target topology_ is allowed by {sap}. +Have a look at the scenarios and prerequisites section below or consult the +manual pages SAPHanaSR(7) and susHanaSR.py(7) for details. +// TODO PRIO3: link to section ++ +.{HANA} System Replication Scale-Up in the Cluster - performance optimized multi-target +image::SAPHanaSR-ScaleUP-MultiTarget.svg[scaledwidth=100.0%] ++ +Multi-tier and multi-target systems are implemented as described in this document. +Only the first replication pair (A and B) is handled by the cluster itself. + +* *Multi-tenancy* or MDC. ++ +Multi-tenancy is supported for all above scenarios and use cases. This scenario is supported since {HANA} SPS09. +The setup and configuration from a cluster point of view is the same for multi-tenancy and single container. +Thus you can use the above documents for both kinds of scenarios. + +// TODO PRIO1: Add new restrictions here (think about multi-target, handshake, ...) +// TODO PRIO1: add link to overview on supported scenarios + +==== The concept of the performance optimized scenario + +In case of failure of the primary {HANA} on node 1 (node or database +instance) the cluster first tries to start the takeover process. This +allows to use the already loaded data at the secondary site. Typically +the takeover is much faster than the local restart. + +To achieve an automation of this resource handling process, you must +use the {HANA} resource agents included in {SAPHanaSR}. System +replication of the productive database is automated with {SAPHanaRA} and +{SAPHanaTopRA}. + +The cluster only allows a takeover to the secondary site if the {HANA} +system replication was in sync until the point when the service of the primary +got lost. This ensures that the last commits processed on the primary site are +already available at the secondary site. + +{SAP} did improve the interfaces between {HANA} and external software, such as +cluster frameworks. These improvements also include the implementation of {HANA} +call outs in case of special events, such as status changes for services or system replication +channels. These call outs are also called HA/DR providers. These interfaces can be used by +implementing {HANA} hooks written in python. {SUSE} has enhanced the SAPHanaSR package +to include such {HANA} hooks to optimize the cluster interface. Using the +{HANA} hooks described in this document allows to inform the cluster immediately +if the {HANA} system replication is broken. In addition to the {HANA} hook status, +the cluster continues to poll the system replication status on a regular basis. + +You can adjust the level of automation by setting the parameter `AUTOMATED_REGISTER`. +If automated registration is activated, the cluster will automatically register +a former failed primary to become the new secondary. Refer to the manual pages SAPHanaSR(7) and ocf_suse_SAPHana(7) for details on all supported parameters and features. + +IMPORTANT: The solution is not designed to manually 'migrate' the primary or +secondary instance using HAWK or any other cluster client commands. In the +_Administration_ section of this document we describe how to 'migrate' the primary +to the secondary site using {sap} and cluster commands. + +//// + ______ __ + / ____/________ _______ _______/ /____ ____ ___ + / __/ / ___/ __ \/ ___/ / / / ___/ __/ _ \/ __ `__ \ + / /___/ /__/ /_/ (__ ) /_/ (__ ) /_/ __/ / / / / / +/_____/\___/\____/____/\__, /____/\__/\___/_/ /_/ /_/ + /____/ +//// + +=== Ecosystem of the document + +==== Additional documentation and resources + +Chapters in this manual contain links to additional documentation +resources that are either available on the system or on the Internet. + +For the latest documentation updates, see https://documentation.suse.com/. + +You can also find numerous white-papers, best-practices, setup guides, and +other resources at the {sles4sap} best practices Web page: +{reslibrary}. +There is particularly an overview on all {suse} high availability solutions for +{saphana} and {s4hana} workloads. + +SUSE also publishes blog articles about {sap} and high availability. +Join us by using the hashtag #TowardsZeroDowntime. Use the following link: +https://www.suse.com/c/tag/TowardsZeroDowntime/. + +Supported high availability solutions by {sles4sap} overview: +https://documentation.suse.com/sles-sap/sap-ha-support/html/sap-ha-support/article-sap-ha-support.html + +Lastly, there are manual pages shipped with the product. + +==== Errata + +To deliver urgent smaller fixes and important information in a timely manner, +the Technical Information Document (TID) for this setup guide +will be updated, maintained and published at a higher frequency: + +- SAP HANA SR Performance Optimized Scenario - Setup Guide - Errata +{tidNotes}7023882 + +- Showing SOK Status in Cluster Monitoring Tools Workaround +{tidNotes}7023526 - +see also the blog article https://www.suse.com/c/lets-flip-the-flags-is-my-sap-hana-database-in-sync-or-not/ + +// TODO PRIO2: replace below with correct TID +// In addition to this guide, check the SUSE SAP Best Practice Guide Errata for +// other solutions +{tidNotes}7023713. + +// Standard SUSE includes +==== Feedback +include::common_intro_feedback.adoc[] + + +// DO NOT CHANGE SECTION ID: refer to Trento checks +[[cha.hana-sr.scenario]] +== Supported scenarios and prerequisites + +For the `{saphanasr}` package configure as decribed in the document at hand, +we limit the support to scale-up (single-box to single-box) system replication +with the following configurations and parameters: + +* Two-node clusters are standard. Three node clusters are fine if you install +the resource agents also on that third node. But define in the cluster that +{HANA} resources must never run on that third node. In this case the third +node is an additional majority maker in case of cluster separation. +* The cluster must include a valid STONITH method. +** Any STONITH mechanism supported for production use by {sleha} {prodnr} (like +SBD, IPMI) is supported with {saphanasr}. +** This guide is focusing on the SBD fencing method as this is hardware independent. +** If you use disk-based SBD as the fencing mechanism, you need one or more shared +drives. For productive environments, we recommend more than one SBD device. For +details on disk-based SBD, read the product documentation for {sleha} and the +manual pages sbd(8) and stonith_sbd(7). +** For diskless SBD, you need at least three cluster nodes. The diskless SBD +mechanism has the benefit that you do not need a shared drive for fencing. +Since diskless SBD is based on self-fencing, reliable detection of lost quorum +is absolutely crucial. +** Priority fencing is an optional improvement for two nodes, but does not work +for three nodes. +* Both nodes are in the same network segment (layer 2). Similar methods provided +by cloud environments such as overlay IP addresses and load balancer functionality +are also fine. Follow the cloud specific guides to set up your {sles4sap} cluster. +* Technical users and groups, such as _{refsidadm}_ are defined +locally in the Linux system. If that is not possible, additional measures are needed +to ensure reliable resolution of users, groups and permissions at any time. +This might include caching. +* Name resolution of the cluster nodes and the virtual IP address must be done +locally on all cluster nodes. If that is not possible, additional measures are +needed to ensure reliable resolution of host names at any time. +* Time synchronization between the cluster nodes, such as NTP, is required. +* Both {HANA} instances of the system replication pair (primary and secondary) +have the same SAP Identifier (SID) and instance number. +* If the cluster nodes are installed in different data centers or data center +areas, the environment must match the requirements of the {sleha} cluster product. +Of particular concern are the network latency and recommended maximum distance +between the nodes. Review the product documentation for {sleha} about those +recommendations. +* Automated registration of a failed primary after takeover prerequisites need +to be defined. +** As a good starting configuration for projects, we recommend to switch off the +automated registration of a failed primary. The setup `AUTOMATED_REGISTER="false"` +is set as default. In this case, you need to register a failed primary after a takeover +manually. For re-registration, use precisely the site names that are already known +by the cluster. Use {SAP} tools like {HANA} cockpit or _hdbnsutil_. +** For optimal automation, we recommend to set `AUTOMATED_REGISTER="true"`. +* Automated start of {HANA} instances during system boot must be switched off. +* Multi-tenancy (MDC) databases are supported. +** Multi-tenancy databases can be used in combination with any other setup +(performance-optimized, cost-optimized, multi-tier, multi-target and read-enabled). +** In MDC configurations, the {HANA} RDBMS is treated as a single system including +all database containers. Therefore, cluster takeover decisions are based on the +complete RDBMS status independent of the status of individual database containers. +** Tests on Multi-tenancy databases can force a different test procedure if you +are using strong separation of the tenants. +As an example, killing the complete {HANA} instance using _HDB kill_ does not work, +because the tenants are running with different Linux user UIDs. +_{refsidadm}_ is not allowed to terminate the processes of the other tenant users. +* Only one system replication between the two {HANA} database in the Linux +cluster. Maximum one system replication to an {HANA} database outside the Linux +cluster. +** Once an {HANA} system replication site is known to the Linux cluster, that exact +site name needs to be used whenever the site is registered manually. +** If a third {HANA} site is connected by system replication, that {HANA} is not +controlled by another Linux cluster. If that third site should work as part of a +fall-back HA cluster in DR case, that HA cluster needs to be in standby. +** The replication mode is either sync or syncmem for the controlled replication. +Replication mode async is not supported. The operation modes delta_datashipping, +logreplay and logreplay_readaccess are supported. +** See also the dedicated section on requirements for susHanaSR.py. +* The current resource agent supports {HANA} in system replication beginning with +{HANA} version 2.0 SPS05 revision 59.04. +Even in {HANA} multi-target environments, the current resource agent manages only +two sites. Thus only two {HANA} sites are part of the Linux cluster. +* Besides {HANA} you need {SAP} hostagent installed and started on your system. +** For SystemV style, the sapinit script needs to be active. +** For systemd style, the service SAP_ can stay enabled. +The systemd enabled saphostagent and instance´s sapstartsrv is supported. +Refer to the OS documentation for the systemd version. +{HANA} comes with native systemd integration as default starting with version 2.0 SPS07. +Refer to {SAP} documentation for information on other {HANA} versions. +** Combining systemd style hostagent with SystemV style instance is allowed. +However, all nodes in one Linux cluster need to use the same style. +* The RA's monitoring operations need to be active. +* Using HA/DR provider hook for srConnectionChanged() by enabling susHanaSR.py is +mandatory. +* RA and HA/DR provider hook script´s runtime almost completely depends on call-outs +to controlled resources, OS and Linux cluster. The infrastructure needs to allow +these call-outs to return in time. +* Colocation constraints between the SAPHanaController RA and other resources are +allowed only if they do not affect the RA's scoring. The location scoring finally +depends on system replication status and must not be overruled by additional +constraints. Thus it is not allowed to define rules forcing an SAPHanaController +promoted clone to follow another resource. +* Reliable access to the /hana/shared/ file system is crucial for {HANA} and the +Linux cluster. +// TODO PRIO2: point to SAPHanaFilesystem RA +* {HANA} feature Secondary Time Travel is not supported. +* The {HANA} Fast Restart feature on RAM-tmfps and {HANA} on persistent +memory can be used, as long as they are transparent to Linux HA. + +// TODO PRIO3: align with manual pages SAPHanaSR(7) and susHanaSR.py(7) +For the HA/DR provider hook scripts susHanaSR.py and susTkOver.py, the following +requirements apply: + +* {HANA} 2.0 SPS05 revision 059.04 and later provides Python3 as well as the HA/DR +provider hook method srConnectionChanegd() with multi-target aware parameters. +Python 3 and multi-target aware parameters are needed for the {saphanasr} package. +* {HANA} 2.0 SPS05 and later provides the HA/DR provider hook method preTakeover(). +// TODO PRIO1: check above version +* The user _{refsidadm}_ needs execution permission as user root for the command +crm_attribute. +* The hook provider needs to be added to the {HANA} global configuration, in memory +and on disk (in persistence). + + +For the HA/DR provider hook script susChkSrv.py, the following requirements apply: + +* {HANA} 2.0 SPS05 or later provides the HA/DR provider hook method srServiceStateChanged() +with needed parameters. +* No other HA/DR provider hook script should be configured for the srServiceStateChanged() +method. Hook scripts for other methods, provided in SAPHanaSR can be used in +parallel to susChkSrv.py, if not documented contradictingly. +* The user {refsidadm} needs execution permission as user root for the command +SAPHanaSR-hookHelper. +* The hook provider needs to be added to the {HANA} global configuration, in +memory and on disk (in persistence). +* The hook script runs in the {HANA} name server. It runs on the node where the +event srServiceStateChanged() occurs. +* If susChkSrv.py parameter action_on_lost=stop is set and the RA SAPHana +parameter AUTOMATED_REGISTER=true is set, it depends on HANA to release all OS +resources prior to the registering attempt. +* If the hook provider should be pre-compiled, the particular Python version that +comes with {HANA} needs to be used. + +See also manual pages SAPHanaSR(7), susHanaSR.py(7), susTkOver.py(7) and +susChkSrv.py(7) for more details and requirements. + +IMPORTANT: Without a valid STONITH method, the complete cluster is unsupported +and will not work properly. + +If you need to implement a different scenario, we strongly recommend to define +a Proof of Concept (PoC) with {SUSE}. This PoC will focus on testing the existing +solution in your scenario. Most of the above mentioned limitations are set +because careful testing is needed. + +// DO NOT CHANGE SECTION ID: refer to Trento check +[[cha.hana-sr.scope]] +== Scope of this document + +This document describes how to set up the cluster to control {HANA} in +System Replication scenarios. The document focuses on the steps to integrate +an already installed and working {HANA} with System Replication. +In this document {sles4sap} {prodNr} {prodSP} is used. This concept can also be +used with {sles4sap} {prodNr} SP4 or newer. + +The described example setup builds an {HANA} HA cluster in two data centers in +{saplocation1} ({sapsite1}) and in {saplocation2} ({sapsite2}), installed on two +{sles4sapa} {prodnr} {prodsp} systems. + +.Cluster with {HANA} SR - performance optimized +image::hana_sr_scaleup_perfopt.svg[scaledwidth=100.0%] + +You can either set up the cluster using the YaST wizard, doing it manually or +using your own automation. + +If you prefer to use the YaST wizard, you can use the shortcut _yast sap_ha_ to +start the module. The procedure to set up {saphanasr} using YaST is described in +the product documentation of {sles4sap} in section _Setting Up an {HANA} Cluster_ +at {sles4sapGuide15}cha-cluster.html. + +.Scenario Selection for {HANA} in the YaST Module sap_ha +image::Yast_SAP_HA.png[scaledwidth=100.0%] + +This guide focuses on the manual setup of the cluster to explain the details and +to give you the possibility to create your own automation. + +The seven main setup steps are: + +// Do _not_ use the attributes in <>. This would not compile when converting to DocBook + +:stepPlanning: Planning the installation +:stepOS: Setting up the operating system +:stepHANA: Installing the {HANA} Databases on both cluster nodes +:stepHSR: Setting up {HANA} System Replication +:stepHook: Setting up {HANA} HA/DR providers +:stepCluster: Configuring the cluster +:stepTest: Testing the cluster + +image::SAPHanaSR-ScaleOut-Plan-Phase0.svg[scaledwidth="100%"] + +- Planning (see <>) +- OS installation (see <>) +- Database installation (see <>) +- {HANA} system replication setup (see <> +- {HANA} HA/DR provider hooks (see <>) +- Cluster configuration (see <>) +- Testing (see <>) + +// DO NOT CHANGE SECTION ID: refer to Trento check +[[cha.s4s.hana-planning]] +== {stepPlanning} + +image::SAPHanaSR-ScaleOut-Plan-Phase1.svg[scaledwidth="100%"] + +Planning the installation is essential for a successful {HANA} cluster setup. + +Before you start, you need the following: + +- Software from {SUSE}: {sles4sap} installation media, a valid subscription, and +access to update channels +- Software from {SAP}: {HANA} installation media +- Physical or virtual systems including disks +- Filled parameter sheet (see below <>) + + +=== Minimum lab requirements and prerequisites + +NOTE: The minimum lab requirements mentioned here are by no means {SAP} sizing +information. These data are provided only to rebuild the described cluster in a +lab for test purposes. +Even for tests the requirements can increase, depending on your test scenario. +For productive systems ask your hardware vendor or use the official {SAP} sizing +tools and services. + +NOTE: Refer to {HANA} TDI documentation for allowed storage +configuration and file systems. + +Requirements with 1 {SAP} system replication instance per site (1 : 1) - without +a majority maker (2 node cluster): + +- 2 VMs with each 32GB RAM, 50GB disk space for the system +- 1 shared disk for SBD with 10 MB disk space +- 2 data disks (one per site) with a capacity of each 96GB for {HANA} +- 1 additional IP address for takeover +- 1 optional IP address for the read-enabled setup +- 1 optional IP address for HAWK Administration GUI + +Requirements with 1 SAP instance per site (1 : 1) - with a +majority maker (3 node cluster): + +- 2 VMs with each 32GB RAM, 50GB disk space for the system +- 1 VM with 2GB RAM, 50GB disk space for the system +- 2 data disks (one per site) with a capacity of each 96GB for {HANA} +- 1 additional IP address for takeover +- 1 optional IP address for the read-enabled setup +- 1 optional IP address for HAWK Administration GUI + +// pools size is calculated like: +// shared = 1 * MEM +// data = NODES * 1 * MEM +// log = NODES * 1/2 * MEM +// pool = shared + data + log + +=== Parameter sheet + +[[parameter-sheet]] +Even if the setup of the cluster organizing two {HANA} sites is quite simple, +the installation should be planned properly. You should have all needed parameters +like SID, IP addresses and much more in place. It is good practice to first fill +out the parameter sheet and then begin with the installation. + +.Parameter Sheet for Planning +[width="100%",cols="20%,20%,60%",options="header",] +|======================================================================= +|Parameter |Value |Role +|Node 1 ||Cluster node name and IP address. + +|Node 2 ||Cluster node name and IP address. + +|Site A ||Site name of the primary replicating {HANA} database + +|Site B ||Site name of the secondary replicating and the non-replicating {HANA} database + + +|SID ||SAP System Identifier + +|Instance Number ||Number of the {HANA} database. For system replication also Instance Number+1 is blocked. + +|Network mask || + +|vIP primary || Virtual IP address to be assigned to +the primary {HANA} site + +|vIP secondary || Virtual IP address to be assigned to +the read-enabled secondary {HANA} site (optional) + +|Storage | |Storage for HDB data and log files is connected “locally” (per node; not shared) + +|SBD ||STONITH device (two for production) or diskless SBD + +|HAWK Port |`7630` | + +|NTP Server ||Address or name of your time server +|======================================================================= + +.Parameter Sheet with Values used in this Document +[width="100%",cols="20%,20%,60%",options="header",] +|======================================================================= +|Parameter |Value |Role +|Node 1 |`{sapnode1}`, `{sapip1node1}` |Cluster node name and IP address. + +|Node 2 |`{sapnode2}`, `{sapip1node2}` |Cluster node name and IP address. + +|SID |`{sapsid}` |SAP System Identifier + +|Instance Number |`{sapino}` |Instance number of the {HANA} database. For +system replication also Instance Number+1 is blocked. + +|Network mask |`255.255.255.0` | + +|vIP primary |`{sapip1srv1}` | + +|vIP secondary |`{sapip1srv2}` | (optional) + +|Storage | |Storage for HDB data and log files is connected “locally” +(per node; not shared) + +|SBD |`{sapsbd1}` |STONITH device (two for production) or diskless + +|HAWK Port |`7630` | + +|NTP Server |pool pool.ntp.org|Address or name of your time server +|======================================================================= + +[[cha.s4s.os-install]] +== {stepOS} + +image::SAPHanaSR-ScaleOut-Plan-Phase2.svg[scaledwidth="100%"] + +This section contains information you should consider during the installation of +the operating system. + +For the scope of this document, first {sles4sap} is installed and configured. +Then the {HANA} database including the system replication is set up. Finally +the automation with the cluster is set up and configured. + +=== Installing {sles4sap} + +Multiple installation guides already exist, for different purposes and with +different reasons to set up the server in a certain way. Below it is outlined +where this information can be found. In addition, you will find important details +you should consider to get a well-working system in place. + +// DO NOT CHANGE AUTOGENERATED SECTION ID: id-installing-the-base-operating-system +// DO NOT CHANGE SECTION TITLE as this would change autogenerated section ID +// Refer to Trento check +==== Installing the base operating system + +Depending on your infrastructure and the hardware used, you need to adapt the +installation. All supported installation methods and minimum requirement are +described in the _Deployment Guide_ for {SLES} ({deploymentGuide15}). +In case of automated installations you can find further information in the +_AutoYaST Guide_ ({autoYastGuide15}). The main installation guides for {sles4sap} +that fit all requirements for {HANA} are available from the SAP notes: + +// SUSE and SAP are kept literal here not by the reference, because its a quote of an external title +* {sapnote15} SUSE Linux Enterprise Server 15: Installation Note +* {sapnoteset15} SAP HANA DB: Recommended OS settings for SLES 15 / SLES for SAP Applications 15 + +// TODO PRIO2: refer to saptune + +// DO NOT CHANGE AUTOGENERATED SECTION ID: id-installing-additional-software +// DO NOT CHANGE SECTION TITLE as this would change autogenerated section ID +// Refer to Trento checks +==== Installing additional software +With {sles4sap}, {SUSE} delivers special resource agents for {HANA}. With the +pattern _sap-hana_, the old-style resource agent package SAPHanaSR is installed. +This package needs to be replaced by the new {saphanasr} package. +Follow the instructions +below on each node if you have installed the systems based on SAP note {sapnote15}. +The pattern _High Availability_ summarizes all tools recommended to be installed on +*all* nodes, including the _majority maker_. + +// TODO PRIO2: check pattern and package names/dependencies +.Installing additional software for the HA cluster +==== +. Install the `High Availability` pattern. Do this on all nodes. ++ +[subs="quotes,attributes"] +---- +{sapnode1}:~ # zypper in --type pattern ha_sles +---- + +. De-install the old-style package and install the new {saphanasr} resource agents. +Do this on all nodes. ++ +[subs="attributes,quotes"] +---- +{sapnode1}:~ # rpm -e --nodeps SAPHanaSR SAPHanaSR-doc +{sapnode1}:~ # zypper in SAPHanaSR-angi +---- + +==== + +NOTE: Do not replace the package SAPHanaSR by SAPHanaSR-angi in an already running cluster. +Upgrading from SAPHanaSR to {saphanasr} requires a certain procedure. See manual page +SAPHanaSR_upgrade_to_angi(7) for details. + +Installing the packages supportutils-plugin-ha-sap and ClusterTools2 is highly +recommended. The first helps collecting data for support requests, the second +simplifies common administrative tasks. + +For more information, see section _Installation and Basic Setup_ of the {uarr} +{sleha} guide. + + +// DO NOT CHANGE SECTION ID: refer to Trento check +[[cha.s4s.hana-install]] +== {stepHANA} + +image::SAPHanaSR-ScaleOut-Plan-Phase3.svg[scaledwidth="100%"] + +Even though this document focuses on the integration of an installed {HANA} with +system replication already set up into the Linux cluster, this chapter summarizes +the test environment. Always use the official documentation from {SAP} to install +{HANA} and to set up the system replication. + +This guide shows {HANA} and saphostagent with native systemd integration. +An example for legacy SystemV is outlined in the appendix +<>. + +.Procedure + +. Install the {HANA} databases. + +. Check if the {SAP} hostagent is installed on all cluster nodes. +If this SAP service is not installed, install it now. + +. Verify that both databases are up and running. + +=== Installing the {HANA} databases + +* Read the SAP Installation and Setup Manuals available at the SAP Marketplace. + +* Download the {HANA} Software from SAP Marketplace. + +* Install the {HANA} database as described in the {HANA} Server Installation Guide. +The {HANA} database client will be installed together with the server by default. + +=== Checking if the SAP hostagent is installed on all cluster nodes + +Check if the native `systemd`-enabled SAP hostagent and instance `sapstartsrv` +are installed on all cluster nodes. If not, install and enable them now. + +As Linux user _root_, use the command `systemctl` and `systemd-cgls` to check +the SAP hostagent and instance services: + +[subs="attributes,quotes"] +---- +# systemctl list-unit-files | grep sap +saphostagent.service enabled +sapinit.service generated +saprouter.service disabled +saptune.service enabled +---- +The mandatory `saphostagent` service is enabled. This is the installation default. +Some more {sap} related services might be enabled, for example the recommended `saptune`. + +[subs="attributes,quotes"] +---- +# systemctl list-unit-files | grep SAP +SAP{sapsid}_{sapino}.service enabled +---- +The instance service is indeed enabled, as required. + +=== Verifying both databases are up and running + +[subs="attributes,quotes"] +---- +# systemd-cgls -u SAP.slice +Unit SAP.slice (/SAP.slice): +├─saphostagent.service +│ ├─2630 /usr/sap/hostctrl/exe/saphostexec pf=/usr/sap/hostctrl/exe/host_profile -systemd +│ ├─2671 /usr/sap/hostctrl/exe/sapstartsrv pf=/usr/sap/hostctrl/exe/host_profile -D +│ └─3591 /usr/sap/hostctrl/exe/saposcol -l -w60 pf=/usr/sap/hostctrl/exe/host_profile +└─SAP{sapsid}_{sapino}.service + ├─ 1257 hdbcompileserver + ├─ 1274 hdbpreprocessor + ├─ 1353 hdbindexserver -port 3{sapino}03 + ├─ 1356 hdbxsengine -port 3{sapino}07 + ├─ 2077 hdbwebdispatcher + ├─ 2300 hdbrsutil --start --port 3{sapino}03 --volume 3 --volumesuffix mnt00001/hdb00003.00003 --identifier 1644426276 + ├─28462 /usr/sap/{sapsid}/HDB{sapino}/exe/sapstartsrv pf=/usr/sap/{sapsid}/SYS/profile/{sapsid}_HDB{sapino}_{sapnode1} + ├─31314 sapstart pf=/usr/sap/{sapsid}/SYS/profile/{sapsid}_HDB{sapino}_{sapnode1} + ├─31372 /usr/sap/{sapsid}/HDB{sapino}/{sapnode1}/trace/hdb.sap{sapsid}_HDB{sapino} -d -nw -f /usr/sap/{sapsid}/HDB{sapino}/{sapnode1}/daemon.ini pf=/usr/sap/{sapsid}/SYS/profile/{sapsid}_HDB{sapino}_{sapnode1} + ├─31479 hdbnameserver + └─32201 hdbrsutil --start --port 3{sapino}01 --volume 1 --volumesuffix mnt00001/hdb00001 --identifier 1644426203 +---- +The SAP hostagent `saphostagent.service` and the instance´s `sapstartsrv` `SAP{sapsid}_{sapino}.service` +are running in the `SAP.slice`. +See also manual pages systemctl(8) and systemd-cgls(8) for details. + + +[[cha.s4s.hana-sys-replication]] +== {stepHSR} + +image::SAPHanaSR-ScaleOut-Plan-Phase4.svg[scaledwidth="100%"] + +For more information read the section _Setting Up System Replication_ of the +{HANA} Administration Guide. + +**Procedure** + +. Back up the primary database. +. Enable primary database. +. Register and start the secondary database. +. Verify the system replication. + +=== Backing up the primary database + +Back up the primary database as described in the {HANA} Administration Guide, +section _{HANA} Database Backup and Recovery_. We provide an example with SQL +commands. You need to adapt these backup commands to match your backup infrastructure. + +.Simple backup for the system database and all tenants with one single backup call +========================= +As user _{refsidadm}_ enter the following command: + +[subs="attributes,quotes"] +---- +~> hdbsql -i {sapino} -u SYSTEM -d SYSTEMDB \ + "BACKUP DATA FOR FULL SYSTEM USING FILE ('backup')" +---- + +You will get a command output similar to the following: + +---- +0 rows affected (overall time 15.352069 sec; server time 15.347745 sec) +---- +========================= + +.Simple backup for a single container (non MDC) database +========================= +Enter the following command as user _{refsidadm}_: + +[subs="specialchars,attributes"] +---- +~> hdbsql -i {refInst} -u \ + "BACKUP DATA USING FILE ('backup')" +---- +========================= + +IMPORTANT: Without a valid backup, you cannot bring {HANA} into a system +replication configuration. + +=== Enabling the primary node + +As Linux user _{refsidadm}_, enable the system replication at the primary node. You +need to define a site name (like {sapsite1}). This site name must be unique for +all {HANA} databases which are connected via system replication. This means the +secondary must have a different site name. The site names must not be changed +later when the cluster has been activated. + +NOTE: Do not use strings like "primary" and "secondary" as site names. + +.Enable the Primary +========== +Enable the primary using the `-sr_enable` option. + +[subs="attributes,quotes"] +---- +{sapnode1}:~> hdbnsutil -sr_enable --name={sapsite1} +checking local nameserver: +checking for active nameserver ... +nameserver is running, proceeding ... +configuring ini files ... +successfully enabled system as primary site ... +done. +---- +========== + +.Check SR Configuration on the Primary +========== +// the command output is still valid as of HANA 2.0 SPS5 +Check the primary using the command `hdbnsutil -sr_stateConfiguration`. + +[subs="specialchars,attributes,quotes"] +---- +{sapnode1}:~> hdbnsutil -sr_stateConfiguration --sapcontrol=1 +SAPCONTROL-OK: +mode=primary +site id=1 +site name={sapsite1} +SAPCONTROL-OK: +done. +---- +========== + +The mode has changed from “none” to “primary”. The site now has a site name and a site ID. + +=== Registering the secondary node + +The {HANA} database instance on the secondary side must be stopped +before the instance can be registered for the system replication. You +can use your preferred method to stop the instance (like `HDB` or +`sapcontrol`). After the database instance has been stopped +successfully, you can register the instance using `hdbnsutil`. Again, +use the Linux user _{refsidadm}_: + +.Stop the Secondary +========== +To stop the secondary, you can use the command line tool _HDB_. + +[subs="attributes,quotes"] +---- +{sapnode2}:~> HDB stop +---- +========== + +.Copy the KEY and KEY-DATA file from the primary to the secondary site +========== +Beginning with {HANA} 2.0, the system replication is running encrypted. +The key files need to be copied-over from the primary to the secondary site. + +[subs="specialchars,attributes,quotes"] +---- +~> cd /usr/sap/{refSID}/SYS/global/security/rsecssfs +~> rsync -va {,:}$PWD/data/SSFS_{refSID}.DAT +~> rsync -va {,:}$PWD/key/SSFS_{refSID}.KEY +---- +========== + +.Register the Secondary +========== +The registration of the secondary is triggered by calling _hdbnsutil -sr_register ..._. + +[subs="attributes,quotes"] +---- +... +{sapnode2}:~> hdbnsutil -sr_register --name={sapsite2} \ + --remoteHost={sapnode1} --remoteInstance={sapino} \ + --replicationMode=sync --operationMode=logreplay +adding site ... +checking for inactive nameserver ... +nameserver {sapnode2}:30001 not responding. +collecting information ... +updating local ini files ... +done. +---- +========== + +The _remoteHost_ is the primary node in our case, the _remoteInstance_ is the +database instance number (here {sapino}). + +Now start the database instance again and verify the system replication status. +On the secondary node, the mode should be one of "SYNC" or "SYNCMEM". "ASYNC" is +*not supported with automated cluster takeover*. The mode depends on the +*replicationMode* option defined during the registration of the secondary. + +.Start Secondary and Check SR Configuration +========== +To start the new secondary, use the command line tool _HDB_. Then check the +SR configuration using `hdbnsutil -sr_stateConfiguration`. + +[subs="specialchars,attributes,quotes"] +---- +{sapnode2}:~> HDB start +... +{sapnode2}:~> hdbnsutil -sr_stateConfiguration --sapcontrol=1 +SAPCONTROL-OK: +mode=sync +site id=2 +site name={sapsite2} +active primary site=1 +primary masters={sapnode1} +SAPCONTROL-OK: +done. +---- +========== + +To view the replication state of the whole {HANA} cluster, use the +following command as _{refsidadm}_ user on the primary node: + +.Checking System Replication Status Details +========== +The python script _systemReplicationStatus.py_ provides details about the current +system replication. + +[subs="attributes,quotes"] +---- +{sapnode1}:~> HDBSettings.sh systemReplicationStatus.py --sapcontrol=1 +... +site/2/SITE_NAME={sapsite2} +site/2/SOURCE_SITE_ID=1 +site/2/REPLICATION_MODE=SYNC +site/2/REPLICATION_STATUS=ACTIVE +overall_replication_status=ACTIVE +site/1/REPLICATION_MODE=PRIMARY +site/1/SITE_NAME={sapsite1} +local_site_id=1 +... +---- +========== + +=== Manually testing the {HANA} SR takeover +[[cha.s4s.testtakeover]] + +Before you integrate your {HANA} system replication into the HA cluster, it is +mandatory to do a manual takeover. Testing without the cluster helps to make +sure that basic operation (takeover and registration) is working as expected. + +* Stop {HANA} on node 1 + +* Takeover {HANA} to node 2 + +* Register node 1 as secondary + +* Start {HANA} on node 1 + +* Wait until sync state is active + +=== Optional: Manually re-establishing {HANA} SR to original state + +Bring the systems back to the original state: + +* Stop {HANA} on node 2 + +* Take over {HANA} to node 1 + +* Register node 2 as secondary + +* Start {HANA} on node2 + +* Wait until sync state is active + +// DO NOT CHANGE SECTION ID: refer to Trento check +[[cha.s4s.hana-hook]] +== {stepHook} + +image::SAPHanaSR-ScaleOut-Plan-Phase5.svg[scaledwidth="100%"] + +This step is mandatory to inform the cluster immediately if the secondary gets +out of sync. The hook is called by {HANA} using the HA/DR provider interface +in point-of-time when the secondary gets out of sync. This is typically the case +when the first commit pending is released. The hook is called by {HANA} again +when the system replication is back. This HA/DR provider method is +`srConnectionChanged()`, the related {SUSE} hook script is `susHanaSR.py`. +The hook script susHanaSR.py is defacto mandatory. + +Another hook is called by {HANA} before an SR takeover is processed. This +method can be used to block a manual takeover during normal cluster operation. +This HA/DR provider method is `preTakeover()`, the related {SUSE} hook script is +`susTkOver.py`. + +A third hook is called by {HANA} when a service status changes. This method +can be used to speed up the takeover in case the indexserver process fails. +This HA/DR provider method is `srServiceStateChanged()`, the related {SUSE} hook +script is `susChkSrv.py`. + +**Procedure** + +. Implement the python hook script susHanaSR.py on both sites. +. Implement the python hook script susTkOver.py on both sites. +. Implement the python hook script susChkSrv.py on both sites. +. Configure the system replication operation mode. +. Allow _{refsidadm}_ to access the cluster. +. Start {HANA}. +. Test the hook integration. + +// TODO PRIO2: Steps "Start" and "Test" are incomplete + +This will implement three {HANA} HA/DR provider hook scripts. +The hook script susHanaSR.py is needs no config parameters. +The configuration for susTkOver.py normally does not need to be adapted. +The default for parameter sustkover_timeout is set to 30 seconds and is good +for most environments. +The configuration shown for susChkSrv.py is a good starting point. Any tuning +should be aligned with the SAP experts. + +NOTE: All hook scripts should be used directly from the SAPHanaSR package. +If the scripts are moved or copied, regular {SUSE} package updates will not work. + +{HANA} must be stopped to change the global.ini and allow {HANA} to integrate +the HA/DR hook scripts during start. Alternatively, `SAPHanaSR-manageProvider` +might be used for adapting the global.ini. See manual page SAPHanaSR-manageProvider(8) +for details. + + +=== Implementing susHanaSR hook for srConnectionChanged + +Use the hook from the {saphanasr} package /usr/share/SAPHanaSR-angi/susHanaSR.py. +The hook must be configured on all {HANA} cluster nodes. +In global.ini, the section `[ha_dr_provider_sushanasr]` needs to be created. +The section `[trace]` might be adapted. +Refer to the manual page susHanaSR.py(7) for details on this HA/DR provider hook +script, see also SAPHanaSR-manageProvider(8). + +.Stop {HANA} +========== +Stop {HANA} either with _HDB_ or using _sapcontrol_. + +[subs="specialchars,attributes"] +---- +~> sapcontrol -nr {refInst} -function StopSystem +---- +========== + +.Adding SAPHanaSR via global.ini +========== +Best is to use the {HANA} tools for changing global.ini. Alternatively you may use +SAPHanaSR-manageProvider, see manual page SAPHanaSR-manageProvider(8). + +---- +[ha_dr_provider_sushanasr] +provider = susHanaSR +path = /usr/share/SAPHanaSR-angi/ +execution_order = 1 + +[trace] +ha_dr_sushanasr = info +---- +========== + + +=== Implementing susTkOver hook for preTakeover + +Use the hook from the {saphanasr} package /usr/share/SAPHanaSR-angi/susTkOver.py. +The hook must be configured on all {HANA} cluster nodes. +In global.ini, the section `[ha_dr_provider_sustkover]` needs to be created. +The section `[trace]` might be adapted. +Refer to the manual page susTkOver.py(7) for details on this HA/DR provider hook +script, see also SAPHanaSR-manageProvider(8). + +.Stop {HANA} +========== +Stop {HANA} either with _HDB_ or using _sapcontrol_. + +[subs="specialchars,attributes"] +---- +{sapssid}adm@{sapnode2}:/usr/sap/{sapsid}/HDB{sapino}> sapcontrol -nr {refInst} -function StopSystem +---- +========== + +.Adding susTkOver via global.ini +========== +Best is to use the {HANA} tools for changing global.ini. Alternatively you may use +SAPHanaSR-manageProvider, see manual page SAPHanaSR-manageProvider(8). + +---- +[ha_dr_provider_sustkover] +provider = susTkOver +path = /usr/share/SAPHanaSR-angi/ +execution_order = 2 + +[trace] +ha_dr_sustkover = info +... +---- +========== + +=== Implementing susChkSrv hook for srServiceStateChanged + +Use the hook from the {saphanasr} package /usr/share/SAPHanaSR-angi/susChkSrv.py. +The hook must be configured on all {HANA} cluster nodes. +In global.ini, the section `[ha_dr_provider_suschksrv]` needs to be created. +The section `[trace]` might be adapted. +Refer to the manual page susChkSrv.py(7) for details on this HA/DR provider hook +script, see also SAPHanaSR-manageProvider(8). + +.Stop {HANA} +========== +Stop {HANA} either with _HDB_ or using _sapcontrol_. + +[subs="specialchars,attributes"] +---- +{sapssid}adm@{sapnode2}:/usr/sap/{sapsid}/HDB{sapino}> sapcontrol -nr {refInst} -function StopSystem +---- +========== + +.Adding susChkSrv via global.ini +========== +Best is to use the {HANA} tools for changing global.ini. Alternatively you may use +SAPHanaSR-manageProvider, see manual page SAPHanaSR-manageProvider(8). + +---- +[ha_dr_provider_suschksrv] +provider = susChkSrv +path = /usr/share/SAPHanaSR-angi/ +execution_order = 3 +action_on_lost=stop + +[trace] +ha_dr_suschksrv = info +... +---- +========== + + +=== Configuring system replication operation mode + +When your system is connected as {HANA} system replication target, you can find an +entry in the _global.ini_ +which defines the operation mode. Up to now there are the following modes available: + +* _delta_datashipping_ +* _logreplay_ +* _logreplay_readaccess_ + +Until a takeover and re-registration in the opposite direction, the entry for the +operation mode is missing on your primary site. The first operation mode which was +available was _delta_datashipping_. Today the preferred modes for HA are _logreplay_ +or _logreplay_readaccess_. Using the operation mode _logreplay_ makes your secondary +site in the {HANA} system replication a hot standby system. +For more details regarding all operation modes, check the available SAP documentation +such as "How To Perform System Replication for SAP HANA ". + +.Checking the Operation Mode +========== +Check both _global.ini_ files and add the operation mode if needed. Check the section +´system_replication´ for entry ´operation_mode = logreplay´. + +Path for the _global.ini_: /hana/shared//global/hdb/custom/config/ +---- +[system_replication] +operation_mode = logreplay +---- +========== + +=== Allowing {refsidadm} to access the cluster + +The current version of the susHanaSR python hook uses the command `sudo` to allow +the _{refsidadm}_ user to access the cluster attributes. + +//In Linux you can use `visudo` +//to start the vi editor for the _/etc/sudoers_ configuration file. + +The user _{refsidadm}_ must be able to set the cluster attributes +hana_{refsidLC}_site_srHook_*. The {HANA} system replication hook needs +password free access. The following example limits the sudo access to exactly +setting the needed attribute. +The entries can be added to a new file _/etc/sudoers.d/SAPHanaSR_ so that the +original _/etc/sudoers_ file does not need to be edited. +See manual page sudoers(5) for details. + +Replace the {refsidLC} by the *lowercase* SAP system ID (like `{sapssid}`). + +.Entry in sudo permissions /etc/sudoers.d/SAPHanaSR file +========== +Basic sudoers entry to allow {refsidadm} to use the hooks SAPHanaSR and susTkOver. + +// command to be allowed used in the hook: +// "sudo /usr/sbin/crm_attribute -n hana_%s_site_srHook_%s -v %s -t crm_config -s SAPHanaSR" +// "sudo /usr/bin/SAPHanaSR-hookHelper *" + +[subs="specialchars,attributes"] +---- +# SAPHanaSR-ScaleUp entries for writing srHook cluster attribute and SAPHanaSR-hookHelper +{refsidadm} ALL=(ALL) NOPASSWD: /usr/sbin/crm_attribute -n hana_{refsidLC}_* +{refsidadm} ALL=(ALL) NOPASSWD: /usr/bin/SAPHanaSR-hookHelper --sid={refSID} * +---- + +More specific sudoers entries to meet a high security level. + +All Cmnd_Alias entries must be each defined as a single line entry. +In our example, we have five separate lines with Cmnd_Alias entries and one line +for the _{refsidadm}_ user permitting the Cmnd_Aliases. In the document at +hand, however, the separate lines of the example might include a line-break +forced by document formatting. +The alias identifier (for example SOK_SITEA) needs to be in capitals. + +Replace the {refsidLC} by the *lowercase* SAP system ID (like `{sapssid}`). +Replace the {refsid} by the *uppercase* SAP system ID. + +[subs="specialchars,attributes"] +---- +# SAPHanaSR-ScaleUp entries for writing srHook cluster attribute +Cmnd_Alias SOK_SITEA = /usr/bin/crm_attribute -n hana_{refsidLC}_site_srHook_{refSiteA} -v SOK -t crm_config -s SAPHanaSR +Cmnd_Alias SFAIL_SITEA = /usr/bin/crm_attribute -n hana_{refsidLC}_site_srHook_{refSiteA} -v SFAIL -t crm_config -s SAPHanaSR +Cmnd_Alias SOK_SITEB = /usr/bin/crm_attribute -n hana_{refsidLC}_site_srHook_{refSiteB} -v SOK -t crm_config -s SAPHanaSR +Cmnd_Alias SFAIL_SITEB = /usr/bin/crm_attribute -n hana_{refsidLC}_site_srHook_{refSiteB} -v SFAIL -t crm_config -s SAPHanaSR +Cmnd_Alias HOOK_HELPER = /usr/bin/SAPHanaSR-hookHelper --sid={refsid} --case=checkTakeover + +{refsidadm} ALL=(ALL) NOPASSWD: SOK_SITEA, SFAIL_SITEA, SOK_SITEB, SFAIL_SITEB, HOOK_HELPER +---- +========== + + +=== Starting {HANA} + +Start {HANA} as user {refsidadm}. + +[subs="specialchars,attributes,quotes"] +---- +~> HDB start +---- + +=== Testing the hook integration + +==== Check the load of the hook scripts + +As user {refsidadm} check the {HANA} tracefiles, if the HA/DR provider scripts are +loaded and called successfully during an {HANA} event. + +Check for script susHanaSR.py, if it is loaded and initialized correctly: + +---- + ~> cdtrace + ~> grep HADR.*load.*susHanaSR nameserver_*.trc + ~> grep susHanaSR.init nameserver_*.trc +---- + +Check for script susTkOver.py, if it is loaded and initialized correctly: + +---- + ~> cdtrace + ~> grep HADR.*load.*susTkOver nameserver_*.trc + ~> grep susTkOver.init nameserver_*.trc +---- + +Check for script susChkSrv.py, if it is loaded and initialized correctly: + +---- + ~> cdtrace + ~> grep HADR.*load.*susChkSrv nameserver_*.trc + ~> grep susChkSrv.init nameserver_*.trc + ~> egrep '(LOST:|STOP:|START:|DOWN:|init|load|fail)' nameserver_suschksrv.trc +---- + + +==== Check an srConnectionChanged event + +After an event has been processed by the HA/DR provider script, check for the correct behaviour. +// TODO PRIO2: point to test procedure + +As user {refsidadm} check the {HANA} tracefiles, if susHanaSR.py did successfully +interact with the cluster or created a fallback file. + +---- + ~> cdtrace + ~> grep susHanaSR.srConnection.*CRM nameserver_*.trc + ~> grep susHanaSR.srConnection.*fallback nameserver_*.trc +---- + +==== Check a preTakeover event + +After an event has been processed by the HA/DR provider script, check for the correct behaviour. + +To test script susTkOver.py with stopped cluster, procedure <> +can be used here again. While the cluster is not set up at this moment the takeover +will not be blocked. + +As user root check the system messages, if the sudo permissions for susTkOver.py +calling SAPHanaSR-hookHelper are set successfully. + +[subs="specialchars,attributes,quotes"] +---- +# grep "sudo.*SAPHanaSR-hookHelper" /var/log/messages +---- + +As user {refsidadm} check the {HANA} tracefiles, if susTkOver.py did successfully +blocked or permitted the takeover request. +First check for permitted takeover requests, then check for blocked takeover requests. + +---- +~> cdtrace +~> grep susTkOver.preTakeover.*permit nameserver_*.trc +~> grep susTkOver.preTakeover.*failed.*50277 nameserver_*.trc +---- + +[[cha.s4s.configure-cluster]] +== {stepCluster} + +image::SAPHanaSR-ScaleOut-Plan-Phase6.svg[scaledwidth="100%"] + +This chapter describes the configuration of the cluster software {sleha}, which +is part of {sles4sap}, and the {saphana} database integration. + +.Actions +. Basic cluster configuration + +. Configuration of cluster properties and resources + +. Testing the HA/DR provider hook integration + +// TODO PRIO2: Describe the hook test (for example restart secondary) to trigger the write of cluster attributes + +=== Configuring the basic cluster + +The first step is to set up the basic cluster framework. For convenience, use YaST +or the {slehainit} script. It is strongly recommended to add a second corosync ring, +change it to UCAST communication and adjust the timeout values to fit your environment. + +==== Setting up watchdog for "storage-based fencing" + +// TODO PRIO2: Should we add some more information here? + +If you use the storage-based fencing (SBD) mechanism (diskless or disk-based), +you must also configure a watchdog. The watchdog is needed to reset a node if +the system cannot longer access the SBD (diskless or disk-based). +It is mandatory to configure the Linux system for loading a watchdog driver. +It is strongly recommended to use a watchdog with hardware assistance (as is +available on most modern systems), such as hpwdt, iTCO_wdt, or others. As fallback, +you can use the softdog module. + +.Setup for Watchdog +==== +IMPORTANT: Access to the watchdog timer: +No other software must access the watchdog timer; it can only be accessed by one +process at any time. Some hardware vendors ship systems management software that +use the watchdog for system resets (for example HP ASR daemon). Such software must +be disabled if the watchdog is to be used by SBD. + +Determine the right watchdog module. Alternatively, you can find a list of installed +drivers with your kernel version. + +---- +# ls -l /lib/modules/$(uname -r)/kernel/drivers/watchdog +---- + +Check if any watchdog module is already loaded. + +---- +# lsmod | egrep "(wd|dog|i6|iT|ibm)" +---- + +If you get a result, the system has already a loaded watchdog. If the watchdog +does not match your watchdog device, you need to unload the module. + +To safely unload the module, check first if an application is using the watchdog device. + +---- +# lsof /dev/watchdog +# rmmod +---- + +Enable your watchdog module and make it persistent. For the example below, _softdog_ +has been used. However, _softdog_ has some restrictions and should not be used as +first option. + +---- +# echo softdog > /etc/modules-load.d/watchdog.conf +# systemctl restart systemd-modules-load +---- + +Check if the watchdog module is loaded correctly. + +---- +# lsmod | grep dog +# ls -l /dev/watchdog +---- + +Testing the watchdog can be done with a simple action. Ensure to switch of your {HANA} +first because the watchdog will force an unclean reset or shutdown of your system. + +In case a hardware watchdog is used, a desired action is predefined after the timeout of the watchdog has +reached. If your watchdog module is loaded and not controlled by any other application, do the following: + +// TODO PRIO2: It seems a command is missing here?? + +IMPORTANT: Triggering the watchdog without continuously updating the watchdog +resets/switches off the system. This is the intended mechanism. The following +commands will force your system to be reset/switched off. + +In case the softdog module is used, the following action can be performed: + +---- +# sync; cat /dev/watchdog & while date; do sleep 10; done +---- +// # touch /dev/watchdog +// # echo 1 > /dev/watchdog + +After your test was successful, you must implement the watchdog on all cluster members. + +==== + +// DO NOT CHANGE AUTOGENERATED SECTION ID: id-setting-up-the-initial-cluster-using-ha-cluster-init +// DO NOT CHANGE SECTION TITLE as this would change autogenerated section ID +// Refer to Trento check +==== Setting up the initial cluster using `{slehainit}` + +For more detailed information about setting up a cluster, refer to the sections +_Setting Up the First Node_ and _Adding the Second Node_ of the {dochaquickstart} +for {sleha} {pn15} {psp15} at {haquickstart15}. + +This setup uses unicast (UCAST) for corosync communication (_-u_ option). Refer +to the {haAdminGuide15} on detailed explanations of the terms unicast/multicast. + +Create an initial setup, using the `{slehainit}` command, and follow the +dialogs. Do this only on the first cluster node. Answer "no" to "Do you wish to +configure a virtual IP address" and to "Do you want to configure QDevice". + +To use two corosync rings make sure you have two interfaces configured and run: + +[subs="specialchars,attributes,quotes"] +---- +{sapnode1}:~ # {slehainit} -u -M -s {sapsbd1} -s {sapsbd2} +---- + +To use only one corosync ring leave out the _-M_ option (not recommended): + +[subs="specialchars,attributes,quotes"] +---- +{sapnode1}:~ # {slehainit} -u -s {sapsbd1} -s {sapsbd2} +---- + +// TODO PRIO2: for disk less SBD: {slehainit} -u -S + +This command configures the basic cluster framework including: + +* SSH keys +* csync2 to transfer configuration files +* SBD (at least one device, in this guide two) +* corosync (at least one ring, better two rings) +* HAWK Web interface + +IMPORTANT: As requested by `{slehainit}`, change the password of the user hacluster. + +// DO NOT CHANGE AUTOGENERATED SECTION ID: id-checking-and-adapting-the-corosync-and-sbd-configuration +// DO NOT CHANGE SECTION TITLE as this would change autogenerated section ID +// Refer to Trento checks +==== Checking and adapting the corosync and SBD configuration + +//It is recommended to add a second corosync ring. If you did not start {slehainit} +//with the _-u_ option, you need to change corosync to use UCAST communication. +//To change to UCAST, stop the already running cluster by using `{clusterstop}`. +//After the setup of the corosync configuration and the SBD parameters, start the cluster again. +===== Checking the corosync configuration + +Check the following blocks in the file _/etc/corosync/corosync.conf_. +The important parts are _udpu_ and the correct ring/IP configuration. + +See also the example at the end of this document and refer to the manual pages +corosync.conf(5), votequorum(5) and corosync_overview(8) for details on parameters +and features. + +[subs="attributes,quotes"] +---- +totem { + ... + + interface { + ringnumber: 0 + mcastport: 5405 + ttl: 1 + } + + interface { + ringnumber: 1 + mcastport: 5407 + ttl: 1 + } + + rrp_mode: passive + transport: udpu + + ... + +} + + ... + +nodelist { + node { + ring0_addr: {sapIp1Node1} + ring1_addr: {sapIp2Node1} + nodeid: 1 + } + + node { + ring0_addr: {sapIp1Node2} + ring1_addr: {sapIp2Node2} + nodeid: 2 + } +} + ... +---- + +// DO NOT CHANGE AUTOGENERATED SECTION ID: id-adapting-the-sbd-configuration +// DO NOT CHANGE SECTION TITLE as this would change autogenerated section ID +// Refer to Trento checks +===== Adapting the SBD configuration + +// TODO PRIO2: Should we add some information about SBD timing tunables? + +You can skip this section if you do not have any SBD devices, but be sure to +implement another supported fencing mechanism. + +See the manual pages sbd(8) and stonith_sbd(7) for details. + +.SBD Options in File /etc/sysconfig/SBD +[cols=",",options="header",] +|======================================================================= +|Parameter |Description +|SBD_WATCHDOG_DEV| +Define the watchdog device. It is mandatory to use a watchdog. SBD does not work +reliable without watchdog. Refer to the {slsa} manual and SUSE +TID 7016880 for setting up a watchdog. + +|SBD_WATCHDOG_TIMEOUT a| +This parameter is used with diskless SBD. +It defines the timeout, in seconds, the watchdog will wait before panicking the +node if noone tickles it. +If you set CIB parameter stonith-watchdog-timeout to a negative value, +Pacemaker will automatically calculate this timeout and set it to twice the +value of SBD_WATCHDOG_TIMEOUT starting with {sleha} 15. + +|SBD_STARTMODE| +Start mode. If set to `clean`, sbd will only start if the node was +previously shut down cleanly or if the slot is empty. + +|SBD_PACEMAKER| +Check Pacemaker quorum and node health. + +|======================================================================= + +In the following, replace {sapsbd1} and {sapsbd2} by your real sbd +device names. As an example, the `SBD_WATCHDOG_TIMEOUT` is set to 20s to be less +aggressive than the formerly used 5s. + +[subs="attributes,quotes"] +---- +# egrep -v "(\^#|^$)" /etc/sysconfig/sbd +SBD_PACEMAKER=yes +SBD_STARTMODE="clean" +SBD_WATCHDOG_DEV="/dev/watchdog" +SBD_WATCHDOG_TIMEOUT="20" +SBD_TIMEOUT_ACTION="flush,reboot" +SBD_MOVE_TO_ROOT_CGROUP="auto" +SBD_OPTS="" +SBD_DEVICE="{sapsbd1};{sapsbd2}" +---- + +IMPORTANT: Also read the SUSE product documentation about calculation of timeouts for more details: +https://documentation.suse.com/sle-ha/15-SP1/single-html/SLE-HA-guide/#sec-ha-storage-protect-watchdog-timings + + +// DO NOT CHANGE AUTOGENERATED SECTION ID: id-verifying-the-sbd-device +// DO NOT CHANGE SECTION TITLE as this would change autogenerated section ID +// Refer to Trento check +===== Verifying the SBD device + +You can skip this section if you do not have any SBD devices, but make sure to +implement a supported fencing mechanism. + +It is a good practice to check if the SBD device can be accessed from both nodes +and does contain valid records. Check this for all devices configured in +_/etc/sysconfig/sbd_. You can do so, for example, by calling `cs_show_sbd_devices`. + +[subs="attributes,quotes"] +---- +{sapnode1}:~ # sbd -d {sapsbd1} -d {sapsbd2} dump +==Dumping header on disk {sapsbd1} +Header version : 2.1 +UUID : 0f4ea13e-fab8-4147-b9b2-3cdcfff07f86 +Number of slots : 255 +Sector size : 512 +Timeout (watchdog) : 20 +Timeout (allocate) : 2 +Timeout (loop) : 1 +Timeout (msgwait) : 40 +==Header on disk {sapsbd1} is dumped +==Dumping header on disk {sapsbd2} +Header version : 2.1 +UUID : 23c423df-675d-4937-a48c-5eb869fe0bb7 +Number of slots : 255 +Sector size : 512 +Timeout (watchdog) : 20 +Timeout (allocate) : 2 +Timeout (loop) : 1 +Timeout (msgwait) : 40 +==Header on disk {sapsbd2} is dumped +---- + +// TODO PRIO3: Check if we need to explain the timeout parameters of the sbd device (watchdog, allocate, loop, msgwait) + +IMPORTANT: The timeout values in our example are only start values. +It is a requirement that they are tuned to your environment. Refer to the +TIDs 7011346 and 7023689 for more information. + +To check the current SBD entries for the various cluster nodes, you can +use `sbd list`. If all entries are `clear`, no fencing task is marked in +the SBD device. + +[subs="attributes,quotes"] +---- +{sapnode1}:~ # sbd -d {sapsbd1} -d {sapsbd2} list +0 {sapnode1} clear +0 {sapnode1} clear +---- + +For more information on SBD configuration parameters, read the +section _Storage-based Fencing_, {uarr}SUSE Linux Enterprise High +Availability Extension and TIDs 7016880 and 7008216. + +Now it is time to restart the cluster at the first node again (`{clusterstart}`). + +==== Configuring the cluster on the second node + +The second node of the two nodes cluster can be integrated by starting the command +`{slehajoin}`. This command asks for the IP address or name of the first cluster +node. With this command, all needed configuration files are copied over. As a result, +the cluster is started on both nodes. + +[subs="specialchars,attributes,quotes"] +---- +# {slehajoin} -c {refHost1} +---- + +Press _RETURN_ to acknowledge the IP address. + +==== Checking the cluster for the first time + +Now it is time to check and optionally start the cluster for the first +time on both nodes. + +[subs="attributes,quotes"] +---- +{sapnode1}:~ # {clusterstatus} +{sapnode1}:~ # {sbdstatus} +{sapnode2}:~ # {clusterstatus} +{sapnode2}:~ # {sbdstatus} +{sapnode1}:~ # {clusterstart} +{sapnode2}:~ # {clusterstart} +---- + +Check the cluster status with crm_mon. We use the option "-r" to also +see resources, which are configured but stopped. + +[subs="attributes,quotes"] +---- +# crm_mon -r1 +---- + +The command will show the "empty" cluster and will print something similar to the +screen output below. The most interesting pieces of information for now are that +there are two nodes in the status "online", the message "partition with quorum", +and a running SBD resource. + +//---- +//Stack: corosync +//Current DC: {sapnode1} (version 1.1.19+20180928.0d2680780-1.8-1.1.19+20180928.0d2680780) - partition with quorum +//Last updated: Fri Nov 29 15:41:16 2019 +//Last change: Fri Nov 29 15:40:22 2019 by root via crm_attribute on {sapnode2} +//2 nodes configured +//1 resource configured +//Online: [ {sapnode1} {sapnode2} ] +//Full list of resources: +// stonith-sbd (stonith:external/sbd): Started {sapnode1} +//---- +[subs="attributes,quotes"] +---- +Cluster Summary: + * Stack: corosync + * Current DC: {sapnode1} (version 2.0.5+20201202.ba59be712-150300.4.16.1-2.0.5+20201202.ba59be712) - partition with quorum + * Last updated: Thu Jun 10 08:32:58 2022 + * Last change: Thu Jun 10 08:29:41 2022 by hacluster via crmd on {sapnode1} + * 2 nodes configured + * 1 resource instance configured + +Node List: + * Online: [ {sapnode1} {sapnode2} ] + +Full List of Resources: + * stonith-sbd (stonith:external/sbd): Started {sapnode1} +---- + +=== Configuring cluster properties and resources + +This section describes how to configure constraints, resources, bootstrap, and STONITH, +using the `crm configure` shell command as described in section _Configuring and Managing Cluster Resources (Command Line)_ +of the {uarr}SUSE Linux Enterprise High Availability Extension documentation. + +Use the command `crm` to add the objects to the cluster information base (CIB). Copy the following +examples to a local file, edit the file and then load the configuration to the CIB: + +[subs="attributes,quotes"] +---- +{sapnode1}:~ # vi crm-fileXX +{sapnode1}:~ # crm configure load update crm-fileXX +---- + +// DO NOT CHANGE AUTOGENERATED SECTION ID: id-cluster-bootstrap-and-more +// DO NOT CHANGE SECTION TITLE as this would change autogenerated section ID +// Refer to Trento checks +==== Cluster bootstrap and more + +The first example defines the cluster bootstrap options, the resource +and operation defaults. The stonith-timeout should be greater than 1.2 +times the SBD on-disk msgwait timeout. The priority-fencing-delay should be +at least 2 times the SBD CIB pcmk_delay_max. + +[subs="attributes,quotes"] +---- +{sapnode1}:~ # vi crm-bs.txt +# enter the following to crm-bs.txt +property cib-bootstrap-options: \ + stonith-enabled="true" \ + stonith-action="reboot" \ + stonith-timeout="150" \ + priority-fencing-delay="30" +rsc_defaults rsc-options: \ + resource-stickiness="1000" \ + migration-threshold="5000" +op_defaults op-options: \ + timeout="600" \ + record-pending=true +---- + +Now add the configuration to the cluster. + +[subs="attributes,quotes"] +---- +{sapnode1}:~ # crm configure load update crm-bs.txt +---- + +==== STONITH device + +Skip this section if you are using diskless SBD. + +The next configuration part defines an SBD disk STONITH resource. + +[subs="attributes,quotes"] +---- +# vi crm-sbd.txt +# enter the following to crm-sbd.txt +primitive stonith-sbd stonith:external/sbd \ + params pcmk_delay_max="15" +---- + +Again we add the configuration to the cluster. + +[subs="attributes,quotes"] +---- +{sapnode1}:~ # crm configure load update crm-sbd.txt +---- + +For fencing with IPMI/ILO see section <>. + +==== Using IPMI as fencing mechanism + +For details about IPMI/ILO fencing see our cluster product documentation ({haAdminGuide15}). +An example for an IPMI STONITH resource can be found in section <> +of this document. + +To use IPMI, the remote management boards must be compatible with the IPMI standard. + +For the IPMI-based fencing, configure a primitive per-cluster +node. Each resource is responsible to fence exactly one cluster node. +Adapt the IP addresses and login user / password of the +remote management boards to the STONITH resource agent. We recommend to +create a special STONITH user instead of providing root access to the +management board. Location rules must guarantee that a host should never +run its own STONITH resource. + +==== Using other fencing mechanisms + +This section is only relevant if the recommended disk-based or diskless SBD fencing is not used. + +We recommend to use SBD (best practice) or IPMI (second choice) as STONITH mechanism. The {sleha} product +also supports additional fencing mechanism not covered here. + +For further information about fencing, read the Administration Guide for SUSE Linux Enterprise High +Availability Extension at {haAdminGuide15}. For public cloud environements, refer to your cloud provider's documentation on supported fencing mechanisms. + +==== SAPHanaTopology + +This step is to define the resources needed, to analyze the {HANA} topology for the replicated pair. +Prepare the changes in a text file, for example _crm-saphanatop.txt_, and load it with the command: + +`crm configure load update crm-saphanatop.txt` + +// TODO PRIO3: Discuss naming scheme SAPHanaTop SAPHanaCon vs old scheme + +[subs="attributes,quotes"] +---- +# vi crm-saphanatop.txt +# enter the following to crm-saphanatop.txt +primitive rsc_SAPHanaTop_{sapsid}_HDB{sapino} ocf:suse:SAPHanaTopology \ + op start interval=0 timeout=600 \ + op stop interval=0 timeout=300 \ + op monitor interval=50 timeout=600 \ + params SID={sapsid} InstanceNumber={sapino} +clone cln_SAPHanaTop_{sapsid}_HDB{sapino} rsc_SAPHanaTop_{sapsid}_HDB{sapino} \ + meta clone-node-max=1 interleave=true +---- + +Additional information about all parameters can be found with the command: + +`man ocf_suse_SAPHanaTopology` + +Again, add the configuration to the cluster. + +[subs="attributes,quotes"] +---- +{sapnode1}:~ # crm configure load update crm-saphanatop.txt +---- + +The most important parameters here are SID and InstanceNumber, which are +quite self explaining in the SAP context. Beside these parameters, typical +tuneables are the timeout values or the operations (start, monitor, stop). + +==== SAPHanaController + +This step is to define the resource needed, to control the replicated {HANA} pair. +Edit the changes in a text file, for example _crm-saphanacon.txt_, and load it +with the following command: + +`crm configure load update crm-saphanacon.txt` + +.Typical Resource Agent parameter settings for different scenarios +[width="99%",cols="52%,16%,16%,16%",options="header",] +|============================================================ +|Parameter |Performance Optimized |Cost Optimized |Multi-Tier +|PREFER_SITE_TAKEOVER |true |false |false / true +|AUTOMATED_REGISTER |false / true |false / true |false +|DUPLICATE_PRIMARY_TIMEOUT |7200 |7200 |7200 +|============================================================ + +// TODO PRIO1: Check if all parameters in special DUPLICATE_PRIMARY_TIMEOUT +// are explained well + +.Description of important Resource Agent parameters +[width="100%",cols="42%,58%",options="header",] +|======================================================================= +|Parameter |Description +|PREFER_SITE_TAKEOVER |Defines whether RA should prefer to take over to +the secondary instance instead of restarting the failed primary locally. + +|AUTOMATED_REGISTER a| +Defines whether a former primary should be automatically registered to +be secondary of the new primary. With this parameter you can adapt the +level of system replication automation. + +If set to `false`, the former primary must be manually registered. The +cluster will not start this SAP HANA RDBMS until it is registered, to avoid +double primary up situations. + +|DUPLICATE_PRIMARY_TIMEOUT |Time difference needed between two primary +time stamps if a dual-primary situation occurs. If the time difference +is less than the time gap, the cluster holds one or both instances +in a "WAITING" status. This is to give an administrator the chance to react on a +failover. If the complete node of the former primary crashed, the former +primary will be registered after the time difference is passed. If +"only" the SAP HANA RDBMS has crashed, the former primary will be +registered immediately. After this registration to the new primary, all +data will be overwritten by the system replication. +|======================================================================= + +Additional information about all parameters of the SAPHanaController RA can be +found with the following command: + +`man ocf_suse_SAPHanaController` + +[subs="attributes,quotes"] +---- +# vi crm-saphanacon.txt +# enter the following to crm-saphanacon.txt +primitive rsc_SAPHanaCon_{sapsid}_HDB{sapino} ocf:suse:SAPHanaController \ + op start interval=0 timeout=3600 \ + op stop interval=" timeout=3600 \ + op promote interval=0 timeout=900 \ + op demote interval=0 timeout=320 \ + op monitor interval=60 role=Promoted timeout=700 \ + op monitor interval=61 role=Unpromoted timeout=700 \ + params SID={sapsid} InstanceNumber={sapino} PREFER_SITE_TAKEOVER=true \ + DUPLICATE_PRIMARY_TIMEOUT=7200 AUTOMATED_REGISTER=false \ + meta priority=100 +clone mst_SAPHanacon_{sapsid}_HDB{sapino} rsc_SAPHanaCon_{sapsid}_HDB{sapino} \ + meta clone-node-max=1 promotable=true interleave=true maintenance=true +---- + +Now add the configuration to the cluster. + +[subs="attributes,quotes"] +---- +{sapnode1}:~ # crm configure load update crm-saphanacon.txt +---- + +The most important parameters here are again SID and InstanceNumber. +Beside these parameters the timeout values for the operations (start, +promote, monitors, stop) are typical tuneables. + +// TODO PRIO1: SAPHanaFilesystem RA + +==== Adding a virtual IP address for the primary site + +The last resource to be added is covering the virtual IP address. +For details, see manual page ocf_heartbeat_IPaddr2(7). + +[subs="attributes,quotes"] +---- +# vi crm-vip.txt +# enter the following to crm-vip.txt + +primitive rsc_ip_{sapsid}_HDB{sapino} ocf:heartbeat:IPaddr2 \ + op monitor interval=10 timeout=20 \ + params ip={sapip1srv1} +---- + +Load the file to the cluster. + +[subs="attributes,quotes"] +---- +{sapnode1}:~ # crm configure load update crm-vip.txt +---- + +In most on-premise installations, only the parameter _ip_ needs to be set to the +virtual IP address to be presented to the client systems. +Public cloud environments often need specific settings. + +==== Constraints for SAPHanaSR-angi + +Two constraints are organizing the correct placement of the virtual IP +address for the client database access and the start order between the +two resource agents SAPHanaController and SAPHanaTopology. + +[subs="attributes,quotes"] +---- +# vi crm-cs.txt +# enter the following to crm-cs.txt +colocation col_saphana_ip_{sapsid}_HDB{sapino} 2000: rsc_ip_{sapsid}_HDB{sapino}:Started \ + mst_saphana_{sapsid}_HDB{sapino}:Promoted +order ord_saphana_{sapsid}_HDB{sapino} Optional: cln_SAPHanaTop_{sapsid}_HDB{sapino} \ + mst_SAPHanaCon_{sapsid}_HDB{sapino} +---- + +Load the file to the cluster. + +[subs="attributes,quotes"] +---- +{sapnode1}:~ # crm configure load update crm-cs.txt +---- + +==== Activating multi-state resource for cluster operation + +The multi state resource was added to the cluster with in maintenance mode. To get +the resource operated by the cluster the maintenance must be ended by the command: + +[subs="attributes,quotes"] +---- +# crm resource refresh mst_SAPHanaCon_{sapsid}_HDB{sapino} +# cs_wait_for_idle -s 5 +# crm resource maintenance mst_SAPHanaCon_{sapsid}_HDB{sapino} off +---- + +The command `cs_wait_for_idle` is part of the package `ClusterTools2`. +For more details, see manual pages cs_wait_for_idle(8), crm(8), +SAPHanaSR_maintenance_examples(7). + +==== Active/active read-enabled scenario + +This step is optional. If you have an active/active {HANA} system +replication with a read-enabled secondary, it is possible to integrate the +needed second virtual IP address into the cluster. This is been done by adding +a second virtual IP address resource and a location constraint binding the +address to the secondary site. + +[subs="attributes,quotes"] +---- +# vi crm-re.txt +# enter the following to crm-re.txt + +primitive rsc_ip_{sapsid}_HDB{sapino}_readenabled ocf:heartbeat:IPaddr2 \ + op monitor interval=10 timeout=20 \ + params ip={sapip1srv2} +colocation col_saphana_ip_{sapsid}_HDB{sapino}_readenabled 2000: \ + rsc_ip_{sapsid}_HDB{sapino}_readenabled:Started mst_SAPHanaCon_{sapsid}_HDB{sapino}:Unpromoted +---- + + +//// + ______ __ + /_ __/__ _____/ /______ + / / / _ \/ ___/ __/ ___/ + / / / __(__ ) /_(__ ) +/_/ \___/____/\__/____/ +//// + +[[cha.s4s.test-cluster]] +== {stepTest} + +// TODO PRIO2: Adding second IP Address to the cluster tests +// TODO PRIO1: Check, if we need to mention the srHook-Attribute in the cluster tests +// TODO PRIO2: Check, if it makes sense to provide query for cluster attributes to +// make it more easy to review srHook and RA attributes (like SOK or SFAIL) + +image::SAPHanaSR-ScaleOut-Plan-Phase7.svg[scaledwidth="100%"] + +The lists of tests will be further enhanced in one of the next updates of this document. + +As with any cluster testing is crucial. Make sure that all test +cases derived from customer expectations are implemented and passed +fully. Otherwise the project is likely to fail in production. + +The test prerequisite, if not described differently, is always that both +nodes are booted, normal members of the cluster and the HANA RDBMS is +running. The system replication is in sync (SOK). + + +=== Test cases for semi-automation + +In the following test descriptions we assume +`PREFER_SITE_TAKEOVER="true"` and `AUTOMATED_REGISTER="false".` + +NOTE: The following tests are designed to be run in sequence and depend +on the exit state of the preceding tests. + +==== Test: Stop primary database on site A (node 1) + +.Test STOP_PRIMARY_SITE_A +========== +.{testComp} + - Primary Database + +.{testDescr} +- The primary {HANA} database is stopped during normal cluster operation. + +.{testProc} +. Stop the primary {HANA} database gracefully as _{refsidadm}_. ++ +[subs="attributes,quotes"] +---- +{sapnode1}:~> HDB stop +---- + +.{testRecover} +. Manually register the old primary (on node 1) with the new primary after takeover (on node 2) as _{refsidadm}_. ++ +[subs="attributes,quotes"] +---- +{sapnode1}:~> hdbnsutil -sr_register --remoteHost={sapnode2} --remoteInstance=10 \ + --replicationMode=sync --operationMode=logreplay \ + --name={sapsite1} +---- ++ +. Restart the {HANA} database (now secondary) on node 1 as root. ++ +[subs="attributes,quotes"] +---- +# crm resource refresh rsc_SAPHanaCon_{sapsid}_HDB{sapino} {sapnode1} +---- + +.{testExpect} +. The cluster detects the stopped primary {HANA} database (on node 1) +and marks the resource failed. +. The cluster promotes the secondary {HANA} database (on node 2) to +take over as primary. +. The cluster migrates the IP address to the new primary (on node 2). +. After some time the cluster shows the sync_state of the stopped +primary (on node 1) as SFAIL. +. Because AUTOMATED_REGISTER="false" the cluster does not restart +the failed {HANA} database or register it against the new primary. +. After the manual register and resource refresh the system +replication pair is marked as in sync (SOK). +. The cluster "failed actions" are cleaned up after following the +recovery procedure. +========== + +==== Test: Stop primary database on site B (node 2) + +.Test STOP_PRIMARY_DB_SITE_B +========== + +{testComp}:: + Primary Database + +{testDescr}:: + The primary {HANA} database is stopped during normal cluster operation. + +.{testProc} +. Stop the database gracefully as {refsidadm}. ++ +[subs="attributes,quotes"] +---- +{sapnode2}:~> HDB stop +---- + +.{testRecover} +. Manually register the old primary (on node 2) with the new primary + after takeover (on node 1) as _{refsidadm}_. ++ +[subs="attributes,quotes"] +---- +{sapnode2}:~> hdbnsutil -sr_register --remoteHost={sapnode1} --remoteInstance=10 \ + --replicationMode=sync --operationMode=logreplay \ + --name={sapsite2} +---- ++ +. Restart the {HANA} database (now secondary) on node 2 as root. ++ +[subs="attributes,quotes"] +---- +# crm resource refresh rsc_SAPHanaCon_{sapsid}_HDB{sapino} {sapnode2} +---- + +.{testExpect} +. The cluster detects the stopped primary {HANA} database (on node 2) +and marks the resource failed. +. The cluster promotes the secondary {HANA} database (on node 1) to +take over as primary. +. The cluster migrates the IP address to the new primary (on node +1). +. After some time the cluster shows the sync_state of the stopped +primary (on node 2) as SFAIL. +. Because AUTOMATED_REGISTER="false" the cluster does not restart +the failed {HANA} database or register it against the new primary. +. After the manual register and resource refresh the system +replication pair is marked as in sync (SOK). +. The cluster "failed actions" are cleaned up after following the +recovery procedure. +========== + +==== Test: Crash primary database on site A (node 1) + +.Test CRASH_PRIMARY_DB_SITE_A +========== + +{testComp}:: + Primary Database + +{testDescr}:: + Simulate a complete break-down of the primary database system. + +.{testProc} +. Kill the primary database system using signals as _{refsidadm}_. ++ +[subs="attributes,quotes"] +---- +{sapnode1}:~> HDB kill-9 +---- + +.{testRecover} +. Manually register the old primary (on node 1) with the new primary + after takeover (on node 2) as _{refsidadm}_. ++ +[subs="attributes,quotes"] +---- +{sapnode1}:~> hdbnsutil -sr_register --remoteHost={sapnode2} --remoteInstance=10 \ + --replicationMode=sync --operationMode=logreplay \ + --name={sapsite1} +---- ++ +. Restart the {HANA} database (now secondary) on node 1 as root. ++ +[subs="attributes,quotes"] +---- +# crm resource refresh rsc_SAPHanaCon_{sapsid}_HDB{sapino} {sapnode1} +---- + +.{testExpect} +. The cluster detects the stopped primary {HANA} database (on node 1) +and marks the resource failed. +. The cluster promotes the secondary {HANA} database (on node 2) to +take over as primary. +. The cluster migrates the IP address to the new primary (on node +2). +. After some time the cluster shows the sync_state of the stopped +primary (on node 1) as SFAIL. +. Because AUTOMATED_REGISTER="false" the cluster does not restart +the failed {HANA} database or register it against the new primary. +. After the manual register and resource refresh the system +replication pair is marked as in sync (SOK). +. The cluster "failed actions" are cleaned up after following the +recovery procedure. +========== + +==== Test: Crash primary database on site B (node 2) + +.Test CRASH_PRIMARY_DB_SITE_B +========== + +{testComp}:: + Primary Database +{testDescr}:: + Simulate a complete break-down of the primary database system. + +.{testProc} +. Kill the primary database system using signals as _{refsidadm}_. ++ +[subs="attributes,quotes"] +---- +{sapnode2}:~> HDB kill-9 +---- + +.{testRecover} +. Manually register the old primary (on node 2) with the new primary + after takeover (on node 1) as _{refsidadm}_. ++ +[subs="attributes,quotes"] +---- +{sapnode2}:~> hdbnsutil -sr_register --remoteHost={sapnode1} --remoteInstance=10 \ + --replicationMode=sync --operationMode=logreplay \ + --name={sapsite2} +---- ++ +. Restart the {HANA} database (now secondary) on node 2 as root. ++ +[subs="attributes,quotes"] +---- +# crm resource refresh rsc_SAPHanaCon_{sapsid}_HDB{sapino} {sapnode2} +---- + +.{testExpect} +. The cluster detects the stopped primary {HANA} database (on node 2) +and marks the resource failed. +. The cluster promotes the secondary {HANA} database (on node 1) to +take over as primary. +. The cluster migrates the IP address to the new primary (on node +1). +. After some time the cluster shows the sync_state of the stopped +primary (on node 2) as SFAIL. +. Because AUTOMATED_REGISTER="false" the cluster does not restart +the failed {HANA} database or register it against the new primary. +. After the manual register and resource refresh the system +replication pair is marked as in sync (SOK). +. The cluster "failed actions" are cleaned up after following the +recovery procedure. +========== + +==== Test: Crash primary node on site A (node 1) + +.Test CRASH_PRIMARY_NODE_SITE_A +========== +{testComp}:: + Cluster node of primary site +{testDescr}:: + Simulate a crash of the primary site node running the primary {HANA} + database. + +.{testProc} +. Crash the primary node by sending a 'fast-reboot' system request. ++ +[subs="attributes,quotes"] +---- +{sapnode1}:~ # sync; echo b > /proc/sysrq-trigger +---- + +.{testRecover} +. If SBD fencing is used, pacemaker will not automatically restart after being fenced. In this case clear the fencing flag on all SBD devices and subsequently start pacemaker. ++ +[subs="attributes,quotes"] +---- +{sapnode1}:~ # sbd -d {sapsbd1} message {sapnode1} clear +{sapnode1}:~ # sbd -d {sapsbd2} message {sapnode1} clear +... +---- ++ +. Start the cluster framework ++ +[subs="attributes,quotes"] +---- +{sapnode1}:~ # {clusterstart} +---- ++ +. Manually register the old primary (on node 1) with the new primary after takeover (on node 2) as _{refsidadm}_. ++ +[subs="attributes,quotes"] +---- +{sapnode1}:~> hdbnsutil -sr_register --remoteHost={sapnode2} --remoteInstance=10 \ + --replicationMode=sync --operationMode=logreplay \ + --name={sapsite1} +---- ++ +. Restart the {HANA} database (now secondary) on node 1 as root. ++ +[subs="attributes,quotes"] +---- +# crm resource refresh rsc_SAPHanaCon_{sapsid}_HDB{sapino} {sapnode1} +---- + +.{testExpect} +. The cluster detects the failed node (node 1) and declares it +UNCLEAN and sets the secondary node (node 2) to status "partition +with quorum". +. The cluster fences the failed node (node 1). +. The cluster declares the failed node (node 1) OFFLINE. +. The cluster promotes the secondary {HANA} database (on node 2) to +take over as primary. +. The cluster migrates the IP address to the new primary (on node +2). +. After some time the cluster shows the sync_state of the stopped +primary (on node 2) as SFAIL. +. If SBD fencing is used, then the manual recovery procedure will be +used to clear the fencing flag and restart pacemaker on the node. +. Because AUTOMATED_REGISTER="false" the cluster does not restart +the failed {HANA} database or register it against the new primary. +. After the manual register and resource refresh the system +replication pair is marked as in sync (SOK). +. The cluster "failed actions" are cleaned up after following the +recovery procedure. +========== + +==== Test: Crash primary node on site B (node 2) + +.Test CRASH_PRIMARY_NODE_SITE_B +========== +{testComp}:: + Cluster node of secondary site +{testDescr}:: + Simulate a crash of the secondary site node running the primary {HANA} + database. + +.{testProc} +. Crash the secondary node by sending a 'fast-reboot' system request. ++ +[subs="attributes,quotes"] +---- +{sapnode2}:~ # sync; echo b > /proc/sysrq-trigger +---- + +.{testRecover} +. If SBD fencing is used, pacemaker will not automatically restart + after being fenced. In this case clear the fencing flag on all SBD + devices and subsequently start pacemaker. ++ +[subs="attributes,quotes"] +---- +{sapnode2}:~ # sbd -d {sapsbd1} message {sapnode2} clear +{sapnode2}:~ # sbd -d {sapsbd2} message {sapnode2} clear +... +---- ++ +. Start the cluster Framework ++ +[subs="attributes,quotes"] +---- +{sapnode2}:~ # {clusterstart} +---- ++ +. Manually register the old primary (on node 2) with the new primary + after takeover (on node 1) as _{refsidadm}_. ++ +[subs="attributes,quotes"] +---- +{sapnode2}:~> hdbnsutil -sr_register --remoteHost={sapnode1} --remoteInstance=10 \ + --replicationMode=sync --operationMode=logreplay \ + --name={sapsite2} +---- ++ +. Restart the {HANA} database (now secondary) on node 2 as root. ++ +[subs="attributes,quotes"] +---- +# crm resource refresh rsc_SAPHanaCon_{sapsid}_HDB{sapino} {sapnode2} +---- + +.{testExpect} +. The cluster detects the failed secondary node (node 2) and +declares it UNCLEAN and sets the primary node (node 1) to status +"partition with quorum". +. The cluster fences the failed secondary node (node 2). +. The cluster declares the failed secondary node (node 2) OFFLINE. +. The cluster promotes the secondary {HANA} database (on node 1) to +take over as primary. +. The cluster migrates the IP address to the new primary (on node +1). +. After some time the cluster shows the sync_state of the stopped +secondary (on node 2) as SFAIL. +. If SBD fencing is used, then the manual recovery procedure will be +used to clear the fencing flag and restart pacemaker on the node. +. Because AUTOMATED_REGISTER="false" the cluster does not restart +the failed {HANA} database or register it against the new primary. +. After the manual register and resource refresh the system +replication pair is marked as in sync (SOK). +. The cluster "failed actions" are cleaned up after following the +recovery procedure. +========== + +==== Test: Stop the secondary database on site B (node 2) + +.Test STOP_SECONDARY_DB_SITE_B +========== +{testComp}:: + Secondary {HANA} database + +{testDescr}:: + The secondary {HANA} database is stopped during normal cluster + operation. + +.{testProc} +. Stop the secondary {HANA} database gracefully as _{refsidadm}_. ++ +[subs="attributes,quotes"] +---- +{sapnode2}:~> HDB stop +---- + +.{testRecover} +. Refresh the failed resource status of the secondary {HANA} database (on + node 2) as root. ++ +[subs="attributes,quotes"] +---- +# crm resource refresh rsc_SAPHanaCon_{sapsid}_HDB{sapino} {sapnode2} +---- + +.{testExpect} +. The cluster detects the stopped secondary database (on node 2) and +marks the resource failed. +. The cluster detects the broken system replication and marks it as +failed (SFAIL). +. The cluster restarts the secondary {HANA} database on the same node +(node 2). +. The cluster detects that the system replication is in sync again +and marks it as ok (SOK). +. The cluster "failed actions" are cleaned up after following the +recovery procedure. +========== + +==== Test: Crash the secondary database on site B (node 2) + +.Test CRASH_SECONDARY_DB_SITE_B +========== +{testComp}:: + Secondary {HANA} database +{testDescr}:: + Simulate a complete break-down of the secondary database system. + +.{testProc} +. Kill the secondary database system using signals as _{refsidadm}_. ++ +[subs="attributes,quotes"] +---- +{sapnode2}:~> HDB kill-9 +---- + +.{testRecover} +. Clean up the failed resource status of the secondary {HANA} database (on node 2) as root. ++ +[subs="attributes,quotes"] +---- +# crm resource refresh rsc_SAPHanaCon_{sapsid}_HDB{sapino} {sapnode2} +---- + +.{testExpect} +. The cluster detects the stopped secondary database (on node 2) and +marks the resource failed. +. The cluster detects the broken system replication and marks it as +failed (SFAIL). +. The cluster restarts the secondary {HANA} database on the same node +(node 2). +. The cluster detects that the system replication is in sync again +and marks it as ok (SOK). +. The cluster "failed actions" are cleaned up after following the +recovery procedure. +========== + +==== Test: Crash the secondary node on site B (node2) + +.Test CRASH_SECONDARY_NODE_SITE_B +========== +{testComp}:: + Cluster node of secondary site +{testDescr}:: + Simulate a crash of the secondary site node running the secondary {HANA} + database. + +.{testProc} +. Crash the secondary node by sending a 'fast-reboot' system request. ++ +[subs="attributes,quotes"] +---- +{sapnode2}:~ # sync; echo b > /proc/sysrq-trigger +---- + +.{testRecover} +. If SBD fencing is used, pacemaker will not automatically restart + after being fenced. In this case clear the fencing flag on *all* SBD + devices and subsequently start pacemaker. ++ +[subs="attributes,quotes"] +---- +{sapnode2}:~ # sbd -d {sapsbd1} message {sapnode2} clear +{sapnode2}:~ # sbd -d {sapsbd2} message {sapnode2} clear +... +---- ++ +. Start the cluster framework. ++ +[subs="attributes,quotes"] +---- +{sapnode2}:~ # {clusterstart} +---- + +.{testExpect} +. The cluster detects the failed secondary node (node 2) and +declares it UNCLEAN and sets the primary node (node 1) to status +"partition with quorum". +. The cluster fences the failed secondary node (node 2). +. The cluster declares the failed secondary node (node 2) OFFLINE. +. After some time the cluster shows the sync_state of the stopped +secondary (on node 2) as SFAIL. +. If SBD fencing is used, then the manual recovery procedure will be +used to clear the fencing flag and restart pacemaker on the node. +. When the fenced node (node 2) rejoins the cluster the former +secondary {HANA} database is started automatically. +. The cluster detects that the system replication is in sync again +and marks it as ok (SOK). +========== + +//// +// TODO PRIO2: complete this test for node1 +// TODO PRIO2: describe same test for node2 + +==== Test: Crash the primary database indexserver on site A (node1) + +.Test CRASH_PRIMARY_INDEXSRV_SITE_A +========== +{testComp}:: + Primary database +{testDescr}:: + Simulate a crash of the primary {HANA} database indexserver. + +.{testProc} +. Crash the primary indexserver by killing the hdbindexserver process. ++ +[subs="attributes,quotes"] +---- +{sapnode1}:~ # killall -9 hdbindexserver +---- + +.{testRecover} +. TODO register former primary as new secondary ++ +[subs="attributes,quotes"] +---- +{sapnode1}:~ # TODO +... +---- + +.{testExpect} +. The hook script stops the primary database. +. The cluster detects the failed primary database (node 1) and initiates a +takeover. +========== +//// + +==== Test: Failure of replication LAN + +.Test FAIL_NETWORK_SR +========== + +{testComp}:: + Replication LAN +{testDescr}:: + Loss of replication LAN connectivity between the primary and secondary + node. + +.{testProc} +. Break the connection between the cluster nodes on the replication LAN. + +.{testRecover} +. Re-establish the connection between the cluster nodes on the + replication LAN. + +.{testExpect} +. After some time the cluster shows the sync_state of the secondary +(on node 2) as SFAIL. +. The primary {HANA} database (node 1) "HDBSettings.sh +systemReplicationStatus.py" shows "CONNECTION TIMEOUT" and the +secondary {HANA} database (node 2) is not able to reach the primary +database (node 1). +. The primary {HANA} database continues to operate as “normal”, but no +system replication takes place and is therefore no longer a valid take +over destination. +. When the LAN connection is re-established, HDB automatically +detects connectivity between the {HANA} databases and restarts the +system replication process +. The cluster detects that the system replication is in sync again +and marks it as ok (SOK). +========== + +=== Test cases for full automation + +In the following test descriptions we assume +*PREFER_SITE_TAKEOVER="true"* and *AUTOMATED_REGISTER="true".* + +NOTE: The following tests are designed to be run in sequence and depend +on the exit state of the preceding tests. + +==== Test: Stop the primary database on site A + +.Test STOP_PRIMARY_DB_SITE_A +========== +.{testComp} +- Primary Database + +.{testDescr} +- The primary {HANA} database is stopped during normal cluster operation. + +.{testProc} +- Stop the primary {HANA} database gracefully as _{refsidadm}_. + +[subs="specialchars,attributes,quotes"] +---- +{sapnode1}:~> HDB stop +---- + +.{testRecover} +. Not needed, everything is automated +. Refresh the cluster resources on node 1 as root. + +[subs="specialchars,attributes,quotes"] +---- +# crm resource refresh rsc_SAPHanaCon_{sapsid}_HDB{sapino} {sapnode1} +---- + +.{testExpect} +. The cluster detects the stopped primary {HANA} database (on node 1) +and marks the resource failed. +. The cluster promotes the secondary {HANA} database (on node 2) to +take over as primary. +. The cluster migrates the IP address to the new primary (on node 2). +. After some time the cluster shows the sync_state of the stopped +primary (on node 1) as SFAIL. +. Because AUTOMATED_REGISTER="true" the cluster does restart +the failed {HANA} database and register it against the new primary. +. After the automated register and resource refresh the system +replication pair is marked as in sync (SOK). +. The cluster "failed actions" are cleaned up after following the +recovery procedure. + +========== + +==== Test: Crash the primary node on site B (node 2) + +.Test CRASH_PRIMARY_NODE_SITE_B +========== +.{testComp} +- Cluster node of site B + +.{testDescr} +- Simulate a crash of the site B node running the primary {HANA} database. + +.{testProc} +- Crash the secondary node by sending a 'fast-reboot' system request. + +[subs="specialchars,attributes,quotes"] +---- +{sapnode2}:~ # sync; echo b > /proc/sysrq-trigger +---- + +.{testRecover} +* If SBD fencing is used, pacemaker will not automatically restart +after being fenced. In this case clear the fencing flag on *all* SBD +devices and subsequently start pacemaker. + +[subs="specialchars,attributes,quotes"] +---- +{sapnode2}:~ # sbd -d {sapsbd1} message {sapnode2} clear +{sapnode2}:~ # sbd -d {sapsbd2} message {sapnode2} clear +... +---- + +* Start the cluster framework. + +[subs="specialchars,attributes,quotes"] +---- +{sapnode2}:~ # {clusterstart} +---- + +* Refresh the cluster resources on node 2 as root. + +[subs="specialchars,attributes,quotes"] +---- +# crm resource refresh rsc_SAPHanaCon_{sapsid}_HDB{sapino} {sapnode2} +---- + +.{testExpect} +. The cluster detects the failed primary node (node 2) and +declares it UNCLEAN and sets the primary node (node 2) to status +"partition with quorum". +. The cluster fences the failed primary node (node 2). +. The cluster declares the failed primary node (node 2) OFFLINE. +. The cluster promotes the secondary {HANA} database (on node 1) to +take over as primary. +. The cluster migrates the IP address to the new primary (on node 1). +. After some time the cluster shows the sync_state of the stopped +secondary (on node 2) as SFAIL. +. If SBD fencing is used, then the manual recovery procedure will be +used to clear the fencing flag and restart pacemaker on the node. +. When the fenced node (node 2) rejoins the cluster the former primary became a secondary. +. Because AUTOMATED_REGISTER="true" the cluster does restart +the failed {HANA} database and register it against the new primary. +. The cluster detects that the system replication is in sync again +and marks it as ok (SOK). + +========== + +[[cha.hana-sr.administrate]] +== Administration + +=== Dos and don'ts + +In your project, you should: + +* Define STONITH before adding other resources to the cluster. +* Do intensive testing. +* Tune the timeouts of operations of SAPHana and SAPHanaTopology. +* Start with the parameter values PREFER_SITE_TAKEOVER=”true”, AUTOMATED_REGISTER=”false” and +DUPLICATE_PRIMARY_TIMEOUT=”7200”. +* Always wait for pending cluster actions to finish before doing something. +* Set up a test cluster for testing configuration changes and administrative +procedure before applying them on the production cluster. + +In your project, avoid: + +* Rapidly changing/changing back a cluster configuration, such as setting +nodes to standby and online again or stopping/starting the multi-state +resource. +* Creating a cluster without proper time synchronization or unstable name +resolutions for hosts, users and groups. +* Using site names other than the ones already known by the cluster when +manually re-registering a site. +* Adding location rules for the clone, multi-state or IP resource. Only location +rules mentioned in this setup guide are allowed. For public cloud refer to the +cloud specific documentation. +* Using {SAP} tools for attempting start/stop/takeover actions on a database +while the cluster is in charge of managing that database. + +IMPORTANT: As "migrating" or "moving" resources in crm-shell, HAWK or other +tools would add client-prefer location rules, support is limited to maintenance +procedures described in this document. See <> and +<> for proven procedures. + +=== Monitoring and tools + +You can use the High Availability Web Console (HAWK), {HANA} Cockpit, {HANA} Studio and different +command line tools for cluster status requests. + +==== HAWK – cluster status and more + +You can use a Web browser to check the cluster status. + +.Cluster Status in HAWK +image::SAPHanaSR-ScaleUp-HAWK-Status-SLE12.png[scaledwidth=100%] +// TODO PRIO3: SAPHanaSR-ScaleUp-HAWK-Status-SLE15.png + +If you set up the cluster using {slehainit} and you have installed all packages +as described above, your system will provide a very useful Web interface. You +can use this graphical Web interface to get an overview of the complete cluster +status, perform administrative tasks or configure resources and cluster bootstrap +parameters. Read the product manuals for a complete documentation of this user +interface. For the {HANA} system replication {usecase} scenario the use of HAWK +should follow the guidance given in this guide. + +==== {HANA} Cockpit + +Database-specific administration and checks can be done with {HANA} Cockpit. +Before trying start/stop/takeover for the database, make sure the cluster is not +in charge of managing the respective resource. See also <>. + +.{HANA} Cockpit – database directory +//image::hana_studio_landscape.png[scaledwidth=100%] +//image::hana_studio_landscape_2021.png[scaledwidth=100%] +image::saphana-cockpit01.png[scaledwidth=100%] + +==== Cluster command line tools + +A simple overview can be obtained by calling `crm_mon`. Using option +`-r` shows also stopped but already configured resources. Option `-1` +tells `crm_mon` to output the status once instead of periodically. + +[subs="attributes,quotes"] +---- +# crm_mon -1r +Stack: corosync +Current DC: {sapnode1} (version 2.0.1+20190417.13d370ca9-3.6.1-2.0.1+20190417.13d370ca9) - partition with quorum +Last updated: Thu Feb 6 12:20:03 2020 +Last change: Thu Feb 6 12:19:43 2020 by root via crm_attribute on {sapnode1} + +2 nodes configured +6 resources configured + +Online: [ {sapnode1} {sapnode2} ] + +Full list of resources: + + stonith-sbd (stonith:external/sbd): Started {sapnode1} + Clone Set: cln_SAPHanaTop_{sapsid}_HDB{sapino} [rsc_SAPHanaTop_{sapsid}_HDB{sapino}] + Started: [ {sapnode1} {sapnode2} ] + Clone Set: mst_SAPHanaCon_{sapsid}_HDB{sapino} [rsc_SAPHanaCon_{sapsid}_HDB{sapino}] (promotable) + Masters: [ {sapnode1} ] + Slaves: [ {sapnode2} ] + rsc_ip_{sapsid}_HDB{sapino} (ocf::heartbeat:IPaddr2): Started {sapnode1} +---- + +See the manual page crm_mon(8) for details. + +==== SAPHanaSR command line tools + +To show some SAPHana or SAPHanaTopology resource agent internal +values, you can call the program `SAPHanaSR-showAttr`. The internal +values, the storage location and their parameter names may change in the next +versions. The command `SAPHanaSR-showAttr` will always fetch the values +from the correct storage location. + +Do not use cluster commands like `crm_attribute` to fetch the values +directly from the cluster. If you use such commands, your methods will be +broken when you need to move an attribute to a different storage place +or even out of the cluster. At first _SAPHanaSR-showAttr_ is a test +program only and should not be used for automated system monitoring. + +// TODO PRIO1: adapt sample output +[subs="attributes,quotes"] +---- + {sapnode1}:~ # SAPHanaSR-showAttr + Host \ Attr clone_state remoteHost roles ... site srmode sync_state ... + --------------------------------------------------------------------------------- + {sapnode1} PROMOTED {sapnode2} 4:P:master1:... {sapsite1} sync PRIM ... + {sapnode2} DEMOTED {sapnode1} 4:S:master1:... {sapsite2} sync SOK ... +---- + +`SAPHanaSR-showAttr` also supports other output formats such as *script*. The script +format is intended to allow running filters. Based on `SAPHanaSR-showAttr` output +format script you can define effective queries: + +// TODO PRIO1: adapt sample output +[subs="attributes,quotes"] +---- +{sapnode1}:~ # SAPHanaSR-showAttr --format=script | grep "/remoteHost=" +Thu Feb 6 12:28:10 2020; Hosts/{sapnode1}/remoteHost={sapnode2} +Thu Feb 6 12:28:10 2020; Hosts/{sapnode2}/remoteHost={sapnode1} +---- + +`SAPHanaSR-replay-archive` can help to analyze the SAPHanaSR attribute values from +`crm_report` archives. This allows post mortem analyzes. + +In our example, the administrator killed the primary {HANA} instance using the command +`HDB kill-9`. This happened around 9:10 pm. + +// TODO PRIO1: adapt sample output +[subs="attributes,quotes"] +---- +{sapnode1}:~ # hb_report -f 19:00 +INFO: {sapnode1}# The report is saved in ./hb_report-1-11-11-2019.tar.bz2 +INFO: {sapnode1}# Report timespan: 11/11/19 19:00:00 - 11/11/19 21:05:33 +INFO: {sapnode1}# Thank you for taking time to create this report. +{sapnode1}:~ # SAPHanaSR-replay-archive --format=script \ + ./hb_report-1-11-11-2019.tar.bz2 | grep "/roles=" +Mon Nov 11 20:38:01 2019; Hosts/{sapnode1}/roles=4:P:master1:master:worker:master +Mon Nov 11 20:38:01 2019; Hosts/{sapnode2}/roles=4:S:master1:master:worker:master +Mon Nov 11 21:11:37 2019; Hosts/{sapnode1}/roles=1:P:master1::worker: +Mon Nov 11 21:12:43 2019; Hosts/{sapnode2}/roles=4:P:master1:master:worker:master +---- + +In the above example the attributes indicate that at the beginning {sapnode1} +was running primary (4:P) and {sapnode2} was running secondary (4:S). + +At 21:11 (CET) suddenly the primary on {sapnode1} died - it was falling down to 1:P. + +The cluster did jump-in and initiated a takeover. At 21:12 (CET) the former secondary +was detected as new running master (changing from 4:S to 4:P). + + +==== {HANA} LandscapeHostConfiguration + +To check the status of an SAPHana database and to find out if the +cluster should react, you can use the script *landscapeHostConfiguration.py* +to be called as Linux user _{refsidadm}_. + +[subs="attributes,quotes"] +---- +{sapnode1}:~> HDBSettings.sh landscapeHostConfiguration.py +| Host | Host | ... NameServer | NameServer | IndexServer | IndexServer | +| | Active | ... Config Role | Actual Role | Config Role | Actual Role | +| ------ | ------ | ... ------------ | ----------- | ----------- | ----------- | +| {sapnode1} | yes | ... master 1 | master | worker | master | + +overall host status: ok +---- + +Following the {SAP} HA guideline, the {SAPHana} resource agent interprets the return +codes in the following way: + +.Interpretation of Return Codes +[width="100%",cols="15%,85%",options="header",] +|======================================================================= +|Return Code |Interpretation +|4 |{HANA} database is up and OK. The cluster does interpret this as a +correctly running database. + +|3 |{HANA} database is up and in status info. The cluster does +interpret this as a correctly running database. + +|2 |{HANA} database is up and in status warning. The cluster does +interpret this as a correctly running database. + +|1 |{HANA} database is down. If the database should be up and is not +down by intention, this could trigger a takeover. + +|0 |Internal Script Error – to be ignored. +|======================================================================= +// TODO PRIO2: replace "to be ignored" + +=== Maintenance + +[[sec-maintenance]] + +To receive updates for the operating system or the {sleha}, +it is recommended to register your systems to either a local {suma}, to {rmtool} ({rmt}), +or remotely with {scc}. +For more information, visit the respective Web pages: +{sumalandingpage} +{rmtGuide15} +{sccfaq} +Examples for maintenance tasks are also given in manual page SAPHanaSR_maintenance_examples(7). + +==== Updating the operating system and cluster + +For an update of {sles4sap} packages including cluster software, follow the +rolling update procedure defined in the product documentation +of the {sleha} Administration Guide, chapter _Upgrading Your Cluster and Updating Software Packages_ at +{haAdminGuide15}#cha-ha-migration. + +==== Updating {HANA} - seamless {HANA} maintenance + +For updating {HANA} database systems in system replication, you need to follow +the defined SAP processes. This section describes the steps required before and +after the update procedure to get the system replication automated again. + +{SUSE} has optimized the {HANA} maintenance process in the cluster. +The improved procedure only sets the multi-state resource to maintenance and +keeps the rest of the cluster (SAPHanaTopology clones and IPaddr2 vIP resource) +still active. Using the updated procedure allows a seamless {HANA} maintenance +in the cluster, as the virtual IP address can automatically follow the running +primary. + +Prepare the cluster not to react on the maintenance work to be done on +the {HANA} database systems. Set the multi-state resource to `maintenance`. + +.Main {HANA} Update procedure +============================== +Pre-Update Tasks:: For the multi-state-resource set the maintenance mode as follows: ++ +[subs="specialchars,attributes,quotes"] +---- +# crm resource maintenance +---- ++ +The in the guide at hand is `mst_SAPHanaCon_{sapsid}_HDB{sapino}`. + +Update:: Process the {SAP} Update for both {HANA} database systems. This +procedure is described by {SAP}. + +Post-Update Tasks:: Expect the primary/secondary roles to be exchanged after the +maintenance. Therefore, tell the cluster to forget about these states and to +reprobe the updated {HANA} database systems. ++ +[subs="specialchars,attributes,quotes"] +---- +# crm resource refresh +---- ++ +After the {HANA} update is complete on both sites, tell the cluster about the +end of the maintenance process. This allows the cluster to actively control and +monitor the {sap} again. ++ +[subs="specialchars,attributes,quotes"] +---- +# crm resource maintenance off +---- + +Optionally, you could completety remove the maintenance attribute from the resource. +For more details see manual page SAPHanaSR_maintenance_examples(7) and crm(8). + +============================== + +==== Migrating an {HANA} primary + +In the following procedures, we assume the primary runs on node 1 and the secondary +on node 2. The goal is to "exchange" the roles of the nodes: the primary should +then run on node 2 and the secondary should run on node 1. + +There are different methods to get the exchange of the roles done. The +following procedure shows how to tell the cluster to "accept" a role change via +native {HANA} commands. + +// TODO PRIO2: Check for takeover with handshake + +.Migrating an {HANA} primary using {SAP} toolset +============================== +Pre-Migration Tasks:: Set the multi-state resource to `maintenance`. This can be done on any +cluster node. ++ +[subs="specialchars,attributes,quotes"] +---- +# crm resource maintenance +---- + +// TODO PRIO2: Check -sr_takeover with 'handshake' + +Manual Takeover Process:: +* Stop the primary {HANA} database system. Enter the command in our +example on node1 as user _{refsidadm}_. ++ +[subs="specialchars,attributes,quotes"] +---- +~> HDB stop +---- ++ +* Before proceeding, make sure the primary {HANA} database is stopped. +* Start the takeover process on the secondary {HANA} database system. +Enter the command in our example on node 2 as user _{refsidadm}_. ++ +[subs="specialchars,attributes,quotes"] +---- +~> hdbnsutil -sr_takeover +---- ++ +* Register the former primary to become the new secondary. Enter the +command in our example on node1 as user _{refsidadm}_. ++ +[subs="specialchars,attributes,quotes"] +---- +~> hdbnsutil -sr_register --remoteHost={sapnode2} --remoteInstance={sapino} \ + --replicationMode=sync --name={sapsite1} \ + --operationMode=logreplay +---- ++ +* Start the new secondary {HANA} database system. Enter the command in +our example on node1 as user _{refsidadm}_. ++ +[subs="specialchars,attributes,quotes"] +---- +~> HDB start +---- + +Post-Migration Tasks:: +* Wait some time until `SAPHanaSR-showAttr` shows both {HANA} database +systems to be up again (field roles must start with the digit 4). The new +secondary should have role "S" (for secondary). ++ +* Tell the cluster to forget about the former multi-state roles and to +re-monitor the failed master. The command can be submitted on any +cluster node as user root. ++ +[subs="specialchars,attributes,quotes"] +---- +# crm resource refresh +---- ++ +* Set the multi-state resource to the status managed again. The command +can be submitted on any cluster node as user root. ++ +[subs="specialchars,attributes,quotes"] +---- +# crm resource maintenance off +---- +============================== + +The following paragraphs explain how to use the cluster to partially automate the +migration. + +.Moving an {HANA} primary using the Cluster Toolset +============================== +* Create a "move away" from this node rule by using the *force* option. ++ +[subs="attributes,specialchars,quotes"] +---- +# crm resource move *force* +---- ++ +Because of the "move away" (*force*) rule, the cluster will *stop* the +current primary. After that, run a *promote* on the secondary site if the system +replication was in sync before. You should not migrate the primary if +the status of the system replication is not in sync (SFAIL). ++ +IMPORTANT: Migration without the *force* option will cause a takeover without the +former primary to be stopped. Only the migration with *force* option is supported. ++ +* Wait until the secondary has completely taken over to be the new primary role. You + see this using the command line tool `SAPHanaSR-showAttr`. Now check for the + attributes "roles" for the new primary. It must start with "*4:P*". ++ +// TODO PRIO1: adapt sample output +[subs="specialchars,attributes,quotes"] +---- +{sapnode1}:~ # SAPHanaSR-showAttr --format=script | grep "/roles=" +Mon Jun 21 19:38:50 2021; Hosts/{sapnode1}/roles=*1:P*:master1::worker: +Mon Jun 21 19:38:50 2021; Hosts/{sapnode2}/roles=*4:P*:master1:master:worker:master +---- + +* If you have set up the parameter value `AUTOMATED_REGISTER="true"`, you can skip this step. In +other cases you now need to register the old primary. Enter the command +in our example on node1 as user _{refsidadm}_. ++ +[subs="specialchars,attributes,quotes"] +---- +~> hdbnsutil -sr_register --remoteHost={sapnode2} --remoteInstance={sapino} \ + --replicationMode=sync --operationMode=logreplay \ + --name={sapsite1} +---- + +* Clear the ban rules of the resource to allow the cluster to start the new secondary. ++ +[subs="specialchars,attributes,quotes"] +---- +# crm resource clear +---- ++ + +* Wait until the new secondary has started. You + see this using the command line tool `SAPHanaSR-showAttr` and check for the + attributes "roles" for the new primary. It must start with "*4:S*". ++ +// TODO PRIO1: adapt sample output +[subs="specialchars,attributes,quotes"] +---- +{sapnode1}:~ # SAPHanaSR-showAttr --format=script | grep "/roles=" +Mon Jun 21 19:38:50 2021; Hosts/{sapnode1}/roles=*4:S*:master1::worker: +Mon Jun 21 19:38:50 2021; Hosts/{sapnode2}/roles=*4:P*:master1:master:worker:master +---- + +============================== + +//// +* Set the primary node to be standby. + +`crm node standby {sapnode1}` + +The cluster will stop the primary {HANA} database and, if the system +replication was in sync, process the takeover on the secondary site. + +Wait until the former secondary has completely taken over to be the new +primary. + +If you have set up `AUTOMATED_REGISTER="true"`, you can skip this step. In +other cases you now need to register the old primary. Enter the command +in our example on node1 as user _{refsidadm}_. + +`hdbnsutil -sr_register --remoteHost={sapnode2} --remoteInstance={sapino} --replicationMode=sync --name={sapsite1} --operationMode=logreplay` + +Set the standby node to be online again. + +`crm node online {sapnode1}` +//// + + +[[app.hana-sr.example]] +== Examples + +=== Example `{slehainit}` configuration + +[subs="attributes,quotes"] +---- +{sapnode1}:~ # ha-cluster-init -u + Generating SSH key + Configuring csync2 + Generating csync2 shared key (this may take a while)...done + csync2 checking files...done + +Configure Corosync (unicast): + This will configure the cluster messaging layer. You will need + to specify a network address over which to communicate (default + is eth0's network, but you can use the network address of any + active interface). + + Address for ring0 [{sapip1node1}] + Port for ring0 [5405] + +Configure SBD: + If you have shared storage, for example a SAN or iSCSI target, + you can use it avoid split-brain scenarios by configuring SBD. + This requires a 1 MB partition, accessible to all nodes in the + cluster. The device path must be persistent and consistent + across all nodes in the cluster, so /dev/disk/by-id/* devices + are a good choice. Note that all data on the partition you + specify here will be destroyed. + +Do you wish to use SBD (y/n)? y + Path to storage device (e.g. /dev/disk/by-id/...), or "none" []{sapsbd1} +WARNING: All data on {sapsbd1} will be destroyed! +Are you sure you wish to use this device (y/n)? y + Initializing SBD......done + Hawk cluster interface is now running. To see cluster status, open: + https://{sapip1node1}:7630/ + Log in with username 'hacluster', password 'linux' +You should change the hacluster password to something more secure! + Waiting for cluster........done + Loading initial cluster configuration + +Configure Administration IP Address: + Optionally configure an administration virtual IP + address. The purpose of this IP address is to + provide a single IP that can be used to interact + with the cluster, rather than using the IP address + of any specific cluster node. + +Do you wish to configure a virtual IP address (y/n)? n + Done (log saved to /var/log/ha-cluster-bootstrap.log) +---- + +// DO NOT CHANGE AUTOGENERATED SECTION ID: id-example-cluster-configuration +// DO NOT CHANGE SECTION TITLE as this would change autogenerated section ID +// Refer to Trento check +=== Example cluster configuration + +The following complete crm configuration is for a two-node cluster +({sapnode1}, {sapnode2}) and an {HANA} database with SID {sapsid} +and instance number {sapino}. +Priority fencing prefers the {HANA} primary in case of split-brain. +The virtual IP address in the example is {sapip1srv1}. + +// TODO PRIO1: shorter stop timeout +[subs="attributes,quotes"] +---- +node {sapnode1} +node {sapnode2} + +primitive rsc_SAPHanaTop_{sapsid}_HDB{sapino} ocf:suse:SAPHanaTopology \ + op start interval=0 timeout=600 \ + op stop interval=0 timeout=600 \ + op monitor interval=50 timeout=600 \ + params SID={sapsid} InstanceNumber={sapino} + +primitive rsc_SAPHanaCon_{sapsid}_HDB{sapino} ocf:suse:SAPHana \ + op start interval=0 timeout=3600 \ + op stop interval=0 timeout=3600 \ + op promote interval=0 timeout=900 \ + op demote interval=0 timeout=320 \ + op monitor interval=60 role=Promoted timeout=700 \ + op monitor interval=61 role=Unpromoted timeout=700 \ + params SID={sapsid} InstanceNumber={sapino} PREFER_SITE_TAKEOVER=true \ + DUPLICATE_PRIMARY_TIMEOUT=7200 AUTOMATED_REGISTER=false \ + meta priority=100 + +primitive rsc_ip_{sapsid}_HDB{sapino} ocf:heartbeat:IPaddr2 \ + op monitor interval=10 timeout=20 \ + params ip={sapip1srv1} + +primitive stonith-sbd stonith:external/sbd \ + params pcmk_delay_max=15 + +clone mst_SAPHanaCon_{sapsid}_HDB{sapino} rsc_SAPHanaCon_{sapsid}_HDB{sapino} \ + meta clone-node-max=1 promotable=true interleave=true + +clone cln_SAPHanaTop_{sapsid}_HDB{sapino} rsc_SAPHanaTop_{sapsid}_HDB{sapino} \ + meta clone-node-max=1 interleave=true + +colocation col_saphana_ip_{sapsid}_HDB{sapino} 2000: \ + rsc_ip_{sapsid}_HDB{sapino}:Started mst_SAPHanaCon_{sapsid}_HDB{sapino}:Promoted +order ord_saphana_{sapsid}_HDB{sapino} Optional: \ + cln_SAPHanaTop_{sapsid}_HDB{sapino} mst_SAPHanaCon_{sapsid}_HDB{sapino} + +property cib-bootstrap-options: \ + cluster-infrastructure=corosync \ + stonith-enabled=true \ + stonith-action=reboot \ + stonith-timeout=150 \ + priority-fencing-delay=30 + +rsc_defaults rsc-options: \ + resource-stickiness=1000 \ + migration-threshold=5000 + +op_defaults op-options \ + timeout=600 \ + record-pending=true +---- + +// DO NOT CHANGE AUTOGENERATED SECTION ID: id-example-for-etccorosynccorosync-conf +// DO NOT CHANGE SECTION TITLE as this would change autogenerated section ID +// Refer to Trento checks +// Refer to Trento checks +=== Example for _/etc/corosync/corosync.conf_ + +The following file shows a typical corosync configuration with two rings. +Review the SUSE product documentation about details. +See also manual pages corosync.conf(5) and votequorum(5). +// TODO PRIO3: get corosync.conf from common include file + +[subs="attributes,quotes"] +---- +# Read the corosync.conf.5 manual page +totem { + version: 2 + secauth: on + crypto_hash: sha1 + crypto_cipher: aes256 + cluster_name: suse-ha + clear_node_high_bit: yes + token: 5000 + token_retransmits_before_loss_const: 10 + join: 60 + consensus: 6000 + max_messages: 20 + interface { + ringnumber: 0 + mcastport: 5405 + ttl: 1 + } + interface { + ringnumber: 1 + mcastport: 5407 + ttl: 1 + } + rrp_mode: passive + transport: udpu +} + +logging { + fileline: off + to_stderr: no + to_logfile: no + logfile: /var/log/cluster/corosync.log + to_syslog: yes + debug: off + timestamp: on + logger_subsys { + subsys: QUORUM + debug: off + } +} + +nodelist { + node { + ring0_addr: {sapIp1Node1} + ring1_addr: {sapIp2Node1} + nodeid: 1 + } + node { + ring0_addr: {sapIp1Node2} + ring1_addr: {sapIp2Node2} + nodeid: 2 + } +} + +quorum { + provider: corosync_votequorum + expected_votes: 2 + two_node: 1 +} +---- + +=== Examples for alternate STONITH methods + +==== Example for deterministic SBD STONITH + +These SBD resources make sure that node {sapnode1} will win in case of split-brain. + +[subs="attributes,quotes"] +---- +primitive rsc_sbd_{sapnode1} stonith:external/sbd \ + params pcmk_host_list={sapnode2} pcmk_delay_base=0 + +primitive rsc_sbd_{sapnode2} stonith:external/sbd \ + params pcmk_host_list={sapnode1} pcmk_delay_base=30 +---- + +[[cha.hanasr-example-ipmi]] +==== Example for the IPMI STONITH method + +[subs="attributes,quotes"] +---- +primitive rsc_{sapnode1}_stonith stonith:external/ipmi \ + params hostname="{sapnode1}" ipaddr="{sapip1ipmi1}" userid="stonith" \ + passwd="k1llm3" interface="lanplus" \ + op monitor interval="1800" timeout="30" + ... +primitive rsc_{sapnode2}_stonith stonith:external/ipmi \ + params hostname="{sapnode2}" ipaddr="{sapip1ipmi2}" userid="stonith" \ + passwd="k1llm3" interface="lanplus" \ + op monitor interval="1800" timeout="30" + ... +location loc_{sapnode1}_stonith rsc_{sapnode1}_stonith -inf: {sapnode1} +location loc_{sapnode2}_stonith rsc_{sapnode2}_stonith -inf: {sapnode2} +---- + +[[cha.hanasr-example-systemv]] +=== Example for checking legacy SystemV integration + +include::SLES4SAP-hana-systemv-appendix.adoc[] + +++++ + +++++ + + +[[app.hana-sr.information]] +== References + +For more detailed information, have a look at the documents listed below. + +// SUSE docu, manual pages, TIDs and blogs. SAP guides and notes. +:leveloffset: 2 +include::SAPNotes_HANA20_angi_15.adoc[] +:leveloffset: 0 + +=== Pacemaker + +Pacemaker Project Documentation:: + https://clusterlabs.org/pacemaker/doc/ + +++++ + +++++ + + +// Standard SUSE Best Practices includes +== Legal notice +include::common_sbp_legal_notice.adoc[] + +++++ + +++++ + +// Standard SUSE Best Practices includes +include::common_gfdl1.2_i.adoc[] + +// +// REVISION 0.1 2024/04 +// - Initial version +// diff --git a/adoc/SLES4SAP-hana-scaleOut-PerfOpt-12.adoc b/adoc/SLES4SAP-hana-scaleOut-PerfOpt-12.adoc index 056414d4..c6ffc12a 100644 --- a/adoc/SLES4SAP-hana-scaleOut-PerfOpt-12.adoc +++ b/adoc/SLES4SAP-hana-scaleOut-PerfOpt-12.adoc @@ -2009,7 +2009,7 @@ Load the file to the cluster. In most installations, only the parameter *ip* needs to be set to the virtual IP address to be presented to the client systems. -Use the command `man ocf_heartbeat_IPAddr2` for details on additional parameters. +Use the command `man ocf_heartbeat_IPaddr2` for details on additional parameters. // NOTE: DONE PRIO1: remove nc stuff diff --git a/adoc/SLES4SAP-hana-scaleOut-PerfOpt-15.adoc b/adoc/SLES4SAP-hana-scaleOut-PerfOpt-15.adoc index 7826c9d8..ba4fa936 100644 --- a/adoc/SLES4SAP-hana-scaleOut-PerfOpt-15.adoc +++ b/adoc/SLES4SAP-hana-scaleOut-PerfOpt-15.adoc @@ -2213,7 +2213,7 @@ Load the file to the cluster. In most installations, only the parameter *ip* needs to be set to the virtual IP address to be presented to the client systems. -Use the command `man ocf_heartbeat_IPAddr2` for details on additional parameters. +Use the command `man ocf_heartbeat_IPaddr2` for details on additional parameters. ==== Constraints diff --git a/adoc/SLES4SAP-hana-scaleout-multitarget-perfopt-15.adoc b/adoc/SLES4SAP-hana-scaleout-multitarget-perfopt-15.adoc index 27fff8fe..b2de751b 100644 --- a/adoc/SLES4SAP-hana-scaleout-multitarget-perfopt-15.adoc +++ b/adoc/SLES4SAP-hana-scaleout-multitarget-perfopt-15.adoc @@ -2315,7 +2315,7 @@ Load the file to the cluster. In most installations, only the parameter *ip* needs to be set to the virtual IP address to be presented to the client systems. -Use the command `man ocf_heartbeat_IPAddr2` for details on additional parameters. +Use the command `man ocf_heartbeat_IPaddr2` for details on additional parameters. ==== Constraints @@ -2428,7 +2428,7 @@ Load the file to the cluster. In most installations, only the parameter *ip* needs to be set to the virtual IP address to be presented to the client systems. -Use the command `man ocf_heartbeat_IPAddr2` for details on additional parameters. +Use the command `man ocf_heartbeat_IPaddr2` for details on additional parameters. See also manual page SAPHanaSR-ScaleOut_basic_cluster(7). diff --git a/adoc/SLES4SAP-hana-sr-guide-PerfOpt-15.adoc b/adoc/SLES4SAP-hana-sr-guide-PerfOpt-15.adoc index 735d1785..f9acf2a1 100644 --- a/adoc/SLES4SAP-hana-sr-guide-PerfOpt-15.adoc +++ b/adoc/SLES4SAP-hana-sr-guide-PerfOpt-15.adoc @@ -53,6 +53,15 @@ collaboration with {SAP}, cloud service and hardware partners, {SUSE} provides t resource agents for customers to ensure the high availability of {HANA} system replications. +The here described HA setup using the SAPHanaSR package has been used in the past. +It still is supported for existing clusters. +For deploying new HA clusters, we recommend to use the SAPHanaSR-angi package +following +"{SAPHANA} System Replication Scale-Up - Performance Optimized Scenario". +// TODO PRIO1: add URL once published +// (https://documentation.suse.com/sbp/sap/html/SLES4SAP-hana-angi-perfopt-15). + + ==== Abstract This guide describes planning, setup, and basic testing of {sles4sap} based on @@ -3061,7 +3070,7 @@ cluster node. + [subs="specialchars,attributes,quotes"] ---- -# crm resource maintenance +# crm resource maintenance ---- // TODO PRIO2: Check -sr_takeover with 'handshake' @@ -3113,7 +3122,7 @@ cluster node as user root. + [subs="specialchars,attributes,quotes"] ---- -# crm resource refresh multi-state-resource-name +# crm resource refresh ---- + * Set the multi-state resource to the status managed again. The command @@ -3121,7 +3130,7 @@ can be submitted on any cluster node as user root. + [subs="specialchars,attributes,quotes"] ---- -# crm resource maintenance off +# crm resource maintenance off ---- ============================== @@ -3135,7 +3144,7 @@ migration. For the described attribute query using `SAPHanaSR-showAttr` and + [subs="attributes,specialchars,quotes"] ---- -# crm resource move *force* +# crm resource move *force* ---- + Because of the "move away" (*force*) rule, the cluster will *stop* the @@ -3176,7 +3185,7 @@ in our example on node1 as user _{refsidadm}_. + [subs="specialchars,attributes,quotes"] ---- -# crm resource clear +# crm resource clear ---- + NOTE: The crm resource command *clear* was previously named *unmigrate*. The *unmigrate* @@ -3512,10 +3521,12 @@ include::common_gfdl1.2_i.adoc[] // - corosync example with two rings // REVISION 1.6 2022/08 // - susChkSrv.py -// - updated examples, reference +// - updated examples, references // REVISION 1.6a 2023/04 // - SAP native systemd support is default for HANA 2.0 SPS07 // REVISION 1.6b 2024/02 // - HANA 2.0 SPS05 rev.059 Python 3 needed +// REVISION 1.6c 2024/03 +// - updated references +// - pointer to SAPHanaSR-angi // - diff --git a/adoc/SLES4SAP-hana-sr-guide-costopt-15.adoc b/adoc/SLES4SAP-hana-sr-guide-costopt-15.adoc index 453897c2..2adad4c1 100644 --- a/adoc/SLES4SAP-hana-sr-guide-costopt-15.adoc +++ b/adoc/SLES4SAP-hana-sr-guide-costopt-15.adoc @@ -3337,7 +3337,7 @@ cluster node. + [subs="specialchars,attributes,quotes"] ---- -# crm resource maintenance +# crm resource maintenance ---- // TODO PRIO2: Check -sr_takeover with 'handshake' @@ -3389,7 +3389,7 @@ cluster node as user root. + [subs="specialchars,attributes,quotes"] ---- -# crm resource refresh +# crm resource refresh ---- + * Set the multi-state resource to the status managed again. The command @@ -3397,7 +3397,7 @@ can be submitted on any cluster node as user root. + [subs="specialchars,attributes,quotes"] ---- -# crm resource maintenance off +# crm resource maintenance off ---- ============================== @@ -3411,7 +3411,7 @@ migration. For the described attribute query using `SAPHanaSR-showAttr` and + [subs="attributes,specialchars,quotes"] ---- -# crm resource move *force* +# crm resource move *force* ---- + Because of the "move away" (*force*) rule, the cluster will *stop* the @@ -3452,7 +3452,7 @@ in our example on node1 as user _{refsidadm}_. + [subs="specialchars,attributes,quotes"] ---- -# crm resource clear +# crm resource clear ---- + NOTE: The crm resource command *clear* was previously named *unmigrate*. The *unmigrate* @@ -3518,7 +3518,7 @@ preload_column_tables = false + [subs="specialchars,attributes,quotes"] ---- -# crm resource move force +# crm resource move force ---- * Wait until the cluster has finished the transition and is idle. @@ -3526,7 +3526,7 @@ Then remove the migration constraint from CIB. + [subs="specialchars,attributes,quotes"] ---- -# crm resource clear +# crm resource clear ---- ============================== diff --git a/adoc/Var_SLES4SAP-hana-angi-perfopt-15-param.txt b/adoc/Var_SLES4SAP-hana-angi-perfopt-15-param.txt new file mode 100644 index 00000000..dba86716 --- /dev/null +++ b/adoc/Var_SLES4SAP-hana-angi-perfopt-15-param.txt @@ -0,0 +1,35 @@ +:sapsid : HA1 +:sapssid : ha1 +:sapino : 10 +:sapsite1 : WDF +:sapsite2 : ROT +:sapnode1 : suse01 +:sapnode2 : suse02 +:saplocation1 : Walldorf +:saplocation2 : Rot +:sapnpsid : QAS +:sapsnpsid : qas +:sapnpino : 20 +:sapip1node1 : 192.168.1.11 +:sapip1node2 : 192.168.1.12 +:sapip1srv1 : 192.168.1.20 +:sapip1srv2 : 192.168.1.21 +:sapip1net1 : 192.168.1.0 +:sapip1ipmi1 : 192.168.1.101 +:sapip1ipmi2 : 192.168.1.102 +:sapip1mcast1 : 238.50.0.1 +:sapip2node1 : 192.168.2.11 +:sapip2node2 : 192.168.2.12 +:sapip2srv1 : 192.168.2.20 +:sapip2srv2 : 192.168.2.21 +:sapip2net1 : 192.168.2.0 +:sapsbd1 : /dev/disk/by-id/SBDA +:sapsbd2 : /dev/disk/by-id/SBDB +:sapsbd3 : /dev/disk/by-id/SBDC +:deploy : Deployment Guide +:slehainit: ha-cluster-init +:slehajoin: ha-cluster-join +:clusterstart: crm cluster start +:clusterstop: crm cluster stop +:clusterstatus: systemctl status pacemaker +:sbdstatus: systemctl status sbd diff --git a/adoc/Var_SLES4SAP-hana-angi-perfopt-15.txt b/adoc/Var_SLES4SAP-hana-angi-perfopt-15.txt new file mode 100644 index 00000000..649a69d2 --- /dev/null +++ b/adoc/Var_SLES4SAP-hana-angi-perfopt-15.txt @@ -0,0 +1,207 @@ +:usecase: performance optimized +:prodNr: 15 +:prodSP: SP6 +:testName: Test +:testComp: Component: +:testDescr: Description: +:testExpect: Expected: +:testProc: Test Procedure: +:testRecover: Recovery Procedure: +:exampleuser: tux +:exampleuserII: wilber +:examplegroup: users +:grub: GRUB +:lilo: LILO +:sax: SaX2 +:atomic-update: Atomic Update +:suse: SUSE +:novell: Novell +:suseconfig: SuSEconfig +:suselinux: {suse} Linux +:suseonsite: {suse} Studio Onsite +:reg: (R) +:suseonsitereg: {suseonsite}{reg} +:yast: YaST +:x86: x86 +:amd64: AMD64 +:s390: S/390 +:zseries: System z +:ipf: Itanium +:ipseries: POWER +:ppc: POWER +:em64t: Intel EM64T +:intel64: Intel 64 +:x86-64: {amd64}/{intel64} +:opensuse: openSUSE +:opensusereg: openSUSE{reg} +:sle: SUSE Linux Enterprise +:slea: SLE +:sas: {sle} Advanced Server +:cpr: (C) +:trmk: (TM) +:trade: {trmk} +:uarr: +:slereg: SUSE{reg} Linux Enterprise +:slert: {sle} Real Time +:slerte: {slert} Extension +:slertreg: {slereg} Real Time +:slertereg: {slereg} Real Time Extension +:sle: SUSE Linux Enterprise +:sls: SUSE Linux Enterprise Server +:sles: {sls} +:slsa: SLES +:slsreg: SUSE{reg} Linux Enterprise Server +:sles4sapa: {slsa} for SAP +:sles4sap: {sls} for SAP Applications +:sles4sapreg: {slsreg} for SAP Applications +:sleha: {sle} High Availability Extension +:sdk: SUSE Software Development Kit +:slreg: SUSE{reg} Linux +:hasi: High Availability Extension +:sleha: {sle} {hasi} +:sletcreg: {slereg} Thin Client +:sletc: {sle} Thin Client +:sletca: SLETC +:tc: Thin Client +:nu: NU +:scc: {suse} Customer Center +:suma: {suse} Manager +:yup: yup +:imgcreat: Image Creator +:admserv: Administration Server +:branchserv: Branch Server +:imgserv: Image Building Server +:posbranchserv: POSBranch Server +:pos: Point of Service +:wy: WebYaST +:wyclient: {yast}2 Webclient +:wyservice: {yast}2 Webservice +:slms: SUSE Lifecycle Management Server +:slmsreg: SUSE{reg} Lifecycle Management Server +:obs: Open Build Service +:obsa: OBS +:oes: Open Enterprise Server +:rhel: RedHat Enterprise Linux +:musicplayer: Banshee +:musicplayerreg: Banshee{trade} +:zenup: Software Updater +:updater: openSUSE Updater +:gupdater: Update Applet +:kupdater: Software Updater +:aa: AppArmor +:aareg: AppArmor{reg} +:naa: {aa} +:naareg: {aareg} +:hb: Heartbeat +:hbvs: Heartbeat 2 +:hbgui: Linux HA Management Client +:smtool: Subscription Management Tool +:smt: SMT +:rmtool: Repository Mirroring Tool +:rmt: RMT +:lxc: LXC +:xen: Xen +:xenreg: Xen{reg} +:kvm: KVM +:vmhost: VM Host Server +:vmguest: VM Guest +:dom0: Domain0 +:vmm: Virtual Machine Manager +:pk: PolicyKit +:ha: High Availability +:ais: OpenAIS +:stonith: STONITH +:susefirewall: SuSEFirewall2 +:pciback: PCI Pass-Through +:usbback: USB Pass-Through +:vgaback: VGA Pass-Through +:lvs: Linux Virtual Server +:krandr: KRandRTray +:amarok: Amarok +:digikam: digiKam +:postgresql: PostgreSQL +:mysql: MySQL +:jeos: JeOS +:stap: SystemTap +:oprof: OProfile +:cpufreq: CPUfreq +:powertop: powerTOP +:thrdmrk: * +:kexec: Kexec +:kdump: Kdump +:ycc_runlevel: System Services (Runlevel) +:kiwi: KIWI +:cdcreator: CD Creator +:addoncreator: Add-on Creator +:imgcreator: Image Creator +:productcreator: Product Creator +:nomad: Nomad +:cpufreq: CPUfreq +:qemu: QEMU +:wypublic: /srv/www/yast/public/ +:euro: € +:sapwiz: SAP Installation Wizard +:netweaver: SAP NetWeaver +:sap: SAP +:sapreg: SAP* +:saphana: SAP HANA +:hana: SAP HANA +:s4hana: SAP S/4HANA +:saphanasr: SAPHanaSR-angi +:saphanara: SAPHanaController +:saphanatopra: SAPHanaTopology +:saphanafilra: SAPHanaFilesystem +:b1: SAP BusinessOne +:instmaster: Installation Master +:instmedia: Installation Media +:mediaset: Media Set +:supmedia: Supplementary Media +:sapin: SAP Installer +:sapina: SAPinst +:thirdmedia: Third-Party Media +:pn15: {prodNr} +:pn12: 12 +:pn11: 11 +:psp15: {prodSP} +:psp12: SP5 +:psp11: SP4 +:file1: hanasr_appendix_docupdates.xml +:sapnote15: 2578899 +:sapnote12: 1984787 +:sapnote11: 1310037 +:sapnoteset15: 2684254 +:sapnoteset12: 2205917 +:sapnoteset11: 1954788 +:deploymentGuide15: https://documentation.suse.com/sles/15-{psp15}/single-html/SLES-deployment/ +:autoYastGuide15: https://documentation.suse.com/sles/15-{psp15}/html/SLES-all/book-autoyast.html +:haDocs15: https://documentation.suse.com/sle-ha/15-{psp15}/ +:haquickstart15: https://documentation.suse.com/sle-ha/15-{psp15}/single-html/SLE-HA-installation/ +:haAdminGuide15: https://documentation.suse.com/sle-ha/15-{psp15}/single-html/SLE-HA-administration/ +:sles4sapGuide15: https://documentation.suse.com/sles-sap/15-{psp15}/html/SLES-SAP-guide/ +:sles4sapDocs15: https://documentation.suse.com/sles-sap/15-{psp15}/ +:rmtGuide15: https://documentation.suse.com/sles/15-{psp15}/html/SLES-all/book-rmt.html +:tuningGuide15: https://documentation.suse.com/sbp/all/single-html/SBP-performance-tuning/ +:storageGuide15: https://documentation.suse.com/sles/15-{psp15}/single-html/SLES-storage/ +:persMemDoc: https://documentation.suse.com/sles/15-{psp15}/html/SLES-all/cha-nvdimm.html +:launchPadNotes: https://launchpad.support.sap.com/#/notes/ +:tidNotes: https://www.suse.com/support/kb/doc/?id= +:susedoc: https://documentation.suse.com/ +:reslibrary: https://documentation.suse.com/sbp/sap/ +:sumalandingpage: https://www.suse.com/products/suse-manager/ +:sumadoc: https://documentation.suse.com/external-tree/en-us/suma/4.1/suse-manager/index.html +:scclandingpage: https://scc.suse.com/ +:sccfaq: {scclandingpage}docs/help +:refsidadm: adm +:refHost1: +:refInst: +:refSidLc: +:refSiteA: +:refSiteB: +:refSid: +:docScaleOut: SAP HANA System Replication Scale-Out - Performance Optimized Scenario +:docCostOpt: Setting up a SAP HANA SR Cost Optimized Infrastructure +:docPerfOpt: SAP HANA System Replication Scale-Up - Performance Optimized Scenario +:dochaquickstart: Installation and Setup Quick Start +:sapHanaSrMinVers: 1.2 +:haDrCostOptMem: srCostOptMemConfig +:haDrCostOptMemPy: {haDrCostOptMem}.py