From 578826341399705e7deee63f3ee0f237b01ecd19 Mon Sep 17 00:00:00 2001
From: Ryan Emerson <remerson@redhat.com>
Date: Wed, 29 May 2024 11:33:13 +0100
Subject: [PATCH] Document Failover Lambda for Active/Passive deployments

Closes #29787

Signed-off-by: Ryan Emerson <remerson@redhat.com>
Signed-off-by: Alexander Schwartz <aschwart@redhat.com>
Co-authored-by: Alexander Schwartz <aschwart@redhat.com>
Co-authored-by: Alexander Schwartz <alexander.schwartz@gmx.net>
Co-authored-by: andymunro <48995441+andymunro@users.noreply.github.com>
---
 .../release_notes/topics/25_0_0.adoc          |   4 +
 .../bblocks-active-passive-sync.adoc          |   2 +-
 .../deploy-aws-route53-failover-lambda.adoc   | 252 ++++++++++++++++++
 .../deploy-aws-route53-loadbalancer.adoc      |  13 +-
 .../high-availability/introduction.adoc       |   1 +
 .../high-availability/operate-failover.adoc   |  11 +-
 .../operate-switch-back.adoc                  |   2 +-
 .../operate-switch-over.adoc                  |   3 +-
 docs/guides/high-availability/pinned-guides   |   1 +
 9 files changed, 280 insertions(+), 9 deletions(-)
 create mode 100644 docs/guides/high-availability/deploy-aws-route53-failover-lambda.adoc

diff --git a/docs/documentation/release_notes/topics/25_0_0.adoc b/docs/documentation/release_notes/topics/25_0_0.adoc
index 4699fd33e2..979826b9d5 100644
--- a/docs/documentation/release_notes/topics/25_0_0.adoc
+++ b/docs/documentation/release_notes/topics/25_0_0.adoc
@@ -164,6 +164,10 @@ This eliminates the need to execute the build phase and rebuild your image due t
 
 For more details, see the link:{upgradingguide_link}[{upgradingguide_name}].
 
+= High availability guide enhanced
+
+The high availability guide now contains a {section} on how to configure an AWS Lambda to prevent an intended automatic failback from the Backup site to the Primary site.
+
 = Removing deprecated methods from `AccessToken`, `IDToken`, and `JsonWebToken` classes
 
 In this release, we are finally removing deprecated methods from the following classes:
diff --git a/docs/guides/high-availability/bblocks-active-passive-sync.adoc b/docs/guides/high-availability/bblocks-active-passive-sync.adoc
index 81e4075ff3..a499f6a61b 100644
--- a/docs/guides/high-availability/bblocks-active-passive-sync.adoc
+++ b/docs/guides/high-availability/bblocks-active-passive-sync.adoc
@@ -65,7 +65,7 @@ A clustered deployment of {project_name} in each site, connected to an external
 
 A load balancer which checks the `/lb-check` URL of the {project_name} deployment in each site.
 
-*Blueprint:* <@links.ha id="deploy-aws-route53-loadbalancer"/>.
+*Blueprint:* <@links.ha id="deploy-aws-route53-loadbalancer"/>, optionally enhanced with <@links.ha id="deploy-aws-route53-failover-lambda"/>
 
 *Not considered:* AWS Global Accelerator as it supports only weighted traffic routing and not active-passive failover.
 To support active-passive failover, additional logic using, for example, AWS CloudWatch and AWS Lambda would be necessary to simulate the active-passive handling by adjusting the weights when the probes fail.
diff --git a/docs/guides/high-availability/deploy-aws-route53-failover-lambda.adoc b/docs/guides/high-availability/deploy-aws-route53-failover-lambda.adoc
new file mode 100644
index 0000000000..5bb091149b
--- /dev/null
+++ b/docs/guides/high-availability/deploy-aws-route53-failover-lambda.adoc
@@ -0,0 +1,252 @@
+<#import "/templates/guide.adoc" as tmpl>
+<#import "/templates/links.adoc" as links>
+
+<@tmpl.guide
+title="Deploy an AWS Route 53 Failover Lambda"
+summary="Building block for loadbalancer resilience"
+tileVisible="false" >
+
+After a Primary cluster has failed over to a Backup cluster due to a health check failure, the Primary must only serve requests
+again after the SRE team has synchronized the two sites first as outlined in the <@links.ha id="operate-switch-back" /> {section}.
+
+If the Primary site would be marked as healthy by the Route 53 Health Check before the sites are synchronized, the Primary Site would start serving requests with outdated session and realm data.
+
+This {section} shows how an automatic fallback to a not-yet synchronized Primary site can be prevented with the help of AWS CloudWatch, SNS, and Lambda.
+
+== Architecture
+
+In the event of a Primary cluster failure, an https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/AlarmThatSendsEmail.html[AWS CloudWatch]
+alarm sends a message to an https://aws.amazon.com/sns[AWS SNS] topic, which then triggers an https://aws.amazon.com/lambda/[AWS Lambda] function.
+The Lambda function updates the Route53 health check of the Primary cluster so that it points to a non-existent path
+`/lb-check-failed-over`, thus ensuring that it is impossible for the Primary to be marked as healthy until the path is
+manually changed back to `/lb-check`.
+
+== Prerequisites
+
+* Deployment of {project_name} as described in the <@links.ha id="deploy-keycloak-kubernetes" /> {section} on a ROSA cluster running OpenShift 4.14 or later in two AWS availability zones in one AWS region.
+* A Route53 configuration as described in the <@links.ha id="deploy-aws-route53-loadbalancer" /> {section}.
+
+== Procedure
+
+. Create an SNS topic to trigger a Lambda.
++
+.Command:
+[source,bash]
+----
+<#noparse>
+PRIMARY_HEALTH_ID=233e180f-f023-45a3-954e-415303f21eab #<1>
+ALARM_NAME=${PRIMARY_HEALTH_ID}
+TOPIC_NAME=${PRIMARY_HEALTH_ID}
+FUNCTION_NAME=${PRIMARY_HEALTH_ID}
+TOPIC_ARN=$(aws sns create-topic --name ${TOPIC_NAME} \
+  --query "TopicArn" \
+  --tags "Key=HealthCheckId,Value=${PRIMARY_HEALTH_ID}" \
+  --region us-east-1 \
+  --output text
+)
+</#noparse>
+----
+<1> Replace this with the ID of the xref:create-health-checks[Health Check] associated with your Primary cluster
++
+. Create a CloudWatch alarm to a send message to the SNS topic.
++
+.Command:
+[source,bash]
+----
+<#noparse>
+aws cloudwatch put-metric-alarm \
+  --alarm-actions ${TOPIC_ARN} \
+  --actions-enabled \
+  --alarm-name ${ALARM_NAME} \
+  --dimensions "Name=HealthCheckId,Value=${PRIMARY_HEALTH_ID}" \
+  --comparison-operator LessThanThreshold \
+  --evaluation-periods 1 \
+  --metric-name HealthCheckStatus \
+  --namespace AWS/Route53 \
+  --period 60 \
+  --statistic Minimum \
+  --threshold 1.0 \
+  --treat-missing-data notBreaching \
+  --region us-east-1
+</#noparse>
+----
++
+. Create the Role used to execute the Lambda.
++
+.Command:
+[source,bash]
+----
+<#noparse>
+ROLE_ARN=$(aws iam create-role \
+  --role-name ${FUNCTION_NAME} \
+  --assume-role-policy-document \
+  '{
+    "Version": "2012-10-17",
+    "Statement": [
+      {
+        "Effect": "Allow",
+        "Principal": {
+          "Service": "lambda.amazonaws.com"
+        },
+        "Action": "sts:AssumeRole"
+      }
+    ]
+  }' \
+  --query 'Role.Arn' \
+  --region us-east-1 \
+  --output text
+)
+</#noparse>
+----
++
+. Create a policy with the permissions required by the Lambda.
++
+.Command:
+[source,bash]
+----
+<#noparse>
+POLICY_ARN=$(aws iam create-policy \
+  --policy-name ${FUNCTION_NAME} \
+  --policy-document \
+  '{
+      "Version": "2012-10-17",
+      "Statement": [
+          {
+              "Effect": "Allow",
+              "Action": [
+                  "route53:UpdateHealthCheck"
+              ],
+              "Resource": "*"
+          }
+      ]
+  }' \
+  --query 'Policy.Arn' \
+  --region us-east-1 \
+  --output text
+)
+</#noparse>
+----
++
+. Attach the custom policy to the Lambda role.
++
+.Command:
+[source,bash]
+----
+<#noparse>
+aws iam attach-role-policy \
+  --role-name ${FUNCTION_NAME} \
+  --policy-arn ${POLICY_ARN} \
+  --region us-east-1
+</#noparse>
+----
++
+. Attach the `AWSLambdaBasicExecutionRole` policy so that the Lambda logs can be written to CloudWatch
++
+.Command:
+[source,bash]
+----
+<#noparse>
+aws iam attach-role-policy \
+  --role-name ${FUNCTION_NAME} \
+  --policy-arn arn:aws:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole \
+  --region us-east-1
+</#noparse>
+----
++
+. Create a Lambda ZIP file.
++
+.Command:
+[source,bash]
+----
+<#noparse>
+LAMBDA_ZIP=/tmp/lambda.zip
+cat << EOF > /tmp/lambda.py
+import boto3
+import json
+
+
+def handler(event, context):
+    print(json.dumps(event, indent=4))
+
+    msg = json.loads(event['Records'][0]['Sns']['Message'])
+    healthCheckId = msg['Trigger']['Dimensions'][0]['value']
+
+    r53Client = boto3.client("route53")
+    response = r53Client.update_health_check(
+        HealthCheckId=healthCheckId,
+        ResourcePath="/lb-check-failed-over"
+    )
+
+    print(json.dumps(response, indent=4, default=str))
+    statusCode = response['ResponseMetadata']['HTTPStatusCode']
+    if statusCode != 200:
+        raise Exception("Route 53 Unexpected status code %d" + statusCode)
+
+EOF
+zip -FS --junk-paths ${LAMBDA_ZIP} /tmp/lambda.py
+</#noparse>
+----
++
+. Create the Lambda function.
++
+.Command:
+[source,bash]
+----
+<#noparse>
+FUNCTION_ARN=$(aws lambda create-function \
+  --function-name ${FUNCTION_NAME} \
+  --zip-file fileb://${LAMBDA_ZIP} \
+  --handler lambda.handler \
+  --runtime python3.11 \
+  --role ${ROLE_ARN} \
+  --query 'FunctionArn' \
+  --region eu-west-1 \#<1>
+  --output text
+)
+</#noparse>
+----
+<1> Replace with the AWS region hosting your ROSA cluster
+
+. Allow the SNS to trigger the Lambda.
++
+.Command:
+[source,bash]
+----
+<#noparse>
+aws lambda add-permission \
+  --function-name ${FUNCTION_NAME} \
+  --statement-id function-with-sns \
+  --action 'lambda:InvokeFunction' \
+  --principal 'sns.amazonaws.com' \
+  --source-arn ${TOPIC_ARN} \
+  --region eu-west-1 #<1>
+</#noparse>
+----
+<1> Replace with the AWS region hosting your ROSA cluster
+
+. Invoke the Lambda when the SNS message is received.
++
+.Command:
+[source,bash]
+----
+<#noparse>
+aws sns subscribe --protocol lambda \
+  --topic-arn ${TOPIC_ARN} \
+  --notification-endpoint ${FUNCTION_ARN} \
+  --region us-east-1
+</#noparse>
+----
+
+== Verify
+
+To test the Lambda is triggered as expected, log in to the Primary cluster and scale the {project_name} deployment to zero Pods.
+Scaling will cause the Primary's health checks to fail and the following should occur:
+
+* Route53 should start routing traffic to the {project_name} Pods on the Backup cluster.
+* The Route53 health check for the Primary cluster should have `ResourcePath=/lb-check-failed-over`
+
+To direct traffic back to the Primary site, scale up the {project_name} deployment and manually revert the changes to the Route53 health check the Lambda has performed.
+
+For more information, see the <@links.ha id="operate-switch-back" /> {section}.
+
+</@tmpl.guide>
\ No newline at end of file
diff --git a/docs/guides/high-availability/deploy-aws-route53-loadbalancer.adoc b/docs/guides/high-availability/deploy-aws-route53-loadbalancer.adoc
index 013c9ae15d..ebb927aebb 100644
--- a/docs/guides/high-availability/deploy-aws-route53-loadbalancer.adoc
+++ b/docs/guides/high-availability/deploy-aws-route53-loadbalancer.adoc
@@ -73,7 +73,7 @@ aws elbv2 describe-load-balancers \
 +
 NOTE: ROSA clusters running OpenShift 4.13 and earlier use classic load balancers instead of application load balancers. Use the `aws elb describe-load-balancers` command and an updated query string instead.
 
-. Create Route53 health checks
+. [[create-health-checks]]Create Route53 health checks
 +
 .Command:
 [source,bash]
@@ -87,6 +87,7 @@ function createHealthCheck() {
   --query "HealthCheck.Id" \
   --no-cli-pager \
   --output text \
+  --region us-east-1 \
   --health-check-config '
   {
     "Type": "HTTPS",
@@ -136,6 +137,7 @@ BACKUP_HEALTH_ID=799e2cbb-43ae-4848-9b72-0d9173f04912
 aws route53 change-resource-record-sets \
   --hosted-zone-id Z09084361B6LKQQRCVBEY \
   --query "ChangeInfo.Id" \
+  --region us-east-1 \
   --output text \
   --change-batch '
   {
@@ -208,7 +210,7 @@ aws route53 change-resource-record-sets \
 .Command:
 [source,bash]
 ----
-aws route53 wait resource-record-sets-changed --id /change/C053410633T95FR9WN3YI
+aws route53 wait resource-record-sets-changed --id /change/C053410633T95FR9WN3YI --region us-east-1
 ----
 +
 . Update or create the {project_name} deployment
@@ -271,4 +273,9 @@ Navigate to the chosen CLIENT_DOMAIN in your local browser and log in to the {pr
 To test failover works as expected, log in to the Primary cluster and scale the {project_name} deployment to zero Pods.
 Scaling will cause the Primary's health checks to fail and Route53 should start routing traffic to the {project_name} Pods on the Backup cluster.
 
-</@tmpl.guide>
+== Optional: Failover Lambda
+
+To prevent a failed Primary cluster from becoming active without SRE input, follow the steps outlined in the
+guide <@links.ha id="deploy-aws-route53-failover-lambda" />
+
+</@tmpl.guide>
\ No newline at end of file
diff --git a/docs/guides/high-availability/introduction.adoc b/docs/guides/high-availability/introduction.adoc
index 8a9fbce7b2..6bf9300d1a 100644
--- a/docs/guides/high-availability/introduction.adoc
+++ b/docs/guides/high-availability/introduction.adoc
@@ -33,6 +33,7 @@ Additional performance tuning and security hardening are still recommended when
 * <@links.ha id="deploy-infinispan-kubernetes-crossdc" />
 * <@links.ha id="connect-keycloak-to-external-infinispan" />
 * <@links.ha id="deploy-aws-route53-loadbalancer" />
+* <@links.ha id="deploy-aws-route53-failover-lambda" />
 
 == Operational procedures
 
diff --git a/docs/guides/high-availability/operate-failover.adoc b/docs/guides/high-availability/operate-failover.adoc
index 527e86d03c..cedebec91d 100644
--- a/docs/guides/high-availability/operate-failover.adoc
+++ b/docs/guides/high-availability/operate-failover.adoc
@@ -13,7 +13,7 @@ A failover from the primary site to the secondary site will happen automatically
 
 When the primary site loses its state in {jdgserver_name} or a network partition occurs that prevents the synchronization, manual procedures are necessary to recover the primary site before it can handle traffic again, see the <@links.ha id="operate-switch-back" /> {section}.
 
-To prevent an automatic fallback to the primary site before those manual steps have been performed, configure the loadbalancer as described following to prevent this from happening automatically.
+To prevent fallback to the primary site before those manual steps have been performed, follow the procedure outlined in this guide.
 
 For a graceful switch to the secondary site, follow the instructions in the <@links.ha id="operate-switch-over" /> {section}.
 
@@ -21,11 +21,16 @@ See the <@links.ha id="introduction" /> {section} for different operational proc
 
 == Procedure
 
-Follow these steps to manually force a failover.
+Follow these steps to prevent an automatic failover back to the Primary site or to manually force a failover.
 
 === Route53
 
-To force Route53 to mark the primary site as permanently not available and prevent an automatic fallback, edit the health check in AWS to point to a non-existent route (`health/down`).
+To force Route53 to mark the primary site as permanently not available and prevent an automatic fallback, edit the health check in AWS to point to a non-existent route (`/lb-check-failed-over`).
+
+== Optional: Failover Lambda
+
+To prevent a failed Primary cluster from becoming active without SRE input, follow the steps outlined in the
+guide <@links.ha id="deploy-aws-route53-failover-lambda" />
 
 </@tmpl.guide>
 
diff --git a/docs/guides/high-availability/operate-switch-back.adoc b/docs/guides/high-availability/operate-switch-back.adoc
index bfb498d34e..4d520abb5e 100644
--- a/docs/guides/high-availability/operate-switch-back.adoc
+++ b/docs/guides/high-availability/operate-switch-back.adoc
@@ -74,7 +74,7 @@ include::partials/aurora/aurora-failover.adoc[]
 
 === Route53
 
-If switching over to the secondary site has been triggered by changing the health endpoint, edit the health check in AWS to point to a correct endpoint (`health/live`).
+If switching over to the secondary site has been triggered by changing the health endpoint, edit the health check in AWS to point to a correct endpoint (`/lb-check`).
 After some minutes, the clients will notice the change and traffic will gradually move over to the secondary site.
 
 == Further reading
diff --git a/docs/guides/high-availability/operate-switch-over.adoc b/docs/guides/high-availability/operate-switch-over.adoc
index 8e8a7882fe..e510fb3636 100644
--- a/docs/guides/high-availability/operate-switch-over.adoc
+++ b/docs/guides/high-availability/operate-switch-over.adoc
@@ -85,7 +85,8 @@ No action required.
 
 === Route53
 
-To force Route53 to mark the primary site as not available, edit the health check in AWS to point to a non-existent route (`health/down`).  After some minutes, the clients will notice the change and traffic will gradually move over to the secondary site.
+To force Route53 to mark the primary site as not available, edit the health check in AWS to point to a non-existent route (`/lb-check-switched-over`).
+After some minutes, the clients will notice the change and traffic will gradually move over to the secondary site.
 
 == Further reading
 
diff --git a/docs/guides/high-availability/pinned-guides b/docs/guides/high-availability/pinned-guides
index 3590a3f318..2ad53d1e3a 100644
--- a/docs/guides/high-availability/pinned-guides
+++ b/docs/guides/high-availability/pinned-guides
@@ -6,6 +6,7 @@ deploy-keycloak-kubernetes
 deploy-infinispan-kubernetes-crossdc
 connect-keycloak-to-external-infinispan
 deploy-aws-route53-loadbalancer
+deploy-aws-route53-failover-lambda
 operate-failover
 operate-switch-over
 operate-network-partition-recovery