restore_playbook.yaml

---

- hosts: localhost
  tasks:
  - debug: msg="Running Etcd Restore Ansible Operator Playbook"
  - name: set fact restore_s3
    set_fact:
      restore_s3: true
      etcd_secure_client: "absent"
      etcd_secure_peer: "absent"

  - name: lookup for status
    set_fact:
      cr: "{{ q('k8s',
                  api_version='etcd.database.coreos.com/v1beta2',
                  kind='EtcdRestore',
                  namespace=meta.namespace,
                  resource_name=meta.name)
                }}"

  - meta: end_play
    when: cr[0].get("status", {}).get("phase", "") == "Complete"

  - debug: msg="phase is Complete hence first run"

  - name: Update etcdrestore cr status to Restoring
    k8s_status:
      api_version: "etcd.database.coreos.com/v1beta2"
      kind: "EtcdRestore"
      name: "{{ meta.name }}"
      namespace: "{{ meta.namespace }}"
      status:
        phase: Restoring
        reason: "Restoring EtcdCluster {{ meta.namespace }}/{{ meta.name }}"

  - name: Set fail=true if bucket name is undefined
    set_fact:
      fail_reconciliation: true
      fail_reason: "Bucket name is undefined"
    when: s_3.path is undefined

  - name: Set fail=true if aws secret is undefined
    set_fact:
      fail_reconciliation: true
      fail_reason: "AWS Secret is undefined"
    when: s_3.aws_secret is undefined

  - name: Get EtcdCluster cr and save the spec
    set_fact:
      cluster_spec: "{{ q('k8s',
                                api_version='etcd.database.coreos.com/v1beta2',
                                kind='EtcdCluster',
                                namespace=meta.namespace,
                                resource_name=meta.name)
                      }}"
  - name: Set fail=true if EtcdCluster CR not found
    set_fact:
      fail_reconciliation: true
      fail_reason: "EtcdCluster CR {{ meta.namespace }}/{{ meta.name }} not Found"
    when: cluster_spec|length == 0

  - name: Determine if secure client
    set_fact:
      etcd_secure_client: "present"
    when: (cluster_spec[0].spec.TLS is not undefined and cluster_spec[0].spec.TLS.static is not undefined and cluster_spec[0].spec.TLS.static.operatorSecret is not undefined and cluster_spec[0].spec.TLS.static.operatorSecret|length > 0 )

  - name: Determine if secure peer
    set_fact:
      etcd_secure_peer: "present"
    when: ( cluster_spec[0].spec.TLS is not undefined and cluster_spec[0].spec.TLS.static is not undefined and cluster_spec[0].spec.TLS.static.member is not undefined and cluster_spec[0].spec.TLS.static.member.peerSecret|length > 0)

#  - name: call create certs on secure client
#    vars:
#      tls_secret_name: "{{ _tls.static.operator_secret }}"
#    include_tasks: create_certs.yaml
#    when: etcd_secure_client == "present"

  - block:
    - name: Set status when EtcdCluster cr not found
      k8s_status:
        api_version: "etcd.database.coreos.com/v1beta2"
        kind: "EtcdRestore"
        name: "{{ meta.name }}"
        namespace: "{{ meta.namespace }}"
        status:
          phase: Failed
          reason: "{{ fail_reason }}"

    - debug: msg="{{ fail_reason }}"

    - meta: end_play
    when: fail_reconciliation is defined

  - name: Pause EtcdCluster cr
    k8s:
      state: present
      definition:
        apiVersion: "etcd.database.coreos.com/v1beta2"
        kind: "EtcdCluster"
        metadata:
          name: "{{ meta.name }}"
          namespace: "{{ meta.namespace }}"
        spec:
          pause: true

  #The EtcdCluster Operator can be in middle of a blocking reconciliation call when paused hence might not pause immediately
  - name: Wait for EtcdCluster Operator to relinquish control
    k8s_facts:
      kind: EtcdCluster
      api_version: etcd.database.coreos.com/v1beta2
      namespace: "{{ meta.namespace }}"
      name: "{{ meta.name }}"
    register: etcd_cr
    until: etcd_cr.get("resources", []) and etcd_cr.resources[0].get("status", {}).get("controlPaused", "") == true
    retries: 30
    delay: 5

  - name: Delete EtcdCluster StatefulSet
    k8s:
      state: absent
      definition:
        apiVersion: "apps/v1"
        kind: "StatefulSet"
        metadata:
          name: "{{ meta.name }}"
          namespace: "{{ meta.namespace }}"

  - name: Delete EtcdCluster Restore Pods
    k8s:
      state: absent
      api_version: "v1"
      kind: "Pod"
      namespace: "{{ meta.namespace }}"
      name: "{{ meta.name }}-{{ item|string }}"
    with_sequence: start=0 end={{cluster_spec[0].spec.size|int - 1}}


  - name: Wait while all pods terminate
    vars:
      pods: "{{ q('k8s', api_version='v1', kind='Pod', namespace=meta.namespace,
         label_selector='etcd_cluster='+ meta.name + ',app=etcd') }}"
    set_fact:
      c_pods: "{{ pods }}"
    until: pods|length == 0
    retries: 50
    delay: 10

  - name: Wait while all restore pods terminate
    vars:
      pods: "{{ q('k8s', api_version='v1', kind='Pod', namespace=meta.namespace,
         label_selector='etcd_cluster='+ meta.name + ',app=etcd') }}"
    set_fact:
      c_pods: "{{ pods }}"
    until: pods|length == 0
    retries: 50
    delay: 10

  - name: set variables for etcd restore
    set_fact:
      fetch_args:
        - "/tmp/ansible/get_file.yaml"
        - "-e"
        - "s_3_path={{ s_3.path }}"
        - "-e"
        - "s_3_aws_secret={{ s_3.aws_secret }}"
        - "-e"
        - "storage_type={{ backup_storage_type }}"
        - "-e"
        - "etcd_namespace={{ meta.namespace }}"
        - "-e"
        - "file_path=/var/run/etcd"
      initial_cluster: []
      etcd_token: "{{ meta.name }}"
      image: "{{ 'quay.io/coreos/etcd:v' + cluster_spec[0].spec.version | string }}"
      etcd_volume_mounts:
        - mountPath: "/var/run/etcd"
          name: "data"

  - block:

    - name: generate initial cluster variable
      set_fact:
        initial_cluster: "{{ initial_cluster + [meta.name + '-' + (item|string) + '=http://' + meta.name + '-' + (item|string) + '.' + meta.name + '.' + meta.namespace + '.svc.cluster.local:2380'] }}"
  #      initial_cluster: "{{ initial_cluster + [meta.name + '-' + (item|string) + '=http://' + meta.name + '-' + (item|string) + '.' + meta.name + '.' + meta.namespace + '.svc.cluster.local:2380'] }}"
      with_sequence: start=0 end={{cluster_spec[0].spec.size|int - 1}}

    - name: Create Restore Pods
      vars:
        etcd_volumes:
          - name: data
            persistentVolumeClaim:
              claimName: "data-{{ meta.name }}-{{ item|string }}"
        pod_name: "{{ meta.name }}-{{ item|string }}"
        etcd_peer_url: "http://{{ meta.name }}-{{ item|string }}.{{meta.name}}.{{meta.namespace}}.svc.cluster.local:2380"
        init_containers:
          - name: "fetch-backup"
            image: "docker.io/alaypatel07/etcd-ansible-restore"
            command: ["ansible-playbook"]
            args: "{{ fetch_args }}"
            imagePullPolicy: Always
            volumeMounts: "{{ etcd_volume_mounts }}"
          - name: "delete-datadir"
            image: "{{ image }}"
            command:
              - "/bin/sh"
            args:
              - "-ec"
              - "if [ -d '/var/run/etcd/default.etcd' ]; then rm -rf /var/run/etcd/default.etcd;  fi && if [ -f '/var/run/etcd/member_id' ]; then rm -f /var/run/etcd/member_id;  fi"
            volumeMounts: "{{ etcd_volume_mounts }}"
      k8s:
        state: present
        definition:
          apiVersion: "v1"
          kind: Pod
          metadata:
            name: "{{ pod_name }}"
            namespace: "{{ meta.namespace }}"
            labels:
              etcd_cluster: "{{ meta.name }}"
              app: "etcd"
          spec:
            serviceAccountName: "etcd-ansible-operator"
            initContainers: "{{ init_containers }}"
            hostname: "{{ pod_name }}"
            subdomain: "{{ meta.name }}"
            containers:
              - name: "restore-datadir"
                image: "{{ image }}"
                command:
                  - "/bin/sh"
                args:
                  - "-ec"
                  - "while true; do
                       echo 'Waiting for {{ pod_name }} to come up';
                       ping -W 1 -c 1 {{ pod_name }}.{{ meta.name }}.{{ meta.namespace }}.svc.cluster.local > /dev/null && break;
                       sleep 1s;
                     done &&
                    ETCDCTL_API=3 etcdctl snapshot restore /var/run/etcd/latest.backup
                      --name {{ pod_name }}
                      --initial-cluster {{ initial_cluster | join(',') }}
                      --initial-cluster-token {{ etcd_token }}
                      --initial-advertise-peer-urls {{ etcd_peer_url }}
                      --data-dir /var/run/etcd/default.etcd 2>/dev/termination-log"
                volumeMounts: "{{ etcd_volume_mounts }}"
              - name: restore-check
                image: "{{ image }}"
                command:
                  - "/bin/sh"
                args:
                  - "-ec"
                  - "while [ ! -d '/var/run/etcd/default.etcd' ]; do sleep 1; done &&
                    exit 0;"
                volumeMounts: "{{ etcd_volume_mounts }}"
            restartPolicy: OnFailure
            volumes: "{{ etcd_volumes }}"
      with_sequence: start=0 end={{cluster_spec[0].spec.size|int - 1}}

    when: etcd_secure_peer == "absent"

  - block:
    - name: generate initial cluster variable
      set_fact:
        initial_cluster: "{{ initial_cluster + [meta.name + '-' + (item|string) + '=https://' + meta.name + '-' + (item|string) + '.' + meta.name + '.' + meta.namespace + '.svc.cluster.local:2380'] }}"
      with_sequence: start=0 end={{cluster_spec[0].spec.size|int - 1}}

    - name: Create Restore Pods
      vars:
        etcd_volumes:
          - name: data
            persistentVolumeClaim:
              claimName: "data-{{ meta.name }}-{{ item|string }}"
        pod_name: "{{ meta.name }}-{{ item|string }}"
        etcd_peer_url: "https://{{ meta.name }}-{{ item|string }}.{{meta.name}}.{{meta.namespace}}.svc.cluster.local:2380"
        init_containers:
          - name: "fetch-backup"
            image: "docker.io/alaypatel07/etcd-ansible-restore"
            command: ["ansible-playbook"]
            args: "{{ fetch_args }}"
            imagePullPolicy: Always
            volumeMounts: "{{ etcd_volume_mounts }}"
          - name: "delete-datadir"
            image: "{{ image }}"
            command:
              - "/bin/sh"
            args:
              - "-ec"
              - "if [ -d '/var/run/etcd/default.etcd' ]; then rm -rf /var/run/etcd/default.etcd;  fi && if [ -f '/var/run/etcd/member_id' ]; then rm -f /var/run/etcd/member_id;  fi"
            volumeMounts: "{{ etcd_volume_mounts }}"
      k8s:
        state: present
        definition:
          apiVersion: "v1"
          kind: Pod
          metadata:
            name: "{{ pod_name }}"
            namespace: "{{ meta.namespace }}"
            labels:
              etcd_cluster: "{{ meta.name }}"
              app: "etcd"
          spec:
            serviceAccountName: "etcd-ansible-operator"
            initContainers: "{{ init_containers }}"
            hostname: "{{ pod_name }}"
            subdomain: "{{ meta.name }}"
            containers:
              - name: "restore-datadir"
                image: "{{ image }}"
                command:
                  - "/bin/sh"
                args:
                  - "-ec"
                  - "while true; do
                     echo 'Waiting for {{ pod_name }} to come up';
                     ping -W 1 -c 1 {{ pod_name }}.{{ meta.name }}.{{ meta.namespace }}.svc.cluster.local > /dev/null && break;
                     sleep 1s;
                   done &&
                  ETCDCTL_API=3 etcdctl snapshot restore /var/run/etcd/latest.backup
                    --name {{ pod_name }}
                    --initial-cluster {{ initial_cluster | join(',') }}
                    --initial-cluster-token {{ etcd_token }}
                    --initial-advertise-peer-urls {{ etcd_peer_url }}
                    --data-dir /var/run/etcd/default.etcd 2>/dev/termination-log"
                volumeMounts: "{{ etcd_volume_mounts }}"
              - name: restore-check
                image: "{{ image }}"
                command:
                  - "/bin/sh"
                args:
                  - "-ec"
                  - "while [ ! -d '/var/run/etcd/default.etcd' ]; do sleep 1; done &&
                  exit 0;"
                volumeMounts: "{{ etcd_volume_mounts }}"
            restartPolicy: OnFailure
            volumes: "{{ etcd_volumes }}"
      with_sequence: start=0 end={{cluster_spec[0].spec.size|int - 1}}

    when: etcd_secure_peer == "present" and etcd_secure_client == "present"

  - name: Wait while 3 pods are running
    vars:
      pods: "{{ q('k8s', api_version='v1', kind='Pod', namespace=meta.namespace,
           label_selector='etcd_cluster='+ meta.name + ',app=etcd') }}"
    set_fact:
      c_pods: "{{ pods }}"
    until: pods|length == cluster_spec[0].spec.size and pods[0].status.phase == "Succeeded"
    retries: 30
    delay: 10

  - name: Delete Restore Pods
    vars:
      pod_name: "{{ meta.name }}-{{ item|string }}"
    k8s:
      state: absent
      definition:
        apiVersion: "v1"
        kind: Pod
        metadata:
          name: "{{ pod_name }}"
          namespace: "{{ meta.namespace }}"
    with_sequence: start=0 end={{cluster_spec[0].spec.size|int - 1}}

  - name: Unpause EtcdCluster
    k8s:
      state: present
      definition:
        apiVersion: "etcd.database.coreos.com/v1beta2"
        kind: "EtcdCluster"
        metadata:
          name: "{{ meta.name }}"
          namespace: "{{ meta.namespace }}"
        spec:
          pause: false

  - name: Wait for cluster to be available
    k8s_facts:
      kind: EtcdCluster
      api_version: etcd.database.coreos.com/v1beta2
      namespace: "{{ meta.namespace }}"
      name: "{{ meta.name }}"
    register: sts
    until: sts.get("resources", []) and sts.resources[0].get("status", {}).get("phase", "") == "Available"
    retries: 30
    delay: 10
    ignore_errors: yes

  - name: Update etcdrestore cr status to Failed
    k8s_status:
      api_version: "etcd.database.coreos.com/v1beta2"
      kind: "EtcdRestore"
      name: "{{ meta.name }}"
      namespace: "{{ meta.namespace }}"
      status:
        phase: Failed
        reason: "EtcdCluster is not Available"
    when: sts.get("resources", []) and sts.resources[0].get("status", {}).get("phase", "") != "Available"

  - name: Update etcdrestore cr status to Complete
    k8s_status:
      api_version: "etcd.database.coreos.com/v1beta2"
      kind: "EtcdRestore"
      name: "{{ meta.name }}"
      namespace: "{{ meta.namespace }}"
      status:
        phase: Complete
        reason: "EtcdCluster Restored successfully"
    when: sts.get("resources", []) and sts.resources[0].get("status", {}).get("phase", "") == "Available"