Update NS1 CNAME record by service status / 09 Nov 2018 / Author: Haim Ari

    Estimated read time: 10 minutes

    This is simple docker container build with python, used to failover between Xtradb nodes.

    On xtradb nodes

    /etc/xinetd.d/mysqlchk
    # default: on 
    # description: mysqlchk 
    service mysqlchk 
    { 
    # this is a config for xinetd, place it in /etc/xinetd.d/
            disable = no
            flags           = REUSE 
            socket_type     = stream 
            type            = UNLISTED
            port            = 9200 
            wait            = no
            user            = nobody 
            server          = /usr/bin/clustercheck
            log_on_failure  += USERID 
            only_from       = 0.0.0.0/0
            #
            # Passing arguments to clustercheck
            # <user> <pass> <available_when_donor=0|1> <log_file> <available_when_readonly=0|1> <defaults_extra_file>"
            # Recommended: server_args   = user pass 1 /var/log/log-file 0 /etc/my.cnf.local"
            # Compatibility: server_args = user pass 1 /var/log/log-file 1 /etc/my.cnf.local"
            # 55-to-56 upgrade: server_args = user pass 1 /var/log/log-file 0 /etc/my.cnf.extra"
            #
            # recommended to put the IPs that need 
            # to connect exclusively (security purposes) 
            per_source      = UNLIMITED 
    }
    
    
    clustercheck

    In each xtradb node there is clustercheck script which responds to HTTP request in port 9200. It returns “200 OK” if the node is in sync. The script will check the status every 30 seconds and if a node is out of sync it will update NS1 records accordingly. Same goes for Connection Error. If both nodes checks fails the script will not update DNS records.

    /usr/bin/clustercheck
    
    #!/bin/bash
    #
    # Script to make a proxy (ie HAProxy) capable of monitoring Percona XtraDB Cluster nodes properly
    #
    # Author: Olaf van Zandwijk 
    # Documentation and download: https://github.com/olafz/percona-clustercheck
    #
    # Based on the original script from Unai Rodriguez
    # Modified by Brad Baker 5/7/2013
    #
    # This cluster check script is provided by the percona packages under
    # /usr/bin/clustercheck. I've made a copy of it to /our-custom-location because I had
    # to customize it to get it to work reliably  and I don't want YUM overwriting
    # our customized version.
    #
    # For some reason the percona provided version of this script will
    # intermittently fail when accessed remotely using curl or our load balancer
    # health check. To test this for yourself remotely run the following command
    # for i in {1..1000}; do curl http://your-server:9200; sleep 2; date;  done
    #
    # After extensive debugging one of the Percona devs had me add sleep statements.  
    # After doing so the intermittent issue stopped - WHY?! I have no idea. 
    # But with those in place it works reliably. 
    if [[ $1 == '-h' || $1 == '--help' ]];then
        echo "Usage: $0    "
        exit
    fi
    MYSQL_USERNAME="${1:-clustercheckuser}"
    MYSQL_PASSWORD="${2:-clustercheckpassword!}"
    AVAILABLE_WHEN_DONOR=${3:-0}
    ERR_FILE="${4:-/dev/null}"
    #Timeout exists for instances where mysqld may be hung
    TIMEOUT=10
    #
    # Perform the query to check the wsrep_local_state
    #
    WSREP_STATUS=`mysql -nNE --connect-timeout=$TIMEOUT --user=${MYSQL_USERNAME} --password=${MYSQL_PASSWORD} \
    -e "SHOW STATUS LIKE 'wsrep_local_state';" 2>${ERR_FILE} | tail -1 2>>${ERR_FILE}`
    if [[ "${WSREP_STATUS}" == "4" ]] || [[ "${WSREP_STATUS}" == "2" && ${AVAILABLE_WHEN_DONOR} == 1 ]]
    then
        # Percona XtraDB Cluster node local state is 'Synced' => return HTTP 200
        # Shell return-code is 0
        echo -en "HTTP/1.1 200 OK\r\n"
        sleep 0.1
        echo -en "Content-Type: text/plain\r\n"
        sleep 0.2
        echo -en "Connection: close\r\n"
        sleep 0.3
        echo -en "Content-Length: 40\r\n"
        sleep 0.2
        echo -en "\r\n"
        sleep 0.1
        echo -en "Percona XtraDB Cluster Node is synced.\r\n"
        sleep 0.1
        exit 0
    else
        # Percona XtraDB Cluster node local state is not 'Synced' => return HTTP 503
        # Shell return-code is 1
        echo -en "HTTP/1.1 503 Service Unavailable\r\n"
        sleep 0.1
        echo -en "Content-Type: text/plain\r\n"
        sleep 0.2
        echo -en "Connection: close\r\n"
        sleep 0.3
        echo -en "Content-Length: 44\r\n"
        sleep 0.2
        echo -en "\r\n"
        sleep 0.1
        echo -en "Percona XtraDB Cluster Node is not synced.\r\n"
        exit 1
    fi
    
    

    On Docker

    NS1
    • xtradb-yourdomain.com
    • i-xtradb.yourdomain.com

    The above are managed by the script and will point by CNAME to the active node

    Dyn
    • xtradb.your-other-domain.com
    • i-xtradb.your-other-domain.com

    The above are CNAMES to the “startappelb” records managed by the script.

    Dockerfile
    FROM python:2
    
    RUN mkdir /usr/src/app
    WORKDIR /usr/src/app
    
    COPY requirements.txt ./
    
    RUN pip install --no-cache-dir -r requirements.txt
    
    COPY xtradb-failover.py  /usr/src/app/xtradb-failover.py
    
    CMD exec /bin/bash -c "python /usr/src/app/xtradb-failover.py; trap : TERM INT;"
    
    
    requirements.txt
    ns1-python
    pymsteams
    
    xtradb-failover.py
    import urllib2
    import socket
    import os
    from ns1 import NS1
    import time
    import logging
    import sys
    import pymsteams
    
    
    __author__ = "Haim Ari"
    __license__ = "GPL"
    __version__ = "0.0.1"
    
    logger = logging.getLogger('xtradb-failover-check: ')
    logger.setLevel(logging.DEBUG)
    ch = logging.StreamHandler(sys.__stdout__)
    ch.setLevel(logging.DEBUG)
    formatter = logging.Formatter('%(asctime)s - %(lineno)s - %(levelname)s - %(message)s')
    ch.setFormatter(formatter)
    logger.addHandler(ch)
    
    teams_webhook_url = os.getenv("teams_url")
    
    zone = 'yourdomain.com'
    ttl = '360'
    cluster = ["i-xtradb1.your-other-domain.com", "i-xtradb2.your-other-domain.com"]
    cnames = {"i-xtradb1.your-other-domain.com": "xtradb-1.your-other-domain.com",
              "i-xtradb2.your-other-domain.com": "xtradb-2.your-other-domain.com"}
    
    port = "9200"
    synced_node = []
    not_synced_node = []
    ns1_key = os.getenv("NS1_KEY")
    sleep = 600
    records = ("xtradb", "i-xtradb")
    retry = 30
    
    
    def send_teams_notification(msg, action):
        logger.info("Sending Teams notification")
        myTeamsMessage = pymsteams.connectorcard(teams_webhook_url)
        myTeamsMessage.title("xtradb-checker: " + str(action))
        myTeamsMessage.text(msg)
        myTeamsMessage.send()
    
    
    def update_zone_records(internal_node, external_node):
        # Todo: don't run if synced node is already updated.
        """Update the CNAME record of the NS1 zone"""
        '''NS1 API Key should be set as a environment variable'''
        api = NS1(apiKey=ns1_key)
        current_zone = api.loadZone(zone)
    
        '''Delete current node record from DNS'''
        for name in records:
            rec = current_zone.loadRecord(str(name), 'CNAME')
            logger.info("Current records: " + str(rec))
            logger.warn("Removing records")
            rec.delete()
        logger.info("Done")
    
        '''Create CNAME to Active node'''
        logger.warn("Creating updated CNAME records")
        logger.warn("creating: " + records[0] + " -> CNAME -> " + external_node)
        current_zone.add_CNAME(records[0], external_node)
        logger.warn("creating: " + records[1] + " -> CNAME -> " + internal_node)
        current_zone.add_CNAME(records[1], internal_node)
        logger.info("Done")
        logger.info("Sleeping for " + str(sleep) + " seconds")
        send_teams_notification(msg="Updated DNS record with CNAMES: " + external_node + " & " + internal_node,
                                action="DNS update")
        time.sleep(sleep)
    
    
    def cluster_check():
        count = 0
        for node in cluster:
            url = ("http://" + node + ":" + port)
            try:
                connection = urllib2.urlopen(url)
                logger.info("node: " + str(node) + " response: " + str(connection.getcode()))
                connection.close()
            except (urllib2.HTTPError, urllib2.URLError,  socket.error) as e:
                try:
                    logger.error("node: " + str(node) + " response: " + str(e))
                    logger.info("Will retry in " + str(retry))
                    time.sleep(retry)
                    connection = urllib2.urlopen(url)
                    logger.info("node: " + str(node) + " response: " + str(connection.getcode()))
                    connection.close()
                except (urllib2.HTTPError, urllib2.URLError,  socket.error) as e:
                    not_synced_node.append(str(node))
                    logger.error("node: " + str(node) + " response: " + str(e))
                    count += 1
                    send_teams_notification(msg=("node: " + str(node) + " response: " + str(e)), action="Check Error")
            else:
                synced_node.append(str(node))
        if count == 1:
            logger.warning("preparing to update zone...")
            logger.warning(str("internal: " + str(synced_node[0]) + " external: " + str(cnames[synced_node[0]])))
            update_zone_records(internal_node=str(synced_node[0]), external_node=str(cnames[synced_node[0]]))
        elif count == 2:
            logger.error("both nodes are unavailable, will not update NS1 record.")
        del synced_node[:]
    
    
    if __name__ == "__main__":
        logger.info("Started.")
        while True:
            cluster_check()
            time.sleep(30)
    
    
    The following Env Vars, should be ‘exported’ during CI.
    export them manualy if not using CI, or omit them if not needed.
    • update your domain/s
    • update your Microsoft Teams Url
    
    export teams_url="Your Teams URL Here" 
    
    • update your Slack
    export slack_token="Your Slack Token Here" 
    
    • In your CI, export your NS1 API KEY as :
    export NS1_KEY="Your NS1 API Key"
    
    Build the docker image using Gitlab and deploy to kubernetes
    .gitlab-ci.yml
    image: docker:git
    services:
      - docker:dind
    
    stages:
      - build
      - deploy
    
    variables:
      IMAGE_TAG: $CI_REGISTRY_IMAGE:$CI_COMMIT_SHA
      IMAGE_NAME: $CI_REGISTRY_IMAGE
      DOCKER_DRIVER: overlay2
    
    
    build:
      stage: build
      script:
        - docker login -u gitlab-ci-token -p $CI_JOB_TOKEN $CI_REGISTRY
        - docker build -t $IMAGE_TAG .
        - docker tag $IMAGE_TAG $CI_REGISTRY_IMAGE
        - docker tag $IMAGE_TAG $IMAGE_NAME
        - docker push $IMAGE_TAG
        - docker push $IMAGE_NAME
      tags:
        - generic
    
    
    deploy:
      image: lachlanevenson/k8s-kubectl:latest
      stage: deploy
      environment:
        name: monitoring
      script:
        - kubectl version
        - cd manifests/
        - sed -i "s/__APP__/${CI_PROJECT_NAME}/" deployment.yml
        - sed -i "s/__VERSION__/${CI_COMMIT_SHA}/" deployment.yml
        - kubectl apply -f deployment.yml
        # ToDo migrate these secrets to "kubernetes config/secrets"
        - kubectl set env Deployment/xtradb-failover
                      NS1_KEY=${NS1_API_KEY}
                      slack_token=${slack_token}
                      teams_url=${teams_url} -n monitoring
    
        - kubectl rollout status -f deployment.yml
        - kubectl get pods,svc -n dev -o wide
      tags:
        - kubernetes
    
    
    manifests/deployment.yml
    apiVersion: extensions/v1beta1
    kind: Deployment
    metadata:
      name: __APP__
      namespace: monitoring
    spec:
      template:
        metadata:
          labels:
            app: __APP__
        spec:
          containers:
            - name: __APP__
              image: registry.your-domain.com/monitoring/__APP__:__VERSION__
              resources:
                requests:
                  cpu: "0.5"
                  memory: 50Mi
                limits:
                  cpu: "1"
                  memory: 100Mi
          dnsPolicy: "None"
          dnsConfig:
            nameservers:
              - 8.8.8.8
            searches:
              - your-domain.com
          imagePullSecrets:
            - name: reg
    
    

    clustercheck