Contents

AWS Spot interruption stats for CI/CD

Contents

Amazon EC2 provides access to spare EC2 compute capacity in the AWS Cloud through Spot Instances at savings of up to 90% compared to On-Demand prices. The only difference between On-Demand Instances and Spot Instances is that Spot Instances can be interrupted by Amazon EC2, with two minutes of notice, if Amazon EC2 needs to reclaim the capacity.

Spot interruption frequency stats

The services below shows the average interruption frequency and savings over on-demand rates over last 30 days for various instance pools. While the average frequency of interruption across all Regions and Instance types has historically been <5%, the actual interruption rate for your workloads will depend on point-in-time available capacity.

Show rates in build log

Sometimes it is convenient to show EC2 Spot Interruption rate as well as other specs in Gitlab build log as shown below:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
============ RUNNER SPECS ========================
INSTANCE TYPE:      m6a.4xlarge
INSTANCE LIFECYCLE: spot
SPOT INTERRUPTION:  10-15%
INSTANCE NAME:      ip-12-34-56-78.us-west-2.compute.internal
ZONE:               us-west-2c
CPU model:          AMD EPYC 7R13 Processor
CPU CORES:          16
MEMORY TOTAL:       61.4605 Gb
UPTIME:             11:53:16 up 2 min,  0 users,  load average: 4.60, 1.56, 0.58
OS:                 Linux 5.10.192-183.736.amzn2.x86_64 x86_64
INSTANCE TAGS:      Unknown
---- Volumes ---- 
ami
root
---- Partitions ---- 
Filesystem      Size  Used Avail Use% Mounted on
overlay         150G   31G  120G  21% /
tmpfs            64M     0   64M   0% /dev
tmpfs            31G     0   31G   0% /sys/fs/cgroup
/dev/nvme0n1p1  150G   31G  120G  21% /builds
shm              64M     0   64M   0% /dev/shm
tmpfs            59G   12K   59G   1% /run/secrets/kubernetes.io/serviceaccount
tmpfs            31G     0   31G   0% /proc/acpi
tmpfs            31G     0   31G   0% /sys/firmware
==========================================================

Grafana Dashboard: <LINK_TO_DASHBOARD>

============ END OF SPECS ========================

This can be achieved by using before_script as below

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
.display_node_specs: &display_node_specs
  - |
    {
      # Workaround if uptime binary is not installed
      uptime_sec="$(cat /proc/uptime | awk '{print int($1)}')"
      uptime_linux=$(date -d "@$uptime_sec" "+$(($uptime_sec/86400)) days and %H hours %M minutes %S seconds")
    
      # AWS metadata
      TOKEN=$(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 21600")
      INSTANCE_NAME=`curl -H "X-aws-ec2-metadata-token: $TOKEN" --connect-timeout 10 -sf 169.254.169.254/latest/meta-data/hostname || hostname -f`
      AWS_INSTANCE_TYPE=`curl -H "X-aws-ec2-metadata-token: $TOKEN" --connect-timeout 10 -sf 169.254.169.254/latest/meta-data/instance-type || \
                                sysctl -n machdep.cpu.brand_string 2> /dev/null || echo "Unknown"`
      AWS_REGION=$(curl -H "X-aws-ec2-metadata-token: $TOKEN" --connect-timeout 10 -sf http://169.254.169.254/latest/meta-data/placement/region || echo "Unknown")
      AWS_INSTANCE_LIFECYCLE=$(curl -H "X-aws-ec2-metadata-token: $TOKEN" --connect-timeout 10 -sf 169.254.169.254/latest/meta-data/instance-life-cycle || echo "Unknown")
    
      # Spot instances interruption percentage
      if [[ "${AWS_REGION}" != "Unknown" && "${AWS_INSTANCE_LIFECYCLE}" == "spot" ]]; then
        AWS_INSTANCE_OS="$(uname)"
        # Get spot-advisor-data
        curl --connect-timeout 10 -sf https://spot-bid-advisor.s3.amazonaws.com/spot-advisor-data.json -o /tmp/data.json
        # Get interruption percentage for specific instance
        AWS_SPOT_INTERRUPTION=$(python3 -c '\
        import sys, os, json; f=open("/tmp/data.json", "r"); \
        d=json.load(f); id=d.get("spot_advisor", {}).get(sys.argv[1], {}).get(sys.argv[2], {}).get(sys.argv[3], {}).get("r", None); \
        p=next((item["label"] for item in d.get("ranges", []) if item["index"]==id), None); \
        print(p)' $AWS_REGION $AWS_INSTANCE_OS $AWS_INSTANCE_TYPE 2> /dev/null)
      fi
    
      echo ""
      echo ""
      echo "============ RUNNER SPECS ========================"
      echo "INSTANCE TYPE:      ${AWS_INSTANCE_TYPE}";
      echo "INSTANCE LIFECYCLE: ${AWS_INSTANCE_LIFECYCLE}";
      echo "SPOT INTERRUPTION:  ${AWS_SPOT_INTERRUPTION:-Unknown}";
      echo "INSTANCE NAME:      ${INSTANCE_NAME}";
      echo "AZ:                  `curl -H "X-aws-ec2-metadata-token: $TOKEN" --connect-timeout 10 -sf 169.254.169.254/latest/meta-data/placement/availability-zone || echo "Unknown"`";
      echo "CPU model:          `cat /proc/cpuinfo | grep 'model name' | uniq | sed 's/.*: //' || sysctl -n machdep.cpu.brand_string 2> /dev/null || echo "Unknown"`";
      echo "CPU CORES:          `sysctl -n hw.ncpu 2> /dev/null || nproc 2> /dev/null || echo "Unknown"`";
      echo "MEMORY TOTAL:       `sysctl -n hw.memsize >/dev/null 2>&1 && sysctl -n hw.memsize | awk '{print $1/2^30, "Gb"}' || \
                                cat /proc/meminfo | grep "MemTotal" | awk '{print $2/2^20, "Gb"}'`"
      echo "UPTIME:            `uptime 2> /dev/null || echo ${uptime_linux}`"
      echo "OS:                 `uname -mrs`"
      echo "EC2 TAGS    :      `curl -H "X-aws-ec2-metadata-token: $TOKEN" --connect-timeout 10 -sf 169.254.169.254/latest/meta-data/tags/instance || echo "Unknown"`";
      echo -e "---- Volumes ---- \n`curl -H "X-aws-ec2-metadata-token: $TOKEN" --connect-timeout 10 -sf http://169.254.169.254/latest/meta-data/block-device-mapping || echo "Unknown"`"
      echo -e "---- Partitions ---- \n`df -h`"
      echo "=========================================================="
      echo ""
      START_DATE=$(date +%s000);
      echo "Node Stats: https://grafana.mycompany.com/<DASHBOARD>?orgId=1&var-DataSource=<DATASOURCE>&var-server=${INSTANCE_NAME}&var-job=${HOSTNAME}&from=$((${START_DATE}-600000))&to=$((${START_DATE}+10800000))&refresh=5m" || echo "Unknown";
      echo ""
    } && echo "============ END OF SPECS ========================" || echo "fails"

So after it can be attached to all jobs

1
2
3
4
# Run for all jobs
default:
  before_script:
    - *display_node_specs