You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
modules/.github/scripts/check.sh

190 lines
6.1 KiB
Bash

#!/usr/bin/env bash
set -o pipefail
set -u
VERBOSE="${VERBOSE:-0}"
if [[ "${VERBOSE}" -ne "0" ]]; then
set -x
fi
# List of required environment variables
required_vars=(
"INSTATUS_API_KEY"
"INSTATUS_PAGE_ID"
"INSTATUS_COMPONENT_ID"
"VERCEL_API_KEY"
)
# Check if each required variable is set
for var in "${required_vars[@]}"; do
if [[ -z "${!var:-}" ]]; then
echo "Error: Environment variable '$var' is not set."
exit 1
fi
done
LATEST_REDEPLOY_FAILED="${LATEST_REDEPLOY_FAILED:-0}"
if [[ "${LATEST_REDEPLOY_FAILED}" -ne "0" ]] ;then
echo "Trying to re-run job when previous re-deploy failed"
return 1
fi
REGISTRY_BASE_URL="${REGISTRY_BASE_URL:-https://registry.coder.com}"
status=0
declare -a modules=()
declare -a failures=()
# Collect all module directories containing a main.tf file
for path in $(find . -maxdepth 2 -not -path '*/.*' -type f -name main.tf | cut -d '/' -f 2 | sort -u); do
modules+=("${path}")
done
echo "Checking modules: ${modules[*]}"
# Function to update the component status on Instatus
update_component_status() {
local component_status=$1
# see https://instatus.com/help/api/components
(curl -X PUT "https://api.instatus.com/v1/$INSTATUS_PAGE_ID/components/$INSTATUS_COMPONENT_ID" \
-H "Authorization: Bearer $INSTATUS_API_KEY" \
-H "Content-Type: application/json" \
-d "{\"status\": \"$component_status\"}")
}
# Function to create an incident
create_incident() {
local incident_name="Testing Instatus"
local message="The following modules are experiencing issues:\n"
for i in "${!failures[@]}"; do
message+="$((i + 1)). ${failures[$i]}\n"
done
component_status="PARTIALOUTAGE"
if (( ${#failures[@]} == ${#modules[@]} )); then
component_status="MAJOROUTAGE"
fi
# see https://instatus.com/help/api/incidents
response=$(curl -s -X POST "https://api.instatus.com/v1/$INSTATUS_PAGE_ID/incidents" \
-H "Authorization: Bearer $INSTATUS_API_KEY" \
-H "Content-Type: application/json" \
-d "{
\"name\": \"$incident_name\",
\"message\": \"$message\",
\"components\": [\"$INSTATUS_COMPONENT_ID\"],
\"status\": \"INVESTIGATING\",
\"notify\": true,
\"statuses\": [
{
\"id\": \"$INSTATUS_COMPONENT_ID\",
\"status\": \"PARTIALOUTAGE\"
}
]
}")
incident_id=$(echo "$response" | jq -r '.id')
echo "$incident_id"
}
force_redeploy_registry () {
# These are not secret values; safe to just expose directly in script
local VERCEL_TEAM_SLUG="codercom"
local VERCEL_TEAM_ID="team_tGkWfhEGGelkkqUUm9nXq17r"
local VERCEL_APP="registry"
local latest_res
latest_res=$(curl "https://api.vercel.com/v6/deployments?app=$VERCEL_APP&limit=1&slug=$VERCEL_TEAM_SLUG&teamId=$VERCEL_TEAM_ID" \
--fail \
--silent \
-H "Authorization: Bearer $VERCEL_API_KEY" \
-H "Content-Type: application/json"
)
# If we have zero deployments, something is VERY wrong. Make the whole
# script exit with a non-zero status code
local latest_id
latest_id=$(echo "${latest_res}" | jq '.deployments[0].uid')
if [[ "${latest_id}" = "null" ]]; then
echo "Unable to pull any previous deployments for redeployment"
return 1
fi
local latest_date_ts_seconds
latest_date_ts_seconds=$(echo "${latest_res}" | jq '.deployments[0].createdAt/1000|floor)')
local current_date_ts_seconds
current_date_ts_seconds="$(date +%s)"
local max_redeploy_interval_seconds=7200 # 2 hours
if (( current_date_ts_seconds - latest_date_ts_seconds < max_redeploy_interval_seconds )); then
echo "Last deployment was less than 2 hours ago. Skipping redeployment."
return 1
fi
local redeploy_res
redeploy_res=$(curl -X POST "https://api.vercel.com/v13/deployments?forceNew=1&skipAutoDetectionConfirmation=1&slug=$VERCEL_TEAM_SLUG&teamId=$VERCEL_TEAM_ID" \
--fail \
--silent \
--output "/dev/null" \
-H "Authorization: Bearer $VERCEL_API_KEY" \
-H "Content-Type: application/json" \
-d "{ \"deploymentId\": \"${latest_id}\" }"
)
echo "${redeploy_res}"
}
# Check each module's accessibility
for module in "${modules[@]}"; do
# Trim leading/trailing whitespace from module name
module=$(echo "${module}" | xargs)
url="${REGISTRY_BASE_URL}/modules/${module}"
printf "=== Checking module %s at %s\n" "${module}" "${url}"
status_code=$(curl --output /dev/null --head --silent --fail --location "${url}" --retry 3 --write-out "%{http_code}")
# shellcheck disable=SC2181
if (( status_code != 200 )); then
printf "==> FAIL(%s)\n" "${status_code}"
status=1
failures+=("${module}")
else
printf "==> OK(%s)\n" "${status_code}"
fi
done
# Determine overall status and update Instatus component
if (( status == 0 )); then
echo "All modules are operational."
# set to
update_component_status "OPERATIONAL"
echo "LATEST_REDEPLOY_FAILED=0" >> "${GITHUB_ENV}"
else
echo "The following modules have issues: ${failures[*]}"
# check if all modules are down
if (( ${#failures[@]} == ${#modules[@]} )); then
update_component_status "MAJOROUTAGE"
else
update_component_status "PARTIALOUTAGE"
fi
# Create a new incident
incident_id=$(create_incident)
echo "Created incident with ID: $incident_id"
# If a module is down, force a reployment to try getting things back online
# ASAP
status_code=$(force_redeploy_registry)
# shellcheck disable=SC2181
if (( status_code == 200 )); then
echo "Reployment successful"
else
echo "Unable to redeploy automatically"
fi
# Update environment variable so that if automatic re-deployment fails, we
# don't keep running the script over and over again. Note that even if a
# re-deployment succeeds, that doesn't necessarily mean that everything is
# fully operational
echo "LATEST_REDEPLOY_FAILED=1" >> "${GITHUB_ENV}"
fi
exit "${status}"