Compare commits

...

6 Commits

Author SHA1 Message Date
defelmnq
66aa7bf80a add validation rules to monitoring variables 2024-12-11 00:06:06 +01:00
defelmnq
1a101ecc10 improve versions set 2024-12-10 15:42:02 +01:00
defelmnq
9c049bd555 improve desription and default values 2024-12-10 15:39:53 +01:00
defelmnq
84bf63705f Work on monitoring module 2024-12-10 15:21:20 +01:00
defelmnq
e4af2a9887 Work on monitoring module 2024-12-10 15:17:20 +01:00
Muhammad Atif Ali
cbd06b1135 Improve incident management in Instatus check script (#346) 2024-12-05 14:24:45 +05:00
3 changed files with 188 additions and 8 deletions

View File

@@ -48,7 +48,7 @@ update_component_status() {
# Function to create an incident
create_incident() {
local incident_name="Testing Instatus"
local incident_name="Degraded Service"
local message="The following modules are experiencing issues:\n"
for i in "${!failures[@]}"; do
message+="$((i + 1)). ${failures[$i]}\n"
@@ -59,7 +59,7 @@ create_incident() {
component_status="MAJOROUTAGE"
fi
# see https://instatus.com/help/api/incidents
response=$(curl -s -X POST "https://api.instatus.com/v1/$INSTATUS_PAGE_ID/incidents" \
incident_id=$(curl -s -X POST "https://api.instatus.com/v1/$INSTATUS_PAGE_ID/incidents" \
-H "Authorization: Bearer $INSTATUS_API_KEY" \
-H "Content-Type: application/json" \
-d "{
@@ -74,10 +74,25 @@ create_incident() {
\"status\": \"PARTIALOUTAGE\"
}
]
}")
}" | jq -r '.id')
incident_id=$(echo "$response" | jq -r '.id')
echo "$incident_id"
echo "Created incident with ID: $incident_id"
}
# Function to check for existing unresolved incidents
check_existing_incident() {
# Fetch the latest incidents with status not equal to "RESOLVED"
local unresolved_incidents=$(curl -s -X GET "https://api.instatus.com/v1/$INSTATUS_PAGE_ID/incidents" \
-H "Authorization: Bearer $INSTATUS_API_KEY" \
-H "Content-Type: application/json" | jq -r '.incidents[] | select(.status != "RESOLVED") | .id')
if [[ -n "$unresolved_incidents" ]]; then
echo "Unresolved incidents found: $unresolved_incidents"
return 0 # Indicate that there are unresolved incidents
else
echo "No unresolved incidents found."
return 1 # Indicate that no unresolved incidents exist
fi
}
force_redeploy_registry () {
@@ -174,9 +189,10 @@ else
update_component_status "PARTIALOUTAGE"
fi
# Create a new incident
incident_id=$(create_incident)
echo "Created incident with ID: $incident_id"
# Check if there is an existing incident before creating a new one
if ! check_existing_incident; then
create_incident
fi
# If a module is down, force a reployment to try getting things back online
# ASAP

72
monitoring/README.md Normal file
View File

@@ -0,0 +1,72 @@
---
display_name: Monitoring
description: Monitoring of workspace resources
maintainer_github: coder
verified: true
tags: [monitoring]
---
# Monitoring
This module adds monitoring of workspace resources.
```tf
module "monitoring" {
source = "registry.coder.com/modules/monitoring/coder"
version = "1.0.0"
agent_id = coder_agent.dev.id
}
```
## Examples
```tf
module "monitoring" {
source = "registry.coder.com/modules/monitoring/coder"
version = "1.0.0"
agent_id = coder_agent.dev.id
}
```
### Enable/Disable
You can customize the monitoring by setting the `enabled`, `memory_enabled`, and `disk_enabled` variables.
```tf
module "monitoring" {
source = "registry.coder.com/modules/monitoring/coder"
version = "1.0.0"
agent_id = coder_agent.dev.id
enabled = false
memory_enabled = true
disk_enabled = false
}
```
### Customize Thresholds
You can customize the thresholds by setting the `threshold`, `memory_threshold`, and `disk_threshold` variables.
```tf
module "monitoring" {
source = "registry.coder.com/modules/monitoring/coder"
version = "1.0.0"
agent_id = coder_agent.dev.id
threshold = 90
memory_threshold = 95
disk_threshold = 90
}
```
### Customize Disks
You can customize the disks by setting the `disks` variable.
```tf
module "monitoring" {
source = "registry.coder.com/modules/monitoring/coder"
version = "1.0.0"
agent_id = coder_agent.dev.id
disks = ["/"]
}
```

92
monitoring/main.tf Normal file
View File

@@ -0,0 +1,92 @@
terraform {
required_version = ">= 1.0.25"
required_providers {
coder = {
source = "coder/coder"
version = ">= 2.0.2"
}
}
}
variable "threshold" {
type = number
description = "The threshold for the monitoring, used for all resources unless overridden by *_threshold - expressed as a percentage."
default = 90
validation {
condition = var.threshold >= 0 && var.threshold <= 100
error_message = "The threshold must be between 0 and 100."
}
}
variable "memory_threshold" {
type = number
description = "The threshold for the memory monitoring - expressed as a percentage."
default = 90
validation {
condition = var.memory_threshold >= 0 && var.memory_threshold <= 100
error_message = "The memory_threshold must be between 0 and 100."
}
}
variable "disk_threshold" {
type = number
description = "The threshold for the disk monitoring - expressed as a percentage."
default = 90
validation {
condition = var.disk_threshold >= 0 && var.disk_threshold <= 100
error_message = "The disk_threshold must be between 0 and 100."
}
}
variable "disks" {
type = list(string)
description = "The disks to monitor. e.g. ['/', '/home']"
default = ["/"]
}
variable "enabled" {
type = bool
description = "Whether the monitoring is enabled."
default = true
validation {
condition = var.enabled == true || var.enabled == false
error_message = "The enabled must be true or false."
}
}
variable "memory_enabled" {
type = bool
description = "Whether the memory monitoring is enabled."
default = true
validation {
condition = var.memory_enabled == true || var.memory_enabled == false
error_message = "The memory_enabled must be true or false."
}
}
variable "disk_enabled" {
type = bool
description = "Whether the disk monitoring is enabled."
default = true
validation {
condition = var.disk_enabled == true || var.disk_enabled == false
error_message = "The disk_enabled must be true or false."
}
}
variable "agent_id" {
type = string
description = "The ID of the agent to monitor."
}
data "coder_monitoring" "monitoring" {
threshold = var.threshold
memory_threshold = var.memory_threshold
disk_threshold = var.disk_threshold
disks = var.disks
enabled = var.enabled
memory_enabled = var.memory_enabled
disk_enabled = var.disk_enabled
agent_id = var.agent_id
}