diff --git a/util/diagnostics/README.md b/util/diagnostics/README.md new file mode 100644 index 0000000000..6e86f0b659 --- /dev/null +++ b/util/diagnostics/README.md @@ -0,0 +1,40 @@ +# ParallelCluster Diagnostics + +A collection of scripts to diagnose common ParallelCluster issues. +The diagnostics suite is meant to be executed within the cluster head node. + +## Requirements + +The following tools must be installed on your **local machine** to deploy the diagnostsics suite to your cluster. + +| Tool | Purpose | +|---|---| +| `pcluster` | AWS ParallelCluster CLI, used to retrieve head node connection info | +| `ssh` | Used to connect to the head node and install dependencies | +| `rsync` | Used to upload the diagnostics folder to the head node | + +## Available Scripts + +| Script | Description | +|---|---| +| `diagnose-slurm-accounting.py` | Diagnoses SLURM accounting setup | + +## Usage + +### 1. Deploy to the head node + +Run `deploy.sh` from your local machine. It uploads the diagnostics folder to the head node and installs dependencies. + +```bash +bash deploy.sh --cluster-name --region --ssh-key +``` + +At the end it prints the SSH command to log directly into the diagnostics folder on the head node. + +### 2. Run a diagnostic script (example) + +Once logged into the head node: + +```bash +./diagnose-slurm-accounting.py --help +``` diff --git a/util/diagnostics/common.py b/util/diagnostics/common.py new file mode 100755 index 0000000000..2f871b57e2 --- /dev/null +++ b/util/diagnostics/common.py @@ -0,0 +1,112 @@ +import json +import logging +import re + +# nosec B404: The subprocess module is used intentionally to run trusted system commands. +import subprocess # nosec B404 + +import boto3 +import yaml +from botocore.exceptions import ClientError + +CHEF_DNA_JSON_FILE = "/etc/chef/dna.json" +LOCAL_CLUSTER_CONFIG_FILE = "/opt/parallelcluster/shared/cluster-config.yaml" + + +def setup_logging(): + """Set up common logging configuration for all diagnosis scripts.""" + logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s: %(message)s") + return logging.getLogger(__name__) + + +def print_success(message): + print(f"\033[32m[✓] {message}\033[0m") + + +def print_failure(message): + print(f"\033[31m[✗] {message}\033[0m") + + +def print_skipped(message): + print(f"\033[33m[~] {message}\033[0m") + + +def read_dna_json(): + try: + with open(CHEF_DNA_JSON_FILE, "r") as f: + return json.load(f) + except Exception as e: + raise RuntimeError(f"Failed to read {CHEF_DNA_JSON_FILE}: {str(e)}") from e + + +def parse_db_uri(uri): + if ":" in uri: + endpoint, port_str = uri.split(":", 1) + return endpoint, int(port_str) + else: + return uri, 3306 + + +def get_cluster_config_from_s3(region=None): + try: + dna = read_dna_json() + bucket = dna["cluster"]["cluster_s3_bucket"] + key = dna["cluster"]["cluster_config_s3_key"] + version = dna["cluster"]["cluster_config_version"] + + s3 = boto3.client("s3", region_name=region) if region else boto3.client("s3") + + response = s3.get_object(Bucket=bucket, Key=key, VersionId=version) + config = yaml.safe_load(response["Body"]) + + print_success("Downloaded cluster configuration from S3") + return config + except RuntimeError: + raise + except Exception as e: + raise RuntimeError(f"Failed to get config from S3: {str(e)}") from e + + +def read_yaml(path): + """Read a YAML file and return its contents as a dictionary.""" + try: + with open(path, "r") as f: + return yaml.safe_load(f) + except Exception as e: + raise RuntimeError(f"Failed to read YAML file {path}: {str(e)}") from e + + +def get_cluster_config_local(): + try: + return read_yaml(LOCAL_CLUSTER_CONFIG_FILE) + except RuntimeError: + raise + + +def get_slurm_config_value(conf_file, property_name): + try: + # A nosec comment is appended to the following line in order to disable the B603 and B607 checks. + # The command is constructed from a trusted, hardcoded path (conf_file) and a fixed executable (sudo, cat). + result = subprocess.run( + ["sudo", "cat", conf_file], capture_output=True, text=True, check=True + ) # nosec B603 B607 nosemgrep + content = result.stdout + match = re.search(f"{property_name}=(.*?)(?:\n|$)", content) + if match: + return match.group(1) + print_failure(f"{property_name} not found in configuration file") + return None + except (subprocess.CalledProcessError, FileNotFoundError) as e: + print_failure(f"Failed to read configuration file {conf_file}: {str(e)}") + return None + + +def get_secret(secret_arn, region=None): + try: + session = boto3.session.Session() + client = session.client(service_name="secretsmanager", region_name=region) + response = client.get_secret_value(SecretId=secret_arn) + return response["SecretString"] + except ClientError as e: + print_failure(f"Failed to retrieve secret from AWS Secrets Manager: {str(e)}") + return None diff --git a/util/diagnostics/deploy.sh b/util/diagnostics/deploy.sh new file mode 100644 index 0000000000..375d1bdf4a --- /dev/null +++ b/util/diagnostics/deploy.sh @@ -0,0 +1,106 @@ +#!/bin/bash +# Uploads the diagnostics folder to the head node of a ParallelCluster and installs dependencies. +# +# Usage: +# bash deploy.sh --cluster-name --region [--ssh-key ] +# +# Example: +# bash deploy.sh --cluster-name my-cluster --region us-east-1 --ssh-key ~/.ssh/my-key.pem +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +usage() { + echo "Usage: $0 --cluster-name --region [--ssh-key ]" + echo "" + echo "Upload the diagnostics folder to the head node of a ParallelCluster." + echo "" + echo "Options:" + echo " --cluster-name, -n Name of the cluster" + echo " --region, -r AWS region" + echo " --ssh-key, -i Path to the SSH private key" + exit "${1:-1}" +} + +CLUSTER_NAME="" +REGION="" +SSH_KEY="" + +while [[ $# -gt 0 ]]; do + case "$1" in + --cluster-name|-n) CLUSTER_NAME="$2"; shift 2 ;; + --region|-r) REGION="$2"; shift 2 ;; + --ssh-key|-i) SSH_KEY="$2"; shift 2 ;; + --help|-h) usage 0 ;; + *) echo "[ERROR] Unknown option: $1"; usage ;; + esac +done + +if [[ -z "$CLUSTER_NAME" || -z "$REGION" ]]; then + usage +fi + +if [[ -n "$SSH_KEY" && ! -f "$SSH_KEY" ]]; then + echo "[ERROR] SSH key file not found: $SSH_KEY" + exit 1 +fi + +echo "[INFO] Retrieving head node connection info for cluster '${CLUSTER_NAME}' in region '${REGION}'..." + +# Check pcluster is available +if ! command -v pcluster &>/dev/null; then + echo "[ERROR] 'pcluster' command not found. Please install the AWS ParallelCluster CLI." + exit 1 +fi + +# Run pcluster ssh dryrun; on failure, surface the CLI error directly +PCLUSTER_OUTPUT=$(pcluster ssh -n "$CLUSTER_NAME" -r "$REGION" --dryrun true 2>&1) || { + echo "[ERROR] pcluster command failed: ${PCLUSTER_OUTPUT}" + exit 1 +} + +SSH_CMD=$(echo "$PCLUSTER_OUTPUT" | python3 -c "import sys, json; print(json.load(sys.stdin)['command'])" 2>/dev/null) + +if [[ -z "$SSH_CMD" ]]; then + echo "[ERROR] Could not parse pcluster ssh output: '${PCLUSTER_OUTPUT}'" + exit 1 +fi + +# Extract user and IP from "ssh @" +USER_AT_IP=$(echo "$SSH_CMD" | awk '{print $2}') +DEFAULT_USER="${USER_AT_IP%%@*}" +HEAD_NODE_IP="${USER_AT_IP##*@}" + +if [[ -z "$DEFAULT_USER" || -z "$HEAD_NODE_IP" ]]; then + echo "[ERROR] Could not parse user and IP from pcluster ssh output: '${SSH_CMD}'" + exit 1 +fi + +echo "[INFO] Head node IP: ${HEAD_NODE_IP}" +echo "[INFO] Default user: ${DEFAULT_USER}" +echo "[INFO] Uploading ${SCRIPT_DIR} to ${DEFAULT_USER}@${HEAD_NODE_IP}:~/" + +REMOTE_DIR="$(basename "$SCRIPT_DIR")" + +# Build rsync and ssh args as arrays to safely handle paths with spaces +RSYNC_ARGS=(-av --exclude="README.md" --exclude="deploy.sh" --exclude="__pycache__") +SSH_ARGS=() +if [[ -n "$SSH_KEY" ]]; then + RSYNC_ARGS+=(-e "ssh -i ${SSH_KEY}") + SSH_ARGS+=(-i "${SSH_KEY}") +fi + +rsync "${RSYNC_ARGS[@]}" "$SCRIPT_DIR" "${DEFAULT_USER}@${HEAD_NODE_IP}:~/" + +echo "[INFO] Done. Files uploaded to /home/${DEFAULT_USER}/${REMOTE_DIR}/" + +echo "[INFO] Installing requirements on head node..." + +ssh "${SSH_ARGS[@]}" "${DEFAULT_USER}@${HEAD_NODE_IP}" "pip install -r ~/${REMOTE_DIR}/requirements.txt" + +echo "[INFO] Requirements installed successfully." +echo "[INFO] Next steps: log into the head node and run the diagnostics scripts from ~/${REMOTE_DIR}/" +SSH_LOGIN_CMD="ssh" +[[ -n "$SSH_KEY" ]] && SSH_LOGIN_CMD+=" -i ${SSH_KEY}" +SSH_LOGIN_CMD+=" ${DEFAULT_USER}@${HEAD_NODE_IP} -t 'cd ~/${REMOTE_DIR} && bash -l'" +echo "[INFO] ${SSH_LOGIN_CMD}" diff --git a/util/diagnostics/diagnose-slurm-accounting.py b/util/diagnostics/diagnose-slurm-accounting.py new file mode 100755 index 0000000000..96543abd21 --- /dev/null +++ b/util/diagnostics/diagnose-slurm-accounting.py @@ -0,0 +1,248 @@ +#!/usr/bin/env python3 +""" +Diagnose SLURM accounting configuration and connectivity. + +This script performs comprehensive checks on SLURM accounting setup including database +connectivity, configuration validation, and MySQL permissions. It's designed to be run +on the head node and can automatically retrieve cluster configuration if parameters are omitted. + +Usage: + python3 diagnose-slurm-accounting.py + python3 diagnose-slurm-accounting.py \ + --db-endpoint mydb.cluster-xyz.us-east-1.rds.amazonaws.com \ + --db-user slurm \ + --region us-east-1 +""" + +import json +import logging +import socket + +# nosec B404: The subprocess module is used intentionally to run trusted system commands. +import subprocess # nosec B404 + +import click +import pymysql +from common import ( + get_cluster_config_from_s3, + get_cluster_config_local, + get_secret, + get_slurm_config_value, + parse_db_uri, + print_failure, + print_skipped, + print_success, + read_dna_json, + setup_logging, +) + +SLURMDBD_CONF_FILE = "/opt/slurm/etc/slurm_parallelcluster_slurmdbd.conf" + + +def check_db_reachable(host, port): + try: + socket.create_connection((host, port), timeout=5) + print_success("Database endpoint reachability check") + except socket.error as e: + print_failure(f"Database endpoint not reachable: {str(e)}") + + +def check_config_db_endpoint(conf_file, db_endpoint): + conf_host = get_slurm_config_value(conf_file, "StorageHost") + if conf_host is None: + return + if conf_host == db_endpoint: + print_success("Database endpoint matches configuration") + else: + print_failure(f"Database endpoint does not match configuration (expected: {db_endpoint}, found: {conf_host})") + + +def check_secret_format(secret): + if secret is None: + print_skipped("Secret format check skipped: secret is None") + return + try: + json.loads(secret) + print_failure("Secret is in JSON format, expected plain text password") + except json.JSONDecodeError: + print_success("Secret is plain text password") + + +def check_secret_retrievable(secret_arn, region): + """Retrieve the secret and report success or failure. Returns the secret value or None.""" + secret = get_secret(secret_arn, region) + if secret is None: + print_failure(f"Secret could not be retrieved from Secrets Manager: {secret_arn}") + else: + print_success(f"Secret retrieved from Secrets Manager: {secret_arn}") + return secret + + +def check_config_db_password(conf_file, secret): + if secret is None: + print_skipped("Database password vs secret check skipped: secret is None") + return + conf_pass = get_slurm_config_value(conf_file, "StoragePass") + if conf_pass is None: + return + if conf_pass == secret: + print_success("Database password matches secret") + else: + print_failure("Database password does not match secret") + + +def check_config_db_user(conf_file, db_user): + conf_user = get_slurm_config_value(conf_file, "StorageUser") + if conf_user is None: + return + if conf_user == db_user: + print_success("Database user matches configuration") + else: + print_failure(f"Database user does not match configuration (expected: {db_user}, found: {conf_user})") + + +def check_mysql_connection(host: str, port: int, user: str, password: str): + try: + conn = pymysql.connect( + host=host, port=port, user=user, password=password, connect_timeout=5, ssl={"ssl_disabled": False} + ) + conn.close() + print_success("MySQL connection test") + except pymysql.Error as e: + print_failure(f"MySQL connection test failed: {str(e)}") + + +def check_mysql_user_permissions(host: str, port: int, user: str, password: str): + connection = None + cursor = None + try: + connection = pymysql.connect(host=host, port=port, user=user, password=password, ssl={"ssl_disabled": False}) + cursor = connection.cursor() + + cursor.execute("SELECT Host FROM mysql.user WHERE User = %s LIMIT 1", (user,)) + result = cursor.fetchone() + if result: + user_host = result[0] + cursor.execute(f"SHOW GRANTS FOR '{user}'@'{user_host}'") + else: + print_failure(f"MySQL user '{user}' not found in mysql.user table") + return + + grants = [grant[0] for grant in cursor.fetchall()] + print_success(f"User {user} has correct MySQL permissions") + logging.getLogger(__name__).info(f"Grants for user '{user}':") + for grant in grants: + print(f" {grant}") + except pymysql.Error as e: + print_failure(f"Failed to retrieve MySQL permissions for user '{user}': {str(e)}") + finally: + if cursor: + cursor.close() + if connection: + connection.close() + + +def check_mysql_errors_in_system_messages(): + error_patterns = [ + "mysql_real_connect failed", + "Access denied for user", + "The database must be up when starting the MYSQL plugin", + ] + try: + # A nosec comment is appended to the following line in order to disable the B603 and B607 checks. + # The command uses a fixed, hardcoded executable (sudo, journalctl) with no user-controlled input. + result = subprocess.run( + ["sudo", "journalctl", "-u", "slurmdbd", "--no-pager", "--boot"], capture_output=True, text=True, check=True + ) # nosec B603 B607 nosemgrep + log_output = result.stdout + found = [p for p in error_patterns if p in log_output] + if found: + print_failure(f"Found MySQL-related errors in slurmdbd logs: {found}") + else: + print_success("No errors related to MySQL in slurmdbd logs") + except (subprocess.CalledProcessError, FileNotFoundError) as e: + print_failure(f"Failed to read slurmdbd logs: {str(e)}") + + +def check_s3_config_matches_local(s3_config): + """Check that the cluster config fetched from S3 matches the one stored locally on the head node.""" + if s3_config is None: + print_skipped("S3 vs local config check skipped: S3 config could not be retrieved") + return + try: + local_config = get_cluster_config_local() + except RuntimeError as e: + print_failure(f"Could not read local cluster config: {str(e)}") + return + if s3_config == local_config: + print_success("S3 cluster config matches local cluster config") + else: + print_failure("S3 cluster config does not match local cluster config") + + +@click.command(help="Diagnose SLURM accounting setup.", context_settings={"help_option_names": ["-h", "--help"]}) +@click.option( + "--db-endpoint", help="Database endpoint. If not specified, determined from the cluster configuration in S3." +) +@click.option( + "--db-port", type=int, help="Database port. If not specified, determined from the cluster configuration in S3." +) +@click.option("--db-user", help="Database user. If not specified, determined from the cluster configuration in S3.") +@click.option( + "--secret-arn", + help="Secret ARN for the database password. If not specified, determined from the cluster configuration in S3.", +) +@click.option("--region", help="AWS region. If not specified, determined from the local /etc/chef/dna.json file.") +def main(db_endpoint, db_port, db_user, secret_arn, region): + """Diagnose SLURM accounting. + + This script is meant to be run on the head node. + If all parameters are missing they will be retrieved automatically from the cluster configuration. + """ + logger = setup_logging() + + # Always fetch S3 config — needed for auto-resolving missing params and for the config comparison check + s3_config = None + try: + s3_config = get_cluster_config_from_s3(region) + except RuntimeError as e: + logger.warning(f"Could not fetch S3 config: {str(e)}") + + # Resolve missing parameters from S3 config; log the source of each value + try: + db_config = s3_config["Scheduling"]["SlurmSettings"]["Database"] if s3_config else {} + db_endpoint_config, db_port_config = parse_db_uri(db_config["Uri"]) + db_endpoint = db_endpoint or db_endpoint_config + db_port = db_port or db_port_config + db_user = db_user or db_config["UserName"] + secret_arn = secret_arn or db_config["PasswordSecretArn"] + region = region or read_dna_json()["cluster"]["region"] + except RuntimeError as e: + logger.error(str(e)) + raise SystemExit(1) + + check_s3_config_matches_local(s3_config) + + logger.info("Parameters used for the checks") + logger.info(f"Database Endpoint: {db_endpoint}") + logger.info(f"Database Port: {db_port}") + logger.info(f"Database User: {db_user}") + logger.info(f"Secret ARN: {secret_arn}") + logger.info(f"Region: {region}") + + check_db_reachable(db_endpoint, db_port) + check_config_db_endpoint(SLURMDBD_CONF_FILE, db_endpoint) + secret = check_secret_retrievable(secret_arn, region) + check_secret_format(secret) + check_config_db_user(SLURMDBD_CONF_FILE, db_user) + check_config_db_password(SLURMDBD_CONF_FILE, secret) + password = get_slurm_config_value(SLURMDBD_CONF_FILE, "StoragePass") + check_mysql_connection(db_endpoint, db_port, db_user, password) + check_mysql_user_permissions(db_endpoint, db_port, db_user, password) + check_mysql_errors_in_system_messages() + + logger.info("All checks completed!") + + +if __name__ == "__main__": + main() diff --git a/util/diagnostics/requirements.txt b/util/diagnostics/requirements.txt new file mode 100644 index 0000000000..2ab86672ce --- /dev/null +++ b/util/diagnostics/requirements.txt @@ -0,0 +1,6 @@ +pyyaml +boto3 +pymysql +click +# This constraint on urllib3 to support OS shipping old OpenSSL versions, such as AL2. +urllib3<2.0 \ No newline at end of file