-
Notifications
You must be signed in to change notification settings - Fork 317
[Diagnostics] Add diagnostics suite with first tool to diagnose SLURM accounting setup #7336
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
f474247
2af84d7
6d72b65
a17a18f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,40 @@ | ||
| # ParallelCluster Diagnostics | ||
|
|
||
| A collection of scripts to diagnose common ParallelCluster issues. | ||
| The diagnostics suite is meant to be executed within the cluster head node. | ||
|
|
||
| ## Requirements | ||
|
|
||
| The following tools must be installed on your **local machine** to deploy the diagnostsics suite to your cluster. | ||
|
|
||
| | Tool | Purpose | | ||
| |---|---| | ||
| | `pcluster` | AWS ParallelCluster CLI, used to retrieve head node connection info | | ||
| | `ssh` | Used to connect to the head node and install dependencies | | ||
| | `rsync` | Used to upload the diagnostics folder to the head node | | ||
|
|
||
| ## Available Scripts | ||
|
|
||
| | Script | Description | | ||
| |---|---| | ||
| | `diagnose-slurm-accounting.py` | Diagnoses SLURM accounting setup | | ||
|
|
||
| ## Usage | ||
|
|
||
| ### 1. Deploy to the head node | ||
|
|
||
| Run `deploy.sh` from your local machine. It uploads the diagnostics folder to the head node and installs dependencies. | ||
|
|
||
| ```bash | ||
| bash deploy.sh --cluster-name <cluster-name> --region <region> --ssh-key <path-to-key> | ||
| ``` | ||
|
|
||
| At the end it prints the SSH command to log directly into the diagnostics folder on the head node. | ||
|
|
||
| ### 2. Run a diagnostic script (example) | ||
|
|
||
| Once logged into the head node: | ||
|
|
||
| ```bash | ||
| ./diagnose-slurm-accounting.py --help | ||
| ``` |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,112 @@ | ||
| import json | ||
| import logging | ||
| import re | ||
|
|
||
| # nosec B404: The subprocess module is used intentionally to run trusted system commands. | ||
| import subprocess # nosec B404 | ||
|
|
||
| import boto3 | ||
| import yaml | ||
| from botocore.exceptions import ClientError | ||
|
|
||
| CHEF_DNA_JSON_FILE = "/etc/chef/dna.json" | ||
| LOCAL_CLUSTER_CONFIG_FILE = "/opt/parallelcluster/shared/cluster-config.yaml" | ||
|
|
||
|
|
||
| def setup_logging(): | ||
| """Set up common logging configuration for all diagnosis scripts.""" | ||
| logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s: %(message)s") | ||
| return logging.getLogger(__name__) | ||
|
|
||
|
|
||
| def print_success(message): | ||
| print(f"\033[32m[✓] {message}\033[0m") | ||
|
|
||
|
|
||
| def print_failure(message): | ||
| print(f"\033[31m[✗] {message}\033[0m") | ||
|
gmarciani marked this conversation as resolved.
Dismissed
|
||
|
|
||
|
|
||
| def print_skipped(message): | ||
| print(f"\033[33m[~] {message}\033[0m") | ||
|
|
||
|
|
||
| def read_dna_json(): | ||
| try: | ||
| with open(CHEF_DNA_JSON_FILE, "r") as f: | ||
| return json.load(f) | ||
| except Exception as e: | ||
| raise RuntimeError(f"Failed to read {CHEF_DNA_JSON_FILE}: {str(e)}") from e | ||
|
|
||
|
|
||
| def parse_db_uri(uri): | ||
| if ":" in uri: | ||
| endpoint, port_str = uri.split(":", 1) | ||
| return endpoint, int(port_str) | ||
| else: | ||
| return uri, 3306 | ||
|
|
||
|
|
||
| def get_cluster_config_from_s3(region=None): | ||
| try: | ||
| dna = read_dna_json() | ||
| bucket = dna["cluster"]["cluster_s3_bucket"] | ||
| key = dna["cluster"]["cluster_config_s3_key"] | ||
| version = dna["cluster"]["cluster_config_version"] | ||
|
|
||
| s3 = boto3.client("s3", region_name=region) if region else boto3.client("s3") | ||
|
|
||
| response = s3.get_object(Bucket=bucket, Key=key, VersionId=version) | ||
| config = yaml.safe_load(response["Body"]) | ||
|
|
||
| print_success("Downloaded cluster configuration from S3") | ||
| return config | ||
| except RuntimeError: | ||
| raise | ||
| except Exception as e: | ||
| raise RuntimeError(f"Failed to get config from S3: {str(e)}") from e | ||
|
|
||
|
|
||
| def read_yaml(path): | ||
| """Read a YAML file and return its contents as a dictionary.""" | ||
| try: | ||
| with open(path, "r") as f: | ||
| return yaml.safe_load(f) | ||
| except Exception as e: | ||
| raise RuntimeError(f"Failed to read YAML file {path}: {str(e)}") from e | ||
|
|
||
|
|
||
| def get_cluster_config_local(): | ||
| try: | ||
| return read_yaml(LOCAL_CLUSTER_CONFIG_FILE) | ||
| except RuntimeError: | ||
| raise | ||
|
|
||
|
|
||
| def get_slurm_config_value(conf_file, property_name): | ||
| try: | ||
| # A nosec comment is appended to the following line in order to disable the B603 and B607 checks. | ||
| # The command is constructed from a trusted, hardcoded path (conf_file) and a fixed executable (sudo, cat). | ||
| result = subprocess.run( | ||
| ["sudo", "cat", conf_file], capture_output=True, text=True, check=True | ||
| ) # nosec B603 B607 nosemgrep | ||
| content = result.stdout | ||
| match = re.search(f"{property_name}=(.*?)(?:\n|$)", content) | ||
| if match: | ||
| return match.group(1) | ||
| print_failure(f"{property_name} not found in configuration file") | ||
| return None | ||
| except (subprocess.CalledProcessError, FileNotFoundError) as e: | ||
| print_failure(f"Failed to read configuration file {conf_file}: {str(e)}") | ||
| return None | ||
|
|
||
|
|
||
| def get_secret(secret_arn, region=None): | ||
|
github-advanced-security[bot] marked this conversation as resolved.
Fixed
|
||
| try: | ||
| session = boto3.session.Session() | ||
| client = session.client(service_name="secretsmanager", region_name=region) | ||
| response = client.get_secret_value(SecretId=secret_arn) | ||
| return response["SecretString"] | ||
| except ClientError as e: | ||
| print_failure(f"Failed to retrieve secret from AWS Secrets Manager: {str(e)}") | ||
| return None | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,106 @@ | ||
| #!/bin/bash | ||
| # Uploads the diagnostics folder to the head node of a ParallelCluster and installs dependencies. | ||
| # | ||
| # Usage: | ||
| # bash deploy.sh --cluster-name <cluster-name> --region <region> [--ssh-key <path-to-key>] | ||
| # | ||
| # Example: | ||
| # bash deploy.sh --cluster-name my-cluster --region us-east-1 --ssh-key ~/.ssh/my-key.pem | ||
| set -euo pipefail | ||
|
|
||
| SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" | ||
|
|
||
| usage() { | ||
| echo "Usage: $0 --cluster-name <cluster-name> --region <region> [--ssh-key <path>]" | ||
| echo "" | ||
| echo "Upload the diagnostics folder to the head node of a ParallelCluster." | ||
| echo "" | ||
| echo "Options:" | ||
| echo " --cluster-name, -n Name of the cluster" | ||
| echo " --region, -r AWS region" | ||
| echo " --ssh-key, -i Path to the SSH private key" | ||
| exit "${1:-1}" | ||
| } | ||
|
|
||
| CLUSTER_NAME="" | ||
| REGION="" | ||
| SSH_KEY="" | ||
|
|
||
| while [[ $# -gt 0 ]]; do | ||
| case "$1" in | ||
| --cluster-name|-n) CLUSTER_NAME="$2"; shift 2 ;; | ||
| --region|-r) REGION="$2"; shift 2 ;; | ||
| --ssh-key|-i) SSH_KEY="$2"; shift 2 ;; | ||
| --help|-h) usage 0 ;; | ||
| *) echo "[ERROR] Unknown option: $1"; usage ;; | ||
| esac | ||
| done | ||
|
|
||
| if [[ -z "$CLUSTER_NAME" || -z "$REGION" ]]; then | ||
| usage | ||
| fi | ||
|
|
||
| if [[ -n "$SSH_KEY" && ! -f "$SSH_KEY" ]]; then | ||
| echo "[ERROR] SSH key file not found: $SSH_KEY" | ||
| exit 1 | ||
| fi | ||
|
|
||
| echo "[INFO] Retrieving head node connection info for cluster '${CLUSTER_NAME}' in region '${REGION}'..." | ||
|
|
||
| # Check pcluster is available | ||
| if ! command -v pcluster &>/dev/null; then | ||
| echo "[ERROR] 'pcluster' command not found. Please install the AWS ParallelCluster CLI." | ||
| exit 1 | ||
| fi | ||
|
|
||
| # Run pcluster ssh dryrun; on failure, surface the CLI error directly | ||
| PCLUSTER_OUTPUT=$(pcluster ssh -n "$CLUSTER_NAME" -r "$REGION" --dryrun true 2>&1) || { | ||
| echo "[ERROR] pcluster command failed: ${PCLUSTER_OUTPUT}" | ||
| exit 1 | ||
| } | ||
|
|
||
| SSH_CMD=$(echo "$PCLUSTER_OUTPUT" | python3 -c "import sys, json; print(json.load(sys.stdin)['command'])" 2>/dev/null) | ||
|
|
||
| if [[ -z "$SSH_CMD" ]]; then | ||
| echo "[ERROR] Could not parse pcluster ssh output: '${PCLUSTER_OUTPUT}'" | ||
| exit 1 | ||
| fi | ||
|
|
||
| # Extract user and IP from "ssh <user>@<ip>" | ||
| USER_AT_IP=$(echo "$SSH_CMD" | awk '{print $2}') | ||
| DEFAULT_USER="${USER_AT_IP%%@*}" | ||
| HEAD_NODE_IP="${USER_AT_IP##*@}" | ||
|
|
||
| if [[ -z "$DEFAULT_USER" || -z "$HEAD_NODE_IP" ]]; then | ||
| echo "[ERROR] Could not parse user and IP from pcluster ssh output: '${SSH_CMD}'" | ||
| exit 1 | ||
| fi | ||
|
|
||
| echo "[INFO] Head node IP: ${HEAD_NODE_IP}" | ||
| echo "[INFO] Default user: ${DEFAULT_USER}" | ||
| echo "[INFO] Uploading ${SCRIPT_DIR} to ${DEFAULT_USER}@${HEAD_NODE_IP}:~/" | ||
|
|
||
| REMOTE_DIR="$(basename "$SCRIPT_DIR")" | ||
|
|
||
| # Build rsync and ssh args as arrays to safely handle paths with spaces | ||
| RSYNC_ARGS=(-av --exclude="README.md" --exclude="deploy.sh" --exclude="__pycache__") | ||
| SSH_ARGS=() | ||
| if [[ -n "$SSH_KEY" ]]; then | ||
| RSYNC_ARGS+=(-e "ssh -i ${SSH_KEY}") | ||
| SSH_ARGS+=(-i "${SSH_KEY}") | ||
| fi | ||
|
|
||
| rsync "${RSYNC_ARGS[@]}" "$SCRIPT_DIR" "${DEFAULT_USER}@${HEAD_NODE_IP}:~/" | ||
|
gmarciani marked this conversation as resolved.
|
||
|
|
||
| echo "[INFO] Done. Files uploaded to /home/${DEFAULT_USER}/${REMOTE_DIR}/" | ||
|
|
||
| echo "[INFO] Installing requirements on head node..." | ||
|
|
||
| ssh "${SSH_ARGS[@]}" "${DEFAULT_USER}@${HEAD_NODE_IP}" "pip install -r ~/${REMOTE_DIR}/requirements.txt" | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [Non-blocking] We should create a virtual environement so that we do not install packages which could be related to CVE and can be picked up during a scan especially when we are not baking them into the AMI
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Agree, will do in follow up PR |
||
|
|
||
| echo "[INFO] Requirements installed successfully." | ||
| echo "[INFO] Next steps: log into the head node and run the diagnostics scripts from ~/${REMOTE_DIR}/" | ||
| SSH_LOGIN_CMD="ssh" | ||
| [[ -n "$SSH_KEY" ]] && SSH_LOGIN_CMD+=" -i ${SSH_KEY}" | ||
| SSH_LOGIN_CMD+=" ${DEFAULT_USER}@${HEAD_NODE_IP} -t 'cd ~/${REMOTE_DIR} && bash -l'" | ||
| echo "[INFO] ${SSH_LOGIN_CMD}" | ||
Uh oh!
There was an error while loading. Please reload this page.