Skip to content

Commit 6934533

Browse files
authored
Merge pull request #3708 from abhishek-sa1/pub/k8s_telemetry
Rename ldms store theme to slurm-cluster, update ldms port configuration, adding ome kafka topic creation
2 parents fa835bc + e1e654b commit 6934533

26 files changed

+618
-291
lines changed

common/library/module_utils/input_validation/schema/telemetry_config.json

Lines changed: 52 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -20,18 +20,24 @@
2020
},
2121
"ldms_agg_port": {
2222
"type": "integer",
23-
"const": 6001,
24-
"description": "LDMS aggregator starting port (FIXED - cannot be changed)"
23+
"minimum": 6001,
24+
"maximum": 6100,
25+
"default": 6001,
26+
"description": "LDMS Aggregator port on service k8s cluster. Valid range: 6001-6100. Default: 6001"
2527
},
2628
"ldms_store_port": {
2729
"type": "integer",
28-
"const": 6001,
29-
"description": "LDMS store daemon port (FIXED - cannot be changed)"
30+
"minimum": 6001,
31+
"maximum": 6100,
32+
"default": 6001,
33+
"description": "LDMS store daemon port on service k8s cluster. Valid range: 6001-6100. Can be the same as ldms_agg_port (isolated by pod). Default: 6001"
3034
},
3135
"ldms_sampler_port": {
3236
"type": "integer",
33-
"const": 10001,
34-
"description": "LDMS sampler port on compute nodes (FIXED - cannot be changed)"
37+
"minimum": 10001,
38+
"maximum": 10100,
39+
"default": 10001,
40+
"description": "LDMS sampler port on compute nodes. Valid range: 10001-10100. Default: 10001"
3541
},
3642
"ldms_sampler_configurations": {
3743
"type": "array",
@@ -49,8 +55,9 @@
4955
},
5056
"activation_parameters": {
5157
"type": "string",
52-
"description": "Activation parameters as a string (e.g., 'interval=1000000 offset=0')",
53-
"pattern": "^(?=.*\\binterval=[1-9][0-9]*\\b)(?:.*\\boffset=[0-9]+\\b)?$"
58+
"description": "Activation parameters as a string (e.g., 'interval=1000000 offset=0'). Format: 'interval=<microseconds>' with optional 'offset=<microseconds>' separated by space.",
59+
"pattern": "^interval=[1-9][0-9]*(?:\\s+offset=[0-9]+)?$",
60+
"errorMessage": "Must be in format 'interval=<non-zero-number>' or 'interval=<non-zero-number> offset=<number>'. Example: 'interval=1000000' or 'interval=1000000 offset=0'"
5461
}
5562
},
5663
"required": ["plugin_name", "activation_parameters"],
@@ -114,8 +121,43 @@
114121
"type": "integer"
115122
},
116123
"topic_partitions": {
117-
"type": "integer",
118-
"minimum": 1
124+
"type": "array",
125+
"minItems": 1,
126+
"maxItems": 3,
127+
"items": {
128+
"type": "object",
129+
"properties": {
130+
"name": {
131+
"type": "string",
132+
"enum": ["idrac", "ldms", "ome"],
133+
"description": "CONSTANT: Fixed topic names that cannot be changed. Only 'idrac', 'ldms', and 'ome' are allowed.",
134+
"errorMessage": {
135+
"enum": "Invalid topic name. Only 'idrac', 'ldms', and 'ome' are allowed as Kafka topic names. Custom topic names are not supported."
136+
}
137+
},
138+
"partitions": {
139+
"type": "integer",
140+
"minimum": 1,
141+
"maximum": 100,
142+
"description": "Number of partitions for the topic (1-100). This is the only configurable parameter."
143+
}
144+
},
145+
"required": ["name", "partitions"],
146+
"additionalProperties": false,
147+
"errorMessage": {
148+
"required": {
149+
"name": "Topic 'name' is required and must be one of: 'idrac', 'ldms', 'ome'",
150+
"partitions": "Topic 'partitions' is required and must be between 1-100"
151+
}
152+
}
153+
},
154+
"uniqueItems": true,
155+
"description": "IMPORTANT: At least one Kafka topic must be defined. Topic names 'idrac', 'ldms', and 'ome' are CONSTANTS. 'idrac' is required if idrac_telemetry_support is true and kafka is in idrac_telemetry_collection_type. 'ldms' is required if LDMS software is configured in software_config.json (automatic detection). 'ome' is optional. Only partition counts can be changed.",
156+
"errorMessage": {
157+
"minItems": "At least 1 Kafka topic must be defined. Configure based on enabled features.",
158+
"maxItems": "Maximum 3 topics allowed: 'idrac', 'ldms', and 'ome'",
159+
"uniqueItems": "Each topic (idrac, ldms, ome) must appear only once"
160+
}
119161
}
120162
},
121163
"required": [

common/library/module_utils/input_validation/validation_flows/common_validation.py

Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1137,6 +1137,124 @@ def validate_telemetry_config(
11371137
en_us_validation_msg.TELEMETRY_SERVICE_CLUSTER_ENTRY_MISSING_ROLES_CONFIG_MSG
11381138
)
11391139
)
1140+
1141+
# Determine LDMS support from software_config.json
1142+
# software_config.json is in the same directory as telemetry_config.yml
1143+
ldms_support_from_software_config = False
1144+
input_dir = os.path.dirname(input_file_path)
1145+
software_config_file_path = os.path.join(input_dir, "software_config.json")
1146+
1147+
logger.info(f"Checking for LDMS software in: {software_config_file_path}")
1148+
1149+
if os.path.exists(software_config_file_path):
1150+
try:
1151+
with open(software_config_file_path, 'r') as f:
1152+
software_config = json.load(f)
1153+
softwares = software_config.get("softwares", [])
1154+
ldms_support_from_software_config = any(
1155+
software.get("name") == "ldms" for software in softwares
1156+
)
1157+
logger.info(f"LDMS software detected in software_config.json: {ldms_support_from_software_config}")
1158+
if ldms_support_from_software_config:
1159+
logger.info("LDMS software found - 'ldms' topic will be required in kafka_configurations.topic_partitions")
1160+
except (json.JSONDecodeError, IOError) as e:
1161+
logger.warn(f"Could not load software_config.json: {e}")
1162+
else:
1163+
logger.info(f"software_config.json not found at: {software_config_file_path}")
1164+
1165+
# Validate topic_partitions configuration
1166+
kafka_config = data.get("kafka_configurations", {})
1167+
topic_partitions = kafka_config.get("topic_partitions", [])
1168+
idrac_telemetry_collection_type = data.get("idrac_telemetry_collection_type", "")
1169+
1170+
# Check if LDMS software is configured but kafka_configurations is missing entirely
1171+
if ldms_support_from_software_config and not kafka_config:
1172+
errors.append(create_error_msg(
1173+
"kafka_configurations",
1174+
"not defined",
1175+
"LDMS software is configured in software_config.json, but kafka_configurations section is missing in telemetry_config.yml. "
1176+
"Please define kafka_configurations with at least the 'ldms' topic in topic_partitions."
1177+
))
1178+
1179+
# Check if LDMS software is configured but no topics are defined
1180+
if ldms_support_from_software_config and kafka_config and not topic_partitions:
1181+
errors.append(create_error_msg(
1182+
"kafka_configurations.topic_partitions",
1183+
"not defined",
1184+
"LDMS software is configured in software_config.json, but kafka_configurations.topic_partitions is not defined. "
1185+
"Please define at least the 'ldms' topic in topic_partitions."
1186+
))
1187+
1188+
if topic_partitions:
1189+
# Ensure at least one topic is defined
1190+
if len(topic_partitions) < 1:
1191+
errors.append(create_error_msg(
1192+
"kafka_configurations.topic_partitions",
1193+
"is empty",
1194+
"At least one Kafka topic must be defined"
1195+
))
1196+
1197+
# Collect topic names and validate each one
1198+
topic_names = []
1199+
allowed_topics = {"idrac", "ldms", "ome"}
1200+
1201+
for idx, topic in enumerate(topic_partitions):
1202+
if "name" not in topic:
1203+
errors.append(create_error_msg(
1204+
f"kafka_configurations.topic_partitions[{idx}]",
1205+
"missing 'name' field",
1206+
"Each topic must have a 'name' field"
1207+
))
1208+
continue
1209+
1210+
topic_name = topic.get("name")
1211+
topic_names.append(topic_name)
1212+
1213+
# Validate each topic name individually
1214+
if topic_name not in allowed_topics:
1215+
errors.append(create_error_msg(
1216+
f"kafka_configurations.topic_partitions[{idx}].name",
1217+
topic_name,
1218+
f"Invalid topic name '{topic_name}'. Only 'idrac', 'ldms', and 'ome' are allowed as Kafka topic names. Custom topic names are not supported."
1219+
))
1220+
1221+
present_topics = set(topic_names)
1222+
1223+
# Debug logging
1224+
logger.info(f"Telemetry validation - Present topics: {present_topics}")
1225+
logger.info(f"Telemetry validation - Allowed topics: {allowed_topics}")
1226+
1227+
# Validate required topics based on feature flags
1228+
# If iDRAC telemetry is enabled with Kafka, idrac topic is required
1229+
if idrac_telemetry_support and 'kafka' in idrac_telemetry_collection_type.split(','):
1230+
if 'idrac' not in present_topics:
1231+
errors.append(create_error_msg(
1232+
"kafka_configurations.topic_partitions",
1233+
"missing 'idrac' topic",
1234+
"idrac topic is required when idrac_telemetry_support is true and 'kafka' is in idrac_telemetry_collection_type"
1235+
))
1236+
1237+
# If LDMS software is configured in software_config.json, ldms topic is required
1238+
logger.info(f"Checking LDMS topic requirement - ldms_support_from_software_config: {ldms_support_from_software_config}")
1239+
if ldms_support_from_software_config and 'ldms' not in present_topics:
1240+
logger.error(f"LDMS topic validation FAILED - 'ldms' topic is missing from present_topics: {present_topics}")
1241+
errors.append(create_error_msg(
1242+
"kafka_configurations.topic_partitions",
1243+
"missing 'ldms' topic",
1244+
"ldms topic is required when LDMS software is configured in software_config.json"
1245+
))
1246+
elif ldms_support_from_software_config:
1247+
logger.info(f"LDMS topic validation PASSED - 'ldms' found in present_topics: {present_topics}")
1248+
1249+
# Check for duplicate topic names
1250+
if len(topic_names) != len(set(topic_names)):
1251+
duplicates = [name for name in topic_names if topic_names.count(name) > 1]
1252+
errors.append(create_error_msg(
1253+
"kafka_configurations.topic_partitions",
1254+
f"duplicate topics: {', '.join(set(duplicates))}",
1255+
"Each topic must be defined only once"
1256+
))
1257+
11401258
return errors
11411259

11421260
def validate_additional_software(

discovery/discovery.yml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,8 +36,7 @@
3636
- name: Set dynamic run tags including 'provision'
3737
when: not config_file_status | default(false) | bool
3838
ansible.builtin.set_fact:
39-
omnia_run_tags: "{{ ((ansible_run_tags | default([]))
40-
+ ['provision', 'slurm', 'slurm_custom', 'security', 'service_k8s', 'csi_driver_powerscale']) | unique }}" # noqa: yaml[line-length]
39+
omnia_run_tags: "{{ (ansible_run_tags | default([]) + ['provision']) + ['slurm'] + ['slurm_custom'] + ['security'] + ['service_k8s'] + ['csi_driver_powerscale'] + ['telemetry'] | unique }}" # noqa: yaml[line-length]
4140
cacheable: true
4241

4342
- name: Invoke validate_config.yml to perform L1 and L2 validations

discovery/roles/configure_ochami/templates/ldms/ldms_conf.sh.j2

Lines changed: 18 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -14,12 +14,22 @@ else
1414
echo "Environment file /opt/ovis-ldms/etc/profile.d/set-ovis-variables.sh not found. Continuing..." | tee -a "$LOG_FILE"
1515
fi
1616

17-
# --- Verify environment file existence ---
18-
echo "Checking for LDMS sampler environment file..." | tee -a "$LOG_FILE"
17+
# --- Copy LDMS sampler environment file from NFS share ---
18+
echo "Copying LDMS sampler environment file from NFS share..." | tee -a "$LOG_FILE"
19+
if [ -f {{ client_mount_path }}/ldms/samplers/ldmsd.sampler.env ]; then
20+
sudo cp {{ client_mount_path }}/ldms/samplers/ldmsd.sampler.env /opt/ovis-ldms/etc/ldms/ldmsd.sampler.env
21+
echo "✓ Copied ldmsd.sampler.env" | tee -a "$LOG_FILE"
22+
else
23+
echo "Warning: ldmsd.sampler.env not found in NFS share" | tee -a "$LOG_FILE"
24+
fi
25+
26+
# --- Source environment file to get port ---
1927
if [ -f /opt/ovis-ldms/etc/ldms/ldmsd.sampler.env ]; then
20-
echo "Found ldmsd.sampler.env" | tee -a "$LOG_FILE"
28+
source /opt/ovis-ldms/etc/ldms/ldmsd.sampler.env
29+
echo "✓ Sourced ldmsd.sampler.env (Port: $LDMSD_PORT)" | tee -a "$LOG_FILE"
2130
else
22-
echo "Warning: ldmsd.sampler.env not found" | tee -a "$LOG_FILE"
31+
LDMSD_PORT=10001
32+
echo "Warning: Using default port $LDMSD_PORT" | tee -a "$LOG_FILE"
2333
fi
2434

2535
# --- Configure and enable LDMS service ---
@@ -36,13 +46,13 @@ fi
3646

3747
# Configure firewall safely
3848

39-
echo "Configuring firewall for LDMS port 10001..." | tee -a "$LOG_FILE"
40-
sudo firewall-cmd --permanent --add-port=10001/tcp
49+
echo "Configuring firewall for LDMS port $LDMSD_PORT..." | tee -a "$LOG_FILE"
50+
sudo firewall-cmd --permanent --add-port=$LDMSD_PORT/tcp
4151
sudo firewall-cmd --reload
4252

4353
# --- Verify LDMS connection and metrics ---
4454
echo "Verifying LDMS connection and metrics..." | tee -a "$LOG_FILE"
45-
/opt/ovis-ldms/sbin/ldms_ls -a ovis -A conf=/opt/ovis-ldms/etc/ldms/ldmsauth.conf -p 10001 -h localhost | tee -a "$LOG_FILE"
46-
/opt/ovis-ldms/sbin/ldms_ls -l -a ovis -A conf=/opt/ovis-ldms/etc/ldms/ldmsauth.conf -p 10001 -h localhost > /tmp/metrics
55+
/opt/ovis-ldms/sbin/ldms_ls -a ovis -A conf=/opt/ovis-ldms/etc/ldms/ldmsauth.conf -p $LDMSD_PORT -h localhost | tee -a "$LOG_FILE"
56+
/opt/ovis-ldms/sbin/ldms_ls -l -a ovis -A conf=/opt/ovis-ldms/etc/ldms/ldmsauth.conf -p $LDMSD_PORT -h localhost > /tmp/metrics
4757

4858
echo "===== LDMS setup completed =====" | tee -a "$LOG_FILE"

discovery/roles/telemetry/files/nersc-ldms-aggr/README.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ For csm: make_host_map.csm.py:
6161
3. Write to out_dir/host_map.json
6262

6363
For dell: make_host_map.dell.py:
64-
1. Copy host_map.r7525.json to out_dir/host_map.json
64+
1. Copy host_map.slurm-cluster.json to out_dir/host_map.json
6565

6666
Create ldms config and prepare chart (nersc_ldms_make_ldms_config.py)
6767
1. Create `ldmsd` config and environment variable files for each `ldmsd` to distribute the producers across daemons and enable daemons to find each other.
@@ -256,11 +256,11 @@ helm install -n telemetry nersc-ldms-aggr nersc-ldms-aggr --values values.yaml
256256
257257
# Let it startup
258258
kubectl -n telemetry top pods --containers |grep ldms
259-
nersc-ldms-aggr-0 r7525-0 2m 16Mi
259+
nersc-ldms-aggr-0 slurm-cluster-0 2m 16Mi
260260
nersc-ldms-exporter-0 exporter 1m 27Mi
261261
nersc-ldms-exporter-1 exporter 1m 24Mi
262262
nersc-ldms-exporter-2 exporter 1m 24Mi
263-
nersc-ldms-store-r7525-0 store 2m 10Mi
263+
nersc-ldms-store-slurm-cluster-0 store 2m 10Mi
264264
nersc-ldms-stream-0 stream 1m 13Mi
265265
266266
```
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
#!/bin/bash
22

33
if [ ! -d "$
4-
cp host_map.r7525.json out_dir
4+
cp host_map.slurm-cluster.json out_dir

discovery/roles/telemetry/files/nersc-ldms-aggr/make_host_map.dell.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ def main(self):
4747
os.makedirs(self.out_dir, exist_ok=True)
4848

4949
# PLACE HOLDER: just copy the example file for now
50-
shutil.copy("host_map.r7525.json", self.out_dir)
50+
shutil.copy("host_map.slurm-cluster.json", self.out_dir)
5151

5252
def main():
5353
parser = argparse.ArgumentParser()

discovery/roles/telemetry/files/nersc-ldms-aggr/nersc_ldms_make_ldms_config.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -166,6 +166,7 @@ def make_agg_configs(self): # pylint: disable=too-many-locals
166166
def make_store_configs(self): # pylint: disable=too-many-locals
167167
"""Generate store configuration files."""
168168
logging.info("Make Store Configs")
169+
169170
for ldmsd_name, ldmsd_conf in self.config['node_types'].items():
170171
# grab auth data
171172
auth_type = ldmsd_conf.get('auth_type')
@@ -537,7 +538,7 @@ def make_config_store(self, ldmsd_name, ldmsd_agg_name, ldmsd_agg_port, # pylin
537538
cfg.append("prdcr_start_regex regex=.*")
538539
cfg.extend([
539540
"# Store in kafka - port 9092 (plaintext, no TLS, no auth)",
540-
"# NOTE: store_avro_kafka plugin cannot configure TLS/SSL",
541+
"# NOTE: store_avro_kafka plugin using plaintext for anonymous access",
541542
"# Using plaintext listener on port 9092 (internal cluster only)",
542543
"load name=store_avro_kafka",
543544
"config name=store_avro_kafka encoding=json topic=ldms",

0 commit comments

Comments
 (0)