Skip to content

Commit 16b9e08

Browse files
CR2-22 CR2-49 CR2-48 add metrics list to sdk (#328)
* feat(metrics): add metrics list to sdk CR-22 * refactor(metrics): remove charts from ListMetrics * feat(metrics): sdk and cli have metrics list functionality for jobs experiments and deployments * fix(listMetrics): remove condition for localhost port :8080 * fix(deployments): list metrics, not get metrics * Revert "fix(deployments): list metrics, not get metrics" This reverts commit dc9a1b9. * fix(deployments): list metrics, not get metrics * feat(notebooks): cli notebooks list metrics option * Update gradient/api_sdk/repositories/experiments.py Co-authored-by: Colin <[email protected]> * fix(metrics): correct metrics_api_url while in local development * fix(config): revert config to DEFAULT_CONFIG_HOST * chore(metrics): added port :8080 in db so removing logic from code Co-authored-by: Colin <[email protected]>
1 parent 6b6ca6d commit 16b9e08

File tree

18 files changed

+444
-16
lines changed

18 files changed

+444
-16
lines changed

gradient/api_sdk/clients/deployment_client.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -296,6 +296,27 @@ def get_metrics(self, deployment_id, start=None, end=None, interval="30s", built
296296
)
297297
return metrics
298298

299+
300+
def list_metrics(self, deployment_id, start=None, end=None, interval="30s"):
301+
"""List model deployment metrics
302+
303+
:param str deployment_id: ID of deployment
304+
:param datetime.datetime|str start:
305+
:param datetime.datetime|str end:
306+
:param str interval:
307+
:returns: Metrics of a model deployment job
308+
:rtype: dict[str,dict[str,list[dict]]]
309+
"""
310+
311+
repository = self.build_repository(repositories.ListDeploymentMetrics)
312+
metrics = repository.get(
313+
id=deployment_id,
314+
start=start,
315+
end=end,
316+
interval=interval,
317+
)
318+
return metrics
319+
299320
def stream_metrics(self, deployment_id, interval="30s", built_in_metrics=None):
300321
"""Stream live model deployment metrics
301322

gradient/api_sdk/clients/experiment_client.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -831,6 +831,27 @@ def get_metrics(self, experiment_id, start=None, end=None, interval="30s", built
831831
)
832832
return metrics
833833

834+
def list_metrics(self, experiment_id, start=None, end=None, interval="30s"):
835+
"""List experiment metrics
836+
837+
:param str experiment_id: ID of experiment
838+
:param datetime.datetime|str start:
839+
:param datetime.datetime|str end:
840+
:param str interval:
841+
:returns: Metrics of and experiment
842+
:rtype: dict[str,dict[str,list[dict]]]
843+
"""
844+
845+
repository = self.build_repository(repositories.ListExperimentMetrics)
846+
metrics = repository.get(
847+
id=experiment_id,
848+
start=start,
849+
end=end,
850+
interval=interval,
851+
)
852+
return metrics
853+
854+
834855
def stream_metrics(self, experiment_id, interval="30s", built_in_metrics=None):
835856
"""Stream live experiment metrics
836857

gradient/api_sdk/clients/job_client.py

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
from .base_client import BaseClient, TagsSupportMixin
77
from ..models import Artifact, Job
88
from ..repositories.jobs import ListJobs, ListJobLogs, ListJobArtifacts, CreateJob, DeleteJob, StopJob, \
9-
DeleteJobArtifacts, GetJobArtifacts, GetJobMetrics, StreamJobMetrics
9+
DeleteJobArtifacts, GetJobArtifacts, GetJobMetrics, ListJobMetrics, StreamJobMetrics
1010

1111

1212
class JobsClient(TagsSupportMixin, BaseClient):
@@ -396,6 +396,26 @@ def get_metrics(self, job_id, start=None, end=None, interval="30s", built_in_met
396396
)
397397
return metrics
398398

399+
def list_metrics(self, job_id, start=None, end=None, interval="30s"):
400+
"""List job metrics
401+
402+
:param str job_id: ID of a job
403+
:param datetime.datetime|str start:
404+
:param datetime.datetime|str end:
405+
:param str interval:
406+
:returns: Metrics of a job
407+
:rtype: dict[str,dict[str,list[dict]]]
408+
"""
409+
410+
repository = self.build_repository(ListJobMetrics)
411+
metrics = repository.get(
412+
id=job_id,
413+
start=start,
414+
end=end,
415+
interval=interval,
416+
)
417+
return metrics
418+
399419
def stream_metrics(self, job_id, interval="30s", built_in_metrics=None):
400420
"""Stream live job metrics
401421

gradient/api_sdk/clients/notebook_client.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -191,6 +191,26 @@ def get_metrics(self, notebook_id, start=None, end=None, interval="30s", built_i
191191
)
192192
return metrics
193193

194+
def list_metrics(self, notebook_id, start=None, end=None, interval="30s"):
195+
"""List notebook metrics
196+
197+
:param str notebook_id: notebook ID
198+
:param datetime.datetime|str start:
199+
:param datetime.datetime|str end:
200+
:param str interval:
201+
:returns: Metrics of a notebook
202+
:rtype: dict[str,dict[str,list[dict]]]
203+
"""
204+
205+
repository = self.build_repository(repositories.ListNotebookMetrics)
206+
metrics = repository.get(
207+
id=notebook_id,
208+
start=start,
209+
end=end,
210+
interval=interval,
211+
)
212+
return metrics
213+
194214
def stream_metrics(self, notebook_id, interval="30s", built_in_metrics=None):
195215
"""Stream live notebook metrics
196216

gradient/api_sdk/repositories/__init__.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,19 +4,19 @@
44
from .dataset_versions import ListDatasetVersions, CreateDatasetVersion, DeleteDatasetVersion, \
55
GenerateDatasetVersionPreSignedS3Urls, GetDatasetVersion, UpdateDatasetVersion
66
from .deployments import ListDeployments, CreateDeployment, StartDeployment, StopDeployment, DeleteDeployment, \
7-
UpdateDeployment, GetDeployment, GetDeploymentMetrics, StreamDeploymentMetrics, ListDeploymentLogs
7+
UpdateDeployment, GetDeployment, GetDeploymentMetrics, ListDeploymentMetrics, StreamDeploymentMetrics, ListDeploymentLogs
88
from .experiments import ListExperiments, GetExperiment, ListExperimentLogs, StartExperiment, StopExperiment, \
99
CreateSingleNodeExperiment, CreateMultiNodeExperiment, RunSingleNodeExperiment, RunMultiNodeExperiment, \
10-
CreateMpiMultiNodeExperiment, RunMpiMultiNodeExperiment, DeleteExperiment, GetExperimentMetrics, \
10+
CreateMpiMultiNodeExperiment, RunMpiMultiNodeExperiment, DeleteExperiment, GetExperimentMetrics, ListExperimentMetrics, \
1111
StreamExperimentMetrics
1212
from .hyperparameter import CreateHyperparameterJob, CreateAndStartHyperparameterJob, ListHyperparameterJobs, \
1313
GetHyperparameterTuningJob, StartHyperparameterTuningJob
14-
from .jobs import ListJobs, ListResources, ListJobArtifacts, ListJobLogs, GetJob, GetJobMetrics, StreamJobMetrics
14+
from .jobs import ListJobs, ListResources, ListJobArtifacts, ListJobLogs, GetJob, GetJobMetrics, ListJobMetrics, StreamJobMetrics
1515
from .machine_types import ListMachineTypes
1616
from .machines import CheckMachineAvailability, CreateMachine, CreateResource, StartMachine, StopMachine, \
1717
RestartMachine, GetMachine, UpdateMachine, GetMachineUtilization
1818
from .models import DeleteModel, ListModels, UploadModel, GetModel, ListModelFiles
19-
from .notebooks import CreateNotebook, DeleteNotebook, GetNotebook, ListNotebooks, GetNotebookMetrics, \
19+
from .notebooks import CreateNotebook, DeleteNotebook, GetNotebook, ListNotebooks, GetNotebookMetrics, ListNotebookMetrics, \
2020
StreamNotebookMetrics, StopNotebook, StartNotebook, ForkNotebook, ListNotebookArtifacts, ListNotebookLogs
2121
from .projects import CreateProject, ListProjects, DeleteProject, GetProject
2222
from .secrets import ListSecrets, SetSecret, DeleteSecret

gradient/api_sdk/repositories/common.py

Lines changed: 78 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -377,6 +377,84 @@ def _format_datetime(self, some_datetime):
377377
datetime_str = some_datetime.strftime("%Y-%m-%dT%H:%M:%SZ")
378378
return datetime_str
379379

380+
class ListMetrics(GetResource):
381+
OBJECT_TYPE = None
382+
383+
DEFAULT_INTERVAL = "30s"
384+
385+
@abc.abstractmethod
386+
def _get_instance_by_id(self, instance_id, **kwargs):
387+
pass
388+
389+
def _get_metrics_api_url(self, instance, protocol="https"):
390+
if not instance.metrics_url:
391+
raise GradientSdkError("Metrics API url not found")
392+
393+
metrics_api_url = concatenate_urls(protocol + "://", instance.metrics_url)
394+
return metrics_api_url
395+
396+
def _get(self, **kwargs):
397+
new_kwargs = self._get_kwargs(kwargs)
398+
rv = super(ListMetrics, self)._get(**new_kwargs)
399+
return rv
400+
401+
def _get_kwargs(self, kwargs):
402+
instance_id = kwargs["id"]
403+
instance = self._get_instance_by_id(instance_id)
404+
started_date = self._get_start_date(instance, kwargs)
405+
end = self._get_end_date(instance, kwargs)
406+
interval = kwargs.get("interval") or self.DEFAULT_INTERVAL
407+
metrics_api_url = self._get_metrics_api_url(instance)
408+
new_kwargs = {
409+
"start": started_date,
410+
"interval": interval,
411+
"objecttype": self.OBJECT_TYPE,
412+
"handle": instance_id,
413+
"metrics_api_url": metrics_api_url,
414+
}
415+
if end:
416+
new_kwargs["end"] = end
417+
418+
return new_kwargs
419+
420+
def get_request_url(self, **kwargs):
421+
return "metrics/api/v1/list"
422+
423+
def _get_api_url(self, **kwargs):
424+
api_url = kwargs["metrics_api_url"]
425+
return api_url
426+
427+
def _get_start_date(self, instance, kwargs):
428+
datetime_string = kwargs.get("start") or instance.dt_started or instance.dt_created
429+
if not datetime_string:
430+
return None
431+
432+
datetime_string = self._format_datetime(datetime_string)
433+
return datetime_string
434+
435+
def _get_end_date(self, instance, kwargs):
436+
datetime_string = kwargs.get("end")
437+
if not datetime_string:
438+
return None
439+
440+
datetime_string = self._format_datetime(datetime_string)
441+
return datetime_string
442+
443+
def _get_request_params(self, kwargs):
444+
params = kwargs.copy()
445+
params.pop("metrics_api_url", None)
446+
return params
447+
448+
def _parse_object(self, instance_dict, **kwargs):
449+
chart_names = instance_dict["chart_names"]
450+
return chart_names
451+
452+
def _format_datetime(self, some_datetime):
453+
if not isinstance(some_datetime, datetime.datetime):
454+
some_datetime = dateutil.parser.parse(some_datetime)
455+
456+
datetime_str = some_datetime.strftime("%Y-%m-%dT%H:%M:%SZ")
457+
return datetime_str
380458

381459
@six.add_metaclass(abc.ABCMeta)
382460
class StreamMetrics(BaseRepository):
@@ -456,7 +534,6 @@ def _send_chart_descriptor(self, connection, kwargs):
456534
def _get_stream_generator(self, connection):
457535
return connection
458536

459-
460537
class ListLogs(ListResources):
461538
@abc.abstractmethod
462539
def _get_request_params(self, kwargs):

gradient/api_sdk/repositories/deployments.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
from .common import ListResources, CreateResource, StartResource, StopResource, DeleteResource, AlterResource, \
2-
GetResource, GetMetrics, StreamMetrics, ListLogs
2+
GetResource, GetMetrics, ListMetrics, StreamMetrics, ListLogs
33
from .. import serializers, config, sdk_exceptions
44
from ..sdk_exceptions import ResourceFetchingError, MalformedResponseError
55

@@ -175,6 +175,20 @@ def _get_start_date(self, instance, kwargs):
175175

176176
return rv
177177

178+
class ListDeploymentMetrics(ListMetrics):
179+
OBJECT_TYPE = "modelDeployment"
180+
181+
def _get_instance_by_id(self, instance_id, **kwargs):
182+
repository = GetDeployment(self.api_key, logger=self.logger, ps_client_name=self.ps_client_name)
183+
instance = repository.get(deployment_id=instance_id)
184+
return instance
185+
186+
def _get_start_date(self, instance, kwargs):
187+
rv = super(ListDeploymentMetrics, self)._get_start_date(instance, kwargs)
188+
if rv is None:
189+
raise sdk_exceptions.GradientSdkError("Deployment job has not started yet")
190+
191+
return rv
178192

179193
class StreamDeploymentMetrics(StreamMetrics):
180194
OBJECT_TYPE = "modelDeployment"

gradient/api_sdk/repositories/experiments.py

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import six
44
import websocket
55

6-
from .common import ListResources, CreateResource, StartResource, StopResource, DeleteResource, GetResource, GetMetrics, \
6+
from .common import ListResources, CreateResource, StartResource, StopResource, DeleteResource, GetResource, GetMetrics, ListMetrics, \
77
StreamMetrics, ListLogs
88
from .. import config, serializers, sdk_exceptions
99
from ..repositories.jobs import ListJobs
@@ -208,6 +208,32 @@ def _get_instance(self, response, **kwargs):
208208

209209
return rv
210210

211+
class ListExperimentMetrics(GetExperimentMetricsApiUrlMixin, ListMetrics):
212+
OBJECT_TYPE = "experiment"
213+
214+
def _get_instance_by_id(self, instance_id, **kwargs):
215+
repository = GetExperiment(self.api_key, logger=self.logger, ps_client_name=self.ps_client_name)
216+
instance = repository.get(experiment_id=instance_id)
217+
return instance
218+
219+
def _get_start_date(self, instance, kwargs):
220+
rv = super(ListExperimentMetrics, self)._get_start_date(instance, kwargs)
221+
if rv is None:
222+
raise sdk_exceptions.GradientSdkError("Experiment has not started yet")
223+
224+
return rv
225+
226+
def _get_instance(self, response, **kwargs):
227+
try:
228+
rv = super(ListExperimentMetrics, self)._get_instance(response, **kwargs)
229+
except sdk_exceptions.ResourceFetchingError as e:
230+
if '{"version":' in str(e):
231+
# TODO: metrics are not working for v1 experiments at the moment
232+
raise sdk_exceptions.GradientSdkError("Custom metrics are available for private clusters only")
233+
else:
234+
raise
235+
236+
return rv
211237

212238
class StreamExperimentMetrics(GetExperimentMetricsApiUrlMixin, StreamMetrics):
213239
OBJECT_TYPE = "experiment"

gradient/api_sdk/repositories/jobs.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import json
22

33
import gradient.api_sdk.config
4-
from .common import ListResources, CreateResource, GetResource, DeleteResource, StopResource, GetMetrics, StreamMetrics, \
4+
from .common import ListResources, CreateResource, GetResource, DeleteResource, StopResource, GetMetrics, ListMetrics, StreamMetrics, \
55
ListLogs
66
from .. import serializers, sdk_exceptions
77
from ..clients import http_client
@@ -209,6 +209,20 @@ def _get_start_date(self, instance, kwargs):
209209

210210
return rv
211211

212+
class ListJobMetrics(ListMetrics):
213+
OBJECT_TYPE = "mljob"
214+
215+
def _get_instance_by_id(self, instance_id, **kwargs):
216+
repository = GetJob(self.api_key, logger=self.logger, ps_client_name=self.ps_client_name)
217+
instance = repository.get(job_id=instance_id)
218+
return instance
219+
220+
def _get_start_date(self, instance, kwargs):
221+
rv = super(ListJobMetrics, self)._get_start_date(instance, kwargs)
222+
if rv is None:
223+
raise sdk_exceptions.GradientSdkError("Job has not started yet")
224+
225+
return rv
212226

213227
class StreamJobMetrics(StreamMetrics):
214228
OBJECT_TYPE = "mljob"

gradient/api_sdk/repositories/notebooks.py

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
from ..clients import http_client
44
from ..sdk_exceptions import ResourceCreatingError
55
from .common import CreateResource, DeleteResource, ListResources, GetResource, \
6-
StopResource, GetMetrics, StreamMetrics, BaseRepository, ListLogs
6+
StopResource, GetMetrics, ListMetrics, StreamMetrics, BaseRepository, ListLogs
77
from .. import config
88
from .. import serializers, sdk_exceptions
99

@@ -174,6 +174,20 @@ def _get_start_date(self, instance, kwargs):
174174

175175
return rv
176176

177+
class ListNotebookMetrics(ListMetrics):
178+
OBJECT_TYPE = "notebook"
179+
180+
def _get_instance_by_id(self, instance_id, **kwargs):
181+
repository = GetNotebook(self.api_key, logger=self.logger, ps_client_name=self.ps_client_name)
182+
instance = repository.get(id=instance_id)
183+
return instance
184+
185+
def _get_start_date(self, instance, kwargs):
186+
rv = super(ListNotebookMetrics, self)._get_start_date(instance, kwargs)
187+
if rv is None:
188+
raise sdk_exceptions.GradientSdkError("Notebook has not started yet")
189+
190+
return rv
177191

178192
class StreamNotebookMetrics(StreamMetrics):
179193
OBJECT_TYPE = "notebook"
@@ -220,4 +234,3 @@ def _get_request_params(self, kwargs):
220234
"limit": kwargs["limit"]
221235
}
222236
return params
223-

0 commit comments

Comments
 (0)