Skip to content

SDK API Reference

Complete API documentation for the DeepFix SDK client library.

DeepFixClient

Main client for interacting with the DeepFix server.

This client provides a high-level interface for diagnosing ML datasets, ingesting data with quality checks, and leveraging AI-powered recommendations to improve your ML workflows.

Attributes:

Name Type Description
mlflow_config MLflowConfig

Configuration for MLflow integration.

api_url str

Base URL of the DeepFix server.

timeout int

Request timeout in seconds.

Source code in deepfix-sdk/src/deepfix_sdk/client.py
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
class DeepFixClient:
    """Main client for interacting with the DeepFix server.

    This client provides a high-level interface for diagnosing ML datasets,
    ingesting data with quality checks, and leveraging AI-powered recommendations
    to improve your ML workflows.

    Attributes:
        mlflow_config (MLflowConfig): Configuration for MLflow integration.
        api_url (str): Base URL of the DeepFix server.
        timeout (int): Request timeout in seconds.
    """

    def __init__(
        self,
        api_url: str = "http://localhost:8844",
        mlflow_config: Optional[MLflowConfig] = None,
        artifact_config: Optional[ArtifactConfig] = None,
        timeout: int = 30,
    ):
        """Initialize the DeepFixClient.

        Args:
            api_url (str, optional): URL of the DeepFix server. Defaults to "http://localhost:8844".
            mlflow_config (MLflowConfig, optional): MLflow configuration for experiment tracking.
                If not provided, a default MLflowConfig is created. Defaults to None.
            artifact_config (ArtifactConfig, optional): Artifact cache configuration used to discover
                stored datasets/models. Defaults to None.
            timeout (int, optional): Request timeout in seconds. Defaults to 30.

        Example:
            >>> client = DeepFixClient(
            ...     api_url="http://localhost:8844",
            ...     timeout=120
            ... )
        """
        self.mlflow_config = mlflow_config or MLflowConfig()
        self.artifact_config = artifact_config or ArtifactConfig()
        self.api_url = api_url
        self.timeout = timeout

        self._analyze_endpoint = f"{self.api_url}/api/v1/analyse"
        self._artifact_repo: Optional[ArtifactRepository] = None

    def _get_artifact_repository(self) -> ArtifactRepository:
        if self._artifact_repo is None:
            self._artifact_repo = ArtifactRepository(
                sqlite_path=self.artifact_config.sqlite_path
            )
        return self._artifact_repo

    def list_datasets(
        self, status: Optional[Union[str, ArtifactStatus]] = None
    ) -> list[dict[str, Any]]:
        """List datasets that have been ingested and are available for diagnosis.

        Args:
            status (ArtifactStatus | str | None): Optional filter by artifact status.

        Returns:
            List of dictionaries describing available datasets. Each record contains:
                - dataset_name: Registered run/dataset name.
                - status: Artifact registration status.
                - mlflow_run_id: Associated MLflow run, if any.
                - local_path: Path to cached artifact on disk, if downloaded.
                - updated_at / created_at: ISO8601 timestamps for auditing.
        """
        repo = self._get_artifact_repository()
        status_enum: Optional[ArtifactStatus] = None
        if status is not None:
            status_enum = (
                status if isinstance(status, ArtifactStatus) else ArtifactStatus(status)
            )
        records = repo.list_records(
            artifact_key=ArtifactPath.DATASET.value, status=status_enum
        )
        datasets = []
        for record in records:
            datasets.append(
                {
                    "dataset_name": record.run_id,
                    "status": record.status.value if record.status else None,
                    "mlflow_run_id": record.mlflow_run_id,
                    "local_path": record.local_path,
                    "created_at": record.created_at.isoformat()
                    if record.created_at
                    else None,
                    "updated_at": record.updated_at.isoformat()
                    if record.updated_at
                    else None,
                }
            )
        datasets.sort(key=lambda item: item["updated_at"] or "", reverse=True)
        return datasets

    def get_dataset_names(
        self, status: Optional[Union[str, ArtifactStatus]] = None
    ) -> list[str]:
        """Convenience method returning only dataset names for UI dropdowns."""
        return [entry["dataset_name"] for entry in self.list_datasets(status=status)]

    def get_diagnosis(
        self,
        train_data: BaseDataset,
        test_data: Optional[BaseDataset] = None,
        model: Any = None,
        model_name: Optional[str] = None,
        batch_size: int = 8,
        language: str = "english",
    ) -> APIResponse:
        """Ingest and diagnose a model in a single operation.

        This convenience method combines ingestion and diagnosis into a single call.
        It first ingests the dataset and model (if provided), then immediately runs
        diagnosis on them to get analysis results and recommendations.

        Args:
            train_data (BaseDataset): Training dataset to ingest. Must be an instance
                of an appropriate dataset class (e.g., ImageClassificationDataset,
                TabularDataset, NLPDataset).
            test_data (BaseDataset, optional): Test/validation dataset. If provided,
                enables cross-dataset validation checks. Defaults to None.
            model (Any, optional): Model to ingest. Must be an instance of a model class.
                Defaults to None.
            model_name (str, optional): Name of the model. Defaults to None.
            batch_size (int, optional): Batch size for processing the dataset.
                Defaults to 8.
            language (str, optional): Language for analysis output. Defaults to "english".

        Returns:
            APIResponse: Response object containing:
                - Analysis results and findings
                - Actionable recommendations

        Raises:
            ValueError: If dataset with the same name exists and overwrite=False, or
                if dataset artifacts cannot be found after ingestion.
            Exception: If ingestion fails, or if the analysis request fails (non-200 status code).

        Example:
            >>> from deepfix_sdk.data import TabularDataset
            >>> import pandas as pd
            >>> df = pd.read_csv("train.csv")
            >>> label = "target"
            >>> cat_features = ["cat_feature1", "cat_feature2"]
            >>> dataset_name = "my-dataset"
            >>> train_dataset = TabularDataset(dataset=df, dataset_name=dataset_name, label=label, cat_features=cat_features)
            >>> response = client.get_diagnosis(
            ...     model_name="my-model",
            ...     train_data=train_dataset,
            ...     batch_size=16
            ... )
            >>> print(response.to_text())
        """
        assert isinstance(train_data, BaseDataset), (
            "train_data must be an instance of BaseDataset"
        )
        assert test_data is None or isinstance(test_data, BaseDataset), (
            "test_data must be an instance of BaseDataset"
        )

        dataset_name = self.get_dataset_name(train_data, test_data)

        # First, ingest the dataset and model
        self.ingest(
            train_data=train_data,
            test_data=test_data,
            model=model,
            model_name=model_name,
            batch_size=batch_size,
            overwrite=True,
        )
        # Then, diagnose the ingested dataset/model
        return self.diagnose(
            dataset_name=dataset_name,
            model_name=model_name,
            language=language,
        )

    def diagnose(
        self,
        dataset_name: str,
        language: str = "english",
        model_name: Optional[str] = None,
    ) -> APIResponse:
        """Analyze a run and return diagnostic results with recommendations.

        This method performs a comprehensive analysis of the specified run to identify
        potential issues, quality problems, and provides AI-powered recommendations for
        improvement.

        Args:
            dataset_name (str): Name of the dataset to analyze. Must match a dataset
                that has been previously ingested.
            language (str, optional): Language for analysis output. Defaults to "english".
            model_name (str, optional): Name of the model. Defaults to None.
        Returns:
            APIResponse: Response object containing:
                - Analysis results and findings
                - Quality metrics
                - Actionable recommendations
                - Dataset statistics

        Raises:
            ValueError: If dataset artifacts cannot be found for the specified dataset.
            Exception: If the analysis request fails (non-200 status code).

        Example:
            >>> response = client.diagnose(dataset_name="my-dataset")
            >>> print(response.to_text())
        """
        request = self._create_request(
            dataset_name=dataset_name,
            model_name=model_name,
            language=language,
        )
        response = self._send_request(request)
        return response

    def _load_artifacts(self, dataset_name: str, model_name: str) -> dict:
        from .pipelines import ArtifactLoadingPipeline

        artifact_config = self.artifact_config.model_copy()
        artifact_config.load_dataset_metadata = True
        artifact_config.load_checks = True
        artifact_config.load_model_checkpoint = True
        artifact_config.load_training = False
        return ArtifactLoadingPipeline(
            mlflow_config=self.mlflow_config,
            artifact_config=artifact_config,
            dataset_name=dataset_name,
            model_name=model_name,
        ).run()

    def ingest(
        self,
        train_data: BaseDataset,
        test_data: Optional[BaseDataset] = None,
        model: Any = None,
        model_name: Optional[str] = None,
        batch_size: int = 8,
        overwrite: bool = False,
    ) -> None:
        """Ingest a dataset with optional quality validation.

        This method uploads a dataset to the DeepFix server and optionally performs
        validation checks on the data. Supports multiple data types including images,
        tabular data, NLP text, and general vision datasets.

        Args:
            train_data (BaseDataset): Training dataset to ingest. Must be an instance
                of an appropriate dataset class (e.g., ImageClassificationDataset,
                TabularDataset, NLPDataset). The dataset name is extracted from the
                dataset_name attribute of this object.
            test_data (BaseDataset, optional): Test/validation dataset. If provided,
                enables cross-dataset validation checks. Defaults to None.
            model (Any, optional): Model to ingest. Must be an instance of a model class.
                Defaults to None.
            model_name (str, optional): Name of the model. Defaults to None.
            batch_size (int, optional): Batch size for processing the dataset.
                Defaults to 8.
            overwrite (bool, optional): If True, overwrite existing dataset with the
                same name. If False, raise an error if dataset exists. Defaults to False.

        Raises:
            ValueError: If dataset with the same name exists and overwrite=False.
            Exception: If data validation fails or ingestion fails.

        Example:
            >>> from deepfix_sdk.data.datasets import TabularDataset
            >>> import pandas as pd
            >>> df = pd.read_csv("train.csv")
            >>> train_dataset = TabularDataset(
            ...     dataset_name="my-dataset",
            ...     data=df
            ... )
            >>> client.ingest(
            ...     train_data=train_dataset,
            ...     batch_size=16
            ... )
        """
        from .pipelines import IngestionPipeline

        data_type = self._get_data_type(train_data, test_data)
        dataset_name = self.get_dataset_name(train_data, test_data)

        dataset_logging_pipeline = IngestionPipeline(
            dataset_name=dataset_name,
            data_type=data_type,
            mlflow_tracking_uri=self.mlflow_config.tracking_uri,
            train_test_validation=test_data is not None,
            data_integrity=True,
            model_evaluation=model is not None,
            batch_size=batch_size,
            overwrite=overwrite,
            model_name=model_name,
        )
        dataset_logging_pipeline.run(
            train_data=train_data, test_data=test_data, model=model
        )

    def _create_request(
        self,
        dataset_name: str,
        model_name: str,
        language: str = "english",
    ):
        """Create an API request for analysis.

        Internal method that loads dataset artifacts and constructs an APIRequest
        object for sending to the DeepFix server.

        Args:
            dataset_name (str): Name of the dataset.
            model_name (str): Name of the model.
            language (str, optional): Language for analysis. Defaults to "english".
            loaded_artifacts (dict): Loaded artifacts from the server.
        Returns:
            APIRequest: Request object configured with dataset artifacts and language.

        Raises:
            ValueError: If dataset artifacts are not found or have unexpected format.
        """
        loaded_artifacts = self._load_artifacts(
            dataset_name=dataset_name, model_name=model_name
        )

        cfg = {
            "dataset_name": dataset_name,
            "language": language,
            "model_name": model_name,
        }
        request = APIRequest(**cfg)
        dataset_artifacts = loaded_artifacts.get(ArtifactPath.DATASET.value, None)
        if dataset_artifacts is not None:
            request.dataset_artifacts = dataset_artifacts.to_dict()

        request.deepchecks_artifacts = loaded_artifacts.get(
            ArtifactPath.DEEPCHECKS.value, None
        )
        request.model_checkpoint_artifacts = loaded_artifacts.get(
            ArtifactPath.MODEL_CHECKPOINT.value, None
        )
        return request

    def _send_request(self, request: APIRequest) -> APIResponse:
        """Send an analysis request to the DeepFix server.

        Internal method that sends the API request to the server and handles the response.
        Displays a progress spinner during the request and returns the parsed response.

        Args:
            request (APIRequest): The API request object to send to the server.

        Returns:
            APIResponse: Parsed response object from the server containing analysis results.

        Raises:
            Exception: If the server returns a non-200 status code or if the request times out.

        Note:
            Requires the DEEPFIX_API_KEY environment variable to be set for authentication.
        """
        with Live(
            Spinner("dots", text="[cyan]Running analysis...[/cyan]", style="cyan"),
            console=console,
            refresh_per_second=10,
        ):
            payload = request.model_dump()
            # headers = {"X-API-Key": os.getenv("DEEPFIX_API_KEY")}
            headers = {"Authorization": f"Bearer {os.getenv('DEEPFIX_API_KEY')}"}
            response = requests.post(
                self._analyze_endpoint,
                json=payload,
                timeout=self.timeout,
                headers=headers,
            )

            if response.status_code != 200:
                console.print("[red]✗[/red] Analysis failed", style="bold red")
                raise RuntimeError(
                    f"Error during analysis: status code: {response.status_code} \nand message: {response.text}"
                )
            out = APIResponse(**response.json())

        if isinstance(out.error_messages, dict) and any(out.error_messages.values()):
            console.print("[red]✗[/red] Analysis failed", style="bold red")
            raise RuntimeError(f"Error during analysis: {out.error_messages}")

        console.print("[green]✓[/green] Analysis complete!", style="bold green")
        return out

    def _get_data_type(
        self, train_data: BaseDataset, test_data: Optional[BaseDataset] = None
    ) -> DataType:
        data_type = train_data.data_type
        if test_data is not None:
            test_data_type = test_data.data_type
            if test_data_type != data_type:
                raise ValueError(
                    f"Test data type {test_data_type} does not match train data type {data_type}"
                )
        return data_type

    def get_dataset_name(
        self, train_data: BaseDataset, test_data: Optional[BaseDataset] = None
    ) -> str:
        dataset_name = train_data.name
        if test_data is not None:
            if test_data.name != dataset_name:
                dataset_name = f"{dataset_name}_vs_{test_data.name}"
        return dataset_name

__init__(api_url='http://localhost:8844', mlflow_config=None, artifact_config=None, timeout=30)

Initialize the DeepFixClient.

Parameters:

Name Type Description Default
api_url str

URL of the DeepFix server. Defaults to "http://localhost:8844".

'http://localhost:8844'
mlflow_config MLflowConfig

MLflow configuration for experiment tracking. If not provided, a default MLflowConfig is created. Defaults to None.

None
artifact_config ArtifactConfig

Artifact cache configuration used to discover stored datasets/models. Defaults to None.

None
timeout int

Request timeout in seconds. Defaults to 30.

30
Example

client = DeepFixClient( ... api_url="http://localhost:8844", ... timeout=120 ... )

Source code in deepfix-sdk/src/deepfix_sdk/client.py
def __init__(
    self,
    api_url: str = "http://localhost:8844",
    mlflow_config: Optional[MLflowConfig] = None,
    artifact_config: Optional[ArtifactConfig] = None,
    timeout: int = 30,
):
    """Initialize the DeepFixClient.

    Args:
        api_url (str, optional): URL of the DeepFix server. Defaults to "http://localhost:8844".
        mlflow_config (MLflowConfig, optional): MLflow configuration for experiment tracking.
            If not provided, a default MLflowConfig is created. Defaults to None.
        artifact_config (ArtifactConfig, optional): Artifact cache configuration used to discover
            stored datasets/models. Defaults to None.
        timeout (int, optional): Request timeout in seconds. Defaults to 30.

    Example:
        >>> client = DeepFixClient(
        ...     api_url="http://localhost:8844",
        ...     timeout=120
        ... )
    """
    self.mlflow_config = mlflow_config or MLflowConfig()
    self.artifact_config = artifact_config or ArtifactConfig()
    self.api_url = api_url
    self.timeout = timeout

    self._analyze_endpoint = f"{self.api_url}/api/v1/analyse"
    self._artifact_repo: Optional[ArtifactRepository] = None

diagnose(dataset_name, language='english', model_name=None)

Analyze a run and return diagnostic results with recommendations.

This method performs a comprehensive analysis of the specified run to identify potential issues, quality problems, and provides AI-powered recommendations for improvement.

Parameters:

Name Type Description Default
dataset_name str

Name of the dataset to analyze. Must match a dataset that has been previously ingested.

required
language str

Language for analysis output. Defaults to "english".

'english'
model_name str

Name of the model. Defaults to None.

None

Returns: APIResponse: Response object containing: - Analysis results and findings - Quality metrics - Actionable recommendations - Dataset statistics

Raises:

Type Description
ValueError

If dataset artifacts cannot be found for the specified dataset.

Exception

If the analysis request fails (non-200 status code).

Example

response = client.diagnose(dataset_name="my-dataset") print(response.to_text())

Source code in deepfix-sdk/src/deepfix_sdk/client.py
def diagnose(
    self,
    dataset_name: str,
    language: str = "english",
    model_name: Optional[str] = None,
) -> APIResponse:
    """Analyze a run and return diagnostic results with recommendations.

    This method performs a comprehensive analysis of the specified run to identify
    potential issues, quality problems, and provides AI-powered recommendations for
    improvement.

    Args:
        dataset_name (str): Name of the dataset to analyze. Must match a dataset
            that has been previously ingested.
        language (str, optional): Language for analysis output. Defaults to "english".
        model_name (str, optional): Name of the model. Defaults to None.
    Returns:
        APIResponse: Response object containing:
            - Analysis results and findings
            - Quality metrics
            - Actionable recommendations
            - Dataset statistics

    Raises:
        ValueError: If dataset artifacts cannot be found for the specified dataset.
        Exception: If the analysis request fails (non-200 status code).

    Example:
        >>> response = client.diagnose(dataset_name="my-dataset")
        >>> print(response.to_text())
    """
    request = self._create_request(
        dataset_name=dataset_name,
        model_name=model_name,
        language=language,
    )
    response = self._send_request(request)
    return response

get_dataset_names(status=None)

Convenience method returning only dataset names for UI dropdowns.

Source code in deepfix-sdk/src/deepfix_sdk/client.py
def get_dataset_names(
    self, status: Optional[Union[str, ArtifactStatus]] = None
) -> list[str]:
    """Convenience method returning only dataset names for UI dropdowns."""
    return [entry["dataset_name"] for entry in self.list_datasets(status=status)]

get_diagnosis(train_data, test_data=None, model=None, model_name=None, batch_size=8, language='english')

Ingest and diagnose a model in a single operation.

This convenience method combines ingestion and diagnosis into a single call. It first ingests the dataset and model (if provided), then immediately runs diagnosis on them to get analysis results and recommendations.

Parameters:

Name Type Description Default
train_data BaseDataset

Training dataset to ingest. Must be an instance of an appropriate dataset class (e.g., ImageClassificationDataset, TabularDataset, NLPDataset).

required
test_data BaseDataset

Test/validation dataset. If provided, enables cross-dataset validation checks. Defaults to None.

None
model Any

Model to ingest. Must be an instance of a model class. Defaults to None.

None
model_name str

Name of the model. Defaults to None.

None
batch_size int

Batch size for processing the dataset. Defaults to 8.

8
language str

Language for analysis output. Defaults to "english".

'english'

Returns:

Name Type Description
APIResponse APIResponse

Response object containing: - Analysis results and findings - Actionable recommendations

Raises:

Type Description
ValueError

If dataset with the same name exists and overwrite=False, or if dataset artifacts cannot be found after ingestion.

Exception

If ingestion fails, or if the analysis request fails (non-200 status code).

Example

from deepfix_sdk.data import TabularDataset import pandas as pd df = pd.read_csv("train.csv") label = "target" cat_features = ["cat_feature1", "cat_feature2"] dataset_name = "my-dataset" train_dataset = TabularDataset(dataset=df, dataset_name=dataset_name, label=label, cat_features=cat_features) response = client.get_diagnosis( ... model_name="my-model", ... train_data=train_dataset, ... batch_size=16 ... ) print(response.to_text())

Source code in deepfix-sdk/src/deepfix_sdk/client.py
def get_diagnosis(
    self,
    train_data: BaseDataset,
    test_data: Optional[BaseDataset] = None,
    model: Any = None,
    model_name: Optional[str] = None,
    batch_size: int = 8,
    language: str = "english",
) -> APIResponse:
    """Ingest and diagnose a model in a single operation.

    This convenience method combines ingestion and diagnosis into a single call.
    It first ingests the dataset and model (if provided), then immediately runs
    diagnosis on them to get analysis results and recommendations.

    Args:
        train_data (BaseDataset): Training dataset to ingest. Must be an instance
            of an appropriate dataset class (e.g., ImageClassificationDataset,
            TabularDataset, NLPDataset).
        test_data (BaseDataset, optional): Test/validation dataset. If provided,
            enables cross-dataset validation checks. Defaults to None.
        model (Any, optional): Model to ingest. Must be an instance of a model class.
            Defaults to None.
        model_name (str, optional): Name of the model. Defaults to None.
        batch_size (int, optional): Batch size for processing the dataset.
            Defaults to 8.
        language (str, optional): Language for analysis output. Defaults to "english".

    Returns:
        APIResponse: Response object containing:
            - Analysis results and findings
            - Actionable recommendations

    Raises:
        ValueError: If dataset with the same name exists and overwrite=False, or
            if dataset artifacts cannot be found after ingestion.
        Exception: If ingestion fails, or if the analysis request fails (non-200 status code).

    Example:
        >>> from deepfix_sdk.data import TabularDataset
        >>> import pandas as pd
        >>> df = pd.read_csv("train.csv")
        >>> label = "target"
        >>> cat_features = ["cat_feature1", "cat_feature2"]
        >>> dataset_name = "my-dataset"
        >>> train_dataset = TabularDataset(dataset=df, dataset_name=dataset_name, label=label, cat_features=cat_features)
        >>> response = client.get_diagnosis(
        ...     model_name="my-model",
        ...     train_data=train_dataset,
        ...     batch_size=16
        ... )
        >>> print(response.to_text())
    """
    assert isinstance(train_data, BaseDataset), (
        "train_data must be an instance of BaseDataset"
    )
    assert test_data is None or isinstance(test_data, BaseDataset), (
        "test_data must be an instance of BaseDataset"
    )

    dataset_name = self.get_dataset_name(train_data, test_data)

    # First, ingest the dataset and model
    self.ingest(
        train_data=train_data,
        test_data=test_data,
        model=model,
        model_name=model_name,
        batch_size=batch_size,
        overwrite=True,
    )
    # Then, diagnose the ingested dataset/model
    return self.diagnose(
        dataset_name=dataset_name,
        model_name=model_name,
        language=language,
    )

ingest(train_data, test_data=None, model=None, model_name=None, batch_size=8, overwrite=False)

Ingest a dataset with optional quality validation.

This method uploads a dataset to the DeepFix server and optionally performs validation checks on the data. Supports multiple data types including images, tabular data, NLP text, and general vision datasets.

Parameters:

Name Type Description Default
train_data BaseDataset

Training dataset to ingest. Must be an instance of an appropriate dataset class (e.g., ImageClassificationDataset, TabularDataset, NLPDataset). The dataset name is extracted from the dataset_name attribute of this object.

required
test_data BaseDataset

Test/validation dataset. If provided, enables cross-dataset validation checks. Defaults to None.

None
model Any

Model to ingest. Must be an instance of a model class. Defaults to None.

None
model_name str

Name of the model. Defaults to None.

None
batch_size int

Batch size for processing the dataset. Defaults to 8.

8
overwrite bool

If True, overwrite existing dataset with the same name. If False, raise an error if dataset exists. Defaults to False.

False

Raises:

Type Description
ValueError

If dataset with the same name exists and overwrite=False.

Exception

If data validation fails or ingestion fails.

Example

from deepfix_sdk.data.datasets import TabularDataset import pandas as pd df = pd.read_csv("train.csv") train_dataset = TabularDataset( ... dataset_name="my-dataset", ... data=df ... ) client.ingest( ... train_data=train_dataset, ... batch_size=16 ... )

Source code in deepfix-sdk/src/deepfix_sdk/client.py
def ingest(
    self,
    train_data: BaseDataset,
    test_data: Optional[BaseDataset] = None,
    model: Any = None,
    model_name: Optional[str] = None,
    batch_size: int = 8,
    overwrite: bool = False,
) -> None:
    """Ingest a dataset with optional quality validation.

    This method uploads a dataset to the DeepFix server and optionally performs
    validation checks on the data. Supports multiple data types including images,
    tabular data, NLP text, and general vision datasets.

    Args:
        train_data (BaseDataset): Training dataset to ingest. Must be an instance
            of an appropriate dataset class (e.g., ImageClassificationDataset,
            TabularDataset, NLPDataset). The dataset name is extracted from the
            dataset_name attribute of this object.
        test_data (BaseDataset, optional): Test/validation dataset. If provided,
            enables cross-dataset validation checks. Defaults to None.
        model (Any, optional): Model to ingest. Must be an instance of a model class.
            Defaults to None.
        model_name (str, optional): Name of the model. Defaults to None.
        batch_size (int, optional): Batch size for processing the dataset.
            Defaults to 8.
        overwrite (bool, optional): If True, overwrite existing dataset with the
            same name. If False, raise an error if dataset exists. Defaults to False.

    Raises:
        ValueError: If dataset with the same name exists and overwrite=False.
        Exception: If data validation fails or ingestion fails.

    Example:
        >>> from deepfix_sdk.data.datasets import TabularDataset
        >>> import pandas as pd
        >>> df = pd.read_csv("train.csv")
        >>> train_dataset = TabularDataset(
        ...     dataset_name="my-dataset",
        ...     data=df
        ... )
        >>> client.ingest(
        ...     train_data=train_dataset,
        ...     batch_size=16
        ... )
    """
    from .pipelines import IngestionPipeline

    data_type = self._get_data_type(train_data, test_data)
    dataset_name = self.get_dataset_name(train_data, test_data)

    dataset_logging_pipeline = IngestionPipeline(
        dataset_name=dataset_name,
        data_type=data_type,
        mlflow_tracking_uri=self.mlflow_config.tracking_uri,
        train_test_validation=test_data is not None,
        data_integrity=True,
        model_evaluation=model is not None,
        batch_size=batch_size,
        overwrite=overwrite,
        model_name=model_name,
    )
    dataset_logging_pipeline.run(
        train_data=train_data, test_data=test_data, model=model
    )

list_datasets(status=None)

List datasets that have been ingested and are available for diagnosis.

Parameters:

Name Type Description Default
status ArtifactStatus | str | None

Optional filter by artifact status.

None

Returns:

Type Description
list[dict[str, Any]]

List of dictionaries describing available datasets. Each record contains: - dataset_name: Registered run/dataset name. - status: Artifact registration status. - mlflow_run_id: Associated MLflow run, if any. - local_path: Path to cached artifact on disk, if downloaded. - updated_at / created_at: ISO8601 timestamps for auditing.

Source code in deepfix-sdk/src/deepfix_sdk/client.py
def list_datasets(
    self, status: Optional[Union[str, ArtifactStatus]] = None
) -> list[dict[str, Any]]:
    """List datasets that have been ingested and are available for diagnosis.

    Args:
        status (ArtifactStatus | str | None): Optional filter by artifact status.

    Returns:
        List of dictionaries describing available datasets. Each record contains:
            - dataset_name: Registered run/dataset name.
            - status: Artifact registration status.
            - mlflow_run_id: Associated MLflow run, if any.
            - local_path: Path to cached artifact on disk, if downloaded.
            - updated_at / created_at: ISO8601 timestamps for auditing.
    """
    repo = self._get_artifact_repository()
    status_enum: Optional[ArtifactStatus] = None
    if status is not None:
        status_enum = (
            status if isinstance(status, ArtifactStatus) else ArtifactStatus(status)
        )
    records = repo.list_records(
        artifact_key=ArtifactPath.DATASET.value, status=status_enum
    )
    datasets = []
    for record in records:
        datasets.append(
            {
                "dataset_name": record.run_id,
                "status": record.status.value if record.status else None,
                "mlflow_run_id": record.mlflow_run_id,
                "local_path": record.local_path,
                "created_at": record.created_at.isoformat()
                if record.created_at
                else None,
                "updated_at": record.updated_at.isoformat()
                if record.updated_at
                else None,
            }
        )
    datasets.sort(key=lambda item: item["updated_at"] or "", reverse=True)
    return datasets

Configuration

MLflowConfig

Bases: BaseModel

Configuration for MLflow integration.

Attributes:

Name Type Description
tracking_uri str

MLflow tracking server URI. Must start with http://, https://, or file://.

run_id Optional[str]

Optional MLflow run ID to analyze.

download_dir str

Local directory for downloading artifacts.

create_run_if_not_exists bool

Whether to create the run if it doesn't exist. Defaults to False.

experiment_name str

MLflow experiment name for deepfix.

trace_dspy bool

Whether to trace dspy requests. Defaults to True.

Source code in deepfix-sdk/src/deepfix_sdk/config.py
class MLflowConfig(BaseModel):
    """Configuration for MLflow integration.

    Attributes:
        tracking_uri: MLflow tracking server URI. Must start with http://,
            https://, or file://.
        run_id: Optional MLflow run ID to analyze.
        download_dir: Local directory for downloading artifacts.
        create_run_if_not_exists: Whether to create the run if it doesn't exist.
            Defaults to False.
        experiment_name: MLflow experiment name for deepfix.
        trace_dspy: Whether to trace dspy requests. Defaults to True.
    """

    tracking_uri: str = Field(
        default=DefaultPaths.MLFLOW_TRACKING_URI.value,
        description="MLflow tracking server URI",
    )
    run_id: Optional[str] = Field(default=None, description="MLflow run ID to analyze")
    download_dir: str = Field(
        default=DefaultPaths.MLFLOW_DOWNLOADS.value,
        description="Local directory for downloading artifacts",
    )
    create_run_if_not_exists: bool = Field(
        default=False,
        description="Whether to create the run if it doesn't exist",
    )
    experiment_name: str = Field(
        default=DefaultPaths.EXPERIMENT_NAME.value,
        description="MLflow experiment name for deepfix",
    )
    trace_dspy: bool = Field(
        default=True,
        description="Whether to trace dspy requests",
    )

    @field_validator("tracking_uri")
    @classmethod
    def validate_tracking_uri(cls, v: str) -> str:
        """Validate tracking URI format.

        Args:
            v: Tracking URI string to validate.

        Returns:
            Validated tracking URI.

        Raises:
            ValueError: If URI doesn't start with http://, https://, or file://.
        """
        if not v.startswith(
            (
                "http://",
                "https://",
                "file://",
            )
        ):
            raise ValueError(
                "tracking_uri must start with http://, https://, or file://"
            )
        return v

validate_tracking_uri(v) classmethod

Validate tracking URI format.

Parameters:

Name Type Description Default
v str

Tracking URI string to validate.

required

Returns:

Type Description
str

Validated tracking URI.

Raises:

Type Description
ValueError

If URI doesn't start with http://, https://, or file://.

Source code in deepfix-sdk/src/deepfix_sdk/config.py
@field_validator("tracking_uri")
@classmethod
def validate_tracking_uri(cls, v: str) -> str:
    """Validate tracking URI format.

    Args:
        v: Tracking URI string to validate.

    Returns:
        Validated tracking URI.

    Raises:
        ValueError: If URI doesn't start with http://, https://, or file://.
    """
    if not v.startswith(
        (
            "http://",
            "https://",
            "file://",
        )
    ):
        raise ValueError(
            "tracking_uri must start with http://, https://, or file://"
        )
    return v

ArtifactConfig

Bases: BaseModel

Configuration for artifact management.

Attributes:

Name Type Description
load_training bool

Whether to load training artifacts. Defaults to False.

load_checks bool

Whether to load Deepchecks artifacts. Defaults to True.

load_dataset_metadata bool

Whether to load dataset metadata. Defaults to True.

load_model_checkpoint bool

Whether to load model checkpoint. Defaults to True.

download_if_missing bool

Whether to download artifacts if not locally cached. Defaults to True.

cache_enabled bool

Whether to enable local caching. Defaults to True.

sqlite_path str

Path to SQLite database for artifact caching.

Source code in deepfix-sdk/src/deepfix_sdk/config.py
class ArtifactConfig(BaseModel):
    """Configuration for artifact management.

    Attributes:
        load_training: Whether to load training artifacts. Defaults to False.
        load_checks: Whether to load Deepchecks artifacts. Defaults to True.
        load_dataset_metadata: Whether to load dataset metadata. Defaults to True.
        load_model_checkpoint: Whether to load model checkpoint. Defaults to True.
        download_if_missing: Whether to download artifacts if not locally cached.
            Defaults to True.
        cache_enabled: Whether to enable local caching. Defaults to True.
        sqlite_path: Path to SQLite database for artifact caching.
    """

    load_training: bool = Field(
        default=False, description="Whether to load training artifacts"
    )
    load_checks: bool = Field(
        default=True, description="Whether to load Deepchecks artifacts"
    )
    load_dataset_metadata: bool = Field(
        default=True, description="Whether to load dataset metadata"
    )
    load_model_checkpoint: bool = Field(
        default=True, description="Whether to load model checkpoint"
    )
    download_if_missing: bool = Field(
        default=True, description="Whether to download artifacts if not locally cached"
    )
    cache_enabled: bool = Field(
        default=True, description="Whether to enable local caching"
    )
    sqlite_path: str = Field(
        default=DefaultPaths.ARTIFACTS_SQLITE_PATH.value,
        description="Path to SQLite database for artifact caching",
    )

Datasets

BaseDataset

Bases: Protocol

Source code in deepfix-sdk/src/deepfix_sdk/data/datasets.py
@runtime_checkable
class BaseDataset(Protocol):
    def to_loader(self, model: Optional[Callable] = None, batch_size: int = 8) -> Any:
        raise NotImplementedError("Subclasses must implement this method")

    @property
    def data_type(self) -> DataType:
        raise NotImplementedError("Subclasses must implement this method")

    @property
    def name(self) -> str:
        raise NotImplementedError("Subclasses must implement this method")

ImageClassificationDataset

Bases: VisionDataset

Source code in deepfix-sdk/src/deepfix_sdk/data/datasets.py
class ImageClassificationDataset(VisionDataset):
    def __init__(self, dataset_name: str, dataset: Dataset):
        super().__init__(dataset_name=dataset_name, dataset=dataset)

    def to_loader(
        self, model: Optional[Callable] = None, batch_size: int = 8
    ) -> ClassificationVisionDataLoader:
        return ClassificationVisionDataLoader.load_from_dataset(
            self.dataset,
            batch_size=batch_size,
            model=model,
        )

    def __getitem__(self, idx):
        image, label = self.dataset[idx]
        return dict(image=image, label=label)

TabularDataset

Bases: BaseDataset

Source code in deepfix-sdk/src/deepfix_sdk/data/datasets.py
class TabularDataset(BaseDataset):
    def __init__(
        self,
        dataset_name: str,
        dataset: pd.DataFrame,
        label: Optional[str] = None,
        cat_features: Optional[List[str]] = None,
    ):
        if isinstance(dataset, pd.DataFrame):
            assert label is not None, "Label column is required"
            self.dataset = DeepchecksTabularDataset(
                dataset, label=label, cat_features=cat_features or []
            )

        else:
            raise ValueError(f"Invalid dataset type: {type(dataset)}")

        self.dataset_name = dataset_name

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        return self.dataset.data.iloc[idx], self.dataset.label_col.iloc[idx]

    def __iter__(self):
        return iter(self.dataset)

    def get_data(self) -> pd.DataFrame:
        return self.dataset.data.copy()

    @property
    def data_type(self) -> DataType:
        return DataType.TABULAR

    @property
    def data(self) -> pd.DataFrame:
        return self.get_data()

    @property
    def name(self) -> str:
        return self.dataset_name

    @property
    def X(self) -> pd.DataFrame:
        x = self.get_data().drop(columns=[self.dataset.label_name])
        x[self.cat_features] = x[self.cat_features].astype("category")
        return x

    @property
    def y(self) -> pd.Series:
        return self.dataset.label_col.copy()

    @property
    def cat_features(self) -> List[str]:
        return self.dataset.cat_features

    @property
    def num_features(self) -> List[str]:
        return self.dataset.numerical_features

    def to_loader(self, *args, **kwargs) -> "TabularDataset":
        return self

NLPDataset

Bases: BaseDataset

Source code in deepfix-sdk/src/deepfix_sdk/data/datasets.py
class NLPDataset(BaseDataset):
    def __init__(self, dataset_name: str, dataset: TextData):
        self.dataset = dataset
        self.dataset_name = dataset_name

    def to_loader(self, *args, **kwargs) -> TextData:
        return self.dataset

    def __len__(self):
        return len(self.dataset)

    @property
    def data_type(self) -> DataType:
        return DataType.NLP

    @property
    def data(self) -> TextData:
        return self.dataset

    @property
    def embeddings(self) -> np.ndarray:
        return self.dataset.embeddings

    @property
    def X(self) -> Sequence[str]:
        return self.dataset.text

    @property
    def y(self) -> TTextLabel:
        return self.dataset.label

    @property
    def name(self) -> str:
        return self.dataset_name

Pipelines

ArtifactLoadingPipeline

Bases: Pipeline

Source code in deepfix-sdk/src/deepfix_sdk/pipelines/factory.py
class ArtifactLoadingPipeline(Pipeline):
    def __init__(
        self,
        dataset_name: str,
        model_name: Optional[str] = None,
        mlflow_config: Optional[MLflowConfig] = None,
        artifact_config: Optional[ArtifactConfig] = None,
    ):
        self.run_name = create_run_name(dataset_name, model_name=model_name)
        self.mlflow_config = mlflow_config or MLflowConfig()
        self.artifact_config = artifact_config or ArtifactConfig()
        mlflow_manager = MLflowManager.from_config(
            self.mlflow_config, run_name=self.run_name
        )
        self.artifact_mgr = ArtifactsManager(
            mlflow_manager=mlflow_manager, sqlite_path=self.artifact_config.sqlite_path
        )

        super().__init__(steps=self._load_steps())

    def _load_steps(self) -> list[Step]:
        """Build steps based on configuration. Supports loading multiple artifact types."""
        steps = []
        cfg = dict(
            artifact_mgr=self.artifact_mgr,
            run_name=self.run_name,
        )

        # Load dataset metadata if configured
        if self.artifact_config.load_dataset_metadata:
            LOGGER.info("Loading dataset metadata")
            steps.append(LoadDatasetArtifact(**cfg))

        # Load deepchecks artifacts if configured
        if self.artifact_config.load_checks:
            LOGGER.info("Loading deepchecks artifacts")
            steps.append(LoadDeepchecksArtifacts(**cfg))

        # Load model checkpoint if configured
        if self.artifact_config.load_model_checkpoint:
            LOGGER.info("Loading model checkpoint")
            steps.append(LoadModelCheckpoint(**cfg))

        # Load training artifacts if configured
        if self.artifact_config.load_training:
            LOGGER.info("Loading training artifacts")
            steps.append(LoadTrainingArtifact(**cfg))

        # Ensure at least one artifact type is configured to load
        if not steps:
            raise ValueError(
                "No artifacts to load. Please enable at least one of: "
                "load_dataset_metadata, load_checks, load_model_checkpoint, load_training"
            )

        return steps

Examples

See the Quickstart Guide for usage examples.