MLOps ZoomCamp Module 2: MLflow for ML Experiment Tracking

Collapse
X
 
  • Time
  • Show
Clear All
new posts
  • MyrinNew
    Senior Member
    • Feb 2024
    • 5175

    #1

    MLOps ZoomCamp Module 2: MLflow for ML Experiment Tracking

    📚 Introduction

    MLflow is an open-source platform designed to manage the complete machine learning lifecycle. It helps data scientists track experiments, reproduce results, deploy models, and create a centralized model registry.


    Why Experiment Tracking Matters

    🔄 Reproducibility Recreate results and understand past decisions
    🗂️ Organization Keep track of work across many iterations and experiments
    ⚙️ Optimization Easily compare different approaches and parameter configurations
    👥 Collaboration Share and communicate results with team members


    Key Terminology

    • ML experiment: The entire process of building a machine learning model
    • Experiment run: A single trial within an ML experiment
    • Run artifact: Files associated with a run (models, visualizations, datasets)
    • Experiment metadata: Data describing the experiment (parameters, metrics, etc.)


    🧩 MLflow Components

    MLflow consists of four main modules:

    1. Tracking: Record and query experiments (parameters, metrics, code versions, etc.)
    2. Models: Package ML models in a standard format for deployment
    3. Model Registry: Store, annotate, and manage models in a central repository
    4. Projects: Package code in a reusable and reproducible form (not covered in this guide)


    🚀 Getting Started

    Installation





    # Using pip
    pip install mlflow

    # Using conda
    conda install -c conda-forge mlflow







    Setting Up the MLflow Server





    # Basic usage
    mlflow ui

    # With SQLite backend (recommended)
    mlflow ui --backend-store-uri sqlite:///mlflow.db

    # With specific artifact location
    mlflow ui --backend-store-uri sqlite:///mlflow.db --default-artifact-root ./mlruns







    After running the commands above, access the UI at: http://localhost:5000


    💻 Working with MLflow

    Initializing MLflow in Your Code





    import mlflow

    # Set the tracking server URI
    mlflow.set_tracking_uri("sqlite:///mlflow.db")

    # Set the experiment name
    mlflow.set_experiment("my-classification-project")







    Basic Experiment Tracking





    with mlflow.start_run(run_name="basic-model"):
    # 1. Log parameters
    mlflow.log_param("algorithm", "RandomForest")
    mlflow.log_param("n_estimators", 100)

    # 2. Log metrics
    mlflow.log_metric("accuracy", 0.85)
    mlflow.log_metric("f1_score", 0.82)

    # 3. Log model
    mlflow.sklearn.log_model(model, "model")

    # 4. Log artifacts (e.g., feature importance plot)
    mlflow.log_artifact("feature_importance.png")







    Using the MLflow Client API





    from mlflow.tracking import MlflowClient

    # Initialize the client
    client = MlflowClient(tracking_uri="sqlite:///mlflow.db")

    # Create a new experiment
    experiment_id = client.create_experiment("customer-churn-prediction")

    # Get experiment by name
    experiment = client.get_experiment_by_name("customer-churn-prediction")







    📊 Advanced Tracking Features

    Nested Runs





    # Parent run
    with mlflow.start_run(run_name="parent") as parent_run:
    mlflow.log_param("parent_param", "parent_value")

    # Child run
    with mlflow.start_run(run_name="child", nested=True) as child_run:
    mlflow.log_param("child_param", "child_value")
    mlflow.log_metric("child_metric", 1.0)







    Hyperparameter Optimization with MLflow





    from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

    def objective(params):
    with mlflow.start_run(nested=True):
    # Log hyperparameters
    mlflow.log_params(params)

    # Your model training code here
    # ...

    # Log results
    mlflow.log_metric("rmse", rmse)

    return {"loss": rmse, "status": STATUS_OK}

    # Define search space
    search_space = {
    "learning_rate": hp.loguniform("learning_rate", -5, 0),
    "max_depth": hp.randint("max_depth", 3, 10),
    "min_child_weight": hp.randint("min_child_weight", 1, 10),
    "subsample": hp.uniform("subsample", 0.5, 1.0)
    }

    # Run optimization
    with mlflow.start_run(run_name="hyperopt_tuning"):
    best_params = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=50,
    trials=Trials()
    )







    Autologging

    MLflow can automatically log parameters, metrics, and models without manual logging statements:






    # Enable autologging for all supported frameworks
    mlflow.autolog()

    # Or enable for specific framework
    mlflow.sklearn.autolog()
    mlflow.xgboost.autolog()
    mlflow.pytorch.autolog()
    mlflow.tensorflow.autolog()







    What gets autologged:

    • Model parameters
    • Evaluation metrics
    • Model artifacts
    • Model signatures
    • Input examples
    • Package dependencies


    📦 Managing Models

    Saving and Loading Models





    # Saving a model
    with mlflow.start_run():
    # Train your model
    model = RandomForestClassifier()
    model.fit(X_train, y_train)

    # Log model
    mlflow.sklearn.log_model(
    sk_model=model,
    artifact_path="random_forest_model",
    registered_model_name="rf_classification"
    )

    # Loading a model
    model_uri = "runs://random_forest_model"
    loaded_model = mlflow.sklearn.load_model(model_uri)

    # Make predictions
    predictions = loaded_model.predict(X_test)







    Model Signatures and Input Examples





    import pandas as pd
    from mlflow.models.signature import infer_signature

    # Generate model signature
    X_sample = X_train.iloc[:5]
    y_sample = model.predict(X_sample)
    signature = infer_signature(X_sample, y_sample)

    # Log model with signature and input example
    with mlflow.start_run():
    mlflow.sklearn.log_model(
    model,
    "model",
    signature=signature,
    input_example=X_sample
    )







    📋 Model Registry

    The Model Registry provides a centralized repository for managing the full lifecycle of MLflow Models.


    Model Registry Workflow

    1. Register a model from a run
    2. Transition models between stages (Staging, Production, Archived)
    3. Version models automatically
    4. Annotate models with descriptions and tags
    5. Deploy models to various serving platforms


    Registering Models





    # From UI: Click "Register Model" on the run page

    # From code
    with mlflow.start_run():
    mlflow.sklearn.log_model(
    model,
    "model",
    registered_model_name="customer-churn-predictor"
    )

    # Or using existing run
    run_id = "abcdef123456"
    model_uri = f"runs:/{run_id}/model"
    mlflow.register_model(model_uri, "customer-churn-predictor")







    Managing Model Versions





    from mlflow.tracking import MlflowClient

    client = MlflowClient()
    model_name = "customer-churn-predictor"

    # Get all versions of a model
    versions = client.get_latest_versions(model_name)
    for v in versions:
    print(f"Version: {v.version}, Stage: {v.current_stage}")

    # Transition a model to production
    client.transition_model_version_stage(
    name=model_name,
    version=2,
    stage="Production",
    archive_existing_versions=True # Archive any existing production versions
    )

    # Add description to model version
    client.update_model_version(
    name=model_name,
    version=2,
    description="This model was trained on dataset v2 with improved features"
    )







    Loading Models from Registry





    # Load the latest Production model
    production_model = mlflow.pyfunc.load_model(
    model_uri=f"models:/{model_name}/Production"
    )

    # Load a specific version
    specific_model = mlflow.pyfunc.load_model(
    model_uri=f"models:/{model_name}/2"
    )







    📝 Best Practices

    Project Structure





    my-ml-project/
    ├── data/ # Data files
    ├── notebooks/ # Jupyter notebooks for exploration
    ├── src/ # Source code
    │ ├── train.py # Training script
    │ ├── predict.py # Prediction script
    │ └── utils.py # Utility functions
    ├── mlruns/ # MLflow tracking data (if using local storage)
    ├── mlflow.db # MLflow SQLite database
    ├── README.md # Project documentation
    └── requirements.txt # Project dependencies







    Tips for Effective MLflow Usage

    1. Use descriptive run names to easily identify experiments
    2. Create separate experiments for different problems or approaches
    3. Log all parameters that affect your model's performance
    4. Use tags to add searchable metadata to runs
    5. Always version your data and log data paths/versions
    6. Set up a dedicated tracking server for team collaboration
    7. Integrate with CI/CD pipelines for automated model deployment
    8. Create standardized model training workflows with common logging patterns


    🔍 Comparison with Other Tools

    Experiment Tracking
    Model Registry
    Model Packaging
    Open Source
    UI Visualization Basic Advanced Advanced Advanced
    Team Collaboration Basic Limited Advanced Advanced
    Learning Curve Easy Moderate Easy Easy


    🔧 Complete Example: NYC Taxi Fare Prediction





    import mlflow
    import pandas as pd
    import numpy as np
    from sklearn.model_selection import train_test_split
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.metrics import mean_squared_error, r2_score

    # Set up MLflow
    mlflow.set_tracking_uri("sqlite:///mlflow.db")
    mlflow.set_experiment("nyc-taxi-fare-prediction")

    # Load data
    df = pd.read_parquet("data/green_tripdata_2021-01.parquet")

    # Data preprocessing
    # ... (feature engineering code here)

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(
    df.drop('fare_amount', axis=1), df['fare_amount'], test_size=0.2, random_state=42
    )

    # Model training with MLflow tracking
    with mlflow.start_run(run_name="random-forest-baseline"):
    # Set parameters
    n_estimators = 100
    max_depth = 10

    # Log parameters
    mlflow.log_param("model_type", "RandomForest")
    mlflow.log_param("n_estimators", n_estimators)
    mlflow.log_param("max_depth", max_depth)
    mlflow.log_param("train_data_shape", X_train.shape)
    mlflow.log_param("test_data_shape", X_test.shape)

    # Train model
    model = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, random_state=42)
    model.fit(X_train, y_train)

    # Evaluate model
    y_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)

    # Log metrics
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2", r2)

    # Log feature importance as a figure
    import matplotlib.pyplot as plt
    feature_importance = pd.DataFrame(
    model.feature_importances_,
    index=X_train.columns,
    columns=['importance']
    ).sort_values('importance', ascending=False)

    plt.figure(figsize=(10, 8))
    feature_importance[:10].plot(kind='barh')
    plt.title('Feature Importance')
    plt.tight_layout()
    plt.savefig("feature_importance.png")
    mlflow.log_artifact("feature_importance.png")

    # Log model
    mlflow.sklearn.log_model(
    model,
    "random_forest_model",
    registered_model_name="nyc-taxi-fare-predictor"
    )

    print(f"Model trained with RMSE: {rmse:.4f}, R²: {r2:.4f}")
    print(f"Run ID: {mlflow.active_run().info.run_id}")









    More...
Working...