In [1]:
# Let's use MLFlow to deploy and Test our classic Iris dataset. We will use k-nearest neighbors algorithm (KNN)
# This is dataset is the "Hello World" in ML

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn import datasets

from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)

# Load Iris dataset from sklearn
iris = datasets.load_iris()

# Start MLFlow Server in Conda environment
# mlflow server --backend-store-uri file:///Users/coool/Documents/MLFlow_Git/mlflow-database
In [2]:
## Load dataset into a dataframe

iris_df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
iris_df['target'] = iris.target
iris_df['species'] = iris.target_names[iris.target]
iris_df.drop('target', axis=1, inplace=True)
In [3]:
# Check data

iris_df.head()
Out[3]:
sepal length (cm) sepal width (cm) petal length (cm) petal width (cm) species
0 5.1 3.5 1.4 0.2 setosa
1 4.9 3.0 1.4 0.2 setosa
2 4.7 3.2 1.3 0.2 setosa
3 4.6 3.1 1.5 0.2 setosa
4 5.0 3.6 1.4 0.2 setosa
In [4]:
## EDA - Exploratory Data Analysis

sns.pairplot(iris_df, hue='species', diag_kind='kde')
plt.show()
In [5]:
## Train, test Split and fit the model

X = iris_df.drop(['species'], axis=1)
y = iris_df['species']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=5)

knn = KNeighborsClassifier(n_neighbors = 9)
knn.fit(X, y)
Out[5]:
KNeighborsClassifier(n_neighbors=9)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
KNeighborsClassifier(n_neighbors=9)
In [6]:
# Hyper parameter tuning. Let's keep it simple

# experimenting with different n values
#k_range = [3, 5, 10,50]
k_range = list(range(1,10))
scores = []
for k in k_range:
    knn_hp = KNeighborsClassifier(n_neighbors=k)
    knn_hp.fit(X_train, y_train)
    y_pred = knn_hp.predict(X_test)
    scores.append(metrics.accuracy_score(y_test, y_pred))
    
plt.plot(k_range, scores)
plt.xlabel('Value of k for KNN')
plt.ylabel('Accuracy Score')
plt.title('Accuracy Scores for Values of k of k-Nearest-Neighbors')
plt.show()
In [7]:
## Sample data to pick for Testing

pd.merge(X_test, y_test, left_index=True, right_index=True)
Out[7]:
sepal length (cm) sepal width (cm) petal length (cm) petal width (cm) species
82 5.8 2.7 3.9 1.2 versicolor
134 6.1 2.6 5.6 1.4 virginica
114 5.8 2.8 5.1 2.4 virginica
42 4.4 3.2 1.3 0.2 setosa
109 7.2 3.6 6.1 2.5 virginica
57 4.9 2.4 3.3 1.0 versicolor
1 4.9 3.0 1.4 0.2 setosa
70 5.9 3.2 4.8 1.8 versicolor
25 5.0 3.0 1.6 0.2 setosa
84 5.4 3.0 4.5 1.5 versicolor
66 5.6 3.0 4.5 1.5 versicolor
133 6.3 2.8 5.1 1.5 virginica
102 7.1 3.0 5.9 2.1 virginica
107 7.3 2.9 6.3 1.8 virginica
26 5.0 3.4 1.6 0.4 setosa
23 5.1 3.3 1.7 0.5 setosa
123 6.3 2.7 4.9 1.8 virginica
130 7.4 2.8 6.1 1.9 virginica
21 5.1 3.7 1.5 0.4 setosa
12 4.8 3.0 1.4 0.1 setosa
71 6.1 2.8 4.0 1.3 versicolor
128 6.4 2.8 5.6 2.1 virginica
48 5.3 3.7 1.5 0.2 setosa
72 6.3 2.5 4.9 1.5 versicolor
88 5.6 3.0 4.1 1.3 versicolor
148 6.2 3.4 5.4 2.3 virginica
74 6.4 2.9 4.3 1.3 versicolor
96 5.7 2.9 4.2 1.3 versicolor
63 6.1 2.9 4.7 1.4 versicolor
132 6.4 2.8 5.6 2.2 virginica
In [8]:
# Test samples list. This is array of indices pointing towards X_test
# This list is used to test predictions throught this notebook
# We can pick any index values from the above list

test_samples=[1, 134, 84, 21, 128, 48] 
In [9]:
# Simple test of predictions to actual values. - No MLFlow So far

results_df = pd.DataFrame()
results_df["Predictions"] = knn.predict(X_test.loc[test_samples])     ## Prediction
results_df["Actuals"] = y_test.loc[test_samples].values
results_df
Out[9]:
Predictions Actuals
0 setosa setosa
1 virginica virginica
2 versicolor versicolor
3 setosa setosa
4 virginica virginica
5 setosa setosa

MLFlow starts from here¶

  • Model has a score of 0.98. It is pretty high and will predict accurately
  • Model has been built in the previous section and now we are ready to move from Dev => Staging => Prod
  • Get predictions using REST API calls
  • test_samples is a list of indices used to validate predictions
In [10]:
import pickle
from urllib.parse import urlparse

import mlflow
from mlflow.models import infer_signature
tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme

print("MLFlow Tracking DB:", mlflow.get_tracking_uri())
MLFlow Tracking DB: http://localhost:5000
In [11]:
mlflow.set_experiment(experiment_name="Sreenivas-KNN-Iris")
Out[11]:
<Experiment: artifact_location='mlflow-artifacts:/627570918674180859', creation_time=1694915262298, experiment_id='627570918674180859', last_update_time=1694915262298, lifecycle_stage='active', name='Sreenivas-KNN-Iris', tags={}>
In [12]:
with mlflow.start_run():
    mlflow.doctor()
System information: Windows 10.0.22621
Python version: 3.10.11
MLflow version: 2.7.0
MLflow module location: C:\Users\coool\anaconda3\envs\mflow27_1\lib\site-packages\mlflow\__init__.py
Tracking URI: http://localhost:5000
Registry URI: http://localhost:5000
Active experiment ID: 627570918674180859
Active run ID: 3c6ec1fe03a946098a7c3fe1fcc0b4e4
Active run artifact URI: mlflow-artifacts:/627570918674180859/3c6ec1fe03a946098a7c3fe1fcc0b4e4/artifacts
MLflow environment variables: 
  MLFLOW_TRACKING_URI: http://localhost:5000
MLflow dependencies: 
  Flask: 2.3.3
  Jinja2: 3.1.2
  alembic: 1.12.0
  click: 8.1.7
  cloudpickle: 2.2.1
  databricks-cli: 0.17.7
  docker: 6.1.3
  entrypoints: 0.4
  gitpython: 3.1.36
  importlib-metadata: 6.8.0
  markdown: 3.4.4
  matplotlib: 3.8.0
  numpy: 1.24.3
  packaging: 23.1
  pandas: 2.1.0
  protobuf: 4.24.3
  psutil: 5.9.0
  pyarrow: 13.0.0
  pytz: 2023.3.post1
  pyyaml: 6.0.1
  querystring-parser: 1.2.4
  requests: 2.31.0
  scikit-learn: 1.3.0
  scipy: 1.11.2
  sqlalchemy: 2.0.20
  sqlparse: 0.4.4
  virtualenv: 20.24.5
  waitress: 2.1.2
In [13]:
# Register the model in ML Flow. We are NOT building the model. We are using the model built in this notebook 

with mlflow.start_run() as knn_iris_run:
    
    score = knn.score(X, y)
    print(f"Score: {score}")
    
    mlflow.log_metric("score", score)
    predictions = knn.predict(X)
    signature = infer_signature(X_test, predictions)

    model_info = mlflow.sklearn.log_model(
                knn, "model", registered_model_name="KNNIrisModel", signature=signature)  ## Log model in MLFlow
    
    
    print(f"Model saved in run: {knn_iris_run.info.run_uuid}, run_name: {mlflow.active_run().info.run_name}")
Score: 0.98
Registered model 'KNNIrisModel' already exists. Creating a new version of this model...
2023/09/17 17:05:28 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: KNNIrisModel, version 25
Model saved in run: f846fd2633324456a73f53aef1fd6103, run_name: funny-sow-770
Created version '25' of model 'KNNIrisModel'.
In [14]:
## The model should be now be visible in MLFlow UI

model_info.model_uri
Out[14]:
'runs:/f846fd2633324456a73f53aef1fd6103/model'
In [15]:
## Make predictions on Model using runId

# Load model as a PyFuncModel.
loaded_model = mlflow.pyfunc.load_model(model_info.model_uri)

## Predict and display
results_logged_df = pd.DataFrame()
results_logged_df["Predictions"] = loaded_model.predict(X_test.loc[test_samples])   ## Prediction
results_logged_df["Actuals"] = y_test.loc[test_samples].values
results_logged_df
Out[15]:
Predictions Actuals
0 setosa setosa
1 virginica virginica
2 versicolor versicolor
3 setosa setosa
4 virginica virginica
5 setosa setosa

Model is tested and logged. Let's move it to Staging and Test¶

In [16]:
## Capture the version number using model_info, so that we can move the newly created version into Staging
## Get Curr(ent) version on the model

from mlflow import MlflowClient
client = MlflowClient()

filter_string = f"run_id='{model_info.run_id}'"
results = client.search_model_versions(filter_string)
curr_version = results[0].version 

print(f"Model URI: {model_info.model_uri}, Model version: {curr_version}")
Model URI: runs:/f846fd2633324456a73f53aef1fd6103/model, Model version: 25
In [17]:
# Promote the model to Staging

client.transition_model_version_stage(
    name="KNNIrisModel", version=curr_version, stage="Staging"
)
Out[17]:
<ModelVersion: aliases=[], creation_timestamp=1694988328701, current_stage='Staging', description='', last_updated_timestamp=1694988334019, name='KNNIrisModel', run_id='f846fd2633324456a73f53aef1fd6103', run_link='', source='mlflow-artifacts:/627570918674180859/f846fd2633324456a73f53aef1fd6103/artifacts/model', status='READY', status_message='', tags={}, user_id='', version='25'>
In [18]:
# Test the model in Staging

staged_model = mlflow.pyfunc.load_model(model_uri=f"models:/KNNIrisModel/Staging")  ## Get Staged Model

## Predict and display
results_staging_df = pd.DataFrame()
results_staging_df["Predictions"] = staged_model.predict(X_test.loc[test_samples])   ## Prediction
results_staging_df["Actuals"] = y_test.loc[test_samples].values
results_staging_df
Out[18]:
Predictions Actuals
0 setosa setosa
1 virginica virginica
2 versicolor versicolor
3 setosa setosa
4 virginica virginica
5 setosa setosa

Use UI to promote the model into Production and then run the next cell¶

UI can be accessed http://localhost:5000¶

In [19]:
prod_model = mlflow.pyfunc.load_model(model_uri=f"models:/KNNIrisModel/Production")  ## Get Prod Model

## Predict and display
results_prod_df = pd.DataFrame()
results_prod_df["Predictions"] = prod_model.predict(X_test.loc[test_samples])  ## Prediction
results_prod_df["Actuals"] = y_test.loc[test_samples].values
results_prod_df
Out[19]:
Predictions Actuals
0 setosa setosa
1 virginica virginica
2 versicolor versicolor
3 setosa setosa
4 virginica virginica
5 setosa setosa

Finally let's use REST Service to invoke the model and get prediction¶

In [20]:
## Serve the model, run this command in conda environment. Once is model is being served, run this cell
#    mlflow models serve -m "models:/KNNIrisModel/Production" --port 5002
## The model is available on pot 5002. It looks like we will need a seperate port for each model

import requests
import json

message_body = {
    "dataframe_split" : {
        "columns" : [],
        "data": []
    }
}
message_body["dataframe_split"]["columns" ]=list(X_test)
message_body["dataframe_split"]["data" ]=X_test.loc[test_samples].values.tolist()

json_object = json.dumps(message_body)

headers = {'Content-Type': 'application/json'}
r = requests.post('http://localhost:5002/invocations',
                  headers=headers,
                  data = json_object)                                                 ## Prediction

print(f"Status code: {r.status_code},  Response: {r.text}");
Status code: 200,  Response: {"predictions": ["setosa", "virginica", "versicolor", "setosa", "virginica", "setosa"]}
In [21]:
## Display predictions from REST API calls

results_rest_df = pd.DataFrame()
results_rest_df["Predictions"] = pd.read_json(r.text)
results_rest_df["Actuals"] = y_test.loc[test_samples].values
results_rest_df
Out[21]:
Predictions Actuals
0 setosa setosa
1 virginica virginica
2 versicolor versicolor
3 setosa setosa
4 virginica virginica
5 setosa setosa

Finally consolidate all predictions for display¶

  • test_samples is used to select rows from X_test. This is unseen data by the model
In [22]:
# Create a multi-index for columns
columns = pd.MultiIndex.from_tuples([
    ('Dev', 'Predictions'), ('Dev', 'Actuals'),
    ('Staging', 'Predictions'), ('Staging', 'Actuals'),
    ('Prod', 'Predictions'), ('Prod', 'Actuals'),
    ('REST API', 'Predictions'), ('REST API', 'Actuals')
])

results_final = pd.DataFrame(columns=columns)

results_final['Dev'] = results_logged_df
results_final['Staging'] = results_staging_df
results_final['Prod'] = results_prod_df
results_final['REST API'] = results_rest_df

print("\nPredictions vs Actuals at each stage. Note: Model score is 0.98")
print("-"*80)
results_final
Predictions vs Actuals at each stage. Note: Model score is 0.98
--------------------------------------------------------------------------------
Out[22]:
Dev Staging Prod REST API
Predictions Actuals Predictions Actuals Predictions Actuals Predictions Actuals
0 setosa setosa setosa setosa setosa setosa setosa setosa
1 virginica virginica virginica virginica virginica virginica virginica virginica
2 versicolor versicolor versicolor versicolor versicolor versicolor versicolor versicolor
3 setosa setosa setosa setosa setosa setosa setosa setosa
4 virginica virginica virginica virginica virginica virginica virginica virginica
5 setosa setosa setosa setosa setosa setosa setosa setosa
In [ ]:
 
In [ ]:
 
In [23]:
import sys
print(sys.version)
3.10.11 | packaged by Anaconda, Inc. | (main, May 16 2023, 00:55:32) [MSC v.1916 64 bit (AMD64)]