# Let's use MLFlow to deploy and Test our classic Iris dataset. We will use k-nearest neighbors algorithm (KNN)
# This is dataset is the "Hello World" in ML
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn import datasets
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)
# Load Iris dataset from sklearn
iris = datasets.load_iris()
# Start MLFlow Server in Conda environment
# mlflow server --backend-store-uri file:///Users/coool/Documents/MLFlow_Git/mlflow-database
## Load dataset into a dataframe
iris_df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
iris_df['target'] = iris.target
iris_df['species'] = iris.target_names[iris.target]
iris_df.drop('target', axis=1, inplace=True)
# Check data
iris_df.head()
sepal length (cm) | sepal width (cm) | petal length (cm) | petal width (cm) | species | |
---|---|---|---|---|---|
0 | 5.1 | 3.5 | 1.4 | 0.2 | setosa |
1 | 4.9 | 3.0 | 1.4 | 0.2 | setosa |
2 | 4.7 | 3.2 | 1.3 | 0.2 | setosa |
3 | 4.6 | 3.1 | 1.5 | 0.2 | setosa |
4 | 5.0 | 3.6 | 1.4 | 0.2 | setosa |
## EDA - Exploratory Data Analysis
sns.pairplot(iris_df, hue='species', diag_kind='kde')
plt.show()
## Train, test Split and fit the model
X = iris_df.drop(['species'], axis=1)
y = iris_df['species']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=5)
knn = KNeighborsClassifier(n_neighbors = 9)
knn.fit(X, y)
KNeighborsClassifier(n_neighbors=9)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
KNeighborsClassifier(n_neighbors=9)
# Hyper parameter tuning. Let's keep it simple
# experimenting with different n values
#k_range = [3, 5, 10,50]
k_range = list(range(1,10))
scores = []
for k in k_range:
knn_hp = KNeighborsClassifier(n_neighbors=k)
knn_hp.fit(X_train, y_train)
y_pred = knn_hp.predict(X_test)
scores.append(metrics.accuracy_score(y_test, y_pred))
plt.plot(k_range, scores)
plt.xlabel('Value of k for KNN')
plt.ylabel('Accuracy Score')
plt.title('Accuracy Scores for Values of k of k-Nearest-Neighbors')
plt.show()
## Sample data to pick for Testing
pd.merge(X_test, y_test, left_index=True, right_index=True)
sepal length (cm) | sepal width (cm) | petal length (cm) | petal width (cm) | species | |
---|---|---|---|---|---|
82 | 5.8 | 2.7 | 3.9 | 1.2 | versicolor |
134 | 6.1 | 2.6 | 5.6 | 1.4 | virginica |
114 | 5.8 | 2.8 | 5.1 | 2.4 | virginica |
42 | 4.4 | 3.2 | 1.3 | 0.2 | setosa |
109 | 7.2 | 3.6 | 6.1 | 2.5 | virginica |
57 | 4.9 | 2.4 | 3.3 | 1.0 | versicolor |
1 | 4.9 | 3.0 | 1.4 | 0.2 | setosa |
70 | 5.9 | 3.2 | 4.8 | 1.8 | versicolor |
25 | 5.0 | 3.0 | 1.6 | 0.2 | setosa |
84 | 5.4 | 3.0 | 4.5 | 1.5 | versicolor |
66 | 5.6 | 3.0 | 4.5 | 1.5 | versicolor |
133 | 6.3 | 2.8 | 5.1 | 1.5 | virginica |
102 | 7.1 | 3.0 | 5.9 | 2.1 | virginica |
107 | 7.3 | 2.9 | 6.3 | 1.8 | virginica |
26 | 5.0 | 3.4 | 1.6 | 0.4 | setosa |
23 | 5.1 | 3.3 | 1.7 | 0.5 | setosa |
123 | 6.3 | 2.7 | 4.9 | 1.8 | virginica |
130 | 7.4 | 2.8 | 6.1 | 1.9 | virginica |
21 | 5.1 | 3.7 | 1.5 | 0.4 | setosa |
12 | 4.8 | 3.0 | 1.4 | 0.1 | setosa |
71 | 6.1 | 2.8 | 4.0 | 1.3 | versicolor |
128 | 6.4 | 2.8 | 5.6 | 2.1 | virginica |
48 | 5.3 | 3.7 | 1.5 | 0.2 | setosa |
72 | 6.3 | 2.5 | 4.9 | 1.5 | versicolor |
88 | 5.6 | 3.0 | 4.1 | 1.3 | versicolor |
148 | 6.2 | 3.4 | 5.4 | 2.3 | virginica |
74 | 6.4 | 2.9 | 4.3 | 1.3 | versicolor |
96 | 5.7 | 2.9 | 4.2 | 1.3 | versicolor |
63 | 6.1 | 2.9 | 4.7 | 1.4 | versicolor |
132 | 6.4 | 2.8 | 5.6 | 2.2 | virginica |
# Test samples list. This is array of indices pointing towards X_test
# This list is used to test predictions throught this notebook
# We can pick any index values from the above list
test_samples=[1, 134, 84, 21, 128, 48]
# Simple test of predictions to actual values. - No MLFlow So far
results_df = pd.DataFrame()
results_df["Predictions"] = knn.predict(X_test.loc[test_samples]) ## Prediction
results_df["Actuals"] = y_test.loc[test_samples].values
results_df
Predictions | Actuals | |
---|---|---|
0 | setosa | setosa |
1 | virginica | virginica |
2 | versicolor | versicolor |
3 | setosa | setosa |
4 | virginica | virginica |
5 | setosa | setosa |
import pickle
from urllib.parse import urlparse
import mlflow
from mlflow.models import infer_signature
tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme
print("MLFlow Tracking DB:", mlflow.get_tracking_uri())
MLFlow Tracking DB: http://localhost:5000
mlflow.set_experiment(experiment_name="Sreenivas-KNN-Iris")
<Experiment: artifact_location='mlflow-artifacts:/627570918674180859', creation_time=1694915262298, experiment_id='627570918674180859', last_update_time=1694915262298, lifecycle_stage='active', name='Sreenivas-KNN-Iris', tags={}>
with mlflow.start_run():
mlflow.doctor()
System information: Windows 10.0.22621 Python version: 3.10.11 MLflow version: 2.7.0 MLflow module location: C:\Users\coool\anaconda3\envs\mflow27_1\lib\site-packages\mlflow\__init__.py Tracking URI: http://localhost:5000 Registry URI: http://localhost:5000 Active experiment ID: 627570918674180859 Active run ID: 3c6ec1fe03a946098a7c3fe1fcc0b4e4 Active run artifact URI: mlflow-artifacts:/627570918674180859/3c6ec1fe03a946098a7c3fe1fcc0b4e4/artifacts MLflow environment variables: MLFLOW_TRACKING_URI: http://localhost:5000 MLflow dependencies: Flask: 2.3.3 Jinja2: 3.1.2 alembic: 1.12.0 click: 8.1.7 cloudpickle: 2.2.1 databricks-cli: 0.17.7 docker: 6.1.3 entrypoints: 0.4 gitpython: 3.1.36 importlib-metadata: 6.8.0 markdown: 3.4.4 matplotlib: 3.8.0 numpy: 1.24.3 packaging: 23.1 pandas: 2.1.0 protobuf: 4.24.3 psutil: 5.9.0 pyarrow: 13.0.0 pytz: 2023.3.post1 pyyaml: 6.0.1 querystring-parser: 1.2.4 requests: 2.31.0 scikit-learn: 1.3.0 scipy: 1.11.2 sqlalchemy: 2.0.20 sqlparse: 0.4.4 virtualenv: 20.24.5 waitress: 2.1.2
# Register the model in ML Flow. We are NOT building the model. We are using the model built in this notebook
with mlflow.start_run() as knn_iris_run:
score = knn.score(X, y)
print(f"Score: {score}")
mlflow.log_metric("score", score)
predictions = knn.predict(X)
signature = infer_signature(X_test, predictions)
model_info = mlflow.sklearn.log_model(
knn, "model", registered_model_name="KNNIrisModel", signature=signature) ## Log model in MLFlow
print(f"Model saved in run: {knn_iris_run.info.run_uuid}, run_name: {mlflow.active_run().info.run_name}")
Score: 0.98
Registered model 'KNNIrisModel' already exists. Creating a new version of this model... 2023/09/17 17:05:28 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: KNNIrisModel, version 25
Model saved in run: f846fd2633324456a73f53aef1fd6103, run_name: funny-sow-770
Created version '25' of model 'KNNIrisModel'.
## The model should be now be visible in MLFlow UI
model_info.model_uri
'runs:/f846fd2633324456a73f53aef1fd6103/model'
## Make predictions on Model using runId
# Load model as a PyFuncModel.
loaded_model = mlflow.pyfunc.load_model(model_info.model_uri)
## Predict and display
results_logged_df = pd.DataFrame()
results_logged_df["Predictions"] = loaded_model.predict(X_test.loc[test_samples]) ## Prediction
results_logged_df["Actuals"] = y_test.loc[test_samples].values
results_logged_df
Predictions | Actuals | |
---|---|---|
0 | setosa | setosa |
1 | virginica | virginica |
2 | versicolor | versicolor |
3 | setosa | setosa |
4 | virginica | virginica |
5 | setosa | setosa |
## Capture the version number using model_info, so that we can move the newly created version into Staging
## Get Curr(ent) version on the model
from mlflow import MlflowClient
client = MlflowClient()
filter_string = f"run_id='{model_info.run_id}'"
results = client.search_model_versions(filter_string)
curr_version = results[0].version
print(f"Model URI: {model_info.model_uri}, Model version: {curr_version}")
Model URI: runs:/f846fd2633324456a73f53aef1fd6103/model, Model version: 25
# Promote the model to Staging
client.transition_model_version_stage(
name="KNNIrisModel", version=curr_version, stage="Staging"
)
<ModelVersion: aliases=[], creation_timestamp=1694988328701, current_stage='Staging', description='', last_updated_timestamp=1694988334019, name='KNNIrisModel', run_id='f846fd2633324456a73f53aef1fd6103', run_link='', source='mlflow-artifacts:/627570918674180859/f846fd2633324456a73f53aef1fd6103/artifacts/model', status='READY', status_message='', tags={}, user_id='', version='25'>
# Test the model in Staging
staged_model = mlflow.pyfunc.load_model(model_uri=f"models:/KNNIrisModel/Staging") ## Get Staged Model
## Predict and display
results_staging_df = pd.DataFrame()
results_staging_df["Predictions"] = staged_model.predict(X_test.loc[test_samples]) ## Prediction
results_staging_df["Actuals"] = y_test.loc[test_samples].values
results_staging_df
Predictions | Actuals | |
---|---|---|
0 | setosa | setosa |
1 | virginica | virginica |
2 | versicolor | versicolor |
3 | setosa | setosa |
4 | virginica | virginica |
5 | setosa | setosa |
prod_model = mlflow.pyfunc.load_model(model_uri=f"models:/KNNIrisModel/Production") ## Get Prod Model
## Predict and display
results_prod_df = pd.DataFrame()
results_prod_df["Predictions"] = prod_model.predict(X_test.loc[test_samples]) ## Prediction
results_prod_df["Actuals"] = y_test.loc[test_samples].values
results_prod_df
Predictions | Actuals | |
---|---|---|
0 | setosa | setosa |
1 | virginica | virginica |
2 | versicolor | versicolor |
3 | setosa | setosa |
4 | virginica | virginica |
5 | setosa | setosa |
## Serve the model, run this command in conda environment. Once is model is being served, run this cell
# mlflow models serve -m "models:/KNNIrisModel/Production" --port 5002
## The model is available on pot 5002. It looks like we will need a seperate port for each model
import requests
import json
message_body = {
"dataframe_split" : {
"columns" : [],
"data": []
}
}
message_body["dataframe_split"]["columns" ]=list(X_test)
message_body["dataframe_split"]["data" ]=X_test.loc[test_samples].values.tolist()
json_object = json.dumps(message_body)
headers = {'Content-Type': 'application/json'}
r = requests.post('http://localhost:5002/invocations',
headers=headers,
data = json_object) ## Prediction
print(f"Status code: {r.status_code}, Response: {r.text}");
Status code: 200, Response: {"predictions": ["setosa", "virginica", "versicolor", "setosa", "virginica", "setosa"]}
## Display predictions from REST API calls
results_rest_df = pd.DataFrame()
results_rest_df["Predictions"] = pd.read_json(r.text)
results_rest_df["Actuals"] = y_test.loc[test_samples].values
results_rest_df
Predictions | Actuals | |
---|---|---|
0 | setosa | setosa |
1 | virginica | virginica |
2 | versicolor | versicolor |
3 | setosa | setosa |
4 | virginica | virginica |
5 | setosa | setosa |
# Create a multi-index for columns
columns = pd.MultiIndex.from_tuples([
('Dev', 'Predictions'), ('Dev', 'Actuals'),
('Staging', 'Predictions'), ('Staging', 'Actuals'),
('Prod', 'Predictions'), ('Prod', 'Actuals'),
('REST API', 'Predictions'), ('REST API', 'Actuals')
])
results_final = pd.DataFrame(columns=columns)
results_final['Dev'] = results_logged_df
results_final['Staging'] = results_staging_df
results_final['Prod'] = results_prod_df
results_final['REST API'] = results_rest_df
print("\nPredictions vs Actuals at each stage. Note: Model score is 0.98")
print("-"*80)
results_final
Predictions vs Actuals at each stage. Note: Model score is 0.98 --------------------------------------------------------------------------------
Dev | Staging | Prod | REST API | |||||
---|---|---|---|---|---|---|---|---|
Predictions | Actuals | Predictions | Actuals | Predictions | Actuals | Predictions | Actuals | |
0 | setosa | setosa | setosa | setosa | setosa | setosa | setosa | setosa |
1 | virginica | virginica | virginica | virginica | virginica | virginica | virginica | virginica |
2 | versicolor | versicolor | versicolor | versicolor | versicolor | versicolor | versicolor | versicolor |
3 | setosa | setosa | setosa | setosa | setosa | setosa | setosa | setosa |
4 | virginica | virginica | virginica | virginica | virginica | virginica | virginica | virginica |
5 | setosa | setosa | setosa | setosa | setosa | setosa | setosa | setosa |
import sys
print(sys.version)
3.10.11 | packaged by Anaconda, Inc. | (main, May 16 2023, 00:55:32) [MSC v.1916 64 bit (AMD64)]