from metaflow import FlowSpec, step, Flow, current
class MyFlow(FlowSpec):
@step
def start(self):
"Read the data"
import pandas as pd
self.df = pd.read_parquet('train.parquet')
print(f'num of rows: {self.df.shape[0]}')
self.next(self.baseline, self.train)
@step
def baseline(self):
"Compute the baseline"
from sklearn.metrics import accuracy_score, roc_auc_score
= [1] * self.df.shape[0]
baseline_predictions self.base_acc = accuracy_score(self.df.labels, baseline_predictions)
self.base_rocauc = roc_auc_score(self.df.labels, baseline_predictions)
self.next(self.join)
@step
def train(self):
"Train the model"
import tensorflow as tf
from tensorflow.keras.utils import set_random_seed
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.feature_extraction.text import CountVectorizer
from model import get_model
2022)
set_random_seed(
self.cv = CountVectorizer(min_df=.005, max_df = .75, stop_words='english', strip_accents='ascii', )
= self.cv.fit_transform(self.df['review'])
res self.model = get_model(len(self.cv.vocabulary_))
self.model.fit(x=res.toarray(),
=self.df['labels'],
y=32, epochs=10, validation_split=.2)
batch_size
self.next(self.join)
@step
def join(self, inputs):
"Compare the model results with the baseline."
import tensorflow as tf
from tensorflow.keras import layers, optimizers, regularizers
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
self.model = inputs.train.model
self.cv = inputs.train.cv
self.train_df = inputs.train.df
self.holdout_df = pd.read_parquet('holdout.parquet')
self.predictions = self.model.predict(self.cv.transform(self.holdout_df['review']).toarray())
= self.holdout_df['labels']
labels
self.model_acc = accuracy_score(labels, self.predictions > .5)
self.model_rocauc = roc_auc_score(labels, self.predictions)
print(f'Baseline Acccuracy: {inputs.baseline.base_acc:.2%}')
print(f'Baseline AUC: {inputs.baseline.base_rocauc:.2}')
print(f'Model Acccuracy: {self.model_acc:.2%}')
print(f'Model AUC: {self.model_rocauc:.2}')
self.beats_baseline = self.model_rocauc > inputs.baseline.base_rocauc
print(f'Model beats baseline (T/F): {self.beats_baseline}')
#smoke test to make sure model is doing the right thing on obvious examples.
= ["poor fit its baggy in places where it isn't supposed to be.",
_tst_reviews "love it, very high quality and great value"]
= self.model.predict(self.cv.transform(_tst_reviews).toarray())
_tst_preds self.passed_smoke_test = _tst_preds[0][0] < .5 and _tst_preds[1][0] > .5
print(f'Model passed smoke test (T/F): {self.passed_smoke_test}')
if self.beats_baseline and self.passed_smoke_test:
= Flow(current.flow_name)[current.run_id]
run 'deployment_candidate')
run.add_tag(self.next(self.end)
@step
def end(self): ...
if __name__ == '__main__':
MyFlow()
Overwriting flow.py