데이터분석/머신러닝
머신러닝 데이터 분류모델 훈련 시키기
code cleaner
2023. 4. 8. 14:24
반응형
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import VotingClassifier
from xgboost import XGBClassifier
from sklearn.semi_supervised import SelfTrainingClassifier
from modeling.model_evaluation import evaluation
from joblib import dump
import numpy as np
import pandas as pd
from tqdm import tqdm
class BestMLModel:
def __init__(self, data_div_nm) -> None:
self.random_seed_num = 0
self.data_div_nm = data_div_nm
def trainsSpervisedMLModel(self, x_train, x_test, y_train, y_test):
clf_decision = DecisionTreeClassifier(random_state=self.random_seed_num, max_depth=5)
clf_kneighbors = KNeighborsClassifier(n_neighbors=3, weights='distance', leaf_size=50)
clf_logistic = LogisticRegression(max_iter=3000, random_state=self.random_seed_num)
clf_mlp = MLPClassifier(solver = 'lbfgs' , alpha=0.05, random_state=self.random_seed_num, max_iter=2000)
clf_xgb = XGBClassifier(random_state=self.random_seed_num, max_depth=5)
clf_ensemble = VotingClassifier(estimators=[('xgboost', clf_xgb), ('kneibors', clf_kneighbors), ('mlp', clf_mlp)], voting='soft')
mlmodels = {'decision':clf_decision, 'kneighbors':clf_kneighbors, 'logistic':clf_logistic, 'mlp':clf_mlp, 'xgboost':clf_xgb, 'ensemble':clf_ensemble}
model_train_rs_df = pd.DataFrame()
for model_nm , model_obj in tqdm(mlmodels.items(), desc='training ML Model'):
model_obj.fit(x_train, y_train)
dump(model_obj, f'./files/{self.data_div_nm}_{model_nm}.joblib')
y_train_pred = model_obj.predict(x_train)
y_test_pred = model_obj.predict(x_test)
ml_unit_model_rs = evaluation(y_train, y_train_pred, y_test, y_test_pred, model_nm=model_nm)
model_train_rs_df = model_train_rs_df.append(ml_unit_model_rs)
return model_train_rs_df
def trainSemisupervisedMLModel(self, x_train, x_test, y_train, y_test):
clf_kneighbors = KNeighborsClassifier(n_neighbors=3, weights='distance', leaf_size=50)
self_train_model = SelfTrainingClassifier(clf_kneighbors)
self_train_model.fit(x_train, y_train)
dump(self_train_model, f'./files/{self.data_div_nm}_semi_supervised.joblib')
y_train_pred = self_train_model.predict(x_train)
y_test_pred = self_train_model.predict(x_test)
model_train_rs_df = evaluation(y_train, y_train_pred, y_test, y_test_pred, model_nm='semi_supervised')
return model_train_rs_df
if __name__=="__main__":
cls_mlmodel = BestMLModel('test')
model_train_rs_sup = cls_mlmodel.trainsSpervisedMLModel(x_train_scl, x_test_scl, y_train, y_test)
model_train_rs_semi_sup = cls_mlmodel.trainSemisupervisedMLModel(x_train_scl, x_test_scl, y_train, y_test)
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import confusion_matrix
import pandas as pd
def evaluation(y_train_true, y_train_pred, y_test_true, y_test_pred, model_nm):
print(classification_report(y_train_true, y_train_pred))
print(classification_report(y_test_true, y_test_pred))
train_precision, train_recall, train_f_score, train_support = precision_recall_fscore_support(y_train_true, y_train_pred)
train_tn, train_fp, train_fn, train_tp = confusion_matrix(y_train_true, y_train_pred).ravel()
train_specificity = train_tn / (train_tn+train_fp)
train_sensitivity = train_tp / (train_tp+train_fn)
test_precision, test_recall, test_f_score, test_support = precision_recall_fscore_support(y_test_true, y_test_pred)
test_tn, test_fp, test_fn, test_tp = confusion_matrix(y_test_true, y_test_pred).ravel()
test_specificity = test_tn / (test_tn+test_fp)
test_sensitivity = test_tp / (test_tp+test_fn)
model_train_rs_list = list(train_precision) + list(train_recall) + list(train_f_score) + list(train_support) + list(test_precision) + list(test_recall) + list(test_f_score) + list(test_support)
cols = ['train_precision', 'train_recall', 'train_f_score', 'train_support', 'test_precision', 'test_recall', 'test_f_score', 'test_support']
label_cols = []
for col_unit in cols :
label_cols += [f"{i}_{col_unit}" for i in range(len(y_train_true.unique()))]
model_train_rs_df = pd.DataFrame(data = {model_nm:model_train_rs_list}).T
model_train_rs_df.columns = label_cols
model_train_rs_df[['train_specificity', 'train_sensitivity', 'test_specificity', 'test_sensitivity']] = train_specificity, train_sensitivity, test_specificity, test_sensitivity
return model_train_rs_df
반응형