본문 바로가기
데이터분석/머신러닝

머신러닝 데이터 분류모델 훈련 시키기

by code cleaner 2023. 4. 8.
반응형
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import VotingClassifier
from xgboost import XGBClassifier

from sklearn.semi_supervised import SelfTrainingClassifier

from modeling.model_evaluation import evaluation

from joblib import dump

import numpy as np
import pandas as pd

from tqdm import tqdm

class BestMLModel:
    def __init__(self, data_div_nm) -> None:
        self.random_seed_num = 0
        self.data_div_nm = data_div_nm

    def trainsSpervisedMLModel(self, x_train, x_test, y_train, y_test):

        clf_decision = DecisionTreeClassifier(random_state=self.random_seed_num, max_depth=5)
        clf_kneighbors = KNeighborsClassifier(n_neighbors=3, weights='distance', leaf_size=50)
        clf_logistic = LogisticRegression(max_iter=3000, random_state=self.random_seed_num)
        clf_mlp = MLPClassifier(solver = 'lbfgs' , alpha=0.05, random_state=self.random_seed_num, max_iter=2000)
        clf_xgb = XGBClassifier(random_state=self.random_seed_num, max_depth=5)
        clf_ensemble = VotingClassifier(estimators=[('xgboost', clf_xgb), ('kneibors', clf_kneighbors), ('mlp', clf_mlp)], voting='soft')

        mlmodels = {'decision':clf_decision, 'kneighbors':clf_kneighbors, 'logistic':clf_logistic, 'mlp':clf_mlp, 'xgboost':clf_xgb, 'ensemble':clf_ensemble}
        
        model_train_rs_df = pd.DataFrame()
        
        for model_nm , model_obj in tqdm(mlmodels.items(), desc='training ML Model'):

            model_obj.fit(x_train, y_train)

            dump(model_obj, f'./files/{self.data_div_nm}_{model_nm}.joblib')

            y_train_pred = model_obj.predict(x_train)
            y_test_pred = model_obj.predict(x_test)
            ml_unit_model_rs = evaluation(y_train, y_train_pred, y_test, y_test_pred, model_nm=model_nm)
            model_train_rs_df = model_train_rs_df.append(ml_unit_model_rs)

        return model_train_rs_df

    def trainSemisupervisedMLModel(self, x_train, x_test, y_train, y_test):
        clf_kneighbors = KNeighborsClassifier(n_neighbors=3, weights='distance', leaf_size=50)
        self_train_model = SelfTrainingClassifier(clf_kneighbors)
        self_train_model.fit(x_train, y_train)
        dump(self_train_model, f'./files/{self.data_div_nm}_semi_supervised.joblib')

        y_train_pred = self_train_model.predict(x_train)
        y_test_pred = self_train_model.predict(x_test)

        model_train_rs_df = evaluation(y_train, y_train_pred, y_test, y_test_pred, model_nm='semi_supervised')
        
        return model_train_rs_df



if __name__=="__main__":
    cls_mlmodel = BestMLModel('test')
    model_train_rs_sup = cls_mlmodel.trainsSpervisedMLModel(x_train_scl, x_test_scl, y_train, y_test)
    model_train_rs_semi_sup = cls_mlmodel.trainSemisupervisedMLModel(x_train_scl, x_test_scl, y_train, y_test)

 

from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import confusion_matrix

import pandas as pd

def evaluation(y_train_true, y_train_pred, y_test_true, y_test_pred, model_nm):
    print(classification_report(y_train_true, y_train_pred))
    print(classification_report(y_test_true, y_test_pred))

    train_precision, train_recall, train_f_score, train_support = precision_recall_fscore_support(y_train_true, y_train_pred)
    train_tn, train_fp, train_fn, train_tp = confusion_matrix(y_train_true, y_train_pred).ravel()
    train_specificity = train_tn / (train_tn+train_fp)
    train_sensitivity = train_tp / (train_tp+train_fn)

    test_precision, test_recall, test_f_score, test_support = precision_recall_fscore_support(y_test_true, y_test_pred)
    test_tn, test_fp, test_fn, test_tp = confusion_matrix(y_test_true, y_test_pred).ravel()
    test_specificity = test_tn / (test_tn+test_fp)
    test_sensitivity = test_tp / (test_tp+test_fn)


    model_train_rs_list = list(train_precision) + list(train_recall) + list(train_f_score) + list(train_support) + list(test_precision) + list(test_recall) + list(test_f_score) + list(test_support)
    cols = ['train_precision', 'train_recall', 'train_f_score', 'train_support', 'test_precision', 'test_recall', 'test_f_score', 'test_support']
    label_cols = []
    for col_unit in cols :
        label_cols += [f"{i}_{col_unit}" for i in range(len(y_train_true.unique()))]        
    
    model_train_rs_df = pd.DataFrame(data = {model_nm:model_train_rs_list}).T
    model_train_rs_df.columns = label_cols
    model_train_rs_df[['train_specificity', 'train_sensitivity', 'test_specificity', 'test_sensitivity']] = train_specificity, train_sensitivity, test_specificity, test_sensitivity

    return model_train_rs_df

 

반응형