반응형
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import VotingClassifier
from xgboost import XGBClassifier
from sklearn.semi_supervised import SelfTrainingClassifier
from modeling.model_evaluation import evaluation
from joblib import dump
import numpy as np
import pandas as pd
from tqdm import tqdm
class BestMLModel:
def __init__(self, data_div_nm) -> None:
self.random_seed_num = 0
self.data_div_nm = data_div_nm
def trainsSpervisedMLModel(self, x_train, x_test, y_train, y_test):
clf_decision = DecisionTreeClassifier(random_state=self.random_seed_num, max_depth=5)
clf_kneighbors = KNeighborsClassifier(n_neighbors=3, weights='distance', leaf_size=50)
clf_logistic = LogisticRegression(max_iter=3000, random_state=self.random_seed_num)
clf_mlp = MLPClassifier(solver = 'lbfgs' , alpha=0.05, random_state=self.random_seed_num, max_iter=2000)
clf_xgb = XGBClassifier(random_state=self.random_seed_num, max_depth=5)
clf_ensemble = VotingClassifier(estimators=[('xgboost', clf_xgb), ('kneibors', clf_kneighbors), ('mlp', clf_mlp)], voting='soft')
mlmodels = {'decision':clf_decision, 'kneighbors':clf_kneighbors, 'logistic':clf_logistic, 'mlp':clf_mlp, 'xgboost':clf_xgb, 'ensemble':clf_ensemble}
model_train_rs_df = pd.DataFrame()
for model_nm , model_obj in tqdm(mlmodels.items(), desc='training ML Model'):
model_obj.fit(x_train, y_train)
dump(model_obj, f'./files/{self.data_div_nm}_{model_nm}.joblib')
y_train_pred = model_obj.predict(x_train)
y_test_pred = model_obj.predict(x_test)
ml_unit_model_rs = evaluation(y_train, y_train_pred, y_test, y_test_pred, model_nm=model_nm)
model_train_rs_df = model_train_rs_df.append(ml_unit_model_rs)
return model_train_rs_df
def trainSemisupervisedMLModel(self, x_train, x_test, y_train, y_test):
clf_kneighbors = KNeighborsClassifier(n_neighbors=3, weights='distance', leaf_size=50)
self_train_model = SelfTrainingClassifier(clf_kneighbors)
self_train_model.fit(x_train, y_train)
dump(self_train_model, f'./files/{self.data_div_nm}_semi_supervised.joblib')
y_train_pred = self_train_model.predict(x_train)
y_test_pred = self_train_model.predict(x_test)
model_train_rs_df = evaluation(y_train, y_train_pred, y_test, y_test_pred, model_nm='semi_supervised')
return model_train_rs_df
if __name__=="__main__":
cls_mlmodel = BestMLModel('test')
model_train_rs_sup = cls_mlmodel.trainsSpervisedMLModel(x_train_scl, x_test_scl, y_train, y_test)
model_train_rs_semi_sup = cls_mlmodel.trainSemisupervisedMLModel(x_train_scl, x_test_scl, y_train, y_test)
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import confusion_matrix
import pandas as pd
def evaluation(y_train_true, y_train_pred, y_test_true, y_test_pred, model_nm):
print(classification_report(y_train_true, y_train_pred))
print(classification_report(y_test_true, y_test_pred))
train_precision, train_recall, train_f_score, train_support = precision_recall_fscore_support(y_train_true, y_train_pred)
train_tn, train_fp, train_fn, train_tp = confusion_matrix(y_train_true, y_train_pred).ravel()
train_specificity = train_tn / (train_tn+train_fp)
train_sensitivity = train_tp / (train_tp+train_fn)
test_precision, test_recall, test_f_score, test_support = precision_recall_fscore_support(y_test_true, y_test_pred)
test_tn, test_fp, test_fn, test_tp = confusion_matrix(y_test_true, y_test_pred).ravel()
test_specificity = test_tn / (test_tn+test_fp)
test_sensitivity = test_tp / (test_tp+test_fn)
model_train_rs_list = list(train_precision) + list(train_recall) + list(train_f_score) + list(train_support) + list(test_precision) + list(test_recall) + list(test_f_score) + list(test_support)
cols = ['train_precision', 'train_recall', 'train_f_score', 'train_support', 'test_precision', 'test_recall', 'test_f_score', 'test_support']
label_cols = []
for col_unit in cols :
label_cols += [f"{i}_{col_unit}" for i in range(len(y_train_true.unique()))]
model_train_rs_df = pd.DataFrame(data = {model_nm:model_train_rs_list}).T
model_train_rs_df.columns = label_cols
model_train_rs_df[['train_specificity', 'train_sensitivity', 'test_specificity', 'test_sensitivity']] = train_specificity, train_sensitivity, test_specificity, test_sensitivity
return model_train_rs_df
반응형
'데이터분석 > 머신러닝' 카테고리의 다른 글
[sklearn] train_test_split 사용하는 방법 및 유의사항 (0) | 2021.05.09 |
---|---|
[회귀분석] 회귀분석 모델 한 번에 돌려서 가장 좋은 성능 모델 값 뽑기 (0) | 2020.02.24 |
머신러닝/딥러닝 데이터셋 얻을 수 있는 링크 모음 (0) | 2019.05.07 |
[모델 선택하기] 머신러닝(지도학습,비지도학습,강화학습)/딥러닝 (0) | 2018.07.19 |
[기초개념] 데이터 분석 관점에서 한줄로 정리한 '머신러닝 딥러닝 데이터 분석을 하기 위해 꼭 알아야할 기본 개념' (0) | 2018.05.28 |