본문 바로가기
데이터분석/인공지능

딥러닝 불균형 데이터 분류 모델 만들기

by code cleaner 2023. 4. 8.
반응형

loss에 라벨별 각각의 가중치를 주면됨

 

 

from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten
from tensorflow.keras.layers import Conv1D
from tensorflow.keras.layers import Bidirectional, LSTM, GRU, SimpleRNN
from tensorflow.keras.layers import BatchNormalization, LeakyReLU
from tensorflow.keras import Input, Model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.models import load_model
from tensorflow import random as tf_random, constant_initializer
from tensorflow import convert_to_tensor
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import BinaryCrossentropy, CategoricalCrossentropy
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.metrics import Precision, Recall, TrueNegatives, TruePositives

from modeling.model_evaluation import evaluation

import random
import numpy as np
import pandas as pd

from datetime import datetime

class BestDLModel:
    def __init__(self, x_train, x_test, y_train, y_test, data_div_nm) -> None:
        self.x_train = x_train
        self.x_test = x_test
        self.y_train = y_train
        self.y_test = y_test
        self.data_div_nm = data_div_nm

    def reset_random(self):
        random_seed_num = 0
        tf_random.set_seed(random_seed_num)
        np.random.seed(random_seed_num)
        random.seed(random_seed_num)
        constant_initializer()

    def BasicCLF(self, y_clf_num = 2, weight_add = True):
        self.reset_random()
        y_train_label = to_categorical(self.y_train) if y_clf_num != 1 else self.y_train
        final_activation = 'sigmoid' if y_clf_num == 1 else 'softmax'
        loss_select = BinaryCrossentropy() if y_clf_num == 1 else CategoricalCrossentropy()

        counts = np.bincount(self.y_train)
        weight_for_0 = 1.0 / counts[0]
        weight_for_1 = 1.0 / counts[1]

        model = Sequential([
            Dense(units=256, activation='relu', input_shape=(self.x_train.shape[1], )),
            Dropout(0.1),
            Dense(units=1024, activation='relu'),            
            Dropout(0.1),
            Dense(units=512, activation='relu'),            
            Dense(units=128, activation='relu'),
            Dense(units=8, activation='relu'),
            Dense(units=y_clf_num, activation = final_activation)])
        
        if self.data_div_nm == 'marketing':
            if y_clf_num == 1 :
                loss_select = BinaryCrossentropy(from_logits=True)
                model.compile(optimizer=Adam(learning_rate=0.01), loss = loss_select, metrics=[TrueNegatives(thresholds=0), Precision(name='precision'), Recall(name='recall'), 'accuracy'])        
            else :
                model.compile(optimizer=Adam(learning_rate=0.01), loss = loss_select, metrics=[Precision(name='precision'), Recall(name='recall'), 'accuracy'])     
        else :
            model.compile(optimizer=Adam(learning_rate=0.01), loss = loss_select, metrics=[Precision(name='precision'), Recall(name='recall'), 'accuracy'])

        model.summary()
        
        es = EarlyStopping(patience=3, monitor='loss')
        mc = ModelCheckpoint(f'./files/{self.data_div_nm}_{y_clf_num}_basicDense_model.h5', save_best_only=True)
        
        class_weight = {0 : weight_for_0 , 1 : weight_for_1}
        if weight_add == True :
            history = model.fit(self.x_train, y_train_label, epochs =10, batch_size = 128, validation_split=0.2, callbacks=[es, mc], class_weight= class_weight, verbose=2)
        
        else : 
            history = model.fit(self.x_train, y_train_label, epochs =10, batch_size = 128, validation_split=0.2, callbacks=[es, mc], verbose=2)

        y_train_pred = model.predict(self.x_train)
        y_test_pred = model.predict(self.x_test)
        
        if y_clf_num == 1 :
            y_train_pred = pd.Series(y_train_pred.flatten()).apply(lambda x : 1 if x > 0.5 else 0).tolist()
            y_test_pred = pd.Series(y_test_pred.flatten()).apply(lambda x : 1 if x > 0.5 else 0).tolist()
        else :
            y_train_pred = y_train_pred.argsort()[:, ::-1][:, 0].tolist()
            y_test_pred = y_test_pred.argsort()[:, ::-1][:, 0].tolist()

        model_train_rs_df = evaluation(self.y_train, y_train_pred, self.y_test, y_test_pred, model_nm = f'basic_dl_{y_clf_num}')
        
        return model_train_rs_df
    
    
    def LSTMCLF(self, weight_add = True):
        start_time = datetime.now()
        self.reset_random()

        x_train_3d = np.reshape(self.x_train, (self.x_train.shape[0], self.x_train.shape[1], -1))
        x_test_3d = np.reshape(self.x_test, (self.x_test.shape[0], self.x_test.shape[1], -1))

        model = Sequential([
            Conv1D(64, kernel_size = 3, input_shape = (x_train_3d.shape[1], x_train_3d.shape[2]), activation='relu'),
            Bidirectional(LSTM(128, return_sequences=True)),
            Bidirectional(SimpleRNN(4, return_sequences=False)),
            Dense(1, activation='sigmoid')
            ])

        model.compile(optimizer=Adam(learning_rate=0.01), loss = BinaryCrossentropy(), metrics=['accuracy', Precision(name='precision'), Recall(name='recall')])
       
        es = EarlyStopping(patience=2, monitor='loss')
        mc = ModelCheckpoint(f'./files/{self.data_div_nm}_LSTM_model.h5', save_best_only=True)
        if weight_add == True :
            counts = np.bincount(self.y_train)
            weight_for_0 = 1.0 / counts[0]
            weight_for_1 = 1.0 / counts[1]
            class_weight = {0 : weight_for_0 , 1 : weight_for_1}
            history = model.fit(x_train_3d, self.y_train, epochs = 3, batch_size = 128, validation_split=0.3, callbacks=[es, mc], class_weight= class_weight, verbose=2)
        else :
            history = model.fit(x_train_3d, self.y_train, epochs = 3, batch_size = 128, validation_split=0.3, callbacks=[es, mc], verbose=2)
        print(history.history)
        print(f"모델 훈련 소요 시간 : {datetime.now() - start_time}")
        start_time = datetime.now()
        y_train_pred = model.predict(x_train_3d)
        y_test_pred = model.predict(x_test_3d)

        y_train_pred = pd.Series(y_train_pred.flatten()).apply(lambda x : 1 if x > 0.5 else 0).tolist()
        y_test_pred =  pd.Series(y_test_pred.flatten()).apply(lambda x : 1 if x > 0.5 else 0).tolist()

        model_train_rs_df = evaluation(self.y_train, y_train_pred, self.y_test, y_test_pred, model_nm = 'LSTM')
        
        return model_train_rs_df


    def AutoEncoder(self):
        self.reset_random()

        n_inputs = self.x_train.shape[1]
        input = Input(shape=n_inputs)
        e1 = Dense(n_inputs * 2 , activation='relu')(input)
        e1 = BatchNormalization()(e1)
        e1 = LeakyReLU()(e1)

        e2 = Dense(n_inputs)(e1)
        e2 = BatchNormalization()(e2)
        e2 = LeakyReLU()(e2)

        bottleneck = Dense(n_inputs)(e2)

        d1 = Dense(n_inputs)(bottleneck)
        d1 = BatchNormalization()(d1)
        d1 = LeakyReLU()(d1)

        d2 = Dense(n_inputs * 2)(d1)
        d2 = BatchNormalization()(d2)
        d2 = LeakyReLU()(d2)

        output = Dense(1, activation='relu')(d2)

        model = Model(inputs= input, outputs = output)

        model.compile(optimizer=Adam(learning_rate=0.01), loss = BinaryCrossentropy(), metrics=['accuracy', Precision(name='precision'), Recall(name='recall')])

        es = EarlyStopping(patience=2, monitor='val_precision')
        mc = ModelCheckpoint(f'./files/{self.data_div_nm}_AutoEncoder_model.h5', save_best_only=True)
        history = model.fit(self.x_train, self.y_train, epochs = 3, batch_size = 128, validation_split=0.3, callbacks=[es, mc], verbose=2)

        y_train_pred = model.predict(self.x_train)
        y_test_pred = model.predict(self.x_test)

        y_train_pred = pd.Series(y_train_pred.flatten()).apply(lambda x : 1 if x > 0.5 else 0).tolist()
        y_test_pred =  pd.Series(y_test_pred.flatten()).apply(lambda x : 1 if x > 0.5 else 0).tolist()

        model_train_rs_df = evaluation(self.y_train, y_train_pred, self.y_test, y_test_pred, model_nm = 'AUTO-ENCODER')
        
        return model_train_rs_df

       

반응형