본문 바로가기

텐서플로우

케라스 자동차 연비 예측 모델(텐서플로우,판다,케라스)

## 케라스 자동차 연비 예측 모델

- Source : https://www.tensorflow.org/tutorials/keras/regression

### modules import



import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.models import Model
from tensorflow.keras.utils import get_file, plot_model

sns.set(style='white')
plt.style.use('seaborn-white')

### 데이터 로드

dataset_path = get_file("auto-mpg.data", "http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data")
dataset_path

column_names = ['MPG', 'Cylinders', 'Displacement', 'Horsepower', 'Weight', 'Acceleration', 'Model Year', 'Origin']

### 데이터 확인

raw_dataset = pd.read_csv(dataset_path, names=column_names,
                          na_values='?', comment='\t',
                          sep=" ", skipinitialspace=True)

dataset=raw_dataset.copy()
dataset

### 데이터 전처리


- 해당 데이터는 일부 데이터가 누락되어 있음

dataset.isna().sum() #not available(누락)확인

- 누락된 행 삭제

dataset = dataset.dropna()

- "Origin" 범주형 데이터
  - 원-핫 인코딩(one-hot encoding) 진행
dataset["Origin"].unique()

origin = dataset.pop('Origin')

dataset['USA'] = (origin==1)*1.0
dataset['Europe'] = (origin==2)*1.0
dataset['Japan'] = (origin==3)*1.0
dataset

#### 검증 데이터셋 생성

train_dataset = dataset.sample(frac=0.8, random_state=0)
test_dataset = dataset.drop(train_dataset.index)

#### 데이터 조사

sns.pairplot(train_dataset[["MPG", "Cylinders", "Displacement", "Horsepower", "Weight"]], diag_kind="kde");

train_stats = train_dataset.describe()
train_stats.pop("MPG")
train_stats = train_stats.transpose()
train_stats

#### 데이터의 특성과 레이블 분리

train_labels = train_dataset.pop('MPG')
test_labels = test_dataset.pop('MPG')

#### 데이터 정규화

def normalization(x):
    return(x-train_stats['mean'])/train_stats['std']
    
normed_train_data = normalization(train_dataset)  
normed_test_data = normalization(test_dataset)    


### 모델 구성

def build_model():
    input = Input(shape=(len(train_dataset.keys())), name='input')
    hidden1=Dense(64, activation='relu', name='dense1')(input)
    hidden2=Dense(64, activation='relu', name='dense2')(hidden1)
    output = Dense(1, name='output')(hidden2)

    model = Model(inputs=[input],outputs=output)

    model.compile(loss='mse',
              optimizer=RMSprop(0.001),
              metrics=['mae','mse'])

    return model

model = build_model()

model.summary()

plot_model(model)

### 샘플 데이터 확인

sample_batch = normed_train_data[:10]
sample_result = model.predict(sample_batch)
sample_batch

### 모델 학습

epochs = 1000

history = model.fit(normed_train_data, train_labels,
                    epochs=epochs, validation_split=0.2)

### 모델 학습 시각화

history.history.keys()

hist = pd.DataFrame(history.history)
hist['epoch']=history.epoch
hist

def plot_history(history):
    hist = pd.DataFrame(history.history)
    hist['epoch'] = history.epoch

    plt.figure(figsize=(12,6))

    plt.subplot(1,2,1)
    plt.xlabel('Epochs')
    plt.ylabel('Mean Absolute Error')
    plt.plot(hist['epoch'], hist['mae'],label='Train Error')
    plt.plot(hist['epoch'], hist['val_mae'],label='val Error')
    plt.ylim([0,5])
    plt.legend()

    plt.subplot(1,2,2)
    plt.xlabel('Epochs')
    plt.ylabel('Mean Squared Error')
    plt.plot(hist['epoch'], hist['mse'],label='Train Error')
    plt.plot(hist['epoch'], hist['val_mse'],label='val Error')
    plt.ylim([0,20])
    plt.legend()

    plt.show()

plot_history(history)

### EarlyStopping을 이용한 규제화

from tensorflow.keras.callbacks import EarlyStopping

model = build_model()

early_stop = EarlyStopping(monitor='val_loss', patience=10)

history = model.fit(normed_train_data, train_labels, epochs=epochs,
                    validation_split=0.2, callbacks=[early_stop])

plot_history(history)

### 모델 평가

loss, mae, mse = model.evaluate(normed_test_data, test_labels, verbose=2)

print(mae)

### 학습된 모델을 통한 예측

test_pred = model.predict(normed_test_data).flatten()

plt.scatter(test_labels, test_pred)
plt.xlabel('True Values')
plt.ylabel('predictions')
plt.axis('equal')
plt.axis('square')
plt.grid()
plt.xlim([0,plt.xlim()[1]])
plt.ylim([0,plt.ylim()[1]])
plt.plot([-100,100], [-100,100])

error = test_pred - test_labels
plt.hist(error, bins=30)
plt.xlabel('Prediction Error')
plt.grid()
plt.ylabel('Count')