## 케라스 자동차 연비 예측 모델
- Source : https://www.tensorflow.org/tutorials/keras/regression
### modules import
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.models import Model
from tensorflow.keras.utils import get_file, plot_model
sns.set(style='white')
plt.style.use('seaborn-white')
### 데이터 로드
dataset_path = get_file("auto-mpg.data", "http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data")
dataset_path
column_names = ['MPG', 'Cylinders', 'Displacement', 'Horsepower', 'Weight', 'Acceleration', 'Model Year', 'Origin']
### 데이터 확인
raw_dataset = pd.read_csv(dataset_path, names=column_names,
na_values='?', comment='\t',
sep=" ", skipinitialspace=True)
dataset=raw_dataset.copy()
dataset
### 데이터 전처리
- 해당 데이터는 일부 데이터가 누락되어 있음
dataset.isna().sum() #not available(누락)확인
- 누락된 행 삭제
dataset = dataset.dropna()
- "Origin" 범주형 데이터
- 원-핫 인코딩(one-hot encoding) 진행
dataset["Origin"].unique()
origin = dataset.pop('Origin')
dataset['USA'] = (origin==1)*1.0
dataset['Europe'] = (origin==2)*1.0
dataset['Japan'] = (origin==3)*1.0
dataset
#### 검증 데이터셋 생성
train_dataset = dataset.sample(frac=0.8, random_state=0)
test_dataset = dataset.drop(train_dataset.index)
#### 데이터 조사
sns.pairplot(train_dataset[["MPG", "Cylinders", "Displacement", "Horsepower", "Weight"]], diag_kind="kde");
train_stats = train_dataset.describe()
train_stats.pop("MPG")
train_stats = train_stats.transpose()
train_stats
#### 데이터의 특성과 레이블 분리
train_labels = train_dataset.pop('MPG')
test_labels = test_dataset.pop('MPG')
#### 데이터 정규화
def normalization(x):
return(x-train_stats['mean'])/train_stats['std']
normed_train_data = normalization(train_dataset)
normed_test_data = normalization(test_dataset)
### 모델 구성
def build_model():
input = Input(shape=(len(train_dataset.keys())), name='input')
hidden1=Dense(64, activation='relu', name='dense1')(input)
hidden2=Dense(64, activation='relu', name='dense2')(hidden1)
output = Dense(1, name='output')(hidden2)
model = Model(inputs=[input],outputs=output)
model.compile(loss='mse',
optimizer=RMSprop(0.001),
metrics=['mae','mse'])
return model
model = build_model()
model.summary()
plot_model(model)
### 샘플 데이터 확인
sample_batch = normed_train_data[:10]
sample_result = model.predict(sample_batch)
sample_batch
### 모델 학습
epochs = 1000
history = model.fit(normed_train_data, train_labels,
epochs=epochs, validation_split=0.2)
### 모델 학습 시각화
history.history.keys()
hist = pd.DataFrame(history.history)
hist['epoch']=history.epoch
hist
def plot_history(history):
hist = pd.DataFrame(history.history)
hist['epoch'] = history.epoch
plt.figure(figsize=(12,6))
plt.subplot(1,2,1)
plt.xlabel('Epochs')
plt.ylabel('Mean Absolute Error')
plt.plot(hist['epoch'], hist['mae'],label='Train Error')
plt.plot(hist['epoch'], hist['val_mae'],label='val Error')
plt.ylim([0,5])
plt.legend()
plt.subplot(1,2,2)
plt.xlabel('Epochs')
plt.ylabel('Mean Squared Error')
plt.plot(hist['epoch'], hist['mse'],label='Train Error')
plt.plot(hist['epoch'], hist['val_mse'],label='val Error')
plt.ylim([0,20])
plt.legend()
plt.show()
plot_history(history)
### EarlyStopping을 이용한 규제화
from tensorflow.keras.callbacks import EarlyStopping
model = build_model()
early_stop = EarlyStopping(monitor='val_loss', patience=10)
history = model.fit(normed_train_data, train_labels, epochs=epochs,
validation_split=0.2, callbacks=[early_stop])
plot_history(history)
### 모델 평가
loss, mae, mse = model.evaluate(normed_test_data, test_labels, verbose=2)
print(mae)
### 학습된 모델을 통한 예측
test_pred = model.predict(normed_test_data).flatten()
plt.scatter(test_labels, test_pred)
plt.xlabel('True Values')
plt.ylabel('predictions')
plt.axis('equal')
plt.axis('square')
plt.grid()
plt.xlim([0,plt.xlim()[1]])
plt.ylim([0,plt.ylim()[1]])
plt.plot([-100,100], [-100,100])
error = test_pred - test_labels
plt.hist(error, bins=30)
plt.xlabel('Prediction Error')
plt.grid()
plt.ylabel('Count')