In [20]:
import pandas as pd
wine_url = "https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/wine.csv"
wine = pd.read_csv(wine_url, index_col=0)
wine.head()| fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | quality | color | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7.4 | 0.70 | 0.00 | 1.9 | 0.076 | 11.0 | 34.0 | 0.9978 | 3.51 | 0.56 | 9.4 | 5 | 1 |
| 1 | 7.8 | 0.88 | 0.00 | 2.6 | 0.098 | 25.0 | 67.0 | 0.9968 | 3.20 | 0.68 | 9.8 | 5 | 1 |
| 2 | 7.8 | 0.76 | 0.04 | 2.3 | 0.092 | 15.0 | 54.0 | 0.9970 | 3.26 | 0.65 | 9.8 | 5 | 1 |
| 3 | 11.2 | 0.28 | 0.56 | 1.9 | 0.075 | 17.0 | 60.0 | 0.9980 | 3.16 | 0.58 | 9.8 | 6 | 1 |
| 4 | 7.4 | 0.70 | 0.00 | 1.9 | 0.076 | 11.0 | 34.0 | 0.9978 | 3.51 | 0.56 | 9.4 | 5 | 1 |
Plain text view
fixed acidity volatile acidity citric acid residual sugar chlorides \ 0 7.4 0.70 0.00 1.9 0.076 1 7.8 0.88 0.00 2.6 0.098 2 7.8 0.76 0.04 2.3 0.092 3 11.2 0.28 0.56 1.9 0.075 4 7.4 0.70 0.00 1.9 0.076 free sulfur dioxide total sulfur dioxide density pH sulphates \ 0 11.0 34.0 0.9978 3.51 0.56 1 25.0 67.0 0.9968 3.20 0.68 2 15.0 54.0 0.9970 3.26 0.65 3 17.0 60.0 0.9980 3.16 0.58 4 11.0 34.0 0.9978 3.51 0.56 alcohol quality color 0 9.4 5 1 1 9.8 5 1 2 9.8 5 1 3 9.8 6 1 4 9.4 5 1
맛 등급 추가
In [21]:
wine['taste'] = [1. if grade > 5 else 0 for grade in wine['quality']]
X = wine.drop(['taste', 'quality'], axis=1)
y = wine['taste']데이터 분리
In [22]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)logistic regression 적용
In [23]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
estimators = [
('scaler', StandardScaler()),
('lr', LogisticRegression(solver='liblinear', random_state=42))
]
pipe = Pipeline(estimators)
pipe.fit(X_train, y_train)
y_pred_tr = pipe.predict(X_train)
y_pred_test = pipe.predict(X_test)
print("Train Accuracy: {:.3f}".format(accuracy_score(y_train, y_pred_tr)))
print("Test Accuracy: {:.3f}".format(accuracy_score(y_test, y_pred_test)))Train Accuracy: 0.742 Test Accuracy: 0.739
decision tree 적용
In [24]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(max_depth=2, random_state=42)
dt.fit(X_train, y_train)
models = {
'Logistic Regression': pipe,
'Decision Tree': dt
}ROC AUC 그래프를 이용한 모델간 비교
In [25]:
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt
plt.figure(figsize=(10, 8))
plt.plot([0, 1], [0, 1])
for model_name, model in models.items():
pred = model.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, pred)
plt.plot(fpr, tpr, label=model_name)
plt.grid()
plt.legend()
plt.show()현재 상황에서 두 모델을 비교했을 때, logistic regression 모델이 decision tree 모델보다 낫다는 것을 알 수 있다.