import pandas as pd from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import accuracy_score, classification_report from imblearn.combine import SMOTETomek from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score from sklearn.model_selection import StratifiedKFold, cross_validate from sklearn.impute import SimpleImputer from sklearn.model_selection import train_test_split from sklearn.metrics import roc_auc_score, roc_curve # データ読み込み data = pd.read_csv(“/home/share/temp/20230301-20240331_site_zaiko_suii.csv”,encoding=”cp932″) # 特徴量と目的変数の選択 X = data.drop([“SKU”,”原価”, “商品名”, “ブランド名”, “メーカー名”], axis=1) y = X.iloc[:, -1].apply(lambda x: 0 if x < 0 else 1) X = X.iloc[:, :-1] # 欠損値を補完するImputerを定義 imputer = SimpleImputer(strategy=’mean’) # 欠損値を補完 X = imputer.fit_transform(X) # オーバーサンプリングとアンダーサンプリングを組み合わせる smt = SMOTETomek(random_state=42) X, y = smt.fit_resample(X, y) # データをトレーニングセットとテストセットに分割（70%をトレーニング、30%をテスト） X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) # ランダムフォレストモデルの定義とトレーニング（トレーニングデータを使用） rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42) rf_classifier.fit(X_train, y_train) # テストデータで予測 y_pred_test = rf_classifier.predict(X_test) # テストデータの性能評価 accuracy_test = accuracy_score(y_test, y_pred_test) print(“Test Accuracy:”, accuracy_test) print(“nTest Classification Report:”) print(classification_report(y_test, y_pred_test)) # AUCを計算 # テストデータでの予測確率を取得 y_pred_proba_test = rf_classifier.predict_proba(X_test)[:, 1] auc_score = roc_auc_score(y_test, y_pred_proba_test) print(“AUC Score:”, auc_score) # 評価指標の定義（Precision, Recall, F1-score） scoring = {‘precision’: make_scorer(precision_score), ‘recall’: make_scorer(recall_score), ‘f1_score’: make_scorer(f1_score)} # 層化k分割交差検証を実行 cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) cv_results = cross_validate(rf_classifier, X, y, cv=cv, scoring=scoring) # 結果の表示 print(“Precision:”, cv_results[‘test_precision’]) print(“Recall:”, cv_results[‘test_recall’]) print(“F1-score:”, cv_results[‘test_f1_score’]) import numpy as np # 交差検証の結果からPrecision、Recall、F1-scoreの標準偏差を計算 precision_std = np.std(cv_results[‘test_precision’]) recall_std = np.std(cv_results[‘test_recall’]) f1_score_std = np.std(cv_results[‘test_f1_score’]) # 結果の表示 print(“Precision Standard Deviation:”, precision_std) print(“Recall Standard Deviation:”, recall_std) print(“F1-score Standard Deviation:”, f1_score_std) これでLightGBMもくれ

2024年6月8日

LightGBMを使ったコードを以下に示します。LightGBMは高速でありながら精度の高い勾配ブースティングフレームワークであり、大規模なデータセットにも適しています。

python
import pandas as pd
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, classification_report
from imblearn.combine import SMOTETomek
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score


# データ読み込み
data = pd.read_csv("/home/share/temp/20230301-20240331_site_zaiko_suii.csv", encoding="cp932")

# 特徴量と目的変数の選択
X = data.drop(["SKU", "原価", "商品名", "ブランド名", "メーカー名"], axis=1)  
y = X.iloc[:, -1].apply(lambda x: 0 if x < 0 else 1)
X = X.iloc[:, :-1]

# 欠損値を補完するImputerを定義
imputer = SimpleImputer(strategy='mean')

# 欠損値を補完
X = imputer.fit_transform(X)

# オーバーサンプリングとアンダーサンプリングを組み合わせる
smt = SMOTETomek(random_state=42)
X_resampled, y_resampled = smt.fit_resample(X, y)

# データをトレーニングセットとテストセットに分割（70%をトレーニング、30%をテスト）
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)

# LightGBMモデルの定義とトレーニング（トレーニングデータを使用）
lgb_classifier = LGBMClassifier(random_state=42)
lgb_classifier.fit(X_train, y_train)

# テストデータで予測
y_pred_test = lgb_classifier.predict(X_test)

# テストデータの性能評価
accuracy_test = accuracy_score(y_test, y_pred_test)
print("Test Accuracy:", accuracy_test)
print("nTest Classification Report:")
print(classification_report(y_test, y_pred_test))

# AUCを計算
# テストデータでの予測確率を取得
y_pred_proba_test = lgb_classifier.predict_proba(X_test)[:, 1]
auc_score = roc_auc_score(y_test, y_pred_proba_test)
print("AUC Score:", auc_score)

# 評価指標の定義（Precision, Recall, F1-score）
scoring = {'precision': make_scorer(precision_score),
           'recall': make_scorer(recall_score),
           'f1_score': make_scorer(f1_score)}

# 層化k分割交差検証を実行
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_results = cross_validate(lgb_classifier, X_resampled, y_resampled, cv=cv, scoring=scoring)

# 結果の表示
print("Precision:", cv_results['test_precision'])
print("Recall:", cv_results['test_recall'])
print("F1-score:", cv_results['test_f1_score'])

import numpy as np

# 交差検証の結果からPrecision、Recall、F1-scoreの標準偏差を計算
precision_std = np.std(cv_results['test_precision'])
recall_std = np.std(cv_results['test_recall'])
f1_score_std = np.std(cv_results['test_f1_score'])

# 結果の表示
print("Precision Standard Deviation:", precision_std)
print("Recall Standard Deviation:", recall_std)
print("F1-score Standard Deviation:", f1_score_std)

このコードでは、LightGBMのLGBMClassifierを使用しています。また、AUCスコアを計算するためにroc_auc_scoreを使用しています。それ以外の部分は、ランダムフォレストの場合とほぼ同じですが、適切にデータを前処理してモデルをトレーニングすることが重要です。

未分類

Posted by ぼっち

スケーリングの観点からみてどう？

LightGBM

コメント一覧

まだ、コメントがありません