-
Notifications
You must be signed in to change notification settings - Fork 0
KR_Scikit
somaz edited this page Mar 11, 2025
·
4 revisions
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
import numpy as np
import pandas as pd
# 데이터 준비
X = np.random.rand(100, 4) # 특성
y = np.random.randint(0, 2, 100) # 레이블
# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
# 데이터 전처리
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)✅ 특징:
- 데이터 분할
- 스케일링
- 전처리
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
# 로지스틱 회귀
lr_model = LogisticRegression()
lr_model.fit(X_train_scaled, y_train)
lr_pred = lr_model.predict(X_test_scaled)
# 결정 트리
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train_scaled, y_train)
dt_pred = dt_model.predict(X_test_scaled)
# 랜덤 포레스트
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_scaled, y_train)
rf_pred = rf_model.predict(X_test_scaled)
# SVM
svm_model = SVC(kernel='rbf', random_state=42)
svm_model.fit(X_train_scaled, y_train)
svm_pred = svm_model.predict(X_test_scaled)✅ 특징:
- 다양한 모델
- 모델 학습
- 예측
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
# K-means 클러스터링
kmeans = KMeans(n_clusters=3, random_state=42)
clusters = kmeans.fit_predict(X_scaled)
# PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)
# 차원 축소 결과 시각화
import matplotlib.pyplot as plt
plt.figure(figsize=(8, 6))
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=clusters)
plt.title('PCA로 축소된 클러스터')
plt.show()✅ 특징:
- 클러스터링
- 차원 축소
- 시각화
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
# 교차 검증
cv_scores = cross_val_score(rf_model, X_train_scaled, y_train, cv=5)
print(f"교차 검증 점수: {cv_scores.mean():.3f} (+/- {cv_scores.std() * 2:.3f})")
# 그리드 서치
param_grid = {
'n_estimators': [100, 200, 300],
'max_depth': [10, 20, 30, None]
}
grid_search = GridSearchCV(
RandomForestClassifier(random_state=42),
param_grid,
cv=5
)
grid_search.fit(X_train_scaled, y_train)
print(f"최적 파라미터: {grid_search.best_params_}")✅ 특징:
- 교차 검증
- 하이퍼파라미터 튜닝
- 성능 평가
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
# 전처리와 모델링 파이프라인
pipeline = Pipeline([
('imputer', SimpleImputer(strategy='mean')),
('scaler', StandardScaler()),
('classifier', RandomForestClassifier())
])
# 파이프라인 실행
pipeline.fit(X_train, y_train)
pipeline_pred = pipeline.predict(X_test)
# 파이프라인 평가
print(classification_report(y_test, pipeline_pred))✅ 특징:
- 전처리 자동화
- 모델 연결
- 평가 보고서
✅ 모범 사례:
- 데이터 전처리 중요성
- 교차 검증 활용
- 하이퍼파라미터 튜닝
- 파이프라인 구축
- 모델 평가 지표 선택
- 과적합 방지
- 특성 선택과 엔지니어링
- 불균형 데이터 처리
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
class TextClassifier:
def __init__(self):
self.pipeline = Pipeline([
('tfidf', TfidfVectorizer()),
('classifier', MultinomialNB())
])
def train(self, texts, labels):
self.pipeline.fit(texts, labels)
def predict(self, texts):
return self.pipeline.predict(texts)
def evaluate(self, texts, true_labels):
pred_labels = self.predict(texts)
return classification_report(true_labels, pred_labels)from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
class AnomalyDetector:
def __init__(self, contamination=0.1):
self.scaler = StandardScaler()
self.detector = IsolationForest(
contamination=contamination,
random_state=42
)
def fit(self, data):
scaled_data = self.scaler.fit_transform(data)
self.detector.fit(scaled_data)
def predict(self, data):
scaled_data = self.scaler.transform(data)
predictions = self.detector.predict(scaled_data)
return predictions == -1 # True for anomalies
def get_anomaly_scores(self, data):
scaled_data = self.scaler.transform(data)
return -self.detector.score_samples(scaled_data)- 데이터 전처리 중요성
- 교차 검증 활용
- 하이퍼파라미터 튜닝
- 파이프라인 구축
- 모델 평가 지표 선택
- 과적합 방지
- 특성 선택과 엔지니어링
- 불균형 데이터 처리
- 모델 저장과 로드
- 확장성 고려