30天学会Python编程:22. Python机器学习简介
22.1 机器学习概述
22.1.1 机器学习分类
22.1.2 典型应用场景
学习类型 | 应用案例 | 常用算法 |
分类 | 垃圾邮件识别 | 决策树/SVM |
回归 | 房价预测 | 线性回归 |
聚类 | 客户分群 | K-Means |
降维 | 数据可视化 | PCA |
22.2 scikit-learn工作流
22.2.1 基本流程
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
# 1. 数据准备
X_train, X_test, y_train, y_test = train_test_split(
features, labels, test_size=0.2, random_state=42
)
# 2. 特征工程
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# 3. 模型训练
model = RandomForestClassifier(n_estimators=100)
model.fit(X_train_scaled, y_train)
# 4. 模型评估
y_pred = model.predict(X_test_scaled)
print(f"准确率: {accuracy_score(y_test, y_pred):.2f}")
22.2.2 交叉验证
from sklearn.model_selection import cross_val_score
scores = cross_val_score(
model,
X_train_scaled,
y_train,
cv=5, # 5折交叉验证
scoring='accuracy'
)
print(f"交叉验证平均准确率: {scores.mean():.2f} (±{scores.std():.2f})")
22.3 监督学习算法
22.3.1 线性模型
from sklearn.linear_model import LinearRegression, LogisticRegression
# 线性回归
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)
# 逻辑回归
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)
22.3.2 决策树与集成方法
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
# 决策树
tree = DecisionTreeClassifier(max_depth=5)
tree.fit(X_train, y_train)
# 梯度提升树
gbdt = GradientBoostingClassifier(n_estimators=100)
gbdt.fit(X_train, y_train)
22.4 无监督学习
22.4.1 聚类分析
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
# K-Means聚类
kmeans = KMeans(n_clusters=3, random_state=42)
clusters = kmeans.fit_predict(X_scaled)
# 评估轮廓系数
score = silhouette_score(X_scaled, clusters)
print(f"轮廓系数: {score:.2f}")
22.4.2 降维技术
from sklearn.decomposition import PCA
# PCA降维
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)
# 可视化
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y)
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.show()
22.5 模型评估与优化
22.5.1 分类评估指标
from sklearn.metrics import classification_report, roc_auc_score
print(classification_report(y_test, y_pred))
print(f"AUC分数: {roc_auc_score(y_test, y_pred_proba):.2f}")
22.5.2 超参数调优
from sklearn.model_selection import GridSearchCV
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [3, 5, None]
}
grid_search = GridSearchCV(
RandomForestClassifier(),
param_grid,
cv=5,
scoring='accuracy'
)
grid_search.fit(X_train, y_train)
print(f"最佳参数: {grid_search.best_params_}")
print(f"最佳分数: {grid_search.best_score_:.2f}")
22.6 应用举例
案例1:鸢尾花分类
from sklearn.datasets import load_iris
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
# 加载数据
iris = load_iris()
X, y = iris.data, iris.target
# 创建管道
model = make_pipeline(
StandardScaler(),
SVC(kernel='rbf', probability=True)
)
# 训练评估
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
model.fit(X_train, y_train)
print(f"测试集准确率: {model.score(X_test, y_test):.2f}")
# 可视化决策边界
def plot_decision_boundary(model, X, y):
# 降维到前两个特征
X = X[:, :2]
model.fit(X, y)
x_min, x_max = X[:, 0].min()-1, X[:, 0].max()+1
y_min, y_max = X[:, 1].min()-1, X[:, 1].max()+1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.02),
np.arange(y_min, y_max, 0.02))
Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.contourf(xx, yy, Z, alpha=0.4)
plt.scatter(X[:, 0], X[:, 1], c=y, s=20, edgecolor='k')
plt.title('决策边界可视化')
plot_decision_boundary(model, X_train, y_train)
案例2:手写数字识别
from sklearn.datasets import load_digits
from sklearn.neural_network import MLPClassifier
# 加载数据
digits = load_digits()
X, y = digits.data, digits.target
# 数据可视化
fig, axes = plt.subplots(4, 4, figsize=(6, 6))
for i, ax in enumerate(axes.flat):
ax.imshow(X[i].reshape(8, 8), cmap='gray')
ax.set_title(f"Label: {y[i]}")
ax.axis('off')
# 训练MLP分类器
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
mlp = MLPClassifier(hidden_layer_sizes=(64, 32), max_iter=1000)
mlp.fit(X_train, y_train)
# 评估模型
print(f"测试集准确率: {mlp.score(X_test, y_test):.2f}")
# 混淆矩阵
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
cm = confusion_matrix(y_test, mlp.predict(X_test))
disp = ConfusionMatrixDisplay(cm)
disp.plot()
plt.title('混淆矩阵')
plt.show()
22.7 学习路径
22.8 学习总结
核心要点:
- 掌握机器学习工作流程
- 理解不同算法适用场景
- 熟练使用scikit-learn工具
- 能够评估模型性能
持续更新Python编程学习日志与技巧,敬请关注!
#编程# #学习# #python# #在头条记录我的2025#