python分类

import pandas as pd

import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler

from matplotlib.colors import ListedColormap

import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression

from sklearn.neighbors import KNeighborsClassifier

from sklearn.ensemble import RandomForestClassifier

from sklearn import tree

from sklearn.ensemble import GradientBoostingClassifier

from sklearn.svm import SVC

from scipy.stats import pearsonr

from minepy import MINE

#读取数据集

#data = pd.read_csv(“Absenteeism_at_work.csv”, header=0, index_col=0)

data = pd.read_csv(“Absenteeism_at_work.csv”)

#data = pd.read_csv(“Absenteeism_at_work_org.csv”, sep=’;’, header=0, index_col=0)

data.head()

data.info()

X=data.iloc[:,0:21]

y=data.iloc[:,20]

#进行相关性分析

#采用皮尔森相关系统数,对每个因数与结果进行相关分析

ax = np.array(X)

for ine in range(0,20):

    x = ax[:,ine]

    #print(x)

    r, p = pearsonr(x,y)

    print(ine,‘:’,r,  ,p)

#根据相关系数,选择相关性较高对因素,但是后面得到对分类结果更差。

axs = np.column_stack((ax[:,1],ax[:,3],ax[:,11],ax[:,13],ax[:,18]))

#MIC,基于最大信息对非参数估计

print(“MIC result:”)

for ine in range(0,20):

    x = ax[:,ine]

    mine = MINE(alpha=0.6, c=15, est=“mic_approx”)

    mine.compute_score(x, y)

    mic = mine.mic()

    print(ine, ‘:’, mic)

axm = np.column_stack((ax[:,0], ax[:,1],ax[:,5],ax[:,6],ax[:,7],ax[:,11],ax[:,17],ax[:,19]))

#拆分训练集和测试集,该样本集,训练样本选80%,得到效果较好;计算结果与数据集选择有明显关系

#实验结果表明,使用完全数据,结果较好;而根据pearsonr相关性选择对特征,分类结果很差

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=0) #全部数据

#X_train,X_test,y_train,y_test=train_test_split(axs,y,test_size=0.2,random_state=0) #pearsonr特征提取后数据

#X_train,X_test,y_train,y_test=train_test_split(axm,y,test_size=0.2,random_state=0) #mic特征提取后数据, 结果要比pearsonr特征好。

#标准化处理,对各位度数据进行预处理

#实验结果表明,标准化处理对logistics、K近邻、SVM对结果有一定帮助

sc = StandardScaler()

sc.fit(X_train)

X_train_std = sc.transform(X_train)

X_test_std = sc.transform(X_test)

#X_train_std = X_train

#X_test_std = X_test

#采用下面多种分类算法进行数据处理

clf = LogisticRegression(penalty=‘l2’)

clf.fit(X_train_std, y_train)

ret = clf.score(X_test_std, y_test)

print(“Logistic:”,ret)

clf = KNeighborsClassifier()

clf.fit(X_train_std, y_train)

ret = clf.score(X_test_std, y_test)

print(“KNeighbors: “,ret)

clf = clf = RandomForestClassifier(n_estimators=8)

clf.fit(X_train_std, y_train)

ret = clf.score(X_test_std, y_test)

print(“Random Forest:”, ret)

clf = tree.DecisionTreeClassifier()

clf.fit(X_train_std, y_train)

ret = clf.score(X_test_std, y_test)

print(“Decision Tree”, ret)

clf = GradientBoostingClassifier(n_estimators=200)

clf.fit(X_train_std, y_train)

ret = clf.score(X_test_std, y_test)

print(“GradientBoosting(梯度提升): “, ret)

clf = SVC(kernel=‘rbf’, probability=True)

clf.fit(X_train_std, y_train)

ret = clf.score(X_test_std, y_test)

print(“SVM”, ret)

本文为原创文章,转载请注明出处!