import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from scipy.stats import pearsonr
from minepy import MINE
#读取数据集
#data = pd.read_csv(“Absenteeism_at_work.csv”, header=0, index_col=0)
data = pd.read_csv(“Absenteeism_at_work.csv”)
#data = pd.read_csv(“Absenteeism_at_work_org.csv”, sep=’;’, header=0, index_col=0)
data.head()
data.info()
X=data.iloc[:,0:21]
y=data.iloc[:,20]
#进行相关性分析
#采用皮尔森相关系统数,对每个因数与结果进行相关分析
ax = np.array(X)
for ine in range(0,20):
x = ax[:,ine]
#print(x)
r, p = pearsonr(x,y)
print(ine,‘:’,r,‘ ‘,p)
#根据相关系数,选择相关性较高对因素,但是后面得到对分类结果更差。
axs = np.column_stack((ax[:,1],ax[:,3],ax[:,11],ax[:,13],ax[:,18]))
#MIC,基于最大信息对非参数估计
print(“MIC result:”)
for ine in range(0,20):
x = ax[:,ine]
mine = MINE(alpha=0.6, c=15, est=“mic_approx”)
mine.compute_score(x, y)
mic = mine.mic()
print(ine, ‘:’, mic)
axm = np.column_stack((ax[:,0], ax[:,1],ax[:,5],ax[:,6],ax[:,7],ax[:,11],ax[:,17],ax[:,19]))
#拆分训练集和测试集,该样本集,训练样本选80%,得到效果较好;计算结果与数据集选择有明显关系
#实验结果表明,使用完全数据,结果较好;而根据pearsonr相关性选择对特征,分类结果很差
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=0) #全部数据
#X_train,X_test,y_train,y_test=train_test_split(axs,y,test_size=0.2,random_state=0) #pearsonr特征提取后数据
#X_train,X_test,y_train,y_test=train_test_split(axm,y,test_size=0.2,random_state=0) #mic特征提取后数据, 结果要比pearsonr特征好。
#标准化处理,对各位度数据进行预处理
#实验结果表明,标准化处理对logistics、K近邻、SVM对结果有一定帮助
sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)
#X_train_std = X_train
#X_test_std = X_test
#采用下面多种分类算法进行数据处理
clf = LogisticRegression(penalty=‘l2’)
clf.fit(X_train_std, y_train)
ret = clf.score(X_test_std, y_test)
print(“Logistic:”,ret)
clf = KNeighborsClassifier()
clf.fit(X_train_std, y_train)
ret = clf.score(X_test_std, y_test)
print(“KNeighbors: “,ret)
clf = clf = RandomForestClassifier(n_estimators=8)
clf.fit(X_train_std, y_train)
ret = clf.score(X_test_std, y_test)
print(“Random Forest:”, ret)
clf = tree.DecisionTreeClassifier()
clf.fit(X_train_std, y_train)
ret = clf.score(X_test_std, y_test)
print(“Decision Tree”, ret)
clf = GradientBoostingClassifier(n_estimators=200)
clf.fit(X_train_std, y_train)
ret = clf.score(X_test_std, y_test)
print(“GradientBoosting(梯度提升): “, ret)
clf = SVC(kernel=‘rbf’, probability=True)
clf.fit(X_train_std, y_train)
ret = clf.score(X_test_std, y_test)
print(“SVM”, ret)
本文为原创文章,转载请注明出处!
admin:支持一下,感谢分享!,+10,