1. 原理:贝叶斯公式
已知先验概率及类条件概率我们可以通过贝叶斯公式很方便地计算后验概率,并通过后验概率来进行决策。
在本次实验设计贝叶斯分类器时,假设数据属性服从高斯(正态)分布。
2. 数据集
本次实验的数据集为《模式识别》班级中学生提交的身高、体重、鞋码等数据。
为了导入方便,将性别列中的数据进行更改(男替换为0,女替换为1)
3. 代码
3.1 导入数据
import csv
#导入csv数据
def loadCsv(filename):
lines = csv.reader(open(filename,"r"))
dataset = list(lines) #将每一行数据以列表的形式存入数据集中
for i in range(len(dataset)):
dataset[i] = [float(x) for x in dataset[i]]
#将每一行的每一个数据改为float类型
return dataset
3.2 划分数据集与训练集
def splitDataset(dataset,splitRatio):
#按照一定比例划分数据集
trainSize = int(len(dataset)*splitRatio)
#splitRadio为训练集的比例
trainSet = []
testSet = list(dataset)#创建数据集副本,用作测试集
while len(trainSet) < trainSize:
#直到训练集数目满足
temp = random.randrange(len(testSet))#随机选择一条数据
trainSet.append(testSet.pop(temp))#向训练集中添加数据,同时测试集删除该条数据
return [trainSet,testSet]
3.3 按类别划分数据
def separateByClass(dataset):
separated = {}
for i in range(len(dataset)):
vector = dataset[i]
if vector[-1] not in separated:
#最后一列为类别
separated[vector[-1]] = []
separated[vector[-1]].append(vector)
return separated
3.4 计算正态分布的均值和标准差
import math
def mean(numbers):
return sum(numbers)/float(len(numbers))
def stdev(numbers):
avg = mean(numbers)
variance = sum([pow(x-avg,2) for x in numbers])/float(len(numbers)-1)
return math.sqrt(variance)
3.5 提取数据集的特征
def summarize(dataset):
summaries = [(mean(attribute),stdev(attribute)) for attribute in zip(*dataset)]
del summaries[-1]
return summaries
3.6 按类别提取属性特征
def summarizeByClass(dataset):
separated = separateByClass(dataset)
summaries = {}
for classValue, instances in separated.items():
summaries[classValue] = summarize(instances)
return summaries
3.7 计算正态分布的概率密度函数
已知每个属性和类值的属性特征,在给定类值的条件下,可以得到给定属性值的条件概率。
import math
def calculateProbability(x, mean, stdev):
exponent = math.exp(-(math.pow(x-mean,2)/(2*math.pow(stdev,2))))
return (1 / (math.sqrt(2*math.pi) * stdev)) * exponent
3.8 计算所属某个类别的概率
给定一个数据样本,它所属每个类别的概率,可以通过将其属性概率相乘得到。
def calculateClassProbabilities(summaries, inputVector):
probabilities = {}
for classValue, classSummaries in summaries.items():
probabilities[classValue] = 1
for i in range(len(classSummaries)):
mean, stdev = classSummaries[i]
x = inputVector[i]
probabilities[classValue] *= calculateProbability(x, mean, stdev)
return probabilities
3.9 结果预测
def predict(summaries, inputVector):
probabilities = calculateClassProbabilities(summaries, inputVector)
bestLabel, bestProb = None, -1
for classValue, probability in probabilities.items():
if bestLabel is None or probability > bestProb:
bestProb = probability
bestLabel = classValue
return bestLabel
def getPredictions(summaries, testSet):
predictions = []
for i in range(len(testSet)):
result = predict(summaries, testSet[i])
predictions.append(result)
return predictions
4. 精度评估
def getAccuracy(testSet, predictions):
correct = 0
for x in range(len(testSet)):
if testSet[x][-1] == predictions[x]:
correct += 1
return (correct/float(len(testSet))) * 100.0
5. 运行结果
filename = "2021冬模式识别数据收集 - Sheet1.csv"
splitRatio = 0.67
dataset = loadCsv(filename)
trainingSet, testSet = splitDataset(dataset, splitRatio)
print('Split {0} rows into train={1} and test={2} rows'.format(len(dataset), len(trainingSet), len(testSet)))
# prepare model
summaries = summarizeByClass(trainingSet)
# test model
predictions = getPredictions(summaries, testSet)
accuracy = getAccuracy(testSet, predictions)
print('Accuracy: {0}%'.format(accuracy))
6. 决策面绘制
本次实验的数据集为《模式识别》班级中学生提交的身高、体重、鞋码等数据。
为了导入方便,将性别列中的数据进行更改(男替换为0,女替换为1)
选择学生身高和体重两项属性关于性别进行分类
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn import preprocessing as process
from sklearn.metrics import roc_curve, auc
def newLoadCsv(filename):
x = pd.read_csv(filename,header=None)
# del x[2]
num = len(x[0])
height = x[0].tolist()
weight = x[1].tolist()
size = x[3].tolist()
gender = x[4].tolist()
height = np.array(height)
weight = np.array(weight)
size = np.array(size)
scaler = preprocess.MinMaxScaler(feature_range=[-1,1])
height = scaler.fit_transform(height.reshape(-1,1))
weight = scaler.fit_transform(weight.reshape(-1,1))
size = scaler.fit_transform(size.reshape(-1,1))
x[0] = height
x[1] = weight
x[3] = size
# x.to_csv('new.csv')
boy = x[x[0]==0].values
girl = x[x[0]==1].values
return boy,girl
filename = "链接中的数据"
boy,girl = newLoadCsv(filename)
#处理男生数据
#先验概率
Pw1 = 0.5
#协方差矩阵
Cov1 = np.cov(boy[:12,0:2].T)
#协方差逆矩阵
_Cov1 = np.linalg.inv(Cov1)
#身高、体重两列的均值
mean1 = np.transpose(boy[:12,0:2].mean(axis=0))
W1 = 0.5*_Cov1#对应于公式中的W1
w1 = _Cov1*mean1#对应于公式中的w1
w10 = -0.5*mean1.T*_Cov1*mean1-0.5*np.log(np.linalg.det(_Cov1)) + np.log(Pw1)
#对应于公式中的w10
#处理女生数据
#先验概率
Pw2 = 0.5
#协方差矩阵
Cov2 = np.cov(girl[:,0:2].T)
#协方差逆矩阵
_Cov2 = np.linalg.inv(Cov2)
mean2 = np.transpose(girl[:,0:2].mean(axis=0))
W2 = 0.5*_Cov2#W2
w2 = _Cov2*mean2#w2
w20 = -0.5*mean2.T*_Cov2*mean2-0.5*np.log(np.linalg.det(_Cov2)) + np.log(Pw2)
x1 = Symbol("x1")
x2 = Symbol("x2")
g1x=np.matrix([[x1],[x2]]).T*W1*np.matrix([[x1],[x2]])+w1.T*np.matrix([[x1],[x2]])+w10 #1类判别函数g1(x)
g2x=np.matrix([[x1],[x2]]).T*W2*np.matrix([[x1],[x2]])+(w2.T*np.matrix([[x1],[x2]]))+w20 #2类判别函数g2(x)
a=solve(g1x[0,0]-g2x[0,0],x2) #求x1与x2函数式
print("x2=%s"%a) #输出决策面函数解析式
结果为
x2=[1.01052973055899x1 - 1.43139420533296sqrt(-0.0137691080679818x1**2 + x1 + 0.829053167702432) - 0.73893290231258, 1.01052973055899x1 + 1.43139420533296sqrt(-0.0137691080679818x1**2 + x1 + 0.829053167702432) - 0.73893290231258]
#定义函数用上面解析式,用于画图
def x2(x1):
return 1.05197634138595*x1 - 1.66886500443616*sqrt(abs(-0.437192683690852*x1**2 - x1 - 0.034770097514185)) + 1.11993705115552
x = []
y = []
for i in range(10,100,1):
x.append(-1*i/100)
y.append(x2(-1*i/100))
# print(x,y)
x = np.array(x)
y = np.array(y)
#描点法画图
plt.plot(x,y,c='g')
plt.xlabel('height')
plt.ylabel('weight')
plt.title('decision function')
x11 = boy[:,0]
x21 = boy[:,1]
x12 = girl[:,0]
x22 = girl[:,1]
colors1 = '#00CEF1' #点的颜色
colors2 = '#DC141C'
# colors1 = np.random.rand(len(boy[:,0]))
area1 = np.pi * (15 * np.random.rand(len(boy[:,0])))**2 # 0 to 15 point radii
# colors2 = np.random.rand(len(girl[:,0]))
area2 = np.pi * (15 * np.random.rand(len(girl[:,0])))**2 # 0 to 15 point radii
# area = np.pi * 6.25 # 点面积
plt.scatter(x11, x21, s=area1, c=colors1, alpha=0.4, label='boy')
plt.scatter(x12, x22, s=area2, c=colors2, alpha=0.4, label='girl')# 画散点图
结果:
7. 三维决策面绘制
def loadCsv(filename):
lines = csv.reader(open(filename,"r"))
dataset = list(lines)
for i in range(len(dataset)):
dataset[i] = [np.array(x) for x in dataset[i]]
return dataset
def calculation(boy,girl):
boy_cov = np.cov(boy[:,0:3].T)#男生协方差矩阵
girl_cov = np.cov(girl[:,0:3].T)#女生协方差矩阵
cov_avg = (boy_cov+girl_cov)/2.0
mean_boy = np.transpose(boy[:,0:3].mean(axis=0))#男生均值
mean_girl = np.transpose(girl[:,0:3].mean(axis=0))
boy_num = len(boy[:,0])
girl_num = len(girl[:,0])
num = boy_num + girl_num
p_boy = float(boy_num)/float(num)#男生先验概率
p_girl = float(girl_num)/float(num)
p = np.log(p_girl/p_boy)
return boy_cov,girl_cov,cov_avg,mean_boy,mean_girl,p_boy,p_girl,p
def result_3d(boy, girl, boy_cov, girl_cov, u_boy, u_girl, P_boy, P_girl):
W_boy = np.linalg.inv(boy_cov) * (-0.5)#inv为逆矩阵
W_girl = np.linalg.inv(girl_cov) * (-0.5)
W = np.subtract(W_boy, W_girl)#减
w_boy = np.dot(W_boy * (-2.0), u_boy.T)#点积
w_girl = np.dot(W_girl * (-2.0), u_girl.T)
w = np.subtract(w_boy, w_girl)
w_boy_0 = -0.5 * np.dot(np.dot(u_boy, np.linalg.inv(boy_cov)), u_boy.T) - 0.5 * np.log(
np.linalg.det(boy_cov)) + np.log(P_boy)#det为行列式
w_girl_0 = -0.5 * np.dot(np.dot(u_girl, np.linalg.inv(girl_cov)), u_girl.T) - 0.5 * np.log(
np.linalg.det(girl_cov)) + np.log(P_girl)
wi_0 = w_boy_0 - w_girl_0
x1 = np.arange(-1, 1, 0.01)
y1 = np.arange(-1, 1, 0.01)
x, y = np.meshgrid(x1, y1)
# print(x.shape)
a = W[2, 2]
b = ((W[0, 2] + W[2, 0]) * y + (W[1, 2] + W[2, 1]) * x + w[2])
c = (W[0, 0] * x * x + W[1, 1] * y * y + (W[0, 1] + W[1, 0]) * x * y + w[0] * x + w[1] * y) + wi_0
# print(a.shape,b.shape,c.shape)
z1 = ((-b - np.sqrt(b ** 2 - 4 * a * c)) / (2 * a))
z2 = ((-b + np.sqrt(b ** 2 - 4 * a * c)) / (2 * a))
w2 = w[2]
w0 = w[0]
w1 = w[1]
W02 = W[0, 2]
W01 = W[0, 1]
W00 = W[0, 0]
W11 = W[1, 1]
W12 = W[1, 2]
boy_result = []
girl_result = []
for i in boy:
h = i[0]
w = i[1]
l = i[2]
A = a
B = ((W02 + W02) * w + (W12 + W12) * h + w2)
C = A * l * l + B * l
Z_ = C + ((W00 * h * h + W11 * w * w + (W01 + W01) * w * h + w0 * h + w1 * w) + wi_0)
boy_result.append(Z_)
for i in girl:
h = i[0]
w = i[1]
l = i[2]
A = a
B = ((W02 + W02) * w + (W12 + W12) * h + w2)
C = A * l * l + B * l
Z_ = C + ((W00 * h * h + W11 * w * w + (W01 + W01) * w * h + w0 * h + w1 * w) + wi_0)
girl_result.append(Z_)
return x, y, z1, z2, boy_result, girl_result
运行
filename = "链接中的文件"
boy,girl = newLoadCsv(filename)
boy_cov, girl_cov, cov_avg, u_boy, u_girl, P_boy, P_girl, P = calculation(boy, girl)
x, y, z1, z2, boy_result, girl_result = result_3d(boy, girl, boy_cov, girl_cov, u_boy, u_girl, P_boy, P_girl)
print(boy_result)
print(girl_result)
fig = plt.figure()
ax1 = Axes3D(fig)
ax1.scatter(boy[:, 0], boy[:, 1], boy[:, 2], color='b')
ax1.scatter(girl[:, 0], girl[:, 1], girl[:, 2], color='r')
if type(z2) == int:
ax1.plot_surface(x, y, z1, color='w')
else:
ax1.plot_surface(x, y, z1, color='w')
ax1.plot_surface(x, y, z2, color='y')
plt.show()
8. 参考链接
https://blog.csdn.net/caihaihua0572/article/details/117258485
https://cloud.tencent.com/developer/article/1164686
https://blog.csdn.net/qq_25948717/article/details/81744277
https://blog.csdn.net/weixin_46523923/article/details/120789524