这里写自定义目录标题
- KNN实现手写数字(自己的)识别
-
- 项目文件
- 代码展示
KNN实现手写数字(自己的)识别
项目文件
代码展示
import numpy as npy
import os
from collections import Counter
import operator as opt
from matplotlib import pyplot as plt
#欧氏距离
def knn(k,test_data,train_data,labels): #传统蛮力求解KNN法,计算欧式距离
traindatasize=train_data.shape[0]
dif=npy.tile(test_data,(traindatasize,1))-train_data#tile的作用,在列方向复制test_data,train_data_size次,此处是测试数据与每个现有的数据相减
sqdif=dif**2 # 求平方
sumsqdif=sqdif.sum(axis=1)
distance=sumsqdif**0.5#开方,欧氏距离
sortdistance=distance.argsort()
count={ }
for i in range(0,k):
vote=labels[sortdistance[i]]
count[vote]=count.get(vote,0)+1
sortcount=sorted(count.items(),key=opt.itemgetter(1),reverse=True)#排序按照最大投票的数量排序
return sortcount[0][0]#返回投票最多的标签
#曼哈顿距离
def Knn_MANHD(k, test_data, train_data, labels): #传统蛮力求解KNN法,计算曼哈顿距离
train_data_size = train_data.shape[0]
dif = abs(npy.tile(test_data, (train_data_size, 1)) - train_data)#tile的作用,在列方向复制test_data,train_data_size次,此处是测试数据与每个现有的数据相减
sum_dif = dif.sum(axis=1)#距离和
sort_distance = sum_dif.argsort()
count = { }
for i in range(k):
vote = labels[sort_distance[i]]
count[vote] = count.get(vote, 0) + 1
sort_count = sorted(count.items(), key=opt.itemgetter(1), reverse=True)
return sort_count[0][0]
#闵氏距离p=3
def Knn_MD(k, test_data, train_data, labels): # 传统蛮力求解KNN法,计算欧式距离
train_data_size = train_data.shape[0]
dif = abs(npy.tile(test_data,
(train_data_size, 1)) - train_data) # tile的作用,在列方向复制test_data,train_data_size次,此处是测试数据与每个现有的数据相减
sqrt_dif = dif ** 3 # 求3次方
sum_sqrt_dif = sqrt_dif.sum(axis=1) # 平方和
distance = sum_sqrt_dif ** float((1/3)) # 开3方,欧氏距离
sort_distance = distance.argsort()
count = { }
for i in range(k):
vote = labels[sort_distance[i]]
count[vote] = count.get(vote, 0) + 1
# print(count)
sort_count = sorted(count.items(), key=opt.itemgetter(1), reverse=True) # 排序按照最大投票的数量排序
return sort_count[0][0] # 返回投票最多的标签
#余弦相似度
def cosine_similarity(x, y, norm=False):
# zero_list = np.zeros((1,x.shape[1]))
# if x == zero_list or y == zero_list:
# return float(1) if x == y else float(0)
res = npy.array([[x[i] * y[i], x[i] * x[i], y[i] * y[i]] for i in range(len(x))])
cos = sum(res[:, 0]) / (npy.sqrt(sum(res[:, 1])) * npy.sqrt(sum(res[:, 2])))
return 0.5 * cos + 0.5 if norm else cos
def Knn_COS(k, test_data, train_data, labels): # 余弦相似度
distance=[]
train_data_size = train_data.shape[0]
for i in range(train_data_size):
cos=cosine_similarity(test_data,train_data[i])
distance.append(cos)
distance=npy.array(distance)
sort_distance = distance.argsort()
count = { }
for i in range(k):
vote = labels[sort_distance[i]]
count[vote] = count.get(vote, 0) + 1
# print(count)
sort_count = sorted(count.items(), key=opt.itemgetter(1), reverse=True) # 排序按照最大投票的数量排序
return sort_count[0][0] # 返回投票最多的标签
#权值优化
def ED_weight_optimize(k, test_data, train_data, labels):
sum = 0
train_data_size = train_data.shape[0]
for i in range(k):
sum += (i+1)
dif = npy.tile(test_data,
(train_data_size, 1)) - train_data # tile的作用,在列方向复制test_data,train_data_size次,此处是测试数据与每个现有的数据相减
sqrt_dif = dif ** 2 # 求平方
sum_sqrt_dif = sqrt_dif.sum(axis=1) # 平方和
distance = abs(sum_sqrt_dif ** 0.5) # 开方,欧氏距离
sort_distance = distance.argsort()
count = { }
for i in range(k):
vote = labels[sort_distance[i]]
count[vote] = count.get(vote,0.0) + round(float((k-i)/(sum)) ,2)
# print(count)
sort_count = sorted(count.items(), key=opt.itemgetter(1), reverse=True) # 排序按照最大投票的数量排序
return sort_count[0][0] # 返回投票最多的标签
def datatoarray(fname): #转化为一维
arr=[]
fh=open(fname)
for i in range(0,32):
thisline=fh.readline()
for j in range(0,32):
arr.append(int(thisline[j]))
return arr
def seplabel(fname):
filestr=fname.split(".")[0]
labels=int(filestr.split("_")[0])
return labels
def traindata(): #训练集
labels=[]
trainfile=os.listdir("./traindata")
num=len(trainfile)
#像素32*32=1024
#创建一个数组存放训练数据,行为文件总数,列为1024,为一个手写体的内容 zeros创建规定大小的数组
trainarr=npy.zeros((num,1024))
for i in range(0,num):
thisfname=trainfile[i]
thislabel=seplabel(thisfname)
labels.append(thislabel)
trainarr[i]=datatoarray("./traindata/"+thisfname)
return trainarr,labels
def datatest():
line = npy.zeros(5)
plt.subplot(111)
plt.ylim(0.85, 1)
x = npy.zeros((5,16))
correct = npy.zeros((5, 16))
for j in range(1, 6):
for i in range(1, 17):
x[j-1][i - 1] = i
for k in range(0, 5):
for j in range(1, 17):
num = 0;
trainarr, labels = traindata() #进入训练集
testlist = os.listdir("./testdata")
tnum = len(testlist)
for i in range(tnum):
thisname=testlist[i]
testarr=datatoarray("./testdata/"+thisname)
if (k == 0):
rknn=knn(k=j,test_data=testarr,train_data=trainarr,labels=labels)
elif (k == 1):
rknn = Knn_MANHD(k=j, test_data=testarr, train_data=trainarr, labels=labels)
elif (k == 2):
rknn = Knn_MD(k=j, test_data=testarr, train_data=trainarr, labels=labels)
# elif (k == 3):
# rknn = Knn_COS(k=j, test_data=testarr, train_data=trainarr, labels=labels)
# plt.plot(color='#4B0082')
elif (k == 4):
rknn = ED_weight_optimize(k=j, test_data=testarr, train_data=trainarr, labels=labels)
thislabel = seplabel(thisname)
if thislabel == rknn:
num += 1
#print(str(thisname)+" : "+str(rknn))
print(k , " ", j , " 正确率 ", num,"/",tnum," ",num/tnum)
correct[k][j-1] = num/tnum;
plt.plot(x[k], correct[k])
print(x)
print(correct)
#ax.legend(('knn', 'Knn_MANHD', ), loc='upper right')
plt.show()
# plt.plot(x, correct[0], x, correct[1], x, correct[2], x, correct[3], x, correct[4])
# plt.legend(('knn', 'Knn_MANHD', 'Knn_MD', 'Knn_COS', 'ED_weight_optimize'), loc='upper right')
# plt.show()
datatest() #主函数
本文地址:https://blog.csdn.net/Steel_Knife/article/details/110440931