python实现K-means算法
k-means 算法接受参数 k ;然后将事先输入的n个数据对象划分为 k个聚类以便使得所获得的聚类满足:同一聚类中的对象相似度较高;而不同聚类中的对象相似度较小。聚类相似度是利用各聚类中对象的均值所获得一个“中心对象”(引力中心)来进行计算的。通过随机选取几个聚类中心,并计算所有点到中心的距离,选取最近的一类,在以这个簇为中心,求簇中点的均值形成新的类。
操作方法
- 01
第一步计算欧氏距离 并取样,k代表分类的总个数 import numpy as np #calculate the O distance def calculate_distance(vector1,vector2): import numpy as np return np.sqrt(np.sum(np.square(vector1-vector2))) #initialize centroids def initialize_centroids(data,k): import random return random.sample(data,k)
- 02
产生新的簇类并求出最短距离 #find the minimun diastance from individual to centroids def minimun_distance(data,centroidlist): clusterdictionary=cd=dict() for i in data: vector1=i marker=0 min_dist=float(inf) for j in range(len(centroidlist)): vector2=centroidlist[j] distance=calculate_distance(vector1,vector2) if distance<min_dist: min_dist=distance marker=j if marker not in clusterdictionary.keys(): clusterdictionary[marker]=list() clusterdictionary[marker].append(i) return clusterdictionary #get centroids def getcentroids(clusterdictionary): import numpy as np centroidlist=list() for key in clusterdictionary.keys(): centroid=np.mean(np.array(clusterdictionary[key]),axis=0) centroidlist.append(centroid) return np.array(centroidlist)
- 03
导入数据并计算,当簇中心变化小于一定阈值跳出循环 #get mean squared deviation def getmsd(clusterdictionary,centroidlist): sum=0.0 for key in clusterdictionary.keys(): vector1=centroidlist[key] distance=0.0 for i in clusterdictionary[key]: vector2=i distance+=calculate_distance(vector1,vector2) sum+=distance return sum #show result def showresult(clusterdictionary,centroidlist): import matplotlib.pyplot as plt colormark=['or','ob','og','ok'] centroidmark=['dr','db','dg','dk'] for key in clusterdictionary.keys(): plt.plot(centroidlist[key][0],centroidlist[key][1],centroidmark[key],markersize=12) for i in clusterdictionary[key]: plt.plot(i[0],i[1],colormark[key]) plt.show path='C:\\Users\\jyjh\\Desktop\\data.txt' data=open(path,'r').readlines() temp=list() import re for i in data: numlist=list() for j in i.strip().split('\t'): num=float(j) numlist.append(num) temp.append(numlist) data=np.array(temp) centroidlist=initialize_centroids(data,4) clusterdictionary=minimun_distance(data,centroidlist) new_msd=getmsd(clusterdictionary,centroidlist) old_msd=-0.000001 k=2 while(abs(new_msd-old_msd)>=0.00001): centroidlist=getcentroids(clusterdictionary) clusterdictionary=minimun_distance(data,centroidlist) old_msd=new_msd new_msd=getmsd(clusterdictionary,centroidlist) k+=1 print new_msd-old_msd showresult(clusterdictionary,centroidlist)