K-Means - 基于tensorflow实现程序

本文之编写程序涉及到API介绍,程序的完整实现,具体算法原理请查看之前所写的K-Means算法介绍

一、基础准备

1、python 基础

2、numpy 基础

3、tensorflow 基础

initialized_value
作用:
返回已经初始化变量的值.你应该使用这个函数来代替使用变量自己来初始化依赖这个变量的值的其他变量。

# 原始的变量
weights = tf.Variable(tf.random_normal([784, 200], stddev=0.35),name="weights")
# 创造相同内容的变量
w2 = tf.Variable(weights.initialized_value(), name="w2")
# 也可以直接乘以比例
w_twice = tf.Variable(weights.initialized_value() * 0.2, name="w_twice")

tf.slice:
切割数组

y=np.arange(36).reshape([3,3,4])

sess=tf.Session()

begin_y=[1,1,0]  #切割的起始位置
size_y=[2,2,2]  # 2:第一个维度取几个数据, 2:第一个维度取几个数据,  3:第一个维度取几个数据,

###
#y[1+0, 1+0, 0+0], y[1+0, 1+0, 0+1], y[1+0, 1+0, 0+3] =  [16 17 18]
#y[1+0, 1+1, 0+0], y[1+0, 1+0, 0+1], y[1+0, 1+0, 0+3] =  [20 21 22]

#y[1+1, 1+0, 0+0], y[1+0, 1+0, 0+1], y[1+0, 1+0, 0+3] =  [28 29 30]
#y[1+1, 1+1, 0+0], y[1+0, 1+0, 0+1], y[1+0, 1+0, 0+3] =  [28 29 30]
print("y")
print(y)
out=tf.slice(y,begin_y,size_y)
print(sess.run(out))  # 结果:[[[16 17 18][20 21 22]][[28 29 30][32 33 34]]]
print("---------------")

tf.tile
复制数组
tf.tile(input, multiples, name = None)

sess = tf.Session()
data = tf.constant([[1, 2, 3, 4], [9, 8, 7, 6]])
d = tf.tile(data, [2,2])
print(sess.run(d))
----------
[[1 2 3 4 1 2 3 4]
 [9 8 7 6 9 8 7 6]
 [1 2 3 4 1 2 3 4]
 [9 8 7 6 9 8 7 6]]
 
 sess = tf.Session()
data = tf.constant([1, 2, 3, 4])
d = tf.tile(data, [2)
print(sess.run(d))
----------
[1 2 3 4 1 2 3 4]

tf.reduce_sum
分组计算tensor中各数组的和,reduction_indices等于0和1时取的维度值就不一样。

inputs = [[1,0,2],[3,2,4]]
B = tf.reduce_sum(inputs, reduction_indices=0)
with tf.Session() as sess:
    print(sess.run(B))
# >> [4 2 6]

B = tf.reduce_sum(inputs, reduction_indices=1)
with tf.Session() as sess:
    print(sess.run(B))
# >> [3 9]

tf.arg_min
求数组最小值的下标,如axis=0,代表第一维度],如axis=1,代表第二维度

data = tf.constant([[8,1,2],[2,3,4]])
sess = tf.Session()
print(sess.run(tf.arg_min(data,0)))
# >> [1 0 0]
print(sess.run(tf.arg_min(data,1)))
# >>[1 0]

tf.reduce_any
计算tensor中各个元素的逻辑或(or运算)

inputs = [[True,False],[True,False]]
with tf.Session() as sess:
    inputs = np.array(inputs)
    A = tf.reduce_any(inputs,0)
    print(sess.run(A))
# >>[ True False]
    A = tf.reduce_any(inputs,1)
    print(sess.run(A))
# >>[ True  True]

tf.unsorted_segment_sum
根据segment_ids的分段计算各个片段的和,
num_segments, name=None) 与tf.segment_sum函数类似,
不同在于segment_ids中id顺序可以是无序的

t1 = tf.constant([[1,2,3,4], [-1,-2,-3,-4],[-1,-2,-8,-4]])
t2 = tf.unsorted_segment_sum(t1, tf.constant([0, 1,0]),2)
with tf.Session() as sess1:
    print(sess1.run(t2))
实际上就把对应下表的数组进行计算
如[0, 1,0],则是0:[1,2,3,4] + [-1,-2,-8,-4],1: [-1,-2,-3,-4]

assign
tf.assign是用来更新模型中变量的值的。ref是待赋值的变量,value是要更新的值。即效果等同于 ref = value


sess =  tf.Session()
a = tf.Variable(0.0)
b = tf.placeholder(dtype=tf.float32,shape=[])
op = tf.assign(a,b)

sess.run(tf.initialize_all_variables())
print(sess.run(a))
# 0.0
sess.run(op,feed_dict={b:5.})
print(sess.run(a))
# 5.0

二、完整程序

# -*- coding: utf-8 -*-
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np

K = 4 # 类别数目
MAX_ITERS = 1000 # 最大迭代次数
# MAX_ITERS = 2 # 最大迭代次数
N = 200 # 样本点数目

centers = [[-2, -2], [-2, 1.5], [1.5, -2], [2, 1.5]] # 簇中心

print("1、加载数据")
dataSet = []
fileIn = open('data\\testData.txt')
for line in fileIn.readlines():
    lineArr = line.strip().split(' ')
    dataSet.append([float(lineArr[0]), float(lineArr[1])])

N = len(dataSet)

# print("2、数据归一化")
# print(dataSet[0])
# # dataSet = np.mat(dataSet)
# # print(dataSet[0])

#展示数据
def showCluster(dataSet, k, clusterAssment):
    dataSet = np.array(dataSet)

    numSamples= np.shape(dataSet)[0]
    dim = np.shape(dataSet)[1]
    if dim != 2:
        print("Sorry! I can not draw because the dimension of your data is not 2!")
        return 1

    mark = ['or', 'ob', 'og', 'ok', '^r', '+r', 'sr', 'dr', '<r', 'pr']
    if k > len(mark):
        print("")
        return 1

    # draw all samples
    for i in range(numSamples):
        markIndex = int(clusterAssment[i])
        plt.plot(dataSet[i, 0], dataSet[i, 1], mark[markIndex])

    plt.show()

# 计算类内平均值函数
def clusterMean(data, id, num):
    # 第一个参数是tensor,第二个参数是簇标签,第三个是簇数目
    total = tf.unsorted_segment_sum(data, id, num)
    count = tf.unsorted_segment_sum(tf.ones_like(data), id, num)
    return total/count

# 构建graph
points = tf.Variable(dataSet)
cluster = tf.Variable(tf.zeros([N], dtype=tf.int64))
# 将原始数据前k个点当做初始中心
centers = tf.Variable(tf.slice(points.initialized_value(), [0, 0], [K, 2]))

# 复制操作,便于矩阵批量计算距离
repCenters = tf.reshape(tf.tile(centers, [N, 1]), [N, K, 2])
repPoints = tf.reshape(tf.tile(points, [1, K]), [N, K, 2])
# 计算距离
sumSqure = tf.reduce_sum(tf.square(repCenters-repPoints), reduction_indices=2)
# 寻找最近的簇中心
bestCenter = tf.argmin(sumSqure, axis=1)
# 检测簇中心是否还在变化
change = tf.reduce_any(tf.not_equal(bestCenter, cluster))
# 计算簇内均值
means = clusterMean(points, bestCenter, K)
# 将粗内均值变成新的簇中心,同时分类结果也要更新
with tf.control_dependencies([change]):
    # 复制函数
    update = tf.group(centers.assign(means), cluster.assign(bestCenter))

with tf.Session() as sess:
    sess.run(tf.initialize_all_variables())
    changed = True
    iterNum = 0
    while changed and iterNum < MAX_ITERS:
        iterNum += 1
        # 运行graph
        [changed, _] = sess.run([change, update])
        [centersArr, clusterArr] = sess.run([centers, cluster])

    print(clusterArr)
    print(centersArr)
    showCluster(dataSet, K, clusterArr)
        # # 显示图像
        # fig, ax = plt.subplots()
        # ax.scatter(dataSet.transpose()[0], dataSet.transpose()[1], marker='o', s=100, c=clusterArr)
        # plt.plot()
        # plt.show()

推荐阅读更多精彩内容