# TensorFlow 学习指南 三、学习

## 聚类和 KMeans

### 生成样本

``````import tensorflow as tf
import numpy as np

def create_samples(n_clusters, n_samples_per_cluster, n_features, embiggen_factor, seed):
np.random.seed(seed)
slices = []
centroids = []
# 为每个簇创建样本
for i in range(n_clusters):
samples = tf.random_normal((n_samples_per_cluster, n_features),
mean=0.0, stddev=5.0, dtype=tf.float32, seed=seed, name="cluster_{}".format(i))
current_centroid = (np.random.random((1, n_features)) * embiggen_factor) - (embiggen_factor/2)
centroids.append(current_centroid)
samples += current_centroid
slices.append(samples)
# 创建一个大的“样本”数据集
samples = tf.concat(slices, 0, name='samples')
centroids = tf.concat(centroids, 0, name='centroids')
return centroids, samples
``````

`create_samples`方法保存在名为`functions.py`的文件中，允许我们为这个（以及下一个！）课程，将这些方法导入到我们的脚本中。 创建一个名为`generate_samples.py`的新文件，其中包含以下代码：

``````import tensorflow as tf
import numpy as np

from functions import create_samples

n_features = 2
n_clusters = 3
n_samples_per_cluster = 500
seed = 700
embiggen_factor = 70

np.random.seed(seed)

centroids, samples = create_samples(n_clusters, n_samples_per_cluster, n_features, embiggen_factor, seed)

model = tf.global_variables_initializer()
with tf.Session() as session:
sample_values = session.run(samples)
centroid_values = session.run(centroids)
``````

``````def plot_clusters(all_samples, centroids, n_samples_per_cluster):
import matplotlib.pyplot as plt
# 绘制出不同的簇
# 为每个簇选择不同的颜色
colour = plt.cm.rainbow(np.linspace(0,1,len(centroids)))
for i, centroid in enumerate(centroids):
# 为给定簇抓取样本，并用新颜色绘制出来
samples = all_samples[i*n_samples_per_cluster:(i+1)*n_samples_per_cluster]
plt.scatter(samples[:,0], samples[:,1], c=colour[i])
# 还绘制质心
plt.plot(centroid[0], centroid[1], markersize=35, marker="x", color='k', mew=10)
plt.plot(centroid[0], centroid[1], markersize=30, marker="x", color='m', mew=5)
plt.show()
``````

``````plot_clusters(sample_values, centroid_values, n_samples_per_cluster)
``````

image

### 初始化

k-means 算法从初始质心的选择开始，初始质心只是数据中实际质心的随机猜测。 以下函数将从数据集中随机选择多个样本作为此初始猜测：

``````def choose_random_centroids(samples, n_clusters):
# 第 0 步：初始化：选择 n_clusters 个随机点
n_samples = tf.shape(samples)[0]
random_indices = tf.random_shuffle(tf.range(0, n_samples))
begin = [0,]
size = [n_clusters,]
size[0] = n_clusters
centroid_indices = tf.slice(random_indices, begin, size)
initial_centroids = tf.gather(samples, centroid_indices)
return initial_centroids
``````

``````import tensorflow as tf
import numpy as np

from functions import create_samples, choose_random_centroids, plot_clusters

n_features = 2
n_clusters = 3
n_samples_per_cluster = 500
seed = 700
embiggen_factor = 70

centroids, samples = create_samples(n_clusters, n_samples_per_cluster, n_features, embiggen_factor, seed)
initial_centroids = choose_random_centroids(samples, n_clusters)

model = tf.global_variables_initializer()
with tf.Session() as session:
sample_values = session.run(samples)
updated_centroid_value = session.run(initial_centroids)

plot_clusters(sample_values, updated_centroid_value, n_samples_per_cluster)
``````

### 更新质心

``````def assign_to_nearest(samples, centroids):
# 为每个样本查找最近的质心

expanded_vectors = tf.expand_dims(samples, 0)
expanded_centroids = tf.expand_dims(centroids, 1)
distances = tf.reduce_sum( tf.square(
tf.subtract(expanded_vectors, expanded_centroids)), 2)
mins = tf.argmin(distances, 0)
nearest_indices = mins
return nearest_indices
``````

``````def update_centroids(samples, nearest_indices, n_clusters):
# 将质心更新为与其相关的所有样本的平均值。
nearest_indices = tf.to_int32(nearest_indices)
partitions = tf.dynamic_partition(samples, nearest_indices, n_clusters)
new_centroids = tf.concat([tf.expand_dims(tf.reduce_mean(partition, 0), 0) for partition in partitions], 0)
return new_centroids
``````

``````import tensorflow as tf
import numpy as np

from functions import *

n_features = 2
n_clusters = 3
n_samples_per_cluster = 500
seed = 700
embiggen_factor = 70

data_centroids, samples = create_samples(n_clusters, n_samples_per_cluster, n_features, embiggen_factor, seed)
initial_centroids = choose_random_centroids(samples, n_clusters)
nearest_indices = assign_to_nearest(samples, initial_centroids)
updated_centroids = update_centroids(samples, nearest_indices, n_clusters)

model = tf.global_variables_initializer()
with tf.Session() as session:
sample_values = session.run(samples)
updated_centroid_value = session.run(updated_centroids)
print(updated_centroid_value)

plot_clusters(sample_values, updated_centroid_value, n_samples_per_cluster)
``````

• 从初始质心生成样本
• 随机选择初始质心
• 关联每个样本和最近的质心
• 将每个质心更新为与关联的样本的平均值

1）传递给`generate_samples`的种子选项可确保每次运行脚本时，“随机”生成的样本都是一致的。 我们没有将种子传递给`choose_random_centroids`函数，这意味着每次运行脚本时这些初始质心都不同。 更新脚本来为随机质心包含新的种子。

2）迭代地执行 k 均值算法，其中来自之前迭代的更新的质心用于分配簇，然后用于更新质心，等等。 换句话说，算法交替调用`assign_to_nearest``update_centroids`。 在停止之前，更新代码来执行此迭代 10 次。 你会发现，随着 k-means 的更多迭代，得到的质心平均上更接近。 （对于那些对 k-means 有经验的人，未来的教程将研究收敛函数和其他停止标准。）

## 训练和收敛

``````import tensorflow as tf

x = tf.Variable(0, name='x')

model = tf.global_variables_initializer()

with tf.Session() as session:
for i in range(5):
session.run(model)
x = x + 1
print(session.run(x))
``````

``````import tensorflow as tf

x = tf.Variable(0., name='x')
threshold = tf.constant(5.)

model = tf.global_variables_initializer()

with tf.Session() as session:
session.run(model)
while session.run(tf.less(x, threshold)):
x = x + 1
x_value = session.run(x)
print(x_value)
``````

### 梯度下降

``````import tensorflow as tf
import numpy as np

# x 和 y 是我们的训练数据的占位符
x = tf.placeholder("float")
y = tf.placeholder("float")
# w 是存储我们的值的变量。 它使用“猜测”来初始化
# w[0] 是我们方程中的“a”，w[1] 是“b”
w = tf.Variable([1.0, 2.0], name="w")
# 我们的模型是 y = a*x + b
y_model = tf.multiply(x, w[0]) + w[1]

# 我们的误差定义为差异的平方
error = tf.square(y - y_model)

# TensorFlow 常规 - 初始化值，创建会话并运行模型
model = tf.global_variables_initializer()

with tf.Session() as session:
session.run(model)
for i in range(1000):
x_value = np.random.rand()
y_value = x_value * 2 + 6
session.run(train_op, feed_dict={x: x_value, y: y_value})

w_value = session.run(w)
print("Predicted model: {a:.3f}x + {b:.3f}".format(a=w_value[0], b=w_value[1]))
``````

### 其它优化器

TensorFlow 有一整套优化器，并且你也可以定义自己的优化器（如果你对这类事情感兴趣）。 如何使用它们的 API，请参阅此页面。 列表如下：

• `GradientDescentOptimizer`
• `AdagradOptimizer`
• `MomentumOptimizer`
• `AdamOptimizer`
• `FtrlOptimizer`
• `RMSPropOptimizer`

### 绘制误差

image

``````errors = []
with tf.Session() as session:
session.run(model)
for i in range(1000):
x_train = tf.random_normal((1,), mean=5, stddev=2.0)
y_train = x_train * 2 + 6
x_value, y_value = session.run([x_train, y_train])
_, error_value = session.run([train_op, error], feed_dict={x: x_value, y: y_value})
errors.append(error_value)
w_value = session.run(w)
print("Predicted model: {a:.3f}x + {b:.3f}".format(a=w_value[0], b=w_value[1]))

import matplotlib.pyplot as plt
plt.plot([np.mean(errors[i-50:i]) for i in range(len(errors))])
plt.show()
plt.savefig("errors.png")
``````

1）创建第 6 课中的 k-means 示例的收敛函数，如果旧质心与新质心之间的距离小于给定的`epsilon`值，则停止训练。

2）尝试从梯度下降示例（`w`）中分离`a``b`值。

3）我们的例子一次只训练一个示例，这是低效的。 扩展它来一次使用多个（例如 50 个）训练样本来学习。

## TFLearn

image

### Scikit-Learn API

Scikit-learn 是一个用于数据挖掘和分析的 Python 包，它非常受欢迎。 这是因为它广泛支持不同的算法，令人惊叹的文档，以及庞大而活跃的社区。 其他一个因素是它的一致接口，它的 API，允许人们构建可以使用 scikit-learn 辅助函数训练的模型，并允许人们非常容易地测试不同的模型。

``````from sklearn.datasets import load_digits
from matplotlib import pyplot as plt

``````

``````fig = plt.figure(figsize=(3, 3))

plt.imshow(digits['images'][66], cmap="gray", interpolation='none')

plt.show()
``````

``````from sklearn import svm

classifier = svm.SVC(gamma=0.001)
classifier.fit(digits.data, digits.target)
predicted = classifier.predict(digits.data)
``````

``````import numpy as np
print(np.mean(digits.target == predicted))
``````

``````from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target)
``````

### TFLearn

TensorFlow Learn 接口距离 scikit-learn 的接口只有一小步之遥：

``````from tensorflow.contrib import learn
n_classes = len(set(y_train))
classifier = learn.LinearClassifier(feature_columns=[tf.contrib.layers.real_valued_column("", dimension=X_train.shape[1])],
n_classes=n_classes)
classifier.fit(X_train, y_train, steps=10)

y_pred = classifier.predict(X_test)
``````

``````from sklearn import metrics
print(metrics.classification_report(y_true=y_test, y_pred=y_pred))
``````

1）将分类器更改为`DNNClassifier`并重新运行。随意告诉所有朋友你现在使用深度学习来做数据分析。

2）`DNNClassifier`的默认参数是好的，但不完美。尝试更改参数来获得更高的分数。

3）从 TFLearn 的文档中查看此示例并下载 CIFAR 10 数据集。构建一个使用卷积神经网络预测图像的分类器。你可以使用此代码加载数据：

``````def load_cifar(file):
import pickle
import numpy as np
with open(file, 'rb') as inf:
data = cifar['data'].reshape((10000, 3, 32, 32))
data = np.rollaxis(data, 3, 1)
data = np.rollaxis(data, 3, 1)
y = np.array(cifar['labels'])

# 最开始只需 2 和 9
# 如果要构建大型模型，请删除这些行
mask = (y == 2) | (y == 9)