with tf.Session() as sess:
for epoch in range(n_epochs):
for X_batch, y_batch in zip(X_batches, y_batches):
sess.run(training_op,feed_dict={X: X_batch, y: y_batch})
# 每次迭代后,运行权重裁剪
clip_weights.eval()
更简洁的写法如下:
def
max_norm_regularizer(threshold, axes=1, name="max_norm", collection="max_norm"):
def max_norm(weights):
clipped = tf.clip_by_norm(weights,
clip_norm=threshold, axes=axes)
clip_weights = tf.assign(weights,
clipped, name=name)
# 将clip_weights操作添加到
tf.add_to_collection(collection,
clip_weights)
return None # there is no
regularization loss term
return max_norm
# 取出集合max_norm中的ops:clip_weights
clip_all_weights= tf.get_collection("max_norm")
with tf.Session() as sess:
for epoch in range(n_epochs):
for X_batch, y_batch in zip(X_batches,y_batches):
sess.run(training_op, feed_dict={X:X_batch, y: y_batch})
sess.run(clip_all_weights)
each si accumulates the squares of the partial
derivative of the cost function with regards to parameter θi. If the cost
function is steep along the ith dimension, then si will get larger and larger
at each iteration.
In fact, since Adam is an adaptive learning rate
algorithm (like AdaGrad and RMSProp), it requires less tuning of the learning
rate hyperparameter η. You can often use the default value η = 0.001。