By : iSeriesveteran
Date : November 24 2020, 09:00 AM
|
will be helpful for those in need Here's a sketch implementation using plain Theano. This can be integrated into Lasagne easily enough. You need to create a custom operation which acts as an identity operation in the forward pass but reverses the gradient in the backward pass. code :
class ReverseGradient(theano.gof.Op):
view_map = {0: [0]}
__props__ = ('hp_lambda',)
def __init__(self, hp_lambda):
super(ReverseGradient, self).__init__()
self.hp_lambda = hp_lambda
def make_node(self, x):
return theano.gof.graph.Apply(self, [x], [x.type.make_variable()])
def perform(self, node, inputs, output_storage):
xin, = inputs
xout, = output_storage
xout[0] = xin
def grad(self, input, output_gradients):
return [-self.hp_lambda * output_gradients[0]]
import numpy
import theano
import theano.tensor as tt
def g_f(z, theta_f):
for w_f, b_f in theta_f:
z = tt.tanh(theano.dot(z, w_f) + b_f)
return z
def g_y(z, theta_y):
for w_y, b_y in theta_y[:-1]:
z = tt.tanh(theano.dot(z, w_y) + b_y)
w_y, b_y = theta_y[-1]
z = tt.nnet.softmax(theano.dot(z, w_y) + b_y)
return z
def g_d(z, theta_d):
for w_d, b_d in theta_d[:-1]:
z = tt.tanh(theano.dot(z, w_d) + b_d)
w_d, b_d = theta_d[-1]
z = tt.nnet.sigmoid(theano.dot(z, w_d) + b_d)
return z
def l_y(z, y):
return tt.nnet.categorical_crossentropy(z, y).mean()
def l_d(z, d):
return tt.nnet.binary_crossentropy(z, d).mean()
def mlp_parameters(input_size, layer_sizes):
parameters = []
previous_size = input_size
for layer_size in layer_sizes:
parameters.append((theano.shared(numpy.random.randn(previous_size, layer_size).astype(theano.config.floatX)),
theano.shared(numpy.zeros(layer_size, dtype=theano.config.floatX))))
previous_size = layer_size
return parameters, previous_size
def compile(input_size, f_layer_sizes, y_layer_sizes, d_layer_sizes, hp_lambda, hp_mu):
r = ReverseGradient(hp_lambda)
theta_f, f_size = mlp_parameters(input_size, f_layer_sizes)
theta_y, _ = mlp_parameters(f_size, y_layer_sizes)
theta_d, _ = mlp_parameters(f_size, d_layer_sizes)
xs = tt.matrix('xs')
xs.tag.test_value = numpy.random.randn(9, input_size).astype(theano.config.floatX)
xt = tt.matrix('xt')
xt.tag.test_value = numpy.random.randn(10, input_size).astype(theano.config.floatX)
ys = tt.ivector('ys')
ys.tag.test_value = numpy.random.randint(y_layer_sizes[-1], size=9).astype(numpy.int32)
fs = g_f(xs, theta_f)
e = l_y(g_y(fs, theta_y), ys) + l_d(g_d(r(fs), theta_d), 0) + l_d(g_d(r(g_f(xt, theta_f)), theta_d), 1)
updates = [(p, p - hp_mu * theano.grad(e, p)) for theta in theta_f + theta_y + theta_d for p in theta]
train = theano.function([xs, xt, ys], outputs=e, updates=updates)
return train
def main():
theano.config.compute_test_value = 'raise'
numpy.random.seed(1)
compile(input_size=2, f_layer_sizes=[3, 4], y_layer_sizes=[7, 8], d_layer_sizes=[5, 6], hp_lambda=.5, hp_mu=.01)
main()
class ReverseGradientLayer(lasagne.layers.Layer):
def __init__(self, incoming, hp_lambda, **kwargs):
super(ReverseGradientLayer, self).__init__(incoming, **kwargs)
self.op = ReverseGradient(hp_lambda)
def get_output_for(self, input, **kwargs):
return self.op(input)
Share :
|
How efficient/intelligent is Theano in computing gradients?
By : FaithJava
Date : March 29 2020, 07:55 AM
wish of those help Well it turns out Theano does not take the previously-computed gradients to compute the gradients in lower layers of a computational graph. Here's a dummy example of a neural network with 3 hidden layers and an output layer. However, it's not going to be a big deal at all since computing the gradients is a once-in-a-life-time operation unless you have to compute the gradient on each iteration. Theano returns a symbolic expression for the derivatives as a computational graph and you can simply use it as a function from that point on. From that point on we simply use the function derived by Theano to compute numerical values and update the weights using those. code :
import theano.tensor as T
import time
import numpy as np
class neuralNet(object):
def __init__(self, examples, num_features, num_classes):
self.w = shared(np.random.random((16384, 5000)).astype(T.config.floatX), borrow = True, name = 'w')
self.w2 = shared(np.random.random((5000, 3000)).astype(T.config.floatX), borrow = True, name = 'w2')
self.w3 = shared(np.random.random((3000, 512)).astype(T.config.floatX), borrow = True, name = 'w3')
self.w4 = shared(np.random.random((512, 40)).astype(T.config.floatX), borrow = True, name = 'w4')
self.b = shared(np.ones(5000, dtype=T.config.floatX), borrow = True, name = 'b')
self.b2 = shared(np.ones(3000, dtype=T.config.floatX), borrow = True, name = 'b2')
self.b3 = shared(np.ones(512, dtype=T.config.floatX), borrow = True, name = 'b3')
self.b4 = shared(np.ones(40, dtype=T.config.floatX), borrow = True, name = 'b4')
self.x = examples
L1 = T.nnet.sigmoid(T.dot(self.x, self.w) + self.b)
L2 = T.nnet.sigmoid(T.dot(L1, self.w2) + self.b2)
L3 = T.nnet.sigmoid(T.dot(L2, self.w3) + self.b3)
L4 = T.dot(L3, self.w4) + self.b4
self.forwardProp = T.nnet.softmax(L4)
self.predict = T.argmax(self.forwardProp, axis = 1)
def loss(self, y):
return -T.mean(T.log(self.forwardProp)[T.arange(y.shape[0]), y])
x = T.matrix('x')
y = T.ivector('y')
nnet = neuralNet(x)
loss = nnet.loss(y)
diffrentiationTime = []
for i in range(100):
t1 = time.time()
gw, gw2, gw3, gw4, gb, gb2, gb3, gb4 = T.grad(loss, [nnet.w, nnet.w2, logReg.w3, nnet.w4, nnet.b, nnet.b2, nnet.b3, nnet.b4])
diffrentiationTime.append(time.time() - t1)
print 'Efficient Method: Took %f seconds with std %f' % (np.mean(diffrentiationTime), np.std(diffrentiationTime))
diffrentiationTime = []
for i in range(100):
t1 = time.time()
gw = T.grad(loss, [nnet.w])
gw2 = T.grad(loss, [nnet.w2])
gw3 = T.grad(loss, [nnet.w3])
gw4 = T.grad(loss, [nnet.w4])
gb = T.grad(loss, [nnet.b])
gb2 = T.grad(loss, [nnet.b2])
gb3 = T.grad(loss, [nnet.b3])
gb4 = T.grad(loss, [nnet.b4])
diffrentiationTime.append(time.time() - t1)
print 'Inefficient Method: Took %f seconds with std %f' % (np.mean(diffrentiationTime), np.std(diffrentiationTime))
Efficient Method: Took 0.061056 seconds with std 0.013217
Inefficient Method: Took 0.305081 seconds with std 0.026024
|
How to monitor gradients of theano shared variables
By : SeekingSparrow
Date : March 29 2020, 07:55 AM
this will help How can one get the values of gradients of theano shared variables ? That is, , Just don't pass W as the input argument: code :
gradf = theano.function( [x, target], TT.grad(...) )
|
Theano -- Mean of squared gradients
By : Kesavan Selvaraj
Date : March 29 2020, 07:55 AM
it fixes the issue Your function g_square happens to have complexity O(batch_size**2) instead of O(batch_size) as expected. This lets it appear incredibly slow for larger batch sizes. The reason is because in every iteration the forward and backward pass is computed over the whole batch, even though just cost[i] for one data point is needed. I assume the input to the cost computation graph, x, is a tensor with the first dimension of size batch_size. Theano has no means to automatically slice this tensor along this dimension. Therefore computation is always done over the whole batch.
|
Checking backpropagation gradients
By : Crazykiddo
Date : March 29 2020, 07:55 AM
I think the issue was by ths following , I'm trying to adapt a reinforcement learning script that's coded in pure python into tensorflow. , One way is to print the values of the backpropagation gradients:
|
Periodically log gradients without requiring two functions (or slowdown) in Theano
By : Dubravko Gacina
Date : March 29 2020, 07:55 AM
fixed the issue. Will look into that further Your first function obviously executes a training step and updates all your parameters. The second function must return the gradients of your parameters.
|
|
|
Related Posts :
|