I am trying to implement word level prediction, an adaptation of from http://karpathy.github.io/2015/05/21/rnn-effectiveness/. When I implement it in pure python, the training is fast. However I started learning tensorflow and I tried to implement the same in tensorflow(eager execution). However, the training is extremely slow. A my the one who has done things wrong ? Please advise. Here is the code. I am running the code in CPU mac
import tensorflow as tf
tf.enable_eager_execution()
import tensorflow.contrib.eager as tfe
import numpy as np
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from IPython import display
plt.style.use('seaborn-white')
import matplotlib.pyplot as plt
from IPython import display
class RNN_LSTM(object):
def __init__(self,hidden_size):
data=open('Shakespear.txt', 'r').read()
self.data = data.split()
vocab_size=len(list(set(self.data)))
self.words =list(set(self.data))
self.hidden_size=hidden_size
self.input_size=vocab_size+hidden_size
self.vocab_size=vocab_size
self.W1=tf.Variable(tf.random.normal((self.hidden_size,self.input_size))*0.1,dtype=tf.dtypes.float32,name="W1")
self.b1=tf.Variable(tf.random.normal((self.hidden_size,1))*0.1,dtype=tf.dtypes.float32,name="b1")
self.W2=tf.Variable(tf.random.normal((self.hidden_size,self.input_size))*0.1,dtype=tf.dtypes.float32,name="W2")
self.b2=tf.Variable(tf.random.normal((self.hidden_size,1))*0.1,dtype=tf.dtypes.float32,name="b2")
self.W3=tf.Variable(tf.random.normal((self.hidden_size,self.input_size))*0.1,dtype=tf.dtypes.float32,name="W3")
self.b3=tf.Variable(tf.random.normal((self.hidden_size,1))*0.1,dtype=tf.dtypes.float32,name="b3")
self.W4=tf.Variable(tf.random.normal((hidden_size,self.input_size))*0.1,dtype=tf.dtypes.float32,name="W4")
self.b4=tf.Variable(tf.random.normal((self.hidden_size,1))*0.1,dtype=tf.dtypes.float32,name="b4")
self.W5=tf.Variable(tf.random.normal((self.vocab_size,self.hidden_size))*0.1,dtype=tf.dtypes.float32,name="W5")
self.b5=tf.Variable(tf.random.normal((self.vocab_size,1))*0.1,dtype=tf.dtypes.float32,name="b5")
self.learning_rate=0.1
self.sequence_length=25
self.selection_sample_np=[i for i in range(self.vocab_size)]
self.selection_sample_tf=tf.convert_to_tensor(self.selection_sample_np)
self.char_to_ix={ch:ix for ix,ch in enumerate(self.words)}
self.ix_to_char={ix:ch for ix,ch in enumerate(self.words)}
def feedforward(self,X,Y,hprev,p_s):
losses=0
for x,y in zip(X,Y):
M_c=tf.Variable(tf.zeros((self.input_size,1)),name="M_c")
h=tf.one_hot(x,self.vocab_size)
gh=tf.reshape(h,(self.vocab_size,1))
M_c=tf.concat((hprev,gh),axis=0)
ft=tf.nn.sigmoid(tf.matmul(self.W1,M_c)+self.b1)
it=tf.nn.sigmoid(tf.matmul(self.W2,M_c)+self.b2)
gt=tf.math.tanh(tf.matmul(self.W3,M_c)+self.b3)
cs=tf.multiply(ft,p_s)+tf.multiply(it,gt)
ot=tf.nn.sigmoid(tf.matmul(self.W4,M_c)+self.b4)
ht=tf.multiply(ot,tf.math.tanh(cs))
output=tf.matmul(self.W5,ht)+self.b5
yt = tf.nn.softmax(output,axis=0)
losses=losses+(-tf.math.log(yt[y]))
hprev=ht
p_s=cs
return ht,cs,output,losses
def process_data(self,p):
inputs=[self.char_to_ix[ch] for ch in self.data[p:p+self.sequence_length]]
targets=[self.char_to_ix[ch] for ch in self.data[p+1:p+self.sequence_length+1]]
features=tf.data.Dataset.from_tensor_slices(inputs)
labels=tf.data.Dataset.from_tensor_slices(targets)
training_data=tf.data.Dataset.zip((features,labels)).batch(25)
return training_data
def sample_text(self,hprev,begin,p_s,n):
selected_letters=[]
for i in range(n):
M=tf.Variable(tf.zeros((self.input_size,1)),name="M")
h=tf.one_hot(begin,self.vocab_size)
gh=tf.reshape(h,(self.vocab_size,1))
M=tf.Variable(tf.zeros((self.input_size,1)),name="M")
M=tf.concat((hprev,gh),axis=0)
ft=tf.nn.sigmoid(tf.matmul(self.W1,M)+self.b1)
it=tf.nn.sigmoid(tf.matmul(self.W2,M)+self.b2)
gt=tf.math.tanh(tf.matmul(self.W3,M)+self.b3)
cs=tf.multiply(ft,p_s)+tf.multiply(it,gt)
ot=tf.sigmoid(tf.matmul(self.W4,M)+self.b4)
ht=tf.multiply(ot,tf.math.tanh(cs))
output=tf.matmul(self.W5,ht)+self.b5
p=tf.nn.softmax(output,axis=0)
p=tf.reshape(p.numpy(),(1,self.vocab_size))
samples =tf.random.categorical(tf.log(p.numpy()), 1)
sample_selected=samples[0][0]
selected_next_letter=self.selection_sample_tf[sample_selected]
hprev=ht
p_s=cs
begin=selected_next_letter
selected_letters.append(selected_next_letter)
return selected_letters
def lossFun(x):
loss=0
return loss.assign_add(x)
def softmax(self,z):
return tf.math.exp(z-tf.reduce_max(z))/tf.math.reduce_sum(tf.math.exp(z-tf.reduce_max(z)))
def Bonga(self):
p=0
i=0
smooth_loss =-np.log(1.0/self.vocab_size)*self.sequence_length
while(1):
with tf.GradientTape() as g:
g.watch([self.W1,self.W2,self.W3,self.W4,self.W5,self.b1,self.b2,self.b3,self.b4,self.b5])
if p+self.sequence_length+1>= len(self.data) or i == 0:
hprev=tf.Variable(np.zeros((self.hidden_size,1)),dtype=tf.float32)
p_s=tf.Variable(tf.zeros((self.hidden_size,1)))
p=0
training_dataset=self.process_data(p)
for batch in tfe.Iterator(training_dataset):
begin=batch[0][0]
hprev,p_s,output,loss=self.feedforward(batch[0],batch[1],hprev,p_s)
training_dataset=self.process_data(p)
optimizer = tf.train.RMSPropOptimizer(
learning_rate=self.learning_rate,decay=0.9,momentum=0.01, epsilon=1e-8,use_locking=False,centered=False,
name='RMSProp')
grad = g.gradient(loss,[self.W1,self.W2,self.W3,self.W4,self.W5,self.b1,self.b2,self.b3,self.b4,self.b5])
capped_gvs = [(tf.clip_by_value(g, -1.0, 1.0)) for g in grad]
optimizer.apply_gradients(zip(capped_gvs,[self.W1,self.W2,self.W3,self.W4,self.W5,self.b1,self.b2,self.b3,self.b4,self.b5]))
if i % 100 == 0:
sample_ix = self.sample_text(hprev,begin,p_s,200)
text=[self.ix_to_char[ix.numpy()] for ix in sample_ix]
txt = tf.strings.join(text,separator=" ")
print ('----\n %s \n----' % (txt.numpy(), ))
smooth_loss = smooth_loss * 0.999 + loss * 0.001
print ('iterr %d, loss: %f' % (i, smooth_loss))
i=i+1
p=p+self.sequence_length