labs-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From tomm...@apache.org
Subject svn commit: r1769529 - in /labs/yay/trunk/core/src: main/java/org/apache/yay/RNN.java main/java/org/apache/yay/StackedRNN.java test/resources/word2vec/abstracts.txt
Date Sun, 13 Nov 2016 17:54:46 GMT
Author: tommaso
Date: Sun Nov 13 17:54:46 2016
New Revision: 1769529

URL: http://svn.apache.org/viewvc?rev=1769529&view=rev
Log:
minor perf improvements

Modified:
    labs/yay/trunk/core/src/main/java/org/apache/yay/RNN.java
    labs/yay/trunk/core/src/main/java/org/apache/yay/StackedRNN.java
    labs/yay/trunk/core/src/test/resources/word2vec/abstracts.txt

Modified: labs/yay/trunk/core/src/main/java/org/apache/yay/RNN.java
URL: http://svn.apache.org/viewvc/labs/yay/trunk/core/src/main/java/org/apache/yay/RNN.java?rev=1769529&r1=1769528&r2=1769529&view=diff
==============================================================================
--- labs/yay/trunk/core/src/main/java/org/apache/yay/RNN.java (original)
+++ labs/yay/trunk/core/src/main/java/org/apache/yay/RNN.java Sun Nov 13 17:54:46 2016
@@ -100,8 +100,8 @@ public class RNN {
     wxh = Nd4j.randn(hiddenLayerSize, vocabSize).mul(0.01);
     whh = Nd4j.randn(hiddenLayerSize, hiddenLayerSize).mul(0.01);
     why = Nd4j.randn(vocabSize, hiddenLayerSize).mul(0.01);
-    bh = Nd4j.zeros(hiddenLayerSize, 1).mul(0.01);
-    by = Nd4j.zeros(vocabSize, 1).mul(0.01);
+    bh = Nd4j.zeros(hiddenLayerSize, 1);
+    by = Nd4j.zeros(vocabSize, 1);
   }
 
   private String[] toStrings(char[] chars) {
@@ -248,7 +248,7 @@ public class RNN {
     // backward pass: compute gradients going backwards
     INDArray dhNext = Nd4j.zerosLike(hs.getRow(0));
     for (int t = inputs.length() - 1; t >= 0; t--) {
-      INDArray dy = ps.getRow(t).dup();
+      INDArray dy = ps.getRow(t);
       dy.putRow(targets.getInt(t), dy.getRow(targets.getInt(t)).sub(1)); // backprop into
y
       INDArray hst = hs.getRow(t);
       dWhy.addi(dy.mmul(hst.transpose())); // derivative of hy layer
@@ -263,11 +263,11 @@ public class RNN {
     }
 
     // clip exploding gradients
-    Nd4j.getExecutioner().execAndReturn(new SetRange(dWxh, -5, 5));
-    Nd4j.getExecutioner().execAndReturn(new SetRange(dWhh, -5, 5));
-    Nd4j.getExecutioner().execAndReturn(new SetRange(dWhy, -5, 5));
-    Nd4j.getExecutioner().execAndReturn(new SetRange(dbh, -5, 5));
-    Nd4j.getExecutioner().execAndReturn(new SetRange(dby, -5, 5));
+    Nd4j.getExecutioner().exec(new SetRange(dWxh, -5, 5));
+    Nd4j.getExecutioner().exec(new SetRange(dWhh, -5, 5));
+    Nd4j.getExecutioner().exec(new SetRange(dWhy, -5, 5));
+    Nd4j.getExecutioner().exec(new SetRange(dbh, -5, 5));
+    Nd4j.getExecutioner().exec(new SetRange(dby, -5, 5));
 
     return loss;
   }
@@ -292,11 +292,9 @@ public class RNN {
     int sampleSize = 2 * seqLength;
     INDArray ixes = Nd4j.create(sampleSize);
 
-    INDArray h = hPrev.dup();
-
     for (int t = 0; t < sampleSize; t++) {
-      h = Transforms.tanh((wxh.mmul(x)).add((whh.mmul(h)).add(bh)));
-      INDArray y = (why.mmul(h)).add(by);
+      hPrev = Transforms.tanh((wxh.mmul(x)).add((whh.mmul(hPrev)).add(bh)));
+      INDArray y = (why.mmul(hPrev)).add(by);
       INDArray pm = Nd4j.getExecutioner().execAndReturn(new SoftMax(y)).ravel();
 
       List<Pair<Integer, Double>> d = new LinkedList<>();

Modified: labs/yay/trunk/core/src/main/java/org/apache/yay/StackedRNN.java
URL: http://svn.apache.org/viewvc/labs/yay/trunk/core/src/main/java/org/apache/yay/StackedRNN.java?rev=1769529&r1=1769528&r2=1769529&view=diff
==============================================================================
--- labs/yay/trunk/core/src/main/java/org/apache/yay/StackedRNN.java (original)
+++ labs/yay/trunk/core/src/main/java/org/apache/yay/StackedRNN.java Sun Nov 13 17:54:46 2016
@@ -70,9 +70,9 @@ public class StackedRNN extends RNN {
     whh = Nd4j.randn(hiddenLayerSize, hiddenLayerSize).mul(0.01);
     whh2 = Nd4j.randn(hiddenLayerSize, hiddenLayerSize).mul(0.01);
     wh2y = Nd4j.randn(vocabSize, hiddenLayerSize).mul(0.01);
-    bh = Nd4j.zeros(hiddenLayerSize, 1).mul(0.01);
-    bh2 = Nd4j.zeros(hiddenLayerSize, 1).mul(0.01);
-    by = Nd4j.zeros(vocabSize, 1).mul(0.01);
+    bh = Nd4j.zeros(hiddenLayerSize, 1);
+    bh2 = Nd4j.zeros(hiddenLayerSize, 1);
+    by = Nd4j.zeros(vocabSize, 1);
   }
 
   public void learn() {
@@ -112,7 +112,7 @@ public class StackedRNN extends RNN {
       INDArray targets = getSequence(p + 1);
 
       // sample from the model every now and then
-      if (n % 100 == 0 && n > 0) {
+      if (n % 1000 == 0 && n > 0) {
         String txt = sample(inputs.getInt(0));
         System.out.printf("\n---\n %s \n----\n", txt);
       }
@@ -172,69 +172,63 @@ public class StackedRNN extends RNN {
   private double lossFun(INDArray inputs, INDArray targets, INDArray dWxh, INDArray dWhh,
INDArray dWhh2, INDArray dWh2y,
                          INDArray dbh, INDArray dbh2, INDArray dby) {
 
-    INDArray xs = Nd4j.zeros(inputs.length(), vocabSize);
+    INDArray xs = Nd4j.zeros(seqLength, vocabSize);
     INDArray hs = null;
     INDArray hs2 = null;
     INDArray ys = null;
     INDArray ps = null;
 
-    INDArray hs1 = hPrev.dup();
-    INDArray hs12 = hPrev2.dup();
-
     double loss = 0;
 
     // forward pass
-    for (int t = 0; t < inputs.length(); t++) {
+    for (int t = 0; t < seqLength; t++) {
       int tIndex = inputs.getScalar(t).getInt(0);
       xs.putScalar(t, tIndex, 1); // encode in 1-of-k representation
 
-      INDArray hsRow = t == 0 ? hs1 : hs.getRow(t - 1);
+      INDArray hsRow = t == 0 ? hPrev : hs.getRow(t - 1);
       INDArray xst = xs.getRow(t);
       INDArray hst = Transforms.tanh((wxh.mmul(xst.transpose())).add((whh.mmul(hsRow)).add(bh)));
// hidden state
       if (hs == null) {
-        hs = init(inputs.length(), hst);
+        hs = init(seqLength, hst);
       }
       hs.putRow(t, hst);
 
-      INDArray hs2Row = t == 0 ? hs12 : hs2.getRow(t - 1);
+      INDArray hs2Row = t == 0 ? hPrev2 : hs2.getRow(t - 1);
       INDArray hst2 = Transforms.tanh((whh.mmul(hst)).add((whh2.mmul(hs2Row)).add(bh2)));
// hidden state 2
       if (hs2 == null) {
-        hs2 = init(inputs.length(), hst2);
+        hs2 = init(seqLength, hst2);
       }
       hs2.putRow(t, hst2);
 
       INDArray yst = (wh2y.mmul(hst2)).add(by); // unnormalized log probabilities for next
chars
       if (ys == null) {
-        ys = init(inputs.length(), yst);
+        ys = init(seqLength, yst);
       }
       ys.putRow(t, yst);
 
       INDArray pst = Nd4j.getExecutioner().execAndReturn(new SoftMax(yst)); // probabilities
for next chars
       if (ps == null) {
-        ps = init(inputs.length(), pst);
+        ps = init(seqLength, pst);
       }
       ps.putRow(t, pst);
 
-      int targetsInt = targets.getInt(t);
-      loss += -Math.log(pst.getDouble(targetsInt)); // softmax (cross-entropy loss)
+      loss += -Math.log(pst.getDouble(targets.getInt(t))); // softmax (cross-entropy loss)
     }
 
     // backward pass: compute gradients going backwards
     INDArray dhNext = Nd4j.zerosLike(hs.getRow(0));
     INDArray dh2Next = Nd4j.zerosLike(hs2.getRow(0));
-    for (int t = inputs.length() - 1; t >= 0; t--) {
-
-      INDArray dy = ps.getRow(t).dup();
-      dy.putRow(targets.getInt(t), dy.getRow(targets.getInt(t)).sub(1)); // backprop into
y
+    for (int t = seqLength - 1; t >= 0; t--) {
+      INDArray dy = ps.getRow(t);
+      dy.getRow(targets.getInt(t)).subi(1); // backprop into y
 
       INDArray hs2t = hs2.getRow(t);
-      INDArray hs2tm1 = t == 0 ? hs12 : hs2.getRow(t - 1);
+      INDArray hs2tm1 = t == 0 ? hPrev2 : hs2.getRow(t - 1);
 
       dWh2y.addi(dy.mmul(hs2t.transpose()));
       dby.addi(dy);
 
       INDArray dh2 = wh2y.transpose().mmul(dy).add(dh2Next); // backprop into h2
-
       INDArray dhraw2 = (Nd4j.ones(hs2t.shape()).sub(hs2t.mul(hs2t))).mul(dh2); //  backprop
through tanh nonlinearity
       dbh2.addi(dhraw2);
       INDArray hst = hs.getRow(t);
@@ -242,29 +236,28 @@ public class StackedRNN extends RNN {
       dWhh2.addi(dhraw2.mmul(hs2tm1.transpose()));
       dh2Next = whh2.transpose().mmul(dhraw2);
 
-      INDArray dh = whh2.transpose().mmul(dhraw2).add(dhNext); // backprop into h
+      INDArray dh = dh2Next.add(dhNext); // backprop into h
       INDArray dhraw = (Nd4j.ones(hst.shape()).sub(hst.mul(hst))).mul(dh); // backprop through
tanh nonlinearity
       dbh.addi(dhraw);
-
       dWxh.addi(dhraw.mmul(xs.getRow(t)));
-      INDArray hsRow = t == 0 ? hs1 : hs.getRow(t - 1);
+      INDArray hsRow = t == 0 ? hPrev : hs.getRow(t - 1);
       dWhh.addi(dhraw.mmul(hsRow.transpose()));
       dhNext = whh.transpose().mmul(dhraw);
 
     }
 
-    this.hPrev = hs.getRow(inputs.length() - 1);
-    this.hPrev2 = hs2.getRow(inputs.length() - 1);
+    this.hPrev = hs.getRow(seqLength - 1);
+    this.hPrev2 = hs2.getRow(seqLength - 1);
 
     // clip exploding gradients
     int clip = 5;
-    Nd4j.getExecutioner().execAndReturn(new SetRange(dWxh, -clip, clip));
-    Nd4j.getExecutioner().execAndReturn(new SetRange(dWhh, -clip, clip));
-    Nd4j.getExecutioner().execAndReturn(new SetRange(dWhh2, -clip, clip));
-    Nd4j.getExecutioner().execAndReturn(new SetRange(dWh2y, -clip, clip));
-    Nd4j.getExecutioner().execAndReturn(new SetRange(dbh, -clip, clip));
-    Nd4j.getExecutioner().execAndReturn(new SetRange(dbh2, -clip, clip));
-    Nd4j.getExecutioner().execAndReturn(new SetRange(dby, -clip, clip));
+    Nd4j.getExecutioner().exec(new SetRange(dWxh, -clip, clip));
+    Nd4j.getExecutioner().exec(new SetRange(dWhh, -clip, clip));
+    Nd4j.getExecutioner().exec(new SetRange(dWhh2, -clip, clip));
+    Nd4j.getExecutioner().exec(new SetRange(dWh2y, -clip, clip));
+    Nd4j.getExecutioner().exec(new SetRange(dbh, -clip, clip));
+    Nd4j.getExecutioner().exec(new SetRange(dbh2, -clip, clip));
+    Nd4j.getExecutioner().exec(new SetRange(dby, -clip, clip));
 
     return loss;
   }
@@ -280,8 +273,8 @@ public class StackedRNN extends RNN {
     int sampleSize = seqLength * 2;
     INDArray ixes = Nd4j.create(sampleSize);
 
-    INDArray h = hPrev.dup();
-    INDArray h2 = hPrev2.dup();
+    INDArray h = hPrev;
+    INDArray h2 = hPrev2;
 
     for (int t = 0; t < sampleSize; t++) {
       h = Transforms.tanh((wxh.mmul(x)).add((whh.mmul(h)).add(bh)));

Modified: labs/yay/trunk/core/src/test/resources/word2vec/abstracts.txt
URL: http://svn.apache.org/viewvc/labs/yay/trunk/core/src/test/resources/word2vec/abstracts.txt?rev=1769529&r1=1769528&r2=1769529&view=diff
==============================================================================
--- labs/yay/trunk/core/src/test/resources/word2vec/abstracts.txt (original)
+++ labs/yay/trunk/core/src/test/resources/word2vec/abstracts.txt Sun Nov 13 17:54:46 2016
@@ -137,4 +137,8 @@ We employ a character-level convolutiona
 Our character-to-character model outperforms a recently proposed baseline with a subwordlevel
encoder on WMT’15 DE-EN and CSEN , and gives comparable performance on FIEN and RU-EN
.
 We then demonstrate that it is possible to share a single characterlevel encoder across multiple
languages by training a model on a many-to-one translation task .
 In this multilingual setting , the character-level encoder significantly outperforms the
subword-level encoder on all the language pairs .
-We observe that on CS-EN , FI-EN and RU-EN , the quality of the multilingual character-level
translation even surpasses the models specifically trained on that language pair alone , both
in terms of BLEU score and human judgment .
\ No newline at end of file
+We observe that on CS-EN , FI-EN and RU-EN , the quality of the multilingual character-level
translation even surpasses the models specifically trained on that language pair alone , both
in terms of BLEU score and human judgment .
+The Teacher Forcing algorithm trains recurrent networks by supplying observed sequence values
as inputs during training and using the network’s own one-step- ahead predictions to
do multi-step sampling .
+We introduce the Professor Forcing algorithm , which uses adversarial domain adaptation to
encourage the dynamics of the recurrent network to be the same when training the network and
when sampling from the network over multiple time steps .
+We apply Professor Forcing to language modeling , vocal synthesis on raw waveforms , handwriting
generation , and image generation .
+Empirically we find that Professor Forcing acts as a regularizer , improving test likelihood
on character level Penn Treebank and sequential MNIST. We also find that the model qualitatively
improves samples, especially when sam- pling for a large number of time steps. This is supported
by human evaluation of sample quality. Trade-offs between Professor Forcing and Scheduled
Sampling are discussed. We produce T-SNEs showing that Professor Forcing successfully makes
the dynamics of the network during training and sampling more similar.
\ No newline at end of file



---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@labs.apache.org
For additional commands, e-mail: commits-help@labs.apache.org


Mime
View raw message