labs-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From tomm...@apache.org
Subject svn commit: r1768715 - in /labs/yay/trunk/core/src: main/java/org/apache/yay/ test/java/org/apache/yay/ test/resources/word2vec/
Date Tue, 08 Nov 2016 15:33:36 GMT
Author: tommaso
Date: Tue Nov  8 15:33:35 2016
New Revision: 1768715

URL: http://svn.apache.org/viewvc?rev=1768715&view=rev
Log:
ce loss fixes, sRnn bptt fix

Modified:
    labs/yay/trunk/core/src/main/java/org/apache/yay/NNRunner.java
    labs/yay/trunk/core/src/main/java/org/apache/yay/RNN.java
    labs/yay/trunk/core/src/main/java/org/apache/yay/StackedRNN.java
    labs/yay/trunk/core/src/test/java/org/apache/yay/RNNCrossValidationTest.java
    labs/yay/trunk/core/src/test/resources/word2vec/abstracts.txt

Modified: labs/yay/trunk/core/src/main/java/org/apache/yay/NNRunner.java
URL: http://svn.apache.org/viewvc/labs/yay/trunk/core/src/main/java/org/apache/yay/NNRunner.java?rev=1768715&r1=1768714&r2=1768715&view=diff
==============================================================================
--- labs/yay/trunk/core/src/main/java/org/apache/yay/NNRunner.java (original)
+++ labs/yay/trunk/core/src/main/java/org/apache/yay/NNRunner.java Tue Nov  8 15:33:35 2016
@@ -58,19 +58,19 @@ public class NNRunner {
             } catch (IOException e) {
               throw new RuntimeException("could not read from path " + path);
             }
-          }
+          } // use chars or words
           if (args.length > 2 && args[2] != null) {
             useChars = Boolean.valueOf(args[2]);
-          }
+          } // no. of epochs
           if (args.length > 3 && args[3] != null) {
             epochs = Integer.valueOf(args[3]);
-          }
+          } // hidden layer size
           if (args.length > 4 && args[4] != null) {
             hiddenLayerSize = Integer.valueOf(args[4]);
-          }
+          } // unrolled sequence lenght
           if (args.length > 5 && args[5] != null) {
             seqLength = Integer.valueOf(args[5]);
-          }
+          } // learning rate
           if (args.length > 6 && args[6] != null) {
             learningRate = Float.valueOf(args[6]);
           }

Modified: labs/yay/trunk/core/src/main/java/org/apache/yay/RNN.java
URL: http://svn.apache.org/viewvc/labs/yay/trunk/core/src/main/java/org/apache/yay/RNN.java?rev=1768715&r1=1768714&r2=1768715&view=diff
==============================================================================
--- labs/yay/trunk/core/src/main/java/org/apache/yay/RNN.java (original)
+++ labs/yay/trunk/core/src/main/java/org/apache/yay/RNN.java Tue Nov  8 15:33:35 2016
@@ -240,7 +240,7 @@ public class RNN {
         ps = init(inputs.length(), pst);
       }
       ps.putRow(t, pst);
-      loss += -Transforms.log(ps.getRow(t).getRow(targets.getInt(t)), true).sumNumber().doubleValue();
// softmax (cross-entropy loss)
+      loss += -Math.log(pst.getDouble(targets.getInt(t))); // softmax (cross-entropy loss)
     }
 
     this.hPrev = hs.getRow(inputs.length() - 1);

Modified: labs/yay/trunk/core/src/main/java/org/apache/yay/StackedRNN.java
URL: http://svn.apache.org/viewvc/labs/yay/trunk/core/src/main/java/org/apache/yay/StackedRNN.java?rev=1768715&r1=1768714&r2=1768715&view=diff
==============================================================================
--- labs/yay/trunk/core/src/main/java/org/apache/yay/StackedRNN.java (original)
+++ labs/yay/trunk/core/src/main/java/org/apache/yay/StackedRNN.java Tue Nov  8 15:33:35 2016
@@ -54,6 +54,8 @@ public class StackedRNN extends RNN {
   private final INDArray bh2; // hidden2 bias
   private final INDArray by; // output bias
 
+  private final double reg = 1e-8;
+
   private INDArray hPrev = null; // memory state
   private INDArray hPrev2 = null; // memory state
 
@@ -137,25 +139,25 @@ public class StackedRNN extends RNN {
 
       // perform parameter update with Adagrad
       mWxh.addi(dWxh.mul(dWxh));
-      wxh.subi((dWxh.mul(learningRate)).div(Transforms.sqrt(mWxh.add(1e-8))));
+      wxh.subi((dWxh.mul(learningRate)).div(Transforms.sqrt(mWxh.add(reg))));
 
       mWhh.addi(dWhh.mul(dWhh));
-      whh.subi(dWhh.mul(learningRate).div(Transforms.sqrt(mWhh.add(1e-8))));
+      whh.subi(dWhh.mul(learningRate).div(Transforms.sqrt(mWhh.add(reg))));
 
       mWhh2.addi(dWhh2.mul(dWhh2));
-      whh2.subi(dWhh2.mul(learningRate).div(Transforms.sqrt(mWhh2.add(1e-8))));
+      whh2.subi(dWhh2.mul(learningRate).div(Transforms.sqrt(mWhh2.add(reg))));
 
       mbh2.addi(dbh2.mul(dbh2));
-      bh2.subi(dbh2.mul(learningRate).div(Transforms.sqrt(mbh2.add(1e-8))));
+      bh2.subi(dbh2.mul(learningRate).div(Transforms.sqrt(mbh2.add(reg))));
 
       mWh2y.addi(dWh2y.mul(dWh2y));
-      wh2y.subi(dWh2y.mul(learningRate).div(Transforms.sqrt(mWh2y.add(1e-8))));
+      wh2y.subi(dWh2y.mul(learningRate).div(Transforms.sqrt(mWh2y.add(reg))));
 
       mbh.addi(dbh.mul(dbh));
-      bh.subi(dbh.mul(learningRate).div(Transforms.sqrt(mbh.add(1e-8))));
+      bh.subi(dbh.mul(learningRate).div(Transforms.sqrt(mbh.add(reg))));
 
       mby.addi(dby.mul(dby));
-      by.subi(dby.mul(learningRate).div(Transforms.sqrt(mby.add(1e-8))));
+      by.subi(dby.mul(learningRate).div(Transforms.sqrt(mby.add(reg))));
 
       p += seqLength; // move data pointer
       n++; // iteration counter
@@ -176,11 +178,8 @@ public class StackedRNN extends RNN {
     INDArray ys = null;
     INDArray ps = null;
 
-    INDArray hs1 = Nd4j.create(hPrev.shape());
-    Nd4j.copy(hPrev, hs1);
-
-    INDArray hs12 = Nd4j.create(hPrev2.shape());
-    Nd4j.copy(hPrev2, hs12);
+    INDArray hs1 = hPrev.dup();
+    INDArray hs12 = hPrev2.dup();
 
     double loss = 0;
 
@@ -192,15 +191,13 @@ public class StackedRNN extends RNN {
       INDArray hsRow = t == 0 ? hs1 : hs.getRow(t - 1);
       INDArray xst = xs.getRow(t);
       INDArray hst = Transforms.tanh((wxh.mmul(xst.transpose())).add((whh.mmul(hsRow)).add(bh)));
// hidden state
-//      INDArray hst = Transforms.relu((wxh.mmul(xs.getRow(t).transpose())).add((whh.mmul(hsRow)).add(bh)));
// hidden state
       if (hs == null) {
         hs = init(inputs.length(), hst);
       }
       hs.putRow(t, hst);
 
       INDArray hs2Row = t == 0 ? hs12 : hs2.getRow(t - 1);
-      INDArray hst2 = Transforms.tanh((whh.mmul(hs.getRow(t))).add((whh2.mmul(hs2Row)).add(bh2)));
// hidden state 2
-//      INDArray hst2 = Transforms.relu((whh.mmul(hs.getRow(t))).add((whh2.mmul(hs2Row)).add(bh2)));
// hidden state 2
+      INDArray hst2 = Transforms.tanh((whh.mmul(hst)).add((whh2.mmul(hs2Row)).add(bh2)));
// hidden state 2
       if (hs2 == null) {
         hs2 = init(inputs.length(), hst2);
       }
@@ -217,7 +214,9 @@ public class StackedRNN extends RNN {
         ps = init(inputs.length(), pst);
       }
       ps.putRow(t, pst);
-      loss += -Transforms.log(ps.getRow(t).getRow(targets.getInt(t)), true).sumNumber().doubleValue();
// softmax (cross-entropy loss)
+
+      int targetsInt = targets.getInt(t);
+      loss += -Math.log(pst.getDouble(targetsInt)); // softmax (cross-entropy loss)
     }
 
     // backward pass: compute gradients going backwards
@@ -225,36 +224,32 @@ public class StackedRNN extends RNN {
     INDArray dh2Next = Nd4j.zerosLike(hs2.getRow(0));
     for (int t = inputs.length() - 1; t >= 0; t--) {
 
-      INDArray dy = ps.getRow(t).dup(); // dy = np.copy(ps[t])
-      int targetsInt = targets.getInt(t);
-      INDArray dyRow = dy.getRow(targetsInt);
-      dy.putRow(targetsInt, dyRow.sub(1)); // backprop into y
+      INDArray dy = ps.getRow(t).dup();
+      dy.putRow(targets.getInt(t), dy.getRow(targets.getInt(t)).sub(1)); // backprop into
y
 
       INDArray hs2t = hs2.getRow(t);
       INDArray hs2tm1 = t == 0 ? hs12 : hs2.getRow(t - 1);
 
-      dWh2y.addi(dy.mmul(hs2t.transpose())); // dWhy += np.dot(dy, hs[t].T)
-      dby.addi(dy); // dby += dy
+      dWh2y.addi(dy.mmul(hs2t.transpose()));
+      dby.addi(dy);
 
-      INDArray dh2 = wh2y.transpose().mmul(dy).add(dh2Next); // dh = np.dot(Why.T, dy) +
dhnext # backprop into h2
+      INDArray dh2 = wh2y.transpose().mmul(dy).add(dh2Next); // backprop into h2
 
       INDArray dhraw2 = (Nd4j.ones(hs2t.shape()).sub(hs2t.mul(hs2t))).mul(dh2); //  backprop
through tanh nonlinearity
-//      INDArray dhraw2 = Nd4j.getExecutioner().execAndReturn(new SetRange(hst2, 0, Double.MAX_VALUE)).mul(dh2);
// backprop through relu nonlinearity
-      dbh2.addi(dhraw2); // dbh += dhraw
+      dbh2.addi(dhraw2);
       INDArray hst = hs.getRow(t);
-      dWhh.addi(dhraw2.mmul(hst.transpose())); // dWxh += np.dot(dhraw, xs[t].T)
-      dWhh2.addi(dhraw2.mmul(hs2tm1.transpose())); // dWhh += np.dot(dhraw, hs[t-1].T)
-      dh2Next = whh2.transpose().mmul(dhraw2); // dhnext = np.dot(Whh.T, dhraw)
+      dWhh.addi(dhraw2.mmul(hst.transpose()));
+      dWhh2.addi(dhraw2.mmul(hs2tm1.transpose()));
+      dh2Next = whh2.transpose().mmul(dhraw2);
 
-      INDArray dh = whh2.transpose().mmul(dh2).add(dhNext); // backprop into h
+      INDArray dh = whh2.transpose().mmul(dhraw2).add(dhNext); // backprop into h
       INDArray dhraw = (Nd4j.ones(hst.shape()).sub(hst.mul(hst))).mul(dh); // backprop through
tanh nonlinearity
-//      INDArray dhraw = Nd4j.getExecutioner().execAndReturn(new SetRange(hst, 0, Double.MAX_VALUE)).mul(dh);
// backprop through relu nonlinearity
       dbh.addi(dhraw);
 
-      dWxh.addi(dhraw.mmul(xs.getRow(t))); // dWxh += np.dot(dhraw, xs[t].T)
+      dWxh.addi(dhraw.mmul(xs.getRow(t)));
       INDArray hsRow = t == 0 ? hs1 : hs.getRow(t - 1);
-      dWhh.addi(dhraw.mmul(hsRow.transpose())); // dWhh += np.dot(dhraw, hs[t-1].T)
-      dhNext = whh.transpose().mmul(dhraw); // dhnext = np.dot(Whh.T, dhraw)
+      dWhh.addi(dhraw.mmul(hsRow.transpose()));
+      dhNext = whh.transpose().mmul(dhraw);
 
     }
 
@@ -262,13 +257,14 @@ public class StackedRNN extends RNN {
     this.hPrev2 = hs2.getRow(inputs.length() - 1);
 
     // clip exploding gradients
-    Nd4j.getExecutioner().execAndReturn(new SetRange(dWxh, -5, 5));
-    Nd4j.getExecutioner().execAndReturn(new SetRange(dWhh, -5, 5));
-    Nd4j.getExecutioner().execAndReturn(new SetRange(dWhh2, -5, 5));
-    Nd4j.getExecutioner().execAndReturn(new SetRange(dWh2y, -5, 5));
-    Nd4j.getExecutioner().execAndReturn(new SetRange(dbh, -5, 5));
-    Nd4j.getExecutioner().execAndReturn(new SetRange(dbh2, -5, 5));
-    Nd4j.getExecutioner().execAndReturn(new SetRange(dby, -5, 5));
+    int clip = 5;
+    Nd4j.getExecutioner().execAndReturn(new SetRange(dWxh, -clip, clip));
+    Nd4j.getExecutioner().execAndReturn(new SetRange(dWhh, -clip, clip));
+    Nd4j.getExecutioner().execAndReturn(new SetRange(dWhh2, -clip, clip));
+    Nd4j.getExecutioner().execAndReturn(new SetRange(dWh2y, -clip, clip));
+    Nd4j.getExecutioner().execAndReturn(new SetRange(dbh, -clip, clip));
+    Nd4j.getExecutioner().execAndReturn(new SetRange(dbh2, -clip, clip));
+    Nd4j.getExecutioner().execAndReturn(new SetRange(dby, -clip, clip));
 
     return loss;
   }
@@ -289,9 +285,7 @@ public class StackedRNN extends RNN {
 
     for (int t = 0; t < sampleSize; t++) {
       h = Transforms.tanh((wxh.mmul(x)).add((whh.mmul(h)).add(bh)));
-//      INDArray h = Transforms.relu((wxh.mmul(x)).add((whh.mmul(hPrev)).add(bh)));
       h2 = Transforms.tanh((whh.mmul(h)).add((whh2.mmul(h2)).add(bh2)));
-//      INDArray h2 = Transforms.relu((whh.mmul(h)).add((whh2.mmul(hPrev2)).add(bh2)));
       INDArray y = (wh2y.mmul(h2)).add(by);
       INDArray pm = Nd4j.getExecutioner().execAndReturn(new SoftMax(y)).ravel();
 

Modified: labs/yay/trunk/core/src/test/java/org/apache/yay/RNNCrossValidationTest.java
URL: http://svn.apache.org/viewvc/labs/yay/trunk/core/src/test/java/org/apache/yay/RNNCrossValidationTest.java?rev=1768715&r1=1768714&r2=1768715&view=diff
==============================================================================
--- labs/yay/trunk/core/src/test/java/org/apache/yay/RNNCrossValidationTest.java (original)
+++ labs/yay/trunk/core/src/test/java/org/apache/yay/RNNCrossValidationTest.java Tue Nov 
8 15:33:35 2016
@@ -41,7 +41,7 @@ public class RNNCrossValidationTest {
   private int hiddenLayerSize;
   private Random r = new Random();
   private String text;
-  private final int epochs = 5;
+  private final int epochs = 10;
   private List<String> words;
 
   public RNNCrossValidationTest(float learningRate, int seqLength, int hiddenLayerSize) {
@@ -61,12 +61,11 @@ public class RNNCrossValidationTest {
   @Parameterized.Parameters
   public static Collection<Object[]> data() {
     return Arrays.asList(new Object[][]{
-            {1e-1f, 50, 5},
-            {1e-1f, 50, 10},
             {1e-1f, 50, 15},
             {1e-1f, 50, 25},
             {1e-1f, 50, 50},
             {1e-1f, 50, 100},
+            {1e-1f, 50, 150},
     });
   }
 

Modified: labs/yay/trunk/core/src/test/resources/word2vec/abstracts.txt
URL: http://svn.apache.org/viewvc/labs/yay/trunk/core/src/test/resources/word2vec/abstracts.txt?rev=1768715&r1=1768714&r2=1768715&view=diff
==============================================================================
--- labs/yay/trunk/core/src/test/resources/word2vec/abstracts.txt (original)
+++ labs/yay/trunk/core/src/test/resources/word2vec/abstracts.txt Tue Nov  8 15:33:35 2016
@@ -95,4 +95,46 @@ In this paper , we propose a novel neura
 One RNN encodes a sequence of symbols into a fixedlength vector representation , and the
other decodes the representation into another sequence of symbols .
 The encoder and decoder of the proposed model are jointly trained to maximize the conditional
probability of a target sequence given a source sequence .
 The performance of a statistical machine translation system is empirically found to improve
by using the conditional probabilities of phrase pairs computed by the RNN Encoder–Decoder
as an additional feature in the existing log-linear model .
-Qualitatively, we show that the proposed model learns a semantically and syntactically meaningful
representation of linguistic phrases .
\ No newline at end of file
+Qualitatively , we show that the proposed model learns a semantically and syntactically meaningful
representation of linguistic phrases .
+Time series often have a temporal hierarchy , with information that is spread out over multiple
time scales .
+Common recurrent neural networks , however, do not explicitly accommodate such a hierarchy
, and most research on them has been focusing on training algorithms rather than on their
basic architecture .
+In this paper we study the effect of a hierarchy of recurrent neural networks on processing
time series .
+Here , each layer is a recurrent network which receives the hidden state of the previous
layer as input .
+This architecture allows us to perform hierarchical processing on difficult temporal tasks
, and more naturally capture the structure of time series .
+We show that they reach state-of-the-art performance for recurrent networks in character-level
language modeling when trained with simple stochastic gradient descent .
+We also offer an analysis of the different emergent time scales .
+In this paper , we explore different ways to extend a recurrent neural network (RNN) to a
deep RNN .
+We start by arguing that the concept of depth in an RNN is not as clear as it is in feedforward
neural networks .
+By carefully analyzing and understanding the architecture of an RNN , however , we find three
points of an RNN which may be made deeper ; (1) input-to-hidden function , (2) hidden-tohidden
transition and (3) hidden-to-output function .
+Based on this observation , we propose two novel architectures of a deep RNN which are orthogonal
to an earlier attempt of stacking multiple recurrent layers to build a deep RNN (Schmidhuber
, 1992; El Hihi and Bengio , 1996) .
+We provide an alternative interpretation of these deep RNNs using a novel framework based
on neural operators .
+The proposed deep RNNs are empirically evaluated on the tasks of polyphonic music prediction
and language modeling .
+The experimental result supports our claim that the proposed deep RNNs benefit from the depth
and outperform the conventional , shallow RNNs.
+Reasoning and inference are central to human and artificial intelligence .
+Modeling inference in human language is notoriously challenging but is fundamental to natural
language understanding and many applications .
+With the availability of large annotated data , neural network models have recently advanced
the field significantly .
+In this paper , we present a new state-of-the-art result , achieving the accuracy of 88.3%
on the standard benchmark , the Stanford Natural Language Inference dataset .
+This result is achieved first through our enhanced sequential encoding model , which outperforms
the previous best model that employs more complicated network architectures , suggesting that
the potential of sequential LSTM-based models have not been fully explored yet in previous
work .
+We further show that by explicitly considering recursive architectures , we achieve additional
improvement .
+Particularly , incorporating syntactic parse information contributes to our best result ;
it improves the performance even when the parse information is added to an already very strong
system .
+We present a neural architecture for sequence processing .
+The ByteNet is a stack of two dilated convolutional neural networks , one to encode the source
sequence and one to decode the target sequence , where the target network unfolds dynamically
to generate variable length outputs .
+The ByteNet has two core properties : it runs in time that is linear in the length of the
sequences and it preserves the sequences’ temporal resolution .
+The ByteNet decoder attains state-of-the-art performance on character-level language modelling
and outperforms the previous best results obtained with recurrent neural networks .
+The ByteNet also achieves a performance on raw character-level machine translation that approaches
that of the best neural translation models that run in quadratic time .
+The implicit structure learnt by the ByteNet mirrors the expected alignments between the
sequences .
+The Teacher Forcing algorithm trains recurrent networks by supplying observed sequence values
as inputs during training and using the network’s own one-stepahead predictions to do
multi-step sampling .
+We introduce the Professor Forcing algorithm , which uses adversarial domain adaptation to
encourage the dynamics of the recurrent network to be the same when training the network and
when sampling from the network over multiple time steps .
+We apply Professor Forcing to language modeling , vocal synthesis on raw waveforms , handwriting
generation , and image generation .
+Empirically we find that Professor Forcing acts as a regularizer , improving test likelihood
on character level Penn Treebank and sequential MNIST .
+We also find that the model qualitatively improves samples , especially when sampling for
a large number of time steps .
+This is supported by human evaluation of sample quality .
+Trade-offs between Professor Forcing and Scheduled Sampling are discussed .
+We produce T-SNEs showing that Professor Forcing successfully makes the dynamics of the network
during training and sampling more similar .
+Most existing machine translation systems operate at the level of words , relying on explicit
segmentation to extract tokens .
+We introduce a neural machine translation (NMT) model that maps a source character sequence
to a target character sequence without any segmentation .
+We employ a character-level convolutional network with max-pooling at the encoder to reduce
the length of source representation , allowing the model to be trained at a speed comparable
to subword-level models while capturing local regularities .
+Our character-to-character model outperforms a recently proposed baseline with a subwordlevel
encoder on WMT’15 DE-EN and CSEN , and gives comparable performance on FIEN and RU-EN
.
+We then demonstrate that it is possible to share a single characterlevel encoder across multiple
languages by training a model on a many-to-one translation task .
+In this multilingual setting , the character-level encoder significantly outperforms the
subword-level encoder on all the language pairs .
+We observe that on CS-EN , FI-EN and RU-EN , the quality of the multilingual character-level
translation even surpasses the models specifically trained on that language pair alone , both
in terms of BLEU score and human judgment .
\ No newline at end of file



---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@labs.apache.org
For additional commands, e-mail: commits-help@labs.apache.org


Mime
View raw message