singa-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From wan...@apache.org
Subject [1/3] incubator-singa git commit: SINGA-36 Clean ModelProto, ClusterProto, JobProto and driver program
Date Thu, 23 Jul 2015 06:49:29 GMT
Repository: incubator-singa
Updated Branches:
  refs/heads/master 29de86337 -> c3a248a4b


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/0478e8cf/src/proto/job.proto
----------------------------------------------------------------------
diff --git a/src/proto/job.proto b/src/proto/job.proto
new file mode 100644
index 0000000..7c462d2
--- /dev/null
+++ b/src/proto/job.proto
@@ -0,0 +1,461 @@
+package singa;
+
+message JobProto {
+  required ClusterProto cluster = 1;
+  required ModelProto model = 2;
+}
+
+message ClusterProto {
+  optional int32 nworker_groups = 1;
+  optional int32 nserver_groups = 2;
+  optional int32 nworkers_per_group = 3 [default = 1];
+  optional int32 nservers_per_group = 4 [default = 1];
+  optional int32 nworkers_per_procs = 5 [default = 1];
+  optional int32 nservers_per_procs = 6 [default = 1];
+
+  // servers and workers in different processes?
+  optional bool server_worker_separate = 11 [default = false];
+
+  // port number is used by ZeroMQ
+  optional int32 start_port = 13 [default = 6723];
+  // local workspace, train/val/test shards, checkpoint files
+  optional string workspace = 14 [default = "workspace"];
+
+  // conduct updates at server side; otherwise do it at worker side
+  optional bool server_update = 40 [default = true];
+  // share memory space between worker groups in one procs
+  optional bool share_memory = 41 [default = true];
+
+  // bandwidth of ethernet, Bytes per second, default is 1 Gbps
+  optional int32 bandwidth=50 [default=134217728];
+  // poll time in milliseconds
+  optional int32 poll_time=51 [default =100];
+}
+
+
+enum Phase {
+  kTrain = 0;
+  kValidation = 1;
+  kTest= 2;
+  // postivie phase for contrastive divergence algorithm
+  kPositive = 3;
+  // negative phase for contrastive divergence algorithm
+  kNegative = 4;
+  kForward = 5;
+  kBackward = 6;
+}
+
+message ModelProto {
+  // model name, e.g., "cifar10-dcnn", "mnist-mlp"
+  required string name = 1;
+  // frequency of displaying training info
+  required int32 display_frequency = 3 ;
+  // total num of steps for training
+  required int32 train_steps = 5;
+  // configuration of SGD updater, including learning rate, etc.
+  required UpdaterProto updater = 7;
+  enum GradCalcAlg {
+    // BP algorithm for feed-forward models, e.g., CNN, MLP, RNN
+    kBackPropagation = 1;
+    // CD algorithm for RBM, DBM etc., models
+    kContrastiveDivergence = 2;
+  }
+ // gradient calculation algorithm
+  required GradCalcAlg alg = 8 [default = kBackPropagation];
+  required NetProto neuralnet = 9;
+
+  // total num of steps for validation
+  optional int32 validation_steps = 30 [default = 0];
+  // total num of steps for test
+  optional int32 test_steps = 31 [default = 0];
+  // frequency of validation
+  optional int32 validation_frequency = 32;
+  // frequency of test
+  optional int32 test_frequency = 33 [default = 0];
+  // frequency of checkpoint
+  optional int32 checkpoint_frequency = 34 [default = 0];
+  // send parameters to servers after training for this num of steps
+  optional int32 warmup_steps = 35 [default = 0];
+  // checkpoint path
+  optional bool resume = 36 [default = false];
+
+   // start display after this num steps
+  optional int32 display_after =  60[default = 0];
+  // start checkpoint after this num steps
+  optional int32 checkpoint_after = 61 [default = 0];
+  // start test after this num steps
+  optional int32 test_after = 62 [default = 0];
+// start validation after this num steps
+  optional int32 validation_after = 63 [default = 0];
+  // last snapshot step
+  optional int32 step = 64 [default = 0];
+  // display debug info
+  optional bool debug = 65 [default = false];
+  // checkpoint files
+  repeated string checkpoint = 66;
+  // reset the version of params loaded from checkpoint file to step
+  optional bool reset_param_version = 67 [default = false];
+}
+
+message NetProto {
+  repeated LayerProto layer = 1;
+  // partitioning type for parallelism
+  optional int32 partition_dim = 2 [default = 0];
+}
+
+// weight matrix should be defined before bias vector
+message ParamProto {
+  enum InitMethod {
+    // fix the values of all parameters  a constant in the value field
+    kConstant = 0;
+    // sample gaussian with std and mean
+    kGaussian = 1;
+    // uniform sampling between low and high
+    kUniform = 2;
+    // copy the content and history which are from previous training
+    kPretrained = 3;
+    // from Toronto Convnet, let a=1/sqrt(fan_in), w*=a after generating from
+    // Gaussian distribution
+    kGaussainSqrtFanIn = 4;
+    // from Toronto Convnet, rectified linear activation, let
+    // a=sqrt(3)/sqrt(fan_in), range is [-a, +a]; no need to set value=sqrt(3),
+    // the program will multiply it.
+    kUniformSqrtFanIn = 5;
+    // from Theano MLP tutorial, let a=sqrt(6/(fan_in+fan_out)). for tanh
+    // activation, range is [-a, +a], for sigmoid activation, range is
+    // [-4a, +4a], put the scale factor to value field.
+    // <a href="http://deeplearning.net/tutorial/mlp.html"> Theano MLP</a>
+    kUniformSqrtFanInOut = 6;
+  }
+  optional InitMethod init_method = 1 [default = kGaussian];
+  // constant init
+  optional float value = 5 [default = 1];
+  // for uniform sampling
+  optional float low = 6 [default = -1];
+  optional float high = 7 [default = 1];
+  // for gaussian sampling
+  optional float mean = 8 [default = 0];
+  optional float std = 9 [default = 1];
+  // multiplied on the global learning rate.
+  optional float learning_rate_multiplier = 15 [default = 1];
+  // multiplied on the global weight decay.
+  optional float weight_decay_multiplier = 16 [default = 1];
+  // partition dimension, -1 for no partition
+  optional int32 partition_dim = 30;
+  // usually, the program will infer the param shape
+  repeated int32 shape = 31;
+  // used for identifying the same params from diff models and display deug info
+  optional string name =  61 [default = ""];
+  // name of the owner param from which this param shares the values
+  optional string share_from = 62;
+  // used interally
+  optional int32 id = 63;
+  // parameter slice limit (Google Protobuf also has size limit)
+  optional int32 split_threshold = 64 [default = 5000000];
+  // used internally
+  optional int32 owner = 65 [default = -1];
+}
+
+enum PartitionType{
+  kDataPartition=0;
+  kLayerPartition=1;
+  kNone=2;
+}
+
+message LayerProto {
+  // the layer name used for identification
+  required string name = 1;
+  enum LayerType{
+    kBridgeSrc = 15;
+    kBridgeDst = 16;
+    kConvolution = 1;
+    kConcate = 2;
+    kShardData = 3;
+    kDropout = 4;
+    kInnerProduct = 5;
+    kLabel = 18;
+    kLMDBData = 17;
+    kLRN = 6;
+    kMnist = 7;
+    kPooling = 8;
+    kPrefetch = 19;
+    kReLU = 9;
+    kRGBImage = 10;
+    kSoftmaxLoss = 11;
+    kSlice = 12;
+    kSplit = 13;
+    kTanh = 14;
+  }
+  // source layer names
+  repeated string srclayers = 3;
+  // parameters, e.g., weight matrix or bias vector
+  repeated ParamProto param = 12;
+  // all layers are included in the net structure for training phase by default.
+  // some layers like data layer for loading test data are not used by training
+  // phase should be removed by setting the exclude field.
+  repeated Phase exclude = 15;
+  // the layer type from the enum above
+  required LayerType type = 20;
+  // configuration for convolution layer
+  optional ConvolutionProto convolution_conf = 30;
+  // configuration for concatenation layer
+  optional ConcateProto concate_conf = 31;
+  // configuration for dropout layer
+  optional DropoutProto dropout_conf = 33;
+  // configuration for inner product layer
+  optional InnerProductProto innerproduct_conf = 34;
+  // configuration for local response normalization layer
+  optional DataProto lmdbdata_conf = 35;
+  // configuration for local response normalization layer
+  optional LRNProto lrn_conf = 45;
+  // configuration for mnist parser layer
+  optional MnistProto mnist_conf= 36;
+  // configuration for pooling layer
+  optional PoolingProto pooling_conf = 37;
+  // configuration for prefetch layer
+  optional PrefetchProto prefetch_conf = 44;
+  // configuration for rectified linear unit layer
+  optional ReLUProto relu_conf = 38;
+  // configuration for rgb image parser layer
+  optional RGBImageProto rgbimage_conf = 39;
+  // configuration for data layer
+  optional DataProto sharddata_conf = 32;
+ // configuration for slice layer
+  optional SliceProto slice_conf = 41;
+  // configuration for softmax loss layer
+  optional SoftmaxLossProto softmaxloss_conf = 40;
+  // configuration for split layer
+  optional SplitProto split_conf = 42;
+  // configuration for tanh layer
+  optional TanhProto tanh_conf = 43;
+
+
+  // overrides the partition dimension for neural net
+  optional int32 partition_dim =59 [default = -1];
+  optional string datablob = 58 [default = "unknow"];
+
+  // names of parameters shared from other layers
+  repeated string share_param = 60;
+  optional int32 partition_id = 62 [default = 0];
+}
+
+message RGBImageProto {
+  // scale factor for each pixel
+  optional float scale = 1 [default = 1.0];
+  // size after cropping
+  optional int32 cropsize = 2 [default = 0];
+  // mirror the image
+  optional bool mirror = 3 [default = false];
+  // meanfile path
+  optional string meanfile = 4 [default = ""];
+}
+
+message PrefetchProto {
+  repeated LayerProto sublayers = 1;
+}
+
+message SplitProto {
+  optional int32 num_splits = 1 [default =1];
+}
+
+// scaled tan: A*tan(B*x)
+message TanhProto {
+  // A of A*tan(B*x)
+  optional float outer_scale = 1 [default = 1.0];
+  // B of A*tan(B*x)
+  optional float inner_scale = 2 [default = 1.0];
+}
+
+message SoftmaxLossProto {
+  // computing accuracy against topk results
+  optional int32 topk = 1 [default = 1];
+  // loss scale factor
+  optional float scale= 30 [default = 1];
+}
+
+message ConvolutionProto {
+  // The number of outputs for the layer
+  required int32 num_filters = 1;
+  // the kernel height/width
+  required int32 kernel= 2;
+
+  // The padding height/width
+  optional int32 pad = 30 [default = 0];
+  // the stride
+  optional int32 stride = 31 [default = 1];
+  // whether to have bias terms
+  optional bool bias_term = 32 [default = true];
+}
+
+message ConcateProto {
+  // on which dimension, starts from 0
+  required int32 concate_dim = 1;
+}
+
+message DataProto {
+  // path to the data file/folder, absolute or relative to the workspace
+  required string path = 2;
+  // batch size.
+  required int32 batchsize = 4;
+  // skip [0,random_skip] records
+  optional int32 random_skip = 30 [default = 0];
+}
+
+message MnistProto {
+  // normalization x/norm_a
+  required float norm_a = 1 [default = 1];
+  // normalization x-norm_b
+  required float norm_b = 2 [default = 0];
+
+  // elastic distortion
+  optional int32 kernel = 30 [default = 0];
+  optional float sigma = 31 [default = 0];
+  optional float alpha = 32 [default = 0];
+  // rotation or horizontal shearing
+  optional float beta = 33 [default = 0];
+  // scaling
+  optional float gamma = 34 [default = 0];
+  // scale to this size as input for deformation
+  optional int32 resize = 35 [default = 0] ;
+  optional int32 elastic_freq = 36 [default = 0];
+}
+
+// Message that stores parameters used by DropoutLayer
+message DropoutProto {
+  // dropout ratio
+  optional float dropout_ratio = 30 [default = 0.5];
+}
+
+// Message that stores parameters used by InnerProductLayer
+message InnerProductProto {
+  // number of outputs for the layer
+  required int32 num_output = 1;
+  // use bias vector or not
+  optional bool bias_term = 30 [default = true];
+}
+
+message LRNProto {
+  // local response size
+  required int32 local_size = 1 [default = 5];
+  // scale factor
+  optional float alpha = 31 [default = 1.0];
+  // exponential number
+  optional float beta = 32 [default = 0.75];
+  enum NormRegion {
+    // across channels, e.g., r,g,b
+    ACROSS_CHANNELS = 0;
+    // within channel, e.g., r, g and b are concatenated into one channel
+    WITHIN_CHANNEL = 1;
+  }
+  // normalization objective
+  optional NormRegion norm_region = 33 [default = ACROSS_CHANNELS];
+  // offset
+  optional float knorm =34 [default = 1.0];
+}
+
+message PoolingProto {
+  // The kernel size (square)
+  required int32 kernel= 1;
+  enum PoolMethod {
+    MAX = 0;
+    AVE = 1;
+  }
+  // The pooling method
+  optional PoolMethod pool = 30 [default = MAX];
+  // The padding size
+  optional uint32 pad = 31 [default = 0];
+  // The stride
+  optional uint32 stride = 32 [default = 1];
+}
+
+message SliceProto{
+  required int32 slice_dim = 1;
+}
+
+message ReLUProto {
+  // Ref. Maas, A. L., Hannun, A. Y., & Ng, A. Y. (2013).
+  // Rectifier nonlinearities improve neural network acoustic models.
+  // In ICML Workshop on Deep Learning for Audio, Speech, and Language Processing.
+  optional float negative_slope = 1 [default = 0];
+}
+
+message UpdaterProto {
+  enum UpdaterType{
+    // noraml SGD with momentum and weight decay
+    kSGD = 1;
+    // adaptive subgradient, http://www.magicbroom.info/Papers/DuchiHaSi10.pdf
+    kAdaGrad = 2;
+    // http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf
+    kRMSProp = 3;
+    // Nesterov first optimal gradient method
+    kNesterov = 4;
+  }
+  // updater type
+  required UpdaterType type = 1 [default=kSGD];
+  // configuration for RMSProp algorithm
+  optional RMSPropProto rmsprop_conf = 50;
+
+ enum ChangeMethod {
+    kFixed = 0;
+    kInverseT = 1;
+    kInverse = 2;
+    kExponential = 3;
+    kLinear = 4;
+    kStep = 5;
+    kFixedStep = 6;
+  }
+  // change method for learning rate
+  required ChangeMethod lr_change= 2 [default = kFixed];
+
+  optional FixedStepProto fixedstep_conf=40;
+  optional StepProto step_conf=41;
+  optional LinearProto linear_conf=42;
+  optional ExponentialProto exponential_conf=43;
+  optional InverseProto inverse_conf=44;
+  optional InverseTProto inverset_conf=45;
+
+  optional float momentum = 31 [default = 0];
+  optional float weight_decay = 32 [default = 0];
+  // base learning rate
+  optional float base_lr = 34 [default = 0];
+  // used to avoid divide by 0, i.e. x/(y+delta)
+  optional float delta = 35 [default = 0.00000001];
+}
+
+message RMSPropProto{
+  // history=history*rho_+(1-rho_)*(grad*grad_scale);
+  required float rho = 1;
+}
+
+message FixedStepProto{
+  repeated int32 step = 28;
+  // lr = step_lr[i] if current step >= step[i]
+  repeated float step_lr = 29;
+}
+
+message StepProto{
+  // lr = base_lr * gamma^(step/change_freq)
+  required float gamma = 35 [default = 1];
+  // lr = base_lr * gamma^(step/change_freq)
+  required int32 change_freq= 40;
+}
+message LinearProto{
+  // lr = (1 - step / freq) * base_lr + (step / freq) * final_lr
+  required int32 change_freq= 40;
+  // lr = (1 - step / freq) * base_lr + (step / freq) * final_lr
+  required float final_lr = 39;
+}
+message ExponentialProto{
+  // lr = base / 2^(step/change_freq)
+  required int32 change_freq= 40;
+}
+message InverseTProto{
+  // lr = base_lr / (1+step/final_lr)
+  required float final_lr = 39;
+}
+message InverseProto{
+  // lr = base_lr*(1+gamma*step)^(-pow)
+  required float gamma = 1 [default = 1];
+  // lr = base_lr*(1+gamma*step)^(-pow)
+  required float pow = 2 [default = 0];
+}

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/0478e8cf/src/proto/model.proto
----------------------------------------------------------------------
diff --git a/src/proto/model.proto b/src/proto/model.proto
deleted file mode 100644
index f3b8dfe..0000000
--- a/src/proto/model.proto
+++ /dev/null
@@ -1,427 +0,0 @@
-package singa;
-enum Phase {
-  kTrain = 0;
-  kValidation = 1;
-  kTest= 2;
-  // postivie phase for contrastive divergence algorithm
-  kPositive = 3;
-  // negative phase for contrastive divergence algorithm
-  kNegative = 4;
-  kForward = 5;
-  kBackward = 6;
-}
-
-message ModelProto {
-  // model name, e.g., "cifar10-dcnn", "mnist-mlp"
-  required string name = 1;
-  // frequency of displaying training info
-  required int32 display_frequency = 3 ;
-  // total num of steps for training
-  required int32 train_steps = 5;
-  // configuration of SGD updater, including learning rate, etc.
-  required UpdaterProto updater = 7;
-  enum GradCalcAlg {
-    // BP algorithm for feed-forward models, e.g., CNN, MLP, RNN
-    kBackPropagation = 1;
-    // CD algorithm for RBM, DBM etc., models
-    kContrastiveDivergence = 2;
-  }
- // gradient calculation algorithm
-  required GradCalcAlg alg = 8 [default = kBackPropagation];
-  required NetProto neuralnet = 9;
-
-  // total num of steps for validation
-  optional int32 validation_steps = 30 [default = 0];
-  // total num of steps for test
-  optional int32 test_steps = 31 [default = 0];
-  // frequency of validation
-  optional int32 validation_frequency = 32;
-  // frequency of test
-  optional int32 test_frequency = 33 [default = 0];
-  // frequency of checkpoint
-  optional int32 checkpoint_frequency = 34 [default = 0];
-  // send parameters to servers after training for this num of steps
-  optional int32 warmup_steps = 35 [default = 0];
-  // checkpoint path
-  optional bool resume = 36 [default = false];
-
-   // start display after this num steps
-  optional int32 display_after =  60[default = 0];
-  // start checkpoint after this num steps
-  optional int32 checkpoint_after = 61 [default = 0];
-  // start test after this num steps
-  optional int32 test_after = 62 [default = 0];
-// start validation after this num steps
-  optional int32 validation_after = 63 [default = 0];
-  // last snapshot step
-  optional int32 step = 64 [default = 0];
-  // display debug info
-  optional bool debug = 65 [default = false];
-  // checkpoint files
-  repeated string checkpoint = 66;
-  // reset the version of params loaded from checkpoint file to step
-  optional bool reset_param_version = 67 [default = false];
-}
-
-message NetProto {
-  repeated LayerProto layer = 1;
-  // partitioning type for parallelism
-  optional int32 partition_dim = 2 [default = 0];
-}
-
-// weight matrix should be defined before bias vector
-message ParamProto {
-  enum InitMethod {
-    // fix the values of all parameters  a constant in the value field
-    kConstant = 0;
-    // sample gaussian with std and mean
-    kGaussian = 1;
-    // uniform sampling between low and high
-    kUniform = 2;
-    // copy the content and history which are from previous training
-    kPretrained = 3;
-    // from Toronto Convnet, let a=1/sqrt(fan_in), w*=a after generating from
-    // Gaussian distribution
-    kGaussainSqrtFanIn = 4;
-    // from Toronto Convnet, rectified linear activation, let
-    // a=sqrt(3)/sqrt(fan_in), range is [-a, +a]; no need to set value=sqrt(3),
-    // the program will multiply it.
-    kUniformSqrtFanIn = 5;
-    // from Theano MLP tutorial, let a=sqrt(6/(fan_in+fan_out)). for tanh
-    // activation, range is [-a, +a], for sigmoid activation, range is
-    // [-4a, +4a], put the scale factor to value field.
-    // <a href="http://deeplearning.net/tutorial/mlp.html"> Theano MLP</a>
-    kUniformSqrtFanInOut = 6;
-  }
-  optional InitMethod init_method = 1 [default = kGaussian];
-  // constant init
-  optional float value = 5 [default = 1];
-  // for uniform sampling
-  optional float low = 6 [default = -1];
-  optional float high = 7 [default = 1];
-  // for gaussian sampling
-  optional float mean = 8 [default = 0];
-  optional float std = 9 [default = 1];
-  // multiplied on the global learning rate.
-  optional float learning_rate_multiplier = 15 [default = 1];
-  // multiplied on the global weight decay.
-  optional float weight_decay_multiplier = 16 [default = 1];
-  // partition dimension, -1 for no partition
-  optional int32 partition_dim = 30;
-  // usually, the program will infer the param shape
-  repeated int32 shape = 31;
-  // used for identifying the same params from diff models and display deug info
-  optional string name =  61 [default = ""];
-  // name of the owner param from which this param shares the values
-  optional string share_from = 62;
-  // used interally
-  optional int32 id = 63;
-  // parameter slice limit (Google Protobuf also has size limit)
-  optional int32 split_threshold = 64 [default = 5000000];
-  // used internally
-  optional int32 owner = 65 [default = -1];
-}
-
-enum PartitionType{
-  kDataPartition=0;
-  kLayerPartition=1;
-  kNone=2;
-}
-
-message LayerProto {
-  // the layer name used for identification
-  required string name = 1;
-  enum LayerType{
-    kBridgeSrc = 15;
-    kBridgeDst = 16;
-    kConvolution = 1;
-    kConcate = 2;
-    kShardData = 3;
-    kDropout = 4;
-    kInnerProduct = 5;
-    kLabel = 18;
-    kLMDBData = 17;
-    kLRN = 6;
-    kMnist = 7;
-    kPooling = 8;
-    kPrefetch = 19;
-    kReLU = 9;
-    kRGBImage = 10;
-    kSoftmaxLoss = 11;
-    kSlice = 12;
-    kSplit = 13;
-    kTanh = 14;
-  }
-  // source layer names
-  repeated string srclayers = 3;
-  // parameters, e.g., weight matrix or bias vector
-  repeated ParamProto param = 12;
-  // all layers are included in the net structure for training phase by default.
-  // some layers like data layer for loading test data are not used by training
-  // phase should be removed by setting the exclude field.
-  repeated Phase exclude = 15;
-  // the layer type from the enum above
-  required LayerType type = 20;
-  // configuration for convolution layer
-  optional ConvolutionProto convolution_conf = 30;
-  // configuration for concatenation layer
-  optional ConcateProto concate_conf = 31;
-  // configuration for dropout layer
-  optional DropoutProto dropout_conf = 33;
-  // configuration for inner product layer
-  optional InnerProductProto innerproduct_conf = 34;
-  // configuration for local response normalization layer
-  optional DataProto lmdbdata_conf = 35;
-  // configuration for local response normalization layer
-  optional LRNProto lrn_conf = 45;
-  // configuration for mnist parser layer
-  optional MnistProto mnist_conf= 36;
-  // configuration for pooling layer
-  optional PoolingProto pooling_conf = 37;
-  // configuration for prefetch layer
-  optional PrefetchProto prefetch_conf = 44;
-  // configuration for rectified linear unit layer
-  optional ReLUProto relu_conf = 38;
-  // configuration for rgb image parser layer
-  optional RGBImageProto rgbimage_conf = 39;
-  // configuration for data layer
-  optional DataProto sharddata_conf = 32;
- // configuration for slice layer
-  optional SliceProto slice_conf = 41;
-  // configuration for softmax loss layer
-  optional SoftmaxLossProto softmaxloss_conf = 40;
-  // configuration for split layer
-  optional SplitProto split_conf = 42;
-  // configuration for tanh layer
-  optional TanhProto tanh_conf = 43;
-
-
-  // overrides the partition dimension for neural net
-  optional int32 partition_dim =59 [default = -1];
-  optional string datablob = 58 [default = "unknow"];
-
-  // names of parameters shared from other layers
-  repeated string share_param = 60;
-  optional int32 partition_id = 62 [default = 0];
-}
-
-message RGBImageProto {
-  // scale factor for each pixel
-  optional float scale = 1 [default = 1.0];
-  // size after cropping
-  optional int32 cropsize = 2 [default = 0];
-  // mirror the image
-  optional bool mirror = 3 [default = false];
-  // meanfile path
-  optional string meanfile = 4 [default = ""];
-}
-
-message PrefetchProto {
-  repeated LayerProto sublayers = 1;
-}
-
-message SplitProto {
-  optional int32 num_splits = 1 [default =1];
-}
-
-// scaled tan: A*tan(B*x)
-message TanhProto {
-  // A of A*tan(B*x)
-  optional float outer_scale = 1 [default = 1.0];
-  // B of A*tan(B*x)
-  optional float inner_scale = 2 [default = 1.0];
-}
-
-message SoftmaxLossProto {
-  // computing accuracy against topk results
-  optional int32 topk = 1 [default = 1];
-  // loss scale factor
-  optional float scale= 30 [default = 1];
-}
-
-message ConvolutionProto {
-  // The number of outputs for the layer
-  required int32 num_filters = 1;
-  // the kernel height/width
-  required int32 kernel= 2;
-
-  // The padding height/width
-  optional int32 pad = 30 [default = 0];
-  // the stride
-  optional int32 stride = 31 [default = 1];
-  // whether to have bias terms
-  optional bool bias_term = 32 [default = true];
-}
-
-message ConcateProto {
-  // on which dimension, starts from 0
-  required int32 concate_dim = 1;
-}
-
-message DataProto {
-  // path to the data file/folder, absolute or relative to the workspace
-  required string path = 2;
-  // batch size.
-  required int32 batchsize = 4;
-  // skip [0,random_skip] records
-  optional int32 random_skip = 30 [default = 0];
-}
-
-message MnistProto {
-  // normalization x/norm_a
-  required float norm_a = 1 [default = 1];
-  // normalization x-norm_b
-  required float norm_b = 2 [default = 0];
-
-  // elastic distortion
-  optional int32 kernel = 30 [default = 0];
-  optional float sigma = 31 [default = 0];
-  optional float alpha = 32 [default = 0];
-  // rotation or horizontal shearing
-  optional float beta = 33 [default = 0];
-  // scaling
-  optional float gamma = 34 [default = 0];
-  // scale to this size as input for deformation
-  optional int32 resize = 35 [default = 0] ;
-  optional int32 elastic_freq = 36 [default = 0];
-}
-
-// Message that stores parameters used by DropoutLayer
-message DropoutProto {
-  // dropout ratio
-  optional float dropout_ratio = 30 [default = 0.5];
-}
-
-// Message that stores parameters used by InnerProductLayer
-message InnerProductProto {
-  // number of outputs for the layer
-  required int32 num_output = 1;
-  // use bias vector or not
-  optional bool bias_term = 30 [default = true];
-}
-
-message LRNProto {
-  // local response size
-  required int32 local_size = 1 [default = 5];
-  // scale factor
-  optional float alpha = 31 [default = 1.0];
-  // exponential number
-  optional float beta = 32 [default = 0.75];
-  enum NormRegion {
-    // across channels, e.g., r,g,b
-    ACROSS_CHANNELS = 0;
-    // within channel, e.g., r, g and b are concatenated into one channel
-    WITHIN_CHANNEL = 1;
-  }
-  // normalization objective
-  optional NormRegion norm_region = 33 [default = ACROSS_CHANNELS];
-  // offset
-  optional float knorm =34 [default = 1.0];
-}
-
-message PoolingProto {
-  // The kernel size (square)
-  required int32 kernel= 1;
-  enum PoolMethod {
-    MAX = 0;
-    AVE = 1;
-  }
-  // The pooling method
-  optional PoolMethod pool = 30 [default = MAX];
-  // The padding size
-  optional uint32 pad = 31 [default = 0];
-  // The stride
-  optional uint32 stride = 32 [default = 1];
-}
-
-message SliceProto{
-  required int32 slice_dim = 1;
-}
-
-message ReLUProto {
-  // Ref. Maas, A. L., Hannun, A. Y., & Ng, A. Y. (2013).
-  // Rectifier nonlinearities improve neural network acoustic models.
-  // In ICML Workshop on Deep Learning for Audio, Speech, and Language Processing.
-  optional float negative_slope = 1 [default = 0];
-}
-
-message UpdaterProto {
-  enum UpdaterType{
-    // noraml SGD with momentum and weight decay
-    kSGD = 1;
-    // adaptive subgradient, http://www.magicbroom.info/Papers/DuchiHaSi10.pdf
-    kAdaGrad = 2;
-    // http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf
-    kRMSProp = 3;
-    // Nesterov first optimal gradient method
-    kNesterov = 4;
-  }
-  // updater type
-  required UpdaterType type = 1 [default=kSGD];
-  // configuration for RMSProp algorithm
-  optional RMSPropProto rmsprop_conf = 50;
-
- enum ChangeMethod {
-    kFixed = 0;
-    kInverseT = 1;
-    kInverse = 2;
-    kExponential = 3;
-    kLinear = 4;
-    kStep = 5;
-    kFixedStep = 6;
-  }
-  // change method for learning rate
-  required ChangeMethod lr_change= 2 [default = kFixed];
-
-  optional FixedStepProto fixedstep_conf=40;
-  optional StepProto step_conf=41;
-  optional LinearProto linear_conf=42;
-  optional ExponentialProto exponential_conf=43;
-  optional InverseProto inverse_conf=44;
-  optional InverseTProto inverset_conf=45;
-
-  optional float momentum = 31 [default = 0];
-  optional float weight_decay = 32 [default = 0];
-  // base learning rate
-  optional float base_lr = 34 [default = 0];
-  // used to avoid divide by 0, i.e. x/(y+delta)
-  optional float delta = 35 [default = 0.00000001];
-}
-
-message RMSPropProto{
-  // history=history*rho_+(1-rho_)*(grad*grad_scale);
-  required float rho = 1;
-}
-
-message FixedStepProto{
-  repeated int32 step = 28;
-  // lr = step_lr[i] if current step >= step[i]
-  repeated float step_lr = 29;
-}
-
-message StepProto{
-  // lr = base_lr * gamma^(step/change_freq)
-  required float gamma = 35 [default = 1];
-  // lr = base_lr * gamma^(step/change_freq)
-  required int32 change_freq= 40;
-}
-message LinearProto{
-  // lr = (1 - step / freq) * base_lr + (step / freq) * final_lr
-  required int32 change_freq= 40;
-  // lr = (1 - step / freq) * base_lr + (step / freq) * final_lr
-  required float final_lr = 39;
-}
-message ExponentialProto{
-  // lr = base / 2^(step/change_freq)
-  required int32 change_freq= 40;
-}
-message InverseTProto{
-  // lr = base_lr / (1+step/final_lr)
-  required float final_lr = 39;
-}
-message InverseProto{
-  // lr = base_lr*(1+gamma*step)^(-pow)
-  required float gamma = 1 [default = 1];
-  // lr = base_lr*(1+gamma*step)^(-pow)
-  required float pow = 2 [default = 0];
-}

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/0478e8cf/src/proto/singa.proto
----------------------------------------------------------------------
diff --git a/src/proto/singa.proto b/src/proto/singa.proto
new file mode 100644
index 0000000..94af58d
--- /dev/null
+++ b/src/proto/singa.proto
@@ -0,0 +1,8 @@
+package singa;
+
+message SingaProto {
+  // ip/hostname:port[,ip/hostname:port]
+  required string zookeeper_host = 1;
+  // if not set, use the default dir of glog
+  optional string log_dir = 2;
+}

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/0478e8cf/src/test/test_cluster.cc
----------------------------------------------------------------------
diff --git a/src/test/test_cluster.cc b/src/test/test_cluster.cc
index c34dd0f..a51126d 100644
--- a/src/test/test_cluster.cc
+++ b/src/test/test_cluster.cc
@@ -1,5 +1,4 @@
 #include "gtest/gtest.h"
-#include "proto/cluster.pb.h"
 #include "utils/cluster.h"
 
 using namespace singa;

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/0478e8cf/src/trainer/trainer.cc
----------------------------------------------------------------------
diff --git a/src/trainer/trainer.cc b/src/trainer/trainer.cc
index 78ec49f..4a0a47a 100644
--- a/src/trainer/trainer.cc
+++ b/src/trainer/trainer.cc
@@ -11,6 +11,7 @@
 #include "trainer/trainer.h"
 #include "mshadow/tensor.h"
 
+
 namespace singa {
 using std::vector;
 using std::map;
@@ -193,7 +194,7 @@ vector<Worker*> Trainer::CreateWorkers(int nthreads, const ModelProto&
mconf){
   return workers;
 }
 
-void Trainer::Resume(ModelProto& mconf) {
+void Trainer::Resume(ModelProto* modelConf) {
   tinydir_dir dir;
   string folder = Cluster::Get()->checkpoint_folder();
   tinydir_open(&dir, folder.c_str());
@@ -223,34 +224,34 @@ void Trainer::Resume(ModelProto& mconf) {
   }
 
   if (latest_step > 0) {
-    mconf.set_step(latest_step);
+    modelConf->set_step(latest_step);
     for (auto ck_file : ck_files)
-      mconf.add_checkpoint(folder + "/" +string(ck_file));
+      modelConf->add_checkpoint(folder + "/" +string(ck_file));
   }
   tinydir_close(&dir);
 }
 
-void Trainer::Start(ModelProto& mconf, const GlobalProto& gconf,
-                    const ClusterProto& cconf, int job, bool resume){
+void Trainer::Start(int job, bool resume,
+    const JobProto& jobConf, const SingaProto& singaConf) {
   // register job to zookeeper at the beginning
-  auto cluster=Cluster::Get(gconf, cconf, job);
-
-  RegisterDefaultClasses(mconf);
+  auto cluster = Cluster::Get(job, singaConf, jobConf.cluster());
+  ModelProto model = jobConf.model();
+  RegisterDefaultClasses(model);
   if (resume)
-    Resume(mconf);
+    Resume(&model);
 
   router_ = new Router();
   router_->Bind(kInprocRouterEndpoint);
   const string hostip = cluster->hostip();
   int port = router_->Bind("tcp://" + hostip + ":*");
   // register endpoint to zookeeper
-  cluster->Register(hostip + ":" + std::to_string(port), getpid());
+  cluster->Register(getpid(), hostip + ":" + std::to_string(port));
 
   int nthreads = 1;
-  const vector<Worker*> workers = CreateWorkers(nthreads, mconf);
+  const vector<Worker*> workers = CreateWorkers(nthreads, model);
   nthreads += workers.size();
-  const vector<Server*> servers = CreateServers(nthreads, mconf);
-  SetupWorkerServer(mconf, workers, servers);
+  const vector<Server*> servers = CreateServers(nthreads, model);
+  SetupWorkerServer(model, workers, servers);
 
 #ifdef USE_MPI
   for (int i = 0; i < nthreads; i++)

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/0478e8cf/src/trainer/worker.cc
----------------------------------------------------------------------
diff --git a/src/trainer/worker.cc b/src/trainer/worker.cc
index 7d779ad..87d251d 100644
--- a/src/trainer/worker.cc
+++ b/src/trainer/worker.cc
@@ -6,7 +6,6 @@
 #include "utils/cluster.h"
 #include "utils/factory.h"
 #include "trainer/worker.h"
-#include "proto/model.pb.h"
 
 namespace singa {
 using std::thread;
@@ -173,6 +172,9 @@ void Worker::Run() {
     step_++;
   }
 
+  // save the model
+  Checkpoint(step_, train_net_);
+
   // clean up
   if(updater_ == nullptr) {
     int svr_grp = grp_id_ / cluster->nworker_groups_per_server_group();

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/0478e8cf/src/utils/cluster.cc
----------------------------------------------------------------------
diff --git a/src/utils/cluster.cc b/src/utils/cluster.cc
index 791332d..6dad2a8 100644
--- a/src/utils/cluster.cc
+++ b/src/utils/cluster.cc
@@ -3,18 +3,17 @@
 #include <unistd.h>
 #include <fstream>
 #include "utils/cluster.h"
-#include "proto/cluster.pb.h"
 #include "proto/common.pb.h"
 #include <sys/stat.h>
 #include <sys/types.h>
 namespace singa {
 
 std::shared_ptr<Cluster> Cluster::instance_;
-Cluster::Cluster(const GlobalProto & global, const ClusterProto &cluster,
-                 int job_id) {
-  cluster_ = cluster;
-  global_ = global;
-  SetupFolders(cluster);
+Cluster::Cluster(
+    int job, const SingaProto& singaConf, const ClusterProto& clusterConf) {
+  cluster_ = clusterConf;
+  singa_ = singaConf;
+  SetupFolders(clusterConf);
   if(server_worker_separate())
     nprocs_=nworker_procs()+nserver_procs();
   else
@@ -38,14 +37,14 @@ Cluster::Cluster(const GlobalProto & global, const ClusterProto &cluster,
     }
   }
 
-  auto rt = new ZKClusterRT(global_.zookeeper_host(), job_id);
+  auto rt = new ZKClusterRT(singa_.zookeeper_host(), job);
   rt->Init();
   cluster_rt_=shared_ptr<ClusterRuntime>(static_cast<ClusterRuntime*>(rt));
 
   hostip_=GetHostIP();
 }
 
-void Cluster::Register(const string& endpoint, int pid) {
+void Cluster::Register(int pid, const string& endpoint) {
   procs_id_=cluster_rt_->RegistProc(endpoint, pid);
   CHECK_GE(procs_id_,0);
   CHECK_LT(procs_id_,nprocs());
@@ -69,9 +68,9 @@ void Cluster::SetupFolders(const ClusterProto &cluster){
   mkdir(checkpoint_folder().c_str(),  S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH);
 }
 
-shared_ptr<Cluster> Cluster::Get(const GlobalProto& global,
-                                 const ClusterProto& cluster, int job_id){
-  instance_.reset(new Cluster(global, cluster, job_id));
+shared_ptr<Cluster> Cluster::Get(
+    int job, const SingaProto& singaConf, const ClusterProto& clusterConf) {
+  instance_.reset(new Cluster(job, singaConf, clusterConf));
   return instance_;
 }
 
@@ -82,7 +81,7 @@ shared_ptr<Cluster> Cluster::Get() {
   }
   return instance_;
 }
-int Cluster::Hash(int gid, int id, int flag){
+int Cluster::Hash(int gid, int id, int flag) {
   int ret=-1;
   if(flag==kServer){
     ret=(flag*cluster_.nserver_groups()+gid)*cluster_.nservers_per_group() + id;
@@ -91,7 +90,7 @@ int Cluster::Hash(int gid, int id, int flag){
   }
   return ret;
 }
-int Cluster::ProcsIDOf(int group_id, int id, int flag){
+int Cluster::ProcsIDOf(int group_id, int id, int flag) {
   return procs_ids_.at(Hash(group_id, id, flag));
 }
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/0478e8cf/src/utils/param.cc
----------------------------------------------------------------------
diff --git a/src/utils/param.cc b/src/utils/param.cc
index 69e3b09..5541acc 100644
--- a/src/utils/param.cc
+++ b/src/utils/param.cc
@@ -3,7 +3,7 @@
 #include <chrono>
 #include <random>
 #include "utils/param.h"
-#include "proto/cluster.pb.h"
+#include "proto/job.pb.h"
 #include "mshadow/tensor.h"
 #include "utils/singleton.h"
 using namespace mshadow;

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/0478e8cf/src/utils/tool.cc
----------------------------------------------------------------------
diff --git a/src/utils/tool.cc b/src/utils/tool.cc
index 267d266..3ffd0e8 100644
--- a/src/utils/tool.cc
+++ b/src/utils/tool.cc
@@ -2,8 +2,8 @@
 #include <glog/logging.h>
 #include <iostream>
 #include <fstream>
-#include "proto/cluster.pb.h"
 #include "utils/cluster_rt.h"
+#include "proto/singa.pb.h"
 #include "utils/common.h"
 #ifndef GFLAGS_GFLAGS_H_
   namespace gflags = google;
@@ -15,7 +15,7 @@ int main(int argc, char **argv) {
   google::InitGoogleLogging(argv[0]);
   gflags::ParseCommandLineFlags(&argc, &argv, true);
 
-  singa::GlobalProto global;
+  singa::SingaProto global;
   singa::ReadProtoFromTextFile(FLAGS_global.c_str(), &global);
   singa::SetupLog(global.log_dir(), "SingaTool");
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/0478e8cf/src/utils/updater.cc
----------------------------------------------------------------------
diff --git a/src/utils/updater.cc b/src/utils/updater.cc
index 8e949ef..18e53ce 100644
--- a/src/utils/updater.cc
+++ b/src/utils/updater.cc
@@ -2,7 +2,7 @@
 #include "utils/updater.h"
 #include "mshadow/tensor.h"
 #include "mshadow/cxxnet_op.h"
-#include "proto/model.pb.h"
+#include "proto/job.pb.h"
 using namespace mshadow;
 using namespace mshadow::expr;
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/0478e8cf/tool/gen_hosts.py
----------------------------------------------------------------------
diff --git a/tool/gen_hosts.py b/tool/gen_hosts.py
index a3bec47..e38c8bf 100644
--- a/tool/gen_hosts.py
+++ b/tool/gen_hosts.py
@@ -4,19 +4,20 @@ import argparse
 import os
 import sys
 from google.protobuf import text_format
-from pb2.cluster_pb2 import ClusterProto
+from pb2.job_pb2 import JobProto
 
 # parse command line
 parser = argparse.ArgumentParser(description='Generate host list from host file for a SINGA
job')
-parser.add_argument('-conf', dest='conf', metavar='CONF_FILE', required=True, help='cluster.conf
file')
+parser.add_argument('-conf', dest='conf', metavar='CONF_FILE', required=True, help='job.conf
file')
 parser.add_argument('-hosts', dest='hosts', metavar='HOST_FILE', required=True, help='global
host file')
 parser.add_argument('-output', dest='output', metavar='OUTPUT_FILE', required=True, help='generated
list')
 args = parser.parse_args();
 
 # read from .conf file
 fd_conf = open(args.conf, 'r')
-cluster = ClusterProto()
-text_format.Merge(str(fd_conf.read()), cluster)
+job = JobProto()
+text_format.Merge(str(fd_conf.read()), job)
+cluster = job.cluster
 nworker_procs = cluster.nworker_groups * cluster.nworkers_per_group / cluster.nworkers_per_procs
 nserver_procs = cluster.nserver_groups * cluster.nservers_per_group / cluster.nservers_per_procs
 nprocs = 0
@@ -39,10 +40,10 @@ fd_hosts.close()
 # write to output file
 num_hosts = len(hosts)
 if (num_hosts == 0):
-  print "contains no valid host %s" % args.hosts
+  print "Contains no valid host %s" % args.hosts
   sys.exit(1)
 fd_output = open(args.output, 'w')
 for i in range(nprocs):
   fd_output.write(hosts[i % num_hosts] + '\n')
 fd_output.close()
-print 'generate host list at %s' % args.output
+print 'Generate host list to %s' % args.output



Mime
View raw message