ctakes-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From dlig...@apache.org
Subject svn commit: r1758973 - in /ctakes/trunk/ctakes-temporal/scripts/nn: ./ classify.sh cleartk_io.py dataset.py dataset.pyc et_cleartk_io.py et_cleartk_io.pyc nn_models.py nn_models.pyc predict.py reqs.txt train.sh train_and_package.py
Date Fri, 02 Sep 2016 15:21:50 GMT
Author: dligach
Date: Fri Sep  2 15:21:49 2016
New Revision: 1758973

URL: http://svn.apache.org/viewvc?rev=1758973&view=rev
Log:
scripts to train dima's keras models; initial checkin

Added:
    ctakes/trunk/ctakes-temporal/scripts/nn/
    ctakes/trunk/ctakes-temporal/scripts/nn/classify.sh   (with props)
    ctakes/trunk/ctakes-temporal/scripts/nn/cleartk_io.py
    ctakes/trunk/ctakes-temporal/scripts/nn/dataset.py   (with props)
    ctakes/trunk/ctakes-temporal/scripts/nn/dataset.pyc   (with props)
    ctakes/trunk/ctakes-temporal/scripts/nn/et_cleartk_io.py
    ctakes/trunk/ctakes-temporal/scripts/nn/et_cleartk_io.pyc   (with props)
    ctakes/trunk/ctakes-temporal/scripts/nn/nn_models.py
    ctakes/trunk/ctakes-temporal/scripts/nn/nn_models.pyc   (with props)
    ctakes/trunk/ctakes-temporal/scripts/nn/predict.py
    ctakes/trunk/ctakes-temporal/scripts/nn/reqs.txt
    ctakes/trunk/ctakes-temporal/scripts/nn/train.sh   (with props)
    ctakes/trunk/ctakes-temporal/scripts/nn/train_and_package.py

Added: ctakes/trunk/ctakes-temporal/scripts/nn/classify.sh
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/scripts/nn/classify.sh?rev=1758973&view=auto
==============================================================================
--- ctakes/trunk/ctakes-temporal/scripts/nn/classify.sh (added)
+++ ctakes/trunk/ctakes-temporal/scripts/nn/classify.sh Fri Sep  2 15:21:49 2016
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+source $(dirname $0)/env/bin/activate
+python $(dirname $0)/predict.py $*
+ret=$?
+deactivate
+exit $ret

Propchange: ctakes/trunk/ctakes-temporal/scripts/nn/classify.sh
------------------------------------------------------------------------------
    svn:executable = *

Added: ctakes/trunk/ctakes-temporal/scripts/nn/cleartk_io.py
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/scripts/nn/cleartk_io.py?rev=1758973&view=auto
==============================================================================
--- ctakes/trunk/ctakes-temporal/scripts/nn/cleartk_io.py (added)
+++ ctakes/trunk/ctakes-temporal/scripts/nn/cleartk_io.py Fri Sep  2 15:21:49 2016
@@ -0,0 +1,170 @@
+#!/usr/bin/env python3
+
+import numpy as np
+import os, os.path
+import subprocess
+
+
+def string_label_to_label_vector(label_string, outcome_maps):    
+    label_vec = []
+    
+    for label_val in label_string.split('#'):
+        (label, val) = label_val.split('=')
+        cur_map = outcome_maps[label]
+        label_ind = cur_map[val]
+        label_vec.append(label_ind)
+        
+    return label_vec
+    
+def get_data_dimensions(data_file):
+    wc_out = subprocess.check_output(['wc',  data_file])
+    wc_fields = wc_out.decode().strip().split(' ')
+    file_len = int(wc_fields[0])
+
+    num_feats = 0
+    for line in open(data_file):
+        max_dim = int( line.rstrip().split(' ')[-1].split(':')[0] )
+        if max_dim > num_feats:
+            num_feats = max_dim
+
+    return (file_len, num_feats)
+
+def flatten_outputs(Y):
+    maxes = Y.max(0)
+    #print("Maxes = %s" % (maxes) )
+    reqd_dims = 0
+    indices = [0]
+    
+    ## Create an indices array that maps from "true" label indices to neural network 
+    ## output layer indices -- binary labels map to single output nodes (2->1) while n-ary
+    ## labels map to n nodes.
+    for val in maxes:
+        if val == 1:
+            reqd_dims += 1
+        elif val > 1:
+            reqd_dims += (int(val) + 1)
+        else:
+            raise Exception("There is a column with all zeros!")
+            
+        indices.append(reqd_dims)
+
+    Y_adj = np.zeros( (Y.shape[0], reqd_dims) )
+    for row_ind in range(0, Y.shape[0]):
+        for col_ind in range(0, Y.shape[1]):
+            if maxes[col_ind] == 1:
+                ## For binary variables just need the offset and copy the value
+                Y_adj[row_ind][ int(indices[col_ind]) ] = Y[row_ind][col_ind]
+            else:
+                ## for n-ary variables we use the value to find the offset that will 
+                ## be set to 1.
+                Y_adj[row_ind][ int(indices[col_ind]) + int(Y[row_ind][col_ind]) ] = 1
+    
+    return Y_adj, indices
+
+def read_outcome_maps(dirname):
+    raw_outcomes = []
+    raw_outcomes.append(None)
+    
+    derived_maps = {}
+    lookup_map = {}
+    ## First read outcome file
+    for line in open(os.path.join(dirname, 'outcome-lookup.txt') ):
+        (index, label) = line.rstrip().split(' ')
+        raw_outcomes.append(label)
+        
+        for task_label in label.split('#'):
+            #print(task_label)
+            (task, val) = task_label.rstrip().split("=")
+            if not task in derived_maps:
+                derived_maps[task] = {}
+                lookup_map[task] = []
+                
+            cur_map = derived_maps[task]
+            lookup = lookup_map[task]
+            if not val in cur_map:
+                cur_map[val] = len(cur_map)
+                lookup.append(val)
+    
+    return raw_outcomes, derived_maps, lookup_map
+
+def outcome_list(raw_outcomes):
+    outcomes = []
+    for outcome_val in raw_outcomes[1].split("#"):
+        outcomes.append(outcome_val.split("=")[0])
+    
+    return outcomes
+    
+def read_multitask_liblinear(dirname):
+    
+    raw_outcomes, derived_maps, outcome_lookups = read_outcome_maps(dirname)
+        
+    data_file = os.path.join(dirname, 'training-data.liblinear')
+    
+    (data_points, feat_dims) = get_data_dimensions(data_file)
+    
+    ## Remove bias feature -- will be part of any neural network
+    label_dims = len(derived_maps)
+    
+    label_matrix = np.zeros( (data_points, label_dims) )
+    feat_matrix = np.zeros( (data_points, feat_dims) )
+    
+    line_ind = 0
+    for line in open( data_file ):
+        label_and_feats = line.rstrip().split(' ')
+        label = label_and_feats[0]
+        string_label = raw_outcomes[int(label)]
+        label_vec = string_label_to_label_vector(string_label, derived_maps)
+        
+        for ind, val in enumerate(label_vec):
+            label_matrix[line_ind, ind] = val
+    
+        ## Go from 2 on -- skip both the label and the first feature since it will be
+        ## the bias term from the liblinear data writer.
+#        feat_list = feature_array_to_list( label_and_feats[1:], feat_dims )
+#        feat_matrix[line_ind,:] = feat_list[1:]
+        feat_matrix[line_ind, :] = feature_array_to_list( label_and_feats[1:], feat_dims
)
+#        for feat in label_and_feats[1:]:
+#            (ind, val) = feat.split(':')
+#            feat_ind = int(ind) - 1    ## since feats are indexed at 1
+#            feat_matrix[line_ind, feat_ind] = float(val)
+            
+                
+        line_ind += 1
+
+    return label_matrix, feat_matrix
+
+def convert_multi_output_to_string(outcomes, outcome_list, lookup_map, raw_outcomes):
+    """Return the int value corresponding to the class implied by the
+    set of outputs in the outcomes array."""
+    str = ''
+    for ind, label in enumerate(outcome_list):
+        str += label
+        str += "="
+        str += lookup_map[label][outcomes[ind]]
+        str += "#"
+        
+    str = str[:-1]
+    return str
+    
+def feature_string_to_list( feat_string, length=-1 ):
+    return feature_array_to_list( feat_string.split(' '), length )
+
+def feature_array_to_list( feats, length=-1 ):
+    if length == -1:
+        length = len(feats)
+        
+    #f = np.zeros(length)
+    f = [0] * length
+    
+    for feat in feats:
+        (ind, val) = feat.split(':')
+        ind = int(ind) - 1
+        if int(ind) >= len(f):
+            raise Exception("Feature index %d is larger than feature vector length %d --
you may need to specify the expected length of the vector." % (int(ind), len(f) ) )
+        f[int(ind)] = val
+    
+    return f
+    
+if __name__ == "__main__":
+    (labels, feats) = read_multitask_liblinear('data_testing/multitask_assertion/train_and_test/')
+    print("train[0][100] = %f" % feats[0][100])

Added: ctakes/trunk/ctakes-temporal/scripts/nn/dataset.py
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/scripts/nn/dataset.py?rev=1758973&view=auto
==============================================================================
--- ctakes/trunk/ctakes-temporal/scripts/nn/dataset.py (added)
+++ ctakes/trunk/ctakes-temporal/scripts/nn/dataset.py Fri Sep  2 15:21:49 2016
@@ -0,0 +1,154 @@
+#!/usr/bin/env python
+
+import numpy as np
+
+import sys
+sys.dont_write_bytecode = True
+
+import ConfigParser
+
+import glob, string, collections, operator
+
+from fnmatch import fnmatch
+
+label2int = {
+  'none':0,
+  'contains':1,
+  'contains-1':2
+  }
+
+# will have to do this eventually
+# label2int = {
+#   'none': 0,
+#   'contains': 1,
+#   'contains-1': 2,
+#   'before': 3,
+#   'before-1': 4,
+#   'begins-on': 5,
+#   'begins-on-1': 6,
+#   'ends-on': 7,
+#   'ends-on-1': 8,
+#   'overlap': 9,
+#   'overlap-1': 10,
+# }
+
+class DatasetProvider:
+  """THYME relation data"""
+  
+  def __init__(self, file_names):
+    """Index words by frequency in a list of files"""
+
+    self.alphabet = {} # words indexed by frequency
+
+    unigrams = [] # read entire corpus into a list
+    for file_name in file_names:
+      for line in open(file_name):
+        label, text = line.strip().split('|')
+        unigrams.extend(text.split())
+
+    index = 1 # zero used to encode unknown words
+    unigram_counts = collections.Counter(unigrams)
+    self.alphabet['oov_word'] = 0
+    for unigram, count in unigram_counts.most_common():
+      self.alphabet[unigram] = index
+      index = index + 1
+
+  def load(self, path):
+    """Convert sentences (examples) into lists of indices"""
+
+    examples = []
+    labels = []
+    for line in open(path):
+      label, text = line.strip().split('|')
+      example = []
+      for unigram in text.split():
+        example.append(self.alphabet[unigram])
+      examples.append(example)
+      labels.append(label2int[label])
+
+    return examples, labels
+
+  def load_if_oov(self, path):
+
+    examples = []
+    labels = []
+    for line in open(path):
+      label,text = line.strip().split('|')
+      example = []
+      for unigram in text.split():
+        if(self.alphabet.has_key(unigram)):
+            example.append(self.alphabet[unigram])
+        else:
+            example.append(self.alphabet["none"])
+      examples.append(example)
+      labels.append(label2int[label])
+
+    return examples, labels
+
+  def load_by_region(self, path):
+    pres = []
+    arg1s = []
+    conts = []
+    arg2s = []
+    posts = []
+    labels = []
+    for line in open(path):
+      label,text = line.strip().split('|')
+      pre,arg1,cont,arg2,post = self.processText(text)
+      pres.append(pre)
+      arg1s.append(arg1)
+      conts.append(cont)
+      arg2s.append(arg2)
+      posts.append(post)
+      labels.append(label2int[label])
+
+    return pres, arg1s, conts, arg2s, posts, labels
+
+  def processText(self, text):
+    pre= []
+    arg1= []
+    cont= []
+    arg2= []
+    post= []
+
+    tag = 0
+    for unigram in text.split():
+      idx = self.alphabet[unigram]
+      if( fnmatch(unigram, '<*>')):
+        tag = tag + 1
+        continue
+      if(tag ==0 ):
+        pre.append(idx)
+      elif(tag == 1):
+        arg1.append(idx)
+      elif(tag == 2):
+        cont.append(idx)
+      elif(tag == 3):
+        arg2.append(idx)
+      elif(tag == 4):
+        post.append(idx)
+
+    return pre, arg1, cont, arg2, post
+
+
+
+if __name__ == "__main__":
+
+  cfg = ConfigParser.ConfigParser()
+  cfg.read('settings.ini')
+
+  dataset = DatasetProvider([cfg.get('data', 'train'),
+                             cfg.get('data', 'test')])
+  print 'alphabet size:', len(dataset.alphabet)
+
+  x,y = dataset.load(cfg.get('data', 'test'))
+
+  print 'max seq len:', max([len(s) for s in x])
+  print 'number of examples:', len(x)
+  print 'number of labels:', len(set(y))
+  print 'label counts:', collections.Counter(y)
+  print 'first 10 examples:', x[:10]
+  print 'class proportions:'
+  counter = collections.Counter(y)
+  for label in counter:
+    print label, counter[label] / float(len(y)), float(len(y)) / counter[label]

Propchange: ctakes/trunk/ctakes-temporal/scripts/nn/dataset.py
------------------------------------------------------------------------------
    svn:executable = *

Added: ctakes/trunk/ctakes-temporal/scripts/nn/dataset.pyc
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/scripts/nn/dataset.pyc?rev=1758973&view=auto
==============================================================================
Binary file - no diff available.

Propchange: ctakes/trunk/ctakes-temporal/scripts/nn/dataset.pyc
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: ctakes/trunk/ctakes-temporal/scripts/nn/et_cleartk_io.py
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/scripts/nn/et_cleartk_io.py?rev=1758973&view=auto
==============================================================================
--- ctakes/trunk/ctakes-temporal/scripts/nn/et_cleartk_io.py (added)
+++ ctakes/trunk/ctakes-temporal/scripts/nn/et_cleartk_io.py Fri Sep  2 15:21:49 2016
@@ -0,0 +1,207 @@
+#!/usr/bin/env python
+
+import numpy as np
+import os, os.path
+import subprocess
+
+
+def string_label_to_label_vector(label_string, outcome_maps):    
+    label_vec = []
+    
+    for label_val in label_string.split('#'):
+        (label, val) = label_val.split('=')
+        cur_map = outcome_maps[label]
+        label_ind = cur_map[val]
+        label_vec.append(label_ind)
+        
+    return label_vec
+    
+def get_data_dimensions(data_file):
+    wc_out = subprocess.check_output(['wc',  data_file])
+    wc_fields = wc_out.decode().strip().split(' ')
+    file_len = int(wc_fields[0])
+
+    num_feats = 0
+    for line in open(data_file):
+        max_dim = int( line.rstrip().split(' ')[-1].split(':')[0] )
+        if max_dim > num_feats:
+            num_feats = max_dim
+
+    return (file_len, num_feats)
+
+def flatten_outputs(Y):
+    maxes = Y.max(0)
+    #print("Maxes = %s" % (maxes) )
+    reqd_dims = 0
+    indices = [0]
+    
+    ## Create an indices array that maps from "true" label indices to neural network 
+    ## output layer indices -- binary labels map to single output nodes (2->1) while n-ary
+    ## labels map to n nodes.
+    for val in maxes:
+        if val == 1:
+            reqd_dims += 1
+        elif val > 1:
+            reqd_dims += (int(val) + 1)
+        else:
+            raise Exception("There is a column with all zeros!")
+            
+        indices.append(reqd_dims)
+
+    Y_adj = np.zeros( (Y.shape[0], reqd_dims) )
+    for row_ind in range(0, Y.shape[0]):
+        for col_ind in range(0, Y.shape[1]):
+            if maxes[col_ind] == 1:
+                ## For binary variables just need the offset and copy the value
+                Y_adj[row_ind][ int(indices[col_ind]) ] = Y[row_ind][col_ind]
+            else:
+                ## for n-ary variables we use the value to find the offset that will 
+                ## be set to 1.
+                Y_adj[row_ind][ int(indices[col_ind]) + int(Y[row_ind][col_ind]) ] = 1
+    
+    return Y_adj, indices
+
+def read_liblinear(dirname):
+    data_file = os.path.join(dirname, 'training-data.liblinear')
+
+    (data_points, feat_dims) = get_data_dimensions(data_file)
+
+    label_array = np.zeros( (data_points, 1), dtype=np.int )
+    feat_matrix = np.zeros( (data_points, feat_dims) )
+
+    line_ind = 0
+    for line in open( data_file ):
+        label_and_feats = line.rstrip().split(' ')
+        label = label_and_feats[0]
+
+        label_array[line_ind] = float(label) - 1
+
+        ## Go from 1 on -- skip the label
+        ## the bias term from the liblinear data writer.
+        feat_matrix[line_ind, :] = feature_array_to_list( label_and_feats[1:], feat_dims
)
+
+        line_ind += 1
+
+    label_matrix = np.zeros( (data_points, label_array.max()+1) )
+
+    for ind,val in np.ndenumerate(label_array):
+        label_matrix[ind,val] = 1
+
+    return label_matrix, feat_matrix
+
+def read_outcome_maps(dirname):
+    raw_outcomes = []
+    raw_outcomes.append(None)
+    
+    derived_maps = {}
+    lookup_map = {}
+    ## First read outcome file
+    for line in open(os.path.join(dirname, 'outcome-lookup.txt') ):
+        (index, label) = line.rstrip().split(' ')
+        raw_outcomes.append(label)
+        
+        #for task_label in label.split('#'):
+            #print(task_label)
+         #   (task, val) = task_label.rstrip().split("=")
+         #   if not task in derived_maps:
+         #       derived_maps[task] = {}
+         #       lookup_map[task] = []
+                
+         #   cur_map = derived_maps[task]
+         #   lookup = lookup_map[task]
+         #   if not val in cur_map:
+         #       cur_map[val] = len(cur_map)
+         #       lookup.append(val)
+    
+    return raw_outcomes#, derived_maps, lookup_map
+
+def outcome_list(raw_outcomes):
+    outcomes = []
+    for outcome_val in raw_outcomes[1].split("#"):
+        outcomes.append(outcome_val.split("=")[0])
+    
+    return outcomes
+    
+def read_multitask_liblinear(dirname):
+    
+    #raw_outcomes, derived_maps, outcome_lookups = read_outcome_maps(dirname)
+        
+    data_file = os.path.join(dirname, 'training-data.liblinear')
+    
+    (data_points, feat_dims) = get_data_dimensions(data_file)
+    
+    ## Remove bias feature -- will be part of any neural network
+    label_dims = 1 #len(derived_maps)
+    
+    label_matrix = np.zeros( (data_points, label_dims) )
+    feat_matrix = np.zeros( (data_points, feat_dims) )
+    
+    line_ind = 0
+    for line in open( data_file ):
+        label_and_feats = line.rstrip().split(' ')
+        label = label_and_feats[0]
+        #string_label = raw_outcomes[int(label)]
+        #label_vec = string_label_to_label_vector(string_label, derived_maps)
+        
+        #for ind, val in enumerate(label_vec):
+        label_matrix[line_ind, 0] = label
+    
+        ## Go from 2 on -- skip both the label and the first feature since it will be
+        ## the bias term from the liblinear data writer.
+#        feat_list = feature_array_to_list( label_and_feats[1:], feat_dims )
+#        feat_matrix[line_ind,:] = feat_list[1:]
+        feat_matrix[line_ind, :] = feature_array_to_list( label_and_feats[1:], feat_dims
)
+#        for feat in label_and_feats[1:]:
+#            (ind, val) = feat.split(':')
+#            feat_ind = int(ind) - 1    ## since feats are indexed at 1
+#            feat_matrix[line_ind, feat_ind] = float(val)
+            
+                
+        line_ind += 1
+
+    return label_matrix, feat_matrix
+
+def convert_multi_output_to_string(outcomes, outcome_list, lookup_map, raw_outcomes):
+    """Return the int value corresponding to the class implied by the
+    set of outputs in the outcomes array."""
+    str = ''
+    for ind, label in enumerate(outcome_list):
+        str += label
+        str += "="
+        str += lookup_map[label][outcomes[ind]]
+        str += "#"
+        
+    str = str[:-1]
+    return str
+
+def get_outcome_array(working_dir):
+    labels = []
+
+    for line in open(os.path.join(working_dir, "outcome-lookup.txt")):
+       (ind, val) = line.rstrip().split(" ")
+       labels.append(val)
+
+    return labels
+    
+def feature_string_to_list( feat_string, length=-1 ):
+    return feature_array_to_list( feat_string.split(' '), length )
+
+def feature_array_to_list( feats, length=-1 ):
+    if length == -1:
+        length = len(feats)
+        
+    #f = np.zeros(length)
+    f = [0] * length
+    
+    for feat in feats:
+        (ind, val) = feat.split(':')
+        ind = int(ind) - 1
+        if int(ind) >= len(f):
+            raise Exception("Feature index %d is larger than feature vector length %d --
you may need to specify the expected length of the vector." % (int(ind), len(f) ) )
+        f[int(ind)] = val
+    
+    return f
+    
+if __name__ == "__main__":
+    (labels, feats) = read_multitask_liblinear('target/eval/thyme/train_and_test/event-time/')
+    print("train[0][100] = %f" % feats[0][100])

Added: ctakes/trunk/ctakes-temporal/scripts/nn/et_cleartk_io.pyc
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/scripts/nn/et_cleartk_io.pyc?rev=1758973&view=auto
==============================================================================
Binary file - no diff available.

Propchange: ctakes/trunk/ctakes-temporal/scripts/nn/et_cleartk_io.pyc
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: ctakes/trunk/ctakes-temporal/scripts/nn/nn_models.py
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/scripts/nn/nn_models.py?rev=1758973&view=auto
==============================================================================
--- ctakes/trunk/ctakes-temporal/scripts/nn/nn_models.py (added)
+++ ctakes/trunk/ctakes-temporal/scripts/nn/nn_models.py Fri Sep  2 15:21:49 2016
@@ -0,0 +1,121 @@
+#!/usr/bin/env python
+
+from keras.models import Sequential
+from keras.layers import Dense, Dropout, Activation, Convolution1D, MaxPooling1D, Lambda,
Flatten, Merge
+from keras.optimizers import SGD
+from keras import backend as K
+from keras.optimizers import RMSprop
+
+def get_mlp_model(dimension, num_outputs, layers=(64, 256, 64) ):
+    model = Sequential()
+    sgd = get_mlp_optimizer()
+
+    drop = 0.5
+
+    # Dense(64) is a fully-connected layer with 64 hidden units.
+    # in the first layer, you must specify the expected input data shape:
+    # here, 20-dimensional vectors.
+    model.add(Dense(layers[0], input_dim=dimension, init='uniform'))
+    model.add(Activation('relu'))
+    model.add(Dropout(drop))
+    model.add(Dense(layers[1], init='uniform'))
+    model.add(Activation('relu'))
+    model.add(Dropout(drop))
+    #model.add(Dense(layers[2], init='uniform'))
+    #model.add(Activation('relu'))
+    #model.add(Dropout(drop))
+
+#            model.add(Dense(layers[2], init='uniform'))
+#            model.add(Activation('relu'))
+#            model.add(Dropout(0.5))
+
+    if num_outputs == 1:
+        model.add(Dense(1, init='uniform'))
+        model.add(Activation('sigmoid'))
+        model.compile(loss='binary_crossentropy',
+                      optimizer=sgd,
+                      metrics=['accuracy'])
+    else:
+        model.add(Dense(num_outputs, init='uniform'))
+        model.add(Activation('softmax'))                
+        model.compile(loss='categorical_crossentropy',
+                      optimizer=sgd,
+                      metrics=['accuracy'])
+
+    return model
+
+def get_mlp_optimizer():
+    return SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=True)
+
+def get_cnn_model(dimension, num_outputs, nb_filter = 200, layers=(64, 64, 256) ):
+    model = Sequential()
+    sgd = get_mlp_optimizer()
+
+    ## Convolutional layers:
+    model.add(Convolution1D(nb_filter, 3, input_shape=(6,200)))
+    def max_1d(X):
+        return K.max(X, axis=1)
+
+    model.add(Lambda(max_1d, output_shape=(nb_filter,)))
+
+    
+    #model.add(MaxPooling1D())
+
+    model.add(Dense(layers[1], init='uniform'))
+    model.add(Activation('relu'))
+    model.add(Dropout(0.5))
+
+#    model.add(Dense(layers[2], init='uniform'))
+#    model.add(Activation('relu'))
+#    model.add(Dropout(0.5))
+
+    if num_outputs == 1:
+        model.add(Dense(1, init='uniform'))
+        model.add(Activation('sigmoid'))
+        model.compile(loss='binary_crossentropy',
+                      optimizer=sgd,
+                      metrics=['accuracy'])
+    else:
+        model.add(Dense(num_outputs, init='uniform'))
+        model.add(Activation('softmax'))                
+        model.compile(loss='categorical_crossentropy',
+                      optimizer=sgd,
+                      metrics=['accuracy'])
+
+    return model
+
+def get_dima_cnn_model(dimension, num_outputs):
+    filtlens = "3,4,5"
+    branches = [] # models to be merged
+    train_xs = []
+    for filterLen in filtlens.split(','):
+        branch = Sequential()
+        branch.add(Convolution1D(nb_filter=200,
+                             filter_length=int(filterLen),
+                             border_mode='valid',
+                             activation='relu',
+                             subsample_length=1,
+                             input_shape=(6,200)))
+        branch.add(MaxPooling1D(pool_length=2))
+        branch.add(Flatten())
+
+        branches.append(branch)
+    model = Sequential()
+    model.add(Merge(branches, mode='concat'))
+
+    dropout = 0.25
+    model.add(Dense(250))
+    model.add(Dropout(dropout))
+    model.add(Activation('relu'))
+
+    model.add(Dropout(dropout))
+    model.add(Dense(num_outputs))
+    model.add(Activation('softmax'))
+
+    optimizer = RMSprop(lr=0.001,
+                      rho=0.9, epsilon=1e-08)
+    model.compile(loss='categorical_crossentropy',
+                optimizer=optimizer,
+                metrics=['accuracy'])
+
+    return model, branches
\ No newline at end of file

Added: ctakes/trunk/ctakes-temporal/scripts/nn/nn_models.pyc
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/scripts/nn/nn_models.pyc?rev=1758973&view=auto
==============================================================================
Binary file - no diff available.

Propchange: ctakes/trunk/ctakes-temporal/scripts/nn/nn_models.pyc
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: ctakes/trunk/ctakes-temporal/scripts/nn/predict.py
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/scripts/nn/predict.py?rev=1758973&view=auto
==============================================================================
--- ctakes/trunk/ctakes-temporal/scripts/nn/predict.py (added)
+++ ctakes/trunk/ctakes-temporal/scripts/nn/predict.py Fri Sep  2 15:21:49 2016
@@ -0,0 +1,76 @@
+#!python
+
+from keras.models import Sequential, model_from_json
+import numpy as np
+import et_cleartk_io as ctk_io
+import sys
+import os.path
+import pickle
+from keras.preprocessing.sequence import pad_sequences
+
+def main(args):
+    if len(args) < 1:
+        sys.stderr.write("Error - one required argument: <model directory>\n")
+        sys.exit(-1)
+
+    working_dir = args[0]
+
+    int2label = {
+        0:'none',
+        1:'CONTAINS',
+        2:'CONTAINS-1'
+    }
+
+    ## Load models and weights:
+    #outcomes = ctk_io.get_outcome_array(working_dir)
+    model_dir = "/Users/Dima/Loyola/Workspaces/cTakes/ctakes/ctakes-temporal/target/eval/thyme/train_and_test/event-time"
+    maxlen   = pickle.load(open(os.path.join(model_dir, "maxlen.p"), "rb"))
+    alphabet = pickle.load(open(os.path.join(model_dir, "alphabet.p"), "rb"))
+    #print("Outcomes array is %s" % (outcomes) )
+    model = model_from_json(open(os.path.join(model_dir, "model_0.json")).read())
+    model.load_weights(os.path.join(model_dir, "model_0.h5"))
+
+    while True:
+        try:
+            line = sys.stdin.readline().rstrip()
+            if not line:
+                break
+
+            ## Convert the line of Strings to lists of indices
+            feats=[]
+            for unigram in line.rstrip().split():
+                if(alphabet.has_key(unigram)):
+                    feats.append(alphabet[unigram])
+                else:
+                    feats.append(alphabet["none"])
+            if(len(feats)> maxlen):
+                feats=feats[0:maxlen]
+            test_x = pad_sequences([feats], maxlen=maxlen)
+            #feats = np.reshape(feats, (1, 6, input_dims / 6))
+            #feats = np.reshape(feats, (1, input_dims))
+
+            X_dup = []
+            X_dup.append(test_x)
+            X_dup.append(test_x)
+            X_dup.append(test_x)
+
+            out = model.predict(X_dup, batch_size=50)[0]
+            # print("Out is %s and decision is %d" % (out, out.argmax()))
+        except KeyboardInterrupt:
+            sys.stderr.write("Caught keyboard interrupt\n")
+            break
+
+        if line == '':
+            sys.stderr.write("Encountered empty string so exiting\n")
+            break
+
+        out_str = int2label[out.argmax()]
+
+        print(out_str)
+        sys.stdout.flush()
+
+    sys.exit(0)
+
+
+if __name__ == "__main__":
+    main(sys.argv[1:])

Added: ctakes/trunk/ctakes-temporal/scripts/nn/reqs.txt
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/scripts/nn/reqs.txt?rev=1758973&view=auto
==============================================================================
--- ctakes/trunk/ctakes-temporal/scripts/nn/reqs.txt (added)
+++ ctakes/trunk/ctakes-temporal/scripts/nn/reqs.txt Fri Sep  2 15:21:49 2016
@@ -0,0 +1,10 @@
+h5py==2.6.0
+Keras==1.0.4
+numpy==1.11.0
+PyYAML==3.11
+scipy==0.17.1
+scikit-learn==0.17.1
+six==1.10.0
+sklearn==0.0
+Theano==0.8.2
+wheel==0.29.0

Added: ctakes/trunk/ctakes-temporal/scripts/nn/train.sh
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/scripts/nn/train.sh?rev=1758973&view=auto
==============================================================================
--- ctakes/trunk/ctakes-temporal/scripts/nn/train.sh (added)
+++ ctakes/trunk/ctakes-temporal/scripts/nn/train.sh Fri Sep  2 15:21:49 2016
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+source $(dirname $0)/env/bin/activate
+python $(dirname $0)/train_and_package.py $*
+ret=$?
+deactivate
+exit $ret

Propchange: ctakes/trunk/ctakes-temporal/scripts/nn/train.sh
------------------------------------------------------------------------------
    svn:executable = *

Added: ctakes/trunk/ctakes-temporal/scripts/nn/train_and_package.py
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/scripts/nn/train_and_package.py?rev=1758973&view=auto
==============================================================================
--- ctakes/trunk/ctakes-temporal/scripts/nn/train_and_package.py (added)
+++ ctakes/trunk/ctakes-temporal/scripts/nn/train_and_package.py Fri Sep  2 15:21:49 2016
@@ -0,0 +1,117 @@
+#!/usr/bin/env python
+
+import sklearn as sk
+
+import numpy as np
+np.random.seed(1337)
+
+import et_cleartk_io as ctk_io
+import nn_models
+
+import sys
+import os.path
+
+import dataset
+
+import keras as k
+from keras.utils.np_utils import to_categorical
+from keras.optimizers import RMSprop
+from keras.preprocessing.sequence import pad_sequences
+from keras.models import Sequential
+from keras.layers import Merge
+from keras.layers.core import Dense, Dropout, Activation, Flatten
+from keras.layers.convolutional import Convolution1D, MaxPooling1D
+from keras.layers.embeddings import Embedding
+
+import pickle
+
+def main(args):
+    if len(args) < 1:
+        sys.stderr.write("Error - one required argument: <data directory>\n")
+        sys.exit(-1)
+
+    working_dir = args[0]
+
+    #read in data file
+#    print("Reading data...")
+    #Y, X = ctk_io.read_liblinear(working_dir) # ('data_testing/multitask_assertion/train_and_test')
+    data_file = os.path.join(working_dir, 'training-data.liblinear')
+
+    # learn alphabet from training and test data
+    dataset1 = dataset.DatasetProvider([data_file])
+    # now load training examples and labels
+    train_x, train_y = dataset1.load(data_file)
+
+    init_vectors = None #used for pre-trained embeddings
+    
+    # turn x and y into numpy array among other things
+    maxlen = max([len(seq) for seq in train_x])
+    outcomes = set(train_y)
+    classes = len(outcomes)
+
+    train_x = pad_sequences(train_x, maxlen=maxlen)
+    train_y = to_categorical(np.array(train_y), classes)
+
+    pickle.dump(maxlen, open(os.path.join(working_dir, 'maxlen.p'),"wb"))
+    pickle.dump(dataset1.alphabet, open(os.path.join(working_dir, 'alphabet.p'),"wb"))
+    #test_x = pad_sequences(test_x, maxlen=maxlen)
+    #test_y = to_categorical(np.array(test_y), classes)
+
+    print 'train_x shape:', train_x.shape
+    print 'train_y shape:', train_y.shape
+
+    branches = [] # models to be merged
+    train_xs = [] # train x for each branch
+    #test_xs = []  # test x for each branch
+
+    filtlens = "3,4,5"
+    for filter_len in filtlens.split(','):
+        branch = Sequential()
+        branch.add(Embedding(len(dataset1.alphabet),
+                         300,
+                         input_length=maxlen,
+                         weights=init_vectors))
+        branch.add(Convolution1D(nb_filter=200,
+                             filter_length=int(filter_len),
+                             border_mode='valid',
+                             activation='relu',
+                             subsample_length=1))
+        branch.add(MaxPooling1D(pool_length=2))
+        branch.add(Flatten())
+
+        branches.append(branch)
+        train_xs.append(train_x)
+        #test_xs.append(test_x)
+    model = Sequential()
+    model.add(Merge(branches, mode='concat'))
+
+    model.add(Dense(250))#cfg.getint('cnn', 'hidden')))
+    model.add(Dropout(0.25))#cfg.getfloat('cnn', 'dropout')))
+    model.add(Activation('relu'))
+
+    model.add(Dropout(0.25))#cfg.getfloat('cnn', 'dropout')))
+    model.add(Dense(classes))
+    model.add(Activation('softmax'))
+
+    optimizer = RMSprop(lr=0.0001,#cfg.getfloat('cnn', 'learnrt'),
+                      rho=0.9, epsilon=1e-08)
+    model.compile(loss='categorical_crossentropy',
+                optimizer=optimizer,
+                metrics=['accuracy'])
+    model.fit(train_xs,
+            train_y,
+            nb_epoch=3,#cfg.getint('cnn', 'epochs'),
+            batch_size=50,#cfg.getint('cnn', 'batches'),
+            verbose=1,
+            validation_split=0.1,
+            class_weight=None)
+
+    model.summary()
+
+    json_string = model.to_json()
+    open(os.path.join(working_dir, 'model_0.json'), 'w').write(json_string)
+    model.save_weights(os.path.join(working_dir, 'model_0.h5'), overwrite=True)
+    sys.exit(0)
+
+if __name__ == "__main__":
+    main(sys.argv[1:])
\ No newline at end of file



Mime
View raw message