In [ ]:
from lasagne import layers
from lasagne import nonlinearities
from lasagne import objectives
import theano
import theano.tensor as T
from lasagne import updates
from sklearn.metrics import r2_score
from sklearn.model_selection import KFold
from collections import OrderedDict
import os
import os.path
import gzip
import numpy as np
import sys
import pickle
In [ ]:
#### Creating output directories

tissue = "MuscleSkeletal"
directory = tissue + "_Stage1_classB_peaBrain"
if not os.path.exists(directory):
    os.makedirs(directory)
In [ ]:
#### Load and save functions

def save_obj(obj, name):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)
In [ ]:
chromosome_lengths = {'chr1': 249250621,
                      'chr2': 243199373,
                      'chr3': 198022430,
                      'chr4': 191154276,
                      'chr5': 180915260,
                      'chr6': 171115067,
                      'chr7': 159138663,
                      'chr8': 146364022,
                      'chr9': 141213431,
                      'chr10': 135534747,
                      'chr11': 135006516,
                      'chr12': 133851895,
                      'chr13': 115169878,
                      'chr14': 107349540,
                      'chr15': 102531392,
                      'chr16': 90354753,
                      'chr17': 81195210,
                      'chr18': 78077248,
                      'chr19': 59128983,
                      'chr20': 63025520,
                      'chr21': 48129895,
                      'chr22': 51304566}
In [ ]:
#### Reading reference fasta

reference = {}
f = open("/gpfs1/well/got2d/ddn/moustafa/hg19/chromosome.fasta")
for line in f:
    t = line.strip().split()
        if "X" not in t[0] and "Y" not in t[0]:
            chr = int(t[0].replace("chr", ''))
            reference[chr] = t[1].upper()
            assert len(reference[chr]) == chromosome_lengths[t[0]]
f.close()
In [ ]:
#### Reading gene locations
#### Use this if you want to build Class A models

positions = {}
f = open("/gpfs1/well/got2d/ddn/moustafa/hg19/grch37_hg19_ensGene")
for line in f:
    if "#" not in line:
        t = line.strip().split()
        if t[2] in chromosome_lengths:
            gene = t[12]
            chr = int(t[2].replace("chr", ''))
            strand = t[3]
            txStart = int(t[4])
            txEnd = int(t[5])
            if gene not in positions:
                positions[gene] = {}
                positions[gene]['txStart'] = txStart #end position for minus strand item
                positions[gene]['txEnd'] = txEnd #start position for minus strand item
                positions[gene]['Strand'] = strand
                positions[gene]['chr'] = chr
            elif chr in chromosome_lengths:
                assert positions[gene]['Strand'] == strand
                if positions[gene]['txStart'] > txStart: 
                    positions[gene]['txStart'] = txStart
                if positions[gene]['txEnd'] < txEnd: 
                    positions[gene]['txEnd'] = txEnd
f.close()
In [ ]:
#### Loading expression data

source =  "rn.mean." + tissue + ".matrix"

expression = {}
f = open(source)
for line in f:
    if "mean" not in line:
        t = line.strip().split()
        if t[0] in positions:
            expression[t[0]] = float(t[1])
f.close()
In [ ]:
#### Loading Class B input data

ALL_SEQUENCES = load_obj("input_sequences")
In [ ]:
################## BUILDING THE MODEL ###################

def buildModel():
    net = OrderedDict()
    net['Input'] = layers.InputLayer(shape=(None, 32, 4000))
    net['C1'] = layers.Conv1DLayer(net['Input'], num_filters=11, filter_size=5, stride = 1, pad=2, nonlinearity=nonlinearities.leaky_rectify)
    net['P1'] = layers.MaxPool1DLayer(net['C1'], pool_size=5, pad=1)
    net['C2'] = layers.Conv1DLayer(net['P1'], num_filters=11, filter_size=5, stride = 1, pad=2, nonlinearity=nonlinearities.leaky_rectify)
    net['P2'] = layers.MaxPool1DLayer(net['C2'], pool_size=5, pad=1)
    net['C3'] = layers.Conv1DLayer(net['P2'], num_filters=11, filter_size=5, stride = 1, pad=2, nonlinearity=nonlinearities.leaky_rectify)
    net['P3'] = layers.MaxPool1DLayer(net['C3'], pool_size=5, pad=1)
    net['noise'] = layers.DropoutLayer(net['P3'], p = 0.5)
    net['encoding'] = layers.DenseLayer(net['noise'], num_units=1001, nonlinearity=None)
    net['out'] = layers.DenseLayer(net['encoding'], num_units=1, nonlinearity=None)
    return net
In [ ]:
###################### TRAINING #########################
print "START TRAINING..."

all_genes = np.array(expression.keys())
kf = KFold(n_splits=10)
cv_fold = 0
In [ ]:
for train_index, test_index in kf.split(all_genes):
    NET = buildModel()
    
    #################### LOSS FUNCTION ######################
    
    def calc_loss(prediction, targets):
        l = T.mean(objectives.squared_error(prediction, targets))
        return l

    #theano variable for the class targets
    #this is the output vector the net should predict
    
    targets = T.col('targets', dtype=theano.config.floatX)
    X = T.tensor3()
    
    #get the network output
    prediction = layers.get_output(NET['out'], X)
 
    #calculate the loss
    loss = calc_loss(prediction, targets)
 
    ####################### UPDATES ######################## 
    #get all trainable parameters (weights) of our net
    params = layers.get_all_params(NET['out'], trainable=True)
 
    #we use the adam update
    #it changes params based on our loss function with the learning rate
    param_updates = updates.adam(loss, params)

    #################### TRAIN FUNCTION ######################
    #the theano train functions takes images and class targets as input
    #it updates the parameters of the net and returns the current loss as float value
    #compiling theano functions may take a while...
    print "COMPILING THEANO TRAIN FUNCTION...",
    train_net = theano.function([X, targets], loss, updates=param_updates)
    print "DONE!"

    ################# PREDICTION FUNCTION ####################
    #we need the prediction function to calculate the validation error
    #this way we can test the net after training
    #first we need to get the net output
    net_output = layers.get_output(NET['out'], X, deterministic=True)
    net_encoding = layers.get_output(NET['encoding'], X, deterministic=True)

    #now we compile another theano function; this may take a while, too
    print "COMPILING THEANO TEST FUNCTION...",
    test_net = theano.function([X, targets], [net_output, loss])
    encode = theano.function([X], [net_encoding])
    print "DONE!"


    val_train, genes_test = all_genes[train_index], all_genes[test_index]

    np.random.shuffle(val_train)
    genes_train, genes_val = val_train[:int(0.9*len(val_train))], val_train[int(0.9*len(val_train)):]
    
    np.savez(directory + "/" + str(cv_fold) + "_train_set" + '.npz', genes_train)
    np.savez(directory + "/" + str(cv_fold) + "_val_set" + '.npz', genes_val)
    np.savez(directory + "/" + str(cv_fold) + "_test_set" + '.npz', genes_test)

    train_loss = []
    val_loss = []

    f = open(directory + "/" + str(cv_fold) + ".training", 'w')
    
    for epoch in range(0, 1000):

        #iterate over train split batches and calculate mean loss for epoch
        t_l = 0.0
        t_counter = 0.0
        v_l = 0.0
        v_counter = 0.0

        np.random.shuffle(genes_train)
        np.random.shuffle(genes_val)

        for training_slice in zip(*(iter(genes_train),) * 199):
            X = np.array([ALL_SEQUENCES[gene] for gene in training_slice]).astype(np.float32)
            y = np.array([expression[gene] for gene in training_slice]).astype(np.float32)
            l = train_net(X, y.reshape(-1,1))
            t_l += l
            t_counter += 1
            print(t_l/t_counter, t_counter)

        actual_array = []
        predicted_array = []

        for validation_slice in zip(*(iter(genes_val),) * 199):
            X = np.array([ALL_SEQUENCES[gene] for gene in validation_slice]).astype(np.float32)
            y = np.array([expression[gene] for gene in validation_slice]).astype(np.float32)
            prediction_batch, l = test_net(X, y.reshape(-1,1))
            actual_array.extend(y)
            predicted_array.extend(prediction_batch.reshape(1,-1)[0])
            v_l += l
            v_counter += 1

        if len(val_loss) == 0 or all((v_l/v_counter) < val_loss):
            print((v_l/v_counter), epoch, "best")
            np.savez(directory + "/" + str(cv_fold) + "_model" + '.npz', *layers.get_all_param_values(NET['out']))

        train_loss.append(t_l/t_counter)
        val_loss.append(v_l/v_counter)

        print "Done"
        print "EPOCH:", epoch,
        print "TRAIN LOSS:", train_loss[-1],
        print "VAL LOSS:", val_loss[-1],
        
        r2 = r2_score(actual_array, predicted_array)

        f.write(str(epoch) + "\t" + str(train_loss[-1]) + "\t" + str(val_loss[-1]) + "\t" + str(r2) + "\n")
        f.flush()

        ################# EXIT CRITERIA ####################
        if len(val_loss) > 100 and val_loss[-1] >= val_loss[-2] >= val_loss[-3]:
            break


    f.close()

    ################# MODEL PERFORMANCE ON TEST SET ####################
    with np.load(directory + "/" + str(cv_fold) + "_model" + '.npz') as g:
        param_values = [g['arr_%d' % i] for i in range(len(g.files))]
        layers.set_all_param_values(NET['out'], param_values)

    test_r2 = 0.0
    predicted_array = []
    actual_array = []
    
    for test_slice in zip(*(iter(genes_test),) * 199):
        X = np.array([ALL_SEQUENCES[gene] for gene in test_slice]).astype(np.float32)
        y = np.array([expression[gene] for gene in test_slice]).astype(np.float32)
        prediction_test, l = test_net(X, y.reshape(-1,1))
        actual_array.extend(y)
        predicted_array.extend(prediction_test.reshape(1,-1)[0])


    test_r2 = r2_score(actual_array, predicted_array)
    f = open(directory + "/" + str(cv_fold) + ".test", 'w')
    f.write(str(test_r2))
    f.close()

    ################# GENERATE EMBEDDINGS ####################
    embedding = {}
    for g in ALL_SEQUENCES:
        embedding[g] = encode([ALL_SEQUENCES[g]])[0]
    save_obj(embedding, directory + "/" + str(cv_fold) + "_embedding")
    cv_fold += 1