In [ ]:
import lasagne
from lasagne import layers
from lasagne import updates
from lasagne import objectives
from lasagne import nonlinearities
import theano
import theano.tensor as T
from theano import config, shared, sandbox, Out
from collections import OrderedDict
import os
import sys
import math
import random
import pickle
import gzip, cPickle	
import numpy as np
from joblib import Parallel, delayed
from sklearn.metrics import r2_score
from sklearn.model_selection import KFold, ShuffleSplit
In [ ]:
#### Function to load sequence

def load_obj(name):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)
In [ ]:
################## BUILDING THE MODEL ###################
def buildModel():
    net = OrderedDict()

    net['in'] = layers.InputLayer(shape=(None, 4, 1000000))

    net['slice_upstream'] = layers.SliceLayer(net['in'], indices=slice(0, 498000), axis=2)
    net['slice_centre'] = layers.SliceLayer(net['in'], indices=slice(498000, 502000), axis=2)
    net['slice_downstream'] = layers.SliceLayer(net['in'], indices=slice(502000, None), axis=2)

    net['C1_upstream'] = layers.Conv1DLayer(net['slice_upstream'], num_filters=11, filter_size=5, stride = 1, pad=2, nonlinearity=nonlinearities.leaky_rectify)
    net['P1_upstream'] = layers.MaxPool1DLayer(net['C1_upstream'], pool_size=100, pad=1)
    net['C2_upstream'] = layers.Conv1DLayer(net['P1_upstream'], num_filters=11, filter_size=5, stride = 1, pad=2, nonlinearity=nonlinearities.leaky_rectify)
    net['P2_upstream'] = layers.MaxPool1DLayer(net['C2_upstream'], pool_size=50, pad=1)
    net['C3_upstream'] = layers.Conv1DLayer(net['P2_upstream'], num_filters=11, filter_size=5, stride = 1, pad=2, nonlinearity=nonlinearities.leaky_rectify)
    net['P3_upstream'] = layers.MaxPool1DLayer(net['C3_upstream'], pool_size=10, pad=1)
    net['noise_upstream'] = layers.DropoutLayer(net['P3_upstream'], p = 0.5)
    net['dense_upstream'] = layers.DenseLayer(net['noise_upstream'], num_units=50, nonlinearity=None)

    net['C1_centre'] = layers.Conv1DLayer(net['slice_centre'], num_filters=11, filter_size=5, stride = 1, pad=2, nonlinearity=nonlinearities.leaky_rectify)
    net['P1_centre'] = layers.MaxPool1DLayer(net['C1_centre'], pool_size=5, pad=1)
    net['C2_centre'] = layers.Conv1DLayer(net['P1_centre'], num_filters=11, filter_size=5, stride = 1, pad=2, nonlinearity=nonlinearities.leaky_rectify)
    net['P2_centre'] = layers.MaxPool1DLayer(net['C2_centre'], pool_size=5, pad=1)
    net['C3_centre'] = layers.Conv1DLayer(net['P2_centre'], num_filters=11, filter_size=5, stride = 1, pad=2, nonlinearity=nonlinearities.leaky_rectify)
    net['P3_centre'] = layers.MaxPool1DLayer(net['C3_centre'], pool_size=5, pad=1)
    net['noise_centre'] = layers.DropoutLayer(net['P3_centre'], p = 0.5)
    net['dense_centre'] = layers.DenseLayer(net['noise_centre'], num_units=50, nonlinearity=None)

    net['C1_downstream'] = layers.Conv1DLayer(net['slice_downstream'], num_filters=11, filter_size=5, stride = 1, pad=2, nonlinearity=nonlinearities.leaky_rectify)
    net['P1_downstream'] = layers.MaxPool1DLayer(net['C1_downstream'], pool_size=100, pad=1)
    net['C2_downstream'] = layers.Conv1DLayer(net['P1_downstream'], num_filters=11, filter_size=5, stride = 1, pad=2, nonlinearity=nonlinearities.leaky_rectify)
    net['P2_downstream'] = layers.MaxPool1DLayer(net['C2_downstream'], pool_size=50, pad=1)
    net['C3_downstream'] = layers.Conv1DLayer(net['P2_downstream'], num_filters=11, filter_size=5, stride = 1, pad=2, nonlinearity=nonlinearities.leaky_rectify)
    net['P3_downstream'] = layers.MaxPool1DLayer(net['C3_downstream'], pool_size=10, pad=1)
    net['noise_downstream'] = layers.DropoutLayer(net['P3_downstream'], p = 0.5)
    net['dense_downstream'] = layers.DenseLayer(net['noise_downstream'], num_units=50, nonlinearity=None)

    net['concat']= layers.ConcatLayer([net['dense_upstream'], net['dense_downstream'], net['dense_centre']])
    net['dense'] = layers.DenseLayer(net['concat'], num_units=50, nonlinearity=None)
    net['out'] = layers.DenseLayer(net['dense'], num_units=1, nonlinearity=None)

    return net
In [ ]:
#################### LOSS FUNCTION ######################
def calc_loss(prediction, targets):
    l = T.mean(objectives.squared_error(prediction, targets))
    return l
In [ ]:
############################ Reading pasta
def read_pasta(subject):
    dic = {}
    f = gzip.open("/path/to/individual/" + subject + ".gz")
    for line in f:
        t = line.strip().split()
        dic[t[0]] = t[1]
    f.close()
    return dic
In [ ]:
############################ Reference TSS locations
def load_TSS_positions(subject):
    positions = {}
    f = open("/path/to/individual/level/wgs/data/" + subject + ".genes")
    for line in f:
        gene, label, chr, strand, tss_1, tss_2 = line.strip().split()
        if gene not in positions:
            positions[gene] = {}
        positions[gene][label] = (float(tss_1), float(tss_2))
        positions[gene]["chr"] = chr
        positions[gene]["strand"] = strand
    f.close()
    return positions
In [ ]:
############################ Load Expression
def load_expression(tissue):
    expression = {}
    samples = []
    f = open("/path/to/individual/level/expression/data/" + tissue + ".matrix")
    for line in f:
        t = line.strip().split()
        if "GTEX" in line:
            samples = ["-".join(x.split(".")[:2]) for x in t]
            for s in samples:
                expression[s] = {}
        else:
            gene = t[0]
            expr = t[1:]
            for i in range(len(samples)):
                expression[samples[i]][gene] = float(expr[i])
    f.close()
    return expression
In [ ]:
not_related = set()
f = open("/exclude/related/individuals/file")
for line in f:
    t = line.strip().split()
    not_related.add(t[0])
f.close()

all_genomes = set([x.split(".")[0] for x in os.listdir("/path/to/individual/level/wgs/data/")])

heritable = {}
f = open("/genes/with/significant/non-zero/heritability/file")
for line in f:
    t = line.strip().split()
    heritable[t[0]] = t[1:]

f.close()
In [ ]:
###################### TRAINING #########################
print "START TRAINING..."

all_genes = list(heritable.keys())
all_genes = sorted(all_genes)

#### input into file

SELECTED_INDEX = int(sys.argv[1]) - 1
SELECTED_FOLD = int(sys.argv[2])

for gene in [all_genes[SELECTED_INDEX]]:
    print(gene)

    tissue = 'CellsEBVtransformedlymphocytes' ### selected tissue
    if True:

        expression = load_expression(tissue)
        all_subjects = np.array([x for x in all_genomes if x in expression.keys() and x in not_related])

        directory = "/path/to/output/directory/" + tissue + "/" + gene

        if not os.path.exists(directory):
            os.makedirs(directory)

        ALL_SEQUENCES = {}
        for subject in all_subjects:
            print(gene, subject)
            fp=gzip.open("/gpfs1/well/got2d/ddn/moustafa/hg19/joint_embedding/embed/1mbembed/pkls/" + subject + "/" + gene,'rb')
            ALL_SEQUENCES[subject] = cPickle.load(fp)
            fp.close()
	  
        cv_fold = SELECTED_FOLD

        for COUNTER_FOLD in range(1):

            np.random.shuffle(all_subjects)

            train_index = np.array(range(0, int(len(all_subjects)*0.95)))
            test_index = np.array(range(int(len(all_subjects)*0.95), len(all_subjects)))

            NET = buildModel()
            print("DONE NET")
            #theano variable for the class targets
            #this is the output vector the net should predict
            targets = T.col('targets', dtype=theano.config.floatX)
            X = T.tensor3()
            #get the network output
            prediction = layers.get_output(NET['out'], X)
 
            #calculate the loss
            loss = calc_loss(prediction, targets)
 
            ####################### UPDATES #########################
            #get all trainable parameters (weights) of our net
            params = layers.get_all_params(NET['out'], trainable=True)
 
            #we use the adam update
            #it changes params based on our loss function with the learning rate
            param_updates = updates.nesterov_momentum(loss, params, learning_rate=0.00001, momentum=0.9)
            print("DONE UPDATES")

            #################### TRAIN FUNCTION ######################
            #the theano train functions takes images and class targets as input
            #it updates the parameters of the net and returns the current loss as float value
            #compiling theano functions may take a while...
            print "COMPILING THEANO TRAIN FUNCTION...",
            train_net = theano.function([X, targets], loss, updates=param_updates)
            train_net.trust_input = True
            print "DONE!"

            ################# PREDICTION FUNCTION ####################
            #we need the prediction function to calculate the validation error
            #this way we can test the net after training
            #first we need to get the net output
            net_output = layers.get_output(NET['out'], X, deterministic=True)
 
            #now we compile another theano function; this may take a while, too
            print "COMPILING THEANO TEST FUNCTION...",
            test_net = theano.function([X, targets], [net_output, loss])
            test_net.trust_input = True
            print "DONE!"

            val_train, subjects_test = all_subjects[train_index], all_subjects[test_index]

            np.random.shuffle(val_train)
            subjects_train, subjects_val = val_train[:int(0.90*len(val_train))], val_train[int(0.90*len(val_train)):]

            np.savez(directory + "/" + str(cv_fold) + "_train_set" + '.npz', subjects_train)
            np.savez(directory + "/" + str(cv_fold) + "_val_set" + '.npz', subjects_val)
            np.savez(directory + "/" + str(cv_fold) + "_test_set" + '.npz', subjects_test)

            train_loss = []
            val_loss = []
            r2_loss = []

            f = open(directory + "/" + str(cv_fold) + ".training", 'w')

            for epoch in range(0, 1000):
                #iterate over train split batches and calculate mean loss for epoch
                t_l = 0.0
                t_counter = 0.0
                v_l = 0.0
                v_counter = 0.0
                np.random.shuffle(subjects_train)
                #np.random.shuffle(subjects_val)

                for SUJREF in subjects_train[:int(len(subjects_train)/2.0)]:
                    subjects_train2 = subjects_train.copy()
                    np.random.shuffle(subjects_train2)
                    for training_slice in zip(*(iter(subjects_train2),) * int(0.45 * len(subjects_train))):
                        X = np.array([ALL_SEQUENCES[subject] - ALL_SEQUENCES[SUJREF] for subject in training_slice]).astype(np.float32)
                        y = np.array([expression[subject][gene] - expression[SUJREF][gene] for subject in training_slice]).astype(np.float32)
                        l = train_net(X, y.reshape(-1,1))
                        t_l += l
                        t_counter += 1
                        print(tissue, t_l/t_counter, t_counter)
                        X = np.array([ALL_SEQUENCES[SUJREF] - ALL_SEQUENCES[subject] for subject in training_slice]).astype(np.float32)
                        y = np.array([expression[SUJREF][gene] - expression[subject][gene] for subject in training_slice]).astype(np.float32)
                        l = train_net(X, y.reshape(-1,1))
                        t_l += l
                        t_counter += 1
                        print(gene, tissue, t_l/t_counter, t_counter)

                predicted_array = []
                actual_array = []

                for SUJREF in subjects_val[:int(len(subjects_val)/2.0)]:
                    for validation_slice in zip(*(iter(subjects_val),) * int(0.45 * len(subjects_val))):
                        X = np.array([ALL_SEQUENCES[subject] - ALL_SEQUENCES[SUJREF] for subject in validation_slice]).astype(np.float32)					
                        y = np.array([expression[subject][gene] - expression[SUJREF][gene]  for subject in validation_slice]).astype(np.float32)
                        prediction_batch, l = test_net(X, y.reshape(-1,1))
                        actual_array.extend(y)
                        predicted_array.extend(prediction_batch.reshape(1,-1)[0])
                        v_l += l
                        v_counter += 1 
                        X = np.array([ALL_SEQUENCES[SUJREF] - ALL_SEQUENCES[subject] for subject in validation_slice]).astype(np.float32)					
                        y = np.array([expression[SUJREF][gene] - expression[subject][gene]  for subject in validation_slice]).astype(np.float32)
                        prediction_batch, l = test_net(X, y.reshape(-1,1))
                        actual_array.extend(y)
                        predicted_array.extend(prediction_batch.reshape(1,-1)[0])
                        v_l += l
                        v_counter += 1 

                if len(val_loss) == 0 or all((v_l/v_counter) < val_loss):
                    print((v_l/v_counter), epoch, "best")
                    np.savez(directory + "/" + str(cv_fold) + "_model" + '.npz', *layers.get_all_param_values(NET['out']))

                train_loss.append(t_l/t_counter)
                val_loss.append(v_l/v_counter)
                r2 = r2_score(actual_array, predicted_array)
                r2_loss.append(r2)

                print "Done"
                print "EPOCH:", epoch, gene, tissue, 
                print "TRAIN LOSS:", train_loss[-1],
                print "VAL LOSS:", val_loss[-1], r2

                f.write(str(epoch) + "\t" + str(train_loss[-1]) + "\t" + str(val_loss[-1]) + "\t" + str(r2) + "\n")
                f.flush()

                if len(val_loss) > 90 and val_loss[-1] >= val_loss[-2] >= val_loss[-3]:
                    break
                    
            f.close()

            #################### TEST MODEL ######################
            with np.load(directory + "/" + str(cv_fold) + "_model" + '.npz') as g:
                param_values = [g['arr_%d' % i] for i in range(len(g.files))]
                layers.set_all_param_values(NET['out'], param_values)

            predicted_array = []
            actual_array = []

            for SUJREF in subjects_test[:int(len(subjects_test)/2.0)]:
                for test_slice in zip(*(iter(subjects_test),) * int(0.45 * len(subjects_test))):
                    X = np.array([ALL_SEQUENCES[subject] - ALL_SEQUENCES[SUJREF] for subject in test_slice]).astype(np.float32)					
                    y = np.array([expression[subject][gene] - expression[SUJREF][gene] for subject in test_slice]).astype(np.float32)
                    prediction_test, l = test_net(X, y.reshape(-1,1))
                    actual_array.extend(y)
                    predicted_array.extend(prediction_test.reshape(1,-1)[0])
                    X = np.array([ALL_SEQUENCES[SUJREF] - ALL_SEQUENCES[subject] for subject in test_slice]).astype(np.float32)					
                    y = np.array([expression[SUJREF][gene] - expression[subject][gene] for subject in test_slice]).astype(np.float32)
                    prediction_test, l = test_net(X, y.reshape(-1,1))
                    actual_array.extend(y)
                    predicted_array.extend(prediction_test.reshape(1,-1)[0])


            test_r2 = r2_score(actual_array, predicted_array)
            f = open(directory + "/" + str(cv_fold) + ".test", 'w')
            f.write(str(test_r2))
            f.close()