from lasagne import layers
from lasagne import nonlinearities
from lasagne import objectives
import theano
import theano.tensor as T
from lasagne import updates
from sklearn.metrics import r2_score
from sklearn.model_selection import KFold
from collections import OrderedDict
import os
import os.path
import gzip
import numpy as np
import sys
import pickle
#### Creating output directories
tissue = "MuscleSkeletal"
directory = tissue + "_Stage1_classB_peaBrain"
if not os.path.exists(directory):
os.makedirs(directory)
#### Load and save functions
def save_obj(obj, name):
with open(name + '.pkl', 'wb') as f:
pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
def load_obj(name):
with open(name + '.pkl', 'rb') as f:
return pickle.load(f)
chromosome_lengths = {'chr1': 249250621,
'chr2': 243199373,
'chr3': 198022430,
'chr4': 191154276,
'chr5': 180915260,
'chr6': 171115067,
'chr7': 159138663,
'chr8': 146364022,
'chr9': 141213431,
'chr10': 135534747,
'chr11': 135006516,
'chr12': 133851895,
'chr13': 115169878,
'chr14': 107349540,
'chr15': 102531392,
'chr16': 90354753,
'chr17': 81195210,
'chr18': 78077248,
'chr19': 59128983,
'chr20': 63025520,
'chr21': 48129895,
'chr22': 51304566}
#### Reading reference fasta
reference = {}
f = open("/gpfs1/well/got2d/ddn/moustafa/hg19/chromosome.fasta")
for line in f:
t = line.strip().split()
if "X" not in t[0] and "Y" not in t[0]:
chr = int(t[0].replace("chr", ''))
reference[chr] = t[1].upper()
assert len(reference[chr]) == chromosome_lengths[t[0]]
f.close()
#### Reading gene locations
#### Use this if you want to build Class A models
positions = {}
f = open("/gpfs1/well/got2d/ddn/moustafa/hg19/grch37_hg19_ensGene")
for line in f:
if "#" not in line:
t = line.strip().split()
if t[2] in chromosome_lengths:
gene = t[12]
chr = int(t[2].replace("chr", ''))
strand = t[3]
txStart = int(t[4])
txEnd = int(t[5])
if gene not in positions:
positions[gene] = {}
positions[gene]['txStart'] = txStart #end position for minus strand item
positions[gene]['txEnd'] = txEnd #start position for minus strand item
positions[gene]['Strand'] = strand
positions[gene]['chr'] = chr
elif chr in chromosome_lengths:
assert positions[gene]['Strand'] == strand
if positions[gene]['txStart'] > txStart:
positions[gene]['txStart'] = txStart
if positions[gene]['txEnd'] < txEnd:
positions[gene]['txEnd'] = txEnd
f.close()
#### Loading expression data
source = "rn.mean." + tissue + ".matrix"
expression = {}
f = open(source)
for line in f:
if "mean" not in line:
t = line.strip().split()
if t[0] in positions:
expression[t[0]] = float(t[1])
f.close()
#### Loading Class B input data
ALL_SEQUENCES = load_obj("input_sequences")
################## BUILDING THE MODEL ###################
def buildModel():
net = OrderedDict()
net['Input'] = layers.InputLayer(shape=(None, 32, 4000))
net['C1'] = layers.Conv1DLayer(net['Input'], num_filters=11, filter_size=5, stride = 1, pad=2, nonlinearity=nonlinearities.leaky_rectify)
net['P1'] = layers.MaxPool1DLayer(net['C1'], pool_size=5, pad=1)
net['C2'] = layers.Conv1DLayer(net['P1'], num_filters=11, filter_size=5, stride = 1, pad=2, nonlinearity=nonlinearities.leaky_rectify)
net['P2'] = layers.MaxPool1DLayer(net['C2'], pool_size=5, pad=1)
net['C3'] = layers.Conv1DLayer(net['P2'], num_filters=11, filter_size=5, stride = 1, pad=2, nonlinearity=nonlinearities.leaky_rectify)
net['P3'] = layers.MaxPool1DLayer(net['C3'], pool_size=5, pad=1)
net['noise'] = layers.DropoutLayer(net['P3'], p = 0.5)
net['encoding'] = layers.DenseLayer(net['noise'], num_units=1001, nonlinearity=None)
net['out'] = layers.DenseLayer(net['encoding'], num_units=1, nonlinearity=None)
return net
###################### TRAINING #########################
print "START TRAINING..."
all_genes = np.array(expression.keys())
kf = KFold(n_splits=10)
cv_fold = 0
for train_index, test_index in kf.split(all_genes):
NET = buildModel()
#################### LOSS FUNCTION ######################
def calc_loss(prediction, targets):
l = T.mean(objectives.squared_error(prediction, targets))
return l
#theano variable for the class targets
#this is the output vector the net should predict
targets = T.col('targets', dtype=theano.config.floatX)
X = T.tensor3()
#get the network output
prediction = layers.get_output(NET['out'], X)
#calculate the loss
loss = calc_loss(prediction, targets)
####################### UPDATES ########################
#get all trainable parameters (weights) of our net
params = layers.get_all_params(NET['out'], trainable=True)
#we use the adam update
#it changes params based on our loss function with the learning rate
param_updates = updates.adam(loss, params)
#################### TRAIN FUNCTION ######################
#the theano train functions takes images and class targets as input
#it updates the parameters of the net and returns the current loss as float value
#compiling theano functions may take a while...
print "COMPILING THEANO TRAIN FUNCTION...",
train_net = theano.function([X, targets], loss, updates=param_updates)
print "DONE!"
################# PREDICTION FUNCTION ####################
#we need the prediction function to calculate the validation error
#this way we can test the net after training
#first we need to get the net output
net_output = layers.get_output(NET['out'], X, deterministic=True)
net_encoding = layers.get_output(NET['encoding'], X, deterministic=True)
#now we compile another theano function; this may take a while, too
print "COMPILING THEANO TEST FUNCTION...",
test_net = theano.function([X, targets], [net_output, loss])
encode = theano.function([X], [net_encoding])
print "DONE!"
val_train, genes_test = all_genes[train_index], all_genes[test_index]
np.random.shuffle(val_train)
genes_train, genes_val = val_train[:int(0.9*len(val_train))], val_train[int(0.9*len(val_train)):]
np.savez(directory + "/" + str(cv_fold) + "_train_set" + '.npz', genes_train)
np.savez(directory + "/" + str(cv_fold) + "_val_set" + '.npz', genes_val)
np.savez(directory + "/" + str(cv_fold) + "_test_set" + '.npz', genes_test)
train_loss = []
val_loss = []
f = open(directory + "/" + str(cv_fold) + ".training", 'w')
for epoch in range(0, 1000):
#iterate over train split batches and calculate mean loss for epoch
t_l = 0.0
t_counter = 0.0
v_l = 0.0
v_counter = 0.0
np.random.shuffle(genes_train)
np.random.shuffle(genes_val)
for training_slice in zip(*(iter(genes_train),) * 199):
X = np.array([ALL_SEQUENCES[gene] for gene in training_slice]).astype(np.float32)
y = np.array([expression[gene] for gene in training_slice]).astype(np.float32)
l = train_net(X, y.reshape(-1,1))
t_l += l
t_counter += 1
print(t_l/t_counter, t_counter)
actual_array = []
predicted_array = []
for validation_slice in zip(*(iter(genes_val),) * 199):
X = np.array([ALL_SEQUENCES[gene] for gene in validation_slice]).astype(np.float32)
y = np.array([expression[gene] for gene in validation_slice]).astype(np.float32)
prediction_batch, l = test_net(X, y.reshape(-1,1))
actual_array.extend(y)
predicted_array.extend(prediction_batch.reshape(1,-1)[0])
v_l += l
v_counter += 1
if len(val_loss) == 0 or all((v_l/v_counter) < val_loss):
print((v_l/v_counter), epoch, "best")
np.savez(directory + "/" + str(cv_fold) + "_model" + '.npz', *layers.get_all_param_values(NET['out']))
train_loss.append(t_l/t_counter)
val_loss.append(v_l/v_counter)
print "Done"
print "EPOCH:", epoch,
print "TRAIN LOSS:", train_loss[-1],
print "VAL LOSS:", val_loss[-1],
r2 = r2_score(actual_array, predicted_array)
f.write(str(epoch) + "\t" + str(train_loss[-1]) + "\t" + str(val_loss[-1]) + "\t" + str(r2) + "\n")
f.flush()
################# EXIT CRITERIA ####################
if len(val_loss) > 100 and val_loss[-1] >= val_loss[-2] >= val_loss[-3]:
break
f.close()
################# MODEL PERFORMANCE ON TEST SET ####################
with np.load(directory + "/" + str(cv_fold) + "_model" + '.npz') as g:
param_values = [g['arr_%d' % i] for i in range(len(g.files))]
layers.set_all_param_values(NET['out'], param_values)
test_r2 = 0.0
predicted_array = []
actual_array = []
for test_slice in zip(*(iter(genes_test),) * 199):
X = np.array([ALL_SEQUENCES[gene] for gene in test_slice]).astype(np.float32)
y = np.array([expression[gene] for gene in test_slice]).astype(np.float32)
prediction_test, l = test_net(X, y.reshape(-1,1))
actual_array.extend(y)
predicted_array.extend(prediction_test.reshape(1,-1)[0])
test_r2 = r2_score(actual_array, predicted_array)
f = open(directory + "/" + str(cv_fold) + ".test", 'w')
f.write(str(test_r2))
f.close()
################# GENERATE EMBEDDINGS ####################
embedding = {}
for g in ALL_SEQUENCES:
embedding[g] = encode([ALL_SEQUENCES[g]])[0]
save_obj(embedding, directory + "/" + str(cv_fold) + "_embedding")
cv_fold += 1