library(mva) ndims <- 10 distfilename <- 'newNFkB.dist' datfilename <- 'newNFkB.txt' bindpcfile <- function( distfile, ndim ) { # read in the symmetric distance matrix for the differences between the oligos # format is a table with the first row being the oligo names, the first column the oligo names ane the other entries the corresponding distances d <- read.table(distfile,header=TRUE) # perform principal coordinates analysis bindmds( d, ndim ) } bindpc <- function( seqs, ndim ) { # principal coordinates analysis from an array of sequences d <- seqdistances( seqs ) bindmds( d, ndim ) } bindmds <- function ( d, ndim ) { # core function to perfrom princ coord analysis from a distance matrix # perform square-root transformation ds <- sqrt(d) # metric scaling, with ndim dimensions c <-cmdscale( ds, k=ndim, eig=TRUE) print(c$eig) print(c$eig/c$eig[1]) c$ndim = ndim colnames(c$points) <- paste( sep='', "PC", 1:ndim ) c$seqs <- colnames(d) c } seqdistances <- function( sequences, sequences2=NULL ) { # construct a distance matrix from a array of sequences seqs <- sort( unique(sequences)) revs <- revcomp( seqs ) s <- strsplit( seqs, '' ) r <- strsplit( revs, '' ) len <- length(seqs) if ( is.null(sequences2) ) { sequences2 <- sequences seqs2 <- seqs s2 <- s len2 <- len symmetric <- TRUE } else { seqs2 <- sort( unique(sequences2)) s2 <- strsplit( seqs2, '') len2 <- length(seqs2) symmetric <- FALSE } dist <- matrix(nrow=len,ncol=len2) rownames(dist) <- levels(seqs) colnames(dist) <- levels(seqs2) for( i in 1:len ) { seq1 <- seqs[[i]] as.character(seq1) si <- s[[i]] ri <- r[[i]] if ( symmetric ) { dist[i,i] <- 0 if ( i > 1 ) { for( j in 1:(i-1) ) { seq2 <- seqs[[j]] as.character(seq2) if ( seq2 < seq1 ) { x <- ifelse ( s[[i]] == s[[j]], 0, 1 ) forward <- sum (x) x <- ifelse ( r[[i]] == s[[j]], 0, 1 ) reverse <- sum (x) dist[i,j] <- ifelse ( forward < reverse , forward , reverse ) dist[j,i] <- dist[i,j] } } } } else { for( j in 1:len2 ) { seq2 <- seqs2[[j]] as.character(seq2) x <- ifelse ( s[[i]] == s2[[j]], 0, 1 ) forward <- sum (x) x <- ifelse ( r[[i]] == s2[[j]], 0, 1 ) reverse <- sum (x) dist[i,j] <- ifelse ( forward < reverse , forward , reverse ) } } } dist } bindgetseqs <- function ( datfile ) { b <- read.table(datfile, header=TRUE) oligos <- b$oligo oligos <- unique(sort(oligos)) as.character(oligos) oligos } ################################################## bindfit <- function( c, datfile, bindmodel=NULL ) { ################################################## # read in the binding data # input file must be a table with three columns: # oligo slide signal # ( a header line must be included ) # the signals should be raw (ie not log-transformed) # replicate observations for an oligo and a slide are permitted, represented as additional lines b <- read.table(datfile, header=TRUE) bdim <- dim(b) # the oligos oligos <- b$oligo reverse <- revcomp(oligos) if ( is.na(c) ) { warning("error - c is not set ") return( NA) } ndim <- c$ndim # construct a temporary table with the ndim principal coordinates for each row in b i <- 1 tmp <- matrix( nrow = bdim[1], ncol = ndim ) for ( ol in 1:length(oligos) ) { ind <- match(oligos[ol],rownames(c$points ),nomatch=0) if ( ind == 0 ) { rev <- reverse[ol] ind <- match(rev,rownames(c$points ),nomatch=0) if ( ind == 0 ) { print(paste("error ", oligos[ol], "not found")) return } } tmp[i,] <- c$points[ind,] i <- i+1 } # combine the data into one matrix combined <- cbind( b, tmp ) # create column names for the combined matrix, labelling the principal coordinates PC1, PC2 etc cn <- paste( sep='', "PC", 1:ndim ) # create default formula formula1 <- paste( "signal ~ slide + ", paste( collapse = " + ", cn )) formula2 <- paste( "rsignal ~ ", paste( collapse = " + ", cn )) print(formula1) print(bindmodel) print(length(bindmodel)) if ( is.null(bindmodel) ) { bindmodel <- formula1 } cn <- c( colnames(b), cn ) colnames(combined) <- cn print(colnames(combined)) # log-transform the signals combined$signal <- log(combined$signal) # fit the model print(bindmodel) fit <- lm ( as.formula(bindmodel), data = combined ) print(summary.lm(fit)) print(anova(fit)) print(fit) c$fit <- fit c } revcomp <- function ( seq ) { dnas <- seq as.character(dnas) splat <- strsplit( dnas, '' ) len <- length(dnas) rc = c() for( i in 1:len ) { splat1 = splat[[i]] len1 <- length(splat1) rev <- splat1 for( j in 1:len1 ) { x <- splat1[[j]] y <- 'N' if ( x == 'A' ) {y <- 'T'} if ( x == 'C' ) {y <- 'G'} if ( x == 'G' ) {y <- 'C'} if ( x == 'T' ) {y <- 'A'} rev[[len1-j+1]] <- y } rc <- c( rc, paste(rev,sep='',collapse='') ) } } bindpredict <- function ( c, predict, factor=10, relative=NULL ) { # result of bindfit ( regression model fitted to oligos, plus pca analysis ) # predict = array of seqs for prediction predict <- unique(sort(predict)) train <- c$seqs dist <- seqdistances( predict, train ) # extract the PC coefficients cm <- grep("^PC", names(c$fit$coefficients), value=TRUE ) # the PC terms in the model ci <- grep("^PC", names(c$fit$coefficients), value=FALSE ) # indices of the PC terms in the model co <- c$fit$coefficients[ci] # extract the PC coordinates predmat <- c$points[,cm] colnames(predmat) <- cm rownames(predmat) <- rownames(c$points) # make the prediction w <- exp(-log(factor)*dist) w <- w / apply( w, 1, sum ) bind <- w %*% predmat %*% co print(bind[relative,]) if ( ! is.null(relative) ) { bind <- bind - bind[relative,] } bind <- cbind( bind, rank(bind)/dim(bind)[1] ) colnames(bind) <- c( 'affinity', 'rank' ) print(bind) } binder <- function ( filename, ndim, formula=NULL ) { seqs <- bindgetseqs( filename ) c <- bindpc( seqs, ndim ) c <- bindfit( c, filename, formula ) c } pwm <- function( filename ) { d <- read.table(filename,header=T) m <- matrix( nrow=nrow(d), byrow=TRUE, unlist(strsplit(as.character(d$oligo),'')) ) nc <- ncol(m) nc <- paste( 'pos', c(1:nc), sep="") dd <- data.frame(cbind( d$slide,m ) ) names(dd) <- c( 'slide', nc ) names(dd) good <- c() for( n in names(dd) ) { if ( nlevels(dd[,n]) > 1 ) { good <- c( good, n ) } } f <- paste( good, sep= " + ", collapse=" + ") f <- paste( 'log(d$signal) ~ ' , f ) form <- as.formula( f ) fit <- lm ( form, data=dd ); print(anova(fit)) print(fit) print(summary.lm(fit)) return( fit ) } pwm.predict<- function( fit, seqs ) { m <- matrix( nrow=length(seqs), byrow=TRUE, unlist(strsplit(as.character(seqs),'')) ) nc <- ncol(m) nc <- paste( 'pos', c(1:nc), sep="") z <- matrix( ncol=1, nrow=length(seqs), 'slide2') m <- cbind( z, m ) dd <- data.frame( m ) names(dd) <- c( 'slide', nc ) for ( n in names(dd) ) { dd[,n] <- as.factor(dd[,n]) } result <- predict.lm( fit, dd ) }