#!/usr/local/bin/perl use lib "."; use CGI; use FileHandle; use strict; use Dict; use Globals; my $DEBUG; # NB: netscape's file-upload is broken! if you access a CGI script # that performs a file-upload without parameters, then pressing the # "back" button will display the results of the upload submission. # the simple solution is to add a bogus "GET" parameter on the script # when posting. my $ACTION = "genie?run"; # set the environment variable for the C pre-processor to blank # to ensure that it doesn't run. it causes problems with the web interface. $ENV{CPPE} = ""; ### MAIN: my $q = new CGI; if (!$q->param()) { print $q->header(); writeForm($q); } else { $DEBUG = $q->param('D'); print $q->header(); print $q->p('Sorry, this server is currently unavailable.'); exit; print $q->dump() if $DEBUG; my ($label, $seq) = getSeq($q); if (length($seq) > 10000) { Die("Sorry, but in order to allow for fair distribution of ". "computational resources, sequences are limited to a length ". "of 10,000."); } my $multiFlag = ($q->param('multi') =~ /single/i); my $bothStrands = ($q->param('strand') =~ /both/i); my ($results) = processSeq($label, $seq, $multiFlag, $bothStrands); ErrorCheck($q,$results); displayResults($q, $label, $seq, $results); } ### SUBROUTINES: # Displays submission form and description of Genie sub writeForm { my ($q) = @_; print $q->start_html(-title => 'Interactive Genie', -BGCOLOR => 'white'), $q->h1('Genie Interactive Submission Form'); my ($form) = ($q->start_multipart_form("POST",$ACTION). $q->em("Enter a DNA sequence: "). $q->br(). $q->textarea(-name => 'seq', -rows => 12, -cols => 40). $q->p(). $q->em($q->b("Or")." Upload a FASTA or Genbank file: "). $q->br(). $q->filefield(-name=>'seq_file', -size=>30). $q->p(). $q->radio_group(-name=>'multi', '-values' => ['Single Gene','Multigene']). $q->p(). $q->radio_group(-name=>'strand', '-values' => ['Forward Strand Only','Both Strands']). $q->hidden(-name => 'D'). $q->p(). $q->submit(). $q->end_form()); my $image; if ($LOGO) { $image = $q->img({-src => $LOGO}); } my $overview = ("Genie is a gene identification tool originally developed ". "at the ". $q->a({-href => "http://www.cse.ucsc.edu/research/compbio/"}, "University of California, Santa Cruz").". The program ". "uses a Hidden Markov Model (HMM) as a high-level grammar representing ". "gene structure. It uses statistical methods and neural networks ". "to model different gene structures, e.g. codons, splice site ". "boundaries, etc. A candidate sequence is \"threaded\" through ". "the HMM using a min-cost path depth-first search algorithm and ". "the system reports this \"optimal\" path as the predicted gene ". "structure.". $q->br(). "For more information, there are several ". $q->a({-href => "http://www.cse.ucsc.edu/research/compbio/research.html"}, "papers"). " describing the system.". $q->p(). $q->a({-href => $DEMO_RESULT},"Sample Results") ); my $authors = ("Genie has been exclusively licensed from the University of ". "California for commercial ". "distribution and continues to be developed at ". $q->a({-href => "http://www.neomorphic.com/"},"Neomorphic Software"). ". Original development and server equipment were made possible ". "in part by DOE grant DE-FG03-95ER62112, NSF grant DBI-9408579 and ". "a gift from ". $q->a({-href => "http://www.dec.com/"}, "Digital Equipment Corporation"). "."); my $limits = "There are known ".$q->a({-href => "$LIMITS"}, "limitations")."."; my $banner = $image.$q->p().$overview; print $q->table({-border => '0',-cellpadding=>'10'}, $q->TR({-valign => 'TOP'}, $q->td({-width => "30%"}, $q->font({-size=>"-1"},$banner)), $q->td($form))); print $q->font({-size=>"-1"}, $q->hr().$authors.$q->p().$limits); print $q->p(). $q->font({-size=>"-1"}, $q->address($q->a({-href=>"mailto:$MAILTO"}, $MAILTO_NAME))); print $q->end_html(); exit; } # reads the genbank file in $seqfile. Writes annotation to a special # file "$label.gbannot". returns the label and DNA. sub parseGB { my (@lines) = split(/\n/,$_[0]); my ($label) = ($lines[0] =~ /LOCUS\s+(\w+)/); DbgMsg("LABEL is $label"); my @annot; while (@lines) { my $line = shift(@lines); last if ($line =~ /^BASE COUNT/); push(@annot,$line); } my $line = shift(@lines); if ($line !~ /^ORIGIN/) { DbgMsg("BAD GENBANK FILE"); Die("Expected ORIGIN tag"); } DbgMsg("ATTEMPTING TO SAVE GENBANK HEADER TO $SAVEDIR/$label.gbannot"); open(GBANNOT,">$SAVEDIR/$label.gbannot") || die $!; foreach (@annot) { print GBANNOT $_."\n"; } close(GBANNOT); my $seq; while (@lines) { my $line = shift(@lines); last if ($line =~ /\/\//); $line =~ s/^.{10}//; # chop first 10 columns $seq .= $line; } # upcase, just so it looks nice $seq =~ tr/a-z/A-Z/; # remove all white space $seq =~ s/\s//gm; # remove any non-alphabetic chars $seq =~ s/[^A-Z]//gm; DbgMsg("FINAL INPUT SEQUENCE FOLLOWS:\n$seq\n\n"); return ($label,$seq); } # reads the FASTA or plain sequence file. returns the label and DNA sub parseFASTA { my ($seq) = @_; # remove FASTA header line, if it exists ($seq =~ s/^>(\w*).*//); my $label = $1; if (!$label) { $label = "WWW-SEQ-$$"; } DbgMsg("OPTIONAL FASTA HEADER OR TMP GENERATED LABEL IS '$label'\n"); DbgMsg("UPPERCASING, REMOVING SPACES, AND REMOVING NON-ALPHABETIC CHARACTERS\n"); # remove any hash comments $seq =~ s/^\#.*//gm; # upcase, just so it looks nice $seq =~ tr/a-z/A-Z/; # remove all white space $seq =~ s/\s//gm; # remove any non-alphabetic chars $seq =~ s/[^A-Z]//gm; DbgMsg("FINAL INPUT SEQUENCE FOLLOWS:\n$seq\n\n"); return ($label,$seq); } # reads the CGI query and returns the FASTA or genbank label (if any) and # the DNA sequence that was submitted sub getSeq { my ($q) = @_; my $seq; # the DNA sequence my $fh; # a filehandle to an uploaded file DbgMsg("RETRIEVING SEQUENCE\n"); # put sequence in $seq, either from file or parameter if (!($seq = $q->param('seq'))) { DbgMsg("DID NOT FIND STRING LABELLED 'SEQ'\nCHECKING FOR UPLOAD FILE\n"); if ($fh = $q->param('seq_file')) { my $tmpfilename = $fh; if ($ENV{REQUEST_METHOD}) { $tmpfilename = $q->tmpFileName($fh); } DbgMsg("OPENING UPLOADED TMP FILE $tmpfilename\n"); open(TMP,$tmpfilename); while () { $seq .= $_; } close(TMP); } else { DbgMsg("NO LABEL 'SEQ' NOR 'SEQ_FILE', RESENDING FORM\n"); writeForm($q); } } DbgMsg("RAW INPUT SEQUENCE FOLLOWS:\n---BEGIN---\n$seq\n---END---\n"); # assume genbank if starts with "LOCUS" if ($seq =~ /^LOCUS/) { return parseGB($seq); } else { return parseFASTA($seq); } } sub buildCmd { my ($prog, $host, $in, $err, $out, $multi, $args) = @_; DbgMsg("prog = $prog\nhost=$host\nin=$in\nerr=$err\nout=$out\nmulti=$multi\nargs=$args\n"); my $cmd = "$prog "; if ($multi) { $cmd .= "$SINGLE_CONFIG "; } else { $cmd .= "$MULTI_CONFIG "; } $cmd .= $args . " "; my $rcmd; if ($host !~ /^localhost$/i) { $rcmd = "cat $in | rsh $host 'sh -c \"$cmd 2>$err | cat - $err ; rm $err\"'"; } else { $rcmd = "$cmd 2>$err <$in | cat - $err; rm $err"; } $rcmd = "($rcmd) >> $out"; } sub writeFasta { my ($filename, $label, $seq) = @_; unlink($filename); open(SEQ,"> $filename"); print SEQ ">$label\n"; print SEQ "$seq\n"; close(SEQ); DbgMsg("WROTE FASTA SEQUENCE TO '$filename' AS FOLLOWS:\n---BEGIN---\n"); DbgMsg(`cat $filename`); DbgMsg("---END---\n\n"); } # Runs Genie as an external executable, returning the results # in a hash reference. sub processSeq { my ($name,$seq,$multi,$both) = @_; my $infile = "$SEQPREFIX.$$"; my $err = "$ERRPREFIX.$$"; my $outfile = "$OUTPREFIX.$$"; unlink($outfile); # write the sequence to a temporary file writeFasta($infile, $name, $seq); my $cmd = buildCmd($GENIE, $HOST, $infile, $err, $outfile, $multi,"-s"); DbgMsg("EXECUTING COMMAND:\t$cmd\n"); system($cmd); if ($both) { # write the sequence to a temporary file writeFasta($infile, "R-".$name, $seq); my $cmd = buildCmd($GENIE, $HOST, $infile, $err, $outfile, $multi, '-s -dREVERSE_STRAND'); DbgMsg("EXECUTING COMMAND:\t$cmd\n"); system($cmd); } my $genie = new FileHandle($outfile); DbgMsg("READING RESULTS INTO DICT\n"); my $dict = new Dict($genie); DbgMsg("REMOVING '$infile'\n"); unlink($infile); DbgMsg("REMOVING '$outfile'\n"); unlink($outfile); return $dict; } # sends the results back to the browser sub displayResults { my ($q, $label, $seq, $results) = @_; # contains "+" where coding is predicted my $annotFwd = ""; # contains "-" where reverse coding is predicted my $annotRev = ""; my $sequences = $results->Find("SEQUENCES"); # print "
\n";
    #  $results->Dump();
    #  print "
\n"; print $q->start_html(-title => 'Interactive Genie', -BGCOLOR => 'white'); print $q->h1($q->a({-href=>"$TOGB?$label"},$label)); my @EXONS; my $found = 0; # set to 1 if 1 or more exons found while ($sequences) { # assume there is only one sequence my $id = $sequences->GetKey(0); my $seqid = $sequences->Find($id); my $parse = $seqid->Find("PREDICTED_PARSE"); my $revFlag = ($id =~ /^R-/); # iterate through exon contents # placing each exon in @EXONS as a pre-formatted row in an HTML table # we also annotate while we're at it my $i; for ($i=0; $i < $parse->GetSize(); $i++) { my $component = $parse->GetKey($i); if ($component =~ /EXON/) { my $content = $parse->Get($i); my $bounds = $content->Find("Bounds"); $found = 1; push(@EXONS,$q->td([$component,@$bounds])); my ($start,$stop) = @$bounds; my $len = abs($stop - $start) + 1; if ($revFlag) { if (length($annotRev) < $start) { $annotRev .= " " x ($start - length($annotRev)); } substr($annotRev,$stop,$len) = "-" x $len; } else { if (length($annotFwd) < $stop) { $annotFwd .= " " x ($stop - length($annotFwd)); } substr($annotFwd,$start,$len) = "+" x $len; } } } $sequences = $results->FindNext(); } DbgMsg("CHECKING FOR $SAVEDIR"); if (!-d $SAVEDIR) { DbgMsg("$SAVEDIR MISSING. ATTEMPTING TO CREATE"); system("mkdir $SAVEDIR"); if ($? >> 8) { DbgMsg("CREATING $SAVEDIR FAILED: $!"); die($!); } } DbgMsg("SAVING PARSE AND SEQUENCE to $SAVEDIR/$label\n"); # save the parse to disk my $savefh = new FileHandle(">$SAVEDIR/$label"); $results->Dump(0,$savefh); $savefh->close(); if ($found) { print $q->table({-border => '1'}, $q->Tr({ -align => "CENTER", -valign => "TOP"}, [ $q->th(['','Start','Stop']), @EXONS ]) ); } else { print $q->b("No genes found."); } print $q->p(); $seq =~ s/(.{50})/$1\n/g; $annotFwd =~ s/(.{50})/$1\n/g; $annotRev =~ s/(.{50})/$1\n/g; my $outLines; my @seqLines = split(/\n/,$seq); my @annotFwdLines = split(/\n/,$annotFwd); my @annotRevLines = split(/\n/,$annotRev); while (@seqLines) { my $sLine = shift(@seqLines); my $aLine = shift(@annotFwdLines); my $rLine = shift(@annotRevLines); $outLines .= $sLine."\n"; $outLines .= $aLine."\n" if ($aLine !~ /^\s*$/); $outLines .= $rLine."\n" if ($rLine !~ /^\s*$/); $outLines .= "\n"; } print $q->pre($outLines); Globals::addAnnoSeq($q,"$TOGB?$label") if $found; print $q->end_html(); } # checks the dict for errors. if found, returns an error string to # print. similar to Dict::ErrorCheck, but doesn't print to STDERR and exit sub ErrorCheck { my ($q,$dict) = @_; my $err = $dict->Find("Error"); if ($err) { DbgMsg("ERROR FOUND OUTPUT\n"); my $errstr = "Error: \"$err\"\n"; $errstr .= "Program: \"".$dict->Find("Program")."\"\n" if $dict->Find("Program"); $errstr .= "Build: \"".$dict->Find("Build")."\"\n" if $dict->Find("Build"); $errstr .= "Diagnostic: \"".$dict->Find("Diagnostic")."\"\n" if $dict->Find("Diagnostic"); print $q->start_html(-title => 'Interactive Genie Error'); print $q->em($q->b("An error has occurred.")); print $q->p(); print $q->pre("Please be so kind as to email dkulp\@cse.ucsc.edu with the\n". "following error report, DNA input, and a brief description\n". "on how this error could be reproduced.\n\n".$errstr); print $q->end_html(); DbgMsg("DUMPING DICTIONARY\n"); $dict->Dump(); exit; } } sub DbgMsg { return if !$DEBUG; my (@msg) = @_; foreach (@msg) { s//\>/g; } print "
",@msg,"
\n"; } sub Die { my ($msg) = @_; print "

$msg

\n"; exit; }