1. This site uses cookies. By continuing to use this site, you are agreeing to our use of cookies. Learn More.

longest sequence

Discussion in 'Perl' started by skunk, May 17, 2008.

  1. skunk

    skunk New Member

    Joined:
    May 1, 2008
    Messages:
    2
    Likes Received:
    0
    Trophy Points:
    0
    hello,

    could some one please help me find the longest protein seqence only after ruuning the following code?
    Code:
    #!/usr/local/bin/perl
    #7_transorfs.pl
    #Author: Paul Stothard, Genome Canada Bioinformatics Help Desk
    #This script reads a single DNA sequence (in FASTA format) from a file and 
    #then generates the protein translations of ORFs found in any of the six
    #reading frames.
    
    use warnings;
    use strict;
    
    #Declare a variable to hold the minimum ORF size you want to be returned
    #(in bases).
    my $MINORF = 102;
    
    #Declare a hash table for translating DNA into protein.
    my %translationHash = 
        (gca => "A", gcg => "A", gct => "A", gcc => "A", gcn => "A",
         tgc => "C", tgt => "C",
         gat => "D", gac => "D",
         gaa => "E", gag => "E",
         ttt => "F", ttc => "F",
         gga => "G", ggg => "G", ggc => "G", ggt => "G", ggn => "G",
         cat => "H", cac => "H",
         ata => "I", att => "I", atc => "I",
         aaa => "K", aag => "K",
         cta => "L", ctg => "L", ctt => "L", ctc => "L", ctn => "L", tta => "L", ttg => "L",
         atg => "M",
         aat => "N", aac => "N",
         cca => "P", cct => "P", ccg => "P", ccc => "P", ccn => "P",
         caa => "Q", cag => "Q",
         cga => "R", cgg => "R", cgc => "R", cgt => "R", cgn => "R",
         aga => "R", agg => "R",
         tca => "S", tcg => "S", tcc => "S", tct => "S", tcn => "S",
         agc => "S", agt => "S",
         aca => "T", acg => "T", acc => "T", act => "T", acn => "T",
         gta => "V", gtg => "V", gtc => "V", gtt => "V", gtn => "V",
         tgg => "W",
         tat => "Y", tac => "Y",
         tag => "*", taa => "*", tga => "*");
    
    #Obtain the DNA sequence from a file.
    print "Enter the name of the sequence file (try mediumseq.txt) and then press Enter:\n";
    my $fileToRead = <STDIN>;
    chomp($fileToRead);
    open (DNAFILE, $fileToRead) or die( "Cannot open file : $!" );
    $/ = undef;
    my $directStrand = <DNAFILE>;
    my $sequenceTitle = "";
    if ($directStrand =~ m/(>[^\n]+)/){
       $sequenceTitle = $1;
       $sequenceTitle =~ s/>//;
    }
    else {
        die( "A FASTA sequence title was not found." );
    }
    
    #Once you are done with a file, close the filehandle.
    close (DNAFILE) or die( "Cannot close file : $!");
    
    #Filter the DNA sequence.
    $directStrand =~ s/>[^\n]+//;
    $directStrand =~ tr/GATCN/gatcn/;
    $directStrand =~ s/[^gatcn]//g;
    
    #Determine the reverse-complement of the sequence. The reverse-complement
    #will be used when looking for ORFs on the opposite strand.
    my $reverseComplement = $directStrand;
    $reverseComplement =~ tr/gatcn/ctagn/;
    my @arrayOfBases = split(/\B/, $reverseComplement);
    my @reversedArray = reverse(@arrayOfBases);
    $reverseComplement = join("",@reversedArray);
    
    #Declare an array to hold the ranges of ORFs found in the sequence.
    my @arrayOfORFs = ();
    
    #Declare an array to hold the translations of ORFs.
    my @arrayOfTranslations = ();
    
    #Below is the ORF finding code from the previous example, with some
    #differences. The first difference is that an outer for loop has
    #been added to the ORF finding code, so that it executes twice--
    #once for the direct strand, and once for the reverse-complement.
    #The other change is that the codon range description includes
    #a variable that is set to "+" for the first iteration of the
    #for loop, and "-" for the second iteration.
    for (my $i = 0; $i < 2; $i = $i + 1) {
        my $sequenceEntry = "";
        my $strand = "";
        if ($i == 0) {
    	#the first time the ORF finding code executes we will search
    	#for ORFs on the direct strand.
    	$sequenceEntry = $directStrand;
    	$strand = "+";
        }
        else {
    	#the second time the ORF finding code executes we will search
    	#for ORFs on the reverse-complement.
    	$sequenceEntry = $reverseComplement;
    	$strand = "-";
        }
    
        my @startsRF1 =();
        my @startsRF2 =();
        my @startsRF3 =();
        my @stopsRF1 = ();
        my @stopsRF2 = ();
        my @stopsRF3 = ();
    
        while ($sequenceEntry =~ m/atg/gi){
    	my $matchPosition = pos($sequenceEntry) - 3;
    	if (($matchPosition % 3) == 0) {
    	    push (@startsRF1, $matchPosition);
    	}
    	elsif ((($matchPosition + 2) % 3) == 0) {
    	    push (@startsRF2, $matchPosition);
    	}
    	else {
    	    push (@startsRF3, $matchPosition);
    	}
        }
    
        while ($sequenceEntry =~ m/tag|taa|tga/gi){
    	my $matchPosition = pos($sequenceEntry);
    	if (($matchPosition % 3) == 0) {
    	    push (@stopsRF1, $matchPosition);
    	}
    	elsif ((($matchPosition + 2) % 3) == 0) {
    	    push (@stopsRF2, $matchPosition);
    	}
    	else {
    	    push (@stopsRF3, $matchPosition);
    	}
        }
    
        my $codonRange = "";
        my $startPosition = 0;
        my $stopPosition = 0;
    
        @startsRF1 = reverse(@startsRF1);
        @stopsRF1 = reverse(@stopsRF1);
        while (scalar(@startsRF1) > 0) {
    	$codonRange = "";
    	$startPosition = pop(@startsRF1);
    	if ($startPosition < $stopPosition) {
    	    next;
    	}
    	while (scalar(@stopsRF1) > 0) {
    	    $stopPosition = pop(@stopsRF1);
    	    if ($stopPosition > $startPosition) {
    		last;
    	    }
    	}
    	if ($stopPosition <= $startPosition) {
    	    $stopPosition = length($sequenceEntry) - (length($sequenceEntry) % 3);
    	    $codonRange = $strand . "1 " . $startPosition . ".." . $stopPosition;
    	    push (@arrayOfORFs, $codonRange);
    	    last;
    	}
    	else {
    	    $codonRange = $strand . "1 " . $startPosition . ".." . $stopPosition;
    	    push (@arrayOfORFs, $codonRange);
    	}
        }
    
        $stopPosition = 0;
        @startsRF2 = reverse(@startsRF2);
        @stopsRF2 = reverse(@stopsRF2);
        while (scalar(@startsRF2) > 0) {
    	$codonRange = "";
    	$startPosition = pop(@startsRF2);
    	if ($startPosition < $stopPosition) {
    	    next;
    	}
    	while (scalar(@stopsRF2) > 0) {
    	    $stopPosition = pop(@stopsRF2);
    	    if ($stopPosition > $startPosition) {
    		last;
    	    }
    	}
    	if ($stopPosition <= $startPosition) {
    	    $stopPosition = length($sequenceEntry) - ((length($sequenceEntry) + 2) % 3);
    	    $codonRange = $strand . "2 " . $startPosition . ".." . $stopPosition;
    	    push (@arrayOfORFs, $codonRange);
    	    last;
    	}
    	else {
    	    $codonRange = $strand . "2 " . $startPosition . ".." . $stopPosition;
    	    push (@arrayOfORFs, $codonRange);
    	}
        }
    
        $stopPosition = 0;
        @startsRF3 = reverse(@startsRF3);
        @stopsRF3 = reverse(@stopsRF3);
        while (scalar(@startsRF3) > 0) {
    	$codonRange = "";
    	$startPosition = pop(@startsRF3);
    	if ($startPosition < $stopPosition) {
    	    next;
    	}
    	while (scalar(@stopsRF3) > 0) {
    	    $stopPosition = pop(@stopsRF3);
    	    if ($stopPosition > $startPosition) {
    		last;
    	    }
    	}
    	if ($stopPosition <= $startPosition) {
    	    $stopPosition = length($sequenceEntry) - ((length($sequenceEntry) + 1) % 3);
    	    $codonRange = $strand . "3 " . $startPosition . ".." . $stopPosition;
    	    push (@arrayOfORFs, $codonRange);
    	    last;
    	}
    	else {
    	    $codonRange = $strand . "3 " . $startPosition . ".." . $stopPosition;
    	    push (@arrayOfORFs, $codonRange);
    	}
        }
    }
    
    #Use a loop to examine each ORF description in @arrayOfORFs. Translate ORFs that are
    #longer than $MINORF, and add those translations to @arrayOfTranslations
    foreach(@arrayOfORFs) {
        
        #Use the matching operator to copy parts of the ORF range into variables.
        #The strand will be stored in $1, the reading frame in $2, the start of
        #the ORF in $3, and the end of the ORF in $4.
        $_ =~ m/([\+\-])(\d)\s(\d+)\.\.(\d+)/;
    
        #Skip ORFs that are smaller than $MINORF
        if (($4 - $3) < $MINORF) {
    	next;
        }
    
        #Declare a variable to hold the DNA sequence of the ORF.
        my $ORFsequence = "";
    
        #If the ORF is on the forward strand, use "substr" to obtain the sequence
        #of the ORF from $directStrand. Otherwise, obtain the sequence from 
        #$reverseComplement.
        if ($1 eq "+") {
    	$ORFsequence = substr($directStrand, $3, $4 - $3);
        }
        else {
    	$ORFsequence = substr($reverseComplement, $3, $4 - $3);
        }
    
        #Now use a for loop to translate the ORF sequence codon by codon.
        #The amino acids are added as elements to the array @growingProtein.
        my @growingProtein = ();
        for (my $i = 0; $i <= (length($ORFsequence) - 3); $i = $i + 3) {
    	my $codon = substr($ORFsequence, $i, 3);
    	if (exists( $translationHash{$codon} )){
    	    push (@growingProtein, $translationHash{$codon});
    	}
    	else {
    	    push (@growingProtein, "X");
    	}
        }
    
        #Now convert the @growingProtein array to a string.
        my $joinedAminoAcids = join("",@growingProtein);
    
        #Add a title to the protein sequence before adding it to @arrayOfTranslations.
        #For ORFs on the reverse strand adjust the position numbers so that they refer
        #to the direct strand.
        if ($1 eq "+") {
    	$joinedAminoAcids = ">" . "ORF rf " . $1 . $2 . ", from " . ($3 + 1) .
    	    " to " . $4 . ", " . ($4 - $3) . " bases.\n" . $joinedAminoAcids;
        }
        else {
    	$joinedAminoAcids = ">" . "ORF rf " . $1 . $2 . ", from " . 
    	(length($directStrand) - $4 + 1) . " to " . (length($directStrand) - $3) . 
    	    ", " . ($4 - $3) . " bases.\n" . $joinedAminoAcids;
        }
        #The text contained in $joinedAminoAcids can now be added on to our array
        #of protein translations.
        push (@arrayOfTranslations, $joinedAminoAcids);
    }
    
    #Now display the results.
    print "Translated ORFs for $sequenceTitle, length = " . length($directStrand) . " bp.\n";
    print "minimum ORF size kept = " . $MINORF . " bases.\n\n";
    foreach(@arrayOfTranslations) {
        print "$_ \n\n";
    }
    
    
    #Once your script is working using mediumseq.txt, try using sars.txt
    #as the input.
    
    
    
    the following is the mediumseq.txt:

    >random sequence
    ttatgaatcggggcctaagctgctattgtacattgggaggacgtaccaatagtgtcttgg
    ctttcgacgactaccattgttctgggtttcaggagtaaacccactatacgaagttagaga
    tggttgagaaaacttcgcgaaatgatacaagaggatgttcatattcaactggccgcgcca
    tataacgcgatcgttgttcaaacaaaagtccaagaagactttccacgtccacagctgccg
    agatcgagtcgggatgtaagctgggactaagtccacaacaggccgatgatggttccgaca
    ctgcaggggagatacctaaacgccgactcactctgatcgaccaaccatcgatacggcaac
    tggacgtacaaaagcttggcctggcggcgctgccttcatgcagaaagaccaacgaacgag
    gattaaccagtgtggaatccgaagaaaactctccgcatcgataggtcagctaccaacaca
    gagaaagagtaccgtcgtttctcgctctatctcatatatcagtgacatcttgacggattc
    gacagcagagtaacttaccaatctactttcacggcatgagttaagaaacgctaggctcgc
    aaagtagactaggaatttatagaggtgacggatatccccgtgactaataatcttgcactc
    ccagaaaaaacaaaggattaagcttaagtaaatccaaagaccgcgttccaagatgcaatt
    agcacctgagtattttatagatgcccgtccatttggcaaccctttccttaagacgagctt
    atctagtgccattttcaaaatgcgaaccctttgccatagcctcagtttaggactactaac
    ctatcccacgcgctaggagggtgtggcaacactgacacacatgctgcttgctaaaagcag
    tcggtacaaccgacagaattactttaaccacgcctgaaccaagtgtttaaacagatgact
    cgcaatgatggtgtaaatgctaaatgtttatatgcgtccg
     
    Last edited by a moderator: May 18, 2008
  2. pradeep

    pradeep Team Leader

    Joined:
    Apr 4, 2005
    Messages:
    1,646
    Likes Received:
    86
    Trophy Points:
    0
    Occupation:
    Programmer
    Location:
    Kolkata, India
    Home Page:
    Hi skunk, unfortunately none of us here is a genetic expert, so it'd be nice if you could tell us, what's wrong with your code!
     
  3. gprunescape22

    gprunescape22 New Member

    Joined:
    Jul 7, 2008
    Messages:
    1
    Likes Received:
    0
    Trophy Points:
    0
    Please sell your wares elsewhere
     
    Last edited by a moderator: Jul 9, 2008
  4. XXxxImmortalxxXX

    XXxxImmortalxxXX New Member

    Joined:
    Jun 27, 2007
    Messages:
    569
    Likes Received:
    18
    Trophy Points:
    0
    Stop Spamming This Crap Dude God
     
  5. XXxxImmortalxxXX

    XXxxImmortalxxXX New Member

    Joined:
    Jun 27, 2007
    Messages:
    569
    Likes Received:
    18
    Trophy Points:
    0
    Bann Gprunescape
     

Share This Page