Go4Expert

Go4Expert (http://www.go4expert.com/)
-   Perl (http://www.go4expert.com/forums/perl/)
-   -   longest sequence (http://www.go4expert.com/forums/longest-sequence-t10663/)

skunk 17May2008 20:44

longest sequence
 
hello,

could some one please help me find the longest protein seqence only after ruuning the following code?
Code:

#!/usr/local/bin/perl
#7_transorfs.pl
#Author: Paul Stothard, Genome Canada Bioinformatics Help Desk
#This script reads a single DNA sequence (in FASTA format) from a file and
#then generates the protein translations of ORFs found in any of the six
#reading frames.

use warnings;
use strict;

#Declare a variable to hold the minimum ORF size you want to be returned
#(in bases).
my $MINORF = 102;

#Declare a hash table for translating DNA into protein.
my %translationHash =
    (gca => "A", gcg => "A", gct => "A", gcc => "A", gcn => "A",
    tgc => "C", tgt => "C",
    gat => "D", gac => "D",
    gaa => "E", gag => "E",
    ttt => "F", ttc => "F",
    gga => "G", ggg => "G", ggc => "G", ggt => "G", ggn => "G",
    cat => "H", cac => "H",
    ata => "I", att => "I", atc => "I",
    aaa => "K", aag => "K",
    cta => "L", ctg => "L", ctt => "L", ctc => "L", ctn => "L", tta => "L", ttg => "L",
    atg => "M",
    aat => "N", aac => "N",
    cca => "P", cct => "P", ccg => "P", ccc => "P", ccn => "P",
    caa => "Q", cag => "Q",
    cga => "R", cgg => "R", cgc => "R", cgt => "R", cgn => "R",
    aga => "R", agg => "R",
    tca => "S", tcg => "S", tcc => "S", tct => "S", tcn => "S",
    agc => "S", agt => "S",
    aca => "T", acg => "T", acc => "T", act => "T", acn => "T",
    gta => "V", gtg => "V", gtc => "V", gtt => "V", gtn => "V",
    tgg => "W",
    tat => "Y", tac => "Y",
    tag => "*", taa => "*", tga => "*");

#Obtain the DNA sequence from a file.
print "Enter the name of the sequence file (try mediumseq.txt) and then press Enter:\n";
my $fileToRead = <STDIN>;
chomp($fileToRead);
open (DNAFILE, $fileToRead) or die( "Cannot open file : $!" );
$/ = undef;
my $directStrand = <DNAFILE>;
my $sequenceTitle = "";
if ($directStrand =~ m/(>[^\n]+)/){
  $sequenceTitle = $1;
  $sequenceTitle =~ s/>//;
}
else {
    die( "A FASTA sequence title was not found." );
}

#Once you are done with a file, close the filehandle.
close (DNAFILE) or die( "Cannot close file : $!");

#Filter the DNA sequence.
$directStrand =~ s/>[^\n]+//;
$directStrand =~ tr/GATCN/gatcn/;
$directStrand =~ s/[^gatcn]//g;

#Determine the reverse-complement of the sequence. The reverse-complement
#will be used when looking for ORFs on the opposite strand.
my $reverseComplement = $directStrand;
$reverseComplement =~ tr/gatcn/ctagn/;
my @arrayOfBases = split(/\B/, $reverseComplement);
my @reversedArray = reverse(@arrayOfBases);
$reverseComplement = join("",@reversedArray);

#Declare an array to hold the ranges of ORFs found in the sequence.
my @arrayOfORFs = ();

#Declare an array to hold the translations of ORFs.
my @arrayOfTranslations = ();

#Below is the ORF finding code from the previous example, with some
#differences. The first difference is that an outer for loop has
#been added to the ORF finding code, so that it executes twice--
#once for the direct strand, and once for the reverse-complement.
#The other change is that the codon range description includes
#a variable that is set to "+" for the first iteration of the
#for loop, and "-" for the second iteration.
for (my $i = 0; $i < 2; $i = $i + 1) {
    my $sequenceEntry = "";
    my $strand = "";
    if ($i == 0) {
        #the first time the ORF finding code executes we will search
        #for ORFs on the direct strand.
        $sequenceEntry = $directStrand;
        $strand = "+";
    }
    else {
        #the second time the ORF finding code executes we will search
        #for ORFs on the reverse-complement.
        $sequenceEntry = $reverseComplement;
        $strand = "-";
    }

    my @startsRF1 =();
    my @startsRF2 =();
    my @startsRF3 =();
    my @stopsRF1 = ();
    my @stopsRF2 = ();
    my @stopsRF3 = ();

    while ($sequenceEntry =~ m/atg/gi){
        my $matchPosition = pos($sequenceEntry) - 3;
        if (($matchPosition % 3) == 0) {
            push (@startsRF1, $matchPosition);
        }
        elsif ((($matchPosition + 2) % 3) == 0) {
            push (@startsRF2, $matchPosition);
        }
        else {
            push (@startsRF3, $matchPosition);
        }
    }

    while ($sequenceEntry =~ m/tag|taa|tga/gi){
        my $matchPosition = pos($sequenceEntry);
        if (($matchPosition % 3) == 0) {
            push (@stopsRF1, $matchPosition);
        }
        elsif ((($matchPosition + 2) % 3) == 0) {
            push (@stopsRF2, $matchPosition);
        }
        else {
            push (@stopsRF3, $matchPosition);
        }
    }

    my $codonRange = "";
    my $startPosition = 0;
    my $stopPosition = 0;

    @startsRF1 = reverse(@startsRF1);
    @stopsRF1 = reverse(@stopsRF1);
    while (scalar(@startsRF1) > 0) {
        $codonRange = "";
        $startPosition = pop(@startsRF1);
        if ($startPosition < $stopPosition) {
            next;
        }
        while (scalar(@stopsRF1) > 0) {
            $stopPosition = pop(@stopsRF1);
            if ($stopPosition > $startPosition) {
                last;
            }
        }
        if ($stopPosition <= $startPosition) {
            $stopPosition = length($sequenceEntry) - (length($sequenceEntry) % 3);
            $codonRange = $strand . "1 " . $startPosition . ".." . $stopPosition;
            push (@arrayOfORFs, $codonRange);
            last;
        }
        else {
            $codonRange = $strand . "1 " . $startPosition . ".." . $stopPosition;
            push (@arrayOfORFs, $codonRange);
        }
    }

    $stopPosition = 0;
    @startsRF2 = reverse(@startsRF2);
    @stopsRF2 = reverse(@stopsRF2);
    while (scalar(@startsRF2) > 0) {
        $codonRange = "";
        $startPosition = pop(@startsRF2);
        if ($startPosition < $stopPosition) {
            next;
        }
        while (scalar(@stopsRF2) > 0) {
            $stopPosition = pop(@stopsRF2);
            if ($stopPosition > $startPosition) {
                last;
            }
        }
        if ($stopPosition <= $startPosition) {
            $stopPosition = length($sequenceEntry) - ((length($sequenceEntry) + 2) % 3);
            $codonRange = $strand . "2 " . $startPosition . ".." . $stopPosition;
            push (@arrayOfORFs, $codonRange);
            last;
        }
        else {
            $codonRange = $strand . "2 " . $startPosition . ".." . $stopPosition;
            push (@arrayOfORFs, $codonRange);
        }
    }

    $stopPosition = 0;
    @startsRF3 = reverse(@startsRF3);
    @stopsRF3 = reverse(@stopsRF3);
    while (scalar(@startsRF3) > 0) {
        $codonRange = "";
        $startPosition = pop(@startsRF3);
        if ($startPosition < $stopPosition) {
            next;
        }
        while (scalar(@stopsRF3) > 0) {
            $stopPosition = pop(@stopsRF3);
            if ($stopPosition > $startPosition) {
                last;
            }
        }
        if ($stopPosition <= $startPosition) {
            $stopPosition = length($sequenceEntry) - ((length($sequenceEntry) + 1) % 3);
            $codonRange = $strand . "3 " . $startPosition . ".." . $stopPosition;
            push (@arrayOfORFs, $codonRange);
            last;
        }
        else {
            $codonRange = $strand . "3 " . $startPosition . ".." . $stopPosition;
            push (@arrayOfORFs, $codonRange);
        }
    }
}

#Use a loop to examine each ORF description in @arrayOfORFs. Translate ORFs that are
#longer than $MINORF, and add those translations to @arrayOfTranslations
foreach(@arrayOfORFs) {
   
    #Use the matching operator to copy parts of the ORF range into variables.
    #The strand will be stored in $1, the reading frame in $2, the start of
    #the ORF in $3, and the end of the ORF in $4.
    $_ =~ m/([\+\-])(\d)\s(\d+)\.\.(\d+)/;

    #Skip ORFs that are smaller than $MINORF
    if (($4 - $3) < $MINORF) {
        next;
    }

    #Declare a variable to hold the DNA sequence of the ORF.
    my $ORFsequence = "";

    #If the ORF is on the forward strand, use "substr" to obtain the sequence
    #of the ORF from $directStrand. Otherwise, obtain the sequence from
    #$reverseComplement.
    if ($1 eq "+") {
        $ORFsequence = substr($directStrand, $3, $4 - $3);
    }
    else {
        $ORFsequence = substr($reverseComplement, $3, $4 - $3);
    }

    #Now use a for loop to translate the ORF sequence codon by codon.
    #The amino acids are added as elements to the array @growingProtein.
    my @growingProtein = ();
    for (my $i = 0; $i <= (length($ORFsequence) - 3); $i = $i + 3) {
        my $codon = substr($ORFsequence, $i, 3);
        if (exists( $translationHash{$codon} )){
            push (@growingProtein, $translationHash{$codon});
        }
        else {
            push (@growingProtein, "X");
        }
    }

    #Now convert the @growingProtein array to a string.
    my $joinedAminoAcids = join("",@growingProtein);

    #Add a title to the protein sequence before adding it to @arrayOfTranslations.
    #For ORFs on the reverse strand adjust the position numbers so that they refer
    #to the direct strand.
    if ($1 eq "+") {
        $joinedAminoAcids = ">" . "ORF rf " . $1 . $2 . ", from " . ($3 + 1) .
            " to " . $4 . ", " . ($4 - $3) . " bases.\n" . $joinedAminoAcids;
    }
    else {
        $joinedAminoAcids = ">" . "ORF rf " . $1 . $2 . ", from " .
        (length($directStrand) - $4 + 1) . " to " . (length($directStrand) - $3) .
            ", " . ($4 - $3) . " bases.\n" . $joinedAminoAcids;
    }
    #The text contained in $joinedAminoAcids can now be added on to our array
    #of protein translations.
    push (@arrayOfTranslations, $joinedAminoAcids);
}

#Now display the results.
print "Translated ORFs for $sequenceTitle, length = " . length($directStrand) . " bp.\n";
print "minimum ORF size kept = " . $MINORF . " bases.\n\n";
foreach(@arrayOfTranslations) {
    print "$_ \n\n";
}


#Once your script is working using mediumseq.txt, try using sars.txt
#as the input.

the following is the mediumseq.txt:

>random sequence
ttatgaatcggggcctaagctgctattgtacattgggaggacgtaccaat agtgtcttgg
ctttcgacgactaccattgttctgggtttcaggagtaaacccactatacg aagttagaga
tggttgagaaaacttcgcgaaatgatacaagaggatgttcatattcaact ggccgcgcca
tataacgcgatcgttgttcaaacaaaagtccaagaagactttccacgtcc acagctgccg
agatcgagtcgggatgtaagctgggactaagtccacaacaggccgatgat ggttccgaca
ctgcaggggagatacctaaacgccgactcactctgatcgaccaaccatcg atacggcaac
tggacgtacaaaagcttggcctggcggcgctgccttcatgcagaaagacc aacgaacgag
gattaaccagtgtggaatccgaagaaaactctccgcatcgataggtcagc taccaacaca
gagaaagagtaccgtcgtttctcgctctatctcatatatcagtgacatct tgacggattc
gacagcagagtaacttaccaatctactttcacggcatgagttaagaaacg ctaggctcgc
aaagtagactaggaatttatagaggtgacggatatccccgtgactaataa tcttgcactc
ccagaaaaaacaaaggattaagcttaagtaaatccaaagaccgcgttcca agatgcaatt
agcacctgagtattttatagatgcccgtccatttggcaaccctttcctta agacgagctt
atctagtgccattttcaaaatgcgaaccctttgccatagcctcagtttag gactactaac
ctatcccacgcgctaggagggtgtggcaacactgacacacatgctgcttg ctaaaagcag
tcggtacaaccgacagaattactttaaccacgcctgaaccaagtgtttaa acagatgact
cgcaatgatggtgtaaatgctaaatgtttatatgcgtccg

pradeep 20May2008 09:59

Re: longest sequence
 
Hi skunk, unfortunately none of us here is a genetic expert, so it'd be nice if you could tell us, what's wrong with your code!

gprunescape22 7Jul2008 15:56

Re: longest sequence
 
Please sell your wares elsewhere

XXxxImmortalxxXX 7Jul2008 16:12

Re: longest sequence
 
Stop Spamming This Crap Dude God

XXxxImmortalxxXX 7Jul2008 16:13

Re: longest sequence
 
Bann Gprunescape


All times are GMT +5.5. The time now is 06:16.