Archive Ensembl HomeArchive Ensembl Home
DumpMemberSequencesIntoFasta.pm
Go to the documentation of this file.
00001 #
00002 # You may distribute this module under the same terms as perl itself
00003 #
00004 # POD documentation - main docs before the code
00005 
00006 =pod 
00007 
00008 =head1 NAME
00009 
00010 Bio::EnsEMBL::Compara::RunnableDB::DumpMemberSequencesIntoFasta
00011 
00012 This runnable dumps all members from given source_names into one big FASTA file.
00013 
00014 =cut
00015 
00016 package Bio::EnsEMBL::Compara::RunnableDB::DumpMemberSequencesIntoFasta;
00017 
00018 use strict;
00019 
00020 use base ('Bio::EnsEMBL::Compara::RunnableDB::BaseRunnable');
00021 
00022 sub param_defaults {
00023     return {
00024         'fasta_name'  => 'metazoa.pep', # you should definitely change it
00025         'split_width' => 72,            # split sequence lines into readable format (set to 0 to disable)
00026         'idprefixed'  => 1,             # introduce sequence_id as a part of the name (for faster mapping)
00027         'removeXed'   => undef,         # do not filter sequences that contain that many X-es consecutively
00028         'source_names'=> [ 'ENSEMBLPEP','Uniprot/SWISSPROT','Uniprot/SPTREMBL', 'EXTERNALPEP' ],
00029     };
00030 }
00031 
00032 sub run {
00033     my $self = shift @_;
00034 
00035     my $fasta_name  = $self->param('fasta_name');
00036     my $split_width = $self->param('split_width');
00037     my $idprefixed  = $self->param('idprefixed');
00038     my $removeXed   = $self->param('removeXed');
00039 
00040     my $source_names = join(', ', map { "'$_'" } @{ $self->param('source_names') } );
00041 
00042     my $sql = "SELECT m.sequence_id, m.stable_id, m.description, s.sequence " .
00043                 " FROM member m, sequence s " .
00044                 " WHERE m.source_name in ( $source_names ) ".
00045                 " AND m.sequence_id=s.sequence_id ".
00046                 " GROUP BY m.sequence_id ".
00047                 " ORDER BY m.sequence_id, m.stable_id";
00048 
00049     open FASTAFILE, ">$fasta_name"
00050         or die "Could open $fasta_name for output\n";
00051 
00052     print("writing fasta to file '$fasta_name'\n");
00053 
00054     my $sth = $self->compara_dba()->dbc->prepare( $sql );
00055     $sth->execute();
00056 
00057     my ($sequence_id, $stable_id, $description, $sequence);
00058     $sth->bind_columns( \$sequence_id, \$stable_id, \$description, \$sequence );
00059 
00060     while( $sth->fetch() ) {
00061         if ($sequence =~ /^X+$/) {
00062             print STDERR "$stable_id is all X not dumped\n";
00063             next;
00064         }
00065         unless($removeXed and ($sequence =~ /X{$removeXed,}?/)) {
00066             $sequence =~ s/(.{$split_width})/$1\n/g if($split_width);
00067             chomp $sequence;
00068             my $nameprefix = $idprefixed ? ('seq_id_'.$sequence_id.'_') : '';
00069             print FASTAFILE ">${nameprefix}${stable_id} $description\n$sequence\n";
00070         }
00071     }
00072     $sth->finish();
00073 
00074     close FASTAFILE;
00075 }
00076 
00077 1;
00078