Archive Ensembl HomeArchive Ensembl Home
BlastpWithReuse.pm
Go to the documentation of this file.
00001 #
00002 # You may distribute this module under the same terms as perl itself
00003 #
00004 # POD documentation - main docs before the code
00005 
00006 =pod 
00007 
00008 =head1 NAME
00009 
00010 Bio::EnsEMBL::Compara::RunnableDB::ProteinTrees::BlastpWithReuse
00011 
00012 =cut
00013 
00014 =head1 SYNOPSIS
00015 
00016 my $db      = Bio::EnsEMBL::Compara::DBAdaptor->new($locator);
00017 my $blast = Bio::EnsEMBL::Compara::RunnableDB::ProteinTrees::BlastpWithReuse->new
00018  (
00019   -db      => $db,
00020   -input_id   => $input_id
00021   -analysis   => $analysis );
00022 $blast->fetch_input(); #reads from DB
00023 $blast->run();
00024 $blast->output();
00025 $blast->write_output(); #writes to DB
00026 
00027 =cut
00028 
00029 =head1 DESCRIPTION
00030 
00031 This object wraps Bio::EnsEMBL::Analysis::Runnable::Blast to add
00032 functionality to read and write to databases.
00033 The appropriate Bio::EnsEMBL::Analysis object must be passed for
00034 extraction of appropriate parameters. A Bio::EnsEMBL::Analysis::DBSQL::Obj is
00035 required for databse access.
00036 
00037 =cut
00038 
00039 =head1 CONTACT
00040 
00041   Contact Albert Vilella on module implemetation/design detail: avilella@ebi.ac.uk
00042   Contact Ewan Birney on EnsEMBL in general: birney@sanger.ac.uk
00043 
00044 =cut
00045 
00046 =head1 APPENDIX
00047 
00048 The rest of the documentation details each of the object methods. 
00049 Internal methods are usually preceded with a _
00050 
00051 =cut
00052 
00053 package Bio::EnsEMBL::Compara::RunnableDB::ProteinTrees::BlastpWithReuse;
00054 
00055 use strict;
00056 use warnings;
00057 
00058 use Bio::EnsEMBL::Utils::Exception qw(throw warning);
00059 use Bio::EnsEMBL::Analysis::Runnable::Blast;
00060 use Bio::EnsEMBL::Analysis::Tools::BPliteWrapper;
00061 use Bio::EnsEMBL::Analysis::Tools::FilterBPlite;
00062 use Bio::EnsEMBL::Compara::PeptideAlignFeature;   # Blast_reuse
00063 
00064 use base ('Bio::EnsEMBL::Compara::RunnableDB::BaseRunnable');
00065 
00066 
00067 =head2 fetch_input
00068 
00069     Title   :   fetch_input
00070     Usage   :   $self->fetch_input
00071     Function:   Fetches input data for repeatmasker from the database
00072     Returns :   none
00073     Args    :   none
00074 
00075 =cut
00076 
00077 sub fetch_input {
00078     my $self = shift @_;
00079 
00080     $self->compara_dba->dbc->disconnect_when_inactive(0);
00081 
00082 
00083     my $reuse_ss_id = $self->param('reuse_ss_id')
00084                     or die "'reuse_ss_id' is an obligatory parameter dynamically set in 'meta' table by the pipeline - please investigate";
00085 
00086     my $reuse_ss = $self->compara_dba()->get_SpeciesSetAdaptor->fetch_by_dbID($reuse_ss_id);    # this method cannot fail at the moment, but in future it may
00087 
00088     my $reuse_ss_hash = { map { $_->dbID() => 1 } @{ $reuse_ss->genome_dbs() } };
00089     $self->param('reuse_ss_hash', $reuse_ss_hash );
00090 
00091 
00092     my $member_id = $self->param('member_id')
00093                         or die "'member_id' is an obligatory parameter";
00094     my $member    = $self->compara_dba->get_MemberAdaptor->fetch_by_dbID($member_id)
00095                         or die "Could not fetch member with member_id='$member_id'";
00096     my $query     = $member->bioseq()
00097                         or die "Could not fetch bioseq for member with member_id='$member_id'";
00098 
00099     if ($query->length < 10) {
00100         $self->input_job->incomplete(0);    # to say "the execution completed successfully, but please record the thown message"
00101         die "Peptide is too short for BLAST";
00102     }
00103 
00104     $self->param('member', $member);
00105     $self->param('query',  $query);
00106 
00107       # We get the list of genome_dbs to execute, then go one by one with this member
00108 
00109     my $mlss_id         = $self->param('mlss_id') or die "'mlss_id' is an obligatory parameter";
00110     my $mlss            = $self->compara_dba()->get_MethodLinkSpeciesSetAdaptor->fetch_by_dbID($mlss_id) or die "Could not fetch mlss with dbID=$mlss_id";
00111     my $species_set     = $mlss->species_set;
00112     my $genome_db_list  = (ref($species_set) eq 'ARRAY') ? $species_set : $species_set->genome_dbs();
00113 
00114     print STDERR "Found ", scalar(@$genome_db_list), " genomes to blast this member against.\n" if ($self->debug);
00115     $self->param('genome_db_list', $genome_db_list);
00116 }
00117 
00118 
00119 =head2 run
00120 
00121   Arg[1]     : -none-
00122   Example    : $self->run;
00123   Function   : Runs the runnable set in fetch_input
00124   Returns    : 1 on successful completion
00125   Exceptions : dies if runnable throws an unexpected error
00126 
00127 =cut
00128 
00129 sub run {
00130     my $self = shift @_;
00131 
00132     my $member_id   = $self->param('member_id');
00133     my $member      = $self->param('member');
00134     my $query       = $self->param('query');
00135 
00136     my $reuse_db          = $self->param('reuse_db');   # if this parameter is an empty string, there will be no reuse
00137 
00138     my $reuse_ss_hash     = $self->param('reuse_ss_hash');
00139     my $reuse_this_member = $reuse_ss_hash->{$member->genome_db_id};
00140 
00141     my $fasta_dir         = $self->param('fasta_dir') or die "'fasta_dir' is an obligatory parameter";
00142 
00143     my $wublastp_exe      = $self->param('wublastp_exe') or die "'wublastp_exe' is an obligatory parameter";
00144     die "Cannot execute '$wublastp_exe'" unless(-x $wublastp_exe);
00145 
00146     my $blast_tmp_dir     = $self->param('blast_tmp_dir');
00147 
00148     my %cross_pafs = ();
00149 
00150   foreach my $genome_db (@{$self->param('genome_db_list')}) {
00151     my $fastafile = $genome_db->name() . '_' . $genome_db->assembly() . '.fasta';
00152     $fastafile =~ s/\s+/_/g;    # replace whitespace with '_' characters
00153     $fastafile =~ s/\/\//\//g;  # converts any // in path to /
00154     my $cross_genome_dbfile = $fasta_dir . '/' . $fastafile;   # we are always interested in the 'foreign' genome's fasta file, not the member's
00155 
00156         # Here we can look at a previous build and try to reuse the blast
00157         # results for this query peptide against this hit genome.
00158         # Only run if the blasts are not being reused:
00159     unless($reuse_db and $reuse_ss_hash->{$genome_db->dbID} and $reuse_this_member) {
00160 
00161           ## Define the filter from the parameters:
00162       my $thr_type = $self->param('-threshold_type');
00163       my $thr      = $self->param('-threshold');
00164       unless($thr_type and $thr) {
00165         ($thr_type, $thr) = ('PVALUE', 1e-10);
00166       }
00167       my $blast_options  = $self->param('blast_options') || '';
00168 
00169       ## Create a parser object. This Bio::EnsEMBL::Analysis::Tools::FilterBPlite
00170       ## object wraps the Bio::EnsEMBL::Analysis::Tools::BPliteWrapper which in
00171       ## turn wraps the Bio::EnsEMBL::Analysis::Tools::BPlite (a port of Ian
00172       ## Korf's BPlite from bioperl 0.7 into ensembl). This parser also filter
00173       ## the results according to threshold_type and threshold.
00174 
00175       my $regex = $self->param('regex') || '^(\S+)\s*';
00176 
00177       my $parser = Bio::EnsEMBL::Analysis::Tools::FilterBPlite->new(
00178         -regex          => $regex,
00179         -query_type     => 'pep',
00180         -input_type     => 'pep',
00181         -threshold_type => $thr_type,
00182         -threshold      => $thr,
00183       );
00184 
00185           ## Create the runnable with the previous parser. The filter is not required:
00186       my $runnable = Bio::EnsEMBL::Analysis::Runnable::Blast->new(
00187          -query     => $query,
00188          -database  => $cross_genome_dbfile,
00189          -program   => $wublastp_exe,
00190          -analysis  => $self->analysis,
00191          -options   => $blast_options,
00192          -parser    => $parser,
00193          -filter    => undef,
00194          ( $blast_tmp_dir ? (-workdir => $blast_tmp_dir) : () ),
00195       );
00196 
00197       $self->compara_dba->dbc->disconnect_when_inactive(1);
00198 
00199       ## call runnable run method in eval block
00200       eval { $runnable->run(); };
00201       ## Catch errors if any
00202       if ($@) {
00203         printf(STDERR ref($self->runnable)." threw exception:\n$@$_");
00204         if($@ =~ /"VOID"/) {
00205           print STDERR "this is OK: member_id='$member_id' doesn't have sufficient structure for a search\n";
00206         } else {
00207           die("$@$_");
00208         }
00209       }
00210       $self->compara_dba->dbc->disconnect_when_inactive(0);
00211           #since the Blast runnable takes in analysis parameters rather than an
00212           #analysis object, it creates new Analysis objects internally
00213           #(a new one for EACH FeaturePair generated)
00214           #which are a shadow of the real analysis object ($self->analysis)
00215           #The returned FeaturePair objects thus need to be reset to the real analysis object
00216       foreach my $feature (@{$runnable->output}) {
00217         if($feature->isa('Bio::EnsEMBL::FeaturePair')) {
00218           $feature->analysis($self->analysis);
00219           $feature->{null_cigar} = 1 if ($self->param('null_cigar'));
00220         }
00221         push @{$cross_pafs{$genome_db->dbID}}, $feature;
00222       }
00223 
00224     } # unless it is reuse-against-reuse
00225 
00226   } # for each genome_db
00227 
00228   $self->param('cross_pafs', \%cross_pafs);
00229 }
00230 
00231 
00232 sub write_output {
00233     my $self = shift @_;
00234 
00235     print STDERR "Inserting PAFs...\n" if ($self->debug);
00236 
00237     my $cross_pafs = $self->param('cross_pafs');
00238     foreach my $genome_db_id (keys %$cross_pafs) {
00239         $self->compara_dba->get_PeptideAlignFeatureAdaptor->store(@{$cross_pafs->{$genome_db_id}});
00240     }
00241 }
00242 
00243 
00244 1;