Archive Ensembl HomeArchive Ensembl Home
BlastAndParsePAF.pm
Go to the documentation of this file.
00001 =head1 LICENSE
00002 
00003   Copyright (c) 1999-2012 The European Bioinformatics Institute and
00004   Genome Research Limited.  All rights reserved.
00005 
00006   This software is distributed under a modified Apache license.
00007   For license details, please see
00008 
00009     http://www.ensembl.org/info/about/code_licence.html
00010 
00011 =head1 CONTACT
00012 
00013   Please email comments or questions to the public Ensembl
00014   developers list at <dev@ensembl.org>.
00015 
00016   Questions may also be sent to the Ensembl help desk at
00017   <helpdesk@ensembl.org>.
00018 
00019 =head1 NAME
00020 
00021 Bio::EnsEMBL::Compara::RunnableDB::MercatorPecan::BlastAndParsePAF 
00022 
00023 =head1 SYNOPSIS
00024 
00025 
00026 =head1 DESCRIPTION
00027 
00028 Create fasta file containing batch_size number of sequences. Run ncbi_blastp and parse the output into
00029 PeptideAlignFeature objects. Store PeptideAlignFeature objects in the compara database
00030 Supported keys:
00031     'blast_param' => <string>
00032         ncbi blastp parameters
00033 eg "-num_alignments 20 -seg 'yes' -best_hit_overhang 0.2 -best_hit_score_edge 0.1 -use_sw_tback"
00034     'fasta_dir' => <directory path>
00035         Path to fasta files
00036     'mlss_id' => <number>
00037         Method link species set id for Pecan. Obligatory
00038     'genome_db_id' => <number>
00039         Species genome db id.
00040     'offset' => <number>
00041         Offset into ordered array of member_ids. Obligatory
00042     'subset_id' => <number>
00043         Subset id of members to select
00044     'start_member_id' => <number>
00045         Member id of member at 'offset' in order array of member ids. Obligatory
00046     'batch_size' => <number>
00047         Number of members to write to fasta file
00048     'reuse_ss_id' => <number>
00049         Reuse species set id. Normally stored in the meta table. Obligatory.
00050     'do_transactions' => <0|1>
00051         Whether to do transactions. Default is yes.
00052 
00053 
00054 =cut
00055 
00056 package Bio::EnsEMBL::Compara::RunnableDB::MercatorPecan::BlastAndParsePAF;
00057 
00058 use strict;
00059 use FileHandle;
00060 
00061 use base ('Bio::EnsEMBL::Compara::RunnableDB::BaseRunnable');
00062 use Bio::EnsEMBL::Analysis::Tools::FeatureFactory;
00063 use Bio::EnsEMBL::Utils::Exception qw(throw warning info);
00064 use Bio::EnsEMBL::Utils::SqlHelper;
00065 
00066 #
00067 # Fetch members and sequences from the database. 
00068 # Return a sorted list based on member_id, starting at $offset and with $batch_size items
00069 #
00070 sub load_members_from_db{
00071     my ($self, $start_member_id, $offset, $batch_size) = @_;
00072 
00073     my $idprefixed              = $self->param('idprefixed')  || 0;
00074     my $debug                   = $self->debug() || $self->param('debug') || 0;
00075     my $subset_id               = $self->param('subset_id');
00076 
00077     #Get list of members and sequences
00078     my $sql = "SELECT member_id, sequence_id, stable_id, sequence FROM member JOIN sequence USING (sequence_id) JOIN subset_member USING (member_id) WHERE  subset_id=?";
00079     
00080     my $sth = $self->compara_dba->dbc->prepare( $sql );
00081     $sth->execute($subset_id);
00082 
00083     my $member_list;
00084     while( my ($member_id, $seq_id, $stable_id, $seq) = $sth->fetchrow() ) {
00085         $seq=~ s/(.{72})/$1\n/g;
00086         chomp $seq;
00087     my $fasta_line = ($idprefixed
00088                                 ? ">seq_id_${seq_id}_${stable_id}\n$seq\n"
00089                                 : ">$stable_id sequence_id=$seq_id member_id=$member_id\n$seq\n") ;
00090     my $member_sequence;
00091     %$member_sequence = ( member_id => $member_id,
00092                   fasta_line => $fasta_line);
00093     push @$member_list, $member_sequence;
00094     }
00095     $sth->finish();
00096     $self->compara_dba->dbc->disconnect_when_inactive(1);
00097 
00098     #Sort on member_id
00099     my $sorted_list;
00100     @$sorted_list = sort {$a->{member_id} <=> $b->{member_id}} @$member_list;
00101 
00102     my $fasta_list;
00103     #Check start_member_id is the same as the item at offset
00104     if ($self->param('start_member_id') ne $sorted_list->[$offset]->{member_id}) {
00105     throw("start_member_id " . $self->param('start_member_id') . " is not the same as offset " . $sorted_list->[$offset]->{member_id});
00106     }
00107     for (my $i = $offset; $i < ($offset+$batch_size); $i++) {
00108     my $member_id = $sorted_list->[$i]->{member_id};
00109     my $fasta_line = $sorted_list->[$i]->{fasta_line};
00110     push @$fasta_list, $fasta_line;
00111     }
00112     return $fasta_list;
00113 }
00114 
00115 sub load_name2index_mapping_from_db {
00116     my ($self) = @_;
00117 
00118     my $sql = qq {
00119         SELECT sequence_id, stable_id
00120           FROM member
00121          WHERE sequence_id
00122       GROUP BY sequence_id
00123     };
00124 
00125     my $sth = $self->compara_dba->dbc->prepare( $sql );
00126     $sth->execute();
00127 
00128     my %name2index = ();
00129     while( my ($seq_id, $stable_id) = $sth->fetchrow() ) {
00130         $name2index{$stable_id} = $seq_id;
00131     }
00132     $sth->finish();
00133     $self->compara_dba->dbc->disconnect_when_inactive(1);
00134 
00135     return \%name2index;
00136 }
00137 
00138 #
00139 # Load stable_id name to member_id mappings from the database
00140 #
00141 sub load_name2member_mapping_from_db {
00142     my ($self) = @_;
00143 
00144     my $sql = qq {
00145         SELECT member_id, stable_id
00146           FROM member
00147     };
00148 
00149     my $sth = $self->compara_dba->dbc->prepare( $sql );
00150     $sth->execute();
00151 
00152     my %name2index = ();
00153     while( my ($member_id, $stable_id) = $sth->fetchrow() ) {
00154         $name2index{$stable_id} = $member_id;
00155     }
00156     $sth->finish();
00157     $self->compara_dba->dbc->disconnect_when_inactive(1);
00158 
00159     return \%name2index;
00160 }
00161 
00162 sub load_name2index_mapping_from_file {
00163     my ($self, $filename) = @_;
00164 
00165     my %name2index = ();
00166     open(MAPPING, "<$filename") || die "Could not open name2index mapping file '$filename'";
00167     while(my $line = <MAPPING>) {
00168         chomp $line;
00169         my ($idx, $stable_id) = split(/\s+/,$line);
00170         $name2index{$stable_id} = $idx;
00171     }
00172     close MAPPING;
00173 
00174     return \%name2index;
00175 }
00176 
00177 sub name2index { # can load the name2index mapping from db/file if necessary
00178     my ($self, $name) = @_;
00179 
00180     if($name=~/^seq_id_(\d+)_/) {
00181         return $1;
00182     } else {
00183         my $name2index;
00184         unless($name2index = $self->param('name2index')) {
00185             my $tabfile                 = $self->param('tabfile');
00186 
00187             $name2index = $self->param('name2index', $tabfile
00188                 ? $self->load_name2index_mapping_from_file($tabfile)
00189                 : $self->load_name2index_mapping_from_db()
00190             );
00191         }
00192         return $name2index->{$name} || "UNKNOWN($name)";
00193     }
00194 }
00195 
00196 #
00197 # Convert stable_id name to member_id.
00198 #
00199 sub name2member { 
00200     my ($self, $name) = @_;
00201 
00202     if($name=~/^member_id_(\d+)_/) {
00203         return $1;
00204     } else {
00205         my $name2member;
00206         unless($name2member = $self->param('name2member')) {
00207             my $tabfile                 = $self->param('tabfile');
00208 
00209             $name2member = $self->param('name2member', $tabfile
00210                 ? $self->load_name2member_mapping_from_file($tabfile)
00211                 : $self->load_name2member_mapping_from_db()
00212             );
00213         }
00214         return $name2member->{$name} || "UNKNOWN($name)";
00215     }
00216 }
00217 
00218 sub fetch_input {
00219     my $self = shift @_;
00220 
00221     my $start_member_id = $self->param('start_member_id') || die "'start_member_id' is an obligatory parameter, please set it in the input_id hashref";
00222     my $offset          = $self->param('offset');
00223     die "'offset' is an obligatory parameter" if (!defined $offset);
00224 
00225     my $batch_size      = $self->param('batch_size') || 1000;
00226     my $debug           = $self->debug() || $self->param('debug') || 0;
00227 
00228     my $fasta_list      = $self->load_members_from_db($start_member_id, $offset, $batch_size);
00229 
00230     if (!defined $self->param('genome_db_id')) {
00231     die "'genome_db_id' is an obligatory parameter";
00232     }
00233 
00234     #set default to do transactions
00235     if (!defined $self->param('do_transactions')) {
00236     $self->param('do_transactions', 1);
00237     }
00238 
00239     if($debug) {
00240         print "Loaded ".scalar(@$fasta_list)." sequences\n";
00241     }
00242 
00243     $self->param('fasta_list', $fasta_list);
00244 
00245     my $reuse_ss_id = $self->param('reuse_ss_id')
00246                     or die "'reuse_ss_id' is an obligatory parameter dynamically set in 'meta' table by the pipeline - please investigate";
00247 
00248     my $reuse_ss = $self->compara_dba()->get_SpeciesSetAdaptor->fetch_by_dbID($reuse_ss_id);    # this method cannot fail at the moment, but in future it may
00249 
00250     my $reuse_ss_hash = { map { $_->dbID() => 1 } @{ $reuse_ss->genome_dbs() } };
00251     $self->param('reuse_ss_hash', $reuse_ss_hash );
00252 
00253      # We get the list of genome_dbs to execute, then go one by one with this member
00254 
00255     my $mlss_id         = $self->param('mlss_id') or die "'mlss_id' is an obligatory parameter";
00256     my $mlss            = $self->compara_dba()->get_MethodLinkSpeciesSetAdaptor->fetch_by_dbID($mlss_id) or die "Could not fetch mlss with dbID=$mlss_id";
00257     my $species_set     = $mlss->species_set;
00258 
00259     my $genome_db_list;
00260 
00261     #If reusing this genome_db, only need to blast against the 'fresh' genome_dbs
00262     if ($reuse_ss_hash->{$self->param('genome_db_id')}) {
00263     foreach my $gdb (@$species_set) {
00264         if (!$reuse_ss_hash->{$gdb->dbID}) {
00265         push @$genome_db_list, $gdb;
00266         }
00267     }
00268     } else {
00269     #Using 'fresh' genome_db therefore must blast against everything
00270     $genome_db_list  = (ref($species_set) eq 'ARRAY') ? $species_set : $species_set->genome_dbs();
00271     }
00272 
00273     print STDERR "Found ", scalar(@$genome_db_list), " genomes to blast this member against.\n" if ($self->debug);
00274     $self->param('genome_db_list', $genome_db_list);
00275 
00276 }
00277 
00278 sub parse_blast_table_into_paf {
00279     my ($self, $filename, $min_self_dist, $qgenome_db_id, $hgenome_db_id) = @_;
00280 
00281     my $debug                   = $self->debug() || $self->param('debug') || 0;
00282 
00283     my $features;
00284     my $roundto    = $self->param('roundto') || 0.0001;
00285 
00286     my $curr_name    = '';
00287     my $curr_index   = 0;
00288 
00289     open(BLASTTABLE, "<$filename") || die "Could not open the blast table file '$filename'";
00290     
00291     print "blast $qgenome_db_id $hgenome_db_id $filename\n" if $debug;
00292 
00293     while(my $line = <BLASTTABLE>) {
00294 
00295         unless ($line =~ /^#/) {
00296             my ($qname, $hname, $evalue, $score, $nident,$pident, $qstart, $qend, $hstart,$hend, $length, $positive, $ppos ) = split(/\s+/, $line);
00297 
00298         my $source_name = "ENSEMBLEXON";
00299 
00300         $hname =~ s/[$source_name]*://; #Need to remove "$source_name:" from name. Need to check if this is a general problem or just for
00301                          #my old test database
00302             my $qmember_id = $self->name2member($qname);
00303             my $hmember_id = $self->name2member($hname);
00304         my $analysis_id = $self->analysis->dbID;
00305 
00306         my $feature;
00307         %$feature = (analysis_id       => $analysis_id,
00308              qmember_id        => $qmember_id,
00309              hmember_id        => $hmember_id,
00310              qstable_id        => $qname,
00311              hstable_id        => $hname,
00312              qgenome_db_id     => $qgenome_db_id,
00313              hgenome_db_id     => $hgenome_db_id,
00314              perc_ident        => $pident,
00315              score             => $score,
00316              evalue            => $evalue,
00317              qstart            => $qstart,
00318              qend              => $qend,
00319              hstart            => $hstart,
00320              hend              => $hend,
00321              length            => $length,
00322              perc_ident        => $pident,
00323              identical_matches => $nident,
00324              positive          => $positive,
00325              perc_pos          => $ppos);
00326               
00327         print "feature query $qgenome_db_id $qmember_id hit $hgenome_db_id $hmember_id $hname $qstart $qend $hstart $hend $length $nident $positive\n" if $debug;
00328 #       push @$features, $feature;
00329         push @{$features->{$qmember_id}}, $feature;
00330             #$matrix_hash{$curr_index}{$hit_index} = $distance;
00331         }
00332     }
00333     close BLASTTABLE;
00334     if (!defined $features) {
00335     return $features;
00336     }
00337 
00338     #group together by qmember_id and rank the hits
00339     foreach my $qmember_id (keys %$features) {
00340     my $qfeatures = $features->{$qmember_id};
00341     @$qfeatures = sort sort_by_score_evalue_and_pid @$qfeatures;
00342     my $rank=1;
00343     my $prevPaf = undef;
00344     foreach my $paf (@$qfeatures) {
00345         $rank++ if($prevPaf and !pafs_equal($prevPaf, $paf));
00346         $paf->{hit_rank} = $rank;
00347         $prevPaf = $paf;
00348     }
00349     }
00350     return $features;
00351 }
00352 
00353 sub sort_by_score_evalue_and_pid {
00354   $b->{score} <=> $a->{score} ||
00355     $a->{evalue} <=> $b->{evalue} ||
00356       $b->{perc_ident} <=> $a->{perc_ident} ||
00357         $b->{perc_pos} <=> $a->{perc_pos};
00358 }
00359 
00360 sub pafs_equal {
00361   my ($paf1, $paf2) = @_;
00362   return 0 unless($paf1 and $paf2);
00363   return 1 if(($paf1->{score} == $paf2->{score}) and
00364               ($paf1->{evalue} == $paf2->{evalue}) and
00365               ($paf1->{perc_ident} == $paf2->{perc_ident}) and
00366               ($paf1->{perc_pos} == $paf2->{perc_pos}));
00367   return 0;
00368 }
00369 
00370 sub run {
00371     my $self = shift @_;
00372 
00373     my $fasta_list              = $self->param('fasta_list'); # set by fetch_input()
00374     my $debug                   = $self->debug() || $self->param('debug') || 0;
00375 
00376     unless(scalar(@$fasta_list)) { # if we have no more work to do just exit gracefully
00377         if($debug) {
00378             warn "No work to do, exiting\n";
00379         }
00380         return;
00381     }
00382     my $reuse_db          = $self->param('reuse_db');   # if this parameter is an empty string, there will be no reuse
00383 
00384     my $reuse_ss_hash     = $self->param('reuse_ss_hash');
00385     my $reuse_this_member = $reuse_ss_hash->{$self->param('genome_db_id')};
00386 
00387     my $blastdb_dir             = $self->param('fasta_dir');
00388     my $minibatch               = $self->param('minibatch')     || 1;
00389 
00390     my $blast_bin_dir           = $self->param('blast_bin_dir') || ( '/software/ensembl/compara/ncbi-blast-2.2.23+/bin' );
00391     my $blast_params            = $self->param('blast_params')  || '';  # no parameters to C++ binary means having composition stats on and -seg masking off
00392     my $evalue_limit            = $self->param('evalue_limit')  || 0.00001;
00393     my $tophits                 = $self->param('tophits')       || 20;
00394 
00395 
00396     my $blast_infile  = '/tmp/mercator_blast.in.'.$$;     # only for debugging
00397     my $blast_outfile = '/tmp/mercator_blast.out.'.$$;    # looks like inevitable evil (tried many hairy alternatives and failed)
00398 
00399     if($debug) {
00400         open(FASTA, ">$blast_infile") || die "Could not open '$blast_infile' for writing";
00401         print FASTA @$fasta_list;
00402         close FASTA;
00403     }
00404 
00405     $self->compara_dba->dbc->disconnect_when_inactive(1); 
00406 
00407     my $cross_pafs;
00408 
00409     foreach my $genome_db (@{$self->param('genome_db_list')}) {
00410     my $fastafile = $genome_db->name() . '_' . $genome_db->assembly() . '.fasta';
00411     $fastafile =~ s/\s+/_/g;    # replace whitespace with '_' characters
00412     $fastafile =~ s/\/\//\//g;  # converts any // in path to /
00413     my $cross_genome_dbfile = $blastdb_dir . '/' . $fastafile;   # we are always interested in the 'foreign' genome's fasta file, not the member's
00414 
00415     #Don't blast against self
00416     unless ($genome_db->dbID == $self->param('genome_db_id')) {
00417     
00418         #Hard code for now
00419         #Moved to analysis table
00420         #$blast_params = "-num_alignments 20 -seg 'no'";
00421         #$blast_params = "-num_alignments 20 -seg 'yes'";
00422         #$blast_params = "-num_alignments 20 -seg 'yes' -best_hit_overhang 0.2 -best_hit_score_edge 0.1 -use_sw_tback";
00423 
00424         #Run blastp
00425         my $cmd = "${blast_bin_dir}/blastp -db $cross_genome_dbfile $blast_params -evalue $evalue_limit -num_descriptions $tophits -out $blast_outfile -outfmt '7 qacc sacc evalue score nident pident qstart qend sstart send length positive ppos'";
00426         if($debug) {
00427         warn "CMD:\t$cmd\n";
00428         }
00429 
00430         my $start_time = time();
00431         open( BLAST, "| $cmd") || die qq{could not execute "${cmd}", returned error code: $!};
00432         print BLAST @$fasta_list;
00433         close BLAST;
00434         
00435         print "Time for blast " . (time() - $start_time) . "\n";
00436 
00437         my $features = $self->parse_blast_table_into_paf($blast_outfile, -log($evalue_limit)/log(10), $self->param('genome_db_id'), $genome_db->dbID);
00438         if (defined $features) {
00439         foreach my $qmember_id (keys %$features) {
00440             my $qfeatures = $features->{$qmember_id};
00441             push @$cross_pafs, @$qfeatures;
00442         }
00443         }
00444         unless($debug) {
00445         unlink $blast_outfile;
00446         }
00447     }
00448     }
00449      $self->compara_dba->dbc->disconnect_when_inactive(0); 
00450 
00451     unless($debug) {
00452     $self->param('cross_pafs', $cross_pafs);
00453     }
00454 
00455 }
00456 
00457 sub write_output {
00458     my ($self) = @_;
00459     
00460     if ($self->param('do_transactions')) {
00461     my $compara_conn = $self->compara_dba->dbc;
00462 
00463     my $compara_helper = Bio::EnsEMBL::Utils::SqlHelper->new(-DB_CONNECTION => $compara_conn);
00464     $compara_helper->transaction(-CALLBACK => sub {
00465                      $self->_write_output;
00466                      });
00467     } else {
00468     $self->_write_output;
00469     }
00470 
00471 }
00472 
00473 
00474 sub _write_output {
00475     my $self = shift @_;
00476 
00477     my $cross_pafs = $self->param('cross_pafs');
00478     #foreach my $genome_db_id (keys %$cross_pafs) {
00479     #    $self->compara_dba->get_PeptideAlignFeatureAdaptor->store(@{$cross_pafs->{$genome_db_id}});
00480     #}
00481     print "numbers pafs " . scalar(@$cross_pafs) . "\n";
00482     foreach my $feature (@$cross_pafs) {
00483     my $peptide_table = $self->get_table_name_from_dbID($feature->{qgenome_db_id});
00484 
00485     #AWFUL HACK to insert into the peptide_align_feature table but without going through the API. Only fill in
00486     #some the of fields
00487     my $sql = "INSERT INTO $peptide_table (qmember_id, hmember_id, qgenome_db_id, hgenome_db_id, analysis_id, qstart, qend, hstart, hend, score, evalue, hit_rank,identical_matches, perc_ident,align_length,positive_matches, perc_pos) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?, ?,?)";
00488     my $sth = $self->compara_dba->dbc->prepare( $sql );
00489 
00490     #print "INSERT INTO $peptide_table (qmember_id, hmember_id, qgenome_db_id, hgenome_db_id, analysis_id, qstart, qend, hstart, hend, score, evalue, hit_rank,identical_matches, perc_ident,align_length,positive_matches, perc_pos) VALUES ('" . $feature->{qmember_id} , "','" . $feature->{hmember_id} . "'," . $feature->{qgenome_db_id} . "," . $feature->{hgenome_db_id} . "," . $feature->{analysis_id} . "," . $feature->{qstart} . "," . $feature->{qend} . "," . $feature->{hstart} . "," . $feature->{hend} . "," . $feature->{score} . "," . $feature->{evalue} . "," . $feature->{hit_rank} . "," . $feature->{identical_matches} . "," . $feature->{perc_ident} . "," . $feature->{length} . "," . $feature->{positive} . "," . $feature->{perc_pos} . "\n";
00491 
00492     $sth->execute($feature->{qmember_id},
00493               $feature->{hmember_id},
00494               $feature->{qgenome_db_id},
00495               $feature->{hgenome_db_id},
00496               $feature->{analysis_id},
00497               $feature->{qstart},
00498               $feature->{qend},
00499               $feature->{hstart},
00500               $feature->{hend},
00501               $feature->{score},
00502               $feature->{evalue},
00503               $feature->{hit_rank},
00504               $feature->{identical_matches},
00505               $feature->{perc_ident},
00506               $feature->{length},
00507               $feature->{positive},
00508               $feature->{perc_pos});
00509     }
00510 }
00511 
00512 sub get_table_name_from_dbID {
00513   my ($self, $gdb_id) = @_;
00514   my $table_name = "peptide_align_feature";
00515 
00516   my $gdba = $self->{'comparaDBA'}->get_GenomeDBAdaptor;
00517   my $gdb = $gdba->fetch_by_dbID($gdb_id);
00518   return $table_name if (!$gdb);
00519 
00520   $table_name .= "_" . lc($gdb->name) . "_" . $gdb_id;
00521   $table_name =~ s/ /_/g;
00522 
00523   return $table_name;
00524 }
00525 
00526 
00527 1;
00528