Archive Ensembl HomeArchive Ensembl Home
BlastFactory.pm
Go to the documentation of this file.
00001 =head1 LICENSE
00002 
00003   Copyright (c) 1999-2012 The European Bioinformatics Institute and
00004   Genome Research Limited.  All rights reserved.
00005 
00006   This software is distributed under a modified Apache license.
00007   For license details, please see
00008 
00009     http://www.ensembl.org/info/about/code_licence.html
00010 
00011 =head1 CONTACT
00012 
00013   Please email comments or questions to the public Ensembl
00014   developers list at <dev@ensembl.org>.
00015 
00016   Questions may also be sent to the Ensembl help desk at
00017   <helpdesk@ensembl.org>.
00018 
00019 =head1 NAME
00020 
00021 Bio::EnsEMBL::Compara::RunnableDB::MercatorPecan::BlastFactory 
00022 
00023 =head1 SYNOPSIS
00024 
00025 
00026 =head1 DESCRIPTION
00027 
00028 Fetch sorted list of member_ids and create jobs for BlastAndParsePAF. 
00029 Supported keys:
00030    'genome_db_id' => <number>
00031        Genome_db id. Obligatory
00032 
00033    'subset_id' => <number>
00034        Subset id. If this is not defined, will retrieve from database
00035 
00036    'step' => <number>
00037        How many sequences to write into the blast query file. Default 1000
00038 
00039 
00040 
00041 =cut
00042 
00043 package Bio::EnsEMBL::Compara::RunnableDB::MercatorPecan::BlastFactory;
00044 
00045 use strict;
00046 use base ('Bio::EnsEMBL::Compara::RunnableDB::BaseRunnable');
00047 
00048 sub fetch_input {
00049     my $self = shift @_;
00050 
00051     my $subset_id = $self->param('subset_id') || $self->param('ss');
00052     my $subset;
00053     if (defined $subset_id) {
00054     $subset      = $self->compara_dba->get_SubsetAdaptor()->fetch_by_dbID($subset_id) or die "cannot fetch Subset with id '$subset_id'";
00055     } else {
00056     my $genome_db_id = $self->param('genome_db_id') || $self->param('genome_db_id', $self->param('gdb'))        # for compatibility
00057       or die "'genome_db_id' is an obligatory parameter";
00058     my $gdb_Adaptor = $self->compara_dba->get_GenomeDBAdaptor;
00059     my $genome_db = $gdb_Adaptor->fetch_by_dbID($genome_db_id);
00060 
00061     my $species = $genome_db->name;
00062     my $set_description = "gdb:" . $self->param('genome_db_id') . " $species coding exons";
00063     my $subsetAdaptor = $self->compara_dba->get_SubsetAdaptor;
00064     $subset = $subsetAdaptor->fetch_by_set_description($set_description);
00065     $self->param('subset_id', $subset->dbID);
00066     }
00067 
00068     die ("Unable to find subset for " . $self->param('genome_db_id')) if (!defined $subset);
00069     $self->param('subset', $subset);
00070 
00071     if (!defined $self->param('step')) {
00072     $self->param('step', 1000);
00073     }
00074 }
00075 
00076 
00077 sub write_output {
00078     my $self = shift @_;
00079 
00080 
00081     #Fetch members for subset_id
00082     my $sql = "SELECT member_id FROM member JOIN subset_member USING (member_id) WHERE  subset_id=?";
00083     my $sth = $self->compara_dba->dbc->prepare( $sql );
00084     $sth->execute($self->param('subset_id'));
00085     
00086     my $member_id_list;
00087     while( my ($member_id) = $sth->fetchrow() ) {
00088     push @$member_id_list, $member_id;
00089     }
00090 
00091     my $step = $self->param('step');
00092 
00093     #Sort on member_id
00094     my $sorted_list;
00095     @$sorted_list = sort {$a <=> $b} @$member_id_list;
00096     
00097     my $start_member_id = $sorted_list->[0];
00098     my $offset = 0;
00099     my $batch_size;
00100 
00101     #Create jobs for BlastAndParsePAF
00102     for (my $i = 0; $i < @$sorted_list; $i++) {
00103     my $member_id = $sorted_list->[$i];
00104 
00105     if ($batch_size == $step) {
00106         my $output_id = "{genome_db_id => " . $self->param('genome_db_id') . ", start_member_id => " . $start_member_id . ", offset=>". $offset . ", batch_size=>" . $batch_size . ", subset_id=>" . $self->param('subset_id') .  "}";
00107         
00108         $self->dataflow_output_id($output_id, 2);
00109 
00110         $offset += $batch_size;
00111         $batch_size = 0;
00112         $start_member_id= $member_id;
00113     }
00114     $batch_size++;
00115     }
00116 
00117     my $output_id = "{genome_db_id => " . $self->param('genome_db_id') . ", start_member_id => " . $start_member_id . ", offset=>". $offset . ", batch_size=>" . $batch_size . ", subset_id=>" . $self->param('subset_id') .  "}";
00118 
00119     
00120    $self->dataflow_output_id($output_id, 2);
00121 
00122 
00123 }
00124 return 1;