Archive Ensembl HomeArchive Ensembl Home
CreateOtherJobs.pm
Go to the documentation of this file.
00001 =head1 LICENSE
00002 
00003   Copyright (c) 1999-2012 The European Bioinformatics Institute and
00004   Genome Research Limited.  All rights reserved.
00005 
00006   This software is distributed under a modified Apache license.
00007   For license details, please see
00008 
00009     http://www.ensembl.org/info/about/code_licence.html
00010 
00011 =head1 CONTACT
00012 
00013   Please email comments or questions to the public Ensembl
00014   developers list at <dev@ensembl.org>.
00015 
00016   Questions may also be sent to the Ensembl help desk at
00017   <helpdesk@ensembl.org>.
00018 
00019 =head1 NAME
00020 
00021 Bio::EnsEMBL::Hive::RunnableDB::DumpMultiAlign::CreateOtherJobs
00022 
00023 =head1 SYNOPSIS
00024 
00025 This RunnableDB module is part of the DumpMultiAlign pipeline.
00026 
00027 =head1 DESCRIPTION
00028 
00029 This RunnableDB module generates DumpMultiAlign jobs from genomic_align_blocks
00030 on the chromosomes which do not contain species. The jobs are split into 
00031 $split_size chunks
00032 
00033 =cut
00034 
00035 
00036 package Bio::EnsEMBL::Compara::RunnableDB::DumpMultiAlign::CreateOtherJobs;
00037 
00038 use strict;
00039 use Bio::EnsEMBL::Hive::DBSQL::AnalysisDataAdaptor;
00040 use base ('Bio::EnsEMBL::Compara::RunnableDB::BaseRunnable');
00041 
00042 use POSIX qw(ceil);
00043 
00044 =head2 strict_hash_format
00045 
00046     Description : Implements strict_hash_format() interface method of Bio::EnsEMBL::Hive::Process that is used to set the strictness level of the parameters' parser.
00047                   Here we return 0 in order to indicate that neither input_id() nor parameters() is required to contain a hash.
00048 
00049 =cut
00050 
00051 #sub strict_hash_format {
00052 #    return 0;
00053 #}
00054 
00055 
00056 sub fetch_input {
00057     my $self = shift;
00058 }
00059 
00060 
00061 sub run {
00062     my $self = shift;
00063     
00064 
00065 }
00066 
00067 sub write_output {
00068     my $self = shift @_;
00069     my $reg = "Bio::EnsEMBL::Registry";
00070     my $output_ids;
00071 
00072     #
00073     #Load registry and get compara database adaptor
00074     #
00075     if ($self->param('reg_conf')) {
00076     Bio::EnsEMBL::Registry->load_all($self->param('reg_conf'),1);
00077     } elsif ($self->param('db_url')) {
00078     my $db_urls = $self->param('db_url');
00079     foreach my $db_url (@$db_urls) {
00080         Bio::EnsEMBL::Registry->load_registry_from_url($db_url);
00081     }
00082     } else {
00083     Bio::EnsEMBL::Registry->load_all();
00084     }
00085 
00086     my $compara_dba = $self->go_figure_compara_dba($self->param('compara_db'));
00087 
00088     my $tag = "other";
00089 
00090     my $output_file = $self->param('filename') . "." . $tag . "." . $self->param('format');
00091 
00092     #Convert eg human to Homo sapiens
00093     #my $species_name = $reg->get_adaptor($self->param('species'), "core", "MetaContainer")->get_production_name;
00094 
00095     my $mlss_adaptor = $compara_dba->get_MethodLinkSpeciesSetAdaptor;
00096     my $genome_db_adaptor = $compara_dba->get_GenomeDBAdaptor;
00097     my $gab_adaptor = $compara_dba->get_GenomicAlignBlockAdaptor;
00098 
00099     my $genome_db = $genome_db_adaptor->fetch_by_registry_name($self->param('species'));
00100     my $species_name = $genome_db->name;
00101 
00102     my $mlss = $mlss_adaptor->fetch_by_dbID($self->param('mlss_id'));
00103 
00104     #
00105     #Find genomic_align_blocks which do not contain $self->param('species')
00106     #
00107     my $skip_genomic_align_blocks = $gab_adaptor->
00108       fetch_all_by_MethodLinkSpeciesSet($mlss);
00109     for (my $i=0; $i<@$skip_genomic_align_blocks; $i++) {
00110     my $has_skip = 0;
00111     foreach my $this_genomic_align (@{$skip_genomic_align_blocks->[$i]->get_all_GenomicAligns()}) {
00112         if (($this_genomic_align->genome_db->name eq $species_name) or
00113         ($this_genomic_align->genome_db->name eq "ancestral_sequences")) {
00114         $has_skip = 1;
00115         last;
00116         }
00117     }
00118     if ($has_skip) {
00119         my $this_genomic_align_block = splice(@$skip_genomic_align_blocks, $i, 1);
00120         $i--;
00121         $this_genomic_align_block = undef;
00122     }
00123     }
00124     my $split_size = $self->param('split_size');
00125     my $format = $self->param('format');
00126     my $species = $self->param('species');
00127 
00128     my $gab_num = 1;
00129     my $start_gab_id ;
00130     my $end_gab_id;
00131     my $chunk = 1;
00132 
00133     #
00134     #Create a table (other_gab) to store the genomic_align_block_ids of those
00135     #blocks which do not contain $self->param('species')
00136     #
00137     foreach my $gab (@$skip_genomic_align_blocks) {
00138     my $sql_cmd = "INSERT INTO other_gab (genomic_align_block_id) VALUES (?)";
00139     my $dump_sth = $self->analysis->adaptor->dbc->prepare($sql_cmd);
00140     $dump_sth->execute($gab->dbID);
00141     $dump_sth->finish();
00142 
00143     if (!defined $start_gab_id) {
00144         $start_gab_id = $gab->dbID;
00145     }
00146 
00147     #Create jobs after each $split_size gabs
00148     if ($gab_num % $split_size == 0 || 
00149         $gab_num == @$skip_genomic_align_blocks) {
00150 
00151         $end_gab_id = $gab->dbID;
00152 
00153         my $this_num_blocks = $split_size;
00154         if ($gab_num == @$skip_genomic_align_blocks) {
00155         $this_num_blocks = (@$skip_genomic_align_blocks % $split_size);
00156         }
00157 
00158         my $this_suffix = "_" . $chunk . "." . $format;
00159         my $dump_output_file = $output_file;
00160         $dump_output_file =~ s/\.$format/$this_suffix/;
00161 
00162         #Write out cmd from DumpMultiAlign
00163         my $dump_cmd = "\"output_file\"=> \"$output_file\", \"extra_args\" => \" --skip_species $species --chunk_num $chunk\", \"num_blocks\"=>\"$this_num_blocks\", \"dumped_output_file\"=>\"$dump_output_file\"";
00164 
00165         #Used to create a file of genomic_align_block_ids to pass to
00166         #DumpMultiAlign
00167         my $output_ids = "{\"start\"=>\"$start_gab_id\", \"end\"=>\"$end_gab_id\", $dump_cmd}";
00168 
00169         #print "skip $output_ids\n";
00170         $self->dataflow_output_id($output_ids, 2);
00171         undef($start_gab_id);
00172         $chunk++;
00173     }
00174     $gab_num++;
00175     }
00176 }
00177 
00178 
00179 1;