Archive Ensembl HomeArchive Ensembl Home
CreateSuperJobs.pm
Go to the documentation of this file.
00001 =head1 LICENSE
00002 
00003   Copyright (c) 1999-2012 The European Bioinformatics Institute and
00004   Genome Research Limited.  All rights reserved.
00005 
00006   This software is distributed under a modified Apache license.
00007   For license details, please see
00008 
00009     http://www.ensembl.org/info/about/code_licence.html
00010 
00011 =head1 CONTACT
00012 
00013   Please email comments or questions to the public Ensembl
00014   developers list at <dev@ensembl.org>.
00015 
00016   Questions may also be sent to the Ensembl help desk at
00017   <helpdesk@ensembl.org>.
00018 
00019 =head1 NAME
00020 
00021 Bio::EnsEMBL::Hive::RunnableDB::DumpMultiAlign::CreateSuperJobs
00022 
00023 =head1 SYNOPSIS
00024 
00025 This RunnableDB module is part of the DumpMultiAlign pipeline.
00026 
00027 =head1 DESCRIPTION
00028 
00029 This RunnableDB module generates DumpMultiAlign jobs from genomic_align_blocks
00030 on the species supercontigs. The jobs are split into $split_size chunks
00031 
00032 =cut
00033 
00034 
00035 package Bio::EnsEMBL::Compara::RunnableDB::DumpMultiAlign::CreateSuperJobs;
00036 
00037 use strict;
00038 use Bio::EnsEMBL::Hive::DBSQL::AnalysisDataAdaptor;
00039 use base ('Bio::EnsEMBL::Compara::RunnableDB::BaseRunnable');
00040 
00041 use POSIX qw(ceil);
00042 
00043 =head2 strict_hash_format
00044 
00045     Description : Implements strict_hash_format() interface method of Bio::EnsEMBL::Hive::Process that is used to set the strictness level of the parameters' parser.
00046                   Here we return 0 in order to indicate that neither input_id() nor parameters() is required to contain a hash.
00047 
00048 =cut
00049 
00050 #sub strict_hash_format {
00051 #    return 0;
00052 #}
00053 
00054 sub fetch_input {
00055     my $self = shift;
00056 }
00057 
00058 
00059 sub run {
00060     my $self = shift;
00061 }
00062 
00063 sub write_output {
00064     my $self = shift @_;
00065 
00066     my $output_ids;
00067 
00068     #
00069     #Load registry and get compara database adaptor
00070     #
00071     if ($self->param('reg_conf')) {
00072     Bio::EnsEMBL::Registry->load_all($self->param('reg_conf'),1);
00073     } elsif ($self->param('db_url')) {
00074     my $db_urls = $self->param('db_url');
00075     foreach my $db_url (@$db_urls) {
00076         Bio::EnsEMBL::Registry->load_registry_from_url($db_url);
00077     }
00078     } else {
00079     Bio::EnsEMBL::Registry->load_all();
00080     }
00081 
00082     my $compara_dba = $self->go_figure_compara_dba($self->param('compara_db'));
00083 
00084 
00085     #
00086     #Find supercontigs and number of genomic_align_blocks
00087     #
00088     my $sql = "
00089     SELECT count(*) 
00090     FROM genomic_align 
00091     LEFT JOIN dnafrag 
00092     USING (dnafrag_id) 
00093     WHERE coord_system_name = ? 
00094     AND genome_db_id= ? 
00095     AND method_link_species_set_id=?";
00096 
00097     my $sth = $compara_dba->dbc->prepare($sql);
00098     $sth->execute($self->param('coord_system_name'),$self->param('genome_db_id'), $self->param('mlss_id'));
00099     my ($total_blocks) = $sth->fetchrow_array;
00100     
00101     my $tag = $self->param('coord_system_name');
00102     #my $output_file = $self->param('output_dir') ."/" . $self->param('filename') . "." . $tag . "." . $self->param('format');
00103     my $output_file = $self->param('filename') . "." . $tag . "." . $self->param('format');
00104     
00105     my $format = $self->param('format');
00106     my $coord_system_name = $self->param('coord_system_name');
00107     #This doesn't work because DumpMultiAlignment adds _1 to the output file and can create more if there are lots of supercontigs.
00108     #Since I create only one job, the compress will only start when all the chunks have been produced (if more than one) so I can use "*"
00109     #my $this_suffix = "." . $format;
00110     my $this_suffix = "*" . "." . $format;
00111     my $dump_output_file = $output_file;
00112     $dump_output_file =~ s/\.$format/$this_suffix/;
00113     
00114     #Write out cmd for DumpMultiAlign and a few other parameters 
00115     #used in downstream analyses 
00116     $output_ids = "{\"coord_system\"=> \"$coord_system_name\", \"output_file\"=> \"$output_file\", \"num_blocks\"=> $total_blocks, \"dumped_output_file\"=> \"$dump_output_file\", \"format\"=> \"$format\"}";
00117 
00118     $self->dataflow_output_id($output_ids, 2);
00119 }
00120 
00121 1;