Archive Ensembl HomeArchive Ensembl Home
CreateChrJobs.pm
Go to the documentation of this file.
00001 =head1 LICENSE
00002 
00003   Copyright (c) 1999-2012 The European Bioinformatics Institute and
00004   Genome Research Limited.  All rights reserved.
00005 
00006   This software is distributed under a modified Apache license.
00007   For license details, please see
00008 
00009     http://www.ensembl.org/info/about/code_licence.html
00010 
00011 =head1 CONTACT
00012 
00013   Please email comments or questions to the public Ensembl
00014   developers list at <dev@ensembl.org>.
00015 
00016   Questions may also be sent to the Ensembl help desk at
00017   <helpdesk@ensembl.org>.
00018 
00019 =head1 NAME
00020 
00021 Bio::EnsEMBL::Hive::RunnableDB::DumpMultiAlign::CreateChrJobs
00022 
00023 =head1 SYNOPSIS
00024 
00025 This RunnableDB module is part of the DumpMultiAlign pipeline.
00026 
00027 =head1 DESCRIPTION
00028 
00029 This RunnableDB module generates DumpMultiAlign jobs from genomic_align_blocks
00030 on the species chromosomes. The jobs are split into $split_size chunks
00031 
00032 =cut
00033 
00034 
00035 package Bio::EnsEMBL::Compara::RunnableDB::DumpMultiAlign::CreateChrJobs;
00036 
00037 use strict;
00038 use Bio::EnsEMBL::Hive::DBSQL::AnalysisDataAdaptor;
00039 use base ('Bio::EnsEMBL::Compara::RunnableDB::BaseRunnable');
00040 
00041 use Bio::EnsEMBL::Compara::DBSQL::DBAdaptor;
00042 use Bio::EnsEMBL::Registry;
00043 
00044 use POSIX qw(ceil);
00045 
00046 =head2 strict_hash_format
00047 
00048     Description : Implements strict_hash_format() interface method of Bio::EnsEMBL::Hive::Process that is used to set the strictness level of the parameters' parser.
00049                   Here we return 0 in order to indicate that neither input_id() nor parameters() is required to contain a hash.
00050 
00051 =cut
00052 
00053 sub strict_hash_format {
00054     return 0;
00055 }
00056 
00057 sub fetch_input {
00058     my $self = shift;
00059 }
00060 
00061 sub run {
00062     my $self = shift;
00063 
00064 
00065 }
00066 
00067 sub write_output {
00068     my $self = shift @_;
00069 
00070     my $output_ids;
00071 
00072     #
00073     #Load registry and get compara database adaptor
00074     #
00075     if ($self->param('reg_conf')) {
00076     Bio::EnsEMBL::Registry->load_all($self->param('reg_conf'),1);
00077     } elsif ($self->param('db_url')) {
00078     my $db_urls = $self->param('db_url');
00079     foreach my $db_url (@$db_urls) {
00080         Bio::EnsEMBL::Registry->load_registry_from_url($db_url);
00081     }
00082     } else {
00083     Bio::EnsEMBL::Registry->load_all();
00084     }
00085 
00086     my $compara_dba = $self->go_figure_compara_dba($self->param('compara_db'));
00087 
00088     #
00089     #Find chromosome names and numbers of genomic_align_blocks
00090     #
00091     my $sql = qq {
00092     SELECT
00093        name,
00094        count(*)
00095     FROM
00096        dnafrag,
00097        genomic_align
00098     WHERE 
00099        dnafrag.dnafrag_id = genomic_align.dnafrag_id 
00100     AND 
00101        genome_db_id = ? 
00102     AND 
00103        coord_system_name = ? 
00104     AND 
00105        method_link_species_set_id = ? 
00106     GROUP BY name};
00107 
00108     my $sth = $compara_dba->dbc->prepare($sql);
00109     $sth->execute($self->param('genome_db_id'), 
00110           $self->param('coord_system_name'),
00111           $self->param('mlss_id'));
00112     my ($name, $total_blocks);
00113     $sth->bind_columns(\$name,\$total_blocks);
00114 
00115     my $chr_blocks; 
00116     my $tag;
00117     if ($self->param('coord_system_name') eq "chromosome") {
00118     $tag = "chr";
00119     }
00120 
00121     my $compara_url = $self->param('compara_url');
00122     my $split_size = $self->param('split_size');
00123     my $format = $self->param('format');
00124     my $coord_system_name = $self->param('coord_system_name');
00125 
00126     if (defined($compara_url)) {
00127     #need to protect the @
00128     $compara_url =~ s/@/\\\\@/;
00129     }
00130 
00131     while (my $row = $sth->fetchrow_arrayref) {
00132     my $output_file = $self->param('filename') . "." . $tag . $name . "." . $self->param('format');
00133 
00134     my $num_chunks = ceil($total_blocks/$self->param('split_size'));
00135 
00136     #store chromosome name and number of chunks
00137     $chr_blocks->{$name} = $num_chunks;
00138     for (my $chunk = 1; $chunk <= $num_chunks; $chunk++) {
00139 
00140         #Number of gabs in this chunk (used for healthcheck)
00141         my $this_num_blocks = $split_size;
00142         if ($chunk == $num_chunks) {
00143         $this_num_blocks = ($total_blocks - (($chunk-1)*$split_size));
00144         }
00145 
00146         my $this_suffix = "_" . $chunk . "." . $format;
00147         my $dump_output_file = $output_file;
00148         $dump_output_file =~ s/\.$format$/$this_suffix/;
00149 
00150         #Write out cmd for DumpMultiAlign and a few other parameters 
00151         #used in downstream analyses 
00152         my $output_ids = "{\"coord_system\"=> \"$coord_system_name\", \"output_file\"=> \"$output_file\", \"extra_args\"=> \"--seq_region $name --chunk_num $chunk\", \"num_blocks\"=> $this_num_blocks, \"dumped_output_file\"=> \"$dump_output_file\", \"format\"=> \"$format\"}";
00153 
00154         #print "$output_ids\n";
00155         
00156         $self->dataflow_output_id($output_ids, 2);
00157     }
00158     }
00159 
00160 
00161 
00162 }
00163 
00164 1;