Archive Ensembl HomeArchive Ensembl Home
DumpMultiAlign_conf.pm
Go to the documentation of this file.
00001 ## Configuration file for DumpMultiAlign pipeline
00002 #Release 65
00003 #
00004 #epo 6 way
00005 #init_pipeline.pl Bio::EnsEMBL::Compara::PipeConfig::DumpMultiAlign_conf --password **** --mlss_id 548 --output_dir /lustre/scratch101/ensembl/kb3/scratch/hive/release_65/emf_dumps/epo_6_primate --species human -dbname dumpMultiAlign_6way_primate_65 -pipeline_name DUMP_6_65
00006 #3.4 hours
00007 #
00008 #epo 12 way
00009 #init_pipeline.pl Bio::EnsEMBL::Compara::PipeConfig::DumpMultiAlign_conf --password **** --mlss_id 547 --output_dir /lustre/scratch101/ensembl/kb3/scratch/hive/release_65/emf_dumps/epo_12_eutherian --species human -dbname dumpMultiAlign_12way_eutherian_65 -pipeline_name DUMP_12_65
00010 #2.7 hours
00011 #
00012 #mercator/pecan 19 way
00013 #init_pipeline.pl Bio::EnsEMBL::Compara::PipeConfig::DumpMultiAlign_conf --password **** --mlss_id 50035 --output_dir /lustre/scratch101/ensembl/kb3/scratch/hive/release_65/emf_dumps/pecan_19_amniota --species human -dbname dumpMultiAlign_19way_amniota_65 -pipeline_name DUMP_19_65
00014 #5.5 hours
00015 #
00016 #low coverage epo 35 way
00017 #init_pipeline.pl Bio::EnsEMBL::Compara::PipeConfig::DumpMultiAlign_conf --password **** --mlss_id 50036 --output_dir /lustre/scratch101/ensembl/kb3/scratch/hive/release_65/emf_dumps/epo_35_eutherian --species human --high_coverage_mlss_id 547 -dbname dumpMultiAlign_35way_eutherian_65 -pipeline_name DUMP_35_65
00018 #43 hours (1.8 days)
00019 #
00020 
00021 package Bio::EnsEMBL::Compara::PipeConfig::DumpMultiAlign_conf;
00022 
00023 use strict;
00024 use warnings;
00025 
00026 use base ('Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf');  # All Hive databases configuration files should inherit from HiveGeneric, directly or indirectly
00027 
00028 
00029 sub default_options {
00030     my ($self) = @_;
00031     return {
00032         'ensembl_cvs_root_dir' => $ENV{'HOME'}.'/src/ensembl_main/', 
00033     'release'       => 65,
00034         'pipeline_name' => 'DUMP_'.$self->o('release'),  # name used by the beekeeper to prefix job names on the farm
00035 
00036         'dbname' => 'dumpMultiAlign'.$self->o('release'),  # database suffix (without user name prepended)
00037 
00038         'pipeline_db' => {                               # connection parameters
00039             -host   => 'compara4',
00040             -port   => 3306,
00041             -user   => 'ensadmin',
00042             -pass   => $self->o('password'),
00043             -dbname => $ENV{USER}.'_'.$self->o('dbname'),
00044         },
00045 
00046         'staging_loc1' => {                     # general location of half of the current release core databases
00047             -host   => 'ens-staging1',
00048             -port   => 3306,
00049             -user   => 'ensro',
00050             -pass   => '',
00051         -driver => 'mysql',
00052         -dbname => $self->o('release'),
00053         },
00054 
00055         'staging_loc2' => {                     # general location of the other half of the current release core databases
00056             -host   => 'ens-staging2',
00057             -port   => 3306,
00058             -user   => 'ensro',
00059             -pass   => '',
00060         -driver => 'mysql',
00061         -dbname => $self->o('release'),
00062         },
00063 
00064         'livemirror_loc' => {                   # general location of the previous release core databases (for checking their reusability)
00065             -host   => 'ens-livemirror',
00066             -port   => 3306,
00067             -user   => 'ensro',
00068             -pass   => '',
00069         },
00070 
00071     #Location of core and, optionally, compara db
00072     'db_urls' => [ $self->dbconn_2_url('staging_loc1'), $self->dbconn_2_url('staging_loc2') ],
00073 
00074     #Alternative method of defining location of dbs
00075     'reg_conf' => '',
00076 
00077     #Default compara. Can be the database name (if loading via db_urls) or the url
00078     'compara_db' => 'Multi',
00079 
00080     'species'  => "human",
00081         'coord_system_name1' => "chromosome",
00082         'coord_system_name2' => "supercontig",
00083     'split_size' => 200,
00084     'masked_seq' => 1,
00085         'format' => 'emf',
00086         'dump_program' => $self->o('ensembl_cvs_root_dir')."/ensembl-compara/scripts/dumps/DumpMultiAlign.pl",
00087         'emf2maf_program' => $self->o('ensembl_cvs_root_dir')."/ensembl-compara/scripts/dumps/emf2maf.pl",
00088     'maf_output_dir' => "",
00089     'species_tree_file' => $self->o('ensembl_cvs_root_dir')."/ensembl-compara/scripts/pipeline/species_tree_blength.nh",
00090     'species_tree_data_id' => "",
00091     'high_coverage_mlss_id' => "",
00092     };
00093 }
00094 
00095 sub pipeline_create_commands {
00096     my ($self) = @_;
00097     return [
00098         @{$self->SUPER::pipeline_create_commands},  # inheriting database and hive tables' creation
00099 
00100     #Store DumpMultiAlign other_gab genomic_align_block_ids
00101         'mysql ' . $self->dbconn_2_mysql('pipeline_db', 1) . " -e 'CREATE TABLE other_gab (genomic_align_block_id bigint NOT NULL)'",
00102 
00103     #Store DumpMultiAlign healthcheck results
00104         'mysql ' . $self->dbconn_2_mysql('pipeline_db', 1) . " -e 'CREATE TABLE healthcheck (filename VARCHAR(400) NOT NULL, expected INT NOT NULL, dumped INT NOT NULL)'",
00105     
00106     'mkdir -p '.$self->o('output_dir'), #Make dump_dir directory
00107     ];
00108 }
00109 
00110 sub pipeline_wide_parameters {  # these parameter values are visible to all analyses, can be overridden by parameters{} and input_id{}
00111     my ($self) = @_;
00112 
00113     return {
00114         'pipeline_name' => $self->o('pipeline_name'), #This must be defined for the beekeeper to work properly
00115     };
00116 }
00117 
00118 
00119 sub resource_classes {
00120     my ($self) = @_;
00121     return {
00122          0 => { -desc => 'default, 8h',      'LSF' => '' },
00123      1 => { -desc => 'urgent',           'LSF' => '-q yesterday' },
00124          2 => { -desc => 'compara1',         'LSF' => '-R"select[mycompara1<800] rusage[mycompara1=10:duration=3]"' },
00125     };
00126 }
00127 
00128 sub pipeline_analyses {
00129     my ($self) = @_;
00130     return [
00131      {  -logic_name => 'initJobs',
00132             -module     => 'Bio::EnsEMBL::Compara::RunnableDB::DumpMultiAlign::InitJobs',
00133             -parameters => {'species' => $self->o('species'),
00134                 'dump_mlss_id' => $self->o('mlss_id'),
00135                 'output_dir' => $self->o('output_dir'),
00136                 'compara_db' => $self->o('compara_db'),
00137                 'db_url'    =>  $self->o('db_urls'),
00138                 'reg_conf' => $self->o('reg_conf'),
00139                 'maf_output_dir' => $self->o('maf_output_dir'), #define if want to run emf2maf 
00140                },
00141             -input_ids => [ {} ],
00142             -flow_into => {
00143                 2 => [ 'createChrJobs' ],   
00144                 3 => [ 'createSuperJobs'  ],  
00145         4 => [ 'createOtherJobs' ],
00146         1 => [ 'md5sum'],
00147         5 => [ 'md5sum'], #if defined maf_output_dir
00148             },
00149         },
00150      {  -logic_name    => 'createChrJobs',
00151             -module        => 'Bio::EnsEMBL::Compara::RunnableDB::DumpMultiAlign::CreateChrJobs',
00152             -parameters    => {'coord_system_name' => $self->o('coord_system_name1'),
00153                    'format' => $self->o('format'),
00154                    'compara_db' => $self->o('compara_db'),
00155                    'db_url'    =>  $self->o('db_urls'),
00156                    'reg_conf' => $self->o('reg_conf'),
00157                    'split_size' => $self->o('split_size'),
00158                   },
00159             -input_ids     => [
00160                   ],
00161         -flow_into => {
00162            2 => [ 'dumpMultiAlign' ] #must be on branch2 incase there are no results
00163             }       
00164         },
00165     {  -logic_name    => 'createSuperJobs',
00166             -module        => 'Bio::EnsEMBL::Compara::RunnableDB::DumpMultiAlign::CreateSuperJobs',
00167             -parameters    => {'coord_system_name' => $self->o('coord_system_name2'),
00168                                'format' => $self->o('format'),
00169                    'output_dir' => $self->o('output_dir'),
00170                    'compara_db' => $self->o('compara_db'),
00171                    'db_url'    =>  $self->o('db_urls'),
00172                    'reg_conf' => $self->o('reg_conf'),
00173                   },
00174             -input_ids     => [
00175             ],
00176         -flow_into => {
00177            2 => [ 'dumpMultiAlign' ]
00178             }
00179         },
00180     {  -logic_name    => 'createOtherJobs',
00181             -module        => 'Bio::EnsEMBL::Compara::RunnableDB::DumpMultiAlign::CreateOtherJobs',
00182             -parameters    => {'species' => $self->o('species'),
00183                    'format' => $self->o('format'),
00184                    'compara_db' => $self->o('compara_db'),
00185                    'reg_conf' => $self->o('reg_conf'),
00186                    'db_url'    =>  $self->o('db_urls'),
00187                    'split_size' => $self->o('split_size'),
00188                   },
00189             -input_ids     => [
00190             ],
00191        -hive_capacity => 10, #make this large to allow any dumpMultiAlign jobs to start
00192         -flow_into => {
00193            2 => [ 'dumpMultiAlign' ]
00194             }
00195         },
00196     {  -logic_name    => 'dumpMultiAlign',
00197             -module        => 'Bio::EnsEMBL::Compara::RunnableDB::DumpMultiAlign::DumpMultiAlign',
00198 
00199             -parameters    => {"cmd"=>"perl " . $self->o('dump_program') . " --species " . $self->o('species') . " --mlss_id " . $self->o('mlss_id') ." --coord_system " . "#coord_system# --masked_seq " . $self->o('masked_seq') . " --split_size " . $self->o('split_size') . " --output_format " . $self->o('format') . "  #extra_args#", 
00200                    "reg_conf" => $self->o('reg_conf'),
00201                    "db_urls" => $self->o('db_urls'),
00202                    "compara_db" => $self->o('compara_db'),
00203                    "num_blocks"=> "#num_blocks#",
00204                    "output_dir"=> $self->o('output_dir'),
00205                    "output_file"=>"#output_file#" , 
00206                    "dumped_output_file"=>"#dumped_output_file#" , 
00207                    "format" => $self->o('format'), 
00208                    "maf_output_dir" => $self->o('maf_output_dir'),
00209                   },
00210             -input_ids     => [
00211             ],
00212        -hive_capacity => 15,
00213        -rc_id => 2,
00214         -flow_into => {
00215            2 => [ 'emf2maf' ],
00216            1 => [ 'compress' ]
00217             }
00218         },
00219     {  -logic_name    => 'emf2maf',
00220             -module        => 'Bio::EnsEMBL::Compara::RunnableDB::DumpMultiAlign::Emf2Maf',
00221             -parameters    => {"output_dir"=> $self->o('output_dir'), 
00222                    "emf2maf_program" => $self->o('emf2maf_program'), 
00223                    "maf_output_dir" => $self->o('maf_output_dir')},
00224             -input_ids     => [
00225             ],
00226        -can_be_empty  => 1,
00227        -hive_capacity => 200,
00228        -flow_into => {
00229            2 => [ 'compress' ],
00230            }
00231         },
00232     {  -logic_name    => 'compress',
00233             -module        => 'Bio::EnsEMBL::Compara::RunnableDB::DumpMultiAlign::Compress',
00234             -parameters    => {"output_dir"=> $self->o('output_dir')},
00235             -input_ids     => [
00236             ],
00237        -hive_capacity => 200,
00238         },
00239     {  -logic_name    => 'md5sum',
00240             -module        => 'Bio::EnsEMBL::Compara::RunnableDB::DumpMultiAlign::MD5SUM',
00241             -parameters    => {'output_dir' => $self->o('output_dir'),},
00242             -input_ids     => [
00243             ],
00244         -wait_for => [ 'dumpMultiAlign', 'compress' ],
00245         },
00246     {  -logic_name    => 'readme',
00247             -module        => 'Bio::EnsEMBL::Compara::RunnableDB::DumpMultiAlign::Readme',
00248             -parameters    => {'format' => $self->o('format'),
00249                    'compara_db' => $self->o('compara_db'),
00250                    'reg_conf' => $self->o('reg_conf'),
00251                    'db_url'    =>  $self->o('db_urls'),
00252                    'mlss_id' => $self->o('mlss_id'),
00253                    'output_dir' => $self->o('output_dir'),
00254                    'split_size' => $self->o('split_size'),
00255                    'species_tree_file' => $self->o('species_tree_file'),
00256                    'species_tree_data_id' => $self->o('species_tree_data_id'),
00257                    'high_coverage_mlss_id' =>$self->o('high_coverage_mlss_id') ,
00258                   },
00259             -input_ids     =>[ 
00260           {
00261           },
00262              ],
00263         },    
00264 
00265     ];
00266 }
00267 
00268 1;