Archive Ensembl HomeArchive Ensembl Home
EpoLowCoverage_conf.pm
Go to the documentation of this file.
00001 ## Configuration file for the Epo Low Coverage pipeline
00002 
00003 package Bio::EnsEMBL::Compara::PipeConfig::EpoLowCoverage_conf;
00004 
00005 use strict;
00006 use warnings;
00007 use base ('Bio::EnsEMBL::Compara::PipeConfig::ComparaGeneric_conf');  # All Hive databases configuration files should inherit from HiveGeneric, directly or indirectly
00008 
00009 sub default_options {
00010     my ($self) = @_;
00011     return {
00012     %{$self->SUPER::default_options},   # inherit the generic ones
00013 
00014         'ensembl_cvs_root_dir' => $ENV{'HOME'}.'/src/ensembl_main/', 
00015 
00016     'release'       => 65,
00017     'prev_release'  => 64,
00018         'release_suffix'=> '', # set it to '' for the actual release
00019         'pipeline_name' => 'LOW35_'.$self->o('release').$self->o('release_suffix'), # name used by the beekeeper to prefix job names on the farm
00020 
00021     #location of new pairwise mlss if not in the pairwise_default_location eg:
00022     'pairwise_exception_location' => { 545 => 'mysql://ensro@compara1/kb3_hsap_ogar_lastz_65'},
00023     #'pairwise_exception_location' => { },
00024 
00025         'pipeline_db' => {
00026             -host   => 'compara1',
00027             -port   => 3306,
00028             -user   => 'ensadmin',
00029             -pass   => $self->o('password'),
00030             -dbname => $ENV{USER}.'_epo_35way_'.$self->o('release').$self->o('release_suffix'),
00031         },
00032 
00033     #Location of compara db containing most pairwise mlss ie previous compara
00034     'live_compara_db' => {
00035             -host   => 'ens-livemirror',
00036             -port   => 3306,
00037             -user   => 'ensro',
00038             -pass   => '',
00039 #       -dbname => 'ensembl_compara_63',
00040         -dbname => 'ensembl_compara_' . $self->o('prev_release'),
00041         -driver => 'mysql',
00042         },
00043 
00044     #Location of compara db containing the high coverage alignments
00045     #Make this a url that you provide on the command line
00046     #'epo_db' => {
00047         #    -host   => 'compara3',
00048         #    -port   => 3306,
00049         #    -user   => 'ensro',
00050         #    -pass   => '',
00051     #    -dbname => 'sf5_compara_12way_64',
00052     #    -driver => 'mysql',
00053         #},
00054     master_db => { 
00055             -host   => 'compara1',
00056             -port   => 3306,
00057             -user   => 'ensadmin',
00058             -pass   => $self->o('password'),
00059             -dbname => 'sf5_ensembl_compara_master',
00060         -driver => 'mysql',
00061         },
00062     'populate_new_database_program' => $self->o('ensembl_cvs_root_dir')."/ensembl-compara/scripts/pipeline/populate_new_database.pl",
00063 
00064     'staging_loc1' => {
00065             -host   => 'ens-staging1',
00066             -port   => 3306,
00067             -user   => 'ensro',
00068             -pass   => '',
00069         -db_version => $self->o('release'),
00070         },
00071         'staging_loc2' => {
00072             -host   => 'ens-staging2',
00073             -port   => 3306,
00074             -user   => 'ensro',
00075             -pass   => '',
00076         -db_version => $self->o('release'),
00077         },  
00078     'livemirror_loc' => {
00079             -host   => 'ens-livemirror',
00080             -port   => 3306,
00081             -user   => 'ensro',
00082             -pass   => '',
00083         -db_version => $self->o('prev_release'),
00084         },
00085 
00086     'low_epo_mlss_id' => $self->o('low_epo_mlss_id'),   #mlss_id for low coverage epo alignment
00087     'high_epo_mlss_id' => $self->o('high_epo_mlss_id'), #mlss_id for high coverage epo alignment
00088     'ce_mlss_id' => $self->o('ce_mlss_id'),             #mlss_id for low coverage constrained elements
00089     'cs_mlss_id' => $self->o('cs_mlss_id'),             #mlss_id for low coverage conservation scores
00090     #'master_db_name' => 'sf5_ensembl_compara_master',   
00091     'ref_species' => 'homo_sapiens',                    #ref species for pairwise alignments
00092     'max_block_size'  => 1000000,                       #max size of alignment before splitting 
00093     'pairwise_default_location' => $self->dbconn_2_url('live_compara_db'), #default location for pairwise alignments
00094 
00095      #gerp parameters
00096     'gerp_version' => '2.1',                            #gerp program version
00097     'gerp_window_sizes'    => '[1,10,100,500]',         #gerp window sizes
00098     'no_gerp_conservation_scores' => 0,                 #Not used in productions but is a valid argument
00099     'species_tree_file' => $self->o('ensembl_cvs_root_dir').'/ensembl-compara/scripts/pipeline/species_tree_blength.nh', #location of full species tree, will be pruned 
00100     'newick_format' => 'simple',
00101     'work_dir' => $self->o('work_dir'),                 #location to put pruned tree file 
00102 
00103     #Location of executables (or paths to executables)
00104     'gerp_exe_dir'    => '/software/ensembl/compara/gerp/GERPv2.1',   #gerp program
00105     };
00106 }
00107 
00108 sub pipeline_create_commands {
00109     my ($self) = @_;
00110     return [
00111         @{$self->SUPER::pipeline_create_commands},  # inheriting database and hive tables' creation
00112        ];
00113 }
00114 
00115 sub pipeline_wide_parameters {  # these parameter values are visible to all analyses, can be overridden by parameters{} and input_id{}
00116     my ($self) = @_;
00117 
00118     return {
00119         'pipeline_name' => $self->o('pipeline_name'), #Essential for the beekeeper to work correctly
00120     };
00121 }
00122 
00123 sub resource_classes {
00124     my ($self) = @_;
00125     return {
00126          #0 => { -desc => 'default, 8h',      'LSF' => '' },
00127      0 => { -desc => 'default',           'LSF' => '-R"select[mycompara1 <=800 && myens_staging1 <= 800 && myens_staging2 <=800 && myens_livemirror <=800] rusage[mycompara1=10:duration=3,myens_staging1=10:duration=3,myens_staging2=10:duration=3,myens_livemirror=10:duration=3]"' },
00128      1 => { -desc => 'urgent',           'LSF' => '-q yesterday' },
00129     };
00130 }
00131 
00132 
00133 sub pipeline_analyses {
00134     my ($self) = @_;
00135 
00136     #my $epo_low_coverage_logic_name = $self->o('logic_name_prefix');
00137 
00138     print "pipeline_analyses\n";
00139 
00140     return [
00141 # ---------------------------------------------[Turn all tables except 'genome_db' to InnoDB]---------------------------------------------
00142         {   -logic_name => 'innodbise_table_factory',
00143         -module     => 'Bio::EnsEMBL::Hive::RunnableDB::JobFactory',
00144         -parameters => {
00145                 'inputquery'      => "SELECT table_name FROM information_schema.tables WHERE table_schema ='".$self->o('pipeline_db','-dbname')."' AND table_name!='meta' AND engine='MyISAM' ",
00146                 'fan_branch_code' => 2,
00147                    },
00148         -input_ids => [{}],
00149         -flow_into => {
00150                    2 => [ 'innodbise_table'  ],
00151                    1 => [ 'populate_new_database' ],
00152                   },
00153         },
00154 
00155         {   -logic_name    => 'innodbise_table',
00156         -module        => 'Bio::EnsEMBL::Hive::RunnableDB::SqlCmd',
00157         -parameters    => {
00158                    'sql'         => "ALTER TABLE #table_name# ENGINE='InnoDB'",
00159                   },
00160         -hive_capacity => 10,
00161         -can_be_empty  => 1
00162         },
00163 
00164 # ---------------------------------------------[Run poplulate_new_database.pl script ]---------------------------------------------------
00165         {  -logic_name => 'populate_new_database',
00166            -module     => 'Bio::EnsEMBL::Hive::RunnableDB::SystemCmd',
00167            -parameters    => {
00168                   'program'        => $self->o('populate_new_database_program'),
00169                   #'master'         => $self->o('master_db_name'),
00170                   'mlss_id'        => $self->o('low_epo_mlss_id'),
00171                   'ce_mlss_id'     => $self->o('ce_mlss_id'),
00172                   'cs_mlss_id'     => $self->o('cs_mlss_id'),
00173                   'cmd'            => "#program# --master " . $self->dbconn_2_url('master_db') . " --new " . $self->dbconn_2_url('pipeline_db') . " --mlss #mlss_id# --mlss #ce_mlss_id# --mlss #cs_mlss_id# ",
00174                  },
00175            -wait_for  => [ 'innodbise_table' ],
00176            -flow_into => {
00177                   1 => [ 'set_internal_ids' ],
00178                  },
00179         },
00180 
00181 # ------------------------------------------------------[Set internal ids ]---------------------------------------------------------------
00182         {   -logic_name => 'set_internal_ids',
00183         -module     => 'Bio::EnsEMBL::Hive::RunnableDB::SqlCmd',
00184         -parameters => {
00185                 'low_epo_mlss_id' => $self->o('low_epo_mlss_id'),
00186                 'sql'   => [
00187                         'ALTER TABLE genomic_align_block AUTO_INCREMENT=#expr(($low_epo_mlss_id * 10**10) + 1)expr#',
00188                         'ALTER TABLE genomic_align AUTO_INCREMENT=#expr(($low_epo_mlss_id * 10**10) + 1)expr#',
00189                         'ALTER TABLE genomic_align_group AUTO_INCREMENT=#expr(($low_epo_mlss_id * 10**10) + 1)expr#',
00190                         'ALTER TABLE genomic_align_tree AUTO_INCREMENT=#expr(($low_epo_mlss_id * 10**10) + 1)expr#',
00191                        ],
00192                    },
00193         -flow_into => {
00194                    1 => [ 'load_genomedb_factory' ],
00195                   },
00196         },
00197 
00198 # ---------------------------------------------[Load GenomeDB entries from master+cores]--------------------------------------------------
00199         {   -logic_name => 'load_genomedb_factory',
00200         -module     => 'Bio::EnsEMBL::Compara::RunnableDB::ObjectFactory',
00201         -parameters => {
00202                 'compara_db'    => $self->o('master_db'),   # that's where genome_db_ids come from
00203                 'mlss_id'       => $self->o('low_epo_mlss_id'),
00204                 
00205                 'adaptor_name'          => 'MethodLinkSpeciesSetAdaptor',
00206                 'adaptor_method'        => 'fetch_by_dbID',
00207                 'method_param_list'     => [ '#mlss_id#' ],
00208                 'object_method'         => 'species_set',
00209                 
00210                 'column_names2getters'  => { 'genome_db_id' => 'dbID', 'species_name' => 'name', 'assembly_name' => 'assembly', 'genebuild' => 'genebuild', 'locator' => 'locator' },
00211                 
00212                 'fan_branch_code'       => 2,
00213                    },
00214         -flow_into => {
00215                    2 => [ 'load_genomedb' ],
00216                    1 => [ 'load_genomedb_funnel' ],    # backbone
00217                   },
00218         },
00219         {   -logic_name => 'load_genomedb',
00220         -module     => 'Bio::EnsEMBL::Compara::RunnableDB::LoadOneGenomeDB',
00221         -parameters => {
00222                 'registry_dbs'  => [ $self->o('staging_loc1'), $self->o('staging_loc2'), $self->o('livemirror_loc')],
00223 #               'registry_dbs'  => [ $self->o('live_db'), $self->o('reg1'), $self->o('reg2')],
00224                    },
00225         -hive_capacity => 1,    # they are all short jobs, no point doing them in parallel
00226         },
00227 
00228         {   -logic_name => 'load_genomedb_funnel',
00229         -module     => 'Bio::EnsEMBL::Hive::RunnableDB::Dummy',
00230         -wait_for => [ 'load_genomedb' ],
00231         -flow_into => {
00232             1 => [ 'create_default_pairwise_mlss'],
00233         },
00234         },
00235 # -------------------------------------------------------------[Load species tree]--------------------------------------------------------
00236         {   -logic_name    => 'make_species_tree',
00237         -module        => 'Bio::EnsEMBL::Compara::RunnableDB::MakeSpeciesTree',
00238         -parameters    => { },
00239         -input_ids     => [
00240                    {'blength_tree_file' => $self->o('species_tree_file'), 'newick_format' => 'simple' }, #species_tree
00241                    {'newick_format'     => 'njtree' },                                                   #taxon_tree
00242                   ],
00243         -hive_capacity => -1,   # to allow for parallelization
00244         -wait_for => [ 'load_genomedb_funnel' ],
00245             -flow_into  => {
00246                    3 => { 'mysql:////meta' => { 'meta_key' => 'taxon_tree', 'meta_value' => '#species_tree_string#' } },
00247                    4 => { 'mysql:////meta' => { 'meta_key' => 'tree_string', 'meta_value' => '#species_tree_string#' } },
00248                 },
00249         },
00250 
00251 # -----------------------------------[Create a list of pairwise mlss found in the default compara database]-------------------------------
00252         {   -logic_name => 'create_default_pairwise_mlss',
00253         -module     => 'Bio::EnsEMBL::Compara::RunnableDB::EpoLowCoverage::CreateDefaultPairwiseMlss',
00254         -parameters => {
00255                 'new_method_link_species_set_id' => $self->o('low_epo_mlss_id'),
00256                 'base_method_link_species_set_id' => $self->o('high_epo_mlss_id'),
00257                 'pairwise_default_location' => $self->o('pairwise_default_location'),
00258                 #'base_location' => $self->dbconn_2_url('epo_db'),
00259                 'base_location' => $self->o('epo_db'),
00260                 'reference_species' => $self->o('ref_species'),
00261                 'fan_branch_code' => 3,
00262                    },
00263         -flow_into => {
00264                    1 => [ 'import_alignment' ],
00265                    3 => [ 'mysql:////meta' ],
00266                   }
00267         },
00268 
00269 # ------------------------------------------------[Import the high coverage alignments]---------------------------------------------------
00270         {   -logic_name => 'import_alignment',
00271         -module     => 'Bio::EnsEMBL::Compara::RunnableDB::EpoLowCoverage::ImportAlignment',
00272         -parameters => {
00273                 'method_link_species_set_id'       => $self->o('high_epo_mlss_id'),
00274                 #'from_db_url'                      => $self->dbconn_2_url('epo_db'),
00275                 'from_db_url'                      => $self->o('epo_db'),
00276                    },
00277         -wait_for  => [ 'create_default_pairwise_mlss', 'make_species_tree'],
00278         -flow_into => {
00279                    1 => [ 'create_low_coverage_genome_jobs' ],
00280                   },
00281         },
00282 
00283 # ------------------------------------------------------[Low coverage alignment]----------------------------------------------------------
00284         {   -logic_name => 'create_low_coverage_genome_jobs',
00285         -module     => 'Bio::EnsEMBL::Hive::RunnableDB::JobFactory',
00286         -parameters => {
00287                 'inputquery' => 'SELECT genomic_align_block_id FROM genomic_align ga LEFT JOIN dnafrag USING (dnafrag_id) WHERE method_link_species_set_id=' . $self->o('high_epo_mlss_id') . ' AND genome_db_id <> 63 GROUP BY genomic_align_block_id',
00288                 'fan_branch_code' => 2,
00289                    },
00290         -flow_into => {
00291                    1 => [ 'delete_alignment' ],
00292                    2 => [ 'low_coverage_genome_alignment' ],
00293                   }
00294         },
00295         {   -logic_name => 'low_coverage_genome_alignment',
00296         -module     => 'Bio::EnsEMBL::Compara::RunnableDB::EpoLowCoverage::LowCoverageGenomeAlignment',
00297         -parameters => {
00298                 'max_block_size' => $self->o('max_block_size'),
00299                 'method_link_species_set_id' => $self->o('low_epo_mlss_id'),
00300                 'reference_species' => $self->o('ref_species'),
00301                 'pairwise_exception_location' => $self->o('pairwise_exception_location'),
00302                 'pairwise_default_location' => $self->o('pairwise_default_location'),
00303                    },
00304         -batch_size      => 5,
00305         -hive_capacity   => 30,
00306         #Need a mode to say, do not die immediately if fail due to memory because of memory leaks, rerunning is the solution. Flow to module _again.
00307         -flow_into => {
00308                    2 => [ 'gerp' ],
00309                    -1 => [ 'low_coverage_genome_alignment_again' ],
00310                   },
00311         },
00312         #If fail due to MEMLIMIT, probably due to memory leak, and rerunning with the default memory should be fine.
00313         {   -logic_name => 'low_coverage_genome_alignment_again',
00314         -module     => 'Bio::EnsEMBL::Compara::RunnableDB::EpoLowCoverage::LowCoverageGenomeAlignment',
00315         -parameters => {
00316                 'max_block_size' => $self->o('max_block_size'),
00317                 'method_link_species_set_id' => $self->o('low_epo_mlss_id'),
00318                 'reference_species' => $self->o('ref_species'),
00319                 'pairwise_exception_location' => $self->o('pairwise_exception_location'),
00320                 'pairwise_default_location' => $self->o('pairwise_default_location'),
00321                    },
00322         -batch_size      => 5,
00323         -hive_capacity   => 30,
00324         -flow_into => {
00325                    2 => [ 'gerp' ],
00326                   },
00327         },
00328 # ---------------------------------------------------------------[Gerp]-------------------------------------------------------------------
00329         {   -logic_name => 'gerp',
00330         -module     => 'Bio::EnsEMBL::Compara::RunnableDB::GenomicAlignBlock::Gerp',
00331         -parameters => {
00332                 'program_version' => $self->o('gerp_version'),
00333                 'window_sizes' => $self->o('gerp_window_sizes'),
00334                 'gerp_exe_dir' => $self->o('gerp_exe_dir'),
00335                    },
00336         -hive_capacity   => 600,
00337         },
00338 
00339 # ---------------------------------------------------[Delete high coverage alignment]-----------------------------------------------------
00340         {   -logic_name => 'delete_alignment',
00341         -module     => 'Bio::EnsEMBL::Hive::RunnableDB::SqlCmd',
00342         -parameters => {
00343                 'sql' => [
00344                       'DELETE gag, gat, ga FROM genomic_align_group gag JOIN genomic_align_tree gat USING (node_id) JOIN genomic_align ga USING (genomic_align_id) WHERE method_link_species_set_id=' . $self->o('high_epo_mlss_id'),
00345                       'DELETE FROM genomic_align_block WHERE method_link_species_set_id=' . $self->o('high_epo_mlss_id'),
00346                      ],
00347                    },
00348         #-input_ids => [{}],
00349         -wait_for  => [ 'low_coverage_genome_alignment', 'gerp' ],
00350         -flow_into => {
00351                    1 => [ 'update_max_alignment_length' ],
00352                   },
00353         },
00354 
00355 # ---------------------------------------------------[Update the max_align data in meta]--------------------------------------------------
00356         {  -logic_name => 'update_max_alignment_length',
00357            -module     => 'Bio::EnsEMBL::Compara::RunnableDB::GenomicAlignBlock::UpdateMaxAlignmentLength',
00358            -flow_into => {
00359                   1 => [ 'create_neighbour_nodes_jobs_alignment' ],
00360                  },
00361         },
00362 
00363 # --------------------------------------[Populate the left and right node_id of the genomic_align_tree table]-----------------------------
00364         {   -logic_name => 'create_neighbour_nodes_jobs_alignment',
00365         -module     => 'Bio::EnsEMBL::Hive::RunnableDB::JobFactory',
00366         -parameters => {
00367                 'inputquery' => 'SELECT root_id FROM genomic_align_tree WHERE parent_id = 0',
00368                 'fan_branch_code' => 2,
00369                    },
00370         -flow_into => {
00371                    1 => [ 'conservation_score_healthcheck' ],
00372                    2 => [ 'set_neighbour_nodes' ],
00373                   }
00374         },
00375         {   -logic_name => 'set_neighbour_nodes',
00376         -module     => 'Bio::EnsEMBL::Compara::RunnableDB::EpoLowCoverage::SetNeighbourNodes',
00377         -parameters => {
00378                 'method_link_species_set_id' => $self->o('low_epo_mlss_id')
00379                    },
00380         -batch_size    => 10,
00381         -hive_capacity => 15,
00382         },
00383 # -----------------------------------------------------------[Run healthcheck]------------------------------------------------------------
00384         {   -logic_name => 'conservation_score_healthcheck',
00385         -module     => 'Bio::EnsEMBL::Compara::RunnableDB::HealthCheck',
00386         -wait_for   => [ 'set_neighbour_nodes' ],
00387         -input_ids  => [
00388                 {'test' => 'conservation_jobs', 'logic_name'=>'gerp','method_link_type'=>'EPO_LOW_COVERAGE'}, 
00389                 {'test' => 'conservation_scores','method_link_species_set_id'=>$self->o('cs_mlss_id')},
00390                    ],
00391         },
00392 
00393      ];
00394 }
00395 1;