Archive Ensembl HomeArchive Ensembl Home
ImportNCBItaxonomy_conf.pm
Go to the documentation of this file.
00001 
00002 =pod 
00003 
00004 =head1 NAME
00005 
00006   Bio::EnsEMBL::Compara::PipeConfig::ImportNCBItaxonomy_conf
00007 
00008 =head1 SYNOPSIS
00009 
00010     init_pipeline.pl Bio::EnsEMBL::Compara::PipeConfig::ImportNCBItaxonomy_conf -password <your_password> -ensembl_cvs_root_dir <path_to_your_ensembl_cvs_root>
00011     init_pipeline.pl Bio::EnsEMBL::Compara::PipeConfig::ImportNCBItaxonomy_conf -password <your_password>
00012 
00013 =head1 DESCRIPTION  
00014 
00015     A pipeline to import NCBI taxonomy database into ncbi_taxonomy@ens-livemirror database
00016 
00017 =head1 CONTACT
00018 
00019   Please contact ehive-users@ebi.ac.uk mailing list with questions/suggestions.
00020 
00021 =cut
00022 
00023 package Bio::EnsEMBL::Compara::PipeConfig::ImportNCBItaxonomy_conf;
00024 
00025 use strict;
00026 use warnings;
00027 
00028 use base ('Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf');      # we want to treat it as a 'pure' Hive pipeline
00029 
00030 
00031 =head2 default_options
00032 
00033     Description : Implements default_options() interface method of Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf that is used to initialize default options.
00034                 
00035                   There are rules dependent on two options that do not have defaults (this makes them mandatory):
00036                     o('password')       your read-write password for creation and maintenance of the hive database
00037 
00038 =cut
00039 
00040 sub default_options {
00041     my ($self) = @_;
00042     return {
00043         'ensembl_cvs_root_dir' => $ENV{'ENSEMBL_CVS_ROOT_DIR'},     # it will make sense to set this variable if you are going to use ehive frequently
00044 
00045         'pipeline_name' => 'ncbi_taxonomy',            # name used by the beekeeper to prefix job names on the farm
00046         'name_suffix'   => '',                         # use a non-empty value if you want to test the pipeline
00047 
00048         'pipeline_db' => {
00049             -host   => 'ens-livemirror',
00050             -port   => 3306,
00051             -user   => 'ensadmin',
00052             -pass   => $self->o('password'),
00053             -dbname => $self->o('pipeline_name').$self->o('name_suffix'),
00054         },
00055 
00056         'taxdump_loc'   => 'ftp://ftp.ncbi.nih.gov/pub/taxonomy',   # the original location of the dump
00057         'taxdump_file'  => 'taxdump.tar.gz',                        # the filename of the dump
00058 
00059         'work_dir'      => $ENV{'HOME'}.'/ncbi_taxonomy_loading',
00060     };
00061 }
00062 
00063 =head2 pipeline_create_commands
00064 
00065     Description : Implements pipeline_create_commands() interface method of Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf that lists the commands that will create and set up the Hive database.
00066                   In addition to the standard creation of the database and populating it with Hive tables and procedures it also creates a working directory to store intermediate files.
00067 
00068 =cut
00069 
00070 sub pipeline_create_commands {
00071     my ($self) = @_;
00072     return [
00073         @{$self->SUPER::pipeline_create_commands},  # inheriting database and hive tables' creation
00074 
00075             # additional tables that we use here (taken from ensembl-compara schema):
00076 
00077         $self->db_execute_command('pipeline_db', qq{
00078             CREATE TABLE ncbi_taxa_node (
00079               taxon_id                        INT(10) UNSIGNED NOT NULL,
00080               parent_id                       INT(10) UNSIGNED NOT NULL,
00081 
00082               rank                            CHAR(32) DEFAULT \"\" NOT NULL,
00083               genbank_hidden_flag             TINYINT(1) DEFAULT 0 NOT NULL,
00084 
00085               left_index                      INT(10) NOT NULL,
00086               right_index                     INT(10) NOT NULL,
00087               root_id                         INT(10) DEFAULT 1 NOT NULL,
00088 
00089               PRIMARY KEY (taxon_id),
00090               KEY (parent_id),
00091               KEY (rank),
00092               KEY (left_index),
00093               KEY (right_index)
00094             )
00095         }),
00096 
00097         $self->db_execute_command('pipeline_db', qq{
00098             CREATE TABLE ncbi_taxa_name (
00099               taxon_id                    INT(10) UNSIGNED NOT NULL,
00100 
00101               name                        VARCHAR(255),
00102               name_class                  VARCHAR(50),
00103 
00104               KEY (taxon_id),
00105               KEY (name),
00106               KEY (name_class)
00107             )
00108         }),
00109 
00110         'mkdir '.$self->o('work_dir'),
00111     ];
00112 }
00113 
00114 sub resource_classes {
00115     my ($self) = @_;
00116     return {
00117          0 => { -desc => 'default',          'LSF' => '' },
00118          1 => { -desc => 'urgent',           'LSF' => '-q yesterday' },
00119          2 => { -desc => 'himem',            'LSF' => '-q yesterday -R"select[mem>3000] rusage[mem=3000]" -M3000000' },
00120     };
00121 }
00122 
00123 
00124 =head2 pipeline_analyses
00125 
00126     Description : Implements pipeline_analyses() interface method of Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf that defines the structure of the pipeline: analyses, jobs, rules, etc.
00127 
00128 
00129 =cut
00130 
00131 sub pipeline_analyses {
00132     my ($self) = @_;
00133     return [
00134         {   -logic_name    => 'download_tarball',
00135             -module        => 'Bio::EnsEMBL::Hive::RunnableDB::SystemCmd',
00136             -parameters    => {
00137                 'cmd'       => 'curl '.$self->o('taxdump_loc').'/'.$self->o('taxdump_file').' > #work_dir#/'.$self->o('taxdump_file'),
00138             },
00139             -input_ids     => [
00140                 { 'work_dir' => $self->o('work_dir') }
00141             ],
00142             -hive_capacity  => 10,  # to allow parallel branches
00143             -flow_into => {
00144                 1 => [ 'untar' ],
00145             },
00146             -rc_id => 1,
00147         },
00148 
00149         {   -logic_name    => 'untar',
00150             -module        => 'Bio::EnsEMBL::Hive::RunnableDB::SystemCmd',
00151             -parameters    => {
00152                 'cmd'       => 'cd #work_dir# ; tar -xzf #work_dir#/'.$self->o('taxdump_file'),
00153             },
00154             -hive_capacity  => 10,  # to allow parallel branches
00155             -flow_into => {
00156                 1 => [ 'load_nodes', 'load_names' ],
00157             },
00158             -rc_id => 1,
00159         },
00160 
00161         {   -logic_name => 'load_nodes',
00162             -module     => 'Bio::EnsEMBL::Hive::RunnableDB::JobFactory',
00163             -parameters => {
00164                 'inputfile'       => '#work_dir#/nodes.dmp',
00165                 'delimiter'       => "\t\Q|\E\t?",
00166                 'input_id'        => { 'taxon_id' => '#_0#', 'parent_id' => '#_1#', 'rank' => '#_2#', 'genbank_hidden_flag' => '#_10#'},
00167                 'fan_branch_code' => 2,
00168             },
00169             -hive_capacity  => 10,  # to allow parallel branches
00170             -flow_into => {
00171                 1 => [ 'zero_parent_id' ],
00172                 2 => [ ':////ncbi_taxa_node' ],
00173             },
00174             -rc_id => 1,
00175         },
00176 
00177         {   -logic_name    => 'zero_parent_id',
00178             -module        => 'Bio::EnsEMBL::Hive::RunnableDB::SqlCmd',
00179             -parameters    => {
00180                 'sql'         => "update ncbi_taxa_node set parent_id=0 where parent_id=taxon_id",
00181             },
00182             -hive_capacity  => 10,  # to allow parallel branches
00183             -flow_into => {
00184                 1 => [ 'build_left_right_indices' ],
00185             },
00186             -rc_id => 1,
00187         },
00188 
00189         {   -logic_name    => 'build_left_right_indices',
00190             -module        => 'Bio::EnsEMBL::Hive::RunnableDB::SystemCmd',
00191             -parameters    => {
00192                 'cmd'       => $self->o('ensembl_cvs_root_dir').'/ensembl-compara/scripts/taxonomy/taxonTreeTool.pl -url '.$self->dbconn_2_url('pipeline_db').' -index',
00193             },
00194             -hive_capacity  => 10,  # to allow parallel branches
00195             -rc_id => 2,
00196         },
00197 
00198 
00199 
00200         {   -logic_name => 'load_names',
00201             -module     => 'Bio::EnsEMBL::Hive::RunnableDB::JobFactory',
00202             -parameters => {
00203                 'inputfile'       => '#work_dir#/names.dmp',
00204                 'delimiter'       => "\t\Q|\E\t?",
00205                 'input_id'        => { 'taxon_id' => '#_0#', 'name' => '#_1#', 'name_class' => '#_3#'},
00206                 'fan_branch_code' => 2,
00207             },
00208             -hive_capacity  => 10,  # to allow parallel branches
00209             -flow_into => {
00210                 1 => [ 'load_merged_names' ],
00211                 2 => [ ':////ncbi_taxa_name' ],
00212             },
00213             -rc_id => 1,
00214         },
00215 
00216         {   -logic_name => 'load_merged_names',
00217             -module     => 'Bio::EnsEMBL::Hive::RunnableDB::JobFactory',
00218             -parameters => {
00219                 'inputfile'       => '#work_dir#/merged.dmp',
00220                 'delimiter'       => "\t\Q|\E\t?",
00221                 'input_id'        => { 'name' => '#_0#', 'taxon_id' => '#_1#', 'name_class' => 'merged_taxon_id'},
00222                 'fan_branch_code' => 2,
00223             },
00224             -hive_capacity  => 10,  # to allow parallel branches
00225             -flow_into => {
00226                 1 => [ 'web_name_patches' ],
00227                 2 => [ ':////ncbi_taxa_name' ],
00228             },
00229             -rc_id => 1,
00230         },
00231 
00232         {   -logic_name    => 'web_name_patches',
00233             -module        => 'Bio::EnsEMBL::Hive::RunnableDB::SystemCmd',
00234             -parameters    => {
00235                 'cmd'       => 'mysql '.$self->dbconn_2_mysql('pipeline_db', 1).' <'.$self->o('ensembl_cvs_root_dir').'/ensembl-compara/scripts/taxonomy/web_name_patches.sql',
00236             },
00237             -hive_capacity  => 10,  # to allow parallel branches
00238             -flow_into => {
00239                 1 => [ 'ensembl_name_aliases' ],
00240             },
00241             -rc_id => 1,
00242         },
00243 
00244         {   -logic_name    => 'ensembl_name_aliases',
00245             -module        => 'Bio::EnsEMBL::Hive::RunnableDB::SystemCmd',
00246             -parameters    => {
00247                 'cmd'       => 'mysql '.$self->dbconn_2_mysql('pipeline_db', 1).' <'.$self->o('ensembl_cvs_root_dir').'/ensembl-compara/scripts/taxonomy/ensembl_aliases.sql',
00248             },
00249             -hive_capacity  => 10,  # to allow parallel branches
00250             -flow_into => {
00251                 1 => [ 'add_import_date' ],
00252             },
00253             -rc_id => 1,
00254         },
00255 
00256 
00257         {   -logic_name => 'add_import_date',
00258             -module     => 'Bio::EnsEMBL::Hive::RunnableDB::JobFactory',
00259             -parameters => {
00260                 'inputquery'      => 'select distinct taxon_id, CURRENT_TIMESTAMP this_moment from ncbi_taxa_node where parent_id=0',
00261                 'input_id'        => { 'taxon_id' => '#taxon_id#', 'name' => '#this_moment#', 'name_class' => 'import date' },
00262                 'fan_branch_code' => 2,
00263             },
00264             -wait_for => [ 'build_left_right_indices' ],
00265             -hive_capacity  => 10,  # to allow parallel branches
00266             -flow_into => {
00267                 1 => [ 'cleanup' ],
00268                 2 => [ ':////ncbi_taxa_name' ],
00269             },
00270             -rc_id => 1,
00271         },
00272 
00273         {   -logic_name    => 'cleanup',
00274             -module        => 'Bio::EnsEMBL::Hive::RunnableDB::SystemCmd',
00275             -parameters    => {
00276                 'work_dir'  => '/tmp/not_so_important', # make sure $self->param('work_dir') contains something by default, or else.
00277                 'cmd'       => 'rm -rf #work_dir#',
00278             },
00279             -hive_capacity  => 10,  # to allow parallel branches
00280             -rc_id => 1,
00281         },
00282 
00283     ];
00284 }
00285 
00286 1;
00287