Archive Ensembl HomeArchive Ensembl Home
SplitAndPartialGenesOnTrees_conf.pm
Go to the documentation of this file.
00001 
00002 =pod 
00003 
00004 =head1 NAME
00005 
00006   Bio::EnsEMBL::Compara::PipeConfig::SplitAndPartialGenesOnTrees
00007 
00008 =head1 SYNOPSIS
00009 
00010 #1. update ensembl-hive, ensembl and ensembl-compara CVS repositories before each new release
00011 #2. you may need to update 'schema_version' in meta table to the current release number in ensembl-hive/sql/tables.sql
00012 
00013 #3. make sure that all default_options are set correctly
00014 
00015 #4. Run init_pipeline.pl script:
00016 init_pipeline.pl Bio::EnsEMBL::Compara::PipeConfig::SplitAndPartialGenesOnTrees -password <your_password>
00017 
00018 #5. Sync and loop the beekeeper.pl as shown in init_pipeline.pl's output
00019 
00020 
00021 =head1 DESCRIPTION  
00022 
00023     The PipeConfig file for SplitGenesAndPartialGenesOnTrees pipeline that should automate most of the pre-execution tasks.
00024     Excecution of 4 analysis:
00025     -> looking for split genes
00026     -> looking for partial genes
00027     -> getting coverage on core region of each trees
00028     -> looking for unique gene of a species in a tree.
00029 
00030 =head1 CONTACT
00031 
00032   Please contact maurel@ebi.ac.uk mailing list with questions/suggestions.
00033 
00034 =cut
00035 
00036 package Bio::EnsEMBL::Compara::PipeConfig::SplitAndPartialGenesOnTrees_conf;
00037 
00038 use strict;
00039 use warnings;
00040 use base ('Bio::EnsEMBL::Compara::PipeConfig::ComparaGeneric_conf');
00041 
00042 
00043 sub default_options {
00044     my ($self) = @_;
00045     return {
00046         %{$self->SUPER::default_options},   # inherit the generic ones
00047 
00048         'ensembl_cvs_root_dir'  => $ENV{'ENSEMBL_CVS_ROOT_DIR'}, # this variable should be defined in your shell configs
00049         'email'                 => $ENV{'USER'}.'@ebi.ac.uk',    # NB: your EBI address may differ from the Sanger one!
00050 
00051         'pipeline_name'         => 'SG',   # name the pipeline to differentiate the submitted processes
00052 
00053     # connection parameters to various databases:
00054 
00055         'pipeline_db' => {                      # the production database itself (will be created)
00056             -host   => 'compara3',
00057             -port   => 3306,
00058             -user   => 'ensadmin',
00059             -pass   => $self->o('password'),                    
00060             -dbname => $ENV{'USER'}.'_split_and_partial_genes',
00061         },
00062 
00063 
00064 
00065           'source_db' => {                      # the source database (read only mode)           
00066             -host   => 'compara1',            
00067             -user   => 'ensro',
00068             -pass   => '',
00069             -dbname => 'lg4_ensembl_compara_63',
00070           },
00071 
00072 
00073 
00074 
00075 # 'source_db' => {                      # the source database (read only mode)
00076 #            -host   => 'ens-livemirror',
00077 #            -port   => 3306,
00078 #            -user   => 'ensro',
00079 #            -pass   => '',
00080 #            -dbname => 'ensembl_compara_61',
00081 #        },
00082 
00083 #  'source_db' => {                      # the source database (read only mode)
00084 #         -host   => 'ensembldb.ensembl.org',
00085 #         -port   => 5306,
00086 #         -user   => 'anonymous',
00087 #         -pass   => '',
00088 #         -dbname => 'ensembl_compara_60',
00089 #       },
00090       
00091 #      'source_db' => {                      # the source database (read only mode)
00092 #          -host   => 'ensdb-archive',
00093 #          -port   => 5304,
00094 #          -user   => 'ensro',
00095 #          -pass   => '',
00096 #          -dbname => 'ensembl_compara_59',
00097 #        },
00098     };
00099 }
00100 
00101 
00102 sub pipeline_wide_parameters {  # these parameter values are visible to all analyses, can be overridden by parameters{} and input_id{}
00103     my ($self) = @_;
00104     return {
00105         %{$self->SUPER::pipeline_wide_parameters},          # here we inherit anything from the base class
00106 
00107         'email'             => $self->o('email'),           # for (future) automatic notifications (may be unsupported by your Meadows)
00108     };
00109 }
00110 
00111 
00112 sub pipeline_create_commands {
00113 
00114     my ($self) = @_;
00115     return [
00116         @{$self->SUPER::pipeline_create_commands},  # inheriting database and hive tables' creation
00117 
00118             # additional table needed for keeping the output of 'find_split_genes_on_tree' analysis
00119         'mysql '.$self->dbconn_2_mysql('pipeline_db', 1)." -e 'CREATE TABLE split_gene (id_spg MEDIUMINT NOT NULL AUTO_INCREMENT, tagged_as_split_gene_by_gene_tree_pipeline int(1) NOT NULL, overlap int(10) NOT NULL, score_inter_union float(4,2) NOT NULL, first_aa_prot char(1), unknown_aa_prot1 int(10) NOT NULL, unknown_aa_prot2 int(10) NOT NULL, rounded_duplication_confidence_score float(4,3) NOT NULL, intersection_duplication_score int(10) NOT NULL, union_duplication_confidence_score int(10) NOT NULL, merged_by_gene_tree_pipeline char(50) NOT NULL, chr_name char(40) NOT NULL, chr_strand int(5) NOT NULL, first_part_split_gene_stable_id char(30) NOT NULL, second_part_split_gene_stable_id char(30) NOT NULL, protein1_label char(40) NOT NULL, protein1_length_in_aa int(20) NOT NULL, alignment_length int(20) NOT NULL, species_name char(40) NOT NULL, PRIMARY KEY (id_spg)) ENGINE=InnoDB'",
00120  
00121          'mysql '.$self->dbconn_2_mysql('pipeline_db', 1)." -e 'CREATE TABLE partial_gene (id_spg MEDIUMINT NOT NULL AUTO_INCREMENT, gene_stable_id char(30) NOT NULL, protein_tree_stable_id char(30) NOT NULL, coverage_on_core_regions_score float(6,3) NOT NULL, alignment_overlap_score float(6,3) NOT NULL, species_name char(40) NOT NULL,  PRIMARY KEY (id_spg)) ENGINE=InnoDB'",
00122 
00123          'mysql '.$self->dbconn_2_mysql('pipeline_db', 1)." -e 'CREATE TABLE cocr_length (protein_tree_stable_id char(30) NOT NULL, coverage_on_core_regions_length int(30) NOT NULL, number_of_gene int(30) NOT NULL,  PRIMARY KEY (protein_tree_stable_id)) ENGINE=InnoDB'",
00124 
00125          'mysql '.$self->dbconn_2_mysql('pipeline_db', 1)." -e 'CREATE TABLE single_genes (id_spg MEDIUMINT NOT NULL AUTO_INCREMENT, gene_stable_id char(30) NOT NULL, protein_tree_stable_id char(30) NOT NULL, species_name char(40) NOT NULL,  PRIMARY KEY (id_spg)) ENGINE=InnoDB'",
00126 
00127  ];
00128 
00129 }
00130 
00131 
00132 sub pipeline_analyses {
00133     my ($self) = @_;
00134     return [
00135 # ---------------------------------------------[Get all protein tree ids from the database]-----------------------------------------------------------------------
00136         {   -logic_name => 'tree_factory',
00137             -module     => 'Bio::EnsEMBL::Compara::RunnableDB::ObjectFactory',
00138             -parameters => {
00139                 'compara_db'            => $self->o('source_db'),
00140                 'adaptor_name'          => 'ProteinTreeAdaptor',
00141                 'adaptor_method'        => 'fetch_all',
00142                 'column_names2getters'  => { 'protein_tree_id' => 'node_id' },
00143                 'input_id' => { 'protein_tree_id' => '#protein_tree_id#', 'compara_db' => '#compara_db#', },
00144                 'fan_branch_code' => 2,
00145             },
00146             -input_ids => [
00147               {'compara_db' => $self->o('source_db'), },
00148             ],
00149             -flow_into => {
00150               2 => ['find_split_genes_on_tree','find_partial_genes_on_tree','coverage_on_core_region_length','find_single_genes_on_tree'],
00151             },
00152         },
00153 # ---------------------------------------------[Looking for possible split genes on each protein tree id]-----------------------------------------------------------------------
00154 
00155         {   -logic_name => 'find_split_genes_on_tree',
00156             -module     => 'Bio::EnsEMBL::Compara::RunnableDB::FindSplitGenesOnTree',
00157             -parameters => {
00158             },
00159             -batch_size => 25,
00160             -hive_capacity => 100,
00161             -flow_into => {
00162                 3 => [ 'mysql:////split_gene' ],
00163             },
00164         },
00165 
00166 # ---------------------------------------------[Looking for possible partial genes on each protein tree id]-----------------------------------------------------------------------
00167 
00168         {   -logic_name => 'find_partial_genes_on_tree',
00169           -module     => 'Bio::EnsEMBL::Compara::RunnableDB::FindPartialGenesOnTree',
00170           -parameters => {
00171             'threshold' => 90,
00172           },
00173           -batch_size => 50,
00174           -hive_capacity => 200,
00175           -max_retry_count => 20,   
00176           -flow_into => {
00177             3 => [ 'mysql:////partial_gene' ],
00178           },
00179         },
00180 
00181 # ---------------------------------------------[Get the coverage on core region length for each trees]-----------------------------------------------------------------------
00182 
00183     {   -logic_name => 'coverage_on_core_region_length',
00184           -module     => 'Bio::EnsEMBL::Compara::RunnableDB::FindCoreRegionLength',
00185           -parameters => {
00186             'threshold' => 90, 
00187           },  
00188           -batch_size => 50, 
00189           -hive_capacity => 200,
00190           -max_retry_count => 10,   
00191           -flow_into => {
00192             3 => [ 'mysql:////cocr_length' ],
00193           },  
00194         },  
00195 
00196 # ---------------------------------------------[Find single genes of a species in each trees]-----------------------------------------------------------------------
00197 
00198     {   -logic_name => 'find_single_genes_on_tree',
00199       -module     => 'Bio::EnsEMBL::Compara::RunnableDB::FindSingleGenesOnTree',
00200       -parameters => {
00201       },
00202       -batch_size => 50,
00203       -hive_capacity => 200,
00204       -max_retry_count => 10,
00205       -flow_into => {
00206         3 => [ 'mysql:////single_genes' ],
00207       },
00208     },
00209 
00210     ];
00211 }
00212 
00213 1;
00214