Archive Ensembl HomeArchive Ensembl Home
SplitAndPartialGenesOnTrees_ensembl_genomes_conf.pm
Go to the documentation of this file.
00001 
00002 =pod 
00003 
00004 =head1 NAME
00005 
00006   Bio::EnsEMBL::Compara::PipeConfig::SplitAndPartialGenesOnTrees_ensembl_genomes
00007 
00008 =head1 SYNOPSIS
00009 
00010 #1. update ensembl-hive, ensembl and ensembl-compara CVS repositories before each new release
00011 #2. you may need to update 'schema_version' in meta table to the current release number in ensembl-hive/sql/tables.sql
00012 
00013 #3. make sure that all default_options are set correctly
00014 
00015 #4. Run init_pipeline.pl script:
00016 init_pipeline.pl Bio::EnsEMBL::Compara::PipeConfig::SplitAndPartialGenesOnTrees_ensembl_genomes -password <your_password> -kingdom <kingdom_name>
00017 
00018 #5. Sync and loop the beekeeper.pl as shown in init_pipeline.pl's output
00019 
00020 
00021 =head1 DESCRIPTION  
00022 
00023     The PipeConfig file for SplitGenesOnTrees pipeline that should automate most of the pre-execution tasks.
00024 
00025 =head1 CONTACT
00026 
00027   Please contact maurel@ebi.ac.uk mailing list with questions/suggestions.
00028 
00029 =cut
00030 
00031 package Bio::EnsEMBL::Compara::PipeConfig::SplitAndPartialGenesOnTrees_ensembl_genomes_conf;
00032 
00033 use strict;
00034 use warnings;
00035 use base ('Bio::EnsEMBL::Compara::PipeConfig::ComparaGeneric_conf');
00036 
00037 
00038 sub default_options {
00039     my ($self) = @_;
00040     return {
00041         %{$self->SUPER::default_options},   # inherit the generic ones
00042 
00043         'ensembl_cvs_root_dir'  => $ENV{'ENSEMBL_CVS_ROOT_DIR'}, # this variable should be defined in your shell configs
00044         'email'                 => $ENV{'USER'}.'@ebi.ac.uk',    # NB: your EBI address may differ from the Sanger one!
00045 
00046         'pipeline_name'         => 'SG',   # name the pipeline to differentiate the submitted processes
00047         'kingdom'               => $self->o('kingdom'), # ensembl genomes kingdom : bacteria, fungi, metazoa, pan_homology, plants, protists.
00048     # connection parameters to various databases:
00049 
00050         'pipeline_db' => {                      # the production database itself (will be created)
00051             -host   => 'compara3',
00052             -port   => 3306,
00053             -user   => 'ensadmin',
00054             -pass   => $self->o('password'),                    
00055             -dbname => $ENV{'USER'}.'_split_and_partial_genes_e_genomes',
00056         },
00057 
00058 #      'source_db' => {                      # the source database (read only mode)
00059 #          -host   => 'mysql.ebi.ac.uk',
00060 #          -port   => 4157,
00061 #          -user   => 'anonymous',
00062 #          -pass   => '',
00063 #          -dbname => 'ensembl_compara_'.$self->o('kingdom').'_9_62',
00064 #        },
00065 
00066       'source_db' => {                      # the source database (read only mode)
00067         -host   => 'farm2-head1',
00068         -port   => 4275,
00069         -user   => 'ensro',
00070         -pass   => '', 
00071         -dbname => 'ensembl_compara_'.$self->o('kingdom').'_10_63',
00072       },
00073 
00074 
00075 
00076 
00077 #      'source_db' => {                      # the source database (read only mode)
00078 #        -host   => 'farm2-head1',
00079 #        -port   => 4275,
00080 #        -user   => 'ensro',
00081 #        -pass   => '', 
00082 #        -dbname => 'ensembl_compara_'.$self->o('kingdom').'_10_63',
00083 #      },  
00084 
00085     };
00086 }
00087 
00088 
00089 sub pipeline_wide_parameters {  # these parameter values are visible to all analyses, can be overridden by parameters{} and input_id{}
00090     my ($self) = @_;
00091     return {
00092         %{$self->SUPER::pipeline_wide_parameters},          # here we inherit anything from the base class
00093 
00094         'email'             => $self->o('email'),           # for (future) automatic notifications (may be unsupported by your Meadows)
00095     };
00096 }
00097 
00098 
00099 sub pipeline_create_commands {
00100 
00101     my ($self) = @_;
00102     return [
00103         @{$self->SUPER::pipeline_create_commands},  # inheriting database and hive tables' creation
00104 
00105             # additional table needed for keeping the output of 'find_split_genes_on_tree' analysis
00106 
00107         'mysql '.$self->dbconn_2_mysql('pipeline_db', 1)." -e 'CREATE TABLE split_genes_e_genomes (id_spg MEDIUMINT NOT NULL AUTO_INCREMENT, tagged_as_split_gene_by_gene_tree_pipeline int(1) NOT NULL, overlap int(10) NOT NULL, score_inter_union float(4,2) NOT NULL, first_aa_prot char(1), unknown_aa_prot1 int(10) NOT NULL, unknown_aa_prot2 int(10) NOT NULL, rounded_duplication_confidence_score float(4,3) NOT NULL, intersection_duplication_score int(10) NOT NULL, union_duplication_confidence_score int(10) NOT NULL, merged_by_gene_tree_pipeline char(50) NOT NULL, chr_name char(40) NOT NULL, chr_strand int(5) NOT NULL, first_part_split_gene_stable_id varchar(128) NOT NULL, second_part_split_gene_stable_id varchar(128) NOT NULL, protein1_label char(40) NOT NULL, protein1_length_in_aa int(20) NOT NULL, alignment_length int(20) NOT NULL, species_name char(40) NOT NULL, kingdom char(40) NOT NULL, PRIMARY KEY (id_spg)) ENGINE=InnoDB'",
00108  
00109 'mysql '.$self->dbconn_2_mysql('pipeline_db', 1)." -e 'CREATE TABLE partial_genes_e_genomes (id_spg MEDIUMINT NOT NULL AUTO_INCREMENT, gene_stable_id varchar(128) NOT NULL, protein_tree_stable_id varchar(128) NOT NULL, coverage_on_core_regions_score float(6,3) NOT NULL, alignment_overlap_score float(6,3) NOT NULL, species_name char(40) NOT NULL, kingdom char(40) NOT NULL, PRIMARY KEY (id_spg)) ENGINE=InnoDB'",
00110 
00111 'mysql '.$self->dbconn_2_mysql('pipeline_db', 1)." -e 'CREATE TABLE cocr_length_e_genomes (protein_tree_stable_id char(30) NOT NULL, coverage_on_core_regions_length int(30) NOT NULL, number_of_gene int(30) NOT NULL, kingdom char(40) NOT NULL,  PRIMARY KEY (protein_tree_stable_id)) ENGINE=InnoDB'",
00112 
00113 'mysql '.$self->dbconn_2_mysql('pipeline_db', 1)." -e 'CREATE TABLE single_genes_e_genomes (id_spg MEDIUMINT NOT NULL AUTO_INCREMENT, gene_stable_id char(30) NOT NULL, protein_tree_stable_id char(30) NOT NULL, species_name char(40) NOT NULL,kingdom char(40) NOT NULL, PRIMARY KEY (id_spg)) ENGINE=InnoDB'",
00114 
00115  ];
00116 
00117 }
00118 
00119 sub pipeline_analyses {
00120     my ($self) = @_;
00121     return [
00122 # ---------------------------------------------[getting all tree node ids]---------------------------------------------
00123         {   -logic_name => 'tree_factory',
00124             -module     => 'Bio::EnsEMBL::Compara::RunnableDB::ObjectFactory',
00125             -parameters => {
00126                 'adaptor_name' => 'ProteinTreeAdaptor',
00127                 'adaptor_method' => 'fetch_all',
00128                 'column_names2getters'  => { 'protein_tree_id' => 'node_id' },
00129                 'input_id' => { 'protein_tree_id' => '#protein_tree_id#', 'kingdom' => '#kingdom#', 'compara_db' => '#compara_db#', },
00130                 'fan_branch_code' => 2,
00131             },
00132             -input_ids => [
00133               { 'compara_db' => $self->o('source_db'),'kingdom' => $self->o('kingdom'), },
00134             ],
00135             -flow_into => {
00136               2 => ['find_split_genes_on_tree', 'find_partial_genes_on_tree', 'coverage_on_core_region_length','find_single_genes_on_tree'],
00137             },
00138         },
00139 
00140 # ---------------------------------------------[Split Genes analysis]---------------------------------------------
00141 
00142     { -logic_name => 'find_split_genes_on_tree',
00143       -module     => 'Bio::EnsEMBL::Compara::RunnableDB::FindSplitGenesOnTree',
00144       -parameters => { 
00145       },
00146       -batch_size => 50,
00147       -hive_capacity => 200,
00148       -flow_into => {
00149         3 => [ 'mysql:////split_genes_e_genomes' ],
00150       },
00151    },
00152 
00153 # ---------------------------------------------[Partial Genes analysis]---------------------------------------------
00154 
00155     {   -logic_name => 'find_partial_genes_on_tree',
00156       -module     => 'Bio::EnsEMBL::Compara::RunnableDB::FindPartialGenesOnTree',
00157       -parameters => {
00158         'threshold' => 90,
00159       },
00160       -batch_size => 50,
00161       -hive_capacity => 200,
00162       -max_retry_count => 20,
00163       -flow_into => {
00164         3 => [ 'mysql:////partial_genes_e_genomes' ],
00165       },
00166     },
00167 # ---------------------------------------------[Coverage on core region length analysis]---------------------------------------------
00168 
00169     {   -logic_name => 'coverage_on_core_region_length',
00170           -module     => 'Bio::EnsEMBL::Compara::RunnableDB::FindCoreRegionLength',
00171           -parameters => {
00172             'threshold' => 90,
00173           },
00174           -batch_size => 50,
00175           -hive_capacity => 200,
00176           -max_retry_count => 10,
00177           -flow_into => {
00178             3 => [ 'mysql:////cocr_length_e_genomes' ],
00179           },
00180         },
00181 
00182 # ---------------------------------------------[Single Genes in species analysis]---------------------------------------------
00183 
00184     {   -logic_name => 'find_single_genes_on_tree',
00185       -module     => 'Bio::EnsEMBL::Compara::RunnableDB::FindSingleGenesOnTree',
00186       -parameters => {
00187       },
00188       -batch_size => 50,
00189       -hive_capacity => 200,
00190       -max_retry_count => 10,
00191       -flow_into => {
00192         3 => [ 'mysql:////single_genes_e_genomes' ],
00193       },
00194     },
00195 
00196   ];
00197 }
00198 
00199 1;
00200