Archive Ensembl HomeArchive Ensembl Home
PairAlignerConfig.pm
Go to the documentation of this file.
00001 =head1 LICENSE
00002 
00003   Copyright (c) 1999-2012 The European Bioinformatics Institute and
00004   Genome Research Limited.  All rights reserved.
00005 
00006   This software is distributed under a modified Apache license.
00007   For license details, please see
00008 
00009     http://www.ensembl.org/info/about/code_licence.html
00010 
00011 =head1 CONTACT
00012 
00013   Please email comments or questions to the public Ensembl
00014   developers list at <dev@ensembl.org>.
00015 
00016   Questions may also be sent to the Ensembl help desk at
00017   <helpdesk@ensembl.org>.
00018 
00019 =head1 NAME
00020 
00021 Bio::EnsEMBL::Compara::RunnableDB::PairAlignerConfig
00022 
00023 =cut
00024 
00025 =head1 SYNOPSIS
00026 
00027 $module->fetch_input
00028 
00029 $module->run
00030 
00031 $module->write_output
00032 
00033 =cut
00034 
00035 =head1 DESCRIPTION
00036 
00037 This module is intended to update the pair_aligner_conf database by firstly adding any new bed files to the correct directory and running compare_beds to generate the statistics
00038 
00039 =head1 OPTIONS
00040 
00041 
00042 =item ref_species
00043 
00044 Reference species
00045 
00046 =item reg_conf
00047 
00048 Registry configuration file if not able to provide ref_dbc_url or non_ref_dbc_url (eg local genebuild database)
00049 
00050 =item [method_link_type]
00051 
00052 method_link_type for the multiple alignments.
00053 
00054 =item [genome_db_ids]
00055 
00056 List of genome_dbs, should be 2 for a pairwise alignment
00057 
00058 =item [mlss_id]
00059 
00060 Method link species set id for the pairwise alignment
00061 
00062 =item bed_dir
00063 
00064 Location of directory to write any new bed files
00065 
00066 =item config_url
00067 
00068 Location of the pair aligner configuration database
00069 
00070 =item config_file
00071 
00072 Location of the pair aligner configuration file containing the RAW analysis parameters (if not the input conf_file)
00073 
00074 =item perl_path
00075 
00076 Location of ensembl-compara directory
00077 
00078 =item ensembl_release
00079 
00080 Ensembl release if not the same as contained in the pair aligner compara database in the meta table
00081 
00082 =back
00083 
00084 =head1 EXAMPLES
00085 
00086 =item {'ref_species' => 'danio_rerio', 'method_link_type'=>'TRANSLATED_BLAT_NET', 'genome_db_ids'=>'[65,110]', 'bed_dir' => '/lustre/scratch103/ensembl/kb3/scratch/tests/test_config/pipeline', 'config_url' => 'mysql://USER:PASS@compara1:3306/kb3_pair_aligner_config_test', 'config_file' => '/nfs/users/nfs_k/kb3/work/projects/tests/test_config/tblat.conf',}
00087 
00088 =back
00089 
00090 =head1 APPENDIX
00091 
00092 The rest of the documentation details each of the object methods.
00093 Internal methods are usually preceded with a _
00094 
00095 =cut
00096 
00097 package Bio::EnsEMBL::Compara::RunnableDB::PairAligner::PairAlignerConfig;
00098 
00099 use strict;
00100 use Bio::EnsEMBL::Compara::Production::DBSQL::DBAdaptor;
00101 #use Bio::EnsEMBL::Utils::Exception;
00102 
00103 use base ('Bio::EnsEMBL::Compara::RunnableDB::BaseRunnable');
00104 use Bio::EnsEMBL::Hive::Utils 'stringify';  # import 'stringify()'
00105 
00106 =head2 fetch_input
00107 
00108   Implementation of the Bio::EnsEMBL::Hive::Process interface
00109 
00110 =cut
00111 
00112 sub fetch_input {
00113   my ($self) = @_;
00114 
00115   return if ($self->param('skip_pairaligner_config'));
00116 
00117   #Default directory containing bed files.
00118   if (!defined $self->param('bed_dir')) {
00119       $self->param('bed_dir', "/nfs/ensembl/compara/dumps/bed/");
00120   }
00121 
00122   #Find the mlss_id from the method_link_type and genome_db_ids
00123   my $mlss;
00124   my $mlss_adaptor = $self->compara_dba->get_MethodLinkSpeciesSetAdaptor;
00125   if (defined $self->param('mlss_id')) {
00126       $mlss = $mlss_adaptor->fetch_by_dbID($self->param('mlss_id'));
00127   } else{
00128       if (defined $self->param('method_link_type') && $self->param('genome_db_ids')) {
00129       die ("No method_link_species_set") if (!$mlss_adaptor);
00130       $mlss = $mlss_adaptor->fetch_by_method_link_type_genome_db_ids($self->param('method_link_type'), eval($self->param('genome_db_ids')));
00131       $self->param('mlss_id', $mlss->dbID);
00132       } else {
00133       die("must define either mlss_id or method_link_type and genome_db_ids");
00134       }
00135   }
00136 
00137   #Find the non_ref_species name
00138   if (!defined $self->param('non_ref_species')) {
00139       my $species_set = $mlss->species_set;
00140 
00141       if (@$species_set == 1) {
00142       $self->param('non_ref_species', $self->param('ref_species'));
00143       }
00144       foreach my $genome_db (@$species_set) {
00145       if ($self->param('ref_species') ne $genome_db->name) {
00146           $self->param('non_ref_species', $genome_db->name);
00147       }
00148       }
00149   }
00150   my $genome_db_adaptor = $self->compara_dba->get_GenomeDBAdaptor;
00151 
00152   my $ref_genome_db = $genome_db_adaptor->fetch_by_registry_name($self->param('ref_species'));
00153   my $non_ref_genome_db = $genome_db_adaptor->fetch_by_registry_name($self->param('non_ref_species'));
00154 
00155   #Get ref_dbc_url and non_ref_dbc_url from genome_db table
00156   #unless ($self->param('ref_dbc_url')) {
00157       my $ref_db = $ref_genome_db->connect_to_genome_locator;
00158       #$self->param('ref_db', $ref_db);
00159 
00160       #This doesn't not produce a valid "core" url ie it appends the database name instead of just the db_version so
00161       #load_registry_from_url doesn't work but it is useful to store the database info
00162       $self->param('ref_dbc_url', $ref_db->dbc->url);
00163   #}
00164   #unless ($self->param('non_ref_dbc_url')) {
00165 
00166       my $non_ref_db = $non_ref_genome_db->connect_to_genome_locator;
00167       #$self->param('non_ref_db', $non_ref_db);
00168 
00169       #This doesn't not produce a valid "core" url ie it appends the database name instead of just the db_version so
00170       #load_registry_from_url doesn't work but it is useful to store the database info
00171       $self->param('non_ref_dbc_url', $non_ref_db->dbc->url);
00172   #}
00173 
00174   #Set up paths to various perl scripts
00175   unless ($self->param('dump_features')) {
00176       $self->param('dump_features', $self->param('perl_path') . "/scripts/dumps/dump_features.pl");
00177   }
00178   
00179   unless (-e $self->param('dump_features')) {
00180       die(self->param('dump_features') . " does not exist");
00181   }
00182   
00183   unless ($self->param('update_config_database')) {
00184       $self->param('update_config_database', $self->param('perl_path') . "/scripts/pipeline/update_config_database.pl");
00185   }
00186   
00187   unless (-e $self->param('update_config_database')) {
00188       die(self->param('update_config_database') . " does not exist");
00189   }
00190   
00191   unless ($self->param('create_pair_aligner_page')) {
00192       $self->param('create_pair_aligner_page', $self->param('perl_path') . "/scripts/pipeline/create_pair_aligner_page.pl");
00193   }
00194   unless (-e $self->param('create_pair_aligner_page')) {
00195       die(self->param('create_pair_aligner_page') . " does not exist");
00196   }
00197 
00198   #Get ensembl schema version from meta table if not defined
00199   if (!defined $self->param('ensembl_release')) {
00200       $self->param('ensembl_release', $self->compara_dba->get_MetaContainer->list_value_by_key("schema_version")->[0]);
00201   }
00202 
00203   return 1;
00204 }
00205 
00206 =head2 run
00207 
00208   Implementation of the Bio::EnsEMBL::Hive::Process interface
00209 
00210 =cut
00211 
00212 sub run {
00213   my $self = shift;
00214 
00215   return if ($self->param('skip_pairaligner_config'));
00216 
00217   #Dump bed files if necessary
00218   $self->dump_bed_file($self->param('ref_species'), $self->param('ref_dbc_url'), $self->param('reg_conf'));
00219   $self->dump_bed_file($self->param('non_ref_species'), $self->param('non_ref_dbc_url'), $self->param('reg_conf'));
00220 
00221   
00222   #Update the pair aligner configuaration database
00223   $self->run_update_config_database();
00224   
00225   #Create the pair aligner html and png files for display on the web
00226   $self->run_create_pair_aligner_page();
00227 
00228   return 1;
00229 }
00230 
00231 
00232 =head2 write_output
00233 
00234   Implementation of the Bio::EnsEMBL::Hive::Process interface
00235 
00236 =cut
00237 
00238 sub write_output {
00239   my ($self) = @_;
00240 
00241   return 1;
00242 }
00243 
00244 
00245 #
00246 #Write bed file to general repository for a new species or assembly. The naming scheme assumes the format
00247 #production_name.assembly.genome.bed for toplevel regions and production_name.assembly.coding_exons.bed for exonic
00248 #regions. If a file of that convention already exists, it will not be overwritten.
00249 #
00250 sub dump_bed_file {
00251     my ($self, $species, $dbc_url, $reg_conf) = @_;
00252 
00253     #Need assembly
00254     my $genome_db = $self->compara_dba->get_GenomeDBAdaptor->fetch_by_registry_name($species);
00255     my $assembly = $genome_db->assembly;
00256     my $name = $genome_db->name; #get production_name
00257     
00258     #Check if file already exists
00259     my $genome_bed_file = $self->param('bed_dir') ."/" . $name . "." . $assembly . "." . "genome.bed";
00260     my $exon_bed_file = $self->param('bed_dir') . "/" . $name . "." . $assembly . "." . "coding_exons.bed";
00261 
00262     if (-e $genome_bed_file && !(-z $genome_bed_file)) {
00263     print "$genome_bed_file already exists and not empty. Not overwriting.\n";
00264     } else {
00265     #Need to dump toplevel features
00266     my $cmd;
00267     my $compara_url = $self->compara_dba->dbc->url;
00268     if ($reg_conf) {
00269         #Need to define compara_url even though it isn't used to stop dump_features complaining
00270         $cmd = $self->param('dump_features') . " --reg_conf $reg_conf --species $name --feature toplevel --compara_url $compara_url > $genome_bed_file";
00271     } else {
00272         #Non-standard core name. Use DBAdaptor info
00273         my ($user, $host, $port, $dbname) = $dbc_url =~ /mysql:\/\/(\w*)@(.*):(\d*)\/(.*)/;
00274         $cmd = $self->param('dump_features') . " --host $host --user $user --port $port --dbname $dbname --species $name --feature toplevel > $genome_bed_file";
00275     }
00276     unless (system($cmd) == 0) {
00277         die("$cmd execution failed\n");
00278     }
00279     }
00280     
00281     #Always overwrite the coding exon file since this will usually be updated each release for human
00282     if (-e $exon_bed_file) {
00283 #   print "$exon_bed_file already exists. Overwriting.\n";
00284     print "$exon_bed_file already exists and not empty. Not overwriting.\n";
00285 #    }
00286     } else {
00287     my $cmd;
00288     if ($reg_conf) {
00289         $cmd = $self->param('dump_features') . " --reg_conf " . $reg_conf ." --species $name --feature coding-exons > $exon_bed_file";
00290     } else {
00291         #Non-standard core name. Use DBAdaptor info
00292         my ($user, $host, $port, $dbname) = $dbc_url =~ /mysql:\/\/(\w*)@(.*):(\d*)\/(.*)/;
00293         $cmd = $self->param('dump_features') . " --host $host --user $user --port $port --dbname $dbname --species $name --feature coding-exons > $exon_bed_file";
00294     }
00295     unless (system($cmd) == 0) {
00296         die("$cmd execution failed\n");
00297     }
00298     }
00299 }
00300 
00301 #
00302 #Run script to update the pair aligner configuration database
00303 #
00304 sub run_update_config_database {
00305     my ($self) = @_;
00306 
00307     my $cmd = "perl " . $self->param('update_config_database') . 
00308       " --ref_species " . $self->param('ref_species') . 
00309       " --compara_url " . $self->compara_dba->dbc->url . 
00310       " --mlss_id " . $self->param('mlss_id') . 
00311       " --ensembl_release " . $self->param('ensembl_release');
00312 
00313     $cmd .= " --config_url " . $self->param('config_url') if ($self->param('config_url'));
00314     $cmd .= " --config_file " . $self->param('config_file') if ($self->param('config_file')); 
00315     $cmd .= " --ref_dbc_url " . $self->param('ref_dbc_url') if ($self->param('ref_dbc_url'));
00316     $cmd .= " --non_ref_dbc_url " . $self->param('non_ref_dbc_url') if ($self->param('non_ref_dbc_url'));
00317     $cmd .= " --reg_conf " . $self->param('reg_conf') if ($self->param('reg_conf'));
00318     $cmd .= " --output_dir " . $self->param('output_dir') if ($self->param('output_dir'));
00319     $cmd .= " --pair_aligner_options \'" . $self->param('pair_aligner_options') ."\'" if ($self->param('pair_aligner_options')) ;
00320     $cmd .= " --ref_dna_collection \'" . stringify($self->param('ref_dna_collection')) ."\'" if ($self->param('ref_dna_collection'));
00321     $cmd .= " --non_ref_dna_collection \'" . stringify($self->param('non_ref_dna_collection')) ."\'" if ($self->param('non_ref_dna_collection'));
00322     $cmd .= " --bed_file_location " . $self->param('bed_dir') if ($self->param('bed_dir'));
00323 
00324     print "$cmd\n";
00325     my $output;
00326     $output = `$cmd 2>&1`;
00327     $self->warning($output);
00328     unless ($?== 0) {
00329     die("$cmd execution failed\n");
00330     }
00331 }
00332 
00333 #
00334 #Run script to create the html and png files for the web. These are written to the current directory 
00335 #and will need to be copied to the correct location.
00336 #
00337 sub run_create_pair_aligner_page {
00338     my ($self) = @_;
00339 
00340     if (!$self->param('config_url')) {
00341     print "Must before config_url to print out the page information. Stats info is written to the job_message table\n";
00342     return;
00343     }
00344 
00345     my $cmd = "perl " . $self->param('create_pair_aligner_page') . 
00346       " --config_url " . $self->param('config_url') . 
00347       " --mlss_id " . $self->param('mlss_id');
00348 
00349     $cmd .= " --ucsc_url " . $self->param('ucsc_url') if ($self->param('ucsc_url'));
00350     $cmd .= " > ./mlss_" . $self->param('mlss_id') . ".html";
00351 
00352     unless (system($cmd) == 0) {
00353     die("$cmd execution failed\n");
00354     }
00355 }
00356 
00357 1;