Archive Ensembl HomeArchive Ensembl Home
LoadOneGenomeDB.pm
Go to the documentation of this file.
00001 
00002 =pod 
00003 
00004 =head1 NAME
00005 
00006 Bio::EnsEMBL::Compara::RunnableDB::LoadOneGenomeDB
00007 
00008 =head1 SYNOPSIS
00009 
00010         # load a genome_db given a class/keyvalue locator (genome_db_id will be generated)
00011     standaloneJob.pl LoadOneGenomeDB.pm -compara_db "mysql://ensadmin:${ENSADMIN_PSW}@compara2/lg4_test_load1genome" \
00012         -locator 'Bio::EnsEMBL::DBSQL::DBAdaptor/host=ens-staging;port=3306;user=ensro;pass=;dbname=homo_sapiens_core_64_37;species=homo_sapiens;species_id=1;disconnect_when_inactive=1'
00013 
00014         # load a genome_db given a url-style locator
00015     standaloneJob.pl LoadOneGenomeDB.pm -compara_db "mysql://ensadmin:${ENSADMIN_PSW}@compara2/lg4_test_load1genome" \
00016         -locator "mysql://ensro@ens-staging2/mus_musculus_core_64_37"
00017 
00018         # load a genome_db given a reg_conf and species_name as locator
00019     standaloneJob.pl LoadOneGenomeDB.pm -compara_db "mysql://ensadmin:${ENSADMIN_PSW}@compara2/lg4_test_load1genome" \
00020         -reg_conf $ENSEMBL_CVS_ROOT_DIR/ensembl-compara/scripts/pipeline/production_reg_conf.pl \
00021         -locator 'mus_musculus'
00022 
00023         # load a genome_db given a reg_conf and species_name as locator with a specific genome_db_id
00024     standaloneJob.pl LoadOneGenomeDB.pm -compara_db "mysql://ensadmin:${ENSADMIN_PSW}@compara2/lg4_test_load1genome" \
00025         -reg_conf $ENSEMBL_CVS_ROOT_DIR/ensembl-compara/scripts/pipeline/production_reg_conf.pl \
00026         -locator 'homo_sapiens' -genome_db_id 90
00027 
00028 =head1 DESCRIPTION
00029 
00030 This Runnable loads one entry into 'genome_db' table and passes on the genome_db_id.
00031 
00032 The format of the input_id follows the format of a Perl hash reference.
00033 Examples:
00034     { 'species_name' => 'Homo sapiens', 'assembly_name' => 'GRCh37' }
00035     { 'species_name' => 'Mus musculus' }
00036 
00037 supported keys:
00038     'locator'       => <string>
00039         one of the ways to specify the connection parameters to the core database (overrides 'species_name' and 'assembly_name')
00040 
00041     'registry_dbs'  => <list_of_dbconn_hashes>
00042         another, simple way to specify the genome_db (and let the registry search across multiple mysql instances to do the rest)
00043     'species_name'  => <string>
00044         mandatory, but what would you expect?
00045 
00046     'first_found'   => <0|1>
00047         optional, defaults to 0.
00048         Defines whether we emulate (to a certain extent) the behaviour of load_registry_from_multiple_dbs
00049         or try the last one that still fits (this would allow to try ens-staging[12] *first*, and only then check if ens-livemirror has is a suitable copy).
00050 
00051     'assembly_name' => <string>
00052         optional: in most cases it should be possible to find the species just by using 'species_name'
00053 
00054     'genome_db_id'  => <integer>
00055         optional, in case you want to specify it (otherwise it will be generated by the adaptor when storing)
00056 
00057     'pseudo_stableID_prefix' => <string>
00058         optional?, see 'GenomeLoadMembers.pm', 'GenomeLoadReuseMembers.pm', 'GeneStoreNCMembers.pm', 'GenomePrepareNCMembers.pm'
00059 
00060     'ensembl_genomes' => <0|1>
00061         optional, sets the preferential order of precedence of species_name sources, depending on whether the module is run by EG or Compara
00062 
00063 =cut
00064 
00065 package Bio::EnsEMBL::Compara::RunnableDB::LoadOneGenomeDB;
00066 
00067 use strict;
00068 use Bio::EnsEMBL::Registry;
00069 use Bio::EnsEMBL::DBLoader;
00070 use Bio::EnsEMBL::Compara::GenomeDB;
00071 
00072 use base ('Bio::EnsEMBL::Compara::RunnableDB::BaseRunnable');
00073 
00074 my $suffix_separator = '__cut_here__';
00075 
00076 sub fetch_input {
00077     my $self = shift @_;
00078 
00079     my $assembly_name = $self->param('assembly_name');
00080     my $core_dba;
00081 
00082     if(my $locator = $self->param('locator') ) {   # use the locator and skip the registry
00083 
00084         eval {
00085             $core_dba = Bio::EnsEMBL::DBLoader->new($locator);
00086         };
00087 
00088         unless($core_dba) {     # assume this is a hive-type locator and try more tricks:
00089             my $dbc = $self->go_figure_dbc( $locator, 'core' )
00090                 or die "Could not connect to '$locator' as DBC";
00091 
00092             $core_dba = Bio::EnsEMBL::DBSQL::DBAdaptor->new( -DBCONN => $dbc );
00093 
00094             $self->param('locator', $core_dba->locator() );  # substitute the given locator by one in conventional format
00095         }
00096 
00097     } elsif( my $species_name = $self->param('species_name') ) {    # perform our tricky multiregistry search: find the last one still suitable
00098 
00099         my $genebuild = $self->param('genebuild');
00100 
00101         my $registry_dbs = $self->param('registry_dbs') || die "unless 'locator' is specified, 'registry_dbs' becomes obligatory parameter";
00102 
00103         for(my $r_ind=0; $r_ind<scalar(@$registry_dbs); $r_ind++) {
00104 
00105             Bio::EnsEMBL::Registry->load_registry_from_db( %{ $registry_dbs->[$r_ind] }, -species_suffix => $suffix_separator.$r_ind );
00106 
00107 
00108         my $no_alias_check = 1;
00109             my $this_core_dba = Bio::EnsEMBL::Registry->get_DBAdaptor($species_name.$suffix_separator.$r_ind, 'core', $no_alias_check) || next;
00110             my $this_assembly = $this_core_dba->extract_assembly_name();
00111         my $this_start_date = $this_core_dba->get_MetaContainer->get_genebuild();
00112 
00113             $genebuild ||= $this_start_date;
00114             $assembly_name ||= $this_assembly;
00115 
00116             if($this_assembly eq $assembly_name && $this_start_date eq $genebuild) {
00117                 $core_dba = $this_core_dba;
00118 
00119                 if($self->param('first_found')) {
00120                     last;
00121                 }
00122             } else {
00123                 warn "Found assembly '$this_assembly' when looking for '$assembly_name' or '$this_start_date' when looking for '$genebuild'";
00124             }
00125 
00126         } # try next registry server
00127     }
00128 
00129     if( $core_dba ) {
00130         $self->param('core_dba', $core_dba);
00131         if($assembly_name) {
00132             $self->param('assembly_name', $assembly_name);
00133         }
00134     } else {
00135         die "Could not find species_name='".$self->param('species_name')."', assembly_name='".$self->param('assembly_name')."' on the servers provided, please investigate";
00136     }
00137 }
00138 
00139 sub run {
00140     my $self = shift @_;
00141 
00142     my $core_dba            = $self->param('core_dba');
00143     my $meta_container      = $core_dba->get_MetaContainer;
00144 
00145     my $assembly_name_in_db = $core_dba->extract_assembly_name();
00146     my $assembly_name       = $self->param('assembly_name') || $assembly_name_in_db;
00147     if($assembly_name ne $assembly_name_in_db) {
00148         die "The required assembly_name ('$assembly_name') is different from the one found in the database ('$assembly_name_in_db'), please investigate";
00149     }
00150 
00151     my $taxon_id_in_db      = $meta_container->get_taxonomy_id();
00152     my $taxon_id            = $self->param('taxon_id')  || $taxon_id_in_db;
00153     if($taxon_id != $taxon_id_in_db) {
00154         die "taxon_id parameter ($taxon_id) is different from the one defined in the database ($taxon_id_in_db), please investigate";
00155     }
00156 
00157     my $genome_db_id    = $self->param('genome_db_id')      || undef;
00158     my $genebuild       = $meta_container->get_genebuild()    || '';
00159     my $genome_name     = $meta_container->get_production_name() or die "Could not fetch production_name, please investigate";
00160     my $locator         = $self->param('locator') || $core_dba->locator();
00161 
00162     my $genome_db       = Bio::EnsEMBL::Compara::GenomeDB->new();
00163     $genome_db->dbID( $genome_db_id );
00164     $genome_db->taxon_id( $taxon_id );
00165     $genome_db->name( $genome_name );
00166     $genome_db->assembly( $assembly_name );
00167     $genome_db->genebuild( $genebuild );
00168     $genome_db->locator( $locator );
00169 
00170     $self->param('genome_db', $genome_db);
00171 }
00172 
00173 sub write_output {      # store the genome_db and dataflow
00174     my $self = shift;
00175 
00176     my $genome_db               = $self->param('genome_db');
00177 
00178     $self->compara_dba->get_GenomeDBAdaptor->store($genome_db);
00179     my $genome_db_id            = $genome_db->dbID();
00180 
00181     my $pseudo_stableID_prefix  = $self->param('pseudo_stableID_prefix');
00182 
00183     $self->dataflow_output_id( {
00184         'genome_db_id' => $genome_db_id,
00185         ($pseudo_stableID_prefix ? ('pseudo_stableID_prefix' => $pseudo_stableID_prefix) : ())
00186     }, 1);
00187 }
00188 
00189 # ------------------------- non-interface subroutines -----------------------------------
00190 
00191 sub Bio::EnsEMBL::DBSQL::DBAdaptor::extract_assembly_name {  # with much regret I have to introduce the highly demanded method this way
00192     my $self = shift @_;
00193 
00194     my ($cs) = @{$self->get_CoordSystemAdaptor->fetch_all()};
00195     my $assembly_name = $cs->version;
00196 
00197     return $assembly_name;
00198 }
00199 
00200 sub Bio::EnsEMBL::DBSQL::DBAdaptor::locator {  # this is another similar hack (to be included or at least offered for inclusion into Core codebase)
00201     my $self         = shift @_;
00202 
00203     my ($species_safe) = split(/$suffix_separator/, $self->species());  # The suffix was added to attain uniqueness and avoid collision, now we have to chop it off again.
00204 
00205     my $dbc = $self->dbc();
00206 
00207     return sprintf(
00208           "%s/host=%s;port=%s;user=%s;pass=%s;dbname=%s;species=%s;species_id=%s;disconnect_when_inactive=%d",
00209           ref($self), $dbc->host(), $dbc->port(), $dbc->username(), $dbc->password(), $dbc->dbname(), $species_safe, $self->species_id, 1,
00210     );
00211 }
00212 
00213 1;
00214