Archive Ensembl HomeArchive Ensembl Home
MemberDisplayLabelUpdater.pm
Go to the documentation of this file.
00001 package Bio::EnsEMBL::Compara::RunnableDB::MemberDisplayLabelUpdater;
00002 
00003 =pod
00004 
00005 =head1 NAME
00006 
00007 Bio::EnsEMBL::Compara::RunnableDB::MemberDisplayLabelUpdater
00008 
00009 =head1 SYNOPSIS
00010 
00011 This runnable can be used both as a Hive pipeline component or run in standalone mode.
00012 At the moment Compara runs it standalone, EnsEMBL Genomes runs it in both modes.
00013 
00014 In standalone mode you will need to set --reg_conf to your registry configuration file in order to access the core databases.
00015 You will have to refer to your compara database either via the full URL or (if you have a corresponding registry entry) via registry.
00016 Here are both examples:
00017 
00018     standaloneJob.pl Bio::EnsEMBL::Compara::RunnableDB::MemberDisplayLabelUpdater --reg_conf $ENSEMBL_CVS_ROOT_DIR/ensembl-compara/scripts/pipeline/production_reg_conf.pl --compara_db compara_homology_merged --debug 1
00019 
00020     standaloneJob.pl Bio::EnsEMBL::Compara::RunnableDB::MemberDisplayLabelUpdater --reg_conf $ENSEMBL_CVS_ROOT_DIR/ensembl-compara/scripts/pipeline/production_reg_conf.pl --compara_db mysql://ensadmin:${ENSADMIN_PSW}@compara3:3306/lg4_compara_homology_merged_64 --debug 1
00021 
00022 You should be able to limit the set of species being updated by adding --species "[ 90, 3 ]" or --species "[ 'human', 'rat' ]"
00023 
00024 =head1 DESCRIPTION
00025 
00026 The module loops through all genome_dbs given via the parameter C<species> and attempts to update any gene/translation with the display identifier from the core database.
00027 If the list of genome_dbs is not specified, it will attempt all genome_dbs with entries in the member table.
00028 
00029 This code uses direct SQL statements because of the relationship between translations and their display labels
00030 being stored at the transcript level. If the DB changes this will break.
00031 
00032 =head1 AUTHOR
00033 
00034 Andy Yates
00035 
00036 =head1 MAINTANER
00037 
00038 $Author: lg4 $
00039 
00040 =head1 VERSION
00041 
00042 $Revision: 1.10 $
00043 
00044 =head1 APPENDIX
00045 
00046 The rest of the documentation details each of the object methods.
00047 Internal methods are usually preceded with a _
00048 
00049 =cut
00050 
00051 use strict;
00052 use warnings;
00053 
00054 use Scalar::Util qw(looks_like_number);
00055 use Bio::EnsEMBL::Utils::Argument qw(rearrange);
00056 use Bio::EnsEMBL::Utils::Exception qw(throw);
00057 use Bio::EnsEMBL::Utils::Scalar qw(assert_ref check_ref);
00058 use Bio::EnsEMBL::Utils::SqlHelper;
00059 
00060 use base ('Bio::EnsEMBL::Compara::RunnableDB::BaseRunnable');
00061 
00062 
00063 #--- Non-hive methods
00064 
00065 =head2 new_without_hive()
00066 
00067   Arg [DB_ADAPTOR]              : (DBAdaptor) Compara DBAdaptor to use
00068   Arg [REPLACE]                 : (Boolean)   Forces the code to replace display labels 
00069   Arg [DIE_IF_NO_CORE_ADAPTOR]  : (Boolean)   Kills the process if there is no core adaptor
00070   Arg [GENOME_DB_IDS]           : (ArrayRef)  GenomeDB IDs to run this process over
00071   Arg [DEBUG]                   : (Boolean)   Force debug output to STDOUT
00072   
00073   Example    : See synopsis
00074   Description: Non-hive version of the object construction to be used with scripts
00075   Returntype : Bio::EnsEMBL::Compara::RunnableDB::MemberDisplayIdUpdater
00076   Exceptions : if DB_ADAPTOR was not given and was not a valid object
00077   Caller     : general
00078 
00079 =cut
00080 
00081 sub new_without_hive {
00082   my ($class, @params) = @_;
00083   
00084   my $self = bless {}, $class;
00085   #Put in so we can have access to $self->param()
00086   my $job = Bio::EnsEMBL::Hive::AnalysisJob->new();
00087   $self->input_job($job);
00088   
00089   my ($db_adaptor, $replace, $die_if_no_core_adaptor, $species, $debug) = 
00090     rearrange(
00091       [qw(db_adaptor replace die_if_no_core_adaptor species debug)], 
00092       @params
00093   );
00094   
00095   $self->compara_dba($db_adaptor);
00096   $self->param('replace', $replace);
00097   $self->param('die_if_no_core_adaptor', $die_if_no_core_adaptor);
00098   $self->param('species', $species);
00099   $self->debug($debug);
00100   
00101   return $self;
00102 }
00103 
00104 =head2 run_without_hive()
00105 
00106 Performs fetch_input(), run() and write_output() calls in one method.
00107 
00108 =cut
00109 
00110 sub run_without_hive {
00111   my ($self) = @_;
00112 
00113   $self->fetch_input();
00114   $self->run();
00115   $self->write_output();
00116 }
00117 
00118 
00119 #--- Hive methods
00120 
00121 =head2 fetch_input
00122 
00123     Title   :   fetch_input
00124     Usage   :   $self->fetch_input
00125     Function:   prepares global variables and DB connections
00126     Returns :   none
00127     Args    :   none
00128 
00129 =cut
00130 
00131 sub fetch_input {
00132   my ($self) = @_;
00133 
00134   my $species_list = $self->param('species') || $self->param('genome_db_ids');
00135 
00136   unless( $species_list ) {
00137       my $h = Bio::EnsEMBL::Utils::SqlHelper->new(-DB_CONNECTION => $self->compara_dba()->dbc());
00138       my $sql = q{SELECT DISTINCT genome_db_id FROM member WHERE genome_db_id IS NOT NULL AND genome_db_id <> 0};
00139       $species_list = $h->execute_simple( -SQL => $sql);
00140   }
00141 
00142   my $genome_db_adaptor = $self->compara_dba()->get_GenomeDBAdaptor();  
00143 
00144   my @genome_dbs = ();
00145   foreach my $species (@$species_list) {
00146     my $genome_db = ( looks_like_number( $species )
00147         ? $genome_db_adaptor->fetch_by_dbID( $species )
00148         : $genome_db_adaptor->fetch_by_registry_name( $species ) )
00149     or die "Could not fetch genome_db object given '$species'";
00150 
00151     push @genome_dbs, $genome_db;
00152   }
00153   $self->param('genome_dbs', \@genome_dbs);
00154 }
00155 
00156 =head2 run
00157 
00158     Title   :   run
00159     Usage   :   $self->run
00160     Function:   Retrives the Members to update
00161     Returns :   none
00162     Args    :   none
00163 
00164 =cut
00165 
00166 sub run {
00167   my ($self) = @_;
00168   
00169   my $genome_dbs = $self->param('genome_dbs');
00170   if($self->debug()) {
00171     my $names = join(q{, }, map { $_->name() } @$genome_dbs);
00172     print "Working with: [${names}]\n";
00173   }
00174   
00175   my $results = $self->param('results', {});
00176   
00177   foreach my $genome_db (@$genome_dbs) {
00178     my $output = $self->_process_genome_db($genome_db);
00179     $results->{$genome_db->dbID()} = $output;
00180   }
00181 }
00182 
00183 =head2 write_output
00184 
00185     Title   :   write_output
00186     Usage   :   $self->write_output
00187     Function:   Writes the display labels/members back to the Compara DB
00188     Returns :   none
00189     Args    :   none
00190 
00191 =cut
00192 
00193 sub write_output {
00194   my ($self) = @_;
00195   
00196   my $genome_dbs = $self->param('genome_dbs');
00197   foreach my $genome_db (@$genome_dbs) {
00198     $self->_update_display_labels($genome_db);
00199   }
00200 }
00201 
00202 
00203 #--- Generic Logic
00204 
00205 
00206 sub _process_genome_db {
00207     my ($self, $genome_db) = @_;
00208     
00209     my $name = $genome_db->name();
00210     my $replace = $self->param('replace');
00211     
00212     print "Processing ${name}\n" if $self->debug();
00213     
00214     if(!$genome_db->db_adaptor()) {
00215         throw('Cannot get an adaptor for GenomeDB '.$name) if $self->param('die_if_no_core_adaptor');
00216         return;
00217     }
00218 
00219     my @members_to_update;
00220     my @sources = qw(ENSEMBLGENE ENSEMBLPEP);
00221     foreach my $source_name (@sources) {
00222       print "Working with ${source_name}\n" if $self->debug();
00223       if(!$self->_need_to_process_genome_db_source($genome_db, $source_name) && !$replace) {
00224         if($self->debug()) {
00225           print "No need to update as all members for ${name} and source ${source_name} have display labels\n";
00226         }
00227         next;
00228       }
00229       my $results = $self->_process($genome_db, $source_name);
00230       push(@members_to_update, @{$results});
00231     }
00232     
00233     return \@members_to_update;
00234 }
00235 
00236 sub _process {
00237   my ($self, $genome_db, $source_name) = @_;
00238   
00239   my @members_to_update;
00240   my $replace = $self->param('replace');
00241   
00242   my $members = $self->_get_members_by_source($genome_db, $source_name);
00243   
00244   if(%{$members}) {
00245     my $core_labels = $self->_get_display_label_lookup($genome_db, $source_name);
00246 
00247     foreach my $stable_id (keys %{$members}) {
00248       my $member = $members->{$stable_id};
00249       
00250       #Skip if it's already got a label & we are not replacing things
00251       next if defined $member->display_label() && !$replace;
00252       
00253       my $display_label = $core_labels->{$stable_id};
00254       #Next if there was no core object for the stable ID
00255       next if ! defined $display_label;
00256       $member->display_label($display_label);
00257       push(@members_to_update, $member);
00258     }
00259   } else {
00260     my $name = $genome_db->name();
00261     print "No members found for ${name} and ${source_name}\n" if $self->debug();
00262   }
00263     
00264   return \@members_to_update;
00265 }
00266 
00267 sub _need_to_process_genome_db_source {
00268     my ($self, $genome_db, $source_name) = @_;
00269     my $h = Bio::EnsEMBL::Utils::SqlHelper->new(-DB_CONNECTION => $self->compara_dba()->dbc());
00270     my $sql = q{select count(*) from member 
00271 where genome_db_id =? and display_label is null and source_name =?};
00272   my $params = [$genome_db->dbID(), $source_name];
00273     return $h->execute_single_result( -SQL => $sql, -PARAMS => $params);
00274 }
00275 
00276 sub _get_members_by_source {
00277     my ($self, $genome_db, $source_name) = @_;
00278     my $member_a = $self->compara_dba()->get_MemberAdaptor();
00279     my $gdb_id = $genome_db->dbID();
00280     my $constraint = qq(m.source_name = '${source_name}' and m.genome_db_id = ${gdb_id});
00281     my $members = $member_a->_generic_fetch($constraint);
00282     my $members_hash = {};
00283     foreach my $member (@{$members}) {
00284         $members_hash->{$member->stable_id()} = $member;
00285     }
00286     return $members_hash;
00287 }
00288 
00289 sub _get_display_label_lookup {
00290   my ($self, $genome_db, $source_name) = @_;
00291     
00292   my $sql_lookup = {
00293       'ENSEMBLGENE'  => q{select gsi.stable_id, x.display_label 
00294 from gene_stable_id gsi 
00295 join gene g using (gene_id)  
00296 join xref x on (g.display_xref_id = x.xref_id) 
00297 join seq_region sr on (g.seq_region_id = sr.seq_region_id) 
00298 join coord_system cs using (coord_system_id) 
00299 where cs.species_id =?},
00300       'ENSEMBLPEP'   => q{select tsi.stable_id, x.display_label 
00301 from translation_stable_id tsi 
00302 join translation tr using (translation_id) 
00303 join transcript t using (transcript_id) 
00304 join xref x on (t.display_xref_id = x.xref_id) 
00305 join seq_region sr on (t.seq_region_id = sr.seq_region_id) 
00306 join coord_system cs using (coord_system_id) 
00307 where cs.species_id =?}
00308     };
00309     
00310   my $dba = $genome_db->db_adaptor();
00311   my $h = Bio::EnsEMBL::Utils::SqlHelper->new(-DB_CONNECTION => $dba->dbc());
00312   
00313   my $sql = $sql_lookup->{$source_name};
00314   my $params = [$dba->species_id()];
00315     
00316   my $hash = $h->execute_into_hash( -SQL => $sql, -PARAMS => $params );
00317   return $hash;
00318 }
00319 
00320 sub _update_display_labels {
00321     my ($self, $genome_db) = @_;
00322     
00323     my $name = $genome_db->name();
00324     my $members = $self->param('results')->{$genome_db->dbID()};
00325     
00326     if(! defined $members || scalar(@{$members}) == 0) {
00327       print "No members to write back for ${name}\n" if $self->debug();
00328       return;
00329     }
00330     
00331     print "Writing members out for ${name}\n" if $self->debug();
00332     
00333     my $total = 0;
00334     
00335     my $h = Bio::EnsEMBL::Utils::SqlHelper->new(-DB_CONNECTION => $self->compara_dba()->dbc());
00336      
00337     $h->transaction( -CALLBACK => sub {
00338       my $sql = 'update member set display_label =? where member_id =?';
00339       $h->batch(
00340        -SQL => $sql,
00341        -CALLBACK => sub {
00342          my ($sth) = @_;
00343          foreach my $member (@{$members}) {
00344            my $updated = $sth->execute($member->display_label(), $member->dbID());
00345            $total += $updated;
00346          }
00347          return;
00348        }
00349       );
00350     });
00351 
00352     print "Inserted ${total} member(s) for ${name}\n" if $self->debug();
00353     
00354     return $total;
00355 }
00356 
00357 1;