Archive Ensembl HomeArchive Ensembl Home
LoadMembers.pm
Go to the documentation of this file.
00001 
00002 =pod 
00003 
00004 =head1 NAME
00005 
00006     Bio::EnsEMBL::Compara::RunnableDB::LoadMembers
00007 
00008 =cut
00009 
00010 =head1 SYNOPSIS
00011 
00012         # load reference peptide+gene members of a particular genome_db (mouse)
00013     standaloneJob.pl LoadMembers.pm -compara_db "mysql://ensadmin:${ENSADMIN_PSW}@compara2/lg4_test_loadmembers" -genome_db_id 57
00014 
00015         # load nonreference peptide+gene members of a particular genome_db (human)
00016     standaloneJob.pl LoadMembers.pm -compara_db "mysql://ensadmin:${ENSADMIN_PSW}@compara2/lg4_test_loadmembers" -genome_db_id 90 -include_nonreference 1 -include_reference 0
00017 
00018         # load reference coding exon members of a particular genome_db (rat)
00019     standaloneJob.pl LoadMembers.pm -compara_db "mysql://ensadmin:${ENSADMIN_PSW}@compara2/lg4_test_loadmembers" -genome_db_id 3 -coding_exons 1 -min_length 20
00020 
00021 =cut
00022 
00023 =head1 DESCRIPTION
00024 
00025 This RunnableDB works in two major modes, depending on the trueness of 'coding_exons' parameter.
00026 
00027 ProteinTree pipeline uses this module with $self->param('coding_exons') set to false.
00028 Which is a request to load peptide+gene members from a particular core database defined by $self->param('genome_db_id').
00029 
00030 MercatorPecan pipeline uses this module with $self->param('coding_exons') set to true.
00031 Which is a request to load coding exon members from a particular core database defined by $self->param('genome_db_id').
00032 
00033 You can also choose whether you want your members (peptides or coding exons) extracted from reference slices, nonreference slices (including LRGs) or both
00034 by using -include_reference <0|1> and -include_nonreference <0|1> parameters.
00035 
00036 =cut
00037 
00038 =head1 CONTACT
00039 
00040 Contact anybody in Compara.
00041 
00042 =cut
00043 
00044 package Bio::EnsEMBL::Compara::RunnableDB::LoadMembers;
00045 
00046 use strict;
00047 use warnings;
00048 
00049 use Bio::EnsEMBL::Compara::Member;
00050 use Bio::EnsEMBL::Compara::Subset;
00051 
00052 use base ('Bio::EnsEMBL::Compara::RunnableDB::BaseRunnable');
00053 
00054 
00055 sub param_defaults {
00056     return {
00057         'include_reference'     => 1,
00058         'include_nonreference'  => 0,
00059         'store_genes'           => 1, # whether the genes are also stored as members
00060     };
00061 }
00062 
00063 
00064 sub fetch_input {
00065     my $self = shift @_;
00066 
00067         # not sure if this can be done directly in param_defaults because of the order things get initialized:
00068     unless(defined($self->param('verbose'))) {
00069         $self->param('verbose', $self->debug == 2);
00070     }
00071 
00072     my $genome_db_id = $self->param('genome_db_id')
00073         or die "'genome_db_id' is an obligatory parameter";
00074 
00075     my $compara_dba = $self->compara_dba();
00076 
00077         #get the Compara::GenomeDB object for the genome_db_id:
00078     my $genome_db = $self->param('genome_db', $compara_dba->get_GenomeDBAdaptor->fetch_by_dbID($genome_db_id) )
00079         or die "Can't fetch the genome_db object (gdb_id=$genome_db_id) from Compara";
00080   
00081         #using genome_db_id, connect to external core database:
00082     $self->param('core_dba', $genome_db->db_adaptor() )
00083         or die "Can't connect to external core database for gdb=$genome_db_id";
00084 
00085     my $genome_db_name = $genome_db->name;
00086 
00087         # this one will be used later for dataflow:
00088     $self->param('per_genome_suffix', $genome_db_name.'_'.$genome_db_id );
00089 
00090     unless($self->param('include_reference') or $self->param('include_nonreference')) {
00091         die "Either 'include_reference' or 'include_nonreference' or both have to be true";
00092     }
00093 
00094     my $ref_nonref = join('+', ($self->param('include_reference') ? ('ref') : ()), ($self->param('include_nonreference') ? ('nonref') : ()));
00095 
00096     if ($self->param('coding_exons')) {
00097 
00098         $self->param('exonSubset', Bio::EnsEMBL::Compara::Subset->new(
00099           -name=>"gdb:$genome_db_id $genome_db_name $ref_nonref coding exons") );
00100 
00101             # This does an INSERT IGNORE or a SELECT if already exists:
00102         $compara_dba->get_SubsetAdaptor->store($self->param('exonSubset'));
00103 
00104     } else {
00105 
00106         $self->param('pepSubset', Bio::EnsEMBL::Compara::Subset->new(
00107           -name=>"gdb:$genome_db_id $genome_db_name $ref_nonref canonical translations") );
00108 
00109         $self->param('geneSubset', Bio::EnsEMBL::Compara::Subset->new(
00110           -name=>"gdb:$genome_db_id $genome_db_name $ref_nonref genes") );
00111 
00112             # This does an INSERT IGNORE or a SELECT if already exists:
00113         $compara_dba->get_SubsetAdaptor->store($self->param('pepSubset'));
00114         $compara_dba->get_SubsetAdaptor->store($self->param('geneSubset'));
00115     }
00116 }
00117 
00118 
00119 sub run {
00120     my $self = shift @_;
00121 
00122     my $compara_dba = $self->compara_dba();
00123     my $core_dba    = $self->param('core_dba');
00124 
00125     $compara_dba->dbc->disconnect_when_inactive(0);
00126     $core_dba->dbc->disconnect_when_inactive(0);
00127 
00128     my $unfiltered_slices = $core_dba->get_SliceAdaptor->fetch_all('toplevel', $self->param('include_nonreference') ? (undef, 1, undef, 1) : ());
00129     die "Could not fetch any toplevel slices from ".$core_dba->dbc->dbname() unless(scalar(@$unfiltered_slices));
00130 
00131     my $slices = $self->param('include_reference')
00132                     ? $unfiltered_slices
00133                     : [ grep { not $_->is_reference() } @$unfiltered_slices ];
00134 
00135     if(scalar(@$slices)) {
00136 
00137         $self->loadMembersFromCoreSlices( $slices );
00138 
00139     } else {
00140 
00141         $self->warning("No suitable toplevel slices found in ".$core_dba->dbc->dbname());
00142     }
00143 }
00144 
00145 
00146 sub write_output {
00147     my $self = shift @_;
00148 
00149     $self->dataflow_output_id( {
00150         'genome_db_id'      => $self->param('genome_db_id'),
00151         'reuse_this'        => 0,
00152         'subset_id'         => $self->param('coding_exons')
00153                                 ? $self->param('exonSubset')->dbID  # MercatorPecan pipeline
00154                                 : $self->param('pepSubset')->dbID,  # ProteinTrees pipeline
00155         'per_genome_suffix' => $self->param('per_genome_suffix'),
00156     } , 1);
00157 }
00158 
00159 
00160 ######################################
00161 #
00162 # subroutines
00163 #
00164 #####################################
00165 
00166 
00167 sub loadMembersFromCoreSlices {
00168     my ($self, $slices) = @_;
00169 
00170         # initialize internal counters for tracking success of process:
00171     $self->param('sliceCount',      0);
00172     $self->param('geneCount',       0);
00173     $self->param('realGeneCount',   0);
00174     $self->param('transcriptCount', 0);
00175 
00176   #from core database, get all slices, and then all genes in slice
00177   #and then all transcripts in gene to store as members in compara
00178 
00179   my @genes;
00180 
00181   SLICE: foreach my $slice (@$slices) {
00182     $self->param('sliceCount', $self->param('sliceCount')+1 );
00183     #print("slice " . $slice->name . "\n");
00184 
00185     @genes = ();
00186     my $current_end;
00187 
00188     foreach my $gene (sort {$a->start <=> $b->start} @{$slice->get_all_Genes}) {
00189       $self->param('geneCount', $self->param('geneCount')+1 );
00190       # LV and C are for the Ig/TcR family, which rearranges
00191       # somatically so is considered as a different biotype in EnsEMBL
00192       # D and J are very short or have no translation at all
00193 
00194       if (defined $self->param('coding_exons')) {
00195           $current_end = $gene->end unless (defined $current_end);
00196           if((lc($gene->biotype) eq 'protein_coding')) {
00197               $self->param('realGeneCount', $self->param('realGeneCount')+1 );
00198               if ($gene->start <= $current_end) {
00199                   push @genes, $gene;
00200                   $current_end = $gene->end if ($gene->end > $current_end);
00201               } else {
00202                   $self->store_all_coding_exons(\@genes);
00203                   @genes = ();
00204                   $current_end = $gene->end;
00205                   push @genes, $gene;
00206               }
00207           }
00208       } else {
00209           if (   lc($gene->biotype) eq 'protein_coding'
00210              || lc($gene->biotype) eq 'ig_v_gene'
00211              || lc($gene->biotype) eq 'ig_c_gene'
00212              #         || lc($gene->biotype) eq 'polymorphic_pseudogene'     # lg4: not sure if this biotype is ok, as it has a stop codon in the middle
00213              ) {
00214               $self->param('realGeneCount', $self->param('realGeneCount')+1 );
00215               
00216               $self->store_gene_and_all_transcripts($gene);
00217               
00218               print STDERR $self->param('realGeneCount') , " genes stored\n" if ($self->debug && (0 == ($self->param('realGeneCount') % 100)));
00219           }
00220       }
00221     } # foreach
00222 
00223     if ($self->param('coding_exons')) {
00224         $self->store_all_coding_exons(\@genes);
00225     }
00226   }
00227 
00228   print("loaded ".$self->param('sliceCount')." slices\n");
00229   print("       ".$self->param('geneCount')." genes\n");
00230   print("       ".$self->param('realGeneCount')." real genes\n");
00231   print("       ".$self->param('transcriptCount')." transcripts\n");
00232 
00233   if($self->param('coding_exons')) {
00234       print("       ".$self->param('exonSubset')->count()." in exonSubset\n");
00235   } else {
00236       print("       ".$self->param('pepSubset')->count()." in pepSubset\n");
00237   }
00238 }
00239 
00240 
00241 sub store_gene_and_all_transcripts {
00242   my $self = shift;
00243   my $gene = shift;
00244 
00245   my $member_adaptor = $self->compara_dba->get_MemberAdaptor();
00246   
00247   my @canonicalPeptideMember;
00248   my $gene_member;
00249   my $gene_member_not_stored = 1;
00250 
00251   if(defined($self->param('pseudo_stableID_prefix'))) {
00252     $gene->stable_id($self->param('pseudo_stableID_prefix') ."G_". $gene->dbID);
00253   }
00254 
00255   my $canonical_transcript; my $canonical_transcript_stable_id;
00256   eval {
00257     $canonical_transcript = $gene->canonical_transcript;
00258     $canonical_transcript_stable_id = $canonical_transcript->stable_id;
00259   };
00260   if (!defined($canonical_transcript) && !defined($self->param('force_unique_canonical'))) {
00261     print STDERR "WARN: ", $gene->stable_id, " has no canonical transcript\n" if ($self->debug);
00262     return 1;
00263   }
00264 
00265   foreach my $transcript (@{$gene->get_all_Transcripts}) {
00266     my $translation = $transcript->translation;
00267     next unless (defined $translation);
00268 
00269     if (!defined($self->param('force_unique_canonical'))) {
00270       if ($canonical_transcript->biotype ne $gene->biotype) {
00271         # This can happen when the only transcripts are, e.g., NMDs
00272         print STDERR "INFO: ", $canonical_transcript->stable_id, " biotype ", $canonical_transcript->biotype, " is canonical\n" if ($self->debug);
00273       }
00274     }
00275 #    This test might be useful to put here, thus avoiding to go further in trying to get a peptide
00276 #    my $next = 0;
00277 #    try {
00278 #      $transcript->translate;
00279 #    } catch {
00280 #      warn("COREDB error: transcript does not translate", $transcript->stable_id, "(dbID=",$transcript->dbID.")\n");
00281 #      $next = 1;
00282 #    };
00283 #    next if ($next);
00284 
00285     if(defined($self->param('pseudo_stableID_prefix'))) {
00286       $transcript->stable_id($self->param('pseudo_stableID_prefix') ."T_". $transcript->dbID);
00287       $translation->stable_id($self->param('pseudo_stableID_prefix') ."P_". $translation->dbID);
00288     }
00289 
00290     $self->param('transcriptCount', $self->param('transcriptCount')+1 );
00291     #print("gene " . $gene->stable_id . "\n");
00292     print("     transcript " . $transcript->stable_id ) if($self->param('verbose'));
00293 
00294     unless (defined $translation->stable_id) {
00295       die "CoreDB error: does not contain translation stable id for translation_id ". $translation->dbID;
00296     }
00297 
00298     my $description = $self->fasta_description($gene, $transcript);
00299 
00300     my $pep_member = Bio::EnsEMBL::Compara::Member->new_from_transcript(
00301          -transcript=>$transcript,
00302          -genome_db=>$self->param('genome_db'),
00303          -translate=>'yes',
00304          -description=>$description);
00305 
00306     print(" => member " . $pep_member->stable_id) if($self->param('verbose'));
00307 
00308     unless($pep_member->sequence) {
00309       print "  => NO SEQUENCE for member " . $pep_member->stable_id;
00310       next;
00311     }
00312     print(" len=",$pep_member->seq_length ) if($self->param('verbose'));
00313 
00314     # store gene_member here only if at least one peptide is to be loaded for
00315     # the gene.
00316     if($self->param('store_genes') && $gene_member_not_stored) {
00317       print("     gene       " . $gene->stable_id ) if($self->param('verbose'));
00318       $gene_member = Bio::EnsEMBL::Compara::Member->new_from_gene(
00319                                                                   -gene=>$gene,
00320                                                                   -genome_db=>$self->param('genome_db'));
00321       print(" => member " . $gene_member->stable_id) if($self->param('verbose'));
00322 
00323       my $stable_id = $gene_member->stable_id;
00324       $member_adaptor->store($gene_member);
00325       print(" : stored") if($self->param('verbose'));
00326 
00327       $self->param('geneSubset')->add_member($gene_member);
00328       print("\n") if($self->param('verbose'));
00329       $gene_member_not_stored = 0;
00330     }
00331 
00332     my $stable_id = $pep_member->stable_id;
00333     $member_adaptor->store($pep_member);
00334 
00335     $member_adaptor->store_gene_peptide_link($gene_member->dbID, $pep_member->dbID);
00336     print(" : stored\n") if($self->param('verbose'));
00337 
00338     if(($transcript->stable_id eq $canonical_transcript_stable_id) || defined($self->param('force_unique_canonical'))) {
00339       @canonicalPeptideMember = ($transcript, $pep_member);
00340     }
00341 
00342   }
00343 
00344   if(@canonicalPeptideMember) {
00345     my ($transcript, $member) = @canonicalPeptideMember;
00346     $self->param('pepSubset')->add_member($member);
00347     # print("     LONGEST " . $transcript->stable_id . "\n");
00348   }
00349   return 1;
00350 }
00351 
00352 
00353 sub store_all_coding_exons {
00354   my ($self, $genes) = @_;
00355 
00356   return 1 if (scalar @$genes == 0);
00357 
00358   my $min_exon_length = $self->param('min_length') or die "'min_length' is an obligatory parameter";
00359 
00360   my $member_adaptor = $self->compara_dba->get_MemberAdaptor();
00361   my $genome_db = $self->param('genome_db');
00362   my @exon_members = ();
00363 
00364   foreach my $gene (@$genes) {
00365       #print " gene " . $gene->stable_id . "\n";
00366 
00367     foreach my $transcript (@{$gene->get_all_Transcripts}) {
00368       $self->param('transcriptCount', $self->param('transcriptCount')+1);
00369 
00370       print("     transcript " . $transcript->stable_id ) if($self->param('verbose'));
00371       
00372       foreach my $exon (@{$transcript->get_all_translateable_Exons}) {
00373 #     print "        exon " . $exon->stable_id . "\n";
00374         unless (defined $exon->stable_id) {
00375           warn("COREDB error: does not contain exon stable id for translation_id ".$exon->dbID."\n");
00376           next;
00377         }
00378         my $description = $self->fasta_description($exon, $transcript);
00379         
00380         my $exon_member = new Bio::EnsEMBL::Compara::Member;
00381         $exon_member->taxon_id($genome_db->taxon_id);
00382         if(defined $description ) {
00383           $exon_member->description($description);
00384         } else {
00385           $exon_member->description("NULL");
00386         }
00387         $exon_member->genome_db_id($genome_db->dbID);
00388         $exon_member->chr_name($exon->seq_region_name);
00389         $exon_member->chr_start($exon->seq_region_start);
00390         $exon_member->chr_end($exon->seq_region_end);
00391         $exon_member->chr_strand($exon->seq_region_strand);
00392         $exon_member->version($exon->version);
00393         $exon_member->stable_id($exon->stable_id);
00394         $exon_member->source_name("ENSEMBLEXON");
00395 
00396     #Not sure what this should be but need to set it to something or else the members do not get added
00397     #to the member table in the store method of MemberAdaptor
00398     $exon_member->display_label("NULL");
00399         
00400         my $seq_string = $exon->peptide($transcript)->seq;
00401         ## a star or a U (selenocysteine) in the seq breaks the pipe to the cast filter for Blast
00402         $seq_string =~ tr/\*U/XX/;
00403         if ($seq_string =~ /^X+$/) {
00404           warn("X+ in sequence from exon " . $exon->stable_id."\n");
00405         }
00406         else {
00407           $exon_member->sequence($seq_string);
00408         }
00409 
00410         print(" => member " . $exon_member->stable_id) if($self->param('verbose'));
00411 
00412         unless($exon_member->sequence) {
00413           print("  => NO SEQUENCE!\n") if($self->param('verbose'));
00414           next;
00415         }
00416         print(" len=",$exon_member->seq_length ) if($self->param('verbose'));
00417         next if ($exon_member->seq_length < $min_exon_length);
00418         push @exon_members, $exon_member;
00419       }
00420     }
00421   }
00422   @exon_members = sort {$b->seq_length <=> $a->seq_length} @exon_members;
00423   my @exon_members_stored = ();
00424   while (my $exon_member = shift @exon_members) {
00425     my $not_to_store = 0;
00426     foreach my $stored_exons (@exon_members_stored) {
00427       if ($exon_member->chr_start <=$stored_exons->chr_end &&
00428           $exon_member->chr_end >= $stored_exons->chr_start) {
00429         $not_to_store = 1;
00430         last;
00431       }
00432     }
00433     next if ($not_to_store);
00434     push @exon_members_stored, $exon_member;
00435 
00436     eval {
00437         my $stable_id = $exon_member->stable_id;
00438         #print "New member\n";
00439         $member_adaptor->store($exon_member);
00440         print(" : stored\n") if($self->param('verbose'));
00441     };
00442     $self->param('exonSubset')->add_member($exon_member);
00443   }
00444 }
00445 
00446 
00447 sub fasta_description {
00448   my ($self, $gene, $transcript) = @_;
00449 
00450   my $description = "Transcript:" . $transcript->stable_id .
00451                     " Gene:" .      $gene->stable_id .
00452                     " Chr:" .       $gene->seq_region_name .
00453                     " Start:" .     $gene->seq_region_start .
00454                     " End:" .       $gene->seq_region_end;
00455   return $description;
00456 }
00457 
00458 
00459 1;