Archive Ensembl HomeArchive Ensembl Home
FindSplitGenesOnTree.pm
Go to the documentation of this file.
00001 #
00002 # You may distribute this module under the same terms as perl itself
00003 #
00004 # POD documentation - main docs before the code
00005 
00006 =pod 
00007 
00008 =head1 NAME
00009 
00010 Bio::EnsEMBL::Compara::RunnableDB::FindSplitGenesOnTree
00011 
00012 =cut
00013 
00014 =head1 SYNOPSIS
00015 
00016 my $db           = Bio::EnsEMBL::Compara::DBAdaptor->new($locator);
00017 my $find_split_genes = Bio::EnsEMBL::Compara::RunnableDB::FindSplitGenesOnTree->new
00018   (
00019    -db         => $db,
00020    -input_id   => $input_id,
00021    -analysis   => $analysis
00022   );
00023 $find_split_genes->fetch_input(); #reads from DB
00024 $find_split_genes->run();
00025 $find_split_genes->output();
00026 $find_split_genes->write_output(); #writes to DB
00027 
00028 =cut
00029 
00030 
00031 =head1 DESCRIPTION
00032 
00033 This Analysis will take a protein tree id and calcul the species intersection score, alignment overlap score, overlap, etc. 
00034 for each possible pair of split genes 
00035 
00036 =cut
00037 
00038 
00039 =head1 CONTACT
00040 
00041   Contact Thomas Maurel on module implementation/design detail: maurel@ebi.ac.uk
00042   Contact Javier Herrero on Split/partial genes in general: jherrero@ebi.ac.uk
00043 
00044 =cut
00045 
00046 
00047 =head1 APPENDIX
00048 
00049 The rest of the documentation details each of the object methods. 
00050 Internal methods are usually preceded with a _
00051 
00052 =cut
00053 
00054 
00055 package Bio::EnsEMBL::Compara::RunnableDB::FindSplitGenesOnTree;
00056 
00057 use strict;
00058 
00059 use base ('Bio::EnsEMBL::Compara::RunnableDB::BaseRunnable');
00060 
00061 sub fetch_input {
00062   my $self = shift @_;
00063 
00064   my $protein_tree_id      = $self->param('protein_tree_id') or die "'protein_tree_id' is an obligatory parameter";
00065   my $protein_tree_adaptor = $self->compara_dba->get_ProteinTreeAdaptor();
00066       # if fetch_node_by_node_id is insufficient, try fetch_tree_at_node_id
00067   my $protein_tree         = $protein_tree_adaptor->fetch_node_by_node_id($protein_tree_id) or die "Could not fetch protein_tree by id=$protein_tree_id";
00068   $self->param('protein_tree', $protein_tree);
00069   my $homology_adaptor     = $self->compara_dba->get_HomologyAdaptor();
00070   my $homologies           = $homology_adaptor->fetch_all_by_tree_node_id($protein_tree_id);
00071   $self->param('homologies', $homologies);
00072 
00073   $self->dbc->disconnect_when_inactive(1);
00074 }
00075 
00076 
00077 sub run {
00078   my $self = shift @_;
00079 
00080   my $protein_tree = $self->param('protein_tree');
00081   my $homologies   = $self->param('homologies');
00082   my $kingdom      = $self->param('kingdom') or '(none)';
00083   my @output_ids = ();
00084 
00085         # That will return a reference to an array with all homologies (orthologues in
00086         # other species and paralogues in the same one)
00087         # And then only split gene are kept in a array called @homologie_split_gene
00088         my @homologies_split_gene=();
00089         foreach my $homologies_search (@{$homologies}) 
00090         {
00091           if ($homologies_search->description =~ /gene_split/)
00092           {
00093             push (@homologies_split_gene,$homologies_search);
00094           } 
00095         }
00096  
00097 
00098         my %pairs_of_split_genes = ();
00099         foreach my $homology (@homologies_split_gene)
00100         {
00101         # You will find different kind of description
00102         # UBRH, MBRH, RHS, YoungParalogues
00103         # see ensembl-compara/docs/docs/schema_doc.html for more details
00104           my ($member1, $member2) = @{$homology->get_all_Members};
00105           if(defined $member1)
00106           {
00107             $pairs_of_split_genes{$member1->stable_id}{$member2->stable_id} = 1;
00108             $pairs_of_split_genes{$member2->stable_id}{$member1->stable_id} = 1;
00109           }
00110         }
00111 
00112       # get all leaves,  all proteins of the tree
00113       my @aligned_members = @{$protein_tree->get_all_leaves};                                                                                                                                          
00114       # Group aligned members per species
00115       my $aligned_member_per_species = {};
00116       for (my $i = 0; $i < @aligned_members; $i++) 
00117       {
00118         push(@{$aligned_member_per_species->{$aligned_members[$i]->genome_db->name}}, $aligned_members[$i]);
00119       }                                     
00120                                            
00121       # Study each set of aligned members in turn                                                          
00122       foreach  my $these_aligned_members (values %$aligned_member_per_species)                            
00123       {                                                                                              
00124                                                                                                   
00125       for (my $i = 0; $i < @$these_aligned_members; $i++)                                          
00126       {                                                                                          
00127           my $score = 0;                                                                      
00128           my $first_aa_length =0;                                                              
00129           my $length =0;                                                                    
00130           my $first_aa=0;                                                                    
00131           my $overlap = 0;                                                                 
00132           my $is_a_split_gene=0;                                                         
00133           my $perc_confidence=0;                                                          
00134           my $unknown_aa_seq = 0;                                                      
00135           my $unknown2_aa_seq =0;                                                       
00136           my $possible_split_gene=0;
00137           my $ancestor =0;                                                            
00138           my $duplication_confidence_score=0;                                        
00139           my $duplication_confidence_score_rounded=0;                              
00140           my $isec = 0;                                                             
00141           my $union = 0;                                                          
00142           my $msplit= "NA";                                                     
00143                                                                                  
00144           for (my $j = $i + 1 ; $j < @$these_aligned_members; $j++)           
00145           {                                                                    
00146               #used of a subroutine compare_aligned_members to compare proteins with each others and find shortests.  
00147               ($score,$first_aa_length,$length, $first_aa,$overlap, $unknown_aa_seq,$unknown2_aa_seq)=compare_aligned_members(@$these_aligned_members[$i], @$these_aligned_members[$j]); 
00148                                                                                                                      
00149               #Fetch Gene Stable ID of the possible split gene                                                      
00150               $possible_split_gene= @$these_aligned_members[$j];
00151               # get the ancestor of two aligned members                                                            
00152               $ancestor =$these_aligned_members->[$i]->find_first_shared_ancestor(@$these_aligned_members[$j]);   
00153                                                                                                                   
00154                                                                                                                 
00155             #Result of a subroutine checking if those two aligned members are referenced as a split gene in the homology table 
00156             $is_a_split_gene = 0;
00157             if ($pairs_of_split_genes{$these_aligned_members->[$i]->gene_member->stable_id}{$these_aligned_members->[$j]->gene_member->stable_id}) {
00158               $is_a_split_gene = 1;
00159             }                  
00160 
00161             #Concatenate Number of unknonwn aa on two partial genes matching                                      
00162             #$final_unknown_aa_seq=$final_unknown_aa_seq."X".$final_unknown2_aa_seq."X";                        
00163                                                                                                                  
00164             #Result of a subroutine that will get the percentage of confidence of the duplication node         
00165             ($duplication_confidence_score,$isec,$union,$msplit) = duplication_confidence_score($protein_tree,$ancestor,@$these_aligned_members[$i],$possible_split_gene);                       
00166                                                                                                               
00167             #Round up the duplication confidence score.                                                      
00168             $duplication_confidence_score_rounded = sprintf("%.3f",$duplication_confidence_score);          
00169        
00170 
00171            #Push all the result into an array
00172             push @output_ids, {
00173               'tagged_as_split_gene_by_gene_tree_pipeline' => $is_a_split_gene,
00174               'overlap' => $overlap,
00175               'score_inter_union' => int($score*1000)/10,
00176               'first_aa_prot' => $first_aa,
00177               'unknown_aa_prot1' => $unknown_aa_seq,
00178               'unknown_aa_prot2' => $unknown2_aa_seq,
00179               'rounded_duplication_confidence_score' => $duplication_confidence_score_rounded,
00180               'intersection_duplication_score' => $isec,
00181               'union_duplication_confidence_score' => $union,
00182               'merged_by_gene_tree_pipeline' => $msplit,
00183               'chr_name' => @$these_aligned_members[$i]->chr_name,
00184               'chr_strand' => @$these_aligned_members[$i]->chr_strand,
00185               'first_part_split_gene_stable_id' => @$these_aligned_members[$i]->gene_member->stable_id,
00186               'second_part_split_gene_stable_id' => $possible_split_gene->gene_member->stable_id,
00187               'protein1_label' => (@$these_aligned_members[$i]->gene_member->display_label or "NA"),
00188               'protein1_length_in_aa' => $first_aa_length,
00189               'alignment_length' => $length,
00190               'species_name' => @$these_aligned_members[$i]->genome_db->name,
00191               'kingdom' => $kingdom,
00192             };
00193           }           
00194         }
00195       }
00196       $self->param('output_ids', \@output_ids);
00197 }
00198 
00199 sub write_output {
00200   my $self = shift @_;
00201 
00202   my $output_ids = $self->param('output_ids');
00203 
00204   $self->dbc->disconnect_when_inactive(0);
00205 
00206   $self->dataflow_output_id($output_ids, 3);
00207 }
00208 
00209 
00210 #Subroutine witch compare two proteins and return a score, lower score is for a shorter protein and highest for a longest
00211 sub compare_aligned_members {
00212     my ($first_aligned_member, $second_aligned_member) = @_;
00213     my $first_alignment_string = $first_aligned_member->alignment_string;
00214     my $second_alignment_string = $second_aligned_member->alignment_string;
00215 
00216     #Compare the length of two proteins
00217     my $length = length($first_alignment_string);
00218     die "Whoohoohoho" if ($length != length($second_alignment_string));
00219 
00220     #initialise variables
00221     my $union = 0;
00222     my $intersection = 0;
00223     my $second_aa_length = 0;
00224     my $first_aa_length =0 ;
00225     my $first_aa = 0;
00226     my $overlap=1000000;
00227     my $first_aa_boolean =0;
00228     my $unknown_seq =0;
00229     my $unknown2_seq = 0;
00230     my $unknown_aa_seq = 0;
00231     my $unknown2_aa_seq =0;
00232 
00233     # PROT1:  ATGDSGDFAS----DFS---GERGEW------
00234     # PROT2:  --AGFGJEJSGSDHJKYITSDEWRW-------
00235     # inter   00111111110000111000111110000000 => 16 aa
00236     # union   11111111111111111111111111000000 => 26 aa
00237     # score = 16/26
00238 
00239     # PROT1:  ATGDSGDFASFGDWDFS---GERGEW------
00240     # PROT2:  -----------------YITSDEWRW------
00241     # inter   00000000000000000000111111000000 => 6 aa
00242     # union   11111111111111111111111111000000 => 26 aa
00243     # score = 6/26
00244 
00245     # We want to know if prot1 is shorter -> compare intersection with second_aa_length
00246     
00247     for (my $i = 0; $i <= $length - 1; $i++) {
00248     #substr used to cut the string in pieces, to get each amino acid.
00249     my $first = substr($first_alignment_string, $i, 1);
00250     my $second = substr($second_alignment_string, $i, 1);
00251     #if the character of the second protein is not a gap "_" increment second_aa_length.
00252     if ($second ne "-") {
00253         $second_aa_length++;
00254     }
00255     if ($first ne "-"){
00256         $first_aa_length++;
00257     }
00258     if ($first ne "-" and $second ne "-") {
00259         $union++;
00260         $intersection++;
00261     } elsif ($first ne "-" or $second ne "-") {
00262         $union++;
00263     }
00264     if (($first ne "-") and $first_aa_boolean == 0 )
00265     {
00266         $first_aa = $first;
00267         $first_aa_boolean = 1;
00268     }
00269     #if the Protein aa is Unknown
00270     if ($first eq "X")
00271     {
00272         $unknown_seq++;
00273     }
00274     #same for the other gene
00275     if ($second eq "X")
00276     {
00277         $unknown2_seq++;
00278     }
00279     }   
00280     # score is the number of intersection over the number of union.
00281     my $score = $intersection / $union;
00282 
00283     # Check if two members of the same species are overlapping, throw the nomber of overlap 
00284     if(($first_aligned_member->genome_db->name eq $second_aligned_member->genome_db->name))
00285     {
00286     $overlap=$intersection; 
00287     $unknown_aa_seq = $unknown_seq;
00288     $unknown2_aa_seq = $unknown2_seq;
00289     }
00290     return ($score,$first_aa_length,$length,$first_aa,$overlap,$unknown_aa_seq,$unknown2_aa_seq);
00291 }
00292 
00293 sub duplication_confidence_score {   
00294     
00295     my ($protein_tree,$ancestor,$aligned_members_i,$possible_split_gene)=@_;
00296     # This assumes bifurcation!!! No multifurcations allowed 
00297     my ($child_a, $child_b, $dummy) = @{$ancestor->children}; 
00298     throw("tree is multifurcated in duplication_confidence_score\n") if (defined($dummy)); 
00299     my $child_a_id =$aligned_members_i->stable_id;
00300     my $child_b_id =$possible_split_gene->stable_id;
00301     my @child_a_gdbs = keys %{get_ancestor_species_hash($protein_tree,$child_a)}; 
00302     #return undef if (!defined($child_b));
00303     my @child_b_gdbs = keys %{get_ancestor_species_hash($protein_tree,$child_b)}; 
00304     #$DB::single=1;1;
00305     my %seen = ();my @gdb_a = grep { ! $seen{$_} ++ } @child_a_gdbs;
00306     %seen = (); my @gdb_b = grep { ! $seen{$_} ++ } @child_b_gdbs; 
00307     my @isect = my @diff = my @union = ();my %count;
00308     foreach my $e (@gdb_a, @gdb_b) { $count{$e}++ } 
00309     foreach my $e (keys %count) 
00310     {  
00311     push(@union, $e); push @{ $count{$e} == 2 ? \@isect : \@diff }, $e;
00312     } 
00313     my $duplication_confidence_score = 0;
00314     my $scalar_isect = scalar(@isect);
00315     my $scalar_union = scalar(@union); 
00316     my $ancestor_node_id="";
00317     $duplication_confidence_score = (($scalar_isect)/$scalar_union) unless (0 == $scalar_isect);
00318     my $msplit="NA"; 
00319     #Check if gene  was already merged by the pipeline
00320     $msplit = "msplit_${child_a_id}_${child_b_id}" if $protein_tree->has_tag("msplit_${child_a_id}_${child_b_id}");
00321     $msplit = "msplit_${child_b_id}_${child_a_id}" if $protein_tree->has_tag("msplit_${child_b_id}_${child_a_id}");
00322 
00323     # subroutine won't tried to write on the database, it's not the point here.
00324     $protein_tree->{_readonly} = 1;
00325     $ancestor->store_tag ( 
00326                "duplication_confidence_score", 
00327                $duplication_confidence_score 
00328                ) unless ($protein_tree->{'_readonly'});
00329     my $rounded_duplication_confidence_score = (int((100.0 * $scalar_isect / $scalar_union + 0.5))); 
00330     my $species_intersection_score = $ancestor->get_tagvalue("species_intersection_score");
00331     unless (defined($species_intersection_score)) 
00332     { 
00333     $ancestor_node_id = $ancestor->node_id;
00334     #warn("Difference in the ProteinTree: duplication_confidence_score [$duplication_confidence_score] whereas species_intersection_score [$species_intersection_score] is undefined in njtree - ancestor $ancestor_node_id\n");
00335     return ($duplication_confidence_score,$scalar_isect,$scalar_union,$msplit);
00336     } 
00337     if ($species_intersection_score ne $rounded_duplication_confidence_score && !defined($protein_tree->{_readonly})) 
00338     { 
00339     $ancestor_node_id = $ancestor->node_id;
00340     $protein_tree->throw("Inconsistency in the ProteinTree: duplication_confidence_score [$duplication_confidence_score] != species_intersection_score [$species_intersection_score] - $ancestor_node_id\n");
00341     } 
00342     return ($duplication_confidence_score,$scalar_isect,$scalar_union,$msplit);
00343 
00344 }
00345 
00346 sub get_ancestor_species_hash {
00347     my ($protein_tree,$node)=@_;
00348     my $species_hash=0;
00349     $species_hash = $node->get_tagvalue('species_hash');
00350     return ($species_hash) if($species_hash); 
00351     $species_hash = {}; 
00352     my $duplication_hash = {};
00353     my $is_dup=0; 
00354     if($node->isa('Bio::EnsEMBL::Compara::GeneTreeMember')) 
00355     { 
00356     my $node_genome_db_id = $node->genome_db_id;
00357     $species_hash->{$node_genome_db_id} = 1; 
00358     $node->add_tag('species_hash', $species_hash);
00359     return ($species_hash);
00360     } 
00361     foreach my $child (@{$node->children}) 
00362     { 
00363     my $t_species_hash = get_ancestor_species_hash($protein_tree,$child);
00364     next unless(defined($t_species_hash)); 
00365         #shouldn't happen 
00366     foreach my $genome_db_id (keys(%$t_species_hash)) 
00367     {
00368         unless(defined($species_hash->{$genome_db_id})) 
00369         { 
00370         $species_hash->{$genome_db_id} = $t_species_hash->{$genome_db_id};
00371         } 
00372         else 
00373         { #this species already existed in one of the other children 
00374               #this means this species was duplicated at this point between 
00375               #the species 
00376         $is_dup=1;
00377         $duplication_hash->{$genome_db_id} = 1; 
00378         $species_hash->{$genome_db_id} += $t_species_hash->{$genome_db_id};
00379         } 
00380     } 
00381     } 
00382     $node->add_tag("species_hash", $species_hash); 
00383     if($is_dup && !($protein_tree->{'_treefam'})) 
00384     { 
00385       if ($node->get_tagvalue('node_type', '') eq 'speciation')
00386     { 
00387             # RAP did not predict a duplication here 
00388         $node->add_tag('duplication_hash', $duplication_hash);
00389         $node->store_tag('node_type', 'duplication') unless ($protein_tree->{'_readonly'});
00390     } 
00391     } 
00392     return ($species_hash); 
00393 } 
00394 
00395 1;