Archive Ensembl HomeArchive Ensembl Home
NCOrthoTree.pm
Go to the documentation of this file.
00001 #
00002 # You may distribute this module under the same terms as perl itself
00003 #
00004 # POD documentation - main docs before the code
00005 
00006 =pod 
00007 
00008 =head1 NAME
00009 
00010 Bio::EnsEMBL::Compara::RunnableDB::ncRNAtrees::NCOrthoTree
00011 
00012 =cut
00013 
00014 =head1 SYNOPSIS
00015 
00016 my $db    = Bio::EnsEMBL::Compara::DBAdaptor->new($locator);
00017 my $otree = Bio::EnsEMBL::Compara::RunnableDB::ncRNAtrees::NCOrthoTree->new ( 
00018                                                     -db      => $db,
00019                                                     -input_id   => $input_id,
00020                                                     -analysis   => $analysis );
00021 $otree->fetch_input(); #reads from DB
00022 $otree->run();
00023 $otree->output();
00024 $otree->write_output(); #writes to DB
00025 
00026 =cut
00027 
00028 =head1 DESCRIPTION
00029 
00030 This Analysis/RunnableDB is designed to take a GeneTree as input
00031 
00032 This must already have a rooted tree with duplication/sepeciation tags
00033 on the nodes.
00034 
00035 It analyzes that tree structure to pick Orthologues and Paralogs for
00036 each genepair.
00037 
00038 input_id/parameters format eg: "{'nc_tree_id'=>1234}"
00039     nc_tree_id : use 'id' to fetch a cluster from the GeneTree
00040 
00041 =cut
00042 
00043 =head1 CONTACT
00044 
00045   Contact Albert Vilella on module implementation: avilella@ebi.ac.uk
00046   Contact Ewan Birney on EnsEMBL in general: birney@sanger.ac.uk
00047 
00048 =cut
00049 
00050 =head1 APPENDIX
00051 
00052 The rest of the documentation details each of the object methods. 
00053 Internal methods are usually preceded with a _
00054 
00055 =cut
00056 
00057 package Bio::EnsEMBL::Compara::RunnableDB::ncRNAtrees::NCOrthoTree;
00058 
00059 use strict;
00060 use IO::File;
00061 use File::Basename;
00062 use Time::HiRes qw(time gettimeofday tv_interval);
00063 use Scalar::Util qw(looks_like_number);
00064 
00065 use Bio::EnsEMBL::Compara::Member;
00066 use Bio::EnsEMBL::Compara::Graph::Link;
00067 use Bio::EnsEMBL::Compara::Graph::Node;
00068 use Bio::EnsEMBL::Compara::Graph::NewickParser;
00069 use Bio::EnsEMBL::Compara::MethodLinkSpeciesSet;
00070 use Bio::EnsEMBL::Compara::Homology;
00071 use Bio::EnsEMBL::Hive::Utils 'stringify';  # import 'stringify()'
00072 
00073 use base ('Bio::EnsEMBL::Compara::RunnableDB::BaseRunnable');
00074 
00075 
00076 sub param_defaults {
00077     return {
00078             'tree_scale'            => 1,
00079             'store_homologies'      => 1,
00080             'no_between'            => 0.25, # dont store all possible_orthologs
00081     };
00082 }
00083 
00084 
00085 =head2 fetch_input
00086 
00087     Title   :   fetch_input
00088     Usage   :   $self->fetch_input
00089     Function:   Fetches input data from the database
00090     Returns :   none
00091     Args    :   none
00092 
00093 =cut
00094 
00095 
00096 sub fetch_input {
00097     my $self = shift @_;
00098 
00099     $self->param('treeDBA', $self->compara_dba->get_NCTreeAdaptor);
00100     $self->param('homologyDBA', $self->compara_dba->get_HomologyAdaptor);
00101 
00102     my $starttime = time();
00103     my $tree_id = $self->param('nc_tree_id') or die "'protein_tree_id' is an obligatory parameter";
00104     my $gene_tree = $self->param('treeDBA')->fetch_tree_at_node_id($tree_id) or die "Could not fetch gene_tree with tree_id='$tree_id'";
00105     $self->param('gene_tree', $gene_tree);
00106 
00107     if($self->debug) {
00108         $self->param('gene_tree')->print_tree($self->param('tree_scale'));
00109         printf("time to fetch tree : %1.3f secs\n" , time()-$starttime);
00110     }
00111     unless($self->param('gene_tree')) {
00112         $self->throw("undefined GeneTree as input\n");
00113     }
00114     $self->delete_old_homologies;
00115     $self->delete_old_orthotree_tags;
00116     $self->param('taxon_tree', $self->load_species_tree_from_string( $self->get_species_tree_string ) );
00117 }
00118 
00119 
00120 =head2 run
00121 
00122     Title   :   run
00123     Usage   :   $self->run
00124     Function:   runs OrthoTree
00125     Returns :   none
00126     Args    :   none
00127 
00128 =cut
00129 
00130 
00131 sub run {
00132     my $self = shift @_;
00133 
00134     $self->run_analysis;
00135 }
00136 
00137 
00138 =head2 write_output
00139 
00140     Title   :   write_output
00141     Usage   :   $self->write_output
00142 
00143     Function: parse clustalw output and update homology and
00144               homology_member tables
00145     Returns : none 
00146     Args    : none 
00147 
00148 =cut
00149 
00150 sub write_output {
00151     my $self = shift @_;
00152 
00153     $self->store_homologies;
00154 }
00155 
00156 
00157 sub DESTROY {
00158   my $self = shift;
00159 
00160   if($self->param('gene_tree')) {
00161     printf("OrthoTree::DESTROY  releasing gene_tree\n") if($self->debug);
00162     $self->param('gene_tree')->release_tree;
00163     $self->param('gene_tree', undef);
00164   }
00165   if($self->param('taxon_tree')) {
00166     printf("OrthoTree::DESTROY  releasing taxon_tree\n") if($self->debug);
00167     $self->param('taxon_tree')->release_tree;
00168     $self->param('taxon_tree', undef);
00169   }
00170 
00171   $self->SUPER::DESTROY if $self->can("SUPER::DESTROY");
00172 }
00173 
00174 
00175 ##########################################
00176 #
00177 # internal methods
00178 #
00179 ##########################################
00180 
00181 
00182 sub run_analysis {
00183   my $self = shift;
00184 
00185   my $starttime = time()*1000;
00186   my $tmp_time = time();
00187   my $gene_tree = $self->param('gene_tree');
00188 
00189   print "Getting all leaves\n";
00190   my @all_gene_leaves = @{$gene_tree->get_all_leaves};
00191   my $tree_node_id = $gene_tree->node_id;
00192 
00193   #precalculate the ancestor species_hash (caches into the metadata of
00194   #nodes) also augments the Duplication tagging
00195   printf("Calculating ancestor species hash\n") if ($self->debug);
00196   $self->get_ancestor_species_hash($gene_tree);
00197 
00198   if($self->debug) {
00199     $gene_tree->print_tree($self->param('tree_scale'));
00200     printf("%d genes in tree\n", scalar(@all_gene_leaves));
00201   }
00202 
00203   #compare every gene in the tree with every other each gene/gene
00204   #pairing is a potential ortholog/paralog and thus we need to analyze
00205   #every possibility
00206   #Accomplish by creating a fully connected graph between all the
00207   #genes under the tree (hybrid graph structure) and then analyze each
00208   #gene/gene link
00209   $tmp_time = time();
00210   printf("build fully linked graph\n") if($self->debug);
00211   my @genepairlinks;
00212   my $graphcount = 0;
00213   while (my $gene1 = shift @all_gene_leaves) {
00214     foreach my $gene2 (@all_gene_leaves) {
00215       my $ancestor = $gene1->find_first_shared_ancestor($gene2);
00216       # Line below will is a speed-up of the find_first_shared_ancestor, but relies on root_id/clusterset_id schema
00217       # my $ancestor = $self->param('treeDBA')->fetch_first_shared_ancestor_indexed($gene1,$gene2);
00218       my $taxon_level = $self->get_ancestor_taxon_level($ancestor);
00219       my $distance = $gene1->distance_to_ancestor($ancestor) +
00220                      $gene2->distance_to_ancestor($ancestor);
00221       my $genepairlink = new Bio::EnsEMBL::Compara::Graph::Link
00222         (
00223          $gene1, $gene2, $distance
00224         );
00225       $genepairlink->add_tag("hops", 0);
00226       $genepairlink->add_tag("ancestor", $ancestor);
00227       $genepairlink->add_tag("taxon_name", $taxon_level->name);
00228       $genepairlink->add_tag("tree_node_id", $tree_node_id);
00229       push @genepairlinks, $genepairlink;
00230     }
00231     print STDERR "build graph $graphcount\n" if ($graphcount++ % 10 == 0);
00232   }
00233   printf("%1.3f secs build links and features\n", time()-$tmp_time) 
00234     if($self->debug>1);
00235 
00236   $gene_tree->print_tree($self->param('tree_scale')) 
00237     if($self->debug);
00238 
00239   #sort the gene/gene links by distance
00240   #   makes debug display easier to read, not required by algorithm
00241   $tmp_time = time();
00242   printf("sort links\n") if($self->debug);
00243   my @sorted_genepairlinks = 
00244     sort {$a->distance_between <=> $b->distance_between} @genepairlinks;
00245   printf("%1.3f secs to sort links\n", time()-$tmp_time) if($self->debug > 1);
00246 
00247   #analyze every gene pair (genepairlink) to get its classification
00248   printf("analyze links\n") if($self->debug);
00249   printf("%d links\n", scalar(@genepairlinks)) if ($self->debug);
00250   $tmp_time = time();
00251   $self->param('old_homology_count', 0);
00252   $self->param('orthotree_homology_counts', {});
00253   foreach my $genepairlink (@sorted_genepairlinks) {
00254     $self->analyze_genepairlink($genepairlink);
00255   }
00256   printf("%1.3f secs to analyze genepair links\n", time()-$tmp_time) 
00257     if($self->debug > 1);
00258   
00259   #display summary stats of analysis 
00260   my $runtime = time()*1000-$starttime;  
00261   $gene_tree->tree->store_tag('OrthoTree_runtime_msec', $runtime) 
00262     unless ($self->param('_readonly'));
00263 
00264   if($self->debug) {
00265     printf("%d genes in tree\n", scalar(@{$gene_tree->get_all_leaves}));
00266     printf("%d pairings\n", scalar(@genepairlinks));
00267     printf("%d old homologies\n", $self->param('old_homology_count'));
00268     printf("orthotree homologies\n");
00269     foreach my $type (keys(%{$self->param('orthotree_homology_counts')})) {
00270       printf ( "  %13s : %d\n", $type, $self->param('orthotree_homology_counts')->{$type} );
00271     }
00272   }
00273   $self->param('homology_links', \@sorted_genepairlinks);
00274 
00275   return undef;
00276 }
00277 
00278 
00279 sub analyze_genepairlink {
00280   my $self = shift;
00281   my $genepairlink = shift;
00282 
00283   my ($gene1, $gene2) = $genepairlink->get_nodes;
00284 
00285   #run feature detectors: precalcs and caches into metadata
00286   $self->genepairlink_check_dups($genepairlink);
00287   $self->genepairlink_fetch_homology($genepairlink) if($self->debug);
00288 
00289   #do classification analysis : as filter stack
00290   if($self->inspecies_paralog_test($genepairlink)) { }
00291   elsif($self->direct_ortholog_test($genepairlink)) { } 
00292   elsif($self->ancient_residual_test($genepairlink)) { } 
00293   elsif($self->one2many_ortholog_test($genepairlink)) { } 
00294   elsif($self->outspecies_test($genepairlink)) { }
00295   else {
00296     printf
00297       (
00298        "OOPS!!!! %s - %s\n",
00299        $gene1->gene_member->stable_id,
00300        $gene2->gene_member->stable_id
00301       );
00302   }
00303 
00304   $self->param('old_homology_count', $self->param('old_homology_count') + 1)
00305     if($genepairlink->get_tagvalue('old_homology'));
00306 
00307   my $type = $genepairlink->get_tagvalue('orthotree_type');
00308   if($type) {
00309     if(!defined($self->param('orthotree_homology_counts')->{$type})) {
00310       $self->param('orthotree_homology_counts')->{$type} = 1;
00311     } else {
00312       $self->param('orthotree_homology_counts')->{$type}++;
00313     }
00314   }
00315 
00316   #display results
00317   $self->display_link_analysis($genepairlink) if($self->debug >1);
00318 
00319   return undef;
00320 }
00321 
00322 
00323 sub display_link_analysis
00324 {
00325   my $self = shift;
00326   my $genepairlink = shift;
00327 
00328   #display raw feature analysis
00329   my ($gene1, $gene2) = $genepairlink->get_nodes;
00330   my $ancestor = $genepairlink->get_tagvalue('ancestor');
00331   printf("%21s(%7d) - %21s(%7d) : %10.3f dist : %3d hops : ", 
00332     $gene1->gene_member->stable_id, $gene1->gene_member->member_id,
00333     $gene2->gene_member->stable_id, $gene2->gene_member->member_id,
00334     $genepairlink->distance_between, $genepairlink->get_tagvalue('hops'));
00335 
00336   if($genepairlink->get_tagvalue('has_dups')) { printf("%5s ", 'DUP');
00337   } else { printf("%5s ", ""); }
00338 
00339   my $homology = $genepairlink->get_tagvalue('old_homology');
00340   if($homology) { printf("%5s ", $homology->description);
00341   } else { printf("%5s ", ""); }
00342 
00343   print("ancestor:(");
00344   my $node_type = $ancestor->get_tagvalue('node_type', '');
00345   if ($node_type eq 'duplication') {
00346     print "DUP ";
00347   } elsif ($node_type eq 'dubious') {
00348     print "DD  ";
00349   } elsif ($node_type eq 'gene_split') {
00350     print "SPL ";
00351   } else {
00352     print "    ";
00353   }
00354   printf("%9s)", $ancestor->node_id);
00355   printf(" %.4f ", $ancestor->get_tagvalue('duplication_confidence_score'));
00356 
00357   my $taxon_level = $ancestor->get_tagvalue('taxon_level');
00358   printf(" %s %s %s\n", 
00359          $genepairlink->get_tagvalue('orthotree_type'), 
00360          $genepairlink->get_tagvalue('orthotree_subtype'),
00361          $taxon_level->name
00362         );
00363 
00364   return undef;
00365 }
00366 
00367 
00368 sub get_species_tree_string {
00369     my $self = shift @_;
00370 
00371     my $species_tree_string = $self->param('species_tree_string');
00372 
00373     unless( $species_tree_string ) {
00374         if( my $species_tree_file = $self->param('species_tree_file') ) {
00375 
00376             $species_tree_string = $self->_slurp( $species_tree_file );
00377 
00378         } else {
00379             my $tag_table_name = 'gene_tree_root_tag';
00380 
00381             my $sth = $self->dbc->prepare( "select value from $tag_table_name where tag='species_tree_string'" );
00382             $sth->execute;
00383             ($species_tree_string) = $sth->fetchrow_array;
00384             $sth->finish;
00385         }
00386     }
00387     return $species_tree_string;
00388 }
00389 
00390 
00391 sub load_species_tree_from_string {
00392   my ($self, $species_tree_string) = @_;
00393 
00394   my $taxonDBA = $self->compara_dba->get_NCBITaxonAdaptor;
00395 
00396   my $tree = Bio::EnsEMBL::Compara::Graph::NewickParser::parse_newick_into_tree($species_tree_string);
00397   foreach my $node (@{$tree->all_nodes_in_graph}) {
00398     my ($id) = split('-',$node->name);
00399     $id =~ s/\*//; # internal nodes have asterisk
00400     if (looks_like_number($id)) {
00401       $node->node_id($id);
00402       my $ncbi_node = $taxonDBA->fetch_node_by_taxon_id($id);
00403       $node->name($ncbi_node->name) if (defined $ncbi_node);
00404     } else { # doesnt look like number
00405       $node->name($id);
00406     }
00407     $node->add_tag('taxon_id', $id);
00408   }
00409   return $tree;
00410 }
00411 
00412 sub _slurp {
00413   my ($self, $file_name) = @_;
00414   my $slurped;
00415   {
00416     local $/ = undef;
00417     open(my $fh, '<', $file_name);
00418     $slurped = <$fh>;
00419     close($fh);
00420   }
00421   return $slurped;
00422 }
00423 
00424 sub get_ancestor_species_hash
00425 {
00426   my $self = shift;
00427   my $node = shift;
00428 
00429   my $species_hash = $node->get_tagvalue('species_hash');
00430   return $species_hash if($species_hash);
00431 
00432   $species_hash = {};
00433   my $duplication_hash = {};
00434   my $is_dup=0;
00435 
00436   if($node->isa('Bio::EnsEMBL::Compara::GeneTreeMember')) {
00437     my $node_genome_db_id = $node->genome_db_id;
00438     $species_hash->{$node_genome_db_id} = 1;
00439     $node->add_tag('species_hash', $species_hash);
00440     return $species_hash;
00441   }
00442 
00443   foreach my $child (@{$node->children}) {
00444     my $t_species_hash = $self->get_ancestor_species_hash($child);
00445     next unless(defined($t_species_hash)); #shouldn't happen
00446     foreach my $genome_db_id (keys(%$t_species_hash)) {
00447       unless(defined($species_hash->{$genome_db_id})) {
00448         $species_hash->{$genome_db_id} = $t_species_hash->{$genome_db_id};
00449       } else {
00450         #this species already existed in one of the other children
00451         #this means this species was duplicated at this point between
00452         #the species
00453         $is_dup=1;
00454         $duplication_hash->{$genome_db_id} = 1;
00455         $species_hash->{$genome_db_id} += $t_species_hash->{$genome_db_id};
00456       }
00457     }
00458   }
00459   
00460   #printf("\ncalc_ancestor_species_hash : %s\n", stringify($species_hash));
00461   #$node->print_tree(20);
00462   
00463   $node->add_tag("species_hash", $species_hash);
00464   if($is_dup && !($self->param('_treefam'))) {
00465 
00466     $node->add_tag("duplication_hash", $duplication_hash);
00467 
00468     my $original_node_type = $node->get_tagvalue('node_type');
00469     if ((not defined $original_node_type) or ($original_node_type eq 'speciation')) {
00470       # RAP did not predict a duplication here
00471       $node->store_tag('node_type', 'duplication') unless ($self->param('_readonly'));
00472 
00473     } # The other values should not need any treatment
00474 
00475   }
00476   return $species_hash;
00477 }
00478 
00479 
00480 sub get_ancestor_taxon_level {
00481   my ($self, $ancestor) = @_;
00482 
00483   my $taxon_level = $ancestor->get_tagvalue('taxon_level');
00484   return $taxon_level if($taxon_level);
00485 
00486   #printf("\ncalculate ancestor taxon level\n");
00487   my $taxon_tree = $self->param('taxon_tree');
00488   my $species_hash = $self->get_ancestor_species_hash($ancestor);
00489 
00490   foreach my $gdbID (keys(%$species_hash)) {
00491     my $gdb = $self->compara_dba->get_GenomeDBAdaptor->fetch_by_dbID($gdbID);
00492     my $taxon = $taxon_tree->find_node_by_node_id($gdb->taxon_id);
00493 
00494     unless($taxon) {
00495       $self->throw("oops missing taxon " . $gdb->taxon_id ."\n");
00496     }
00497 
00498     if($taxon_level) {
00499       $taxon_level = $taxon_level->find_first_shared_ancestor($taxon);
00500     } else {
00501       $taxon_level = $taxon;
00502     }
00503   }
00504   my $taxon_id = $taxon_level->get_tagvalue("taxon_id");
00505   $ancestor->add_tag("taxon_level", $taxon_level);
00506   $ancestor->store_tag("taxon_id", $taxon_id) 
00507     unless ($self->param('_readonly'));
00508   $ancestor->store_tag("taxon_name", $taxon_level->name) 
00509     unless ($self->param('_readonly'));
00510 
00511   #$ancestor->print_tree($self->param('tree_scale'));
00512   #$taxon_level->print_tree(10);
00513 
00514   return $taxon_level;
00515 }
00516 
00517 
00518 sub duplication_confidence_score {
00519   my $self = shift;
00520   my $ancestor = shift;
00521 
00522   # This assumes bifurcation!!! No multifurcations allowed
00523   my ($child_a, $child_b, $dummy) = @{$ancestor->children};
00524   $self->throw("tree is multifurcated in duplication_confidence_score\n") if (defined($dummy));
00525   my @child_a_gdbs = keys %{$self->get_ancestor_species_hash($child_a)};
00526   my @child_b_gdbs = keys %{$self->get_ancestor_species_hash($child_b)};
00527   my %seen = ();  my @gdb_a = grep { ! $seen{$_} ++ } @child_a_gdbs;
00528      %seen = ();  my @gdb_b = grep { ! $seen{$_} ++ } @child_b_gdbs;
00529   my @isect = my @diff = my @union = (); my %count;
00530   foreach my $e (@gdb_a, @gdb_b) { $count{$e}++ }
00531   foreach my $e (keys %count) {
00532     push(@union, $e); push @{ $count{$e} == 2 ? \@isect : \@diff }, $e; 
00533   }
00534 
00535   my $duplication_confidence_score = 0;
00536   my $scalar_isect = scalar(@isect);
00537   my $scalar_union = scalar(@union);
00538   $duplication_confidence_score = 
00539     (($scalar_isect)/$scalar_union) unless (0 == $scalar_isect);
00540 
00541   $ancestor->store_tag
00542     (
00543      "duplication_confidence_score",
00544      $duplication_confidence_score
00545     ) unless ($self->param('_readonly'));
00546 
00547   my $rounded_duplication_confidence_score = (int((100.0 * $scalar_isect / $scalar_union + 0.5)));
00548   my $species_intersection_score = $ancestor->get_tagvalue("species_intersection_score");
00549   unless (defined($species_intersection_score)) {
00550     my $ancestor_node_id = $ancestor->node_id;
00551     warn("Difference in the ProteinTree: duplication_confidence_score [$duplication_confidence_score] whereas species_intersection_score [$species_intersection_score] is undefined in njtree - ancestor $ancestor_node_id\n");
00552     return;
00553   }
00554   if ($species_intersection_score ne $rounded_duplication_confidence_score && !defined($self->param('_readonly'))) {
00555     my $ancestor_node_id = $ancestor->node_id;
00556     $self->throw("Inconsistency in the ProteinTree: duplication_confidence_score [$duplication_confidence_score] != species_intersection_score [$species_intersection_score] -  $ancestor_node_id\n");
00557   } else {
00558     $ancestor->delete_tag('species_intersection_score');
00559   }
00560 }
00561 
00562 
00563 sub genepairlink_fetch_homology
00564 {
00565   my $self = shift;
00566   my $genepairlink = shift;
00567 
00568   my ($member1, $member2) = $genepairlink->get_nodes;
00569 
00570   my $sql = "select homology.homology_id from homology ".
00571             "join homology_member hm1 using(homology_id) ".
00572             "join homology_member hm2 using (homology_id) " .
00573             "where hm1.peptide_member_id=? and hm2.peptide_member_id=?";
00574 
00575   my $sth = $self->dbc->prepare($sql);
00576   $sth->execute($member1->member_id, $member2->member_id);
00577   my $ref = $sth->fetchrow_arrayref();
00578   return undef unless($ref);
00579   $sth->finish;
00580   my ($homology_id) = @$ref;
00581   return undef unless($homology_id);
00582 
00583   my $homology = 
00584     $self->compara_dba->get_HomologyAdaptor->fetch_by_dbID($homology_id);
00585   $genepairlink->add_tag("old_homology", $homology);
00586 
00587   return $homology;
00588 }
00589 
00590 sub delete_old_orthotree_tags
00591 {
00592   my $self = shift;
00593 
00594   return undef unless ($self->input_job->retry_count > 0);
00595 
00596   print "deleting old orthotree tags\n" if ($self->debug);
00597   my @node_ids;
00598 
00599   foreach my $node ($self->param('gene_tree')->get_all_subnodes) {
00600     push @node_ids, $node->node_id;
00601   }
00602 
00603   my @list_ids;
00604   foreach my $id (@node_ids) {
00605     push @list_ids, $id;
00606     if (scalar @list_ids == 2000) {
00607       # FIXME
00608       my $sql = "delete from gene_tree_root_tag where root_id in (".join(",",@list_ids).") and tag in ('duplication_confidence_score','taxon_id','taxon_name','OrthoTree_runtime_msec','OrthoTree_types_hashstr')";
00609       my $sth = $self->dbc->prepare($sql);
00610       $sth->execute;
00611       $sth->finish;
00612       @list_ids = ();
00613     }
00614   }
00615 
00616   if (scalar @list_ids) {
00617   }
00618 
00619   return undef;
00620 }
00621 
00622 sub delete_old_homologies {
00623   my $self = shift;
00624 
00625   return undef unless ($self->input_job->retry_count > 0);
00626 
00627   print "deleting old homologies\n" if ($self->debug);
00628 
00629   # New method all in one go -- requires key on tree_node_id
00630   my $delete_time = time();
00631   my $tree_node_id = $self->param('gene_tree')->node_id;
00632 
00633   # Delete first the members
00634   my $sql1 = 'DELETE homology_member FROM homology JOIN homology_member USING (homology_id) WHERE tree_node_id=?';
00635   my $sth1 = $self->dbc->prepare($sql1);
00636   $sth1->execute($tree_node_id);
00637   $sth1->finish;
00638 
00639   # And then the homologies
00640   my $sql2 = 'DELETE FROM homology WHERE tree_node_id=?';
00641   my $sth2 = $self->dbc->prepare($sql2);
00642   $sth2->execute($tree_node_id);
00643   $sth2->finish;
00644   printf("%1.3f secs build links and features\n", time()-$delete_time);
00645 
00646   $self->param('old_homologies_deleted', 1);
00647   return undef;
00648 }
00649 
00650 sub delete_old_homologies_old {
00651   my $self = shift;
00652   my $genepairlink = shift;
00653 
00654   return undef unless ($self->input_job->retry_count > 0);
00655 
00656   my ($member1, $member2) = $genepairlink->get_nodes;
00657 
00658   my @homologies = @{$self->compara_dba->get_HomologyAdaptor->fetch_by_Member_Member_method_link_type
00659                        ($member1->gene_member, $member2->gene_member, 'ENSEMBL_ORTHOLOGUES')};
00660   push @homologies, @{$self->compara_dba->get_HomologyAdaptor->fetch_by_Member_Member_method_link_type
00661                         ($member1->gene_member, $member2->gene_member, 'ENSEMBL_PARALOGUES')};
00662 
00663   my $sql1 = "DELETE FROM homology WHERE homology_id=?";
00664   my $sth1 = $self->dbc->prepare($sql1);
00665   my $sql2 = "DELETE FROM homology_member WHERE homology_id=?";
00666   my $sth2 = $self->dbc->prepare($sql2);
00667 
00668   foreach my $homology (@homologies) {
00669     $sth1->execute($homology->dbID);
00670     $sth2->execute($homology->dbID);
00671   }
00672 
00673   $sth1->finish;
00674   $sth2->finish;
00675 
00676   return undef;
00677 }
00678 
00679 sub genepairlink_check_dups
00680 {
00681   my $self = shift;
00682   my $genepairlink = shift;
00683 
00684   my ($pep1, $pep2) = $genepairlink->get_nodes;
00685 
00686   my $ancestor = $genepairlink->get_tagvalue('ancestor');
00687   my $has_dup=0;
00688   my %nodes_between;
00689   my $tnode = $pep1;
00690   do {
00691     $tnode = $tnode->parent;
00692     if ($tnode->get_tagvalue('node_type', '') eq 'duplication') {
00693       $has_dup = 1;
00694     }
00695     $nodes_between{$tnode->node_id} = $tnode;
00696   } while(!($tnode->equals($ancestor)));
00697 
00698   $tnode = $pep2;
00699   do {
00700     $tnode = $tnode->parent;
00701     if ($tnode->get_tagvalue('node_type', '') eq 'duplication') {
00702       $has_dup = 1;
00703     }
00704     $nodes_between{$tnode->node_id} = $tnode;
00705   } while(!($tnode->equals($ancestor)));
00706 
00707   $genepairlink->add_tag("hops", scalar(keys(%nodes_between)));
00708   $genepairlink->add_tag("has_dups", $has_dup);
00709   return undef;
00710 }
00711 
00712 
00713 ########################################################
00714 #
00715 # Classification analysis
00716 #
00717 ########################################################
00718 
00719 
00720 sub direct_ortholog_test
00721 {
00722   my $self = shift;
00723   my $genepairlink = shift;
00724 
00725   #strictest ortholog test: 
00726   #  - genes are from different species
00727   #  - no ancestral duplication events
00728   #  - these genes are only copies of the ancestor for their species
00729 
00730   return undef if($genepairlink->get_tagvalue('has_dups'));
00731 
00732   my ($pep1, $pep2) = $genepairlink->get_nodes;
00733   return undef if($pep1->genome_db_id == $pep2->genome_db_id);
00734 
00735   my $ancestor = $genepairlink->get_tagvalue('ancestor');
00736   my $species_hash = $self->get_ancestor_species_hash($ancestor);
00737 
00738   #RAP seems to miss some duplication events so check the species 
00739   #counts for these two species to make sure they are the only
00740   #representatives of these species under the ancestor
00741   my $count1 = $species_hash->{$pep1->genome_db_id};
00742   my $count2 = $species_hash->{$pep2->genome_db_id};
00743 
00744   return undef if($count1>1);
00745   return undef if($count2>1);
00746 
00747   #passed all the tests -> it's a simple ortholog
00748 #  $self->delete_old_homologies_old($genepairlink) unless ($self->param('_readonly'));
00749   $genepairlink->add_tag("orthotree_type", 'ortholog_one2one');
00750   my $taxon = $self->get_ancestor_taxon_level($ancestor);
00751   $genepairlink->add_tag("orthotree_subtype", $taxon->name);
00752   return 1;
00753 }
00754 
00755 
00756 sub inspecies_paralog_test
00757 {
00758   my $self = shift;
00759   my $genepairlink = shift;
00760 
00761   #simplest paralog test: 
00762   #  - both genes are from the same species
00763   #  - and just label with taxonomic level
00764 
00765   my ($pep1, $pep2) = $genepairlink->get_nodes;
00766   return undef unless($pep1->genome_db_id == $pep2->genome_db_id);
00767 
00768   my $ancestor = $genepairlink->get_tagvalue('ancestor');
00769   my $taxon = $self->get_ancestor_taxon_level($ancestor);
00770 
00771   #my $species_hash = $self->get_ancestor_species_hash($ancestor);
00772   #foreach my $gdbID (keys(%$species_hash)) {
00773   #  return undef unless($gdbID == $pep1->genome_db_id);
00774   #}
00775 
00776   #passed all the tests -> it's an inspecies_paralog
00777 #  $genepairlink->add_tag("orthotree_type", 'inspecies_paralog');
00778 #  $self->delete_old_homologies_old($genepairlink) unless ($self->param('_readonly'));
00779   $genepairlink->add_tag("orthotree_type", 'within_species_paralog');
00780   $genepairlink->add_tag("orthotree_subtype", $taxon->name);
00781   # Duplication_confidence_score
00782   if (not $ancestor->has_tag("duplication_confidence_score")) {
00783     $self->duplication_confidence_score($ancestor);
00784     $genepairlink->{duplication_confidence_score} = $ancestor->get_tagvalue("duplication_confidence_score");
00785   }
00786   return 1;
00787 }
00788 
00789 
00790 sub ancient_residual_test
00791 {
00792   my $self = shift;
00793   my $genepairlink = shift;
00794 
00795   #test 3: getting a bit more complex:
00796   #  - genes are from different species
00797   #  - there is evidence for duplication events elsewhere in the history
00798   #  - but these two genes are the only remaining representative of
00799   #    the ancestor
00800 
00801   my ($pep1, $pep2) = $genepairlink->get_nodes;
00802   return undef if($pep1->genome_db_id == $pep2->genome_db_id);
00803 
00804   my $ancestor = $genepairlink->get_tagvalue('ancestor');
00805   my $species_hash = $self->get_ancestor_species_hash($ancestor);
00806 
00807   #check these are the only representatives of the ancestor
00808   my $count1 = $species_hash->{$pep1->genome_db_id};
00809   my $count2 = $species_hash->{$pep2->genome_db_id};
00810 
00811   return undef if($count1>1);
00812   return undef if($count2>1);
00813 
00814   #passed all the tests -> it's a simple ortholog
00815   # print $ancestor->node_id, " ", $ancestor->name,"\n";
00816 
00817   # little hack to work around some weird treefam trees
00818   if ($self->param('_treefam')) {
00819     if(not $ancestor->has_tag('node_type')) {
00820       $ancestor->add_tag('node_type', 'speciation');
00821     }
00822   }
00823 
00824 #  my $sis_value = $ancestor->get_tagvalue("species_intersection_score");
00825   if ($ancestor->get_tagvalue('node_type', '') eq 'duplication') {
00826 #    $self->delete_old_homologies_old($genepairlink) unless ($self->param('_readonly'));
00827     $genepairlink->add_tag("orthotree_type", 'apparent_ortholog_one2one');
00828     my $taxon = $self->get_ancestor_taxon_level($ancestor);
00829     $genepairlink->add_tag("orthotree_subtype", $taxon->name);
00830     # Duplication_confidence_score
00831     if (not $ancestor->has_tag("duplication_confidence_score")) {
00832       $self->duplication_confidence_score($ancestor);
00833       $genepairlink->{duplication_confidence_score} = $ancestor->get_tagvalue("duplication_confidence_score");
00834     }
00835   } else {
00836 #    $self->delete_old_homologies_old($genepairlink) unless ($self->param('_readonly'));
00837     $genepairlink->add_tag("orthotree_type", 'ortholog_one2one');
00838     my $taxon = $self->get_ancestor_taxon_level($ancestor);
00839     $genepairlink->add_tag("orthotree_subtype", $taxon->name);
00840   }
00841   return 1;
00842 }
00843 
00844 
00845 sub one2many_ortholog_test
00846 {
00847   my $self = shift;
00848   my $genepairlink = shift;
00849 
00850   #test 4: getting a bit more complex yet again:
00851   #  - genes are from different species
00852   #  - but there is evidence for duplication events in the history
00853   #  - one of the genes is the only remaining representative of the
00854   #  ancestor in its species
00855   #  - but the other gene has multiple copies in it's species 
00856   #  (first level of orthogroup analysis)
00857 
00858   my ($pep1, $pep2) = $genepairlink->get_nodes;
00859   return undef if($pep1->genome_db_id == $pep2->genome_db_id);
00860 
00861   my $ancestor = $genepairlink->get_tagvalue('ancestor');
00862   my $species_hash = $self->get_ancestor_species_hash($ancestor);
00863 
00864   my $count1 = $species_hash->{$pep1->genome_db_id};
00865   my $count2 = $species_hash->{$pep2->genome_db_id};
00866 
00867   #one of the genes must be the only copy of the gene
00868   #and the other must appear more than once in the ancestry
00869   return undef unless 
00870     (
00871      ($count1==1 and $count2>1) or ($count1>1 and $count2==1)
00872     );
00873 
00874   if ($ancestor->get_tagvalue('node_type', '') eq 'duplication') {
00875     return undef;
00876   }
00877 
00878   #passed all the tests -> it's a one2many ortholog
00879 #  $self->delete_old_homologies_old($genepairlink) unless ($self->param('_readonly'));
00880   $genepairlink->add_tag("orthotree_type", 'ortholog_one2many');
00881   my $taxon = $self->get_ancestor_taxon_level($ancestor);
00882   $genepairlink->add_tag("orthotree_subtype", $taxon->name);
00883   return 1;
00884 }
00885 
00886 
00887 sub outspecies_test
00888 {
00889   my $self = shift;
00890   my $genepairlink = shift;
00891 
00892   #last test: left over pairs:
00893   #  - genes are from different species
00894   #  - if ancestor is 'DUP' -> paralog else 'ortholog'
00895 
00896   my ($pep1, $pep2) = $genepairlink->get_nodes;
00897   return undef if($pep1->genome_db_id == $pep2->genome_db_id);
00898 
00899   my $ancestor = $genepairlink->get_tagvalue('ancestor');
00900   my $taxon = $self->get_ancestor_taxon_level($ancestor);
00901 
00902   #ultra simple ortho/paralog classification
00903   if ($ancestor->get_tagvalue('node_type', '') eq 'duplication') {
00904     $genepairlink->add_tag("orthotree_type", 'possible_ortholog');
00905     $genepairlink->add_tag("orthotree_subtype", $taxon->name);
00906     # duplication_confidence_score
00907     if (not $ancestor->has_tag("duplication_confidence_score")) {
00908       $self->duplication_confidence_score($ancestor);
00909      $genepairlink->{duplication_confidence_score} = $ancestor->get_tagvalue("duplication_confidence_score");
00910     }
00911   } else {
00912 #      $self->delete_old_homologies_old($genepairlink) unless ($self->param('_readonly'));
00913       $genepairlink->add_tag("orthotree_type", 'ortholog_many2many');
00914       $genepairlink->add_tag("orthotree_subtype", $taxon->name);
00915   }
00916   return 1;
00917 }
00918 
00919 
00920 ########################################################
00921 #
00922 # Tree input/output section
00923 #
00924 ########################################################
00925 
00926 sub store_homologies {
00927   my $self = shift;
00928 
00929   $self->param('homology_consistency', {});
00930 
00931   my $hlinkscount = 0;
00932   foreach my $genepairlink (@{$self->param('homology_links')}) {
00933     $self->display_link_analysis($genepairlink) if($self->debug>2);
00934     my $type = $genepairlink->get_tagvalue("orthotree_type");
00935     my $dcs = $genepairlink->get_tagvalue('ancestor')->get_tagvalue('duplication_confidence_score');
00936     next if ($type eq 'possible_ortholog' and $dcs > $self->param('no_between'));
00937 
00938     $self->store_gene_link_as_homology($genepairlink);
00939     print STDERR "homology links $hlinkscount\n" if ($hlinkscount++ % 500 == 0);
00940   }
00941 
00942   my $counts_str = stringify($self->param('orthotree_homology_counts'));
00943   printf("$counts_str\n");
00944 
00945   $self->check_homology_consistency;
00946 
00947   $self->param('gene_tree')->tree->store_tag(
00948       'OrthoTree_types_hashstr', 
00949       stringify($self->param('orthotree_homology_counts'))) unless ($self->param('_readonly'));
00950 
00951   return undef;
00952 }
00953 
00954 
00955 sub store_gene_link_as_homology {
00956   my $self = shift;
00957   my $genepairlink  = shift;
00958 
00959   my $type = $genepairlink->get_tagvalue('orthotree_type');
00960   return unless($type);
00961   my $subtype = $genepairlink->get_tagvalue('taxon_name');
00962   my $ancestor = $genepairlink->get_tagvalue('ancestor');
00963   my $tree_node_id = $genepairlink->get_tagvalue('tree_node_id');
00964   warn("Tag tree_node_id undefined\n") unless(defined($tree_node_id) && $tree_node_id ne '');
00965 
00966   my ($gene1, $gene2) = $genepairlink->get_nodes;
00967 
00968   #
00969   # create method_link_species_set
00970   #
00971   my $mlss = new Bio::EnsEMBL::Compara::MethodLinkSpeciesSet;
00972   $mlss->method_link_type("ENSEMBL_ORTHOLOGUES") 
00973     unless ($type eq 'possible_ortholog' || $type eq 'within_species_paralog');
00974   $mlss->method_link_type("ENSEMBL_PARALOGUES") if ($type eq 'possible_ortholog' || $type eq 'within_species_paralog');
00975   if ($gene1->genome_db->dbID == $gene2->genome_db->dbID) {
00976     $mlss->species_set([$gene1->genome_db]);
00977   } else {
00978     $mlss->species_set([$gene1->genome_db, $gene2->genome_db]);
00979   }
00980   $self->compara_dba->get_MethodLinkSpeciesSetAdaptor->store($mlss) unless ($self->param('_readonly'));
00981 
00982   # create an Homology object
00983   my $homology = new Bio::EnsEMBL::Compara::Homology;
00984   $homology->description($type);
00985   $homology->subtype($subtype);
00986   # $homology->node_id($ancestor->node_id);
00987   $homology->ancestor_node_id($ancestor->node_id);
00988   $homology->tree_node_id($tree_node_id);
00989   $homology->method_link_type($mlss->method_link_type);
00990   $homology->method_link_species_set($mlss);
00991 
00992   my $key = $mlss->dbID . "_" . $gene1->dbID;
00993   $self->param('homology_consistency')->{$key}{$type} = 1;
00994   #$homology->dbID(-1);
00995 
00996   # NEED TO BUILD THE Attributes (ie homology_members)
00997   my ($cigar_line1, $perc_id1, $perc_pos1,
00998       $cigar_line2, $perc_id2, $perc_pos2) = 
00999         $self->generate_attribute_arguments($gene1, $gene2,$type);
01000 
01001   # QUERY member
01002   #
01003   my $attribute;
01004   $attribute = new Bio::EnsEMBL::Compara::Attribute;
01005   $attribute->peptide_member_id($gene1->dbID);
01006   $attribute->cigar_line($cigar_line1);
01007   $attribute->perc_cov(100);
01008   $attribute->perc_id(int($perc_id1));
01009   $attribute->perc_pos(int($perc_pos1));
01010 
01011   $homology->add_Member_Attribute([$gene1->gene_member, $attribute]);
01012 
01013   #
01014   # HIT member
01015   #
01016   $attribute = new Bio::EnsEMBL::Compara::Attribute;
01017   $attribute->peptide_member_id($gene2->dbID);
01018   $attribute->cigar_line($cigar_line2);
01019   $attribute->perc_cov(100);
01020   $attribute->perc_id(int($perc_id2));
01021   $attribute->perc_pos(int($perc_pos2));
01022 
01023   $homology->add_Member_Attribute([$gene2->gene_member, $attribute]);
01024 
01025   ## Check if it has already been stored, in which case we dont need to store again
01026   my $matching_homology = 0;
01027   if (($self->input_job->retry_count > 0) and !defined($self->param('old_homologies_deleted'))) {
01028     my $member_id1 = $gene1->gene_member->member_id;
01029     my $member_id2 = $gene2->gene_member->member_id;
01030     if ($member_id1 == $member_id2) {
01031       my $tree_id = $self->param('gene_tree')->node_id;
01032       my $pmember_id1 = $gene1->member_id; my $pstable_id1 = $gene1->stable_id;
01033       my $pmember_id2 = $gene2->member_id; my $pstable_id2 = $gene2->stable_id;
01034       $self->throw("$member_id1 ($pmember_id1 - $pstable_id1) and $member_id2 ($pmember_id2 - $pstable_id2) shouldn't be the same");
01035     }
01036     my $stored_homology = $self->param('homologyDBA')->fetch_by_Member_id_Member_id($member_id1,$member_id2);
01037     if (defined($stored_homology)) {
01038       $matching_homology = 1;
01039       $matching_homology = 0 if ($stored_homology->description ne $homology->description);
01040       $matching_homology = 0 if ($stored_homology->subtype ne $homology->subtype);
01041       $matching_homology = 0 if ($stored_homology->ancestor_node_id ne $homology->ancestor_node_id);
01042       $matching_homology = 0 if ($stored_homology->tree_node_id ne $homology->tree_node_id);
01043       $matching_homology = 0 if ($stored_homology->method_link_type ne $homology->method_link_type);
01044       $matching_homology = 0 if ($stored_homology->method_link_species_set->dbID ne $homology->method_link_species_set->dbID);
01045     }
01046 
01047     # Delete old one, then proceed to store new one
01048     if (defined($stored_homology) && (0 == $matching_homology)) {
01049       my $homology_id = $stored_homology->dbID;
01050       my $sql1 = "delete from homology where homology_id=$homology_id";
01051       my $sql2 = "delete from homology_member where homology_id=$homology_id";
01052       my $sth1 = $self->dbc->prepare($sql1);
01053       my $sth2 = $self->dbc->prepare($sql2);
01054       $sth1->execute;
01055       $sth2->execute;
01056       $sth1->finish;
01057       $sth2->finish;
01058     }
01059   }
01060   if($self->param('store_homologies') and (0 == $matching_homology)) {
01061     $self->param('homologyDBA')->store($homology);
01062   }
01063 
01064   my $stable_id;
01065   if($gene1->taxon_id < $gene2->taxon_id) {
01066     $stable_id = $gene1->taxon_id . "_" . $gene2->taxon_id . "_";
01067   } else {
01068     $stable_id = $gene2->taxon_id . "_" . $gene1->taxon_id . "_";
01069   }
01070   $stable_id .= sprintf ("%011.0d",$homology->dbID) if($homology->dbID);
01071   $homology->stable_id($stable_id);
01072   #TODO: update the stable_id of the homology
01073 
01074   return $homology;
01075 }
01076 
01077 
01078 sub check_homology_consistency {
01079   my $self = shift;
01080 
01081   if ($self->debug) {
01082     print "checking homology consistency\n";
01083     foreach my $mlss_member_id ( keys %{$self->param('homology_consistency')} ) {
01084       my $count = scalar(keys %{$self->param('homology_consistency')->{$mlss_member_id}});
01085       if ($count > 1) {
01086         my ($mlss, $member_id) = split("_",$mlss_member_id);
01087         print "mlss member_id : $mlss $member_id\n";
01088       }
01089     }
01090   }
01091 
01092   foreach my $mlss_member_id ( keys %{$self->param('homology_consistency')} ) {
01093     my $count = scalar(keys %{$self->param('homology_consistency')->{$mlss_member_id}});
01094     if ($count > 1) {
01095       my ($mlss, $member_id) = split("_",$mlss_member_id);
01096       $self->throw("Inconsistent homologies in mlss $mlss and member_id $member_id");
01097     }
01098   }
01099 }
01100 
01101 
01102 sub generate_attribute_arguments {
01103   my ($self, $gene1, $gene2, $type) = @_;
01104 
01105   my $new_aln1_cigarline = "";
01106   my $new_aln2_cigarline = "";
01107 
01108   my $perc_id1 = 0;
01109   my $perc_pos1 = 0;
01110   my $perc_id2 = 0;
01111   my $perc_pos2 = 0;
01112   # This speeds up the pipeline for this portion of the homology table. If no_between option is used,
01113   # only the low SIS are dealt with, so we will calculate the cigar_lines
01114   if ($type eq 'possible_ortholog' && !defined($self->param('no_between'))) {
01115     return ($new_aln1_cigarline, $perc_id1, $perc_pos1, $new_aln2_cigarline, $perc_id2, $perc_pos2);
01116   }
01117 
01118   my $identical_matches = 0;
01119   my $positive_matches = 0;
01120   my $m_hash = $self->get_matrix_hash;
01121 
01122   my ($aln1state, $aln2state);
01123   my ($aln1count, $aln2count);
01124 
01125   # my @aln1 = split(//, $gene1->alignment_string); # Speed up
01126   # my @aln2 = split(//, $gene2->alignment_string);
01127   my $alignment_string = $gene1->alignment_string;
01128   my @aln1 = unpack("A1" x length($alignment_string), $alignment_string);
01129   $alignment_string = $gene2->alignment_string;
01130   my @aln2 = unpack("A1" x length($alignment_string), $alignment_string);
01131 
01132   for (my $i=0; $i <= $#aln1; $i++) {
01133     next if ($aln1[$i] eq "-" && $aln2[$i] eq "-");
01134     my ($cur_aln1state, $cur_aln2state) = qw(M M);
01135     if ($aln1[$i] eq "-") {
01136       $cur_aln1state = "D";
01137     }
01138     if ($aln2[$i] eq "-") {
01139       $cur_aln2state = "D";
01140     }
01141     if ($cur_aln1state eq "M" && $cur_aln2state eq "M" && $aln1[$i] eq $aln2[$i]) {
01142       $identical_matches++;
01143       $positive_matches++;
01144     } elsif ($cur_aln1state eq "M" && $cur_aln2state eq "M" && $m_hash->{uc $aln1[$i]}{uc $aln2[$i]} > 0) {
01145         $positive_matches++;
01146     }
01147     unless (defined $aln1state) {
01148       $aln1count = 1;
01149       $aln2count = 1;
01150       $aln1state = $cur_aln1state;
01151       $aln2state = $cur_aln2state;
01152       next;
01153     }
01154     if ($cur_aln1state eq $aln1state) {
01155       $aln1count++;
01156     } else {
01157       if ($aln1count == 1) {
01158         $new_aln1_cigarline .= $aln1state;
01159       } else {
01160         $new_aln1_cigarline .= $aln1count.$aln1state;
01161       }
01162       $aln1count = 1;
01163       $aln1state = $cur_aln1state;
01164     }
01165     if ($cur_aln2state eq $aln2state) {
01166       $aln2count++;
01167     } else {
01168       if ($aln2count == 1) {
01169         $new_aln2_cigarline .= $aln2state;
01170       } else {
01171         $new_aln2_cigarline .= $aln2count.$aln2state;
01172       }
01173       $aln2count = 1;
01174       $aln2state = $cur_aln2state;
01175     }
01176   }
01177   if ($aln1count == 1) {
01178     $new_aln1_cigarline .= $aln1state;
01179   } else {
01180     $new_aln1_cigarline .= $aln1count.$aln1state;
01181   }
01182   if ($aln2count == 1) {
01183     $new_aln2_cigarline .= $aln2state;
01184   } else {
01185     $new_aln2_cigarline .= $aln2count.$aln2state;
01186   }
01187   my $seq_length1 = $gene1->seq_length;
01188   unless (0 == $seq_length1) {
01189     $perc_id1  = (int((100.0 * $identical_matches / $seq_length1 + 0.5)));
01190     $perc_pos1 = (int((100.0 * $positive_matches  / $seq_length1 + 0.5)));
01191   }
01192   my $seq_length2 = $gene2->seq_length;
01193   unless (0 == $seq_length2) {
01194     $perc_id2  = (int((100.0 * $identical_matches / $seq_length2 + 0.5)));
01195     $perc_pos2 = (int((100.0 * $positive_matches  / $seq_length2 + 0.5)));
01196   }
01197 
01198 #   my $perc_id1 = $identical_matches*100.0/$gene1->seq_length;
01199 #   my $perc_pos1 = $positive_matches*100.0/$gene1->seq_length;
01200 #   my $perc_id2 = $identical_matches*100.0/$gene2->seq_length;
01201 #   my $perc_pos2 = $positive_matches*100.0/$gene2->seq_length;
01202 
01203   return ($new_aln1_cigarline, $perc_id1, $perc_pos1, $new_aln2_cigarline, $perc_id2, $perc_pos2);
01204 }
01205 
01206 sub get_matrix_hash {
01207   my $self = shift;
01208 
01209   return $self->param('matrix_hash') if (defined $self->param('matrix_hash'));
01210 
01211   my $BLOSUM62 = "#  Matrix made by matblas from blosum62.iij
01212 #  * column uses minimum score
01213 #  BLOSUM Clustered Scoring Matrix in 1/2 Bit Units
01214 #  Blocks Database = /data/blocks_5.0/blocks.dat
01215 #  Cluster Percentage: >= 62
01216 #  Entropy =   0.6979, Expected =  -0.5209
01217    A  R  N  D  C  Q  E  G  H  I  L  K  M  F  P  S  T  W  Y  V  B  Z  X  *
01218 A  4 -1 -2 -2  0 -1 -1  0 -2 -1 -1 -1 -1 -2 -1  1  0 -3 -2  0 -2 -1  0 -4
01219 R -1  5  0 -2 -3  1  0 -2  0 -3 -2  2 -1 -3 -2 -1 -1 -3 -2 -3 -1  0 -1 -4
01220 N -2  0  6  1 -3  0  0  0  1 -3 -3  0 -2 -3 -2  1  0 -4 -2 -3  3  0 -1 -4
01221 D -2 -2  1  6 -3  0  2 -1 -1 -3 -4 -1 -3 -3 -1  0 -1 -4 -3 -3  4  1 -1 -4
01222 C  0 -3 -3 -3  9 -3 -4 -3 -3 -1 -1 -3 -1 -2 -3 -1 -1 -2 -2 -1 -3 -3 -2 -4
01223 Q -1  1  0  0 -3  5  2 -2  0 -3 -2  1  0 -3 -1  0 -1 -2 -1 -2  0  3 -1 -4
01224 E -1  0  0  2 -4  2  5 -2  0 -3 -3  1 -2 -3 -1  0 -1 -3 -2 -2  1  4 -1 -4
01225 G  0 -2  0 -1 -3 -2 -2  6 -2 -4 -4 -2 -3 -3 -2  0 -2 -2 -3 -3 -1 -2 -1 -4
01226 H -2  0  1 -1 -3  0  0 -2  8 -3 -3 -1 -2 -1 -2 -1 -2 -2  2 -3  0  0 -1 -4
01227 I -1 -3 -3 -3 -1 -3 -3 -4 -3  4  2 -3  1  0 -3 -2 -1 -3 -1  3 -3 -3 -1 -4
01228 L -1 -2 -3 -4 -1 -2 -3 -4 -3  2  4 -2  2  0 -3 -2 -1 -2 -1  1 -4 -3 -1 -4
01229 K -1  2  0 -1 -3  1  1 -2 -1 -3 -2  5 -1 -3 -1  0 -1 -3 -2 -2  0  1 -1 -4
01230 M -1 -1 -2 -3 -1  0 -2 -3 -2  1  2 -1  5  0 -2 -1 -1 -1 -1  1 -3 -1 -1 -4
01231 F -2 -3 -3 -3 -2 -3 -3 -3 -1  0  0 -3  0  6 -4 -2 -2  1  3 -1 -3 -3 -1 -4
01232 P -1 -2 -2 -1 -3 -1 -1 -2 -2 -3 -3 -1 -2 -4  7 -1 -1 -4 -3 -2 -2 -1 -2 -4
01233 S  1 -1  1  0 -1  0  0  0 -1 -2 -2  0 -1 -2 -1  4  1 -3 -2 -2  0  0  0 -4
01234 T  0 -1  0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -2 -1  1  5 -2 -2  0 -1 -1  0 -4
01235 W -3 -3 -4 -4 -2 -2 -3 -2 -2 -3 -2 -3 -1  1 -4 -3 -2 11  2 -3 -4 -3 -2 -4
01236 Y -2 -2 -2 -3 -2 -1 -2 -3  2 -1 -1 -2 -1  3 -3 -2 -2  2  7 -1 -3 -2 -1 -4
01237 V  0 -3 -3 -3 -1 -2 -2 -3 -3  3  1 -2  1 -1 -2 -2  0 -3 -1  4 -3 -2 -1 -4
01238 B -2 -1  3  4 -3  0  1 -1  0 -3 -4  0 -3 -3 -2  0 -1 -4 -3 -3  4  1 -1 -4
01239 Z -1  0  0  1 -3  3  4 -2  0 -3 -3  1 -1 -3 -1  0 -1 -3 -2 -2  1  4 -1 -4
01240 X  0 -1 -1 -1 -2 -1 -1 -1 -1 -1 -1 -1 -1 -1 -2  0  0 -2 -1 -1 -1 -1 -1 -4
01241 * -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4  1
01242 ";
01243   my $matrix_string;
01244   my @lines = split(/\n/,$BLOSUM62);
01245   foreach my $line (@lines) {
01246     next if ($line =~ /^\#/);
01247     if ($line =~ /^[A-Z\*\s]+$/) {
01248       $matrix_string .= sprintf "$line\n";
01249     } else {
01250       my @t = split(/\s+/,$line);
01251       shift @t;
01252       #       print scalar @t,"\n";
01253       $matrix_string .= sprintf(join(" ",@t)."\n");
01254     }
01255   }
01256 
01257   my %matrix_hash;
01258   @lines = ();
01259   @lines = split /\n/, $matrix_string;
01260   my $lts = shift @lines;
01261   $lts =~ s/^\s+//;
01262   $lts =~ s/\s+$//;
01263   my @letters = split /\s+/, $lts;
01264 
01265   foreach my $letter (@letters) {
01266     my $line = shift @lines;
01267     $line =~ s/^\s+//;
01268     $line =~ s/\s+$//;
01269     my @penalties = split /\s+/, $line;
01270     die "Size of letters array and penalties array are different\n"
01271       unless (scalar @letters == scalar @penalties);
01272     for (my $i=0; $i < scalar @letters; $i++) {
01273       $matrix_hash{uc $letter}{uc $letters[$i]} = $penalties[$i];
01274     }
01275   }
01276 
01277   $self->param('matrix_hash', \%matrix_hash);
01278   return $self->param('matrix_hash');
01279 }
01280 
01281 1;