Archive Ensembl HomeArchive Ensembl Home
NJTREE_PHYML.pm
Go to the documentation of this file.
00001 =head1 LICENSE
00002 
00003   Copyright (c) 1999-2012 The European Bioinformatics Institute and
00004   Genome Research Limited.  All rights reserved.
00005 
00006   This software is distributed under a modified Apache license.
00007   For license details, please see
00008 
00009    http://www.ensembl.org/info/about/code_licence.html
00010 
00011 =head1 CONTACT
00012 
00013   Please email comments or questions to the public Ensembl
00014   developers list at <dev@ensembl.org>.
00015 
00016   Questions may also be sent to the Ensembl help desk at
00017   <helpdesk@ensembl.org>.
00018 
00019 =head1 NAME
00020 
00021 Bio::EnsEMBL::Compara::RunnableDB::ProteinTrees::NJTREE_PHYML
00022 
00023 =head1 DESCRIPTION
00024 
00025 This Analysis/RunnableDB is designed to take ProteinTree as input
00026 This must already have a multiple alignment run on it. It uses that alignment
00027 as input into the NJTREE PHYML program which then generates a phylogenetic tree
00028 
00029 input_id/parameters format eg: "{'protein_tree_id'=>1234}"
00030     protein_tree_id : use 'id' to fetch a cluster from the ProteinTree
00031 
00032 =head1 SYNOPSIS
00033 
00034 my $db           = Bio::EnsEMBL::Compara::DBAdaptor->new($locator);
00035 my $njtree_phyml = Bio::EnsEMBL::Compara::RunnableDB::ProteinTrees::NJTREE_PHYML->new
00036   (
00037    -db         => $db,
00038    -input_id   => $input_id,
00039    -analysis   => $analysis
00040   );
00041 $njtree_phyml->fetch_input(); #reads from DB
00042 $njtree_phyml->run();
00043 $njtree_phyml->output();
00044 $njtree_phyml->write_output(); #writes to DB
00045 
00046 =head1 AUTHORSHIP
00047 
00048 Ensembl Team. Individual contributions can be found in the CVS log.
00049 
00050 =head1 MAINTAINER
00051 
00052 $Author: mm14 $
00053 
00054 =head VERSION
00055 
00056 $Revision: 1.31 $
00057 
00058 =head1 APPENDIX
00059 
00060 The rest of the documentation details each of the object methods.
00061 Internal methods are usually preceded with an underscore (_)
00062 
00063 =cut
00064 
00065 package Bio::EnsEMBL::Compara::RunnableDB::ProteinTrees::NJTREE_PHYML;
00066 
00067 use strict;
00068 
00069 use IO::File;
00070 use File::Basename;
00071 use Time::HiRes qw(time gettimeofday tv_interval);
00072 use Data::Dumper;
00073 
00074 use Bio::AlignIO;
00075 use Bio::SimpleAlign;
00076 
00077 use Bio::EnsEMBL::Compara::AlignedMember;
00078 use Bio::EnsEMBL::Compara::Member;
00079 use Bio::EnsEMBL::Compara::Graph::NewickParser;
00080 use Bio::EnsEMBL::Compara::Graph::ConnectedComponentGraphs;
00081 
00082 use base ('Bio::EnsEMBL::Compara::RunnableDB::BaseRunnable', 'Bio::EnsEMBL::Compara::RunnableDB::ProteinTrees::OrthoTree');
00083 
00084 
00085 sub param_defaults {
00086     return {
00087             'cdna'              => 1,   # always use cdna for njtree_phyml
00088             'bootstrap'         => 1,
00089         'check_split_genes' => 1,
00090             'correction_mode'   => 'max_diff_lk',   # can be either max_diff_lk or jackknife
00091     };
00092 }
00093 
00094 
00095 sub fetch_input {
00096     my $self = shift @_;
00097 
00098     $self->check_if_exit_cleanly;
00099 
00100     $self->param('member_adaptor',       $self->compara_dba->get_MemberAdaptor);
00101     $self->param('protein_tree_adaptor', $self->compara_dba->get_ProteinTreeAdaptor);
00102 
00103     my $protein_tree_id     = $self->param('protein_tree_id') or die "'protein_tree_id' is an obligatory parameter";
00104     my $protein_tree        = $self->param('protein_tree_adaptor')->fetch_node_by_node_id( $protein_tree_id )
00105                                         or die "Could not fetch protein_tree with protein_tree_id='$protein_tree_id'";
00106     $protein_tree->print_tree(10) if($self->debug);
00107 
00108     $self->param('protein_tree', $protein_tree);
00109 }
00110 
00111 
00112 sub run {
00113   my $self = shift;
00114 
00115   $self->check_if_exit_cleanly;
00116   $self->run_njtree_phyml;
00117 }
00118 
00119 
00120 sub write_output {
00121   my $self = shift;
00122 
00123   $self->check_if_exit_cleanly;
00124   $self->store_proteintree;
00125   if (defined $self->param('output_dir')) {
00126     system("zip -r -9 ".($self->param('output_dir'))."/".($self->param('protein_tree_id')).".zip ".$self->worker_temp_directory);
00127   }
00128 }
00129 
00130 
00131 sub DESTROY {
00132   my $self = shift;
00133 
00134   if(my $protein_tree = $self->param('protein_tree')) {
00135     printf("NJTREE_PHYML::DESTROY  releasing tree\n") if($self->debug);
00136     $protein_tree->release_tree;
00137     $self->param('protein_tree', undef);
00138   }
00139 
00140   $self->SUPER::DESTROY if $self->can("SUPER::DESTROY");
00141 }
00142 
00143 
00144 ##########################################
00145 #
00146 # internal methods
00147 #
00148 ##########################################
00149 
00150 sub get_species_tree_file {
00151     my $self = shift @_;
00152 
00153     unless( $self->param('species_tree_file') ) {
00154 
00155         unless( $self->param('species_tree_string') ) {
00156 
00157             my $tag_table_name = 'gene_tree_root_tag';
00158 
00159             my $sth = $self->compara_dba->dbc->prepare( "select value from $tag_table_name where tag='species_tree_string'" );
00160             $sth->execute;
00161             my ($species_tree_string) = $sth->fetchrow_array;
00162             $sth->finish;
00163 
00164             $self->param('species_tree_string', $species_tree_string)
00165                 or die "Could not fetch 'species_tree_string' from $tag_table_name";
00166         }
00167 
00168         my $species_tree_string = $self->param('species_tree_string');
00169         eval {
00170             my $eval_species_tree = Bio::EnsEMBL::Compara::Graph::NewickParser::parse_newick_into_tree($species_tree_string);
00171             my @leaves = @{$eval_species_tree->get_all_leaves};
00172         };
00173         if($@) {
00174             die "Error parsing species tree from the string '$species_tree_string'";
00175         }
00176 
00177             # store the string in a local file:
00178         my $species_tree_file = $self->worker_temp_directory . "spec_tax.nh";
00179         open SPECIESTREE, ">$species_tree_file" or die "Could not open '$species_tree_file' for writing : $!";
00180         print SPECIESTREE $species_tree_string;
00181         close SPECIESTREE;
00182         $self->param('species_tree_file', $species_tree_file);
00183     }
00184     return $self->param('species_tree_file');
00185 }
00186 
00187 
00188 sub run_njtree_phyml {
00189   my $self = shift;
00190 
00191     my $protein_tree = $self->param('protein_tree');
00192 
00193   my $starttime = time()*1000;
00194 
00195   $self->check_for_split_genes if ($self->param('check_split_genes')) ;
00196 
00197   my $input_aln = $self->dumpTreeMultipleAlignmentToWorkdir ( $protein_tree ) or return;
00198 
00199   my $newick_file = $input_aln . "_njtree_phyml_tree.txt ";
00200 
00201   my $treebest_exe = $self->param('treebest_exe')
00202       or die "'treebest_exe' is an obligatory parameter";
00203 
00204   die "Cannot execute '$treebest_exe'" unless(-x $treebest_exe);
00205 
00206   my $species_tree_file = $self->get_species_tree_file();
00207 
00208   # ./njtree best -f spec-v4.1.nh -p tree -o $BASENAME.best.nhx \
00209   # $BASENAME.nucl.mfa -b 100 2>&1/dev/null
00210 
00211   my $cmd = $treebest_exe;
00212   if (1 == $self->param('bootstrap')) {
00213     $cmd .= " best ";
00214     if(my $max_diff_lk = $self->param('max_diff_lk')) {
00215         $cmd .= " -Z $max_diff_lk";
00216     }
00217     if ($species_tree_file) {
00218       $cmd .= " -f ". $species_tree_file;
00219     }
00220     $cmd .= " ". $input_aln;
00221     $cmd .= " -p interm ";
00222     $cmd .= " -o " . $newick_file;
00223     if ($self->param('extra_args')) {
00224       $cmd .= " ".($self->param('extra_args')).' ';
00225     }
00226     my $logfile = $self->worker_temp_directory. "proteintree_". $protein_tree->node_id . ".log";
00227     my $errfile = $self->worker_temp_directory. "proteintree_". $protein_tree->node_id . ".err";
00228     $cmd .= " 1>$logfile 2>$errfile";
00229     #     $cmd .= " 2>&1 > /dev/null" unless($self->debug);
00230 
00231     my $worker_temp_directory = $self->worker_temp_directory;
00232     my $full_cmd = defined $worker_temp_directory ? "cd $worker_temp_directory; $cmd" : "$cmd";
00233     print STDERR "Running:\n\t$full_cmd\n" if($self->debug);
00234 
00235     $self->compara_dba->dbc->disconnect_when_inactive(1);
00236     
00237     if(my $rc = system($full_cmd)) {
00238       my $system_error = $!;
00239 
00240       if(my $segfault = (($rc != -1) and ($rc & 127 == 11))) {
00241           $self->throw("'$full_cmd' resulted in a segfault");
00242       }
00243       print STDERR "$full_cmd\n";
00244       open(ERRFILE, $errfile) or die "Could not open logfile '$errfile' for reading : $!\n";
00245     my $logfile = "";
00246     my $handled_failure = 0;
00247       while (<ERRFILE>) {
00248         if (!($_ =~ /^Large distance/)) {
00249          $logfile .= $_;
00250         }
00251         if (($_ =~ /NNI/) || ($_ =~ /Optimize_Br_Len_Serie/) || ($_ =~ /Optimisation failed/) || ($_ =~ /Brent failed/))  {
00252          $handled_failure = 1;
00253       }
00254     }
00255     if ($handled_failure) {
00256         if ($self->param("correction_mode") eq "jackknife") {
00257           # Do jack-knife treebest starting by the sequence with more Ns
00258           my $jackknife_value = $self->param('jackknife') if ($self->param('jackknife'));
00259           $jackknife_value++;
00260           my $output_id = sprintf("{'protein_tree_id'=>%d, 'jackknife'=>%d}", $protein_tree->node_id, $jackknife_value);
00261           $self->input_job->input_id($output_id);
00262           $self->dataflow_output_id($output_id, 2);
00263           $protein_tree->release_tree;
00264           $self->param('protein_tree', undef);
00265           $self->input_job->incomplete(0);
00266           die "PHYML error, dataflowing to NJTREE_PHYML+jackknife\n$logfile";
00267 
00268         } elsif ($self->param("correction_mode") eq "max_diff_lk") {
00269         # Increase the tolerance max_diff_lk in the computation
00270 
00271           my $max_diff_lk_value = $self->param('max_diff_lk') ?  $self->param('max_diff_lk') : 1e-5;
00272           print STDERR sprintf("*%f*%f*\n", $self->param('max_diff_lk'), $max_diff_lk_value);
00273         $max_diff_lk_value *= 10;
00274           print STDERR sprintf("*%f*%f*\n", $self->param('max_diff_lk'), $max_diff_lk_value);
00275           my $output_id = sprintf("{'protein_tree_id'=>%d, 'max_diff_lk'=>%f}", $protein_tree->node_id, $max_diff_lk_value);
00276           $self->input_job->input_id($output_id);
00277           $self->dataflow_output_id($output_id, 2);
00278           $protein_tree->release_tree;
00279           $self->param('protein_tree', undef);
00280           $self->input_job->incomplete(0);
00281           die "PHYML error, dataflowing to NJTREE_PHYML+max_diff_lk\n$logfile";
00282         }
00283       }
00284       $self->throw("error running njtree phyml: $system_error\n$logfile");
00285     }
00286 
00287     $self->compara_dba->dbc->disconnect_when_inactive(0);
00288   } elsif (0 == $self->param('bootstrap')) {
00289     # first part
00290     # ./njtree phyml -nS -f species_tree.nh -p 0.01 -o $BASENAME.cons.nh $BASENAME.nucl.mfa
00291     $cmd = $treebest_exe;
00292     $cmd .= " phyml -nS";
00293     if($species_tree_file) {
00294       $cmd .= " -f ". $species_tree_file;
00295     }
00296     $cmd .= " ". $input_aln;
00297     $cmd .= " -p 0.01 ";
00298 
00299     my $intermediate_newick_file = $input_aln . "_intermediate_njtree_phyml_tree.txt ";
00300     $cmd .= " -o " . $intermediate_newick_file;
00301     $cmd .= " 2>&1 > /dev/null" unless($self->debug);
00302 
00303     print("$cmd\n") if($self->debug);
00304     my $worker_temp_directory = $self->worker_temp_directory;
00305     if(system("cd $worker_temp_directory; $cmd")) {
00306       my $system_error = $!;
00307       $self->throw("Error running njtree phyml noboot (step 1 of 2) : $system_error");
00308     }
00309     # second part
00310     # nice -n 19 ./njtree sdi -s species_tree.nh $BASENAME.cons.nh > $BASENAME.cons.nhx
00311     $cmd = $treebest_exe;
00312     $cmd .= " sdi ";
00313     if ($species_tree_file) {
00314       $cmd .= " -s ". $species_tree_file;
00315     }
00316     $cmd .= " ". $intermediate_newick_file;
00317     $cmd .= " 1> " . $newick_file;
00318     $cmd .= " 2> /dev/null" unless($self->debug);
00319 
00320     $self->compara_dba->dbc->disconnect_when_inactive(1);
00321     print("$cmd\n") if($self->debug);
00322     my $worker_temp_directory = $self->worker_temp_directory;
00323     if(system("cd $worker_temp_directory; $cmd")) {
00324       my $system_error = $!;
00325       $self->throw("Error running njtree phyml noboot (step 2 of 2) : $system_error");
00326     }
00327     $self->compara_dba->dbc->disconnect_when_inactive(0);
00328   } else {
00329     $self->throw("NJTREE PHYML -- wrong bootstrap option");
00330   }
00331 
00332       #parse the tree into the datastucture:
00333   $protein_tree = $self->parse_newick_into_proteintree( $newick_file );
00334 
00335   my $runtime = time()*1000-$starttime;
00336 
00337   $protein_tree->tree->store_tag('NJTREE_PHYML_runtime_msec', $runtime);
00338 }
00339 
00340 
00341 ########################################################
00342 #
00343 # GeneTree input/output section
00344 #
00345 ########################################################
00346 
00347 sub dumpTreeMultipleAlignmentToWorkdir {
00348   my $self = shift;
00349   my $protein_tree = shift;
00350 
00351   my $alignment_edits = $self->param('alignment_edits');
00352 
00353   my $leafcount = scalar(@{$protein_tree->get_all_leaves});
00354   if($leafcount<3) {
00355     printf(STDERR "tree cluster %d has <3 proteins - can not build a tree\n", 
00356            $protein_tree->node_id);
00357     return undef;
00358   }
00359 
00360   my $file_root = $self->worker_temp_directory. "proteintree_". $protein_tree->node_id;
00361   $file_root =~ s/\/\//\//g;  # converts any // in path to /
00362 
00363   my $aln_file = $file_root . '.aln';
00364   return $aln_file if(-e $aln_file);
00365   if($self->debug) {
00366     printf("dumpTreeMultipleAlignmentToWorkdir : %d members\n", $leafcount);
00367     print("aln_file = '$aln_file'\n");
00368   }
00369 
00370   open(OUTSEQ, ">$aln_file")
00371     or $self->throw("Error opening $aln_file for write");
00372 
00373   # Using append_taxon_id will give nice seqnames_taxonids needed for
00374   # njtree species_tree matching
00375   my %sa_params = $self->param('use_genomedb_id') ? ('-APPEND_GENOMEDB_ID', 1) : ('-APPEND_TAXON_ID', 1);
00376 
00377   my %split_genes;
00378 
00379   ########################################
00380   # Gene split mirroring code
00381   #
00382   # This will have the effect of grouping the different
00383   # fragments of a gene split event together in a subtree
00384   #
00385   unless ($self->param('gs_mirror') =~ /FALSE/) {
00386     my $holding_node = $alignment_edits->holding_node;
00387     foreach my $link (@{$holding_node->links}) {
00388       my $node1 = $link->get_neighbor($holding_node);
00389       my $protein1 = $protein_tree->find_leaf_by_node_id($node1->node_id);
00390       #print STDERR "node1 ", $node1, " ", $protein1, "\n";
00391       my $name1 = ($protein1->member_id)."_".($self->param('use_genomedb_id') ? $protein1->genome_db_id : $protein1->taxon_id);
00392       my $cdna = $protein1->cdna_alignment_string;
00393       #print STDERR "cnda1 $cdna\n";
00394       foreach my $node2 (@{$node1->all_nodes_in_graph}) {
00395         my $protein2 = $protein_tree->find_leaf_by_node_id($node2->node_id);
00396         #print STDERR "node2 ", $node2, " ", $protein2, "\n";
00397         next if $node2->node_id eq $node1->node_id;
00398         my $name2 = ($protein2->member_id)."_".($self->param('use_genomedb_id') ? $protein2->genome_db_id : $protein2->taxon_id);
00399         $split_genes{$name2} = $name1;
00400         #print STDERR Dumper(%split_genes);
00401         print STDERR "Joining in ", $protein1->stable_id, " / $name1 and ", $protein2->stable_id, " / $name2 in input cdna alignment\n" if ($self->debug);
00402         my $other_cdna = $protein2->cdna_alignment_string;
00403         $cdna =~ s/-/substr($other_cdna, pos($cdna), 1)/eg;
00404         #print STDERR "cnda2 $cdna\n";
00405       }
00406       $protein1->{'cdna_alignment_string'} = $cdna;
00407         # We start with the original cdna alignment string of the first gene, and
00408         # add the position in the other cdna for every gap position, and iterate
00409         # through all the other cdnas
00410         # cdna1 = AAA AAA AAA AAA AAA --- --- --- --- --- --- --- --- --- --- --- ---
00411         # cdna2 = --- --- --- --- --- --- TTT TTT TTT TTT TTT --- --- --- --- --- ---
00412         # become
00413         # cdna1 = AAA AAA AAA AAA AAA --- TTT TTT TTT TTT TTT --- --- --- --- --- ---
00414         # and now then paired with 3, they becomes the full gene model:
00415         # cdna3 = --- --- --- --- --- --- --- --- --- --- --- --- CCC CCC CCC CCC CCC
00416         # and form:
00417         # cdna1 = AAA AAA AAA AAA AAA --- TTT TTT TTT TTT TTT --- CCC CCC CCC CCC CCC
00418         # We then directly override the cached cdna_alignment_string
00419         # hash, which will be used next time is called for
00420     }
00421   }
00422   ########################################
00423 
00424   print STDERR "fetching alignment\n" if ($self->debug);
00425   my $sa = $protein_tree->get_SimpleAlign
00426     (
00427      -id_type => 'MEMBER',
00428      -cdna=>$self->param('cdna'),
00429      -stop2x => 1,
00430      %sa_params
00431     );
00432   # Removing duplicate sequences of split genes
00433   print STDERR "split_genes hash: ", Dumper(%split_genes), "\n" if $self->debug;
00434   foreach my $gene_to_remove (keys %split_genes) {
00435     $sa->remove_seq($sa->each_seq_with_id($gene_to_remove));
00436   }
00437   $self->param('split_genes', \%split_genes);
00438 
00439   if ($self->param('jackknife')) {
00440     # my $coverage_hash;
00441     my $empty_hash;
00442     foreach my $seq ($sa->each_seq) {
00443       my $sequence = $seq->seq;
00444       $sequence =~ s/\-//g;
00445       my $full_length = length($sequence);
00446       $sequence =~ s/N//g;
00447       my $covered_length = length($sequence);
00448       my $coverage_proportion = $covered_length/$full_length;
00449       my $empty_length = $full_length - $covered_length;
00450       # $coverage_hash->{$coverage_proportion} = $seq->display_id if ($coverage_proportion < 1);
00451       $empty_hash->{$empty_length} = $seq->display_id if ($empty_length > 0);
00452     }
00453     my @lowest = sort {$b<=>$a} keys %$empty_hash;
00454     my $i = 0;
00455     while ($i < $self->param('jackknife')) {
00456       $sa->remove_seq($sa->each_seq_with_id($empty_hash->{$lowest[$i]}));
00457       $i++;
00458     }
00459     $sa = $sa->remove_gaps(undef,1);
00460   }
00461   $sa->set_displayname_flat(1);
00462 
00463   my $alignIO = Bio::AlignIO->newFh
00464     (
00465      -fh => \*OUTSEQ,
00466      -format => "fasta"
00467     );
00468   print $alignIO $sa;
00469 
00470   close OUTSEQ;
00471 
00472   return $aln_file;
00473 }
00474 
00475 sub store_proteintree
00476 {
00477     my $self = shift;
00478 
00479     my $tree = $self->param('protein_tree') or return;
00480     my $tree_adaptor = $self->param('protein_tree_adaptor');
00481 
00482     printf("PHYML::store_proteintree\n") if($self->debug);
00483 
00484     $tree->build_leftright_indexing(1);
00485     $tree_adaptor->store($tree);
00486     $tree_adaptor->delete_nodes_not_in_tree($tree);
00487 
00488     if($self->debug >1) {
00489         print("done storing - now print\n");
00490         $tree->print_tree;
00491     }
00492 
00493     $self->store_tags($tree);
00494 
00495     if($self->param('jackknife')) {
00496         my $leaf_count = $tree->num_leaves;
00497         $tree->tree->store_tag( 'gene_count', $leaf_count );
00498     }
00499     $self->_store_tree_tags;
00500 
00501 }
00502 
00503 sub store_tags
00504 {
00505     my $self = shift;
00506     my $node = shift;
00507 
00508     if (not $node->is_leaf) {
00509         my $node_type;
00510         if ($node->has_tag('node_type')) {
00511             $node_type = $node->get_tagvalue('node_type');
00512         } elsif ($node->get_tagvalue("DD", 0)) {
00513             $node_type = 'dubious';
00514         } elsif ($node->get_tagvalue('Duplication', '') eq '1') {
00515             $node_type = 'duplication';
00516         } else {
00517             $node_type = 'speciation';
00518         }
00519         $node->store_tag('node_type', $node_type);
00520         if ($self->debug) {
00521             print "store node_type: $node_type"; $node->print_node;
00522         }
00523     }
00524 
00525     if ($node->has_tag("E")) {
00526         my $n_lost = $node->get_tagvalue("E");
00527         $n_lost =~ s/.{2}//;        # get rid of the initial $-
00528         my @lost_taxa = split('-', $n_lost);
00529         foreach my $taxon (@lost_taxa) {
00530             if ($self->debug) {
00531                 printf("store lost_taxon_id : $taxon "); $node->print_node;
00532             }
00533             $node->store_tag('lost_taxon_id', $taxon, 1);
00534         }
00535     }
00536 
00537     my %mapped_tags = ('B' => 'bootstrap', 'SIS' => 'species_intersection_score', 'T' => 'tree_support');
00538     foreach my $tag (keys %mapped_tags) {
00539         if ($node->has_tag($tag)) {
00540             my $value = $node->get_tagvalue($tag);
00541             my $db_tag = $mapped_tags{$tag};
00542             # Because the duplication_confidence_score won't be computed for dubious nodes
00543             $db_tag = 'duplication_confidence_score' if ($node->get_tagvalue('node_type') eq 'dubious' and $tag eq 'SIS');
00544             $node->store_tag($db_tag, $value);
00545             if ($self->debug) {
00546                 printf("store $tag as $db_tag: $value"); $node->print_node;
00547             }
00548         }
00549     }
00550 
00551     foreach my $child (@{$node->children}) {
00552         $self->store_tags($child);
00553     }
00554 }
00555 
00556 sub parse_newick_into_proteintree {
00557   my $self = shift;
00558   my $newick_file = shift;
00559 
00560   my $tree = $self->param('protein_tree');
00561   
00562   #cleanup old tree structure- 
00563   #  flatten and reduce to only GeneTreeMember leaves
00564   $tree->flatten_tree;
00565   $tree->print_tree(20) if($self->debug);
00566   foreach my $node (@{$tree->get_all_leaves}) {
00567     next if($node->isa('Bio::EnsEMBL::Compara::GeneTreeMember'));
00568     $node->disavow_parent;
00569   }
00570 
00571   #parse newick into a new tree object structure
00572   my $newick = '';
00573   print("load from file $newick_file\n") if($self->debug);
00574   open (FH, $newick_file) or $self->throw("Couldnt open newick file [$newick_file]");
00575   while(<FH>) { $newick .= $_;  }
00576   close(FH);
00577 
00578   my $newtree = Bio::EnsEMBL::Compara::Graph::NewickParser::parse_newick_into_tree($newick, "Bio::EnsEMBL::Compara::GeneTreeNode");
00579   $newtree->print_tree(20) if($self->debug > 1);
00580 
00581   my $nsplits = 0;
00582   my $split_genes = $self->param('split_genes');
00583   print STDERR "Retrieved split_genes hash: ", Dumper($split_genes) if $self->debug;
00584 
00585   while ( my ($name, $other_name) = each(%{$split_genes})) {
00586         print STDERR "$name is split_gene of $other_name\n" if $self->debug;
00587         my $node = new Bio::EnsEMBL::Compara::GeneTreeNode;
00588         $node->name($name);
00589         my $othernode = $newtree->find_node_by_name($other_name);
00590         print STDERR "$node is split_gene of $othernode\n" if $self->debug;
00591         my $newnode = new Bio::EnsEMBL::Compara::GeneTreeNode;
00592         $nsplits++;
00593         $newnode->node_id(-$nsplits);
00594         $othernode->parent->add_child($newnode);
00595         $newnode->add_child($othernode);
00596         $newnode->add_child($node);
00597         $newnode->add_tag('node_type', 'gene_split');
00598         $newnode->print_tree(10);
00599     }
00600 
00601   # get rid of the taxon_id needed by njtree -- name tag
00602   foreach my $leaf (@{$newtree->get_all_leaves}) {
00603     my $njtree_phyml_name = $leaf->get_tagvalue('name');
00604     $njtree_phyml_name =~ /(\d+)\_\d+/;
00605     my $member_id = $1;
00606     $leaf->add_tag('name', $member_id);
00607   }
00608   $newtree->print_tree(20) if($self->debug > 1);
00609 
00610   # Leaves of newick tree are named with member_id of members from
00611   # input tree move members (leaves) of input tree into newick tree to
00612   # mirror the 'member_id' nodes
00613   foreach my $member (@{$tree->get_all_leaves}) {
00614     my $tmpnode = $newtree->find_node_by_name($member->member_id);
00615     if($tmpnode) {
00616       $member->Bio::EnsEMBL::Compara::AlignedMember::copy($tmpnode);
00617       bless $tmpnode, 'Bio::EnsEMBL::Compara::GeneTreeMember';
00618       $tmpnode->node_id($member->node_id);
00619       $tmpnode->adaptor($member->adaptor);
00620     } else {
00621       print("unable to find node in newick for member");
00622       $member->print_member;
00623     }
00624   }
00625 
00626   $newtree->node_id($tree->node_id);
00627   $newtree->adaptor($tree->adaptor);
00628   $newtree->tree($tree->tree);
00629   $self->param('protein_tree', $newtree);
00630   # to keep the link to the super-tree
00631   if ($tree->has_parent) {
00632       $tree->parent->add_child($newtree);
00633    }
00634 
00635   # Newick tree is now empty so release it
00636   $tree->release_tree;
00637 
00638   $newtree->print_tree if($self->debug);
00639   # check here on the leaf to test if they all are GeneTreeMembers as
00640   # minimize_tree/minimize_node might not work properly
00641   foreach my $leaf (@{$newtree->get_all_leaves}) {
00642     unless($leaf->isa('Bio::EnsEMBL::Compara::GeneTreeMember')) {
00643       $self->throw("Phyml tree does not have all leaves as GeneTreeMembers\n");
00644     }
00645   }
00646   return $newtree;
00647 }
00648 
00649 sub _store_tree_tags {
00650     my $self = shift;
00651     my $tree = $self->param('protein_tree');
00652     my $pta = $self->compara_dba->get_ProteinTreeAdaptor;
00653 
00654     print "Storing Tree tags...\n";
00655 
00656     my @leaves = @{$tree->get_all_leaves};
00657     my @nodes = @{$tree->get_all_nodes};
00658 
00659     # Tree number of leaves.
00660     my $tree_num_leaves = scalar(@leaves);
00661     $tree->tree->store_tag("tree_num_leaves",$tree_num_leaves);
00662 
00663     # Tree number of human peptides contained.
00664     my $num_hum_peps = 0;
00665     foreach my $leaf (@leaves) {
00666     $num_hum_peps++ if ($leaf->taxon_id == 9606);
00667     }
00668     $tree->tree->store_tag("tree_num_human_peps",$num_hum_peps);
00669 
00670     # Tree max root-to-tip distance.
00671     my $tree_max_length = $tree->max_distance;
00672     $tree->tree->store_tag("tree_max_length",$tree_max_length);
00673 
00674     # Tree max single branch length.
00675     my $tree_max_branch = 0;
00676     foreach my $node (@nodes) {
00677         my $dist = $node->distance_to_parent;
00678         $tree_max_branch = $dist if ($dist > $tree_max_branch);
00679     }
00680     $tree->tree->store_tag("tree_max_branch",$tree_max_branch);
00681 
00682     # Tree number of duplications and speciations.
00683     my $num_dups = 0;
00684     my $num_specs = 0;
00685     foreach my $node (@nodes) {
00686         my $node_type = $node->get_tagvalue("node_type");
00687         if ((defined $node_type) and ($node_type ne 'speciation')) {
00688             $num_dups++;
00689         } else {
00690             $num_specs++;
00691         }
00692     }
00693     $tree->tree->store_tag("tree_num_dup_nodes",$num_dups);
00694     $tree->tree->store_tag("tree_num_spec_nodes",$num_specs);
00695 
00696     print "Done storing stuff!\n" if ($self->debug);
00697 }
00698 
00699 sub check_for_split_genes {
00700   my $self = shift;
00701   my $protein_tree = $self->param('protein_tree');
00702 
00703   my $alignment_edits = $self->param('alignment_edits', new Bio::EnsEMBL::Compara::Graph::ConnectedComponentGraphs);
00704 
00705   my $tmp_time = time();
00706 
00707   my @all_protein_leaves = @{$protein_tree->get_all_leaves};
00708   printf("%1.3f secs to fetch all leaves\n", time()-$tmp_time) if ($self->debug);
00709 
00710   if($self->debug) {
00711     printf("%d proteins in tree\n", scalar(@all_protein_leaves));
00712   }
00713   printf("build paralogs graph\n") if($self->debug);
00714   my @genepairlinks;
00715   my $graphcount = 0;
00716   my $tree_node_id = $protein_tree->node_id;
00717   while (my $protein1 = shift @all_protein_leaves) {
00718     foreach my $protein2 (@all_protein_leaves) {
00719       next unless ($protein1->genome_db_id == $protein2->genome_db_id);
00720       my $genepairlink = new Bio::EnsEMBL::Compara::Graph::Link
00721         (
00722          $protein1, $protein2, 0
00723         );
00724       $genepairlink->add_tag("tree_node_id", $tree_node_id);
00725       push @genepairlinks, $genepairlink;
00726       print STDERR "build graph $graphcount\n" if ($graphcount++ % 10 == 0);
00727     }
00728   }
00729   printf("%1.3f secs build links and features\n", time()-$tmp_time) if($self->debug>1);
00730 
00731   # We sort the pairings by seq_region (chr) name, then by distance between
00732   # the start of link_node pairs.
00733   # This is to try to do the joining up of cdnas in the best order in
00734   # cases of e.g. 2 cases of 3-way split genes in same species.
00735   my @sorted_genepairlinks = sort { 
00736     $a->{_link_node1}->chr_name <=> $b->{_link_node1}->chr_name 
00737  || $a->{_link_node2}->chr_name <=> $b->{_link_node2}->chr_name 
00738  || abs($a->{_link_node1}->chr_start - $a->{_link_node2}->chr_start) <=> abs($b->{_link_node1}->chr_start - $b->{_link_node2}->chr_start) } @genepairlinks;
00739 
00740   foreach my $genepairlink (@sorted_genepairlinks) {
00741     my $type = 'within_species_paralog';
00742     my ($protein1, $protein2) = $genepairlink->get_nodes;
00743     my ($cigar_line1, $perc_id1, $perc_pos1,
00744         $cigar_line2, $perc_id2, $perc_pos2) = 
00745         $self->generate_attribute_arguments($protein1, $protein2,$type);
00746     print STDERR "Pair: ", $protein1->stable_id, " - ", $protein2->stable_id, "\n" if ($self->debug);
00747 
00748     # Checking for gene_split cases
00749     if ($type eq 'within_species_paralog' && 0 == $perc_id1 && 0 == $perc_id2 && 0 == $perc_pos1 && 0 == $perc_pos2) {
00750 
00751       # Condition A1: If same seq region and less than 1MB distance
00752       my $gene_member1 = $protein1->gene_member; my $gene_member2 = $protein2->gene_member;
00753       if ($gene_member1->chr_name eq $gene_member2->chr_name 
00754           && (1000000 > abs($gene_member1->chr_start - $gene_member2->chr_start)) 
00755           && $gene_member1->chr_strand eq $gene_member2->chr_strand ) {
00756 
00757         # Condition A2: there have to be the only 2 or 3 protein coding
00758         # genes in the range defined by the gene pair. This should
00759         # strictly be 2, only the pair in question, but in clean perc_id
00760         # = 0 cases, we allow for 2+1: the rare case where one extra
00761         # protein coding gene is partially or fully embedded in another.
00762         my $start1 = $gene_member1->chr_start; my $start2 = $gene_member2->chr_start; my $starttemp;
00763         my $end1 = $gene_member1->chr_end; my $end2 = $gene_member2->chr_end; my $endtemp;
00764         if ($start1 > $start2) { $starttemp = $start1; $start1 = $start2; $start2 = $starttemp; }
00765         if ($end1   <   $end2) {   $endtemp = $end1;     $end1 = $end2;     $end2 = $endtemp; }
00766         my $strand1 = $gene_member1->chr_strand; my $taxon_id1 = $gene_member1->taxon_id; my $name1 = $gene_member1->chr_name;
00767         print STDERR "Checking split genes overlap\n";
00768         my @genes_in_range = @{$self->param('member_adaptor')->_fetch_all_by_source_taxon_chr_name_start_end_strand_limit('ENSEMBLGENE',$taxon_id1,$name1,$start1,$end1,$strand1,4)};
00769 
00770         if (3 < scalar @genes_in_range) {
00771           foreach my $gene (@genes_in_range) {
00772             print STDERR "More than 2 genes in range...";
00773             print STDERR "Genes in range ($start1,$end1,$strand1): ", $gene->stable_id,",", $gene->chr_start,",", $gene->chr_end,"\n";
00774           }
00775           next;
00776         }
00777         $alignment_edits->add_connection($protein1->node_id, $protein2->node_id);
00778       }
00779 
00780 
00781       # This is a second level of contiguous gene split events, more
00782       # stringent on contig but less on alignment, for "skidding"
00783       # alignment cases.
00784 
00785       # These cases take place when a few of the aminoacids in the
00786       # alignment have been wrongly displaced from the columns that
00787       # correspond to the fragment, so the identity level is slightly
00788       # above 0. This small number of misplaced aminoacids look like
00789       # "skid marks" in the alignment view.
00790 
00791       # Condition B1: all 4 percents below 10
00792     } elsif ($type eq 'within_species_paralog' 
00793              && $perc_id1 < 10 
00794              && $perc_id2 < 10 
00795              && $perc_pos1 < 10 
00796              && $perc_pos2 < 10) {
00797       my $gene_member1 = $protein1->gene_member; my $gene_member2 = $protein2->gene_member;
00798 
00799     # Condition B2: If non-overlapping and smaller than 500kb start and 500kb end distance
00800       if ($gene_member1->chr_name eq $gene_member2->chr_name 
00801           && (500000 > abs($gene_member1->chr_start - $gene_member2->chr_start)) 
00802           && (500000 > abs($gene_member1->chr_end - $gene_member2->chr_end)) 
00803           && (($gene_member1->chr_start - $gene_member2->chr_start)*($gene_member1->chr_end - $gene_member2->chr_end)) > 1
00804           && $gene_member1->chr_strand eq $gene_member2->chr_strand ) {
00805 
00806     # Condition B3: they have to be the only 2 genes in the range:
00807         my $start1 = $gene_member1->chr_start; my $start2 = $gene_member2->chr_start; my $starttemp;
00808         my $end1 = $gene_member1->chr_end; my $end2 = $gene_member2->chr_end; my $endtemp;
00809         if ($start1 > $start2) { $starttemp = $start1; $start1 = $start2; $start2 = $starttemp; }
00810         if ($end1   <   $end2) {   $endtemp = $end1;     $end1 = $end2;     $end2 = $endtemp; }
00811         my $strand1 = $gene_member1->chr_strand; my $taxon_id1 = $gene_member1->taxon_id; my $name1 = $gene_member1->chr_name;
00812 
00813         my @genes_in_range = @{$self->param('member_adaptor')->_fetch_all_by_source_taxon_chr_name_start_end_strand_limit('ENSEMBLGENE',$taxon_id1,$name1,$start1,$end1,$strand1,4)};
00814         if (2 < scalar @genes_in_range) {
00815           foreach my $gene (@genes_in_range) {
00816             print STDERR "More than 2 genes in range...";
00817             print STDERR "Genes in range ($start1,$end1,$strand1): ", $gene->stable_id,",", $gene->chr_start,",", $gene->chr_end,"\n";
00818           }
00819           next;
00820         }
00821 
00822     # Condition B4: discard if the smaller protein is 1/10 or less of the larger and all percents above 2
00823         my $len1 = length($protein1->sequence); my $len2 = length($protein2->sequence); my $temp;
00824         if ($len1 < $len2) { $temp = $len1; $len1 = $len2; $len2 = $temp; }
00825         if ($len1/$len2 > 10 && $perc_id1 > 2 && $perc_id2 > 2 && $perc_pos1 > 2 && $perc_pos2 > 2) {
00826           next;
00827         }
00828         $alignment_edits->add_connection($protein1->node_id, $protein2->node_id);
00829       }
00830     }
00831   }
00832 
00833   printf("%1.3f secs label gene splits\n", time()-$tmp_time) if($self->debug>1);
00834 
00835   if($self->debug) {
00836     printf("%d pairings\n", scalar(@sorted_genepairlinks));
00837   }
00838 
00839   return 1;
00840 }
00841 
00842 
00843 1;