Archive Ensembl HomeArchive Ensembl Home
LowCoverageGenomeAlignment.pm
Go to the documentation of this file.
00001 =head1 LICENSE
00002 
00003   Copyright (c) 1999-2012 The European Bioinformatics Institute and
00004   Genome Research Limited.  All rights reserved.
00005 
00006   This software is distributed under a modified Apache license.
00007   For license details, please see
00008 
00009     http://www.ensembl.org/info/about/code_licence.html
00010 
00011 =head1 CONTACT
00012 
00013   Please email comments or questions to the public Ensembl
00014   developers list at <dev@ensembl.org>.
00015 
00016   Questions may also be sent to the Ensembl help desk at
00017   <helpdesk@ensembl.org>.
00018 
00019 =head1 NAME
00020 
00021 Bio::EnsEMBL::Compara::Runnable::EpoLowCoverage::LowCoverageGenomeAlignment
00022 
00023 =head1 SYNOPSIS
00024 
00025 =head1 DESCRIPTION
00026 
00027 This module acts as a layer between the Hive system and the Bio::EnsEMBL::Analysis::Runnable::LowCoverageGenomeAlignment module since the ensembl-analysis API does not know about ensembl-compara
00028 
00029 This module is the alternative module to Ortheus.pm for aligning low coverage (2X) genomes. This takes an existing high coverage alignment and maps the pairwise BlastZ-Net alignments of the low coverage genomes to the human sequence in the high coverage alignment. Any insertions in the low coverage alignments are removed, that is, no gaps are added to the human sequence. In regions where there are duplications, a phylogenetic tree is generated using either treeBest where there are more than 3 sequences in the alignment or semphy where there are 3 or less sequences. This module is still under development.
00030 
00031 =head1 APPENDIX
00032 
00033 The rest of the documentation details each of the object methods. 
00034 Internal methods are usually preceded with a _
00035 
00036 =cut
00037 
00038 package Bio::EnsEMBL::Compara::RunnableDB::EpoLowCoverage::LowCoverageGenomeAlignment;
00039 
00040 use strict;
00041 use Bio::EnsEMBL::Analysis::Runnable::LowCoverageGenomeAlignment;
00042 use Bio::EnsEMBL::Compara::DnaFragRegion;
00043 use Bio::EnsEMBL::Compara::Production::DBSQL::DBAdaptor;
00044 use Bio::EnsEMBL::Utils::Exception;
00045 use Bio::EnsEMBL::Compara::Graph::NewickParser;
00046 use Bio::EnsEMBL::Compara::NestedSet;
00047 use Bio::EnsEMBL::Compara::GenomicAlignGroup;
00048 use Data::Dumper;
00049 
00050 use base ('Bio::EnsEMBL::Compara::RunnableDB::BaseRunnable');
00051 
00052 
00053 =head2 fetch_input
00054 
00055     Arg        :   -none-
00056     Example    :   $self->fetch_input
00057     Description:   Fetches input data for the module from the database
00058     Returntype :   none
00059     Excptions  :
00060     Caller     :
00061     Status     :   At risk
00062 
00063 =cut
00064 
00065 sub fetch_input {
00066   my( $self) = @_;
00067 
00068   if (!$self->param('method_link_species_set_id')) {
00069     throw("MethodLinkSpeciesSet->dbID is not defined for this LowCoverageAlignment job");
00070   }
00071 
00072   #set default to do transactions
00073   if (!defined $self->param('do_transactions')) {
00074       $self->param('do_transactions', 1);
00075   }
00076 
00077   #load from genomic_align_block ie using in 2X mode
00078   $self->_load_GenomicAligns($self->param('genomic_align_block_id'));
00079 
00080   if ($self->param('genomic_aligns')) {
00081       #load 2X genomes
00082       $self->_load_2XGenomes($self->param('genomic_align_block_id'));
00083 
00084       ## Get the tree string by taking into account duplications and deletions. Resort dnafrag_regions
00085       ## in order to match the name of the sequences in the tree string (seq1, seq2...)
00086       if ($self->get_species_tree) {
00087       $self->_build_tree_string;
00088       }
00089       ## Dumps fasta files for the DnaFragRegions. Fasta files order must match the entries in the
00090       ## newick tree. The order of the files will match the order of sequences in the tree_string.
00091       
00092       #create multi-fasta file with 2X genomes 
00093       $self->_create_mfa;
00094       
00095       $self->_dump_fasta_and_mfa;
00096 
00097   } else {
00098     #throw("Cannot start alignment because some information is missing");
00099     print("No valid genomic_aligns left in genomic_align_block. Unable to start alignment\n") if ($self->debug);
00100   }
00101   return 1;
00102 }
00103 
00104 =head2 run
00105 
00106     Arg        : -none-
00107     Example    : $self->run
00108     Description: runs the  LowCoverageGenomeAlignment analysis module and 
00109                   parses the results
00110     Returntype : none
00111     Exceptions : none
00112     Caller     :
00113     Status     :   At risk
00114 
00115 =cut
00116 
00117 sub run
00118 {
00119   my $self = shift;
00120 
00121   #check that have genomic_aligns (may not have if removed some because they
00122   #consist entirely of Ns)
00123   return if (!$self->param('genomic_aligns'));
00124 
00125   print "tmp " . $self->worker_temp_directory . " mfa=" . $self->param('multi_fasta_file') . " tree=" . $self->tree_string . " taxon=" . $self->get_taxon_tree . "\n" if $self->debug;
00126 
00127   my $runnable = new Bio::EnsEMBL::Analysis::Runnable::LowCoverageGenomeAlignment(
00128       -analysis => $self->analysis,
00129       -workdir => $self->worker_temp_directory,
00130       -multi_fasta_file => $self->param('multi_fasta_file'),
00131       -tree_string => $self->tree_string,
00132       -taxon_species_tree => $self->get_taxon_tree,
00133       );
00134   $self->param('runnable', $runnable);
00135 
00136   #disconnect pairwise compara database
00137   if (defined $self->param('pairwise_compara_dba')) {
00138       foreach my $dba (values %{$self->param('pairwise_compara_dba')}) {
00139       $dba->dbc->disconnect_if_idle;
00140       }
00141   }
00142 
00143   #disconnect compara database
00144   $self->compara_dba->dbc->disconnect_if_idle;
00145 
00146   $runnable->run_analysis;
00147   $self->_parse_results();
00148 }
00149 
00150 =head2 write_output
00151 
00152     Arg        : -none
00153     Example    : $self->write_output
00154     Description: stores results in a database
00155     Returntype : none
00156     Exceptions : none
00157     Caller     :
00158     Status     : At risk
00159 
00160 =cut
00161 sub write_output {
00162     my ($self) = @_;
00163 
00164     print "WRITE OUTPUT\n" if $self->debug;
00165 
00166     if ($self->param('do_transactions')) {
00167     my $compara_conn = $self->compara_dba->dbc;
00168 
00169     my $compara_helper = Bio::EnsEMBL::Utils::SqlHelper->new(-DB_CONNECTION => $compara_conn);
00170     $compara_helper->transaction(-CALLBACK => sub {
00171          $self->_write_output;
00172          });
00173     } else {
00174     $self->_write_output;
00175     }
00176 
00177   return 1;
00178 
00179 }
00180 
00181 sub _write_output {
00182   my ($self) = @_;
00183 
00184   #check that have genomic_aligns (may not have if removed some because they
00185   #consist entirely of Ns)
00186   if (!$self->param('genomic_aligns')) {
00187       #do not produce gerp jobs
00188       $self->input_job->autoflow(0);
00189       return 1;
00190   }
00191 
00192   my $use_fresh_connection = 1;
00193   my $skip_left_right_index = 0;
00194 
00195   #Set use_autoincrement to 1 otherwise the GenomicAlignBlockAdaptor will use
00196   #LOCK TABLES which does an implicit commit and prevent any rollback
00197   $self->compara_dba->get_GenomicAlignBlockAdaptor->use_autoincrement(1);
00198   
00199   my $mlssa = $self->compara_dba->get_MethodLinkSpeciesSetAdaptor;
00200   my $mlss = $mlssa->fetch_by_dbID($self->param('method_link_species_set_id'));
00201   my $mlss_id = $mlss->dbID;
00202   my $dnafrag_adaptor = $self->compara_dba->get_DnaFragAdaptor;
00203   my $gaba = $self->compara_dba->get_GenomicAlignBlockAdaptor;
00204   my $gaa = $self->compara_dba->get_GenomicAlignAdaptor;
00205   
00206   my $gaga = $self->compara_dba->get_GenomicAlignGroupAdaptor;
00207   
00208   my $gata = $self->compara_dba->get_GenomicAlignTreeAdaptor;
00209 
00210   foreach my $genomic_align_tree (@{$self->param('runnable')->output}) {
00211       my $all_nodes;
00212 
00213       foreach my $genomic_align_node (@{$genomic_align_tree->get_all_nodes}) {
00214       next if (!defined $genomic_align_node->genomic_align_group);
00215       foreach my $genomic_align (@{$genomic_align_node->genomic_align_group->get_all_GenomicAligns}) {
00216           $genomic_align->adaptor($gaa);
00217           
00218           $genomic_align->method_link_species_set($mlss);
00219           $genomic_align->visible(1);
00220       }
00221       }
00222       my $split_trees;
00223       #restrict genomic_align_tree if it is too long and store as groups
00224       
00225       #need to do it this way in case have no ancestral nodes
00226       my $gat_length;
00227       $all_nodes = $genomic_align_tree->get_all_leaves;
00228       $gat_length = $all_nodes->[0]->length;
00229       
00230       if ($self->param('max_block_size') && $gat_length >  $self->param('max_block_size')) {
00231       for (my $start = 1; $start <= $gat_length; $start += $self->param('max_block_size')) {
00232           my $end = $start+$self->param('max_block_size')-1;
00233           if ($end > $gat_length) {
00234           $end = $gat_length;
00235           }
00236           my $new_gat = $genomic_align_tree->restrict_between_alignment_positions($start, $end, "skip_empty_GenomicAligns");
00237           push @$split_trees, $new_gat;
00238       }
00239       $gata->store_group($split_trees);
00240       foreach my $tree (@$split_trees) {
00241           $self->_write_gerp_dataflow($tree->modern_genomic_align_block_id, $mlss);
00242       }
00243       } else {
00244       #If commit is causing a problem again, could theoretically not store the left and right indexes at all and hence
00245       #remove the need to call NestedSetAdaptor.pm. However, this would mean making other alterations to the API which use
00246       #the left and right indexes.
00247       #       $gata->store($genomic_align_tree, "skip_left_right_indexes");
00248       $gata->store($genomic_align_tree, $skip_left_right_index);
00249       $self->_write_gerp_dataflow($genomic_align_tree->modern_genomic_align_block_id,
00250                       $mlss);
00251       }
00252   }
00253 
00254   chdir("$self->worker_temp_directory");
00255   foreach(glob("*")){
00256       #DO NOT COMMENT THIS OUT!!! (at least not permenantly). Needed
00257       #to clean up after each job otherwise you get files left over from
00258       #the previous job.
00259       unlink($_);
00260   }
00261   
00262   return 1;
00263 }
00264 
00265 sub _write_gerp_dataflow {
00266     my ($self, $gab_id, $mlss) = @_;
00267     
00268     my $species_set = "[";
00269     my $genome_db_set  = $mlss->species_set;
00270     
00271     foreach my $genome_db (@$genome_db_set) {
00272     $species_set .= $genome_db->dbID . ","; 
00273     }
00274     $species_set .= "]";
00275     
00276     my $output_id = "{genomic_align_block_id=>" . $gab_id . ",species_set=>" .  $species_set . "}";
00277     $self->dataflow_output_id($output_id, 2);
00278 }
00279 
00280 =head2 _parse_results
00281 
00282     Arg        : none
00283     Example    : $self->_parse_results
00284     Description: parse the alignment and tree files
00285     Returntype : none
00286     Exceptions : 
00287     Caller     : run
00288     Status     : At risk
00289 
00290 =cut
00291 
00292 sub _parse_results {
00293     my ($self) = @_;
00294 
00295     #Taken from Analysis/Runnable/LowCoverageGenomeAlignment.pm module
00296     print "PARSE RESULTS\n" if $self->debug;
00297 
00298     ## The output file contains one fasta aligned sequence per original sequence + ancestral sequences.
00299     ## The first seq. corresponds to the fist leaf of the tree, the second one will be an internal
00300     ## node, the third is the second leaf and so on. The fasta header in the result file correspond
00301     ## to the names of the leaves for the leaf nodes and to the concatenation of the names of all the
00302     ## underlying leaves for internal nodes. For instance:
00303     ## ----------------------------------
00304     ## >0
00305     ## ACTTGG--CCGT
00306     ## >0_1
00307     ## ACTTGGTTCCGT
00308     ## >1
00309     ## ACTTGGTTCCGT
00310     ## >1_2_3
00311     ## ACTTGCTTCCGT
00312     ## >2
00313     ## CCTTCCTTCCGT
00314     ## ----------------------------------
00315     ## The sequence of fasta files and leaves in the tree have the same order. If Ortheus is run
00316     ## with a given tree, the sequences in the file follow the tree. If Ortheus estimate the tree,
00317     ## the tree output file contains also the right order of files:
00318     ## ----------------------------------
00319     ## ((1:0.0157,0:0.0697):0.0000,2:0.0081);
00320     ## /tmp/file3.fa /tmp/file1.fa /tmp/file2.fa
00321     ## ----------------------------------
00322 
00323     my $workdir;
00324     my $tree_file = $self->worker_temp_directory . "/output.$$.tree";
00325 
00326     #print "tree_file $tree_file\n";
00327 
00328     my $ordered_fasta_files = $self->fasta_files;
00329 
00330     #Check for existance of self->tree_string. It will only exist if there are no duplications
00331     #in the high coverage alignment. If there are duplications, self->tree_string will not 
00332     #exist and treebest will have been run to generate a tree.
00333     unless ($self->tree_string) {
00334     if (-e $tree_file) {
00335         ## Estimated tree. Overwrite the order of the fasta files and get the tree
00336         open(F, $tree_file) || throw("Could not open tree file <$tree_file>");
00337 
00338         my ($newick, $files);
00339         while (<F>) {
00340         $newick .= $_;
00341         }
00342         close(F);
00343         $newick =~ s/[\r\n]+$//;
00344         $self->tree_string($newick);
00345         
00346         #print "newick $newick\n";
00347     
00348         my $this_tree =
00349           Bio::EnsEMBL::Compara::Graph::NewickParser::parse_newick_into_tree($newick);
00350         
00351         #created tree via semphy or treebest
00352         $ordered_fasta_files = undef;
00353         my $all_leaves = $this_tree->get_all_leaves;
00354         foreach my $this_leaf (@$all_leaves) {
00355         push @$ordered_fasta_files, $this_leaf->name . ".fa";
00356         }
00357         
00358         $self->fasta_files(@$ordered_fasta_files);
00359         #print STDOUT "**NEWICK: $newick\nFILES: ", join(" -- ", @$ordered_fasta_files), "\n";
00360     } else {
00361         throw "Expected to find $tree_file created by treebest, but did not\n";
00362     }
00363     }
00364 
00365     my (@ordered_leaves) = $self->tree_string =~ /[(,]([^(:)]+)/g;
00366     #print "++NEWICK: ", $self->tree_string, "\nLEAVES: ", join(" -- ", @ordered_leaves), "\nFILES: ", join(" -- ", @{$self->fasta_files}), "\n";
00367 
00368     my $alignment_file;
00369     #read in mfa file created for input into treebest
00370     $alignment_file = $self->param('multi_fasta_file');
00371 
00372     my $this_genomic_align_block = new Bio::EnsEMBL::Compara::GenomicAlignBlock;
00373     open(F, $alignment_file) || throw("Could not open $alignment_file");
00374     my $seq = "";
00375     my $this_genomic_align;
00376 
00377     #Create genomic_align_group object to store genomic_aligns for
00378     #each node. For 2x genomes, there may be several genomic_aligns
00379     #for a node but for other genomes there will only be one
00380     #genomic_align in the genomic_align_group
00381     my $genomic_align_group;
00382 
00383     my $tree = Bio::EnsEMBL::Compara::Graph::NewickParser::parse_newick_into_tree($self->tree_string);
00384     $tree->print_tree(100) if ($self->debug);
00385     
00386     print $tree->newick_format("simple"), "\n";
00387     print join(" -- ", map {$_->name} @{$tree->get_all_leaves}), "\n";
00388     print "Reading $alignment_file...\n";
00389     my $ids;
00390 
00391     foreach my $this_file (@$ordered_fasta_files) {
00392     push(@$ids, qx"head -1 $this_file");
00393     #print "add ids $this_file " . $ids->[-1] . "\n";
00394     }
00395     #print join(" :: ", @$ids), "\n\n";
00396 
00397     my $genomic_aligns_2x_array = [];
00398     my @num_frag_pads;
00399     my $frag_limits;
00400     my @ga_lengths;
00401     my $ga_deletions;
00402 
00403     while (<F>) {
00404     next if (/^\s*$/);
00405     chomp;
00406     ## FASTA headers correspond to the tree and the order of the leaves in the tree corresponds
00407     ## to the order of the files
00408 
00409     if (/^>/) {
00410         print "PARSING $_\n" if ($self->debug);
00411         #print $tree->newick_format(), "\n" if ($self->debug);
00412         my ($name) = $_ =~ /^>(.+)/;
00413         if (defined($this_genomic_align) and  $seq) {
00414         if (@$genomic_aligns_2x_array) {
00415             print "*****FOUND 2x seq " . length($seq) . "\n" if ($self->debug);
00416             #starting offset
00417             my $offset = $num_frag_pads[0];
00418             #how many X's to add at the start of the cigar_line
00419             my $start_X;
00420 
00421             #how many X's to add to the end of the cigar_line
00422             my $end_X;
00423 
00424             my $align_offset = 0;
00425             for (my $i = 0; $i < @$genomic_aligns_2x_array; $i++) {
00426             my $genomic_align = $genomic_aligns_2x_array->[$i];
00427             my $num_pads = $num_frag_pads[$i+1];
00428             my $ga_length = $genomic_align->dnafrag_end-$genomic_align->dnafrag_start+1;
00429 
00430             #print "ga-length $ga_length " . $genomic_align->dnafrag_start . " " . $genomic_align->dnafrag_end . " " , $ga_lengths[$i] . "\n";
00431 
00432             my ($subseq, $aligned_start, $aligned_end) = _extract_sequence($seq, $align_offset, $ga_lengths[$i]);
00433 
00434             $align_offset = $aligned_end;
00435             #print "final subseq $aligned_start $aligned_end $subseq\n";
00436             #Add aligned sequence
00437             $genomic_align->aligned_sequence($subseq);
00438 
00439             my $cigar_line = create_2x_cigar_line($subseq, $ga_deletions->[$i]);
00440             $genomic_align->cigar_line($cigar_line);
00441 
00442 
00443             #Add X padding characters to ends of seq
00444             $start_X = $aligned_start;
00445             $end_X = length($seq) - ($start_X+length($subseq));
00446 
00447             print "start_X $start_X end_X $end_X subseq_length " . length($subseq) . "\n" if ($self->debug);
00448 
00449             #print "before cigar_line " . $genomic_align->cigar_line . "\n";
00450 
00451             
00452             $genomic_align->cigar_line($start_X . "X" .$genomic_align->cigar_line . $end_X . "X");
00453 
00454             #print "after cigar_line " . $genomic_align->cigar_line . "\n";
00455 
00456             #free aligned_sequence now that I've used it to 
00457             #create the cigar_line
00458             undef($genomic_align->{'aligned_sequence'});
00459 
00460             #Add genomic align to genomic align block
00461             $this_genomic_align_block->add_GenomicAlign($genomic_align);
00462             #$offset += $num_pads + $ga_length;
00463             $offset += $ga_length;
00464             }
00465             $genomic_aligns_2x_array = [];
00466             undef @num_frag_pads;
00467             undef @ga_lengths;
00468             undef $ga_deletions;
00469             undef $frag_limits;
00470         } else {
00471 
00472             print "add aligned_sequence " . $this_genomic_align->dnafrag_id . " " . $this_genomic_align->dnafrag_start . " " . $this_genomic_align->dnafrag_end . "\n" if $self->debug;
00473 
00474             $this_genomic_align->aligned_sequence($seq);
00475 
00476             #need to add original sequence here because the routine
00477             #remove_empty_columns can delete parts of the alignment and
00478             #so the original_sequence cannot be reconstructed from the
00479             #aligned_sequence
00480             if ($this_genomic_align->dnafrag_id == -1) {
00481             $this_genomic_align->original_sequence;
00482             }
00483             #undef aligned_sequence now. Necessary because otherwise 
00484             #when I remove_empty_columns, this
00485             #modifies the cigar_line only and not the aligned_sequence
00486             #so not removing it here causes the genomic_align_block
00487             #length to be wrong since it finds the length of the
00488             #aligned_sequence
00489             $this_genomic_align->cigar_line;
00490             undef($this_genomic_align->{'aligned_sequence'});
00491 
00492             $this_genomic_align_block->add_GenomicAlign($this_genomic_align);
00493 
00494         }
00495         }
00496         my $header = shift(@$ids);
00497         $this_genomic_align = new Bio::EnsEMBL::Compara::GenomicAlign;
00498 
00499         if (!defined($header)) {
00500         print "INTERNAL NODE $name\n" if ($self->debug);
00501         my $this_node;
00502         foreach my $this_leaf_name (split("_", $name)) {
00503             if ($this_node) {
00504             my $other_node = $tree->find_node_by_name($this_leaf_name);
00505             if (!$other_node) {
00506                 throw("Cannot find node <$this_leaf_name>\n");
00507             }
00508             $this_node = $this_node->find_first_shared_ancestor($other_node);
00509             } else {
00510             print $tree->newick_format() if ($self->debug);
00511             print " LEAF: $this_leaf_name\n" if ($self->debug);
00512             $this_node = $tree->find_node_by_name($this_leaf_name);
00513             }
00514         }
00515         print join("_", map {$_->name} @{$this_node->get_all_leaves}), "\n" if ($self->debug);
00516         ## INTERNAL NODE: dnafrag_id and dnafrag_end must be edited somewhere else
00517 
00518         $this_genomic_align->dnafrag_id(-1);
00519         $this_genomic_align->dnafrag_start(1);
00520         $this_genomic_align->dnafrag_end(0);
00521         $this_genomic_align->dnafrag_strand(1);
00522 
00523         bless($this_node, "Bio::EnsEMBL::Compara::GenomicAlignTree");
00524         $genomic_align_group = new Bio::EnsEMBL::Compara::GenomicAlignGroup(
00525                 -type => "epo");
00526         $genomic_align_group->add_GenomicAlign($this_genomic_align);
00527 
00528         $this_node->genomic_align_group($genomic_align_group);
00529         $this_node->name($name);
00530         } elsif ($header =~ /^>SeqID(\d+)/) {
00531         #print "old $name\n";
00532         print "leaf_name?? $name\n" if ($self->debug);
00533 
00534         my $this_leaf = $tree->find_node_by_name($name);
00535         if (!$this_leaf) {
00536             #print $tree->newick_format(), " ****\n" if ($self->debug);
00537             print $tree->newick_format(), " ****\n";
00538             die "Unable to find leaf with name $name";
00539         }
00540         #print "$this_leaf\n";
00541         #         print "****** $name -- $header -- ";
00542         #         if ($this_leaf) {
00543         #           $this_leaf->print_node();
00544         #         } else {
00545         #           print "[none]\n";
00546         #         }
00547 
00548         #information extracted from fasta header
00549         my $seq_id = ($1);
00550 
00551         my $all_genomic_aligns = $self->param('genomic_aligns');
00552 
00553         my $ga = $all_genomic_aligns->[$seq_id-1];
00554 
00555         if (!UNIVERSAL::isa($ga, 'Bio::EnsEMBL::Compara::GenomicAlign')) {
00556             print "FOUND 2X GENOME\n" if $self->debug;
00557             print "num of frags " . @$ga . "\n" if $self->debug;
00558             print "*** NAME  " . $ga->[0]->{genomic_align}->genome_db->name . " start " . $ga->[0]->{genomic_align}->dnafrag_start . " end " . $ga->[0]->{genomic_align}->dnafrag_end . " name " . $ga->[0]->{genomic_align}->dnafrag->name . "\n" if $self->debug;
00559             #reorder fragments if reference slice is on the reverse
00560             #strand
00561             my $first_ref_ga = $ga->[0]->{ref_ga};
00562 
00563             if ($first_ref_ga->dnafrag_strand == -1) { 
00564               @{$ga} = sort {$b->{genomic_align_block}->reference_genomic_align->dnafrag_start <=> $a->{genomic_align_block}->reference_genomic_align->dnafrag_start} @{$ga};
00565           }
00566 
00567             #first pads
00568             push @num_frag_pads, $ga->[0]->{first_pads};
00569             #create new genomic_align for each pairwise fragment
00570             foreach my $ga_frag (@$ga) {
00571             my $genomic_align = new Bio::EnsEMBL::Compara::GenomicAlign;
00572             my $genomic_align_block = $ga_frag->{genomic_align_block};
00573             my $non_ref_genomic_align = $genomic_align_block->get_all_non_reference_genomic_aligns->[0];
00574 
00575             $genomic_align->dnafrag_id($non_ref_genomic_align->dnafrag_id);
00576             $genomic_align->dnafrag_start($non_ref_genomic_align->dnafrag_start);
00577             $genomic_align->dnafrag_end($non_ref_genomic_align->dnafrag_end);
00578             $genomic_align->dnafrag_strand($non_ref_genomic_align->dnafrag_strand);
00579 
00580             print "store start " . $genomic_align->dnafrag_start . " end " . $genomic_align->dnafrag_end . " strand " . $genomic_align->dnafrag_strand . "\n" if $self->debug;
00581 
00582             #print "LENGTHS " . $ga_frag->{length} . "\n";
00583             push @$ga_deletions, $ga_frag->{deletions};
00584             push @ga_lengths, $ga_frag->{length};
00585             push @num_frag_pads, $ga_frag->{num_pads};
00586             push @$genomic_aligns_2x_array, $genomic_align;
00587             }
00588             #Add genomic align to genomic align group 
00589             $genomic_align_group = new Bio::EnsEMBL::Compara::GenomicAlignGroup(
00590                                             #-genomic_align_array => $genomic_aligns_2x_array,
00591                                                 -type => "epo");
00592             foreach my $this_genomic_align (@$genomic_aligns_2x_array) {
00593             $genomic_align_group->add_GenomicAlign($this_genomic_align);
00594             }
00595 
00596             bless($this_leaf, "Bio::EnsEMBL::Compara::GenomicAlignTree");
00597             $this_leaf->genomic_align_group($genomic_align_group);
00598             print "size of array " . @$genomic_aligns_2x_array . "\n" if $self->debug;
00599             print "store gag1 $this_leaf\n" if $self->debug;
00600 
00601             #$self->{$this_leaf} = $genomic_align_group;
00602         } else  {
00603             print "normal name " . $ga->genome_db->name . "\n" if $self->debug;
00604 
00605 
00606             $this_genomic_align->dnafrag_id($ga->dnafrag_id);
00607             $this_genomic_align->dnafrag_start($ga->dnafrag_start);
00608             $this_genomic_align->dnafrag_end($ga->dnafrag_end);
00609             $this_genomic_align->dnafrag_strand($ga->dnafrag_strand);
00610 
00611             $genomic_align_group = new Bio::EnsEMBL::Compara::GenomicAlignGroup(
00612                                             #-genomic_align_array => [$this_genomic_align],
00613                                                 -type => "epo");
00614             $genomic_align_group->add_GenomicAlign($this_genomic_align);
00615 
00616             bless($this_leaf, "Bio::EnsEMBL::Compara::GenomicAlignTree");
00617             $this_leaf->genomic_align_group($genomic_align_group);
00618             print "store gag2 $this_leaf\n" if $self->debug;
00619         }
00620         } else {
00621         throw("Error while parsing the FASTA header. It must start by \">DnaFrag#####\" where ##### is the dnafrag_id\n$_");
00622         }
00623         $seq = "";
00624     } else {
00625         $seq .= $_;
00626     }
00627     }
00628     close F;
00629 
00630     #last genomic_align
00631     print "Last genomic align\n" if ($self->debug);
00632     if (@$genomic_aligns_2x_array) {
00633     print "*****FOUND 2x seq " . length($seq) . "\n" if ($self->debug);
00634 
00635     #starting offset
00636     my $offset = $num_frag_pads[0];
00637 
00638     #how many X's to add at the start and end of the cigar_line
00639     my ($start_X , $end_X);
00640     
00641     my $align_offset = 0;
00642     for (my $i = 0; $i < @$genomic_aligns_2x_array; $i++) {
00643         my $genomic_align = $genomic_aligns_2x_array->[$i];
00644 
00645         my $num_pads = $num_frag_pads[$i+1];
00646         my $ga_length = $genomic_align->dnafrag_end-$genomic_align->dnafrag_start+1;
00647 
00648         print "extract_sequence $offset " .($offset+$ga_length) . " num pads $num_pads\n" if ($self->debug); 
00649         my ($subseq, $aligned_start, $aligned_end) = _extract_sequence($seq, $align_offset, $ga_lengths[$i]);
00650 
00651         $align_offset = $aligned_end;
00652 
00653 #       #Add aligned sequence
00654         $genomic_align->aligned_sequence($subseq);
00655 
00656         my $cigar_line = create_2x_cigar_line($subseq, $ga_deletions->[$i]);
00657         $genomic_align->cigar_line($cigar_line);
00658 
00659 #       #Add X padding characters to ends of seq
00660         $start_X = $aligned_start;
00661         $end_X = length($seq) - ($start_X+length($subseq));
00662         print "start_X $start_X end_X $end_X subseq_length " . length($subseq) . "\n" if ($self->debug);
00663         
00664         $genomic_align->cigar_line($start_X . "X" .$genomic_align->cigar_line . $end_X . "X");
00665         
00666         #free aligned_sequence now that I've used it to 
00667         #create the cigar_line
00668         undef($genomic_align->{'aligned_sequence'});
00669 
00670         #Add genomic align to genomic align block
00671         $this_genomic_align_block->add_GenomicAlign($genomic_align);
00672         $offset += $num_pads + $ga_length;
00673     }
00674     } else {
00675     if (defined $this_genomic_align && 
00676         $this_genomic_align->dnafrag_id != -1) {
00677         $this_genomic_align->aligned_sequence($seq);
00678         $this_genomic_align_block->add_GenomicAlign($this_genomic_align);
00679     }
00680     }
00681 
00682     #Where there is no ancestral sequences ie 2X genomes
00683     #convert all the nodes of the tree to Bio::EnsEMBL::GenomicAlignTree objects
00684     foreach my $this_node (@{$tree->get_all_nodes}) {
00685     if (!UNIVERSAL::isa($this_node, 'Bio::EnsEMBL::Compara::GenomicAlignTree')) {
00686         bless($this_node, "Bio::EnsEMBL::Compara::GenomicAlignTree");
00687     }
00688     }
00689 
00690     #fetch group_id from base alignment block if there is one
00691     my $multi_gab_id = $self->param('genomic_align_block_id');
00692     my $multi_gaba = $self->compara_dba->get_GenomicAlignBlockAdaptor;
00693     my $multi_gab = $multi_gaba->fetch_by_dbID($multi_gab_id);
00694     my $group_id = $multi_gab->group_id;
00695 
00696     #fix the group_id so that it starts with the current mlss_id not that of
00697     #the base alignment. Will always do this.
00698     if ($group_id) {
00699     $group_id = _fix_internal_ids($multi_gab->group_id, $multi_gab->method_link_species_set_id, $self->param('method_link_species_set_id'));
00700     } 
00701     $tree->group_id($group_id);
00702 
00703     #print $tree->newick_format("simple"), "\n";
00704     #print join(" -- ", map {$_."+".$_->node_id."+".$_->name} (@{$tree->get_all_nodes()})), "\n";
00705     #$self->output([$tree]);
00706     $self->param('runnable')->output([$tree]);
00707 }
00708 
00709 #Fix the group_id so that it starts with the current mlss_id not that of
00710 #the base alignment. 
00711 sub _fix_internal_ids {
00712     my ($group_id, $multi_mlss_id, $new_mlss_id) = @_;
00713     my $multiplier = 10**10;
00714     my $lower_limit = $multi_mlss_id * $multiplier;
00715     my $upper_limit = ($multi_mlss_id+1) * $multiplier;
00716     my $new_group_id;
00717     my $new_lower_limit = $new_mlss_id * $multiplier;
00718 
00719 
00720     #group_id has previous fix applied
00721     if ($group_id > $lower_limit && $group_id < $upper_limit) {
00722     $new_group_id = $group_id - $lower_limit + $new_lower_limit;
00723     } elsif ($group_id < $multiplier) {
00724     #group_id has had no fix applied
00725     $new_group_id = $group_id + $new_lower_limit;
00726     } else {
00727     #fix has already been applied!
00728     $new_group_id = $group_id;
00729     }
00730     return $new_group_id;
00731 }
00732 
00733 #create cigar line for 2x genomes manually because I need to add in the
00734 #insertions, that is "I" in the cigar_line to represent the 2x-only sequences
00735 #that are not found in the reference species which I removed during the
00736 #creation of the _create_mfa routine.
00737 sub create_2x_cigar_line {
00738     my ($aligned_sequence, $ga_deletions) = @_;
00739 
00740     my $cigar_line = "";
00741     my $base_pos = 0;
00742     my $current_deletion;
00743     if (defined $ga_deletions && @$ga_deletions > 0) {
00744     $current_deletion = shift @$ga_deletions;
00745     }
00746     
00747     my @pieces = grep {$_} split(/(\-+)|(\.+)/, $aligned_sequence);
00748     foreach my $piece (@pieces) {
00749     my $elem;
00750 
00751     #length of current piece
00752     my $this_len = length($piece);
00753     
00754     my $mode;
00755     if ($piece =~ /\-/) {
00756         $mode = "D"; # D for gaps (deletions)
00757         $elem = cigar_element($mode, $this_len);
00758     } elsif ($piece =~ /\./) {
00759         $mode = "X"; # X for pads (in 2X genomes)
00760         $elem = cigar_element($mode, $this_len);
00761     } else {
00762         $mode = "M"; # M for matches/mismatches
00763         my $next_pos = $base_pos + $this_len;
00764 
00765         #TODO need special case if have insertion as the last base.
00766         #need to have >= and < (not <=) otherwise if an insertion occurs
00767         #in the same position as a - then I is added twice.
00768 
00769         #check to see if next deletion occurs in this cigar element
00770         if (defined $current_deletion && 
00771         $current_deletion->{pos} >= $base_pos && 
00772         $current_deletion->{pos} < $next_pos) {
00773         
00774         #find all deletions that occur in this cigar element
00775         my $this_del_array;
00776         while ($current_deletion->{pos} >= $base_pos && 
00777                $current_deletion->{pos} < $next_pos) {
00778             push @$this_del_array, $current_deletion;
00779 
00780             last if (@$ga_deletions == 0);
00781             $current_deletion = shift @$ga_deletions;
00782         } 
00783         
00784         #loop through all deletions, adding them instead of this cigar element
00785         my $prev_pos = $base_pos;
00786         foreach my $this_del (@$this_del_array) {
00787             my $piece_len = ($this_del->{pos} - $prev_pos);
00788             $elem .= cigar_element($mode, $piece_len);
00789             $elem .= cigar_element("I", $this_del->{len});
00790             $prev_pos = $this_del->{pos};
00791             
00792         }
00793         #add final bit
00794         $elem .= cigar_element($mode, ($base_pos+$this_len) - $this_del_array->[-1]->{pos});
00795         } else {
00796         $elem = cigar_element($mode, $this_len);
00797         }
00798         
00799         $base_pos += $this_len;
00800         #print "LENGTH $this_len BASE POS $base_pos\n";
00801     }
00802     $cigar_line .= $elem;
00803     }   
00804     #print "cigar $cigar_line\n";
00805     return $cigar_line;
00806 }
00807 
00808 #create cigar element from mode and length
00809 sub cigar_element {
00810     my ($mode, $len) = @_;
00811     my $elem;
00812     if ($len == 1) {
00813     $elem = $mode;
00814     } elsif ($len > 1) { #length can be 0 if the sequence starts with a gap
00815     $elem = $len.$mode;
00816     }
00817     return $elem;
00818 }
00819 
00820 #check the new cigar_line is consistent ie the seq_length and number of (M+I) 
00821 #agree and the alignment length and total of cig_elems agree.
00822 sub check_cigar_line {
00823     my ($genomic_align, $total_gap) = @_;
00824 
00825     #can't check ancestral nodes because these don't have a dnafarg_start
00826     #or dnafrag_end.
00827     return if ($genomic_align->dnafrag_id == -1);
00828 
00829     my $seq_pos = 0;
00830     my $align_len = 0;
00831     my $cigar_line = $genomic_align->cigar_line;
00832     my $length = $genomic_align->dnafrag_end-$genomic_align->dnafrag_start+1;
00833     my $gab = $genomic_align->genomic_align_block;
00834 
00835     my @cig = ( $cigar_line =~ /(\d*[GMDXI])/g );
00836     for my $cigElem ( @cig ) {
00837     my $cigType = substr( $cigElem, -1, 1 );
00838     my $cigCount = substr( $cigElem, 0 ,-1 );
00839     $cigCount = 1 unless ($cigCount =~ /^\d+$/);
00840 
00841     if( $cigType eq "M" ) {
00842         $seq_pos += $cigCount;
00843     } elsif( $cigType eq "I") {
00844         $seq_pos += $cigCount;
00845     } elsif( $cigType eq "X") {
00846     } elsif( $cigType eq "G" || $cigType eq "D") {  
00847     }
00848     if ($cigType ne "I") {
00849         $align_len += $cigCount;
00850     }
00851     }
00852 
00853     throw ("Cigar line aligned length $align_len does not match (genomic_align_block_length (" . $gab->length . ") - num of gaps ($total_gap)) " . ($gab->length - $total_gap) . " for gab_id " . $gab->dbID . "\n")
00854       if ($align_len != ($gab->length - $total_gap));
00855 
00856     throw("Cigar line ($seq_pos) does not match sequence length $length\n") 
00857       if ($seq_pos != $length);
00858 }
00859 
00860 
00861 
00862 #If a gap has been found in a cig_elem of type M, need to split it into
00863 #firstM - I - lastM. This function adds firstM and I to new_cigar_line
00864 sub add_match_elem {
00865     my ($firstM, $gap_len, $new_cigar_line) = @_;
00866 
00867     #add firstM
00868     if ($firstM == 1) {
00869     $new_cigar_line .= "M";
00870     } elsif($firstM > 1) {
00871     $new_cigar_line .= $firstM . "M";
00872     } 
00873     
00874     if ($gap_len == 1) {
00875     $new_cigar_line .= "I";
00876     } elsif ($gap_len > 1) {
00877     $new_cigar_line .= $gap_len . "I";
00878     } 
00879     return ($new_cigar_line);
00880 }
00881 
00882 #
00883 # Extract the sequence corresponding to the 2X genome fragment
00884 # extracts subsequence from seq starting from aligned_start (alignment coords)
00885 # for $seq_length bases (not counting pads)
00886 sub _extract_sequence {
00887     my ($seq, $aligned_start, $seq_length) = @_;
00888     my $curr_length = 0;
00889     my $aligned_length = 0;
00890     #my $aligned_start;
00891     my $aligned_end;
00892 
00893     #print "original_start $aligned_start length $seq_length\n";
00894 
00895     #create new seq starting from aligned_start to the end
00896     my $new_seq = substr($seq, $aligned_start);
00897 
00898     #find the end in alignment coords counting seq_length bases.
00899     foreach my $subseq (grep {$_} split /(\-+)/, $new_seq) {
00900     #print "subseq $subseq\n";
00901     my $length = length($subseq);
00902     if ($subseq !~ /\-/) {
00903         if (!defined($aligned_end) && ($curr_length + $length >= $seq_length)) {
00904         $aligned_end = $aligned_length + ($seq_length - $curr_length) + $aligned_start;
00905         #print "aligned_end $aligned_end\n";
00906         last;
00907         }
00908         #length in bases
00909         $curr_length += $length;
00910     }
00911     #length in alignment coords
00912     $aligned_length += $length;
00913     }
00914     
00915     my $subseq = substr($seq, $aligned_start, ($aligned_end-$aligned_start));
00916     return ($subseq, $aligned_start, $aligned_end);
00917 }
00918 
00919 ##########################################
00920 #
00921 # getter/setter methods
00922 # 
00923 ##########################################
00924 
00925 sub fasta_files {
00926   my $self = shift;
00927 
00928   $self->param('_fasta_files', []) unless (defined $self->param('_fasta_files'));
00929   my $fasta_files = $self->param('_fasta_files');
00930 
00931   if (@_) {
00932     my $value = shift;
00933     push @$fasta_files, $value;
00934   }
00935   $self->param('_fasta_files', $fasta_files);
00936 
00937   return $self->param('_fasta_files');
00938 }
00939 
00940 sub species_order {
00941   my $self = shift;
00942 
00943   $self->param('_species_order', []) unless (defined $self->param('_species_order'));
00944   my $species_order = $self->param('_species_order');
00945 
00946   if (@_) {
00947     my $value = shift;
00948     push @$species_order, $value;
00949   }
00950   $self->param('_species_order', $species_order);
00951 
00952   return $self->param('_species_order');
00953 }
00954 
00955 sub get_species_tree {
00956   my $self = shift;
00957 
00958   my $newick_species_tree;
00959   if (defined($self->param('_species_tree'))) {
00960     return $self->param('_species_tree');
00961   } elsif ($self->param('tree_string')) {
00962     $newick_species_tree = $self->param('tree_string');
00963   } elsif ($self->param('tree_analysis_data_id')) {
00964       print "Taking tree from tree_analysis_data. Not implemented yet\n";
00965       #Leo has a new method for doing this so I don't need the analysis data adaptor here
00966       #my $analysis_data_adaptor = $self->{hiveDBA}->get_AnalysisDataAdaptor();
00967       #$newick_species_tree = $analysis_data_adaptor->fetch_by_dbID($self->param('tree_analysis_data_id'));
00968   } elsif ($self->param('tree_file')) {
00969       print "Taking tree from tree_file\n";
00970     open(TREE_FILE, $self->param('tree_file')) or throw("Cannot open file ".$self->param('tree_file'));
00971     $newick_species_tree = join("", <TREE_FILE>);
00972     close(TREE_FILE);
00973   }
00974 
00975   if (!defined($newick_species_tree)) {
00976     return undef;
00977   }
00978 
00979   $newick_species_tree =~ s/^\s*//;
00980   $newick_species_tree =~ s/\s*$//;
00981   $newick_species_tree =~ s/[\r\n]//g;
00982 
00983   my $species_tree =
00984       Bio::EnsEMBL::Compara::Graph::NewickParser::parse_newick_into_tree($newick_species_tree);
00985 
00986   #if the tree leaves are species names, need to convert these into genome_db_ids
00987   my $genome_dbs = $self->compara_dba->get_GenomeDBAdaptor->fetch_all();
00988 
00989   my %leaf_name;
00990   my %leaf_check;
00991   foreach my $genome_db (@$genome_dbs) {
00992       my $name = $genome_db->name;
00993       $name =~ tr/ /_/;
00994       $leaf_name{$name} = $genome_db->dbID;
00995       if ($name ne "ancestral_sequences") {
00996       $leaf_check{$genome_db->dbID} = 2;
00997       }
00998   }
00999   foreach my $leaf (@{$species_tree->get_all_leaves}) {
01000       #check have names rather than genome_db_ids
01001       if ($leaf->name =~ /\D+/) {
01002       $leaf->name(($leaf_name{lc($leaf->name)}));
01003       }
01004       $leaf_check{$leaf->name}++;
01005   }
01006 
01007   #Check have one instance in the tree of each genome_db in the database
01008   #Don't worry about having extra elements in the tree that aren't in the
01009   #genome_db table because these will be removed later
01010   foreach my $name (keys %leaf_check) {
01011       if ($leaf_check{$name} == 2) {
01012       throw("Unable to find genome_db_id $name in species_tree\n");
01013       }
01014   }
01015   
01016 
01017   $self->param('_species_tree', $species_tree);
01018   return $self->param('_species_tree');
01019 }
01020 
01021 sub get_taxon_tree {
01022   my $self = shift;
01023 
01024   my $newick_taxon_tree;
01025   if (defined($self->param('_taxon_tree'))) {
01026       #already read in taxon_tree, simply return
01027     return $self->param('_taxon_tree');
01028   } elsif ($self->param('taxon_tree')) {
01029       #read from a string
01030       $newick_taxon_tree = $self->param('taxon_tree');
01031   } elsif ($self->param('taxon_tree_analysis_data_id')) {
01032       #read from analysis_data table
01033       #Leo has a new method for doing this so I don't need the analysis data adaptor here
01034       #my $analysis_data_adaptor = $self->{hiveDBA}->get_AnalysisDataAdaptor();
01035       #$newick_taxon_tree = $analysis_data_adaptor->fetch_by_dbID($self->param('taxon_tree_analysis_data_id'));
01036       print "Taking taxon tree from analysis_data. Not implemented yet\n";
01037   } elsif ($self->param('taxon_tree_file')) {
01038       print "Taking taxon tree from file\n";
01039       #read from file
01040       open(TREE_FILE, $self->param('taxon_tree_file')) or throw("Cannot open file ".$self->param('taxon_tree_file'));
01041       $newick_taxon_tree = join("", <TREE_FILE>);
01042       close(TREE_FILE);
01043   }
01044 
01045   if (!defined($newick_taxon_tree)) {
01046     return undef;
01047   }
01048 
01049   $self->param('_taxon_tree', $newick_taxon_tree);
01050 
01051   return $self->param('_taxon_tree');
01052   
01053 }
01054 
01055 sub tree_string {
01056   my ($self, $tree_string) = @_;
01057   $self->param('_tree_string', $tree_string) if(defined $tree_string);
01058   return $self->param('_tree_string');
01059 }
01060 
01061 sub _load_GenomicAligns {
01062   my ($self, $genomic_align_block_id) = @_;
01063   my $genomic_aligns = [];
01064 
01065   # Fail if dbID has not been provided
01066   return $genomic_aligns if (!$genomic_align_block_id);
01067 
01068   my $gaba = $self->compara_dba->get_GenomicAlignBlockAdaptor;
01069   my $gab = $gaba->fetch_by_dbID($genomic_align_block_id);
01070 
01071   foreach my $ga (@{$gab->get_all_GenomicAligns}) {  
01072       #check that the genomic_align sequence is not just N's. This causes 
01073       #complications with treeBest and we end up with very long branch lengths
01074 
01075       my $slice = $ga->get_Slice();
01076       my $seqlevel = $slice->coord_system->adaptor->fetch_sequence_level();
01077       my @projection = @{$slice->project($seqlevel->name(), $seqlevel->version())};
01078       if (@projection > 0) {
01079       push(@{$genomic_aligns}, $ga);
01080       }
01081   }
01082 
01083   #only store genomic_aligns if there are more than 1 genomic_align left in the
01084   #genomic_align_block
01085   if (@$genomic_aligns > 1) {
01086       $self->param('genomic_aligns', $genomic_aligns);
01087   } 
01088 
01089 }
01090 
01091 =head2 _load_2XGenomes
01092 
01093   Arg [1]    : int genomic_align_block_id
01094   Arg [2]    : int analysis_data_id
01095   Description: Creates a fake assembly for each 2X genome by stitching
01096                together the BLASTZ_NET alignments found on this synteny_region
01097                between the reference species and each 2X genome. The list of
01098                the pairwise database locations and  
01099                Bio::EnsEMBL::Compara::MethodLinkSpeciesSet ids are obtained
01100                from the analysis_data_id. Creates a listref of genomic_align 
01101                fragments
01102   Returntype : 
01103   Exception  : 
01104   Warning    :
01105 
01106 =cut
01107 
01108 sub _load_2XGenomes {
01109   my ($self, $genomic_align_block_id) = @_;
01110 
01111   #extract location of pairwise blocks
01112   my $pairwise_exception_location = $self->param('pairwise_exception_location');
01113   my $default_location = $self->param('pairwise_default_location');
01114 
01115   my $pairwise_locations = $self->_construct_pairwise_locations();
01116 
01117 
01118   #if no 2x genomes defined, return
01119   if (!defined $pairwise_locations) {
01120       print "No 2x genomes to load\n" if $self->debug;
01121       return;
01122   }
01123 
01124   #Find the slice on the reference genome
01125   my $genome_db_adaptor = $self->compara_dba->get_GenomeDBAdaptor;
01126 
01127   #DEBUG this opens up connections to all the databases
01128   my $ref_genome_db = $genome_db_adaptor->fetch_by_name_assembly($self->param('reference_species'));
01129   my $ref_dba = $ref_genome_db->db_adaptor;
01130   my $ref_slice_adaptor = $ref_dba->get_SliceAdaptor();
01131 
01132   #Get multiple alignment genomic_align_block adaptor
01133   my $multi_gaba = $self->compara_dba->get_GenomicAlignBlockAdaptor;
01134 
01135   #Find all the dnafrag_regions for the reference genome in this synteny region
01136   my $ref_gas =[];
01137   my $multi_gab = $multi_gaba->fetch_by_dbID($genomic_align_block_id);
01138   my $all_gas = $multi_gab->get_all_GenomicAligns;
01139 
01140   foreach my $ga (@$all_gas) {
01141       if ($ga->genome_db->dbID == $ref_genome_db->dbID) {
01142       push @$ref_gas, $ga;
01143       }
01144   }
01145   
01146   #Return if there is no reference sequence in this gab region
01147   if (scalar(@$ref_gas) == 0) {
01148       print "No " . $self->param('reference_species') . " sequences found in genomic_align_block $genomic_align_block_id\n";
01149       return;
01150   }
01151 
01152   print "GAB $genomic_align_block_id num ref copies " . scalar(@$ref_gas) . "\n" if $self->debug;
01153 
01154   #Find the BLASTZ_NET alignments between the reference species and each
01155   #2X genome.
01156 
01157   foreach my $mlss_id (keys %$pairwise_locations) {
01158       my $target_species;
01159 
01160       #open compara database containing 2x genome vs $ref_name blastz results
01161       #my $compara_db_url = $param->{'compara_db_url'};
01162       my $compara_db_url = $pairwise_locations->{$mlss_id};
01163 
01164       #if the database name is defined in the url, then open that
01165       my $compara_dba;
01166       my $locator;
01167       if ($compara_db_url =~ /mysql:\/\/.*@.*\/.+/) {
01168       #open database defined in url
01169       $locator = "Bio::EnsEMBL::Compara::DBSQL::DBAdaptor/url=>$compara_db_url";
01170       } else {
01171       throw "Invalid url $compara_db_url. Should be of the form: mysql://user:pass\@host:port/db_name\n";
01172       }
01173 
01174       $compara_dba = Bio::EnsEMBL::DBLoader->new($locator);
01175 
01176       #need to store this to allow disconnect when call ortheus
01177       my $pairwise_compara_dba = $self->param('pairwise_compara_dba');
01178       $pairwise_compara_dba->{$compara_dba->dbc->dbname} = $compara_dba;
01179 
01180       #Get pairwise genomic_align_block adaptor
01181       my $pairwise_gaba = $compara_dba->get_GenomicAlignBlockAdaptor;
01182 
01183       #Get pairwise method_link_species_set
01184       my $p_mlss_adaptor = $compara_dba->get_MethodLinkSpeciesSetAdaptor;
01185       #my $pairwise_mlss = $p_mlss_adaptor->fetch_by_dbID($param->{'method_link_species_set_id'});
01186       my $pairwise_mlss = $p_mlss_adaptor->fetch_by_dbID($mlss_id);
01187 
01188       #find non_reference species name in pairwise alignment
01189       my $species_set = $pairwise_mlss->species_set;
01190       foreach my $genome_db (@$species_set) {
01191       if ($genome_db->name ne $self->param('reference_species')) {
01192           $target_species = $genome_db->name;
01193           last;
01194       }
01195       }
01196      
01197       my $target_genome_db = $genome_db_adaptor->fetch_by_name_assembly($target_species);
01198       my $target_dba = $target_genome_db->db_adaptor;
01199       my $target_slice_adaptor = $target_dba->get_SliceAdaptor();
01200 
01201       #Foreach copy of the ref_genome in the multiple alignment block, 
01202       #find the alignment blocks between the ref_genome and the 2x 
01203       #target_genome in the pairwise database
01204 
01205       my $ga_frag_array = $self->_create_frag_array($pairwise_gaba, $pairwise_mlss, $ref_gas);
01206   
01207       #not found 2x genome
01208       next if (!defined $ga_frag_array);
01209 
01210       #must first sort so I have a reasonable chance of finding duplicates
01211 
01212 
01213       #4/10/08 don't think sorting here is a good idea. The order is very
01214       #important for when I store the fragments and work out their offsets.
01215 #      for (my $i = 0; $i < scalar(@$ga_frag_array); $i++) {
01216 #     @{$ga_frag_array->[$i]} = sort {$a->{genomic_align}->dnafrag_start <=> $b->{genomic_align}->dnafrag_start} @{$ga_frag_array->[$i]};
01217 #      }
01218     
01219       #find the total length of all the fragments in the ref_region
01220       my $sum_lengths;
01221       for (my $i = 0; $i < scalar(@$ga_frag_array); $i++) {
01222       for (my $j = 0; $j < scalar(@{$ga_frag_array->[$i]}); $j++) {
01223           #print "*** gab *** " . $ga_frag_array->[$i][$j]->{genomic_align}->genome_db->name . " " . $ga_frag_array->[$i][$j]->{genomic_align}->genomic_align_block . "\n";
01224 
01225           $sum_lengths->[$i] += ($ga_frag_array->[$i][$j]->{genomic_align}->dnafrag_end - $ga_frag_array->[$i][$j]->{genomic_align}->dnafrag_start + 1);
01226       }
01227       }
01228 
01229       #check if there is any overlap between pairwise blocks on the ref_genomes
01230       #if there is an overlap, then choose ref_genome duplication which is the 
01231       #longest in 2x genome
01232       #if there is no overlap, save dnafrags on all duplications
01233       my $found_overlap;
01234       my $j = 0;
01235     
01236       #Simple case: only found one reference region containing 2x genome 
01237       #fragments
01238       if (@$ga_frag_array == 1) {
01239       my $cluster;
01240       $cluster = _add_to_cluster($cluster, 0);
01241       _print_cluster($cluster) if $self->debug;
01242       my $longest_ref_region = 0;
01243 
01244       print "SIMPLE CASE: longest_region $longest_ref_region length " . $sum_lengths->[$longest_ref_region] . "\n" if $self->debug;
01245       
01246       #_build_2x_composite_seq($self, $compara_dba, $ref_slice_adaptor, $target_slice_adaptor, $ga_frag_array->[$longest_ref_region]);
01247       
01248       my $ga_frag = $self->param('ga_frag');
01249       my $x2_dnafrag_region = $self->param('2x_dnafrag_region');
01250       push @$ga_frag, $ga_frag_array->[$longest_ref_region];
01251       push @$x2_dnafrag_region, $ga_frag_array->[$longest_ref_region]->[0]->{genomic_align};
01252       $self->param('ga_frag', $ga_frag);
01253       $self->param('2x_dnafrag_region', $x2_dnafrag_region);
01254 
01255       #push @{$self->param('ga_frag')}, $ga_frag_array->[$longest_ref_region];
01256       #push @{$self->param('2x_dnafrag_region')}, $ga_frag_array->[$longest_ref_region]->[0]->{genomic_align};
01257       next;
01258       }
01259 
01260       #Found more than one reference region in this synteny block
01261       for (my $region1 = 0; $region1 < scalar(@$ga_frag_array)-1; $region1++) {
01262       for (my $region2 = $region1+1; $region2 <  scalar(@$ga_frag_array); $region2++) {
01263           #initialise found_overlap hash
01264           if (!defined $found_overlap->{$region1}{$region2}) {
01265           $found_overlap->{$region1}{$region2} = 0;
01266           }
01267           
01268           #loop through the 2x genome fragments on region1
01269           for (my $j = 0; ($j < @{$ga_frag_array->[$region1]}); $j++) {
01270           
01271           #if I've already found an overlap, then stop
01272           last if ($found_overlap->{$region1}{$region2});
01273           
01274           #loop through 2x genome fragments on region2
01275           for (my $k = 0; ($k < @{$ga_frag_array->[$region2]}); $k++) {
01276 
01277             #if I've already found an overlap, then stop
01278               last if ($found_overlap->{$region1}{$region2});
01279               
01280               #check if 2x genome fragments have the same name
01281               if ($ga_frag_array->[$region1][$j]->{seq_region_name} eq $ga_frag_array->[$region2][$k]->{seq_region_name}) {
01282               
01283               #check these overlap
01284               if (($ga_frag_array->[$region1][$j]->{start} <= $ga_frag_array->[$region2][$k]->{end}) && ($ga_frag_array->[$region1][$j]->{end} >= $ga_frag_array->[$region2][$k]->{start})) {
01285 
01286                   $found_overlap->{$region1}{$region2} = 1;
01287                   print "found overlap $region1 $region2\n" if $self->debug;
01288                   #found overlap so can stop.
01289                   last;
01290               }
01291               }
01292           }
01293           }
01294       }
01295       }
01296 
01297       #Cluster all the alignment blocks that are overlapping together
01298       my $cluster = $self->_cluster_regions($found_overlap);
01299       _print_cluster($cluster) if $self->debug;
01300       my $longest_regions = $self->_find_longest_region_in_cluster($cluster, $sum_lengths);
01301 
01302       #find the reference with the longest region
01303       my $slice_array;
01304       foreach my $longest_ref_region (@$longest_regions) {
01305       print "longest_ref_region $longest_ref_region length " . $sum_lengths->[$longest_ref_region] . "\n" if $self->debug;
01306 
01307       #store composite_seq in ga_frag_array->[$longest_ref_region]
01308       #_build_2x_composite_seq($self, $compara_dba, $ref_slice_adaptor, $target_slice_adaptor, $ga_frag_array->[$longest_ref_region]);
01309       my $ga_frag = $self->param('ga_frag');
01310       my $x2_dnafrag_region = $self->param('2x_dnafrag_region');
01311       push @$ga_frag, $ga_frag_array->[$longest_ref_region];
01312       push @$x2_dnafrag_region, $ga_frag_array->[$longest_ref_region]->[0]->{genomic_align};
01313       $self->param('ga_frag', $ga_frag);
01314       $self->param('2x_dnafrag_region', $x2_dnafrag_region);
01315 
01316       #push @{$self->param('ga_frag')}, $ga_frag_array->[$longest_ref_region];
01317 
01318       #push @{$self->param('2x_dnafrag_region')}, $ga_frag_array->[$longest_ref_region]->[0]->{genomic_align};
01319 
01320       }
01321   } 
01322 }
01323 
01324 sub _construct_pairwise_locations {
01325     my $self = shift;
01326 
01327     my $pairwise_location;
01328 
01329     #list of mlss to be found in the default location (usually previous compara database)
01330     my @mlsss = eval($self->param('pairwise_default_mlss'));
01331 
01332     #Add exceptions
01333     $pairwise_location = $self->param('pairwise_exception_location');
01334 
01335     #Add defaults which may overwrite the exceptions if the exception mlss is now in
01336     #default location
01337     foreach my $mlss (@mlsss) {
01338     $pairwise_location->{$mlss} = $self->param('pairwise_default_location');
01339     }
01340 
01341     #Need a check here that I have pairwise_mlss for all the low coverage genomes.
01342     my $gab_adaptor = $self->compara_dba->get_GenomicAlignBlockAdaptor;
01343     my $gab = $gab_adaptor->fetch_by_dbID($self->param('genomic_align_block_id'));
01344     my $high_epo_mlss = $gab->method_link_species_set;
01345     my $genome_db_adaptor = $self->compara_dba->get_GenomeDBAdaptor;
01346 
01347     my $mlss_adaptor = $self->compara_dba->get_MethodLinkSpeciesSetAdaptor;
01348     my $low_mlss = $mlss_adaptor->fetch_by_dbID($self->param('method_link_species_set_id'));
01349 
01350     my $low_coverage_species_set;
01351     my $species_set = $low_mlss->species_set;
01352 
01353     my %all_species_set;
01354     foreach my $genome_db (@$species_set) {
01355     $all_species_set{$genome_db->dbID} = 1;
01356     }
01357     #Find only low coverage species by removing the high coverage ones from the list of all of them
01358     foreach my $high_species (@{$high_epo_mlss->species_set}) {
01359     $all_species_set{$high_species->dbID} = 2;
01360     }
01361     foreach my $genome_db_id (keys %all_species_set) {
01362     if ($all_species_set{$genome_db_id} == 1) {
01363         push @$low_coverage_species_set, $genome_db_id;
01364     }
01365     }
01366 
01367     #Check found all low coverage pairwise mlss_ids
01368     if (@$low_coverage_species_set != scalar keys %$pairwise_location) {
01369     print("Failed to find all the pairwise low coverage species method_link_species_set ids\n");
01370 
01371     foreach my $genome_db_id (@$low_coverage_species_set) {
01372         my $species;
01373         my $found = 0;
01374         foreach my $mlss_id (keys %$pairwise_location) {
01375         
01376         my $compara_db_url = $pairwise_location->{$mlss_id};
01377         
01378         #if the database name is defined in the url, then open that
01379         my $compara_dba;
01380         my $locator;
01381         if ($compara_db_url =~ /mysql:\/\/.*@.*\/.+/) {
01382             #open database defined in url
01383             $locator = "Bio::EnsEMBL::Compara::DBSQL::DBAdaptor/url=>$compara_db_url";
01384         } else {
01385             throw "Invalid url $compara_db_url. Should be of the form: mysql://user:pass\@host:port/db_name\n";
01386         }
01387 
01388         $compara_dba = Bio::EnsEMBL::DBLoader->new($locator);
01389         
01390         my $mlss_adaptor = $compara_dba->get_MethodLinkSpeciesSetAdaptor;
01391         my $mlss = $mlss_adaptor->fetch_by_dbID($mlss_id);
01392         $species = $genome_db_adaptor->fetch_by_dbID($genome_db_id)->name;
01393         
01394         foreach my $this_spp (@{$mlss->species_set}) {
01395             if ($this_spp->name ne $self->param('reference_species')) {
01396             if ($this_spp->dbID == $genome_db_id) {
01397                 $found = 1;
01398                 print "Found $species mlss $mlss_id " . $pairwise_location->{$mlss_id} . "\n";
01399                 last;
01400             }
01401             }
01402         }
01403         }
01404         if (!$found) {
01405         throw ("Failed to find a pairwise method_link_species_set for " . $species);
01406         }
01407     }
01408     }
01409 
01410     if ($self->debug) {
01411     foreach my $key (keys %$pairwise_location) {
01412         print "key $key " . $pairwise_location->{$key} . "\n";
01413     }
01414     }
01415     return $pairwise_location;
01416 }
01417 
01418 =head2 _dump_fasta_and_mfa
01419 
01420   Arg [1]    : -none-
01421   Example    : $self->_dump_fasta();
01422   Description: Dumps FASTA files in the order given by the tree
01423                string (needed by Pecan). Resulting file names are
01424                stored using the fasta_files getter/setter
01425   Returntype : 1
01426   Exception  :
01427   Warning    :
01428 
01429 =cut
01430 sub _dump_fasta_and_mfa {
01431   my $self = shift;
01432   my $all_genomic_aligns = $self->param('genomic_aligns');
01433 
01434   ## Dump FASTA files in the order given by the tree string (needed by Pecan)
01435   my @seqs;
01436   if ($self->tree_string) {
01437     @seqs = ($self->tree_string =~ /seq(\d+)/g);
01438   } else {
01439     @seqs = (1..scalar(@$all_genomic_aligns));
01440   }
01441 
01442   my $mfa_file = $self->worker_temp_directory . "/epo_alignment.$$.mfa";
01443   $self->param('multi_fasta_file', $mfa_file);
01444 
01445   print "mfa_file $mfa_file\n" if $self->debug;
01446   open MFA, ">$mfa_file" || throw("Couldn't open $mfa_file");
01447 
01448   foreach my $seq_id (@seqs) {
01449     my $ga = $all_genomic_aligns->[$seq_id-1];
01450 
01451     my $file = $self->worker_temp_directory . "/seq" . $seq_id;
01452 
01453     #Check if I have a DnaFragRegion object or my 2x genome object
01454     #if (!UNIVERSAL::isa($dfr, 'Bio::EnsEMBL::Compara::DnaFragRegion')) {
01455     if (!UNIVERSAL::isa($ga, 'Bio::EnsEMBL::Compara::GenomicAlign')) {
01456     print "FOUND 2X GENOME\n" if $self->debug;
01457     print "num of frags " . @$ga . "\n" if $self->debug;
01458     $self->_dump_2x_fasta($ga, $file, $seq_id, \*MFA);
01459     next;
01460     }
01461 
01462     #add taxon_id to end of fasta files
01463     $file .= "_" . $ga->genome_db->taxon_id . ".fa";
01464     print "file $file\n" if $self->debug;
01465     
01466     open F, ">$file" || throw("Couldn't open $file");
01467 
01468     print F ">SeqID" . $seq_id . "\n";
01469     #print MFA ">SeqID" . $seq_id . "\n";
01470 
01471     #print MFA ">seq" . $seq_id . "\n";
01472     print MFA ">seq" . $seq_id . "_" . $ga->genome_db->taxon_id . "\n";
01473 
01474     print ">DnaFrag", $ga->dnafrag->dbID, "|", $ga->dnafrag->name, ".",
01475         $ga->dnafrag_start, "-", $ga->dnafrag_end, ":", $ga->dnafrag_strand,"\n" if $self->debug;
01476 
01477     my $slice = $ga->get_Slice;
01478     throw("Cannot get slice for DnaFragRegion in DnaFrag #".$ga->dnafrag->dbID) if (!$slice);
01479     
01480     my $seq = $slice->get_repeatmasked_seq(undef, 1)->seq;
01481 
01482     if ($seq =~ /[^ACTGactgNnXx]/) {
01483       print STDERR $slice->name, " contains at least one non-ACTGactgNnXx character. These have been replaced by N's\n";
01484       $seq =~ s/[^ACTGactgNnXx]/N/g;
01485     }
01486     $seq =~ s/(.{80})/$1\n/g;
01487 
01488     chomp $seq;
01489     print F $seq,"\n";
01490 
01491     close F;
01492 
01493     my $aligned_seq = $ga->aligned_sequence;
01494     $aligned_seq =~ s/(.{60})/$1\n/g;
01495     $aligned_seq =~ s/\n$//;
01496     print MFA $aligned_seq, "\n";
01497 
01498     push @{$self->fasta_files}, $file;
01499     push @{$self->species_order}, $ga->dnafrag->genome_db_id;
01500   }
01501   close MFA;
01502 
01503   return 1;
01504 }
01505 
01506 =head2 _build_tree_string
01507 
01508   Arg [1]    : -none-
01509   Example    : $self->_build_tree_string();
01510   Description: This method sets the tree_string using the orginal
01511                species tree and the set of DnaFragRegions. The
01512                tree is edited by the _update_tree method which
01513                resort the DnaFragRegions (see _update_tree elsewwhere
01514                in this document)
01515   Returntype : -none-
01516   Exception  :
01517   Warning    :
01518 
01519 =cut
01520 
01521 sub _build_tree_string {
01522   my $self = shift;
01523 
01524   my $tree = $self->get_species_tree->copy;
01525   return if (!$tree);
01526   
01527   $tree = $self->_update_tree_2x($tree);
01528 
01529   return if (!$tree);
01530 
01531   my $tree_string = $tree->newick_simple_format;
01532 
01533 
01534   # Remove quotes around node labels
01535   $tree_string =~ s/"(seq\d+)"/$1/g;
01536   # Remove branch length if 0
01537   $tree_string =~ s/\:0\.0+(\D)/$1/g;
01538   $tree_string =~ s/\:0([^\.\d])/$1/g;
01539 
01540   $tree->release_tree;
01541 
01542   $self->tree_string($tree_string);
01543 }
01544 
01545 
01546 =head2 _update_tree_2x
01547 
01548   Arg [1]    : Bio::EnsEMBL::Compara::NestedSet $tree_root
01549   Example    : $self->_update_nodes_names($tree);
01550   Description: This method updates the tree by removing or
01551                duplicating the leaves according to the orginal
01552                tree and the set of DnaFragRegions. The tree nodes
01553                will be renamed seq1, seq2, seq3 and so on and the
01554                DnaFragRegions will be resorted in order to match
01555                the names of the nodes (the first DnaFragRegion will
01556                correspond to seq1, the second to seq2 and so on).
01557   Returntype : Bio::EnsEMBL::Compara::NestedSet (a tree)
01558   Exception  :
01559   Warning    :
01560 
01561 =cut
01562 
01563 sub _update_tree_2x {
01564   my $self = shift;
01565   my $tree = shift;
01566 
01567   my $all_genomic_aligns = $self->param('genomic_aligns');
01568   my $ordered_genomic_aligns = [];
01569   my $ordered_2x_genomes = [];
01570 
01571   my $idx = 1;
01572   my $all_leaves = $tree->get_all_leaves;
01573   foreach my $this_leaf (@$all_leaves) {
01574     my $these_genomic_aligns = [];
01575     my $these_2x_genomes = [];
01576     ## Look for genomic_aligns belonging to this genome_db_id
01577     foreach my $this_genomic_align (@$all_genomic_aligns) {
01578       if ($this_genomic_align->dnafrag->genome_db_id == $this_leaf->name) {
01579         push (@$these_genomic_aligns, $this_genomic_align);
01580       }
01581     }
01582 
01583     my $index = 0;
01584     if ($self->param('ga_frag')) {
01585     foreach my $ga_frags (@{$self->param('ga_frag')}) {
01586         my $first_frag = $ga_frags->[0];
01587         
01588         #print "update_tree first_frag " . $first_frag->{genomic_align}->genome_db->dbID . " this leaf " . $this_leaf->name . "\n";
01589         if ($first_frag->{genomic_align}->dnafrag->genome_db->dbID == $this_leaf->name) {
01590         push(@$these_2x_genomes, $index);
01591         }
01592         $index++;
01593     }
01594     }
01595     print "num " . @$these_genomic_aligns . " " . @$these_2x_genomes . "\n" if $self->debug;
01596 
01597     if (@$these_genomic_aligns == 1) {
01598       ## If only 1 has been found...
01599     my $taxon_id = $these_genomic_aligns->[0]->dnafrag->genome_db->taxon_id;
01600       print "seq$idx" . "_" . $taxon_id . " genome_db_id=" . $these_genomic_aligns->[0]->dnafrag->genome_db_id . "\n" if $self->debug;
01601       
01602       $this_leaf->name("seq".$idx++."_".$taxon_id); #.".".$these_dnafrag_regions->[0]->dnafrag_id);
01603 
01604       push(@$ordered_genomic_aligns, $these_genomic_aligns->[0]);
01605 
01606     } elsif (@$these_genomic_aligns > 1) {
01607       ## If more than 1 has been found, let Ortheus estimate the Tree
01608     print "Let ortheus estimate the tree\n" if $self->debug;
01609     #need to add on 2x genomes to genomic_aligns array
01610     my $ga = $self->param('genomic_aligns');
01611     if ($self->param('ga_frag')) {
01612         foreach my $ga_frags (@{$self->param('ga_frag')}) {
01613         push @$ga, $ga_frags;
01614         }
01615     }
01616     $self->param('genomic_aligns', $ga);
01617     return undef;
01618 
01619    } elsif (@$these_2x_genomes == 1) {
01620     #See what happens...
01621     #Find 2x genomes
01622        my $ga_frags = $self->param('ga_frag')->[$these_2x_genomes->[0]];
01623        print "number of frags " . @$ga_frags . "\n" if $self->debug;
01624 
01625        my $taxon_id = $ga_frags->[0]->{taxon_id};
01626     print "2x seq$idx" . "_" . $taxon_id . " " . $ga_frags->[0]->{genome_db_id} . "\n" if $self->debug;
01627     $this_leaf->name("seq".$idx++."_".$taxon_id);
01628     #push(@$ordered_2x_genomes, $these_2x_genomes->[0]);
01629     push(@$ordered_genomic_aligns, $ga_frags);
01630    } else {
01631       ## If none has been found...
01632       $this_leaf->disavow_parent;
01633       $tree = $tree->minimize_tree;
01634     }
01635   }
01636 
01637   $self->param('genomic_aligns', $ordered_genomic_aligns);
01638 
01639   $self->param('ordered_2x_genomes', $ordered_2x_genomes);
01640 
01641   if ($tree->get_child_count == 1) {
01642     my $child = $tree->children->[0];
01643     $child->parent->merge_children($child);
01644     $child->disavow_parent;
01645   }
01646   
01647   return $tree;
01648 }
01649 
01650 #
01651 # Create array of 2x fragments defined by the $pairwise_mlss found for the reference 
01652 #genomic_aligns (may be more than one) in $ref_gas
01653 #
01654 sub _create_frag_array {
01655     my ($self, $gab_adaptor, $pairwise_mlss, $ref_gas) = @_;
01656 
01657     my $ga_frag_array;
01658 
01659     my $ga_num_ns = 0;
01660 
01661     #Multiple alignment reference genomic_aligns (maybe more than 1)
01662     foreach my $ref_ga (@$ref_gas) {
01663     print "  " . $ref_ga->dnafrag->name . " " . $ref_ga->dnafrag_start . " " . $ref_ga->dnafrag_end . " " . $ref_ga->dnafrag_strand . "\n" if $self->debug;
01664     
01665     #find the slice corresponding to the ref_genome
01666     my $slice = $ref_ga->get_Slice;
01667 
01668     print "ref_seq " . $slice->start . " " . $slice->end . " " . $slice->strand . " " . substr($slice->seq,0,120) . "\n" if $self->debug;
01669 
01670     #find the pairwise blocks between ref_genome and the 2x genome
01671     my $pairwise_gabs = $gab_adaptor->fetch_all_by_MethodLinkSpeciesSet_Slice($pairwise_mlss, $slice, undef,undef,"restrict");
01672     
01673     #sort by reference_genomic_align start position (NB I sort again when parsing
01674     #the results if the ref strand is reverse since the fragments will be in the
01675     #reverse order ie A-B-C should be C-B-A). Don't do it here because I try to find
01676     #duplicates in load_2XGenomes.
01677     @$pairwise_gabs = sort {$a->reference_genomic_align->dnafrag_start <=> $b->reference_genomic_align->dnafrag_start} @$pairwise_gabs;
01678 
01679     print "    pairwise gabs " . scalar(@$pairwise_gabs) . "\n" if $self->debug;
01680     #if there are no pairwise matches found to 2x genome, then escape
01681     #back to loop
01682     next if (scalar(@$pairwise_gabs) == 0);
01683     
01684     my $ga_frags;
01685 
01686     #need to save each match separately but still use same structure as
01687     #create_span_frag_array in case we change our minds back again
01688     foreach my $pairwise_gab (@$pairwise_gabs) {
01689 
01690         #should only have 1!
01691         my $ga = $pairwise_gab->get_all_non_reference_genomic_aligns->[0];
01692 
01693         my $ref_start = $ga->genomic_align_block->reference_genomic_align->dnafrag_start;
01694         my $ref_end = $ga->genomic_align_block->reference_genomic_align->dnafrag_end;
01695 
01696         #need to store gab too otherwise it goes out of scope and I can't
01697         #access it from ga
01698         my $ga_fragment = {genomic_align => $ga,
01699                    genomic_align_block => $ga->genomic_align_block,
01700                    taxon_id => $ga->genome_db->taxon_id,
01701                    genome_db_id => $ga->dnafrag->genome_db_id,
01702                    ref_ga => $ref_ga,
01703                   };
01704 
01705         print "GAB " . $ga_fragment->{genomic_align}->genome_db->name . " " . $ga_fragment->{genomic_align}->dnafrag_start . " " . $ga_fragment->{genomic_align}->dnafrag_end . " " . $ga_fragment->{genomic_align}->dnafrag_strand . " " . substr($ga_fragment->{genomic_align}->get_Slice->seq,0,120) . "\n" if $self->debug;
01706         push @$ga_frags, $ga_fragment;
01707     }
01708     #add to array of fragments for each reference genomic_align
01709     push @$ga_frag_array, $ga_frags;
01710     }
01711     return $ga_frag_array;
01712 }
01713 
01714 
01715 #foreach cluster, find the longest region.
01716 sub _find_longest_region_in_cluster {
01717     my ($self, $cluster, $sum_lengths) = @_;
01718 
01719     my $max_frag = 0;
01720     my $final_region = -1;
01721     my @overlap_frag;
01722 
01723     my $overlap_cnt = 0;
01724     my $not_overlap_cnt = 0;
01725     my $longest_clusters;
01726 
01727     foreach my $this_cluster (@$cluster) {
01728     my $longest_frag;
01729     my $longest_region;
01730 
01731     foreach my $region (keys %{$this_cluster}) {
01732         #initialise variables
01733         if (!defined $longest_frag) {
01734         $longest_frag = $sum_lengths->[$region];
01735         $longest_region = $region;
01736         }
01737 
01738         if ($sum_lengths->[$region] >= $longest_frag) {
01739         $longest_frag = $sum_lengths->[$region];
01740         $longest_region = $region;
01741         }
01742     }
01743     push @$longest_clusters, $longest_region;
01744     }
01745     print "overlap_cnt $overlap_cnt not $not_overlap_cnt\n" if $self->debug;
01746     return $longest_clusters;
01747 }
01748 
01749 #Put overlapping regions in the same cluster. If region 0 overlaps with region 
01750 #1 and region 2, but not with region 3, create 2 clusters: (0,1,2), (3)
01751 sub _cluster_regions {
01752     my ($self, $found_overlap) = @_;
01753 
01754     my $overlap_cnt = 0;
01755     my $not_overlap_cnt = 0;
01756 
01757     my $cluster;
01758 
01759     foreach my $region1 (keys %$found_overlap) {
01760     foreach my $region2 (keys %{$found_overlap->{$region1}}) {
01761         print "FOUND OVERLAP $region1 $region2 " . $found_overlap->{$region1}{$region2} . "\n" if $self->debug;
01762         if ($found_overlap->{$region1}{$region2}) {
01763         $overlap_cnt++;
01764 
01765         $cluster = _add_to_same_cluster($cluster, $region1, $region2);
01766         } else {
01767         $not_overlap_cnt++;
01768         $cluster = _add_to_different_cluster($cluster, $region1, $region2);
01769         }
01770     }
01771     }
01772     print "overlap_cnt $overlap_cnt not $not_overlap_cnt\n" if $self->debug;
01773     return $cluster;
01774 }
01775 
01776 #add single region to cluster. No overlaps found.
01777 sub _add_to_cluster {
01778     my ($cluster, $region1) = @_;
01779 
01780     if (!defined $cluster) {
01781      $cluster->[0]->{$region1} = 1;
01782      }
01783     return $cluster;
01784 }
01785 
01786  sub _add_to_same_cluster {
01787      my ($cluster, $region1, $region2) = @_;
01788 
01789      #print "add to same cluster $region1 $region2\n";
01790 
01791      if (!defined $cluster) {
01792      $cluster->[0]->{$region1} = 1;
01793      $cluster->[0]->{$region2} = 1;
01794      return $cluster;
01795      }
01796 
01797      my $cluster_size = @$cluster;
01798 
01799      my $index1 = _in_cluster($cluster, $region1);
01800      my $index2 = _in_cluster($cluster, $region2);
01801 
01802      if ($index1 == -1 && $index2 == -1) {
01803      #neither found, add both to new cluster
01804      $cluster->[$cluster_size]->{$region1} = 1;
01805      $cluster->[$cluster_size]->{$region2} = 1;
01806      } elsif ($index1 != -1 && $index2 == -1) {
01807      #id1 found, id2 not. add id2 to id1 cluster
01808      $cluster->[$index1]->{$region2} = 1;
01809      } elsif ($index1 == -1 && $index2 != -1) {
01810      #id2 found, id1 not. add id1 to id2 cluster
01811      $cluster->[$index2]->{$region1} = 1;
01812      } else {
01813      #both ids set in different clusters. Merge the clusters together
01814      $cluster = _merge_clusters($cluster, $index1, $index2);
01815      }
01816      return $cluster;
01817  }
01818 
01819 sub _add_to_different_cluster {
01820      my ($cluster, $region1, $region2) = @_;
01821 
01822      if (!defined $cluster) {
01823      $cluster->[0]->{$region1} = 1;
01824      $cluster->[1]->{$region2} = 1;
01825      return $cluster;
01826      }
01827      my $cluster_size = @$cluster;
01828 
01829      my $index1 = _in_cluster($cluster, $region1);
01830      my $index2 = _in_cluster($cluster, $region2);
01831 
01832      if ($index1 == -1) {
01833      $cluster->[@$cluster]->{$region1} = 1;
01834      } 
01835      if ($index2 == -1) {
01836      $cluster->[@$cluster]->{$region2} = 1;
01837      }
01838 
01839      return $cluster;
01840  }
01841 
01842  sub _in_cluster {
01843      my ($cluster, $region) = @_;
01844 
01845      for (my $i = 0; $i < @$cluster; $i++) {
01846     if ($cluster->[$i]->{$region}) {
01847         return $i;
01848     }
01849      }
01850      return -1;
01851  }
01852 
01853 sub _merge_clusters {
01854     my ($cluster, $index1, $index2) = @_;
01855     
01856     #already in same cluster
01857     if ($index1 != -1 && $index1 == $index2) {
01858     return $cluster;
01859     }
01860 
01861     #copy over keys from index2 to index1
01862     foreach my $region (keys %{$cluster->[$index2]}) {
01863     #print "region $region\n";
01864     $cluster->[$index1]->{$region} = 1;
01865     }
01866  
01867     #delete index2
01868     splice(@$cluster, $index2, 1);
01869 
01870     return $cluster;
01871 }
01872 
01873 sub _print_cluster {
01874     my ($cluster) = @_;
01875 
01876     print "FINAL cluster ";
01877     foreach my $this_cluster (@$cluster) {
01878     print "(";
01879     foreach my $group (keys %{$this_cluster}) {
01880         print "$group ";
01881     }
01882     print "), ";
01883     }
01884     print "\n";
01885 }
01886 
01887 sub _dump_2x_fasta {
01888     my ($self, $ga_frags, $file, $seq_id, $mfa_fh) = @_;
01889 
01890     $file .= "_" . $ga_frags->[0]->{taxon_id} . ".fa";
01891 
01892     open F, ">$file" || throw("Couldn't open $file");
01893 
01894     print F ">SeqID" . $seq_id . "\n";
01895     #print MFA ">SeqID" . $seq_id . "\n";
01896     #print MFA ">seq" . $seq_id . "\n";
01897     print MFA ">seq" . $seq_id . "_" . $ga_frags->[0]->{taxon_id} . "\n";
01898     my $aligned_seq = $ga_frags->[0]->{aligned_seq};
01899     my $seq = $aligned_seq;
01900     $seq =~ tr/-//d;
01901     print F "$seq\n";
01902     close F;
01903 
01904     $aligned_seq =~ s/(.{60})/$1\n/g;
01905     $aligned_seq =~ s/\n$//;
01906     print $mfa_fh $aligned_seq, "\n";
01907 
01908     push @{$self->fasta_files}, $file;
01909     
01910     push @{$self->species_order}, $ga_frags->[0]->{genome_db_id};
01911 
01912 }
01913 
01914 #create alignment from multiple genomic_align_block and 2X genomes.
01915 sub _create_mfa {
01916     my ($self) = @_;
01917 
01918     my $multi_gab_id = $self->param('genomic_align_block_id');
01919     my $multi_gaba = $self->compara_dba->get_GenomicAlignBlockAdaptor;
01920     my $multi_gab = $multi_gaba->fetch_by_dbID($multi_gab_id);
01921     my $multi_gas = $multi_gab->get_all_GenomicAligns;
01922 
01923     my $pairwise_frags = $self->param('ga_frag');
01924 
01925     my $species_order = $self->species_order;
01926 
01927     
01928     foreach my $ga_frag_array (@$pairwise_frags) {
01929     my $multi_ref_ga = $ga_frag_array->[0]->{ref_ga};
01930     my $multi_mapper = $multi_ref_ga->get_Mapper;
01931     #my $multi_gab_length = length($multi_ref_ga->aligned_sequence);
01932     my $multi_gab_length = $multi_ref_ga->genomic_align_block->length;
01933 
01934     ## New aligned sequence for the 2X genome. Empty (only dashes) at creation time
01935     my $aligned_sequence = "-" x $multi_gab_length;
01936     
01937     foreach my $ga_frag (@$ga_frag_array) {
01938         my $pairwise_gab = $ga_frag->{genomic_align_block};
01939         
01940         my $pairwise_non_ref_ga = $pairwise_gab->get_all_non_reference_genomic_aligns->[0];
01941         my $pairwise_ref_ga = $pairwise_gab->reference_genomic_align;
01942         
01943         my $pairwise_fixed_seq = $pairwise_non_ref_ga->aligned_sequence("+FIX_SEQ");
01944         
01945         #undef($pairwise_non_ref_ga->{'aligned_sequence'});
01946 
01947         print "pairwise " . $pairwise_non_ref_ga->genome_db->name . " " . substr($pairwise_fixed_seq,0,120) . "\n" if $self->debug;
01948         my $depad = $pairwise_fixed_seq;
01949         $depad =~ tr/-//d;
01950         #print "depad length " . length($depad) . "\n";
01951 
01952         #my $deletion_array = find_ref_deletions($pairwise_ref_ga);
01953         my $deletion_array = find_ref_deletions($pairwise_gab);
01954 
01955         my @coords = $multi_mapper->map_coordinates("sequence",
01956                             $pairwise_ref_ga->dnafrag_start,
01957                             $pairwise_ref_ga->dnafrag_end,
01958                             $pairwise_ref_ga->dnafrag_strand,
01959                                 "sequence");
01960         my $length = 0;
01961         my $start = undef;
01962         my $end = undef;
01963         foreach my $this_coord (@coords) {
01964         next if ($this_coord->isa("Bio::EnsEMBL::Mapper::Gap"));
01965         $length += $this_coord->length;
01966         ## Extract the first N characters from $other_fixed_seq
01967         my $subseq = substr($pairwise_fixed_seq, 0, $this_coord->length, "");
01968         ## Copy extracted characters into the new aligned sequence for the 2X genome.
01969         substr($aligned_sequence, $this_coord->start-1, $this_coord->length, $subseq);
01970         $start = $this_coord->start if (!defined($start) or $this_coord->start < $start);
01971         $end = $this_coord->end if (!defined($end) or $this_coord->end > $end);
01972         }
01973 
01974         $ga_frag->{deletions} = $deletion_array;
01975         $ga_frag->{length} = length($depad);
01976 
01977         $ga_frag_array->[0]->{cigar_line} = undef;
01978         $ga_frag_array->[0]->{aligned_seq} = $aligned_sequence;
01979     } 
01980     #for (my $x = 0; $x < length($multi_ref_ga->aligned_sequence); $x += 80) {
01981     #   print substr($aligned_sequence, $x, 80), "\n";
01982     #  print substr($multi_ref_ga->aligned_sequence, $x, 80), "\n\n";
01983     #}
01984     }
01985 }
01986 
01987 
01988 #find deletions in reference species and store the position in slice coords
01989 #and the length
01990 sub find_ref_deletions {
01991     my ($gab) = @_;
01992     my $deletion_array;
01993 
01994     my $ref_ga = $gab->reference_genomic_align;
01995     my $non_ref_ga = $gab->get_all_non_reference_genomic_aligns->[0];
01996 
01997     my $ref_mapper = $ref_ga->get_Mapper;
01998 
01999     my $non_ref_mapper = $non_ref_ga->get_Mapper;
02000 
02001     my @ref_coords = $ref_mapper->map_coordinates("sequence",
02002                           $ref_ga->dnafrag_start,
02003                           $ref_ga->dnafrag_end,
02004                           $ref_ga->dnafrag_strand,
02005                           "sequence");
02006     #print "num coords " . @ref_coords . "\n";
02007     #print "non_ref start " . $non_ref_ga->dnafrag_start . " end " . $non_ref_ga->dnafrag_end . "\n";
02008     my $start_del;
02009     my $end_del;
02010     my $num_del = 0;
02011     foreach my $this_coord (@ref_coords) {
02012     #print "coords " . $this_coord->start . " end " . $this_coord->end . " strand " . $this_coord->strand . "\n";
02013     my @non_ref_coords = $non_ref_mapper->map_coordinates("alignment",
02014                                   $this_coord->start,
02015                                   $this_coord->end,
02016                                   $this_coord->strand,
02017                                   "alignment");
02018 
02019     #want all coords starting from left hand end
02020     if ($non_ref_ga->dnafrag_strand == -1) {
02021         #print "start " . $non_ref_ga->dnafrag_end . " " . $non_ref_coords[0]->start . "\n";
02022         $end_del = ($non_ref_ga->dnafrag_end - $non_ref_coords[0]->end +1);
02023     } else {
02024         $end_del = $non_ref_coords[0]->start - $non_ref_ga->dnafrag_start+1;
02025     }
02026     if (defined $start_del) {
02027         #print "found del $start_del $end_del\n";
02028         my $deletion;
02029         $deletion->{pos} = $start_del - $num_del;
02030         $deletion->{len} = ($end_del-$start_del-1);
02031         push @$deletion_array, $deletion;
02032         $num_del += $deletion->{len};
02033 
02034         #print "del pos " . $deletion->{pos} . " len " . $deletion->{len} . "\n";
02035     }
02036     if ($non_ref_ga->dnafrag_strand == -1) {
02037         #print "end " . $non_ref_ga->dnafrag_end . " " . $non_ref_coords[-1]->end . "\n";
02038         $start_del = ($non_ref_ga->dnafrag_end - $non_ref_coords[-1]->start +1);
02039     } else {    
02040         $start_del = $non_ref_coords[-1]->end-$non_ref_ga->dnafrag_start+1;
02041     }
02042 #   foreach my $non_ref (@non_ref_coords) {
02043 #       if ($non_ref->isa("Bio::EnsEMBL::Mapper::Gap")) {
02044 #       print "   found gap " . $non_ref->start . " end ". $non_ref->end . "\n";
02045 #       } else {
02046 #       print "   non ref " . ($non_ref->start-$non_ref_ga->dnafrag_start+1) . " end " . ($non_ref->end-$non_ref_ga->dnafrag_start+1) . "\n";
02047 #       print "   non ref real " . $non_ref->start . " end " . $non_ref->end . "\n";
02048       
02049 #       }
02050 #   }
02051 
02052     }
02053     return $deletion_array;
02054 }
02055 
02056 sub get_seq_length_from_cigar {
02057     my ($cigar_line) = @_;
02058     my $seq_pos;
02059 
02060     my @cig = ( $cigar_line =~ /(\d*[GMDXI])/g );
02061     for my $cigElem ( @cig ) {
02062     my $cigType = substr( $cigElem, -1, 1 );
02063     my $cigCount = substr( $cigElem, 0 ,-1 );
02064     $cigCount = 1 unless ($cigCount =~ /^\d+$/);
02065 
02066     if( $cigType eq "M" ) {
02067         $seq_pos += $cigCount;
02068     } elsif( $cigType eq "I") {
02069         $seq_pos += $cigCount;
02070     }
02071     }
02072     return $seq_pos;
02073 }
02074 
02075 1;