Archive Ensembl HomeArchive Ensembl Home
Ortheus.pm
Go to the documentation of this file.
00001 =head1 LICENSE
00002 
00003   Copyright (c) 1999-2012 The European Bioinformatics Institute and
00004   Genome Research Limited.  All rights reserved.
00005 
00006   This software is distributed under a modified Apache license.
00007   For license details, please see
00008 
00009     http://www.ensembl.org/info/about/code_licence.html
00010 
00011 =head1 CONTACT
00012 
00013   Please email comments or questions to the public Ensembl
00014   developers list at <dev@ensembl.org>.
00015 
00016   Questions may also be sent to the Ensembl help desk at
00017   <helpdesk@ensembl.org>.
00018 
00019 =head1 NAME
00020 
00021 Bio::EnsEMBL::Compara::RunnableDB::Ortheus
00022 
00023 =head1 SYNOPSIS
00024 
00025 
00026 =head1 DESCRIPTION
00027 
00028 This module acts as a layer between the Hive system and the Bio::EnsEMBL::Analysis::Runnable::Ortheus
00029 module since the ensembl-analysis API does not know about ensembl-compara
00030 
00031 Ortheus wants the files to be provided in the same order as in the tree string. This module starts
00032 by getting all the DnaFragRegions of the SyntenyRegion and then use them to edit the tree (some
00033 nodes must be removed and other ones must be duplicated in order to cope with deletions and
00034 duplications). The build_tree_string methods numbers the sequences in order and changes the
00035 order of the dnafrag_regions array accordingly. Last, the dumpFasta() method dumps the sequences
00036 according to the tree_string order.
00037 
00038 This module can be used to include low coverage 2X genomes in the alignment. To do this, the pairwise BLASTZ_NET alignments between each 2X genome and a reference species (eg human) are retrieved from specified databases. 
00039 
00040 Ortheus also generates a set of aligned ancestral sequences. This module stores them in a core-like database.
00041 
00042 
00043 =head1 PARAMETERS
00044 
00045 The fetch_input method reads the parameters of the analysis (analysis.parameters) first and then
00046 the input_id of the analysis job (analysis_job.input_id). Both are expected to be string
00047 representing hash references like {key1 => "value1", key2 => "value2"}. Values defined in the
00048 analysis_job.input_id column will overwrite values in the analysis.parameters.
00049 
00050 =over 5
00051 
00052 =item * synteny_region_id (int)
00053 
00054 Ortheus will align the segments defined in the SyntenyRegion with this dbID.
00055 
00056 =item * method_link_species_set_id (int)
00057 
00058 Ortheus will store alignments with this method_link_species_set_id
00059 
00060 =item * java_options
00061 
00062 Options used to run java eg: '-server -Xmx1000M'
00063 
00064 =item * tree_file
00065 
00066 FIXME
00067 
00068 =item * tree_analysis_data_id (int)
00069 
00070 FIXME
00071 
00072 =item * pairwise_analysis_data_id (int)
00073 
00074 Optional. A list of database locations and method_link_species_set_id pairs for the 2X geonome BLASTZ_NET alignments. The database locations should be identified using the url format.ie mysql://user:pass\@host:port/db_name.
00075 
00076 =item * reference_species 
00077 
00078 Optional. The reference species for the 2X genome BLASTZ_NET alignments
00079 
00080 =item * options
00081 
00082 Additional pecan options eg "-p 15"
00083 
00084 =item * max_block_size (int)
00085 
00086 If an alignment is longer than this value, it will be split in several blocks in the database. All resulting blocks will share the same genomic_align_group_id. 
00087 
00088 =back
00089 
00090 =head1 APPENDIX
00091 
00092 The rest of the documentation details each of the object methods. 
00093 Internal methods are usually preceded with a _
00094 
00095 =cut
00096 
00097 package Bio::EnsEMBL::Compara::RunnableDB::Ortheus;
00098 
00099 use strict;
00100 use Bio::EnsEMBL::Analysis::Runnable::Ortheus;
00101 use Bio::EnsEMBL::Compara::DnaFragRegion;
00102 use Bio::EnsEMBL::Compara::Production::DBSQL::DBAdaptor;
00103 use Bio::EnsEMBL::Utils::Exception;
00104 use Bio::EnsEMBL::Compara::Graph::NewickParser;
00105 use Bio::EnsEMBL::Compara::NestedSet;
00106 use Bio::EnsEMBL::Analysis::Config::Compara; #for $PYTHON and $ORTHEUS and $EXONERATE
00107 use Bio::EnsEMBL::Analysis::Runnable::Ortheus;
00108 use Bio::EnsEMBL::Compara::GenomicAlignGroup;
00109 use Bio::EnsEMBL::Utils::SqlHelper;
00110 use Data::Dumper;
00111 
00112 use Bio::EnsEMBL::Hive::Process;
00113 
00114 use base ('Bio::EnsEMBL::Compara::RunnableDB::BaseRunnable');
00115 
00116 #Padding character and max_pads to be added when creating the 2X genome
00117 #composite sequence
00118 my $pad_char = "N";
00119 
00120 my $max_pads = 100;
00121 #my $max_pads = 1000000;
00122 
00123 #percentage of max_pads to use ie to use 80% of the actual pad number, set max_pads to be
00124 #very large (so won't be trimmed) and set max_pad_percent to 0.8. 
00125 #my $max_pads_percent = 0.8; 
00126 my $max_pads_percent = 1.0; 
00127 
00128 #which method to use for creating the 2X fragments. If this is true (1), use
00129 #only the pairwise matching blocks. If this is false (0), use the entire net
00130 #including the inter-block spanning regions aswell. This leads to large regions
00131 #in the final alignment containing a single sequence but was useful for 
00132 #aligning gorilla in the 5way primate alignment before gorilla had chromosomes
00133 my $create_block_frag_array = 1;
00134 
00135 =head2 fetch_input
00136 
00137     Title   :   fetch_input
00138     Usage   :   $self->fetch_input
00139     Function:   Fetches input data for repeatmasker from the database
00140     Returns :   none
00141     Args    :   none
00142 
00143 =cut
00144 
00145 sub fetch_input {
00146   my( $self) = @_;
00147 
00148   my $ortheus_mlssid = $self->param('ortheus_mlssid');
00149   unless ($ortheus_mlssid) {
00150     throw("MethodLinkSpeciesSet is not defined for this Ortheus job");
00151   }
00152 
00153   #set default to do transactions
00154   if (!defined $self->param('do_transactions')) {
00155       $self->param('do_transactions', 1);
00156   }
00157 
00158   ## Store DnaFragRegions corresponding to the SyntenyRegion in $self->dnafrag_regions(). At this point the
00159   ## DnaFragRegions are in random order
00160   $self->_load_DnaFragRegions($self->param('synteny_region_id'));
00161 
00162   if ($self->dnafrag_regions) {
00163     ## Get the tree string by taking into account duplications and deletions. Resort dnafrag_regions
00164     ## in order to match the name of the sequences in the tree string (seq1, seq2...)
00165     if ($self->get_species_tree and $self->dnafrag_regions) {
00166       $self->_build_tree_string;
00167       print "seq_string ", $self->tree_string, "\n";
00168     }
00169     ## Dumps fasta files for the DnaFragRegions. Fasta files order must match the entries in the
00170     ## newick tree. The order of the files will match the order of sequences in the tree_string.
00171 
00172     $self->_dump_fasta;
00173 
00174   } else {
00175     throw("Cannot start Pecan job because some information is missing");
00176   }
00177 
00178   return 1;
00179 }
00180 
00181 sub run
00182 {
00183   my $self = shift;
00184   my $species_tree_meta_key = "tree_" . $self->param('ortheus_mlssid');
00185   my $runnable = new Bio::EnsEMBL::Analysis::Runnable::Ortheus(
00186       -workdir => $self->worker_temp_directory,
00187       -fasta_files => $self->fasta_files,
00188       -tree_string => $self->tree_string,
00189       -species_tree => $self->get_species_tree->newick_simple_format,
00190       -species_order => $self->species_order,
00191       -analysis => $self->analysis,
00192       -parameters => $self->param('java_options'),
00193       -options => $self->options,
00194       );
00195   $self->param('runnable', $runnable);
00196 
00197 
00198   #disconnect pairwise compara database
00199   if ($self->{pairwise_compara_dba}) {
00200       foreach my $dba (values %{$self->{pairwise_compara_dba}}) {
00201       $dba->dbc->disconnect_if_idle;
00202       }
00203   }
00204 
00205   #disconnect ancestral core database
00206   my $ancestor_genome_db = $self->compara_dba->get_GenomeDBAdaptor->fetch_by_name_assembly("ancestral_sequences");
00207   my $ancestor_dba = $ancestor_genome_db->db_adaptor;
00208   $ancestor_dba->dbc->disconnect_if_idle;
00209 
00210   #disconnect compara database
00211   $self->compara_dba->dbc->disconnect_if_idle;
00212 
00213   $runnable->run_analysis;
00214 
00215   $self->parse_results();
00216 }
00217 sub write_output {
00218     my ($self) = @_;
00219 
00220     print "WRITE OUTPUT\n" if $self->debug;
00221     if ($self->param('do_transactions')) {
00222     my $compara_conn = $self->compara_dba->dbc;
00223     my $ancestor_genome_db = $self->compara_dba->get_GenomeDBAdaptor->fetch_by_name_assembly("ancestral_sequences");
00224     my $ancestral_conn = $ancestor_genome_db->db_adaptor->dbc;
00225 
00226     my $compara_helper = Bio::EnsEMBL::Utils::SqlHelper->new(-DB_CONNECTION => $compara_conn);
00227     my $ancestral_helper = Bio::EnsEMBL::Utils::SqlHelper->new(-DB_CONNECTION => $ancestral_conn);
00228     $compara_helper->transaction(-CALLBACK => sub {
00229         $ancestral_helper->transaction(-CALLBACK => sub {
00230          $self->_write_output;
00231          });
00232          });
00233     } else {
00234     $self->_write_output;
00235     }
00236 
00237   return 1;
00238 
00239 }
00240 
00241 sub _write_output {
00242     my ($self) = @_;
00243 
00244   my $skip_left_right_index = 0;
00245 
00246   if ($self->param('runnable')->{tree_to_save}) {
00247     my $meta_container = $self->compara_dba->get_MetaContainer;
00248     $meta_container->store_key_value("synteny_region_tree_".$self->param('synteny_region_id'),
00249         $self->param('runnable')->{tree_to_save});
00250   }
00251 
00252   my $mlssa = $self->compara_dba->get_MethodLinkSpeciesSetAdaptor;
00253   my $mlss = $mlssa->fetch_by_dbID($self->param('ortheus_mlssid'));
00254   my $dnafrag_adaptor = $self->compara_dba->get_DnaFragAdaptor;
00255   my $gaba = $self->compara_dba->get_GenomicAlignBlockAdaptor;
00256   my $gaa = $self->compara_dba->get_GenomicAlignAdaptor;
00257 
00258   my $gata = $self->compara_dba->get_GenomicAlignTreeAdaptor;
00259 
00260   my $ancestor_genome_db = $self->compara_dba->get_GenomeDBAdaptor->fetch_by_name_assembly("ancestral_sequences");
00261   my $ancestor_dba = $ancestor_genome_db->db_adaptor;
00262 
00263   my $slice_adaptor = $ancestor_dba->get_SliceAdaptor();
00264   my $ancestor_coord_system_adaptor = $ancestor_dba->get_CoordSystemAdaptor();
00265   my $ancestor_coord_system;
00266   eval{
00267     $ancestor_coord_system = $ancestor_coord_system_adaptor->fetch_by_name("ancestralsegment");
00268   };
00269   if(!$ancestor_coord_system){
00270     $ancestor_coord_system = new Bio::EnsEMBL::CoordSystem(
00271             -name            => "ancestralsegment",
00272             -VERSION         => undef,
00273             -DEFAULT         => 1,
00274             -SEQUENCE_LEVEL  => 1,
00275             -RANK            => 1
00276         );
00277     $ancestor_coord_system_adaptor->store($ancestor_coord_system);
00278   }
00279 
00280   my $seq_region_sql = "UPDATE seq_region SET name = ? WHERE seq_region_id = ?";
00281   my $sth = $ancestor_coord_system_adaptor->prepare($seq_region_sql);
00282 
00283   foreach my $genomic_align_tree (@{$self->param('runnable')->output}) {
00284        foreach my $genomic_align_node (@{$genomic_align_tree->get_all_nodes}) {
00285        foreach my $genomic_align (@{$genomic_align_node->genomic_align_group->get_all_GenomicAligns}) {
00286           $genomic_align->adaptor($gaa);
00287           $genomic_align->method_link_species_set($mlss);
00288           $genomic_align->visible(1);
00289 
00290           if ($genomic_align->dnafrag_id == -1) {
00291           ## INTERNAL NODE, i.e. an ancestral sequence
00292           my $length = length($genomic_align->original_sequence);
00293           
00294           #Trigger loading of seq adaptor to avoid locked table problems
00295           $slice_adaptor->db()->get_SequenceAdaptor();
00296 
00297           #Insert into seq_region with dummy name to get the seq_region_id and then update with the new name
00298           #"Ancestor_" . $mlss_id . "_$seq_region_id";
00299           #Need to make unique dummy name
00300           my $dummy_name = "dummy_" . $$;
00301           my $slice = new Bio::EnsEMBL::Slice(
00302                               -seq_region_name   => $dummy_name,
00303                               -start             => 1,
00304                               -end               => $length,
00305                               -seq_region_length => $length,
00306                               -strand            => 1,
00307                               -coord_system      => $ancestor_coord_system,
00308                              );
00309           my $this_seq_region_id = $slice_adaptor->store($slice, \$genomic_align->original_sequence);
00310 
00311           my $name = "Ancestor_" . $mlss->dbID . "_" . $this_seq_region_id;
00312           #print "name $name\n";
00313           $sth->execute($name, $this_seq_region_id);
00314           my $dnafrag = new Bio::EnsEMBL::Compara::DnaFrag(
00315                        -name => $name,
00316                        -genome_db => $ancestor_genome_db,
00317                        -length => $length,
00318                        -coord_system_name => "ancestralsegment",
00319                                   );
00320           $dnafrag_adaptor->store($dnafrag);
00321 
00322           $genomic_align->dnafrag_id($dnafrag->dbID);
00323           $genomic_align->dnafrag_end($length);
00324           $genomic_align->dnafrag($dnafrag);
00325           }
00326       }
00327        }
00328 
00329        my $split_trees;
00330        #restrict genomic_align_tree if it is too long and store as groups
00331        if ($self->param('max_block_size') && 
00332        $genomic_align_tree->length >  $self->param('max_block_size')) {
00333        for (my $start = 1; $start <= $genomic_align_tree->length; 
00334         $start += $self->param('max_block_size')) {
00335            my $end = $start+$self->param('max_block_size')-1;
00336            if ($end > $genomic_align_tree->length) {
00337            $end = $genomic_align_tree->length;
00338            }
00339            my $new_gat = $genomic_align_tree->restrict_between_alignment_positions($start, $end, "skip_empty_GenomicAligns");
00340            push @$split_trees, $new_gat;
00341        }
00342        $gata->store_group($split_trees);
00343        foreach my $tree (@$split_trees) {
00344            $self->_write_gerp_dataflow($tree->modern_genomic_align_block_id,
00345                        $mlss);
00346            
00347        }
00348        } else {
00349        $gata->store($genomic_align_tree, $skip_left_right_index);
00350        $self->_write_gerp_dataflow(
00351                 $genomic_align_tree->modern_genomic_align_block_id,
00352                 $mlss);
00353        }
00354    }
00355     #print "tmp worker dir " . $self->worker_temp_directory . "\n";
00356     chdir("$self->worker_temp_directory");
00357     foreach(glob("*")){
00358     #DO NOT COMMENT THIS OUT!!! (at least not permenantly). Needed
00359     #to clean up after each job otherwise you get files left over from
00360     #the previous job.
00361     unlink($_);
00362     }
00363     #throw("Test commit");
00364     return 1;
00365 }
00366 
00367 
00368 #trim genomic align block from the left hand edge to first position having at
00369 #least 2 genomic aligns which overlap
00370 sub _trim_gab_left {
00371     my ($gab) = @_;
00372     
00373     if (!defined($gab)) {
00374     return undef;
00375     }
00376     my $align_length = $gab->length;
00377     
00378     my $gas = $gab->get_all_GenomicAligns();
00379     my $d_length;
00380     my $m_length;
00381     my $min_d_length = $align_length;
00382     
00383     my $found_min = 0;
00384 
00385     #take first element in cigar string for each genomic_align and if it is a
00386     #match, it must extend to the start of the block. Find the shortest delete.
00387     #If the shortest delete and the match are the same length, there is no
00388     #overlap between them so restrict to the end of the delete and try again.
00389     #If the delete is shorter than the match, there must be an overlap.
00390     foreach my $ga (@$gas) {
00391     my ($cigLength, $cigType) = ( $ga->cigar_line =~ /^(\d*)([GMD])/ );
00392     $cigLength = 1 unless ($cigLength =~ /^\d+$/);
00393 
00394     if ($cigType eq "D" or $cigType eq "G") {
00395         $d_length = $cigLength; 
00396         if ($d_length < $min_d_length) {
00397         $min_d_length = $d_length;
00398         }
00399     } else {
00400         $m_length = $cigLength;
00401         $found_min++;
00402     }
00403     }
00404     #if more than one alignment filled to the left edge, no need to restrict
00405     if ($found_min > 1) {
00406     return $gab;
00407     }
00408 
00409     my $new_gab = ($gab->restrict_between_alignment_positions(
00410                                   $min_d_length+1, $align_length, 1));
00411 
00412     #no overlapping genomic_aligns
00413     if ($new_gab->length == 0) {
00414     return $new_gab;
00415     }
00416 
00417     #if delete length is less than match length then must have sequence overlap
00418     if ($min_d_length < $m_length) {
00419     return $new_gab;
00420     }
00421     #otherwise try again with restricted gab
00422     return _trim_gab_left($new_gab);
00423 }
00424 
00425 #trim genomic align block from the right hand edge to first position having at
00426 #least 2 genomic aligns which overlap
00427 sub _trim_gab_right {
00428     my ($gab) = @_;
00429     
00430     if (!defined($gab)) {
00431     return undef;
00432     }
00433     my $align_length = $gab->length;
00434 
00435     my $max_pos = 0;
00436     my $gas = $gab->get_all_GenomicAligns();
00437     
00438     my $found_max = 0;
00439     my $d_length;
00440     my $m_length;
00441     my $min_d_length = $align_length;
00442 
00443     #take last element in cigar string for each genomic_align and if it is a
00444     #match, it must extend to the end of the block. Find the shortest delete.
00445     #If the shortest delete and the match are the same length, there is no
00446     #overlap between them so restrict to the end of the delete and try again.
00447     #If the delete is shorter than the match, there must be an overlap.
00448     foreach my $ga (@$gas) {
00449     my ($cigLength, $cigType) = ( $ga->cigar_line =~ /(\d*)([GMD])$/ );
00450     $cigLength = 1 unless ($cigLength =~ /^\d+$/);
00451 
00452     if ($cigType eq "D" or $cigType eq "G") {
00453         $d_length =$cigLength;
00454         if ($d_length < $min_d_length) {
00455         $min_d_length = $d_length;
00456         }
00457     } else {
00458         $m_length = $cigLength;
00459         $found_max++;
00460     }
00461     }
00462     #if more than one alignment filled the right edge, no need to restrict
00463     if ($found_max > 1) {
00464     return $gab;
00465     }
00466 
00467     my $new_gab = $gab->restrict_between_alignment_positions(1, $align_length - $min_d_length, 1);
00468 
00469     #no overlapping genomic_aligns
00470     if ($new_gab->length == 0) {
00471     return $new_gab;
00472     }
00473 
00474     #if delete length is less than match length then must have sequence overlap
00475     if ($min_d_length < $m_length) {
00476     return $new_gab;
00477     }
00478     #otherwise try again with restricted gab
00479     return _trim_gab_right($new_gab);
00480 }
00481 
00482 sub _write_gerp_dataflow {
00483     my ($self, $gab_id, $mlss) = @_;
00484     
00485     my $species_set = "[";
00486     my $genome_db_set  = $mlss->species_set;
00487     
00488     foreach my $genome_db (@$genome_db_set) {
00489     $species_set .= $genome_db->dbID . ","; 
00490     }
00491     $species_set .= "]";
00492     
00493     my $output_id = "{genomic_align_block_id=>" . $gab_id . ",species_set=>" .  $species_set . "}";
00494     $self->dataflow_output_id($output_id);
00495 }
00496 
00497 #Taken from Analysis/Runnable/Ortheus.pm module
00498 sub parse_results {
00499     my ($self) = @_;
00500 
00501     my ($self, $run_number) = @_;
00502 
00503     #print STDERR 
00504       ## The output file contains one fasta aligned sequence per original sequence + ancestral sequences.
00505       ## The first seq. corresponds to the fist leaf of the tree, the second one will be an internal
00506       ## node, the third is the second leaf and so on. The fasta header in the result file correspond
00507       ## to the names of the leaves for the leaf nodes and to the concatenation of the names of all the
00508       ## underlying leaves for internal nodes. For instance:
00509       ## ----------------------------------
00510       ## >0
00511       ## ACTTGG--CCGT
00512       ## >0_1
00513       ## ACTTGGTTCCGT
00514       ## >1
00515       ## ACTTGGTTCCGT
00516       ## >1_2_3
00517       ## ACTTGCTTCCGT
00518       ## >2
00519       ## CCTTCCTTCCGT
00520       ## ----------------------------------
00521       ## The sequence of fasta files and leaves in the tree have the same order. If Ortheus is run
00522       ## with a given tree, the sequences in the file follow the tree. If Ortheus estimate the tree,
00523       ## the tree output file contains also the right order of files:
00524       ## ----------------------------------
00525       ## ((1:0.0157,0:0.0697):0.0000,2:0.0081);
00526       ## /tmp/file3.fa /tmp/file1.fa /tmp/file2.fa
00527       ## ----------------------------------
00528 
00529 
00530       #   $self->workdir("/home/jherrero/ensembl/worker.8139/");
00531     my $tree_file;
00532     my $workdir;
00533     $tree_file = $self->worker_temp_directory . "/output.$$.tree";
00534 
00535     my $ordered_fasta_files = $self->fasta_files;
00536 
00537     if (-e $tree_file) {
00538     ## Ortheus estimated the tree. Overwrite the order of the fasta files and get the tree
00539     open(F, $tree_file) || throw("Could not open tree file <$tree_file>");
00540     my ($newick, $files) = <F>;
00541     close(F);
00542     $newick =~ s/[\r\n]+$//;
00543     $self->tree_string($newick);
00544     $files =~ s/[\r\n]+$//;
00545 
00546     my $all_files = [split(" ", $files)];
00547     
00548     #store ordered fasta_files
00549     $ordered_fasta_files = $all_files;
00550     $self->fasta_files(@$all_files);
00551     print STDOUT "**NEWICK: $newick\nFILES: ", join(" -- ", @$all_files), "\n";
00552     }
00553     
00554     
00555     #   $self->tree_string("((0:0.06969,1:0.015698):1e-05,2:0.008148):1e-05;");
00556     #   $self->fasta_files(["/home/jherrero/ensembl/worker.8139/seq1.fa", "/home/jherrero/ensembl/worker.8139/seq2.fa", "/home/jherrero/ensembl/worker.8139/seq3.fa"]);
00557     
00558     
00559     my (@ordered_leaves) = $self->tree_string =~ /[(,]([^(:)]+)/g;
00560     print "++NEWICK: ", $self->tree_string, "\nLEAVES: ", join(" -- ", @ordered_leaves), "\nFILES: ", join(" -- ", @{$self->fasta_files}), "\n";
00561 
00562     #my $alignment_file = $self->workdir . "/output.$$.mfa";
00563 
00564     my $alignment_file = $self->worker_temp_directory . "/output.$$.mfa";
00565 
00566     my $this_genomic_align_block = new Bio::EnsEMBL::Compara::GenomicAlignBlock;
00567     
00568     open(F, $alignment_file) || throw("Could not open $alignment_file");
00569     my $seq = "";
00570     my $this_genomic_align;
00571 
00572     #Create genomic_align_group object to store genomic_aligns for
00573     #each node. For 2x genomes, there may be several genomic_aligns
00574     #for a node but for other genomes there will only be one
00575     #genomic_align in the genomic_align_group
00576     my $genomic_align_group;
00577 
00578     my $tree = Bio::EnsEMBL::Compara::Graph::NewickParser::parse_newick_into_tree($self->tree_string);
00579   $tree->print_tree(100);
00580     
00581     print $tree->newick_format("simple"), "\n";
00582     print join(" -- ", map {$_->name} @{$tree->get_all_leaves}), "\n";
00583     print "Reading $alignment_file...\n";
00584     my $ids;
00585 
00586     #foreach my $this_file (@{$self->fasta_files}) {
00587     foreach my $this_file (@$ordered_fasta_files) {
00588 
00589     push(@$ids, qx"head -1 $this_file");
00590     push(@$ids, undef); ## There is an internal node after each leaf..
00591     }
00592     pop(@$ids); ## ...except for the last leaf which is the end of the tree
00593     #print join(" :: ", @$ids), "\n\n";
00594 
00595     my $genomic_aligns_2x_array = [];
00596     my @num_frag_pads;
00597 
00598     while (<F>) {
00599     next if (/^\s*$/);
00600     chomp;
00601     ## FASTA headers correspond to the tree and the order of the leaves in the tree corresponds
00602     ## to the order of the files
00603 
00604     if (/^>/) {
00605         print "PARSING $_\n" if ($self->debug);
00606         print $tree->newick_format(), "\n" if ($self->debug);
00607         my ($name) = $_ =~ /^>(.+)/;
00608         if (defined($this_genomic_align) and  $seq) {
00609         if (@$genomic_aligns_2x_array) {
00610             print "*****FOUND 2x seq " . length($seq) . "\n" if ($self->debug);
00611             #starting offset
00612             #my $offset = $max_pads;
00613             my $offset = $num_frag_pads[0];
00614             #how many X's to add at the start of the cigar_line
00615             my $start_X;
00616 
00617             #how many X's to add to the end of the cigar_line
00618             my $end_X;
00619 
00620             my $align_offset = 0;
00621             for (my $i = 0; $i < @$genomic_aligns_2x_array; $i++) {
00622             my $genomic_align = $genomic_aligns_2x_array->[$i];
00623             my $num_pads = $num_frag_pads[$i+1];
00624             my $ga_length = $genomic_align->dnafrag_end-$genomic_align->dnafrag_start+1;
00625 
00626             print "extract_sequence $offset " .($offset+$ga_length) . " num pads $num_pads\n" if ($self->debug); 
00627 
00628             my ($subseq, $aligned_start, $aligned_end) = _extract_sequence($seq, $offset+1, ($offset+$ga_length));
00629 
00630             #Add aligned sequence
00631             $genomic_align->aligned_sequence($subseq);
00632 
00633             #Add X padding characters to ends of seq
00634             $start_X = $aligned_start;
00635             $end_X = length($seq) - ($start_X+length($subseq));
00636 
00637             print "start_X $start_X end_X $end_X subseq_length " . length($subseq) . "\n" if ($self->debug);
00638 
00639             $genomic_align->cigar_line($start_X . "X" .$genomic_align->cigar_line . $end_X . "X");
00640 
00641             #my $aln_seq = "." x $start_X;
00642             #$aln_seq .= $genomic_align->aligned_sequence();
00643             #$aln_seq .= "." x $end_X;
00644             #$genomic_align->aligned_sequence($aln_seq);
00645 
00646             #free aligned_sequence now that I've used it to 
00647             #create the cigar_line
00648             undef($genomic_align->{'aligned_sequence'});
00649 
00650             #Add genomic align to genomic align block
00651             $this_genomic_align_block->add_GenomicAlign($genomic_align);
00652             $offset += $num_pads + $ga_length;
00653             }
00654             $genomic_aligns_2x_array = [];
00655             undef @num_frag_pads;
00656         } else {
00657 
00658             print "add aligned_sequence " . $this_genomic_align->dnafrag_id . " " . $this_genomic_align->dnafrag_start . " " . $this_genomic_align->dnafrag_end . "\n" if $self->debug;
00659 
00660             $this_genomic_align->aligned_sequence($seq);
00661             $this_genomic_align_block->add_GenomicAlign($this_genomic_align);
00662         }
00663         }
00664         my $header = shift(@$ids);
00665         $this_genomic_align = new Bio::EnsEMBL::Compara::GenomicAlign;
00666 
00667         if (!defined($header)) {
00668         print "INTERNAL NODE $name\n" if ($self->debug);
00669         my $this_node;
00670         foreach my $this_leaf_name (split("_", $name)) {
00671             if ($this_node) {
00672             my $other_node = $tree->find_node_by_name($this_leaf_name);
00673             if (!$other_node) {
00674                 throw("Cannot find node <$this_leaf_name>\n");
00675             }
00676             $this_node = $this_node->find_first_shared_ancestor($other_node);
00677             } else {
00678             print $tree->newick_format() if ($self->debug);
00679             print "LEAF: $this_leaf_name\n" if ($self->debug);
00680             $this_node = $tree->find_node_by_name($this_leaf_name);
00681             }
00682         }
00683         print join("_", map {$_->name} @{$this_node->get_all_leaves}), "\n" if ($self->debug);
00684         ## INTERNAL NODE: dnafrag_id and dnafrag_end must be edited somewhere else
00685 
00686         $this_genomic_align->dnafrag_id(-1);
00687         $this_genomic_align->dnafrag_start(1);
00688         $this_genomic_align->dnafrag_end(0);
00689         $this_genomic_align->dnafrag_strand(1);
00690         bless($this_node, "Bio::EnsEMBL::Compara::GenomicAlignTree");
00691         #$this_node->genomic_align($this_genomic_align);
00692         $genomic_align_group = new Bio::EnsEMBL::Compara::GenomicAlignGroup();
00693         $genomic_align_group->add_GenomicAlign($this_genomic_align);
00694 
00695         
00696         $this_node->genomic_align_group($genomic_align_group);
00697         $this_node->name($name);
00698         #} elsif ($header =~ /^>DnaFrag(\d+)\|(.+)\.(\d+)\-(\d+)\:(\-?1)$/) {
00699         } elsif ($header =~ /^>SeqID(\d+)/) {
00700         #print "old $name\n";
00701 
00702         print "leaf_name?? $name\n" if ($self->debug);
00703         my $this_leaf = $tree->find_node_by_name($name);
00704         if (!$this_leaf) {
00705             print $tree->newick_format(), " ****\n" if ($self->debug);
00706             die "Unable to find_node_by_name $name";
00707         }
00708         #print "$this_leaf\n";
00709         #         print "****** $name -- $header -- ";
00710         #         if ($this_leaf) {
00711         #           $this_leaf->print_node();
00712         #         } else {
00713         #           print "[none]\n";
00714         #         }
00715 
00716         #information extracted from fasta header
00717         my $seq_id = ($1);
00718 
00719         my $all_dnafrag_regions = $self->dnafrag_regions;
00720 
00721         my $dfr = $all_dnafrag_regions->[$seq_id-1];
00722 
00723         if (!UNIVERSAL::isa($dfr, 'Bio::EnsEMBL::Compara::DnaFragRegion')) {
00724             print "FOUND 2X GENOME\n" if $self->debug;
00725             print "num of frags " . @$dfr . "\n" if $self->debug;
00726 
00727             #first pads
00728             push @num_frag_pads, $dfr->[0]->{first_pads};
00729 
00730             #create new genomic_align for each pairwise fragment
00731             foreach my $ga_frag (@$dfr) {
00732             my $genomic_align = new Bio::EnsEMBL::Compara::GenomicAlign;
00733             
00734             print "2x dnafrag_id " . $ga_frag->{dnafrag_region}->dnafrag_id . "\n" if $self->debug;
00735 
00736             $genomic_align->dnafrag_id($ga_frag->{dnafrag_region}->dnafrag_id);
00737             $genomic_align->dnafrag_start($ga_frag->{dnafrag_region}->dnafrag_start);
00738             $genomic_align->dnafrag_end($ga_frag->{dnafrag_region}->dnafrag_end);
00739             $genomic_align->dnafrag_strand($ga_frag->{dnafrag_region}->dnafrag_strand);
00740 
00741             push @num_frag_pads, $ga_frag->{num_pads};
00742             push @$genomic_aligns_2x_array, $genomic_align;
00743             }
00744             #Add genomic align to genomic align group 
00745             $genomic_align_group = new Bio::EnsEMBL::Compara::GenomicAlignGroup(
00746                                             #-genomic_align_array => $genomic_aligns_2x_array,
00747                                                 -type => "epo");
00748             foreach my $this_genomic_align (@$genomic_aligns_2x_array) {
00749             $genomic_align_group->add_GenomicAlign($this_genomic_align);
00750             }
00751 
00752             bless($this_leaf, "Bio::EnsEMBL::Compara::GenomicAlignTree");
00753             $this_leaf->genomic_align_group($genomic_align_group);
00754             print "size of array " . @$genomic_aligns_2x_array . "\n" if $self->debug;
00755             print "store gag1 $this_leaf\n" if $self->debug;
00756 
00757             #$self->{$this_leaf} = $genomic_align_group;
00758         } else  {
00759             print "normal dnafrag_id " . $dfr->dnafrag_id . "\n" if $self->debug;
00760 
00761             $this_genomic_align->dnafrag_id($dfr->dnafrag_id);
00762             $this_genomic_align->dnafrag_start($dfr->dnafrag_start);
00763             $this_genomic_align->dnafrag_end($dfr->dnafrag_end);
00764             $this_genomic_align->dnafrag_strand($dfr->dnafrag_strand);
00765 
00766             $genomic_align_group = new Bio::EnsEMBL::Compara::GenomicAlignGroup(
00767                                             #-genomic_align_array => [$this_genomic_align],
00768                                                 -type => "epo");
00769             $genomic_align_group->add_GenomicAlign($this_genomic_align);
00770 
00771             bless($this_leaf, "Bio::EnsEMBL::Compara::GenomicAlignTree");
00772             $this_leaf->genomic_align_group($genomic_align_group);
00773             print "store gag2 $this_leaf\n" if $self->debug;
00774         }
00775         } else {
00776         throw("Error while parsing the FASTA header. It must start by \">DnaFrag#####\" where ##### is the dnafrag_id\n$_");
00777         }
00778         $seq = "";
00779     } else {
00780         $seq .= $_;
00781     }
00782     }
00783     close F;
00784 
00785     #last genomic_align
00786     print "Last genomic align\n" if ($self->debug);
00787     if (@$genomic_aligns_2x_array) {
00788     print "*****FOUND 2x seq " . length($seq) . "\n" if ($self->debug);
00789 
00790     #starting offset
00791     #my $offset = $max_pads;
00792     my $offset = $num_frag_pads[0];
00793 
00794     #how many X's to add at the start and end of the cigar_line
00795     my ($start_X , $end_X);
00796     
00797     my $align_offset = 0;
00798     for (my $i = 0; $i < @$genomic_aligns_2x_array; $i++) {
00799         my $genomic_align = $genomic_aligns_2x_array->[$i];
00800         my $num_pads = $num_frag_pads[$i+1];
00801         my $ga_length = $genomic_align->dnafrag_end-$genomic_align->dnafrag_start+1;
00802         print "extract_sequence $offset " .($offset+$ga_length) . " num pads $num_pads\n" if ($self->debug); 
00803         my ($subseq, $aligned_start, $aligned_end) = _extract_sequence($seq, $offset+1, ($offset+$ga_length));
00804         
00805         #Add aligned sequence
00806         $genomic_align->aligned_sequence($subseq);
00807         
00808         #Add X padding characters to ends of seq
00809         $start_X = $aligned_start;
00810         $end_X = length($seq) - ($start_X+length($subseq));
00811         print "start_X $start_X end_X $end_X subseq_length " . length($subseq) . "\n" if ($self->debug);
00812         
00813         $genomic_align->cigar_line($start_X . "X" .$genomic_align->cigar_line . $end_X . "X");
00814         my $aln_seq = "." x $start_X;
00815         $aln_seq .= $genomic_align->aligned_sequence();
00816         $aln_seq .= "." x $end_X;
00817         $genomic_align->aligned_sequence($aln_seq);
00818         
00819         #Add genomic align to genomic align block
00820         $this_genomic_align_block->add_GenomicAlign($genomic_align);
00821         $offset += $num_pads + $ga_length;
00822     }
00823     } else {
00824     if ($this_genomic_align->dnafrag_id == -1) {
00825     } else {
00826         $this_genomic_align->aligned_sequence($seq);
00827         $this_genomic_align_block->add_GenomicAlign($this_genomic_align);
00828     }
00829     }
00830 
00831     $self->remove_empty_cols($tree);
00832     print $tree->newick_format("simple"), "\n";
00833     print join(" -- ", map {$_."+".$_->node_id."+".$_->name} (@{$tree->get_all_nodes()})), "\n";
00834     #$self->output([$tree]);
00835     $self->param('runnable')->output([$tree]);
00836 
00837 #     foreach my $ga_node (@{$tree->get_all_nodes}) {
00838 #   if ($ga_node) {
00839 #       my $ga = $ga_node->genomic_align;
00840 #       print "name " . $ga_node->name . " $ga \n";
00841 #       my $gab = $ga->genomic_align_block;
00842 #       if (defined $gab) {
00843 #       print "Parse number of genomic_aligns " . $gab . " " . @{$gab->genomic_align_array} . "\n";
00844 #       } else {
00845 #       print "Parse no genomic_aligns\n";
00846 #       }
00847 #   } else {
00848 #       print "no ga_node\n";
00849 #   }
00850     
00851 #     }
00852 
00853 
00854 }
00855 
00856 sub remove_empty_cols {
00857     my ($self, $tree) = @_;
00858 
00859     my $gaa = $self->compara_dba->get_GenomicAlignAdaptor;
00860 
00861     ## $seqs is a hash for storing segments of sequence in the alignment
00862     my $seqs = {}; ## key => start, value => end; both in e! coord.
00863     foreach my $this_leaf (@{$tree->get_all_leaves}) {
00864         foreach my $this_genomic_align (@{$this_leaf->genomic_align_group->get_all_GenomicAligns}) {
00865             my $cigar_line = $this_genomic_align->cigar_line;
00866             my $pos = 1; ## $pos in e! coordinates
00867             foreach my $cig_elem (grep {$_} split(/(\d*[DMIGX])/, $cigar_line)) {
00868                 my ($num, $mode) = $cig_elem =~ /(\d*)([DMIGX])/;
00869                 $num = 1 if ($num eq "");
00870                 if ($mode eq "M" or $mode eq "I") {
00871                     my $start = $pos;
00872                     my $end = $pos + $num - 1;
00873                     unless (exists($seqs->{$start}) and $seqs->{$start} >= $end) {
00874                         $seqs->{$start} = $end;
00875                     }
00876                 }
00877                 $pos += $num;
00878             }
00879     }
00880     }
00881 
00882     ## Now goes through all the segments and detect gap-only cols as coordinates with no sequence
00883     my $last_start_pos = 0;
00884     my $last_end_pos = 0;
00885     my $gaps = {};
00886     foreach my $start_pos (sort {$a <=> $b} keys %$seqs) {
00887         my $end_pos = $seqs->{$start_pos};
00888         print " $start_pos -> $end_pos\n" if $self->debug;
00889         if ($end_pos <= $last_end_pos) {
00890             ## Included in the current block. Skip this
00891             print " XXX\n" if $self->debug;
00892             next;
00893         } elsif ($start_pos <= $last_end_pos + 1) {
00894             ## Overlapping or consecutive segments. Change last_end
00895             $last_end_pos = $end_pos;
00896             print " ---> $end_pos\n" if $self->debug;
00897         } else {
00898             ## New segment: there are gap-only cols
00899             $gaps->{$last_end_pos + 1} = $start_pos - 1 if ($last_end_pos);
00900             print " ---> GAP (" . ($last_end_pos + 1) . "-" . ($start_pos - 1) . ")\n" if $self->debug;
00901             $last_start_pos = $start_pos;
00902             $last_end_pos = $end_pos;
00903         }
00904     }
00905 
00906     ## Trim the sequences to remove gap-only cols.
00907     foreach my $this_leaf (@{$tree->get_all_nodes}) {
00908     foreach my $this_genomic_align (@{$this_leaf->genomic_align_group->get_all_GenomicAligns}) {
00909         #set adaptor to get the aligned sequence using the dnafrag_id
00910         if (!defined $this_genomic_align->{'adaptor'}) {
00911         $this_genomic_align->adaptor($gaa);
00912         }
00913             my $aligned_sequence = $this_genomic_align->aligned_sequence;
00914         print "before cigar " . $this_genomic_align->cigar_line . "\n" if $self->debug;
00915             foreach my $start_pos (sort {$b <=> $a} keys %$gaps) { ## IN REVERSE ORDER!!
00916                 my $end_pos = $gaps->{$start_pos};
00917                 ## substr works with 0-based coordinates
00918                 substr($aligned_sequence, $start_pos - 1, ($end_pos - $start_pos + 1), "");
00919         }
00920         ## Uses the new sequence
00921             $this_genomic_align->{cigar_line} = undef;
00922             $this_genomic_align->aligned_sequence($aligned_sequence);
00923         print "after cigar " . $this_genomic_align->cigar_line . "\n" if $self->debug;
00924     }
00925     }
00926 }
00927 
00928 
00929 #
00930 # Extract the sequence corresponding to the 2X genome fragment
00931 #
00932 sub _extract_sequence {
00933     my ($seq, $original_start, $original_end) = @_;
00934     my $original_count = 0;
00935     my $aligned_count = 0;
00936     my $aligned_start;
00937     my $aligned_end;
00938 
00939     #print "original_start $original_start original_end $original_end\n";
00940     foreach my $subseq (grep {$_} split /(\-+)/, $seq) {
00941     my $length = length($subseq);
00942     if ($subseq !~ /\-/) {
00943         if (!defined($aligned_start) && ($original_count + $length >= $original_start)) {
00944         $aligned_start = $aligned_count + ($original_start - $original_count) - 1;
00945         }
00946         if (!defined($aligned_end) && ($original_count + $length >= $original_end)) {
00947         $aligned_end = $aligned_count + $original_end - $original_count - 1;
00948         last;
00949         }
00950         $original_count += $length;
00951     }
00952     $aligned_count += $length;
00953     }
00954 
00955     my $subseq = substr($seq, $aligned_start, ($aligned_end-$aligned_start+1));
00956     return ($subseq, $aligned_start, $aligned_end);
00957 }
00958 
00959 ##########################################
00960 #
00961 # getter/setter methods
00962 # 
00963 ##########################################
00964 
00965 sub input_dir {
00966   my $self = shift;
00967   $self->{'_input_dir'} = shift if(@_);
00968   return $self->{'_input_dir'};
00969 }
00970 
00971 sub synteny_region_id {
00972   my ($self, $value) = shift;
00973   $self->param('synteny_region_id') = shift if(@_);
00974   return $self->param('synteny_region_id');
00975 }
00976 
00977 sub dnafrag_regions {
00978   my $self = shift;
00979   $self->{'_dnafrag_regions'} = shift if(@_);
00980   return $self->{'_dnafrag_regions'};
00981 }
00982 
00983 sub options {
00984   my $self = shift;
00985   $self->{'_options'} = shift if(@_);  return $self->{'_options'};
00986 }
00987 
00988 sub fasta_files {
00989   my $self = shift;
00990 
00991   $self->{'_fasta_files'} = [] unless (defined $self->{'_fasta_files'});
00992 
00993   if (@_) {
00994     my $value = shift;
00995     push @{$self->{'_fasta_files'}}, $value;
00996   }
00997 
00998   return $self->{'_fasta_files'};
00999 }
01000 
01001 sub species_order {
01002   my $self = shift;
01003 
01004   $self->{'_species_order'} = [] unless (defined $self->{'_species_order'});
01005 
01006   if (@_) {
01007     my $value = shift;
01008     push @{$self->{'_species_order'}}, $value;
01009   }
01010 
01011   return $self->{'_species_order'};
01012 }
01013 
01014 
01015 sub get_species_tree {
01016   my $self = shift;
01017 
01018   my $newick_species_tree;
01019   my $species_tree_meta_key = "tree_" . $self->param('ortheus_mlssid');
01020 
01021   if (defined($self->param('species_tree'))) {
01022     return $self->param('species_tree');
01023   } elsif ($self->param($species_tree_meta_key)) {
01024     $newick_species_tree = $self->param($species_tree_meta_key);
01025   } elsif ($self->param('tree_file')) {
01026     open(TREE_FILE, $self->param('tree_file')) or throw("Cannot open file ".$self->param('tree_file'));
01027     $newick_species_tree = join("", <TREE_FILE>);
01028     close(TREE_FILE);
01029   }
01030 
01031   if (!defined($newick_species_tree)) {
01032     return undef;
01033   }
01034 
01035   $newick_species_tree =~ s/^\s*//;
01036   $newick_species_tree =~ s/\s*$//;
01037   $newick_species_tree =~ s/[\r\n]//g;
01038 
01039   my $species_tree =
01040       Bio::EnsEMBL::Compara::Graph::NewickParser::parse_newick_into_tree($newick_species_tree);
01041 
01042   #if the tree leaves are species names, need to convert these into genome_db_ids
01043   my $genome_dbs = $self->compara_dba->get_GenomeDBAdaptor->fetch_all();
01044 
01045   my %leaf_name;
01046   my %leaf_check;
01047   foreach my $genome_db (@$genome_dbs) {
01048       my $name = $genome_db->name;
01049       $name =~ tr/ /_/;
01050       $leaf_name{$name} = $genome_db->dbID;
01051       if ($name ne "ancestral_sequences") {
01052       $leaf_check{$genome_db->dbID} = 2;
01053       }
01054   }
01055   foreach my $leaf (@{$species_tree->get_all_leaves}) {
01056       #check have names rather than genome_db_ids
01057       if ($leaf->name =~ /\D+/) {
01058       $leaf->name($leaf_name{lc($leaf->name)});
01059       }
01060       $leaf_check{lc($leaf->name)}++;
01061   }
01062 
01063   #Check have one instance in the tree of each genome_db in the database
01064   #Don't worry about having extra elements in the tree that aren't in the
01065   #genome_db table because these will be removed later
01066   foreach my $name (keys %leaf_check) {
01067       if ($leaf_check{$name} == 2) {
01068       throw("Unable to find genome_db_id $name in species_tree\n");
01069       }
01070   }
01071   
01072   $self->param('species_tree', $species_tree);
01073   return $self->param('species_tree');
01074 }
01075 
01076 sub tree_string {
01077   my $self = shift;
01078   $self->{'_tree_string'} = shift if(@_);
01079   return $self->{'_tree_string'};
01080 }
01081 
01082 #sub method_link_species_set_id {
01083 #  my $self = shift;
01084 #  $self->{'_method_link_species_set_id'} = shift if(@_);
01085 #  return $self->{'_method_link_species_set_id'};
01086 #}
01087 #
01088 #sub max_block_size {
01089 #  my $self = shift;
01090 #  $self->{'_max_block_size'} = shift if(@_);
01091 #  return $self->{'_max_block_size'};
01092 #}
01093 
01094 
01095 ##########################################
01096 #
01097 # internal methods
01098 #
01099 ##########################################
01100 
01101 =head2 _load_DnaFragRegions
01102 
01103   Arg [1]    : int syteny_region_id
01104   Example    : $self->_load_DnaFragRegions();
01105   Description: Gets the list of DnaFragRegions for this
01106                syteny_region_id. Resulting DnaFragRegions are
01107                stored using the dnafrag_regions getter/setter.
01108   Returntype : listref of Bio::EnsEMBL::Compara::DnaFragRegion objects
01109   Exception  :
01110   Warning    :
01111 
01112 =cut
01113 
01114 sub _load_DnaFragRegions {
01115   my ($self, $synteny_region_id) = @_;
01116 
01117   my $dnafrag_regions = [];
01118 
01119   # Fail if dbID has not been provided
01120   return $dnafrag_regions if (!$synteny_region_id);
01121 
01122   my $sra = $self->compara_dba->get_SyntenyRegionAdaptor;
01123   my $sr = $sra->fetch_by_dbID($self->param('synteny_region_id'));
01124 
01125   my $regions = $sr->get_all_DnaFragRegions();
01126   foreach my $dfr (@$regions) { 
01127     push(@{$dnafrag_regions}, $dfr);
01128   }
01129 
01130   $self->dnafrag_regions($dnafrag_regions);
01131 }
01132 
01133 
01134 =head2 _load_2XGenomes
01135 
01136   Arg [1]    : int syteny_region_id
01137   Arg [2]    : int analysis_data_id
01138   Description: Creates a fake assembly for each 2X genome by stitching
01139                together the BLASTZ_NET alignments found on this synteny_region
01140                between the reference species and each 2X genome. The list of
01141                the pairwise database locations and  
01142                Bio::EnsEMBL::Compara::MethodLinkSpeciesSet ids are obtained
01143                from the analysis_data_id. Creates a listref of genomic_align 
01144                fragments
01145   Returntype : 
01146   Exception  : 
01147   Warning    :
01148 
01149 =cut
01150 
01151 sub _load_2XGenomes {
01152   my ($self, $synteny_region_id, $analysis_data_id) = @_;
01153 
01154   #get data from analysis_data table
01155   my $analysis_data_adaptor = $self->{hiveDBA}->get_AnalysisDataAdaptor();
01156   my @parameters = split (" ",$analysis_data_adaptor->fetch_by_dbID($analysis_data_id));
01157 
01158   #if no 2x genomes defined, return
01159   if (scalar(@parameters) == 0) {
01160       print "No 2x genomes to load\n" if $self->debug;
01161       return;
01162   }
01163 
01164   #Find the slice on the reference genome
01165   my $genome_db_adaptor = $self->{'comparaDBA'}->get_GenomeDBAdaptor;
01166 
01167   #DEBUG this opens up connections to all the databases
01168   my $ref_genome_db = $genome_db_adaptor->fetch_by_name_assembly($self->param('reference_species'));
01169   my $ref_dba = $ref_genome_db->db_adaptor;
01170   my $ref_slice_adaptor = $ref_dba->get_SliceAdaptor();
01171 
01172   #Find all the dnafrag_regions for the reference genome in this synteny region
01173   my $ref_dnafrags =[];
01174   foreach my $dnafrag_region (@{$self->dnafrag_regions}) {
01175       if ($dnafrag_region->genome_db->dbID == $ref_genome_db->dbID) {
01176       push @$ref_dnafrags, $dnafrag_region;
01177       }
01178   }
01179   
01180   #Return if there is no reference sequence in this synteny region
01181   if (scalar(@$ref_dnafrags) == 0) {
01182       print "No " . $self->param('reference_species') . " sequences found in syntenic block $synteny_region_id\n";
01183       return;
01184   }
01185 
01186   print "Synteny region $synteny_region_id num copies " . scalar(@$ref_dnafrags) . "\n" if $self->debug;
01187 
01188   #Find the BLASTZ_NET alignments between the reference species and each
01189   #2X genome.
01190   foreach my $params (@parameters) {
01191       my $param = eval($params);
01192       my $target_species;
01193 
01194       #open compara database containing 2x genome vs $ref_name blastz results
01195       my $compara_db_url = $param->{'compara_db_url'};
01196 
01197       #if the database name is defined in the url, then open that
01198       my $compara_dba;
01199       my $locator;
01200       if ($compara_db_url =~ /mysql:\/\/.*@.*\/.+/) {
01201       #open database defined in url
01202       $locator = "Bio::EnsEMBL::Compara::DBSQL::DBAdaptor/url=>$compara_db_url";
01203       } else {
01204       throw "Invalid url $compara_db_url. Should be of the form: mysql://user:pass\@host:port/db_name\n";
01205       }
01206 
01207       $compara_dba = Bio::EnsEMBL::DBLoader->new($locator);
01208 
01209       #need to store this to allow disconnect when call ortheus
01210       $self->{pairwise_compara_dba}->{$compara_dba->dbc->dbname} = $compara_dba;
01211       #Get pairwise genomic_align_block adaptor
01212       my $gaba = $compara_dba->get_GenomicAlignBlockAdaptor;
01213 
01214       #Get pairwise method_link_species_set
01215       my $p_mlss_adaptor = $compara_dba->get_MethodLinkSpeciesSetAdaptor;
01216       my $pairwise_mlss = $p_mlss_adaptor->fetch_by_dbID($param->{'method_link_species_set_id'});
01217 
01218       #find non_reference species name in pairwise alignment
01219       my $species_set = $pairwise_mlss->species_set;
01220       foreach my $genome_db (@$species_set) {
01221       if ($genome_db->name ne $self->param('reference_species')) {
01222           $target_species = $genome_db->name;
01223           last;
01224       }
01225       }
01226      
01227       my $target_genome_db = $genome_db_adaptor->fetch_by_name_assembly($target_species);
01228       my $target_dba = $target_genome_db->db_adaptor;
01229       my $target_slice_adaptor = $target_dba->get_SliceAdaptor();
01230 
01231       #Foreach copy of the ref_genome in the multiple alignment block, 
01232       #find the alignment blocks between the ref_genome and the 2x 
01233       #target_genome in the pairwise database
01234 
01235       my $ga_frag_array;
01236       if ($create_block_frag_array) {
01237       $ga_frag_array = $self->_create_block_frag_array($gaba, $ref_slice_adaptor, $pairwise_mlss, $ref_dnafrags);
01238       } else {
01239       $ga_frag_array = $self->_create_span_frag_array($gaba, $ref_slice_adaptor, $pairwise_mlss, $ref_dnafrags);
01240       }
01241 
01242       #not found 2x genome
01243       next if (!defined $ga_frag_array);
01244 
01245       #must first sort so I have a reasonable chance of finding duplicates
01246       for (my $i = 0; $i < scalar(@$ga_frag_array); $i++) {
01247       @{$ga_frag_array->[$i]} = sort {$a->{dnafrag_region}->dnafrag_start <=> $b->{dnafrag_region}->dnafrag_start} @{$ga_frag_array->[$i]};
01248       }
01249     
01250       #find the total length of all the fragments in the ref_region
01251       my $sum_lengths;
01252       for (my $i = 0; $i < scalar(@$ga_frag_array); $i++) {
01253       for (my $j = 0; $j < scalar(@{$ga_frag_array->[$i]}); $j++) {
01254           $sum_lengths->[$i] += ($ga_frag_array->[$i][$j]->{dnafrag_region}->dnafrag_end - $ga_frag_array->[$i][$j]->{dnafrag_region}->dnafrag_start + 1);
01255       }
01256       }
01257 
01258       #check if there is any overlap between pairwise blocks on the ref_genomes
01259       #if there is an overlap, then choose ref_genome duplication which is the 
01260       #longest in 2x genome
01261       #if there is no overlap, save dnafrags on all duplications
01262       my $found_overlap;
01263       my $j = 0;
01264     
01265       #Simple case: only found one reference region containing 2x genome 
01266       #fragments
01267       if (@$ga_frag_array == 1) {
01268       my $cluster;
01269       $cluster = _add_to_cluster($cluster, 0);
01270       _print_cluster($cluster) if $self->debug;
01271       my $longest_ref_region = 0;
01272 
01273       print "SIMPLE CASE: longest_region $longest_ref_region length " . $sum_lengths->[$longest_ref_region] . "\n" if $self->debug;
01274       
01275       _build_2x_composite_seq($self, $compara_dba, $ref_slice_adaptor, $target_slice_adaptor, $ga_frag_array->[$longest_ref_region]);
01276 
01277       push @{$self->{ga_frag}}, $ga_frag_array->[$longest_ref_region];
01278       push @{$self->{'2x_dnafrag_region'}}, $ga_frag_array->[$longest_ref_region]->[0]->{dnafrag_region};
01279       next;
01280       }
01281 
01282       #Found more than one reference region in this synteny block
01283       for (my $region1 = 0; $region1 < scalar(@$ga_frag_array)-1; $region1++) {
01284       for (my $region2 = $region1+1; $region2 <  scalar(@$ga_frag_array); $region2++) {
01285           #initialise found_overlap hash
01286           if (!defined $found_overlap->{$region1}{$region2}) {
01287           $found_overlap->{$region1}{$region2} = 0;
01288           }
01289           
01290           #loop through the 2x genome fragments on region1
01291           for (my $j = 0; ($j < @{$ga_frag_array->[$region1]}); $j++) {
01292           
01293           #if I've already found an overlap, then stop
01294           last if ($found_overlap->{$region1}{$region2});
01295           
01296           #loop through 2x genome fragments on region2
01297           for (my $k = 0; ($k < @{$ga_frag_array->[$region2]}); $k++) {
01298 
01299             #if I've already found an overlap, then stop
01300               last if ($found_overlap->{$region1}{$region2});
01301               
01302               #check if 2x genome fragments have the same name
01303               if ($ga_frag_array->[$region1][$j]->{seq_region_name} eq $ga_frag_array->[$region2][$k]->{seq_region_name}) {
01304               
01305               #check these overlap
01306               if (($ga_frag_array->[$region1][$j]->{start} <= $ga_frag_array->[$region2][$k]->{end}) && ($ga_frag_array->[$region1][$j]->{end} >= $ga_frag_array->[$region2][$k]->{start})) {
01307 
01308                   $found_overlap->{$region1}{$region2} = 1;
01309                   print "found overlap $region1 $region2\n" if $self->debug;
01310                   #found overlap so can stop.
01311                   last;
01312               }
01313               }
01314           }
01315           }
01316       }
01317       }
01318 
01319       #Cluster all the alignment blocks that are overlapping together
01320       my $cluster = $self->_cluster_regions($found_overlap);
01321       _print_cluster($cluster) if $self->debug;
01322       my $longest_regions = $self->_find_longest_region_in_cluster($cluster, $sum_lengths);
01323 
01324       #find the reference with the longest region
01325       my $slice_array;
01326       foreach my $longest_ref_region (@$longest_regions) {
01327       print "longest_ref_region $longest_ref_region length " . $sum_lengths->[$longest_ref_region] . "\n" if $self->debug;
01328 
01329       #store composite_seq in ga_frag_array->[$longest_ref_region]
01330       _build_2x_composite_seq($self, $compara_dba, $ref_slice_adaptor, $target_slice_adaptor, $ga_frag_array->[$longest_ref_region]);
01331       push @{$self->{ga_frag}}, $ga_frag_array->[$longest_ref_region];
01332 
01333       push @{$self->{'2x_dnafrag_region'}}, $ga_frag_array->[$longest_ref_region]->[0]->{dnafrag_region};
01334 
01335       }
01336   } 
01337 }
01338 
01339 =head2 _dump_fasta
01340 
01341   Arg [1]    : -none-
01342   Example    : $self->_dump_fasta();
01343   Description: Dumps FASTA files in the order given by the tree
01344                string (needed by Pecan). Resulting file names are
01345                stored using the fasta_files getter/setter
01346   Returntype : 1
01347   Exception  :
01348   Warning    :
01349 
01350 =cut
01351 
01352 sub _dump_fasta {
01353   my $self = shift;
01354 
01355   my $all_dnafrag_regions = $self->dnafrag_regions;
01356 
01357   ## Dump FASTA files in the order given by the tree string (needed by Pecan)
01358   my @seqs;
01359   if ($self->tree_string) {
01360     @seqs = ($self->tree_string =~ /seq(\d+)/g);
01361   } else {
01362     @seqs = (1..scalar(@$all_dnafrag_regions));
01363   }
01364 
01365   foreach my $seq_id (@seqs) {
01366     my $dfr = $all_dnafrag_regions->[$seq_id-1];
01367 
01368     my $file = $self->worker_temp_directory . "/seq" . $seq_id . ".fa";
01369 
01370     print "file $file name " . $dfr->dnafrag->genome_db->name . "\n" if $self->debug;
01371 
01372     #Check if I have a DnaFragRegion object or my 2x genome object
01373     if (!UNIVERSAL::isa($dfr, 'Bio::EnsEMBL::Compara::DnaFragRegion')) {
01374     print "FOUND 2X GENOME\n" if $self->debug;
01375     print "num of frags " . @$dfr . "\n" if $self->debug;
01376     $self->_dump_2x_fasta($dfr, $file, $seq_id);
01377     next;
01378     }
01379     open F, ">$file" || throw("Couldn't open $file");
01380 
01381     print F ">SeqID" . $seq_id . "\n";
01382 
01383     print ">DnaFrag", $dfr->dnafrag_id, "|", $dfr->dnafrag->name, "|", $dfr->dnafrag->genome_db->name, "|", $dfr->dnafrag->genome_db_id, "|",
01384         $dfr->dnafrag_start, "-", $dfr->dnafrag_end, ":", $dfr->dnafrag_strand," $seq_id***\n" if $self->debug;
01385 
01386 # my $slice = $dfr->dnafrag->slice->sub_Slice($dfr->dnafrag_start,$dfr->dnafrag_end,$dfr->dnafrag_strand);
01387  
01388     my $slice = $dfr->slice;
01389     throw("Cannot get slice for DnaFragRegion in DnaFrag #".$dfr->dnafrag_id) if (!$slice);
01390     my $seq = $slice->get_repeatmasked_seq(undef, 1)->seq;
01391     if ($seq =~ /[^ACTGactgNnXx]/) {
01392       print STDERR $slice->name, " contains at least one non-ACTGactgNnXx character. These have been replaced by N's\n";
01393       $seq =~ s/[^ACTGactgNnXx]/N/g;
01394     }
01395     $seq =~ s/(.{80})/$1\n/g;
01396     chomp $seq;
01397     print F $seq,"\n";
01398 
01399     close F;
01400 
01401     push @{$self->fasta_files}, $file;
01402     push @{$self->species_order}, $dfr->dnafrag->genome_db_id;
01403   }
01404 
01405   return 1;
01406 }
01407 
01408 
01409 =head2 _build_tree_string
01410 
01411   Arg [1]    : -none-
01412   Example    : $self->_build_tree_string();
01413   Description: This method sets the tree_string using the orginal
01414                species tree and the set of DnaFragRegions. The
01415                tree is edited by the _update_tree method which
01416                resort the DnaFragRegions (see _update_tree elsewwhere
01417                in this document)
01418   Returntype : -none-
01419   Exception  :
01420   Warning    :
01421 
01422 =cut
01423 
01424 sub _build_tree_string {
01425   my $self = shift;
01426 
01427   my $tree = $self->get_species_tree->copy;
01428   return if (!$tree);
01429 
01430   $tree = $self->_update_tree($tree);
01431 
01432   return if (!$tree);
01433 
01434   my $tree_string = $tree->newick_simple_format;
01435   # Remove quotes around node labels
01436   $tree_string =~ s/"(seq\d+)"/$1/g;
01437   # Remove branch length if 0
01438   $tree_string =~ s/\:0\.0+(\D)/$1/g;
01439   $tree_string =~ s/\:0([^\.\d])/$1/g;
01440 
01441   $tree->release_tree;
01442 
01443   $self->tree_string($tree_string);
01444 }
01445 
01446 
01447 =head2 _update_tree
01448 
01449   Arg [1]    : Bio::EnsEMBL::Compara::NestedSet $tree_root
01450   Example    : $self->_update_nodes_names($tree);
01451   Description: This method updates the tree by removing or
01452                duplicating the leaves according to the orginal
01453                tree and the set of DnaFragRegions. The tree nodes
01454                will be renamed seq1, seq2, seq3 and so on and the
01455                DnaFragRegions will be resorted in order to match
01456                the names of the nodes (the first DnaFragRegion will
01457                correspond to seq1, the second to seq2 and so on).
01458   Returntype : Bio::EnsEMBL::Compara::NestedSet (a tree)
01459   Exception  :
01460   Warning    :
01461 
01462 =cut
01463 
01464 sub _update_tree {
01465   my $self = shift;
01466   my $tree = shift;
01467 
01468   my $all_dnafrag_regions = $self->dnafrag_regions;
01469   my $ordered_dnafrag_regions = [];
01470   my $ordered_2x_genomes = [];
01471 
01472   my $idx = 1;
01473   my $all_leaves = $tree->get_all_leaves;
01474   foreach my $this_leaf (@$all_leaves) {
01475     my $these_dnafrag_regions = [];
01476     my $these_2x_genomes = [];
01477     ## Look for DnaFragRegions belonging to this genome_db_id
01478     foreach my $this_dnafrag_region (@$all_dnafrag_regions) {
01479       if ($this_dnafrag_region->dnafrag->genome_db_id == $this_leaf->name) {
01480         push (@$these_dnafrag_regions, $this_dnafrag_region);
01481       }
01482     }
01483 
01484     my $index = 0;
01485     foreach my $ga_frags (@{$self->{ga_frag}}) {
01486     my $first_frag = $ga_frags->[0];
01487     if ($first_frag->{genome_db_id} == $this_leaf->name) {
01488         push(@$these_2x_genomes, $index);
01489     }
01490     $index++;
01491     }
01492     print "num " . @$these_dnafrag_regions . " " . @$these_2x_genomes . "\n" if $self->debug;
01493 
01494     if (@$these_dnafrag_regions == 1) {
01495       ## If only 1 has been found...
01496       print "seq$idx genome_db_id=" . $these_dnafrag_regions->[0]->dnafrag->genome_db_id . "\n" if $self->debug;
01497 
01498       $this_leaf->name("seq".$idx++); #.".".$these_dnafrag_regions->[0]->dnafrag_id);
01499 
01500       push(@$ordered_dnafrag_regions, $these_dnafrag_regions->[0]);
01501 
01502     } elsif (@$these_dnafrag_regions > 1) {
01503       ## If more than 1 has been found, let Ortheus estimate the Tree
01504     #need to add on 2x genomes to dnafrag_regions array
01505     my $dfa = $self->dnafrag_regions;
01506     foreach my $ga_frags (@{$self->{ga_frag}}) {
01507         push @$dfa, $ga_frags;
01508     }
01509     $self->dnafrag_regions($dfa);
01510     return undef;
01511 
01512    } elsif (@$these_2x_genomes == 1) {
01513     #See what happens...
01514     #Find 2x genomes
01515        my $ga_frags = $self->{ga_frag}->[$these_2x_genomes->[0]];
01516        print "number of frags " . @$ga_frags . "\n" if $self->debug;
01517 
01518     print "2x seq$idx " . $ga_frags->[0]->{genome_db_id} . "\n" if $self->debug;
01519     $this_leaf->name("seq".$idx++);
01520     #push(@$ordered_2x_genomes, $these_2x_genomes->[0]);
01521     push(@$ordered_dnafrag_regions, $ga_frags);
01522   } else {
01523       ## If none has been found...
01524       $this_leaf->disavow_parent;
01525       $tree = $tree->minimize_tree;
01526     }
01527  }
01528  $self->dnafrag_regions($ordered_dnafrag_regions);
01529 
01530  $self->{ordered_2x_genomes} = $ordered_2x_genomes;
01531 
01532 
01533   #if (scalar(@$all_dnafrag_regions) != scalar(@$ordered_dnafrag_regions) or
01534    #   scalar(@$all_dnafrag_regions) != scalar(@{$tree->get_all_leaves})) {
01535    # throw("Tree has a wrong number of leaves after updating the node names");
01536   #}
01537 
01538   if ($tree->get_child_count == 1) {
01539     my $child = $tree->children->[0];
01540     $child->parent->merge_children($child);
01541     $child->disavow_parent;
01542   }
01543   return $tree;
01544 }
01545 
01546 #
01547 #From each reference genomic_align, find all the pairwise alignments for this
01548 #pairwise_mlss. Store only the pairwise match, NOT the region between blocks
01549 # as the create_span_frag_array does. Return an array
01550 #of ga_fragments for each reference genomic_align
01551 #
01552 sub _create_block_frag_array {
01553     my ($self, $gab_adaptor, $ref_slice_adaptor, $pairwise_mlss, $ref_dnafrags) = @_;
01554 
01555     my $ga_frag_array;
01556 
01557     my $ga_num_ns = 0;
01558 
01559     #Multiple alignment reference genomic_aligns (maybe more than 1)
01560     foreach my $ref_dnafrag (@$ref_dnafrags) {
01561     print "  " . $ref_dnafrag->dnafrag->name . " " . $ref_dnafrag->dnafrag_start . " " . $ref_dnafrag->dnafrag_end . " " . $ref_dnafrag->dnafrag_strand . "\n" if $self->debug;
01562     
01563     #find the slice corresponding to the ref_genome
01564     my $slice = $ref_slice_adaptor->fetch_by_region('toplevel', $ref_dnafrag->dnafrag->name, $ref_dnafrag->dnafrag_start, $ref_dnafrag->dnafrag_end, $ref_dnafrag->dnafrag_strand);
01565 
01566     print "ref_seq " . $slice->start . " " . $slice->end . " " . $slice->strand . " " . substr($slice->seq,0,120) . "\n" if $self->debug;
01567 
01568     #find the pairwise blocks between ref_genome and the 2x genome
01569     my $pairwise_gabs = $gab_adaptor->fetch_all_by_MethodLinkSpeciesSet_Slice($pairwise_mlss, $slice, undef,undef,1);
01570     
01571     #sort by reference_genomic_align start position
01572     @$pairwise_gabs = sort {$a->reference_genomic_align->dnafrag_start <=> $b->reference_genomic_align->dnafrag_start} @$pairwise_gabs;
01573 
01574 
01575     print "    pairwise gabs " . scalar(@$pairwise_gabs) . "\n" if $self->debug;
01576     
01577     #if there are no pairwise matches found to 2x genome, then escape
01578     #back to loop
01579     next if (scalar(@$pairwise_gabs) == 0);
01580     
01581     my $ga_frags;
01582 
01583     #need to save each match separately but still use same structure as
01584     #create_span_frag_array in case we change our minds back again
01585 
01586     foreach my $pairwise_gab (@$pairwise_gabs) {
01587 
01588         #should only have 1!
01589         my $gas = $pairwise_gab->get_all_non_reference_genomic_aligns;
01590         
01591         my $ga = $gas->[0];
01592         my $ref_start = $ga->genomic_align_block->reference_genomic_align->dnafrag_start;
01593         my $ref_end = $ga->genomic_align_block->reference_genomic_align->dnafrag_end;
01594 
01595         #need to reverse order of fragments if ref is on reverse strand
01596         if ($slice->strand == -1) {
01597         my $tmp_start = $ref_start;
01598         $ref_start = $slice->end - $ref_end + $slice->start;
01599         $ref_end = $slice->end - $tmp_start + $slice->start;
01600         #print "REVERSE $ref_start $ref_end\n";
01601         }
01602 
01603         my $dnafrag_adaptor = $self->{'comparaDBA'}->get_DnaFragAdaptor;
01604         
01605         my $dnafrag_region = new Bio::EnsEMBL::Compara::DnaFragRegion(
01606                   -dnafrag_id => $ga->dnafrag->dbID,
01607                   -dnafrag_start => $ga->dnafrag_start,
01608                   -dnafrag_end => $ga->dnafrag_end,
01609                               -dnafrag_strand => $ga->dnafrag_strand,
01610                               -adaptor => $dnafrag_adaptor
01611                       );
01612         
01613 
01614         my $ga_fragment = {dnafrag_region => $dnafrag_region,
01615                    genome_db => $ga->dnafrag->genome_db,
01616                    genome_db_id => $ga->dnafrag->genome_db_id,
01617                    ref_dnafrag => $ref_dnafrag,
01618                    ref_start => $ref_start,
01619                    ref_end => $ref_end,
01620                    ref_slice_start => $slice->start,
01621                    ref_slice_end => $slice->end};
01622         
01623         push @$ga_frags, $ga_fragment;
01624     }
01625     #add to array of fragments for each reference genomic_align
01626     push @$ga_frag_array, $ga_frags;
01627     }
01628     return $ga_frag_array;
01629 }
01630 
01631 
01632 
01633 #
01634 #From each reference genomic_align, find all the pairwise alignments for this
01635 #pairwise_mlss. Summarise the genomic_aligns in the same group_id by storing 
01636 #the min start and max end and create a new DnaFragRegion. Return an array
01637 #of ga_fragments for each reference genomic_align
01638 #
01639 sub _create_span_frag_array {
01640     my ($self, $gab_adaptor, $ref_slice_adaptor, $pairwise_mlss, $ref_dnafrags) = @_;
01641 
01642     my $ga_frag_array;
01643 
01644     my $ga_num_ns = 0;
01645 
01646     #Multiple alignment reference genomic_aligns (maybe more than 1)
01647     foreach my $ref_dnafrag (@$ref_dnafrags) {
01648     print "  " . $ref_dnafrag->dnafrag->name . " " . $ref_dnafrag->dnafrag_start . " " . $ref_dnafrag->dnafrag_end . " " . $ref_dnafrag->dnafrag_strand . "\n" if $self->debug;
01649     
01650 
01651     #find the slice corresponding the ref_genome
01652     my $slice = $ref_slice_adaptor->fetch_by_region('toplevel', $ref_dnafrag->dnafrag->name, $ref_dnafrag->dnafrag_start, $ref_dnafrag->dnafrag_end, $ref_dnafrag->dnafrag_strand);
01653 
01654     print "ref_seq " . $slice->start . " " . $slice->end . " " . $slice->strand . " " . substr($slice->seq,0,120) . "\n" if $self->debug;
01655 
01656     #find the pairwise blocks between ref_genome and the 2x genome
01657     my $pairwise_gabs = $gab_adaptor->fetch_all_by_MethodLinkSpeciesSet_Slice($pairwise_mlss, $slice, undef,undef,1);
01658     
01659     #sort by reference_genomic_align start position
01660     @$pairwise_gabs = sort {$a->reference_genomic_align->dnafrag_start <=> $b->reference_genomic_align->dnafrag_start} @$pairwise_gabs;
01661 
01662 
01663     print "    pairwise gabs " . scalar(@$pairwise_gabs) . "\n" if $self->debug;
01664     
01665     #if there are no pairwise matches found to 2x genome, then escape
01666     #back to loop
01667     next if (scalar(@$pairwise_gabs) == 0);
01668     
01669     my $ga_frags;
01670     
01671     #Group together blocks in the same contiguous group and store the left most
01672     #and right most coords in a slice object
01673     
01674     #initialise prev_group_id
01675     my $prev_group_id = $pairwise_gabs->[0]->group_id;
01676     my $min_start;
01677     my $max_end;
01678     my $dnafrag_name;
01679     my $genome_db_id;
01680     my $genome_db;
01681     my $dnafrag_strand;
01682     my $prev_ga;
01683     my $ref_min_start;
01684     my $ref_max_end;
01685     my $dnafrag;
01686 
01687     foreach my $pairwise_gab (@$pairwise_gabs) {
01688 
01689         #should only have 1!
01690         my $gas = $pairwise_gab->get_all_non_reference_genomic_aligns;
01691 
01692         my $ga = $gas->[0];
01693 
01694 
01695         print "    " . $ga->genome_db->name . " " . $ga->dnafrag->name . " " . $ga->dnafrag_start . " " . $ga->dnafrag_end . " " . $ga->dnafrag_strand . " " . $pairwise_gab->group_id . " " . $ga->dnafrag->coord_system_name . " " . $ga->genomic_align_block->reference_genomic_align->dnafrag_start . " " . $ga->genomic_align_block->reference_genomic_align->dnafrag_end . " " . $ga->genomic_align_block->reference_genomic_align->dnafrag_strand . "\n" if $self->debug;
01696         
01697         my $ga_slice = $ga->get_Slice;
01698 
01699         $ga_num_ns += $ga_slice->seq =~ tr/N/N/;
01700 
01701         #need to group all genomic_aligns of the same group together
01702         if ($prev_group_id == $pairwise_gab->group_id) {
01703         if (!defined $min_start || $ga->dnafrag_start < $min_start) {
01704             $min_start = $ga->dnafrag_start;
01705 
01706             #if the ref and non-ref genomic_aligns are on different
01707             #strands, need to swap start and end
01708             if ($ga->dnafrag_strand == $slice->strand) {
01709             $ref_min_start = $ga->genomic_align_block->reference_genomic_align->dnafrag_start;
01710             } else {
01711             $ref_max_end = $ga->genomic_align_block->reference_genomic_align->dnafrag_end;
01712             }
01713         } 
01714         if (!defined $max_end || $ga->dnafrag_end > $max_end) {
01715             $max_end = $ga->dnafrag_end;
01716 
01717             #if the ref and non-ref genomic_aligns are on different
01718             #strands, need to swap start and end
01719             if ($ga->dnafrag_strand == $slice->strand) {
01720             $ref_max_end = $ga->genomic_align_block->reference_genomic_align->dnafrag_end;
01721             } else { 
01722             $ref_min_start = $ga->genomic_align_block->reference_genomic_align->dnafrag_start; 
01723             }
01724         } 
01725         } else {
01726 
01727         print "ref_min_start $ref_min_start ref_max_end $ref_max_end\n" if ($self->debug);
01728         #need to reverse order of fragments if ref is on reverse strand
01729         if ($slice->strand == -1) {
01730             $ref_min_start = $slice->end - $ref_min_start + $slice->start;
01731             $ref_max_end = $slice->end - $ref_max_end + $slice->start;
01732         }
01733 
01734         #ensure than ref_start is always smaller than ref_end (can be
01735         #larger if strand is 0)
01736         my $ref_start;
01737         if ($ref_min_start > $ref_max_end) {
01738             $ref_start = $ref_max_end;
01739             $ref_max_end = $ref_min_start;
01740             $ref_min_start = $ref_start;
01741         }
01742         
01743         my $dnafrag_adaptor = $self->{'comparaDBA'}->get_DnaFragAdaptor;
01744 
01745         my $dnafrag_region = new Bio::EnsEMBL::Compara::DnaFragRegion(
01746                    -dnafrag_id => $dnafrag->dbID,
01747                    -dnafrag_start => $min_start,
01748                        -dnafrag_end => $max_end,
01749                        -dnafrag_strand => $dnafrag_strand,
01750                        -adaptor => $dnafrag_adaptor
01751                );
01752         
01753 
01754         my $ga_fragment = {dnafrag_region => $dnafrag_region,
01755                    genome_db => $genome_db,
01756                    genome_db_id => $genome_db_id,
01757                    ref_dnafrag => $ref_dnafrag,
01758                    ref_start => $ref_min_start,
01759                    ref_end => $ref_max_end,
01760                    ref_slice_start => $slice->start,
01761                    ref_slice_end => $slice->end};
01762         
01763         print "store frag $min_start $max_end " . ($max_end - $min_start) . "\n" if $self->debug;
01764         print "final seq $ref_min_start $ref_max_end " . substr($dnafrag_region->slice->seq,0,10) . "\n" if $self->debug;
01765         
01766         push @$ga_frags, $ga_fragment;
01767         
01768         #reinitialise min_start and max_end
01769         $min_start = $ga->dnafrag_start;
01770         $ref_min_start = $ga->genomic_align_block->reference_genomic_align->dnafrag_start;
01771         $max_end = $ga->dnafrag_end;
01772         $ref_max_end = $ga->genomic_align_block->reference_genomic_align->dnafrag_end;
01773         }
01774         $dnafrag_name = $ga->dnafrag->name;
01775         $genome_db_id = $ga->dnafrag->genome_db_id;
01776         $genome_db = $ga->dnafrag->genome_db;
01777         $dnafrag = $ga->dnafrag;
01778         
01779         #now get ref slice in correct orientation so this is fine now.
01780         $dnafrag_strand = $ga->dnafrag_strand;
01781         
01782         $prev_group_id = $pairwise_gab->group_id;
01783         $prev_ga = $ga;
01784     }
01785     #store last frag
01786 
01787     #need to reverse order of fragments if ref is on reverse strand
01788     if ($slice->strand == -1) {
01789         $ref_min_start = $slice->end - $ref_min_start + $slice->start;
01790         $ref_max_end = $slice->end - $ref_max_end + $slice->start;
01791     }
01792 
01793     #ensure than ref_start is always smaller than ref_end (can be
01794     #larger if strand is -1)
01795     my $ref_start;
01796     if ($ref_min_start > $ref_max_end) {
01797         $ref_start = $ref_max_end;
01798         $ref_max_end = $ref_min_start;
01799         $ref_min_start = $ref_start;
01800     }
01801     print "store last $min_start $max_end $ref_min_start $ref_max_end \n" if $self->debug;
01802 
01803     my $dnafrag_region = new Bio::EnsEMBL::Compara::DnaFragRegion(
01804            -dnafrag_id => $dnafrag->dbID,
01805            -dnafrag_start => $min_start,
01806                -dnafrag_end => $max_end,
01807                -dnafrag_strand => $dnafrag_strand
01808            );
01809 
01810     my $ga_fragment = {dnafrag_region => $dnafrag_region,
01811                genome_db => $genome_db,
01812                genome_db_id => $genome_db_id,
01813                ref_dnafrag => $ref_dnafrag,
01814                ref_start => $ref_min_start,
01815                ref_end => $ref_max_end,
01816                ref_slice_start => $slice->start,
01817                ref_slice_end => $slice->end};
01818 
01819     #store last $ga_fragment
01820     push @$ga_frags, $ga_fragment;
01821 
01822     #add to array of fragments for each reference genomic_align
01823     push @$ga_frag_array, $ga_frags;
01824     }
01825     return $ga_frag_array;
01826 }
01827 
01828 
01829 #foreach cluster, find the longest region.
01830 sub _find_longest_region_in_cluster {
01831     my ($self, $cluster, $sum_lengths) = @_;
01832 
01833     my $max_frag = 0;
01834     my $final_region = -1;
01835     my @overlap_frag;
01836 
01837     my $overlap_cnt = 0;
01838     my $not_overlap_cnt = 0;
01839     my $longest_clusters;
01840 
01841     foreach my $this_cluster (@$cluster) {
01842     my $longest_frag;
01843     my $longest_region;
01844 
01845     foreach my $region (keys %{$this_cluster}) {
01846         #initialise variables
01847         if (!defined $longest_frag) {
01848         $longest_frag = $sum_lengths->[$region];
01849         $longest_region = $region;
01850         }
01851 
01852         if ($sum_lengths->[$region] >= $longest_frag) {
01853         $longest_frag = $sum_lengths->[$region];
01854         $longest_region = $region;
01855         }
01856     }
01857     push @$longest_clusters, $longest_region;
01858     }
01859     print "overlap_cnt $overlap_cnt not $not_overlap_cnt\n" if $self->debug;
01860     return $longest_clusters;
01861 }
01862 
01863 #Put overlapping regions in the same cluster. If region 0 overlaps with region 
01864 #1 and region 2, but not with region 3, create 2 clusters: (0,1,2), (3)
01865 sub _cluster_regions {
01866     my ($self, $found_overlap) = @_;
01867 
01868     my $overlap_cnt = 0;
01869     my $not_overlap_cnt = 0;
01870 
01871     my $cluster;
01872 
01873     foreach my $region1 (keys %$found_overlap) {
01874     foreach my $region2 (keys %{$found_overlap->{$region1}}) {
01875         print "FOUND OVERLAP $region1 $region2 " . $found_overlap->{$region1}{$region2} . "\n" if $self->debug;
01876         if ($found_overlap->{$region1}{$region2}) {
01877         $overlap_cnt++;
01878 
01879         $cluster = _add_to_same_cluster($cluster, $region1, $region2);
01880         } else {
01881         $not_overlap_cnt++;
01882         $cluster = _add_to_different_cluster($cluster, $region1, $region2);
01883         }
01884     }
01885     }
01886     print "overlap_cnt $overlap_cnt not $not_overlap_cnt\n" if $self->debug;
01887     return $cluster;
01888 }
01889 
01890 #add single region to cluster. No overlaps found.
01891 sub _add_to_cluster {
01892     my ($cluster, $region1) = @_;
01893 
01894     if (!defined $cluster) {
01895      $cluster->[0]->{$region1} = 1;
01896      }
01897     return $cluster;
01898 }
01899 
01900  sub _add_to_same_cluster {
01901      my ($cluster, $region1, $region2) = @_;
01902 
01903      #print "add to same cluster $region1 $region2\n";
01904 
01905      if (!defined $cluster) {
01906      $cluster->[0]->{$region1} = 1;
01907      $cluster->[0]->{$region2} = 1;
01908      return $cluster;
01909      }
01910 
01911      my $cluster_size = @$cluster;
01912 
01913      my $index1 = _in_cluster($cluster, $region1);
01914      my $index2 = _in_cluster($cluster, $region2);
01915 
01916      if ($index1 == -1 && $index2 == -1) {
01917      #neither found, add both to new cluster
01918      $cluster->[$cluster_size]->{$region1} = 1;
01919      $cluster->[$cluster_size]->{$region2} = 1;
01920      } elsif ($index1 != -1 && $index2 == -1) {
01921      #id1 found, id2 not. add id2 to id1 cluster
01922      $cluster->[$index1]->{$region2} = 1;
01923      } elsif ($index1 == -1 && $index2 != -1) {
01924      #id2 found, id1 not. add id1 to id2 cluster
01925      $cluster->[$index2]->{$region1} = 1;
01926      } else {
01927      #both ids set in different clusters. Merge the clusters together
01928      $cluster = _merge_clusters($cluster, $index1, $index2);
01929      }
01930      return $cluster;
01931  }
01932 
01933 sub _add_to_different_cluster {
01934      my ($cluster, $region1, $region2) = @_;
01935 
01936      if (!defined $cluster) {
01937      $cluster->[0]->{$region1} = 1;
01938      $cluster->[1]->{$region2} = 1;
01939      return $cluster;
01940      }
01941      my $cluster_size = @$cluster;
01942 
01943      my $index1 = _in_cluster($cluster, $region1);
01944      my $index2 = _in_cluster($cluster, $region2);
01945 
01946      if ($index1 == -1) {
01947      $cluster->[@$cluster]->{$region1} = 1;
01948      } 
01949      if ($index2 == -1) {
01950      $cluster->[@$cluster]->{$region2} = 1;
01951      }
01952 
01953      return $cluster;
01954  }
01955 
01956  sub _in_cluster {
01957      my ($cluster, $region) = @_;
01958 
01959      for (my $i = 0; $i < @$cluster; $i++) {
01960     if ($cluster->[$i]->{$region}) {
01961         return $i;
01962     }
01963      }
01964      return -1;
01965  }
01966 
01967 sub _merge_clusters {
01968     my ($cluster, $index1, $index2) = @_;
01969     
01970     #already in same cluster
01971     if ($index1 != -1 && $index1 == $index2) {
01972     return $cluster;
01973     }
01974 
01975     #copy over keys from index2 to index1
01976     foreach my $region (keys %{$cluster->[$index2]}) {
01977     #print "region $region\n";
01978     $cluster->[$index1]->{$region} = 1;
01979     }
01980  
01981     #delete index2
01982     splice(@$cluster, $index2, 1);
01983 
01984     return $cluster;
01985 }
01986 
01987 sub _print_cluster {
01988     my ($cluster) = @_;
01989 
01990     print "FINAL cluster ";
01991     foreach my $this_cluster (@$cluster) {
01992     print "(";
01993     foreach my $group (keys %{$this_cluster}) {
01994         print "$group ";
01995     }
01996     print "), ";
01997     }
01998     print "\n";
01999 }
02000 
02001 #Build a sequence by concatenating all the fragments together and adding
02002 #num_pads between each fragment. If the distance between one fragment and the
02003 #next is less than max_pads, num_pads = distance else num_pads = max_pads.
02004 #Store the number of pads added in the ga_fragment structure as num_pads
02005 #This is the num_pads added to the end of fragment so should be indexed using
02006 #ga_frag->{ref_end}+1. Note that the number of pads added to the beginning is
02007 #currently *not* stored.
02008 sub _build_2x_composite_seq {
02009     my ($self, $pairwise_dba, $ref_slice_adaptor, $target_slice_adaptor, $ga_frags) = @_;
02010 
02011     my $slice_array;
02012     my $composite_seq;
02013 
02014     #need to sort on ref_start
02015     @$ga_frags = sort {$a->{ref_start} <=> $b->{ref_start}} @$ga_frags;
02016 
02017     my $first_frag = $ga_frags->[0];
02018     my $num_pads;
02019 
02020     my $prev_end;
02021     my $prev_frag;
02022 
02023     my $dnafrag_adaptor = $pairwise_dba->get_DnaFragAdaptor;
02024 
02025     #work out how many pads to add to the beginning from the reference seq
02026     $num_pads = $first_frag->{ref_start} - $first_frag->{ref_slice_start};
02027     #print "slice start " . $first_frag->{ref_slice_start} . " end " . $first_frag->{ref_slice_end} . " num pads $num_pads\n";
02028     $num_pads = $max_pads if ($num_pads > $max_pads);
02029     $num_pads = int($num_pads * $max_pads_percent);
02030 
02031     $composite_seq .= $pad_char x $num_pads;
02032 
02033     #store first set of pads in {first_pads}
02034     $first_frag->{first_pads} = $num_pads;
02035 
02036     #always add $max_pads to the beginning
02037     #$composite_seq .= $pad_char x $max_pads;
02038     foreach my $ga_frag (@$ga_frags) {
02039 
02040     my $dnafrag = $dnafrag_adaptor->fetch_by_dbID($ga_frag->{dnafrag_region}->dnafrag_id);
02041 
02042     print "species " . $dnafrag->genome_db->name . " name " . $dnafrag->name . " start " . $ga_frag->{dnafrag_region}->dnafrag_start . " end " . $ga_frag->{dnafrag_region}->dnafrag_end . " len " . ($ga_frag->{dnafrag_region}->dnafrag_end-$ga_frag->{dnafrag_region}->dnafrag_start+1) . " strand " . $ga_frag->{dnafrag_region}->dnafrag_strand . " ref_name " . $ga_frag->{ref_dnafrag}->dnafrag->name . " ref_start " . $ga_frag->{ref_start} . " ref_end " . $ga_frag->{ref_end} . " ref_len " . ($ga_frag->{ref_end}-$ga_frag->{ref_start}+1) . "\n" if $self->debug;
02043     if (defined($prev_frag)) {
02044 
02045         print "prev_end " . $prev_frag->{ref_end} . " start " . $ga_frag->{ref_start} . "\n" if $self->debug;
02046 
02047         #Find the number of bases between fragments
02048         $num_pads = $ga_frag->{ref_start} - $prev_frag->{ref_end} - 1;
02049 
02050         print "before max_pads $num_pads\n" if $self->debug;
02051 
02052         #Add up to $max_pads between fragments
02053         $num_pads = $max_pads if ($num_pads > $max_pads);
02054         $num_pads = int($num_pads * $max_pads_percent);
02055 
02056         print "pads $num_pads\n" if $self->debug;
02057         $composite_seq .= $pad_char x $num_pads;
02058 
02059         #Store number of pads added to the end of previous frag. Use
02060         #{ref_end} to identify where the pads have been added
02061         $prev_frag->{num_pads} = $num_pads;
02062     }
02063     
02064     my $ref_slice = $ref_slice_adaptor->fetch_by_region('toplevel', $ga_frag->{ref_dnafrag}->dnafrag->name, $ga_frag->{ref_dnafrag}->dnafrag_start, $ga_frag->{ref_dnafrag}->dnafrag_end, $ga_frag->{ref_dnafrag}->dnafrag_strand);
02065     
02066     my $slice = $target_slice_adaptor->fetch_by_region('toplevel', $dnafrag->name, $ga_frag->{dnafrag_region}->dnafrag_start, $ga_frag->{dnafrag_region}->dnafrag_end, $ga_frag->{dnafrag_region}->dnafrag_strand);
02067                                
02068     my $seq = $slice->get_repeatmasked_seq(undef, 1)->seq;
02069     if ($seq =~ /[^ACTGactgNnXx]/) {
02070         print STDERR $slice->name, " contains at least one non-ACTGactgNnXx character. These have been replaced by N's\n";
02071         $seq =~ s/[^ACTGactgNnXx]/N/g;
02072     }
02073     $composite_seq .= $seq;
02074 
02075     #store end of previous fragment 
02076     $prev_frag = $ga_frag;
02077     }
02078 
02079     my $last_frag = $ga_frags->[-1];
02080 
02081     #work out how many pads to add to the end
02082     $num_pads = $first_frag->{ref_slice_end} - $last_frag->{ref_end};
02083     $num_pads = $max_pads if ($num_pads > $max_pads);
02084     $num_pads = int($num_pads * $max_pads_percent);
02085 
02086     #print "ref slice end " . $first_frag->{ref_slice_end} . " last ele " . $ga_frags->[-1]->{ref_end} . " num pads $num_pads\n";
02087     $composite_seq .= $pad_char x $num_pads;
02088 
02089     #store last pads
02090     $last_frag->{num_pads} = $num_pads;
02091 
02092     #always write $max_pads at the end
02093     #print "last pads $max_pads\n" if $self->debug;
02094 
02095     #$composite_seq .= $pad_char x $max_pads;
02096 
02097     $composite_seq =~ s/(.{80})/$1\n/g;
02098     chomp $composite_seq;
02099 
02100     #store sequence in first object in ga_frags array
02101     $first_frag->{seq} = $composite_seq;
02102 
02103     return $composite_seq;
02104 }
02105 
02106 sub _dump_2x_fasta {
02107     my ($self, $ga_frags, $file, $seq_id) = @_;
02108 
02109     open F, ">$file" || throw("Couldn't open $file");
02110 
02111     print F ">SeqID" . $seq_id . "\n";
02112 
02113     #stored concatenated mfa sequence on first frag
02114     print F $ga_frags->[0]->{seq},"\n";
02115     
02116     close F;
02117     push @{$self->fasta_files}, $file;
02118     push @{$self->species_order}, $ga_frags->[0]->{genome_db_id};
02119 
02120 }
02121 
02122 1;