Archive Ensembl HomeArchive Ensembl Home
TrimAnchorAlign.pm
Go to the documentation of this file.
00001 
00002 # You may distribute this module under the same terms as perl itself
00003 # POD documentation - main docs before the code
00004 
00005 =pod 
00006 
00007 =head1 NAME
00008 
00009 Bio::EnsEMBL::Compara::Production::EPOanchors::TrimAnchorAlign
00010 
00011 =cut
00012 
00013 =head1 SYNOPSIS
00014 
00015 parameters
00016 {input_analysis_id=> ?,method_link_species_set_id=> ?,test_method_link_species_set_id=> ?, genome_db_ids => [?],}
00017 
00018 =cut
00019 
00020 =head1 DESCRIPTION
00021 
00022 Finds a good splitting point within the anchors. Leave the anchors as a 2bp long region.
00023 
00024 - fetch_input
00025 - run
00026 - write_output
00027 
00028 =cut
00029 
00030 =head1 CONTACT
00031 
00032 dev@ensembl.org
00033 
00034 =cut
00035 
00036 =head1 APPENDIX
00037 
00038 The rest of the documentation details each of the object methods.
00039 Internal methods are usually preceded with a _
00040 
00041 =cut
00042 
00043 package Bio::EnsEMBL::Compara::Production::EPOanchors::TrimAnchorAlign;
00044 
00045 use strict;
00046 use Data::Dumper;
00047 use Bio::EnsEMBL::Compara::Production::DBSQL::DBAdaptor;
00048 use Bio::EnsEMBL::Hive::Process;
00049 use Bio::EnsEMBL::Compara::MethodLinkSpeciesSet;
00050 use Bio::EnsEMBL::Compara::Production::EPOanchors::AnchorAlign;
00051 use Bio::EnsEMBL::Utils::Exception qw(throw warning);
00052 use Bio::EnsEMBL::Analysis::Runnable::Pecan;
00053 
00054 our @ISA = qw(Bio::EnsEMBL::Hive::Process);
00055 
00056 
00057 sub fetch_input {
00058   my ($self) = @_;
00059 
00060   $self->{'comparaDBA'} = Bio::EnsEMBL::Compara::Production::DBSQL::DBAdaptor->new(-DBCONN=>$self->db->dbc);
00061   $self->{'comparaDBA'}->dbc->disconnect_when_inactive(0);
00062   $self->get_params($self->parameters);
00063   $self->get_params($self->input_id);
00064 
00065   if (!$self->anchor_id) {
00066     return 0;
00067   }
00068 #   $self->{'fasta_files'} = [];
00069   my $anchor_align_adaptor = $self->{'comparaDBA'}->get_AnchorAlignAdaptor();
00070   ## This method returns a hash at the moment, not the objects
00071 #   print "Fetching AnchorAligns for Anchor ", $self->{'anchor_id'}, " and MLSS ", $self->{'input_method_link_species_set_id'}, "\n";
00072   my $anchor_align_hash = $anchor_align_adaptor->fetch_all_by_anchor_id_and_mlss_id(
00073       $self->{'anchor_id'}, $self->{'input_method_link_species_set_id'});
00074   die "Cannot find any anchor_align with anchor_id = ". $self->{'anchor_id'}.
00075     " and method_link_species_set_id = ". $self->{'input_method_link_species_set_id'}
00076       if (!$anchor_align_hash);
00077   foreach my $anchor_align_id (keys %$anchor_align_hash) {
00078 #     print "Fetching AnchorAlign $anchor_align_id\n";
00079     my $anchor_align = $anchor_align_adaptor->fetch_by_dbID($anchor_align_id);
00080     push(@{$self->{'anchor_aligns'}}, $anchor_align);
00081   }
00082   $self->_dump_fasta();
00083   exit(1) if (!$self->{'fasta_files'} or !@{$self->{'fasta_files'}});
00084 
00085 
00086   return 1;
00087 }
00088 
00089 sub run {
00090   my $self = shift;
00091   exit(1) if (!$self->{'fasta_files'} or !@{$self->{'fasta_files'}});
00092   exit(1) if (!$self->{'tree_string'});
00093 
00094   my $run_str = "/software/ensembl/compara/OrtheusC/bin/OrtheusC";
00095   $run_str .= " -a " . join(" ", @{$self->{'fasta_files'}});
00096   $run_str .= " -b '" . $self->{'tree_string'} . "'";
00097   $run_str .= " -h"; # output leaves only
00098 
00099   $self->{'msa_string'} = qx"$run_str";
00100 
00101   my $trim_position = $self->get_best_trimming_position($self->{'msa_string'});
00102   $self->{'trimmed_anchor_aligns'} = $self->get_trimmed_anchor_aligns($trim_position);
00103 
00104   return 1;
00105 }
00106 
00107 
00108 sub write_output {
00109   my ($self) = @_;
00110 
00111   my $anchor_align_adaptor = $self->{'comparaDBA'}->get_AnchorAlignAdaptor();
00112   foreach my $this_trimmed_anchor_align (@{$self->{'trimmed_anchor_aligns'}}) {
00113     $anchor_align_adaptor->store($this_trimmed_anchor_align);
00114   }
00115 
00116   return 1;
00117 }
00118 
00119 sub anchor_id {
00120   my $self = shift;
00121   if (@_) {
00122     $self->{anchor_id} = shift;
00123   }
00124   return $self->{anchor_id};
00125 }
00126 
00127 sub input_method_link_species_set_id {
00128   my $self = shift;
00129   if (@_) {
00130     $self->{input_method_link_species_set_id} = shift;
00131   }
00132   return $self->{input_method_link_species_set_id};
00133 }
00134 
00135 sub output_method_link_species_set_id {
00136   my $self = shift;
00137   if (@_) {
00138     $self->{output_method_link_species_set_id} = shift;
00139   }
00140   return $self->{output_method_link_species_set_id};
00141 }
00142 
00143 sub get_params {
00144   my $self         = shift;
00145   my $param_string = shift;
00146 
00147   return unless($param_string);
00148   print STDERR "parsing parameter string : ",$param_string,"\n";
00149 
00150   my $params = eval($param_string);
00151   return unless($params);
00152 
00153   if(defined($params->{'anchor_id'})) {
00154     $self->anchor_id($params->{'anchor_id'});
00155   }
00156   if(defined($params->{'input_method_link_species_set_id'})) {
00157     $self->input_method_link_species_set_id($params->{'input_method_link_species_set_id'});
00158   }
00159   if(defined($params->{'output_method_link_species_set_id'})) {
00160     $self->output_method_link_species_set_id($params->{'output_method_link_species_set_id'});
00161   }
00162   if(defined($params->{'method_link_species_set_id'})) {
00163     $self->method_link_species_set_id($params->{'method_link_species_set_id'});
00164   }
00165 
00166   return 1;
00167 }
00168 
00169 =head2 _dump_fasta
00170 
00171   Arg [1]    : -none-
00172   Example    : $self->_dump_fasta();
00173   Description: Dumps FASTA files in the order given by the tree
00174                string (needed by Pecan). Resulting file names are
00175                stored using the fasta_files getter/setter
00176   Returntype : 1
00177   Exception  :
00178   Warning    :
00179 
00180 =cut
00181 
00182 sub _dump_fasta {
00183   my $self = shift;
00184 
00185   my $all_anchor_aligns = $self->{'anchor_aligns'};
00186   $self->{tree_string} = "(";
00187 
00188   foreach my $anchor_align (@$all_anchor_aligns) {
00189     my $anchor_align_id = $anchor_align->dbID;
00190     $self->{tree_string} .= "aa$anchor_align_id:0.1,";
00191     my $file = $self->worker_temp_directory . "/seq" . $anchor_align_id . ".fa";
00192 
00193     open F, ">$file" || throw("Couldn't open $file");
00194 
00195     print F ">AnchorAlign", $anchor_align_id, "|", $anchor_align->dnafrag->name, ".",
00196         $anchor_align->dnafrag_start, "-", $anchor_align->dnafrag_end, ":",
00197         $anchor_align->dnafrag_strand, "\n";
00198     my $seq = $anchor_align->seq;
00199     if ($seq =~ /[^ACTGactgNnXx]/) {
00200       print STDERR "AnchorAlign $anchor_align_id contains at least one non-ACTGactgNnXx character. These have been replaced by N's\n";
00201       $seq =~ s/[^ACTGactgNnXx]/N/g;
00202     }
00203     $seq =~ s/(.{80})/$1\n/g;
00204     chomp $seq;
00205     print F $seq,"\n";
00206 
00207     close F;
00208 
00209     push @{$self->{'fasta_files'}}, $file;
00210   }
00211   substr($self->{tree_string}, -1, 1, ");");
00212 
00213   return 1;
00214 }
00215 
00216 sub get_best_trimming_position {
00217   my ($self, $msa_string) = @_;
00218   my $num_of_leaves = 1;
00219 
00220   ################################
00221   # Parse the multiple alignment
00222   ################################
00223   my @lines = split("\n", $msa_string);
00224   my $anchor_align_id;
00225   my $seqs;
00226   my $this_seq;
00227   foreach my $this_line (@lines) {
00228     if ($this_line =~ /score/) {
00229     } elsif ($this_line =~ /^>aa(\d+)$/) {
00230       $seqs->{$anchor_align_id} = $this_seq if ($anchor_align_id);
00231       $anchor_align_id = $1;
00232       $this_seq = "";
00233       $num_of_leaves++;
00234     } else {
00235       $this_seq .= $this_line;
00236     }
00237   }
00238   $seqs->{$anchor_align_id} = $this_seq;
00239   $self->{'aligned_sequences'} = $seqs;
00240 
00241   my $ideal_score = 4 * $num_of_leaves;
00242 
00243   #####################################
00244   # consensus seq, gaps, conservation
00245   #####################################
00246   my @gaps;
00247   my @consensus;
00248   my @conservation;
00249   foreach $this_seq (values %$seqs) {
00250     my @seq = split("", $this_seq);
00251     for (my $i = 0; $i < @seq; $i++) {
00252       if ($seq[$i] eq "-") {
00253         $gaps[$i]++;
00254       } elsif ($seq[$i] =~ /[ACTG]/) {
00255         $consensus[$i]->{$seq[$i]}++;
00256       }
00257     }
00258   }
00259   for (my $i=0; $i<@consensus; $i++) {
00260     my $cons_hash_ref = $consensus[$i];
00261     my $max = 0;
00262     my $consensus_nucleotide;
00263     while (my ($nucleotide, $this_count) = each %$cons_hash_ref) {
00264       if ($this_count > $max) {
00265         $max = $this_count;
00266         $consensus_nucleotide = $nucleotide;
00267       }
00268     }
00269     push(@conservation, $max);
00270     $consensus[$i] = $consensus_nucleotide;
00271   }
00272 
00273   ####################################################
00274   # Score each position according to previous values
00275   ####################################################
00276   my @final_scores;
00277   for (my $i = 0; $i < @consensus - 3; $i++) {
00278     my @these_bases = @consensus[$i..($i+3)];
00279     my @these_gaps = @gaps[$i..($i+3)];
00280     my @these_scores = @conservation[$i..($i+3)];
00281     my $total_score = 0;
00282     # Same as max score - 1 * $num_mismatches
00283     for (my $j=0; $j<4; $j++) {
00284       $total_score += $these_scores[$j];
00285     }
00286     # 3 points for every gap
00287     for (my $j=0; $j<4; $j++) {
00288       $total_score -= 3 * $these_gaps[$j];
00289     }
00290     my $all_bases;
00291     for (my $j=0; $j<4; $j++) {
00292       $all_bases->{$these_bases[$j]}++;
00293     }
00294     if ($these_bases[0] eq $these_bases[2] and
00295       # Avoid repetitive sequence
00296         $these_bases[1] eq $these_bases[3]) {
00297       $total_score -= $num_of_leaves;
00298       if ($these_bases[0] eq $these_bases[1]) {
00299         $total_score -= $num_of_leaves;
00300       }
00301     } elsif (scalar(keys %$all_bases) == 2) {
00302       # simple sequence (only 2 diff nucleotides in 4 bp)
00303       $total_score -= $num_of_leaves/2;
00304     } elsif (scalar(keys %$all_bases) == 4) {
00305       # Give an extra point to runs of all 4 diff nucleotides
00306       $total_score += 1;
00307     }
00308     push(@final_scores, $total_score);
00309 #     print join(" : ", @these_bases, @these_gaps,@these_scores, $total_score), "\n";
00310   }
00311 
00312   my $max_score = 0;
00313   my $best_position = 0;
00314   for (my $i = 0; $i < @final_scores; $i++) {
00315     if ($final_scores[$i] > $max_score) {
00316       $max_score = $final_scores[$i];
00317       $best_position = $i+2;
00318     }
00319   }
00320 
00321   warn "Cannot find a good position" if ($best_position == 0 or $max_score < 0.8 * $ideal_score);
00322 
00323   return $best_position;
00324 }
00325 
00326 sub get_trimmed_anchor_aligns {
00327   my ($self, $best_position) = @_;
00328   my $trimmed_anchor_aligns;
00329 
00330   my $aligned_sequences = $self->{'aligned_sequences'};
00331 # #   while (my ($align_anchor_id, $aligned_sequence) = each %$aligned_sequences) {
00332 # #     print substr($aligned_sequence, 0, $best_position), " ** ", substr($aligned_sequence, $best_position),
00333 # #         " :: $align_anchor_id\n";
00334 # #   }
00335   my $all_anchor_aligns = $self->{'anchor_aligns'};
00336   foreach my $this_anchor_align (@$all_anchor_aligns) {
00337     my $this_anchor_align_id = $this_anchor_align->dbID;
00338     my $this_aligned_sequence = $aligned_sequences->{$this_anchor_align_id};
00339 # #     print substr($this_aligned_sequence, 0, ($best_position -1)), " ** ",
00340 # #         substr($this_aligned_sequence, $best_position-1, 2) , " ** ",
00341 # #         substr($this_aligned_sequence, ($best_position + 1)),
00342 # #         " ++ $this_anchor_align_id\n";
00343     my $seq_before = substr($this_aligned_sequence, 0, $best_position);
00344     my $seq_after = substr($this_aligned_sequence, $best_position);
00345     my $start = $this_anchor_align->dnafrag_start;
00346     my $end = $this_anchor_align->dnafrag_end;
00347     my ($count_before) = $seq_before =~ tr/ACTGactgNn/ACTGactgNn/;
00348     my ($count_after) = $seq_after =~ tr/ACTGactgNn/ACTGactgNn/;
00349     if ($count_before + $count_after != $end - $start + 1) {
00350       die "Wrong length $count_before $count_after $start $end";
00351     }
00352     if ($this_anchor_align->dnafrag_strand == 1) {
00353       $start += $count_before - 1;
00354       $end -= $count_after - 1;
00355     } elsif ($this_anchor_align->dnafrag_strand == -1) {
00356       $start += $count_after - 1;
00357       $end -= $count_before - 1;
00358     } else {
00359       die "Wrong strand: ".$this_anchor_align->dnafrag_strand;
00360     }
00361     # Check we get a 2bp long anchor_align
00362     if ($end - $start + 1 != 2) {
00363       die "Wrong length $start $end";
00364     }
00365     my $new_anchor_align;
00366     %$new_anchor_align = %$this_anchor_align;
00367     bless $new_anchor_align, ref($this_anchor_align);
00368     delete($new_anchor_align->{'_seq'});
00369     delete($new_anchor_align->{'_dbID'});
00370     $new_anchor_align->dnafrag_start($start);
00371     $new_anchor_align->dnafrag_end($end);
00372     $new_anchor_align->method_link_species_set_id($self->output_method_link_species_set_id);
00373     push(@$trimmed_anchor_aligns, $new_anchor_align);
00374   }
00375 
00376   return $trimmed_anchor_aligns;
00377 }
00378 
00379 1;
00380