Archive Ensembl HomeArchive Ensembl Home
FilterAnchors.pm
Go to the documentation of this file.
00001 
00002 # You may distribute this module under the same terms as perl itself
00003 # POD documentation - main docs before the code
00004 
00005 =pod 
00006 
00007 =head1 NAME
00008 
00009 Bio::EnsEMBL::Compara::Production::GenomicAlignBlock::EPOanchors::FilterAnchors
00010 
00011 =cut
00012 
00013 =head1 SYNOPSIS
00014 
00015 parameters
00016 {input_analysis_id=> ?,method_link_species_set_id=> ?,test_method_link_species_set_id=> ?}
00017 
00018 =cut
00019 
00020 =head1 DESCRIPTION
00021 
00022 There are 3 hard-coded filtering conditions for          
00023 anchor removal: 
00024 1). Anchors which hit > 5 dnafrags in any one genome.
00025 2). Anchors which hit the same dnafrag > 10 in any one genome.
00026 3). Anchors which hit any one genome > 20 times.
00027 
00028 =cut
00029 
00030 =head1 CONTACT
00031 
00032 ensembl-compara@ebi.ac.uk
00033 
00034 =cut
00035 
00036 =head1 APPENDIX
00037 
00038 The rest of the documentation details each of the object methods.
00039 Internal methods are usually preceded with a _
00040 
00041 =cut
00042 
00043 
00044 package Bio::EnsEMBL::Compara::Production::EPOanchors::FilterAnchors;
00045 
00046 use strict;
00047 use Data::Dumper;
00048 use Bio::EnsEMBL::Compara::Production::DBSQL::DBAdaptor;
00049 use Bio::EnsEMBL::Hive::Process;
00050 use Bio::EnsEMBL::Compara::MethodLinkSpeciesSet;
00051 use Bio::EnsEMBL::Compara::Production::EPOanchors::AnchorAlign;
00052 use Bio::EnsEMBL::Utils::Exception qw(throw warning);
00053 
00054 use constant MAX_NUMBER_OF_DNAFRAGS_HIT => 5; #max number of dnafrags hit per genome
00055 use constant MAX_NUMBER_OF_HITS_TO_SAME_DNAFRAG => 10;
00056 use constant MAX_NUMBER_OF_HITS_TO_ANY_ONE_GENOME => 20;
00057 
00058 
00059 our @ISA = qw(Bio::EnsEMBL::Hive::Process);
00060 
00061 
00062 sub configure_defaults {
00063   my $self = shift;
00064   $self->max_number_of_dnafrags_hit(MAX_NUMBER_OF_DNAFRAGS_HIT);
00065   $self->max_number_of_hits_to_same_dnafrag(MAX_NUMBER_OF_HITS_TO_SAME_DNAFRAG);
00066   $self->max_number_of_hits_to_any_one_genome(MAX_NUMBER_OF_HITS_TO_ANY_ONE_GENOME);
00067   return 0;
00068 }
00069 
00070 sub fetch_input {
00071   my ($self) = @_;
00072 
00073   $self->configure_defaults();
00074 
00075   #create a Compara::DBAdaptor which shares the same DBI handle
00076   #with $self->db (Hive DBAdaptor)
00077   $self->{'comparaDBA'} = Bio::EnsEMBL::Compara::Production::DBSQL::DBAdaptor->new(-DBCONN=>$self->db->dbc);
00078   $self->{'comparaDBA'}->dbc->disconnect_when_inactive(0);
00079 
00080   my $dnafrag_adaptor = $self->{'comparaDBA'}->get_DnaFragAdaptor();
00081   my $anchor_align_adaptor = $self->{'comparaDBA'}->get_AnchorAlignAdaptor();
00082 
00083   $self->get_params($self->parameters);
00084   $self->get_params($self->input_id);
00085   return 1;
00086 }
00087 
00088 sub run {
00089 
00090     my ($self) = @_;
00091     my $anchor_align_adaptor = $self->{'comparaDBA'}->get_AnchorAlignAdaptor();
00092     my(@palindromic_anchors, $all_anchors, %anchor_hits2genomes_and_dnafrags, %anchors2remove);
00093     $self->anchor_ids_with_zero_strand(
00094         $anchor_align_adaptor->fetch_all_anchors_with_zero_strand(
00095             $self->test_method_link_species_set_id));
00096     $all_anchors = $anchor_align_adaptor->fetch_all_anchor_ids_by_test_mlssid_and_genome_db_ids(
00097         $self->test_method_link_species_set_id, $self->genome_db_ids);
00098     foreach my $anchor (@{$all_anchors}) {
00099         my $dnafrags_and_genomedbs = $anchor_align_adaptor->fetch_dnafrag_and_genome_db_ids_by_test_mlssid(
00100             $self->test_method_link_species_set_id, $anchor->[0]);
00101         foreach my $genome_db_dnafrag(@{$dnafrags_and_genomedbs}) {
00102             $anchor_hits2genomes_and_dnafrags{$anchor->[0]}{$genome_db_dnafrag->[1]}{$genome_db_dnafrag->[0]}++;
00103         }
00104     }
00105     foreach my $anchor_id(sort keys %anchor_hits2genomes_and_dnafrags) {
00106         foreach my $genome_db_id(sort keys %{$anchor_hits2genomes_and_dnafrags{$anchor_id}}) {
00107             last if(exists($anchors2remove{$anchor_id}));
00108             my $num_hits_to_each_genome = 0;
00109             if(scalar(keys %{$anchor_hits2genomes_and_dnafrags{$anchor_id}{$genome_db_id}}) > $self->max_number_of_dnafrags_hit) {
00110                 $anchors2remove{$anchor_id}++;
00111 #               print "MAX_NUM_DNAFRAGS_HIT : $anchor_id\n";
00112                 last;
00113             }
00114             foreach my $dnafrag_id(%{$anchor_hits2genomes_and_dnafrags{$anchor_id}{$genome_db_id}}) {
00115                 if($anchor_hits2genomes_and_dnafrags{$anchor_id}{$genome_db_id}{$dnafrag_id} > $self->max_number_of_hits_to_same_dnafrag) {
00116                     $anchors2remove{$anchor_id}++;
00117 #                   print "MAX_NUM_HITS_TO_SAME_DNAFRAG : $anchor_id\n";    
00118                     last;
00119                 }
00120                 else{
00121                     $num_hits_to_each_genome += $anchor_hits2genomes_and_dnafrags{$anchor_id}{$genome_db_id}{$dnafrag_id};
00122                 }
00123             }
00124             if($num_hits_to_each_genome > $self->max_number_of_hits_to_any_one_genome) {
00125                 $anchors2remove{$anchor_id}++;
00126 #               print "MAX_NUM_HITS_TO_ANY_ONE_GENOME : $anchor_id\n";
00127                 last;
00128             }
00129         }
00130     }
00131     $self->anchors2remove(\%anchors2remove);
00132     return 1;
00133 }
00134 
00135 sub write_output {
00136     my ($self) = @_;
00137     my $anchor_align_adaptor = $self->{'comparaDBA'}->get_AnchorAlignAdaptor();
00138     print join(":", $self->input_analysis_id, $self->test_method_link_species_set_id), "\n";
00139     $anchor_align_adaptor->update_zero_strand_anchors($self->anchor_ids_with_zero_strand, 
00140         $self->input_analysis_id, $self->test_method_link_species_set_id);
00141     if(scalar (keys %{$self->anchors2remove})) {
00142         $anchor_align_adaptor->update_failed_anchor($self->anchors2remove, $self->input_analysis_id, $self->test_method_link_species_set_id);
00143     }           
00144     else{
00145         print "No anchors to remove\n";
00146     }
00147   return 1;
00148 }
00149 
00150 sub anchor_ids_with_zero_strand {
00151     my $self = shift;
00152     if (@_) {
00153         $self->{anchor_ids_with_zero_strand} = shift;
00154     }
00155     return $self->{anchor_ids_with_zero_strand};
00156 }
00157 
00158 sub anchors2remove {
00159     my $self = shift;
00160     if (@_) {
00161         $self->{anchors2remove} = shift;
00162     }
00163     return $self->{anchors2remove};
00164 }
00165 
00166 sub max_number_of_dnafrags_hit {
00167     my $self = shift;
00168     if (@_) {
00169         $self->{max_number_of_dnafrags_hit} = shift;
00170     }
00171     return $self->{max_number_of_dnafrags_hit};
00172 }
00173 
00174 sub max_number_of_hits_to_same_dnafrag {
00175     my $self = shift;
00176     if (@_) {
00177         $self->{max_number_of_hits_to_same_dnafrag} = shift;
00178     }
00179     return $self->{max_number_of_hits_to_same_dnafrag};
00180 }
00181 
00182 sub max_number_of_hits_to_any_one_genome {
00183     my $self = shift;
00184     if (@_) {
00185         $self->{max_number_of_hits_to_any_one_genome} = shift;
00186     }
00187     return $self->{max_number_of_hits_to_any_one_genome};
00188 }
00189 
00190 sub test_method_link_species_set_id {
00191     my $self = shift;
00192     if (@_) {
00193         $self->{test_method_link_species_set_id} = shift;
00194     }
00195     return $self->{test_method_link_species_set_id};
00196 }
00197 
00198 sub method_link_species_set_id {
00199     my $self = shift;
00200     if (@_) {
00201         $self->{method_link_species_set_id} = shift;
00202     }
00203     return $self->{method_link_species_set_id};
00204 }
00205 
00206 sub input_analysis_id {
00207     my $self = shift;
00208     if (@_) {
00209         $self->{input_analysis_id} = shift;
00210     }
00211     return $self->{input_analysis_id};
00212 }
00213 
00214 sub genome_db_ids {
00215     my $self = shift;
00216     if (@_) {
00217         $self->{genome_db_ids} = shift;
00218     }
00219     return $self->{genome_db_ids};
00220 }
00221 
00222 #sub method_link_type {
00223 #   my $self = shift;
00224 #   if (@_) {
00225 #       $self->{method_link_type} = shift;
00226 #   }
00227 #   return $self->{method_link_type};
00228 #}
00229 #
00230 #sub input_anchor_id {
00231 #   my $self = shift;
00232 #   if (@_) {
00233 #       $self->{input_anchor_id} = shift;
00234 #   }
00235 #   return $self->{input_anchor_id};
00236 #}
00237 
00238 sub get_params {
00239   my $self         = shift;
00240   my $param_string = shift;
00241 
00242   return unless($param_string);
00243   print("parsing parameter string : ",$param_string,"\n");
00244 
00245   my $params = eval($param_string);
00246   return unless($params);
00247 
00248   if(defined($params->{'test_method_link_species_set_id'})) {
00249     $self->test_method_link_species_set_id($params->{'test_method_link_species_set_id'});
00250   }
00251   if(defined($params->{'method_link_species_set_id'})) {
00252     $self->method_link_species_set_id($params->{'method_link_species_set_id'});
00253   }
00254   if(defined($params->{'input_analysis_id'})) {
00255     $self->input_analysis_id($params->{'input_analysis_id'});
00256   }
00257   if(defined($params->{'genome_db_ids'})) {
00258     $self->genome_db_ids($params->{'genome_db_ids'});
00259   }
00260   
00261   if(defined($params->{'method_link_type'})) {
00262     $self->method_link_type($params->{'method_link_type'});
00263   }
00264   if(defined($params->{'input_anchor_id'})) { #same as anchor_id
00265     $self->input_anchor_id($params->{'input_anchor_id'});
00266   }
00267   if(defined($params->{'anchor_id'})) { #same as input_anchor_id
00268     $self->input_anchor_id($params->{'anchor_id'});
00269   }
00270   return 1;
00271 }
00272 
00273 #sub store_input {
00274 #  my $self = shift;
00275 #
00276 #  if (@_) {
00277 #    $self->{store_input} = shift;
00278 #  }
00279 #
00280 #  return $self->{store_input};
00281 #}
00282 
00283 
00284 1;
00285