Archive Ensembl HomeArchive Ensembl Home
GetBlastzOverlaps.pm
Go to the documentation of this file.
00001 #Ensembl module for Bio::EnsEMBL::Compara::Production::EPOanchors::GetBlastzOverlaps
00002 # You may distribute this module under the same terms as perl itself
00003 #
00004 # POD documentation - main docs before the code
00005 =head1 NAME
00006 
00007 Bio::EnsEMBL::Compara::Production::EPOanchors:GetBlastzOverlaps:
00008 
00009 =head1 SYNOPSIS
00010 
00011 $exonate_anchors->fetch_input();
00012 $exonate_anchors->run();
00013 $exonate_anchors->write_output(); writes to database
00014 
00015 =head1 DESCRIPTION
00016 
00017 Given a database with anchor sequences and a target genome. This modules exonerates 
00018 the anchors against the target genome. The required information (anchor batch size,
00019 target genome file, exonerate parameters are provided by the analysis, analysis_job 
00020 and analysis_data tables  
00021 
00022 This modules is part of the Ensembl project http://www.ensembl.org
00023 
00024 Email compara@ebi.ac.uk
00025 
00026 =head1 CONTACT
00027 
00028 This modules is part of the EnsEMBL project (http://www.ensembl.org)
00029 
00030 Questions can be posted to the ensembl-dev mailing list:
00031 dev@ensembl.org
00032 
00033 
00034 =head1 APPENDIX
00035 
00036 The rest of the documentation details each of the object methods. 
00037 Internal methods are usually preceded with a _
00038 
00039 =cut
00040 #
00041 package Bio::EnsEMBL::Compara::Production::EPOanchors::GetBlastzOverlaps;
00042 
00043 use strict;
00044 use Data::Dumper;
00045 use Bio::EnsEMBL::Compara::Production::DBSQL::DBAdaptor;
00046 use Bio::EnsEMBL::Hive::Process;
00047 use Bio::EnsEMBL::Utils::Exception qw(throw warning);
00048 use Bio::EnsEMBL::Analysis;
00049 use Bio::EnsEMBL::Compara::DBSQL::DBAdaptor;
00050 use Bio::EnsEMBL::Registry;
00051 
00052 Bio::EnsEMBL::Registry->load_all;
00053 Bio::EnsEMBL::Registry->no_version_check(1);
00054 
00055 our @ISA = qw(Bio::EnsEMBL::Hive::Process);
00056 
00057 
00058 sub configure_defaults {
00059     my $self = shift;
00060     $self->ref_dnafrag_strand(1); #reference strand is always 1 
00061     return 1;
00062 }
00063 
00064 sub fetch_input {
00065     my ($self) = @_;
00066     $self->configure_defaults();
00067     $self->get_parameters($self->parameters);
00068     #create a Compara::DBAdaptor which shares the same DBI handle with $self->db (Hive DBAdaptor)
00069     $self->{'comparaDBA'} = Bio::EnsEMBL::Compara::Production::DBSQL::DBAdaptor->new(-DBCONN=>$self->db->dbc) or die "cant connect\n";
00070     $self->{'comparaDBA'}->dbc->disconnect_if_idle();
00071     $self->{'hiveDBA'} = Bio::EnsEMBL::Hive::DBSQL::DBAdaptor->new(-DBCONN => $self->{'comparaDBA'}->dbc) or die "cant connect\n";
00072     $self->{'hiveDBA'}->dbc->disconnect_if_idle();
00073     my $analysis_data_adaptor = $self->{hiveDBA}->get_AnalysisDataAdaptor();
00074     my $mlssid_adaptor = Bio::EnsEMBL::Registry->get_adaptor("Multi", "compara", "MethodLinkSpeciesSet");
00075     my $genome_db_adaptor = Bio::EnsEMBL::Registry->get_adaptor("Multi", "compara", "GenomeDB");
00076     my $genomic_align_block_adaptor = Bio::EnsEMBL::Registry->get_adaptor("Multi", "compara", "GenomicAlignBlock"); 
00077     $self->genomic_align_block_adaptor( $genomic_align_block_adaptor );
00078     my $dnafrag_adaptor = Bio::EnsEMBL::Registry->get_adaptor("Multi", "compara", "DNAFrag");
00079     $self->analysis_data( eval $analysis_data_adaptor->fetch_by_dbID($self->analysis_data_id) );
00080     $self->get_input_id($self->input_id);
00081     $self->reference_genome_db( $genome_db_adaptor->fetch_by_dbID($self->genome_db_ids->[0]) );
00082     $self->ref_dnafrag( $dnafrag_adaptor->fetch_by_dbID($self->ref_dnafrag_id) );
00083     my (@ref_dnafrag_coords, @mlssid_adaptors, $chunk_from, $chunk_to);
00084     for(my$i=1;$i<@{$self->genome_db_ids};$i++){ #$self->genome_db_ids->[0] is the reference genome_db_id
00085         my $mlss_id = $mlssid_adaptor->fetch_by_method_link_type_GenomeDBs(
00086                 $self->method_type, 
00087                 [ 
00088                     $self->reference_genome_db,
00089                     $genome_db_adaptor->fetch_by_dbID( $self->genome_db_ids->[$i] ),
00090                 ] );
00091         push(@mlssid_adaptors, $mlss_id);
00092         my $gabs = $genomic_align_block_adaptor->fetch_all_by_MethodLinkSpeciesSet_DnaFrag( 
00093                         $mlss_id, $self->ref_dnafrag, $self->dnafrag_chunks->[0], $self->dnafrag_chunks->[1] );
00094         foreach my $genomic_align_block( @$gabs ) {
00095             next if $genomic_align_block->length < $self->analysis_data->{min_anc_size};
00096             push( @ref_dnafrag_coords, [ $genomic_align_block->reference_genomic_align->dnafrag_start,
00097                     $genomic_align_block->reference_genomic_align->dnafrag_end,
00098                     $self->genome_db_ids->[$i] ] );
00099         }
00100     }               
00101     $self->mlssids( \@mlssid_adaptors );
00102     $self->ref_dnafrag_coords( [ sort {$a->[0] <=> $b->[0]} @ref_dnafrag_coords ] ); #sort reference genomic_align_blocks (gabs) by start position
00103     print "INPUT: ", scalar(@ref_dnafrag_coords), "\n"; 
00104     return 1;
00105 }
00106 
00107 sub run {
00108     my ($self) = @_;
00109     my(@dnafrag_overlaps, @ref_coords_to_gerp);
00110     for(my$i=0;$i<@{$self->ref_dnafrag_coords}-1;$i++) { #find overlapping gabs in reference seq coords
00111         my $temp_end = $self->ref_dnafrag_coords->[$i]->[1];
00112         for(my$j=$i+1;$j<@{$self->ref_dnafrag_coords};$j++) {
00113             if($temp_end >= $self->ref_dnafrag_coords->[$j]->[0]) {
00114                 $temp_end = $temp_end > $self->ref_dnafrag_coords->[$j]->[1] ? $temp_end : $self->ref_dnafrag_coords->[$j]->[1];
00115             }
00116             else {
00117                 push(@dnafrag_overlaps, [$i, --$j]);
00118                 $i = $j;
00119                 last;
00120             }
00121         }
00122     }
00123     for(my$k=0;$k<@dnafrag_overlaps;$k++) {
00124         my(%bases, @bases);
00125         for(my$l=$dnafrag_overlaps[$k]->[0];$l<=$dnafrag_overlaps[$k]->[1];$l++) {#indices for $self->ref_dnafrag_coords
00126             for(my$m=$self->ref_dnafrag_coords->[$l]->[0];$m<=$self->ref_dnafrag_coords->[$l]->[1];$m++) {
00127                 $bases{$m}{$self->ref_dnafrag_coords->[$l]->[2]}++; #count the number of non_ref org hits per base
00128             }
00129         }
00130         foreach my $base(sort {$a <=> $b} keys %bases) {
00131             if((keys %{$bases{$base}}) >= $self->analysis_data->{min_number_of_org_hits_per_base}) {
00132                 push(@bases, $base);
00133             }
00134         }
00135         if(@bases) {
00136             if($bases[-1] - $bases[0] >= $self->analysis_data->{min_anc_size}) {
00137                 push(@ref_coords_to_gerp, [ $bases[0], $bases[-1] ]);
00138             }
00139         }
00140     }
00141     my (%genomic_aligns_on_ref_slice);
00142     my $query_slice_adaptor = Bio::EnsEMBL::Registry->get_adaptor($self->reference_genome_db->name, "core", "Slice");
00143     foreach my $coord_pair(@ref_coords_to_gerp) {
00144         my $ref_slice = $query_slice_adaptor->fetch_by_region( 
00145             $self->ref_dnafrag->coord_system_name, $self->ref_dnafrag->name, $coord_pair->[0], $coord_pair->[1] );
00146         foreach my $mlss_id(@{$self->mlssids}) {            
00147             my $gabs = $self->genomic_align_block_adaptor->fetch_all_by_MethodLinkSpeciesSet_Slice($mlss_id, $ref_slice);
00148             foreach my $gab (@$gabs) {
00149                 my $rgab = $gab->restrict_between_reference_positions($coord_pair->[0], $coord_pair->[1]);
00150                 my $restricted_non_reference_genomic_aligns = $rgab->get_all_non_reference_genomic_aligns;
00151                 foreach my $genomic_align (@$restricted_non_reference_genomic_aligns) {
00152                     push(@{ $genomic_aligns_on_ref_slice{"$coord_pair->[0]-$coord_pair->[1]"}{$genomic_align->dnafrag->dbID}{$genomic_align->dnafrag_strand} },
00153                         [ $genomic_align->dnafrag_start, $genomic_align->dnafrag_end ]);
00154                 }
00155             }
00156         }
00157         foreach my $reference_from_to(sort keys %genomic_aligns_on_ref_slice) {
00158             foreach my $dnafrag_id(sort keys %{$genomic_aligns_on_ref_slice{$reference_from_to}}) {
00159                 foreach my $dnafrag_strand(sort keys %{$genomic_aligns_on_ref_slice{$reference_from_to}{$dnafrag_id}}) {
00160                     my $sorted = \@{$genomic_aligns_on_ref_slice{$reference_from_to}{$dnafrag_id}{$dnafrag_strand}};
00161                     @{$sorted} = sort { $a->[0] <=> $b->[0] } @{$sorted};
00162                 }
00163             }
00164         }
00165     }
00166     $self->genomic_aligns_on_ref_slice( \%genomic_aligns_on_ref_slice );
00167     print "RUN: ", scalar(keys %genomic_aligns_on_ref_slice), "\n";
00168     return 1;
00169 }
00170 
00171 sub write_output {
00172     my ($self) = @_;
00173     my %sql_statements = (
00174         insert_synteny_region => "INSERT INTO synteny_region (method_link_species_set_id) VALUES (?)",
00175         insert_dnafrag_region => "INSERT INTO dnafrag_region (synteny_region_id, dnafrag_id, dnafrag_start, dnafrag_end, dnafrag_strand) VALUES (?,?,?,?,?)",
00176         insert_next_analysis_job => "INSERT INTO analysis_job (analysis_id, input_id) VALUES (?,?)",
00177         select_max_synteny_region_id => "SELECT MAX(synteny_region_id) FROM synteny_region WHERE method_link_species_set_id = ?", 
00178         select_logic_name => "SELECT logic_name FROM analysis WHERE analysis_id = ?",
00179         select_next_analysis_id => "SELECT ctrled_analysis_id FROM analysis_ctrl_rule WHERE condition_analysis_url = (SELECT logic_name FROM analysis WHERE analysis_id = ?)",
00180         select_next_mlssid => "SELECT method_link_species_set_id FROM method_link_species_set WHERE name = (SELECT logic_name FROM analysis WHERE analysis_id = ?)",
00181         select_species_set_id => "SELECT species_set_id FROM method_link_species_set WHERE method_link_species_set_id = ?",
00182         select_genome_db_ids => "SELECT GROUP_CONCAT(genome_db_id) FROM species_set WHERE species_set_id = ?",
00183     );
00184     foreach my$sql_statement(keys %sql_statements) {#prepare all the sql statements
00185             $sql_statements{$sql_statement} = $self->{'comparaDBA'}->dbc->prepare($sql_statements{$sql_statement});
00186     }
00187     my $query_slice_adaptor = Bio::EnsEMBL::Registry->get_adaptor($self->reference_genome_db->name, "core", "Slice");
00188     eval {
00189         $sql_statements{select_next_analysis_id}->execute( $self->analysis_id ) or die;
00190     };
00191     if($@) {
00192         die $@;
00193     }
00194     my $next_analysis_id = ($sql_statements{select_next_analysis_id}->fetchrow_array)[0];
00195     $sql_statements{select_next_mlssid}->execute( $next_analysis_id );
00196     my $next_method_link_species_set_id = ($sql_statements{select_next_mlssid}->fetchrow_array)[0];
00197     $sql_statements{select_species_set_id}->execute( $next_method_link_species_set_id );
00198     $sql_statements{select_genome_db_ids}->execute( ($sql_statements{select_species_set_id}->fetchrow_array)[0] );
00199     my $genome_db_ids = ($sql_statements{select_genome_db_ids}->fetchrow_array)[0];
00200     foreach my $ref_coords(sort keys %{$self->genomic_aligns_on_ref_slice}) {
00201         my @Synteny_blocks_to_insert;
00202         my $temp_next_analysis_id = $next_analysis_id;
00203         my ($ref_from, $ref_to) = split("-", $ref_coords);
00204         push(@Synteny_blocks_to_insert, [ $self->ref_dnafrag->dbID, $ref_from, $ref_to, $self->ref_dnafrag_strand ]);
00205         foreach my $non_ref_dnafrag_id(sort keys %{$self->genomic_aligns_on_ref_slice->{$ref_coords}}) {
00206             foreach my $non_ref_strand(sort keys %{$self->genomic_aligns_on_ref_slice->{$ref_coords}->{$non_ref_dnafrag_id}}) {
00207                 my $non_ref_coords = $self->genomic_aligns_on_ref_slice->{$ref_coords}->{$non_ref_dnafrag_id}->{$non_ref_strand};
00208                 next if ($non_ref_coords->[-1]->[1] - $non_ref_coords->[0]->[0] < $self->analysis_data->{min_anc_size} || 
00209                     ($non_ref_coords->[-1]->[1] - $non_ref_coords->[0]->[0]) < ($ref_to - $ref_from) * 0.2 ||
00210                     ($non_ref_coords->[-1]->[1] - $non_ref_coords->[0]->[0]) > ($ref_to - $ref_from) * 5 ); #need to change - gets rid of unalignable rubbish 
00211                 push(@Synteny_blocks_to_insert, [$non_ref_dnafrag_id, $non_ref_coords->[0]->[0], 
00212                                 $non_ref_coords->[-1]->[1], $non_ref_strand]);
00213             }
00214         }
00215         if(@Synteny_blocks_to_insert > 2) { #need at least 3 sequences for gerp
00216             $self->{'comparaDBA'}->dbc->db_handle->do("LOCK TABLES synteny_region WRITE");
00217             $sql_statements{insert_synteny_region}->execute( $self->method_link_species_set_id );
00218             $sql_statements{select_max_synteny_region_id}->execute( $self->method_link_species_set_id );
00219             my $synteny_region_id = ($sql_statements{select_max_synteny_region_id}->fetchrow_array)[0];
00220             $self->{'comparaDBA'}->dbc->db_handle->do("UNLOCK TABLES");
00221             while($temp_next_analysis_id) {
00222                 my($input_id_string, $next_logic_name);
00223                 $sql_statements{select_logic_name}->execute( $temp_next_analysis_id );
00224                 $next_logic_name = ($sql_statements{select_logic_name}->fetchrow_array)[0];
00225                 if( $next_logic_name=~/pecan/i ) {
00226                     $input_id_string = "{ synteny_region_id=>$synteny_region_id, method_link_species_set_id=>" .
00227                         $next_method_link_species_set_id . ", tree_analysis_data_id=>" . $self->tree_analysis_data_id . ", }"; 
00228                 }
00229                 elsif( $next_logic_name=~/gerp/i ) {
00230                     $input_id_string = "{genomic_align_block_id=>$synteny_region_id,species_set=>[$genome_db_ids]}";
00231                 }
00232                 eval { #add jobs to analysis_job for next analyses (pecan or gerp)
00233                     $sql_statements{insert_next_analysis_job}->execute($temp_next_analysis_id, $input_id_string);
00234                 };
00235                 $sql_statements{select_next_analysis_id}->execute( $temp_next_analysis_id );
00236                 $temp_next_analysis_id = ($sql_statements{select_next_analysis_id}->fetchrow_array)[0];
00237             }
00238             foreach my $dnafrag_region(@Synteny_blocks_to_insert) {
00239                 eval { 
00240                     $sql_statements{insert_dnafrag_region}->execute( $synteny_region_id, @{$dnafrag_region} );
00241                 };
00242                 if($@) {
00243                     die $@;
00244                 }
00245             }
00246         }
00247     }
00248     print "Genome_db_ids: $genome_db_ids\n";
00249     return 1;
00250 }
00251 
00252 sub mlssids {
00253     my $self = shift;
00254     if (@_) {
00255         $self->{_mlssids} = shift;
00256     }
00257     return $self->{_mlssids};
00258 }
00259 
00260 sub ref_dnafrag_strand {
00261     my $self = shift;
00262     if (@_) {
00263         $self->{_ref_dnafrag_strand} = shift;
00264     }
00265     return $self->{_ref_dnafrag_strand};
00266 }
00267 
00268 sub tree_analysis_data_id {
00269     my $self = shift;
00270     if (@_) {
00271         $self->{_tree_analysis_data_id} = shift;
00272     }
00273     return $self->{_tree_analysis_data_id};
00274 }
00275 
00276 sub analysis_data_id {
00277     my $self = shift;
00278     if (@_) {
00279         $self->{_analysis_data_id} = shift;
00280     }
00281     return $self->{_analysis_data_id};
00282 }
00283 
00284 sub analysis_data {
00285     my $self = shift;
00286     if (@_) {
00287         $self->{_analysis_data} = shift;
00288     }
00289     return $self->{_analysis_data};
00290 }
00291 
00292 sub analysis_id {
00293     my $self = shift;
00294     if (@_) {
00295         $self->{_analysis_id} = shift;
00296     }
00297     return $self->{_analysis_id};
00298 }
00299 
00300 sub ref_dnafrag_coords {
00301     my $self = shift;
00302     if (@_) {
00303         $self->{_ref_dnafrag_coords} = shift;
00304     }
00305     return $self->{_ref_dnafrag_coords};
00306 }
00307 
00308 sub genomic_aligns_on_ref_slice {
00309     my $self = shift;
00310     if (@_) {
00311         $self->{_genomic_aligns_on_ref_slice} = shift;
00312     }
00313     return $self->{_genomic_aligns_on_ref_slice};
00314 }
00315 
00316 sub dnafrag_overlaps {
00317     my $self = shift;
00318     if (@_) {
00319         $self->{_dnafrag_overlaps} = shift;
00320     }
00321     return $self->{_dnafrag_overlaps};
00322 }
00323 
00324 sub reference_genome_db {
00325     my $self = shift;
00326     if (@_){
00327         $self->{_reference_genome_db} = shift;
00328     }
00329     return $self->{_reference_genome_db};
00330 }
00331 
00332 sub genome_db_ids {
00333     my $self = shift;
00334     if (@_){
00335         $self->{_genome_db_ids} = shift;
00336     }
00337     return $self->{_genome_db_ids};
00338 }
00339 
00340 sub genomic_align_block_adaptor {
00341     my $self = shift;
00342     if (@_){
00343         $self->{_genomic_align_block_adaptor} = shift;
00344     }
00345     return $self->{_genomic_align_block_adaptor};
00346 }
00347 
00348 sub ref_dnafrag_id {
00349     my $self = shift;
00350     if (@_){
00351         $self->{_ref_dnafrag_id} = shift;
00352     }
00353     return $self->{_ref_dnafrag_id};
00354 }
00355 
00356 sub dnafrag_chunks {
00357     my $self = shift;
00358     if (@_){
00359         $self->{_dnafrag_chunks} = shift;
00360     }
00361     return $self->{_dnafrag_chunks};
00362 }
00363 
00364 sub ref_dnafrag {
00365     my $self = shift;
00366     if (@_){
00367         $self->{_ref_dnafrag} = shift;
00368     }
00369     return $self->{_ref_dnafrag};
00370 }
00371 
00372 sub method_type {
00373     my $self = shift;
00374     if (@_){
00375         $self->{_method_type} = shift;
00376     }
00377     return $self->{_method_type};
00378 }
00379 
00380 sub method_link_species_set_id {
00381     my $self = shift;
00382     if (@_){
00383         $self->{_method_link_species_set_id} = shift;
00384     }
00385     return $self->{_method_link_species_set_id};
00386 }
00387 
00388 sub get_parameters {
00389     my $self = shift;
00390     my $param_string = shift;
00391     
00392     return unless($param_string);
00393     my $params = eval($param_string);
00394     if(defined($params->{'analysis_data_id'})) {
00395         $self->analysis_data_id($params->{'analysis_data_id'});
00396     }
00397     if(defined($params->{'method_link_species_set_id'})) {
00398         $self->method_link_species_set_id($params->{'method_link_species_set_id'});
00399     }
00400     if(defined($params->{'tree_analysis_data_id'})) {
00401         $self->tree_analysis_data_id($params->{'tree_analysis_data_id'});
00402     }
00403     if(defined($params->{'analysis_id'})) {
00404         $self->analysis_id($params->{'analysis_id'});
00405     }
00406 }
00407 
00408 sub get_input_id {
00409     my $self = shift;
00410     my $input_id_string = shift;
00411 
00412     return unless($input_id_string);
00413     print("parsing input_id string : ",$input_id_string,"\n");
00414     
00415     my $params = eval($input_id_string);
00416     return unless($params);
00417     
00418     if(defined($params->{'method_type'})) {
00419         $self->method_type($params->{'method_type'});
00420     }
00421     if(defined($params->{'genome_db_ids'})) {
00422         $self->genome_db_ids($params->{'genome_db_ids'});
00423     }
00424     if(defined($params->{'ref_dnafrag_id'})) {
00425         $self->ref_dnafrag_id($params->{'ref_dnafrag_id'});
00426     }
00427     if(defined($params->{'dnafrag_chunks'})) {
00428         $self->dnafrag_chunks($params->{'dnafrag_chunks'});
00429     }
00430     return 1;
00431 }
00432 
00433 1;
00434