Archive Ensembl HomeArchive Ensembl Home
compara.pm
Go to the documentation of this file.
00001 #
00002 # Bio::Das::ProServer::SourceAdaptor::compara
00003 #
00004 # Copyright EnsEMBL Team
00005 #
00006 # You may distribute this module under the same terms as perl itself
00007 #
00008 # pod documentation - main docs before the code
00009 
00010 =head1 NAME
00011 
00012 Bio::Das::ProServer::SourceAdaptor::compara - Extension of the ProServer for e! genomic alignments
00013 and synteny block.
00014 
00015 =head1 INHERITANCE
00016 
00017 This module inherits attributes and methods from Bio::Das::ProServer::SourceAdaptor
00018 
00019 =head1 DAS CONFIGURATION FILE
00020 
00021 There are some specific parameters for this module you can use in the DAS server configuration file
00022 
00023 =head2 registry
00024 
00025 Your registry configuration file to connect to the compara database
00026 
00027 =head2 database
00028 
00029 The species name in your Registry configuration file.
00030 
00031 =head2 this_species
00032 
00033 The main species. Features will be shown for this species.
00034 
00035 =head2 other_species
00036 
00037 The other species. This DAS track will show alignments between this_species and other_species.
00038 You will have to skip this one for self-alignments. You can add more than one other species
00039 separated by comas.
00040 
00041 =head2 analysis
00042 
00043 The method_link_type. This defines the type of alignments. E.g. TRANSLATED_BLAT, BLASTZ_NET...
00044 See perldoc Bio::EnsEMBL::Compara::MethodLinkSpeciesSet for more details about the
00045 method_link_type
00046 
00047 =head2 Example
00048 
00049 =head3 registry configuration file
00050 
00051   use strict;
00052   use Bio::EnsEMBL::Utils::ConfigRegistry;
00053   use Bio::EnsEMBL::Compara::DBSQL::DBAdaptor;
00054 
00055   new Bio::EnsEMBL::Compara::DBSQL::DBAdaptor(
00056       -host => 'ensembldb.ensembl.org',
00057       -user => 'anonymous',
00058       -port => 3306,
00059       -species => 'ensembl-compara-41',
00060       -dbname => 'ensembl_compara_41');
00061 
00062 
00063 =head3 DAS server configuration file
00064 
00065   [general]
00066   hostname    = ecs4b.internal.sanger.ac.uk
00067   prefork     = 6
00068   maxclients  = 100
00069   port        = 9013
00070 
00071   [Hsap-Mmus-blastznet]
00072   transport       = ensembl
00073   adaptor         = compara
00074   registry        = /home/foo/ProServer/eg/reg.pl
00075   state           = on
00076   database        = ensembl-compara-41
00077   this_species    = Homo sapiens
00078   other_species   = Mus musculus
00079   analysis        = BLASTZ_NET
00080   description     = Human-mouse blastz-net alignments
00081 
00082   [Mmus-Hsap-blastznet]
00083   transport       = ensembl
00084   adaptor         = compara
00085   registry        = /home/foo/ProServer/eg/reg.pl
00086   state           = on
00087   database        = ensembl-compara-41
00088   this_species    = Mus musculus
00089   other_species   = Homo sapiens
00090   analysis        = BLASTZ_NET
00091   description     = Mouse-Human blastz-net alignments
00092 
00093   [primates-mlagan-hs]
00094   transport       = ensembl
00095   adaptor         = compara
00096   registry        = /home/foo/ProServer/eg/reg.pl
00097   state           = on
00098   database        = ensembl-compara-41
00099   this_species    = Homo sapiens
00100   other_species   = Pan troglodytes, Macaca mulatta
00101   analysis        = MLAGAN
00102   description     = Primates Mlagan alignments on human
00103 
00104   [human-platypus-bz]
00105   transport       = ensembl
00106   adaptor         = compara
00107   registry        = /home/foo/ProServer/eg/reg.pl
00108   state           = on
00109   database        = ensembl-compara-41
00110   this_species    = Homo sapiens
00111   other_species   = Ornithorhynchus anatinus
00112   analysis        = BLASTZ_NET
00113   description     = Human-platypus blastz alignments
00114 
00115 
00116 =cut
00117 
00118 package Bio::Das::ProServer::SourceAdaptor::compara;
00119 
00120 use strict;
00121 use Bio::EnsEMBL::Registry;
00122 use Bio::EnsEMBL::Utils::Exception;
00123 
00124 use base qw( Bio::Das::ProServer::SourceAdaptor );
00125 
00126 sub init
00127 {
00128     my ($self) = @_;
00129 
00130     $self->{'capabilities'} = { 'features' => '1.0',
00131                                 'stylesheet' => '1.0' };
00132 
00133     my $registry = $self->config()->{'registry'};
00134     unless (defined $registry) {
00135      throw("registry not defined\n");
00136     }
00137 
00138     if (not $Bio::EnsEMBL::Registry::registry_register->{'seen'}) {
00139         Bio::EnsEMBL::Registry->load_all($registry);
00140     }
00141 }
00142 
00143 sub build_features
00144 {
00145     my ($self, $opts) = @_;
00146 
00147     my $daschr      = $opts->{'segment'} || return ( );
00148     my $dasstart    = $opts->{'start'} || return ( );
00149     my $dasend      = $opts->{'end'} || return ( );
00150 
00151     my $species1    = $self->config()->{'this_species'};
00152     my @other_species = split(/\s*\,\s*/, $self->config()->{'other_species'});
00153     my $chr1        = $daschr;
00154     my $start1      = $dasstart;
00155     my $end1        = $dasend;
00156 
00157     my $db      = "Bio::EnsEMBL::Registry";
00158     $db->no_version_check(1);
00159     my $dbname  = $self->config()->{'database'};
00160 
00161     #need to put adaptors here and not in init 
00162     my $mlss_adaptor =
00163         $db->get_adaptor($dbname, 'compara', 'MethodLinkSpeciesSet') or
00164             die "can't get $dbname, 'compara', 'MethodLinkSpeciesSet'\n";
00165 
00166     my $dnafrag_adaptor =
00167         $db->get_adaptor($dbname, 'compara', 'DnaFrag') or
00168             die "can't get $dbname, 'compara', 'DnaFrag'\n";
00169 
00170    my $genomic_align_block_adaptor =
00171         $db->get_adaptor($dbname, 'compara', 'GenomicAlignBlock') or
00172             die "can't get $dbname, 'compara', 'GenomicAlignBlock'\n";
00173 
00174     my $synteny_region_adaptor =
00175         $db->get_adaptor($dbname, 'compara', 'SyntenyRegion') or
00176             die "can't get $dbname, 'compara', 'SyntenyRegion'\n";
00177 
00178     my $genome_db_adaptor =
00179         $db->get_adaptor($dbname, 'compara', 'GenomeDB') or
00180             die "can't get $dbname, 'compara', 'GenomeDB'\n";
00181 
00182     my $genomedbs = $genome_db_adaptor->fetch_all();
00183 
00184     my $method_link = $self->config()->{'analysis'};
00185 
00186     my $link_template = $self->config()->{'link_template'} || 'http://www.ensembl.org/';
00187 
00188     $link_template .= '%s/Location/View?r=%s:%d-%d';
00189     $self->{'compara'}->{'link_template'} = $link_template;
00190 
00191     my $stored_max_alignment_length;
00192 
00193     my $species1_genome_db;
00194     my @other_species_genome_dbs;
00195 
00196     ## Get the Bio::EnsEMBL::Compara::GenomeDB object for the primary species
00197     foreach my $this_genome_db (@$genomedbs){
00198       if ($this_genome_db->name eq $species1) {
00199         $species1_genome_db = $this_genome_db;
00200       }
00201     }
00202     if (!defined($species1_genome_db)) {
00203       die "No species called $species1 in the database -- check spelling\n";
00204     }
00205 
00206     ## Get the Bio::EnsEMBL::Compara::GenomeDB objects for the remaining species
00207     foreach my $this_other_species (@other_species) {
00208       my $this_other_genome_db;
00209       foreach my $this_genome_db (@$genomedbs){
00210         if ($this_genome_db->name eq $this_other_species) {
00211           $this_other_genome_db = $this_genome_db;
00212           last;
00213         }
00214       }
00215       if (!defined($this_other_genome_db)) {
00216         die "No species called $this_other_species in the database -- check spelling\n";
00217       }
00218       push(@other_species_genome_dbs, $this_other_genome_db);
00219     }
00220 
00221     ## Fetch the Bio::EnsEMBL::Compara::DnaFrag object for the query segment
00222     my $dnafrag1 =
00223         $dnafrag_adaptor->fetch_by_GenomeDB_and_name($species1_genome_db, $chr1);
00224 
00225     return ( ) if (!defined $dnafrag1);
00226 
00227     ## Fetch the Bio::EnsEMBL::Compara::MethodLinkSpeciesSet object
00228     my $method_link_species_set;
00229     $method_link_species_set =
00230         $mlss_adaptor->fetch_by_method_link_type_GenomeDBs(
00231             $method_link, [$species1_genome_db, @other_species_genome_dbs]);
00232 
00233     ## Fetch the alginments on the query segment
00234     my $features;
00235     if ($method_link_species_set->method_link_class =~ /SyntenyRegion/) {
00236       $features = $self->get_features_from_SyntenyRegions($synteny_region_adaptor,
00237           $method_link_species_set, $dnafrag1, $start1, $end1);
00238     } else {
00239       $features = $self->get_features_from_GenomicAlingBlocks(
00240           $genomic_align_block_adaptor, $method_link_species_set, $dnafrag1, $start1, $end1);
00241     }
00242 
00243     return @$features;
00244 }
00245 
00246 sub get_features_from_SyntenyRegions {
00247   my ($self, $synteny_region_adaptor, $method_link_species_set, $dnafrag1, $start1, $end1) = @_;
00248   my $features = [];
00249 
00250   my $synteny_regions = $synteny_region_adaptor->fetch_all_by_MethodLinkSpeciesSet_DnaFrag(
00251       $method_link_species_set, $dnafrag1, $start1, $end1);
00252 
00253   my $data;
00254   foreach my $this_synteny_region (@$synteny_regions) {
00255     my $these_dnafrag_regions = $this_synteny_region->get_all_DnaFragRegions();
00256     foreach my $this_dnafrag_region (@{$these_dnafrag_regions}) {
00257       if ($this_dnafrag_region->genome_db->name == $dnafrag1->genome_db->name and
00258           $this_dnafrag_region->dnafrag->name == $dnafrag1->name and
00259           $this_dnafrag_region->dnafrag_start <= $end1 and
00260           $this_dnafrag_region->dnafrag_end >= $start1) {
00261         push(@$data, [$this_dnafrag_region, $these_dnafrag_regions]);
00262       }
00263     }
00264   }
00265 
00266   foreach my $this_data (@$data) {
00267     my ($reference_dnafrag_region, $all_dnafrag_regions) = @$this_data;
00268 
00269     ## Set link, linktxt, grouplink, grouplinktxt
00270     my @links;
00271     my @link_txts;
00272     foreach my $this_dnafrag_region (@{$all_dnafrag_regions}) {
00273       next if ($this_dnafrag_region == $reference_dnafrag_region);
00274       my ($species2, $name2, $start2, $end2) = (
00275           $this_dnafrag_region->dnafrag->genome_db->name(),
00276           $this_dnafrag_region->dnafrag->name(),
00277           $this_dnafrag_region->dnafrag_start(),
00278           $this_dnafrag_region->dnafrag_end(),
00279         );
00280       my $ens_species = $species2;
00281       $ens_species =~ s/ /_/g;
00282       my $short_species2 = $species2;
00283       $short_species2 =~ /(.)\S+\s(.{3})/;
00284       $short_species2 = "$1.$2";
00285       push(@links, sprintf($self->{compara}->{link_template}, $ens_species, $name2, $start2, $end2, $short_species2));
00286       push(@link_txts, sprintf("%s:%s:%d-%d", $short_species2, $name2, $start2, $end2));
00287     }
00288 
00289     my $synteny_region_id = $reference_dnafrag_region->synteny_region_id;
00290     push @$features, {
00291         'id'    => $synteny_region_id,
00292         'label' => "Synteny Region #$synteny_region_id",
00293         'method'=> $method_link_species_set->method_link_type,
00294         'start' => $reference_dnafrag_region->dnafrag_start,
00295         'end'   => $reference_dnafrag_region->dnafrag_end,
00296         'ori'   => ($reference_dnafrag_region->dnafrag_strand() == 1 ? '+' : '-'),
00297         'score' => '-',
00298         'link'  => [@links],
00299         'linktxt' => [@link_txts],
00300         'typecategory' => $method_link_species_set->method_link_class,
00301         'type'  => $method_link_species_set->name,
00302       };
00303   }
00304 
00305   return $features;
00306 }
00307 
00308 sub get_features_from_GenomicAlingBlocks {
00309   my ($self, $genomic_align_block_adaptor, $method_link_species_set, $dnafrag1, $start1, $end1) = @_;
00310   my $features = [];
00311 
00312   my $genomic_align_blocks = $genomic_align_block_adaptor->fetch_all_by_MethodLinkSpeciesSet_DnaFrag(
00313       $method_link_species_set, $dnafrag1, $start1, $end1);
00314 
00315   ## Get the start and end coordinates for each group. The coordinates are indexed
00316   ## by species_name and chr_name to ensure the continuity in the coordinates.
00317   my $group_coordinates;
00318   foreach my $gab (@$genomic_align_blocks) {
00319     my $group_id = $gab->group_id;
00320     if (!defined $group_id) {
00321     $group_id = $gab->dbID;
00322     if (!defined $group_id) {
00323         $group_id = $gab->{original_dbID};
00324     }
00325     }
00326     
00327     foreach my $genomic_align2 (@{$gab->get_all_non_reference_genomic_aligns()}) {
00328       my $species_name = $genomic_align2->dnafrag->genome_db->name;
00329       my $chr_name = $genomic_align2->dnafrag->name;
00330       my $chr_start = $genomic_align2->dnafrag_start;
00331       my $chr_end = $genomic_align2->dnafrag_end;
00332 
00333       if (!defined($group_coordinates->{$group_id}->{$species_name}->{$chr_name})) {
00334         $group_coordinates->{$group_id}->{$species_name}->{$chr_name}->{start} = $chr_start;
00335         $group_coordinates->{$group_id}->{$species_name}->{$chr_name}->{end} = $chr_end;
00336       } else {
00337         if ($chr_start < $group_coordinates->{$group_id}->{$species_name}->{$chr_name}->{start}) {
00338           $group_coordinates->{$group_id}->{$species_name}->{$chr_name}->{start} = $chr_start;
00339         }
00340         if ($chr_end > $group_coordinates->{$group_id}->{$species_name}->{$chr_name}->{end}) {
00341           $group_coordinates->{$group_id}->{$species_name}->{$chr_name}->{end} = $chr_end;
00342         }
00343       }
00344     }
00345   }
00346 
00347   #$id must be unique so use $cnt to make it so (it isn't unique for low
00348   #coverage genomes where more one gab contains several gas for a single
00349   #species
00350   my $cnt = 0;
00351   foreach my $gab (@$genomic_align_blocks) {
00352     $genomic_align_block_adaptor->retrieve_all_direct_attributes($gab);
00353 
00354     my $genomic_align1 = $gab->reference_genomic_align();
00355     my $other_genomic_aligns = $gab->get_all_non_reference_genomic_aligns();
00356     my $group_id = $gab->group_id;
00357     if (!defined $group_id) {
00358     $group_id = $gab->dbID;
00359     }
00360 
00361     my $id = $gab->dbID;
00362     if (!defined $id) {
00363     $id = $gab->{original_dbID} . "_$cnt";
00364     }
00365     $cnt++;
00366 
00367     my $label;
00368     my $group_label;
00369     # note will contain the perc_id if it exists
00370     my $note = $gab->perc_id?$gab->perc_id.'% identity':undef;
00371     
00372     my $group_note="";
00373     foreach my $gab_gp (@$genomic_align_blocks) {
00374     if (defined $gab_gp->group_id && defined $group_id && $gab_gp->group_id == $group_id) {
00375         my $ga_gp = $gab_gp->reference_genomic_align();
00376         #$group_note .= "".$ga_gp->dnafrag->name .":".$ga_gp->dnafrag_start ."-".$ga_gp->dnafrag_end.";";
00377         $group_note .= "".$gab_gp->dbID .":".$ga_gp->dnafrag_start ."-".$ga_gp->dnafrag_end.";";
00378     }
00379     }
00380     #my $group_note = "".$genomic_align1->dnafrag->name .":".$genomic_align1->dnafrag_start ."-".$genomic_align1->dnafrag_end;
00381 
00382     ## Set link, linktxt, grouplink, grouplinktxt
00383     my @links;
00384     my @link_txts;
00385     my @group_links;
00386     my @group_link_txts;
00387     foreach my $this_genomic_align (@{$gab->get_all_non_reference_genomic_aligns()}) {
00388       my ($species2, $name2, $start2, $end2) = (
00389           $this_genomic_align->dnafrag->genome_db->name(),
00390           $this_genomic_align->dnafrag->name(),
00391           $this_genomic_align->dnafrag_start(),
00392           $this_genomic_align->dnafrag_end(),
00393         );
00394       my $ens_species = $species2;
00395       $ens_species =~ s/ /_/g;
00396       my $short_species2 = $species2;
00397       $short_species2 =~ /(.)\S+\s(.{3})/;
00398       $short_species2 = "$1.$2";
00399       push(@links, sprintf($self->{compara}->{link_template}, $ens_species, $name2, $start2, $end2, $short_species2));
00400       push(@link_txts, sprintf("%s:%s:%d-%d", $short_species2, $name2, $start2, $end2));
00401       next if (!defined($group_id));
00402       my $group_start2 = $group_coordinates->{$group_id}->{$species2}->{$name2}->{start};
00403       my $group_end2 = $group_coordinates->{$group_id}->{$species2}->{$name2}->{end};
00404       $group_label = "$name2: $group_start2-$group_end2";
00405       push(@group_links, sprintf($self->{compara}->{link_template}, $ens_species, $name2,
00406           $group_start2, $group_end2));
00407       push(@group_link_txts, sprintf("%s:%s:%d-%d", $short_species2, $name2,
00408           $group_start2, $group_end2));
00409     }
00410 
00411     if (@{$method_link_species_set->species_set} <= 2) {
00412       ## for pairwise and self-alignments
00413       my $ga = $gab->get_all_non_reference_genomic_aligns()->[0];
00414       $label = $ga->dnafrag->name.": ".$ga->dnafrag_start."-".$ga->dnafrag_end;
00415     } else {
00416       ## for multiple alignments
00417       $group_label = $group_id?"group $group_id":undef;
00418     }
00419 
00420     my $score = $gab->score;
00421     if (!defined $score) {
00422     $score = 0;
00423     }
00424 
00425     push @$features, {
00426         'id'    => $id,
00427         'label' => $label,
00428         'method'=> $method_link_species_set->method_link_type,
00429         'start' => $genomic_align1->dnafrag_start,
00430         'end'   => $genomic_align1->dnafrag_end,
00431         'ori'   => ($genomic_align1->dnafrag_strand() == 1 ? '+' : '-'),
00432         'score' => $score,
00433         'note'  => $note,
00434         'link'  => [@links],
00435         'linktxt' => [@link_txts],
00436         'group' => $group_id,
00437         'grouplabel'=> $group_label,
00438         'grouplink' => [@group_links],
00439         'grouplinktxt' => [@group_link_txts],
00440     'groupnote'=> $group_note,
00441         'typecategory' => $method_link_species_set->method_link_class,
00442         'type'  => $method_link_species_set->name,
00443       };
00444   }
00445 
00446   return $features;
00447 }
00448 
00449 sub das_stylesheet
00450 {
00451     my $self = shift;
00452 
00453     return <<EOT;
00454 <!DOCTYPE DASSTYLE SYSTEM "http://www.biodas.org/dtd/dasstyle.dtd">
00455 <DASSTYLE>
00456     <STYLESHEET version="1.0">
00457         <CATEGORY id="GenomicAlignBlock.pairwise_alignment">
00458             <TYPE id="default">
00459                 <GLYPH>
00460                     <BOX>
00461                         <FGCOLOR>blue</FGCOLOR>
00462                         <BGCOLOR>aquamarine2</BGCOLOR>
00463                     </BOX>
00464                 </GLYPH>
00465             </TYPE>
00466         </CATEGORY>
00467         <CATEGORY id="GenomicAlignBlock.multiple_alignment">
00468             <TYPE id="default">
00469                 <GLYPH>
00470                     <BOX>
00471                         <FGCOLOR>brown</FGCOLOR>
00472                         <BGCOLOR>chocolate</BGCOLOR>
00473                     </BOX>
00474                 </GLYPH>
00475             </TYPE>
00476         </CATEGORY>
00477         <CATEGORY id="GenomicAlignTree.tree_alignment">
00478             <TYPE id="default">
00479                 <GLYPH>
00480                     <BOX>
00481                         <FGCOLOR>firebrick3</FGCOLOR>
00482                         <BGCOLOR>firebrick1</BGCOLOR>
00483                         <BUMP>no</BUMP>
00484                         <LABEL>no</LABEL>
00485                     </BOX>
00486                 </GLYPH>
00487             </TYPE>
00488         </CATEGORY>
00489         <CATEGORY id="SyntenyRegion.synteny">
00490             <TYPE id="default">
00491                 <GLYPH>
00492                     <BOX>
00493                         <FGCOLOR>black</FGCOLOR>
00494                         <BGCOLOR>DarkSeaGreen3</BGCOLOR>
00495                     </BOX>
00496                 </GLYPH>
00497             </TYPE>
00498         </CATEGORY>
00499     </STYLESHEET>
00500 </DASSTYLE>
00501 EOT
00502 }
00503 
00504 1;