Archive Ensembl HomeArchive Ensembl Home
DumpMercatorFiles.pm
Go to the documentation of this file.
00001 =head1 LICENSE
00002 
00003   Copyright (c) 1999-2012 The European Bioinformatics Institute and
00004   Genome Research Limited.  All rights reserved.
00005 
00006   This software is distributed under a modified Apache license.
00007   For license details, please see
00008 
00009     http://www.ensembl.org/info/about/code_licence.html
00010 
00011 =head1 CONTACT
00012 
00013   Please email comments or questions to the public Ensembl
00014   developers list at <dev@ensembl.org>.
00015 
00016   Questions may also be sent to the Ensembl help desk at
00017   <helpdesk@ensembl.org>.
00018 
00019 =head1 NAME
00020 
00021 Bio::EnsEMBL::Compara::RunnableDB::MercatorPecan::DumpMercatorFiles 
00022 
00023 =head1 SYNOPSIS
00024 
00025 
00026 =head1 DESCRIPTION
00027 
00028     Create Chromosome, Anchor and Hit files needed by Mercator.
00029 
00030 Supported keys:
00031     'genome_db_id' => <number>
00032         The id of the query genome 
00033 
00034      'genome_db_ids' => < list_of_genome_db_ids >
00035         eg genome_db_ids => ' [61,108,111,112,38,60,101,43,31] '
00036         List of genome ids to match against
00037 
00038      'all_hits' => <0|1>
00039         Whether to perform all best hits (1) or best reciprocal hits only (0)
00040 
00041      'input_dir' => < directory_path >
00042         Location to write files required by Mercator
00043 
00044      'maximum_gap' => <number>
00045         eg 50000
00046 
00047      'cutoff_score' => <number>
00048          Filter by score. Not normally used
00049 
00050      'cutoff_evalue' => <number>
00051          Filter by evalue. Not normally used
00052 
00053 =cut
00054 
00055 package Bio::EnsEMBL::Compara::RunnableDB::MercatorPecan::DumpMercatorFiles;
00056 
00057 use strict;
00058 use Bio::EnsEMBL::Analysis::Runnable::Mercator;
00059 use Bio::EnsEMBL::Compara::DnaFragRegion;
00060 use Bio::EnsEMBL::Compara::Production::DBSQL::DBAdaptor;;
00061 use Bio::EnsEMBL::Utils::Exception;
00062 use Time::HiRes qw(time gettimeofday tv_interval);
00063 
00064 use base ('Bio::EnsEMBL::Compara::RunnableDB::BaseRunnable');
00065 
00066 sub fetch_input {
00067   my( $self) = @_;
00068 
00069   return 1;
00070 }
00071 
00072 sub run
00073 {
00074   my $self = shift;
00075   $self->dumpMercatorFiles;
00076 }
00077 
00078 sub write_output {
00079   my ($self) = @_;
00080   return 1;
00081 }
00082 
00083 sub dumpMercatorFiles {
00084   my $self = shift;
00085   
00086   my $starttime = time();
00087 
00088   unless (defined $self->param('input_dir')) {
00089     my $input_dir = $self->worker_temp_directory . "/input_dir";
00090     $self->param('input_dir', $input_dir);
00091   }
00092   if (! -e $self->param('input_dir')) {
00093     mkdir($self->param('input_dir'), 0777);
00094   }
00095 
00096   my $dfa = $self->compara_dba->get_DnaFragAdaptor;
00097   my $gdba = $self->compara_dba->get_GenomeDBAdaptor;
00098   my $ma = $self->compara_dba->get_MemberAdaptor;
00099   my $ssa = $self->compara_dba->get_SubsetAdaptor;
00100 
00101   my $max_gap = $self->param('maximum_gap');
00102 
00103   my $gdb_id = $self->param('genome_db_id');
00104 
00105   my $dnafrags;
00106 
00107   ## Create the Chromosome file for Mercator
00108   my $gdb = $gdba->fetch_by_dbID($gdb_id);
00109   my $file = $self->param('input_dir') . "/$gdb_id.chroms";
00110   open F, ">$file";
00111   foreach my $df (@{$dfa->fetch_all_by_GenomeDB_region($gdb)}) {
00112       print F $df->name . "\t" . $df->length,"\n";
00113       if ($max_gap and $df->coord_system_name eq "chromosome") {
00114       my $core_dba = $gdb->db_adaptor;
00115       my $coord_system_adaptor = $core_dba->get_CoordSystemAdaptor();
00116       my $assembly_mapper_adaptor = $core_dba->get_AssemblyMapperAdaptor();
00117       my $chromosome_coord_system = $coord_system_adaptor->fetch_by_name("chromosome");
00118       my $seq_level_coord_system = $coord_system_adaptor->fetch_sequence_level;
00119 
00120       my $assembly_mapper = $assembly_mapper_adaptor->fetch_by_CoordSystems($chromosome_coord_system, $seq_level_coord_system);
00121       my @mappings = $assembly_mapper->map($df->name, 1, $df->length, 1, $chromosome_coord_system);
00122       
00123       my $part = 1;
00124       foreach my $this_mapping (@mappings) {
00125           next if ($this_mapping->isa("Bio::EnsEMBL::Mapper::Coordinate"));
00126           next if ($this_mapping->length < $max_gap);
00127           # print join(" :: ", $df->name, $this_mapping->length, $this_mapping->start, $this_mapping->end), "\n";
00128           print F $df->name . "--$part\t" . $df->length,"\n";
00129           $dnafrags->{$df->name}->{$this_mapping->start} = $df->name."--".$part;
00130           $part++;
00131       }
00132       }
00133   }
00134   close F;
00135 
00136   ## Create the anchor file for Mercator
00137   my $ss = $ssa->fetch_by_set_description("gdb:".$gdb->dbID ." ". $gdb->name . ' ref coding exons');
00138   $file = $self->param('input_dir') . "/$gdb_id.anchors";
00139   open F, ">$file";
00140   foreach my $member (@{$ma->fetch_all_by_subset_id($ss->dbID)}) {
00141       my $strand = "+";
00142       $strand = "-" if ($member->chr_strand == -1);
00143       my $chr_name = $member->chr_name;
00144       if (defined($dnafrags->{$member->chr_name})) {
00145       foreach my $this_start (sort {$a <=> $b} keys %{$dnafrags->{$member->chr_name}}) {
00146           if ($this_start > $member->chr_start - 1) {
00147           last;
00148           } else {
00149           $chr_name = ($dnafrags->{$member->chr_name}->{$this_start} or $member->chr_name);
00150           }
00151       }
00152       }
00153       print F $member->dbID . "\t" .
00154         $chr_name ."\t" .
00155           $strand . "\t" .
00156             ($member->chr_start - 1) ."\t" .
00157               $member->chr_end ."\t1\n";
00158   }
00159   close F;
00160 
00161   my $genome_db_ids = eval $self->param('genome_db_ids');
00162 
00163   my $gdb_id1 = $self->param('genome_db_id');
00164 
00165   foreach my $gdb_id2 (@$genome_db_ids) {
00166       my $file = $self->param('input_dir') . "/$gdb_id1" . "-$gdb_id2.hits";
00167       open F, ">$file";
00168       my $sql = $self->get_sql_for_peptide_hits($gdb_id1, $gdb_id2);
00169       my $sth = $self->{'comparaDBA'}->dbc->prepare($sql);
00170       my ($qmember_id,$hmember_id,$score1,$evalue1,$score2,$evalue2);
00171       $sth->execute($gdb_id1, $gdb_id2);
00172       $sth->bind_columns( \$qmember_id,\$hmember_id,\$score1,\$evalue1,\$score2,\$evalue2);
00173       my %pair_seen = ();
00174       while ($sth->fetch()) {
00175         next if ($pair_seen{$qmember_id . "_" . $hmember_id});
00176         my $score = ($score1>$score2)?$score2:$score1; ## Use smallest score
00177         my $evalue = ($evalue1>$evalue2)?$evalue1:$evalue2; ## Use largest e-value
00178         next if (defined $self->param('cutoff_score') && $score < $self->param('cutoff_score'));
00179         next if (defined $self->param('cutoff_evalue') && $evalue > $self->param('cutoff_evalue'));
00180         print F "$qmember_id\t$hmember_id\t" . int($score). "\t$evalue\n";
00181         $pair_seen{$qmember_id . "_" . $hmember_id} = 1;
00182     }
00183       close F;
00184       $sth->finish();
00185   }
00186 
00187   if($self->debug){printf("%1.3f secs to dump nib for \"%s\" collection\n", (time()-$starttime), $self->collection_name);}
00188 
00189   return 1;
00190 }
00191 
00192 
00193 sub get_sql_for_peptide_hits {
00194   my ($self, $gdb_id1, $gdb_id2) = @_;
00195   my $sql;
00196 
00197   my $table_name1 = $self->get_table_name_from_dbID($gdb_id1);
00198   my $table_name2 = $self->get_table_name_from_dbID($gdb_id2);
00199 
00200   if ($self->param('all_hits')) {
00201     ## Use all best hits
00202     $sql = "SELECT paf1.qmember_id, paf1.hmember_id, paf1.score, paf1.evalue, paf2.score, paf2.evalue
00203       FROM $table_name1 paf1, $table_name2 paf2
00204       WHERE paf1.qgenome_db_id = ? AND paf1.hgenome_db_id = ?
00205         AND paf1.qmember_id = paf2.hmember_id AND paf1.hmember_id = paf2.qmember_id
00206         AND (paf1.hit_rank = 1 OR paf2.hit_rank = 1)";
00207   } else {
00208     ## Use best reciprocal hits only
00209     $sql = "SELECT paf1.qmember_id, paf1.hmember_id, paf1.score, paf1.evalue, paf2.score, paf2.evalue
00210       FROM $table_name1 paf1, $table_name2 paf2
00211       WHERE paf1.qgenome_db_id = ? AND paf1.hgenome_db_id = ?
00212         AND paf1.qmember_id = paf2.hmember_id AND paf1.hmember_id = paf2.qmember_id
00213         AND paf1.hit_rank = 1 AND paf2.hit_rank = 1";
00214   }
00215 
00216   return $sql;
00217 }
00218 
00219 
00220 sub get_table_name_from_dbID {
00221   my ($self, $gdb_id) = @_;
00222   my $table_name = "peptide_align_feature";
00223 
00224   my $gdba = $self->compara_dba->get_GenomeDBAdaptor;
00225   my $gdb = $gdba->fetch_by_dbID($gdb_id);
00226   return $table_name if (!$gdb);
00227 
00228   $table_name .= "_" . lc($gdb->name) . "_" . $gdb_id;
00229   $table_name =~ s/ /_/g;
00230 
00231   return $table_name;
00232 }
00233 
00234 1;