Archive Ensembl HomeArchive Ensembl Home
HMMsearch.pm
Go to the documentation of this file.
00001 package Bio::EnsEMBL::Compara::Production::EPOanchors::HMMer::HMMsearch;
00002 
00003 use strict;
00004 use Data::Dumper;
00005 use Bio::EnsEMBL::Registry;
00006 use Bio::EnsEMBL::Utils::Exception qw(throw);
00007 use File::Basename;
00008 
00009 use base ('Bio::EnsEMBL::Compara::RunnableDB::BaseRunnable');
00010 
00011 sub fetch_input {
00012     my ($self) = @_;
00013 }
00014 
00015 sub run {
00016     my ($self) = @_;
00017 
00018     my $self_dba = $self->compara_dba;
00019 
00020     my $dnafrag_adaptor = $self_dba->get_adaptor("DnaFrag");
00021     my $gab_adaptor = $self_dba->get_adaptor("GenomicAlignBlock");
00022     my $genome_db_adaptor = $self_dba->get_adaptor("GenomeDB");
00023     my ($gab_id) = $self->param('gab_id');
00024     my $self_gab_adaptor = $self_dba->get_adaptor("GenomicAlignBlock");
00025     my @hits = ();
00026     my $gab = $self_gab_adaptor->fetch_by_dbID($gab_id);
00027     my $stk_file = "/tmp/sf5_$gab_id.stk";
00028     my $hmm_file = "/tmp/sf5_$gab_id.hmm";
00029 
00030     $self_dba->dbc->disconnect_when_inactive(1); 
00031 
00032     open(IN, ">$stk_file") or throw("can not open stockholm file $stk_file for writing");
00033     print IN "# STOCKHOLM 1.0\n";
00034     foreach my $genomic_align( @{$gab->get_all_GenomicAligns} ){
00035         my $aligned_seq = $genomic_align->aligned_sequence;
00036         next if($aligned_seq=~/^[N-]+[N-]$/);
00037         $aligned_seq=~s/\./-/g;
00038         print IN $gab_id, "\/", $genomic_align->dnafrag_start, ":", $genomic_align->dnafrag_end,
00039             "\/", $genomic_align->dnafrag->genome_db->name, "\t",
00040             $aligned_seq, "\n"; 
00041     }
00042     print IN "//";
00043     close(IN);
00044 
00045     my $genome_seq_file = $self->param('target_genome')->{"genome_seq"};
00046     #Copy genome_seq to local disk only if md5sum parameter is set. 
00047     if ($self->param('md5sum')) {
00048         #Copy genome_seq to local disk if it doesn't already exist
00049         my $name = basename($self->param('target_genome')->{"genome_seq"});
00050         my $tmp_file = "/tmp/" . $ENV{USER} . "_" . $self->param('target_genome')->{name} . "_" . $name;
00051         
00052         if (-e $tmp_file) {
00053         print "$tmp_file already exists\n";
00054         $genome_seq_file = $tmp_file;
00055         } else {
00056         my $start_time = time;
00057         print "Need to copy file\n";
00058         my $rsync_cmd = "rsync $genome_seq_file $tmp_file";
00059         print "$rsync_cmd\n";
00060 
00061         system($rsync_cmd) == 0 or die "system $rsync_cmd failed:$?";
00062 
00063         print "Time to rsync " . (time - $start_time) . "\n";
00064         my $rsync_time = time;
00065 
00066         #Check md5sum
00067         my $md5sum = `md5sum $tmp_file`;
00068         if ($md5sum == $self->param('md5sum')) {
00069             $genome_seq_file = $tmp_file;
00070         } else {
00071             print "md5sum failed. Use $genome_seq_file\n";
00072         }
00073         print "Time to md5sum " . (time - $rsync_time) . "\n";
00074         print "Total time" . (time - $start_time) . "\n";
00075         }
00076     }
00077 
00078     my $hmm_build_command = $self->param('hmmbuild') . " $hmm_file $stk_file";  
00079     print $hmm_build_command, " **\n";
00080     system($hmm_build_command);
00081     return unless(-e $hmm_file); # The sequences in the alignment are probably too short
00082     my $hmm_len = `egrep "^LENG  " $hmm_file`;
00083     chomp($hmm_len);
00084     $hmm_len=~s/^LENG  //;
00085 #   my $nhmmer_command = $self->param('nhmmer') . " --cpu 1 --noali" ." $hmm_file " . $self->param('target_genome')->{"genome_seq"};
00086 
00087     my $nhmmer_command = $self->param('nhmmer') . " --cpu 1 --noali" ." $hmm_file $genome_seq_file";
00088     print $nhmmer_command, " **\n";
00089     my $nhmm_fh;
00090     open( $nhmm_fh, "$nhmmer_command |" ) or throw("Error opening nhmmer command: $? $!"); 
00091     { local $/ = ">>";
00092         while(my $mapping = <$nhmm_fh>){
00093             next unless $mapping=~/!/;
00094             push(@hits, [$gab_id, $mapping]);
00095         }
00096     }
00097 
00098     $self_dba->dbc->disconnect_when_inactive(0); 
00099 
00100     my @anchor_align_records;
00101     foreach my $hit(@hits){
00102         my $mapping_id = $hit->[0];
00103         my($target_info, $mapping_info) = (split("\n", $hit->[1]))[0,3];
00104         my($coor_sys, $species, $seq_region_name) = (split(":", $target_info))[0,1,2];
00105         my($score,$bias, $evalue, $hmm_from, $hmm_to, $alifrom, $alito) = (split(/ +/, $mapping_info))[2,3,4,5,6,8,9];
00106         my $strand = $alifrom > $alito ? "-1" : "1";
00107         ($alifrom, $alito) = ($alito, $alifrom) if $strand eq "-1";
00108         my $taregt_genome_db = $genome_db_adaptor->fetch_by_registry_name($self->param('target_genome')->{"name"});
00109         my $dnafrag = $dnafrag_adaptor->fetch_by_GenomeDB_and_name($taregt_genome_db, $seq_region_name);
00110         next unless($dnafrag);  
00111         push( @anchor_align_records, [ $self->param('mlssid_of_alignments'), $mapping_id, $dnafrag->dbID, $alifrom, $alito,
00112                         $strand, $score, $hmm_from, $hmm_to, $evalue, $hmm_len ] );  
00113     }   
00114     $self->param('mapping_hits', \@anchor_align_records) if scalar( @anchor_align_records );
00115     unlink("$stk_file");
00116     unlink("$hmm_file");
00117 }
00118 
00119 sub write_output{
00120     my ($self) = @_;
00121     my $self_anchor_align_adaptor = $self->compara_dba->get_adaptor("AnchorAlign");
00122     $self_anchor_align_adaptor->store_mapping_hits( $self->param('mapping_hits') ) if $self->param('mapping_hits');
00123 }
00124 
00125 1;
00126