Archive Ensembl HomeArchive Ensembl Home
NCRecoverSearch.pm
Go to the documentation of this file.
00001 #
00002 # You may distribute this module under the same terms as perl itself
00003 #
00004 # POD documentation - main docs before the code
00005 
00006 =pod 
00007 
00008 =head1 NAME
00009 
00010 Bio::EnsEMBL::Compara::RunnableDB::ncRNAtrees::NCRecoverSearch
00011 
00012 =cut
00013 
00014 =head1 SYNOPSIS
00015 
00016 my $db           = Bio::EnsEMBL::Compara::DBAdaptor->new($locator);
00017 my $ncrecoversearch = Bio::EnsEMBL::Compara::RunnableDB::ncRNAtrees::NCRecoverSearch->new
00018   (
00019    -db         => $db,
00020    -input_id   => $input_id,
00021    -analysis   => $analysis
00022   );
00023 $ncrecoversearch->fetch_input(); #reads from DB
00024 $ncrecoversearch->run();
00025 $ncrecoversearch->output();
00026 $ncrecoversearch->write_output(); #writes to DB
00027 
00028 =cut
00029 
00030 
00031 =head1 DESCRIPTION
00032 
00033 This Analysis will take the sequences from a cluster, the cm from
00034 nc_profile and run a profiled alignment, storing the results as
00035 cigar_lines for each sequence.
00036 
00037 =cut
00038 
00039 
00040 =head1 CONTACT
00041 
00042   Contact Albert Vilella on module implementation/design detail: avilella@ebi.ac.uk
00043   Contact Ewan Birney on EnsEMBL in general: birney@sanger.ac.uk
00044 
00045 =cut
00046 
00047 
00048 =head1 APPENDIX
00049 
00050 The rest of the documentation details each of the object methods. 
00051 Internal methods are usually preceded with a _
00052 
00053 =cut
00054 
00055 
00056 package Bio::EnsEMBL::Compara::RunnableDB::ncRNAtrees::NCRecoverSearch;
00057 
00058 use strict;
00059 use Time::HiRes qw(time gettimeofday tv_interval);
00060 use Bio::EnsEMBL::Registry;
00061 
00062 use base ('Bio::EnsEMBL::Compara::RunnableDB::BaseRunnable');
00063 
00064 sub param_defaults {
00065     return {
00066         'clusterset_id' => 1,
00067         'context_size'  => '120%',
00068     };
00069 }
00070 
00071 =head2 fetch_input
00072 
00073     Title   :   fetch_input
00074     Usage   :   $self->fetch_input
00075     Function:   Fetches input data from the database
00076     Returns :   none
00077     Args    :   none
00078 
00079 =cut
00080 
00081 
00082 sub fetch_input {
00083     my $self = shift @_;
00084 
00085     $self->input_job->transient_error(0);
00086     my $nc_tree_id = $self->param('nc_tree_id') || die "'nc_tree_id' is an obligatory numeric parameter\n";
00087     $self->input_job->transient_error(1);
00088 
00089     my $nc_tree    = $self->compara_dba->get_NCTreeAdaptor->fetch_node_by_node_id($nc_tree_id) or die "Could not fetch nc_tree with id=$nc_tree_id\n";
00090     $self->param('nc_tree', $nc_tree);
00091 
00092     $self->param('model_id', $nc_tree->tree->get_tagvalue('clustering_id'));
00093 
00094     $self->fetch_recovered_member_entries($nc_tree->node_id);
00095 
00096         # Get the needed adaptors here:
00097     $self->param('genomedb_adaptor', $self->compara_dba->get_GenomeDBAdaptor);
00098 }
00099 
00100 =head2 run
00101 
00102     Title   :   run
00103     Usage   :   $self->run
00104     Function:   runs something
00105     Returns :   none
00106     Args    :   none
00107 
00108 =cut
00109 
00110 sub run {
00111   my $self = shift;
00112 
00113   ## This is disabled right now
00114   # $self->run_ncrecoversearch;
00115 }
00116 
00117 
00118 =head2 write_output
00119 
00120     Title   :   write_output
00121     Usage   :   $self->write_output
00122     Function:   stores something
00123     Returns :   none
00124     Args    :   none
00125 
00126 =cut
00127 
00128 
00129 sub write_output {
00130   my $self = shift;
00131 
00132   # Default autoflow
00133 }
00134 
00135 
00136 ##########################################
00137 #
00138 # internal methods
00139 #
00140 ##########################################
00141 
00142 sub run_ncrecoversearch {
00143   my $self = shift;
00144 
00145   next unless(keys %{$self->param('recovered_members')});
00146 
00147   my $cmsearch_exe = $self->param('cmsearch_exe')
00148       or die "'cmsearch_exe' is an obligatory parameter";
00149 
00150   die "Cannot execute '$cmsearch_exe'" unless(-x $cmsearch_exe);
00151 
00152   my $worker_temp_directory = $self->worker_temp_directory;
00153   my $root_id = $self->param('nc_tree')->node_id;
00154 
00155   my $input_fasta = $worker_temp_directory . $root_id . ".db";
00156   open FILE,">$input_fasta" or die "$!\n";
00157 
00158   foreach my $genome_db_id (keys %{$self->param('recovered_members')}) {
00159     my $gdb = $self->param('genomedb_adaptor')->fetch_by_dbID($genome_db_id);
00160     my $core_slice_adaptor = $gdb->db_adaptor->get_SliceAdaptor;
00161     foreach my $recovered_entry (keys %{$self->param('recovered_members')->{$genome_db_id}}) {
00162       my $recovered_id = $self->param('recovered_members')->{$genome_db_id}{$recovered_entry};
00163       unless ($recovered_entry =~ /(\S+)\:(\S+)\-(\S+)/) {
00164         warn("failed to parse coordinates for recovered entry: [$genome_db_id] $recovered_entry\n");
00165         next;
00166       } else {
00167         my ($seq_region_name,$start,$end) = ($1,$2,$3);
00168         my $temp; if ($start > $end) { $temp = $start; $start = $end; $end = $temp;}
00169         my $size = $self->param('context_size');
00170         $size = int( ($1-100)/200 * ($end-$start+1) ) if( $size =~/([\d+\.]+)%/ );
00171         my $slice = $core_slice_adaptor->fetch_by_region('toplevel',$seq_region_name,$start-$size,$end+$size);
00172         my $seq = $slice->seq; $seq =~ s/(.{60})/$1\n/g; chomp $seq;
00173         print FILE ">$recovered_id\n$seq\n";
00174       }
00175     }
00176   }
00177   close FILE;
00178 
00179   my $model_id = $self->param('model_id');
00180   my $ret1 = $self->dump_model('model_id', $model_id);
00181   my $ret2 = $self->dump_model('name',     $model_id) if (1 == $ret1);
00182   if (1 == $ret2) {
00183     $self->param('nc_tree')->release_tree;
00184     $self->param('nc_tree', undef);
00185     $self->input_job->transient_error(0);
00186     die "Failed to find '$model_id' both in 'model_id' and 'name' fields of 'nc_profile' table";
00187   }
00188 
00189   my $cmd = $cmsearch_exe;
00190   # /nfs/users/nfs_a/avilella/src/infernal/infernal-1.0/src/cmsearch --tabfile 110257.tab snoU89_profile.cm 110257.db
00191 
00192   return 1;   # FIXME: this is not ready -- disabling
00193 
00194   my $tabfilename = $worker_temp_directory . $root_id . ".tab";
00195   $cmd .= " --tabfile " . $tabfilename;
00196   $cmd .= " " . $self->param('profile_file');
00197   $cmd .= " " . $input_fasta;
00198 
00199   $self->compara_dba->dbc->disconnect_when_inactive(1);
00200   print("$cmd\n") if($self->debug);
00201   unless(system($cmd) == 0) {
00202     $self->throw("error running cmsearch, $!\n");
00203   }
00204   $self->compara_dba->dbc->disconnect_when_inactive(0);
00205 
00206   open TABFILE,"$tabfilename" or die "$!\n";
00207   while (<TABFILE>) {
00208     next if /^\#/;
00209     my ($dummy,$target_name,$target_start,$target_stop,$query_start,$query_stop,$bit_sc,$evalue,$gc) = split(/\s+/,$_);
00210     my $sth = $self->compara_dba->dbc->prepare
00211       ("INSERT IGNORE INTO cmsearch_hit 
00212                            (recovered_id,
00213                             node_id,
00214                             target_start,
00215                             target_stop,
00216                             query_start,
00217                             query_stop,
00218                             bit_sc,
00219                             evalue) VALUES (?,?,?,?,?,?,?,?)");
00220     $sth->execute($target_name,
00221                   $root_id,
00222                   $target_start,
00223                   $target_stop,
00224                   $query_start,
00225                   $query_stop,
00226                   $bit_sc,
00227                   $evalue);
00228     $sth->finish;
00229   }
00230   close TABFILE;
00231 
00232   return 1;
00233 }
00234 
00235 
00236 sub dump_model {
00237   my $self = shift;
00238   my $field = shift;
00239   my $model_id = shift;
00240 
00241   my $sql = 
00242     "SELECT hc_profile FROM nc_profile ".
00243       "WHERE $field=\"$model_id\"";
00244   my $sth = $self->compara_dba->dbc->prepare($sql);
00245   $sth->execute();
00246   my $nc_profile  = $sth->fetchrow;
00247   unless (defined($nc_profile)) {
00248     return 1;
00249   }
00250   my $profile_file = $self->worker_temp_directory . $model_id . "_profile.cm";
00251   open FILE, ">$profile_file" or die "$!";
00252   print FILE $nc_profile;
00253   close FILE;
00254 
00255   $self->param('profile_file', $profile_file);
00256   return 0;
00257 }
00258 
00259 sub fetch_recovered_member_entries {
00260   my $self = shift;
00261   my $root_id = shift;
00262 
00263   $self->param('recovered_members', {});
00264 
00265   my $sql = 
00266     "SELECT rcm.recovered_id, rcm.node_id, rcm.stable_id, rcm.genome_db_id ".
00267     "FROM recovered_member rcm ".
00268     "WHERE rcm.node_id=\"$root_id\" AND ".
00269     "rcm.stable_id not in ".
00270     "(SELECT stable_id FROM member WHERE source_name='ENSEMBLGENE' AND genome_db_id=rcm.genome_db_id)";
00271   my $sth = $self->compara_dba->dbc->prepare($sql);
00272   $sth->execute();
00273   while ( my $ref  = $sth->fetchrow_arrayref() ) {
00274     my ($recovered_id, $node_id, $stable_id, $genome_db_id) = @$ref;
00275     $self->param('recovered_members')->{$genome_db_id}{$stable_id} = $recovered_id;
00276   }
00277 }
00278 
00279 
00280 1;