Archive Ensembl HomeArchive Ensembl Home
RFAMClassify.pm
Go to the documentation of this file.
00001 =head1 LICENSE
00002 
00003   Copyright (c) 1999-2012 The European Bioinformatics Institute and
00004   Genome Research Limited.  All rights reserved.
00005 
00006   This software is distributed under a modified Apache license.
00007   For license details, please see
00008 
00009    http://www.ensembl.org/info/about/code_licence.html
00010 
00011 =head1 CONTACT
00012 
00013   Please email comments or questions to the public Ensembl
00014   developers list at <dev@ensembl.org>.
00015 
00016   Questions may also be sent to the Ensembl help desk at
00017   <helpdesk@ensembl.org>.
00018 
00019 =cut
00020 
00021 =head1 NAME
00022 
00023 Bio::EnsEMBL::Compara::RunnableDB::ncRNAtrees::RFAMClassify
00024 
00025 =head1 SYNOPSIS
00026 
00027 my $db           = Bio::EnsEMBL::Compara::DBAdaptor->new($locator);
00028 my $rfamclassify = Bio::EnsEMBL::Compara::RunnableDB::ncRNAtrees::RFAMClassify->new
00029   (
00030    -db         => $db,
00031    -input_id   => $input_id,
00032    -analysis   => $analysis
00033   );
00034 
00035 $rfamclassify->fetch_input(); #reads from DB
00036 $rfamclassify->run();
00037 $rfamclassify->output();
00038 $rfamclassify->write_output(); #writes to DB
00039 
00040 
00041 =head1 DESCRIPTION
00042 
00043 This Analysis/RunnableDB is designed to take the descriptions of each
00044 ncrna member and classify them into the respective cluster according
00045 to their RFAM id. It also takes into account information from mirBase.
00046 
00047 =cut
00048 
00049 
00050 =head1 CONTACT
00051 
00052   Contact Albert Vilella on module implementation/design detail: avilella@ebi.ac.uk
00053 
00054 =cut
00055 
00056 
00057 =head1 APPENDIX
00058 
00059 The rest of the documentation details each of the object methods.
00060 Internal methods are usually preceded with a _
00061 
00062 =cut
00063 
00064 
00065 package Bio::EnsEMBL::Compara::RunnableDB::ncRNAtrees::RFAMClassify;
00066 
00067 use strict;
00068 use Data::Dumper;
00069 use IO::File;
00070 use File::Basename;
00071 use Time::HiRes qw(time gettimeofday tv_interval);
00072 
00073 use Bio::EnsEMBL::Utils::Exception;
00074 use Bio::EnsEMBL::Compara::MethodLinkSpeciesSet;
00075 use Bio::EnsEMBL::Compara::GeneTreeNode;
00076 use LWP::Simple;
00077 
00078 use base ('Bio::EnsEMBL::Compara::RunnableDB::BaseRunnable');
00079 
00080 
00081 
00082 sub fetch_input {
00083     my $self = shift @_;
00084 
00085     $self->input_job->transient_error(0);
00086     my $mlss_id = $self->param('mlss_id') or die "'mlss_id' is an obligatory parameter\n";
00087     $self->input_job->transient_error(1);
00088 
00089     my $mlss = $self->compara_dba->get_MethodLinkSpeciesSetAdaptor->fetch_by_dbID($mlss_id) or die "Could not fetch MLSS with dbID=$mlss_id";
00090 
00091     $self->param('cluster_mlss', $mlss);
00092 }
00093 
00094 
00095 sub run {
00096     my $self = shift @_;
00097 
00098     $self->tag_assembly_coverage_depth;
00099     $self->load_mirbase_families;
00100     $self->run_rfamclassify;
00101 }
00102 
00103 
00104 sub write_output {
00105     my $self = shift @_;
00106 
00107     $self->dataflow_clusters;
00108 }
00109 
00110 
00111 ##########################################
00112 #
00113 # internal methods
00114 #
00115 ##########################################
00116 
00117 sub run_rfamclassify {
00118     my $self = shift;
00119 
00120     my $mlss_id = $self->param('mlss_id');
00121     # vivification:
00122     $self->param('rfamcms', {});
00123 
00124     $self->build_hash_cms('model_id');
00125     $self->load_names_model_id();
00126     #  $self->build_hash_cms('name');
00127     $self->build_hash_models();
00128 
00129     my $nctree_adaptor = $self->compara_dba->get_NCTreeAdaptor;
00130 
00131     # Create the clusterset and associate mlss
00132     my $clusterset = new Bio::EnsEMBL::Compara::GeneTree;
00133     $clusterset->tree_type('ncrnaclusterset');
00134     $clusterset->method_link_species_set_id($mlss_id);
00135     $self->param('clusterset', $clusterset);
00136 
00137     my $clusterset_root = new Bio::EnsEMBL::Compara::GeneTreeNode;
00138     $clusterset->root($clusterset_root);
00139     $nctree_adaptor->store($clusterset);
00140 
00141     my @allclusters;
00142     $self->param('allclusters', \@allclusters);
00143 
00144 
00145     # Classify the cluster that already have an RFAM id or mir id
00146     print STDERR "Storing clusters...\n" if ($self->debug);
00147     my $counter = 1;
00148     foreach my $cm_id (keys %{$self->param('rfamcms')->{'model_id'}}) {
00149         print STDERR "++ $cm_id\n" if ($self->debug);
00150         next if not defined($self->param('rfamclassify')->{$cm_id});
00151         my @cluster_list = keys %{$self->param('rfamclassify')->{$cm_id}};
00152 
00153         print STDERR Dumper \@cluster_list if ($self->debug);
00154         # If it's a singleton, we don't store it as a nc tree
00155         next if (scalar(@cluster_list < 2));
00156 
00157         #printf("%10d clusters\n", $counter) if (($counter % 20 == 0) && ($self->debug));
00158         $counter++;
00159 
00160         my $model_name;
00161         if    (defined($self->param('model_id_names')->{$cm_id})) { $model_name = $self->param('model_id_names')->{$cm_id}; }
00162         elsif (defined($self->param('model_name_ids')->{$cm_id})) { $model_name = $cm_id; }
00163 
00164         print STDERR "ModelName: $model_name\n" if ($self->debug);
00165 
00166         my $clusterset_leaf = new Bio::EnsEMBL::Compara::GeneTreeNode;
00167         $clusterset_leaf->no_autoload_children();
00168         $clusterset_root->add_child($clusterset_leaf);
00169 
00170         my $cluster = new Bio::EnsEMBL::Compara::GeneTree;
00171         $cluster->tree_type('nctree');
00172         $cluster->method_link_species_set_id($mlss_id);
00173 
00174         my $cluster_root = new Bio::EnsEMBL::Compara::GeneTreeNode;
00175         $cluster->root($cluster_root);
00176         $cluster->clusterset_id($cluster_root->node_id);
00177         $cluster_root->tree($cluster);
00178         $clusterset_leaf->add_child($cluster_root);
00179 
00180         foreach my $pmember_id (@cluster_list) {
00181             print STDERR "Adding $pmember_id\n" if ($self->debug);
00182             my $node = new Bio::EnsEMBL::Compara::GeneTreeNode;
00183             $cluster_root->add_child($node);
00184             #leaves are GeneTreeNode objects, bless to make into GeneTreeMember objects
00185             bless $node, "Bio::EnsEMBL::Compara::GeneTreeMember";
00186 
00187             #the building method uses member_id's to reference unique nodes
00188             #which are stored in the node_id value, copy to member_id
00189             $node->member_id($pmember_id);
00190         }
00191         $DB::single=1;1;
00192 
00193         # Store the cluster:
00194         $nctree_adaptor->store($clusterset_leaf);
00195         push @allclusters, $cluster->root_id;
00196 
00197         my $leafcount = scalar(@{$cluster->root->get_all_leaves});
00198         print STDERR "cluster $cluster with $leafcount leaves\n" if $self->debug;
00199         $cluster->store_tag('gene_count', $leafcount);
00200         $cluster->store_tag('clustering_id', $cm_id);
00201         $cluster->store_tag('model_name', $model_name) if (defined($model_name));
00202         $cluster_root->disavow_parent();
00203 
00204     }
00205     $clusterset_root->build_leftright_indexing(1);
00206     $nctree_adaptor->store($clusterset);
00207     $self->param('clusterset_id', $clusterset_root->node_id);
00208     my $leafcount = scalar(@{$clusterset->root->get_all_leaves});
00209     print STDERR "clusterset $clusterset with $leafcount leaves\n" if $self->debug;
00210 
00211     # Flow the members that havent been associated to a cluster at this
00212     # stage to the search for all models
00213 
00214     return 1;
00215 }
00216 
00217 sub dataflow_clusters {
00218     my $self = shift;
00219 
00220     foreach my $tree_id (@{$self->param('allclusters')}) {
00221         $self->dataflow_output_id({ 'nc_tree_id' => $tree_id, }, 2);
00222     }
00223 
00224 }
00225 
00226 sub build_hash_models {
00227   my $self = shift;
00228 
00229     # vivification:
00230   $self->param('rfamclassify', {});
00231   $self->param('orphan_transcript_model_id', {});
00232 
00233   # We only take the longest transcript by doing a join with subset_member.
00234   # Right now, this only affects a few transcripts in Drosophila, but it's safer this way.
00235   my $sql = 
00236     "SELECT gene.member_id, gene.description, transcript.member_id, transcript.description ".
00237     "FROM subset_member sm, ".
00238     "member gene, ".
00239     "member transcript ".
00240     "WHERE ".
00241     "sm.member_id=transcript.member_id ".
00242     "AND gene.source_name='ENSEMBLGENE' ".
00243     "AND transcript.source_name='ENSEMBLTRANS' ".
00244     "AND transcript.gene_member_id=gene.member_id ".
00245     "AND transcript.description not like '%Acc:NULL%' ".
00246     "AND transcript.description not like '%Acc:'";
00247   my $sth = $self->compara_dba->dbc->prepare($sql);
00248   $sth->execute;
00249   while( my $ref  = $sth->fetchrow_arrayref() ) {
00250     my ($gene_member_id, $gene_description, $transcript_member_id, $transcript_description) = @$ref;
00251     $transcript_description =~ /Acc:(\w+)/;
00252     my $transcript_model_id = $1;
00253     if ($transcript_model_id =~ /MI\d+/) {
00254       # We use mirbase families to link
00255       my @family_ids = keys %{$self->param('mirbase_families')->{$transcript_model_id}};
00256       my $family_id = $family_ids[0];
00257       if (defined $family_id) {
00258         $transcript_model_id = $family_id;
00259       } else {
00260         if ($gene_description =~ /\w+\-(mir-\S+)\ / || $gene_description =~ /\w+\-(let-\S+)\ /) {
00261           # We take the mir and let ids from the gene description
00262           $transcript_model_id = $1;
00263           # We correct the model_id for genes like 'mir-129-2' which have a 'mir-129' model
00264           if ($transcript_model_id =~ /(mir-\d+)-\d+/) {
00265             $transcript_model_id = $1;
00266           }
00267         } else {
00268             print STDERR "$transcript_model_id is not a mirbase family and is not a mir or let gene\n" if ($self->debug);
00269         }
00270       }
00271     }
00272     # A simple hash classified by the Acc model ids
00273     if (defined $self->param('model_name_ids')->{$transcript_model_id}) {
00274         print STDERR "$transcript_model_id has been converted to id " . $self->param('model_name_ids')->{$transcript_model_id} . "\n" if ($self->debug);
00275         $transcript_model_id = $self->param('model_name_ids')->{$transcript_model_id};
00276     }
00277     $self->param('rfamclassify')->{$transcript_model_id}{$transcript_member_id} = 1;
00278 
00279     # Store list of orphan ids
00280     unless (defined($self->param('rfamcms')->{'model_id'}{$transcript_model_id}) || defined($self->param('rfamcms')->{'name'}{$transcript_model_id})) {
00281       $self->param('orphan_transcript_model_id')->{$transcript_model_id}++;     # NB: this data is never used afterwards
00282     }
00283   }
00284 
00285   $sth->finish;
00286   return 1;
00287 }
00288 
00289 sub build_hash_cms {
00290   my $self = shift;
00291   my $field = shift;
00292 
00293   my $sql = "SELECT $field from nc_profile";
00294   my $sth = $self->compara_dba->dbc->prepare($sql);
00295   $sth->execute;
00296   while( my $ref  = $sth->fetchrow_arrayref() ) {
00297     my ($field_value) = @$ref;
00298     $self->param('rfamcms')->{$field}{$field_value} = 1;
00299   }
00300 }
00301 
00302 sub load_names_model_id {
00303   my $self = shift;
00304 
00305     # vivification:
00306   $self->param('model_id_names', {});
00307   $self->param('model_name_ids', {});
00308 
00309   my $sql = "SELECT model_id, name from nc_profile";
00310   my $sth = $self->compara_dba->dbc->prepare($sql);
00311   $sth->execute;
00312   while( my $ref  = $sth->fetchrow_arrayref() ) {
00313     my ($model_id, $name) = @$ref;
00314     $self->param('model_id_names')->{$model_id} = $name;
00315     $self->param('model_name_ids')->{$name} = $model_id;
00316   }
00317 }
00318 
00319 sub load_mirbase_families {
00320   my $self = shift;
00321 
00322   my $starttime = time();
00323   print STDERR "fetching mirbase families...\n" if ($self->debug);
00324 
00325   my $worker_temp_directory = $self->worker_temp_directory;
00326   my $url  = 'ftp://mirbase.org/pub/mirbase/CURRENT/';
00327   my $file = 'miFam.dat.gz';
00328 
00329   my $tmp_file = $worker_temp_directory . $file;
00330   my $mifam = $tmp_file;
00331 
00332   my $ftp_file = $url . $file;
00333   my $status = getstore($ftp_file, $tmp_file);
00334   die "load_mirbase_families: error $status on $ftp_file" unless is_success($status);
00335 
00336   $mifam =~ s/\.gz//;
00337   my $cmd = "rm -f $mifam";
00338   unless(system("cd $worker_temp_directory; $cmd") == 0) {
00339     print("$cmd\n");
00340     $self->throw("error deleting previously downloaded file $!\n");
00341   }
00342 
00343   my $cmd = "gunzip $tmp_file";
00344   unless(system("cd $worker_temp_directory; $cmd") == 0) {
00345     print("$cmd\n");
00346     $self->throw("error expanding mirbase families $!\n");
00347   }
00348 
00349     # vivfication:
00350   $self->param('mirbase_families', {});
00351 
00352   open (FH, $mifam) or $self->throw("Couldnt open miFam file [$mifam]");
00353   my $family_ac; my $family_id;
00354   while (<FH>) {
00355     if ($_ =~ /^AC\s+(\S+)/) {
00356       $family_ac = $1;
00357     } elsif ($_ =~ /^ID\s+(\S+)/) {
00358       $family_id = $1;
00359     } elsif ($_ =~ /^MI\s+(\S+)\s+(\S+)/) {
00360       my $mi_id = $1; my $mir_id = $2;
00361       $self->param('mirbase_families')->{$mi_id}{$family_id}{$family_ac}{$mir_id} = 1;
00362     } elsif ($_ =~ /\/\//) {
00363     } else {
00364       $self->throw("Unexpected line: [$_] in mifam file\n");
00365     }
00366   }
00367 
00368   printf("time for mirbase families fetch : %1.3f secs\n" , time()-$starttime);
00369 }
00370 
00371 sub tag_assembly_coverage_depth {
00372   my $self = shift;
00373 
00374   my @low_coverage  = ();
00375   my @high_coverage = ();
00376 
00377   foreach my $gdb (@{$self->param('cluster_mlss')->species_set}) {
00378     my $name = $gdb->name;
00379     my $coreDBA = $gdb->db_adaptor;
00380     my $metaDBA = $coreDBA->get_MetaContainerAdaptor;
00381     my $assembly_coverage_depth = @{$metaDBA->list_value_by_key('assembly.coverage_depth')}->[0];
00382     next unless (defined($assembly_coverage_depth) || $assembly_coverage_depth ne '');
00383     if ($assembly_coverage_depth eq 'low' || $assembly_coverage_depth eq '2x') {
00384       push @low_coverage, $gdb;
00385     } elsif ($assembly_coverage_depth eq 'high' || $assembly_coverage_depth eq '6x' || $assembly_coverage_depth >= 6) {
00386       push @high_coverage, $gdb;
00387     } else {
00388       $self->throw("Unrecognised assembly.coverage_depth value in core meta table: $assembly_coverage_depth [$name]\n");
00389     }
00390   }
00391   return undef unless(scalar(@low_coverage));
00392 
00393   my $species_set_adaptor = $self->compara_dba->get_SpeciesSetAdaptor;
00394 
00395   my $ss = new Bio::EnsEMBL::Compara::SpeciesSet(-genome_dbs => \@low_coverage);
00396   # Stores if necessary. Updates $ss->dbID anyway
00397   $species_set_adaptor->store($ss);
00398   my $value = $ss->get_tagvalue('name');
00399   if ($value eq 'low-coverage') {
00400     # Already stored, nothing needed
00401   } else {
00402     # We need to add the tag
00403     $ss->store_tag('name','low-coverage');
00404   }
00405 }
00406 
00407 1;