Archive Ensembl HomeArchive Ensembl Home
GenomePrepareNCMembers.pm
Go to the documentation of this file.
00001 #
00002 # You may distribute this module under the same terms as perl itself
00003 #
00004 # POD documentation - main docs before the code
00005 
00006 =pod 
00007 
00008 =head1 NAME
00009 
00010 Bio::EnsEMBL::Compara::RunnableDB::ncRNAtrees::GenomePrepareNCMembers
00011 
00012 =cut
00013 
00014 =head1 SYNOPSIS
00015 
00016 my $db      = Bio::EnsEMBL::Compara::DBAdaptor->new($locator);
00017 my $g_load_members = Bio::EnsEMBL::Compara::RunnableDB::ncRNAtrees::GenomePrepareNCMembers->new (
00018                                                     -db      => $db,
00019                                                     -input_id   => $input_id
00020                                                     -analysis   => $analysis );
00021 $g_load_members->fetch_input(); #reads from DB
00022 $g_load_members->run();
00023 $g_load_members->output();
00024 $g_load_members->write_output(); #writes to DB
00025 
00026 =cut
00027 
00028 =head1 DESCRIPTION
00029 
00030 A job factory that first iterates through all top-level slices of the corresponding core database and collects ncRNA gene stable_ids,
00031 then creates downstream jobs that will be loading individual ncRNA members.
00032 
00033 =cut
00034 
00035 =head1 CONTACT
00036 
00037 Describe contact details here
00038 
00039 =cut
00040 
00041 =head1 APPENDIX
00042 
00043 The rest of the documentation details each of the object methods.
00044 Internal methods are usually preceded with a _
00045 
00046 =cut
00047 
00048 package Bio::EnsEMBL::Compara::RunnableDB::ncRNAtrees::GenomePrepareNCMembers;
00049 
00050 use strict;
00051 use Bio::EnsEMBL::Slice;
00052 use Bio::EnsEMBL::Gene;
00053 use Bio::EnsEMBL::Compara::Subset;
00054 
00055 use base ('Bio::EnsEMBL::Compara::RunnableDB::BaseRunnable');
00056 
00057 
00058 =head2 fetch_input
00059 
00060     Read the parameters and set up all necessary objects.
00061 
00062 =cut
00063 
00064 sub fetch_input {
00065     my $self = shift @_;
00066 
00067     $self->input_job->transient_error(0);
00068     my $genome_db_id = $self->param('genome_db_id') || die "'genome_db_id' parameter is an obligatory one, please specify";
00069     $self->input_job->transient_error(1);
00070 
00071         # fetch the Compara::GenomeDB object for the genome_db_id
00072     my $genome_db = $self->compara_dba->get_GenomeDBAdaptor->fetch_by_dbID($genome_db_id) or die "Could not fetch genome_db with id=$genome_db_id";
00073     $self->param('genome_db', $genome_db);
00074   
00075         # using genome_db_id connect to external core database
00076     my $core_db = $genome_db->db_adaptor() or die "Can't connect to genome database for id=$genome_db_id";
00077     $self->param('core_db', $core_db);
00078 
00079 
00080         # create subsets for the gene members, and the longest peptide members
00081     my $subset_adaptor = $self->compara_dba->get_SubsetAdaptor;
00082 
00083 # FIXME: change the fan dataflow branch to 2, allowing branch 1 to output something too
00084     my $genome_db_name = $genome_db->name;
00085     my $ncrna_subset = Bio::EnsEMBL::Compara::Subset->new( -name=>"genome_db_id:${genome_db_id} ${genome_db_name} longest ncRNAs" );
00086     my $gene_subset  = Bio::EnsEMBL::Compara::Subset->new( -name=>"genome_db_id:${genome_db_id} ${genome_db_name} ncRNA genes" );
00087 
00088     my $ncrna_subset_id = $subset_adaptor->store($ncrna_subset) or die "Could not store ncRNA subset";
00089     my $gene_subset_id  = $subset_adaptor->store($gene_subset)  or die "Could not store gene subset";
00090 
00091     $self->param('ncrna_subset_id', $ncrna_subset_id);
00092     $self->param('gene_subset_id',  $gene_subset_id);
00093 }
00094 
00095 
00096 =head2 run
00097 
00098     Iterate through all top-level slices of the corresponding core database and collect ncRNA gene stable_ids
00099 
00100 =cut
00101 
00102 sub run {
00103     my $self = shift @_;
00104 
00105     $self->compara_dba->dbc->disconnect_when_inactive(0);
00106     $self->param('core_db')->dbc->disconnect_when_inactive(0);
00107 
00108     my @stable_ids = ();
00109 
00110         # from core database, get all slices, and then all genes in slice
00111         # and then all transcripts in gene to store as members in compara
00112     my @slices = @{$self->param('core_db')->get_SliceAdaptor->fetch_all('toplevel')};
00113     print("fetched ",scalar(@slices), " slices to load from\n");
00114     die "No toplevel slices, cannot fetch anything" unless(scalar(@slices));
00115 
00116     foreach my $slice (@slices) {
00117         foreach my $gene (sort {$a->start <=> $b->start} @{$slice->get_all_Genes}) {
00118             if ($gene->biotype =~ /rna/i) {
00119                 my $gene_stable_id = $gene->stable_id or die "Could not get stable_id from gene with id=".$gene->dbID();
00120                 push @stable_ids, $gene_stable_id;
00121             }
00122         }
00123     }
00124 
00125     $self->param('stable_ids', \@stable_ids);
00126 
00127     $self->param('core_db')->dbc->disconnect_when_inactive(1);
00128 }
00129 
00130 
00131 =head2 write_output
00132 
00133     Create downstream jobs that will be loading individual ncRNA members
00134 
00135 =cut
00136 
00137 sub write_output {
00138     my $self = shift @_;
00139 
00140     my $genome_db_id    = $self->param('genome_db_id');
00141     my $ncrna_subset_id = $self->param('ncrna_subset_id');
00142     my $gene_subset_id  = $self->param('gene_subset_id');
00143 
00144     foreach my $stable_id (@{ $self->param('stable_ids') }) {
00145         $self->dataflow_output_id( {
00146             'genome_db_id'    => $genome_db_id,
00147             'ncrna_subset_id' => $ncrna_subset_id,
00148             'gene_subset_id'  => $gene_subset_id,
00149             'stable_id'       => $stable_id,
00150         }, 2);
00151     }
00152 }
00153 
00154 1;
00155