Archive Ensembl HomeArchive Ensembl Home
StoreMembersSequence.pm
Go to the documentation of this file.
00001 
00002 =pod 
00003 
00004 =head1 NAME
00005 
00006 Bio::EnsEMBL::Compara::RunnableDB::LoadOneGenomeDB
00007 
00008 =head1 DESCRIPTION
00009 
00010 This Runnable loads one entry into 'genome_db' table and passes on the genome_db_id.
00011 
00012 The format of the input_id follows the format of a Perl hash reference.
00013 Examples:
00014     { 'species_name' => 'Homo sapiens', 'assembly_name' => 'GRCh37' }
00015     { 'species_name' => 'Mus musculus' }
00016 
00017 supported keys:
00018     'locator'       => <string>
00019         one of the ways to specify the connection parameters to the core database (overrides 'species_name' and 'assembly_name')
00020 
00021     'registry_dbs'  => <list_of_dbconn_hashes>
00022         another, simple way to specify the genome_db (and let the registry search across multiple mysql instances to do the rest)
00023     'species_name'  => <string>
00024         mandatory, but what would you expect?
00025 
00026     'first_found'   => <0|1>
00027         optional, defaults to 0.
00028         Defines whether we emulate (to a certain extent) the behaviour of load_registry_from_multiple_dbs
00029         or try the last one that still fits (this would allow to try ens-staging[12] *first*, and only then check if ens-livemirror has is a suitable copy).
00030 
00031     'assembly_name' => <string>
00032         optional: in most cases it should be possible to find the species just by using 'species_name'
00033 
00034     'genome_db_id'  => <integer>
00035         optional, in case you want to specify it (otherwise it will be generated by the adaptor when storing)
00036 
00037     'pseudo_stableID_prefix' => <string>
00038         optional?, see 'GenomeLoadMembers.pm', 'GenomeLoadReuseMembers.pm', 'GeneStoreNCMembers.pm', 'GenomePrepareNCMembers.pm'
00039 
00040     'ensembl_genomes' => <0|1>
00041         optional, sets the preferential order of precedence of species_name sources, depending on whether the module is run by EG or Compara
00042 
00043 =cut
00044 
00045 package Bio::EnsEMBL::Compara::RunnableDB::ProteinTrees::FromScratch::StoreMembersSequence;
00046 
00047 use strict;
00048 use Bio::Perl;
00049 use Bio::EnsEMBL::Registry;
00050 use Bio::EnsEMBL::DBLoader;
00051 use Bio::EnsEMBL::Compara::GenomeDB;
00052 use Bio::EnsEMBL::Compara::Member;
00053 use Bio::EnsEMBL::Compara::Subset;
00054 
00055 use Data::Dumper;
00056 
00057 use base ('Bio::EnsEMBL::Compara::RunnableDB::BaseRunnable');
00058 
00059 
00060 sub fetch_input {
00061     my $self = shift @_;
00062 
00063     # Adaptors
00064     my $compara_dba = $self->compara_dba();
00065     $self->param('member_adaptor', $compara_dba->get_MemberAdaptor());
00066 
00067     $self->param('genome_db_id', $compara_dba->get_GenomeDBAdaptor->fetch_by_taxon_id($self->param('ncbi_taxon_id'))->dbID );
00068 
00069     $self->param('pepSubset', Bio::EnsEMBL::Compara::Subset->new(-name => ("gdb:".($self->param('genome_db_id'))." ".($self->param('species_name')).' translations')));
00070     $self->param('geneSubset', Bio::EnsEMBL::Compara::Subset->new(-name => ("gdb:".($self->param('genome_db_id'))." ".($self->param('species_name')).' genes')));
00071 
00072     $self->param('subset_adaptor', $compara_dba->get_SubsetAdaptor());
00073     $self->param('subset_adaptor')->store($self->param('pepSubset'));
00074     $self->param('subset_adaptor')->store($self->param('geneSubset'));
00075 
00076 }
00077 
00078 sub run {
00079     my $self = shift @_;
00080 
00081     my $genome_db_id = $self->param('genome_db_id');
00082     my $member_adaptor = $self->param('member_adaptor');
00083 
00084     # FASTA file
00085     my @allseq = read_all_sequences($self->param('data_dir')."/".$self->param('filename'));
00086     print scalar(@allseq), " sequences read in ", $self->param('filename'), " (taxon_id: ", $self->param('ncbi_taxon_id'), " taxon_name: ", $self->param('species_name'), ")\n" if ($self->debug);
00087 
00088     my $count = 0;
00089     foreach my $sequence (@allseq) {
00090         
00091         $count++;
00092 
00093         my @t = split(":", $sequence->id);
00094         
00095         print "sequence $count: name ", $sequence->id, "\n" if ($self->debug > 1);
00096         print "sequence $count: description ", $sequence->desc, "\n" if ($self->debug > 1);
00097         print "sequence $count: length ", $sequence->length, "\n" if ($self->debug > 1);
00098 
00099         my $gene_member = Bio::EnsEMBL::Compara::Member->new();
00100         $gene_member->stable_id(sprintf("GENE%06d%06d", $self->param('ncbi_taxon_id'), $count));
00101         $gene_member->display_label($t[1]);
00102         $gene_member->source_name("ENSEMBLGENE");
00103         $gene_member->taxon_id($self->param('ncbi_taxon_id'));
00104         $gene_member->description($sequence->id." ".$sequence->desc);
00105         $gene_member->genome_db_id($genome_db_id);
00106         $member_adaptor->store($gene_member);
00107         $self->param('geneSubset')->add_member($gene_member);
00108 
00109         my $pep_member = Bio::EnsEMBL::Compara::Member->new();
00110         $pep_member->stable_id(sprintf("PEPT%06d%06d", $self->param('ncbi_taxon_id'), $count));
00111         $pep_member->display_label($t[1]);
00112         $pep_member->source_name("ENSEMBLPEP");
00113         $pep_member->taxon_id($self->param('ncbi_taxon_id'));
00114         $pep_member->description($sequence->id." ".$sequence->desc);
00115         $pep_member->genome_db_id($genome_db_id);
00116         my $seq = $sequence->seq;
00117         $seq =~ s/O/X/g;
00118         $pep_member->sequence($seq);
00119         $member_adaptor->store($pep_member);
00120         $self->param('pepSubset')->add_member($pep_member);
00121 
00122         $member_adaptor->store_gene_peptide_link($gene_member->dbID, $pep_member->dbID);
00123 
00124     };
00125 
00126     print $self->param('geneSubset')->count(), " genes and ", $self->param('pepSubset')->count(), " peptides in subsets\n" if ($self->debug);
00127 }
00128 
00129 sub write_output {      # dataflow
00130 
00131     my $self = shift;
00132 
00133     $self->dataflow_output_id( { 'genome_db_id' => $self->param('genome_db_id'), 'species_name' => $self->param('species_name') } , 1);
00134 }
00135 
00136 
00137 1;
00138