Archive Ensembl HomeArchive Ensembl Home
LoadUniProtIndex.pm
Go to the documentation of this file.
00001 
00002 =pod 
00003 
00004 =head1 NAME
00005 
00006 Bio::EnsEMBL::Compara::RunnableDB::Families::LoadUniProtIndex
00007 
00008 =head1 DESCRIPTION
00009 
00010 This RunnableDB uses 'mfetch' to get the list of Uniprot accession numbers and dataflows to actual loading jobs.
00011 
00012 The format of the input_id follows the format of a Perl hash reference.
00013 Examples:
00014   "{'uniprot_source' => 'SWISSPROT', taxon_id=>4932}"      # loads all SwissProt for S.cerevisiae
00015   "{'uniprot_source' => 'SPTREMBL'}"                       # loads all SPTrEMBL Fungi/Metazoa
00016   "{'uniprot_source' => 'SPTREMBL', taxon_id=>4932}"       # loads all SPTrEMBL for S.cerevisiae
00017   "{'uniprot_source' => 'SWISSPROT', 'tax_div' => 'FUN'}"  # loads all SwissProt fungi proteins
00018   "{'uniprot_source' => 'SPTREMBL',  'tax_div' => 'ROD'}"  # loads all SwissProt rodent proteins
00019 
00020 supported keys:
00021   uniprot_source    =>  'SWISSPROT' or 'SPTREMBL'
00022   taxon_id          => <taxon_id>
00023                             optional if one wants to load from a specific species
00024                             if not specified it will load all Fungi/Metazoa from the uniprot_source 
00025   tax_div           => <tax_div>
00026                             optional taxonomic division
00027 
00028 =cut
00029 
00030 package Bio::EnsEMBL::Compara::RunnableDB::Families::LoadUniProtIndex;
00031 
00032 use strict;
00033 
00034 use Bio::EnsEMBL::Compara::Subset;
00035 
00036 use base ('Bio::EnsEMBL::Compara::RunnableDB::BaseRunnable');
00037 
00038 sub param_defaults {
00039     return {
00040         'uniprot_version'   => 'uniprot',   # but you can ask for a specific version of uniprot that mfetch would recognize
00041         'taxon_id'          => undef,       # no ncbi_taxid filter means get all Fungi/Metazoa
00042         'buffer_size'       => 16,          # how many uniprot_ids are fetched per one execution of mfetch
00043         'tax_div'           => undef,       # metazoa can be split into 6 parts and loaded in parallel
00044     };
00045 }
00046 
00047 sub fetch_input {
00048     my $self = shift @_;
00049 
00050 
00051     my $uniprot_version = $self->param('uniprot_version');
00052     my $uniprot_source  = $self->param('uniprot_source') or die "'uniprot_source' has to be either 'SWISSPROT' or 'SPTREMBL'";
00053 
00054     my $subset_name = $uniprot_source;
00055 
00056     $self->compara_dba()->dbc->disconnect_when_inactive(1);
00057 
00058     if(my $taxon_id = $self->param('taxon_id')) {
00059         $subset_name .= " ncbi_taxid:$taxon_id";
00060         $self->param('uniprot_ids', $self->mfetch_uniprot_ids($uniprot_version, $uniprot_source, $taxon_id) );
00061     } else {
00062         my $tax_div = $self->param('tax_div');
00063         $subset_name .= " metazoa";
00064         $subset_name .= ", tax_div:$tax_div" if($tax_div);
00065         $self->param('uniprot_ids', $self->mfetch_uniprot_ids($uniprot_version, $uniprot_source, '' , $tax_div && [ $tax_div ]) );
00066     }
00067 
00068     $self->compara_dba()->dbc->disconnect_when_inactive(0);
00069 
00070     my $subset_adaptor = $self->compara_dba()->get_SubsetAdaptor();
00071     my $subset;
00072     unless($subset = $subset_adaptor->fetch_by_set_description($subset_name)) {
00073         $subset = Bio::EnsEMBL::Compara::Subset->new(-name=>$subset_name);
00074         $subset_adaptor->store($subset);
00075     }
00076     $self->param('subset_id', $subset->dbID);
00077 
00078 }
00079 
00080 
00081 sub write_output {
00082     my $self = shift @_;
00083 
00084     my $buffer_size     = $self->param('buffer_size');
00085     my $subset_id       = $self->param('subset_id');
00086     my $uniprot_source  = $self->param('uniprot_source');
00087     my $uniprot_ids     = $self->param('uniprot_ids');
00088 
00089     while (@$uniprot_ids) {
00090         my @id_buffer = splice(@$uniprot_ids, 0, $buffer_size);
00091         $self->dataflow_output_id( { 'uniprot_source' => $uniprot_source, 'subset_id' => $subset_id, 'ids' => [@id_buffer] }, 2);
00092     }
00093 }
00094 
00095 
00096 ######################################
00097 #
00098 # subroutines
00099 #
00100 #####################################
00101 
00102 sub mfetch_uniprot_ids {
00103     my $self            = shift;
00104     my $uniprot_version = shift;  # 'uniprot' or a specific version of it
00105     my $uniprot_source  = shift;  # 'SWISSPROT' or 'SPTREMBL'
00106     my $taxon_id        = shift;  # assume Fungi/Metazoa if not set
00107     my $tax_divs        = shift || [ $taxon_id ? 0 : qw(FUN HUM MAM ROD VRT INV) ];
00108 
00109     my @filters = ( 'div:'.((uc($uniprot_source) eq 'SPTREMBL') ? 'PRE' : 'STD') );
00110     if($taxon_id) {
00111         push @filters, "txi:$taxon_id";
00112     } else {
00113         push @filters, "txt:33154"; # anything that belongs to Fungi/Metazoa subtree (clade)
00114     }
00115 
00116     my @all_ids = ();
00117     foreach my $txd (@$tax_divs) {
00118         my $cmd = "mfetch -d $uniprot_version -v av -i '".join('&', @filters).($txd ? "&txd:$txd" : '')."'";
00119         print("$cmd\n") if($self->debug);
00120         if( my $output_text = `$cmd` ) {
00121             my @ids = split(/\s/, $output_text);
00122             push @all_ids, @ids;
00123         } else {
00124             die "[$cmd] returned nothing, mole server probably down";
00125         }
00126     }
00127     printf("fetched %d ids from %s\n", scalar(@all_ids), $uniprot_source) if($self->debug);
00128     return \@all_ids;
00129 }
00130 
00131 1;
00132