Archive Ensembl HomeArchive Ensembl Home
FindCoreRegionLength.pm
Go to the documentation of this file.
00001 #
00002 # You may distribute this module under the same terms as perl itself
00003 #
00004 # POD documentation - main docs before the code
00005 
00006 =pod 
00007 
00008 =head1 NAME
00009 
00010 Bio::EnsEMBL::Compara::RunnableDB::FindCoreRegionLength
00011 
00012 =cut
00013 
00014 =head1 SYNOPSIS
00015 
00016 my $db           = Bio::EnsEMBL::Compara::DBAdaptor->new($locator);
00017 my $core_region_length = Bio::EnsEMBL::Compara::RunnableDB::FindCoreRegionLength->new
00018   (
00019    -db         => $db,
00020    -input_id   => $input_id,
00021    -analysis   => $analysis
00022   );
00023 $core_region_length->fetch_input(); #reads from DB
00024 $core_region_length->run();
00025 $core_region_length->output();
00026 $core_region_length->write_output(); #writes to DB
00027 
00028 =cut
00029 
00030 
00031 =head1 DESCRIPTION
00032 
00033 This Analysis will take a protein tree id and calculate the core region length of the alignment.
00034 
00035 =cut
00036 
00037 
00038 =head1 CONTACT
00039 
00040   Contact Thomas Maurel on module implementation/design detail: maurel@ebi.ac.uk
00041   Contact Javier Herrero on Split/partial genes in general: jherrero@ebi.ac.uk
00042 
00043 =cut
00044 
00045 
00046 =head1 APPENDIX
00047 
00048 The rest of the documentation details each of the object methods. 
00049 Internal methods are usually preceded with a _
00050 
00051 =cut
00052 
00053 
00054 package Bio::EnsEMBL::Compara::RunnableDB::FindCoreRegionLength;
00055 
00056 use strict;
00057 
00058 use base ('Bio::EnsEMBL::Compara::RunnableDB::BaseRunnable');
00059 use List::Util qw[min max];
00060 
00061 sub fetch_input {
00062   my $self = shift @_; 
00063 
00064   my $protein_tree_id      = $self->param('protein_tree_id') or die "'protein_tree_id' is an obligatory parameter";
00065   my $protein_tree_adaptor = $self->compara_dba->get_ProteinTreeAdaptor();
00066       # if fetch_node_by_node_id is insufficient, try fetch_tree_at_node_id
00067   my $protein_tree         = $protein_tree_adaptor->fetch_node_by_node_id($protein_tree_id) or die "Could not fetch protein_tree by id=$protein_tree_id";
00068   $self->param('protein_tree', $protein_tree);
00069   $self->dbc->disconnect_when_inactive(1);
00070 }
00071 
00072 sub run {
00073   my $self = shift @_; 
00074   my $protein_tree = $self->param('protein_tree');
00075   my $threshold      = $self->param('threshold') or die "'threshold' is an obligatory parameter";
00076   my $kingdom = $self->param('kingdom') or '(none)'; 
00077   my @output_ids = (); 
00078   my @perc_pos=();
00079   my $first_loop=0;
00080   my %members=();
00081   my %pos_occupancy=();
00082 
00083 # get all leaves,  all members of the tree
00084 my @aligned_members = @{$protein_tree->get_all_leaves};
00085 #initialize the array for each new tree
00086 my $first_member_string = $aligned_members[0]->alignment_string;
00087 my $alignment_length = length($first_member_string);
00088 #the array is initilise at 0 for each position of the aligned_member
00089 for(my $j=0;$j<$alignment_length;$j++)
00090 {
00091   $perc_pos[$j]=0;
00092 }
00093 
00094 for (my $i = 0; $i < @aligned_members ; $i++) 
00095 {
00096 #get the cigar line of an aligned_member
00097   my $cigar_line=$aligned_members[$i]->cigar_line();
00098 #get the length of the aligned member
00099   my $first_alignment_string = $aligned_members[$i]->alignment_string;
00100   my $alignment_length = length($first_alignment_string);
00101   my $pos=0;
00102   my $final_pos=0;
00103 #create a new array for a members
00104   $members{$i}=[];
00105 ###First step : for each members, getting the coverage of each protein compared to the alignment###
00106 
00107 #get the value and the code of a cigar line using Perl regular expressions
00108 # for 2D50M2M
00109 # in the first loop $value=2 and $code=D
00110 # if there is only one letter for one match or one deletion $code_alone will take this letter and be defined.
00111  while ($cigar_line =~ m/(\d+)([MID])|([MID])/g) 
00112   {
00113     my $value = $1;
00114     my $code  = $2;
00115     my $code_alone=$3;
00116  if ( defined($code_alone) and $code_alone eq "D")
00117     {   
00118       $final_pos=1+$pos;
00119       $members{$i}[$pos]=0;
00120       $pos++;
00121     }   
00122     if (defined($code) and $code eq "D")
00123     {   
00124 #if its a D, do nothing in the array, just increased the position variable
00125       $final_pos=$value+$pos;
00126       while($pos < $final_pos)
00127       {   
00128 #for a deletion (D) the members will have a 0 at the given position
00129         $members{$i}[$pos]=0;
00130         $pos++;
00131       }   
00132     }   
00133     if (defined($code_alone) and $code_alone eq "M")
00134     {   
00135       my $las=$perc_pos[$pos];
00136       $las++;
00137       $final_pos=1+$pos;
00138       $members{$i}[$pos]=1;
00139 $members{$i}[$pos]=1;
00140       $perc_pos[$pos]=$las;
00141       $pos++;
00142     }   
00143     if (defined($code) and $code eq "M")
00144     {   
00145 #if its a M , increased the value at the position gived by $pos
00146       $final_pos=$value+$pos;
00147       while($pos < $final_pos)
00148       {   
00149 #for multiple match , members will have a 1 at each positions given by the number in front of the match (M)
00150 # and perc_pos array will increased the total for each position
00151       my $last=$perc_pos[$pos];
00152         $last ++; 
00153         $perc_pos[$pos]=$last;
00154         $members{$i}[$pos]=1;
00155         $pos++;
00156        }   
00157     }   
00158   }
00159 } 
00160 
00161 #getting the maximum of core position 
00162 my $max=0; 
00163 foreach my $maxpos (@perc_pos)
00164 { 
00165   if ($maxpos>$max)
00166   {
00167     $max=$maxpos;
00168   }
00169 }
00170 
00171 #create a Treshold for example with (@aligned_members*90)/100, the maximum occupancy position will be kept if there is 90% of all members overlaping this positions.
00172 my $COCR_length=0;
00173 my $threshold_T1 = ($max*$threshold)/100;
00174 #Find postions with maximum of position occupancy taking the treshold in account and add it on a hash table.
00175 for(my $j=0;$j<$alignment_length;$j++)
00176 {
00177   if ($perc_pos[$j] >= $threshold_T1)
00178   {
00179     $pos_occupancy{$j}=$perc_pos[$j];
00180     $COCR_length++;
00181   }    
00182 }
00183 
00184 #Push all result into an array
00185       push @output_ids, {
00186       'protein_tree_stable_id' => $protein_tree->stable_id,
00187       'coverage_on_core_regions_length' => $COCR_length,
00188       'number_of_gene' => scalar @aligned_members,
00189       'kingdom' => $kingdom,
00190  };
00191 $self->param('output_ids', \@output_ids);
00192 }
00193 
00194 sub write_output {
00195   my $self = shift @_;
00196  
00197   my $output_ids = $self->param('output_ids');
00198  
00199   $self->dbc->disconnect_when_inactive(0);
00200   $self->dataflow_output_id($output_ids, 3);
00201 }
00202 
00203 
00204 
00205 1;
00206