Archive Ensembl HomeArchive Ensembl Home
FindPartialGenesOnTree.pm
Go to the documentation of this file.
00001 #
00002 # You may distribute this module under the same terms as perl itself
00003 #
00004 # POD documentation - main docs before the code
00005 
00006 =pod 
00007 
00008 =head1 NAME
00009 
00010 Bio::EnsEMBL::Compara::RunnableDB::FindPartialGenesOnTree
00011 
00012 =cut
00013 
00014 =head1 SYNOPSIS
00015 
00016 
00017 my $db           = Bio::EnsEMBL::Compara::DBAdaptor->new($locator);
00018 my $find_partial_genes = Bio::EnsEMBL::Compara::RunnableDB::FindPartialGenesOnTree->new
00019   (
00020    -db         => $db,
00021    -input_id   => $input_id,
00022    -analysis   => $analysis
00023   );
00024 $find_partial_genes->fetch_input(); #reads from DB
00025 $find_partial_genes->run();
00026 $find_partial_genes->output();
00027 $find_partial_genes->write_output(); #writes to DB
00028 
00029 =cut
00030 
00031 
00032 =head1 DESCRIPTION
00033 
00034 This Analysis will take a protein tree id and calcul the coverage on core region score and  alignment overlap score, 
00035 in order to find possible partial gene of a tree.
00036 
00037 =cut
00038 
00039 
00040 =head1 CONTACT
00041 
00042   Contact Thomas Maurel on module implementation/design detail: maurel@ebi.ac.uk
00043   Contact Javier Herrero on Split/partial genes in general: jherrero@ebi.ac.uk
00044 
00045 =cut
00046 
00047 =head1 APPENDIX
00048 
00049 The rest of the documentation details each of the object methods. 
00050 Internal methods are usually preceded with a _
00051 
00052 =cut
00053 
00054 
00055 package Bio::EnsEMBL::Compara::RunnableDB::FindPartialGenesOnTree;
00056 
00057 use strict;
00058 
00059 use base ('Bio::EnsEMBL::Compara::RunnableDB::BaseRunnable');
00060 use List::Util qw[min max];
00061 
00062 sub fetch_input {
00063   my $self = shift @_; 
00064 
00065   my $protein_tree_id      = $self->param('protein_tree_id') or die "'protein_tree_id' is an obligatory parameter";
00066   my $protein_tree_adaptor = $self->compara_dba->get_ProteinTreeAdaptor();
00067       # if fetch_node_by_node_id is insufficient, try fetch_tree_at_node_id
00068   my $protein_tree         = $protein_tree_adaptor->fetch_node_by_node_id($protein_tree_id) or die "Could not fetch protein_tree by id=$protein_tree_id";
00069   $self->param('protein_tree', $protein_tree);
00070   $self->dbc->disconnect_when_inactive(1);
00071 }
00072 
00073 sub run {
00074   my $self = shift @_; 
00075   my $protein_tree = $self->param('protein_tree');
00076   my $threshold = $self->param('threshold') or die "'threshold' is an obligatory parameter";
00077   my $kingdom = $self->param('kingdom') or '(none)';
00078   my @output_ids = (); 
00079   my @perc_pos=();
00080   my $first_loop=0;
00081   my %members=();
00082   my %pos_occupancy=();
00083 
00084 # get all leaves,  all members of the tree
00085 my @aligned_members = @{$protein_tree->get_all_leaves};
00086 #initialize the array for each new tree
00087 my $first_member_string = $aligned_members[0]->alignment_string;
00088 my $alignment_length = length($first_member_string);
00089 #the array is initilise at 0 for each position of the aligned_member
00090 for(my $j=0;$j<$alignment_length;$j++)
00091 {
00092   $perc_pos[$j]=0;
00093 }
00094 
00095 for (my $i = 0; $i < @aligned_members ; $i++) 
00096 {
00097 #get the cigar line of an aligned_member
00098   my $cigar_line=$aligned_members[$i]->cigar_line();
00099 #get the length of the aligned member
00100   my $first_alignment_string = $aligned_members[$i]->alignment_string;
00101   my $alignment_length = length($first_alignment_string);
00102   my $pos=0;
00103   my $final_pos=0;
00104 #create a new array for a members
00105   $members{$i}=[];
00106 ###First step : for each members, getting the coverage of each protein compared to the alignment###
00107 
00108 #get the value and the code of a cigar line using Perl regular expressions
00109 # for 2D50M2M
00110 # in the first loop $value=2 and $code=D
00111 # if there is only one letter for one match or one deletion $code_alone will take this letter and be defined.
00112  while ($cigar_line =~ m/(\d+)([MID])|([MID])/g) 
00113   {
00114     my $value = $1;
00115     my $code  = $2;
00116     my $code_alone=$3;
00117  if ( defined($code_alone) and $code_alone eq "D")
00118     {   
00119       $final_pos=1+$pos;
00120       $members{$i}[$pos]=0;
00121       $pos++;
00122     }   
00123     if (defined($code) and $code eq "D")
00124     {   
00125 #if its a D, do nothing in the array, just increased the position variable
00126       $final_pos=$value+$pos;
00127       while($pos < $final_pos)
00128       {   
00129 #for a deletion (D) the members will have a 0 at the given position
00130         $members{$i}[$pos]=0;
00131         $pos++;
00132       }   
00133     }   
00134     if (defined($code_alone) and $code_alone eq "M")
00135     {   
00136       my $las=$perc_pos[$pos];
00137       $las++;
00138       $final_pos=1+$pos;
00139       $members{$i}[$pos]=1;
00140 $members{$i}[$pos]=1;
00141       $perc_pos[$pos]=$las;
00142       $pos++;
00143     }   
00144     if (defined($code) and $code eq "M")
00145     {   
00146 #if its a M , increased the value at the position gived by $pos
00147       $final_pos=$value+$pos;
00148       while($pos < $final_pos)
00149       {   
00150 #for multiple match , members will have a 1 at each positions given by the number in front of the match (M)
00151 # and perc_pos array will increased the total for each position
00152       my $last=$perc_pos[$pos];
00153         $last ++; 
00154         $perc_pos[$pos]=$last;
00155         $members{$i}[$pos]=1;
00156         $pos++;
00157        }   
00158     }   
00159   }
00160 } 
00161 
00162 #getting the maximum of core position 
00163 my $max=0; 
00164 foreach my $maxpos (@perc_pos)
00165 { 
00166   if ($maxpos>$max)
00167   {
00168     $max=$maxpos;
00169   }
00170 }
00171 
00172 #create a Treshold for example with (@aligned_members*90)/100, the maximum occupancy position will be kept if there is 90% of all members overlaping this positions.
00173 my $threshold_T1 = ($max*$threshold)/100;
00174 #Find postions with maximum of position occupancy taking the treshold in account and add it on a hash table.
00175 for(my $j=0;$j<$alignment_length;$j++)
00176 {
00177   if ($perc_pos[$j] >= $threshold_T1)
00178   {
00179     $pos_occupancy{$j}=$perc_pos[$j];
00180   }    
00181 }
00182 
00183 #Getting the alignment overlap score for each genes
00184 #the alignment overlap score i = average(i!=j)(intersection ij/length j)
00185 my @alignment_overlap_score=();
00186 for (my $i=0;$i<@aligned_members;$i++)
00187 {
00188   my $final_score=0;
00189   for (my $j=0;$j<@aligned_members;$j++)
00190   {
00191     if($i==$j)
00192     {
00193       next;
00194     }
00195     my $alignment_overlap_score =0;
00196     my $intersection=0;
00197     my $length=0;
00198     for (my $p=0; $p<@{$members{$j}};$p++)
00199     {
00200     if ($members{$i}[$p]==1 and $members{$j}[$p]==1)
00201     {
00202     $intersection++;
00203     }
00204     if ($members{$j}[$p]==1)
00205     {
00206     $length++;
00207     }
00208     }
00209 # alignment overlap score is the intersection over length
00210     $alignment_overlap_score=$intersection/$length;
00211 #add previous score
00212     $final_score=$final_score+$alignment_overlap_score;
00213   }
00214 #final score is the average of score over number total of members
00215   $alignment_overlap_score[$i]=$final_score/(@aligned_members-1);
00216 }
00217 
00218 #Now check for each aligned member at the occupancy position if there is an overlap
00219 for (my $l=0; $l<@aligned_members; $l++)
00220 {
00221   my $total_occupancy=0;
00222   my $member_occupancy=0;
00223   my $coverage_on_core_region=0;
00224   my $alignment_overlap_score=0;
00225 #foreach occupancy positions
00226   foreach my $pos (keys %pos_occupancy)
00227   {
00228     $total_occupancy++;
00229 #if match between member and occupancy position
00230     if (defined $members{$l}[$pos])
00231     {
00232       if ($members{$l}[$pos] eq 1)
00233       {
00234         $member_occupancy++;
00235       }
00236     }
00237   }
00238   if ($total_occupancy!=0){
00239     $coverage_on_core_region=($member_occupancy*100)/$total_occupancy;
00240   }
00241 #Push all result into an array
00242       push @output_ids, {
00243       'gene_stable_id' => $aligned_members[$l]->gene_member ? $aligned_members[$l]->gene_member->stable_id : 'protein_member_id='.$aligned_members[$l]->dbID(),
00244       'protein_tree_stable_id' => $protein_tree->stable_id,
00245       'coverage_on_core_regions_score' => $coverage_on_core_region,
00246       'alignment_overlap_score' => $alignment_overlap_score[$l],
00247       'species_name' => $aligned_members[$l]->genome_db->name,
00248       'kingdom' => $kingdom,
00249  };
00250 
00251 #  push @output_ids, {
00252 #    'gene_stable_id' => $aligned_members[$l]->gene_member->stable_id,
00253 #      'coverage_on_core_regions_score' => $coverage_on_core_region,
00254 #      'species_name' => $aligned_members[$l]->genome_db->name,
00255 #  };  
00256 
00257 }
00258 $self->param('output_ids', \@output_ids);
00259 }
00260 
00261 sub write_output {
00262   my $self = shift @_;
00263  
00264   my $output_ids = $self->param('output_ids');
00265  
00266   $self->dbc->disconnect_if_idle();
00267   #$self->dbc->disconnect_when_inactive(0);
00268   $self->dataflow_output_id($output_ids, 3);
00269 }
00270 
00271 
00272 
00273 1;
00274