Archive Ensembl HomeArchive Ensembl Home
Translation.pm
Go to the documentation of this file.
00001 =head1 LICENSE
00002 
00003   Copyright (c) 1999-2012 The European Bioinformatics Institute and
00004   Genome Research Limited.  All rights reserved.
00005   This software is distributed under a modified Apache license.
00006   For license details, please see
00007     http://www.ensembl.org/info/about/code_licence.html
00008 =head1 CONTACT
00009   Please email comments or questions to the public Ensembl
00010   developers list at <dev@ensembl.org>.
00011   Questions may also be sent to the Ensembl help desk at
00012   <helpdesk@ensembl.org>.
00013 
00014 =cut
00015 =head1 NAME
00016 =head1 SYNOPSIS
00017 =head1 DESCRIPTION
00018 =head1 METHODS
00019 =cut
00020 
00021 package Bio::EnsEMBL::Utils::VegaCuration::Translation;
00022 use strict;
00023 use warnings;
00024 use vars qw(@ISA);
00025 use Data::Dumper;
00026 use Bio::EnsEMBL::Utils::VegaCuration::Transcript;
00027 @ISA = qw(Bio::EnsEMBL::Utils::VegaCuration::Transcript);
00028 
00029 =head2 check_CDS_start_end_remarks
00030 
00031    Args       : B::E::Transcript
00032    Example    : my $results = $support->check_CDS_end_remarks($transcript)
00033    Description: identifies incorrect 'CDS end...' transcript remarks in a
00034                 otter-derived Vega database
00035    Returntype : hashref
00036 
00037 =cut
00038 
00039 sub check_CDS_start_end_remarks {
00040   my $self = shift; 
00041   my $trans = shift;
00042   # info for checking
00043   my @remarks = @{$trans->get_all_Attributes('remark')};
00044   my $coding_end   = $trans->cdna_coding_end;
00045   my $coding_start = $trans->cdna_coding_start;
00046   my $trans_end    = $trans->length;
00047   my $trans_seq    = $trans->seq->seq;
00048   my $stop_codon   = substr($trans_seq, $coding_end-3, 3);
00049   my $start_codon  = substr($trans_seq, $coding_start-1, 3);
00050   #hashref to return results
00051   my $results;
00052   
00053   #extra CDS end not found remarks
00054   if (grep {$_->value eq 'CDS end not found'} @remarks) {
00055     if (   ($coding_end != $trans_end) 
00056          && ( grep {$_ eq $stop_codon} qw(TGA TAA TAG) ) ) {
00057       $results->{'END_EXTRA'} = 1;
00058     }
00059   }
00060   #missing CDS end not found remark
00061   if ( $coding_end == $trans_end ) {
00062     if (! grep {$_->value eq 'CDS end not found'} @remarks) {
00063       if (grep {$_ eq $stop_codon} qw(TGA TAA TAG)) {
00064     $results->{'END_MISSING_2'} = 1;
00065       }
00066       else {
00067     $results->{'END_MISSING_1'} = $stop_codon;
00068       }
00069     }
00070   }
00071   #extra CDS start not found remark
00072   if (grep {$_->value eq 'CDS start not found'} @remarks) {
00073     if (   ($coding_start != 1) 
00074          && ($start_codon eq 'ATG') ) {
00075       $results->{'START_EXTRA'} = 1;
00076     }
00077   }
00078   #missing CDS start not found remark
00079   if ( $coding_start == 1) {
00080     if ( ! grep {$_->value eq 'CDS start not found'} @remarks) {
00081       if ($start_codon eq 'ATG') {
00082     $results->{'START_MISSING_2'} = 1;
00083       } else {
00084     $results->{'START_MISSING_1'} = $start_codon;
00085       }
00086     }
00087   }
00088   return $results;
00089 }
00090 
00091 =head2 check_CDS_start_end_remarks_loutre
00092 
00093    Args       : B::E::Transcript
00094    Example    : my $results = $support->check_CDS_end_remarks($transcript)
00095    Description: identifies incorrect 'CDS end...' transcript attribs in a loutre
00096                 of a loutre-derived Vega database.
00097    Returntype : hashref
00098 
00099 =cut
00100 
00101 sub check_CDS_start_end_remarks_loutre {
00102   my $self = shift;
00103   my $trans = shift;
00104 
00105   # info for checking
00106   my @stops = qw(TGA TAA TAG);
00107   my %attributes;
00108   foreach my $attribute (@{$trans->get_all_Attributes()}) {
00109     push @{$attributes{$attribute->code}}, $attribute;
00110   }
00111 #  warn $trans->stable_id;
00112 #  warn Data::Dumper::Dumper(\%attributes);
00113   my $coding_end   = $trans->cdna_coding_end;
00114   my $coding_start = $trans->cdna_coding_start;
00115   my $trans_end    = $trans->length;
00116   my $trans_seq    = $trans->seq->seq;
00117   my $stop_codon_offset = 3 + $trans->translation->end_Exon->end_phase;
00118   my $initial_exon_phase = $trans->translation->start_Exon->phase;
00119   my $stop_codon   = substr($trans_seq, $coding_end-3, 3);
00120   my $start_codon  = substr($trans_seq, $coding_start-1, 3);
00121 
00122   my $start_codon_incorrect = 1;
00123   if ($start_codon eq 'ATG' ) {
00124     $start_codon_incorrect = 0;
00125   }
00126   elsif ($start_codon eq 'CTG') {
00127     foreach my $attrib (@{$attributes{'remark'}}) {
00128       if ($attrib->value =~ /non[- ]ATG start/) {
00129     $start_codon_incorrect = 0;
00130       }
00131     }
00132   }
00133 
00134 #  warn "$start_codon -- $initial_exon_phase -- $coding_start -- $start_codon_incorrect";
00135 
00136   #hashref to return results
00137   my $results;
00138 
00139   #extra CDS end not found remarks
00140   if ($attributes{'cds_end_NF'}) {
00141     if ( ($attributes{'cds_end_NF'}->[0]->value == 1)
00142        && ($coding_end != $trans_end) 
00143        && ( grep {$_ eq $stop_codon} @stops) ) {
00144 #      warn $trans->stable_id.": $coding_end--$trans_end--$stop_codon";
00145 #      warn $trans->translation->end_Exon->end_phase;
00146       $results->{'END_EXTRA'} = $stop_codon;
00147     }
00148   }
00149   #missing CDS end not found remark
00150   if ( $coding_end == $trans_end ) {
00151     if ($attributes{'cds_end_NF'}) {
00152       if ($attributes{'cds_end_NF'}->[0]->value == 0 ) {
00153     if (! grep {$_ eq $stop_codon} @stops) {
00154 #     warn $trans->translation->end_Exon->end_phase;
00155       $results->{'END_MISSING'}{'WRONG'} = $stop_codon;
00156     }
00157       }
00158     }
00159     elsif (! grep {$_ eq $stop_codon} @stops) {
00160       $results->{'END_MISSING'}{'ABSENT'} = $stop_codon;
00161     }
00162   }
00163   #extra CDS start not found remark 
00164   if ( $attributes{'cds_start_NF'}) {
00165     if (   ($attributes{'cds_start_NF'}->[0]->value == 1 )
00166         && (! $start_codon_incorrect)) {
00167       unless ( ($coding_start == 1) && ( $initial_exon_phase > 0)) {
00168     $results->{'START_EXTRA'} = $start_codon;
00169       }
00170     }
00171   }
00172   #missing CDS start not found remark
00173   if ( $coding_start == 1) {
00174     if ( $attributes{'cds_start_NF'} ) {
00175       if ( $attributes{'cds_start_NF'}->[0]->value == 0 ) {
00176     if ($start_codon_incorrect) {
00177       $results->{'START_MISSING'}{'ABSENT'} = $start_codon;
00178     }
00179     elsif ($initial_exon_phase > 0) {
00180       $results->{'START_MISSING'}{'INITIAL_PHASE'} = $initial_exon_phase;
00181     }
00182       }
00183     }
00184     elsif ($start_codon ne 'ATG') {
00185       $results->{'START_MISSING'}{'ABSENT'} = $start_codon;
00186     }
00187 
00188   }
00189   return $results;
00190 }
00191 
00192 =head2 get_havana_seleno_comments
00193 
00194    Args       : none
00195    Example    : my $results = $support->get_havana_seleno_comments
00196    Description: parses the HEREDOC containing Havana comments in this module
00197    Returntype : hashref
00198 
00199 =cut
00200 
00201 sub get_havana_seleno_comments {
00202   my $seen_translations;
00203   while (<DATA>) {
00204     next if /^\s+$/ or /#+/;
00205     my ($obj,$comment) = split /=/;
00206     $obj =~ s/^\s+|\s+$//g;
00207     $comment =~ s/^\s+|\s+$//g;
00208     # We add the origin as now "seen" can come from a number of places, and have
00209     # a number of consequences in different cases, not just discounted Secs from this method. -- ds23
00210     $seen_translations->{$obj} = [ $comment,"notsec-havana" ];
00211   }
00212   return $seen_translations;
00213 }
00214 
00215 sub check_for_stops {
00216   my $support = shift;
00217   my ($gene,$seen_transcripts,$log_object) = @_;
00218   my $transcripts;
00219   my $has_log_object=defined($log_object);
00220   if($has_log_object){
00221     my @help = $log_object->species_params->get_trans($gene->stable_id);
00222     $transcripts=\@help;
00223   }else{
00224     $log_object=$support;
00225     $transcripts=$gene->get_all_Transcripts;
00226   }
00227 
00228   my $gname = $gene->get_all_Attributes('name')->[0]->value;
00229   my $gsi = $gene->stable_id;
00230   my $scodon = 'TGA';
00231   my $mod_date = $support->date_format( $gene->modified_date,'%d/%m/%y' );
00232   my $hidden_remak_ttributes;
00233  TRANS:
00234   foreach my $trans (@$transcripts) {
00235     my $tsi = $trans->stable_id;
00236     my $tID = $trans->dbID;
00237     my $tname = $trans->get_all_Attributes('name')->[0]->value;
00238     if($has_log_object){
00239       $hidden_remak_ttributes=$log_object->species_params->get_attributes->{$tsi}->{'hidden_remark'};
00240     }else{
00241       $hidden_remak_ttributes=$trans->get_all_Attributes('hidden_remark');
00242     }
00243     foreach my $rem (@$hidden_remak_ttributes) {
00244       if ($rem->value =~ /not_for_Vega/) {
00245         #$support->log_verbose("Skipping transcript $tname ($tsi) since 'not_for_Vega'\n",1);
00246         $log_object->_save_log('log_verbose', '', $gsi, '', $tsi, '', "Skipping transcript $tname ($tsi) since 'not_for_Vega'");
00247         next TRANS;
00248       }
00249     }
00250 
00251     # go no further if there is a ribosomal framshift attribute
00252     foreach my $attrib (@{$trans->get_all_Attributes('_rib_frameshift')}) {
00253       if ($attrib->value) {
00254         $log_object->_save_log('log', '', $gsi, '', $tsi, '', "Skipping $tsi ($tname) since it has a ribosomal frameshift attribute");
00255         next TRANS;
00256       }
00257     }
00258 
00259     #$support->log_verbose("Studying transcript $tsi ($tname, $tID)\n",1);
00260     $log_object->_save_log('log_verbose', '', $gsi, '', $tsi, '', "Studying transcript $tsi ($tname, $tID)");
00261     my $peptide;
00262         
00263     # go no further if the transcript doesn't translate or if there are no stops
00264     next TRANS unless ($peptide = $trans->translate);
00265     my $pseq = $peptide->seq;
00266     my $orig_seq = $pseq;
00267     # (translate method trims stops from sequence end)
00268 
00269     next TRANS unless ($pseq =~ /\*/);
00270     # warn sprintf("Stop codon is '%s'\n",substr($trans->translateable_seq,-3));
00271     #$support->log_verbose("Stops found in $tsi ($tname)\n",1);
00272     $log_object->_save_log('log_verbose', '', $gsi, '', $tsi, '', "Stops found in $tsi ($tname)");
00273 
00274     # find out where and how many stops there are
00275     my @found_stops;
00276     my $mrna   = $trans->translateable_seq;
00277     my $offset = 0;
00278     my $tstop;
00279     while ($pseq =~ /^([^\*]*)\*(.*)/) {
00280       my $pseq1_f = $1;
00281       $pseq = $2;
00282       my $seq_flag = 0;
00283       $offset += length($pseq1_f) * 3;
00284       my $stop = substr($mrna, $offset, 3);
00285       my $aaoffset = int($offset / 3)+1;
00286       push(@found_stops, [ $stop, $aaoffset ]);
00287       $tstop .= "$aaoffset ";
00288       $offset += 3;
00289     }
00290         
00291     # are all stops TGA...?
00292     my $num_stops = scalar(@found_stops);
00293     my $num_tga = 0;
00294     my $positions;
00295     foreach my $stop (@found_stops) {
00296       $positions .= $stop->[0]."(".$stop->[1].") ";
00297       if ($stop->[0] eq $scodon) {
00298     $num_tga++;
00299       }
00300     }
00301     my $source = $gene->source;
00302     #...no - an internal stop codon error in the database...
00303     if ($num_tga < $num_stops) {
00304       if ($source eq 'havana') {
00305         #$support->log_warning("INTERNAL STOPS HAVANA: Transcript $tsi ($tname) from gene $gname has non \'$scodon\' stop codons [$mod_date]:\nSequence = $orig_seq\nStops at $positions)\n\n");
00306         $log_object->_save_log('log_warning', '', $gsi, 'TRANSCRIPT', $tsi, 'VQCT_internal_stop', "INTERNAL STOPS HAVANA: Transcript $tsi ($tname) from gene $gname has non \'$scodon\' stop codons [$mod_date]: Sequence = $orig_seq Stops at $positions)");
00307       }
00308       else {
00309         #$support->log_warning("INTERNAL STOPS EXTERNAL: Transcript $tsi ($tname) from gene $gname has non \'$scodon\' stop codons[$mod_date]:\nSequence = $orig_seq\nStops at $positions)\n\n");
00310         $log_object->_save_log('log_warning', '', $gsi, 'TRANSCRIPT', $tsi, 'VQCT_internal_stop', "INTERNAL STOPS EXTERNAL: Transcript $tsi ($tname) from gene $gname has non \'$scodon\' stop codons[$mod_date]: Sequence = $orig_seq Stops at $positions)");  
00311       }
00312     }
00313     #...yes - check remarks
00314     else {
00315       my $flag_remark  = 0; # 1 if word seleno has been used
00316       my $flag_remark2 = 0; # 1 if existing remark has correct numbering
00317       my $alabel       = 'Annotation_remark- selenocysteine ';
00318       my $alabel2      = 'selenocysteine ';
00319       my $annot_stops;
00320       my $remarks;
00321       my $att;
00322       #get both hidden_remarks and remarks
00323       foreach my $remark_type ('remark','hidden_remark') {
00324         if($has_log_object){
00325           $att=$log_object->species_params->get_attributes->{$trans->stable_id}->{$remark_type};
00326         }else{
00327           $att=$trans->get_all_Attributes($remark_type)
00328         }
00329         foreach my $attrib ( @$att) {
00330           push @{$remarks->{$remark_type}}, $attrib->value;
00331         }
00332       }
00333       #parse remarks to check syntax for location of edits
00334       while (my ($attrib,$remarks)= each %$remarks) {
00335         foreach my $text (@{$remarks}) {                    
00336           if ( ($attrib eq 'remark') && ($text=~/^$alabel(.*)/) ){
00337             #$support->log_warning("seleno remark for $tsi stored as Annotation_remark not hidden remark) [$mod_date]\n");
00338             $log_object->_save_log('log_warning', '', $gsi, '', $tsi, 'VQCT_wrong_selC_coord', "seleno remark for $tsi stored as Annotation_remark not hidden remark) [$mod_date]");        
00339             $annot_stops=$1;
00340           }
00341           elsif ($text =~ /^$alabel2(.*)/) {
00342             my $maybe = $1;
00343             if($maybe =~ /^\s*\d+(\s+\d+)*\s*$/) {
00344               $annot_stops=$maybe;
00345             } else {
00346               $log_object->_save_log('log', '', $gene->stable_id, '', $tsi, '', "Maybe annotated stop in incorrect format, maybe just a remark that happens to begin '$alabel2'".
00347                                                                                 " -- might need to investigate: '$alabel2$maybe' [$mod_date]");
00348             }
00349           }
00350         }
00351       }
00352 
00353       #check the location of the annotated edits matches actual stops in the sequence
00354       my @annotated_stops;
00355       if ($annot_stops){
00356         my $i = 0;
00357         foreach my $offset (split(/\s+/, $annot_stops)) {
00358           #OK if it matches a known stop
00359           if (
00360             defined($found_stops[$i]) && defined($found_stops[$i]->[1]) && ($found_stops[$i]->[1] == $offset)) {
00361             push  @annotated_stops, $offset;
00362           }
00363           # catch old annotations where number was in DNA not peptide coordinates
00364           elsif (defined($found_stops[$i]) && defined($found_stops[$i]->[1]) && (($found_stops[$i]->[1] * 3) == $offset)) {
00365             $log_object->_save_log('log_warning', '', $gene->stable_id, 'DNA', $tsi, 'VQCT_wrong_selC_coord', "DNA: Annotated stop for transcript tsi ($tname) is in DNA not peptide coordinates) [$mod_date]");
00366           }
00367           # catch old annotations where number off by one
00368           elsif (defined($found_stops[$i]) && defined($found_stops[$i]->[1]) && (($found_stops[$i]->[1]) == $offset+1)) {
00369             $log_object->_save_log('log_warning', '', $gene->stable_id, 'PEPTIDE', $tsi, 'VQCT_wrong_selC_coord', "PEPTIDE: Annotated stop for transcript $tsi ($tname) is out by one) [$mod_date]");
00370           }
00371           elsif (defined($offset)  && ($offset=~/^\d+$/)){
00372             if ($offset == length($orig_seq)+1) {
00373               if($seen_transcripts->{$tsi} && $seen_transcripts->{$tsi}->[1] eq 'known-tga-stop') {
00374                 $log_object->_save_log('log', '', $gene->stable_id, 'TRANSCRIPT', $tsi, '', "Annotated stop for transcript $tsi ($tname) known to be a stop codon. Ok. [$mod_date]");
00375               } elsif($seen_transcripts->{$tsi} && $seen_transcripts->{$tsi}->[1] eq 'known-terminal-sec') {
00376                 $log_object->_save_log('log', '', $gene->stable_id, 'TRANSCRIPT', $tsi, '', "Annotated stop for transcript $tsi ($tname) known to be a terminal Sec. Ok. [$mod_date]");
00377               } else {
00378                 $log_object->_save_log('log_warning', '', $gene->stable_id, 'TRANSCRIPT', $tsi, '', "Annotated stop for transcript $tsi ($tname) \"$offset\" matches actual stop codon yet has no entry in script config to disambiguate it. Please investigate and add appropriate entry to config arrays in add_selcys.pl. [$mod_date]");
00379               }
00380             }
00381             else {
00382               $log_object->_save_log('log_warning', '', $gene->stable_id, 'TRANSCRIPT', $tsi, 'VQCT_wrong_selC_coord', "Annotated stop for transcript $tsi ($tname) \"$offset\" does not match a TGA codon) [$mod_date]");
00383               push  @annotated_stops, $offset;
00384             }
00385           }                     
00386           $i++;
00387         }
00388       }
00389 
00390       #check location of found stops matches annotated ones - any new ones are reported
00391       foreach my $stop ( @found_stops ) {
00392         my $pos = $stop->[1];
00393         my $seq = $stop->[0];
00394         unless ( grep { $pos == $_} @annotated_stops) {
00395           if ($seen_transcripts->{$tsi} && $seen_transcripts->{$tsi}->[1] eq 'notsec-havana') {
00396             #$support->log_verbose("Transcript $tsi ($tname) has potential selenocysteines but has been discounted by annotators:\n\t".$seen_transcripts->{$tsi}.") [$mod_date]\n");
00397             $log_object->_save_log('log_verbose', '', $gene->stable_id, '', $tsi, 'VQCT_pot_selC', "Transcript $tsi ($tname) has potential selenocysteines but has been discounted by annotators: ".$seen_transcripts->{$tsi}->[0].") [$mod_date]");
00398           }
00399           else {
00400             #$support->log("POTENTIAL SELENO ($seq) in $tsi ($tname, gene $gname) found at $pos [$mod_date]\n");
00401             $log_object->_save_log('log', '', $gene->stable_id, '', $tsi, 'VQCT_pot_selC', "POTENTIAL SELENO ($seq) in $tsi ($tname, gene $gname) found at $pos [$mod_date]");
00402           }
00403         }
00404       }
00405     }
00406   }
00407 }
00408 sub _save_log{
00409   my $self=shift;
00410   my $log_type = shift;
00411   my $chrom_name=shift || '';
00412   my $gsi=shift || '';
00413   my $type=shift || '';  
00414   my $tsi=shift || '';  
00415   my $tag=shift || '';
00416   my $txt=shift || '';
00417   $self->$log_type($txt."\n");
00418 }
00419 
00420 #details of annotators comments
00421 __DATA__
00422 OTTHUMT00000144659 = FIXED- changed to transcript
00423 OTTHUMT00000276377 = FIXED- changed to transcript
00424 OTTHUMT00000257741 = FIXED- changed to nmd
00425 OTTHUMT00000155694 = NOT_FIXED- should be nmd but external annotation but cannot be fixed
00426 OTTHUMT00000155695 = NOT_FIXED- should be nmd but external annotation but cannot be fixed
00427 OTTHUMT00000282573 = FIXED- changed to unprocessed pseudogene
00428 OTTHUMT00000285227 = FIXED- changed start site
00429 OTTHUMT00000151008 = FIXED- incorrect trimming of CDS, removed extra stop codon
00430 OTTHUMT00000157999 = FIXED- changed incorrect stop
00431 OTTHUMT00000150523 = FIXED- incorrect trimming of CDS
00432 OTTHUMT00000150525 = FIXED- incorrect trimming of CDS
00433 OTTHUMT00000150522 = FIXED- incorrect trimming of CDS
00434 OTTHUMT00000150521 = FIXED- incorrect trimming of CDS
00435 OTTHUMT00000246819 = FIXED- corrected frame
00436 OTTHUMT00000314078 = FIXED- corrected frame
00437 OTTHUMT00000080133 = FIXED- corrected frame
00438 OTTHUMT00000286423 = FIXED- changed to transcript
00439 OTTMUST00000055509 = FIXED- error
00440 OTTMUST00000038729 = FIXED- corrected frame
00441 OTTMUST00000021760 = FIXED- corrected frame
00442 OTTMUST00000023057 = FIXED- corrected frame
00443 OTTMUST00000015207 = FIXED- corrected frame
00444 OTTMUST00000056646 = FIXED- error
00445 OTTMUST00000059686 = FIXED- corrected frame
00446 OTTMUST00000013426 = FIXED- corrected frame
00447 OTTMUST00000044412 = FIXED- error