Archive Ensembl HomeArchive Ensembl Home
TranscriptSNPs.pm
Go to the documentation of this file.
00001 =head1 LICENSE
00002 
00003   Copyright (c) 1999-2012 The European Bioinformatics Institute and
00004   Genome Research Limited.  All rights reserved.
00005 
00006   This software is distributed under a modified Apache license.
00007   For license details, please see
00008 
00009     http://www.ensembl.org/info/about/code_licence.html
00010 
00011 =head1 CONTACT
00012 
00013   Please email comments or questions to the public Ensembl
00014   developers list at <dev@ensembl.org>.
00015 
00016   Questions may also be sent to the Ensembl help desk at
00017   <helpdesk@ensembl.org>.
00018 
00019 =cut
00020 
00021 =head1 NAME
00022 
00023 TranscriptSNPs - A utility class used to obtain information about the
00024 relationships between a transcript and SNPs
00025 
00026 =head1 SYNOPSIS
00027 
00028   use Bio::EnsEMBL::Utils::TranscriptSNPs;
00029 
00030   # get and type all snps in the region of the transcript
00031 
00032   %snps = %{
00033     Bio::EnsEMBL::Utils::TranscriptSNPs::get_all_SNPs( $transcript,
00034       $flanking ) };
00035 
00036   # get all snps overlapping the transcript in cdna coordinates
00037 
00038   %snps =
00039     %{ Bio::EnsEMBL::Utils::TranscriptSNPs::get_all_cdna_SNPs(
00040       $transcript) };
00041 
00042   # get the peptide variations caused by a set of SNPs
00043 
00044   %variations = %{
00045     Bio::EnsEMBL::Utils::TranscriptSNPs::get_all_peptide_variations(
00046       $transcript, $snps ) };
00047 
00048 =head1 DESCRIPTION
00049 
00050 This is a utility class which can be used to get snps associated with a
00051 transcript, and to determine the amino acid changes caused by the SNPs
00052 
00053 =head1 METHODS
00054 
00055 =cut
00056 
00057 package Bio::EnsEMBL::Utils::TranscriptSNPs;
00058 
00059 use strict;
00060 use warnings;
00061 no warnings 'uninitialized';
00062 
00063 
00064 
00065 use Bio::EnsEMBL::Utils::Exception qw(throw warning);
00066 
00067 
00068 =head2 get_all_peptide_variations
00069 
00070   Arg [1]    : $transcript the transcript to obtain the peptide variations for
00071   Arg [2]    : $snps listref of coding snps in cdna coordinates
00072   Example    : $pep_hash = get_all_peptide_variations($transcript, \@snps);
00073   Description: Takes a list of coding snps on this transcript in
00074                which are in cdna coordinates and returns a hash with peptide
00075                coordinate keys and listrefs of alternative amino acids as
00076                values.  The SNPs must additionally have a strand of 1 for the
00077                sake of simplicity.  Normally these could be generated using the
00078                get_all_cdna_SNPs method.
00079 
00080                Note that the peptide encoded by the reference sequence is
00081                also present in the results and that duplicate peptides
00082                (e.g. resulting from synonomous mutations) are discarded.
00083                It is possible to have greated than two peptides variations
00084                at a given location given adjacent or overlapping snps.
00085                Insertion/deletion variations are ignored by this method.
00086                Example of a data structure that could be returned:
00087                {  1  => ['I', 'M'],
00088                  10  => ['I', 'T'],
00089                  37  => ['N', 'D'],
00090                  56  => ['G', 'E'],
00091                  118 => ['R', 'K'],
00092                  159 => ['D', 'E'],
00093                  167 => ['Q', 'R'],
00094                  173 => ['H', 'Q'] }
00095   Returntype : hashref
00096   Exceptions : none
00097   Caller     : general
00098 
00099 =cut
00100 
00101 sub get_all_peptide_variations {
00102   my $transcript = shift;
00103   my $snps = shift;
00104 
00105   if(!ref($transcript) || !$transcript->isa('Bio::EnsEMBL::Transcript')) {
00106     throw('Bio::EnsEMBL::Transcript argument is required.');
00107   }
00108 
00109   if(!ref($snps) eq 'ARRAY') {
00110     throw('Reference to a list of Bio::EnsEMBL::SNP objects is required');
00111   }
00112 
00113   my $codon_table = Bio::Tools::CodonTable->new;
00114   my $codon_length = 3;
00115   my $cdna = $transcript->spliced_seq;
00116 
00117   my $variant_alleles;
00118   my $translation_start = $transcript->cdna_coding_start;
00119   foreach my $snp (@$snps) {
00120     #skip variations not on a single base
00121     next if ($snp->start != $snp->end);
00122 
00123     my $start = $snp->start;
00124     my $strand = $snp->strand;
00125 
00126     #calculate offset of the nucleotide from codon start (0|1|2)
00127     my $codon_pos = ($start - $translation_start) % $codon_length;
00128 
00129     #calculate the peptide coordinate of the snp
00130     my $peptide = ($start - $translation_start +
00131            ($codon_length - $codon_pos)) / $codon_length;
00132     
00133     # skip this SNP if it falls in a partial codon
00134     next if $start - $codon_pos + $codon_length > length($cdna);
00135 
00136     #retrieve the codon
00137     my $codon = substr($cdna, $start - $codon_pos-1, $codon_length);
00138 
00139     #store each alternative allele by its location in the peptide
00140     my @alleles = split(/\/|\|/, lc($snp->allele_string));
00141     #my @alleles = split(/\/|\|/, lc($snp->alleles));
00142 
00143     foreach my $allele (@alleles) {
00144       next if $allele eq '-';       #skip deletions
00145       next if CORE::length($allele) != 1; #skip insertions
00146 
00147       #get_all_cdna_SNPs always gives strand of 1 now
00148       #if($strand == -1) {
00149       #  #complement the allele if the snp is on the reverse strand
00150       #  $allele =~ 
00151       #  tr/acgtrymkswhbvdnxACGTRYMKSWHBVDNX/tgcayrkmswdvbhnxTGCAYRKMSWDVBHNX/;
00152       #}
00153 
00154       #create a data structure of variant alleles sorted by both their
00155       #peptide position and their position within the peptides codon
00156       $variant_alleles ||= {};
00157       if(exists $variant_alleles->{$peptide}) {
00158         my $alleles_arr = $variant_alleles->{$peptide}->[1];
00159         push @{$alleles_arr->[$codon_pos]}, $allele;
00160       } else {
00161         #create a list of 3 lists (one list for each codon position)
00162         my $alleles_arr = [[],[],[]];
00163         push @{$alleles_arr->[$codon_pos]}, $allele;
00164         $variant_alleles->{$peptide} = [$codon, $alleles_arr];
00165       }
00166     }
00167   }
00168 
00169   my %out;
00170   #now generate all possible codons for each peptide and translate them
00171   foreach my $peptide (keys %$variant_alleles) {
00172     my ($codon, $alleles) = @{$variant_alleles->{$peptide}};
00173 
00174     #need to push original nucleotides onto each position
00175     #so that all possible combinations can be generated
00176     push @{$alleles->[0]}, substr($codon,0,1);
00177     push @{$alleles->[1]}, substr($codon,1,1);
00178     push @{$alleles->[2]}, substr($codon,2,1);
00179 
00180     my %alt_amino_acids;
00181     foreach my $a1 (@{$alleles->[0]}) {
00182       substr($codon, 0, 1) = $a1;
00183       foreach my $a2 (@{$alleles->[1]}) {
00184         substr($codon, 1, 1) = $a2;
00185         foreach my $a3 (@{$alleles->[2]}) {
00186           substr($codon, 2, 1) = $a3;
00187           my $aa = $codon_table->translate($codon);
00188           #print "$codon translation is $aa\n";
00189           $alt_amino_acids{$aa} = 1;
00190         }
00191       }
00192     }
00193 
00194     my @aas = keys %alt_amino_acids;
00195     $out{$peptide} = \@aas;
00196   }
00197 
00198   return \%out;
00199 }
00200 
00201 
00202 
00203 =head2 get_all_SNPs
00204 
00205   Arg [1]    : Bio::EnsEMBL::Transcript $transcript
00206 
00207   Arg [2]    : (optional) int $flanking
00208                The number of basepairs of transcript flanking sequence to
00209                retrieve snps from (default 0)
00210   Arg [3]    : $source type of database source (dbSNP, Glovar)
00211   Example    : $snp_hashref = get_all_transcript_SNPs($transcript)
00212   Description: Retrieves all snps found within the region of the 
00213                provided transcript
00214                The snps are returned in a hash with keys corresponding
00215                to the region the snp was found in.  Possible keys are:
00216                'three prime UTR', 'five prime UTR', 'coding', 'intronic',
00217                'three prime flanking', 'five prime flanking'
00218                If no flanking argument is provided no flanking snps will be
00219                obtained.
00220                The listrefs which are the values of the returned hash
00221                contain snps in coordinates of the transcript region 
00222                (i.e. first base = first base of the first exon on the
00223                postive strand - flanking bases + 1)
00224 
00225                Multiple base variations and inserts/deletes are discarded
00226                by this method and not used.
00227 
00228   Returntype : hasref with string keys and listrefs of Bio::EnsEMBL::SNPs for
00229                values
00230   Exceptions : none
00231   Caller     : general
00232 
00233 =cut
00234 
00235 sub get_all_SNPs {
00236   my $transcript = shift;
00237   my $flanking = shift || 0;
00238   my $source = shift;
00239 
00240   if(!ref($transcript) || !$transcript->isa('Bio::EnsEMBL::Transcript')) {
00241     throw('Bio::EnsEMBL::Transcript argument required.');
00242   }
00243 
00244   my $slice = $transcript->slice();
00245 
00246   if(!$slice) {
00247     warning("Cannot obtain SNPs for transcript without attached Slice.");
00248     return {};
00249   }
00250 
00251   my $sa = $slice->adaptor();
00252 
00253   if(!$sa) {
00254     warning('Cannot obtain SNPs for transcript unless attached slice ' .
00255             'has attached adaptor');
00256     return {};
00257   }
00258 
00259   my %snp_hash;
00260 
00261   # retrieve slice in the region of the transcript
00262   $slice = $sa->fetch_by_Feature($transcript, $flanking );
00263 
00264   # copy transcript, to work in coord system we are interested in
00265   $transcript = $transcript->transfer( $slice );
00266 
00267   # get all snps in the transcript region
00268   my $snps;
00269   if ($source eq 'glovar') {
00270     $snps = $slice->get_all_ExternalFeatures('GlovarSNP');
00271   }
00272   elsif ($source eq 'variation') {
00273     $snps = $slice->get_all_VariationFeatures;
00274   }
00275   else {
00276     $snps = $slice->get_all_SNPs;   # dont need once use new snp api (i think)
00277   }
00278 
00279   my $trans_start  = $flanking + 1;
00280   my $trans_end    = $slice->length - $flanking;
00281   my $trans_strand = $transcript->get_all_Exons->[0]->strand;
00282 
00283   # classify each snp
00284   foreach my $snp (@$snps) {
00285     my $key;
00286 
00287     if(($trans_strand == 1 && $snp->end < $trans_start) ||
00288        ($trans_strand == -1 && $snp->start > $trans_end)) {
00289       # this snp is upstream from the transcript
00290       $key = 'five prime flanking';
00291     }
00292 
00293     elsif(($trans_strand == 1 && $snp->start > $trans_end) ||
00294       ($trans_strand == -1 && $snp->start < $trans_start)) {
00295       # this snp is downstream from the transcript
00296       $key = 'three prime flanking';
00297     }
00298 
00299     else {
00300       #snp is inside transcript region check if it overlaps an exon
00301       foreach my $e (@{$transcript->get_all_Exons}) {
00302         if($snp->end >= $e->start && $snp->start <= $e->end) {
00303           # this snp is in an exon
00304       
00305           if(($trans_strand == 1 && 
00306               $snp->end < $transcript->coding_region_start) ||
00307              ($trans_strand == -1 && 
00308               $snp->start > $transcript->coding_region_end)) {
00309             # this snp is in the 5' UTR
00310             $key = 'five prime UTR';
00311           }
00312 
00313           elsif(($trans_strand == 1 && 
00314                  $snp->start > $transcript->coding_region_end)||
00315                 ($trans_strand == -1 && 
00316                  $snp->end < $transcript->coding_region_start)) {
00317             # this snp is in the 3' UTR
00318             $key = 'three prime UTR';
00319           }
00320 
00321           else {
00322             # snp is coding
00323             $key = 'coding';
00324           }
00325           last;
00326         }
00327       }
00328       unless($key) {
00329         # snp was not in an exon and is therefore intronic
00330         $key = 'intronic';
00331       }
00332     }
00333 
00334     unless($key) {
00335       #warning('SNP could not be mapped. In/Dels not supported yet...');
00336       next;
00337     }
00338 
00339     if(exists $snp_hash{$key}) {
00340       push @{$snp_hash{$key}}, $snp;
00341     } 
00342     else {
00343       $snp_hash{$key} = [$snp];
00344     }
00345   }
00346 
00347   return \%snp_hash;
00348 }
00349 
00350 
00351 
00352 =head2 get_all_cdna_SNPs
00353 
00354   Arg [1]    : Bio::EnsEMBL::Transcript $transcript
00355   Arg [2]    : $source type of database source (dbSNP, Glovar)
00356   Example    : $cdna_snp_hasref = $transcript->get_all_cdna_SNPs;
00357   Description: Retrieves all snps found within exons of the provided
00358                transcript.
00359                The snps are returned in a hash with three keys corresponding
00360                to the region the snp was found in.  Valid keys are:
00361                'three prime UTR', 'five prime UTR', 'coding'
00362                The listrefs which are the values of the returned hash
00363                contain snps in CDNA coordinates.
00364 
00365                Multiple base variations and insertions/deletions are not
00366                used by this function and are discarded.
00367   Returntype : hasref with string keys and listrefs of Bio::EnsEMBL::SNPs for 
00368                values
00369   Exceptions : none
00370   Caller     : general
00371 
00372 =cut
00373 
00374 sub get_all_cdna_SNPs {
00375   my ($transcript, $source) = @_;
00376 
00377   #retrieve all of the snps from this transcript
00378   my $all_snps = get_all_SNPs($transcript, 0, $source);
00379   my %snp_hash;
00380 
00381   my @cdna_types = ('three prime UTR', 'five prime UTR','coding');
00382 
00383   my $slice = $transcript->slice();
00384   my $sa    = $slice->adaptor();
00385 
00386   $slice = $sa->fetch_by_Feature($transcript);
00387 
00388   # copy transcript in order to work in coord system of interest
00389   $transcript = $transcript->transfer($slice);
00390 
00391   foreach my $type (@cdna_types) {
00392     $snp_hash{$type} = [];
00393     foreach my $snp (@{$all_snps->{$type}}) {
00394       my @coords = $transcript->genomic2cdna($snp->start, $snp->end,
00395                                              $snp->strand);
00396 
00397       #skip snps that don't map cleanly (possibly an indel...)
00398       if(scalar(@coords) != 1) {
00399         #warning("snp of type $type does not map cleanly\n");
00400         next;
00401       }
00402 
00403       my ($coord) = @coords;
00404 
00405       unless($coord->isa('Bio::EnsEMBL::Mapper::Coordinate')) {
00406         #warning("snp of type $type maps to gap\n");
00407         next;
00408       }
00409 
00410       my $alleles;
00411       my $ambicode;
00412 
00413       # get alleles and ambig_code (with fallback to old snp API)
00414       $alleles = $snp->allele_string || $snp->{'alleles'};
00415       $ambicode = $snp->ambig_code || $snp->{'_ambiguity_code'};
00416 
00417       #we arbitrarily put the SNP on the +ve strand because it is easier to
00418       #work with in the webcode
00419       if($coord->strand == -1) {
00420         $alleles =~
00421          tr/acgthvmrdbkynwsACGTDBKYHVMRNWS\//tgcadbkyhvmrnwsTGCAHVMRDBKYNWS\//;
00422         $ambicode =~
00423          tr/acgthvmrdbkynwsACGTDBKYHVMRNWS\//tgcadbkyhvmrnwsTGCAHVMRDBKYNWS\//;
00424       }
00425       #copy the snp and convert to cdna coords...
00426       my $new_snp;
00427       %$new_snp = %$snp;
00428       bless $new_snp, ref $snp;
00429       $new_snp->start($coord->start);
00430       $new_snp->end($coord->end);
00431       $new_snp->strand(1);
00432       $new_snp->allele_string($alleles);
00433       $new_snp->ambig_code($ambicode);
00434       push @{$snp_hash{$type}}, $new_snp;
00435     }
00436   }
00437 
00438   return \%snp_hash;
00439 }
00440 
00441 
00442 
00443 
00444 1;