Archive Ensembl HomeArchive Ensembl Home
RepeatMaskedSlice.pm
Go to the documentation of this file.
00001 =head1 LICENSE
00002 
00003   Copyright (c) 1999-2012 The European Bioinformatics Institute and
00004   Genome Research Limited.  All rights reserved.
00005 
00006   This software is distributed under a modified Apache license.
00007   For license details, please see
00008 
00009     http://www.ensembl.org/info/about/code_licence.html
00010 
00011 =head1 CONTACT
00012 
00013   Please email comments or questions to the public Ensembl
00014   developers list at <dev@ensembl.org>.
00015 
00016   Questions may also be sent to the Ensembl help desk at
00017   <helpdesk@ensembl.org>.
00018 
00019 =cut
00020 
00021 =head1 NAME
00022 
00023 Bio::EnsEMBL::RepeatMaskedSlice - Arbitary Slice of a genome
00024 
00025 =head1 SYNOPSIS
00026 
00027   $sa = $db->get_SliceAdaptor();
00028 
00029   $slice =
00030     $sa->fetch_by_region( 'chromosome', 'X', 1_000_000, 2_000_000 );
00031 
00032   $repeat_masked_slice = $slice->get_repeatmasked_seq();
00033 
00034   # get repeat masked sequence:
00035   my $dna = $repeat_masked_slice->seq();
00036   $dna = $repeat_masked_slice->subseq( 1, 1000 );
00037 
00038 =head1 DESCRIPTION
00039 
00040 This is a specialised Bio::EnsEMBL::Slice class that is used to retrieve
00041 repeat masked genomic sequence rather than normal genomic sequence.
00042 
00043 =head1 METHODS
00044 
00045 =cut
00046 
00047 package Bio::EnsEMBL::RepeatMaskedSlice;
00048 
00049 use strict;
00050 use warnings;
00051 
00052 use Bio::EnsEMBL::Slice;
00053 use Bio::EnsEMBL::Utils::Argument qw(rearrange);
00054 use Bio::EnsEMBL::Utils::Sequence qw(reverse_comp);
00055 use Bio::EnsEMBL::Utils::Exception;
00056 
00057 use vars qw(@ISA);
00058 
00059 @ISA = ('Bio::EnsEMBL::Slice');
00060 
00061 # The BLOCK_PWR is the lob_bin of the chunksize where you want your repeat features
00062 # to be retreived. This will create repeat feature retrieval calls that are likely 
00063 # to be on the same slice and hopefully create cache hits and less database traffic
00064 my $BLOCK_PWR = 18;
00065 
00066 
00067 
00068 =head2 new
00069 
00070   Arg [-REPEAT_MASK] : The logic name of the repeats to be used for masking.
00071                       If not provided, all repeats in the database are used.
00072   Arg [...]  : Named superclass arguments. See B<Bio::EnsEMBL::Slice>.
00073   Example    : my $slice = Bio::EnsEMBL::RepeatMaskedSlice->new
00074                   (-START  => $start,
00075                    -END    => $end,
00076                    -STRAND => $strand,
00077                    -SEQ_REGION_NAME => $seq_region,
00078                    -SEQ_REGION_LENGTH => $seq_region_length,
00079                    -COORD_SYSTEM  => $cs,
00080                    -ADAPTOR => $adaptor,
00081                    -REPEAT_MASK => ['repeat_masker'],
00082                    -SOFT_MASK => 1,
00083                    -NOT_DEFAULT_MASKING_CASES => {"repeat_class_SINE/MIR" => 1,
00084                                                   "repeat_name_AluSp" => 0});
00085   Description: Creates a Slice which behaves exactly as a normal slice but
00086                that returns repeat masked sequence from the seq method.
00087   Returntype : Bio::EnsEMBL::RepeatMaskedSlice
00088   Exceptions : none
00089   Caller     : RawComputes (PredictionTranscript creation code).
00090   Status     : Stable
00091 
00092 =cut
00093 
00094 sub new {
00095   my $caller = shift;
00096   my $class = ref($caller) || $caller;
00097 
00098   my ($logic_names, $soft_mask, $not_default_masking_cases) = rearrange(['REPEAT_MASK',
00099                                                                          'SOFT_MASK',
00100                                                                          'NOT_DEFAULT_MASKING_CASES'], @_);
00101 
00102   my $self = $class->SUPER::new(@_);
00103 
00104 
00105   $logic_names ||= [''];
00106   if(ref($logic_names) ne 'ARRAY') {
00107     throw("Reference to list of logic names argument expected.");
00108   }
00109 
00110   $self->{'repeat_mask_logic_names'} = $logic_names;
00111   $self->{'soft_mask'} = $soft_mask;
00112   $self->{'not_default_masking_cases'} = $not_default_masking_cases;
00113   $self->{'not_default_masking_cases'} ||= {};
00114   
00115   return $self;
00116 }
00117 
00118 
00119 =head2 repeat_mask_logic_names
00120 
00121   Arg [1]    : reference to list of strings $logic_names (optional)
00122   Example    : $rm_slice->repeat_mask_logic_name(['repeat_masker']);
00123   Description: Getter/Setter for the logic_names of the repeats that are used
00124                to mask this slices sequence.
00125   Returntype : reference to list of strings
00126   Exceptions : none
00127   Caller     : seq() method
00128   Status     : Stable
00129 
00130 =cut
00131 
00132 sub repeat_mask_logic_names {
00133   my $self = shift;
00134 
00135   if(@_) {
00136     my $array = shift;
00137     if(ref($array) ne 'ARRAY') {
00138       throw('Reference to list of logic names argument expected.');
00139     }
00140   }
00141   
00142   return $self->{'repeat_mask_logic_names'};
00143 }
00144 
00145 
00146 =head2 soft_mask
00147 
00148   Arg [1]    : boolean $soft_mask (optional)
00149   Example    : $rm_slice->soft_mask(0);
00150   Description: Getter/Setter which is used to turn on/off softmasking of the
00151                sequence returned by seq.
00152   Returntype : boolean
00153   Exceptions : none
00154   Caller     : seq() method
00155   Status     : Stable
00156 
00157 =cut
00158 
00159 sub soft_mask {
00160   my $self = shift;
00161   $self->{'soft_mask'} = shift if(@_);
00162   return $self->{'soft_mask'} || 0;
00163 }
00164 
00165 =head2 not_default_masking_cases
00166 
00167   Arg [1]    : hash reference $not_default_masking_cases (optional, default is {})
00168                The values are 0 or 1 for hard and soft masking respectively
00169                The keys of the hash should be of 2 forms
00170                "repeat_class_" . $repeat_consensus->repeat_class,
00171                 e.g. "repeat_class_SINE/MIR"
00172                "repeat_name_" . $repeat_consensus->name
00173                 e.g. "repeat_name_MIR"
00174                depending on which base you want to apply the not default masking either 
00175                the repeat_class or repeat_name. Both can be specified in the same hash
00176                at the same time, but in that case, repeat_name setting has priority over 
00177                repeat_class. For example, you may have hard masking as default, and 
00178                you may want soft masking of all repeat_class SINE/MIR,
00179                but repeat_name AluSp (which are also from repeat_class SINE/MIR)
00180   Example    : $rm_slice->not_default_masking_cases({"repeat_class_SINE/MIR" => 1,
00181                                                      "repeat_name_AluSp" => 0});
00182   Description: Getter/Setter which is used to escape some repeat class or name from the default 
00183                masking in place. 
00184   Returntype : hash reference
00185   Exceptions : none
00186   Caller     : seq() and subseq() methods
00187   Status     : Stable
00188 
00189 =cut
00190 
00191 sub not_default_masking_cases {
00192   my $self = shift;
00193   $self->{'not_default_masking_cases'} = shift if (@_);
00194   return $self->{'not_default_masking_cases'};
00195 }
00196 
00197 =head2 seq
00198 
00199   Arg [1]    : none
00200   Example    : print $rmslice->seq(), "\n";
00201   Description: Retrieves the entire repeat masked sequence for this slice.
00202                See also the B<Bio::EnsEMBL::Slice> implementation of this 
00203                method.
00204   Returntype : string
00205   Exceptions : none
00206   Caller     : general
00207   Status     : Stable
00208 
00209 =cut
00210 
00211 sub seq {
00212   my $self = shift;
00213   #
00214   # get all the features
00215   #
00216   my $logic_names = $self->repeat_mask_logic_names();
00217   my $soft_mask   = $self->soft_mask();
00218   my $not_default_masking_cases = $self->not_default_masking_cases;
00219 
00220   my $repeats = [];
00221 
00222   foreach my $l (@$logic_names) {
00223     push @{$repeats}, @{$self->get_all_RepeatFeatures($l)};
00224   }
00225 
00226   #
00227   # get the dna
00228   #
00229   my $dna = $self->SUPER::seq(@_);
00230 
00231   #
00232   # mask the dna
00233   #
00234   $self->_mask_features(\$dna,$repeats,$soft_mask,$not_default_masking_cases);
00235   return $dna;
00236 }
00237 
00238 
00239 
00240 =head2 subseq
00241 
00242   Arg [1]    : none
00243   Example    : print $rmslice->subseq(1, 1000);
00244   Description: Retrieves a repeat masked sequence from a specified subregion
00245                of this slice.  See also the B<Bio::EnsEMBL::Slice> 
00246                implementation of this method.
00247   Returntype : string
00248   Exceptions : none
00249   Caller     : general
00250   Status     : Stable
00251 
00252 =cut
00253 
00254 
00255 sub subseq {
00256   my $self   = shift;
00257   my $start  = shift;
00258   my $end    = shift;
00259   my $strand = shift;
00260 
00261   #
00262   # get all the features
00263   #
00264   my $logic_names = $self->repeat_mask_logic_names();
00265   my $soft_mask   = $self->soft_mask();
00266   my $not_default_masking_cases = $self->not_default_masking_cases;
00267 
00268 
00269   # If frequent subseqs happen on repeatMasked sequence this results in
00270   # a lot of feature retrieval from the database. To avoid this, features
00271   # are only retrieved from subslices with fixed space boundaries. 
00272   # The access happens in block to make cache hits more likely
00273 
00274   # The blocksize can be defined on the top of this module.
00275 
00276   my $seq_region_slice = $self->seq_region_Slice();
00277   my $block_min = ($self->start()-1) >> $BLOCK_PWR;
00278   my $block_max = ($self->end()-1) >> $BLOCK_PWR;
00279 
00280   my $repeats = [];
00281 
00282   my $sub_start = ($block_min << $BLOCK_PWR)+1;
00283   my $sub_end = ($block_max+1)<<$BLOCK_PWR;
00284   if ($sub_end > $seq_region_slice->length) {
00285     $sub_end =  $seq_region_slice->length ;
00286   }
00287 
00288   my $subslice = $seq_region_slice->sub_Slice( $sub_start, $sub_end);
00289 
00290   foreach my $l (@$logic_names) {
00291     push @{$repeats}, @{$subslice->get_all_RepeatFeatures($l)};
00292   }
00293 
00294   #
00295   # get the dna
00296   #
00297   my $subsequence_slice = $self->sub_Slice( $start, $end, $strand );
00298   my $dna = $subsequence_slice->seq();
00299   #
00300   # mask the dna
00301   #
00302   $subsequence_slice->_mask_features(\$dna,$repeats,$soft_mask,$not_default_masking_cases);
00303 
00304   return $dna;
00305 }
00306 
00307 
00308 1;