Archive Ensembl HomeArchive Ensembl Home
UcscChainFactory.pm
Go to the documentation of this file.
00001 =head1 LICENSE
00002 
00003   Copyright (c) 1999-2012 The European Bioinformatics Institute and
00004   Genome Research Limited.  All rights reserved.
00005 
00006   This software is distributed under a modified Apache license.
00007   For license details, please see
00008 
00009     http://www.ensembl.org/info/about/code_licence.html
00010 
00011 =head1 CONTACT
00012 
00013   Please email comments or questions to the public Ensembl
00014   developers list at <dev@ensembl.org>.
00015 
00016   Questions may also be sent to the Ensembl help desk at
00017   <helpdesk@ensembl.org>.
00018 
00019 =head1 NAME
00020 
00021 Bio::EnsEMBL::Compara::RunnableDB::PairAligner::UcscChainFactory
00022 
00023 =head1 SYNOPSIS
00024 
00025 
00026 =head1 DESCRIPTION
00027 
00028 Effectively splits a UCSC chain file into smaller bits by using a seek position and the number of lines to be read, to allow for parallel processing 
00029 
00030 =cut
00031 
00032 package Bio::EnsEMBL::Compara::RunnableDB::PairAligner::UcscChainFactory;
00033 
00034 use strict;
00035 use Bio::EnsEMBL::Utils::Exception qw(throw warning);
00036 
00037 use base ('Bio::EnsEMBL::Compara::RunnableDB::BaseRunnable');
00038 
00039 
00040 ############################################################
00041 
00042 =head2 fetch_input
00043 
00044     Title   :   fetch_input
00045     Usage   :   $self->fetch_input
00046     Returns :   nothing
00047     Args    :   none
00048 
00049 =cut
00050 
00051 sub fetch_input {
00052   my( $self) = @_; 
00053 
00054   #Read at least this many lines to the next chain
00055   my $step = $self->param('step');
00056 
00057   #Open Ucsc chain file
00058   open(FILE, $self->param('chain_file')) or die ("Unable to open " . $self->param('chain_file'));
00059 
00060   my $seek_positions;
00061   my $prev_pos = 0;
00062   my $line_num = 1;
00063   
00064   my $first_chain = 1;
00065 
00066   #
00067   #Read through UCSC chain file. Store the position of the first chain tag 
00068   #(prev_pos) and read $step lines and store the number of lines until you
00069   #reach the next chain tag ($line_num-1)
00070   #
00071   while (<FILE>) {
00072       my $curr_pos = tell(FILE); #current position in file
00073       if (/chain /) {
00074       if ($first_chain || $line_num >= $step) {
00075           my $pos_line;
00076           %$pos_line = ('pos' => $prev_pos,
00077                 'line' => $line_num-1);
00078           push @$seek_positions, $pos_line;
00079           $line_num = 1;
00080           $first_chain = 0;
00081       }
00082       }
00083       $prev_pos = $curr_pos; 
00084       $line_num++;
00085   }
00086 
00087   #Store last position
00088   my $pos_line;
00089   %$pos_line = ('pos' => $prev_pos,
00090         'line' => $line_num-1);
00091   push @$seek_positions, $pos_line;
00092   
00093   close FILE;
00094 
00095   for (my $index = 0;  $index < (@$seek_positions-1); $index++) {
00096       my $seek_pos = $seek_positions->[$index]->{pos};
00097       my $num_lines = $seek_positions->[$index+1]->{line};
00098 
00099       my $output_id = "{seek_offset=>" . $seek_pos . ",num_lines=>" .  $num_lines . "}";
00100       $self->dataflow_output_id($output_id, 2);
00101   }
00102 }
00103 
00104 1;