Archive Ensembl HomeArchive Ensembl Home
DumpDnaCollection.pm
Go to the documentation of this file.
00001 #
00002 # You may distribute this module under the same terms as perl itself
00003 #
00004 # POD documentation - main docs before the code
00005 
00006 =pod
00007 
00008 =head1 NAME
00009 
00010 Bio::EnsEMBL::Compara::Production::GenomicAlignBlock::DumpDnaCollection
00011 
00012 =cut
00013 
00014 =head1 SYNOPSIS
00015 
00016 =cut
00017 
00018 =head1 DESCRIPTION
00019 
00020 
00021 =cut
00022 
00023 =head1 CONTACT
00024 
00025 Abel Ureta-Vidal <abel@ebi.ac.uk>
00026 
00027 =cut
00028 
00029 =head1 APPENDIX
00030 
00031 The rest of the documentation details each of the object methods.
00032 Internal methods are usually preceded with a _
00033 
00034 =cut
00035 
00036 package Bio::EnsEMBL::Compara::Production::GenomicAlignBlock::DumpDnaCollection;
00037 
00038 use strict;
00039 use Bio::EnsEMBL::Compara::Production::DBSQL::DBAdaptor;;
00040 use Bio::EnsEMBL::Utils::Exception;
00041 use Time::HiRes qw(time gettimeofday tv_interval);
00042 use Bio::EnsEMBL::Analysis::Runnable::Blat;
00043 
00044 #use Bio::EnsEMBL::Pipeline::Runnable::Blat;
00045 #our @ISA = qw(Bio::EnsEMBL::Pipeline::RunnableDB);
00046 
00047 use Bio::EnsEMBL::Analysis::RunnableDB;
00048 use Bio::EnsEMBL::Hive::Process;
00049 our @ISA = qw(Bio::EnsEMBL::Hive::Process);
00050 
00051 my $DEFAULT_DUMP_MIN_SIZE = 11500000;
00052 
00053 #comment out to use default faToNib
00054 my $BIN_DIR = "/software/ensembl/compara/bin";
00055 
00056 =head2 fetch_input
00057 
00058     Title   :   fetch_input
00059     Usage   :   $self->fetch_input
00060     Function:   Fetches input data for repeatmasker from the database
00061     Returns :   none
00062     Args    :   none
00063 
00064 =cut
00065 
00066 sub fetch_input {
00067   my( $self) = @_;
00068 
00069   #create a Compara::DBAdaptor which shares the same DBI handle
00070   #with $self->db (Hive DBAdaptor)
00071   $self->{'comparaDBA'} = Bio::EnsEMBL::Compara::Production::DBSQL::DBAdaptor->new(-DBCONN=>$self->db->dbc);
00072 
00073   $self->get_params($self->parameters);
00074   $self->get_params($self->input_id);
00075 
00076   throw("Missing dna_collection_name") unless($self->dna_collection_name);
00077 
00078   unless ($self->dump_min_size) {
00079     $self->dump_min_size($DEFAULT_DUMP_MIN_SIZE);
00080   }
00081 
00082   #must have dump_nib or dump_ooc defined
00083   throw("Missing dump_nib or dump_ooc method or dump_dna") unless ($self->dump_nib || $self->dump_dna);
00084 
00085   return 1;
00086 }
00087 
00088 
00089 
00090 sub run
00091 {
00092   my $self = shift;
00093 
00094   if ($self->dump_nib) {
00095       $self->dumpNibFiles;
00096   }
00097   if ($self->dump_dna) {
00098       $self->dumpDnaFiles;
00099   }
00100 
00101   return 1;
00102 }
00103 
00104 
00105 sub write_output {
00106   my( $self) = @_;
00107   return 1;
00108 }
00109 
00110 ##########################################
00111 #
00112 # getter/setter methods
00113 #
00114 ##########################################
00115 
00116 sub dna_collection_name {
00117   my $self = shift;
00118   $self->{'_dna_collection_name'} = shift if(@_);
00119   return $self->{'_dna_collection_name'};
00120 }
00121 
00122 sub dump_dna {
00123   my $self = shift;
00124   $self->{'_dump_dna'} = shift if(@_);
00125   return $self->{'_dump_dna'};
00126 }
00127 
00128 sub dump_nib {
00129   my $self = shift;
00130   $self->{'_dump_nib'} = shift if(@_);
00131   return $self->{'_dump_nib'};
00132 }
00133 
00134 sub dump_min_size {
00135   my $self = shift;
00136   $self->{'_dump_min_size'} = shift if(@_);
00137   return $self->{'_dump_min_size'};
00138 }
00139 
00140 ##########################################
00141 #
00142 # internal methods
00143 #
00144 ##########################################
00145 
00146 sub get_params {
00147   my $self         = shift;
00148   my $param_string = shift;
00149 
00150   return unless($param_string);
00151   #print("parsing parameter string : ",$param_string,"\n");
00152 
00153   my $params = eval($param_string);
00154   return unless($params);
00155   if(defined($params->{'dna_collection_name'})) {
00156     $self->dna_collection_name($params->{'dna_collection_name'});
00157   }
00158   if(defined($params->{'dump_min_size'})) {
00159     $self->dump_min_size($params->{'dump_min_size'});
00160   }
00161   if(defined($params->{'dump_dna'})) {
00162     $self->dump_dna($params->{'dump_dna'});
00163   }
00164   if(defined($params->{'dump_nib'})) {
00165     $self->dump_nib($params->{'dump_nib'});
00166   }
00167 
00168   return 1;
00169 }
00170 
00171 sub dumpNibFiles {
00172   my $self = shift;
00173 
00174   $self->{'comparaDBA'}->dbc->disconnect_when_inactive(1);
00175 
00176   my $starttime = time();
00177 
00178   my $dna_collection = $self->{'comparaDBA'}->get_DnaCollectionAdaptor->fetch_by_set_description($self->dna_collection_name);
00179   my $dump_loc = $dna_collection->dump_loc;
00180   unless (defined $dump_loc) {
00181     throw("dump_loc directory is not defined, can not dump nib files\n");
00182   }
00183 
00184   foreach my $dna_object (@{$dna_collection->get_all_dna_objects}) {
00185     if($dna_object->isa('Bio::EnsEMBL::Compara::Production::DnaFragChunkSet')) {
00186       warn "At this point you should get DnaFragChunk objects not DnaFragChunkSet objects!\n";
00187       next;
00188     }
00189     if($dna_object->isa('Bio::EnsEMBL::Compara::Production::DnaFragChunk')) {
00190       next if ($dna_object->length <= $self->dump_min_size);
00191 
00192       my $nibfile = "$dump_loc/". $dna_object->dnafrag->name . ".nib";
00193 
00194       #don't dump nibfile if it already exists
00195       next if (-e $nibfile);
00196 
00197       my $fastafile = "$dump_loc/". $dna_object->dnafrag->name . ".fa";
00198 
00199       #$dna_object->dump_to_fasta_file($fastafile);
00200       #use this version to solve problem of very large chromosomes eg opossum
00201       $dna_object->dump_chunks_to_fasta_file($fastafile);
00202 
00203       if (defined $BIN_DIR && -e $BIN_DIR) {
00204       #use newer version
00205       system("$BIN_DIR/faToNib", "$fastafile", "$nibfile") and throw("Could not convert fasta file $fastafile to nib: $!\n");
00206       } else {
00207       system("faToNib", "$fastafile", "$nibfile") and throw("Could not convert fasta file $fastafile to nib: $!\n");
00208       }
00209 
00210       unlink $fastafile;
00211       $dna_object = undef;
00212     }
00213   }
00214 
00215   if($self->debug){printf("%1.3f secs to dump nib for \"%s\" collection\n", (time()-$starttime), $self->dna_collection_name);}
00216 
00217   $self->{'comparaDBA'}->dbc->disconnect_when_inactive(0);
00218 
00219   return 1;
00220 }
00221 
00222 sub dumpDnaFiles {
00223   my $self = shift;
00224 
00225   $self->{'comparaDBA'}->dbc->disconnect_when_inactive(1);
00226 
00227   my $starttime = time();
00228 
00229   my $dna_collection = $self->{'comparaDBA'}->get_DnaCollectionAdaptor->fetch_by_set_description($self->dna_collection_name);
00230   my $dump_loc = $dna_collection->dump_loc;
00231   unless (defined $dump_loc) {
00232     throw("dump_loc directory is not defined, can not dump nib files\n");
00233   }
00234 
00235   foreach my $dna_object (@{$dna_collection->get_all_dna_objects}) {
00236     if($dna_object->isa('Bio::EnsEMBL::Compara::Production::DnaFragChunkSet')) {
00237 
00238       my $first_dna_object = $dna_object->get_all_DnaFragChunks->[0];
00239       my $chunk_array = $dna_object->get_all_DnaFragChunks;
00240 
00241       my $name = $first_dna_object->dnafrag->name . "_" . $first_dna_object->seq_start . "_" . $first_dna_object->seq_end;
00242 
00243       my $fastafile = "$dump_loc/". $name . ".fa";
00244 
00245       #Must always dump new fasta files because different runs call the chunks
00246       #different names and the chunk name is what is stored in the fasta file.
00247       if (-e $fastafile) {
00248       unlink $fastafile
00249       }
00250       foreach my $chunk (@$chunk_array) {
00251       #A chunk_set will contain several seq_regions which will be appended
00252       #to a single fastafile. This means I can't use
00253       #dump_chunks_to_fasta_file because this deletes the fastafile each
00254       #time!
00255       $chunk->dump_to_fasta_file(">".$fastafile);
00256       }
00257     }
00258     if($dna_object->isa('Bio::EnsEMBL::Compara::Production::DnaFragChunk')) {
00259       next if ($dna_object->length <= $self->dump_min_size);
00260 
00261       my $name = $dna_object->dnafrag->name . "_" . $dna_object->seq_start . "_" . $dna_object->seq_end;
00262 
00263       my $fastafile = "$dump_loc/". $name . ".fa";
00264 
00265       if (-e $fastafile) {
00266       unlink $fastafile
00267       }
00268       $dna_object->dump_to_fasta_file(">".$fastafile);
00269     }
00270     $dna_object = undef;
00271   }
00272 
00273   if($self->debug){printf("%1.3f secs to dump nib for \"%s\" collection\n", (time()-$starttime), $self->dna_collection_name);}
00274 
00275   $self->{'comparaDBA'}->dbc->disconnect_when_inactive(0);
00276 
00277   return 1;
00278 }
00279 
00280 #Xreate a ooc file used in blat analysis. Not used for translated blat.
00281 sub create_ooc_file {
00282   my ($dir, $seq_region) = @_;
00283 
00284   my $ooc_file = "$dir/$seq_region/5ooc";
00285 
00286   #make new directory to store 5ooc file for each seq_region
00287   if (!-e "$dir/$seq_region") {
00288       mkdir("$dir/$seq_region")
00289         or throw("Directory $dir/$seq_region cannot be created");
00290   }
00291 
00292   my $runnable = new Bio::EnsEMBL::Analysis::Runnable::Blat (
00293                                  -database => "$dir/$seq_region.fa",
00294                                  -query_type => "dnax",
00295                                  -target_type => "dnax",
00296                                  -options => "-ooc=$ooc_file -tileSize=5 -makeOoc=$ooc_file -mask=lower -qMask=lower");
00297   $runnable->run;
00298 
00299   return $ooc_file;
00300 }
00301 
00302 1;