Archive Ensembl HomeArchive Ensembl Home
PhastCons.pm
Go to the documentation of this file.
00001 #
00002 # Ensembl module for Bio::EnsEMBL::Compara::Production::GenomicAlignBlock::PhastCons
00003 #
00004 # Copyright Ensembl Team
00005 #
00006 # You may distribute this module under the same terms as perl itself
00007 
00008 # POD documentation - main docs before the code
00009 
00010 =head1 NAME
00011 
00012 Bio::EnsEMBL::Compara::Production::GenomicAlignBlock::PhastCons
00013 
00014 =head1 SYNOPSIS
00015 
00016     $phastCons->fetch_input();
00017     $phastCons->run();
00018     $phastCons->write_output(); writes to database
00019 
00020 =head1 DESCRIPTION
00021 
00022     Given a genomic_align_tree Bio::EnsEMBL::Compara::DBSQL::GenomicAlignTree
00023     identifier it fetches the alignment and the tree from a compara database and runs
00024     the program phastCons. At the moment, only conserved elements are supported,
00025     the parser for conservation scores is not implemented yet.
00026 
00027     WARNING: This is preliminary version nd the paths/options/etc are hard-coded.
00028     Please, refer to the TODO section for more details.
00029 
00030 =head1 AUTHOR - Javier Herrero
00031 
00032 
00033 =head1 CONTACT
00034 
00035 This modules is part of the EnsEMBL project (http://www.ensembl.org)
00036 
00037 Questions can be posted to the ensembl-dev mailing list:
00038 dev@ensembl.org
00039 
00040 
00041 =head1 TODO
00042 
00043 - Implement the parsing and storing of conservation scores
00044 
00045 - Move the path to software to a configuration module
00046 
00047 - Allow to specify the method_link_type for phastCons using the input_id or the parameters hash
00048 
00049 - Allow to change the model file, in particular the alphabet, background distribution and
00050   rate matrix.
00051 
00052 - Allow to specify phastCons options using the input_id or the parameters hash
00053 
00054 - Allow to specify the species you want to skip (if any), the uninformative species
00055   and the reference species using the input_id or the parameters hash.
00056 
00057 - Support to run phastCons with no reference sequence.
00058 
00059 - Maybe store the MethodLinkSpeciesSet in the database if it doesn't exist yet.
00060 
00061 =head1 APPENDIX
00062 
00063 The rest of the documentation details each of the object methods. 
00064 Internal methods are usually preceded with a _
00065 
00066 =cut
00067 
00068 package Bio::EnsEMBL::Compara::Production::GenomicAlignBlock::PhastCons;
00069 
00070 use strict;
00071 use File::Basename;
00072 
00073 use Bio::EnsEMBL::Compara::Production::DBSQL::DBAdaptor;
00074 use Bio::EnsEMBL::Utils::Exception;
00075 use Bio::EnsEMBL::Compara::ConservationScore;
00076 
00077 use Bio::EnsEMBL::Hive::Process;
00078 our @ISA = qw(Bio::EnsEMBL::Hive::Process);
00079 
00080 #location of phast binaries:
00081 my $BIN_DIR = "/software/ensembl/compara/phast";
00082 my $MSA_VIEW_EXE = "$BIN_DIR/msa_view";
00083 my $PHAST_CONS_EXE = "$BIN_DIR/phastCons";
00084 
00085 my $METHOD_LINK_TYPE = "PHASTCONS_CONSTRAINED_ELEMENT";
00086 
00087 =head2 fetch_input
00088 
00089     Title   :   fetch_input
00090     Usage   :   $self->fetch_input
00091     Function:   Fetches input data for gerp from the database
00092     Returns :   none
00093     Args    :   none
00094 
00095 =cut
00096 
00097 sub fetch_input {
00098   my( $self) = @_;
00099 
00100   #create a Compara::DBAdaptor which shares the same DBI handle
00101   #with $self->db (Hive DBAdaptor)
00102   $self->{'comparaDBA'} = Bio::EnsEMBL::Compara::Production::DBSQL::DBAdaptor->new(-DBCONN=>$self->db->dbc);
00103   $self->{'comparaDBA'}->dbc->disconnect_when_inactive(0);
00104 
00105   #read from analysis table
00106   $self->get_params($self->parameters); 
00107 
00108   #read from analysis_job table
00109   $self->get_params($self->input_id);
00110 
00111   my $gata = $self->{'comparaDBA'}->get_GenomicAlignTreeAdaptor;
00112   my $root_id = $self->root_id;
00113   $self->{gat} = $gata->fetch_node_by_node_id($root_id);
00114   if ($root_id != $self->{gat}->node_id) {
00115     die "Cannot find a tree with this node ID: ".$root_id."\n";
00116   }
00117 
00118   my $reference_leaves = [];
00119   my @species_to_skip = ("Gorilla gorilla", "Pongo pygmaeus");
00120   my @uninformative_species = ("Pan troglodytes");
00121   my $reference_species = "Homo sapiens";
00122 
00123   ## Trim sequences from species to skip, find all the reference ones and tag uninformatives ones
00124   foreach my $leaf (@{$self->{gat}->get_all_leaves}) {
00125     my $species_name = $leaf->genomic_align_group->genome_db->name;
00126     if (grep {$_ eq $species_name} @species_to_skip) {
00127       $leaf->disavow_parent;
00128       $self->{gat} = $self->{gat}->minimize_tree;
00129       next;
00130     }
00131     # phastCons might take number only names as indexes. Prepend gat_ to node_id to build the name
00132     $leaf->name("gat_".$leaf->node_id);
00133     if (grep {$_ eq $species_name} @uninformative_species) {
00134       push(@{$self->{uninformative_leaves}}, $leaf->node_id);
00135     } elsif (!$reference_species or $species_name eq $reference_species) {
00136       push(@{$self->{uninformative_leaves}}, $leaf->node_id);
00137       push(@$reference_leaves, $leaf->node_id);
00138     }
00139   }
00140 
00141   ## Store the model file
00142   $self->{mod_file} = $self->worker_temp_directory . "gat_".$root_id.".mod";
00143   open(MOD, ">".$self->{mod_file}) or die;
00144   print MOD "ALPHABET: A C G T
00145 ORDER: 0
00146 SUBST_MOD: REV
00147 BACKGROUND: 0.295000 0.205000 0.205000 0.295000
00148   -0.976030    0.165175    0.539722    0.271133
00149    0.237691   -0.990352    0.189637    0.563024
00150    0.776673    0.189637   -1.248143    0.281833
00151    0.271133    0.391254    0.195849   -0.858237
00152 TREE: ";
00153   print MOD $self->{gat}->newick_format("simple"), "\n";
00154   close(MOD);
00155 
00156   foreach my $this_ref_node_id (@$reference_leaves) {
00157     ## Check if the file already exists
00158     my $root_file_name = $self->worker_temp_directory . "gat_".$root_id.".$this_ref_node_id";
00159     my $multifasta_file = "$root_file_name.mfa";
00160     my $ref_fasta_file = "$root_file_name.fa";
00161     my $suff_stats_file = "$root_file_name.ss";
00162     open(MULTIFASTA, ">$multifasta_file") or die;
00163 
00164     ## Write the ref sequence first
00165     my $chr_name;
00166     foreach my $leaf (@{$self->{gat}->get_all_leaves}) {
00167       next if ($leaf->node_id != $this_ref_node_id);
00168       # Make sure the reference sequence is in the forward strand
00169       if ($leaf->genomic_align_group->get_all_GenomicAligns->[0]->dnafrag_strand == -1) {
00170         $self->{gat}->reverse_complement;
00171       }
00172       print MULTIFASTA ">", $leaf->name, "\n";
00173       my $aligned_sequence = $leaf->aligned_sequence;
00174       $aligned_sequence =~ tr/./-/;
00175       print MULTIFASTA $aligned_sequence, "\n";
00176 
00177       # Write the ref sequence in FASTA format, required for msa_view -> phastCons
00178       open(FASTA, ">$ref_fasta_file ") or die;
00179       print FASTA ">", $leaf->name, "\n";
00180       my $original_sequence = $leaf->aligned_sequence;
00181       $original_sequence =~ tr/./-/;
00182       print FASTA $original_sequence, "\n";
00183       close(FASTA);
00184 
00185       last;
00186     }
00187     ## Write the remaining sequences
00188     foreach my $leaf (@{$self->{gat}->get_all_leaves}) {
00189       next if ($leaf->node_id == $this_ref_node_id);
00190       print MULTIFASTA ">", $leaf->name, "\n";
00191       my $aligned_sequence = $leaf->aligned_sequence;
00192       $aligned_sequence =~ tr/./-/;
00193       print MULTIFASTA $aligned_sequence, "\n";
00194     }
00195     close(MULTIFASTA);
00196 
00197     # Get the multiple alignment in Sufficient Statistics format:
00198     my $run_str = "$MSA_VIEW_EXE -i FASTA $multifasta_file -o SS".
00199         " --refseq 2$ref_fasta_file > $suff_stats_file";
00200     system($run_str) == 0
00201         or die "$run_str failed: $?";
00202     die if (!-e $suff_stats_file);
00203     $self->{ss_files}->{$this_ref_node_id} = $suff_stats_file;
00204   }
00205 
00206   # free up some memory
00207   foreach my $leaf (@{$self->{gat}->get_all_leaves}) {
00208     free_aligned_sequence($leaf);
00209   }
00210 
00211   return 1;
00212 }
00213 
00214 =head2 run
00215 
00216     Title   :   run
00217     Usage   :   $self->run
00218     Function:   Run gerp
00219     Returns :   none
00220     Args    :   none
00221 
00222 =cut
00223 
00224 sub run {
00225   my $self = shift;
00226 
00227   while (my ($node_id, $ss_file_name) = each %{$self->{ss_files}}) {
00228     my $bed_file_name = $ss_file_name;
00229     $bed_file_name =~ s/(\.ss)?$/\.bed/;
00230     my $run_str = "$PHAST_CONS_EXE $ss_file_name ". $self->{mod_file}.
00231         " -i SS --rho 0.3 --expected-length 45 --target-coverage 0.3";
00232     if ($self->{uninformative_leaves} and @{$self->{uninformative_leaves}}) {
00233       $run_str .= " --not-informative gat_".join(",gat_", grep {$_ ne $node_id} @{$self->{uninformative_leaves}});
00234     }
00235     $run_str .= " --most-conserved $bed_file_name --no-post-probs".
00236         " --seqname gat_".$node_id." --idpref phastCons.$node_id";
00237     print $run_str, "\n";
00238     system($run_str) == 0
00239         or die "$run_str failed: $?";
00240     die if (!-e $bed_file_name);
00241     $self->{bed_files}->{$node_id} = $bed_file_name;
00242   }
00243 }
00244 
00245 =head2 write_output
00246 
00247     Title   :   write_output
00248     Usage   :   $self->write_output
00249     Function:   Write results to the database
00250     Returns :   1
00251     Args    :   none
00252 
00253 =cut
00254 
00255 sub write_output {
00256   my ($self) = @_;
00257 
00258   print STDERR "Write Output\n";
00259   ## Get the MethodLinkSpeciesSet
00260   my $gat_method_link_species_set = $self->{gat}->get_all_leaves->[0]->
00261       genomic_align_group->get_all_GenomicAligns->[0]->method_link_species_set;
00262   my $phastCons_mlss = $gat_method_link_species_set;
00263   $phastCons_mlss = $gat_method_link_species_set->adaptor->fetch_by_method_link_type_GenomeDBs(
00264       $METHOD_LINK_TYPE, $phastCons_mlss->species_set);
00265   print "MLSS: ", $phastCons_mlss->dbID;
00266 
00267   my $constrained_element_adaptor = $gat_method_link_species_set->adaptor->db->get_ConstrainedElementAdaptor;
00268 
00269   while (my ($node_id, $bed_file_name) = each %{$self->{bed_files}}) {
00270     print "OUTPUT FOR NODE $node_id: $bed_file_name\n";
00271     print qx"head -20 $bed_file_name", "[...]\n\n";
00272 
00273     # Find the leaf
00274     my $genomic_align_node;
00275     foreach my $leaf (@{$self->{gat}->get_all_leaves}) {
00276       if ($leaf->node_id == $node_id) {
00277         $genomic_align_node = $leaf;
00278         last;
00279       }
00280     }
00281 
00282     ## We always run phastCons on the FWD strand
00283     my $genomic_align = $genomic_align_node->genomic_align_group->get_all_GenomicAligns->[0];
00284     my $dnafrag_id = $genomic_align->dnafrag_id;
00285     my $genomic_align_start = $genomic_align->dnafrag_start;
00286 
00287     ## Store the constrained elements
00288     my @constrained_elements;
00289     open(BED, $bed_file_name) or die;
00290     while (<BED>) {
00291       my ($seq_name, $start0, $end, @rest) = split(/\s+/, $_);
00292       #create new genomic align blocks by converting alignment
00293       #coords to chromosome coords
00294       my $constrained_element_block;
00295       my $constrained_element =  new Bio::EnsEMBL::Compara::ConstrainedElement(
00296             -reference_dnafrag_id => $dnafrag_id,
00297             -start => $genomic_align_start + $start0,
00298             -end => $genomic_align_start + $end - 1,
00299             -method_link_species_set_id => $phastCons_mlss->dbID,
00300             -score => 0,
00301         );
00302       push(@$constrained_element_block, $constrained_element);
00303       push(@constrained_elements, $constrained_element_block);
00304     }
00305     close(BED);
00306     #store in constrained_element table
00307     $constrained_element_adaptor->store($phastCons_mlss, \@constrained_elements);
00308   }
00309 
00310   return 1;
00311 }
00312 
00313 ##########################################
00314 #
00315 # getter/setter methods
00316 # 
00317 ##########################################
00318 #read from input_id from analysis_job table
00319 sub root_id {
00320   my $self = shift;
00321   $self->{'_root_id'} = shift if(@_);
00322   return $self->{'_root_id'};
00323 }
00324 
00325 #read method_link_type from analysis table
00326 sub constrained_element_method_link_type {
00327   my $self = shift;
00328   $self->{'_constrained_element_method_link_type'} = shift if(@_);
00329   return $self->{'_constrained_element_method_link_type'};
00330 }
00331 
00332 #read options from analysis table
00333 sub options {
00334   my $self = shift;
00335   $self->{'_options'} = shift if(@_);
00336   return $self->{'_options'};
00337 }
00338 
00339 #read from parameters of analysis table
00340 sub program {
00341   my $self = shift;
00342   $self->{'_program'} = shift if(@_);
00343   return $self->{'_program'};
00344 }
00345 
00346 sub program_file {
00347   my $self = shift;
00348   $self->{'_program_file'} = shift if(@_);
00349   return $self->{'_program_file'};
00350 }
00351 
00352 #read from parameters of analysis table
00353 sub program_version {
00354   my $self = shift;
00355   $self->{'_program_version'} = shift if(@_);
00356   return $self->{'_program_version'};
00357 }
00358 
00359 #read from parameters of analysis table
00360 sub param_file {
00361   my $self = shift;
00362   $self->{'_param_file'} = shift if(@_);
00363   return $self->{'_param_file'};
00364 }
00365 
00366 #read from parameters of analysis table
00367 sub tree_file {
00368   my $self = shift;
00369   $self->{'_tree_file'} = shift if(@_);
00370   return $self->{'_tree_file'};
00371 }
00372 
00373 #name of temporary parameter file
00374 sub param_file_tmp {
00375   my $self = shift;
00376   $self->{'_param_file_tmp'} = shift if(@_);
00377   return $self->{'_param_file_tmp'};
00378 }
00379 
00380 
00381 ##########################################
00382 #
00383 # internal methods
00384 #
00385 ##########################################
00386 sub get_params {
00387     my $self         = shift;
00388     my $param_string = shift;
00389 
00390     return unless($param_string);
00391     
00392     my $params = eval($param_string);
00393     return unless($params);
00394 
00395     if (defined($params->{'program'})) {
00396     $self->program($params->{'program'}); 
00397     }
00398     
00399     #read from parameters in analysis table
00400     if (defined($params->{'param_file'})) {
00401     $self->param_file($params->{'param_file'});
00402     }
00403     if (defined($params->{'tree_file'})) {
00404     $self->tree_file($params->{'tree_file'});
00405     }
00406     if (defined($params->{'window_sizes'})) {
00407     $self->window_sizes($params->{'window_sizes'});
00408     }
00409     if (defined($params->{'constrained_element_method_link_type'})) {
00410     $self->constrained_element_method_link_type($params->{'constrained_element_method_link_type'});
00411     }
00412     if (defined($params->{'options'})) {
00413     $self->options($params->{'options'});
00414     }
00415 
00416     #read from input_id in analysis_job table
00417     if (defined($params->{'root_id'})) {
00418         $self->root_id($params->{'root_id'}); 
00419     }
00420     if(defined($params->{'species_set'})) {
00421         $self->species_set($params->{'species_set'});
00422     }
00423     return 1;
00424 }
00425 
00426 sub free_aligned_sequence {
00427   my ($leaf) = @_;
00428 
00429   my $genomic_align_group = $leaf->genomic_align_group;
00430 
00431   foreach my $this_genomic_align (@{$genomic_align_group->get_all_GenomicAligns}) {
00432     undef($this_genomic_align->{'aligned_sequence'});
00433   }
00434 }
00435 
00436 
00437 1;