Archive Ensembl HomeArchive Ensembl Home
CAFEAnalysis.pm
Go to the documentation of this file.
00001 =head1 LICENSE
00002 
00003   Copyright (c) 1999-2012 The European Bioinformatics Institute and
00004   Genome Research Limited.  All rights reserved.
00005 
00006   This software is distributed under a modified Apache license.
00007   For license details, please see
00008 
00009     http://www.ensembl.org/info/about/code_licence.html
00010 
00011 =head1 CONTACT
00012 
00013   Please email comments or questions to the public Ensembl
00014   developers list at <dev@ensembl.org>.
00015 
00016   Questions may also be sent to the Ensembl help desk at
00017   <helpdesk@ensembl.org>.
00018 
00019 =cut
00020 
00021 =head1 NAME
00022 
00023 Bio::EnsEMBL::Compara::RunnableDB::ncRNAtrees::CAFEAnalysis
00024 
00025 =head1 SYNOPSIS
00026 
00027 =head1 DESCRIPTION
00028 
00029 This RunnableDB calculates the dynamics of a ncRNA family (based on the tree obtained and the CAFE software) in terms of gains losses per branch tree. It needs a CAFE-compliant species tree.
00030 
00031 =head1 INHERITANCE TREE
00032 
00033 Bio::EnsEMBL::Compara::RunnableDB::BaseRunnable
00034 
00035 =head1 APPENDIX
00036 
00037 The rest of the documentation details each of the object methods.
00038 Internal methods are usually preceded with an underscore (_)
00039 
00040 =cut
00041 
00042 package Bio::EnsEMBL::Compara::RunnableDB::ncRNAtrees::CAFEAnalysis;
00043 
00044 use strict;
00045 use Data::Dumper;
00046 
00047 use Bio::EnsEMBL::Compara::CAFETreeNode;
00048 use Bio::EnsEMBL::Compara::Graph::NewickParser;
00049 
00050 use base ('Bio::EnsEMBL::Compara::RunnableDB::BaseRunnable');
00051 
00052 sub param_defaults {
00053     return {
00054             'pvalue_lim' => 0.05,
00055            };
00056 }
00057 
00058 =head2 fetch_input
00059 
00060     Title     : fetch_input
00061     Usage     : $self->fetch_input
00062     Function  : Fetches input data from database
00063     Returns   : none
00064     Args      : none
00065 
00066 =cut
00067 
00068 sub fetch_input {
00069     my ($self) = @_;
00070 
00071     unless ( $self->param('cafe_tree_string') ) {
00072         die ('cafe_species_tree can not be found');
00073     }
00074 
00075     unless ( $self->param('cafe_table_file') ) {
00076         die ('cafe_table_file must be set');
00077     }
00078 
00079     unless ( $self->param('mlss_id') ) {
00080         die ('mlss_id must be set')
00081     }
00082 
00083 #    my $nctree_Adaptor = $self->compara_dba->get_NCTreeAdaptor;
00084 #    $self->param('nctree_Adaptor', $nctree_Adaptor);
00085 
00086     my $cafetree_Adaptor = $self->compara_dba->get_CAFETreeAdaptor;
00087     $self->param('cafeTree_Adaptor', $cafetree_Adaptor);
00088 
00089     my $genomeDB_Adaptor = $self->compara_dba->get_GenomeDBAdaptor;
00090     $self->param('genomeDB_Adaptor', $genomeDB_Adaptor);
00091 
00092     # cafe_shell, mlss_id, cafe_lambdas and cafe_struct_tree_str are also defined parameters
00093 
00094     return;
00095 }
00096 
00097 sub run {
00098     my ($self) = @_;
00099     $self->run_cafe_script;
00100     $self->parse_cafe_output;
00101 }
00102 
00103 sub write_output {
00104     my ($self) = @_;
00105 #    $self->store_expansion_contraction();
00106 #    $self->store_cafe_tree();
00107 
00108     my $lambda = $self->param('lambda');
00109     $self->dataflow_output_id ( {
00110                                  'cafe_lambda' => $self->param('lambda'),
00111                                  'cafe_table_file' => $self->param('work_dir') . "/" . $self->param('cafe_table_file'),
00112                                  'cafe_tree_string' => $self->param('cafe_tree_string'),
00113                                 }, 3);
00114 
00115 }
00116 
00117 ###########################################
00118 ## Internal methods #######################
00119 ###########################################
00120 
00121 sub run_cafe_script {
00122     my ($self) = @_;
00123 
00124     my $mlss_id = $self->param('mlss_id');
00125     my $pval_lim = $self->param('pvalue_lim');
00126     my $cafe_out_file = $self->worker_temp_directory() . "cafe_${mlss_id}.out";
00127     print STDERR "CAFE results will be written into [$cafe_out_file]\n";
00128     my $script_file = $self->worker_temp_directory() . "cafe_${mlss_id}.sh";
00129     open my $sf, ">", $script_file or die $!;
00130     print STDERR "Script file is [$script_file]\n" if ($self->debug());
00131 
00132     my $cafe_shell = $self->param('cafe_shell');
00133     my $cafe_tree_str = $self->param('cafe_tree_string');
00134     chop($cafe_tree_str); #remove final semicolon
00135     $cafe_tree_str =~ s/:\d+$//; # remove last branch length
00136 
00137     my $cafe_table_file = $self->param('work_dir') . "/" . $self->param('cafe_table_file');
00138     my $cafe_lambdas = $self->param('cafe_lambdas');
00139     my $cafe_struct_tree = $self->param('cafe_struct_tree_str');
00140 
00141     print $sf '#!' . $cafe_shell . "\n\n";
00142     print $sf "tree $cafe_tree_str\n\n";
00143     print $sf "load -p ${pval_lim} -i $cafe_table_file\n\n";
00144     print $sf "lambda ";
00145     print $sf $cafe_lambdas ? "-l $cafe_lambdas -t $cafe_struct_tree\n\n" : " -s\n\n";
00146     print $sf "report $cafe_out_file\n\n";
00147     close ($sf);
00148 
00149     print STDERR "CAFE output in [$cafe_out_file]\n" if ($self->debug());
00150 
00151     $self->param('cafe_out_file', $cafe_out_file);
00152 
00153     chmod 0755, $script_file;
00154 
00155     $self->compara_dba->dbc->disconnect_when_inactive(0);
00156     unless ((my $err = system($script_file)) == 4096) {
00157         print STDERR "CAFE returning error $err\n";
00158 #         for my $f (glob "$cafe_out_file*") {
00159 #             system(`head $f >> /lustre/scratch101/ensembl/mp12/kkkk`);
00160 #         }
00161         # It seems that CAFE doesn't exit with error code 0 never (usually 4096?)
00162 #        $self->throw("problem running script $cafe_out_file: $err\n");
00163     }
00164     $self->compara_dba->dbc->disconnect_when_inactive(1);
00165     return;
00166 }
00167 
00168 sub parse_cafe_output {
00169     my ($self) = @_;
00170     my $fmt = '%{-n}%{":"o}';
00171 
00172     my $cafeTree_Adaptor = $self->param('cafeTree_Adaptor');
00173     my $mlss_id = $self->param('mlss_id');
00174     my $pvalue_lim = $self->param('pvalue_lim');
00175     my $cafe_out_file = $self->param('cafe_out_file') . ".cafe";
00176     my $genomeDB_Adaptor = $self->param('genomeDB_Adaptor');
00177 
00178     print STDERR "CAFE OUT FILE [$cafe_out_file]\n" if ($self->debug);
00179 
00180     open my $fh, "<". $cafe_out_file or die $!;
00181 
00182     my $tree_line = <$fh>;
00183     my $tree_str = substr($tree_line, 5, length($tree_line) - 6);
00184     $tree_str .= ";";
00185     my $tree = Bio::EnsEMBL::Compara::Graph::NewickParser::parse_newick_into_tree($tree_str, "Bio::EnsEMBL::Compara::CAFETreeNode");
00186     print STDERR "CAFE TREE: $tree_str\n" if ($self->debug);
00187 
00188     my $lambda_line = <$fh>;
00189     my $lambda = substr($lambda_line, 8, length($lambda_line) - 9);
00190     print STDERR "CAFE LAMBDAS: $lambda\n" if ($self->debug);
00191 
00192     my $ids_line = <$fh>;
00193     my $ids_tree_str = substr($ids_line, 15, length($ids_line) - 16);
00194     $ids_tree_str =~ s/<(\d+)>/:$1/g;
00195     $ids_tree_str .= ";";
00196     print STDERR "CAFE IDsTREE: $ids_tree_str\n" if ($self->debug);
00197 
00198     my $idsTree = Bio::EnsEMBL::Compara::Graph::NewickParser::parse_newick_into_tree($ids_tree_str);
00199     print STDERR $idsTree->newick_format('ryo', '%{-n}%{":"d}'), "\n" if ($self->debug);
00200 
00201     my %cafeIDs2nodeIDs = ();
00202     for my $node (@{$idsTree->get_all_nodes()}) {
00203         $cafeIDs2nodeIDs{$node->distance_to_parent()} = $node->node_id;
00204     }
00205 
00206     my $format_ids_line = <$fh>;
00207     my ($formats_ids) = (split /:/, $format_ids_line)[2];
00208     $formats_ids =~ s/^\s+//;
00209     $formats_ids =~ s/\s+$//;
00210     my @format_pairs_cafeIDs = split /\s+/, $formats_ids;
00211     my @format_pairs_nodeIDs = map {my ($fst,$snd) = $_ =~ /\((\d+),(\d+)\)/; [($cafeIDs2nodeIDs{$fst}, $cafeIDs2nodeIDs{$snd})]} @format_pairs_cafeIDs;
00212 
00213 
00214 # Store the tree
00215     $tree->method_link_species_set_id($mlss_id);
00216     $tree->species_tree($ids_tree_str);
00217     $tree->lambdas($lambda);
00218 
00219     $cafeTree_Adaptor->store($tree);
00220 
00221     while (<$fh>) {
00222         last if $. == 10; # We skip several lines and go directly to the family information.
00223 # Is it always 10??
00224     }
00225 
00226     while (my $fam_line = <$fh>) {
00227         my @flds = split/\s+/, $fam_line;
00228         my $fam_id = $flds[0];
00229         my $fam_tree_str = $flds[1];
00230         my $avg_pvalue = $flds[2];
00231         my $pvalue_pairs = $flds[3];
00232 
00233         #print "FAM_PVALUE:$avg_pvalue VS PVALUE_LIM:$pvalue_lim\n";
00234 
00235         next if ($avg_pvalue >= $pvalue_lim);
00236         print STDERR "FAM_ID:$fam_id\n";
00237 
00238         my $fam_tree = Bio::EnsEMBL::Compara::Graph::NewickParser::parse_newick_into_tree($fam_tree_str . ";");
00239 
00240         my %info_by_nodes;
00241         for my $node (@{$fam_tree->get_all_nodes()}) {
00242             my $name = $node->name();
00243             my ($n_members) = $name =~ /_(\d+)/;
00244             $name =~ s/_\d+//;
00245             $name =~ s/\./_/g;
00246             $info_by_nodes{$name}{'gene_tree_root_id'} = $fam_id;
00247             $info_by_nodes{$name}{'n_members'} = $n_members;
00248 
00249             my $taxon_id;
00250             if (! $node->is_leaf()) {
00251                 $taxon_id = $name;
00252             } else {
00253                 my $genomeDB = $genomeDB_Adaptor->_fetch_by_name($name);
00254                 $taxon_id = $genomeDB->taxon_id();
00255             }
00256 
00257             $info_by_nodes{$name}{'taxon_id'} = $taxon_id;
00258         }
00259 
00260         $pvalue_pairs =~ tr/(/[/;
00261         $pvalue_pairs =~ tr/)/]/;
00262         $pvalue_pairs = eval $pvalue_pairs;
00263 
00264         die "Problem processing the $pvalue_pairs\n" if (ref $pvalue_pairs ne "ARRAY");
00265 
00266         for (my $i=0; $i<scalar(@$pvalue_pairs); $i++) {
00267             my ($val_fst, $val_snd) = @{$pvalue_pairs->[$i]};
00268             my ($id_fst, $id_snd) = @{$format_pairs_nodeIDs[$i]};
00269             my $name1 = $idsTree->find_node_by_node_id($id_fst)->name();
00270             my $name2 = $idsTree->find_node_by_node_id($id_snd)->name();
00271             $name1 =~ s/\./_/g;
00272             $name2 =~ s/\./_/g;
00273 
00274             $info_by_nodes{$name1}{'p_value'} = $val_fst;
00275             $info_by_nodes{$name2}{'p_value'} = $val_snd;
00276 
00277         }
00278 
00279         $tree->print_tree(0.2) if ($self->debug());
00280 
00281         # We store the attributes
00282 #        $tree->store_tag("p_value", $avg_pvalue);
00283         for my $node (@{$tree->get_all_nodes()}) {
00284             my $n = $node->name();
00285             $n =~ s/\./_/g;
00286 
00287             my $fam_id = $info_by_nodes{$n}{gene_tree_root_id};
00288 #            $node->store_tag("gene_tree_root_id", $fam_id);
00289 
00290             my $taxon_id = $info_by_nodes{$n}{taxon_id};
00291 #            $node->store_tag("taxon_id", $taxon_id);
00292 
00293             my $n_members = $info_by_nodes{$n}{n_members};
00294 #            $node->store_tag("n_members", $n_members);
00295 
00296             my $p_value = $info_by_nodes{$n}{p_value};
00297 #            $node->store_tag("p_value", $p_value);
00298             $cafeTree_Adaptor->store_tagvalues($node, $fam_id, $taxon_id, $n_members, $p_value, $avg_pvalue);
00299         }
00300 #         print STDERR "INFO BY NODES:\n";
00301 #         print STDERR Dumper \%info_by_nodes;
00302 #         print STDERR Dumper \%info_by_nodes if ($self->debug);
00303 
00304     }
00305     return
00306 }
00307 
00308 # sub store_cafe_tree {
00309 #     my ($self) = @_;
00310 
00311 #     my $cafe_tree = new Bio::EnsEMBL::Compara::CAFETreeNode;
00312 # }
00313 
00314 # Not used anymore for now
00315 # sub store_expansion_contraction {
00316 #     my ($self) = @_;
00317 #     my $cafe_out_file = $self->param('cafe_out_file');
00318 #     my $nctree_Adaptor = $self->param('nctree_Adaptor');
00319 
00320 #     open my $fh, "<", $cafe_out_file.".cafe" or die $!;
00321 # #     my $tree_line = <$fh>;
00322 # #     my $lambda_line = <$fh>;
00323 # #     my $ids_line = <$fh>;
00324 
00325 # ## WARNI1NG: if the lambda tree is provided, 1 more line in the output file will be present.
00326 
00327 #     while (my $fam_line = <$fh>) {
00328 #         if ($fam_line =~ /^Lambda:\s(\d+\.\d+)/) {
00329 #             $self->param('lambda', $1);
00330 #             next;
00331 #         }
00332 #         next unless $fam_line =~ /^\d+/;
00333 #         chomp $fam_line;
00334 #         my @flds = split /\s+/, $fam_line;
00335 #         my ($node_id, $avg_expansion) = @flds[0,2];
00336 #         my $nc_tree = $nctree_Adaptor->fetch_node_by_node_id($node_id);
00337 #         $nc_tree->store_tag('average_expansion', $avg_expansion);
00338 #     }
00339 
00340 #     return;
00341 # }
00342 
00343 1;