Archive Ensembl HomeArchive Ensembl Home
ParseEnredo.pm
Go to the documentation of this file.
00001 #ensembl module for bio::ensembl::compara::production::epoanchors::parseenredo
00002 # you may distribute this module under the same terms as perl itself
00003 #
00004 # POD documentation - main docs before the code
00005 =head1 NAME
00006 
00007 Bio::EnsEMBL::Compara::Production::EPOanchors::ParseEnredo
00008 
00009 =head1 SYNOPSIS
00010 
00011 $self->fetch_input();
00012 $self->run();
00013 $self->write_output(); writes to database
00014 
00015 =head1 DESCRIPTION
00016 
00017 Module to set up the production database for generating multiple alignments usng Ortheus.
00018 
00019 
00020 =head1 AUTHOR - compara
00021 
00022 This modules is part of the Ensembl project http://www.ensembl.org
00023 
00024 Email dev@ensembl.org
00025 
00026 =head1 CONTACT
00027 
00028 This modules is part of the EnsEMBL project (http://www.ensembl.org)
00029 
00030 Questions can be posted to the ensembl-dev mailing list:
00031 dev@ensembl.org
00032 
00033 
00034 =head1 APPENDIX
00035 
00036 The rest of the documentation details each of the object methods. 
00037 Internal methods are usually preceded with a _
00038 
00039 =cut
00040 #
00041 package Bio::EnsEMBL::Compara::Production::EPOanchors::ParseEnredo;
00042 
00043 use strict;
00044 use Data::Dumper;
00045 use Bio::EnsEMBL::Registry;
00046 use Bio::EnsEMBL::Compara::DBSQL::DBAdaptor;
00047 
00048 use base ('Bio::EnsEMBL::Compara::RunnableDB::BaseRunnable');
00049 
00050 
00051 sub fetch_input {
00052     my ($self) = @_;
00053     my (%DF, %SEG, %Zero_st, @Zero_st, $dnafrag_region);
00054     open(IN, $self->param('enredo_out_file')) or die;
00055     {
00056         local $/ = "block";
00057         while(<IN>){
00058             next if /#/;
00059             $dnafrag_region++;
00060             foreach my $seg(split("\n", $_)){
00061                 next unless $seg=~/:/;
00062                 my($species,$chromosome,$start,$end,$strand) =
00063                 $seg=~/^([\w:]+):([^\:]+):(\d+):(\d+) \[(.*)\]/;
00064                 ($start,$end) = ($start+1,$end-1); # assuming anchors have been split and have a one base overlap
00065                 $DF{$species}++;
00066                 $Zero_st{$dnafrag_region}++ unless $strand; # catch the dnafrag_region_id if there is a least one zero strand
00067                 $SEG{$dnafrag_region}{ join(":", $species,$chromosome,$start,$end,$strand) }++; 
00068                 }
00069         }
00070     }
00071     $self->param('genome_dbs', [ keys %DF ]);
00072     $self->param('dnafrag_regions', \%SEG);
00073     foreach my $dnafrag_region_id(keys %Zero_st){   
00074         push(@Zero_st, { zero_st_dnafrag_region_id => $dnafrag_region_id });
00075     }
00076     $self->param('dfrs_with_zero_st', \@Zero_st);
00077 }
00078 
00079 sub write_output {
00080     my ($self) = @_;
00081     our $master_db;
00082     *master_db = \$self->param('compara_master');
00083     my $master_params = join(" ", "-u", $master_db->{'-user'}, "-h", $master_db->{'-host'}, $master_db->{'-dbname'});
00084     my $master_select = "mysql ".$master_params.' -NB -e"SELECT ';
00085     my $master_dump = "mysqldump -t ".$master_params;
00086 
00087     my $to_db = " | mysql -h".$self->dbc->host." -u".$self->dbc->username.
00088         " -p".$self->dbc->password." -D".$self->dbc->dbname." -P".$self->dbc->port;
00089     # sort the species names from the enredo output files
00090     my $ancestor_db = $self->param('ancestor_db');
00091     my $genome_dbs_names_from_file = join("','", sort {$a cmp $b} @{ $self->param('genome_dbs') }, $ancestor_db->{'-name'});
00092 
00093     my $genomeDB_where_clause = "assembly_default AND name IN ('".$genome_dbs_names_from_file."')";
00094     my $genomeDB_pipe = $master_dump." -w \"$genomeDB_where_clause\""." genome_db".$to_db;
00095     system($genomeDB_pipe);
00096     my $ss_cmd = $master_select.'species_set_id FROM method_link_species_set WHERE method_link_species_set_id ='.
00097         $self->param('ortheus_mlssid').'"'; # get the species_set_id from the method_link_species_set table
00098     my ($ss_id) = map{ chomp $_;$_ } `$ss_cmd`;
00099     my $species_set_where_clause = "species_set_id = ".$ss_id;
00100     my $species_set_pipe = $master_dump." -w \"$species_set_where_clause\""." species_set".$to_db;
00101     system($species_set_pipe);
00102     my $gdb_from_mlssid = $master_select.'GROUP_CONCAT(gdb.name) FROM genome_db gdb INNER JOIN species_set ss ON ss.genome_db_id='.
00103         'gdb.genome_db_id WHERE ss.species_set_id='.$ss_id.'"';
00104 
00105     my ($gdb_names_from_master) = map{ chop $_;$_ } `$gdb_from_mlssid`; 
00106     $genome_dbs_names_from_file=~s/\'//g;
00107     my (%gdb_names_from_master, $species_number_from_file);
00108     foreach my $master_gdb(split(",", $gdb_names_from_master)){
00109         $gdb_names_from_master{ $master_gdb }++;
00110     }
00111     foreach my $gdb_from_file(split(",", $genome_dbs_names_from_file)){
00112         die "species $gdb_from_file from enredo file not in species_set from mlssid", $! 
00113             unless(exists($gdb_names_from_master{$gdb_from_file}) || $gdb_from_file eq $ancestor_db->{'-name'});
00114         $species_number_from_file++;
00115     }
00116     warn "WARNING : species numbers from enredo file are different from db mlssid ". $self->param('ortheus_mlssid')
00117         if( $species_number_from_file <=> (keys %gdb_names_from_master) + 1);
00118     # create the ancestral core db
00119     my $ancestral_create = "mysql -u" . $ancestor_db->{'-user'} . " -h" . $ancestor_db->{'-host'} . " -p" . $ancestor_db->{'-pass'};
00120     system($ancestral_create . " -e" . "\"CREATE DATABASE " . $ancestor_db->{'-dbname'} . "\"");
00121     system($ancestral_create . " " . $ancestor_db->{'-dbname'} . " < " . $self->param('core_cvs_sql_schema'));  
00122     # set the locator field in the genome_db table
00123     $self->set_gdb_locator($genome_dbs_names_from_file);
00124     $genome_dbs_names_from_file=~s/,/','/g;
00125     my $quoted_gdb_names = '\''.$genome_dbs_names_from_file.'\'';
00126     my $genome_db_ids_from_names = $master_select.'GROUP_CONCAT(genome_db_id) FROM genome_db WHERE assembly_default AND name IN ('.$quoted_gdb_names.')"';
00127     my($gdb_ids) = map{ chomp $_;$_ } `$genome_db_ids_from_names`;
00128     # populate the method_link, method_link_species_set and dnafrag tables
00129     my $dnafrag_where_clause = "genome_db_id in (".$gdb_ids.")";
00130     my $dnafrag_pipe = $master_dump." -w \"$dnafrag_where_clause\""." dnafrag".$to_db;
00131     system($dnafrag_pipe);
00132     my $method_link_pipe = $master_dump." method_link".$to_db;
00133     system($method_link_pipe);
00134     my $mlss_clause = "method_link_species_set_id=".$self->param('ortheus_mlssid');
00135     my $mlss_pipe = $master_dump." -w \"$mlss_clause\""." method_link_species_set".$to_db;
00136     system($mlss_pipe);
00137     my $enredo_name = "ENREDO";
00138     my $sth1 = $self->dbc->prepare("SELECT method_link_id FROM method_link WHERE type = \"$enredo_name\"");
00139     $sth1->execute();
00140     my $sth2 = $self->dbc->prepare("REPLACE INTO method_link_species_set (method_link_id, species_set_id, name) VALUES (" . 
00141         $sth1->fetchrow_arrayref->[0] . ", $ss_id, \"$enredo_name\")");
00142     $sth2->execute();
00143     my (%DNAFRAGS, @synteny_region_ids);
00144     # populate dnafrag_region table
00145     my $sth3 = $self->dbc->prepare("INSERT INTO dnafrag_region VALUES (?,?,?,?,?)");
00146     my $sth4 = $self->dbc->prepare("INSERT INTO synteny_region VALUES (?,?)");
00147     foreach my $synteny_region_id(sort {$a <=> $b} keys %{ $self->param('dnafrag_regions') }){
00148         $sth4->execute($synteny_region_id,$self->param('ortheus_mlssid'));
00149         foreach my $dnafrag_region(keys %{ $self->param('dnafrag_regions')->{$synteny_region_id} }){
00150             my($species_name,$dnafrag_name,$start,$end,$strand)=split(":", $dnafrag_region);
00151             # get only the dnafrags used by enredo
00152             unless (exists($DNAFRAGS{$species_name}{$dnafrag_name})) {
00153                 my $dnaf_sth = $self->dbc->prepare("SELECT df.dnafrag_id FROM dnafrag df INNER JOIN genome_db " . 
00154                     "gdb ON gdb.genome_db_id = df.genome_db_id WHERE gdb.name = ? AND df.name = ?");
00155                 $dnaf_sth->execute($species_name,$dnafrag_name);
00156                 while(my $dnafrag_info = $dnaf_sth->fetchrow_arrayref()) {
00157                     $DNAFRAGS{ $species_name }{ $dnafrag_name } = $dnafrag_info->[0];
00158                 }
00159             }
00160             $sth3->execute($synteny_region_id,$DNAFRAGS{$species_name}{$dnafrag_name},$start,$end,$strand);     
00161         }
00162         push(@synteny_region_ids, {synteny_region_id => $synteny_region_id})
00163     }
00164     # add the MTs to the dnafrag_region table
00165     if($self->param('addMT')) {
00166         my $max_synteny_region_id = $synteny_region_ids[-1]->{'synteny_region_id'} + 1;
00167         $sth4->execute($max_synteny_region_id, $self->param('ortheus_mlssid'));
00168         my $sth_mt = $self->dbc->prepare("SELECT dnafrag_id, length FROM dnafrag WHERE name =\"MT\"");
00169         $sth_mt->execute;
00170         foreach my $dnafrag_region ( @{ $sth_mt->fetchall_arrayref } ) {
00171             $sth3->execute($max_synteny_region_id, $dnafrag_region->[0], 1, $dnafrag_region->[1], 1);
00172         }
00173         push(@synteny_region_ids, {synteny_region_id => $max_synteny_region_id});
00174     }
00175     # add the species_tree
00176     my $compara_master = new Bio::EnsEMBL::Compara::DBSQL::DBAdaptor(%$master_db);
00177     my $species_tree = $compara_master->get_SpeciesTreeAdaptor()->create_species_tree( -species_set_id => $ss_id );
00178     my $newick_tree = lc( $species_tree->newick_format() );
00179     $newick_tree=~s/ /_/g;
00180     my $meta_sth = $self->dbc->prepare("REPLACE INTO meta (meta_key, meta_value) VALUES (?,?)");
00181     $meta_sth->execute("tree_" . $self->param('ortheus_mlssid'), "$newick_tree");
00182     $self->dataflow_output_id( $self->param('dfrs_with_zero_st'), 1 );
00183     $self->dataflow_output_id( \@synteny_region_ids, 2 );
00184 }
00185 
00186 sub set_gdb_locator { # fill in the locator field in the genome_db table
00187     my $self = shift;
00188     my $ancestor_db = $self->param('ancestor_db');
00189     my $genome_db_names = shift;
00190     Bio::EnsEMBL::Registry->load_registry_from_multiple_dbs( $self->param('main_core_dbs') );
00191     my @dbas = @{ Bio::EnsEMBL::Registry->get_all_DBAdaptors() };
00192     foreach my $genome_db_name(split(",", $genome_db_names), $ancestor_db->{'-name'}){
00193         my $core_genome_db_name = $genome_db_name . "_core_";
00194         my($host,$port,$db_name,$user,$pass,$locator_string);
00195         foreach my $dba(@dbas){
00196             if($dba->dbc->dbname=~m/$core_genome_db_name/) {
00197                 $host = $dba->dbc->host;
00198                 $port = $dba->dbc->port;
00199                 $db_name = $dba->dbc->dbname;
00200                 $user = $dba->dbc->username;
00201                 $locator_string = "Bio::EnsEMBL::DBSQL::DBAdaptor/host=$host;port=$port;user=$user;".
00202                     "dbname=$db_name;species=$genome_db_name;disconnect_when_inactive=1";
00203                 last;
00204             } 
00205         } 
00206         if ($ancestor_db->{'-dbname'}=~m/$core_genome_db_name/){
00207             $host = $ancestor_db->{'-host'};
00208             $port = $ancestor_db->{'-port'};
00209             $db_name = $ancestor_db->{'-dbname'};
00210             $user = $ancestor_db->{'-user'};
00211             $pass = $ancestor_db->{'-pass'};
00212             $locator_string = "Bio::EnsEMBL::DBSQL::DBAdaptor/host=$host;port=$port;user=$user;pass=$pass;". 
00213                 "dbname=$db_name;species=$genome_db_name;disconnect_when_inactive=1";
00214         }
00215         my $sth = $self->dbc->prepare("UPDATE genome_db SET locator=\"$locator_string\" WHERE name = \"$genome_db_name\"");
00216         $sth->execute;
00217     }
00218 }
00219 
00220 1;
00221