Archive Ensembl HomeArchive Ensembl Home
ImportAlignment.pm
Go to the documentation of this file.
00001 =head1 LICENSE
00002 
00003   Copyright (c) 1999-2012 The European Bioinformatics Institute and
00004   Genome Research Limited.  All rights reserved.
00005 
00006   This software is distributed under a modified Apache license.
00007   For license details, please see
00008 
00009     http://www.ensembl.org/info/about/code_licence.html
00010 
00011 =head1 CONTACT
00012 
00013   Please email comments or questions to the public Ensembl
00014   developers list at <dev@ensembl.org>.
00015 
00016   Questions may also be sent to the Ensembl help desk at
00017   <helpdesk@ensembl.org>.
00018 
00019 =head1 NAME
00020 
00021 Bio::EnsEMBL::Compara::RunnableDB::EpoLowCoverage::ImportAlignment
00022 
00023 =head1 SYNOPSIS
00024 
00025 =head1 DESCRIPTION
00026 
00027 This module imports a specified alignment. This is used in the low coverage genome alignment pipeline for importing the high coverage alignment which is used to build the low coverage genomes on.
00028 
00029 =head1 APPENDIX
00030 
00031 The rest of the documentation details each of the object methods. 
00032 Internal methods are usually preceded with a _
00033 
00034 =cut
00035 
00036 package Bio::EnsEMBL::Compara::RunnableDB::EpoLowCoverage::ImportAlignment;
00037 
00038 use strict;
00039 use Bio::EnsEMBL::Registry;
00040 use Bio::EnsEMBL::Compara::Production::DBSQL::DBAdaptor;
00041 use Bio::EnsEMBL::Utils::Exception qw(throw);
00042 
00043 use base ('Bio::EnsEMBL::Compara::RunnableDB::BaseRunnable');
00044 
00045 
00046 =head2 fetch_input
00047 
00048     Title   :   fetch_input
00049     Usage   :   $self->fetch_input
00050     Function:   Fetches input data for gerp from the database
00051     Returns :   none
00052     Args    :   none
00053 
00054 =cut
00055 
00056 sub fetch_input {
00057   my( $self) = @_;
00058 
00059   #create a Compara::DBAdaptor which shares the same DBI handle
00060   #with $self->db (Hive DBAdaptor)
00061   $self->compara_dba->dbc->disconnect_when_inactive(0);
00062 
00063   my $reg = "Bio::EnsEMBL::Registry";
00064   $reg->load_registry_from_url($self->param('from_db_url'));
00065   
00066 }
00067 
00068 =head2 run
00069 
00070     Title   :   run
00071     Usage   :   $self->run
00072     Function:   Run gerp
00073     Returns :   none
00074     Args    :   none
00075 
00076 =cut
00077 
00078 sub run {
00079     my $self = shift;
00080 
00081     #Quick and dirty import, assuming the 2 databases are on the same server. Useful for debugging
00082     if ($self->param('quick')) {
00083     $self->importAlignment_quick();
00084     } else {
00085     $self->importAlignment();
00086     }
00087 
00088 }
00089 
00090 =head2 write_output
00091 
00092     Title   :   write_output
00093     Usage   :   $self->write_output
00094     Function:   Write results to the database
00095     Returns :   1
00096     Args    :   none
00097 
00098 =cut
00099 
00100 sub write_output {
00101     my ($self) = @_;
00102 
00103     return 1;
00104 }
00105 
00106 #Uses copy_data method from copy_data.pl script
00107 sub importAlignment {
00108     my $self = shift;
00109 
00110     #if the database name is defined in the url, then open that
00111     if ($self->param('from_db_url') =~ /mysql:\/\/.*@.*\/.+/) {
00112     $self->param('from_comparaDBA', new Bio::EnsEMBL::Compara::DBSQL::DBAdaptor(-url=>$self->param('from_db_url')));
00113     } else {
00114     #open the most recent compara database
00115     $self->param('from_comparaDBA', Bio::EnsEMBL::Registry->get_DBAdaptor("Multi", "compara"));
00116     }
00117     my $analysis = $self->db->get_AnalysisAdaptor->fetch_by_logic_name("import_alignment");
00118     
00119     my $dbname = $self->param('from_comparaDBA')->dbc->dbname;
00120     my $analysis_id = $analysis->dbID;
00121     my $mlss_id = $self->param('method_link_species_set_id');
00122 
00123     ##Find min and max of the relevant internal IDs in the FROM database
00124     my $sth = $self->param('from_comparaDBA')->dbc->prepare("SELECT
00125         MIN(gab.genomic_align_block_id), MAX(gab.genomic_align_block_id),
00126         MIN(ga.genomic_align_id), MAX(ga.genomic_align_id),
00127         MIN(gat.node_id), MAX(gat.node_id),
00128         MIN(gat.root_id), MAX(gat.root_id)
00129       FROM genomic_align_block gab
00130         LEFT JOIN genomic_align ga using (genomic_align_block_id)
00131     LEFT JOIN genomic_align_tree gat ON gat.node_id = ga.node_id
00132       WHERE
00133         gab.method_link_species_set_id = ?");
00134     
00135     $sth->execute($mlss_id);
00136     my ($min_gab, $max_gab, $min_ga, $max_ga, $min_node_id, $max_node_id, 
00137     $min_root_id, $max_root_id) =
00138       $sth->fetchrow_array();
00139     
00140     $sth->finish();
00141 
00142     #HACK to just copy over one chr (22) for testing purposes
00143     #my $dnafrag_id = 905407;
00144     my $dnafrag_id;
00145 
00146     #Copy the method_link_species_set
00147     copy_data($self->param('from_comparaDBA'), $self->compara_dba,
00148           "method_link_species_set",
00149           undef, undef, undef,
00150           "SELECT * FROM method_link_species_set WHERE method_link_species_set_id = $mlss_id");
00151 
00152     #Copy the species_set
00153     copy_data($self->param('from_comparaDBA'), $self->compara_dba,
00154           "species_set",
00155           undef, undef, undef,
00156           "SELECT species_set.* FROM species_set JOIN method_link_species_set USING (species_set_id) WHERE method_link_species_set_id = $mlss_id");
00157 
00158     #copy genomic_align_block table
00159     if ($dnafrag_id) {
00160     copy_data($self->param('from_comparaDBA'), $self->compara_dba,
00161           "genomic_align_block",
00162           "genomic_align_block_id",
00163           $min_gab, $max_gab,
00164           "SELECT gab.* FROM genomic_align_block gab LEFT JOIN genomic_align ga USING (genomic_align_block_id) WHERE ga.method_link_species_set_id = $mlss_id AND dnafrag_id=$dnafrag_id");
00165     } else {
00166     copy_data($self->param('from_comparaDBA'), $self->compara_dba,
00167           "genomic_align_block",
00168           "genomic_align_block_id",
00169           $min_gab, $max_gab,
00170           "SELECT * FROM genomic_align_block WHERE method_link_species_set_id = $mlss_id");
00171     }
00172 
00173     #copy genomic_align table
00174     if ($dnafrag_id) {
00175     copy_data($self->param('from_comparaDBA'), $self->compara_dba,
00176           "genomic_align",
00177           "genomic_align_id",
00178           $min_ga, $max_ga,
00179           "SELECT ga.*".
00180           " FROM genomic_align ga ".
00181           " WHERE method_link_species_set_id = $mlss_id AND dnafrag_id=$dnafrag_id");
00182 
00183     } else {
00184 #   copy_data($self->{'from_comparaDBA'}, $self->{'comparaDBA'},
00185 #         "genomic_align",
00186 #         "genomic_align_id",
00187 #         $min_ga, $max_ga,
00188 #         "SELECT *".
00189 #         " FROM genomic_align".
00190 #         " WHERE method_link_species_set_id = $mlss_id");
00191 
00192     #Don't copy over ancestral genomic_aligns 
00193     copy_data($self->param('from_comparaDBA'), $self->compara_dba,
00194           "genomic_align",
00195           "genomic_align_id",
00196           $min_ga, $max_ga,
00197           "SELECT genomic_align.*".
00198           " FROM genomic_align JOIN dnafrag USING (dnafrag_id)".
00199           " WHERE method_link_species_set_id = $mlss_id AND genome_db_id != 63");
00200     }
00201     #copy genomic_align_tree table
00202     if ($dnafrag_id) {
00203     copy_data($self->param('from_comparaDBA'), $self->compara_dba,
00204           "genomic_align_tree",
00205           "root_id",
00206           $min_root_id, $max_root_id,
00207           "SELECT gat.*".
00208           " FROM genomic_align_tree gat LEFT JOIN genomic_align_group USING (node_id)".
00209           " LEFT JOIN genomic_align USING (genomic_align_id)".
00210           " WHERE node_id IS NOT NULL AND method_link_species_set_id = $mlss_id AND dnafrag_id=$dnafrag_id");
00211 
00212     } else {
00213     copy_data($self->param('from_comparaDBA'), $self->compara_dba,
00214           "genomic_align_tree",
00215           "root_id",
00216           $min_root_id, $max_root_id,
00217           "SELECT gat.*".
00218           " FROM genomic_align ga".
00219           " JOIN dnafrag USING (dnafrag_id)".
00220           " LEFT JOIN genomic_align_group gag USING (genomic_align_id)".
00221           " LEFT JOIN genomic_align_tree gat USING (node_id) WHERE gag.node_id IS NOT NULL AND ga.method_link_species_set_id = $mlss_id AND genome_db_id != 63");
00222     }
00223 }
00224 
00225 
00226 =head2 copy_data
00227 
00228   Arg[1]      : Bio::EnsEMBL::Compara::DBSQL::DBAdaptor $from_dba
00229   Arg[2]      : Bio::EnsEMBL::Compara::DBSQL::DBAdaptor $to_dba
00230   Arg[3]      : Bio::EnsEMBL::Compara::MethodLinkSpeciesSet $this_mlss
00231   Arg[4]      : string $table
00232   Arg[5]      : string $sql_query
00233 
00234   Description : copy data in this table using this SQL query.
00235   Returns     :
00236   Exceptions  : throw if argument test fails
00237 
00238 =cut
00239 
00240 sub copy_data {
00241   my ($from_dba, $to_dba, $table_name, $index_name, $min_id, $max_id, $query) = @_;
00242 
00243   print "Copying data in table $table_name\n";
00244 
00245   my $sth = $from_dba->dbc->db_handle->column_info($from_dba->dbc->dbname, undef, $table_name, '%');
00246   $sth->execute;
00247   my $all_rows = $sth->fetchall_arrayref;
00248   my $binary_mode = 0;
00249   foreach my $this_col (@$all_rows) {
00250     if (($this_col->[5] eq "BINARY") or ($this_col->[5] eq "VARBINARY") or
00251         ($this_col->[5] eq "BLOB") or ($this_col->[5] eq "BIT")) {
00252       $binary_mode = 1;
00253       last;
00254     }
00255   }
00256   #speed up writing of data by disabling keys, write the data, then enable 
00257   $to_dba->dbc->do("ALTER TABLE `$table_name` DISABLE KEYS");
00258   if ($binary_mode) {
00259     #copy_data_in_binary_mode($from_dba, $to_dba, $table_name, $query);
00260   } else {
00261     copy_data_in_text_mode($from_dba, $to_dba, $table_name, $index_name, $min_id, $max_id, $query);
00262   }
00263   $to_dba->dbc->do("ALTER TABLE `$table_name` ENABLE KEYS");
00264 }
00265 
00266 
00267 =head2 copy_data_in_text_mode
00268 
00269   Arg[1]      : Bio::EnsEMBL::Compara::DBSQL::DBAdaptor $from_dba
00270   Arg[2]      : Bio::EnsEMBL::Compara::DBSQL::DBAdaptor $to_dba
00271   Arg[3]      : Bio::EnsEMBL::Compara::MethodLinkSpeciesSet $this_mlss
00272   Arg[4]      : string $table
00273   Arg[5]      : string $sql_query
00274 
00275   Description : copy data in this table using this SQL query.
00276   Returns     :
00277   Exceptions  : throw if argument test fails
00278 
00279 =cut
00280 
00281 sub copy_data_in_text_mode {
00282   my ($from_dba, $to_dba, $table_name, $index_name, $min_id, $max_id, $query) = @_;
00283 
00284   my $user = $to_dba->dbc->username;
00285   my $pass = $to_dba->dbc->password;
00286   my $host = $to_dba->dbc->host;
00287   my $port = $to_dba->dbc->port;
00288   my $dbname = $to_dba->dbc->dbname;
00289   my $use_limit = 0;
00290   my $start = $min_id;
00291 #  my $step = 100000;
00292   my $step = 10000;
00293 
00294   #If not using BETWEEN, revert back to LIMIT
00295   if (!defined $index_name && !defined $min_id && !defined $max_id) {
00296       $use_limit = 1;
00297       $start = 0;
00298   }
00299 
00300   while (1) {
00301     my $end = $start + $step - 1;
00302     my $sth;
00303     
00304     if (!$use_limit) {
00305     $sth = $from_dba->dbc->prepare($query." AND $index_name BETWEEN $start AND $end");
00306     } else {
00307     $sth = $from_dba->dbc->prepare($query." LIMIT $start, $step");
00308     }
00309     $start += $step;
00310     $sth->execute();
00311     my $all_rows = $sth->fetchall_arrayref;
00312     ## EXIT CONDITION
00313     return if (!@$all_rows);
00314   
00315     my $filename = "/tmp/$table_name.copy_data.$$.txt";
00316     open(TEMP, ">$filename") or die;
00317     foreach my $this_row (@$all_rows) {
00318       print TEMP join("\t", map {defined($_)?$_:'\N'} @$this_row), "\n";
00319     }
00320     close(TEMP);
00321     if ($pass) {
00322     unless (system("mysqlimport", "-u$user", "-p$pass", "-h$host", "-P$port", "-L", "-l", "-i", $dbname, $filename) == 0) {
00323         throw("Failed mysqlimport -u$user -p$pass -h$host -P$port -L -l -i $dbname $filename");
00324     }
00325     } else {
00326     unless (system("mysqlimport", "-u$user", "-h$host", "-P$port", "-L", "-l", "-i", $dbname, $filename) ==0) {
00327         throw("Failed mysqlimport -u$user -h$host -P$port -L -l -i $dbname $filename");
00328     }
00329     }
00330     unlink("$filename");
00331   }
00332 }
00333 
00334 #Assumes the from and to databases are on the same server and downloads all entries from genomic_align_block, genomic_align,
00335 #genomic_align_group and genomic_align_tree
00336 sub importAlignment_quick {
00337     my $self = shift;
00338 
00339     #if the database name is defined in the url, then open that
00340     if ($self->param('from_db_url') =~ /mysql:\/\/.*@.*\/.+/) {
00341     $self->param('from_comparaDBA', new Bio::EnsEMBL::Compara::DBSQL::DBAdaptor(-url=>$self->param('from_db_url')));
00342     } else {
00343     #open the most recent compara database
00344     $self->param('from_comparaDBA', Bio::EnsEMBL::Registry->get_DBAdaptor("Multi", "compara"));
00345     }
00346     my $analysis = $self->db->get_AnalysisAdaptor->fetch_by_logic_name("import_alignment");
00347     
00348     my $dbname = $self->param('from_comparaDBA')->dbc->dbname;
00349     my $analysis_id = $analysis->dbID;
00350     my $mlss_id = $self->param('method_link_species_set_id');
00351 
00352     #my $sql = "INSERT INTO genomic_align_block SELECT * FROM ?.genomic_align_block WHERE method_link_species_set_id = ?\n";
00353     my $sql = "INSERT INTO genomic_align_block SELECT * FROM $dbname.genomic_align_block\n";
00354 
00355     my $sth = $self->compara_dba->dbc->prepare($sql);
00356     $sth->execute();
00357     #$sth->execute($dbname, $mlss_id);
00358     $sth->finish();
00359 
00360      #$sql = "INSERT INTO genomic_align SELECT genomic_align.* FROM ?.genomic_align LEFT JOIN WHERE method_link_species_set_id = ?\n";
00361     $sql = "INSERT INTO genomic_align SELECT * FROM $dbname.genomic_align\n";
00362     my $sth = $self->compara_dba->dbc->prepare($sql);
00363     $sth->execute();
00364     #$sth->execute($dbname, $mlss_id);
00365     $sth->finish();
00366 
00367     #$sql = "INSERT INTO genomic_align_tree SELECT genomic_align_tree.* FROM ?.genomic_align_tree LEFT JOIN ?.genomic_align_group USING (node_id) LEFT JOIN ?.genomic_align USING (genomic_align_id) LEFT JOIN ?.genomic_align_block WHERE genomic_align_block.method_link_species_set_id = ?\n";
00368     $sql = "INSERT INTO genomic_align_tree SELECT * FROM $dbname.genomic_align_tree\n";
00369     my $sth = $self->compara_dba->dbc->prepare($sql);
00370 
00371     #$sth->execute($dbname, $dbname, $dbname, $dbname, $mlss_id);
00372     $sth->execute();
00373     $sth->finish();
00374 
00375 }
00376 
00377 1;