Archive Ensembl HomeArchive Ensembl Home
ImportAlignment.pm
Go to the documentation of this file.
00001 #
00002 # You may distribute this module under the same terms as perl itself
00003 #
00004 # POD documentation - main docs before the code
00005 
00006 =pod 
00007 
00008 =head1 NAME
00009 
00010 Bio::EnsEMBL::Compara::Production::GenomicAlignBlock::ImportAlignment
00011 
00012 =head1 SYNOPSIS
00013 
00014 
00015 =head1 DESCRIPTION
00016 
00017 This module imports a specified alignment. This is used in the low coverage genome alignment pipeline for importing the high coverage alignment which is used to build the low coverage genomes on.
00018 
00019 =head1 PARAMETERS
00020 
00021 =head1 CONTACT
00022 
00023 Post questions to the Ensembl development list: dev@ensembl.org
00024 
00025 
00026 =head1 APPENDIX
00027 
00028 The rest of the documentation details each of the object methods. 
00029 Internal methods are usually preceded with a _
00030 
00031 =cut
00032 
00033 package Bio::EnsEMBL::Compara::Production::GenomicAlignBlock::ImportAlignment;
00034 
00035 use strict;
00036 use Bio::EnsEMBL::Registry;
00037 use Bio::EnsEMBL::Compara::Production::DBSQL::DBAdaptor;
00038 use Bio::EnsEMBL::Utils::Exception qw(throw);
00039 use Bio::EnsEMBL::Compara::Graph::NewickParser;
00040 
00041 use Bio::EnsEMBL::Hive::Process;
00042 
00043 our @ISA = qw(Bio::EnsEMBL::Hive::Process);
00044 
00045 
00046 =head2 fetch_input
00047 
00048     Title   :   fetch_input
00049     Usage   :   $self->fetch_input
00050     Function:   Fetches input data for gerp from the database
00051     Returns :   none
00052     Args    :   none
00053 
00054 =cut
00055 
00056 sub fetch_input {
00057   my( $self) = @_;
00058 
00059   #create a Compara::DBAdaptor which shares the same DBI handle
00060   #with $self->db (Hive DBAdaptor)
00061   $self->{'comparaDBA'} = Bio::EnsEMBL::Compara::Production::DBSQL::DBAdaptor->new(-DBCONN=>$self->db->dbc);
00062   $self->{'comparaDBA'}->dbc->disconnect_when_inactive(0);
00063 
00064    $self->{'hiveDBA'} = Bio::EnsEMBL::Hive::DBSQL::DBAdaptor->new(-DBCONN => $self->{'comparaDBA'}->dbc);
00065 
00066   #read from analysis table
00067   $self->get_params($self->parameters); 
00068 
00069   #read from analysis_job table
00070   $self->get_params($self->input_id);
00071 
00072   my $reg = "Bio::EnsEMBL::Registry";
00073   $reg->load_registry_from_url($self->from_db_url);
00074   
00075 }
00076 
00077 =head2 run
00078 
00079     Title   :   run
00080     Usage   :   $self->run
00081     Function:   Run gerp
00082     Returns :   none
00083     Args    :   none
00084 
00085 =cut
00086 
00087 sub run {
00088     my $self = shift;
00089     $self->importAlignment();
00090     
00091 
00092 }
00093 
00094 =head2 write_output
00095 
00096     Title   :   write_output
00097     Usage   :   $self->write_output
00098     Function:   Write results to the database
00099     Returns :   1
00100     Args    :   none
00101 
00102 =cut
00103 
00104 sub write_output {
00105     my ($self) = @_;
00106 
00107     return 1;
00108 }
00109 
00110 #Uses copy_data method from copy_data.pl script
00111 sub importAlignment {
00112     my $self = shift;
00113 
00114     #if the database name is defined in the url, then open that
00115     if ($self->from_db_url =~ /mysql:\/\/.*@.*\/.+/) {
00116     $self->{'from_comparaDBA'} = new Bio::EnsEMBL::Compara::DBSQL::DBAdaptor(-url=>$self->from_db_url);
00117     } else {
00118     #open the most recent compara database
00119     $self->{'from_comparaDBA'} = Bio::EnsEMBL::Registry->get_DBAdaptor("Multi", "compara");
00120     }
00121     my $analysis = $self->db->get_AnalysisAdaptor->fetch_by_logic_name("ImportAlignment");
00122     
00123     my $dbname = $self->{'from_comparaDBA'}->dbc->dbname;
00124     my $analysis_id = $analysis->dbID;
00125     my $mlss_id = $self->method_link_species_set_id;
00126 
00127      ##Find min and max of the relevant internal IDs in the FROM database
00128     my $sth = $self->{'from_comparaDBA'}->dbc->prepare("SELECT
00129         MIN(gab.genomic_align_block_id), MAX(gab.genomic_align_block_id),
00130         MIN(ga.genomic_align_id), MAX(ga.genomic_align_id),
00131         MIN(gag.node_id), MAX(gag.node_id),
00132         MIN(gat.root_id), MAX(gat.root_id)
00133       FROM genomic_align_block gab
00134         LEFT JOIN genomic_align ga using (genomic_align_block_id)
00135         LEFT JOIN genomic_align_group gag using (genomic_align_id)
00136     LEFT JOIN genomic_align_tree gat ON gat.node_id = gag.node_id
00137       WHERE
00138         gab.method_link_species_set_id = ?");
00139     
00140     $sth->execute($mlss_id);
00141     my ($min_gab, $max_gab, $min_ga, $max_ga, $min_gag, $max_gag, 
00142     $min_root_id, $max_root_id) =
00143       $sth->fetchrow_array();
00144     
00145     $sth->finish();
00146 
00147     #HACK to just copy over one chr (22) for testing purposes
00148     #my $dnafrag_id = 905407;
00149     my $dnafrag_id;
00150 
00151     #copy genomic_align_block table
00152     if ($dnafrag_id) {
00153     copy_data($self->{'from_comparaDBA'}, $self->{'comparaDBA'},
00154           "genomic_align_block",
00155           "genomic_align_block_id",
00156           $min_gab, $max_gab,
00157           "SELECT gab.* FROM genomic_align_block gab LEFT JOIN genomic_align ga USING (genomic_align_block_id) WHERE ga.method_link_species_set_id = $mlss_id AND dnafrag_id=$dnafrag_id");
00158     } else {
00159     copy_data($self->{'from_comparaDBA'}, $self->{'comparaDBA'},
00160           "genomic_align_block",
00161           "genomic_align_block_id",
00162           $min_gab, $max_gab,
00163           "SELECT * FROM genomic_align_block WHERE method_link_species_set_id = $mlss_id");
00164     }
00165 
00166     #copy genomic_align table
00167     if ($dnafrag_id) {
00168     copy_data($self->{'from_comparaDBA'}, $self->{'comparaDBA'},
00169           "genomic_align",
00170           "genomic_align_id",
00171           $min_ga, $max_ga,
00172           "SELECT ga.*".
00173           " FROM genomic_align ga ".
00174           " WHERE method_link_species_set_id = $mlss_id AND dnafrag_id=$dnafrag_id");
00175 
00176     } else {
00177     copy_data($self->{'from_comparaDBA'}, $self->{'comparaDBA'},
00178           "genomic_align",
00179           "genomic_align_id",
00180           $min_ga, $max_ga,
00181           "SELECT *".
00182           " FROM genomic_align".
00183           " WHERE method_link_species_set_id = $mlss_id");
00184     }
00185     #copy genomic_align_group table
00186     if ($dnafrag_id) {
00187     copy_data($self->{'from_comparaDBA'}, $self->{'comparaDBA'},
00188           "genomic_align_group",
00189           "gag.node_id", 
00190           $min_gag, $max_gag,
00191           "SELECT gag.*".
00192           " FROM genomic_align_group gag LEFT JOIN genomic_align USING (genomic_align_id)".
00193           " WHERE gag.node_id IS NOT NULL AND method_link_species_set_id = $mlss_id AND dnafrag_id=$dnafrag_id");
00194     } else {
00195     copy_data($self->{'from_comparaDBA'}, $self->{'comparaDBA'},
00196           "genomic_align_group",
00197           "gag.node_id", 
00198           $min_gag, $max_gag,
00199           "SELECT gag.*".
00200           " FROM genomic_align ga ".
00201           " LEFT JOIN genomic_align_group gag USING (genomic_align_id)".
00202           " WHERE gag.node_id IS NOT NULL AND ga.method_link_species_set_id = $mlss_id");
00203     }
00204     #copy genomic_align_tree table
00205     if ($dnafrag_id) {
00206     copy_data($self->{'from_comparaDBA'}, $self->{'comparaDBA'},
00207           "genomic_align_tree",
00208           "root_id",
00209           $min_root_id, $max_root_id,
00210           "SELECT gat.*".
00211           " FROM genomic_align_tree gat LEFT JOIN genomic_align_group USING (node_id)".
00212           " LEFT JOIN genomic_align USING (genomic_align_id)".
00213           " WHERE node_id IS NOT NULL AND method_link_species_set_id = $mlss_id AND dnafrag_id=$dnafrag_id");
00214 
00215     } else {
00216     copy_data($self->{'from_comparaDBA'}, $self->{'comparaDBA'},
00217           "genomic_align_tree",
00218           "root_id",
00219           $min_root_id, $max_root_id,
00220           "SELECT gat.*".
00221           " FROM genomic_align ga".
00222           " LEFT JOIN genomic_align_group gag USING (genomic_align_id)".
00223           " LEFT JOIN genomic_align_tree gat USING (node_id) WHERE gag.node_id IS NOT NULL AND ga.method_link_species_set_id = $mlss_id");
00224     }
00225 }
00226 
00227 
00228 =head2 copy_data
00229 
00230   Arg[1]      : Bio::EnsEMBL::Compara::DBSQL::DBAdaptor $from_dba
00231   Arg[2]      : Bio::EnsEMBL::Compara::DBSQL::DBAdaptor $to_dba
00232   Arg[3]      : Bio::EnsEMBL::Compara::MethodLinkSpeciesSet $this_mlss
00233   Arg[4]      : string $table
00234   Arg[5]      : string $sql_query
00235 
00236   Description : copy data in this table using this SQL query.
00237   Returns     :
00238   Exceptions  : throw if argument test fails
00239 
00240 =cut
00241 
00242 sub copy_data {
00243   my ($from_dba, $to_dba, $table_name, $index_name, $min_id, $max_id, $query) = @_;
00244 
00245   print "Copying data in table $table_name\n";
00246 
00247   my $sth = $from_dba->dbc->db_handle->column_info($from_dba->dbc->dbname, undef, $table_name, '%');
00248   $sth->execute;
00249   my $all_rows = $sth->fetchall_arrayref;
00250   my $binary_mode = 0;
00251   foreach my $this_col (@$all_rows) {
00252     if (($this_col->[5] eq "BINARY") or ($this_col->[5] eq "VARBINARY") or
00253         ($this_col->[5] eq "BLOB") or ($this_col->[5] eq "BIT")) {
00254       $binary_mode = 1;
00255       last;
00256     }
00257   }
00258   #speed up writing of data by disabling keys, write the data, then enable 
00259   $to_dba->dbc->do("ALTER TABLE `$table_name` DISABLE KEYS");
00260   if ($binary_mode) {
00261     #copy_data_in_binary_mode($from_dba, $to_dba, $table_name, $query);
00262   } else {
00263     copy_data_in_text_mode($from_dba, $to_dba, $table_name, $index_name, $min_id, $max_id, $query);
00264   }
00265   $to_dba->dbc->do("ALTER TABLE `$table_name` ENABLE KEYS");
00266 }
00267 
00268 
00269 =head2 copy_data_in_text_mode
00270 
00271   Arg[1]      : Bio::EnsEMBL::Compara::DBSQL::DBAdaptor $from_dba
00272   Arg[2]      : Bio::EnsEMBL::Compara::DBSQL::DBAdaptor $to_dba
00273   Arg[3]      : Bio::EnsEMBL::Compara::MethodLinkSpeciesSet $this_mlss
00274   Arg[4]      : string $table
00275   Arg[5]      : string $sql_query
00276 
00277   Description : copy data in this table using this SQL query.
00278   Returns     :
00279   Exceptions  : throw if argument test fails
00280 
00281 =cut
00282 
00283 sub copy_data_in_text_mode {
00284   my ($from_dba, $to_dba, $table_name, $index_name, $min_id, $max_id, $query) = @_;
00285 
00286   my $user = $to_dba->dbc->username;
00287   my $pass = $to_dba->dbc->password;
00288   my $host = $to_dba->dbc->host;
00289   my $port = $to_dba->dbc->port;
00290   my $dbname = $to_dba->dbc->dbname;
00291   my $use_limit = 0;
00292   my $start = $min_id;
00293   my $step = 100000;
00294 
00295   #If not using BETWEEN, revert back to LIMIT
00296   if (!defined $index_name && !defined $min_id && !defined $max_id) {
00297       $use_limit = 1;
00298       $start = 0;
00299   }
00300 
00301 
00302   while (1) {
00303     my $end = $start + $step - 1;
00304     my $sth;
00305     
00306     if (!$use_limit) {
00307     $sth = $from_dba->dbc->prepare($query." AND $index_name BETWEEN $start AND $end");
00308     } else {
00309     $sth = $from_dba->dbc->prepare($query." LIMIT $start, $step");
00310     }
00311     $start += $step;
00312     $sth->execute();
00313     my $all_rows = $sth->fetchall_arrayref;
00314     ## EXIT CONDITION
00315     return if (!@$all_rows);
00316   
00317     my $filename = "/tmp/$table_name.copy_data.$$.txt";
00318     open(TEMP, ">$filename") or die;
00319     foreach my $this_row (@$all_rows) {
00320       print TEMP join("\t", map {defined($_)?$_:'\N'} @$this_row), "\n";
00321     }
00322     close(TEMP);
00323     if ($pass) {
00324     unless (system("mysqlimport", "-u$user", "-p$pass", "-h$host", "-P$port", "-L", "-l", "-i", $dbname, $filename) == 0) {
00325         throw("Failed mysqlimport -u$user -p$pass -h$host -P$port -L -l -i $dbname $filename");
00326     }
00327     } else {
00328     unless (system("mysqlimport", "-u$user", "-h$host", "-P$port", "-L", "-l", "-i", $dbname, $filename) ==0) {
00329         throw("Failed mysqlimport -u$user -h$host -P$port -L -l -i $dbname $filename");
00330     }
00331     }
00332     unlink("$filename");
00333   }
00334 }
00335 
00336 #this assumes the from and to databases are on the same server.
00337 sub importAlignment_old {
00338     my $self = shift;
00339 
00340     #if the database name is defined in the url, then open that
00341     if ($self->from_db_url =~ /mysql:\/\/.*@.*\/.+/) {
00342     $self->{'from_comparaDBA'} = new Bio::EnsEMBL::Compara::DBSQL::DBAdaptor(-url=>$self->from_db_url);
00343     } else {
00344     #open the most recent compara database
00345     $self->{'from_comparaDBA'} = Bio::EnsEMBL::Registry->get_DBAdaptor("Multi", "compara");
00346     }
00347     my $analysis = $self->db->get_AnalysisAdaptor->fetch_by_logic_name("ImportAlignment");
00348     
00349     my $dbname = $self->{'from_comparaDBA'}->dbc->dbname;
00350     my $analysis_id = $analysis->dbID;
00351     my $mlss_id = $self->method_link_species_set_id;
00352 
00353     my $sql = "INSERT INTO genomic_align_block SELECT * FROM ?.genomic_align_block WHERE method_link_species_set_id = ?\n";
00354 
00355     my $sth = $self->{'comparaDBA'}->dbc->prepare($sql);
00356     $sth->execute($dbname, $mlss_id);
00357     $sth->finish();
00358 
00359      $sql = "INSERT INTO genomic_align SELECT genomic_align.* FROM ?.genomic_align LEFT JOIN WHERE method_link_species_set_id = ?\n";
00360     my $sth = $self->{'comparaDBA'}->dbc->prepare($sql);
00361     $sth->execute($dbname, $mlss_id);
00362     $sth->finish();
00363 
00364     $sql = "INSERT INTO genomic_align_group SELECT genomic_align_group.* FROM ?.genomic_align_group LEFT JOIN ?.genomic_align USING (genomic_align_id) LEFT JOIN ?.genomic_align_block USING (genomic_align_block_id) WHERE genomic_align_block.method_link_species_set_id = ?\n";
00365     my $sth = $self->{'comparaDBA'}->dbc->prepare($sql);
00366     $sth->execute($dbname, $dbname, $mlss_id);
00367     $sth->finish();
00368 
00369     $sql = "INSERT INTO genomic_align_tree SELECT genomic_align_tree.* FROM ?.genomic_align_tree LEFT JOIN ?.genomic_align_group USING (node_id) LEFT JOIN ?.genomic_align USING (genomic_align_id) LEFT JOIN ?.genomic_align_block WHERE genomic_align_block.method_link_species_set_id = ?\n";
00370     my $sth = $self->{'comparaDBA'}->dbc->prepare($sql);
00371     $sth->execute($dbname, $dbname, $dbname, $dbname, $mlss_id);
00372     $sth->finish();
00373 
00374 }
00375 
00376 ##########################################
00377 #
00378 # getter/setter methods
00379 # 
00380 ##########################################
00381 
00382 sub method_link_species_set_id {
00383   my $self = shift;
00384   $self->{'_method_link_species_set_id'} = shift if(@_);
00385   return $self->{'_method_link_species_set_id'};
00386 }
00387 
00388 sub from_db_url {
00389   my $self = shift;
00390   $self->{'_from_db_url'} = shift if(@_);
00391   return $self->{'_from_db_url'};
00392 }
00393 
00394 ##########################################
00395 #
00396 # internal methods
00397 #
00398 ##########################################
00399 
00400 sub get_params {
00401   my $self         = shift;
00402   my $param_string = shift;
00403 
00404   return unless($param_string);
00405   print("parsing parameter string : ",$param_string,"\n");
00406 
00407   my $params = eval($param_string);
00408   return unless($params);
00409 
00410   if(defined($params->{'method_link_species_set_id'})) {
00411     $self->method_link_species_set_id($params->{'method_link_species_set_id'});
00412   }
00413   if (defined($params->{'from_db_url'})) {
00414       $self->from_db_url($params->{'from_db_url'});
00415   }
00416   return 1;
00417 }