Archive Ensembl HomeArchive Ensembl Home
UcscToEnsemblMapping.pm
Go to the documentation of this file.
00001 =head1 LICENSE
00002 
00003   Copyright (c) 1999-2012 The European Bioinformatics Institute and
00004   Genome Research Limited.  All rights reserved.
00005 
00006   This software is distributed under a modified Apache license.
00007   For license details, please see
00008 
00009     http://www.ensembl.org/info/about/code_licence.html
00010 
00011 =head1 CONTACT
00012 
00013   Please email comments or questions to the public Ensembl
00014   developers list at <dev@ensembl.org>.
00015 
00016   Questions may also be sent to the Ensembl help desk at
00017   <helpdesk@ensembl.org>.
00018 
00019 =head1 NAME
00020 
00021 Bio::EnsEMBL::Compara::RunnableDB::PairAligner::UcscToEnsemblMapping
00022 
00023 =head1 SYNOPSIS
00024 
00025 
00026 =head1 DESCRIPTION
00027 
00028 
00029 Convert UCSC names to ensembl names (reference only chromosomes and supercontigs, ie no haplotypes)
00030 First check the names using chromInfo.txt and then go to mapping file if necessary eg ctgPos.txt for human
00031 Download from:
00032 http://hgdownload.cse.ucsc.edu/downloads.html
00033 Choose species
00034 Choose Annotation database
00035 wget http://hgdownload.cse.ucsc.edu/goldenPath/ponAbe2/database/chromInfo.txt.gz
00036 
00037 =cut
00038 
00039 package Bio::EnsEMBL::Compara::RunnableDB::PairAligner::UcscToEnsemblMapping;
00040 
00041 use strict;
00042 use Bio::EnsEMBL::Compara::MethodLinkSpeciesSet;
00043 use Bio::EnsEMBL::Utils::Exception qw(throw warning);
00044 
00045 use base ('Bio::EnsEMBL::Compara::RunnableDB::BaseRunnable');
00046 
00047 ############################################################
00048 
00049 =head2 fetch_input
00050 
00051     Title   :   fetch_input
00052     Usage   :   $self->fetch_input
00053     Returns :   nothing
00054     Args    :   none
00055 
00056 =cut
00057 
00058 sub fetch_input {
00059   my( $self) = @_; 
00060 
00061   #Must define chromInfo file
00062   return if (!defined $self->param('chromInfo_file') || $self->param('chromInfo_file') eq "");
00063 
00064   my $gdba = $self->compara_dba->get_GenomeDBAdaptor;
00065   
00066   my $genome_db = $gdba->fetch_by_name_assembly($self->param('species'));
00067   $self->param('genome_db', $genome_db);
00068 
00069   #Get slice adaptor
00070   my $slice_adaptor = $genome_db->db_adaptor->get_SliceAdaptor;
00071 
00072   my $ensembl_names;
00073   my $ucsc_to_ensembl_mapping;
00074   
00075   #Get all toplevel slices
00076   my $ref_slices = $slice_adaptor->fetch_all("toplevel");
00077   
00078   foreach my $this_slice ( @$ref_slices ) {
00079       $ensembl_names->{$this_slice->seq_region_name} = 1;
00080   }
00081   
00082   #Open UCSC chromInfo file
00083   open UCSC, $self->param('chromInfo_file') or die ("Unable to open " . $self->param('chromInfo_file'));
00084   while (<UCSC>) {
00085       my ($ucsc_chr, $size, $file) = split " ";
00086       my $chr = $ucsc_chr;
00087       $chr =~ s/chr//;
00088       if ($ensembl_names->{$chr}) {
00089       $ensembl_names->{$chr} = 2;
00090       $ucsc_to_ensembl_mapping->{$ucsc_chr} = $chr;
00091       } elsif ($chr eq "M") {
00092       #Special case for MT
00093       $ensembl_names->{"MT"} = 2;
00094       $ucsc_to_ensembl_mapping->{$ucsc_chr} = "MT";
00095       } else {
00096       #Try extracting gl from filename
00097       if (defined $self->param('ucsc_map')) {
00098           read_ucsc_map($self->param('ucsc_map'), $ensembl_names, $ucsc_to_ensembl_mapping);
00099       } else {
00100           die ("You must provide a UCSC mapping file");
00101       }
00102       }
00103   }
00104   
00105   close UCSC;
00106   foreach my $chr (keys %$ensembl_names) {
00107       if ($ensembl_names->{$chr} != 2) {
00108       die ("Failed to find $chr in UCSC");
00109       }
00110   }
00111   $self->param('ucsc_to_ensembl_mapping', $ucsc_to_ensembl_mapping);
00112 }
00113 
00114 sub read_ucsc_map {
00115     my ($ucsc_map, $ensembl_names, $ucsc_to_ensembl_mapping) = @_;
00116 
00117     open MAP, $ucsc_map or die ("Unable to open " . $ucsc_map);
00118 
00119     while (<MAP>) {
00120     my ($contig, $size, $chrom, $chromStart, $chromEnd) = split " ";
00121     if ($ensembl_names->{$contig}) {
00122         #print "FOUND $contig\n";
00123         $ensembl_names->{$contig} = 2;
00124         $ucsc_to_ensembl_mapping->{$chrom} = $contig;
00125     }
00126     }
00127 
00128     close MAP;
00129 }
00130 
00131 sub run {
00132     my $self = shift;
00133 
00134 }
00135 
00136 sub write_output {
00137     my ($self) = shift;
00138  
00139     return if (!defined $self->param('chromInfo_file') || $self->param('chromInfo_file') eq "");
00140 
00141     my $genome_db_id = $self->param('genome_db')->dbID;
00142 
00143     #Insert into ucsc_to_ensembl_mapping table
00144     my $sql = "INSERT INTO ucsc_to_ensembl_mapping (genome_db_id, ucsc, ensembl) VALUES (?,?,?)";
00145     my $sth = $self->compara_dba->dbc->prepare($sql);
00146 
00147     my $ucsc_to_ensembl_mapping = $self->param('ucsc_to_ensembl_mapping');
00148     foreach my $ucsc_chr (keys %$ucsc_to_ensembl_mapping) {
00149     #print "$ucsc_chr " . $ucsc_to_ensembl_mapping->{$ucsc_chr} . "\n";
00150     $sth->execute($genome_db_id, $ucsc_chr, $ucsc_to_ensembl_mapping->{$ucsc_chr});
00151     }
00152     $sth->finish();
00153 
00154 }
00155 
00156 1;