Archive Ensembl HomeArchive Ensembl Home
NamedClusterSetLink.pm
Go to the documentation of this file.
00001 =pod 
00002 
00003 =head1 NAME
00004 
00005   Bio::EnsEMBL::Compara::StableId::NamedClusterSetLink
00006 
00007 =head1 SYNOPSIS
00008 
00009 =head1 DESCRIPTION
00010 
00011     A linked pair of NamedClusterSet objects with extra stats + MNR algorithm
00012 
00013 =cut
00014 
00015 package Bio::EnsEMBL::Compara::StableId::NamedClusterSetLink;
00016 
00017 use strict;
00018 use Bio::EnsEMBL::Utils::Argument;  # import 'rearrange()'
00019 use Bio::EnsEMBL::Compara::StableId::NamedClusterSet;
00020 use Bio::EnsEMBL::Compara::StableId::Generator;
00021 use Bio::EnsEMBL::Compara::StableId::Map;
00022 
00023 # The $threshold parameter determines whether:
00024 #   * the name is reused via simply the biggest subset of the from & to ($threshold=0, $matchtype='Majority'),
00025 #   * or you also have to have a 'quorum' of, say, 67% ($threshold==0.67, $matchtype=='Major_67').
00026 # The 'NextBest' matchtype of name reuse is only applied if $threshold==0.
00027 #
00028 #
00029 #my $threshold = 0;    # switches off thresholding
00030 my $threshold = 0.67;
00031 #
00032 my $maj_label =  $threshold ? sprintf("Major_%d", int($threshold*100) ) : 'Majority';
00033 my @labels = ('Exact', 'Exact_o', $maj_label, $maj_label.'_o', 'NextBest', 'NextBest_o', 'NewName', 'NewName_o', 'NewFam', 'NewFam_o');
00034 
00035 
00036 sub new {
00037     my $class = shift @_;
00038 
00039     my $self = bless { }, $class;
00040 
00041     my ($from, $to, $filename) =
00042          rearrange([qw(from to filename) ], @_);
00043 
00044     if($filename) {
00045 
00046         $self->load($filename);
00047 
00048     } elsif($from and $to) {
00049 
00050         $self->from($from);
00051         $self->to($to);
00052 
00053         $self->compute_stats();
00054 
00055     } else {
00056         die "Please define either -filename or (-from and -to) in the '$class' constructor";
00057     }
00058 
00059     return $self;
00060 }
00061 
00062 sub compute_stats {
00063     my $self       = shift @_;
00064 
00065     my $total_count  = 0;
00066     my $common_count = 0;
00067 
00068     foreach my $from_member (@{$self->from->get_all_members}) {
00069         my $from_clid = $self->from->mname2clid($from_member);
00070         if(my $to_clid = $self->to->mname2clid($from_member)) {
00071             $self->direct_contrib->{$from_clid}{$to_clid}++;
00072             $self->rev_contrib->{$to_clid}{$from_clid}++;
00073             $self->from_size->{$from_clid}++;
00074             $self->to_size->{$to_clid}++;
00075             $common_count++;
00076         } else { # disappeared members (disappearing either with or without the containing cluster)
00077             $self->xfrom_size->{$from_clid}++;
00078         }
00079         $total_count++;
00080     }
00081     foreach my $to_member (@{$self->to->get_all_members}) {
00082         my $to_clid = $self->to->mname2clid($to_member);
00083         if(!defined($self->from->mname2clid($to_member))) { # new members (going either into existing or new cluster)
00084             $self->xto_size->{$to_clid}++;
00085             $total_count++;
00086         }
00087     }
00088 
00089     warn "Total number of keys:  $total_count\n";
00090     warn "Number of common keys: $common_count\n";
00091 
00092         # "cleanup" procedure 1: forget the newborn members that do not make a new class, as they will not influence the MNR algorithm
00093     foreach my $to_clid (sort {$a <=> $b} keys %{$self->xto_size}) { # iterate through families that contain new members
00094         if($self->rev_contrib->{$to_clid}) {
00095             delete $self->xto_size->{$to_clid};
00096         }
00097     }
00098         # "cleanup" procedure 2: forget the disappearing members that did not make a separate class, as they will not influence the MNR algorithm either
00099     foreach my $from_clid (sort {$a <=> $b} keys %{$self->xfrom_size}) { # iterate through families that lost some members
00100         if($self->direct_contrib->{$from_clid}) {
00101             delete $self->xfrom_size->{$from_clid};
00102         }
00103     }
00104 }
00105 
00106 sub load {
00107     my $self     = shift @_;
00108     my $filename = shift @_;
00109 
00110     $self->from(Bio::EnsEMBL::Compara::StableId::NamedClusterSet->new());
00111     $self->to(  Bio::EnsEMBL::Compara::StableId::NamedClusterSet->new());
00112 
00113     open(LINKFILE, $filename) || die "Cannot open '$filename' file: $@";
00114     while (my ($from_clid, $from_clname, $from_size, $to_clid, $to_clname, $to_size, $contrib) = split(/\s/,<LINKFILE>)) {
00115 
00116         next unless($contrib=~/^\d+$/); # skip the header line if present
00117 
00118         if($from_size and $to_size) { # Shared
00119             $self->from->clid2clname($from_clid, $from_clname);
00120             $self->to->clid2clname($to_clid, $to_clname);
00121 
00122             $self->direct_contrib->{$from_clid}{$to_clid} = $contrib;
00123             $self->rev_contrib->{$to_clid}{$from_clid}    = $contrib;
00124             $self->from_size->{$from_clid}                = $from_size;
00125             $self->to_size->{$to_clid}                    = $to_size;
00126         } elsif($to_size) { # Newborn
00127             $self->to->clid2clname($to_clid, $to_clname);
00128 
00129             $self->xto_size->{$to_clid}                   = $to_size;
00130         } elsif($from_size) { # Disappearing
00131             $self->from->clid2clname($from_clid, $from_clname);
00132 
00133             $self->xfrom_size->{$from_clid}               = $from_size;
00134         }
00135     }
00136     close LINKFILE;
00137 }
00138 
00139 sub save {
00140     my $self     = shift @_;
00141     my $filename = shift @_;
00142 
00143     open(LINKFILE, ">$filename");
00144 
00145     foreach my $from_clid (sort {$a <=> $b} keys %{$self->direct_contrib}) {
00146         my $from_clname = $self->from->clid2clname($from_clid);
00147         my $subhash = $self->direct_contrib->{$from_clid};
00148 
00149         foreach my $to_clid (sort { $subhash->{$b} <=> $subhash->{$a} } keys %$subhash) {
00150             my $to_clname = $self->to->clid2clname($to_clid);
00151             my $cnt = $self->direct_contrib->{$from_clid}{$to_clid};
00152 
00153             print LINKFILE join("\t", $from_clid, $from_clname, $self->from_size->{$from_clid}, $to_clid, $to_clname, $self->to_size->{$to_clid}, $cnt)."\n";
00154         }
00155     }
00156     foreach my $to_clid (sort {$a <=> $b} keys %{$self->xto_size}) { # iterate through families that contain new members
00157         my $to_clname = $self->to->clid2clname($to_clid);
00158 
00159         print LINKFILE join("\t", 0, '-', 0, $to_clid, $to_clname, $self->xto_size->{$to_clid}, $self->xto_size->{$to_clid})."\n";
00160     }
00161     foreach my $from_clid (sort {$a <=> $b} keys %{$self->xfrom_size}) { # iterate through families that lost some members
00162         my $from_clname = $self->from->clid2clname($from_clid);
00163 
00164         print LINKFILE join("\t", $from_clid, $from_clname, $self->xfrom_size->{$from_clid}, 0, '-', 0, $self->xfrom_size->{$from_clid})."\n";
00165     }
00166 
00167     close LINKFILE;
00168 }
00169 
00170 # Sort new families by size (desc)
00171 # Try to assign a name according to the biggest contributor in the new family (and "grab" it)
00172 # If unsuccessful (has been "grabbed" earlier), count this attempt and take the next best candidate.
00173 #
00174 sub maximum_name_reuse {
00175     my ($self, $generator) = @_;
00176 
00177     $generator ||= Bio::EnsEMBL::Compara::StableId::Generator->new(-TYPE => $self->to->type, -RELEASE => $self->to->release, -MAP => $self->from );
00178 
00179     my $postmap   = Bio::EnsEMBL::Compara::StableId::Map->new(-TYPE => $self->to->type);
00180 
00181     my %from_taken        = (); # indicates the 'from' name has been taken 
00182 
00183     TOPAIR: foreach my $topair (sort { $b->[1] <=> $a->[1] } map { [$_,$self->to_size->{$_}] } keys %{$self->to_size} ) {
00184         my ($to_clid, $to_size) = @$topair;
00185 
00186         my $subhash = $self->rev_contrib->{$to_clid};
00187 
00188         my $td_counts  = 0;
00189         my $matchtype  = '';
00190         my $matchscore = 0;
00191         my $given_name = ''; # serves both as logical flag and the actual name
00192 
00193         FROMPAIR: foreach my $frompair (sort { $b->[1] <=> $a->[1] } map { [$_,$subhash->{$_}] } keys %$subhash ) {
00194             my ($from_clid, $contrib) = @$frompair;
00195             my $from_size = $self->from_size->{$from_clid};
00196             my $from_name = $self->from->clid2clname($from_clid);
00197 
00198             if(!defined $from_taken{$from_name}) { # means the '$from' name is still unused, so we can reuse it now
00199 
00200                 if($contrib==$from_size and $contrib==$to_size) {
00201                     $matchtype  = 'Exact';
00202                 } elsif($threshold>0) {  # either the majority rule is applicable or we don't bother looking at other possibilities (as they are even smaller)
00203                     if($contrib/$from_size>=$threshold and $contrib/$to_size>=$threshold) {
00204                         $matchtype  = $maj_label;
00205                     } # otherwise we have an implicit 'NewName' case
00206                 } else { # non-threshold mode
00207                     $matchtype = $td_counts ? 'NextBest' : $maj_label;
00208                 }
00209 
00210                 if($matchtype) {
00211                     if($matchtype eq 'Exact') {
00212                         # $from_name =~ /^(\w+)(?:\.(\d+))?/;
00213                         # $given_name = $1.'.'. (defined($2) ? $2 : $generator->default_version() ); # same version (but we may want to make it more obvious)
00214                         $given_name = $from_name;
00215                     } else {
00216                         $from_name =~ /^(\w+)(?:\.(\d+))?/;
00217                         $given_name = $1.'.'. ((defined($2) ? $2 : $generator->default_version())+1); # change the version if the match is not exact (or set it if previously unset)
00218                     }
00219                     $from_taken{$from_name} = 1;
00220                     $matchscore = int(100*$contrib/$to_size);
00221                 }
00222                 last FROMPAIR;
00223 
00224             } # if name not taken
00225 
00226             $td_counts++; # counts all attempts, not only the ones where the '$from' name was unused
00227 
00228         } # FROMPAIR
00229 
00230             # the following two lines work either if we arrive here from 'last FROMPAIR' after implicit 'NewName'
00231             # or by exhausting all FROMPAIRS (beacause they were all taken)
00232         $matchtype  ||= 'NewName';
00233         $given_name ||= $generator->generate_new_name();
00234 
00235         $postmap->clid2clname($to_clid, $given_name);
00236         $postmap->clid2score($to_clid, $matchscore);
00237 
00238         if($to_size == 1) { $matchtype .= '_o'; }
00239     } # TOPAIR
00240 
00241     foreach my $to_clid (sort {$a<=>$b} keys %{$self->xto_size}) {
00242 
00243         my $given_name = $generator->generate_new_name();
00244         $postmap->clid2clname($to_clid, $given_name);
00245         $postmap->clid2score($to_clid, 0);
00246 
00247 #        my $matchtype = ($self->xto_size->{$to_clid} == 1) ? 'NewFam_o' : 'NewFam';
00248     }
00249 
00250     return $postmap;
00251 }
00252 
00253     # simply captures cluster inclusion events both ways: SymMatch|Included|Contains
00254 sub mnr_lite {
00255     my $self = shift @_;
00256 
00257     my %accu = ();  # accumulator is a HoHoL
00258 
00259     TOPAIR: foreach my $topair (sort { $b->[1] <=> $a->[1] } map { [$_,$self->to_size->{$_}] } keys %{$self->to_size} ) {
00260         my ($to_clid, $to_size) = @$topair;
00261         my $subhash = $self->rev_contrib->{$to_clid};
00262 
00263         FROMPAIR: foreach my $frompair (sort { $b->[1] <=> $a->[1] } map { [$_,$subhash->{$_}] } keys %$subhash ) {
00264             my ($from_clid, $contrib) = @$frompair;
00265             my $from_size = $self->from_size->{$from_clid};
00266             my $from_name = $self->from->clid2clname($from_clid);
00267 
00268             my $from_contrib = $contrib/$from_size;
00269             my $to_contrib   = $contrib/$to_size;
00270 
00271             if(my $matchtype = 
00272                 ($to_contrib >= $threshold)
00273                     ? (($from_contrib >= $threshold)
00274                         ? 'SymMatch'
00275                         : 'Included')
00276                     : (($from_contrib >= $threshold)
00277                         ? 'Contains'
00278                         : '')
00279             ) {
00280                 push @{$accu{$to_clid}{$matchtype}}, $from_name;
00281             }
00282         } # FROMPAIR
00283     } # TOPAIR
00284 
00285     return \%accu;
00286 }
00287 
00288 
00289 # -------------------------------------getters and setters----------------------------------------
00290 
00291 sub from {
00292     my $self = shift @_;
00293 
00294     if(@_) {
00295         $self->{'_from'} = shift @_;
00296     }
00297     return $self->{'_from'};
00298 }
00299 
00300 sub to {
00301     my $self = shift @_;
00302 
00303     if(@_) {
00304         $self->{'_to'} = shift @_;
00305     }
00306     return $self->{'_to'};
00307 }
00308 
00309 sub direct_contrib {
00310     my $self = shift @_;
00311 
00312     return ($self->{_direct_contrib} ||= {});
00313 }
00314 
00315 sub rev_contrib {
00316     my $self = shift @_;
00317 
00318     return ($self->{_rev_contrib} ||= {});
00319 }
00320 
00321 sub from_size {
00322     my $self = shift @_;
00323 
00324     return ($self->{_from_size} ||= {});
00325 }
00326 
00327 sub to_size {
00328     my $self = shift @_;
00329 
00330     return ($self->{_to_size} ||= {});
00331 }
00332 
00333 sub xfrom_size {
00334     my $self = shift @_;
00335 
00336     return ($self->{_xfrom_size} ||= {});
00337 }
00338 
00339 sub xto_size {
00340     my $self = shift @_;
00341 
00342     return ($self->{_xto_size} ||= {});
00343 }
00344 
00345 1;
00346