Source code for g2gtools.bins

# -*- coding: utf-8 -*-

# http://genomewiki.ucsc.edu/index.php/Bin_indexing_system

BIN_NEXT_SHIFT = 3    # How much to shift to get to finest bin
BIN_FIRST_SHIFT = 17  # How much to shift to get to next larger bin
BIN_OFFSETS = [4096+512+64+8+1, 512+64+8+1, 64+8+1, 8+1, 1, 0]
#BIN_OFFSETS = [32768+4096+512+64+8+1, 4096+512+64+8+1, 512+64+8+1, 64+8+1, 8+1, 1, 0]

# for BED (0-based, half-open) or GFF (1-based, closed intervals)
COORD_OFFSETS = {'bed': 0, 'gff': 1}


[docs]def bins(start, stop, fmt='gff', one=True): """ Uses the definition of a "genomic bin" described in Fig 7 of http://genome.cshlp.org/content/12/6/996.abstract. Parameters ---------- one : boolean If `one=True` (default), then only return the smallest bin that completely contains these coordinates (useful for assigning a single bin). If `one=False`, then return the set of *all* bins that overlap these coordinates (useful for looking for features that could intersect) fmt : 'gff' | 'bed' This specifies 1-based start coords (gff) or 0-based start coords (bed) """ # Jump to highest resolution bin that will fit these coords (depending on # whether we have a BED or GFF-style coordinate). start = (start - COORD_OFFSETS[fmt]) >> BIN_FIRST_SHIFT stop = (stop) >> BIN_FIRST_SHIFT # We always at least fit within the chrom, which is bin 1. bins = set([1]) for offset in BIN_OFFSETS: # Since we're going from smallest to largest bins, the first one where # the feature's start and stop positions are both within the same bin # is the smallest one these coords fit within. if one: if start == stop: # Note that at this point, because of the bit-shifting, `start` # is the number of bins (at this current level). So we need to # add it to `offset` to get the actual bin ID. return offset + start # See the Fig 7 reproduction above to see why range(). bins.update(list(range(offset + start, offset + stop + 1))) # Move to the next level (8x larger bin size; i.e., 2**NEXT_SHIFT # larger bin size) start >>= BIN_NEXT_SHIFT stop >>= BIN_NEXT_SHIFT return bins
[docs]def bin_info(): """ Useful for debugging: how large is each bin, and what are the bin IDs? """ print("level\t#bins\tstart\tend\tsize") level = -1 for i, offset in reversed(list(enumerate(BIN_OFFSETS))): level += 1 number_of_bins = 8 ** level binstart = offset try: binstop = BIN_OFFSETS[i+1] binstop = number_of_bins + binstart - 1 except IndexError: binstop = binstart bin_size = 2 ** (BIN_FIRST_SHIFT + (i * BIN_NEXT_SHIFT)) actual_size = bin_size # nice formatting bin_size, suffix = bin_size / 1024, 'Kb' if bin_size >= 1024: bin_size, suffix = bin_size / 1024, 'Mb' if bin_size >= 1024: bin_size, suffix = bin_size / 1024, 'Gb' size = '(%s %s)' % (bin_size, suffix) actual_size = '%s bp' % (actual_size) print('{level}\t{number_of_bins}\t{binstart}\t{binstop}\t{actual_size}\t{size}'.format(**locals()))
if __name__ == "__main__": bin_info() print(bins(10000, 20000, one=False)) print(bins(10000, 10001, one=True)) print(bins(50000000, 60000000, one=False)) print(bins(50000000, 60000001, one=True))