Source code for g2gtools.bins

# -*- coding: utf-8 -*-

# http://genomewiki.ucsc.edu/index.php/Bin_indexing_system

BIN_NEXT_SHIFT = 3    # How much to shift to get to finest bin
BIN_FIRST_SHIFT = 17  # How much to shift to get to next larger bin
BIN_OFFSETS = [4096+512+64+8+1, 512+64+8+1, 64+8+1, 8+1, 1, 0]
#BIN_OFFSETS = [32768+4096+512+64+8+1, 4096+512+64+8+1, 512+64+8+1, 64+8+1, 8+1, 1, 0]

# for BED (0-based, half-open) or GFF (1-based, closed intervals)
COORD_OFFSETS = {'bed': 0, 'gff': 1}


[docs]def bins(start, stop, fmt='gff', one=True):
    """
    Uses the definition of a "genomic bin" described in Fig 7 of
    http://genome.cshlp.org/content/12/6/996.abstract.

    Parameters
    ----------
    one : boolean
        If `one=True` (default), then only return the smallest bin that
        completely contains these coordinates (useful for assigning a single
        bin).

        If `one=False`, then return the set of *all* bins that overlap these
        coordinates (useful for looking for features that could intersect)

    fmt : 'gff' | 'bed'
        This specifies 1-based start coords (gff) or 0-based start coords (bed)
    """
    # Jump to highest resolution bin that will fit these coords (depending on
    # whether we have a BED or GFF-style coordinate).
    start = (start - COORD_OFFSETS[fmt]) >> BIN_FIRST_SHIFT
    stop = (stop) >> BIN_FIRST_SHIFT

    # We always at least fit within the chrom, which is bin 1.
    bins = set([1])

    for offset in BIN_OFFSETS:
        # Since we're going from smallest to largest bins, the first one where
        # the feature's start and stop positions are both within the same bin
        # is the smallest one these coords fit within.
        if one:
            if start == stop:
                # Note that at this point, because of the bit-shifting, `start`
                # is the number of bins (at this current level).  So we need to
                # add it to `offset` to get the actual bin ID.
                return offset + start

        # See the Fig 7 reproduction above to see why range().
        bins.update(list(range(offset + start, offset + stop + 1)))

        # Move to the next level (8x larger bin size; i.e., 2**NEXT_SHIFT
        # larger bin size)
        start >>= BIN_NEXT_SHIFT
        stop >>= BIN_NEXT_SHIFT
    return bins


[docs]def bin_info():
    """
    Useful for debugging: how large is each bin, and what are the bin IDs?
    """
    print("level\t#bins\tstart\tend\tsize")
    level = -1
    for i, offset in reversed(list(enumerate(BIN_OFFSETS))):
        level += 1
        number_of_bins = 8 ** level

        binstart = offset
        try:
            binstop = BIN_OFFSETS[i+1]
            binstop = number_of_bins + binstart - 1
        except IndexError:
            binstop = binstart

        bin_size = 2 ** (BIN_FIRST_SHIFT + (i * BIN_NEXT_SHIFT))
        actual_size = bin_size

        # nice formatting
        bin_size, suffix = bin_size / 1024, 'Kb'
        if bin_size >= 1024:
            bin_size, suffix = bin_size / 1024, 'Mb'
        if bin_size >= 1024:
            bin_size, suffix = bin_size / 1024, 'Gb'
        size = '(%s %s)' % (bin_size, suffix)
        actual_size = '%s bp' % (actual_size)

        print('{level}\t{number_of_bins}\t{binstart}\t{binstop}\t{actual_size}\t{size}'.format(**locals()))


if __name__ == "__main__":
    bin_info()
    print(bins(10000, 20000, one=False))
    print(bins(10000, 10001, one=True))
    print(bins(50000000, 60000000, one=False))
    print(bins(50000000, 60000001, one=True))