[1, 4]Computational Biology 2025
Ghent University
Ghent University
2025-02-14
numpy (ndarray), pandas (DataFrame), matplotlib (plots)…numpy, scipy, pandas…dask, pyspark…scikit-learn, tensorflow, pytorch…matplotlib, seaborn, plotly…numba, polars, JAX, torch.compile…nextflow, snakemake, kubeflow, airflow…class MyCounter:
    # my_counter = MyCounter()
    # next(my_counter) --> 0
    # next(my_counter) --> 1
    def __init__(self):
        self.current = 0
    def __iter__(self):
        return self
    def __next__(self):
        self.current += 1
        return currentyield fromclass Tree:
    def __init__(self, left=None, right=None):
        self.left = left
        self.right = right
    def __iter__(self):
        return TreePath(self)
class TreePath:
    def __init__(self, tree):
        self.path = [tree]
    def __iter__(self):
        return self
    def __next__(self):
        current = None
        while current is None:
            if not self.path: raise StopIteration
            current = self.path.pop()
        self.path.append(current.right)
        self.path.append(current.left)
        return current
class A(Tree):
    def __init__(self, item, left=None, right=None):
        super().__init__(left=left, right=right)
        self.item = itemclass Tree:
    def __init__(self, left=None, right=None):
        self.left = left
        self.right = right
    def __iter__(self):
        yield self
        if self.left:
            yield from self.left
        if self.right:
            yield from self.right
class A(Tree):
    def __init__(self, item, left=None, right=None):
        super().__init__(left=left, right=right)
        self.item = itemclass Tree:
    def __init__(self, left=None, right=None):
        self.left = left
        self.right = right
    def __iter__(self):
        yield self
        if self.left:
            yield from self.left
        if self.right:
            yield from self.right
class A(Tree):
    def __init__(self, item, left=None, right=None):
        super().__init__(left=left, right=right)
        self.item = itemimport itertools
class Tree:
    def __init__(self, left=None, right=None):
        self.left = left
        self.right = right
    def __iter__(self):
        return itertools.chain([self], self.left, self.right)
class A(Tree):
    def __init__(self, item, left=None, right=None):
        super().__init__(left=left, right=right)
        self.item = itemitertools.filterfalse itertools.takewhile
[1, 4]from itertools import filterfalse, count, takewhile
from operator import ge
from functools import partial
def div_by(m, n):
    return n % m == 0
def primes():
    l = count(2)
    while True:
        first = next(l)
        yield first
        l = iter(filterfalse(partial(div_by, first), l))
def smallprimes():
    print(list(takewhile(partial(ge, 100), primes())))Everything is a reference, so beware of pointer aliasing.
create your own decorator
A least-recently-used cache makes this recursive function faster.
from functools import lru_cache, total_ordering
@lru_cache
def fibo(n):
    if n <= 1:
        return n
    return fibo(n - 1) + fibo(n - 2)
fibo(100)354224848179261915075Total ordering ensures that all comparison methods are defined.
Also called dunder methods (Double leading and trailing UNDERscore).
Private library methods only have one leading underscore e.g. _private_function.
# iterators
__iter__
__next__
# operator overloading
a < b       a.__lt__(b)
a <= b      a.__le__(b)
a > b       a.__gt__(b)
a >= b      a.__ge__(b)
a == b      a.__eq__(b)
a != b      a.__ne__(b)
__add__, __sub__, __mul__,
__truediv__, __floordiv__, __mod__,
__divmod__, __pow__, __lshift__,
__rshift__, __and__, __xor__,
__or__, __neg__, __pos__, __abs__,
# other magic methods
__doc__, __name__, __module__, 
__code__, __defaults__, 
__kwdefaults__, __globals__, 
__dict__, __closure__, __bases__, 
hash(x)          x.__hash__()
bytes(x)         x.__bytes__()
“{}”.format(x)   x.__format__(“{}”)
if x: ...        x.__bool__()
x()              x.__call__()
len(x)           x.__len__()__init__(self, ...)
__new__(cls, ...)
__del__(self)
int(a)         a.__int__()
long(a)        a.__long__()
float(a)       a.__float__()
complex(a)     a.__complex__()
oct(a)         a.__oct__()
hex(a)         a.__hex__()
str(a)         a.__str__()
repr(a)        a.__repr__()
a[key]            a.__getitem__(key)
a[key] = value    a.__setitem__(key, value)
del a[key]        a.__delitem__(key)
item in a         a.__contains__(item)
reversed(a)       a.__reversed__()
a[non_key]        a.__missing__(non_key)class A:
    def __getattribute__(self, name):
        print('Ik wil %s weten' % name)
        try:
            return super().__getattribute__(name)
        except AttributeError:
            return -1
    def __setattribute__(self, name, value):
        print('Ik wil %s instellen op %s' % (name, value))
        super().__setattr__(name, value)
    def __delattribute__(self, name):
        print('Ik wil %s verwijderen' % name)
        super().__delattr__(name)
# Note that this is not a magic method, but a guard using the __name__ attribute of the executed module
if __name__ == '__main__':
    a = A()
    print(a.x)Ik wil x weten
-1Database access
from contextlib import contextmanager
@contextmanager
def managed_resource(*args, **kwds):
    # Code to acquire resource, e.g.:
    resource = acquire_resource(*args, **kwds)
    try:
        yield resource
    finally:
        # Code to release resource, e.g.:
        release_resource(resource)
>>> with managed_resource(timeout=3600) as resource:
...     # Resource is released at the end of this block,
...     # even if code in the block raises an exceptiona + 2      a + b     a + c
a - 2      a - b     a - c
a * 2      a * b     a * b
a ** 2     a ** b    a ** c
a < 2      a < b     a < c
           a @ b.Ta.sum() == 15
a.sum(axis=0)    == array([3, 5, 7])
a.sum(axis=1)    == array([3, 12])
a.cumsum(axis=1) == array([[0, 1, 3],
                           [3, 7, 12]])many other functions like min, max, mean…
a = np.arange(16).reshape(4, 4)
a[1, 1]     == 5
a[1]        == array([4, 5, 6, 7])     
a[:, 1]     == array([1, 5, 9, 13])   
a[:2, :2]   == array([[0, 1], [4, 5]])
a[1, [1,2]] == array([5, 6])
a[a%3 == 0] == array([ 0,  3,  6,  9, 12, 15])Numpy docs: indexing on ndarrays Numpy docs: copies and views
import numpy as np
def stochastic_function(high=10, size=5, rng=None, *args, **kwargs):
    rng = np.random.default_rng(rng)
    return rng.integers(high, *args, size=size, **kwargs)
# seed created with:
#   import secrets
#   secrets.randbits(128)
seed = 319929794527176038403653493598663843656
# create the RNG that you want to pass around
rng = np.random.default_rng(seed)
stochastic_function(rng=rng) # deterministic result instead of random defaultarray([2, 9, 9, 4, 0])Numba is an open source JIT compiler that translates a subset of Python and NumPy code into fast machine code.
Note that the Generator.size argument is not used here. numpy Generator objects are supported, but not thread-safe as of 0.56. Example for CUDA GPU acceleration.
Can be difficult to use and debug, so use only if you want to optimize even further!
from numba import njit
import numpy as np
# seed created with:
#   import secrets
#   secrets.randbits(128)
seed = 319929794527176038403653493598663843656
@njit
def monte_carlo_pi(gen, nsamples):
    acc = 0
    for i in range(nsamples):
        x = gen.random()
        y = gen.random()
        if (x ** 2 + y ** 2) < 1.0:
            acc += 1
    return 4.0 * acc / nsamples
rng = np.random.default_rng(seed)
print(monte_carlo_pi(rng, 1000))3.132from numba import njit, prange
import numpy as np
@njit(parallel=True)
def ident_parallel(x):
    return np.cos(x) ** 2 + np.sin(x) ** 2
@njit(parallel=True, fastmath=True)
def do_sum_parallel(A):
    # fasthmath=True allows for some unsafe optimizations
    # each thread can accumulate its own partial sum, and then a cross
    # thread reduction is performed to obtain the result to return
    n = len(A)
    acc = 0.
    # Without "parallel=True" in the jit-decorator
    # the prange statement is equivalent to range
    for i in prange(n):
        acc += np.sqrt(A[i])
    return accimport random
import timeit
import numpy as np
from collections import Counter, defaultdict
# create datasets, note that this does not take different frequency patters into account
smalllist = random.choices(range(100), k=1_000)
biglist = random.choices(range(10_000), k=1_000_000)
arr_biglist = np.array(biglist)
def if_contains_else(biglist):
    # using if-else is as the 'slowest' method
    frequencies = dict()
    for i in biglist:
        if i in frequencies:
            frequencies[i] += 1
        else:
            frequencies[i] = 1
def for_counter(biglist):
    # this can be the least efficient method because of the for loop
    frequencies = Counter()
    for i in biglist:
        frequencies[i] += 1
def counter(biglist):
    # using the Counter class directly is the fastest method
    frequencies = Counter(biglist)
def get_or_default(biglist):
    frequencies = dict()
    for i in biglist:
        frequencies[i] = frequencies.get(i, 0) + 1
def default_dict(biglist):
    # more elegant than get_or_default
    frequencies = defaultdict(int)
    for i in biglist:
        frequencies[i] += 1
def numpy_freq(biglist):
    # numpy array creation overhead make this slow
    unique, counts = np.unique(biglist, return_counts=True)
def numpy_freq_precreated(biglist):
    # without overhead, still slower than Counter for this size
    unique, counts = np.unique(arr_biglist, return_counts=True)
def try_except(biglist):
    # using a try clause is surprisingly good
    frequences = dict()
    for i in biglist:
        try:
            frequences[i] += 1
        except:
            frequences[i] = 1                                     Time (s)
Function         Dataset                     
if_contains_else smalllist k=1_000      0.000
counter          smalllist k=1_000      0.000
for_counter      smalllist k=1_000      0.000
get_or_default   smalllist k=1_000      0.000
default_dict     smalllist k=1_000      0.000
try_except       smalllist k=1_000      0.000
numpy_freq       smalllist k=1_000      0.002
if_contains_else biglist k=1_000_000    0.211
counter          biglist k=1_000_000    0.109
for_counter      biglist k=1_000_000    0.378
get_or_default   biglist k=1_000_000    0.193
default_dict     biglist k=1_000_000    0.178
try_except       biglist k=1_000_000    0.173
numpy_freq       biglist k=1_000_000    0.145def greeting(name: str) -> str:
    return 'Hello ' + name
greeting(3)         # Argument 1 to "greeting" has incompatible type "int"; expected "str"
greeting(b'Alice')  # Argument 1 to "greeting" has incompatible type "bytes"; expected "str"
greeting("World!")  # No error
def bad_greeting(name: str) -> str:
    return 'Hello ' * name  # Unsupported operand types for * ("str" and "str")# Python 3.11
# from typing import TypeAlias
# Vector = TypeAlias(list[float])
# Python 3.12
type Vector = list[float]
def scale(scalar: float, vector: Vector) -> Vector:
    return [scalar * num for num in vector]
# passes type checking; a list of floats qualifies as a Vector.
new_vector = scale(2.0, [1.0, -4.2, 5.4])from typing import TypedDict, Unpack, Required, NotRequired
class KWArgs(TypedDict):
    a: Required[int]
    b: NotRequired[str]
def consume(**kwargs: Unpack[KWArgs]) -> None:
    a = kwargs["a"]
    b = kwargs.get("b", "default value")
try:
    consume()              # Not allowed.
except KeyError:
    pass
consume(a=1)           # Allowed.
consume(a=1, b="abc")  # Allowed.Beautiful is better than ugly.
Explicit is better than implicit.
Simple is better than complex.
Complex is better than complicated.
Flat is better than nested.
Sparse is better than dense.
Readability counts.
Special cases aren't special enough to break the rules.
Although practicality beats purity.
Errors should never pass silently.
Unless explicitly silenced.
In the face of ambiguity, refuse the temptation to guess.
There should be one-- and preferably only one --obvious way to do it.
Although that way may not be obvious at first unless you're Dutch.
Now is better than never.
Although never is often better than *right* now.
If the implementation is hard to explain, it's a bad idea.
If the implementation is easy to explain, it may be a good idea.
Namespaces are one honking great idea -- let's do more of those!>>> from Bio.Seq import Seq
>>> simple_seq = Seq("GATC")
>>> from Bio.SeqRecord import SeqRecord
>>> simple_seq_r = SeqRecord(simple_seq)
>>> simple_seq_r.id
'<unknown id>'
>>> simple_seq_r.id = "AC12345"
>>> simple_seq_r.description = "Made up sequence I wish I could write a paper about"
>>> print(simple_seq_r.description)
Made up sequence I wish I could write a paper about
>>> simple_seq_r.seq
Seq('GATC')>>> from Bio import SeqIO
>>> records = (
        rec.reverse_complement(id="rc_"+rec.id, description = "reverse complement") 
        for rec in SeqIO.parse("file.fasta", "fasta")
        # optional guard clause:
        # if len(rec)<700
    )
>>> SeqIO.write(records, "rev_file.fasta", "fasta")Notice the generator usage!
# solexa2sanger_fq.py
import sys
from Bio import SeqIO
SeqIO.convert(sys.stdin, "fastq-solexa", sys.stdout, "fastq")gunzip -c some_solexa.fastq.gz | python solexa2sanger_fq.py
python solexa2sanger_fq.py < some_solexa.fastq > some_phred.fastqGreat for xargs or GNU parallel!
Alternative: use joblib or Dask task submissions for parallel processing in Python.