Python

pandas

import pandas as pd
pd.set_option('display.height', 1000)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 10000)

Source.

Replace NaN in one column with value from corresponding row of second column

df['colA'].fillna(df['colB'], inplace=True)

Source

Identify rows with NaN values

df[df['a'].isnull()]

Source

Conditionally replace values

# Method 1; See https://stackoverflow.com/a/21608417/3998252
df.ix[df['A'] > 20000, 'A'] = 0

# Method 2; See https://stackoverflow.com/a/19913845/3998252
import numpy as np
df['A'] = np.where(df['B']=='Z', 1, 0)

# Method 3; See https://stackoverflow.com/a/31173785/3998252
df['A'] = [1' if x == 'Z' else 0 for x in df['B']]

Remove columns

# to get into a new dataframe
df = df.drop('column_name', axis=1)

# to remove it inplace
df.drop('column_name', axis=1, inplace=True)

Source

Splitting column into multiple columns

# create a df
> df = pd.DataFrame(data={'aaa':['a 2 3','b 6 7 8']})
> df
       aaa
0    a 2 3
1  b 6 7 8

# to keep 1 item in a column and store remaining in another column
> df['x'], df['y'] = df['aaa'].str.split(' ', 1).str
> df[['x', 'y']]
   x      y
0  a    2 3
1  b  6 7 8

# split column into multiple columns by a delimiter
> df = df['aaa'].str.split(' ', expand=True)
> df
   0  1  2     3
0  a  2  3  None
1  b  6  7     8

Source

Extract strings by position into new column

data_pd['new_col'] = data_pd['col_X'].str[:1]

Source

Dataframe in percentage

# if by row
df.apply(lambda  x: x / x.sum() * 100, axis=1)

# if by column
df.apply(lambda  x: x / x.sum() * 100, axis=0)

Sorting multilevel dataframe (pivot table)

    Group1    Group2
    A B C     A B C
1   1 0 3     2 5 7
2   5 6 9     1 0 0
3   7 0 2     0 3 5

# Sorted by column 'C' of 'Group1'. They need to be in a TUPLE.
df.sort([('Group1', 'C')], ascending=False)

  Group1       Group2
       A  B  C      A  B  C
2      5  6  9      1  0  0
1      1  0  3      2  5  7
3      7  0  2      0  3  5

Source

Adding new column with mapped value from a dictionary

df["col_2"] = df["col_1"].map(dict_name)

Source

If desired to replace existing column with dictionary value: df.replace({"col_1": dict_name})

Source

Make dictionary from Pandas columns

# works only if 1:1 key to value pairing
dict_name = df.set_index('key_col')['value_col'].to_dict()

Source

Convert dataframe to file object

from cStringIO import StringIO

xx = StringIO()
data_pd.to_csv(xx, index=False, sep='\t')
xx.seek(0)  # this is important; position set to the beginning

Source

Rename column name

df2 = df.rename(columns={'old_name' : 'new_name'})

# in-place
df2.rename(columns={'old_name' : 'new_name'}, inplace = True)

Ignore commented lines when reading files

Set comment='#' when using pd.read_csv

Reading massive files in chunks

Use iterator option for iterating file in chunks

chunksize= 10**6
for df in pd.read_csv('filename.gz',sep='\t', header=None, chunksize=chunksize, iterator=True):
    print df[0].value_counts()

See here for more info on this.

Checking if item is NaN or not

# using numpy
numpy.isnan(item)   # item can't be string dtype

# using math
math.isnan(item)    # item can't be string dtype

# using pandas
pandas.isnull(item) # allows string dtype

Charts

Creating reproducible charts

http://www.jesshamrick.com/2016/04/13/reproducible-plots/

Why use fig, ax = plt.subplots()

See here

Change figure parameters globally for all figures in a script

See this. See this page for parameters that can be modified -http://matplotlib.org/users/customizing.html.

Legends for chart with two Y-axes

fig, ax1 = plt.subplots()   # primary
ax2 = ax1.twinx()   # for second y-axis

ax1.bar(x,y)
ax2.bar(x,y)

# for legend
plots1, labels1 = ax.get_legend_handles_labels()
plots2, labels2 = ax2.get_legend_handles_labels()
ax1.legend(plots1 + plots2, labels1 + labels2)

Source

Force axis tick labels to be integers

from matplotlib.ticker import MaxNLocator
....
ax.yaxis.set_major_locator(MaxNLocator(integer=True))

Source Documentation

xticks and xtick_labels

ax.set_xticks(x_axis)
ax.set_xticklabels(x_labels, rotation=45, ha='right')

# when using seaborn where x_labels were preset, for example-seaborn
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')

importing matplotlib in cluster

import matplotlib as mpl
mpl.use('Agg')
import matplotlib.pyplot as plt

Source

Axis in exponential format

ax.ticklabel_format(style='sci', axis='y', scilimits=(0, 0))

Legend positioning without crowding

Method 1 (Preferred):

lgd = ax.legend(bbox_to_anchor=(0., -0.3, 1., .102), loc=3, ncol=3, mode="expand", borderaxespad=0.)
fig.savefig('a.png', bbox_extra_artists=(lgd,), bbox_inches='tight')

Source

Method 2:

# Shrink current axis by 20%; to accomodate legends.
box = ax.get_position()
ax.set_position([box.x0, box.y0, box.width * 0.85, box.height])

# change 'bbox_to_anchor' for legend positions
handles, labels = ax.get_legend_handles_labels()
ax.legend(handles, labels, loc='center left', bbox_to_anchor=(1, 0.5))

constrained_layout

constrained_layout automatically adjusts subplots and decorations like legends and colorbars so that they fit in the figure window while still preserving, as best they can, the logical layout requested by the user.

Warning: Constrained Layout is experimental for matplot v3

plt.subplots(constrained_layout=True)

Source

subprocess

Stream stdout to terminal

try:
    proc = subprocess.Popen(
        cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True
    )

    for line in proc.stdout:
        print(line.strip("\n"))

    proc.wait()
except Exception:
    LOGGER.exception(f"Exception occurred when running command: '{cmd}'")
    raise SystemExit

return None

logging

Boilerplate using colorlog

colorlog makes logs colorful and works across OS platforms.

import colorlog

logger = colorlog.getLogger(__name__)
logger.setLevel(colorlog.colorlog.logging.DEBUG)

handler = colorlog.StreamHandler()
handler.setFormatter(colorlog.ColoredFormatter(
        "%(log_color)s%(asctime)s %(name)s %(levelname)-8s %(message)s",
        datefmt="%m-%d-%y %H:%M:%S"))
logger.addHandler(handler)

Why use __name__?

The name of the logger corresponding to the name variable is logged as main, which is the name Python assigns to the module where execution starts. If this file is imported by some other module, then the name variable would correspond to its name logging_example. Here’s how it would look: Source

Levels available example:

logger.debug("Debug message")
logger.info("Information message")
logger.warning("Warning message")
logger.error("Error message")
logger.critical("Critical message")

Using fileConfig

Setup config file

[loggers]
keys=root

[logger_root]
handlers=stream
level=DEBUG

[formatters]
keys=color

[formatter_color]
class=colorlog.ColoredFormatter
format=%(log_color)s%(asctime)s %(name)-12s %(levelname)-8s%(reset)s %(message)s
datefmt=%m-%d-%y %H:%M:%S

[handlers]
keys=stream

[handler_stream]
class=StreamHandler
formatter=color
args=(sys.stdout,)

Use this config in script as below. Note that, in this implementation, colorlog will be imported as config file uses colorlog formatter.

import logging.config
logging.config.fileConfig('config.ini', disable_existing_loggers=False)
logger = logging.getLogger(__name__)

logging in modules

Above config based setup works for multiple modules as well. Example:

Module file: xxx.py

import logging.config
logging.config.fileConfig('config.ini', disable_existing_loggers=False)
logger = logging.getLogger(__name__)

def xxx():
    logger.info('info msg')
    logger.warning('warning msg')
    return None

if __name__ == '__main__':
    xxx()

Main file: main.py

import xxx
import logging.config
logging.config.fileConfig('config.ini', disable_existing_loggers=False)
logger = logging.getLogger(__name__)

def main():
    logger.info('main script message')
    xxx.xxx()
    return None


if __name__ == '__main__':
    main()

Source

Disable logs from specific libraries

When logging in debug mode, logs from some third party python libraries (eg.: matplotlib, requests) are also produced. To turn them off:

# Add this code after code for logging config
logging.getLogger('matplotlib').setLevel(logging.WARNING)

Source

Capturing stack traces

Use exc_info=True with error level.

logging.error("Exception occurred", exc_info=True)

Shortcut for above:

logging.exception("Exception occurred")

rich

Source

Logging

Boilerplate:

import logging
from rich.logging import RichHandler

logging.basicConfig(
    level="DEBUG",
    format="%(name)-12s %(message)s",
    datefmt="%m-%d-%y %H:%M:%S",
    handlers=[RichHandler(rich_tracebacks=True)]
)

logger = logging.getLogger(__name__)

Traceback

# makes rich default traceback handler so that all uncaught exceptions will be rendered with highlighting
from rich.traceback import install
install()

Progress bar

Basic usage:

from rich.progress import track

for n in track(range(n), description="Processing..."):
    do_work(n)

See here for advanced setup.

Virtual environment

pipenv

pipenv is the easiest way to manage virtual environment as it automates several easy-to-forget-but-required steps. It requires Python 3 installedthough.

a. Creating new/fresh virtual environment

cd project_dir

# to initiate pipenv virtual env
pipenv install

# to install required packages
pipenv install [packages]

# **IMPORTANT** To LOCK pipfile with EXACT version info
pipenv lock

# activate virtual environment
pipenv shell

# exit virtual environment
exit

b. Recreating virtual environment from Pipfile

# installs all packages from Pipfile
pipenv install

# activate virtual environment
pipenv shell

# exit virtual environment
exit

c. Opening existing pipenv project

cd project_dir

# activate virtual environment
pipenv shell

# exit virtual environment
exit

d. Remove virtual environment

cd <project_dir>
pipenv --rm

e. Simply running a python script with pipenv without spawning a new shell

pipenv run python script.py

Jupyter notebook

Running jupyter notebook under pipenv is possible. See jupyter_notebook.md

pytest

Random tips

  • Use -s flag to show stdout and stderr, which are otherwise not shown by default.

Testing multiple paramters for a test

https://docs.pytest.org/en/latest/parametrize.html

Example:

import pytest
@pytest.mark.parametrize("test_input, expected", [
        ("3+5", 8),
        ("2+4", 6),
        ("6*9", 42)
        ],
        ids=['test1', 'test2', 'test3']
    )
def test_eval(test_input, expected):
    assert eval(test_input) == expected

Using parameters with fixtures

This is allowed using indirect.

@pytest.fixture()
def yup_docker(request):

    # docker cmd
    docker_image = "yup_docker"
    cmd = f'docker run --rm {docker_image} python hello.py "{request.param.strip()}"'

    output = subprocess.run(cmd.split(' '), stdout=subprocess.PIPE)
    yield output.stdout.decode("utf-8").strip()


@pytest.mark.parametrize('yup_docker, expected',[
                                ('yo yo yo', '3'),
                                ('yo', '1'),
                            ],
                            ids=['cellobiose', 'urea'],
                            indirect=['yup_docker'])
def test_hello(yup_docker, expected):

    output = yup_docker

    assert output == expected

xfail

Use xfail for test cases that are expected to fail.

Auto-run tests on saving

Install plugin pytest-xdist and then use -f flag in pytest command.

!!! info This -f flag suppresses color output; Use --color=yes to restore colors.

Coverage

https://pytest-cov.readthedocs.io/en/latest/readme.html#id1

Simple execution: py.test --cov=myproj tests/

Generate reports: py.test --cov-report term --cov=myproj tests/

--cov-report takes values html, xml, term and annotate

temp files/dir

Built-in module tempfile helps to easily manage temp files/dirs. They will get deleted automatically when the script exits, unless mentioned otherwise. They can be named or unnamed.

Example:

import tempfile

tmp = tempfile.NamedTemporaryFile()

# Open the file for writing. NOTICE calling 'tmp.name' instead of just `tmp'.
with open(tmp.name, 'w') as f:
    f.write(stuff) # where `stuff` is, y'know... stuff to write (a string)

...

# Open the file for reading. NOTICE calling 'tmp.name' instead of just `tmp'
with open(tmp.name) as f:
    for line in f:
        ... # more things here