Python
pandas
Print without breaking rows/columns
import pandas as pd
pd.set_option('display.height', 1000)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 10000)
Replace NaN in one column with value from corresponding row of second column
df['colA'].fillna(df['colB'], inplace=True)
Identify rows with NaN values
df[df['a'].isnull()]
Conditionally replace values
# Method 1; See https://stackoverflow.com/a/21608417/3998252
df.ix[df['A'] > 20000, 'A'] = 0
# Method 2; See https://stackoverflow.com/a/19913845/3998252
import numpy as np
df['A'] = np.where(df['B']=='Z', 1, 0)
# Method 3; See https://stackoverflow.com/a/31173785/3998252
df['A'] = [1' if x == 'Z' else 0 for x in df['B']]
Remove columns
# to get into a new dataframe
df = df.drop('column_name', axis=1)
# to remove it inplace
df.drop('column_name', axis=1, inplace=True)
Splitting column into multiple columns
# create a df
> df = pd.DataFrame(data={'aaa':['a 2 3','b 6 7 8']})
> df
aaa
0 a 2 3
1 b 6 7 8
# to keep 1 item in a column and store remaining in another column
> df['x'], df['y'] = df['aaa'].str.split(' ', 1).str
> df[['x', 'y']]
x y
0 a 2 3
1 b 6 7 8
# split column into multiple columns by a delimiter
> df = df['aaa'].str.split(' ', expand=True)
> df
0 1 2 3
0 a 2 3 None
1 b 6 7 8
Extract strings by position into new column
data_pd['new_col'] = data_pd['col_X'].str[:1]
Dataframe in percentage
# if by row
df.apply(lambda x: x / x.sum() * 100, axis=1)
# if by column
df.apply(lambda x: x / x.sum() * 100, axis=0)
Sorting multilevel dataframe (pivot table)
Group1 Group2
A B C A B C
1 1 0 3 2 5 7
2 5 6 9 1 0 0
3 7 0 2 0 3 5
# Sorted by column 'C' of 'Group1'. They need to be in a TUPLE.
df.sort([('Group1', 'C')], ascending=False)
Group1 Group2
A B C A B C
2 5 6 9 1 0 0
1 1 0 3 2 5 7
3 7 0 2 0 3 5
Adding new column with mapped value from a dictionary
df["col_2"] = df["col_1"].map(dict_name)
If desired to replace existing column with dictionary value: df.replace({"col_1": dict_name})
Make dictionary from Pandas columns
# works only if 1:1 key to value pairing
dict_name = df.set_index('key_col')['value_col'].to_dict()
Convert dataframe to file object
from cStringIO import StringIO
xx = StringIO()
data_pd.to_csv(xx, index=False, sep='\t')
xx.seek(0) # this is important; position set to the beginning
Rename column name
df2 = df.rename(columns={'old_name' : 'new_name'})
# in-place
df2.rename(columns={'old_name' : 'new_name'}, inplace = True)
Ignore commented lines when reading files
Set comment='#' when using pd.read_csv
Reading massive files in chunks
Use iterator option for iterating file in chunks
chunksize= 10**6
for df in pd.read_csv('filename.gz',sep='\t', header=None, chunksize=chunksize, iterator=True):
print df[0].value_counts()
See here for more info on this.
Checking if item is NaN or not
# using numpy
numpy.isnan(item) # item can't be string dtype
# using math
math.isnan(item) # item can't be string dtype
# using pandas
pandas.isnull(item) # allows string dtype
Charts
Creating reproducible charts
http://www.jesshamrick.com/2016/04/13/reproducible-plots/
Why use fig, ax = plt.subplots()
Change figure parameters globally for all figures in a script
See this. See this page for parameters that can be modified -http://matplotlib.org/users/customizing.html.
Legends for chart with two Y-axes
fig, ax1 = plt.subplots() # primary
ax2 = ax1.twinx() # for second y-axis
ax1.bar(x,y)
ax2.bar(x,y)
# for legend
plots1, labels1 = ax.get_legend_handles_labels()
plots2, labels2 = ax2.get_legend_handles_labels()
ax1.legend(plots1 + plots2, labels1 + labels2)
Force axis tick labels to be integers
from matplotlib.ticker import MaxNLocator
....
ax.yaxis.set_major_locator(MaxNLocator(integer=True))
xticks and xtick_labels
ax.set_xticks(x_axis)
ax.set_xticklabels(x_labels, rotation=45, ha='right')
# when using seaborn where x_labels were preset, for example-seaborn
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')
importing matplotlib in cluster
import matplotlib as mpl
mpl.use('Agg')
import matplotlib.pyplot as plt
Axis in exponential format
ax.ticklabel_format(style='sci', axis='y', scilimits=(0, 0))
Legend positioning without crowding
Method 1 (Preferred):
lgd = ax.legend(bbox_to_anchor=(0., -0.3, 1., .102), loc=3, ncol=3, mode="expand", borderaxespad=0.)
fig.savefig('a.png', bbox_extra_artists=(lgd,), bbox_inches='tight')
Method 2:
# Shrink current axis by 20%; to accomodate legends.
box = ax.get_position()
ax.set_position([box.x0, box.y0, box.width * 0.85, box.height])
# change 'bbox_to_anchor' for legend positions
handles, labels = ax.get_legend_handles_labels()
ax.legend(handles, labels, loc='center left', bbox_to_anchor=(1, 0.5))
constrained_layout
constrained_layout automatically adjusts subplots and decorations like legends and colorbars so that they fit in the figure window while still preserving, as best they can, the logical layout requested by the user.
Warning: Constrained Layout is experimental for matplot v3
plt.subplots(constrained_layout=True)
subprocess
Stream stdout to terminal
try:
proc = subprocess.Popen(
cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True
)
for line in proc.stdout:
print(line.strip("\n"))
proc.wait()
except Exception:
LOGGER.exception(f"Exception occurred when running command: '{cmd}'")
raise SystemExit
return None
logging
Boilerplate using colorlog
colorlog makes logs colorful and works across OS platforms.
import colorlog
logger = colorlog.getLogger(__name__)
logger.setLevel(colorlog.colorlog.logging.DEBUG)
handler = colorlog.StreamHandler()
handler.setFormatter(colorlog.ColoredFormatter(
"%(log_color)s%(asctime)s %(name)s %(levelname)-8s %(message)s",
datefmt="%m-%d-%y %H:%M:%S"))
logger.addHandler(handler)
Why use __name__?
The name of the logger corresponding to the name variable is logged as main, which is the name Python assigns to the module where execution starts. If this file is imported by some other module, then the name variable would correspond to its name logging_example. Here’s how it would look: Source
Levels available example:
logger.debug("Debug message")
logger.info("Information message")
logger.warning("Warning message")
logger.error("Error message")
logger.critical("Critical message")
Using fileConfig
Setup config file
[loggers]
keys=root
[logger_root]
handlers=stream
level=DEBUG
[formatters]
keys=color
[formatter_color]
class=colorlog.ColoredFormatter
format=%(log_color)s%(asctime)s %(name)-12s %(levelname)-8s%(reset)s %(message)s
datefmt=%m-%d-%y %H:%M:%S
[handlers]
keys=stream
[handler_stream]
class=StreamHandler
formatter=color
args=(sys.stdout,)
Use this config in script as below. Note that, in this implementation, colorlog will be imported as config file uses
colorlog formatter.
import logging.config
logging.config.fileConfig('config.ini', disable_existing_loggers=False)
logger = logging.getLogger(__name__)
logging in modules
Above config based setup works for multiple modules as well. Example:
Module file: xxx.py
import logging.config
logging.config.fileConfig('config.ini', disable_existing_loggers=False)
logger = logging.getLogger(__name__)
def xxx():
logger.info('info msg')
logger.warning('warning msg')
return None
if __name__ == '__main__':
xxx()
Main file: main.py
import xxx
import logging.config
logging.config.fileConfig('config.ini', disable_existing_loggers=False)
logger = logging.getLogger(__name__)
def main():
logger.info('main script message')
xxx.xxx()
return None
if __name__ == '__main__':
main()
Disable logs from specific libraries
When logging in debug mode, logs from some third party python libraries (eg.: matplotlib, requests) are also produced. To turn them off:
# Add this code after code for logging config
logging.getLogger('matplotlib').setLevel(logging.WARNING)
Capturing stack traces
Use exc_info=True with error level.
logging.error("Exception occurred", exc_info=True)
Shortcut for above:
logging.exception("Exception occurred")
rich
Logging
Boilerplate:
import logging
from rich.logging import RichHandler
logging.basicConfig(
level="DEBUG",
format="%(name)-12s %(message)s",
datefmt="%m-%d-%y %H:%M:%S",
handlers=[RichHandler(rich_tracebacks=True)]
)
logger = logging.getLogger(__name__)
Traceback
# makes rich default traceback handler so that all uncaught exceptions will be rendered with highlighting
from rich.traceback import install
install()
Progress bar
Basic usage:
from rich.progress import track
for n in track(range(n), description="Processing..."):
do_work(n)
See here for advanced setup.
Virtual environment
pipenv
pipenv is the easiest way to manage virtual environment as it automates several easy-to-forget-but-required steps. It requires Python 3 installedthough.
a. Creating new/fresh virtual environment
cd project_dir
# to initiate pipenv virtual env
pipenv install
# to install required packages
pipenv install [packages]
# **IMPORTANT** To LOCK pipfile with EXACT version info
pipenv lock
# activate virtual environment
pipenv shell
# exit virtual environment
exit
b. Recreating virtual environment from Pipfile
# installs all packages from Pipfile
pipenv install
# activate virtual environment
pipenv shell
# exit virtual environment
exit
c. Opening existing pipenv project
cd project_dir
# activate virtual environment
pipenv shell
# exit virtual environment
exit
d. Remove virtual environment
cd <project_dir>
pipenv --rm
e. Simply running a python script with pipenv without spawning a new shell
pipenv run python script.py
Jupyter notebook
Running jupyter notebook under pipenv is possible. See jupyter_notebook.md
pytest
Random tips
- Use
-sflag to showstdoutandstderr, which are otherwise not shown by default.
Testing multiple paramters for a test
https://docs.pytest.org/en/latest/parametrize.html
Example:
import pytest
@pytest.mark.parametrize("test_input, expected", [
("3+5", 8),
("2+4", 6),
("6*9", 42)
],
ids=['test1', 'test2', 'test3']
)
def test_eval(test_input, expected):
assert eval(test_input) == expected
Using parameters with fixtures
This is allowed using indirect.
@pytest.fixture()
def yup_docker(request):
# docker cmd
docker_image = "yup_docker"
cmd = f'docker run --rm {docker_image} python hello.py "{request.param.strip()}"'
output = subprocess.run(cmd.split(' '), stdout=subprocess.PIPE)
yield output.stdout.decode("utf-8").strip()
@pytest.mark.parametrize('yup_docker, expected',[
('yo yo yo', '3'),
('yo', '1'),
],
ids=['cellobiose', 'urea'],
indirect=['yup_docker'])
def test_hello(yup_docker, expected):
output = yup_docker
assert output == expected
xfail
Use xfail for test
cases that are expected to fail.
raisesparameter can be used to ensure fail is due to expected exception.- They can be used with
@pytest.mark.parametrizeas well.
Auto-run tests on saving
Install plugin pytest-xdist and then use -f flag in pytest command.
!!! info This -f flag suppresses color output; Use --color=yes to
restore colors.
Coverage
https://pytest-cov.readthedocs.io/en/latest/readme.html#id1
Simple execution: py.test --cov=myproj tests/
Generate reports: py.test --cov-report term --cov=myproj tests/
--cov-report takes values html, xml, term and annotate
temp files/dir
Built-in module tempfile helps to easily manage temp files/dirs.
They will get deleted automatically when the script exits, unless mentioned otherwise. They can be named or unnamed.
Example:
import tempfile
tmp = tempfile.NamedTemporaryFile()
# Open the file for writing. NOTICE calling 'tmp.name' instead of just `tmp'.
with open(tmp.name, 'w') as f:
f.write(stuff) # where `stuff` is, y'know... stuff to write (a string)
...
# Open the file for reading. NOTICE calling 'tmp.name' instead of just `tmp'
with open(tmp.name) as f:
for line in f:
... # more things here