linkFiles

Manipulate files for automating ETL pipelines.

linkStats

import os
from datetime import datetime
f = 'file.txt'
size = os.stat(f).st_size
last_modified = datetime.fromtimestamp(os.stat(f).st_mtime)

linkTemporary

from tempfile import TemporaryDirectory
with TemporaryDirectory() as tmpdir:
    # do stuff
    # context manager cleans up after itself
# OR
import tempfile
import shutil
tmpdir = tempfile.mkdtemp()
# do stuff
# and clean up after yourself
shutil.rmtree(tmpdir)

linkCreating

import os
folder = os.path.abspath('folder')
if not os.path.exists(folder):
    os.makedirs(folder)

linkCopying

import os
import glob
import shutil
src_dir = 'folder_in'
dst_dir = 'folder_out'
if not os.path.exists(dst_dir):
  os.makedirs(dst_dir)
for f in glob.glob(os.path.join(src_dir, '*.txt')):
  shutil.copy(f, dst_dir)

linkZipping

import zipfile
import os
src_dir = 'folder'
dst_dir = 'folder.zip'
zp = zipfile.ZipFile(dst_dir, 'w')
for folder, subfolder, files in os.walk(src_dir):
  for f in files:
    zp.write(os.path.join(folder,f), os.path.relpath(os.path.join(folder,f), dst_dir), compress_type=zipfile.ZIP_DEFLATED)
zp.close()

linkDecompressing

import gzip
import io
infile = 'compressed.gz'
outfile = 'decompressed.txt'
with gzip.open(infile, 'rb') as inf:
  with io.open(outfile, 'wb+') as outf:
    outf.write(inf.read())

linkShell

Run command line operations from within python.

linkExecuting Commands

import subprocess
infiles = os.path.join('folder', '*.gz')
outfile = 'combined.gz'

# windows
cmd = 'copy /B /Y {} {}'.format(infiles, outfile)
# unix
cmd = 'cat {} > {}'.format(infiles, outfile)

subprocess.run(cmd, shell=True)

linkSecurity

Always hide credentials or keys behind the scenes.

linkAccessing Credentials

From files. The file should contain a dictionary, looking something like this:

{
  'username': 'fake_user',
  'password': 'fake_password'
}
import json
with open('credentials', 'r') as f:
    creds = json.loads(f.read())

From environmental variables.

import os
username = os.environ['USERNAME']
password = os.environ['PASSWORD']

linkSystem

Access and use system settings in a workflow.

linkTime

import time
from datetime import datetime

start = datetime.now()
# do something
time.sleep(2)
# or sleep two seconds
end = datetime.now()

time_delta = end - start
processing_speed = time_delta.seconds

linkOS

import platform
print(platform.system())

linkPython Installation

import sys
# python installation location
print(sys.executable)
# python version
print(sys.version_info.major)

linkModify PATH

Temporarily add a folder to you path, maybe for importing a custom python module.

import sys
import os
custom_module = os.path.abspath('custom_module')
sys.path.append(custom_module)

linkHTTP

Operations over HTTP(s)

linkAPI

import requests
url = 'https://site/endpoint'
payload = {
  'item1': item1,
  'item2': item2
}
headers = {'hitem1': hitem1}
r = requests.post(url, headers=headers, json=payload)

linkDownload

import urllib
url = 'https://site/endpoint/file'
urllib.urlretrieve(url)

linkSQL

Connect to a database and execute a sql statement with the following.

This following example uses SQLAlchemy. Depending on the type of database you are connecting to, there is an extra configuration step to install the right Python-to-ODBC connection, then choose the correct engine below.

PostgreSQL: pip install psycopg2, see psycopg homepage

MySQL: pip install mysqlclient, see mysqlclient-python on github which supports python 3.6. If on Windows, find the correct wheel file for your platform on pypi

Oracle: pip install cx_Oracle see python-cx_Oracle on github. You will also need the Oracle Client.

import json
# read database credentials from json file
with open('creds_file') as f:
  creds = json.loads(f.read())

from sqlalchemy import create_engine
# create appropriate database connection engine

#postgresql
db = create_engine('postgresql+psycopg2://{}:{}@{}:{}/{}'.format(
  creds['user'],
  creds['password'],
  creds['host'],
  creds['port'],
  creds['database']),
    encoding='utf8')

#mysql
db = create_engine('mysql+mysqldb://{}:{}@{}:{}/{}?charset=utf8&local_infile=1'.format(
    creds['user'],
    creds['password'],
    creds['host'],
    creds['port'],
    creds['database']),
      encoding='utf8')

#oracle
db = create_engine('oracle+cx_oracle://{}:{}@{}:{}/{}'.format(
  creds['user'],
  creds['password'],
  creds['host'],
  creds['port'],
  creds['tns']),
    encoding='utf8')

#write some data to file from pandas
#(format NULLs for MySQL acceptance)
df.fillna('\\N').to_csv('datafile.txt', sep='\t', encoding='utf8', quotechar='"', line_terminator='\n')

# define some sql statement
sql = """
  LOAD DATA LOCAL INFILE '{data}'
  INTO TABLE {schema}.{table}
  CHARACTER  SET utf8
  FIELDS TERMINATED BY '\t'
  OPTIONALLY] ENCLOSED BY '"'
  LINES TERMINATED BY '\n'
  IGNORE 1 LINES
""".format(data='datafile.txt',
           schema='db',
           table='data')

# execute sql
connection = db.connect()
r = connection.execute(sql)
connection.close()

linkProcessing

linkProgress

tqdm is a super nice library for showing a progress bar when executing loops. They even have a pandas integration.

It runs in terminal by default, you can also run it in a Jupyter Notebook. Here is good example, using enumerate.

from tqdm import tqdm
from tqdm import tnrange, tqdm_notebook

with tqdm_notebook(total=len(data)) as pbar:
  for i,file in enumerate(data):
    pbar.update(1)

linkRuntime

Various procedures to consider when building .py executables.

linkArguments

Pass command line arguments to a .py during execution. Simply copy the below to a test.py file and run test.py -h from the command line.

import argparse
parser = argparse.ArgumentParser(description='Example on how to use ArgumentParser', formatter_class=argparse.ArgumentDefaultsHelpFormatter)

def bool_string_input(s):
    if s in ['True','False']:
        return s == 'True'
    else:
        raise ValueError('Incorrect boolean value provided. Please specify one of "True" or "False"')

parser.add_argument('-a', '--argument', dest='myargument', type=str, help='Example argument description.', default='test')
parser.add_argument('-b', '--boolean', dest='myboolean', type=bool_string_input, help='Example boolean argument description.', default=False)

args = parser.parse_args()

myargument = args.myargument
myboolean = args.myboolean

linkLogging

Setup a basic logging configuration for outputting log messages to console during runtime.

import Logging

logger = logging.getLogger('MyScript')

ch = logging.StreamHandler()
ch.setLevel(logging.INFO)

formatter = logging.Formatter('%(asctime)-15s - %(name)s - %(levelname)s - %(message)s')
ch.setFormatter(formatter)

logger.addHandler(ch)

logger.info('Doing stuff')
logger.info('Doing other stuff')

linkSMTP Email

Find a full working example of how to send emails with attachments, in python, using the SMTP protocol.

import smtplib
from email.message import EmailMessage
from email.mime.application import MIMEApplication
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText

message_text = """
Automated message sent form Python job.

{custom}
""".format(custom="some custom message")

message_html = """
<html>
  <head></head>
  <body>
      <p>Automated message sent form Python job.</p>
      <br><br>
      <p>{custom}</p>
  </body>
</html>
""".format(custom="some custom message")

msg = MIMEMultipart('alternative')

msg_text_part = MIMEText(message_text, 'plain')
msg_html_part = MIMEText(message_html, 'html')

msg.attach(msg_text_part)
msg.attach(msg_html_part)

msg['Subject'] = 'Python Job Finished'
from_email = 'your_email@domain.com'
msg['From'] = from_email
to_email = 'their_email@domain.com'
msg['To'] = to_email

for f in attachments:
    with open(f,'rb') as b:
        part = MIMEApplication(
            b.read(),
            Name=os.path.basename(f)
        )
        part['Content-Disposition'] = 'attachment; filename="{}"'.format(os.path.basename(f))
        msg.attach(part)

s = smtplib.SMTP('smtp-server.yourcompany')
s.sendmail(from_email, to_email, msg.as_string())

linkJupyter

Jupyter Notebooks are a great place to write and debug code on the fly and interactively. Here are a few added magic functions or tricks to make going back and forth a bit more convenient.

Run a python file from within a Jupyter Notebook with:

!python program.py -h

Load a python file into a Jupyter cell with the following.

%load program.py

And if running some code inside a Jupyter Notebook, which utilizes argparse and thus undesirable breaks execution, you can allow Jupyter Notebook to pass this section by doing the following.

if get_ipython().__class__.__name__ == 'ZMQInteractiveShell'
    sys.argv = ['-f']