Automating Everything In Python
Files
Manipulate files for automating ETL pipelines.
Stats
import os
from datetime import datetime
f = 'file.txt'
size = os.stat(f).st_size
last_modified = datetime.fromtimestamp(os.stat(f).st_mtime)
Temporary
from tempfile import TemporaryDirectory
with TemporaryDirectory() as tmpdir:
# do stuff
# context manager cleans up after itself
# OR
import tempfile
import shutil
tmpdir = tempfile.mkdtemp()
# do stuff
# and clean up after yourself
shutil.rmtree(tmpdir)
Creating
import os
folder = os.path.abspath('folder')
if not os.path.exists(folder):
os.makedirs(folder)
Copying
import os
import glob
import shutil
src_dir = 'folder_in'
dst_dir = 'folder_out'
if not os.path.exists(dst_dir):
os.makedirs(dst_dir)
for f in glob.glob(os.path.join(src_dir, '*.txt')):
shutil.copy(f, dst_dir)
Zipping
import zipfile
import os
src_dir = 'folder'
dst_dir = 'folder.zip'
zp = zipfile.ZipFile(dst_dir, 'w')
for folder, subfolder, files in os.walk(src_dir):
for f in files:
zp.write(os.path.join(folder,f), os.path.relpath(os.path.join(folder,f), dst_dir), compress_type=zipfile.ZIP_DEFLATED)
zp.close()
Decompressing
import gzip
import io
infile = 'compressed.gz'
outfile = 'decompressed.txt'
with gzip.open(infile, 'rb') as inf:
with io.open(outfile, 'wb+') as outf:
outf.write(inf.read())
Shell
Run command line operations from within python.
Executing Commands
import subprocess
infiles = os.path.join('folder', '*.gz')
outfile = 'combined.gz'
# windows
cmd = 'copy /B /Y {} {}'.format(infiles, outfile)
# unix
cmd = 'cat {} > {}'.format(infiles, outfile)
subprocess.run(cmd, shell=True)
Security
Always hide credentials or keys behind the scenes.
Accessing Credentials
From files. The file should contain a dictionary, looking something like this:
{
'username': 'fake_user',
'password': 'fake_password'
}
import json
with open('credentials', 'r') as f:
creds = json.loads(f.read())
From environmental variables.
import os
username = os.environ['USERNAME']
password = os.environ['PASSWORD']
System
Access and use system settings in a workflow.
Time
import time
from datetime import datetime
start = datetime.now()
# do something
time.sleep(2)
# or sleep two seconds
end = datetime.now()
time_delta = end - start
processing_speed = time_delta.seconds
OS
import platform
print(platform.system())
Python Installation
import sys
# python installation location
print(sys.executable)
# python version
print(sys.version_info.major)
Modify PATH
Temporarily add a folder to you path, maybe for importing a custom python module.
import sys
import os
custom_module = os.path.abspath('custom_module')
sys.path.append(custom_module)
HTTP
Operations over HTTP(s)
API
import requests
url = 'https://site/endpoint'
payload = {
'item1': item1,
'item2': item2
}
headers = {'hitem1': hitem1}
r = requests.post(url, headers=headers, json=payload)
Download
import urllib
url = 'https://site/endpoint/file'
urllib.urlretrieve(url)
SQL
Connect to a database and execute a sql statement with the following.
This following example uses SQLAlchemy. Depending on the type of database you are connecting to, there is an extra configuration step to install the right Python-to-ODBC connection, then choose the correct engine below.
PostgreSQL: pip install psycopg2
, see psycopg homepage
MySQL: pip install mysqlclient
, see mysqlclient-python on github which supports python 3.6. If on Windows, find the correct wheel file for your platform on pypi
Oracle: pip install cx_Oracle
see python-cx_Oracle on github. You will also need the Oracle Client.
import json
# read database credentials from json file
with open('creds_file') as f:
creds = json.loads(f.read())
from sqlalchemy import create_engine
# create appropriate database connection engine
#postgresql
db = create_engine('postgresql+psycopg2://{}:{}@{}:{}/{}'.format(
creds['user'],
creds['password'],
creds['host'],
creds['port'],
creds['database']),
encoding='utf8')
#mysql
db = create_engine('mysql+mysqldb://{}:{}@{}:{}/{}?charset=utf8&local_infile=1'.format(
creds['user'],
creds['password'],
creds['host'],
creds['port'],
creds['database']),
encoding='utf8')
#oracle
db = create_engine('oracle+cx_oracle://{}:{}@{}:{}/{}'.format(
creds['user'],
creds['password'],
creds['host'],
creds['port'],
creds['tns']),
encoding='utf8')
#write some data to file from pandas
#(format NULLs for MySQL acceptance)
df.fillna('\\N').to_csv('datafile.txt', sep='\t', encoding='utf8', quotechar='"', line_terminator='\n')
# define some sql statement
sql = """
LOAD DATA LOCAL INFILE '{data}'
INTO TABLE {schema}.{table}
CHARACTER SET utf8
FIELDS TERMINATED BY '\t'
OPTIONALLY] ENCLOSED BY '"'
LINES TERMINATED BY '\n'
IGNORE 1 LINES
""".format(data='datafile.txt',
schema='db',
table='data')
# execute sql
connection = db.connect()
r = connection.execute(sql)
connection.close()
Processing
Progress
tqdm is a super nice library for showing a progress bar when executing loops. They even have a pandas integration.
It runs in terminal by default, you can also run it in a Jupyter Notebook. Here is good example, using enumerate.
from tqdm import tqdm
from tqdm import tnrange, tqdm_notebook
with tqdm_notebook(total=len(data)) as pbar:
for i,file in enumerate(data):
pbar.update(1)
Runtime
Various procedures to consider when building .py
executables.
Arguments
Pass command line arguments to a .py during execution. Simply copy the below to a test.py
file and run test.py -h
from the command line.
import argparse
parser = argparse.ArgumentParser(description='Example on how to use ArgumentParser', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
def bool_string_input(s):
if s in ['True','False']:
return s == 'True'
else:
raise ValueError('Incorrect boolean value provided. Please specify one of "True" or "False"')
parser.add_argument('-a', '--argument', dest='myargument', type=str, help='Example argument description.', default='test')
parser.add_argument('-b', '--boolean', dest='myboolean', type=bool_string_input, help='Example boolean argument description.', default=False)
args = parser.parse_args()
myargument = args.myargument
myboolean = args.myboolean
Logging
Setup a basic logging configuration for outputting log messages to console during runtime.
import Logging
logger = logging.getLogger('MyScript')
ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)-15s - %(name)s - %(levelname)s - %(message)s')
ch.setFormatter(formatter)
logger.addHandler(ch)
logger.info('Doing stuff')
logger.info('Doing other stuff')
SMTP Email
Find a full working example of how to send emails with attachments, in python, using the SMTP protocol.
import smtplib
from email.message import EmailMessage
from email.mime.application import MIMEApplication
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
message_text = """
Automated message sent form Python job.
{custom}
""".format(custom="some custom message")
message_html = """
<html>
<head></head>
<body>
<p>Automated message sent form Python job.</p>
<br><br>
<p>{custom}</p>
</body>
</html>
""".format(custom="some custom message")
msg = MIMEMultipart('alternative')
msg_text_part = MIMEText(message_text, 'plain')
msg_html_part = MIMEText(message_html, 'html')
msg.attach(msg_text_part)
msg.attach(msg_html_part)
msg['Subject'] = 'Python Job Finished'
from_email = 'your_email@domain.com'
msg['From'] = from_email
to_email = 'their_email@domain.com'
msg['To'] = to_email
for f in attachments:
with open(f,'rb') as b:
part = MIMEApplication(
b.read(),
Name=os.path.basename(f)
)
part['Content-Disposition'] = 'attachment; filename="{}"'.format(os.path.basename(f))
msg.attach(part)
s = smtplib.SMTP('smtp-server.yourcompany')
s.sendmail(from_email, to_email, msg.as_string())
Jupyter
Jupyter Notebooks are a great place to write and debug code on the fly and interactively. Here are a few added magic functions or tricks to make going back and forth a bit more convenient.
Run a python file from within a Jupyter Notebook with:
!python program.py -h
Load a python file into a Jupyter cell with the following.
%load program.py
And if running some code inside a Jupyter Notebook, which utilizes argparse
and thus undesirable breaks execution, you can allow Jupyter Notebook to pass this section by doing the following.
if get_ipython().__class__.__name__ == 'ZMQInteractiveShell'
sys.argv = ['-f']