Sample PDF¶
Let's take a publicly accessible PDF as a sample, and for fun let's use my Master's thesis.
import urllib.request
import shutil
url = 'https://raw.githubusercontent.com/knanne/vu_msc_tweetsumm/master/research/KainNanne_MSc_Thesis_ACM.pdf'
with urllib.request.urlopen(url) as response, open('sample.pdf', 'wb') as out_file:
shutil.copyfileobj(response, out_file)
file = 'sample.pdf'
PDF to Text using pdfminer¶
Below is a funtion to convert the file to text. Source Credit: https://stackoverflow.com/a/26495057/5356898
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO
def convert_pdf_to_txt(file):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
fp = open(file, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos=set()
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True):
interpreter.process_page(page)
text = retstr.getvalue()
text = text.replace(' ',' ').replace(' ',' ')
fp.close()
device.close()
retstr.close()
return text
pdf = convert_pdf_to_txt(file)
Now that we have the text of the PDF document as a single string, you may want to apply some fance regular expression to split and parse the text as you wish
pdf[:2500]
PDF Metadata using pdfminer¶
You will notice below the metadata in this particular PDF is virtually nonexistent. However, this code is simply a demonstration as to how one would extract such data.
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdftypes import resolve1
fp = open(file, 'rb')
parser = PDFParser(fp)
doc = PDFDocument(parser)
{k:v if isinstance(v, bytes) else v.resolve() for k,v in doc.info[0].items()}
Resolve Metadata to XML if exists¶
Depending on in which system your PDF was created, for example if it was electronically signed in something like Docusign, you may have information on the signers here including emails, names, and dates of form completions.
catalog_metadata = doc.catalog['Metadata']
resolved_xml = catalog_metadata.resolve()
from bs4 import BeautifulSoup
try:
soup = BeautifulSoup(resolved_xml.get_data(), 'lxml')
except: #PDFNotImplementedError
soup = BeautifulSoup(resolved_xml.rawdata, 'lxml')
print(soup.prettify(formatter=None))
You may now want to extract certain data by tags and process as you like
d = soup.find('xmp:createdate')
import pandas as pd
pd.to_datetime(d.text).strftime('%Y-%m-%d')