Extract XMP metadata from PDFs in Python

Metadata (title, author, etc.) can be embedded in PDF files in a number of different ways, and can be a bit of a pain to extract. Older PDFs use “Info” in the XRefs trailer, whereas newer ones use XMP metadata. Using Python’s PDFMiner library, it’s possible to extract the “Info” as a python dictionary, but the XMP metadata is just extracted as raw XML.

I couldn’t find a nice lightweight XMP parser in Python, so I put together something that seemed to work on all the PDFs I threw at it.

You can install PDFMiner using pip:

pip install pdfminer

Once installed, use it to open the PDF and get the XMP:

from pdfminer.pdfparser import PDFParser, PDFDocument
from pdfminer.pdftypes import resolve1
from xmp import xmp_to_dict
fp = open('mypdf.pdf', 'rb')
parser = PDFParser(fp)
doc = PDFDocument()
print(doc.info) # The "Info" metadata
if 'Metadata' in doc.catalog:
metadata = resolve1(doc.catalog['Metadata']).get_data()
print(metadata) # The raw XMP metadata

The xmp_to_dict function is defined follows:

#!/usr/bin/env python
Parses XMP metadata from PDF files.
By Matt Swain. Released under the MIT license.
from collections import defaultdict
from xml.etree import ElementTree as ET
RDF_NS = '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}'
XML_NS = '{http://www.w3.org/XML/1998/namespace}'
NS_MAP = {
'http://www.w3.org/1999/02/22-rdf-syntax-ns#' : 'rdf',
'http://purl.org/dc/elements/1.1/' : 'dc',
'http://ns.adobe.com/xap/1.0/' : 'xap',
'http://ns.adobe.com/pdf/1.3/' : 'pdf',
'http://ns.adobe.com/xap/1.0/mm/' : 'xapmm',
'http://ns.adobe.com/pdfx/1.3/' : 'pdfx',
'http://prismstandard.org/namespaces/basic/2.0/' : 'prism',
'http://crossref.org/crossmark/1.0/' : 'crossmark',
'http://ns.adobe.com/xap/1.0/rights/' : 'rights',
'http://www.w3.org/XML/1998/namespace' : 'xml'
class XmpParser(object):
"""Parser that converts an XMP string into a dictionary.
parser = XmpParser(xmpstring)
meta = parser.meta
def __init__(self, xmp):
self.tree = ET.XML(xmp)
self.rdftree = self.tree.find(RDF_NS+'RDF')
def meta(self):
"""A dictionary of all the parsed metadata."""
meta = defaultdict(dict)
for desc in self.rdftree.findall(RDF_NS+'Description'):
for el in desc.getchildren():
ns, tag = self._parse_tag(el)
value = self._parse_value(el)
meta[ns][tag] = value
return dict(meta)
def _parse_tag(self, el):
"""Extract the namespace and tag from an element."""
ns = None
tag = el.tag
if tag[0] == "{":
ns, tag = tag[1:].split('}',1)
if ns in NS_MAP:
ns = NS_MAP[ns]
return ns, tag
def _parse_value(self, el):
"""Extract the metadata value from an element."""
if el.find(RDF_NS+'Bag') is not None:
value = []
for li in el.findall(RDF_NS+'Bag/'+RDF_NS+'li'):
elif el.find(RDF_NS+'Seq') is not None:
value = []
for li in el.findall(RDF_NS+'Seq/'+RDF_NS+'li'):
elif el.find(RDF_NS+'Alt') is not None:
value = {}
for li in el.findall(RDF_NS+'Alt/'+RDF_NS+'li'):
value[li.get(XML_NS+'lang')] = li.text
value = el.text
return value
def xmp_to_dict(xmp):
"""Parse an XMP string into a dictionary."""
return XmpParser(xmp).meta