Extract XMP metadata from PDFs in Python

22 June 2012

Metadata (title, author, etc.) can be embedded in PDF files in a number of different ways, and can be a bit of a pain to extract. Older PDFs use “Info” in the XRefs trailer, whereas newer ones use XMP metadata. Using Python’s PDFMiner library, it’s possible to extract the “Info” as a python dictionary, but the XMP metadata is just extracted as raw XML.

I couldn’t find a nice lightweight XMP parser in Python, so I put together something that seemed to work on all the PDFs I threw at it.

You can install PDFMiner using pip:

pip install pdfminer

Once installed, use it to open the PDF and get the XMP:

from pdfminer.pdfparser import PDFParser, PDFDocument
from pdfminer.pdftypes import resolve1
from xmp import xmp_to_dict
fp = open('mypdf.pdf', 'rb')
parser = PDFParser(fp)
doc = PDFDocument()
parser.set_document(doc)
doc.set_parser(parser)
doc.initialize()
print(doc.info)        # The "Info" metadata
if 'Metadata' in doc.catalog:
    metadata = resolve1(doc.catalog['Metadata']).get_data()
    print(metadata)  # The raw XMP metadata
    print(xmp_to_dict(metadata))

The xmp_to_dict function is defined follows:

#!/usr/bin/env python
"""
    xmp.py
    ~~~~~~
    Parses XMP metadata from PDF files.
    By Matt Swain. Released under the MIT license.
"""
from collections import defaultdict
from xml.etree import ElementTree as ET
RDF_NS = '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}'
XML_NS = '{http://www.w3.org/XML/1998/namespace}'
NS_MAP = {
    'http://www.w3.org/1999/02/22-rdf-syntax-ns#'    : 'rdf',
    'http://purl.org/dc/elements/1.1/'               : 'dc',
    'http://ns.adobe.com/xap/1.0/'                   : 'xap',
    'http://ns.adobe.com/pdf/1.3/'                   : 'pdf',
    'http://ns.adobe.com/xap/1.0/mm/'                : 'xapmm',
    'http://ns.adobe.com/pdfx/1.3/'                  : 'pdfx',
    'http://prismstandard.org/namespaces/basic/2.0/' : 'prism',
    'http://crossref.org/crossmark/1.0/'             : 'crossmark',
    'http://ns.adobe.com/xap/1.0/rights/'            : 'rights',
    'http://www.w3.org/XML/1998/namespace'           : 'xml'
}
class XmpParser(object):
    """Parser that converts an XMP string into a dictionary.
    Usage::
        parser = XmpParser(xmpstring)
        meta = parser.meta
    """
    def __init__(self, xmp):
        self.tree = ET.XML(xmp)
        self.rdftree = self.tree.find(RDF_NS+'RDF')
    @property
    def meta(self):
        """A dictionary of all the parsed metadata."""
        meta = defaultdict(dict)
        for desc in self.rdftree.findall(RDF_NS+'Description'):
            for el in desc.getchildren():
                ns, tag =  self._parse_tag(el)
                value = self._parse_value(el)
                meta[ns][tag] = value
        return dict(meta)
    def _parse_tag(self, el):
        """Extract the namespace and tag from an element."""
        ns = None
        tag = el.tag
        if tag[0] == "{":
            ns, tag = tag[1:].split('}',1)
            if ns in NS_MAP:
                ns = NS_MAP[ns]
        return ns, tag
    def _parse_value(self, el):
        """Extract the metadata value from an element."""
        if el.find(RDF_NS+'Bag') is not None:
            value = []
            for li in el.findall(RDF_NS+'Bag/'+RDF_NS+'li'):
                value.append(li.text)
        elif el.find(RDF_NS+'Seq') is not None:
            value = []
            for li in el.findall(RDF_NS+'Seq/'+RDF_NS+'li'):
                value.append(li.text)
        elif el.find(RDF_NS+'Alt') is not None:
            value = {}
            for li in el.findall(RDF_NS+'Alt/'+RDF_NS+'li'):
                value[li.get(XML_NS+'lang')] = li.text
        else:
            value = el.text
        return value
def xmp_to_dict(xmp):
    """Parse an XMP string into a dictionary."""
    return XmpParser(xmp).meta