Extract XMP metadata from PDFs in Python
Metadata (title, author, etc.) can be embedded in PDF files in a number of different ways, and can be a bit of a pain to extract. Older PDFs use “Info” in the XRefs trailer, whereas newer ones use XMP metadata. Using Python’s PDFMiner library, it’s possible to extract the “Info” as a python dictionary, but the XMP metadata is just extracted as raw XML.
I couldn’t find a nice lightweight XMP parser in Python, so I put together something that seemed to work on all the PDFs I threw at it.
You can install PDFMiner using pip:
pip install pdfminer
Once installed, use it to open the PDF and get the XMP:
from pdfminer.pdfparser import PDFParser, PDFDocumentfrom pdfminer.pdftypes import resolve1from xmp import xmp_to_dictfp = open('mypdf.pdf', 'rb')parser = PDFParser(fp)doc = PDFDocument()parser.set_document(doc)doc.set_parser(parser)doc.initialize()print(doc.info) # The "Info" metadataif 'Metadata' in doc.catalog:metadata = resolve1(doc.catalog['Metadata']).get_data()print(metadata) # The raw XMP metadataprint(xmp_to_dict(metadata))
The xmp_to_dict function is defined follows:
#!/usr/bin/env python"""xmp.py~~~~~~Parses XMP metadata from PDF files.By Matt Swain. Released under the MIT license."""from collections import defaultdictfrom xml.etree import ElementTree as ETRDF_NS = '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}'XML_NS = '{http://www.w3.org/XML/1998/namespace}'NS_MAP = {'http://www.w3.org/1999/02/22-rdf-syntax-ns#' : 'rdf','http://purl.org/dc/elements/1.1/' : 'dc','http://ns.adobe.com/xap/1.0/' : 'xap','http://ns.adobe.com/pdf/1.3/' : 'pdf','http://ns.adobe.com/xap/1.0/mm/' : 'xapmm','http://ns.adobe.com/pdfx/1.3/' : 'pdfx','http://prismstandard.org/namespaces/basic/2.0/' : 'prism','http://crossref.org/crossmark/1.0/' : 'crossmark','http://ns.adobe.com/xap/1.0/rights/' : 'rights','http://www.w3.org/XML/1998/namespace' : 'xml'}class XmpParser(object):"""Parser that converts an XMP string into a dictionary.Usage::parser = XmpParser(xmpstring)meta = parser.meta"""def __init__(self, xmp):self.tree = ET.XML(xmp)self.rdftree = self.tree.find(RDF_NS+'RDF')@propertydef meta(self):"""A dictionary of all the parsed metadata."""meta = defaultdict(dict)for desc in self.rdftree.findall(RDF_NS+'Description'):for el in desc.getchildren():ns, tag = self._parse_tag(el)value = self._parse_value(el)meta[ns][tag] = valuereturn dict(meta)def _parse_tag(self, el):"""Extract the namespace and tag from an element."""ns = Nonetag = el.tagif tag[0] == "{":ns, tag = tag[1:].split('}',1)if ns in NS_MAP:ns = NS_MAP[ns]return ns, tagdef _parse_value(self, el):"""Extract the metadata value from an element."""if el.find(RDF_NS+'Bag') is not None:value = []for li in el.findall(RDF_NS+'Bag/'+RDF_NS+'li'):value.append(li.text)elif el.find(RDF_NS+'Seq') is not None:value = []for li in el.findall(RDF_NS+'Seq/'+RDF_NS+'li'):value.append(li.text)elif el.find(RDF_NS+'Alt') is not None:value = {}for li in el.findall(RDF_NS+'Alt/'+RDF_NS+'li'):value[li.get(XML_NS+'lang')] = li.textelse:value = el.textreturn valuedef xmp_to_dict(xmp):"""Parse an XMP string into a dictionary."""return XmpParser(xmp).meta