Source code for pyaota.util.bundle

"""
Bundle PDF documents from a directory into uniform-sized bundles.

Optionally generates a "co-bundle" consisting of the last page of each
input document concatenated into a single PDF.
"""

from __future__ import annotations

import math
from pathlib import Path

import PyPDF2

import logging
logger = logging.getLogger(__name__)


[docs] def bundle_pdfs(input_dir: Path, output_dir: Path, bundle_size: int, co_bundle: bool = False) -> int: """ Bundle all PDFs in *input_dir* into new PDF files, each containing *bundle_size* documents concatenated in sorted order. Parameters ---------- input_dir : Path Directory to scan for ``*.pdf`` files. output_dir : Path Directory where bundle PDFs (and optional co-bundle) are written. bundle_size : int Number of source documents per bundle. co_bundle : bool If True, also produce a PDF that concatenates the last page of every source document. Returns ------- int 0 on success, 1 on error. """ pdf_files = sorted(input_dir.glob("*.pdf")) if not pdf_files: print(f"No PDF files found in {input_dir}") return 1 output_dir.mkdir(parents=True, exist_ok=True) num_bundles = math.ceil(len(pdf_files) / bundle_size) print(f"Found {len(pdf_files)} PDFs; creating {num_bundles} bundle(s) " f"of up to {bundle_size} documents each.") for bundle_idx in range(num_bundles): start = bundle_idx * bundle_size end = min(start + bundle_size, len(pdf_files)) chunk = pdf_files[start:end] writer = PyPDF2.PdfWriter() for doc_idx, pdf_path in enumerate(chunk): reader = PyPDF2.PdfReader(str(pdf_path)) for page in reader.pages: writer.add_page(page) # Insert a blank page after every document except the last if doc_idx < len(chunk) - 1: last_page = reader.pages[-1] w = float(last_page.mediabox.width) h = float(last_page.mediabox.height) writer.add_blank_page(width=w, height=h) bundle_name = f"bundle_{bundle_idx + 1:03d}.pdf" bundle_path = output_dir / bundle_name with open(bundle_path, "wb") as f: writer.write(f) print(f" {bundle_name}: {len(chunk)} documents") if co_bundle: writer = PyPDF2.PdfWriter() for pdf_path in pdf_files: reader = PyPDF2.PdfReader(str(pdf_path)) if len(reader.pages) > 0: writer.add_page(reader.pages[-1]) co_bundle_path = output_dir / "co_bundle.pdf" with open(co_bundle_path, "wb") as f: writer.write(f) print(f" co_bundle.pdf: last page from each of {len(pdf_files)} documents") print("Done.") return 0
[docs] def bundle_subcommand(args) -> int: """CLI entry point for the ``bundle`` subcommand.""" input_dir = Path(args.input_dir) output_dir = Path(args.output_dir) if not input_dir.is_dir(): print(f"Input directory does not exist: {input_dir}") return 1 return bundle_pdfs( input_dir=input_dir, output_dir=output_dir, bundle_size=args.bundle_size, co_bundle=args.co_bundle, )