Script to extract pages of PDF as images & apply crop

Description of this Notebook
Author
Published

November 5, 2023

Code

Script to extract pages of PDF as images & apply crop

Description of this Notebook
Author
Published

November 5, 2023

Goal

Imagine you want to extract pages from a PDF as images and then you want to crop them to an specific size. The following scrip does exactly that

pip install Pillow
pip install pdf2image
brew install poppler
from pdf2image import convert_from_path
from PIL import Image
import os

# Function to crop an image to specific dimensions
def crop_image(input_path, output_path, left, top, right, bottom):
    image = Image.open(input_path)
    cropped_image = image.crop((left, top, right, bottom))
    cropped_image.save(output_path)

# Function to extract all pages from a PDF and crop them as images
def extract_and_crop_pdf(pdf_path, output_dir, width, crop_up, crop_down):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    images = convert_from_path(pdf_path)

    for page_number, pdf_image in enumerate(images):
        output_path = os.path.join(output_dir, f'page_{page_number + 1}.png')

        # Calculate the crop coordinates
        left = (pdf_image.width - width) // 2
        top = crop_up
        right = left + width
        bottom = pdf_image.height - crop_down

        # Save the image as a temporary file
        temp_image_path = os.path.join(output_dir, f'temp_page_{page_number + 1}.png')
        pdf_image.save(temp_image_path)

        # Crop the temporary image and save the final cropped image
        crop_image(temp_image_path, output_path, left, top, right, bottom)

        # Clean up the temporary image
        os.remove(temp_image_path)

if __name__ == '__main__':
    input_pdf = 'input.pdf'  # Specify the input PDF file
    output_directory = 'output_images'  # Specify the output folder
    crop_up = 177  # Change this to your desired crop for height up
    crop_down = 118  # Change this to your desired crop for height down
    target_width = 2667  # Change this to your desired width

    extract_and_crop_pdf(input_pdf, output_directory, target_width, crop_up, crop_down)