pdf2json

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Filename: pdf2json.py
# Version: 1.0.0
# Author: Jeoi Reqi

"""
Description:
This script converts a PDF file (.pdf) to a JSON file (.json).
It extracts text from each page of the PDF and saves the data as a JSON file with page-wise content.

Requirements:
- Python 3.x
- PyMuPDF library (install using: pip install PyMuPDF)

Usage:
1. Save this script as 'pdf2json.py'.
2. Ensure your PDF file ('example.pdf') is in the same directory as the script.
3. Install the PyMuPDF library using the command: 'pip install PyMuPDF'
4. Run the script.

Note: Adjust the 'pdf_filename' and 'json_filename' variables in the script as needed.
"""

import fitz  # PyMuPDF
import json

def pdf_to_json(pdf_filename, json_filename):
    pdf_document = fitz.open(pdf_filename)
    json_data = {"pdf_content": []}

    for page_num in range(pdf_document.page_count):
        page = pdf_document[page_num]
        json_data["pdf_content"].append({"page": page_num + 1, "text": page.get_text()})

    with open(json_filename, 'w', encoding='utf-8') as json_file:
        json.dump(json_data, json_file, indent=2)

if __name__ == "__main__":
    # Set the filenames for the PDF and JSON files
    pdf_filename = 'example.pdf'
    json_filename = 'pdf2json.json'

    # Convert the PDF to a JSON file
    pdf_to_json(pdf_filename, json_filename)

    print(f"Converted '{pdf_filename}' to '{json_filename}'.")