Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python3
- # -*- coding: utf-8 -*-
- # Filename: pdf2json.py
- # Version: 1.0.0
- # Author: Jeoi Reqi
- """
- Description:
- This script converts a PDF file (.pdf) to a JSON file (.json).
- It extracts text from each page of the PDF and saves the data as a JSON file with page-wise content.
- Requirements:
- - Python 3.x
- - PyMuPDF library (install using: pip install PyMuPDF)
- Usage:
- 1. Save this script as 'pdf2json.py'.
- 2. Ensure your PDF file ('example.pdf') is in the same directory as the script.
- 3. Install the PyMuPDF library using the command: 'pip install PyMuPDF'
- 4. Run the script.
- Note: Adjust the 'pdf_filename' and 'json_filename' variables in the script as needed.
- """
- import fitz # PyMuPDF
- import json
- def pdf_to_json(pdf_filename, json_filename):
- pdf_document = fitz.open(pdf_filename)
- json_data = {"pdf_content": []}
- for page_num in range(pdf_document.page_count):
- page = pdf_document[page_num]
- json_data["pdf_content"].append({"page": page_num + 1, "text": page.get_text()})
- with open(json_filename, 'w', encoding='utf-8') as json_file:
- json.dump(json_data, json_file, indent=2)
- if __name__ == "__main__":
- # Set the filenames for the PDF and JSON files
- pdf_filename = 'example.pdf'
- json_filename = 'pdf2json.json'
- # Convert the PDF to a JSON file
- pdf_to_json(pdf_filename, json_filename)
- print(f"Converted '{pdf_filename}' to '{json_filename}'.")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement