Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python3
- # -*- coding: utf-8 -*-
- # Filename: pdf2html.py
- # Version: 1.0.0
- # Author: Jeoi Reqi
- """
- Description:
- This script converts a PDF file (.pdf) to an HTML file (.html).
- It extracts text and formatting information from each page of the PDF and writes it to an HTML file.
- Requirements:
- - Python 3.x
- - PyMuPDF library (install using: pip install PyMuPDF)
- Usage:
- 1. Save this script as 'pdf2html.py'.
- 2. Ensure your PDF file ('example.pdf') is in the same directory as the script.
- 3. Install the PyMuPDF library using the command: 'pip install PyMuPDF'
- 4. Run the script.
- Note: Adjust the 'pdf_filename' and 'html_filename' variables in the script as needed.
- """
- import fitz # PyMuPDF
- def pdf_to_html(pdf_filename, html_filename):
- pdf_document = fitz.open(pdf_filename)
- html_content = ""
- for page_num in range(pdf_document.page_count):
- page = pdf_document[page_num]
- html_content += page.get_text("html")
- with open(html_filename, 'w', encoding='utf-8') as html_file:
- html_file.write(html_content)
- if __name__ == "__main__":
- # Set the filenames for the PDF and HTML files
- pdf_filename = 'example.pdf'
- html_filename = 'pdf2html.html'
- # Convert the PDF to an HTML file
- pdf_to_html(pdf_filename, html_filename)
- print(f"Converted '{pdf_filename}' to '{html_filename}'.")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement