Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python3
- # -*- coding: utf-8 -*-
- # Filename: pdf2txt.py
- # Version: 1.0.0
- # Author: Jeoi Reqi
- """
- Description:
- This script converts a PDF file (.pdf) to a text file (.txt).
- It extracts text from each page of the PDF and saves the combined text as a text file.
- Requirements:
- - Python 3.x
- - PyMuPDF library (install using: pip install PyMuPDF)
- Usage:
- 1. Save this script as 'pdf2txt.py'.
- 2. Ensure your PDF file ('example.pdf') is in the same directory as the script.
- 3. Install the PyMuPDF library using the command: 'pip install PyMuPDF'
- 4. Run the script.
- Note: Adjust the 'pdf_filename' and 'txt_filename' variables in the script as needed.
- """
- import fitz # PyMuPDF
- def pdf_to_txt(pdf_filename, txt_filename):
- pdf_document = fitz.open(pdf_filename)
- text_content = ""
- for page_num in range(pdf_document.page_count):
- page = pdf_document[page_num]
- text_content += page.get_text()
- with open(txt_filename, 'w', encoding='utf-8') as txt_file:
- txt_file.write(text_content)
- if __name__ == "__main__":
- # Set the filenames for the PDF and text files
- pdf_filename = 'example.pdf'
- txt_filename = 'pdf2txt.txt'
- # Convert the PDF to a text file
- pdf_to_txt(pdf_filename, txt_filename)
- print(f"Converted '{pdf_filename}' to '{txt_filename}'.")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement