Advertisement
YaBoiSwayZ

PDF to Audio Converter (with Text Chunking)

May 26th, 2024
98
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.26 KB | Source Code | 0 0
  1. import pyttsx3
  2. import PyPDF2
  3. from pydub import AudioSegment
  4. import re
  5. import os
  6. from contextlib import closing
  7.  
  8. def clean_text(text):
  9.     text = re.sub(r'[^\x00-\x7F]+', ' ', text)
  10.     text = re.sub(r'\s+', ' ', text)
  11.     return text.strip()
  12.  
  13. def dynamic_chunk_text(text, delimiter='. ', max_chunk_length=1000):
  14.     sentences = re.split(f'(?<={delimiter})', text)
  15.     chunks = []
  16.     current_chunk = ''
  17.  
  18.     for sentence in sentences:
  19.         if len(current_chunk) + len(sentence) <= max_chunk_length:
  20.             current_chunk += sentence
  21.         else:
  22.             if current_chunk:
  23.                 chunks.append(current_chunk)
  24.             current_chunk = sentence
  25.     if current_chunk:
  26.         chunks.append(current_chunk)
  27.     return chunks
  28.  
  29. def export_audio(combined_audio, filename, format='wav'):
  30.     try:
  31.         combined_audio.export(filename, format=format, bitrate="192k")
  32.         print(f"Exported audio file: {filename}")
  33.     except Exception as e:
  34.         print(f"Failed to export audio file: {e}")
  35.  
  36. def pdf_to_audio_converter(pdf_path, audio_format='mp3'):
  37.     speaker = pyttsx3.init()
  38.     combined_audio = AudioSegment.empty()
  39.  
  40.     try:
  41.         with open(pdf_path, 'rb') as pdf_file:
  42.             pdfreader = PyPDF2.PdfReader(pdf_file)
  43.             full_text = ''
  44.  
  45.             for page_num in range(len(pdfreader.pages)):
  46.                 text = pdfreader.pages[page_num].extract_text()
  47.                 full_text += clean_text(text) + ' '
  48.  
  49.         chunks = dynamic_chunk_text(full_text)
  50.        
  51.         for i, chunk in enumerate(chunks):
  52.             with closing(speaker) as engine:
  53.                 audio_filename = f'temp_part_{i+1}.{audio_format}'
  54.                 engine.save_to_file(chunk, audio_filename)
  55.                 engine.runAndWait()
  56.                 combined_audio += AudioSegment.from_file(audio_filename, format=audio_format)
  57.                 os.remove(audio_filename)
  58.  
  59.         final_audio_filename = f"final_story.{audio_format}"
  60.         export_audio(combined_audio, final_audio_filename, audio_format)
  61.  
  62.     except (PyPDF2.errors.PdfReadError, IOError) as e:
  63.         print(f"Error reading PDF file: {e}")
  64.     except Exception as e:
  65.         print(f"An error occurred: {e}")
  66.  
  67. pdf_to_audio_converter('Brief answers.pdf', 'mp3')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement