Advertisement
jarekmor

translate_pdf

Aug 11th, 2024 (edited)
273
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.34 KB | None | 0 0
  1. # Based on: https://docs.unstructured.io/open-source/introduction/quick-start
  2. ## and https://docs.unstructured.io/open-source/installation/full-installation
  3. # Full Installation in python venv enviroment
  4. ## pip install "unstructured[all-docs]"
  5.  
  6. from unstructured.partition.pdf import partition_pdf
  7. from unstructured.cleaners.translate import translate_text
  8. from transformers import MarianMTModel, MarianTokenizer
  9.  
  10. # Partition the PDF and process the elements
  11. elements = partition_pdf(
  12.     "./pdf_trans.pdf",                                 # name of pdf file to extract text from
  13.     languages=["pl"],
  14.     content_type="application/pdf",                    # content type is pdf.
  15.     include_page_breaks=True,
  16.     strategy="fast",
  17.     extract_image_block_types=["Table"],
  18.     infer_table_structure=True,
  19.     max_partition=None
  20. )
  21.  
  22. # Combine the elements into a single string
  23. pol_text = "\n\n".join([str(el) for el in elements])
  24.  
  25. # Write the original text to a file
  26. with open("pl_pdf.txt", "w", encoding="utf-8") as f:      # name of translated text
  27.     f.write(pol_text)
  28.  
  29. file_path = "./en_pdf.txt"    # name of a pdf file to translate
  30.  
  31. with open(file_path, 'w', encoding='utf-8') as file:
  32.     for i, el in enumerate(elements):
  33.         translated_chunk = translate_text(str(el),'pl','en')
  34.         print(translated_chunk)
  35.         file.write(f"{translated_chunk}\n\n")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement