Advertisement
Abhisek92

content_extraction.py

Oct 21st, 2024
115
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.90 KB | None | 0 0
  1. #!pip install pypdf
  2. #!pip install reportlab
  3.  
  4. import io
  5. import re
  6. from pypdf import PdfReader
  7. from reportlab.pdfbase.pdfdoc import PDFDictionary
  8.  
  9. def clean_text_preserve_format(text):
  10.     # Handle escaped characters
  11.     text = text.replace('\\(', '(').replace('\\)', ')').replace('\\.', '.')
  12.     text = text.replace('\\\\', '\\').replace('\\ ', ' ')
  13.    
  14.     # Extract text within parentheses, preserving spaces
  15.     parts = re.findall(r'\((.*?)\)', text)
  16.    
  17.     # Join parts, replacing empty parentheses with spaces
  18.     result = ''
  19.     for part in parts:
  20.         if part.strip():
  21.             result += part
  22.         else:
  23.             result += ' '
  24.    
  25.     # Remove any remaining parentheses
  26.     result = result.replace('(', '').replace(')', '')
  27.    
  28.     # Handle special cases like "\,"
  29.     result = result.replace('\\,', ',')
  30.    
  31.     return result
  32.  
  33. def extract_text_by_font_size(pdf_path, min_font_size):
  34.     with open(pdf_path, 'rb') as file:
  35.         pdf = PdfReader(file)
  36.         extracted_text = []
  37.  
  38.         for page in pdf.pages:
  39.             content = page['/Contents'].get_object()
  40.             if not isinstance(content, PDFDictionary):
  41.                 content = [content]
  42.  
  43.             page_text = []
  44.             current_font_size = 0
  45.             current_line = ''
  46.             last_y = None
  47.  
  48.             for obj in content:
  49.                 if not hasattr(obj, 'get_data'):
  50.                     continue
  51.                 data = obj.get_data()
  52.                 commands = data.split(b'\n')
  53.  
  54.                 for command in commands:
  55.                     if b'Tf' in command:
  56.                         parts = command.split()
  57.                         if len(parts) >= 3:
  58.                             try:
  59.                                 current_font_size = float(parts[-2])
  60.                             except ValueError:
  61.                                 pass
  62.                     elif b'Td' in command or b'TD' in command:
  63.                         # Extract Y coordinate
  64.                         parts = command.split()
  65.                         if len(parts) >= 3:
  66.                             try:
  67.                                 y = float(parts[-1])
  68.                                 if last_y is not None and y != last_y:
  69.                                     if current_line.strip():
  70.                                         page_text.append(current_line.strip())
  71.                                         current_line = ''
  72.                                 last_y = y
  73.                             except ValueError:
  74.                                 pass
  75.                     elif b'Tj' in command or b'TJ' in command:
  76.                         if current_font_size >= min_font_size:
  77.                             text_obj = command.split(b'[', 1)[-1].rsplit(b']', 1)[0]
  78.                             try:
  79.                                 decoded_text = text_obj.decode('utf-8')
  80.                                 cleaned_text = clean_text_preserve_format(decoded_text)
  81.                                 if current_line and not current_line.endswith(' '):
  82.                                     current_line += ' '
  83.                                 current_line += cleaned_text
  84.                             except UnicodeDecodeError:
  85.                                 pass
  86.                     elif b'T*' in command:
  87.                         # New line
  88.                         if current_line.strip():
  89.                             page_text.append(current_line.strip())
  90.                             current_line = ''
  91.  
  92.             # Add any remaining text in the current line
  93.             if current_line.strip():
  94.                 page_text.append(current_line.strip())
  95.  
  96.             extracted_text.append('\n'.join(page_text))
  97.  
  98.     return '\n\n'.join(extracted_text)
  99.  
  100. # Example usage
  101. pdf_path = 'input.pdf'
  102. min_font_size = 12  # Specify the minimum font size here
  103. result = extract_text_by_font_size(pdf_path, min_font_size)
  104. print(result)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement