content_extraction.py

#!pip install pypdf
#!pip install reportlab

import io
import re
from pypdf import PdfReader
from reportlab.pdfbase.pdfdoc import PDFDictionary

def clean_text_preserve_format(text):
    # Handle escaped characters
    text = text.replace('\\(', '(').replace('\\)', ')').replace('\\.', '.')
    text = text.replace('\\\\', '\\').replace('\\ ', ' ')

    # Extract text within parentheses, preserving spaces
    parts = re.findall(r'\((.*?)\)', text)

    # Join parts, replacing empty parentheses with spaces
    result = ''
    for part in parts:
        if part.strip():
            result += part
        else:
            result += ' '

    # Remove any remaining parentheses
    result = result.replace('(', '').replace(')', '')

    # Handle special cases like "\,"
    result = result.replace('\\,', ',')

    return result

def extract_text_by_font_size(pdf_path, min_font_size):
    with open(pdf_path, 'rb') as file:
        pdf = PdfReader(file)
        extracted_text = []

        for page in pdf.pages:
            content = page['/Contents'].get_object()
            if not isinstance(content, PDFDictionary):
                content = [content]

            page_text = []
            current_font_size = 0
            current_line = ''
            last_y = None

            for obj in content:
                if not hasattr(obj, 'get_data'):
                    continue
                data = obj.get_data()
                commands = data.split(b'\n')

                for command in commands:
                    if b'Tf' in command:
                        parts = command.split()
                        if len(parts) >= 3:
                            try:
                                current_font_size = float(parts[-2])
                            except ValueError:
                                pass
                    elif b'Td' in command or b'TD' in command:
                        # Extract Y coordinate
                        parts = command.split()
                        if len(parts) >= 3:
                            try:
                                y = float(parts[-1])
                                if last_y is not None and y != last_y:
                                    if current_line.strip():
                                        page_text.append(current_line.strip())
                                        current_line = ''
                                last_y = y
                            except ValueError:
                                pass
                    elif b'Tj' in command or b'TJ' in command:
                        if current_font_size >= min_font_size:
                            text_obj = command.split(b'[', 1)[-1].rsplit(b']', 1)[0]
                            try:
                                decoded_text = text_obj.decode('utf-8')
                                cleaned_text = clean_text_preserve_format(decoded_text)
                                if current_line and not current_line.endswith(' '):
                                    current_line += ' '
                                current_line += cleaned_text
                            except UnicodeDecodeError:
                                pass
                    elif b'T*' in command:
                        # New line
                        if current_line.strip():
                            page_text.append(current_line.strip())
                            current_line = ''

            # Add any remaining text in the current line
            if current_line.strip():
                page_text.append(current_line.strip())

            extracted_text.append('\n'.join(page_text))

    return '\n\n'.join(extracted_text)

# Example usage
pdf_path = 'input.pdf'
min_font_size = 12  # Specify the minimum font size here
result = extract_text_by_font_size(pdf_path, min_font_size)
print(result)