Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!pip install pypdf
- #!pip install reportlab
- import io
- import re
- from pypdf import PdfReader
- from reportlab.pdfbase.pdfdoc import PDFDictionary
- def clean_text_preserve_format(text):
- # Handle escaped characters
- text = text.replace('\\(', '(').replace('\\)', ')').replace('\\.', '.')
- text = text.replace('\\\\', '\\').replace('\\ ', ' ')
- # Extract text within parentheses, preserving spaces
- parts = re.findall(r'\((.*?)\)', text)
- # Join parts, replacing empty parentheses with spaces
- result = ''
- for part in parts:
- if part.strip():
- result += part
- else:
- result += ' '
- # Remove any remaining parentheses
- result = result.replace('(', '').replace(')', '')
- # Handle special cases like "\,"
- result = result.replace('\\,', ',')
- return result
- def extract_text_by_font_size(pdf_path, min_font_size):
- with open(pdf_path, 'rb') as file:
- pdf = PdfReader(file)
- extracted_text = []
- for page in pdf.pages:
- content = page['/Contents'].get_object()
- if not isinstance(content, PDFDictionary):
- content = [content]
- page_text = []
- current_font_size = 0
- current_line = ''
- last_y = None
- for obj in content:
- if not hasattr(obj, 'get_data'):
- continue
- data = obj.get_data()
- commands = data.split(b'\n')
- for command in commands:
- if b'Tf' in command:
- parts = command.split()
- if len(parts) >= 3:
- try:
- current_font_size = float(parts[-2])
- except ValueError:
- pass
- elif b'Td' in command or b'TD' in command:
- # Extract Y coordinate
- parts = command.split()
- if len(parts) >= 3:
- try:
- y = float(parts[-1])
- if last_y is not None and y != last_y:
- if current_line.strip():
- page_text.append(current_line.strip())
- current_line = ''
- last_y = y
- except ValueError:
- pass
- elif b'Tj' in command or b'TJ' in command:
- if current_font_size >= min_font_size:
- text_obj = command.split(b'[', 1)[-1].rsplit(b']', 1)[0]
- try:
- decoded_text = text_obj.decode('utf-8')
- cleaned_text = clean_text_preserve_format(decoded_text)
- if current_line and not current_line.endswith(' '):
- current_line += ' '
- current_line += cleaned_text
- except UnicodeDecodeError:
- pass
- elif b'T*' in command:
- # New line
- if current_line.strip():
- page_text.append(current_line.strip())
- current_line = ''
- # Add any remaining text in the current line
- if current_line.strip():
- page_text.append(current_line.strip())
- extracted_text.append('\n'.join(page_text))
- return '\n\n'.join(extracted_text)
- # Example usage
- pdf_path = 'input.pdf'
- min_font_size = 12 # Specify the minimum font size here
- result = extract_text_by_font_size(pdf_path, min_font_size)
- print(result)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement