Llama OCR

# export TOGETHER_API_KEY=your_api_key
# pip install -U pip together

import base64
import os

from together import Together

TOGETHER_API_KEY = os.environ.get("TOGETHER_API_KEY")

client = Together()

def encode_image(image_path):
        """Encode image to base64"""
        with open(image_path, "rb") as image_file:
            return base64.b64encode(image_file.read()).decode('utf-8')

def convert_image_to_text(image_path):
    """Convert image to text

    Args:
        image_path (str): Path to the image to be converted
    """

    encoded_image = encode_image(image_path)

    response = client.chat.completions.create(
        model="meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo",
        messages=[
            {
                "role": "system",
                "content": (
                    "Convert the provided image into Markdown format. Ensure that all content from the page is included, "
                    "such as headers, footers, subtexts, images (with alt text if possible), tables, and any other elements.\n\n"
                    "Requirements:\n\n"
                    "- Output Only Markdown: Return solely the Markdown content without any additional explanations or comments.\n"
                    "- No Delimiters: Do not use code fences or delimiters like \\`\\`\\`markdown.\n"
                    "- Complete Content: Do not omit any part of the page, including headers, footers, and subtext."
                )
            },
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": "convert the uploaded image to text"
                    },
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{encoded_image}"
                        }
                    }
                ]
            }
        ],

        temperature=0.0,
        stream=True
    )
    for token in response:
        if hasattr(token, 'choices') and token.choices:
            print(token.choices[0].delta.content, end='', flush=True)

    return ""

IMAGAES_DIR = "images"

for image in os.listdir(IMAGAES_DIR):
    images_path = os.path.join(IMAGAES_DIR, image)
    convert_image_to_text(images_path)