Untitled

import asyncio
import os
import argparse
import logging
import xml.etree.ElementTree as ET
from bs4 import BeautifulSoup
from aiofiles import open as aio_open
import re
import json

logging.basicConfig(level=logging.INFO)

MATHJAX_SCRIPT = """
<script type="text/javascript" async
  src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.7/MathJax.js?config=TeX-MML-AM_CHTML">
</script>
"""

async def read_file(file_path):
    try:
        async with aio_open(file_path, 'r', encoding='utf-8') as f:
            return await f.read()
    except Exception as e:
        logging.error(f"Error reading {file_path}: {e}")
        return None

async def write_file(file_path, content):
    try:
        async with aio_open(file_path, 'w', encoding='utf-8') as f:
            await f.write(content)
        logging.info(f"Content saved to {file_path}")
    except Exception as e:
        logging.error(f"Error writing to {file_path}: {e}")

def clean_html(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    for tag in soup(['script', 'a']):
        tag.decompose()

    for tag in soup.find_all(True):
        if not tag.contents:
            tag.extract()
    return str(soup)

# async def create_json_from_html(html_path, json_path):
#     html_content = await read_file(html_path)
#     if not html_content:
#         print("No HTML content found")  # Debug statement
#         return

#     soup = BeautifulSoup(html_content, 'html.parser')
#     tr_tags = soup.find_all('tr')
#     if not tr_tags:
#         print("No <tr> tags found")  # Debug statement
#         return

#     questions_json = []
#     question_text = ''
#     options = {}

#     for tr in tr_tags:
#         text_content = tr.get_text().strip()

#         question_match = re.match(r"(\d+\.)\s*(.*)", text_content)
#         option_match = re.match(r"(a\)|b\)|c\)|d\))\s*(.*)", text_content)

#         if question_match:
#             print(f"Question matched: {question_match.group(2)}")  # Debug statement
#             if question_text and options:
#                 questions_json.append({
#                     "Question": question_text,
#                     "Options": options
#                 })
#             question_text = question_match.group(2).strip()
#             options = {}

#         elif option_match:
#             print(f"Option matched: {option_match.group(2)}")  # Debug statement
#             option_key = option_match.group(1)
#             option_value = option_match.group(2).strip()
#             options[option_key] = option_value

#     if question_text and options:
#         questions_json.append({
#             "Question": question_text,
#             "Options": options
#         })

#     await write_file(json_path, json.dumps(questions_json, indent=4))
async def create_json_from_html(html_path, json_path):
    html_content = await read_file(html_path)
    if not html_content:
        return

    soup = BeautifulSoup(html_content, 'html.parser')
    tr_tags = soup.find_all('tr')

    questions_json = []
    question_text = ''
    options = {}

    for tr in tr_tags:
        td_tags = tr.find_all('td')
        print(f"DEBUG: {td_tags}")  # Debug line

        if len(td_tags) == 1:
            text_content = td_tags[0].get_text().strip()
            print(f"DEBUG TEXT: {text_content}")  # Debug line

            question_match = re.match(r"(\d+\.)\s*(.*)", text_content)

            if question_match:
                if question_text and options:
                    questions_json.append({
                        "Question": question_text,
                        "Options": options
                    })
                question_text = question_match.group(2).strip()
                options = {}

        elif len(td_tags) > 1:
            for td in td_tags:
                text_content = td.get_text().strip()
                option_match = re.match(r"(a\)|b\)|c\)|d\))\s*(.*)", text_content)

                if option_match:
                    option_key = option_match.group(1)
                    option_value = option_match.group(2).strip()
                    options[option_key] = option_value

    if question_text and options:
        questions_json.append({
            "Question": question_text,
            "Options": options
        })

    print(f"DEBUG JSON: {questions_json}")  # Debug line
    await write_file(json_path, json.dumps(questions_json, indent=4))

async def main(args):
    try:
        process = await asyncio.create_subprocess_shell(f'pandoc -s --toc --section-divs {args.input_docx} -o {args.output_html}')
        await process.communicate()
        logging.info("HTML Pandoc command executed successfully.")
    except Exception as e:
        logging.error(f"Error executing Pandoc commands: {e}")
        return

    html_content = await read_file(args.output_html)
    if html_content is None:
        return

    cleaned_html = clean_html(html_content)
    cleaned_html_with_mathjax = MATHJAX_SCRIPT + cleaned_html
    await write_file(args.output_modified_html, cleaned_html_with_mathjax)

    await create_json_from_html(args.output_modified_html, args.output_json)

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Convert DOCX to HTML to JSON.')
    parser.add_argument('--input_docx', type=str, default='sdss.docx', help='Input DOCX file path')
    parser.add_argument('--output_html', type=str, default='soutput.html', help='Output HTML file path')
    parser.add_argument('--output_json', type=str, default='soutput.json', help='Output JSON file path')
    parser.add_argument('--output_modified_html', type=str, default='soutput_modified.html', help='Modified output HTML file path')
    args = parser.parse_args()

    asyncio.run(main(args))