Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import asyncio
- import os
- import argparse
- import logging
- import xml.etree.ElementTree as ET
- from bs4 import BeautifulSoup
- from aiofiles import open as aio_open
- import re
- import json
- logging.basicConfig(level=logging.INFO)
- MATHJAX_SCRIPT = """
- <script type="text/javascript" async
- src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.7/MathJax.js?config=TeX-MML-AM_CHTML">
- </script>
- """
- async def read_file(file_path):
- try:
- async with aio_open(file_path, 'r', encoding='utf-8') as f:
- return await f.read()
- except Exception as e:
- logging.error(f"Error reading {file_path}: {e}")
- return None
- async def write_file(file_path, content):
- try:
- async with aio_open(file_path, 'w', encoding='utf-8') as f:
- await f.write(content)
- logging.info(f"Content saved to {file_path}")
- except Exception as e:
- logging.error(f"Error writing to {file_path}: {e}")
- def clean_html(html_content):
- soup = BeautifulSoup(html_content, 'html.parser')
- for tag in soup(['script', 'a']):
- tag.decompose()
- for tag in soup.find_all(True):
- if not tag.contents:
- tag.extract()
- return str(soup)
- # async def create_json_from_html(html_path, json_path):
- # html_content = await read_file(html_path)
- # if not html_content:
- # print("No HTML content found") # Debug statement
- # return
- # soup = BeautifulSoup(html_content, 'html.parser')
- # tr_tags = soup.find_all('tr')
- # if not tr_tags:
- # print("No <tr> tags found") # Debug statement
- # return
- # questions_json = []
- # question_text = ''
- # options = {}
- # for tr in tr_tags:
- # text_content = tr.get_text().strip()
- # question_match = re.match(r"(\d+\.)\s*(.*)", text_content)
- # option_match = re.match(r"(a\)|b\)|c\)|d\))\s*(.*)", text_content)
- # if question_match:
- # print(f"Question matched: {question_match.group(2)}") # Debug statement
- # if question_text and options:
- # questions_json.append({
- # "Question": question_text,
- # "Options": options
- # })
- # question_text = question_match.group(2).strip()
- # options = {}
- # elif option_match:
- # print(f"Option matched: {option_match.group(2)}") # Debug statement
- # option_key = option_match.group(1)
- # option_value = option_match.group(2).strip()
- # options[option_key] = option_value
- # if question_text and options:
- # questions_json.append({
- # "Question": question_text,
- # "Options": options
- # })
- # await write_file(json_path, json.dumps(questions_json, indent=4))
- async def create_json_from_html(html_path, json_path):
- html_content = await read_file(html_path)
- if not html_content:
- return
- soup = BeautifulSoup(html_content, 'html.parser')
- tr_tags = soup.find_all('tr')
- questions_json = []
- question_text = ''
- options = {}
- for tr in tr_tags:
- td_tags = tr.find_all('td')
- print(f"DEBUG: {td_tags}") # Debug line
- if len(td_tags) == 1:
- text_content = td_tags[0].get_text().strip()
- print(f"DEBUG TEXT: {text_content}") # Debug line
- question_match = re.match(r"(\d+\.)\s*(.*)", text_content)
- if question_match:
- if question_text and options:
- questions_json.append({
- "Question": question_text,
- "Options": options
- })
- question_text = question_match.group(2).strip()
- options = {}
- elif len(td_tags) > 1:
- for td in td_tags:
- text_content = td.get_text().strip()
- option_match = re.match(r"(a\)|b\)|c\)|d\))\s*(.*)", text_content)
- if option_match:
- option_key = option_match.group(1)
- option_value = option_match.group(2).strip()
- options[option_key] = option_value
- if question_text and options:
- questions_json.append({
- "Question": question_text,
- "Options": options
- })
- print(f"DEBUG JSON: {questions_json}") # Debug line
- await write_file(json_path, json.dumps(questions_json, indent=4))
- async def main(args):
- try:
- process = await asyncio.create_subprocess_shell(f'pandoc -s --toc --section-divs {args.input_docx} -o {args.output_html}')
- await process.communicate()
- logging.info("HTML Pandoc command executed successfully.")
- except Exception as e:
- logging.error(f"Error executing Pandoc commands: {e}")
- return
- html_content = await read_file(args.output_html)
- if html_content is None:
- return
- cleaned_html = clean_html(html_content)
- cleaned_html_with_mathjax = MATHJAX_SCRIPT + cleaned_html
- await write_file(args.output_modified_html, cleaned_html_with_mathjax)
- await create_json_from_html(args.output_modified_html, args.output_json)
- if __name__ == "__main__":
- parser = argparse.ArgumentParser(description='Convert DOCX to HTML to JSON.')
- parser.add_argument('--input_docx', type=str, default='sdss.docx', help='Input DOCX file path')
- parser.add_argument('--output_html', type=str, default='soutput.html', help='Output HTML file path')
- parser.add_argument('--output_json', type=str, default='soutput.json', help='Output JSON file path')
- parser.add_argument('--output_modified_html', type=str, default='soutput_modified.html', help='Modified output HTML file path')
- args = parser.parse_args()
- asyncio.run(main(args))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement