Advertisement
A_God

Untitled

Sep 28th, 2023
99
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 5.95 KB | None | 0 0
  1. import asyncio
  2. import os
  3. import argparse
  4. import logging
  5. import xml.etree.ElementTree as ET
  6. from bs4 import BeautifulSoup
  7. from aiofiles import open as aio_open
  8. import re
  9. import json
  10.  
  11. logging.basicConfig(level=logging.INFO)
  12.  
  13. MATHJAX_SCRIPT = """
  14. <script type="text/javascript" async
  15. src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.7/MathJax.js?config=TeX-MML-AM_CHTML">
  16. </script>
  17. """
  18.  
  19. async def read_file(file_path):
  20. try:
  21. async with aio_open(file_path, 'r', encoding='utf-8') as f:
  22. return await f.read()
  23. except Exception as e:
  24. logging.error(f"Error reading {file_path}: {e}")
  25. return None
  26.  
  27. async def write_file(file_path, content):
  28. try:
  29. async with aio_open(file_path, 'w', encoding='utf-8') as f:
  30. await f.write(content)
  31. logging.info(f"Content saved to {file_path}")
  32. except Exception as e:
  33. logging.error(f"Error writing to {file_path}: {e}")
  34.  
  35. def clean_html(html_content):
  36. soup = BeautifulSoup(html_content, 'html.parser')
  37. for tag in soup(['script', 'a']):
  38. tag.decompose()
  39.  
  40. for tag in soup.find_all(True):
  41. if not tag.contents:
  42. tag.extract()
  43. return str(soup)
  44.  
  45. # async def create_json_from_html(html_path, json_path):
  46. # html_content = await read_file(html_path)
  47. # if not html_content:
  48. # print("No HTML content found") # Debug statement
  49. # return
  50.  
  51. # soup = BeautifulSoup(html_content, 'html.parser')
  52. # tr_tags = soup.find_all('tr')
  53. # if not tr_tags:
  54. # print("No <tr> tags found") # Debug statement
  55. # return
  56.  
  57. # questions_json = []
  58. # question_text = ''
  59. # options = {}
  60.  
  61. # for tr in tr_tags:
  62. # text_content = tr.get_text().strip()
  63.  
  64. # question_match = re.match(r"(\d+\.)\s*(.*)", text_content)
  65. # option_match = re.match(r"(a\)|b\)|c\)|d\))\s*(.*)", text_content)
  66.  
  67. # if question_match:
  68. # print(f"Question matched: {question_match.group(2)}") # Debug statement
  69. # if question_text and options:
  70. # questions_json.append({
  71. # "Question": question_text,
  72. # "Options": options
  73. # })
  74. # question_text = question_match.group(2).strip()
  75. # options = {}
  76.  
  77. # elif option_match:
  78. # print(f"Option matched: {option_match.group(2)}") # Debug statement
  79. # option_key = option_match.group(1)
  80. # option_value = option_match.group(2).strip()
  81. # options[option_key] = option_value
  82.  
  83. # if question_text and options:
  84. # questions_json.append({
  85. # "Question": question_text,
  86. # "Options": options
  87. # })
  88.  
  89. # await write_file(json_path, json.dumps(questions_json, indent=4))
  90. async def create_json_from_html(html_path, json_path):
  91. html_content = await read_file(html_path)
  92. if not html_content:
  93. return
  94.  
  95. soup = BeautifulSoup(html_content, 'html.parser')
  96. tr_tags = soup.find_all('tr')
  97.  
  98. questions_json = []
  99. question_text = ''
  100. options = {}
  101.  
  102. for tr in tr_tags:
  103. td_tags = tr.find_all('td')
  104. print(f"DEBUG: {td_tags}") # Debug line
  105.  
  106. if len(td_tags) == 1:
  107. text_content = td_tags[0].get_text().strip()
  108. print(f"DEBUG TEXT: {text_content}") # Debug line
  109.  
  110. question_match = re.match(r"(\d+\.)\s*(.*)", text_content)
  111.  
  112. if question_match:
  113. if question_text and options:
  114. questions_json.append({
  115. "Question": question_text,
  116. "Options": options
  117. })
  118. question_text = question_match.group(2).strip()
  119. options = {}
  120.  
  121. elif len(td_tags) > 1:
  122. for td in td_tags:
  123. text_content = td.get_text().strip()
  124. option_match = re.match(r"(a\)|b\)|c\)|d\))\s*(.*)", text_content)
  125.  
  126. if option_match:
  127. option_key = option_match.group(1)
  128. option_value = option_match.group(2).strip()
  129. options[option_key] = option_value
  130.  
  131. if question_text and options:
  132. questions_json.append({
  133. "Question": question_text,
  134. "Options": options
  135. })
  136.  
  137. print(f"DEBUG JSON: {questions_json}") # Debug line
  138. await write_file(json_path, json.dumps(questions_json, indent=4))
  139.  
  140. async def main(args):
  141. try:
  142. process = await asyncio.create_subprocess_shell(f'pandoc -s --toc --section-divs {args.input_docx} -o {args.output_html}')
  143. await process.communicate()
  144. logging.info("HTML Pandoc command executed successfully.")
  145. except Exception as e:
  146. logging.error(f"Error executing Pandoc commands: {e}")
  147. return
  148.  
  149. html_content = await read_file(args.output_html)
  150. if html_content is None:
  151. return
  152.  
  153. cleaned_html = clean_html(html_content)
  154. cleaned_html_with_mathjax = MATHJAX_SCRIPT + cleaned_html
  155. await write_file(args.output_modified_html, cleaned_html_with_mathjax)
  156.  
  157. await create_json_from_html(args.output_modified_html, args.output_json)
  158.  
  159. if __name__ == "__main__":
  160. parser = argparse.ArgumentParser(description='Convert DOCX to HTML to JSON.')
  161. parser.add_argument('--input_docx', type=str, default='sdss.docx', help='Input DOCX file path')
  162. parser.add_argument('--output_html', type=str, default='soutput.html', help='Output HTML file path')
  163. parser.add_argument('--output_json', type=str, default='soutput.json', help='Output JSON file path')
  164. parser.add_argument('--output_modified_html', type=str, default='soutput_modified.html', help='Modified output HTML file path')
  165. args = parser.parse_args()
  166.  
  167. asyncio.run(main(args))
  168.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement