A_God

The last attempt

Sep 28th, 2023
73
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 9.79 KB | Music | 0 0
  1. import asyncio
  2. import os
  3. import argparse
  4. import logging
  5. import xml.etree.ElementTree as ET
  6. from bs4 import BeautifulSoup
  7. from aiofiles import open as aio_open
  8. import re
  9. import json
  10.  
  11. logging.basicConfig(level=logging.INFO)
  12.  
  13. MATHJAX_SCRIPT = """
  14. <script type="text/javascript" async
  15. src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.7/MathJax.js?config=TeX-MML-AM_CHTML">
  16. </script>
  17. """
  18.  
  19. async def read_file(file_path):
  20. try:
  21. async with aio_open(file_path, 'r', encoding='utf-8') as f:
  22. return await f.read()
  23. except Exception as e:
  24. logging.error(f"Error reading {file_path}: {e}")
  25. return None
  26.  
  27. async def write_file(file_path, content):
  28. try:
  29. async with aio_open(file_path, 'w', encoding='utf-8') as f:
  30. await f.write(content)
  31. logging.info(f"Content saved to {file_path}")
  32. except Exception as e:
  33. logging.error(f"Error writing to {file_path}: {e}")
  34.  
  35. def clean_html(html_content):
  36. soup = BeautifulSoup(html_content, 'html.parser')
  37. for tag in soup(['script', 'a']):
  38. tag.decompose()
  39.  
  40. for tag in soup.find_all(True):
  41. if not tag.contents:
  42. tag.extract()
  43. return str(soup)
  44.  
  45. async def create_json_from_html(html_path, json_path):
  46. html_content = await read_file(html_path)
  47. if not html_content:
  48. print("No HTML content found")
  49. return
  50.  
  51. soup = BeautifulSoup(html_content, 'html.parser')
  52. tr_tags = soup.find_all('tr')
  53. if not tr_tags:
  54. print("No <tr> tags found")
  55. return
  56.  
  57. questions_json = []
  58. question_text = ''
  59. options = {}
  60.  
  61. for tr in tr_tags:
  62. td_tags = tr.find_all('td')
  63.  
  64. for td in td_tags:
  65. td_content = td.get_text().strip()
  66. question_match = re.match(r"(\d+\.)\s*(.*)", td_content)
  67. option_match = re.match(r"(a\)|b\)|c\)|d\))\s*(.*)", td_content)
  68.  
  69. if question_match:
  70. # Store previous question and options if they exist
  71. if question_text and options:
  72. questions_json.append({
  73. "Question": question_text,
  74. "Options": options
  75. })
  76.  
  77. # Start a new question
  78. question_text = question_match.group(2).strip()
  79. options = {}
  80.  
  81. elif option_match:
  82. option_key = option_match.group(1)
  83. option_value = option_match.group(2).strip()
  84. options[option_key] = option_value
  85.  
  86. # Handle the last question
  87. if question_text and options:
  88. questions_json.append({
  89. "Question": question_text,
  90. "Options": options
  91. })
  92.  
  93. await write_file(json_path, json.dumps(questions_json, indent=4))
  94.  
  95. # You'll have to define or import read_file and write_file functions yourself.
  96.  
  97.  
  98. # # Working for questions otions not found
  99. # async def create_json_from_html(html_path, json_path):
  100. # html_content = await read_file(html_path)
  101. # if not html_content:
  102. # print("No HTML content found")
  103. # return
  104.  
  105. # soup = BeautifulSoup(html_content, 'html.parser')
  106. # tr_tags = soup.find_all('tr')
  107. # if not tr_tags:
  108. # print("No <tr> tags found")
  109. # return
  110.  
  111. # questions_json = []
  112. # question_text = ''
  113. # options = {}
  114.  
  115. # for tr in tr_tags:
  116. # text_content = tr.get_text().strip()
  117.  
  118. # # Identify questions
  119. # question_match = re.match(r"(\d+\.)\s*(.*)", text_content)
  120. # if question_match:
  121. # if question_text and options:
  122. # questions_json.append({
  123. # "Question": question_text,
  124. # "Options": options
  125. # })
  126. # question_text = question_match.group(2).strip()
  127. # options = {}
  128. # continue # Move on to next tr as the question has been processed
  129.  
  130. # # Identify options
  131. # td_tags = tr.find_all('td')
  132. # for td in td_tags:
  133. # td_content = td.get_text().strip()
  134. # option_match = re.match(r"(a\)|b\)|c\)|d\))\s*(.*)", td_content)
  135. # if option_match:
  136. # option_key = option_match.group(1)
  137. # option_value = option_match.group(2).strip()
  138. # options[option_key] = option_value
  139.  
  140. # # Handle the last question
  141. # if question_text and options:
  142. # questions_json.append({
  143. # "Question": question_text,
  144. # "Options": options
  145. # })
  146.  
  147. # await write_file(json_path, json.dumps(questions_json, indent=4))
  148.  
  149. # Working unorganised
  150. # async def create_json_from_html(html_path, json_path):
  151. # html_content = await read_file(html_path)
  152. # if not html_content:
  153. # return
  154.  
  155. # soup = BeautifulSoup(html_content, 'html.parser')
  156. # tr_tags = soup.find_all('tr')
  157.  
  158. # questions_json = []
  159. # question_text = ''
  160. # options = {}
  161.  
  162. # for tr in tr_tags:
  163. # text_content = tr.get_text().strip()
  164.  
  165. # question_match = re.match(r"(\d+\.)\s*(.*)", text_content)
  166. # option_match = re.match(r"(a\)|b\)|c\)|d\))\s*(.*)", text_content)
  167.  
  168. # if question_match:
  169. # if question_text and options:
  170. # questions_json.append({
  171. # "Question": question_text,
  172. # "Options": options
  173. # })
  174. # question_text = question_match.group(2).strip()
  175. # options = {}
  176.  
  177. # elif option_match:
  178. # option_key = option_match.group(1)
  179. # option_value = option_match.group(2).strip()
  180. # options[option_key] = option_value
  181.  
  182. # if question_text and options:
  183. # questions_json.append({
  184. # "Question": question_text,
  185. # "Options": options
  186. # })
  187.  
  188. # await write_file(json_path, json.dumps(questions_json, indent=4))
  189.  
  190.  
  191. async def main(args):
  192. try:
  193. process = await asyncio.create_subprocess_shell(f'pandoc -s --toc --section-divs {args.input_docx} -o {args.output_html}')
  194. await process.communicate()
  195. logging.info("HTML Pandoc command executed successfully.")
  196. except Exception as e:
  197. logging.error(f"Error executing Pandoc commands: {e}")
  198. return
  199.  
  200. html_content = await read_file(args.output_html)
  201. if html_content is None:
  202. return
  203.  
  204. cleaned_html = clean_html(html_content)
  205. cleaned_html_with_mathjax = MATHJAX_SCRIPT + cleaned_html
  206. await write_file(args.output_modified_html, cleaned_html_with_mathjax)
  207.  
  208. await create_json_from_html(args.output_modified_html, args.output_json)
  209.  
  210. if __name__ == "__main__":
  211. parser = argparse.ArgumentParser(description='Convert DOCX to HTML to JSON.')
  212. parser.add_argument('--input_docx', type=str, default='sdss.docx', help='Input DOCX file path')
  213. parser.add_argument('--output_html', type=str, default='soutput.html', help='Output HTML file path')
  214. parser.add_argument('--output_json', type=str, default='soutput.json', help='Output JSON file path')
  215. parser.add_argument('--output_modified_html', type=str, default='soutput_modified.html', help='Modified output HTML file path')
  216. args = parser.parse_args()
  217.  
  218. asyncio.run(main(args))
  219.  
  220. # import json
  221. # import re
  222. # import asyncio
  223. # import aiohttp
  224.  
  225. # async def fetch_pdf_and_log_json():
  226. # url = "http://20.244.0.255:8080/api/transformer"
  227. # headers = {}
  228. # try:
  229. # async with aiohttp.ClientSession() as session:
  230. # form_data = aiohttp.FormData()
  231. # form_data.add_field('file', open('sdss.pdf', 'rb'), filename='sdss.pdf', content_type='application/pdf')
  232. # async with session.post(url, headers=headers, data=form_data) as response:
  233. # if response.status == 200:
  234. # json_response = await response.json()
  235. # with open('api_response_log.json', 'w') as json_file:
  236. # json.dump(json_response, json_file, indent=4)
  237. # print(f"Successfully logged the JSON response to 'api_response_log.json'")
  238. # return json_response # Assuming 'text' is a key in the returned JSON that holds the text data.
  239. # else:
  240. # print(f"Failed to make the API request. Status code: {response.status}")
  241. # return None
  242. # except Exception as e:
  243. # print(f"An error occurred: {e}")
  244. # return None
  245.  
  246. # def extract_questions(text):
  247. # questions_json = []
  248. # pattern = r"\d+\.\s*([\w\s\W]+?)(?:a\)|b\)|c\)|d\))([\w\s\W]+?)(?:\d+\.\s*|$)"
  249. # matches = re.findall(pattern, text, re.DOTALL)
  250. # print(f"Matches: {matches}") # Debug line
  251. # for i, (question, options) in enumerate(matches):
  252. # question_dict = {"Question": question.strip(), "Options": {}}
  253. # for idx, option in enumerate(re.split(r"a\)|b\)|c\)|d\)", options.strip())[1:]):
  254. # question_dict["Options"][chr(ord('a') + idx)] = option.strip()
  255. # questions_json.append(question_dict)
  256. # return questions_json
  257.  
  258. # async def main():
  259. # fetched_text = await fetch_pdf_and_log_json()
  260. # if fetched_text:
  261. # with open('debug_text.txt', 'w') as debug_file:
  262. # debug_file.write(json.dumps(fetched_text)) # Debug line
  263. # for text in fetched_text.items():
  264. # print(text[1])
  265.  
  266. # extracted_questions = extract_questions(text[1])
  267.  
  268. # with open('structured_questions.json', 'w') as f:
  269. # json.dump(extracted_questions, f, indent=4)
  270. # print(f"Successfully saved the structured questions to 'structured_questions.json'")
  271.  
  272. # if __name__ == "__main__":
  273. # asyncio.run(main())
  274.  
Add Comment
Please, Sign In to add comment