Advertisement
getivan

Colab XML Parser for the KJV Bible PCE 1900

Apr 7th, 2025
299
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 18.64 KB | Software | 0 0
  1. #@title KJV Bible Formatter Setup and UI (v8.1 - Corrected Exception Syntax)
  2. import requests
  3. import xml.etree.ElementTree as ET
  4. import ipywidgets as widgets
  5. from IPython.display import display, clear_output, HTML
  6. import re
  7. import io
  8. import traceback
  9.  
  10. # --- Configuration ---
  11. BIBLE_XML_URL = "https://raw.githubusercontent.com/seven1m/open-bibles/master/eng-kjv.osis.xml"
  12. OSIS_NAMESPACE = "http://www.bibletechnologies.net/2003/OSIS/namespace"
  13. NS_MAP = {'osis': OSIS_NAMESPACE}
  14.  
  15. # --- Configuration Variables ---
  16. BOOK_ID_ATTR = 'osisID'
  17. CHAPTER_ID_ATTR = 'osisRef' # Identifies the chapter marker tag
  18. VERSE_ID_ATTR = 'osisID'    # Identifies the verse marker tag AND contains chapter/verse info
  19.  
  20. # --- Global Variables ---
  21. bible_data = {} # { Book: { Chap: { VerseNum: verse_element } } }
  22. book_list = []
  23. raw_xml_root = None
  24. _parent_map_cache = None
  25.  
  26. # --- UI Elements --- (No changes)
  27. load_button = widgets.Button(description="Load & Process XML", button_style='info', tooltip='Download, parse, and process the Bible XML.')
  28. load_status = widgets.Label(value="Status: Not Loaded")
  29. diagnostic_output = widgets.Output(layout={'border': '1px solid black', 'max_height': '100px', 'overflow_y': 'scroll'})
  30. book_dropdown = widgets.Dropdown(description="Book:", tooltip='Select the Bible book.')
  31. chapter_dropdown = widgets.Dropdown(description="Chapter:", tooltip='Select the chapter number.')
  32. verse_selection_text = widgets.Text(description="Verses:", value="all", placeholder="e.g., all, 1-5, 1,3,7-9", tooltip='Enter verses: "all", single number (5), range (1-5), or mixed (1,3,5-7).')
  33. title_checkbox = widgets.Checkbox(value=True, description="Include Title", tooltip='Prepend output with "Book Chapter".')
  34. italic_style_radio = widgets.RadioButtons(
  35.     options=[('Markdown (*italic*)', 'markdown'), ('HTML (<i>italic</i>)', 'html'), ('Plain (no italics)', 'plain')],
  36.     description='Italics:', value='markdown', tooltip='Choose how italicized text (KJV added words) is formatted.'
  37. )
  38. verse_separator_radio = widgets.RadioButtons(
  39.     options=[('Blank Line', '\n\n'), ('Single Newline', '\n')], description='Separator:', value='\n\n', tooltip='Choose the separator between verses.'
  40. )
  41. generate_button = widgets.Button(description="Generate", button_style='success', disabled=True, tooltip='Generate the formatted text based on selections.')
  42. output_textarea = widgets.Textarea(
  43.     value='', placeholder='Formatted text will appear here...', description='Output:',
  44.     layout={'height': '300px', 'width': '95%'}, disabled=False
  45. )
  46.  
  47. # --- Helper Functions --- (No changes from v8)
  48.  
  49. def build_parent_map(root):
  50.     global _parent_map_cache
  51.     if _parent_map_cache is None:
  52.         _parent_map_cache = {c: p for p in root.iter() for c in p}
  53.     return _parent_map_cache
  54.  
  55. def find_parent(root, element):
  56.     parent_map = build_parent_map(root)
  57.     return parent_map.get(element)
  58.  
  59. def parse_osis_id(osis_id_str):
  60.     """Parses 'Book.Chap.Verse' into (book, chap_num, verse_num). Returns None on failure."""
  61.     if not osis_id_str: return None
  62.     parts = osis_id_str.split('.')
  63.     if len(parts) >= 3: # Allow for potential subparts like Rev.12.7a
  64.         book = parts[0]
  65.         chap_str = parts[1]
  66.         verse_str = parts[2]
  67.         try:
  68.             verse_match = re.match(r'^(\d+)', verse_str)
  69.             if verse_match:
  70.                  return book, int(chap_str), int(verse_match.group(1))
  71.         except (ValueError, TypeError):
  72.             return None
  73.     return None
  74.  
  75.  
  76. def fetch_and_populate_data_v8(root):
  77.     """Parses XML by iterating book children, identifying chapters and verses."""
  78.     global bible_data, book_list
  79.     bible_data = {}
  80.     book_list = []
  81.  
  82.     if root is None: raise ValueError("XML Root is None.")
  83.  
  84.     book_tag = f'{{{OSIS_NAMESPACE}}}div'
  85.     chapter_tag = f'{{{OSIS_NAMESPACE}}}chapter'
  86.     verse_tag = f'{{{OSIS_NAMESPACE}}}verse'
  87.  
  88.     found_books = 0; found_chapters = 0; processed_verses = 0
  89.  
  90.     with diagnostic_output:
  91.         clear_output(wait=True)
  92.         print("Processing (v8): Finding books...")
  93.  
  94.     osis_text_element = root.find(f".//{{{OSIS_NAMESPACE}}}osisText", namespaces=NS_MAP)
  95.     if osis_text_element is None:
  96.          with diagnostic_output: print("ERROR: Cannot find <osisText> element.")
  97.          raise ValueError("<osisText> not found.")
  98.  
  99.     # Combine direct book divs and those under bookGroup
  100.     book_elements_query = f"./{{{OSIS_NAMESPACE}}}div[@type='book']"
  101.     book_group_query = f"./{{{OSIS_NAMESPACE}}}div[@type='bookGroup']/{{{OSIS_NAMESPACE}}}div[@type='book']"
  102.     all_potential_books = osis_text_element.findall(book_elements_query, namespaces=NS_MAP) + \
  103.                           osis_text_element.findall(book_group_query, namespaces=NS_MAP)
  104.  
  105.     valid_book_elements = [b for b in all_potential_books if b.get(BOOK_ID_ATTR)]
  106.  
  107.     if not valid_book_elements:
  108.          with diagnostic_output: print(f"ERROR: No book elements with '{BOOK_ID_ATTR}' found.")
  109.          raise ValueError("No valid book elements found.")
  110.  
  111.     for book_element in valid_book_elements:
  112.         book_name = book_element.get(BOOK_ID_ATTR)
  113.         found_books += 1
  114.         book_list.append(book_name)
  115.         bible_data[book_name] = {}
  116.         current_chapter_num = None
  117.  
  118.         for child in book_element:
  119.             if child.tag == chapter_tag and child.get(CHAPTER_ID_ATTR):
  120.                 chapter_ref = child.get(CHAPTER_ID_ATTR)
  121.                 try:
  122.                     parsed_ref = parse_osis_id(chapter_ref + ".0")
  123.                     if parsed_ref:
  124.                          _, chap_num, _ = parsed_ref
  125.                          n_attr = child.get('n')
  126.                          if n_attr and int(n_attr) == chap_num:
  127.                               current_chapter_num = chap_num
  128.                               if current_chapter_num not in bible_data[book_name]:
  129.                                    bible_data[book_name][current_chapter_num] = {}
  130.                                    found_chapters += 1
  131.                 except Exception as e_chap:
  132.                     print(f"Warning: Error processing chapter marker {chapter_ref}: {e_chap}")
  133.                 continue
  134.  
  135.             container_tags = {f'{{{OSIS_NAMESPACE}}}p', f'{{{OSIS_NAMESPACE}}}lg',
  136.                               f'{{{OSIS_NAMESPACE}}}list', f'{{{OSIS_NAMESPACE}}}table'}
  137.             if child.tag in container_tags:
  138.                 for element in child.iter():
  139.                     if element.tag == verse_tag and element.get(VERSE_ID_ATTR):
  140.                         verse_osisID = element.get(VERSE_ID_ATTR)
  141.                         parsed_verse_ref = parse_osis_id(verse_osisID)
  142.                         if parsed_verse_ref:
  143.                             v_book, v_chap, v_num = parsed_verse_ref
  144.                             if v_book == book_name and v_chap == current_chapter_num:
  145.                                 if current_chapter_num is not None and v_num not in bible_data[book_name][current_chapter_num]:
  146.                                      bible_data[book_name][current_chapter_num][v_num] = element
  147.                                      processed_verses += 1
  148.  
  149.     with diagnostic_output:
  150.         print("Population Summary (v8):")
  151.         print(f"- Found and processed {found_books} books.")
  152.         print(f"- Found and processed {found_chapters} chapters.")
  153.         print(f"- Found and stored {processed_verses} verse elements (using osisID).")
  154.         if processed_verses == 0 and found_chapters > 0:
  155.              print(f"ERROR: Failed to store any verse elements. Check parsing logic within book's children loop.")
  156.         elif found_chapters == 0 and found_books > 0:
  157.              print(f"ERROR: No chapters identified correctly.")
  158.  
  159.  
  160. def extract_and_format_verse_text_v8(verse_start_element, italic_style):
  161.     """Extracts text starting from verse_start_element until the next verse osisID marker, iterating within parent container."""
  162.     verse_tag = f'{{{OSIS_NAMESPACE}}}verse'
  163.     transChange_tag = f'{{{OSIS_NAMESPACE}}}transChange'
  164.     container_tags = {f'{{{OSIS_NAMESPACE}}}p', f'{{{OSIS_NAMESPACE}}}lg'}
  165.     ignore_tags = {f'{{{OSIS_NAMESPACE}}}note', f'{{{OSIS_NAMESPACE}}}rdg', f'{{{OSIS_NAMESPACE}}}title'}
  166.  
  167.     text_parts = []
  168.     in_target_verse = False
  169.     start_verse_osisID = verse_start_element.get(VERSE_ID_ATTR)
  170.  
  171.     current = verse_start_element
  172.     parent_container = None
  173.     while current is not None:
  174.         parent = find_parent(raw_xml_root, current)
  175.         if parent is None: break
  176.         if parent.tag in container_tags:
  177.             parent_container = parent; break
  178.         if parent.tag == f'{{{OSIS_NAMESPACE}}}chapter' or \
  179.            (parent.tag == f'{{{OSIS_NAMESPACE}}}div' and parent.get('type') == 'book'):
  180.              break
  181.         current = parent
  182.  
  183.     if parent_container is None:
  184.         print(f"Warning: Could not find parent container (e.g., <p>) for verse {start_verse_osisID}. Text incomplete.")
  185.         return (verse_start_element.tail or '').strip()
  186.  
  187.     for node in parent_container.iter():
  188.         if node == verse_start_element:
  189.             in_target_verse = True
  190.             if node.tail: text_parts.append(node.tail.strip())
  191.             continue
  192.  
  193.         if in_target_verse and node.tag == verse_tag and node.get(VERSE_ID_ATTR) and node != verse_start_element:
  194.              if node.get(VERSE_ID_ATTR) != start_verse_osisID:
  195.                  in_target_verse = False; break
  196.  
  197.         if in_target_verse:
  198.             if node.tag in ignore_tags:
  199.                 if node.tail: text_parts.append(node.tail.strip())
  200.                 continue
  201.  
  202.             if node.tag == transChange_tag:
  203.                 start_marker, end_marker = "", ""
  204.                 if italic_style == 'markdown': start_marker, end_marker = "*", "*"
  205.                 elif italic_style == 'html': start_marker, end_marker = "<i>", "</i>"
  206.                 if node.text: text_parts.append(start_marker + node.text.strip() + end_marker)
  207.                 if node.tail: text_parts.append(node.tail.strip())
  208.  
  209.             elif node.tag != verse_tag:
  210.                 if node.text:
  211.                     is_sub_container = node.tag in {f'{{{OSIS_NAMESPACE}}}div', f'{{{OSIS_NAMESPACE}}}list'}
  212.                     if not is_sub_container: text_parts.append(node.text.strip())
  213.                 if node.tail:
  214.                     text_parts.append(node.tail.strip())
  215.  
  216.     full_text = " ".join(filter(None, text_parts))
  217.     return ' '.join(full_text.split())
  218.  
  219.  
  220. def update_chapter_dropdown(change):
  221.     selected_book = change['new']
  222.     chapter_dropdown.options = []
  223.     if selected_book in bible_data:
  224.         if isinstance(bible_data[selected_book], dict):
  225.              chapters = sorted([ch for ch in bible_data[selected_book].keys() if isinstance(ch, int)])
  226.              if chapters:
  227.                  chapter_dropdown.options = chapters
  228.                  chapter_dropdown.value = chapters[0]
  229.              else:
  230.                   if load_status.value.startswith("Status: Ready"):
  231.                      output_textarea.value = f"No chapters found for {selected_book}."
  232.                      print(f"Warning: No chapters populated for {selected_book}.")
  233.         else: print(f"Error: Data format issue for book {selected_book}.")
  234.  
  235.  
  236. def parse_verse_selection(selection_str, available_verses_set):
  237.     selected_verses = set()
  238.     if not available_verses_set: return []
  239.     max_verse = max(available_verses_set); min_verse = min(available_verses_set)
  240.     if not selection_str: return []
  241.     selection_str = selection_str.lower().strip()
  242.     if selection_str == 'all': return sorted(list(available_verses_set))
  243.     parts = selection_str.split(',')
  244.     for part in parts:
  245.         part = part.strip();
  246.         if not part: continue
  247.         try:
  248.             if '-' in part:
  249.                 start_str, end_str = part.split('-'); start, end = int(start_str.strip()), int(end_str.strip())
  250.                 if start < min_verse or end > max_verse or start > end: raise ValueError(f"Range {part} invalid ({min_verse}-{max_verse})")
  251.                 selected_verses.update(v for v in available_verses_set if start <= v <= end)
  252.             else:
  253.                 verse_num = int(part)
  254.                 if verse_num not in available_verses_set: raise ValueError(f"Verse {part} not available ({min_verse}-{max_verse})")
  255.                 selected_verses.add(verse_num)
  256.         except ValueError as e: raise ValueError(f"Invalid format/verse in '{part}': {e}")
  257.     return sorted(list(selected_verses))
  258.  
  259. # --- Event Handlers ---
  260.  
  261. def on_load_button_clicked(b):
  262.     """Handler for the load button: Downloads, Parses, Populates using v8 logic."""
  263.     global raw_xml_root, _parent_map_cache
  264.     raw_xml_root = None; _parent_map_cache = None; bible_data.clear(); book_list.clear()
  265.     book_dropdown.options = []; chapter_dropdown.options = []
  266.     generate_button.disabled = True; output_textarea.value = ""; load_status.value = "Status: Not Loaded"
  267.     diagnostic_output.clear_output()
  268.     clear_output(wait=True); display_ui()
  269.  
  270.     try:
  271.         load_status.value = "Status: Downloading..."
  272.         response = requests.get(BIBLE_XML_URL); response.raise_for_status()
  273.         response.encoding = response.apparent_encoding or 'utf-8'; xml_content = response.text
  274.         load_status.value = "Status: Parsing XML..."
  275.         raw_xml_root = ET.fromstring(xml_content)
  276.         load_status.value = "Status: XML Parsed. Populating data (v8)..."
  277.  
  278.         fetch_and_populate_data_v8(raw_xml_root) # Use v8 population logic
  279.  
  280.         if not book_list:
  281.              load_status.value = "Status: Failed: No books loaded."
  282.              generate_button.disabled = True
  283.              # ** SYNTAX FIX **
  284.              with diagnostic_output:
  285.                  print("ERROR: No books found/processed.")
  286.         elif not any(bible_data[b][c] for b in bible_data for c in bible_data.get(b, {}) if isinstance(bible_data[b].get(c), dict) and bible_data[b][c]): # More robust check for non-empty verse dicts
  287.              load_status.value = "Status: Failed: No verses loaded. Check Warnings/XML."
  288.              generate_button.disabled = True
  289.              # ** SYNTAX FIX **
  290.              with diagnostic_output:
  291.                  print("ERROR: Books/Chapters processed, but NO verse data stored.")
  292.         else:
  293.              load_status.value = f"Status: Ready. {len(book_list)} books processed."
  294.              generate_button.disabled = False
  295.              book_dropdown.options = book_list
  296.              if book_list: update_chapter_dropdown({'new': book_list[0]})
  297.  
  298.     except requests.exceptions.RequestException as e:
  299.         load_status.value = f"Status: Error downloading - {e}"
  300.         # ** SYNTAX FIX **
  301.         with diagnostic_output:
  302.             print(f"Download Error: {e}")
  303.     except ET.ParseError as e:
  304.         load_status.value = f"Status: Error parsing XML - {e}"
  305.         # ** SYNTAX FIX **
  306.         with diagnostic_output:
  307.             print(f"XML Parse Error: {e}")
  308.     except Exception as e:
  309.         load_status.value = f"Status: Error during processing - {e}"
  310.         generate_button.disabled = True
  311.         # ** SYNTAX FIX **
  312.         with diagnostic_output:
  313.             print(f"Unexpected Error:")
  314.             traceback.print_exc()
  315.  
  316.  
  317. def on_generate_button_clicked(b):
  318.     """Handler for the generate button - uses v8 data and formatting."""
  319.     output_textarea.value = "Generating..."
  320.     if raw_xml_root is None:
  321.         output_textarea.value = "Error: XML data not loaded. Click 'Load & Process XML'."; return
  322.     # Ensure parent map is built before extracting text
  323.     # It's built on demand by find_parent if needed, but building upfront might be slightly cleaner
  324.     build_parent_map(raw_xml_root)
  325.  
  326.     try:
  327.         book = book_dropdown.value;
  328.         if not chapter_dropdown.value: raise ValueError("Please select a chapter.")
  329.         chapter = int(chapter_dropdown.value)
  330.         verse_sel_str = verse_selection_text.value; show_title = title_checkbox.value
  331.         italic_style = italic_style_radio.value; separator = verse_separator_radio.value
  332.  
  333.         if not book: raise ValueError("Please select a book.")
  334.         if book not in bible_data: raise ValueError(f"Book '{book}' not found.")
  335.         if chapter not in bible_data.get(book, {}): raise ValueError(f"Chapter {chapter} not found for book '{book}'.") # Safer check
  336.  
  337.         chapter_verse_elements = bible_data[book][chapter]
  338.         if not isinstance(chapter_verse_elements, dict):
  339.              raise TypeError(f"Data error: Expected dict for {book} {chapter}, got {type(chapter_verse_elements)}")
  340.  
  341.         available_verses_set = set(chapter_verse_elements.keys())
  342.         if not available_verses_set:
  343.              output_textarea.value = f"{book} {chapter}: No verses loaded for this chapter."; return
  344.  
  345.         selected_verse_numbers = parse_verse_selection(verse_sel_str, available_verses_set)
  346.         if not selected_verse_numbers and verse_sel_str.lower() != 'all' and verse_sel_str.strip() != '':
  347.              min_v, max_v = min(available_verses_set), max(available_verses_set)
  348.              raise ValueError(f"No valid verses selected/found: '{verse_sel_str}'. Available: {min_v}-{max_v}")
  349.  
  350.         verse_texts = []
  351.         for verse_num in selected_verse_numbers:
  352.             if verse_num in chapter_verse_elements:
  353.                 verse_start_element = chapter_verse_elements[verse_num]
  354.                 formatted_text = extract_and_format_verse_text_v8(verse_start_element, italic_style)
  355.                 verse_texts.append(formatted_text)
  356.  
  357.         final_output_lines = []
  358.         if show_title: final_output_lines.append(f"{book} {chapter}")
  359.         if verse_texts:
  360.             verses_block = separator.join(verse_texts)
  361.             if show_title: final_output_lines.append("")
  362.             final_output_lines.append(verses_block)
  363.         elif show_title: pass
  364.         else: final_output_lines.append("(No verses selected or found)")
  365.         output_textarea.value = "\n".join(final_output_lines)
  366.  
  367.     except ValueError as e: output_textarea.value = f"Input Error: {e}"
  368.     except Exception as e:
  369.         output_textarea.value = f"Generation Error: {e}"; print(f"Detailed generation error:"); traceback.print_exc()
  370.  
  371.  
  372. # --- Wire Up Events ---
  373. load_button.on_click(on_load_button_clicked)
  374. book_dropdown.observe(update_chapter_dropdown, names='value')
  375. generate_button.on_click(on_generate_button_clicked)
  376.  
  377. # --- Display UI ---
  378. def display_ui():
  379.     controls_col1 = widgets.VBox([book_dropdown, chapter_dropdown, verse_selection_text])
  380.     controls_col2 = widgets.VBox([title_checkbox, italic_style_radio, verse_separator_radio])
  381.     controls_row = widgets.HBox([controls_col1, controls_col2])
  382.     display(widgets.VBox([widgets.HBox([load_button, load_status]), diagnostic_output, controls_row, generate_button, output_textarea]))
  383.  
  384. # --- Initial Display ---
  385. display_ui()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement