split word file into X parts by strings given

import tempfile
from typing import Generator, List, Tuple, Union

from docx import Document
from docx.document import Document as DocType
from docx.oxml.table import CT_Tbl
from docx.oxml.text.paragraph import CT_P
from docx.oxml.xmlchemy import BaseOxmlElement
from docx.text.paragraph import Paragraph

import pandas as pd
import os

def iterparts(doc_path: str, split_points: List[str], bias: int = 0) -> Generator[Tuple[int, DocType], None, None]:
    """
    Розділяє документ на частини відповідно до масиву рядків split_points.

    :param doc_path: Шлях до документа.
    :param split_points: Список текстів, які визначають точки поділу.
    :param bias: Зміщення індексу для розділення.
    :return: Генератор, що повертає частини документа.
    """
    doc = Document(doc_path)
    counter = 0
    while doc:
        split_elem_idx = -1
        doc_body = doc.element.body
        cutted = [doc, None]
        for idx, elem in enumerate(doc_body.iterchildren()):
            if is_split_point(elem, split_points):
                if split_elem_idx == -1:
                    split_elem_idx = idx
                else:
                    cutted = split(doc, idx + bias)
                    counter += 1
                    break
        yield (counter, cutted[0])
        doc = cutted[1]


def is_split_point(element: BaseOxmlElement, split_points: List[str]) -> bool:
    """
    Визначає, чи є елемент точкою поділу відповідно до масиву рядків.

    :param element: Елемент документа.
    :param split_points: Список текстів, які визначають точки поділу.
    :return: True, якщо елемент є точкою поділу.
    """
    if isinstance(element, CT_P):
        p = Paragraph(element, element.getparent())
        return p.text.strip() in split_points
    return False


def split(doc: DocType, cut_idx: int) -> Tuple[DocType, DocType]:
    """
    Розділяє документ на дві частини.

    :param doc: Документ для розділення.
    :param cut_idx: Індекс елемента, на якому виконується розділення.
    :return: Кортеж з двох частин документа.
    """
    tmpdocfile = write_tmp_doc(doc)
    second_part = doc
    second_elems = list(second_part.element.body.iterchildren())
    for i in range(0, cut_idx):
        remove_element(second_elems[i])
    first_part = Document(tmpdocfile)
    first_elems = list(first_part.element.body.iterchildren())
    for i in range(cut_idx, len(first_elems)):
        remove_element(first_elems[i])
    tmpdocfile.close()
    return (first_part, second_part)


def remove_element(elem: Union[CT_P, CT_Tbl]):
    """
    Видаляє елемент з документа.

    :param elem: Елемент для видалення (параграф або таблиця).
    """
    elem.getparent().remove(elem)


def write_tmp_doc(doc: DocType):
    """
    Зберігає документ у тимчасовий файл.

    :param doc: Документ для збереження.
    :return: Об'єкт тимчасового файлу.
    """
    tmp = tempfile.TemporaryFile()
    doc.save(tmp)
    return tmp


def load_key_phrases(file_path):
    """
    Завантажує таблицю ключових фраз із файлу (TXT, XLSX або CSV) та повертає список ключових фраз.
    """
    if file_path.endswith('.csv'):
        key_phrases_df = pd.read_csv(file_path)
    elif file_path.endswith('.xlsx'):
        key_phrases_df = pd.read_excel(file_path, header=None)
    elif file_path.endswith('.txt'):
        key_phrases_df = open(file_path, 'r').read()
    else:
        raise ValueError("Підтримуються лише файли TXT, CSV або XLSX.")

    if file_path.endswith('.txt'):
        key_phrases = key_phrases_df.splitlines()
    else:
        key_phrases = key_phrases_df.iloc[:, 0].dropna().str.strip().tolist()
    for phrase in key_phrases:
        phrase.encode(encoding="UTF-8", errors='strict')
    return key_phrases


def process_document(doc_path, key_phrases_path, output_dir='output'):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    key_phrases = load_key_phrases(key_phrases_path)


    for index, part in iterparts(doc_path, key_phrases):
        part.save(os.path.join(output_dir, f"part_{index}.docx"))


if __name__ == "__main__":
    document_path = "test.docx"  # Замінить на шлях до вашого файлу
    key_phrases_path = "test_phrases.txt"  # Замініть на шлях до файлу з ключовими фразами
    output_directory = "output"
    process_document(document_path, key_phrases_path, output_directory)