Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import os
- import pandas as pd
- from lxml import etree
- from collections import defaultdict
- import uuid
- def parse_element(element, tag_path, data):
- tag_path = tag_path.copy()
- tag_path.append(element.tag)
- if element:
- for child in element:
- parse_element(child, tag_path, data)
- else:
- data['/'.join(tag_path)].append(element.text)
- def process_element(element, data, output_dir):
- parse_element(element, [], data)
- if len(data) >= 10000:
- temp_file = os.path.join(output_dir, f"temp_{len(os.listdir(output_dir)):03d}.csv")
- pd.DataFrame(data).to_csv(temp_file, index=False)
- data.clear()
- element.clear()
- def xml_to_csv(xml_file, output_csv):
- temp_output_dir = f"temp_output_{str(uuid.uuid4())}"
- os.makedirs(temp_output_dir)
- data = defaultdict(list)
- context = etree.iterparse(xml_file, events=('end',), tag='*')
- for _, element in context:
- process_element(element, data, temp_output_dir)
- if data:
- temp_file = os.path.join(temp_output_dir, f"temp_{len(os.listdir(temp_output_dir)):03d}.csv")
- pd.DataFrame(data).to_csv(temp_file, index=False)
- pd.concat([pd.read_csv(f) for f in os.listdir(temp_output_dir) if f.endswith('.csv')], ignore_index=True).to_csv(output_csv, index=False)
- xml_file = 'input.xml'
- output_csv = 'output.csv'
- xml_to_csv(xml_file, output_csv)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement