Advertisement
bbcqx

XML to CSV gpt-4

Apr 6th, 2023
871
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.40 KB | None | 0 0
  1. import os
  2. import pandas as pd
  3. from lxml import etree
  4. from collections import defaultdict
  5. import uuid
  6.  
  7. def parse_element(element, tag_path, data):
  8.     tag_path = tag_path.copy()
  9.     tag_path.append(element.tag)
  10.  
  11.     if element:
  12.         for child in element:
  13.             parse_element(child, tag_path, data)
  14.     else:
  15.         data['/'.join(tag_path)].append(element.text)
  16.  
  17. def process_element(element, data, output_dir):
  18.     parse_element(element, [], data)
  19.  
  20.     if len(data) >= 10000:
  21.         temp_file = os.path.join(output_dir, f"temp_{len(os.listdir(output_dir)):03d}.csv")
  22.         pd.DataFrame(data).to_csv(temp_file, index=False)
  23.         data.clear()
  24.  
  25.     element.clear()
  26.  
  27. def xml_to_csv(xml_file, output_csv):
  28.     temp_output_dir = f"temp_output_{str(uuid.uuid4())}"
  29.     os.makedirs(temp_output_dir)
  30.  
  31.     data = defaultdict(list)
  32.     context = etree.iterparse(xml_file, events=('end',), tag='*')
  33.     for _, element in context:
  34.         process_element(element, data, temp_output_dir)
  35.  
  36.     if data:
  37.         temp_file = os.path.join(temp_output_dir, f"temp_{len(os.listdir(temp_output_dir)):03d}.csv")
  38.         pd.DataFrame(data).to_csv(temp_file, index=False)
  39.  
  40.     pd.concat([pd.read_csv(f) for f in os.listdir(temp_output_dir) if f.endswith('.csv')], ignore_index=True).to_csv(output_csv, index=False)
  41.  
  42. xml_file = 'input.xml'
  43. output_csv = 'output.csv'
  44. xml_to_csv(xml_file, output_csv)
  45.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement