Advertisement
tomba2k

telemach_parse.py

Jun 27th, 2024
400
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 4.01 KB | None | 0 0
  1. def telemachmob_parse(line):
  2.     err = ''
  3.     line = line.encode('utf-8')
  4.     # Split the line by semicolons
  5.     f = line.split(';')
  6.     if f[8] != '1':
  7.         return (None, err)
  8.     rec = Record()
  9.     rec.tel = f[0].strip()
  10.     if not re.match("\d+", rec.tel):
  11.         err = "tel=%s" % rec.tel
  12.  
  13.     # Extract relevant parts assuming the order as per the provided format
  14.     rec.naziv1 = f[1].strip() if len(f) > 2 else ''  # company name
  15.     rec.naziv2 = f[2].strip() if len(f) > 2 else ''  # company name
  16.     mjesto_ptt_adresa = f[3].strip() if len(f) > 3 else ''  # combined address field
  17.     rec.tip = f[4].strip() if len(f) > 4 else ''  # type
  18.     #aktivan = f[8].strip() if len(f) > 8 else ''  # active status
  19.    
  20.     #fix za prazne zapise, samo tel je prisutan
  21.     if rec.naziv1 == '' and rec.naziv2 == '':
  22.         return (None, '') #ignore
  23.     if rec.naziv1 == '':
  24.         err = "%s nedostaje naziv1" % err
  25.  
  26.     rec.mjesto = ''
  27.     rec.ptt = ''
  28.     rec.adresa = ''
  29.     rec.kbr = ''
  30.     rec.kbp = ''
  31.  
  32.     # Split the combined address field by comma
  33.     address_parts = mjesto_ptt_adresa.split(',')
  34.    
  35.     rec.mjesto = address_parts[0].strip() if len(address_parts) > 0 else ''
  36.     rec.ptt = address_parts[1].strip() if len(address_parts) > 1 else ''
  37.     adresa_kbrkpo = address_parts[2].strip() if len(address_parts) > 2 else ''
  38.    
  39.     # Process the adresa_kbrkpo to split into adresa, kbr, and kbp using the provided regex pattern
  40.     m = re.match(r"(?P<adresa>.*) (?P<kbr>[0-9]+)( )?(?P<kbp>[a-zA-Z])?", adresa_kbrkpo)
  41.     if m:
  42.         rec.adresa = m.group('adresa').strip()
  43.         rec.kbr = m.group('kbr').strip()
  44.         rec.kbp = m.group('kbp').strip() if m.group('kbp') else ''
  45.     else:
  46.         rec.adresa = adresa_kbrkpo
  47.         rec.kbr = ''
  48.         rec.kbp = ''
  49.  
  50.     try:
  51.         rec.tip = {'Poslovni': 'Poslovni', 'Privatni':'Privatni', 'Obrtnici':'Obrtnici', 'N/A':'N/A'}[f[4]]
  52.     except:
  53.         pass
  54.         #err = "%s tip=%s" % (err, f[4])
  55.     if rec.tip == 'N/A' or not rec.tip:
  56.         #ako ne ime onda je poslovni
  57.         if rec.naziv2 == '' or not rec.naziv2:
  58.             rec.tip = 'Poslovni'
  59.         else:
  60.             #ako ima ime i prezime možda je firma ako je u prezimenu d.o.o. ili d.d. TODO
  61.             #ako ima space u ime ili prezime onda err
  62.             tmp = rec.naziv2.lower()
  63.             tmp1 = rec.naziv1.lower()
  64.             if tmp.find("d.d.") >= 0 or tmp.find("d.o.o.") >= 0 or tmp1.find("d.d.") >= 0 or tmp1.find("d.o.o.") >= 0:
  65.                 rec.tip = 'Poslovni'
  66.             else:
  67.                 if tmp1.find(" ") < 0 and tmp.find(" ") < 0 and tmp.find("*") < 0:
  68.                     #nema space u imeni i prezimenu
  69.                     rec.tip = 'Privatni'
  70.     if rec.tip == 'Privatni':
  71.         if not rec.naziv2 or rec.naziv2 == '*':
  72.             #privatni bez imena
  73.             m = re.match(".* (d\.d\.|d\.o\.o|obrt).*", rec.naziv1, re.I | re.U)
  74.             if m:
  75.                 rec.tip = 'Poslovni'
  76.             else:
  77.                 rec.tip = 'N/A'
  78.         elif rec.naziv2.lower() == 'd.o.o.':
  79.             rec.tip = 'Poslovni'
  80.     if rec.tip == 'N/A':
  81.         if rec.naziv1 and rec.naziv2 and rec.naziv2 != '*':
  82.             rec.tip='Privatni'
  83.     if rec.tip not in tipovi:
  84.         err = "%s tip=%s" % (err, rec.tip)
  85.     if rec.tip == 'Privatni':
  86.         if rec.naziv1:
  87.             rec.naziv1 = rec.naziv1.decode('utf-8').capitalize().encode('utf-8')
  88.             rec.naziv1 = re.sub("\(.*", "", rec.naziv1).strip()
  89.         if rec.naziv2:
  90.             rec.naziv2 = rec.naziv2.decode('utf-8').capitalize().encode('utf-8')
  91.             rec.naziv2 = re.sub("\(.*", "", rec.naziv2).strip()
  92.     else:
  93.         if rec.naziv1:
  94.             rec.naziv1 = rec.naziv1.replace("\\\"", "\"")
  95.     rec.ppb = 0 #za sada nemamo flag za reverse search
  96.     if rec.tip == 'Poslovni' or rec.tip == 'Obrtnici':
  97.         rec.ppb = 1
  98.     #pprint.pprint(rec.__dict__)
  99.     if err == ' tip=N/A':
  100.         logger.warn('ignore %s', line)
  101.         return (None, '')
  102.     return (rec, err)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement