Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- def telemachmob_parse(line):
- err = ''
- line = line.encode('utf-8')
- # Split the line by semicolons
- f = line.split(';')
- if f[8] != '1':
- return (None, err)
- rec = Record()
- rec.tel = f[0].strip()
- if not re.match("\d+", rec.tel):
- err = "tel=%s" % rec.tel
- # Extract relevant parts assuming the order as per the provided format
- rec.naziv1 = f[1].strip() if len(f) > 2 else '' # company name
- rec.naziv2 = f[2].strip() if len(f) > 2 else '' # company name
- mjesto_ptt_adresa = f[3].strip() if len(f) > 3 else '' # combined address field
- rec.tip = f[4].strip() if len(f) > 4 else '' # type
- #aktivan = f[8].strip() if len(f) > 8 else '' # active status
- #fix za prazne zapise, samo tel je prisutan
- if rec.naziv1 == '' and rec.naziv2 == '':
- return (None, '') #ignore
- if rec.naziv1 == '':
- err = "%s nedostaje naziv1" % err
- rec.mjesto = ''
- rec.ptt = ''
- rec.adresa = ''
- rec.kbr = ''
- rec.kbp = ''
- # Split the combined address field by comma
- address_parts = mjesto_ptt_adresa.split(',')
- rec.mjesto = address_parts[0].strip() if len(address_parts) > 0 else ''
- rec.ptt = address_parts[1].strip() if len(address_parts) > 1 else ''
- adresa_kbrkpo = address_parts[2].strip() if len(address_parts) > 2 else ''
- # Process the adresa_kbrkpo to split into adresa, kbr, and kbp using the provided regex pattern
- m = re.match(r"(?P<adresa>.*) (?P<kbr>[0-9]+)( )?(?P<kbp>[a-zA-Z])?", adresa_kbrkpo)
- if m:
- rec.adresa = m.group('adresa').strip()
- rec.kbr = m.group('kbr').strip()
- rec.kbp = m.group('kbp').strip() if m.group('kbp') else ''
- else:
- rec.adresa = adresa_kbrkpo
- rec.kbr = ''
- rec.kbp = ''
- try:
- rec.tip = {'Poslovni': 'Poslovni', 'Privatni':'Privatni', 'Obrtnici':'Obrtnici', 'N/A':'N/A'}[f[4]]
- except:
- pass
- #err = "%s tip=%s" % (err, f[4])
- if rec.tip == 'N/A' or not rec.tip:
- #ako ne ime onda je poslovni
- if rec.naziv2 == '' or not rec.naziv2:
- rec.tip = 'Poslovni'
- else:
- #ako ima ime i prezime možda je firma ako je u prezimenu d.o.o. ili d.d. TODO
- #ako ima space u ime ili prezime onda err
- tmp = rec.naziv2.lower()
- tmp1 = rec.naziv1.lower()
- if tmp.find("d.d.") >= 0 or tmp.find("d.o.o.") >= 0 or tmp1.find("d.d.") >= 0 or tmp1.find("d.o.o.") >= 0:
- rec.tip = 'Poslovni'
- else:
- if tmp1.find(" ") < 0 and tmp.find(" ") < 0 and tmp.find("*") < 0:
- #nema space u imeni i prezimenu
- rec.tip = 'Privatni'
- if rec.tip == 'Privatni':
- if not rec.naziv2 or rec.naziv2 == '*':
- #privatni bez imena
- m = re.match(".* (d\.d\.|d\.o\.o|obrt).*", rec.naziv1, re.I | re.U)
- if m:
- rec.tip = 'Poslovni'
- else:
- rec.tip = 'N/A'
- elif rec.naziv2.lower() == 'd.o.o.':
- rec.tip = 'Poslovni'
- if rec.tip == 'N/A':
- if rec.naziv1 and rec.naziv2 and rec.naziv2 != '*':
- rec.tip='Privatni'
- if rec.tip not in tipovi:
- err = "%s tip=%s" % (err, rec.tip)
- if rec.tip == 'Privatni':
- if rec.naziv1:
- rec.naziv1 = rec.naziv1.decode('utf-8').capitalize().encode('utf-8')
- rec.naziv1 = re.sub("\(.*", "", rec.naziv1).strip()
- if rec.naziv2:
- rec.naziv2 = rec.naziv2.decode('utf-8').capitalize().encode('utf-8')
- rec.naziv2 = re.sub("\(.*", "", rec.naziv2).strip()
- else:
- if rec.naziv1:
- rec.naziv1 = rec.naziv1.replace("\\\"", "\"")
- rec.ppb = 0 #za sada nemamo flag za reverse search
- if rec.tip == 'Poslovni' or rec.tip == 'Obrtnici':
- rec.ppb = 1
- #pprint.pprint(rec.__dict__)
- if err == ' tip=N/A':
- logger.warn('ignore %s', line)
- return (None, '')
- return (rec, err)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement