Advertisement
Pandaaaa906

Untitled

May 4th, 2023
806
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.14 KB | None | 0 0
  1. import csv
  2. import re
  3. from concurrent.futures.process import ProcessPoolExecutor
  4. from pathlib import Path
  5.  
  6. from PyPDF2 import PdfReader
  7. from loguru import logger
  8.  
  9. COA_ROOT = Path(r"D:\lgc_coa")
  10. OUT_FP = Path(r"D:\lgc_coa_info.csv")
  11.  
  12.  
  13. def extract_coa_info(fp: Path):
  14.     reader = PdfReader(fp)
  15.     page = reader.pages[0]
  16.  
  17.     text = page.extract_text()
  18.  
  19.     coa_cat_no = lot = None
  20.     d = (m := re.search(r'Product code\s+Lot number\s+(?P<cat_no>\S+)\s+(?P<lot>\S+)', text)) and m.groupdict()
  21.     d2 = (m := re.search(r'Product code\s+(?P<cat_no>\S+)\s+Lot number\s+(?P<lot>\S+)', text)) and m.groupdict()
  22.     if d:
  23.         coa_cat_no, lot = d['cat_no'], d['lot']
  24.     if d2:
  25.         coa_cat_no, lot = d2['cat_no'], d2['lot']
  26.     ret = {
  27.         "cat_no": (m := re.search(r"(?<=COA_)[^_]+", fp.name)) and m.group(),
  28.         "coa_cat_no": coa_cat_no,
  29.         "lot": lot,
  30.         # "appearance": (m := re.search(r"Appearance\s+\n([^\n]+\n){6}([^\n]+)", text)) and m.group(2).strip(),
  31.         "appearance": (m := re.search(r"([^\n]+(solid)|(liquid)|(oil))[^\n]+", text)) and m.group(1),
  32.         "assay": (m := re.search(r'Assay\s?\w? [“"]as is[”"]\s+(\d+\.\d+\s?%)', text)) and m.group(1),
  33.         "date_of_ship": (m := re.search(r'Date of shipment:\s+(\d+\s+\w+\s+\d+)', text)) and m.group(1),
  34.         "fname": fp.name
  35.     }
  36.     if ret['assay'] is None or lot is None:
  37.         logger.debug(f"assay or lot is None, ({lot}, {ret['assay']})")
  38.         pass
  39.     if ret['appearance'] is None:
  40.         logger.debug(f"appearance is None")
  41.     return ret
  42.  
  43.  
  44. def main(dir_path: Path, max_workers: int=4):
  45.     logger.info('starting')
  46.     pdfs = dir_path.glob('./*.pdf')
  47.     headers = ('cat_no', 'coa_cat_no', 'lot', 'appearance', 'assay', 'date_of_ship', 'fname')
  48.     with OUT_FP.open('w', newline='', encoding='u8') as f, ProcessPoolExecutor(max_workers=max_workers) as pool:
  49.         csv_writer = csv.DictWriter(f, fieldnames=headers)
  50.         result = pool.map(extract_coa_info, pdfs)
  51.         for ret in result:
  52.             csv_writer.writerow(ret)
  53.     logger.info('finished')
  54.  
  55.  
  56. if __name__ == '__main__':
  57.     main(COA_ROOT, max_workers=1)
  58.     pass
  59.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement