Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import csv
- import re
- from concurrent.futures.process import ProcessPoolExecutor
- from pathlib import Path
- from PyPDF2 import PdfReader
- from loguru import logger
- COA_ROOT = Path(r"D:\lgc_coa")
- OUT_FP = Path(r"D:\lgc_coa_info.csv")
- def extract_coa_info(fp: Path):
- reader = PdfReader(fp)
- page = reader.pages[0]
- text = page.extract_text()
- coa_cat_no = lot = None
- d = (m := re.search(r'Product code\s+Lot number\s+(?P<cat_no>\S+)\s+(?P<lot>\S+)', text)) and m.groupdict()
- d2 = (m := re.search(r'Product code\s+(?P<cat_no>\S+)\s+Lot number\s+(?P<lot>\S+)', text)) and m.groupdict()
- if d:
- coa_cat_no, lot = d['cat_no'], d['lot']
- if d2:
- coa_cat_no, lot = d2['cat_no'], d2['lot']
- ret = {
- "cat_no": (m := re.search(r"(?<=COA_)[^_]+", fp.name)) and m.group(),
- "coa_cat_no": coa_cat_no,
- "lot": lot,
- # "appearance": (m := re.search(r"Appearance\s+\n([^\n]+\n){6}([^\n]+)", text)) and m.group(2).strip(),
- "appearance": (m := re.search(r"([^\n]+(solid)|(liquid)|(oil))[^\n]+", text)) and m.group(1),
- "assay": (m := re.search(r'Assay\s?\w? [“"]as is[”"]\s+(\d+\.\d+\s?%)', text)) and m.group(1),
- "date_of_ship": (m := re.search(r'Date of shipment:\s+(\d+\s+\w+\s+\d+)', text)) and m.group(1),
- "fname": fp.name
- }
- if ret['assay'] is None or lot is None:
- logger.debug(f"assay or lot is None, ({lot}, {ret['assay']})")
- pass
- if ret['appearance'] is None:
- logger.debug(f"appearance is None")
- return ret
- def main(dir_path: Path, max_workers: int=4):
- logger.info('starting')
- pdfs = dir_path.glob('./*.pdf')
- headers = ('cat_no', 'coa_cat_no', 'lot', 'appearance', 'assay', 'date_of_ship', 'fname')
- with OUT_FP.open('w', newline='', encoding='u8') as f, ProcessPoolExecutor(max_workers=max_workers) as pool:
- csv_writer = csv.DictWriter(f, fieldnames=headers)
- result = pool.map(extract_coa_info, pdfs)
- for ret in result:
- csv_writer.writerow(ret)
- logger.info('finished')
- if __name__ == '__main__':
- main(COA_ROOT, max_workers=1)
- pass
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement