xosski

Data extraction to external server

Dec 4th, 2024
6
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 5.20 KB | None | 0 0
  1. import json
  2. import os
  3. import re
  4. import shutil
  5. import subprocess
  6. import zipfile
  7. from datetime import datetime
  8. from pathlib import Path
  9.  
  10. import pandas as pd
  11. import requests
  12. import openpyxl
  13.  
  14. # make a pathlib folder to the user
  15. user_folder = Path.home()
  16.  
  17. # set path to data folder: \The Data City\The Data City - Documents\Projects\HVMC\Data\CIC
  18. if not os.name == 'posix':
  19. data_folder = user_folder / "The Data City" / "The Data City - Documents" / "Projects" / "HVMC" / "Data" / "Directory" / "CIC"
  20. else:
  21. data_folder = user_folder / "Library" / "CloudStorage" / "OneDrive-SharedLibraries-TheDataCity" / "The Data City - Documents" / "Projects" / "HVMC" / "Directory" / "Data" / "CIC"
  22.  
  23. # Read in all SIC codes
  24. with open(data_folder / "SICCodesOfInterest.txt", "r") as f:
  25. sic_codes = [line.strip() for line in f]
  26.  
  27. # if SICData.zip already exists then delete it and delete the SICData folder
  28. if (data_folder / "SICData.zip").exists():
  29. print("SICData.zip already exists, deleting...")
  30. (data_folder / "SICData.zip").unlink()
  31. print("SICData.zip deleted...")
  32.  
  33. if (data_folder / "SICData").exists():
  34. print("SICData folder already exists, deleting...")
  35. shutil.rmtree(data_folder / "SICData")
  36. print("SICData folder deleted...")
  37.  
  38. # Make the request to the server to download the data
  39. # serialise the json
  40. json_to_send = json.dumps({"SelectedSICs": sic_codes, "ReturnCount": 10000000, "DownloadFormat": "csv", "PreFilter": {"OnlyCompaniesWithWebsites": True, "OnlyManufacturingCompanies": True}})
  41. print("Making request to server...")
  42. req = requests.post(server_download_url, data=json_to_send)
  43. print("Request made to server...")
  44. r_json = req.json()
  45. download_url = r_json["Download_URL"]
  46. # download the data
  47. print("Downloading data...")
  48. r = requests.get(f"https://server{current_month}{current_year[2:]}.thedatacity.com" + download_url)
  49. print("Data downloaded...")
  50. # write the data to an excel file and then read it in
  51. with open(data_folder / "SICData.zip", "wb") as f:
  52. f.write(r.content)
  53.  
  54. # extract the zip file and output to a folder called SICData
  55. print("Extracting data...")
  56. file_to_get = str(data_folder / "SICData.zip")
  57. # patoolib.extract_archive(str(data_folder / "SICData.zip"), outdir=str(data_folder / "SICData"))
  58. # result = subprocess.run(['unar', '-o', 'SICData', file_to_get], capture_output=True, text=True)
  59. with zipfile.ZipFile(data_folder / "SICData.zip", 'r') as zip_ref:
  60. zip_ref.extractall(data_folder / "SICData")
  61. print("Data extracted...")
  62.  
  63. df = pd.read_csv(data_folder / "SICData" / "companieslist_financialsColumnLayout_.csv",
  64. usecols=["Companynumber", "Description", "SICs"], dtype={"Companynumber": str, "Description": str,
  65. "SICs": str})
  66.  
  67. # read in excel file
  68. df2 = pd.read_excel(data_folder / "70100 list with keywords to remove.xlsx",
  69. sheet_name="Sheet1", engine="openpyxl", header=1)
  70. keywords = df2["Description"].tolist()
  71. keywords = [str(kw) for kw in keywords]
  72. # if the keyword ends in a * then remove the * and replace it in the list
  73. keywords = [kw.replace("*", "") if kw.endswith("*") else kw for kw in keywords]
  74. # strip the keywords of whitespace
  75. keywords = [kw.strip() for kw in keywords]
  76. # remove any words inside brackets and the contents of the brackets
  77. keywords = [re.sub(r'\s*\(.*?\)\s*', '', word) for word in keywords]
  78. # if 70100 in SICs and description contains any of the keywords then remove, ignore if the description is nan
  79. df = df[~((df["SICs"].str.contains("70100")) & (df["Description"].str.contains("|".join(keywords), na=False)))]
  80.  
  81. all_company_numbers = df.Companynumber.tolist()
  82.  
  83. # Read in all RTIC codes
  84. with open(data_folder / "RTICCodesOfInterest.txt", "r") as f:
  85. rtic_codes = [line.strip() for line in f]
  86.  
  87. print("Getting RTIC data...")
  88. r = requests.get("https://products.thedatacity.com/v2/rtics/add_companies_to_rtics.php")
  89. r_json = r.json()
  90. print("RTIC data retrieved...")
  91.  
  92. filtered_result = [d for d in r_json if d['code'] in rtic_codes]
  93.  
  94. all_rtic_company_numbers = []
  95. for rtic in filtered_result:
  96. for vertical in rtic["verticals"]:
  97. all_rtic_company_numbers += vertical["companies"]
  98.  
  99. all_rtic_company_numbers = list(set(all_rtic_company_numbers))
  100.  
  101. all_company_numbers = all_company_numbers + all_rtic_company_numbers
  102.  
  103. all_company_numbers = list(set(all_company_numbers))
  104.  
  105. # Read in CompaniesToRemove.txt and zfill
  106. with open(data_folder / "CompaniesToRemove.txt", "r") as f:
  107. companies_to_remove = [line.strip().zfill(8) for line in f]
  108.  
  109. companies_to_remove_set = set(companies_to_remove)
  110.  
  111. # Filter out unwanted company numbers in one go
  112. all_company_numbers = [number for number in all_company_numbers if number not in companies_to_remove_set]
  113.  
  114. # add company numbers
  115. with open(data_folder / "CompaniestoAdd.txt", "r") as f:
  116. companies_to_add = [line.strip() for line in f]
  117.  
  118. all_company_numbers += companies_to_add
  119.  
  120. with open(data_folder / "HVMC_AllCompanyNumbers.txt",
  121. "w") as f:
  122. for number in all_company_numbers:
  123. f.write(number + "\n")
  124.  
  125. with open("HVMC_AllCompanyNumbers.txt", "w") as f:
  126. for number in all_company_numbers:
  127. f.write(number + "\n")
  128.  
Add Comment
Please, Sign In to add comment