Advertisement
xosski

Data exfoliation/api injection

Dec 4th, 2024
11
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.08 KB | None | 0 0
  1. import requests
  2. import json
  3. import zipfile
  4. import shutil
  5. import os
  6. from pathlib import Path
  7. import pandas as pd
  8. import re
  9.  
  10. # Paths
  11. user_folder = Path.home()
  12. data_folder = user_folder / "The Data City" / "The Data City - Documents" / "Projects" / "HVMC" / "Data" / "Directory" / "CIC"
  13.  
  14. # SIC Code Reading
  15. with open(data_folder / "SICCodesOfInterest.txt", "r") as f:
  16. sic_codes = [line.strip() for line in f]
  17.  
  18. # Handle Existing Files/Folders
  19. sic_zip = data_folder / "SICData.zip"
  20. if sic_zip.exists():
  21. print("Deleting existing SICData.zip...")
  22. sic_zip.unlink()
  23.  
  24. sic_data_folder = data_folder / "SICData"
  25. if sic_data_folder.exists():
  26. print("Deleting existing SICData folder...")
  27. shutil.rmtree(sic_data_folder)
  28.  
  29. # Make Request to Server
  30. json_to_send = json.dumps({"SelectedSICs": sic_codes, "ReturnCount": 10000000, "DownloadFormat": "csv", "PreFilter": {"OnlyCompaniesWithWebsites": True, "OnlyManufacturingCompanies": True}})
  31. print("Requesting data from server...")
  32. req = requests.post(server_download_url, data=json_to_send)
  33. r_json = req.json()
  34. download_url = r_json["Download_URL"]
  35.  
  36. # Download and Extract ZIP
  37. r = requests.get(f"https://server{current_month}{current_year[2:]}.thedatacity.com" + download_url)
  38. with open(sic_zip, "wb") as f:
  39. f.write(r.content)
  40.  
  41. # Extract ZIP
  42. with zipfile.ZipFile(sic_zip, 'r') as zip_ref:
  43. zip_ref.extractall(sic_data_folder)
  44.  
  45. # Process Data
  46. df = pd.read_csv(sic_data_folder / "companieslist_financialsColumnLayout_.csv", usecols=["Companynumber", "Description", "SICs"], dtype={"Companynumber": str, "Description": str, "SICs": str})
  47.  
  48. # Keywords Processing (as you already did)
  49. ...
  50.  
  51. # Remove Companies based on SIC and Keywords
  52. df = df[~((df["SICs"].str.contains("70100")) & (df["Description"].str.contains("|".join(keywords), na=False)))]
  53.  
  54. # Collect Company Numbers
  55. all_company_numbers = df.Companynumber.tolist()
  56.  
  57. # RTIC Data (continue as before)
  58. ...
  59.  
  60. # Write Final Output
  61. with open(data_folder / "HVMC_AllCompanyNumbers.txt", "w") as f:
  62. for number in all_company_numbers:
  63. f.write(number + "\n")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement