Advertisement
YanivHaliwa

Untitled

Dec 17th, 2024
51
0
25 days
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 7.23 KB | None | 0 0
  1. import sys
  2. import argparse
  3. from llama_index.core import  Document, SimpleDirectoryReader, StorageContext, load_index_from_storage,GPTVectorStoreIndex,StorageContext
  4. import requests
  5. from bs4 import BeautifulSoup
  6. import re
  7. import shutil
  8. from datetime import datetime
  9. import pandas as pd
  10. import PyPDF2
  11. import os
  12.  
  13. import warnings
  14. warnings.filterwarnings("ignore", message='Field ".*" has conflict with protected namespace ".*"')
  15. warnings.filterwarnings("ignore")
  16. import logging
  17. logging.getLogger("filelock").setLevel(logging.ERROR)
  18.  
  19. data_dir = "./data"
  20. not_learn=os.path.join(data_dir, 'not_learning')
  21.  
  22. def extract_text_from_pdf(file):
  23.     try:
  24.         # Check if the file is located in the data directory
  25.         if os.path.isfile(os.path.join(data_dir, file)) and file.endswith(".pdf"):
  26.             # Open the PDF file in read-binary mode
  27.             with open(os.path.join(data_dir, file), "rb") as pdf_file:
  28.                 pdf_reader = PyPDF2.PdfReader(pdf_file)
  29.  
  30.                 text = ""
  31.                
  32.                 # Extract text from each page of the PDF
  33.                 for page_num in range(len(pdf_reader.pages)):
  34.                     page = pdf_reader.pages[page_num]
  35.                     text += page.extract_text() if page.extract_text() else ""
  36.                
  37.                 # Save the extracted text into a text file
  38.                 txt_file = file.replace(".pdf", ".txt")
  39.                 with open(os.path.join(data_dir, txt_file), "w") as txt_file:
  40.                     txt_file.write(text)
  41.            
  42.             # Move the PDF file to the "not_learn" folder
  43.             os.rename(os.path.join(data_dir, file), os.path.join(not_learn, file))
  44.     except Exception as e:
  45.         print(f"Failed to process {file}: {str(e)}")
  46.  
  47.  
  48.  
  49. def extract_text_from_excel(file):
  50.     # Convert Excel file to CSV
  51.     df = pd.read_excel(os.path.join(data_dir, file), engine='openpyxl')
  52.     csv_file = file.replace(".xlsx", ".csv")
  53.     df.to_csv(os.path.join(data_dir, csv_file), index=False)
  54.  
  55.    # Move the Excel file to the "not learned" folder
  56.     os.rename(os.path.join(data_dir, file), os.path.join(not_learn, file))
  57.  
  58.  
  59. def extract_text_from_url(url, output_dir="./data"):
  60.     response = requests.get(url)
  61.  
  62.     # Raise exception if the request was unsuccessful
  63.     response.raise_for_status()
  64.  
  65.     soup = BeautifulSoup(response.text, 'html.parser')
  66.  
  67.     # Remove script and style elements
  68.     for script in soup(["script", "style"]):
  69.         script.decompose()
  70.  
  71.     # Get the text
  72.     text = soup.get_text()
  73.  
  74.     # Break into lines and remove leading and trailing whitespaces
  75.     lines = (line.strip() for line in text.splitlines())
  76.     # Break multi-headlines into a line each
  77.     chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
  78.     # Drop blank lines
  79.     text = '\n'.join(chunk for chunk in chunks if chunk)
  80.  
  81.     # Get page title and make it a safe string for a filename
  82.     title = soup.title.string
  83.     title = re.sub(r'[\W_]+', '_', title)
  84.     # Append timestamp for uniqueness
  85.     timestamp = datetime.now().strftime('%Y%m%d%H%M%S')
  86.     filename = f"{title}_{timestamp}.txt"
  87.  
  88.     # Save the text into a file
  89.     with open(os.path.join(output_dir, filename), 'w') as f:
  90.         f.write(text)
  91.  
  92.  
  93. class AddingDataToGPT:
  94.     def __init__(self, retrain=False):
  95.         self.index = None
  96.         self.persist_dir = "./storage"
  97.         self.data_dir = data_dir
  98.         if not any(fname for fname in os.listdir(data_dir) if fname != '.gitignore' and not os.path.isdir(
  99.                 os.path.join(data_dir, fname))):
  100.             print("No files to learn from in the 'data' directory. Exiting the program.")
  101.             sys.exit()
  102.         if os.path.exists(self.persist_dir) and not retrain:
  103.             self.read_from_storage()
  104.         else:
  105.             self.build_storage()
  106.         self.query_engine = self.index.as_query_engine()
  107.    
  108.  
  109.     def add_conversation_to_storage(self, question, response):
  110.        
  111.         documents = [{"question": question, "response": response}]
  112.  
  113.         self.index = GPTVectorStoreIndex.from_documents(documents)
  114.         self.index.storage_context.persist()
  115.  
  116.     def build_storage(self):
  117.         print("please wait...")
  118.         documents = []
  119.         if not os.path.exists(os.path.join(data_dir, 'not_learning')):
  120.             os.makedirs(os.path.join(data_dir, 'not_learning'))
  121.  
  122.         for filename in os.listdir(data_dir):
  123.             if os.path.isfile(os.path.join(data_dir, filename)):
  124.                 if filename == ".gitignore" or filename.startswith(".~lock"):
  125.                     continue
  126.                 elif filename.endswith((".xls", ".xlsx")):
  127.                     extract_text_from_excel(filename)
  128.                     print("Learned:", filename)
  129.                 elif filename.endswith(".pdf"):
  130.                     extract_text_from_pdf(filename)
  131.                     print("Learned:", filename)
  132.                 elif filename.endswith((".txt", ".html",".md")):
  133.                     print("Learned:", filename)
  134.                 else:
  135.                     shutil.move(os.path.join(data_dir, filename), os.path.join(data_dir, "not_learning"))
  136.                     print(f"Warning: Ignoring file \033[91m{filename}\033[0m because it does not have a supported file extension.")
  137.  
  138.         document = SimpleDirectoryReader(os.path.join(self.data_dir)).load_data()
  139.         documents.extend(document)
  140.         self.index = GPTVectorStoreIndex.from_documents(documents)
  141.         self.index.storage_context.persist()
  142.  
  143.     @staticmethod
  144.     def initialize_default_vector_store():
  145.        
  146.         default_store = None  # Placeholder for actual store initialization logic
  147.         return default_store
  148.  
  149.     def read_from_storage(self):
  150.         storage_context = StorageContext.from_defaults(persist_dir=self.persist_dir)
  151.         if 'default' not in storage_context.vector_stores:
  152.             # Call the static method using the class name
  153.             storage_context.vector_stores['default'] = AddingDataToGPT.initialize_default_vector_store()
  154.         self.index = load_index_from_storage(storage_context)
  155.  
  156.     def run_conversation(self):
  157.         while True:
  158.             question = input("Enter your question (or 'exit' to quit): ")
  159.             if question.lower() == "exit":
  160.                 break
  161.  
  162.             if question.lower() == "learn!":
  163.                 self.build_storage()
  164.                 self.read_from_storage()
  165.                 print("learning done")
  166.             else:
  167.                 self.query_engine = self.index.as_query_engine()
  168.                 response = self.query_engine.query(question)
  169.                 print(f"\033[1;32m{response}\033[0m")
  170.  
  171.  
  172. def parse_arguments():
  173.     parser = argparse.ArgumentParser()
  174.     parser.add_argument("-t", "--train", action="store_true", help="Retrain the model")
  175.     parser.add_argument("-u", "--url", type=str, help="URL to extract text from")
  176.     return parser.parse_args()
  177.  
  178.  
  179. def main():
  180.    
  181.     args = parse_arguments()
  182.  
  183.     if args.url:
  184.         extract_text_from_url(args.url)
  185.         args.train = True
  186.  
  187.     if args.train:
  188.         adding_data = AddingDataToGPT(retrain=True)
  189.     else:
  190.         adding_data = AddingDataToGPT()
  191.     adding_data.run_conversation()
  192.  
  193.  
  194. if __name__ == "__main__":
  195.     main()
  196.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement