import bs4 import pandas as pd from langchain import hub from langchain_community.document_loaders import WebBaseLoader from langchain_community.vectorstores import Chroma from langchain_core.output_parsers import StrOutputParser from langchain_core.runnables import RunnablePassthrough from langchain_openai import ChatOpenAI, OpenAIEmbeddings from langchain_text_splitters import RecursiveCharacterTextSplitter #from langchain.document_loaders import PyPDFLoader, CSVLoader, ExcelLoader from langchain_community.document_loaders import PyPDFLoader, CSVLoader, TextLoader from langchain_community.document_loaders import UnstructuredExcelLoader #from langchain.text_splitter import CharacterTextSplitter from langchain_community.embeddings import HuggingFaceEmbeddings from dotenv import load_dotenv import sys import shutil import os import uuid import csv def from_web(url): loader = WebBaseLoader(web_paths=(url,), bs_kwargs=dict(parse_only=bs4.SoupStrainer( class_=("post-content", "post-title", "post-header") )),) docs = loader.load() return docs def from_excel(file_address): if file_address.endswith(".xlsx"): loader = UnstructuredExcelLoader(file_path=file_address) docs = loader.load() return docs else: docs = [] for file_name in os.listdir(file_address): file_path = os.path.join(file_address, file_name) if os.path.isfile(file_path) and file_name.endswith(".xlsx"): # Load the Excel file loader = UnstructuredExcelLoader(file_path=file_address) docs.extend(loader.load()) return docs def from_csv(file_address): docs = [] #Load the CSV file if file_address.endswith(".csv"): loader = CSVLoader(file_path=file_address, encoding='utf-8') docs = loader.load() return docs def from_pdf(file_address): loader = PyPDFLoader(file_path=file_address) docs = loader.load() return docs def from_text_files(file_address): docs = [] for file_name in os.listdir(file_address): file_path = os.path.join(file_address, file_name) if os.path.isfile(file_path) and file_name.endswith(".txt"): loader = TextLoader(file_path) docs.extend(loader.load()) return docs def retriever_from_docs(docs): if not docs: print("No documents to process.") return #print("Documents:", docs) # Split the documents into smaller chunks text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) splits = text_splitter.split_documents(docs) print(f"Number of document chunks: {len(splits)}") # Create embeddings for the document chunks #embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") # 384 dimensionality embeddings embeddings = OpenAIEmbeddings() # 1536 dimensionality #embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2") # 768 embedded dimension #embeddings = HuggingFaceEmbeddings(model_name="bert-large-uncased") # 1024 dim embeddings_list = embeddings.embed_documents([t.page_content for t in splits]) # Generate unique IDs for each document chunk doc_ids = [str(uuid.uuid4()) for _ in range(len(splits))] print(f"Number of IDs generated: {len(doc_ids)}") # Create or load the Chroma vector store persist_directory="../../chroma_db" # Check if the directory exists if os.path.exists(persist_directory): # Remove the directory and its contents #shutil.rmtree(persist_directory) #print(f"Deleted {persist_directory}") # Load the existing vector store #chroma_store = Chroma(persist_directory=persist_directory, embedding_function=embeddings) #print() # Create a new vector store chroma_store = Chroma.from_documents(documents=splits, embedding=embeddings, persist_directory=persist_directory) # Load the existing vector store chroma_store = Chroma(persist_directory=persist_directory, embedding_function=embeddings) chroma_store.add_texts([t.page_content for t in splits], embeddings=embeddings_list, ids=doc_ids) else: print(f"{persist_directory} does not exist") # Create a new vector store chroma_store = Chroma.from_documents(documents=splits, embedding=embeddings, persist_directory=persist_directory) #Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings(),persist_directory="../../chroma_db") #chroma_store = Chroma.from_documents(documents=splits, embedding=embeddings, persist_directory="../../chroma_db") # Is used to add new documents and their corresponding embeddings to an existing Chroma vector store. #chroma_store.add_texts([t.page_content for t in splits], embeddings=embeddings_list, ids=doc_ids) print("Embeddings are added to vector store.") def main(): print(sys.argv) load_dotenv() #file_address = "../../../db_28_2_text/db_28_2_text/" #file_address = "../../../db_28_2_excel/db_28_2_excel/" file_address = "../../../International Job Dataset/allJobs.xlsx" #file_address = "../../../db_28_2_excel/db_28_2_excel/Technology Skills.xlsx" #file_address = "../../../db_28_2_excel/db_28_2_excel/Tools Used.xlsx" #file_address = "../../../db_28_2_excel/db_28_2_excel/Alternate Titles.xlsx" #file_address = "../../../db_28_2_excel/db_28_2_excel/Emerging Tasks.xlsx" Job Zone Reference #file_address = "../../../db_28_2_excel/db_28_2_excel/Job Zone Reference.xlsx" #file_address = "../../../db_28_2_excel/db_28_2_excel/Job Zones.xlsx" #file_address = "../../../db_28_2_excel/db_28_2_excel/Occupation Data.xlsx" #file_address = "../../../db_28_2_excel/db_28_2_excel/Related Occupations.xlsx" # Check if the file_address exists if not os.path.exists(file_address): print("File address does not exist.") return # Determine the input type and load the documents accordingly if 'http' in sys.argv[1].lower(): retriever_from_docs(from_web(sys.argv[1])) elif '.xls' in sys.argv[1].lower(): retriever_from_docs(from_excel(sys.argv[1])) elif '.csv' in sys.argv[1].lower(): retriever_from_docs(from_csv(sys.argv[1])) elif '.pdf' in sys.argv[1].lower(): retriever_from_docs(from_pdf(sys.argv[1])) elif '.txt' in sys.argv[1].lower(): retriever_from_docs(from_text_files(sys.argv[1])) elif 'excel' in sys.argv[1].lower(): retriever_from_docs(from_excel(sys.argv[1])) elif 'text' in sys.argv[1].lower(): retriever_from_docs(from_text_files(sys.argv[1])) else: print(f"Unsupported file format for file.") if __name__ == "__main__": main()