import os print(os.getenv('KEY')) KEY = os.getenv('KEY') os.environ['HF_TOKEN']=KEY os.environ['HUGGINGFACEHUB_API_TOKEN']=KEY # from langchain.embeddings.huggingface import HuggingFaceEmbeddings # from langchain import HuggingFaceHub # from langchain.vectorstores import Chroma # from langchain.chains import ConversationalRetrievalChain # from langchain.text_splitter import CharacterTextSplitter # from langchain.docstore.document import Document # import pandas as pd # # Load the CSV file # df = pd.read_csv("web_data.csv") # # Load the HTML and TS files # with open("reports.component.html", "r", encoding="utf-8") as f: # reports_component_html = f.read() # with open("reports.module.ts", "r", encoding="utf-8") as f: # reports_module_ts = f.read() # # Create the embeddings # embeddings = HuggingFaceEmbeddings() # print(embeddings) # # Combine questions, answers, and file contents into a list of strings # texts = [f"Question: {q}\nAnswer: {a}" for q, a in zip(df['query'], df['responses'])] # texts.append(f"File: reports.component.html\nContent:\n{reports_component_html}") # texts.append(f"File: reports.module.ts\nContent:\n{reports_module_ts}") # # Split the texts into chunks # text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0) # docs = [] # for text in texts: # chunks = text_splitter.split_text(text) # for chunk in chunks: # doc = Document(page_content=chunk, metadata={}) # docs.append(doc) # # Create the vector store # db = Chroma.from_documents(docs, embeddings) # # Load the language model # model = HuggingFaceHub(repo_id="tiiuae/falcon-7b-instruct", model_kwargs={"temperature": 0.7, "max_length": 512}) # # model = HuggingFaceHub(repo_id="meta-llama/Meta-Llama-3-8B", model_kwargs={"temperature": 0.7, "max_length": 512}) # # model = HuggingFaceHub(repo_id="mlabonne/AlphaMonarch-7B", model_kwargs={"temperature": 0.7, "max_length": 512}) # # Create the conversational retrieval chain # qa = ConversationalRetrievalChain.from_llm(model, db.as_retriever()) # query = '''what all is present in reports module ''' # result = qa({"question": query, "chat_history": []}) # print(result['answer']) # def get_helpful_answer(context, query): # import re # pattern = re.compile(r"Helpful Answer:\s*(.*?)(?:Question:|\Z)", re.DOTALL) # match = pattern.search(context) # if match: # return match.group(1).strip() # else: # return "No helpful answer found." # # print the helpful answer # print(get_helpful_answer(result['answer'], query)) # CLAUDE IMPROVEMENT TRY import pandas as pd from langchain.text_splitter import CharacterTextSplitter from langchain.embeddings import HuggingFaceEmbeddings from langchain.vectorstores import Chroma from langchain.chains import ConversationalRetrievalChain from langchain.llms import HuggingFaceHub from langchain.chains import LLMChain from langchain.prompts import PromptTemplate from langchain.chains.combine_documents.stuff import StuffDocumentsChain from langchain.schema import Document # Load and process data (unchanged) df = pd.read_csv("web_data.csv") with open("accounting.component.html", "r", encoding="utf-8") as f: reports_component_html = f.read() with open("accounting.component.ts", "r", encoding="utf-8") as f: reports_module_ts = f.read() # Improved text processing texts = [f"Question: {q}\nAnswer: {a}" for q, a in zip(df['query'], df['responses'])] texts.append(f"File: accounting.component.html\nContent:\n{reports_component_html}") texts.append(f"File: accounting.component.ts\nContent:\n{reports_module_ts}") # More granular text splitting text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=50) docs = [Document(page_content=chunk, metadata={}) for text in texts for chunk in text_splitter.split_text(text)] # Create embeddings and vector store embeddings = HuggingFaceEmbeddings(model_name="meta-llama/Meta-Llama-3-8B-Instruct") db = Chroma.from_documents(docs, embeddings) # Improved language model configuration model = HuggingFaceHub( repo_id="meta-llama/Meta-Llama-3-8B-Instruct", model_kwargs={"temperature": 0.3, "max_length": 512, "top_p": 0.95} ) # Enhanced prompt template prompt_template = """ Use the following pieces of context to answer the question at the end. If you don't know the answer, say "I don't have enough information to answer this question accurately." Aim to provide a concise yet informative answer within 500 characters. Context: {context} Question: {question} Confident and Accurate Answer: """ # Updated chains combine_docs_chain = StuffDocumentsChain( llm_chain=LLMChain( prompt=PromptTemplate(input_variables=['context', 'question'], template=prompt_template), llm=model ), document_variable_name='context' ) question_generator = LLMChain( prompt=PromptTemplate( input_variables=['chat_history', 'question'], template='Given the following conversation and a follow-up question, rephrase the follow-up question to be a standalone question focused on Angular and TypeScript concepts.\n\nChat History:\n{chat_history}\nFollow Up Input: {question}\nStandalone question:' ), llm=model ) # Create the improved conversational retrieval chain qa = ConversationalRetrievalChain( retriever=db.as_retriever(search_kwargs={"k": 3}), combine_docs_chain=combine_docs_chain, question_generator=question_generator, return_source_documents=True, verbose=True ) # Function to run a query def run_query(query, chat_history=[]): result = qa({"question": query, "chat_history": chat_history}) print("Question:", query) print("Answer:", result['answer']) print("Sources:", [doc.page_content[:50] + "..." for doc in result['source_documents']]) return result # Example usage query = "Explain the code in summary in the accounting components TypeScript file." result = run_query(query)