newtry2 / app.py
shubhampal's picture
Update app.py
ceb805c verified
raw
history blame
No virus
5.94 kB
import os
print(os.getenv('KEY'))
KEY = os.getenv('KEY')
os.environ['HF_TOKEN']=KEY
os.environ['HUGGINGFACEHUB_API_TOKEN']=KEY
# from langchain.embeddings.huggingface import HuggingFaceEmbeddings
# from langchain import HuggingFaceHub
# from langchain.vectorstores import Chroma
# from langchain.chains import ConversationalRetrievalChain
# from langchain.text_splitter import CharacterTextSplitter
# from langchain.docstore.document import Document
# import pandas as pd
# # Load the CSV file
# df = pd.read_csv("web_data.csv")
# # Load the HTML and TS files
# with open("reports.component.html", "r", encoding="utf-8") as f:
# reports_component_html = f.read()
# with open("reports.module.ts", "r", encoding="utf-8") as f:
# reports_module_ts = f.read()
# # Create the embeddings
# embeddings = HuggingFaceEmbeddings()
# print(embeddings)
# # Combine questions, answers, and file contents into a list of strings
# texts = [f"Question: {q}\nAnswer: {a}" for q, a in zip(df['query'], df['responses'])]
# texts.append(f"File: reports.component.html\nContent:\n{reports_component_html}")
# texts.append(f"File: reports.module.ts\nContent:\n{reports_module_ts}")
# # Split the texts into chunks
# text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
# docs = []
# for text in texts:
# chunks = text_splitter.split_text(text)
# for chunk in chunks:
# doc = Document(page_content=chunk, metadata={})
# docs.append(doc)
# # Create the vector store
# db = Chroma.from_documents(docs, embeddings)
# # Load the language model
# model = HuggingFaceHub(repo_id="tiiuae/falcon-7b-instruct", model_kwargs={"temperature": 0.7, "max_length": 512})
# # model = HuggingFaceHub(repo_id="meta-llama/Meta-Llama-3-8B", model_kwargs={"temperature": 0.7, "max_length": 512})
# # model = HuggingFaceHub(repo_id="mlabonne/AlphaMonarch-7B", model_kwargs={"temperature": 0.7, "max_length": 512})
# # Create the conversational retrieval chain
# qa = ConversationalRetrievalChain.from_llm(model, db.as_retriever())
# query = '''what all is present in reports module '''
# result = qa({"question": query, "chat_history": []})
# print(result['answer'])
# def get_helpful_answer(context, query):
# import re
# pattern = re.compile(r"Helpful Answer:\s*(.*?)(?:Question:|\Z)", re.DOTALL)
# match = pattern.search(context)
# if match:
# return match.group(1).strip()
# else:
# return "No helpful answer found."
# # print the helpful answer
# print(get_helpful_answer(result['answer'], query))
# CLAUDE IMPROVEMENT TRY
import pandas as pd
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.chains import ConversationalRetrievalChain
from langchain.llms import HuggingFaceHub
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from langchain.schema import Document
# Load and process data (unchanged)
df = pd.read_csv("web_data.csv")
with open("accounting.component.html", "r", encoding="utf-8") as f:
reports_component_html = f.read()
with open("accounting.component.ts", "r", encoding="utf-8") as f:
reports_module_ts = f.read()
# Improved text processing
texts = [f"Question: {q}\nAnswer: {a}" for q, a in zip(df['query'], df['responses'])]
texts.append(f"File: accounting.component.html\nContent:\n{reports_component_html}")
texts.append(f"File: accounting.component.ts\nContent:\n{reports_module_ts}")
# More granular text splitting
text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=50)
docs = [Document(page_content=chunk, metadata={}) for text in texts for chunk in text_splitter.split_text(text)]
# Create embeddings and vector store
embeddings = HuggingFaceEmbeddings(model_name="meta-llama/Meta-Llama-3-8B-Instruct")
db = Chroma.from_documents(docs, embeddings)
# Improved language model configuration
model = HuggingFaceHub(
repo_id="meta-llama/Meta-Llama-3-8B-Instruct",
model_kwargs={"temperature": 0.3, "max_length": 512, "top_p": 0.95}
)
# Enhanced prompt template
prompt_template = """
Use the following pieces of context to answer the question at the end. If you don't know the answer, say "I don't have enough information to answer this question accurately."
Aim to provide a concise yet informative answer within 500 characters.
Context:
{context}
Question: {question}
Confident and Accurate Answer:
"""
# Updated chains
combine_docs_chain = StuffDocumentsChain(
llm_chain=LLMChain(
prompt=PromptTemplate(input_variables=['context', 'question'], template=prompt_template),
llm=model
),
document_variable_name='context'
)
question_generator = LLMChain(
prompt=PromptTemplate(
input_variables=['chat_history', 'question'],
template='Given the following conversation and a follow-up question, rephrase the follow-up question to be a standalone question focused on Angular and TypeScript concepts.\n\nChat History:\n{chat_history}\nFollow Up Input: {question}\nStandalone question:'
),
llm=model
)
# Create the improved conversational retrieval chain
qa = ConversationalRetrievalChain(
retriever=db.as_retriever(search_kwargs={"k": 3}),
combine_docs_chain=combine_docs_chain,
question_generator=question_generator,
return_source_documents=True,
verbose=True
)
# Function to run a query
def run_query(query, chat_history=[]):
result = qa({"question": query, "chat_history": chat_history})
print("Question:", query)
print("Answer:", result['answer'])
print("Sources:", [doc.page_content[:50] + "..." for doc in result['source_documents']])
return result
# Example usage
query = "Explain the code in summary in the accounting components TypeScript file."
result = run_query(query)