Spaces:
Runtime error
Runtime error
Commit
•
ceb805c
1
Parent(s):
353952a
Update app.py
Browse files
app.py
CHANGED
@@ -5,69 +5,164 @@ KEY = os.getenv('KEY')
|
|
5 |
os.environ['HF_TOKEN']=KEY
|
6 |
os.environ['HUGGINGFACEHUB_API_TOKEN']=KEY
|
7 |
|
8 |
-
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
|
9 |
-
from langchain import HuggingFaceHub
|
10 |
-
from langchain.vectorstores import Chroma
|
11 |
-
from langchain.chains import ConversationalRetrievalChain
|
12 |
-
from langchain.text_splitter import CharacterTextSplitter
|
13 |
-
from langchain.docstore.document import Document
|
14 |
-
import pandas as pd
|
15 |
|
16 |
|
17 |
-
# Load the CSV file
|
18 |
-
df = pd.read_csv("web_data.csv")
|
19 |
|
20 |
-
# Load the HTML and TS files
|
21 |
-
with open("reports.component.html", "r", encoding="utf-8") as f:
|
22 |
-
|
23 |
|
24 |
-
with open("reports.module.ts", "r", encoding="utf-8") as f:
|
25 |
-
|
26 |
|
27 |
-
# Create the embeddings
|
28 |
-
embeddings = HuggingFaceEmbeddings()
|
29 |
|
30 |
-
print(embeddings)
|
31 |
|
32 |
-
# Combine questions, answers, and file contents into a list of strings
|
33 |
-
texts = [f"Question: {q}\nAnswer: {a}" for q, a in zip(df['query'], df['responses'])]
|
34 |
-
texts.append(f"File: reports.component.html\nContent:\n{reports_component_html}")
|
35 |
-
texts.append(f"File: reports.module.ts\nContent:\n{reports_module_ts}")
|
36 |
-
|
37 |
-
# Split the texts into chunks
|
38 |
-
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
|
39 |
-
docs = []
|
40 |
-
for text in texts:
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
# Create the vector store
|
47 |
-
db = Chroma.from_documents(docs, embeddings)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
|
49 |
-
# Load the language model
|
50 |
-
model = HuggingFaceHub(repo_id="tiiuae/falcon-7b-instruct", model_kwargs={"temperature": 0.7, "max_length": 512})
|
51 |
-
# model = HuggingFaceHub(repo_id="meta-llama/Meta-Llama-3-8B", model_kwargs={"temperature": 0.7, "max_length": 512})
|
52 |
-
# model = HuggingFaceHub(repo_id="mlabonne/AlphaMonarch-7B", model_kwargs={"temperature": 0.7, "max_length": 512})
|
53 |
|
54 |
-
#
|
55 |
-
|
56 |
|
57 |
|
58 |
-
query = '''what all is present in reports module '''
|
59 |
-
result = qa({"question": query, "chat_history": []})
|
60 |
-
print(result['answer'])
|
61 |
|
62 |
-
|
63 |
-
import re
|
64 |
-
pattern = re.compile(r"Helpful Answer:\s*(.*?)(?:Question:|\Z)", re.DOTALL)
|
65 |
-
match = pattern.search(context)
|
66 |
-
if match:
|
67 |
-
return match.group(1).strip()
|
68 |
-
else:
|
69 |
-
return "No helpful answer found."
|
70 |
|
71 |
|
72 |
-
|
73 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
os.environ['HF_TOKEN']=KEY
|
6 |
os.environ['HUGGINGFACEHUB_API_TOKEN']=KEY
|
7 |
|
8 |
+
# from langchain.embeddings.huggingface import HuggingFaceEmbeddings
|
9 |
+
# from langchain import HuggingFaceHub
|
10 |
+
# from langchain.vectorstores import Chroma
|
11 |
+
# from langchain.chains import ConversationalRetrievalChain
|
12 |
+
# from langchain.text_splitter import CharacterTextSplitter
|
13 |
+
# from langchain.docstore.document import Document
|
14 |
+
# import pandas as pd
|
15 |
|
16 |
|
17 |
+
# # Load the CSV file
|
18 |
+
# df = pd.read_csv("web_data.csv")
|
19 |
|
20 |
+
# # Load the HTML and TS files
|
21 |
+
# with open("reports.component.html", "r", encoding="utf-8") as f:
|
22 |
+
# reports_component_html = f.read()
|
23 |
|
24 |
+
# with open("reports.module.ts", "r", encoding="utf-8") as f:
|
25 |
+
# reports_module_ts = f.read()
|
26 |
|
27 |
+
# # Create the embeddings
|
28 |
+
# embeddings = HuggingFaceEmbeddings()
|
29 |
|
30 |
+
# print(embeddings)
|
31 |
|
32 |
+
# # Combine questions, answers, and file contents into a list of strings
|
33 |
+
# texts = [f"Question: {q}\nAnswer: {a}" for q, a in zip(df['query'], df['responses'])]
|
34 |
+
# texts.append(f"File: reports.component.html\nContent:\n{reports_component_html}")
|
35 |
+
# texts.append(f"File: reports.module.ts\nContent:\n{reports_module_ts}")
|
36 |
+
|
37 |
+
# # Split the texts into chunks
|
38 |
+
# text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
|
39 |
+
# docs = []
|
40 |
+
# for text in texts:
|
41 |
+
# chunks = text_splitter.split_text(text)
|
42 |
+
# for chunk in chunks:
|
43 |
+
# doc = Document(page_content=chunk, metadata={})
|
44 |
+
# docs.append(doc)
|
45 |
+
|
46 |
+
# # Create the vector store
|
47 |
+
# db = Chroma.from_documents(docs, embeddings)
|
48 |
+
|
49 |
+
# # Load the language model
|
50 |
+
# model = HuggingFaceHub(repo_id="tiiuae/falcon-7b-instruct", model_kwargs={"temperature": 0.7, "max_length": 512})
|
51 |
+
# # model = HuggingFaceHub(repo_id="meta-llama/Meta-Llama-3-8B", model_kwargs={"temperature": 0.7, "max_length": 512})
|
52 |
+
# # model = HuggingFaceHub(repo_id="mlabonne/AlphaMonarch-7B", model_kwargs={"temperature": 0.7, "max_length": 512})
|
53 |
+
|
54 |
+
# # Create the conversational retrieval chain
|
55 |
+
# qa = ConversationalRetrievalChain.from_llm(model, db.as_retriever())
|
56 |
+
|
57 |
+
|
58 |
+
# query = '''what all is present in reports module '''
|
59 |
+
# result = qa({"question": query, "chat_history": []})
|
60 |
+
# print(result['answer'])
|
61 |
+
|
62 |
+
# def get_helpful_answer(context, query):
|
63 |
+
# import re
|
64 |
+
# pattern = re.compile(r"Helpful Answer:\s*(.*?)(?:Question:|\Z)", re.DOTALL)
|
65 |
+
# match = pattern.search(context)
|
66 |
+
# if match:
|
67 |
+
# return match.group(1).strip()
|
68 |
+
# else:
|
69 |
+
# return "No helpful answer found."
|
70 |
|
|
|
|
|
|
|
|
|
71 |
|
72 |
+
# # print the helpful answer
|
73 |
+
# print(get_helpful_answer(result['answer'], query))
|
74 |
|
75 |
|
|
|
|
|
|
|
76 |
|
77 |
+
# CLAUDE IMPROVEMENT TRY
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
78 |
|
79 |
|
80 |
+
import pandas as pd
|
81 |
+
from langchain.text_splitter import CharacterTextSplitter
|
82 |
+
from langchain.embeddings import HuggingFaceEmbeddings
|
83 |
+
from langchain.vectorstores import Chroma
|
84 |
+
from langchain.chains import ConversationalRetrievalChain
|
85 |
+
from langchain.llms import HuggingFaceHub
|
86 |
+
from langchain.chains import LLMChain
|
87 |
+
from langchain.prompts import PromptTemplate
|
88 |
+
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
|
89 |
+
from langchain.schema import Document
|
90 |
+
|
91 |
+
# Load and process data (unchanged)
|
92 |
+
df = pd.read_csv("web_data.csv")
|
93 |
+
|
94 |
+
with open("accounting.component.html", "r", encoding="utf-8") as f:
|
95 |
+
reports_component_html = f.read()
|
96 |
+
|
97 |
+
with open("accounting.component.ts", "r", encoding="utf-8") as f:
|
98 |
+
reports_module_ts = f.read()
|
99 |
+
|
100 |
+
# Improved text processing
|
101 |
+
texts = [f"Question: {q}\nAnswer: {a}" for q, a in zip(df['query'], df['responses'])]
|
102 |
+
texts.append(f"File: accounting.component.html\nContent:\n{reports_component_html}")
|
103 |
+
texts.append(f"File: accounting.component.ts\nContent:\n{reports_module_ts}")
|
104 |
+
|
105 |
+
# More granular text splitting
|
106 |
+
text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=50)
|
107 |
+
docs = [Document(page_content=chunk, metadata={}) for text in texts for chunk in text_splitter.split_text(text)]
|
108 |
+
|
109 |
+
# Create embeddings and vector store
|
110 |
+
embeddings = HuggingFaceEmbeddings(model_name="meta-llama/Meta-Llama-3-8B-Instruct")
|
111 |
+
db = Chroma.from_documents(docs, embeddings)
|
112 |
+
|
113 |
+
# Improved language model configuration
|
114 |
+
model = HuggingFaceHub(
|
115 |
+
repo_id="meta-llama/Meta-Llama-3-8B-Instruct",
|
116 |
+
model_kwargs={"temperature": 0.3, "max_length": 512, "top_p": 0.95}
|
117 |
+
)
|
118 |
+
|
119 |
+
# Enhanced prompt template
|
120 |
+
prompt_template = """
|
121 |
+
Use the following pieces of context to answer the question at the end. If you don't know the answer, say "I don't have enough information to answer this question accurately."
|
122 |
+
Aim to provide a concise yet informative answer within 500 characters.
|
123 |
+
|
124 |
+
Context:
|
125 |
+
{context}
|
126 |
+
|
127 |
+
Question: {question}
|
128 |
+
|
129 |
+
Confident and Accurate Answer:
|
130 |
+
"""
|
131 |
+
|
132 |
+
# Updated chains
|
133 |
+
combine_docs_chain = StuffDocumentsChain(
|
134 |
+
llm_chain=LLMChain(
|
135 |
+
prompt=PromptTemplate(input_variables=['context', 'question'], template=prompt_template),
|
136 |
+
llm=model
|
137 |
+
),
|
138 |
+
document_variable_name='context'
|
139 |
+
)
|
140 |
+
|
141 |
+
question_generator = LLMChain(
|
142 |
+
prompt=PromptTemplate(
|
143 |
+
input_variables=['chat_history', 'question'],
|
144 |
+
template='Given the following conversation and a follow-up question, rephrase the follow-up question to be a standalone question focused on Angular and TypeScript concepts.\n\nChat History:\n{chat_history}\nFollow Up Input: {question}\nStandalone question:'
|
145 |
+
),
|
146 |
+
llm=model
|
147 |
+
)
|
148 |
+
|
149 |
+
# Create the improved conversational retrieval chain
|
150 |
+
qa = ConversationalRetrievalChain(
|
151 |
+
retriever=db.as_retriever(search_kwargs={"k": 3}),
|
152 |
+
combine_docs_chain=combine_docs_chain,
|
153 |
+
question_generator=question_generator,
|
154 |
+
return_source_documents=True,
|
155 |
+
verbose=True
|
156 |
+
)
|
157 |
+
|
158 |
+
# Function to run a query
|
159 |
+
def run_query(query, chat_history=[]):
|
160 |
+
result = qa({"question": query, "chat_history": chat_history})
|
161 |
+
print("Question:", query)
|
162 |
+
print("Answer:", result['answer'])
|
163 |
+
print("Sources:", [doc.page_content[:50] + "..." for doc in result['source_documents']])
|
164 |
+
return result
|
165 |
+
|
166 |
+
# Example usage
|
167 |
+
query = "Explain the code in summary in the accounting components TypeScript file."
|
168 |
+
result = run_query(query)
|