shubhampal commited on
Commit
ceb805c
1 Parent(s): 353952a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +147 -52
app.py CHANGED
@@ -5,69 +5,164 @@ KEY = os.getenv('KEY')
5
  os.environ['HF_TOKEN']=KEY
6
  os.environ['HUGGINGFACEHUB_API_TOKEN']=KEY
7
 
8
- from langchain.embeddings.huggingface import HuggingFaceEmbeddings
9
- from langchain import HuggingFaceHub
10
- from langchain.vectorstores import Chroma
11
- from langchain.chains import ConversationalRetrievalChain
12
- from langchain.text_splitter import CharacterTextSplitter
13
- from langchain.docstore.document import Document
14
- import pandas as pd
15
 
16
 
17
- # Load the CSV file
18
- df = pd.read_csv("web_data.csv")
19
 
20
- # Load the HTML and TS files
21
- with open("reports.component.html", "r", encoding="utf-8") as f:
22
- reports_component_html = f.read()
23
 
24
- with open("reports.module.ts", "r", encoding="utf-8") as f:
25
- reports_module_ts = f.read()
26
 
27
- # Create the embeddings
28
- embeddings = HuggingFaceEmbeddings()
29
 
30
- print(embeddings)
31
 
32
- # Combine questions, answers, and file contents into a list of strings
33
- texts = [f"Question: {q}\nAnswer: {a}" for q, a in zip(df['query'], df['responses'])]
34
- texts.append(f"File: reports.component.html\nContent:\n{reports_component_html}")
35
- texts.append(f"File: reports.module.ts\nContent:\n{reports_module_ts}")
36
-
37
- # Split the texts into chunks
38
- text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
39
- docs = []
40
- for text in texts:
41
- chunks = text_splitter.split_text(text)
42
- for chunk in chunks:
43
- doc = Document(page_content=chunk, metadata={})
44
- docs.append(doc)
45
-
46
- # Create the vector store
47
- db = Chroma.from_documents(docs, embeddings)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
 
49
- # Load the language model
50
- model = HuggingFaceHub(repo_id="tiiuae/falcon-7b-instruct", model_kwargs={"temperature": 0.7, "max_length": 512})
51
- # model = HuggingFaceHub(repo_id="meta-llama/Meta-Llama-3-8B", model_kwargs={"temperature": 0.7, "max_length": 512})
52
- # model = HuggingFaceHub(repo_id="mlabonne/AlphaMonarch-7B", model_kwargs={"temperature": 0.7, "max_length": 512})
53
 
54
- # Create the conversational retrieval chain
55
- qa = ConversationalRetrievalChain.from_llm(model, db.as_retriever())
56
 
57
 
58
- query = '''what all is present in reports module '''
59
- result = qa({"question": query, "chat_history": []})
60
- print(result['answer'])
61
 
62
- def get_helpful_answer(context, query):
63
- import re
64
- pattern = re.compile(r"Helpful Answer:\s*(.*?)(?:Question:|\Z)", re.DOTALL)
65
- match = pattern.search(context)
66
- if match:
67
- return match.group(1).strip()
68
- else:
69
- return "No helpful answer found."
70
 
71
 
72
- # print the helpful answer
73
- print(get_helpful_answer(result['answer'], query))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  os.environ['HF_TOKEN']=KEY
6
  os.environ['HUGGINGFACEHUB_API_TOKEN']=KEY
7
 
8
+ # from langchain.embeddings.huggingface import HuggingFaceEmbeddings
9
+ # from langchain import HuggingFaceHub
10
+ # from langchain.vectorstores import Chroma
11
+ # from langchain.chains import ConversationalRetrievalChain
12
+ # from langchain.text_splitter import CharacterTextSplitter
13
+ # from langchain.docstore.document import Document
14
+ # import pandas as pd
15
 
16
 
17
+ # # Load the CSV file
18
+ # df = pd.read_csv("web_data.csv")
19
 
20
+ # # Load the HTML and TS files
21
+ # with open("reports.component.html", "r", encoding="utf-8") as f:
22
+ # reports_component_html = f.read()
23
 
24
+ # with open("reports.module.ts", "r", encoding="utf-8") as f:
25
+ # reports_module_ts = f.read()
26
 
27
+ # # Create the embeddings
28
+ # embeddings = HuggingFaceEmbeddings()
29
 
30
+ # print(embeddings)
31
 
32
+ # # Combine questions, answers, and file contents into a list of strings
33
+ # texts = [f"Question: {q}\nAnswer: {a}" for q, a in zip(df['query'], df['responses'])]
34
+ # texts.append(f"File: reports.component.html\nContent:\n{reports_component_html}")
35
+ # texts.append(f"File: reports.module.ts\nContent:\n{reports_module_ts}")
36
+
37
+ # # Split the texts into chunks
38
+ # text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
39
+ # docs = []
40
+ # for text in texts:
41
+ # chunks = text_splitter.split_text(text)
42
+ # for chunk in chunks:
43
+ # doc = Document(page_content=chunk, metadata={})
44
+ # docs.append(doc)
45
+
46
+ # # Create the vector store
47
+ # db = Chroma.from_documents(docs, embeddings)
48
+
49
+ # # Load the language model
50
+ # model = HuggingFaceHub(repo_id="tiiuae/falcon-7b-instruct", model_kwargs={"temperature": 0.7, "max_length": 512})
51
+ # # model = HuggingFaceHub(repo_id="meta-llama/Meta-Llama-3-8B", model_kwargs={"temperature": 0.7, "max_length": 512})
52
+ # # model = HuggingFaceHub(repo_id="mlabonne/AlphaMonarch-7B", model_kwargs={"temperature": 0.7, "max_length": 512})
53
+
54
+ # # Create the conversational retrieval chain
55
+ # qa = ConversationalRetrievalChain.from_llm(model, db.as_retriever())
56
+
57
+
58
+ # query = '''what all is present in reports module '''
59
+ # result = qa({"question": query, "chat_history": []})
60
+ # print(result['answer'])
61
+
62
+ # def get_helpful_answer(context, query):
63
+ # import re
64
+ # pattern = re.compile(r"Helpful Answer:\s*(.*?)(?:Question:|\Z)", re.DOTALL)
65
+ # match = pattern.search(context)
66
+ # if match:
67
+ # return match.group(1).strip()
68
+ # else:
69
+ # return "No helpful answer found."
70
 
 
 
 
 
71
 
72
+ # # print the helpful answer
73
+ # print(get_helpful_answer(result['answer'], query))
74
 
75
 
 
 
 
76
 
77
+ # CLAUDE IMPROVEMENT TRY
 
 
 
 
 
 
 
78
 
79
 
80
+ import pandas as pd
81
+ from langchain.text_splitter import CharacterTextSplitter
82
+ from langchain.embeddings import HuggingFaceEmbeddings
83
+ from langchain.vectorstores import Chroma
84
+ from langchain.chains import ConversationalRetrievalChain
85
+ from langchain.llms import HuggingFaceHub
86
+ from langchain.chains import LLMChain
87
+ from langchain.prompts import PromptTemplate
88
+ from langchain.chains.combine_documents.stuff import StuffDocumentsChain
89
+ from langchain.schema import Document
90
+
91
+ # Load and process data (unchanged)
92
+ df = pd.read_csv("web_data.csv")
93
+
94
+ with open("accounting.component.html", "r", encoding="utf-8") as f:
95
+ reports_component_html = f.read()
96
+
97
+ with open("accounting.component.ts", "r", encoding="utf-8") as f:
98
+ reports_module_ts = f.read()
99
+
100
+ # Improved text processing
101
+ texts = [f"Question: {q}\nAnswer: {a}" for q, a in zip(df['query'], df['responses'])]
102
+ texts.append(f"File: accounting.component.html\nContent:\n{reports_component_html}")
103
+ texts.append(f"File: accounting.component.ts\nContent:\n{reports_module_ts}")
104
+
105
+ # More granular text splitting
106
+ text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=50)
107
+ docs = [Document(page_content=chunk, metadata={}) for text in texts for chunk in text_splitter.split_text(text)]
108
+
109
+ # Create embeddings and vector store
110
+ embeddings = HuggingFaceEmbeddings(model_name="meta-llama/Meta-Llama-3-8B-Instruct")
111
+ db = Chroma.from_documents(docs, embeddings)
112
+
113
+ # Improved language model configuration
114
+ model = HuggingFaceHub(
115
+ repo_id="meta-llama/Meta-Llama-3-8B-Instruct",
116
+ model_kwargs={"temperature": 0.3, "max_length": 512, "top_p": 0.95}
117
+ )
118
+
119
+ # Enhanced prompt template
120
+ prompt_template = """
121
+ Use the following pieces of context to answer the question at the end. If you don't know the answer, say "I don't have enough information to answer this question accurately."
122
+ Aim to provide a concise yet informative answer within 500 characters.
123
+
124
+ Context:
125
+ {context}
126
+
127
+ Question: {question}
128
+
129
+ Confident and Accurate Answer:
130
+ """
131
+
132
+ # Updated chains
133
+ combine_docs_chain = StuffDocumentsChain(
134
+ llm_chain=LLMChain(
135
+ prompt=PromptTemplate(input_variables=['context', 'question'], template=prompt_template),
136
+ llm=model
137
+ ),
138
+ document_variable_name='context'
139
+ )
140
+
141
+ question_generator = LLMChain(
142
+ prompt=PromptTemplate(
143
+ input_variables=['chat_history', 'question'],
144
+ template='Given the following conversation and a follow-up question, rephrase the follow-up question to be a standalone question focused on Angular and TypeScript concepts.\n\nChat History:\n{chat_history}\nFollow Up Input: {question}\nStandalone question:'
145
+ ),
146
+ llm=model
147
+ )
148
+
149
+ # Create the improved conversational retrieval chain
150
+ qa = ConversationalRetrievalChain(
151
+ retriever=db.as_retriever(search_kwargs={"k": 3}),
152
+ combine_docs_chain=combine_docs_chain,
153
+ question_generator=question_generator,
154
+ return_source_documents=True,
155
+ verbose=True
156
+ )
157
+
158
+ # Function to run a query
159
+ def run_query(query, chat_history=[]):
160
+ result = qa({"question": query, "chat_history": chat_history})
161
+ print("Question:", query)
162
+ print("Answer:", result['answer'])
163
+ print("Sources:", [doc.page_content[:50] + "..." for doc in result['source_documents']])
164
+ return result
165
+
166
+ # Example usage
167
+ query = "Explain the code in summary in the accounting components TypeScript file."
168
+ result = run_query(query)