Spaces:
Sleeping
Sleeping
Delete utils
Browse files- utils/__pycache__/ingest1.cpython-310.pyc +0 -0
- utils/__pycache__/qa.cpython-310.pyc +0 -0
- utils/__pycache__/stt.cpython-310.pyc +0 -0
- utils/__pycache__/tts.cpython-310.pyc +0 -0
- utils/ingest1.py +0 -107
- utils/qa.py +0 -87
- utils/stt.py +0 -45
- utils/tts.py +0 -38
utils/__pycache__/ingest1.cpython-310.pyc
DELETED
Binary file (2.71 kB)
|
|
utils/__pycache__/qa.cpython-310.pyc
DELETED
Binary file (2.2 kB)
|
|
utils/__pycache__/stt.cpython-310.pyc
DELETED
Binary file (1.01 kB)
|
|
utils/__pycache__/tts.cpython-310.pyc
DELETED
Binary file (944 Bytes)
|
|
utils/ingest1.py
DELETED
@@ -1,107 +0,0 @@
|
|
1 |
-
import os
|
2 |
-
import nest_asyncio # noqa: E402
|
3 |
-
nest_asyncio.apply()
|
4 |
-
|
5 |
-
# bring in our LLAMA_CLOUD_API_KEY
|
6 |
-
from dotenv import load_dotenv
|
7 |
-
load_dotenv()
|
8 |
-
|
9 |
-
##### LLAMAPARSE #####
|
10 |
-
from llama_parse import LlamaParse
|
11 |
-
|
12 |
-
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
13 |
-
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
|
14 |
-
from langchain_community.vectorstores import Qdrant
|
15 |
-
from langchain_community.document_loaders import DirectoryLoader
|
16 |
-
|
17 |
-
|
18 |
-
llamaparse_api_key = os.getenv("LLAMA_CLOUD_API_KEY")
|
19 |
-
qdrant_url = os.getenv("QDRANT_URL")
|
20 |
-
qdrant_api_key = os.getenv("QDRANT_API_KEY")
|
21 |
-
|
22 |
-
#to_parse_documents = ["./data/example.pdf", "./data/uber_10q_march_2022.pdf"]
|
23 |
-
|
24 |
-
parsed_data_file = r"C:\Users\Naresh Kumar Lahajal\Desktop\FINAL\data\parsed_data.pkl"
|
25 |
-
output_md = r"C:\Users\Naresh Kumar Lahajal\Desktop\FINAL\data\output.md"
|
26 |
-
loki = r"C:\Users\Naresh Kumar Lahajal\Desktop\FINAL\data"
|
27 |
-
|
28 |
-
import pickle
|
29 |
-
# Define a function to load parsed data if available, or parse if not
|
30 |
-
def load_or_parse_data(loc):
|
31 |
-
data_file = parsed_data_file
|
32 |
-
|
33 |
-
if os.path.exists(data_file):
|
34 |
-
# Load the parsed data from the file
|
35 |
-
with open(data_file, "rb") as f:
|
36 |
-
parsed_data = pickle.load(f)
|
37 |
-
else:
|
38 |
-
# Perform the parsing step and store the result in llama_parse_documents
|
39 |
-
parsingInstructiontest10k = """The provided document is an entry level machine learning textbook with example code and outputs.
|
40 |
-
It contains many images and tables.
|
41 |
-
Try to be precise while answering the questions"""
|
42 |
-
parser = LlamaParse(api_key=llamaparse_api_key, result_type="markdown", parsing_instruction=parsingInstructiontest10k)
|
43 |
-
llama_parse_documents = parser.load_data(loc)
|
44 |
-
|
45 |
-
|
46 |
-
# Save the parsed data to a file
|
47 |
-
with open(data_file, "wb") as f:
|
48 |
-
pickle.dump(llama_parse_documents, f)
|
49 |
-
|
50 |
-
# Set the parsed data to the variable
|
51 |
-
parsed_data = llama_parse_documents
|
52 |
-
|
53 |
-
return parsed_data
|
54 |
-
|
55 |
-
|
56 |
-
# Create vector database
|
57 |
-
def create_vector_database(loc):
|
58 |
-
"""
|
59 |
-
Creates a vector database using document loaders and embeddings.
|
60 |
-
|
61 |
-
This function loads urls,
|
62 |
-
splits the loaded documents into chunks, transforms them into embeddings using OllamaEmbeddings,
|
63 |
-
and finally persists the embeddings into a Chroma vector database.
|
64 |
-
|
65 |
-
"""
|
66 |
-
# Call the function to either load or parse the data
|
67 |
-
llama_parse_documents = load_or_parse_data(loc)
|
68 |
-
#print(llama_parse_documents[1].text[:100])
|
69 |
-
|
70 |
-
#with open('data/output.md', 'a') as f: # Open the file in append mode ('a')
|
71 |
-
# for doc in llama_parse_documents:
|
72 |
-
# f.write(doc.text + '\n')
|
73 |
-
with open(output_md, 'a', encoding='utf-8') as f: # Open the file in append mode ('a')
|
74 |
-
for doc in llama_parse_documents:
|
75 |
-
f.write(doc.text + '\n')
|
76 |
-
|
77 |
-
loader = DirectoryLoader(loki, glob="**/*.md", show_progress=True)
|
78 |
-
documents = loader.load()
|
79 |
-
# Split loaded documents into chunks
|
80 |
-
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=100)
|
81 |
-
docs = text_splitter.split_documents(documents)
|
82 |
-
|
83 |
-
#len(docs)
|
84 |
-
#docs[0]
|
85 |
-
|
86 |
-
# Initialize Embeddings
|
87 |
-
embeddings = FastEmbedEmbeddings()
|
88 |
-
|
89 |
-
# Create and persist a Chroma vector database from the chunked documents
|
90 |
-
qdrant = Qdrant.from_documents(
|
91 |
-
documents=docs,
|
92 |
-
embedding=embeddings,
|
93 |
-
url=qdrant_url,
|
94 |
-
collection_name="rag",
|
95 |
-
api_key=qdrant_api_key
|
96 |
-
)
|
97 |
-
|
98 |
-
#query it
|
99 |
-
#query = "what is the agend of Financial Statements for 2022 ?"
|
100 |
-
#found_doc = qdrant.similarity_search(query, k=3)
|
101 |
-
#print(found_doc[0][:100])
|
102 |
-
|
103 |
-
print('Vector DB created successfully !')
|
104 |
-
|
105 |
-
|
106 |
-
if __name__ == "__main__":
|
107 |
-
create_vector_database()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
utils/qa.py
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
import os
|
2 |
-
from typing import List
|
3 |
-
from langchain_groq import ChatGroq
|
4 |
-
from langchain.prompts import PromptTemplate
|
5 |
-
from langchain_community.vectorstores import Qdrant
|
6 |
-
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
|
7 |
-
from qdrant_client import QdrantClient
|
8 |
-
from langchain_community.chat_models import ChatOllama
|
9 |
-
|
10 |
-
|
11 |
-
#import chainlit as cl
|
12 |
-
from langchain.chains import RetrievalQA
|
13 |
-
|
14 |
-
# bring in our GROQ_API_KEY
|
15 |
-
from dotenv import load_dotenv
|
16 |
-
load_dotenv()
|
17 |
-
|
18 |
-
groq_api_key = os.getenv("GROQ_API_KEY")
|
19 |
-
qdrant_url = os.getenv("QDRANT_URL")
|
20 |
-
qdrant_api_key = os.getenv("QDRANT_API_KEY")
|
21 |
-
|
22 |
-
custom_prompt_template = """Use the following pieces of information to answer the user's question.
|
23 |
-
If you don't know the answer, just say that you don't know,if it is out of context say that it is out of context and also try to provide the answer and don't be rude.
|
24 |
-
|
25 |
-
Context: {context}
|
26 |
-
Question: {question}
|
27 |
-
|
28 |
-
Only return the helpful answer below and nothing else.
|
29 |
-
Helpful answer:
|
30 |
-
"""
|
31 |
-
|
32 |
-
def set_custom_prompt():
|
33 |
-
"""
|
34 |
-
Prompt template for QA retrieval for each vectorstore
|
35 |
-
"""
|
36 |
-
prompt = PromptTemplate(template=custom_prompt_template,
|
37 |
-
input_variables=['context', 'question'])
|
38 |
-
return prompt
|
39 |
-
|
40 |
-
|
41 |
-
chat_model = ChatGroq(temperature=0, model_name="mixtral-8x7b-32768")
|
42 |
-
#chat_model = ChatGroq(temperature=0, model_name="Llama2-70b-4096")
|
43 |
-
#chat_model = ChatOllama(model="llama2", request_timeout=30.0)
|
44 |
-
|
45 |
-
client = QdrantClient(api_key=qdrant_api_key, url=qdrant_url,)
|
46 |
-
|
47 |
-
|
48 |
-
def retrieval_qa_chain(llm, prompt, vectorstore):
|
49 |
-
qa_chain = RetrievalQA.from_chain_type(
|
50 |
-
llm=llm,
|
51 |
-
chain_type="stuff",
|
52 |
-
retriever=vectorstore.as_retriever(search_kwargs={'k': 2}),
|
53 |
-
return_source_documents=True,
|
54 |
-
chain_type_kwargs={'prompt': prompt}
|
55 |
-
)
|
56 |
-
return qa_chain
|
57 |
-
|
58 |
-
|
59 |
-
def qa_bot():
|
60 |
-
embeddings = FastEmbedEmbeddings()
|
61 |
-
vectorstore = Qdrant(client=client, embeddings=embeddings, collection_name="rag")
|
62 |
-
llm = chat_model
|
63 |
-
qa_prompt=set_custom_prompt()
|
64 |
-
qa = retrieval_qa_chain(llm, qa_prompt, vectorstore)
|
65 |
-
return qa
|
66 |
-
|
67 |
-
#---------------------------------------------------------------------#
|
68 |
-
|
69 |
-
#qdrant_cloud_api_key="your_qdrant_cloud_api_key"
|
70 |
-
#qdrant_url="your_qdrant_url"
|
71 |
-
|
72 |
-
#qdrant_cloud = Qdrant.from_documents(
|
73 |
-
# docs,
|
74 |
-
# embeddings,
|
75 |
-
# url=qdrant_url,
|
76 |
-
# prefer_grpc=True,
|
77 |
-
# api_key=qdrant_cloud_api_key,
|
78 |
-
# collection_name="qdrant_cloud_documents",
|
79 |
-
#)
|
80 |
-
|
81 |
-
#---------------------------------------------------------------------#
|
82 |
-
query="how to make coffee"
|
83 |
-
print(query)
|
84 |
-
|
85 |
-
chain = qa_bot()
|
86 |
-
|
87 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
utils/stt.py
DELETED
@@ -1,45 +0,0 @@
|
|
1 |
-
# main.py (python example)
|
2 |
-
|
3 |
-
import os
|
4 |
-
from dotenv import load_dotenv
|
5 |
-
|
6 |
-
from deepgram import (
|
7 |
-
DeepgramClient,
|
8 |
-
PrerecordedOptions,
|
9 |
-
FileSource,
|
10 |
-
)
|
11 |
-
|
12 |
-
load_dotenv()
|
13 |
-
|
14 |
-
# Path to the audio file
|
15 |
-
AUDIO_FILE = r"C:\Users\Naresh Kumar Lahajal\Desktop\FINAL\media\recorded.mp3"
|
16 |
-
API_KEY = os.getenv("DG_API_KEY")
|
17 |
-
|
18 |
-
|
19 |
-
def speech_to_text():
|
20 |
-
try:
|
21 |
-
# STEP 1 Create a Deepgram client using the API key
|
22 |
-
deepgram = DeepgramClient(API_KEY)
|
23 |
-
|
24 |
-
with open(AUDIO_FILE, "rb") as file:
|
25 |
-
buffer_data = file.read()
|
26 |
-
|
27 |
-
payload: FileSource = {
|
28 |
-
"buffer": buffer_data,
|
29 |
-
}
|
30 |
-
|
31 |
-
#STEP 2: Configure Deepgram options for audio analysis
|
32 |
-
options = PrerecordedOptions(
|
33 |
-
model="nova-2",
|
34 |
-
smart_format=True,
|
35 |
-
)
|
36 |
-
|
37 |
-
# STEP 3: Call the transcribe_file method with the text payload and options
|
38 |
-
response = deepgram.listen.prerecorded.v("1").transcribe_file(payload, options)
|
39 |
-
# STEP 4: Print the response
|
40 |
-
#print(response.to_json(indent=4))
|
41 |
-
#print(response["results"]["channels"][0]["alternatives"][0]["transcript"])
|
42 |
-
return str(response["results"]["channels"][0]["alternatives"][0]["transcript"])
|
43 |
-
|
44 |
-
except Exception as e:
|
45 |
-
print(f"Exception: {e}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
utils/tts.py
DELETED
@@ -1,38 +0,0 @@
|
|
1 |
-
import requests
|
2 |
-
import os
|
3 |
-
from dotenv import load_dotenv
|
4 |
-
|
5 |
-
load_dotenv()
|
6 |
-
API_KEY = os.getenv("DG_API_KEY")
|
7 |
-
AUDIO_FILE=r"C:\Users\Naresh Kumar Lahajal\Desktop\FINAL\media\ouput_file.mp3"
|
8 |
-
|
9 |
-
def text_to_speech(llm_response):
|
10 |
-
# Define the API endpoint
|
11 |
-
url = "https://api.deepgram.com/v1/speak?model=aura-asteria-en"
|
12 |
-
|
13 |
-
# Define the headers
|
14 |
-
headers = {
|
15 |
-
"Authorization": f"Token {API_KEY}",
|
16 |
-
"Content-Type": "application/json"
|
17 |
-
}
|
18 |
-
|
19 |
-
# Define the payload
|
20 |
-
payload = {
|
21 |
-
"text": llm_response
|
22 |
-
}
|
23 |
-
|
24 |
-
# Make the POST request
|
25 |
-
response = requests.post(url, headers=headers, json=payload)
|
26 |
-
|
27 |
-
# Check if the request was successful
|
28 |
-
if response.status_code == 200:
|
29 |
-
# Save the response content to a file
|
30 |
-
with open(AUDIO_FILE, "wb") as f:
|
31 |
-
f.write(response.content)
|
32 |
-
print("File saved successfully.")
|
33 |
-
else:
|
34 |
-
print(f"Error: {response.status_code} - {response.text}")
|
35 |
-
|
36 |
-
# Example usage
|
37 |
-
#transcribed_text = "Hello, how can I help you today?"
|
38 |
-
#tts(transcribed_text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|