NEXAS commited on
Commit
441637d
1 Parent(s): ae136ee

Delete utils

Browse files
utils/__pycache__/ingest1.cpython-310.pyc DELETED
Binary file (2.71 kB)
 
utils/__pycache__/qa.cpython-310.pyc DELETED
Binary file (2.2 kB)
 
utils/__pycache__/stt.cpython-310.pyc DELETED
Binary file (1.01 kB)
 
utils/__pycache__/tts.cpython-310.pyc DELETED
Binary file (944 Bytes)
 
utils/ingest1.py DELETED
@@ -1,107 +0,0 @@
1
- import os
2
- import nest_asyncio # noqa: E402
3
- nest_asyncio.apply()
4
-
5
- # bring in our LLAMA_CLOUD_API_KEY
6
- from dotenv import load_dotenv
7
- load_dotenv()
8
-
9
- ##### LLAMAPARSE #####
10
- from llama_parse import LlamaParse
11
-
12
- from langchain.text_splitter import RecursiveCharacterTextSplitter
13
- from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
14
- from langchain_community.vectorstores import Qdrant
15
- from langchain_community.document_loaders import DirectoryLoader
16
-
17
-
18
- llamaparse_api_key = os.getenv("LLAMA_CLOUD_API_KEY")
19
- qdrant_url = os.getenv("QDRANT_URL")
20
- qdrant_api_key = os.getenv("QDRANT_API_KEY")
21
-
22
- #to_parse_documents = ["./data/example.pdf", "./data/uber_10q_march_2022.pdf"]
23
-
24
- parsed_data_file = r"C:\Users\Naresh Kumar Lahajal\Desktop\FINAL\data\parsed_data.pkl"
25
- output_md = r"C:\Users\Naresh Kumar Lahajal\Desktop\FINAL\data\output.md"
26
- loki = r"C:\Users\Naresh Kumar Lahajal\Desktop\FINAL\data"
27
-
28
- import pickle
29
- # Define a function to load parsed data if available, or parse if not
30
- def load_or_parse_data(loc):
31
- data_file = parsed_data_file
32
-
33
- if os.path.exists(data_file):
34
- # Load the parsed data from the file
35
- with open(data_file, "rb") as f:
36
- parsed_data = pickle.load(f)
37
- else:
38
- # Perform the parsing step and store the result in llama_parse_documents
39
- parsingInstructiontest10k = """The provided document is an entry level machine learning textbook with example code and outputs.
40
- It contains many images and tables.
41
- Try to be precise while answering the questions"""
42
- parser = LlamaParse(api_key=llamaparse_api_key, result_type="markdown", parsing_instruction=parsingInstructiontest10k)
43
- llama_parse_documents = parser.load_data(loc)
44
-
45
-
46
- # Save the parsed data to a file
47
- with open(data_file, "wb") as f:
48
- pickle.dump(llama_parse_documents, f)
49
-
50
- # Set the parsed data to the variable
51
- parsed_data = llama_parse_documents
52
-
53
- return parsed_data
54
-
55
-
56
- # Create vector database
57
- def create_vector_database(loc):
58
- """
59
- Creates a vector database using document loaders and embeddings.
60
-
61
- This function loads urls,
62
- splits the loaded documents into chunks, transforms them into embeddings using OllamaEmbeddings,
63
- and finally persists the embeddings into a Chroma vector database.
64
-
65
- """
66
- # Call the function to either load or parse the data
67
- llama_parse_documents = load_or_parse_data(loc)
68
- #print(llama_parse_documents[1].text[:100])
69
-
70
- #with open('data/output.md', 'a') as f: # Open the file in append mode ('a')
71
- # for doc in llama_parse_documents:
72
- # f.write(doc.text + '\n')
73
- with open(output_md, 'a', encoding='utf-8') as f: # Open the file in append mode ('a')
74
- for doc in llama_parse_documents:
75
- f.write(doc.text + '\n')
76
-
77
- loader = DirectoryLoader(loki, glob="**/*.md", show_progress=True)
78
- documents = loader.load()
79
- # Split loaded documents into chunks
80
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=100)
81
- docs = text_splitter.split_documents(documents)
82
-
83
- #len(docs)
84
- #docs[0]
85
-
86
- # Initialize Embeddings
87
- embeddings = FastEmbedEmbeddings()
88
-
89
- # Create and persist a Chroma vector database from the chunked documents
90
- qdrant = Qdrant.from_documents(
91
- documents=docs,
92
- embedding=embeddings,
93
- url=qdrant_url,
94
- collection_name="rag",
95
- api_key=qdrant_api_key
96
- )
97
-
98
- #query it
99
- #query = "what is the agend of Financial Statements for 2022 ?"
100
- #found_doc = qdrant.similarity_search(query, k=3)
101
- #print(found_doc[0][:100])
102
-
103
- print('Vector DB created successfully !')
104
-
105
-
106
- if __name__ == "__main__":
107
- create_vector_database()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
utils/qa.py DELETED
@@ -1,87 +0,0 @@
1
- import os
2
- from typing import List
3
- from langchain_groq import ChatGroq
4
- from langchain.prompts import PromptTemplate
5
- from langchain_community.vectorstores import Qdrant
6
- from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
7
- from qdrant_client import QdrantClient
8
- from langchain_community.chat_models import ChatOllama
9
-
10
-
11
- #import chainlit as cl
12
- from langchain.chains import RetrievalQA
13
-
14
- # bring in our GROQ_API_KEY
15
- from dotenv import load_dotenv
16
- load_dotenv()
17
-
18
- groq_api_key = os.getenv("GROQ_API_KEY")
19
- qdrant_url = os.getenv("QDRANT_URL")
20
- qdrant_api_key = os.getenv("QDRANT_API_KEY")
21
-
22
- custom_prompt_template = """Use the following pieces of information to answer the user's question.
23
- If you don't know the answer, just say that you don't know,if it is out of context say that it is out of context and also try to provide the answer and don't be rude.
24
-
25
- Context: {context}
26
- Question: {question}
27
-
28
- Only return the helpful answer below and nothing else.
29
- Helpful answer:
30
- """
31
-
32
- def set_custom_prompt():
33
- """
34
- Prompt template for QA retrieval for each vectorstore
35
- """
36
- prompt = PromptTemplate(template=custom_prompt_template,
37
- input_variables=['context', 'question'])
38
- return prompt
39
-
40
-
41
- chat_model = ChatGroq(temperature=0, model_name="mixtral-8x7b-32768")
42
- #chat_model = ChatGroq(temperature=0, model_name="Llama2-70b-4096")
43
- #chat_model = ChatOllama(model="llama2", request_timeout=30.0)
44
-
45
- client = QdrantClient(api_key=qdrant_api_key, url=qdrant_url,)
46
-
47
-
48
- def retrieval_qa_chain(llm, prompt, vectorstore):
49
- qa_chain = RetrievalQA.from_chain_type(
50
- llm=llm,
51
- chain_type="stuff",
52
- retriever=vectorstore.as_retriever(search_kwargs={'k': 2}),
53
- return_source_documents=True,
54
- chain_type_kwargs={'prompt': prompt}
55
- )
56
- return qa_chain
57
-
58
-
59
- def qa_bot():
60
- embeddings = FastEmbedEmbeddings()
61
- vectorstore = Qdrant(client=client, embeddings=embeddings, collection_name="rag")
62
- llm = chat_model
63
- qa_prompt=set_custom_prompt()
64
- qa = retrieval_qa_chain(llm, qa_prompt, vectorstore)
65
- return qa
66
-
67
- #---------------------------------------------------------------------#
68
-
69
- #qdrant_cloud_api_key="your_qdrant_cloud_api_key"
70
- #qdrant_url="your_qdrant_url"
71
-
72
- #qdrant_cloud = Qdrant.from_documents(
73
- # docs,
74
- # embeddings,
75
- # url=qdrant_url,
76
- # prefer_grpc=True,
77
- # api_key=qdrant_cloud_api_key,
78
- # collection_name="qdrant_cloud_documents",
79
- #)
80
-
81
- #---------------------------------------------------------------------#
82
- query="how to make coffee"
83
- print(query)
84
-
85
- chain = qa_bot()
86
-
87
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
utils/stt.py DELETED
@@ -1,45 +0,0 @@
1
- # main.py (python example)
2
-
3
- import os
4
- from dotenv import load_dotenv
5
-
6
- from deepgram import (
7
- DeepgramClient,
8
- PrerecordedOptions,
9
- FileSource,
10
- )
11
-
12
- load_dotenv()
13
-
14
- # Path to the audio file
15
- AUDIO_FILE = r"C:\Users\Naresh Kumar Lahajal\Desktop\FINAL\media\recorded.mp3"
16
- API_KEY = os.getenv("DG_API_KEY")
17
-
18
-
19
- def speech_to_text():
20
- try:
21
- # STEP 1 Create a Deepgram client using the API key
22
- deepgram = DeepgramClient(API_KEY)
23
-
24
- with open(AUDIO_FILE, "rb") as file:
25
- buffer_data = file.read()
26
-
27
- payload: FileSource = {
28
- "buffer": buffer_data,
29
- }
30
-
31
- #STEP 2: Configure Deepgram options for audio analysis
32
- options = PrerecordedOptions(
33
- model="nova-2",
34
- smart_format=True,
35
- )
36
-
37
- # STEP 3: Call the transcribe_file method with the text payload and options
38
- response = deepgram.listen.prerecorded.v("1").transcribe_file(payload, options)
39
- # STEP 4: Print the response
40
- #print(response.to_json(indent=4))
41
- #print(response["results"]["channels"][0]["alternatives"][0]["transcript"])
42
- return str(response["results"]["channels"][0]["alternatives"][0]["transcript"])
43
-
44
- except Exception as e:
45
- print(f"Exception: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
utils/tts.py DELETED
@@ -1,38 +0,0 @@
1
- import requests
2
- import os
3
- from dotenv import load_dotenv
4
-
5
- load_dotenv()
6
- API_KEY = os.getenv("DG_API_KEY")
7
- AUDIO_FILE=r"C:\Users\Naresh Kumar Lahajal\Desktop\FINAL\media\ouput_file.mp3"
8
-
9
- def text_to_speech(llm_response):
10
- # Define the API endpoint
11
- url = "https://api.deepgram.com/v1/speak?model=aura-asteria-en"
12
-
13
- # Define the headers
14
- headers = {
15
- "Authorization": f"Token {API_KEY}",
16
- "Content-Type": "application/json"
17
- }
18
-
19
- # Define the payload
20
- payload = {
21
- "text": llm_response
22
- }
23
-
24
- # Make the POST request
25
- response = requests.post(url, headers=headers, json=payload)
26
-
27
- # Check if the request was successful
28
- if response.status_code == 200:
29
- # Save the response content to a file
30
- with open(AUDIO_FILE, "wb") as f:
31
- f.write(response.content)
32
- print("File saved successfully.")
33
- else:
34
- print(f"Error: {response.status_code} - {response.text}")
35
-
36
- # Example usage
37
- #transcribed_text = "Hello, how can I help you today?"
38
- #tts(transcribed_text)