did it work?
Browse files- App/Embedding/utils/Initialize.py +19 -18
- requirements.txt +5 -5
App/Embedding/utils/Initialize.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
from langchain.embeddings import HuggingFaceEmbeddings
|
2 |
from langchain.docstore.document import Document
|
3 |
-
from
|
4 |
from pinecone import ServerlessSpec
|
5 |
import pinecone
|
6 |
import os
|
@@ -18,28 +18,29 @@ def initDocument():
|
|
18 |
model_name = "thenlper/gte-base"
|
19 |
embeddings = HuggingFaceEmbeddings(model_name=model_name)
|
20 |
|
|
|
|
|
21 |
try:
|
22 |
-
pc
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
)
|
30 |
-
|
31 |
-
|
32 |
-
#docsearch = Pinecone.from_existing_index(index_name, embeddings)
|
33 |
|
34 |
-
|
|
|
35 |
|
36 |
|
37 |
|
38 |
|
39 |
async def delete_documents(task_id):
|
40 |
-
|
41 |
|
42 |
-
|
43 |
filter={
|
44 |
"task_id": {"$eq": task_id},
|
45 |
}
|
@@ -68,12 +69,12 @@ def generateChunks(chunks, task_id, n=100):
|
|
68 |
|
69 |
|
70 |
def search(query: str, task_id: str):
|
71 |
-
|
72 |
|
73 |
filtering_conditions = {
|
74 |
"task_id": {"$eq": task_id},
|
75 |
}
|
76 |
-
data =
|
77 |
return [
|
78 |
{"text": d.page_content, "start": d.metadata["start"], "end": d.metadata["end"]}
|
79 |
for d in data
|
@@ -82,8 +83,8 @@ def search(query: str, task_id: str):
|
|
82 |
|
83 |
|
84 |
def encode(temp: list[Document]):
|
85 |
-
|
86 |
-
|
87 |
# return embeddings.embed_documents(texts = [d.page_content for d in temp])
|
88 |
|
89 |
|
|
|
1 |
from langchain.embeddings import HuggingFaceEmbeddings
|
2 |
from langchain.docstore.document import Document
|
3 |
+
from langchain_pinecone import PineconeVectorStore
|
4 |
from pinecone import ServerlessSpec
|
5 |
import pinecone
|
6 |
import os
|
|
|
18 |
model_name = "thenlper/gte-base"
|
19 |
embeddings = HuggingFaceEmbeddings(model_name=model_name)
|
20 |
|
21 |
+
pc = pinecone.Pinecone(api_key=PINECONE_API_KEY, environment=PINECONE_ENV)
|
22 |
+
|
23 |
try:
|
24 |
+
index = pc.Index(index_name)
|
25 |
+
except pinecone.core.client.exceptions.NotFoundException:
|
26 |
+
pc.create_index(
|
27 |
+
name=index_name,
|
28 |
+
dimension=768,
|
29 |
+
metric="cosine",
|
30 |
+
spec=ServerlessSpec(cloud="aws", region="us-east-1")
|
31 |
)
|
32 |
+
index = pc.Index(index_name)
|
|
|
|
|
33 |
|
34 |
+
vector_store = PineconeVectorStore(index=index, embedding=embeddings)
|
35 |
+
return vector_store
|
36 |
|
37 |
|
38 |
|
39 |
|
40 |
async def delete_documents(task_id):
|
41 |
+
vector_store = initDocument()
|
42 |
|
43 |
+
vector_store.delete(
|
44 |
filter={
|
45 |
"task_id": {"$eq": task_id},
|
46 |
}
|
|
|
69 |
|
70 |
|
71 |
def search(query: str, task_id: str):
|
72 |
+
vector_store = initDocument()
|
73 |
|
74 |
filtering_conditions = {
|
75 |
"task_id": {"$eq": task_id},
|
76 |
}
|
77 |
+
data =vector_store.similarity_search(query, k=3, filter=filtering_conditions)
|
78 |
return [
|
79 |
{"text": d.page_content, "start": d.metadata["start"], "end": d.metadata["end"]}
|
80 |
for d in data
|
|
|
83 |
|
84 |
|
85 |
def encode(temp: list[Document]):
|
86 |
+
vector_store = initDocument()
|
87 |
+
vector_store.add_documents(temp)
|
88 |
# return embeddings.embed_documents(texts = [d.page_content for d in temp])
|
89 |
|
90 |
|
requirements.txt
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
asyncpg==0.27.0
|
2 |
databases==0.7.0
|
3 |
-
fastapi
|
4 |
orm==0.3.
|
5 |
transformers
|
6 |
faster-whisper
|
@@ -13,7 +13,7 @@ typesystem==0.3.1
|
|
13 |
Werkzeug==2.2.2
|
14 |
passlib # for password hashing
|
15 |
pydantic[email]
|
16 |
-
uvicorn
|
17 |
ujson
|
18 |
yt-dlp
|
19 |
psutil
|
@@ -27,12 +27,12 @@ telethon
|
|
27 |
fastapi-jwt-auth
|
28 |
bcrypt
|
29 |
aiomysql
|
30 |
-
|
31 |
asyncpg
|
32 |
sentence_transformers
|
33 |
google-generativeai
|
34 |
openai
|
35 |
tiktoken
|
36 |
-
langchain
|
37 |
mysqlclient
|
38 |
-
pinecone-client[grpc]
|
|
|
1 |
asyncpg==0.27.0
|
2 |
databases==0.7.0
|
3 |
+
fastapi
|
4 |
orm==0.3.
|
5 |
transformers
|
6 |
faster-whisper
|
|
|
13 |
Werkzeug==2.2.2
|
14 |
passlib # for password hashing
|
15 |
pydantic[email]
|
16 |
+
uvicorn
|
17 |
ujson
|
18 |
yt-dlp
|
19 |
psutil
|
|
|
27 |
fastapi-jwt-auth
|
28 |
bcrypt
|
29 |
aiomysql
|
30 |
+
langchain-pinecone
|
31 |
asyncpg
|
32 |
sentence_transformers
|
33 |
google-generativeai
|
34 |
openai
|
35 |
tiktoken
|
36 |
+
langchain
|
37 |
mysqlclient
|
38 |
+
pinecone-client[grpc]
|