Spaces:
Sleeping
Sleeping
UPDATE: speed ups
Browse files- app.py +2 -9
- functions.py +31 -4
- requirements.txt +1 -1
app.py
CHANGED
@@ -2,12 +2,10 @@ import io
|
|
2 |
import os
|
3 |
import tempfile
|
4 |
from functions import *
|
5 |
-
from langchain_community.document_loaders import PDFMinerLoader
|
6 |
import pandas as pd
|
7 |
from fastapi import FastAPI, File, UploadFile, HTTPException
|
8 |
from pydantic import BaseModel
|
9 |
from fastapi.middleware.cors import CORSMiddleware
|
10 |
-
from langchain_community.document_loaders import UnstructuredURLLoader
|
11 |
from src.api.speech_api import speech_translator_router
|
12 |
from functions import client as supabase
|
13 |
from urllib.parse import urlparse
|
@@ -158,8 +156,7 @@ async def addPDFData(vectorstore: str, pdf: UploadFile = File(...)):
|
|
158 |
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file:
|
159 |
temp_file.write(pdf)
|
160 |
temp_file_path = temp_file.name
|
161 |
-
|
162 |
-
text = loader.load()[0].page_content
|
163 |
os.remove(temp_file_path)
|
164 |
username, chatbotname = vectorstore.split("$")[1], vectorstore.split("$")[2]
|
165 |
df = pd.DataFrame(client.table("ConversAI_ChatbotInfo").select("*").execute().data)
|
@@ -229,11 +226,7 @@ async def addText(addQaPair: AddQAPair):
|
|
229 |
|
230 |
@app.post("/addWebsite")
|
231 |
async def addWebsite(vectorstore: str, websiteUrls: list[str]):
|
232 |
-
|
233 |
-
docs = loader.load()
|
234 |
-
text = "\n\n".join(
|
235 |
-
[f"{docs[doc].page_content}" for doc in range(len(docs))]
|
236 |
-
)
|
237 |
username, chatbotname = vectorstore.split("$")[1], vectorstore.split("$")[2]
|
238 |
df = pd.DataFrame(client.table("ConversAI_ChatbotInfo").select("*").execute().data)
|
239 |
currentCount = df[(df["user_id"] == username) & (df["chatbotname"] == chatbotname)]["charactercount"].iloc[0]
|
|
|
2 |
import os
|
3 |
import tempfile
|
4 |
from functions import *
|
|
|
5 |
import pandas as pd
|
6 |
from fastapi import FastAPI, File, UploadFile, HTTPException
|
7 |
from pydantic import BaseModel
|
8 |
from fastapi.middleware.cors import CORSMiddleware
|
|
|
9 |
from src.api.speech_api import speech_translator_router
|
10 |
from functions import client as supabase
|
11 |
from urllib.parse import urlparse
|
|
|
156 |
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file:
|
157 |
temp_file.write(pdf)
|
158 |
temp_file_path = temp_file.name
|
159 |
+
text = extractTextFromPdf(temp_file_path)
|
|
|
160 |
os.remove(temp_file_path)
|
161 |
username, chatbotname = vectorstore.split("$")[1], vectorstore.split("$")[2]
|
162 |
df = pd.DataFrame(client.table("ConversAI_ChatbotInfo").select("*").execute().data)
|
|
|
226 |
|
227 |
@app.post("/addWebsite")
|
228 |
async def addWebsite(vectorstore: str, websiteUrls: list[str]):
|
229 |
+
text = extractTextFromUrlList(urls = websiteUrls)
|
|
|
|
|
|
|
|
|
230 |
username, chatbotname = vectorstore.split("$")[1], vectorstore.split("$")[2]
|
231 |
df = pd.DataFrame(client.table("ConversAI_ChatbotInfo").select("*").execute().data)
|
232 |
currentCount = df[(df["user_id"] == username) & (df["chatbotname"] == chatbotname)]["charactercount"].iloc[0]
|
functions.py
CHANGED
@@ -1,3 +1,5 @@
|
|
|
|
|
|
1 |
from langchain.retrievers.contextual_compression import ContextualCompressionRetriever
|
2 |
from langchain_core.runnables import RunnablePassthrough, RunnableLambda
|
3 |
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
@@ -38,12 +40,12 @@ qdrantClient = QdrantClient(url=os.environ["QDRANT_URL"], api_key=os.environ["QD
|
|
38 |
model_kwargs = {"device": "cuda"}
|
39 |
encode_kwargs = {"normalize_embeddings": True}
|
40 |
vectorEmbeddings = HuggingFaceEmbeddings(
|
41 |
-
model_name="
|
42 |
model_kwargs=model_kwargs,
|
43 |
encode_kwargs=encode_kwargs
|
44 |
)
|
45 |
reader = easyocr.Reader(['en'], gpu=True, model_storage_directory="/app/EasyOCRModels")
|
46 |
-
sparseEmbeddings = FastEmbedSparse(model="Qdrant/BM25")
|
47 |
prompt = """
|
48 |
INSTRUCTIONS:
|
49 |
=====================================
|
@@ -123,11 +125,11 @@ def addDocuments(text: str, source: str, vectorstore: str):
|
|
123 |
global sparseEmbeddings
|
124 |
global store
|
125 |
parentSplitter = RecursiveCharacterTextSplitter(
|
126 |
-
chunk_size=
|
127 |
add_start_index=True
|
128 |
)
|
129 |
childSplitter = RecursiveCharacterTextSplitter(
|
130 |
-
chunk_size=
|
131 |
add_start_index=True
|
132 |
)
|
133 |
texts = [Document(page_content=text, metadata={"source": source})]
|
@@ -323,3 +325,28 @@ def analyzeData(query, dataframe):
|
|
323 |
return f"data:image/png;base64,{b64string}"
|
324 |
else:
|
325 |
return response
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pymupdf
|
2 |
+
from concurrent.futures import ThreadPoolExecutor
|
3 |
from langchain.retrievers.contextual_compression import ContextualCompressionRetriever
|
4 |
from langchain_core.runnables import RunnablePassthrough, RunnableLambda
|
5 |
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
|
|
40 |
model_kwargs = {"device": "cuda"}
|
41 |
encode_kwargs = {"normalize_embeddings": True}
|
42 |
vectorEmbeddings = HuggingFaceEmbeddings(
|
43 |
+
model_name="sentence-transformers/all-MiniLM-L6-v2",
|
44 |
model_kwargs=model_kwargs,
|
45 |
encode_kwargs=encode_kwargs
|
46 |
)
|
47 |
reader = easyocr.Reader(['en'], gpu=True, model_storage_directory="/app/EasyOCRModels")
|
48 |
+
sparseEmbeddings = FastEmbedSparse(model="Qdrant/BM25", threads = 20 , parallel = 0)
|
49 |
prompt = """
|
50 |
INSTRUCTIONS:
|
51 |
=====================================
|
|
|
125 |
global sparseEmbeddings
|
126 |
global store
|
127 |
parentSplitter = RecursiveCharacterTextSplitter(
|
128 |
+
chunk_size=2000,
|
129 |
add_start_index=True
|
130 |
)
|
131 |
childSplitter = RecursiveCharacterTextSplitter(
|
132 |
+
chunk_size=400,
|
133 |
add_start_index=True
|
134 |
)
|
135 |
texts = [Document(page_content=text, metadata={"source": source})]
|
|
|
325 |
return f"data:image/png;base64,{b64string}"
|
326 |
else:
|
327 |
return response
|
328 |
+
|
329 |
+
|
330 |
+
|
331 |
+
def extractTextFromPage(page):
|
332 |
+
return page.get_text()
|
333 |
+
|
334 |
+
def extractTextFromPdf(pdf_path):
|
335 |
+
doc = pymupdf.open(pdf_path)
|
336 |
+
pages = [doc.load_page(i) for i in range(len(doc))]
|
337 |
+
with ThreadPoolExecutor() as executor:
|
338 |
+
texts = list(executor.map(extractTextFromPage, pages))
|
339 |
+
doc.close()
|
340 |
+
return '.'.join(texts)
|
341 |
+
|
342 |
+
def extractTextFromUrl(url):
|
343 |
+
response = requests.get(url)
|
344 |
+
response.raise_for_status()
|
345 |
+
html = response.text
|
346 |
+
soup = BeautifulSoup(html, 'lxml')
|
347 |
+
return soup.get_text(separator=' ', strip=True)
|
348 |
+
|
349 |
+
def extractTextFromUrlList(urls):
|
350 |
+
with ThreadPoolExecutor() as executor:
|
351 |
+
texts = list(executor.map(extractTextFromUrl, urls))
|
352 |
+
return '.'.join(texts)
|
requirements.txt
CHANGED
@@ -73,6 +73,7 @@ fastembed-gpu
|
|
73 |
nest_asyncio
|
74 |
beautifulsoup4
|
75 |
flashrank
|
|
|
76 |
langchain
|
77 |
langchain-community
|
78 |
langchain-cohere
|
@@ -80,7 +81,6 @@ langchain-huggingface
|
|
80 |
langchain-qdrant
|
81 |
langchain-groq
|
82 |
lxml
|
83 |
-
pdfminer.six
|
84 |
python-dotenv
|
85 |
pillow
|
86 |
pandas
|
|
|
73 |
nest_asyncio
|
74 |
beautifulsoup4
|
75 |
flashrank
|
76 |
+
PyMuPDF
|
77 |
langchain
|
78 |
langchain-community
|
79 |
langchain-cohere
|
|
|
81 |
langchain-qdrant
|
82 |
langchain-groq
|
83 |
lxml
|
|
|
84 |
python-dotenv
|
85 |
pillow
|
86 |
pandas
|