Rauhan commited on
Commit
4c37639
1 Parent(s): 212a8b1

UPDATE: speed ups

Browse files
Files changed (3) hide show
  1. app.py +2 -9
  2. functions.py +31 -4
  3. requirements.txt +1 -1
app.py CHANGED
@@ -2,12 +2,10 @@ import io
2
  import os
3
  import tempfile
4
  from functions import *
5
- from langchain_community.document_loaders import PDFMinerLoader
6
  import pandas as pd
7
  from fastapi import FastAPI, File, UploadFile, HTTPException
8
  from pydantic import BaseModel
9
  from fastapi.middleware.cors import CORSMiddleware
10
- from langchain_community.document_loaders import UnstructuredURLLoader
11
  from src.api.speech_api import speech_translator_router
12
  from functions import client as supabase
13
  from urllib.parse import urlparse
@@ -158,8 +156,7 @@ async def addPDFData(vectorstore: str, pdf: UploadFile = File(...)):
158
  with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file:
159
  temp_file.write(pdf)
160
  temp_file_path = temp_file.name
161
- loader = PDFMinerLoader(file_path = temp_file_path, concatenate_pages = True)
162
- text = loader.load()[0].page_content
163
  os.remove(temp_file_path)
164
  username, chatbotname = vectorstore.split("$")[1], vectorstore.split("$")[2]
165
  df = pd.DataFrame(client.table("ConversAI_ChatbotInfo").select("*").execute().data)
@@ -229,11 +226,7 @@ async def addText(addQaPair: AddQAPair):
229
 
230
  @app.post("/addWebsite")
231
  async def addWebsite(vectorstore: str, websiteUrls: list[str]):
232
- loader = UnstructuredURLLoader(urls=websiteUrls)
233
- docs = loader.load()
234
- text = "\n\n".join(
235
- [f"{docs[doc].page_content}" for doc in range(len(docs))]
236
- )
237
  username, chatbotname = vectorstore.split("$")[1], vectorstore.split("$")[2]
238
  df = pd.DataFrame(client.table("ConversAI_ChatbotInfo").select("*").execute().data)
239
  currentCount = df[(df["user_id"] == username) & (df["chatbotname"] == chatbotname)]["charactercount"].iloc[0]
 
2
  import os
3
  import tempfile
4
  from functions import *
 
5
  import pandas as pd
6
  from fastapi import FastAPI, File, UploadFile, HTTPException
7
  from pydantic import BaseModel
8
  from fastapi.middleware.cors import CORSMiddleware
 
9
  from src.api.speech_api import speech_translator_router
10
  from functions import client as supabase
11
  from urllib.parse import urlparse
 
156
  with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file:
157
  temp_file.write(pdf)
158
  temp_file_path = temp_file.name
159
+ text = extractTextFromPdf(temp_file_path)
 
160
  os.remove(temp_file_path)
161
  username, chatbotname = vectorstore.split("$")[1], vectorstore.split("$")[2]
162
  df = pd.DataFrame(client.table("ConversAI_ChatbotInfo").select("*").execute().data)
 
226
 
227
  @app.post("/addWebsite")
228
  async def addWebsite(vectorstore: str, websiteUrls: list[str]):
229
+ text = extractTextFromUrlList(urls = websiteUrls)
 
 
 
 
230
  username, chatbotname = vectorstore.split("$")[1], vectorstore.split("$")[2]
231
  df = pd.DataFrame(client.table("ConversAI_ChatbotInfo").select("*").execute().data)
232
  currentCount = df[(df["user_id"] == username) & (df["chatbotname"] == chatbotname)]["charactercount"].iloc[0]
functions.py CHANGED
@@ -1,3 +1,5 @@
 
 
1
  from langchain.retrievers.contextual_compression import ContextualCompressionRetriever
2
  from langchain_core.runnables import RunnablePassthrough, RunnableLambda
3
  from langchain_text_splitters import RecursiveCharacterTextSplitter
@@ -38,12 +40,12 @@ qdrantClient = QdrantClient(url=os.environ["QDRANT_URL"], api_key=os.environ["QD
38
  model_kwargs = {"device": "cuda"}
39
  encode_kwargs = {"normalize_embeddings": True}
40
  vectorEmbeddings = HuggingFaceEmbeddings(
41
- model_name="BAAI/bge-m3",
42
  model_kwargs=model_kwargs,
43
  encode_kwargs=encode_kwargs
44
  )
45
  reader = easyocr.Reader(['en'], gpu=True, model_storage_directory="/app/EasyOCRModels")
46
- sparseEmbeddings = FastEmbedSparse(model="Qdrant/BM25")
47
  prompt = """
48
  INSTRUCTIONS:
49
  =====================================
@@ -123,11 +125,11 @@ def addDocuments(text: str, source: str, vectorstore: str):
123
  global sparseEmbeddings
124
  global store
125
  parentSplitter = RecursiveCharacterTextSplitter(
126
- chunk_size=2100,
127
  add_start_index=True
128
  )
129
  childSplitter = RecursiveCharacterTextSplitter(
130
- chunk_size=300,
131
  add_start_index=True
132
  )
133
  texts = [Document(page_content=text, metadata={"source": source})]
@@ -323,3 +325,28 @@ def analyzeData(query, dataframe):
323
  return f"data:image/png;base64,{b64string}"
324
  else:
325
  return response
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pymupdf
2
+ from concurrent.futures import ThreadPoolExecutor
3
  from langchain.retrievers.contextual_compression import ContextualCompressionRetriever
4
  from langchain_core.runnables import RunnablePassthrough, RunnableLambda
5
  from langchain_text_splitters import RecursiveCharacterTextSplitter
 
40
  model_kwargs = {"device": "cuda"}
41
  encode_kwargs = {"normalize_embeddings": True}
42
  vectorEmbeddings = HuggingFaceEmbeddings(
43
+ model_name="sentence-transformers/all-MiniLM-L6-v2",
44
  model_kwargs=model_kwargs,
45
  encode_kwargs=encode_kwargs
46
  )
47
  reader = easyocr.Reader(['en'], gpu=True, model_storage_directory="/app/EasyOCRModels")
48
+ sparseEmbeddings = FastEmbedSparse(model="Qdrant/BM25", threads = 20 , parallel = 0)
49
  prompt = """
50
  INSTRUCTIONS:
51
  =====================================
 
125
  global sparseEmbeddings
126
  global store
127
  parentSplitter = RecursiveCharacterTextSplitter(
128
+ chunk_size=2000,
129
  add_start_index=True
130
  )
131
  childSplitter = RecursiveCharacterTextSplitter(
132
+ chunk_size=400,
133
  add_start_index=True
134
  )
135
  texts = [Document(page_content=text, metadata={"source": source})]
 
325
  return f"data:image/png;base64,{b64string}"
326
  else:
327
  return response
328
+
329
+
330
+
331
+ def extractTextFromPage(page):
332
+ return page.get_text()
333
+
334
+ def extractTextFromPdf(pdf_path):
335
+ doc = pymupdf.open(pdf_path)
336
+ pages = [doc.load_page(i) for i in range(len(doc))]
337
+ with ThreadPoolExecutor() as executor:
338
+ texts = list(executor.map(extractTextFromPage, pages))
339
+ doc.close()
340
+ return '.'.join(texts)
341
+
342
+ def extractTextFromUrl(url):
343
+ response = requests.get(url)
344
+ response.raise_for_status()
345
+ html = response.text
346
+ soup = BeautifulSoup(html, 'lxml')
347
+ return soup.get_text(separator=' ', strip=True)
348
+
349
+ def extractTextFromUrlList(urls):
350
+ with ThreadPoolExecutor() as executor:
351
+ texts = list(executor.map(extractTextFromUrl, urls))
352
+ return '.'.join(texts)
requirements.txt CHANGED
@@ -73,6 +73,7 @@ fastembed-gpu
73
  nest_asyncio
74
  beautifulsoup4
75
  flashrank
 
76
  langchain
77
  langchain-community
78
  langchain-cohere
@@ -80,7 +81,6 @@ langchain-huggingface
80
  langchain-qdrant
81
  langchain-groq
82
  lxml
83
- pdfminer.six
84
  python-dotenv
85
  pillow
86
  pandas
 
73
  nest_asyncio
74
  beautifulsoup4
75
  flashrank
76
+ PyMuPDF
77
  langchain
78
  langchain-community
79
  langchain-cohere
 
81
  langchain-qdrant
82
  langchain-groq
83
  lxml
 
84
  python-dotenv
85
  pillow
86
  pandas