Rauhan commited on
Commit
88d2fdc
1 Parent(s): 6febb6b

UPDATE: New Endpoints

Browse files
Files changed (4) hide show
  1. Dockerfile +3 -1
  2. app.py +18 -1
  3. functions.py +20 -1
  4. requirements.txt +4 -0
Dockerfile CHANGED
@@ -10,7 +10,9 @@ RUN apt-get update && apt-get install -y \
10
  build-essential \
11
  cmake \
12
  && apt-get clean \
13
- && rm -rf /var/lib/apt/lists/*
 
 
14
 
15
  RUN mkdir -p /app/nltk_data && chmod -R 777 /app/nltk_data
16
 
 
10
  build-essential \
11
  cmake \
12
  && apt-get clean \
13
+ && rm -rf /var/lib/apt/lists/*
14
+
15
+ RUN apt-get install poppler-utils -y
16
 
17
  RUN mkdir -p /app/nltk_data && chmod -R 777 /app/nltk_data
18
 
app.py CHANGED
@@ -8,7 +8,6 @@ from fastapi.middleware.cors import CORSMiddleware
8
  from langchain_community.document_loaders import UnstructuredURLLoader
9
 
10
 
11
-
12
  app = FastAPI(title = "ConversAI", root_path = "/api/v1")
13
  app.add_middleware(
14
  CORSMiddleware,
@@ -64,6 +63,24 @@ async def addPDFData(vectorstore: str, pdf: UploadFile = File(...)):
64
  }
65
 
66
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  @app.post("/addText")
68
  async def addText(vectorstore: str, text: str):
69
  username, chatbotname = vectorstore.split("-")[1], vectorstore.split("-")[2]
 
8
  from langchain_community.document_loaders import UnstructuredURLLoader
9
 
10
 
 
11
  app = FastAPI(title = "ConversAI", root_path = "/api/v1")
12
  app.add_middleware(
13
  CORSMiddleware,
 
63
  }
64
 
65
 
66
+ @app.post("/addImagePDF")
67
+ async def addPDFData(vectorstore: str, pdf: UploadFile = File(...)):
68
+ pdf = await pdf.read()
69
+ text = getTextFromImagePDF(pdfBytes = pdf)
70
+ username, chatbotname = vectorstore.split("-")[1], vectorstore.split("-")[2]
71
+ df = pd.DataFrame(client.table("ConversAI_ChatbotInfo").select("*").execute().data)
72
+ currentCount = df[(df["username"] == username) & (df["chatbotname"] == chatbotname)]["charactercount"].iloc[0]
73
+ limit = client.table("ConversAI_UserConfig").select("tokenLimit").eq("username", username).execute().data[0]["tokenLimit"]
74
+ newCount = currentCount + len(text)
75
+ if newCount < int(limit):
76
+ client.table("ConversAI_ChatbotInfo").update({"charactercount": str(newCount)}).eq("username", username).eq("chatbotname", chatbotname).execute()
77
+ return addDocuments(text = text, vectorstore = vectorstore)
78
+ else:
79
+ return {
80
+ "output": "DOCUMENT EXCEEDING LIMITS, PLEASE TRY WITH A SMALLER DOCUMENT."
81
+ }
82
+
83
+
84
  @app.post("/addText")
85
  async def addText(vectorstore: str, text: str):
86
  username, chatbotname = vectorstore.split("-")[1], vectorstore.split("-")[2]
functions.py CHANGED
@@ -18,6 +18,9 @@ from langchain.retrievers.document_compressors import FlashrankRerank
18
  from supabase.client import create_client
19
  from qdrant_client import QdrantClient
20
  from langchain_groq import ChatGroq
 
 
 
21
  from bs4 import BeautifulSoup
22
  from urllib.parse import urlparse, urljoin
23
  from supabase import create_client
@@ -37,6 +40,7 @@ vectorEmbeddings = HuggingFaceEmbeddings(
37
  model_kwargs = model_kwargs,
38
  encode_kwargs = encode_kwargs
39
  )
 
40
  sparseEmbeddings = FastEmbedSparse(model = "Qdrant/BM25")
41
  prompt = """
42
  INSTRUCTIONS:
@@ -282,4 +286,19 @@ def getLinks(url: str, timeout = 30):
282
  break
283
  else:
284
  uniqueLinks = uniqueLinks.union(set(getLinksFromPage(link)))
285
- return list(set([x[:len(x) - 1] if x[-1] == "/" else x for x in uniqueLinks]))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  from supabase.client import create_client
19
  from qdrant_client import QdrantClient
20
  from langchain_groq import ChatGroq
21
+ from pdf2image import convert_from_bytes
22
+ import numpy as np
23
+ from paddleocr import PaddleOCR
24
  from bs4 import BeautifulSoup
25
  from urllib.parse import urlparse, urljoin
26
  from supabase import create_client
 
40
  model_kwargs = model_kwargs,
41
  encode_kwargs = encode_kwargs
42
  )
43
+ ocr = PaddleOCR(use_angle_cls=True, lang='en')
44
  sparseEmbeddings = FastEmbedSparse(model = "Qdrant/BM25")
45
  prompt = """
46
  INSTRUCTIONS:
 
286
  break
287
  else:
288
  uniqueLinks = uniqueLinks.union(set(getLinksFromPage(link)))
289
+ return list(set([x[:len(x) - 1] if x[-1] == "/" else x for x in uniqueLinks]))
290
+
291
+
292
+ def getTextFromImagePDF(pdfBytes):
293
+ global ocr
294
+ allImages = convert_from_bytes(pdfBytes)
295
+ allImages = [np.array(image) for image in allImages]
296
+ pageWiseText = []
297
+ for page in allImages:
298
+ result = ocr.ocr(page)
299
+ if result[0]:
300
+ retrievedText = "\n".join([result[0][x][1][0] for x in range(len(result[0]))])
301
+ else:
302
+ retrievedText = ""
303
+ pageWiseText.append(retrievedText)
304
+ return "\n\n\n".join(pageWiseText)
requirements.txt CHANGED
@@ -12,10 +12,14 @@ langchain-qdrant
12
  langchain-groq
13
  langsmith
14
  lxml
 
15
  PyPDF2
16
  python-dotenv
17
  pydantic
18
  pandas
 
 
 
19
  sentence-transformers
20
  supabase
21
  unstructured
 
12
  langchain-groq
13
  langsmith
14
  lxml
15
+ numpy
16
  PyPDF2
17
  python-dotenv
18
  pydantic
19
  pandas
20
+ paddlepaddle-gpu
21
+ paddleocr
22
+ pdf2image
23
  sentence-transformers
24
  supabase
25
  unstructured