Spaces:
Sleeping
Sleeping
abdul-rafey
commited on
Commit
•
ce6fba2
1
Parent(s):
9c0e4cf
implement semantic-search and upload
Browse files- app/main.py +91 -0
- app/pinecone.py +21 -0
- app/utils/data_cleaner.py +110 -0
- config.py +12 -0
- requirements.txt +79 -0
app/main.py
ADDED
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from uuid import uuid4
|
2 |
+
from fastapi import FastAPI, HTTPException, File, UploadFile, status
|
3 |
+
from fastapi.responses import FileResponse
|
4 |
+
import os
|
5 |
+
|
6 |
+
|
7 |
+
from app.utils.data_cleaner import read_pdf, text_to_pdf
|
8 |
+
from app.pinecone import pinecone_index
|
9 |
+
from app.semantic_search import generate_embeddings, get_results, insert_new_document
|
10 |
+
from config import settings
|
11 |
+
|
12 |
+
app = FastAPI()
|
13 |
+
|
14 |
+
|
15 |
+
UPLOAD_DIRECTORY = "./static"
|
16 |
+
if not os.path.exists(UPLOAD_DIRECTORY):
|
17 |
+
os.makedirs(UPLOAD_DIRECTORY)
|
18 |
+
|
19 |
+
|
20 |
+
# TODO: Remove this extra endpoint
|
21 |
+
@app.get("/")
|
22 |
+
async def root():
|
23 |
+
return {"message": "Hello World!"}
|
24 |
+
|
25 |
+
# TODO: replace "search" with "docs"
|
26 |
+
@app.get("/search")
|
27 |
+
async def search(query: str):
|
28 |
+
result = await get_results(query)
|
29 |
+
return result
|
30 |
+
|
31 |
+
|
32 |
+
@app.post("/upload")
|
33 |
+
async def upload(
|
34 |
+
title: str,
|
35 |
+
file: UploadFile = File(...),
|
36 |
+
):
|
37 |
+
if file.content_type != "application/pdf":
|
38 |
+
raise HTTPException(
|
39 |
+
status_code=400,
|
40 |
+
detail="Only PDF files are allowed."
|
41 |
+
)
|
42 |
+
await insert_new_document(title, file)
|
43 |
+
return {"success": True, "message": "File uploaded!"}
|
44 |
+
|
45 |
+
|
46 |
+
@app.get("/media/{filename}")
|
47 |
+
async def get_file(file_path: str):
|
48 |
+
pdf_file_path = f"./media/{file_path}"
|
49 |
+
if not os.path.exists(pdf_file_path):
|
50 |
+
raise HTTPException(
|
51 |
+
status_code=status.HTTP_404_NOT_FOUND,
|
52 |
+
detail="PDF file not found"
|
53 |
+
)
|
54 |
+
filename = pdf_file_path.split("/")[-1]
|
55 |
+
return FileResponse(pdf_file_path, media_type='application/pdf', filename=filename)
|
56 |
+
|
57 |
+
|
58 |
+
@app.get("/generate_db")
|
59 |
+
async def generate_db():
|
60 |
+
await insert_all_pdfs_to_pinecone()
|
61 |
+
return {"message": "success"}
|
62 |
+
|
63 |
+
|
64 |
+
async def insert_all_pdfs_to_pinecone():
|
65 |
+
# NOTE: Requires one PDF per case study stored in a directory named "media".
|
66 |
+
# Use our functions in data_cleaner.py to create pdfs automatically from a
|
67 |
+
# list of sample text files. The data in the text files will be cleaned
|
68 |
+
# automatically to find case studies and will be saved in pdf format.
|
69 |
+
|
70 |
+
media_dir = os.path.join(settings.BASE_DIR, settings.MEDIA)
|
71 |
+
if not os.path.exists(media_dir):
|
72 |
+
raise HTTPException(
|
73 |
+
status_code=status.HTTP_404_NOT_FOUND,
|
74 |
+
detail="Directory does not exists"
|
75 |
+
)
|
76 |
+
for filename in os.listdir(media_dir):
|
77 |
+
if filename.endswith('.pdf'):
|
78 |
+
content = read_pdf(os.path.join(media_dir, filename))
|
79 |
+
title = ""
|
80 |
+
pos = content.index('Industry:')
|
81 |
+
title = content[:pos]
|
82 |
+
title = title.replace('\n', '')
|
83 |
+
|
84 |
+
content = content.replace('\n', ' ') # NOTE: this should be done before extracting title
|
85 |
+
embedding = generate_embeddings(content)
|
86 |
+
index = pinecone_index()
|
87 |
+
uid = str(uuid4())
|
88 |
+
fname = filename.split('_')[-1]
|
89 |
+
path = f"{uid}_{fname}"
|
90 |
+
text_to_pdf(content, os.path.join(media_dir, path))
|
91 |
+
index.upsert([(uid, embedding.tolist(), {"title": title, "path": path, "filename": filename})])
|
app/pinecone.py
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pinecone import Pinecone, ServerlessSpec
|
2 |
+
from config import settings
|
3 |
+
|
4 |
+
pc = Pinecone(api_key=settings.PINECONE_API_KEY)
|
5 |
+
|
6 |
+
index_name = "case-studies-index"
|
7 |
+
if index_name not in pc.list_indexes().names():
|
8 |
+
pc.create_index(
|
9 |
+
name=index_name,
|
10 |
+
dimension=384,
|
11 |
+
metric="cosine",
|
12 |
+
spec=ServerlessSpec(
|
13 |
+
cloud="aws",
|
14 |
+
region="us-east-1"
|
15 |
+
)
|
16 |
+
|
17 |
+
)
|
18 |
+
|
19 |
+
def pinecone_index():
|
20 |
+
return pc.Index(index_name)
|
21 |
+
|
app/utils/data_cleaner.py
ADDED
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import textwrap
|
2 |
+
from uuid import uuid4
|
3 |
+
import PyPDF2
|
4 |
+
from bs4 import BeautifulSoup
|
5 |
+
from config import settings
|
6 |
+
import os
|
7 |
+
from reportlab.lib.pagesizes import letter
|
8 |
+
from reportlab.pdfgen import canvas
|
9 |
+
|
10 |
+
|
11 |
+
def read_pdf(file):
|
12 |
+
"""
|
13 |
+
Read and the text from pdf
|
14 |
+
"""
|
15 |
+
reader = PyPDF2.PdfReader(file)
|
16 |
+
pdf_text = ""
|
17 |
+
for page_num in range(len(reader.pages)):
|
18 |
+
page = reader.pages[page_num]
|
19 |
+
pdf_text += page.extract_text() + "\n"
|
20 |
+
return pdf_text
|
21 |
+
|
22 |
+
|
23 |
+
def text_to_pdf(text, output_filename):
|
24 |
+
"""
|
25 |
+
Create a PDF file with the given text.
|
26 |
+
"""
|
27 |
+
c = canvas.Canvas(output_filename, pagesize=letter)
|
28 |
+
width, height = letter
|
29 |
+
max_width = width - 200
|
30 |
+
avg_char_width = 7 # Adjust this value as needed based on font size
|
31 |
+
max_chars_per_line = max_width // avg_char_width
|
32 |
+
wrapped_lines = []
|
33 |
+
for line in text.split('\n'):
|
34 |
+
wrapped_lines.extend(textwrap.wrap(line, width=max_chars_per_line)) # Adjust width as needed
|
35 |
+
|
36 |
+
y_position = height - 100 # Start from the top of the page
|
37 |
+
|
38 |
+
for line in wrapped_lines:
|
39 |
+
c.drawString(100, y_position, line)
|
40 |
+
y_position -= 15
|
41 |
+
c.save()
|
42 |
+
|
43 |
+
|
44 |
+
def clean_html(html_content: str):
|
45 |
+
"""
|
46 |
+
Extract all the text from html
|
47 |
+
"""
|
48 |
+
soup = BeautifulSoup(html_content, 'html.parser')
|
49 |
+
for script in soup(["script", "style"]):
|
50 |
+
script.decompose()
|
51 |
+
main_content = soup.find_all(['h1', 'h2', 'h3', 'p'])
|
52 |
+
text = '\n'.join([tag.get_text(separator=" ", strip=True) for tag in main_content])
|
53 |
+
return text
|
54 |
+
|
55 |
+
|
56 |
+
def clean_txt(txt_content: str):
|
57 |
+
content = txt_content.split('\n')
|
58 |
+
cleaned_content = []
|
59 |
+
for line in content:
|
60 |
+
if line:
|
61 |
+
cleaned_content.append(line.strip())
|
62 |
+
return cleaned_content
|
63 |
+
|
64 |
+
|
65 |
+
async def make_case_study_pdfs():
|
66 |
+
case_studies = []
|
67 |
+
with open(os.path.join(settings.BASE_DIR, 'samples','case_studies.txt'), 'r') as f:
|
68 |
+
case_studies = f.read().split("CASE_STUDY /\n")
|
69 |
+
# save these as new pdf files
|
70 |
+
for i in range(1,34):
|
71 |
+
uid = str(uuid4())
|
72 |
+
filename = str(i) + '.pdf'
|
73 |
+
filename = filename.replace(" ", "_")
|
74 |
+
filename = f"{uid}_{filename}"
|
75 |
+
filepath = os.path.join(settings.BASE_DIR ,settings.MEDIA, filename)
|
76 |
+
content = case_studies[i]
|
77 |
+
text_to_pdf(content, filepath)
|
78 |
+
|
79 |
+
|
80 |
+
def generate_case_studies():
|
81 |
+
'''
|
82 |
+
Extract case studies from the given samples and store them in a text file
|
83 |
+
NOTE: Must be run only once
|
84 |
+
'''
|
85 |
+
for filename in os.listdir(os.path.join(settings.BASE_DIR, 'samples')):
|
86 |
+
if filename.endswith('.txt'):
|
87 |
+
with open(
|
88 |
+
os.path.join(settings.BASE_DIR, 'samples', filename),
|
89 |
+
'r',
|
90 |
+
encoding='utf-8'
|
91 |
+
) as file:
|
92 |
+
content = file.read()
|
93 |
+
cleaned_text = clean_txt(content)
|
94 |
+
# Finding starting and ending positions of Case Study in the sample document.
|
95 |
+
start_idx = cleaned_text.index('customer stories /')
|
96 |
+
end_idx = cleaned_text.index(
|
97 |
+
'We’re proud to be recognized as an industry leader, view our full list of honors to learn more.'
|
98 |
+
)
|
99 |
+
case_study = cleaned_text[start_idx:end_idx]
|
100 |
+
with open(os.path.join(settings.BASE_DIR, 'samples', 'case_studies.txt') , 'a') as f:
|
101 |
+
for line in case_study:
|
102 |
+
f.write(line)
|
103 |
+
f.write('\n')
|
104 |
+
# NOTE: A little cleaning is done manually after this :D
|
105 |
+
# But that can also be automated. A low priority task for now!
|
106 |
+
|
107 |
+
# RUN ONLY ONCE
|
108 |
+
# generate_case_studies()
|
109 |
+
# make_case_study_pdfs()
|
110 |
+
|
config.py
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pydantic_settings import BaseSettings, SettingsConfigDict
|
2 |
+
|
3 |
+
|
4 |
+
class Settings(BaseSettings):
|
5 |
+
model_config = SettingsConfigDict(env_file='.config.env')
|
6 |
+
|
7 |
+
BASE_DIR: str
|
8 |
+
MEDIA: str
|
9 |
+
PINECONE_API_KEY: str
|
10 |
+
|
11 |
+
|
12 |
+
settings = Settings()
|
requirements.txt
ADDED
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
annotated-types==0.7.0
|
2 |
+
anyio==4.3.0
|
3 |
+
beautifulsoup4==4.12.3
|
4 |
+
certifi==2024.2.2
|
5 |
+
chardet==5.2.0
|
6 |
+
charset-normalizer==3.3.2
|
7 |
+
click==8.1.7
|
8 |
+
dnspython==2.6.1
|
9 |
+
email_validator==2.1.1
|
10 |
+
exceptiongroup==1.2.1
|
11 |
+
fastapi==0.111.0
|
12 |
+
fastapi-cli==0.0.4
|
13 |
+
filelock==3.14.0
|
14 |
+
fsspec==2024.5.0
|
15 |
+
h11==0.14.0
|
16 |
+
httpcore==1.0.5
|
17 |
+
httptools==0.6.1
|
18 |
+
httpx==0.27.0
|
19 |
+
huggingface-hub==0.23.1
|
20 |
+
idna==3.7
|
21 |
+
Jinja2==3.1.4
|
22 |
+
joblib==1.4.2
|
23 |
+
markdown-it-py==3.0.0
|
24 |
+
MarkupSafe==2.1.5
|
25 |
+
mdurl==0.1.2
|
26 |
+
mpmath==1.3.0
|
27 |
+
networkx==3.3
|
28 |
+
numpy==1.26.4
|
29 |
+
nvidia-cublas-cu12==12.1.3.1
|
30 |
+
nvidia-cuda-cupti-cu12==12.1.105
|
31 |
+
nvidia-cuda-nvrtc-cu12==12.1.105
|
32 |
+
nvidia-cuda-runtime-cu12==12.1.105
|
33 |
+
nvidia-cudnn-cu12==8.9.2.26
|
34 |
+
nvidia-cufft-cu12==11.0.2.54
|
35 |
+
nvidia-curand-cu12==10.3.2.106
|
36 |
+
nvidia-cusolver-cu12==11.4.5.107
|
37 |
+
nvidia-cusparse-cu12==12.1.0.106
|
38 |
+
nvidia-nccl-cu12==2.20.5
|
39 |
+
nvidia-nvjitlink-cu12==12.5.40
|
40 |
+
nvidia-nvtx-cu12==12.1.105
|
41 |
+
orjson==3.10.3
|
42 |
+
packaging==24.0
|
43 |
+
pillow==10.3.0
|
44 |
+
pinecone-client==4.1.0
|
45 |
+
pydantic==2.7.1
|
46 |
+
pydantic-settings==2.2.1
|
47 |
+
pydantic_core==2.18.2
|
48 |
+
Pygments==2.18.0
|
49 |
+
PyPDF2==3.0.1
|
50 |
+
python-dotenv==1.0.1
|
51 |
+
python-multipart==0.0.9
|
52 |
+
PyYAML==6.0.1
|
53 |
+
regex==2024.5.15
|
54 |
+
reportlab==4.2.0
|
55 |
+
requests==2.32.2
|
56 |
+
rich==13.7.1
|
57 |
+
safetensors==0.4.3
|
58 |
+
scikit-learn==1.5.0
|
59 |
+
scipy==1.13.1
|
60 |
+
sentence-transformers==2.7.0
|
61 |
+
shellingham==1.5.4
|
62 |
+
sniffio==1.3.1
|
63 |
+
soupsieve==2.5
|
64 |
+
starlette==0.37.2
|
65 |
+
sympy==1.12
|
66 |
+
threadpoolctl==3.5.0
|
67 |
+
tokenizers==0.19.1
|
68 |
+
torch==2.3.0
|
69 |
+
tqdm==4.66.4
|
70 |
+
transformers==4.41.1
|
71 |
+
triton==2.3.0
|
72 |
+
typer==0.12.3
|
73 |
+
typing_extensions==4.11.0
|
74 |
+
ujson==5.10.0
|
75 |
+
urllib3==2.2.1
|
76 |
+
uvicorn==0.29.0
|
77 |
+
uvloop==0.19.0
|
78 |
+
watchfiles==0.21.0
|
79 |
+
websockets==12.0
|