Spaces:

abdul-rafey
/

semantic-search

Sleeping

App Files Files Community

abdul-rafey commited on May 25, 2024

Commit

ce6fba2

1 Parent(s): 9c0e4cf

implement semantic-search and upload

Browse files

Files changed (5) hide show

app/main.py +91 -0
app/pinecone.py +21 -0
app/utils/data_cleaner.py +110 -0
config.py +12 -0
requirements.txt +79 -0

app/main.py ADDED Viewed

	@@ -0,0 +1,91 @@

+from uuid import uuid4
+from fastapi import FastAPI, HTTPException, File, UploadFile, status
+from fastapi.responses import FileResponse
+import os
+from app.utils.data_cleaner import read_pdf, text_to_pdf
+from app.pinecone import pinecone_index
+from app.semantic_search import generate_embeddings, get_results, insert_new_document
+from config import settings
+app = FastAPI()
+UPLOAD_DIRECTORY = "./static"
+if not os.path.exists(UPLOAD_DIRECTORY):
+    os.makedirs(UPLOAD_DIRECTORY)
+# TODO: Remove this extra endpoint
+@app.get("/")
+async def root():
+    return {"message": "Hello World!"}
+# TODO: replace "search" with "docs"
+@app.get("/search")
+async def search(query: str):
+    result = await get_results(query)
+    return result
+@app.post("/upload")
+async def upload(
+    title: str,
+    file: UploadFile = File(...),
+):
+    if file.content_type != "application/pdf":
+        raise HTTPException(
+            status_code=400,
+            detail="Only PDF files are allowed."
+        )
+    await insert_new_document(title, file)
+    return {"success": True, "message": "File uploaded!"}
+@app.get("/media/{filename}")
+async def get_file(file_path: str):
+    pdf_file_path = f"./media/{file_path}"
+    if not os.path.exists(pdf_file_path):
+        raise HTTPException(
+            status_code=status.HTTP_404_NOT_FOUND,
+            detail="PDF file not found"
+            )
+    filename = pdf_file_path.split("/")[-1]
+    return FileResponse(pdf_file_path, media_type='application/pdf', filename=filename)
+@app.get("/generate_db")
+async def generate_db():
+    await insert_all_pdfs_to_pinecone()
+    return {"message": "success"}
+async def insert_all_pdfs_to_pinecone():
+    # NOTE: Requires one PDF per case study stored in a directory named "media".
+    # Use our functions in data_cleaner.py to create pdfs automatically from a
+    # list of sample text files. The data in the text files will be cleaned
+    # automatically to find case studies and will be saved in pdf format.
+    media_dir = os.path.join(settings.BASE_DIR, settings.MEDIA)
+    if not os.path.exists(media_dir):
+        raise HTTPException(
+            status_code=status.HTTP_404_NOT_FOUND,
+            detail="Directory does not exists"
+            )
+    for filename in os.listdir(media_dir):
+        if filename.endswith('.pdf'):
+            content = read_pdf(os.path.join(media_dir, filename))
+            title = ""
+            pos = content.index('Industry:')
+            title = content[:pos]
+            title = title.replace('\n', '')
+            content = content.replace('\n', ' ') # NOTE: this should be done before extracting title
+            embedding = generate_embeddings(content)
+            index = pinecone_index()
+            uid = str(uuid4())
+            fname = filename.split('_')[-1]
+            path = f"{uid}_{fname}"
+            text_to_pdf(content, os.path.join(media_dir, path))
+            index.upsert([(uid, embedding.tolist(), {"title": title, "path": path, "filename": filename})])

app/pinecone.py ADDED Viewed

	@@ -0,0 +1,21 @@

+from pinecone import Pinecone, ServerlessSpec
+from config import settings
+pc = Pinecone(api_key=settings.PINECONE_API_KEY)
+index_name = "case-studies-index"
+if index_name not in pc.list_indexes().names():
+    pc.create_index(
+        name=index_name,
+        dimension=384,
+        metric="cosine",
+        spec=ServerlessSpec(
+            cloud="aws",
+            region="us-east-1"
+        )
+    )
+def pinecone_index():
+    return pc.Index(index_name)

app/utils/data_cleaner.py ADDED Viewed

	@@ -0,0 +1,110 @@

+import textwrap
+from uuid import uuid4
+import PyPDF2
+from bs4 import BeautifulSoup
+from config import settings
+import os
+from reportlab.lib.pagesizes import letter
+from reportlab.pdfgen import canvas
+def read_pdf(file):
+    """
+    Read and the text from pdf
+    """
+    reader = PyPDF2.PdfReader(file)
+    pdf_text = ""
+    for page_num in range(len(reader.pages)):
+        page = reader.pages[page_num]
+        pdf_text += page.extract_text() + "\n"
+    return pdf_text
+def text_to_pdf(text, output_filename):
+    """
+    Create a PDF file with the given text.
+    """
+    c = canvas.Canvas(output_filename, pagesize=letter)
+    width, height = letter
+    max_width = width - 200
+    avg_char_width = 7  # Adjust this value as needed based on font size
+    max_chars_per_line = max_width // avg_char_width
+    wrapped_lines = []
+    for line in text.split('\n'):
+        wrapped_lines.extend(textwrap.wrap(line, width=max_chars_per_line))  # Adjust width as needed
+    y_position = height - 100  # Start from the top of the page
+    for line in wrapped_lines:
+        c.drawString(100, y_position, line)
+        y_position -= 15
+    c.save()
+def clean_html(html_content: str):
+    """
+    Extract all the text from html
+    """
+    soup = BeautifulSoup(html_content, 'html.parser')
+    for script in soup(["script", "style"]):
+        script.decompose()
+    main_content = soup.find_all(['h1', 'h2', 'h3', 'p'])
+    text = '\n'.join([tag.get_text(separator=" ", strip=True) for tag in main_content])
+    return text
+def clean_txt(txt_content: str):
+    content = txt_content.split('\n')
+    cleaned_content = []
+    for line in content:
+        if line:
+            cleaned_content.append(line.strip())
+    return cleaned_content
+async def make_case_study_pdfs():
+    case_studies = []
+    with open(os.path.join(settings.BASE_DIR, 'samples','case_studies.txt'), 'r') as f:
+        case_studies = f.read().split("CASE_STUDY /\n")
+    # save these as new pdf files
+    for i in range(1,34):
+        uid = str(uuid4())
+        filename = str(i) + '.pdf'
+        filename = filename.replace(" ", "_")
+        filename = f"{uid}_{filename}"
+        filepath = os.path.join(settings.BASE_DIR ,settings.MEDIA, filename)
+        content = case_studies[i]
+        text_to_pdf(content, filepath)
+def generate_case_studies():
+    '''
+    Extract case studies from the given samples and store them in a text file
+    NOTE: Must be run only once
+    '''
+    for filename in os.listdir(os.path.join(settings.BASE_DIR, 'samples')):
+        if filename.endswith('.txt'):
+            with open(
+                os.path.join(settings.BASE_DIR, 'samples', filename),
+                'r',
+                encoding='utf-8'
+                ) as file:
+                content = file.read()
+                cleaned_text = clean_txt(content)
+                # Finding starting and ending positions of Case Study in the sample document.
+                start_idx = cleaned_text.index('customer stories /')
+                end_idx = cleaned_text.index(
+                    'We’re proud to be recognized as an industry leader, view our full list of honors to learn more.'
+                    )
+                case_study = cleaned_text[start_idx:end_idx]
+                with open(os.path.join(settings.BASE_DIR, 'samples', 'case_studies.txt') , 'a') as f:
+                    for line in case_study:
+                        f.write(line)
+                        f.write('\n')
+    # NOTE: A little cleaning is done manually after this :D
+    # But that can also be automated. A low priority task for now!
+# RUN ONLY ONCE
+# generate_case_studies()
+# make_case_study_pdfs()

config.py ADDED Viewed

	@@ -0,0 +1,12 @@

+from pydantic_settings import BaseSettings, SettingsConfigDict
+class Settings(BaseSettings):
+    model_config = SettingsConfigDict(env_file='.config.env')
+    BASE_DIR: str
+    MEDIA: str
+    PINECONE_API_KEY: str
+settings = Settings()

requirements.txt ADDED Viewed

	@@ -0,0 +1,79 @@

+annotated-types==0.7.0
+anyio==4.3.0
+beautifulsoup4==4.12.3
+certifi==2024.2.2
+chardet==5.2.0
+charset-normalizer==3.3.2
+click==8.1.7
+dnspython==2.6.1
+email_validator==2.1.1
+exceptiongroup==1.2.1
+fastapi==0.111.0
+fastapi-cli==0.0.4
+filelock==3.14.0
+fsspec==2024.5.0
+h11==0.14.0
+httpcore==1.0.5
+httptools==0.6.1
+httpx==0.27.0
+huggingface-hub==0.23.1
+idna==3.7
+Jinja2==3.1.4
+joblib==1.4.2
+markdown-it-py==3.0.0
+MarkupSafe==2.1.5
+mdurl==0.1.2
+mpmath==1.3.0
+networkx==3.3
+numpy==1.26.4
+nvidia-cublas-cu12==12.1.3.1
+nvidia-cuda-cupti-cu12==12.1.105
+nvidia-cuda-nvrtc-cu12==12.1.105
+nvidia-cuda-runtime-cu12==12.1.105
+nvidia-cudnn-cu12==8.9.2.26
+nvidia-cufft-cu12==11.0.2.54
+nvidia-curand-cu12==10.3.2.106
+nvidia-cusolver-cu12==11.4.5.107
+nvidia-cusparse-cu12==12.1.0.106
+nvidia-nccl-cu12==2.20.5
+nvidia-nvjitlink-cu12==12.5.40
+nvidia-nvtx-cu12==12.1.105
+orjson==3.10.3
+packaging==24.0
+pillow==10.3.0
+pinecone-client==4.1.0
+pydantic==2.7.1
+pydantic-settings==2.2.1
+pydantic_core==2.18.2
+Pygments==2.18.0
+PyPDF2==3.0.1
+python-dotenv==1.0.1
+python-multipart==0.0.9
+PyYAML==6.0.1
+regex==2024.5.15
+reportlab==4.2.0
+requests==2.32.2
+rich==13.7.1
+safetensors==0.4.3
+scikit-learn==1.5.0
+scipy==1.13.1
+sentence-transformers==2.7.0
+shellingham==1.5.4
+sniffio==1.3.1
+soupsieve==2.5
+starlette==0.37.2
+sympy==1.12
+threadpoolctl==3.5.0
+tokenizers==0.19.1
+torch==2.3.0
+tqdm==4.66.4
+transformers==4.41.1
+triton==2.3.0
+typer==0.12.3
+typing_extensions==4.11.0
+ujson==5.10.0
+urllib3==2.2.1
+uvicorn==0.29.0
+uvloop==0.19.0
+watchfiles==0.21.0
+websockets==12.0