abdul-rafey commited on
Commit
ce6fba2
1 Parent(s): 9c0e4cf

implement semantic-search and upload

Browse files
Files changed (5) hide show
  1. app/main.py +91 -0
  2. app/pinecone.py +21 -0
  3. app/utils/data_cleaner.py +110 -0
  4. config.py +12 -0
  5. requirements.txt +79 -0
app/main.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from uuid import uuid4
2
+ from fastapi import FastAPI, HTTPException, File, UploadFile, status
3
+ from fastapi.responses import FileResponse
4
+ import os
5
+
6
+
7
+ from app.utils.data_cleaner import read_pdf, text_to_pdf
8
+ from app.pinecone import pinecone_index
9
+ from app.semantic_search import generate_embeddings, get_results, insert_new_document
10
+ from config import settings
11
+
12
+ app = FastAPI()
13
+
14
+
15
+ UPLOAD_DIRECTORY = "./static"
16
+ if not os.path.exists(UPLOAD_DIRECTORY):
17
+ os.makedirs(UPLOAD_DIRECTORY)
18
+
19
+
20
+ # TODO: Remove this extra endpoint
21
+ @app.get("/")
22
+ async def root():
23
+ return {"message": "Hello World!"}
24
+
25
+ # TODO: replace "search" with "docs"
26
+ @app.get("/search")
27
+ async def search(query: str):
28
+ result = await get_results(query)
29
+ return result
30
+
31
+
32
+ @app.post("/upload")
33
+ async def upload(
34
+ title: str,
35
+ file: UploadFile = File(...),
36
+ ):
37
+ if file.content_type != "application/pdf":
38
+ raise HTTPException(
39
+ status_code=400,
40
+ detail="Only PDF files are allowed."
41
+ )
42
+ await insert_new_document(title, file)
43
+ return {"success": True, "message": "File uploaded!"}
44
+
45
+
46
+ @app.get("/media/{filename}")
47
+ async def get_file(file_path: str):
48
+ pdf_file_path = f"./media/{file_path}"
49
+ if not os.path.exists(pdf_file_path):
50
+ raise HTTPException(
51
+ status_code=status.HTTP_404_NOT_FOUND,
52
+ detail="PDF file not found"
53
+ )
54
+ filename = pdf_file_path.split("/")[-1]
55
+ return FileResponse(pdf_file_path, media_type='application/pdf', filename=filename)
56
+
57
+
58
+ @app.get("/generate_db")
59
+ async def generate_db():
60
+ await insert_all_pdfs_to_pinecone()
61
+ return {"message": "success"}
62
+
63
+
64
+ async def insert_all_pdfs_to_pinecone():
65
+ # NOTE: Requires one PDF per case study stored in a directory named "media".
66
+ # Use our functions in data_cleaner.py to create pdfs automatically from a
67
+ # list of sample text files. The data in the text files will be cleaned
68
+ # automatically to find case studies and will be saved in pdf format.
69
+
70
+ media_dir = os.path.join(settings.BASE_DIR, settings.MEDIA)
71
+ if not os.path.exists(media_dir):
72
+ raise HTTPException(
73
+ status_code=status.HTTP_404_NOT_FOUND,
74
+ detail="Directory does not exists"
75
+ )
76
+ for filename in os.listdir(media_dir):
77
+ if filename.endswith('.pdf'):
78
+ content = read_pdf(os.path.join(media_dir, filename))
79
+ title = ""
80
+ pos = content.index('Industry:')
81
+ title = content[:pos]
82
+ title = title.replace('\n', '')
83
+
84
+ content = content.replace('\n', ' ') # NOTE: this should be done before extracting title
85
+ embedding = generate_embeddings(content)
86
+ index = pinecone_index()
87
+ uid = str(uuid4())
88
+ fname = filename.split('_')[-1]
89
+ path = f"{uid}_{fname}"
90
+ text_to_pdf(content, os.path.join(media_dir, path))
91
+ index.upsert([(uid, embedding.tolist(), {"title": title, "path": path, "filename": filename})])
app/pinecone.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pinecone import Pinecone, ServerlessSpec
2
+ from config import settings
3
+
4
+ pc = Pinecone(api_key=settings.PINECONE_API_KEY)
5
+
6
+ index_name = "case-studies-index"
7
+ if index_name not in pc.list_indexes().names():
8
+ pc.create_index(
9
+ name=index_name,
10
+ dimension=384,
11
+ metric="cosine",
12
+ spec=ServerlessSpec(
13
+ cloud="aws",
14
+ region="us-east-1"
15
+ )
16
+
17
+ )
18
+
19
+ def pinecone_index():
20
+ return pc.Index(index_name)
21
+
app/utils/data_cleaner.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import textwrap
2
+ from uuid import uuid4
3
+ import PyPDF2
4
+ from bs4 import BeautifulSoup
5
+ from config import settings
6
+ import os
7
+ from reportlab.lib.pagesizes import letter
8
+ from reportlab.pdfgen import canvas
9
+
10
+
11
+ def read_pdf(file):
12
+ """
13
+ Read and the text from pdf
14
+ """
15
+ reader = PyPDF2.PdfReader(file)
16
+ pdf_text = ""
17
+ for page_num in range(len(reader.pages)):
18
+ page = reader.pages[page_num]
19
+ pdf_text += page.extract_text() + "\n"
20
+ return pdf_text
21
+
22
+
23
+ def text_to_pdf(text, output_filename):
24
+ """
25
+ Create a PDF file with the given text.
26
+ """
27
+ c = canvas.Canvas(output_filename, pagesize=letter)
28
+ width, height = letter
29
+ max_width = width - 200
30
+ avg_char_width = 7 # Adjust this value as needed based on font size
31
+ max_chars_per_line = max_width // avg_char_width
32
+ wrapped_lines = []
33
+ for line in text.split('\n'):
34
+ wrapped_lines.extend(textwrap.wrap(line, width=max_chars_per_line)) # Adjust width as needed
35
+
36
+ y_position = height - 100 # Start from the top of the page
37
+
38
+ for line in wrapped_lines:
39
+ c.drawString(100, y_position, line)
40
+ y_position -= 15
41
+ c.save()
42
+
43
+
44
+ def clean_html(html_content: str):
45
+ """
46
+ Extract all the text from html
47
+ """
48
+ soup = BeautifulSoup(html_content, 'html.parser')
49
+ for script in soup(["script", "style"]):
50
+ script.decompose()
51
+ main_content = soup.find_all(['h1', 'h2', 'h3', 'p'])
52
+ text = '\n'.join([tag.get_text(separator=" ", strip=True) for tag in main_content])
53
+ return text
54
+
55
+
56
+ def clean_txt(txt_content: str):
57
+ content = txt_content.split('\n')
58
+ cleaned_content = []
59
+ for line in content:
60
+ if line:
61
+ cleaned_content.append(line.strip())
62
+ return cleaned_content
63
+
64
+
65
+ async def make_case_study_pdfs():
66
+ case_studies = []
67
+ with open(os.path.join(settings.BASE_DIR, 'samples','case_studies.txt'), 'r') as f:
68
+ case_studies = f.read().split("CASE_STUDY /\n")
69
+ # save these as new pdf files
70
+ for i in range(1,34):
71
+ uid = str(uuid4())
72
+ filename = str(i) + '.pdf'
73
+ filename = filename.replace(" ", "_")
74
+ filename = f"{uid}_{filename}"
75
+ filepath = os.path.join(settings.BASE_DIR ,settings.MEDIA, filename)
76
+ content = case_studies[i]
77
+ text_to_pdf(content, filepath)
78
+
79
+
80
+ def generate_case_studies():
81
+ '''
82
+ Extract case studies from the given samples and store them in a text file
83
+ NOTE: Must be run only once
84
+ '''
85
+ for filename in os.listdir(os.path.join(settings.BASE_DIR, 'samples')):
86
+ if filename.endswith('.txt'):
87
+ with open(
88
+ os.path.join(settings.BASE_DIR, 'samples', filename),
89
+ 'r',
90
+ encoding='utf-8'
91
+ ) as file:
92
+ content = file.read()
93
+ cleaned_text = clean_txt(content)
94
+ # Finding starting and ending positions of Case Study in the sample document.
95
+ start_idx = cleaned_text.index('customer stories /')
96
+ end_idx = cleaned_text.index(
97
+ 'We’re proud to be recognized as an industry leader, view our full list of honors to learn more.'
98
+ )
99
+ case_study = cleaned_text[start_idx:end_idx]
100
+ with open(os.path.join(settings.BASE_DIR, 'samples', 'case_studies.txt') , 'a') as f:
101
+ for line in case_study:
102
+ f.write(line)
103
+ f.write('\n')
104
+ # NOTE: A little cleaning is done manually after this :D
105
+ # But that can also be automated. A low priority task for now!
106
+
107
+ # RUN ONLY ONCE
108
+ # generate_case_studies()
109
+ # make_case_study_pdfs()
110
+
config.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydantic_settings import BaseSettings, SettingsConfigDict
2
+
3
+
4
+ class Settings(BaseSettings):
5
+ model_config = SettingsConfigDict(env_file='.config.env')
6
+
7
+ BASE_DIR: str
8
+ MEDIA: str
9
+ PINECONE_API_KEY: str
10
+
11
+
12
+ settings = Settings()
requirements.txt ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ annotated-types==0.7.0
2
+ anyio==4.3.0
3
+ beautifulsoup4==4.12.3
4
+ certifi==2024.2.2
5
+ chardet==5.2.0
6
+ charset-normalizer==3.3.2
7
+ click==8.1.7
8
+ dnspython==2.6.1
9
+ email_validator==2.1.1
10
+ exceptiongroup==1.2.1
11
+ fastapi==0.111.0
12
+ fastapi-cli==0.0.4
13
+ filelock==3.14.0
14
+ fsspec==2024.5.0
15
+ h11==0.14.0
16
+ httpcore==1.0.5
17
+ httptools==0.6.1
18
+ httpx==0.27.0
19
+ huggingface-hub==0.23.1
20
+ idna==3.7
21
+ Jinja2==3.1.4
22
+ joblib==1.4.2
23
+ markdown-it-py==3.0.0
24
+ MarkupSafe==2.1.5
25
+ mdurl==0.1.2
26
+ mpmath==1.3.0
27
+ networkx==3.3
28
+ numpy==1.26.4
29
+ nvidia-cublas-cu12==12.1.3.1
30
+ nvidia-cuda-cupti-cu12==12.1.105
31
+ nvidia-cuda-nvrtc-cu12==12.1.105
32
+ nvidia-cuda-runtime-cu12==12.1.105
33
+ nvidia-cudnn-cu12==8.9.2.26
34
+ nvidia-cufft-cu12==11.0.2.54
35
+ nvidia-curand-cu12==10.3.2.106
36
+ nvidia-cusolver-cu12==11.4.5.107
37
+ nvidia-cusparse-cu12==12.1.0.106
38
+ nvidia-nccl-cu12==2.20.5
39
+ nvidia-nvjitlink-cu12==12.5.40
40
+ nvidia-nvtx-cu12==12.1.105
41
+ orjson==3.10.3
42
+ packaging==24.0
43
+ pillow==10.3.0
44
+ pinecone-client==4.1.0
45
+ pydantic==2.7.1
46
+ pydantic-settings==2.2.1
47
+ pydantic_core==2.18.2
48
+ Pygments==2.18.0
49
+ PyPDF2==3.0.1
50
+ python-dotenv==1.0.1
51
+ python-multipart==0.0.9
52
+ PyYAML==6.0.1
53
+ regex==2024.5.15
54
+ reportlab==4.2.0
55
+ requests==2.32.2
56
+ rich==13.7.1
57
+ safetensors==0.4.3
58
+ scikit-learn==1.5.0
59
+ scipy==1.13.1
60
+ sentence-transformers==2.7.0
61
+ shellingham==1.5.4
62
+ sniffio==1.3.1
63
+ soupsieve==2.5
64
+ starlette==0.37.2
65
+ sympy==1.12
66
+ threadpoolctl==3.5.0
67
+ tokenizers==0.19.1
68
+ torch==2.3.0
69
+ tqdm==4.66.4
70
+ transformers==4.41.1
71
+ triton==2.3.0
72
+ typer==0.12.3
73
+ typing_extensions==4.11.0
74
+ ujson==5.10.0
75
+ urllib3==2.2.1
76
+ uvicorn==0.29.0
77
+ uvloop==0.19.0
78
+ watchfiles==0.21.0
79
+ websockets==12.0