Spaces:

dataprincess
/

ask-anjibot-anything

Sleeping

App Files Files Community

dataprincess commited on Oct 5

Commit

48b4cd8

•

1 Parent(s): 9636641

Update app.py

Browse files

Files changed (1) hide show

app.py +1 -12

app.py CHANGED Viewed

@@ -25,7 +25,7 @@ with open(FILE_PATH, 'r') as file:
 pc = Pinecone(api_key=PINECONE_API_KEY)
 spec = ServerlessSpec(cloud="aws", region='us-east-1')
 existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]
-# Check if index already exists; if not, create it
 if INDEX_NAME not in existing_indexes:
     pc.create_index(INDEX_NAME, dimension=DIMS, metric='cosine', spec=spec)
@@ -54,41 +54,32 @@ for i in tqdm(range(0, len(data['id']), BATCH_SIZE)):
     index.upsert(vectors=to_upsert)
 def extract_course_code(text) -> list[str]:
-    # Improved pattern with correct case insensitivity and spacing allowance
     pattern = r'\b(?:geds?|stats?|maths?|cosc|seng|itgy)\s*\d{3}\b'
     match = re.findall(pattern, text, re.IGNORECASE)
     return match if match else None
 def get_docs(query: str, top_k: int) -> list[str]:
-    # Extract course code(s) from the query
     course_code = extract_course_code(query)
     exact_matches = []
     if course_code:
-        # Normalize course_code to lowercase for case-insensitive matching
         course_code = [code.lower() for code in course_code]
-        # Check for exact match in metadata
         exact_matches = [
             x['content'] for x in data['metadata']
             if any(code in x['content'].lower() for code in course_code)
         ]
-    # Calculate remaining slots if we have fewer than top_k exact matches
     remaining_slots = top_k - len(exact_matches)
     if remaining_slots > 0:
-        # Perform embedding search for either the entire top_k if no exact match, or the remaining slots
         xq = encoder.encode(query)
         res = index.query(vector=xq.tolist(), top_k=remaining_slots if exact_matches else top_k, include_metadata=True)
-        # Add embedding-based matches (avoiding duplicates)
         embedding_matches = [x["metadata"]['content'] for x in res["matches"]]
-        # Combine exact matches with embedding matches
         exact_matches.extend(embedding_matches)
-    # Return the first top_k results
     return exact_matches[:top_k]
 def get_response(query: str, docs: list[str]) -> str:
@@ -114,10 +105,8 @@ def get_response(query: str, docs: list[str]) -> str:
 def handle_query(user_query: str):
-    # Get relevant documents
     docs = get_docs(user_query, top_k=5)
-    # Generate and return response
     response = get_response(user_query, docs=docs)
     for word in response.split():

 pc = Pinecone(api_key=PINECONE_API_KEY)
 spec = ServerlessSpec(cloud="aws", region='us-east-1')
 existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]
 if INDEX_NAME not in existing_indexes:
     pc.create_index(INDEX_NAME, dimension=DIMS, metric='cosine', spec=spec)
     index.upsert(vectors=to_upsert)
 def extract_course_code(text) -> list[str]:
     pattern = r'\b(?:geds?|stats?|maths?|cosc|seng|itgy)\s*\d{3}\b'
     match = re.findall(pattern, text, re.IGNORECASE)
     return match if match else None
 def get_docs(query: str, top_k: int) -> list[str]:
     course_code = extract_course_code(query)
     exact_matches = []
     if course_code:
         course_code = [code.lower() for code in course_code]
         exact_matches = [
             x['content'] for x in data['metadata']
             if any(code in x['content'].lower() for code in course_code)
         ]
     remaining_slots = top_k - len(exact_matches)
     if remaining_slots > 0:
         xq = encoder.encode(query)
         res = index.query(vector=xq.tolist(), top_k=remaining_slots if exact_matches else top_k, include_metadata=True)
         embedding_matches = [x["metadata"]['content'] for x in res["matches"]]
         exact_matches.extend(embedding_matches)
     return exact_matches[:top_k]
 def get_response(query: str, docs: list[str]) -> str:
 def handle_query(user_query: str):
     docs = get_docs(user_query, top_k=5)
     response = get_response(user_query, docs=docs)
     for word in response.split():