Spaces:

ibombonato
/

Semantic-search-br

Running

App Files Files Community

ibombonato commited on Jul 9

Commit

caf6350

verified ·

1 Parent(s): a5c77b7

Upload folder using huggingface_hub

Browse files

Files changed (2) hide show

app.py +10 -21
requirements.txt +0 -1

app.py CHANGED Viewed

@@ -3,20 +3,17 @@ import gradio as gr
 import chromadb
 import pandas as pd
 from sentence_transformers import SentenceTransformer
-import nltk
 # --- 1. SETUP MODELS AND DATABASE ---
-# This single download is all we need.
-print("Downloading NLTK's 'punkt' model...")
-nltk.download('punkt')
 print("Loading embedding model...")
 embedding_model = SentenceTransformer('rufimelo/bert-large-portuguese-cased-sts')
 client = chromadb.Client()
 collection = client.get_or_create_collection(
-    name="transcript_demo_br_model_final",
     metadata={"hnsw:space": "cosine"}
 )
 print("ChromaDB collection ready.")
@@ -27,16 +24,11 @@ def index_transcript(transcript_text):
     if not transcript_text.strip():
         return "Please paste a transcript before indexing.", pd.DataFrame()
-    # --- FIX: Explicitly load the Portuguese tokenizer to avoid lookup errors ---
-    # This file is included in the 'punkt' download.
-    try:
-        pt_tokenizer = nltk.data.load('tokenizers/punkt/portuguese.pickle')
-        chunks = pt_tokenizer.tokenize(transcript_text)
-    except Exception as e:
-        # Fallback to default tokenizer if the Portuguese one fails for any reason
-        print(f"Could not load Portuguese tokenizer, falling back to default. Error: {e}")
-        chunks = nltk.sent_tokenize(transcript_text)
     chunks = [chunk.strip() for chunk in chunks if len(chunk.strip()) > 5]
     # Debugging logs to confirm the chunking
@@ -47,6 +39,7 @@ def index_transcript(transcript_text):
     ids = [f"chunk_{i}" for i in range(len(chunks))]
     if collection.count() > 0:
         collection.delete(ids=collection.get()['ids'])
@@ -70,12 +63,8 @@ def search_transcript(query):
     })
     return df, "Search complete."
-# --- 3. GRADIO INTERFACE (No changes) ---
-sample_transcript = """Atendente: Olá, bem-vindo à EletroMax. Meu nome é Sofia, em que posso ajudar?
-Cliente: Oi, Sofia. Eu comprei uma cafeteira no site de vocês na semana passada, e ela simplesmente parou de funcionar.
-Atendente: Puxa, que chato isso. Sinto muito pelo transtorno. Pode me informar o número do pedido para eu localizar sua compra?
-Cliente: Claro, o número é 11223344. Estou bem decepcionado, usei a cafeteira só duas vezes.
-"""
 with gr.Blocks(theme=gr.themes.Soft(), title="Guideline Compliance Tester") as demo:
     gr.Markdown("# 🤖 Guideline Compliance Prototype")

 import chromadb
 import pandas as pd
 from sentence_transformers import SentenceTransformer
+import re # Import the regular expression library
 # --- 1. SETUP MODELS AND DATABASE ---
 print("Loading embedding model...")
+# Using the recommended Portuguese model
 embedding_model = SentenceTransformer('rufimelo/bert-large-portuguese-cased-sts')
 client = chromadb.Client()
 collection = client.get_or_create_collection(
+    name="transcript_demo_final_v3",
     metadata={"hnsw:space": "cosine"}
 )
 print("ChromaDB collection ready.")
     if not transcript_text.strip():
         return "Please paste a transcript before indexing.", pd.DataFrame()
+    # --- THE FIX: Remove NLTK and use a reliable Regex to split sentences ---
+    # This splits the text after any period, question mark, or exclamation point.
+    chunks = re.split(r'(?<=[.!?])\s+', transcript_text)
+    # Clean up any empty strings or very short fragments that might result
     chunks = [chunk.strip() for chunk in chunks if len(chunk.strip()) > 5]
     # Debugging logs to confirm the chunking
     ids = [f"chunk_{i}" for i in range(len(chunks))]
+    # Clear previous entries before adding new ones
     if collection.count() > 0:
         collection.delete(ids=collection.get()['ids'])
     })
     return df, "Search complete."
+# --- 3. GRADIO INTERFACE ---
+sample_transcript = """Atendente: Olá, bem-vindo à EletroMax. Meu nome é Sofia, em que posso ajudar? Cliente: Oi, Sofia. Eu comprei uma cafeteira no site de vocês na semana passada, e ela simplesmente parou de funcionar. Atendente: Puxa, que chato isso. Sinto muito pelo transtorno. Pode me informar o número do pedido para eu localizar sua compra? Cliente: Claro, o número é 11223344. Estou bem decepcionado, usei a cafeteira só duas vezes."""
 with gr.Blocks(theme=gr.themes.Soft(), title="Guideline Compliance Tester") as demo:
     gr.Markdown("# 🤖 Guideline Compliance Prototype")

requirements.txt CHANGED Viewed

@@ -2,4 +2,3 @@ gradio
 chromadb
 sentence-transformers
 pandas
-nltk

 chromadb
 sentence-transformers
 pandas