ibombonato commited on
Commit
caf6350
·
verified ·
1 Parent(s): a5c77b7

Upload folder using huggingface_hub

Browse files
Files changed (2) hide show
  1. app.py +10 -21
  2. requirements.txt +0 -1
app.py CHANGED
@@ -3,20 +3,17 @@ import gradio as gr
3
  import chromadb
4
  import pandas as pd
5
  from sentence_transformers import SentenceTransformer
6
- import nltk
7
 
8
  # --- 1. SETUP MODELS AND DATABASE ---
9
 
10
- # This single download is all we need.
11
- print("Downloading NLTK's 'punkt' model...")
12
- nltk.download('punkt')
13
-
14
  print("Loading embedding model...")
 
15
  embedding_model = SentenceTransformer('rufimelo/bert-large-portuguese-cased-sts')
16
 
17
  client = chromadb.Client()
18
  collection = client.get_or_create_collection(
19
- name="transcript_demo_br_model_final",
20
  metadata={"hnsw:space": "cosine"}
21
  )
22
  print("ChromaDB collection ready.")
@@ -27,16 +24,11 @@ def index_transcript(transcript_text):
27
  if not transcript_text.strip():
28
  return "Please paste a transcript before indexing.", pd.DataFrame()
29
 
30
- # --- FIX: Explicitly load the Portuguese tokenizer to avoid lookup errors ---
31
- # This file is included in the 'punkt' download.
32
- try:
33
- pt_tokenizer = nltk.data.load('tokenizers/punkt/portuguese.pickle')
34
- chunks = pt_tokenizer.tokenize(transcript_text)
35
- except Exception as e:
36
- # Fallback to default tokenizer if the Portuguese one fails for any reason
37
- print(f"Could not load Portuguese tokenizer, falling back to default. Error: {e}")
38
- chunks = nltk.sent_tokenize(transcript_text)
39
 
 
40
  chunks = [chunk.strip() for chunk in chunks if len(chunk.strip()) > 5]
41
 
42
  # Debugging logs to confirm the chunking
@@ -47,6 +39,7 @@ def index_transcript(transcript_text):
47
 
48
  ids = [f"chunk_{i}" for i in range(len(chunks))]
49
 
 
50
  if collection.count() > 0:
51
  collection.delete(ids=collection.get()['ids'])
52
 
@@ -70,12 +63,8 @@ def search_transcript(query):
70
  })
71
  return df, "Search complete."
72
 
73
- # --- 3. GRADIO INTERFACE (No changes) ---
74
- sample_transcript = """Atendente: Olá, bem-vindo à EletroMax. Meu nome é Sofia, em que posso ajudar?
75
- Cliente: Oi, Sofia. Eu comprei uma cafeteira no site de vocês na semana passada, e ela simplesmente parou de funcionar.
76
- Atendente: Puxa, que chato isso. Sinto muito pelo transtorno. Pode me informar o número do pedido para eu localizar sua compra?
77
- Cliente: Claro, o número é 11223344. Estou bem decepcionado, usei a cafeteira só duas vezes.
78
- """
79
 
80
  with gr.Blocks(theme=gr.themes.Soft(), title="Guideline Compliance Tester") as demo:
81
  gr.Markdown("# 🤖 Guideline Compliance Prototype")
 
3
  import chromadb
4
  import pandas as pd
5
  from sentence_transformers import SentenceTransformer
6
+ import re # Import the regular expression library
7
 
8
  # --- 1. SETUP MODELS AND DATABASE ---
9
 
 
 
 
 
10
  print("Loading embedding model...")
11
+ # Using the recommended Portuguese model
12
  embedding_model = SentenceTransformer('rufimelo/bert-large-portuguese-cased-sts')
13
 
14
  client = chromadb.Client()
15
  collection = client.get_or_create_collection(
16
+ name="transcript_demo_final_v3",
17
  metadata={"hnsw:space": "cosine"}
18
  )
19
  print("ChromaDB collection ready.")
 
24
  if not transcript_text.strip():
25
  return "Please paste a transcript before indexing.", pd.DataFrame()
26
 
27
+ # --- THE FIX: Remove NLTK and use a reliable Regex to split sentences ---
28
+ # This splits the text after any period, question mark, or exclamation point.
29
+ chunks = re.split(r'(?<=[.!?])\s+', transcript_text)
 
 
 
 
 
 
30
 
31
+ # Clean up any empty strings or very short fragments that might result
32
  chunks = [chunk.strip() for chunk in chunks if len(chunk.strip()) > 5]
33
 
34
  # Debugging logs to confirm the chunking
 
39
 
40
  ids = [f"chunk_{i}" for i in range(len(chunks))]
41
 
42
+ # Clear previous entries before adding new ones
43
  if collection.count() > 0:
44
  collection.delete(ids=collection.get()['ids'])
45
 
 
63
  })
64
  return df, "Search complete."
65
 
66
+ # --- 3. GRADIO INTERFACE ---
67
+ sample_transcript = """Atendente: Olá, bem-vindo à EletroMax. Meu nome é Sofia, em que posso ajudar? Cliente: Oi, Sofia. Eu comprei uma cafeteira no site de vocês na semana passada, e ela simplesmente parou de funcionar. Atendente: Puxa, que chato isso. Sinto muito pelo transtorno. Pode me informar o número do pedido para eu localizar sua compra? Cliente: Claro, o número é 11223344. Estou bem decepcionado, usei a cafeteira só duas vezes."""
 
 
 
 
68
 
69
  with gr.Blocks(theme=gr.themes.Soft(), title="Guideline Compliance Tester") as demo:
70
  gr.Markdown("# 🤖 Guideline Compliance Prototype")
requirements.txt CHANGED
@@ -2,4 +2,3 @@ gradio
2
  chromadb
3
  sentence-transformers
4
  pandas
5
- nltk
 
2
  chromadb
3
  sentence-transformers
4
  pandas