Update app.py
Browse files
app.py
CHANGED
@@ -46,29 +46,31 @@ def build_pdf_index():
|
|
46 |
|
47 |
documents = [LangchainDocument(page_content=t) for t in texts]
|
48 |
|
49 |
-
#
|
50 |
sentence_model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
|
51 |
|
52 |
-
|
53 |
-
|
54 |
-
# پروگرس بار اضافه میکنیم
|
55 |
progress_bar = st.progress(0)
|
56 |
total_docs = len(documents)
|
57 |
|
58 |
-
#
|
59 |
-
|
60 |
-
batch_embedding = sentence_model.encode(doc.page_content, convert_to_numpy=True)
|
61 |
-
embeddings.append(batch_embedding)
|
62 |
|
63 |
-
|
64 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
65 |
|
66 |
-
# اطمینان از اینکه خروجی NumpyArray است
|
67 |
embeddings = np.array(embeddings)
|
68 |
|
69 |
return documents, embeddings
|
70 |
|
71 |
-
|
72 |
# ----------------- تعریف LLM از Groq -----------------
|
73 |
# groq_api_key = "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"
|
74 |
|
|
|
46 |
|
47 |
documents = [LangchainDocument(page_content=t) for t in texts]
|
48 |
|
49 |
+
# مدل Embedding
|
50 |
sentence_model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
|
51 |
|
52 |
+
# پروگرس بار
|
|
|
|
|
53 |
progress_bar = st.progress(0)
|
54 |
total_docs = len(documents)
|
55 |
|
56 |
+
# آمادهسازی دادهها
|
57 |
+
texts_to_encode = [doc.page_content for doc in documents]
|
|
|
|
|
58 |
|
59 |
+
# انکود بچی
|
60 |
+
batch_size = 32 # سایز دلخواه
|
61 |
+
embeddings = []
|
62 |
+
for i in range(0, total_docs, batch_size):
|
63 |
+
batch_texts = texts_to_encode[i:i+batch_size]
|
64 |
+
batch_embeddings = sentence_model.encode(batch_texts, convert_to_numpy=True)
|
65 |
+
embeddings.extend(batch_embeddings)
|
66 |
+
|
67 |
+
# بروزرسانی پروگرس بار
|
68 |
+
progress_bar.progress(min((i + batch_size) / total_docs, 1.0))
|
69 |
|
|
|
70 |
embeddings = np.array(embeddings)
|
71 |
|
72 |
return documents, embeddings
|
73 |
|
|
|
74 |
# ----------------- تعریف LLM از Groq -----------------
|
75 |
# groq_api_key = "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"
|
76 |
|