Commit
·
477cc73
0
Parent(s):
Initial commit to Hugging Face
Browse files- README.md +62 -0
- __pycache__/demo.cpython-312.pyc +0 -0
- demo.py +37 -0
- deployment_push-to-hf_Version2.ps1 +29 -0
- indexer.py +44 -0
- requirements.txt +7 -0
- setup.py +73 -0
- simple_demo.py +76 -0
README.md
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ZamAI Multilingual Embeddings
|
| 2 |
+
|
| 3 |
+
This directory contains tools and utilities for working with multilingual embedding models, with a focus on Pashto language support. The embeddings enable semantic search, document retrieval, and other natural language processing tasks across multiple languages.
|
| 4 |
+
|
| 5 |
+
## Model Information
|
| 6 |
+
|
| 7 |
+
- **Base Model**: [sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2](https://huggingface.co/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2)
|
| 8 |
+
- **Languages Supported**: 50+ including Pashto, English, Arabic, Urdu, Farsi, and more
|
| 9 |
+
- **Vector Database**: ChromaDB
|
| 10 |
+
- **Integration Framework**: LlamaIndex
|
| 11 |
+
|
| 12 |
+
## Directory Structure
|
| 13 |
+
|
| 14 |
+
```
|
| 15 |
+
embeddings/
|
| 16 |
+
├── setup.py # Setup script for the embeddings model and vector store
|
| 17 |
+
├── demo.py # Demo application with Gradio web UI
|
| 18 |
+
├── indexer.py # Utility for indexing new documents
|
| 19 |
+
├── requirements.txt # Dependencies for the embeddings components
|
| 20 |
+
└── chroma_db/ # Directory for the vector database (created on first run)
|
| 21 |
+
```
|
| 22 |
+
|
| 23 |
+
## Getting Started
|
| 24 |
+
|
| 25 |
+
1. Install the dependencies:
|
| 26 |
+
```bash
|
| 27 |
+
pip install -r models/embeddings/requirements.txt
|
| 28 |
+
```
|
| 29 |
+
|
| 30 |
+
2. Add documents to index:
|
| 31 |
+
```bash
|
| 32 |
+
# Place your text files in the data/text_corpus directory
|
| 33 |
+
python models/embeddings/indexer.py --corpus data/text_corpus/
|
| 34 |
+
```
|
| 35 |
+
|
| 36 |
+
3. Run the demo application:
|
| 37 |
+
```bash
|
| 38 |
+
python models/embeddings/demo.py
|
| 39 |
+
```
|
| 40 |
+
|
| 41 |
+
## Using the Embeddings in Your Code
|
| 42 |
+
|
| 43 |
+
```python
|
| 44 |
+
from models.embeddings.setup import setup_embedding_model
|
| 45 |
+
|
| 46 |
+
# Initialize the model and related components
|
| 47 |
+
embedding_components = setup_embedding_model()
|
| 48 |
+
|
| 49 |
+
# Get the query engine
|
| 50 |
+
query_engine = embedding_components["query_engine"]
|
| 51 |
+
|
| 52 |
+
# Query in any language
|
| 53 |
+
result = query_engine.query("What is the capital of Afghanistan?")
|
| 54 |
+
# Or in Pashto
|
| 55 |
+
result = query_engine.query("د افغانستان پلازمېنه څه ده؟")
|
| 56 |
+
|
| 57 |
+
print(result)
|
| 58 |
+
```
|
| 59 |
+
|
| 60 |
+
## Reference
|
| 61 |
+
|
| 62 |
+
This implementation is based on the [Multilingula-ZamAI-Embeddings](https://huggingface.co/tasal9/Multilingula-ZamAI-Embeddings) model from Hugging Face.
|
__pycache__/demo.cpython-312.pyc
ADDED
|
Binary file (1.68 kB). View file
|
|
|
demo.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
ZamAI Embeddings Demo Application
|
| 3 |
+
This script creates a web interface for querying documents using multilingual embeddings.
|
| 4 |
+
"""
|
| 5 |
+
import gradio as gr
|
| 6 |
+
from setup import setup_embedding_model
|
| 7 |
+
|
| 8 |
+
# Set up the embedding model and query engine
|
| 9 |
+
print("Setting up embedding model and vector database...")
|
| 10 |
+
embedding_components = setup_embedding_model()
|
| 11 |
+
query_engine = embedding_components["query_engine"]
|
| 12 |
+
|
| 13 |
+
# Define the query function
|
| 14 |
+
def answer_query(query):
|
| 15 |
+
"""Process a user query and return relevant information from indexed documents"""
|
| 16 |
+
if not query.strip():
|
| 17 |
+
return "Please enter a query."
|
| 18 |
+
|
| 19 |
+
try:
|
| 20 |
+
result = query_engine.query(query)
|
| 21 |
+
return str(result)
|
| 22 |
+
except Exception as e:
|
| 23 |
+
return f"Error processing query: {str(e)}"
|
| 24 |
+
|
| 25 |
+
# Create the Gradio interface
|
| 26 |
+
iface = gr.Interface(
|
| 27 |
+
fn=answer_query,
|
| 28 |
+
inputs=gr.Textbox(lines=2, placeholder="Ask in any language (English, Pashto, etc.)"),
|
| 29 |
+
outputs="text",
|
| 30 |
+
title="ZamAI Multilingual Embeddings Demo",
|
| 31 |
+
description="Ask questions about your documents in any language, including Pashto and English."
|
| 32 |
+
)
|
| 33 |
+
|
| 34 |
+
if __name__ == "__main__":
|
| 35 |
+
print("Starting Gradio web interface...")
|
| 36 |
+
iface.launch()
|
| 37 |
+
print("Interface closed.")
|
deployment_push-to-hf_Version2.ps1
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# push-to-hf.ps1
|
| 2 |
+
|
| 3 |
+
# 1) Login to HF CLI (you’ll be prompted for your token)
|
| 4 |
+
huggingface-cli login
|
| 5 |
+
|
| 6 |
+
# 2) Move into your embeddings dir
|
| 7 |
+
cd .\models\embeddings
|
| 8 |
+
|
| 9 |
+
# 3) Init git if needed
|
| 10 |
+
if (-not (Test-Path .git)) {
|
| 11 |
+
git init
|
| 12 |
+
}
|
| 13 |
+
|
| 14 |
+
# 4) Add or update the HF remote
|
| 15 |
+
$remoteUrl = 'https://huggingface.co/tasal9/tasal9/Multilingual-ZamAI-Embeddings'
|
| 16 |
+
if (git remote | Select-String '^origin$') {
|
| 17 |
+
git remote set-url origin $remoteUrl
|
| 18 |
+
} else {
|
| 19 |
+
git remote add origin $remoteUrl
|
| 20 |
+
}
|
| 21 |
+
|
| 22 |
+
# 5) Commit & push
|
| 23 |
+
git add .
|
| 24 |
+
git commit -m "Initial commit to Hugging Face"
|
| 25 |
+
git branch -M main
|
| 26 |
+
git push origin main --force
|
| 27 |
+
|
| 28 |
+
Write-Host "✅ Your models/embeddings folder is now live at:"
|
| 29 |
+
Write-Host " https://huggingface.co/tasal9/Multilingual-ZamAI-Embeddings/tree/main/models/embeddings"
|
indexer.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
ZamAI Document Indexer
|
| 3 |
+
This script helps add new documents to the embedding vector database.
|
| 4 |
+
"""
|
| 5 |
+
import os
|
| 6 |
+
import argparse
|
| 7 |
+
from llama_index.readers.file import SimpleDirectoryReader
|
| 8 |
+
from setup import setup_embedding_model
|
| 9 |
+
|
| 10 |
+
def index_documents(corpus_path, db_path=None):
|
| 11 |
+
"""
|
| 12 |
+
Index documents from the specified corpus path into the vector database.
|
| 13 |
+
|
| 14 |
+
Args:
|
| 15 |
+
corpus_path: Path to the directory containing documents to index
|
| 16 |
+
db_path: Optional custom path for the ChromaDB database
|
| 17 |
+
"""
|
| 18 |
+
if not os.path.exists(corpus_path):
|
| 19 |
+
print(f"Error: Directory {corpus_path} does not exist.")
|
| 20 |
+
return
|
| 21 |
+
|
| 22 |
+
if not os.listdir(corpus_path):
|
| 23 |
+
print(f"Error: No files found in {corpus_path}")
|
| 24 |
+
return
|
| 25 |
+
|
| 26 |
+
# Set up embedding model and components
|
| 27 |
+
db_path = db_path or "./models/embeddings/chroma_db"
|
| 28 |
+
embedding_components = setup_embedding_model(corpus_path=corpus_path, db_path=db_path)
|
| 29 |
+
|
| 30 |
+
print(f"Successfully indexed documents from {corpus_path}")
|
| 31 |
+
print(f"Vector database stored at {db_path}")
|
| 32 |
+
|
| 33 |
+
# Return the components if needed for further processing
|
| 34 |
+
return embedding_components
|
| 35 |
+
|
| 36 |
+
if __name__ == "__main__":
|
| 37 |
+
parser = argparse.ArgumentParser(description="Index documents for ZamAI embeddings")
|
| 38 |
+
parser.add_argument("--corpus", type=str, default="data/text_corpus/",
|
| 39 |
+
help="Path to the directory containing documents to index")
|
| 40 |
+
parser.add_argument("--db", type=str, default=None,
|
| 41 |
+
help="Path to store the ChromaDB database (optional)")
|
| 42 |
+
|
| 43 |
+
args = parser.parse_args()
|
| 44 |
+
index_documents(args.corpus, args.db)
|
requirements.txt
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ZamAI Embeddings Model Dependencies
|
| 2 |
+
llama-index>=0.9.0
|
| 3 |
+
chromadb>=0.4.0
|
| 4 |
+
llama-index-vector-stores-chroma>=0.1.0
|
| 5 |
+
sentence-transformers>=2.2.0
|
| 6 |
+
gradio>=3.50.0
|
| 7 |
+
pydantic>=2.0.0
|
setup.py
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
ZamAI Embeddings Model Setup
|
| 3 |
+
This script sets up the Multilingual ZamAI Embeddings model and vector database.
|
| 4 |
+
"""
|
| 5 |
+
import os
|
| 6 |
+
import chromadb
|
| 7 |
+
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
|
| 8 |
+
from llama_index.vector_stores.chroma import ChromaVectorStore
|
| 9 |
+
from llama_index.core import StorageContext, VectorStoreIndex
|
| 10 |
+
from llama_index.readers.file import SimpleDirectoryReader
|
| 11 |
+
|
| 12 |
+
def setup_embedding_model(corpus_path="data/text_corpus/",
|
| 13 |
+
db_path="./models/embeddings/chroma_db",
|
| 14 |
+
model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"):
|
| 15 |
+
"""
|
| 16 |
+
Set up the embedding model and vector database for multilingual document retrieval.
|
| 17 |
+
|
| 18 |
+
Args:
|
| 19 |
+
corpus_path: Path to the text corpus directory
|
| 20 |
+
db_path: Path where the ChromaDB database will be stored
|
| 21 |
+
model_name: Name of the HuggingFace embedding model to use
|
| 22 |
+
|
| 23 |
+
Returns:
|
| 24 |
+
query_engine: A query engine for searching the indexed documents
|
| 25 |
+
"""
|
| 26 |
+
# Ensure directories exist
|
| 27 |
+
os.makedirs(corpus_path, exist_ok=True)
|
| 28 |
+
os.makedirs(os.path.dirname(db_path), exist_ok=True)
|
| 29 |
+
|
| 30 |
+
# Load documents if corpus directory has files
|
| 31 |
+
if os.listdir(corpus_path):
|
| 32 |
+
text_docs = SimpleDirectoryReader(corpus_path).load_data()
|
| 33 |
+
else:
|
| 34 |
+
print(f"Warning: No documents found in {corpus_path}")
|
| 35 |
+
text_docs = []
|
| 36 |
+
|
| 37 |
+
# Initialize embedding model
|
| 38 |
+
embed_model = HuggingFaceEmbedding(model_name=model_name)
|
| 39 |
+
|
| 40 |
+
# Initialize ChromaDB
|
| 41 |
+
chroma_client = chromadb.PersistentClient(path=db_path)
|
| 42 |
+
collection = chroma_client.get_or_create_collection("zamAI_collection")
|
| 43 |
+
vector_store = ChromaVectorStore(chroma_collection=collection)
|
| 44 |
+
storage_context = StorageContext.from_defaults(vector_store=vector_store)
|
| 45 |
+
|
| 46 |
+
# Build or load index if we have documents
|
| 47 |
+
if text_docs:
|
| 48 |
+
index = VectorStoreIndex.from_documents(
|
| 49 |
+
text_docs, storage_context=storage_context, embed_model=embed_model
|
| 50 |
+
)
|
| 51 |
+
else:
|
| 52 |
+
# If no documents yet, just initialize the index with the embedding model
|
| 53 |
+
index = VectorStoreIndex.from_vector_store(
|
| 54 |
+
vector_store=vector_store,
|
| 55 |
+
embed_model=embed_model,
|
| 56 |
+
storage_context=storage_context
|
| 57 |
+
)
|
| 58 |
+
|
| 59 |
+
# Create a query engine
|
| 60 |
+
query_engine = index.as_query_engine()
|
| 61 |
+
|
| 62 |
+
return {
|
| 63 |
+
"index": index,
|
| 64 |
+
"query_engine": query_engine,
|
| 65 |
+
"embed_model": embed_model,
|
| 66 |
+
"vector_store": vector_store
|
| 67 |
+
}
|
| 68 |
+
|
| 69 |
+
if __name__ == "__main__":
|
| 70 |
+
# Example usage
|
| 71 |
+
embedding_components = setup_embedding_model()
|
| 72 |
+
print("Embedding model and vector store setup complete!")
|
| 73 |
+
print("You can now use the embedding_components['query_engine'] to search your documents.")
|
simple_demo.py
ADDED
|
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
ZamAI Simple Multilingual Embeddings Demo
|
| 3 |
+
This script demonstrates embedding sentences in multiple languages, including Pashto.
|
| 4 |
+
"""
|
| 5 |
+
from sentence_transformers import SentenceTransformer
|
| 6 |
+
import numpy as np
|
| 7 |
+
|
| 8 |
+
def cosine_similarity(vec1, vec2):
|
| 9 |
+
"""Calculate cosine similarity between two vectors"""
|
| 10 |
+
dot_product = np.dot(vec1, vec2)
|
| 11 |
+
norm1 = np.linalg.norm(vec1)
|
| 12 |
+
norm2 = np.linalg.norm(vec2)
|
| 13 |
+
return dot_product / (norm1 * norm2)
|
| 14 |
+
|
| 15 |
+
def print_similarities(model, sentences, query_idx=0):
|
| 16 |
+
"""Print similarity scores between a query and all other sentences"""
|
| 17 |
+
# Get embeddings for all sentences
|
| 18 |
+
embeddings = model.encode(sentences)
|
| 19 |
+
|
| 20 |
+
# Get the query embedding
|
| 21 |
+
query_embedding = embeddings[query_idx]
|
| 22 |
+
query = sentences[query_idx]
|
| 23 |
+
|
| 24 |
+
print(f"Query: '{query}'")
|
| 25 |
+
print("Similarities:")
|
| 26 |
+
|
| 27 |
+
# Calculate similarities with all other sentences
|
| 28 |
+
for i, sentence in enumerate(sentences):
|
| 29 |
+
if i == query_idx:
|
| 30 |
+
continue
|
| 31 |
+
|
| 32 |
+
similarity = cosine_similarity(query_embedding, embeddings[i])
|
| 33 |
+
print(f"- {similarity:.4f}: '{sentence}'")
|
| 34 |
+
print()
|
| 35 |
+
|
| 36 |
+
def main():
|
| 37 |
+
# Load the multilingual model
|
| 38 |
+
print("Loading multilingual embedding model...")
|
| 39 |
+
model_name = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
|
| 40 |
+
model = SentenceTransformer(model_name)
|
| 41 |
+
print(f"Model loaded: {model_name}")
|
| 42 |
+
|
| 43 |
+
# English-English similarity
|
| 44 |
+
print("\n===== English-English Similarity =====")
|
| 45 |
+
english_sentences = [
|
| 46 |
+
"This is a sample sentence in English.",
|
| 47 |
+
"This sentence is similar to the first one.",
|
| 48 |
+
"This sentence has nothing to do with the others."
|
| 49 |
+
]
|
| 50 |
+
print_similarities(model, english_sentences, query_idx=0)
|
| 51 |
+
|
| 52 |
+
# Pashto-Pashto similarity
|
| 53 |
+
print("\n===== Pashto-Pashto Similarity =====")
|
| 54 |
+
pashto_sentences = [
|
| 55 |
+
"دا په پښتو کې یوه نمونه جمله ده.", # This is a sample sentence in Pashto.
|
| 56 |
+
"دا جمله د لومړۍ جملې سره ورته ده.", # This sentence is similar to the first one.
|
| 57 |
+
"دا جمله د نورو سره هېڅ تړاو نلري." # This sentence has nothing to do with the others.
|
| 58 |
+
]
|
| 59 |
+
print_similarities(model, pashto_sentences, query_idx=0)
|
| 60 |
+
|
| 61 |
+
# Cross-lingual similarity (English-Pashto)
|
| 62 |
+
print("\n===== Cross-lingual Similarity (English-Pashto) =====")
|
| 63 |
+
cross_lingual_sentences = [
|
| 64 |
+
"This is a sample sentence in English.",
|
| 65 |
+
"دا په پښتو کې یوه نمونه جمله ده.", # This is a sample sentence in Pashto.
|
| 66 |
+
"I'm learning to speak Pashto.",
|
| 67 |
+
"زه د پښتو ژبې زده کړه کوم." # I'm learning the Pashto language.
|
| 68 |
+
]
|
| 69 |
+
print_similarities(model, cross_lingual_sentences, query_idx=0)
|
| 70 |
+
|
| 71 |
+
# Cross-lingual similarity (Pashto-English)
|
| 72 |
+
print("\n===== Cross-lingual Similarity (Pashto-English) =====")
|
| 73 |
+
print_similarities(model, cross_lingual_sentences, query_idx=1)
|
| 74 |
+
|
| 75 |
+
if __name__ == "__main__":
|
| 76 |
+
main()
|