Initial commit to Hugging Face

Browse files

Files changed (8) hide show

README.md +62 -0
__pycache__/demo.cpython-312.pyc +0 -0
demo.py +37 -0
deployment_push-to-hf_Version2.ps1 +29 -0
indexer.py +44 -0
requirements.txt +7 -0
setup.py +73 -0
simple_demo.py +76 -0

README.md ADDED Viewed

	@@ -0,0 +1,62 @@

+# ZamAI Multilingual Embeddings
+This directory contains tools and utilities for working with multilingual embedding models, with a focus on Pashto language support. The embeddings enable semantic search, document retrieval, and other natural language processing tasks across multiple languages.
+## Model Information
+- **Base Model**: [sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2](https://huggingface.co/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2)
+- **Languages Supported**: 50+ including Pashto, English, Arabic, Urdu, Farsi, and more
+- **Vector Database**: ChromaDB
+- **Integration Framework**: LlamaIndex
+## Directory Structure
+```
+embeddings/
+├── setup.py           # Setup script for the embeddings model and vector store
+├── demo.py            # Demo application with Gradio web UI
+├── indexer.py         # Utility for indexing new documents
+├── requirements.txt   # Dependencies for the embeddings components
+└── chroma_db/         # Directory for the vector database (created on first run)
+```
+## Getting Started
+1. Install the dependencies:
+   ```bash
+   pip install -r models/embeddings/requirements.txt
+   ```
+2. Add documents to index:
+   ```bash
+   # Place your text files in the data/text_corpus directory
+   python models/embeddings/indexer.py --corpus data/text_corpus/
+   ```
+3. Run the demo application:
+   ```bash
+   python models/embeddings/demo.py
+   ```
+## Using the Embeddings in Your Code
+```python
+from models.embeddings.setup import setup_embedding_model
+# Initialize the model and related components
+embedding_components = setup_embedding_model()
+# Get the query engine
+query_engine = embedding_components["query_engine"]
+# Query in any language
+result = query_engine.query("What is the capital of Afghanistan?")
+# Or in Pashto
+result = query_engine.query("د افغانستان پلازمېنه څه ده؟")
+print(result)
+```
+## Reference
+This implementation is based on the [Multilingula-ZamAI-Embeddings](https://huggingface.co/tasal9/Multilingula-ZamAI-Embeddings) model from Hugging Face.

__pycache__/demo.cpython-312.pyc ADDED Viewed

Binary file (1.68 kB). View file

demo.py ADDED Viewed

	@@ -0,0 +1,37 @@

+"""
+ZamAI Embeddings Demo Application
+This script creates a web interface for querying documents using multilingual embeddings.
+"""
+import gradio as gr
+from setup import setup_embedding_model
+# Set up the embedding model and query engine
+print("Setting up embedding model and vector database...")
+embedding_components = setup_embedding_model()
+query_engine = embedding_components["query_engine"]
+# Define the query function
+def answer_query(query):
+    """Process a user query and return relevant information from indexed documents"""
+    if not query.strip():
+        return "Please enter a query."
+    try:
+        result = query_engine.query(query)
+        return str(result)
+    except Exception as e:
+        return f"Error processing query: {str(e)}"
+# Create the Gradio interface
+iface = gr.Interface(
+    fn=answer_query,
+    inputs=gr.Textbox(lines=2, placeholder="Ask in any language (English, Pashto, etc.)"),
+    outputs="text",
+    title="ZamAI Multilingual Embeddings Demo",
+    description="Ask questions about your documents in any language, including Pashto and English."
+)
+if __name__ == "__main__":
+    print("Starting Gradio web interface...")
+    iface.launch()
+    print("Interface closed.")

deployment_push-to-hf_Version2.ps1 ADDED Viewed

	@@ -0,0 +1,29 @@

+# push-to-hf.ps1
+# 1) Login to HF CLI (you’ll be prompted for your token)
+huggingface-cli login
+# 2) Move into your embeddings dir
+cd .\models\embeddings
+# 3) Init git if needed
+if (-not (Test-Path .git)) {
+    git init
+}
+# 4) Add or update the HF remote
+$remoteUrl = 'https://huggingface.co/tasal9/tasal9/Multilingual-ZamAI-Embeddings'
+if (git remote | Select-String '^origin$') {
+    git remote set-url origin $remoteUrl
+} else {
+    git remote add origin $remoteUrl
+}
+# 5) Commit & push
+git add .
+git commit -m "Initial commit to Hugging Face"
+git branch -M main
+git push origin main --force
+Write-Host "✅ Your models/embeddings folder is now live at:"
+Write-Host "   https://huggingface.co/tasal9/Multilingual-ZamAI-Embeddings/tree/main/models/embeddings"

indexer.py ADDED Viewed

	@@ -0,0 +1,44 @@

+"""
+ZamAI Document Indexer
+This script helps add new documents to the embedding vector database.
+"""
+import os
+import argparse
+from llama_index.readers.file import SimpleDirectoryReader
+from setup import setup_embedding_model
+def index_documents(corpus_path, db_path=None):
+    """
+    Index documents from the specified corpus path into the vector database.
+    Args:
+        corpus_path: Path to the directory containing documents to index
+        db_path: Optional custom path for the ChromaDB database
+    """
+    if not os.path.exists(corpus_path):
+        print(f"Error: Directory {corpus_path} does not exist.")
+        return
+    if not os.listdir(corpus_path):
+        print(f"Error: No files found in {corpus_path}")
+        return
+    # Set up embedding model and components
+    db_path = db_path or "./models/embeddings/chroma_db"
+    embedding_components = setup_embedding_model(corpus_path=corpus_path, db_path=db_path)
+    print(f"Successfully indexed documents from {corpus_path}")
+    print(f"Vector database stored at {db_path}")
+    # Return the components if needed for further processing
+    return embedding_components
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Index documents for ZamAI embeddings")
+    parser.add_argument("--corpus", type=str, default="data/text_corpus/",
+                        help="Path to the directory containing documents to index")
+    parser.add_argument("--db", type=str, default=None,
+                        help="Path to store the ChromaDB database (optional)")
+    args = parser.parse_args()
+    index_documents(args.corpus, args.db)

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+# ZamAI Embeddings Model Dependencies
+llama-index>=0.9.0
+chromadb>=0.4.0
+llama-index-vector-stores-chroma>=0.1.0
+sentence-transformers>=2.2.0
+gradio>=3.50.0
+pydantic>=2.0.0

setup.py ADDED Viewed

	@@ -0,0 +1,73 @@

+"""
+ZamAI Embeddings Model Setup
+This script sets up the Multilingual ZamAI Embeddings model and vector database.
+"""
+import os
+import chromadb
+from llama_index.embeddings.huggingface import HuggingFaceEmbedding
+from llama_index.vector_stores.chroma import ChromaVectorStore
+from llama_index.core import StorageContext, VectorStoreIndex
+from llama_index.readers.file import SimpleDirectoryReader
+def setup_embedding_model(corpus_path="data/text_corpus/",
+                         db_path="./models/embeddings/chroma_db",
+                         model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"):
+    """
+    Set up the embedding model and vector database for multilingual document retrieval.
+    Args:
+        corpus_path: Path to the text corpus directory
+        db_path: Path where the ChromaDB database will be stored
+        model_name: Name of the HuggingFace embedding model to use
+    Returns:
+        query_engine: A query engine for searching the indexed documents
+    """
+    # Ensure directories exist
+    os.makedirs(corpus_path, exist_ok=True)
+    os.makedirs(os.path.dirname(db_path), exist_ok=True)
+    # Load documents if corpus directory has files
+    if os.listdir(corpus_path):
+        text_docs = SimpleDirectoryReader(corpus_path).load_data()
+    else:
+        print(f"Warning: No documents found in {corpus_path}")
+        text_docs = []
+    # Initialize embedding model
+    embed_model = HuggingFaceEmbedding(model_name=model_name)
+    # Initialize ChromaDB
+    chroma_client = chromadb.PersistentClient(path=db_path)
+    collection = chroma_client.get_or_create_collection("zamAI_collection")
+    vector_store = ChromaVectorStore(chroma_collection=collection)
+    storage_context = StorageContext.from_defaults(vector_store=vector_store)
+    # Build or load index if we have documents
+    if text_docs:
+        index = VectorStoreIndex.from_documents(
+            text_docs, storage_context=storage_context, embed_model=embed_model
+        )
+    else:
+        # If no documents yet, just initialize the index with the embedding model
+        index = VectorStoreIndex.from_vector_store(
+            vector_store=vector_store,
+            embed_model=embed_model,
+            storage_context=storage_context
+        )
+    # Create a query engine
+    query_engine = index.as_query_engine()
+    return {
+        "index": index,
+        "query_engine": query_engine,
+        "embed_model": embed_model,
+        "vector_store": vector_store
+    }
+if __name__ == "__main__":
+    # Example usage
+    embedding_components = setup_embedding_model()
+    print("Embedding model and vector store setup complete!")
+    print("You can now use the embedding_components['query_engine'] to search your documents.")

simple_demo.py ADDED Viewed

	@@ -0,0 +1,76 @@

+"""
+ZamAI Simple Multilingual Embeddings Demo
+This script demonstrates embedding sentences in multiple languages, including Pashto.
+"""
+from sentence_transformers import SentenceTransformer
+import numpy as np
+def cosine_similarity(vec1, vec2):
+    """Calculate cosine similarity between two vectors"""
+    dot_product = np.dot(vec1, vec2)
+    norm1 = np.linalg.norm(vec1)
+    norm2 = np.linalg.norm(vec2)
+    return dot_product / (norm1 * norm2)
+def print_similarities(model, sentences, query_idx=0):
+    """Print similarity scores between a query and all other sentences"""
+    # Get embeddings for all sentences
+    embeddings = model.encode(sentences)
+    # Get the query embedding
+    query_embedding = embeddings[query_idx]
+    query = sentences[query_idx]
+    print(f"Query: '{query}'")
+    print("Similarities:")
+    # Calculate similarities with all other sentences
+    for i, sentence in enumerate(sentences):
+        if i == query_idx:
+            continue
+        similarity = cosine_similarity(query_embedding, embeddings[i])
+        print(f"- {similarity:.4f}: '{sentence}'")
+    print()
+def main():
+    # Load the multilingual model
+    print("Loading multilingual embedding model...")
+    model_name = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
+    model = SentenceTransformer(model_name)
+    print(f"Model loaded: {model_name}")
+    # English-English similarity
+    print("\n===== English-English Similarity =====")
+    english_sentences = [
+        "This is a sample sentence in English.",
+        "This sentence is similar to the first one.",
+        "This sentence has nothing to do with the others."
+    ]
+    print_similarities(model, english_sentences, query_idx=0)
+    # Pashto-Pashto similarity
+    print("\n===== Pashto-Pashto Similarity =====")
+    pashto_sentences = [
+        "دا په پښتو کې یوه نمونه جمله ده.",  # This is a sample sentence in Pashto.
+        "دا جمله د لومړۍ جملې سره ورته ده.",  # This sentence is similar to the first one.
+        "دا جمله د نورو سره هېڅ تړاو نلري."   # This sentence has nothing to do with the others.
+    ]
+    print_similarities(model, pashto_sentences, query_idx=0)
+    # Cross-lingual similarity (English-Pashto)
+    print("\n===== Cross-lingual Similarity (English-Pashto) =====")
+    cross_lingual_sentences = [
+        "This is a sample sentence in English.",
+        "دا په پښتو کې یوه نمونه جمله ده.",  # This is a sample sentence in Pashto.
+        "I'm learning to speak Pashto.",
+        "زه د پښتو ژبې زده کړه کوم."         # I'm learning the Pashto language.
+    ]
+    print_similarities(model, cross_lingual_sentences, query_idx=0)
+    # Cross-lingual similarity (Pashto-English)
+    print("\n===== Cross-lingual Similarity (Pashto-English) =====")
+    print_similarities(model, cross_lingual_sentences, query_idx=1)
+if __name__ == "__main__":
+    main()