tasal9 commited on
Commit
477cc73
·
0 Parent(s):

Initial commit to Hugging Face

Browse files
README.md ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ZamAI Multilingual Embeddings
2
+
3
+ This directory contains tools and utilities for working with multilingual embedding models, with a focus on Pashto language support. The embeddings enable semantic search, document retrieval, and other natural language processing tasks across multiple languages.
4
+
5
+ ## Model Information
6
+
7
+ - **Base Model**: [sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2](https://huggingface.co/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2)
8
+ - **Languages Supported**: 50+ including Pashto, English, Arabic, Urdu, Farsi, and more
9
+ - **Vector Database**: ChromaDB
10
+ - **Integration Framework**: LlamaIndex
11
+
12
+ ## Directory Structure
13
+
14
+ ```
15
+ embeddings/
16
+ ├── setup.py # Setup script for the embeddings model and vector store
17
+ ├── demo.py # Demo application with Gradio web UI
18
+ ├── indexer.py # Utility for indexing new documents
19
+ ├── requirements.txt # Dependencies for the embeddings components
20
+ └── chroma_db/ # Directory for the vector database (created on first run)
21
+ ```
22
+
23
+ ## Getting Started
24
+
25
+ 1. Install the dependencies:
26
+ ```bash
27
+ pip install -r models/embeddings/requirements.txt
28
+ ```
29
+
30
+ 2. Add documents to index:
31
+ ```bash
32
+ # Place your text files in the data/text_corpus directory
33
+ python models/embeddings/indexer.py --corpus data/text_corpus/
34
+ ```
35
+
36
+ 3. Run the demo application:
37
+ ```bash
38
+ python models/embeddings/demo.py
39
+ ```
40
+
41
+ ## Using the Embeddings in Your Code
42
+
43
+ ```python
44
+ from models.embeddings.setup import setup_embedding_model
45
+
46
+ # Initialize the model and related components
47
+ embedding_components = setup_embedding_model()
48
+
49
+ # Get the query engine
50
+ query_engine = embedding_components["query_engine"]
51
+
52
+ # Query in any language
53
+ result = query_engine.query("What is the capital of Afghanistan?")
54
+ # Or in Pashto
55
+ result = query_engine.query("د افغانستان پلازمېنه څه ده؟")
56
+
57
+ print(result)
58
+ ```
59
+
60
+ ## Reference
61
+
62
+ This implementation is based on the [Multilingula-ZamAI-Embeddings](https://huggingface.co/tasal9/Multilingula-ZamAI-Embeddings) model from Hugging Face.
__pycache__/demo.cpython-312.pyc ADDED
Binary file (1.68 kB). View file
 
demo.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ZamAI Embeddings Demo Application
3
+ This script creates a web interface for querying documents using multilingual embeddings.
4
+ """
5
+ import gradio as gr
6
+ from setup import setup_embedding_model
7
+
8
+ # Set up the embedding model and query engine
9
+ print("Setting up embedding model and vector database...")
10
+ embedding_components = setup_embedding_model()
11
+ query_engine = embedding_components["query_engine"]
12
+
13
+ # Define the query function
14
+ def answer_query(query):
15
+ """Process a user query and return relevant information from indexed documents"""
16
+ if not query.strip():
17
+ return "Please enter a query."
18
+
19
+ try:
20
+ result = query_engine.query(query)
21
+ return str(result)
22
+ except Exception as e:
23
+ return f"Error processing query: {str(e)}"
24
+
25
+ # Create the Gradio interface
26
+ iface = gr.Interface(
27
+ fn=answer_query,
28
+ inputs=gr.Textbox(lines=2, placeholder="Ask in any language (English, Pashto, etc.)"),
29
+ outputs="text",
30
+ title="ZamAI Multilingual Embeddings Demo",
31
+ description="Ask questions about your documents in any language, including Pashto and English."
32
+ )
33
+
34
+ if __name__ == "__main__":
35
+ print("Starting Gradio web interface...")
36
+ iface.launch()
37
+ print("Interface closed.")
deployment_push-to-hf_Version2.ps1 ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # push-to-hf.ps1
2
+
3
+ # 1) Login to HF CLI (you’ll be prompted for your token)
4
+ huggingface-cli login
5
+
6
+ # 2) Move into your embeddings dir
7
+ cd .\models\embeddings
8
+
9
+ # 3) Init git if needed
10
+ if (-not (Test-Path .git)) {
11
+ git init
12
+ }
13
+
14
+ # 4) Add or update the HF remote
15
+ $remoteUrl = 'https://huggingface.co/tasal9/tasal9/Multilingual-ZamAI-Embeddings'
16
+ if (git remote | Select-String '^origin$') {
17
+ git remote set-url origin $remoteUrl
18
+ } else {
19
+ git remote add origin $remoteUrl
20
+ }
21
+
22
+ # 5) Commit & push
23
+ git add .
24
+ git commit -m "Initial commit to Hugging Face"
25
+ git branch -M main
26
+ git push origin main --force
27
+
28
+ Write-Host "✅ Your models/embeddings folder is now live at:"
29
+ Write-Host " https://huggingface.co/tasal9/Multilingual-ZamAI-Embeddings/tree/main/models/embeddings"
indexer.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ZamAI Document Indexer
3
+ This script helps add new documents to the embedding vector database.
4
+ """
5
+ import os
6
+ import argparse
7
+ from llama_index.readers.file import SimpleDirectoryReader
8
+ from setup import setup_embedding_model
9
+
10
+ def index_documents(corpus_path, db_path=None):
11
+ """
12
+ Index documents from the specified corpus path into the vector database.
13
+
14
+ Args:
15
+ corpus_path: Path to the directory containing documents to index
16
+ db_path: Optional custom path for the ChromaDB database
17
+ """
18
+ if not os.path.exists(corpus_path):
19
+ print(f"Error: Directory {corpus_path} does not exist.")
20
+ return
21
+
22
+ if not os.listdir(corpus_path):
23
+ print(f"Error: No files found in {corpus_path}")
24
+ return
25
+
26
+ # Set up embedding model and components
27
+ db_path = db_path or "./models/embeddings/chroma_db"
28
+ embedding_components = setup_embedding_model(corpus_path=corpus_path, db_path=db_path)
29
+
30
+ print(f"Successfully indexed documents from {corpus_path}")
31
+ print(f"Vector database stored at {db_path}")
32
+
33
+ # Return the components if needed for further processing
34
+ return embedding_components
35
+
36
+ if __name__ == "__main__":
37
+ parser = argparse.ArgumentParser(description="Index documents for ZamAI embeddings")
38
+ parser.add_argument("--corpus", type=str, default="data/text_corpus/",
39
+ help="Path to the directory containing documents to index")
40
+ parser.add_argument("--db", type=str, default=None,
41
+ help="Path to store the ChromaDB database (optional)")
42
+
43
+ args = parser.parse_args()
44
+ index_documents(args.corpus, args.db)
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ # ZamAI Embeddings Model Dependencies
2
+ llama-index>=0.9.0
3
+ chromadb>=0.4.0
4
+ llama-index-vector-stores-chroma>=0.1.0
5
+ sentence-transformers>=2.2.0
6
+ gradio>=3.50.0
7
+ pydantic>=2.0.0
setup.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ZamAI Embeddings Model Setup
3
+ This script sets up the Multilingual ZamAI Embeddings model and vector database.
4
+ """
5
+ import os
6
+ import chromadb
7
+ from llama_index.embeddings.huggingface import HuggingFaceEmbedding
8
+ from llama_index.vector_stores.chroma import ChromaVectorStore
9
+ from llama_index.core import StorageContext, VectorStoreIndex
10
+ from llama_index.readers.file import SimpleDirectoryReader
11
+
12
+ def setup_embedding_model(corpus_path="data/text_corpus/",
13
+ db_path="./models/embeddings/chroma_db",
14
+ model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"):
15
+ """
16
+ Set up the embedding model and vector database for multilingual document retrieval.
17
+
18
+ Args:
19
+ corpus_path: Path to the text corpus directory
20
+ db_path: Path where the ChromaDB database will be stored
21
+ model_name: Name of the HuggingFace embedding model to use
22
+
23
+ Returns:
24
+ query_engine: A query engine for searching the indexed documents
25
+ """
26
+ # Ensure directories exist
27
+ os.makedirs(corpus_path, exist_ok=True)
28
+ os.makedirs(os.path.dirname(db_path), exist_ok=True)
29
+
30
+ # Load documents if corpus directory has files
31
+ if os.listdir(corpus_path):
32
+ text_docs = SimpleDirectoryReader(corpus_path).load_data()
33
+ else:
34
+ print(f"Warning: No documents found in {corpus_path}")
35
+ text_docs = []
36
+
37
+ # Initialize embedding model
38
+ embed_model = HuggingFaceEmbedding(model_name=model_name)
39
+
40
+ # Initialize ChromaDB
41
+ chroma_client = chromadb.PersistentClient(path=db_path)
42
+ collection = chroma_client.get_or_create_collection("zamAI_collection")
43
+ vector_store = ChromaVectorStore(chroma_collection=collection)
44
+ storage_context = StorageContext.from_defaults(vector_store=vector_store)
45
+
46
+ # Build or load index if we have documents
47
+ if text_docs:
48
+ index = VectorStoreIndex.from_documents(
49
+ text_docs, storage_context=storage_context, embed_model=embed_model
50
+ )
51
+ else:
52
+ # If no documents yet, just initialize the index with the embedding model
53
+ index = VectorStoreIndex.from_vector_store(
54
+ vector_store=vector_store,
55
+ embed_model=embed_model,
56
+ storage_context=storage_context
57
+ )
58
+
59
+ # Create a query engine
60
+ query_engine = index.as_query_engine()
61
+
62
+ return {
63
+ "index": index,
64
+ "query_engine": query_engine,
65
+ "embed_model": embed_model,
66
+ "vector_store": vector_store
67
+ }
68
+
69
+ if __name__ == "__main__":
70
+ # Example usage
71
+ embedding_components = setup_embedding_model()
72
+ print("Embedding model and vector store setup complete!")
73
+ print("You can now use the embedding_components['query_engine'] to search your documents.")
simple_demo.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ZamAI Simple Multilingual Embeddings Demo
3
+ This script demonstrates embedding sentences in multiple languages, including Pashto.
4
+ """
5
+ from sentence_transformers import SentenceTransformer
6
+ import numpy as np
7
+
8
+ def cosine_similarity(vec1, vec2):
9
+ """Calculate cosine similarity between two vectors"""
10
+ dot_product = np.dot(vec1, vec2)
11
+ norm1 = np.linalg.norm(vec1)
12
+ norm2 = np.linalg.norm(vec2)
13
+ return dot_product / (norm1 * norm2)
14
+
15
+ def print_similarities(model, sentences, query_idx=0):
16
+ """Print similarity scores between a query and all other sentences"""
17
+ # Get embeddings for all sentences
18
+ embeddings = model.encode(sentences)
19
+
20
+ # Get the query embedding
21
+ query_embedding = embeddings[query_idx]
22
+ query = sentences[query_idx]
23
+
24
+ print(f"Query: '{query}'")
25
+ print("Similarities:")
26
+
27
+ # Calculate similarities with all other sentences
28
+ for i, sentence in enumerate(sentences):
29
+ if i == query_idx:
30
+ continue
31
+
32
+ similarity = cosine_similarity(query_embedding, embeddings[i])
33
+ print(f"- {similarity:.4f}: '{sentence}'")
34
+ print()
35
+
36
+ def main():
37
+ # Load the multilingual model
38
+ print("Loading multilingual embedding model...")
39
+ model_name = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
40
+ model = SentenceTransformer(model_name)
41
+ print(f"Model loaded: {model_name}")
42
+
43
+ # English-English similarity
44
+ print("\n===== English-English Similarity =====")
45
+ english_sentences = [
46
+ "This is a sample sentence in English.",
47
+ "This sentence is similar to the first one.",
48
+ "This sentence has nothing to do with the others."
49
+ ]
50
+ print_similarities(model, english_sentences, query_idx=0)
51
+
52
+ # Pashto-Pashto similarity
53
+ print("\n===== Pashto-Pashto Similarity =====")
54
+ pashto_sentences = [
55
+ "دا په پښتو کې یوه نمونه جمله ده.", # This is a sample sentence in Pashto.
56
+ "دا جمله د لومړۍ جملې سره ورته ده.", # This sentence is similar to the first one.
57
+ "دا جمله د نورو سره هېڅ تړاو نلري." # This sentence has nothing to do with the others.
58
+ ]
59
+ print_similarities(model, pashto_sentences, query_idx=0)
60
+
61
+ # Cross-lingual similarity (English-Pashto)
62
+ print("\n===== Cross-lingual Similarity (English-Pashto) =====")
63
+ cross_lingual_sentences = [
64
+ "This is a sample sentence in English.",
65
+ "دا په پښتو کې یوه نمونه جمله ده.", # This is a sample sentence in Pashto.
66
+ "I'm learning to speak Pashto.",
67
+ "زه د پښتو ژبې زده کړه کوم." # I'm learning the Pashto language.
68
+ ]
69
+ print_similarities(model, cross_lingual_sentences, query_idx=0)
70
+
71
+ # Cross-lingual similarity (Pashto-English)
72
+ print("\n===== Cross-lingual Similarity (Pashto-English) =====")
73
+ print_similarities(model, cross_lingual_sentences, query_idx=1)
74
+
75
+ if __name__ == "__main__":
76
+ main()