Spaces:

Qar-Raz
/

NLP-RAG

Sleeping

App Files Files Community

ramailkk commited on 23 days ago

Commit

32005ff

1 Parent(s): 04f14ca

data to generation

Browse files

Files changed (9) hide show

data_loader.py +41 -0
models/deepseek_v3.py +23 -0
models/llama_3_8b.py +20 -0
models/mistral_7b.py +35 -0
models/qwen_2_5.py +20 -0
models/tiny_aya.py +31 -0
retriever/generator.py +19 -0
retriever/processor.py +111 -0
retriever/retriever.py +108 -0

data_loader.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import fitz  # PyMuPDF
+import requests
+import io
+import arxiv
+import pandas as pd
+def extract_text_from_url(pdf_url):
+    """Downloads a PDF and extracts all text."""
+    try:
+        response = requests.get(pdf_url)
+        # Open the PDF directly from the byte stream
+        with fitz.open(stream=io.BytesIO(response.content), filetype="pdf") as doc:
+            text = ""
+            for page in doc:
+                text += page.get_text()
+            return text.replace('\n', ' ')
+    except Exception as e:
+        print(f"Error downloading {pdf_url}: {e}")
+        return ""
+def fetch_arxiv_data(category="cs.AI", limit=5):
+    client = arxiv.Client()
+    search = arxiv.Search(
+        query=f"cat:{category}",
+        max_results=limit,
+        sort_by=arxiv.SortCriterion.SubmittedDate
+    )
+    results = []
+    for r in client.results(search):
+        print(f"Downloading full text for: {r.title[:50]}...")
+        full_text = extract_text_from_url(r.pdf_url)
+        results.append({
+            "id": r.entry_id.split('/')[-1],
+            "title": r.title,
+            "abstract": r.summary.replace('\n', ' '),
+            "full_text": full_text, # <--- NEW FIELD
+            "url": r.pdf_url
+        })
+    return pd.DataFrame(results)

models/deepseek_v3.py ADDED Viewed

	@@ -0,0 +1,23 @@

+from huggingface_hub import InferenceClient
+class DeepSeek_V3:
+    def __init__(self, token):
+        self.client = InferenceClient(token=token)
+        self.model_id = "deepseek-ai/DeepSeek-V3"
+    def generate(self, prompt, max_tokens=500, temperature=0.15):
+        response = ""
+        try:
+            for message in self.client.chat_completion(
+                model=self.model_id,
+                messages=[{"role": "user", "content": prompt}],
+                max_tokens=max_tokens,
+                temperature=temperature,
+                stream=True,
+            ):
+                if message.choices:
+                    content = message.choices[0].delta.content
+                    if content: response += content
+        except Exception as e:
+            return f"⚠️ DeepSeek API Busy: {e}"
+        return response

models/llama_3_8b.py ADDED Viewed

	@@ -0,0 +1,20 @@

+from huggingface_hub import InferenceClient
+class Llama3_8B:
+    def __init__(self, token):
+        self.client = InferenceClient(token=token)
+        self.model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
+    def generate(self, prompt, max_tokens=500, temp=0.1):
+        response = ""
+        for message in self.client.chat_completion(
+            model=self.model_id,
+            messages=[{"role": "user", "content": prompt}],
+            max_tokens=max_tokens,
+            temperature=temp,
+            stream=True,
+        ):
+            if message.choices:
+                content = message.choices[0].delta.content
+                if content: response += content
+        return response

models/mistral_7b.py ADDED Viewed

	@@ -0,0 +1,35 @@

+import os
+from huggingface_hub import InferenceClient
+class Mistral_7b:
+    def __init__(self, token):
+        # Initializing with api_key as per latest documentation
+        self.client = InferenceClient(api_key=token)
+        # Using the specific provider suffix
+        self.model_id = "mistralai/Mistral-7B-Instruct-v0.2:featherless-ai"
+    def generate(self, prompt, max_tokens=500, **kwargs):
+        # Extract temperature, defaulting to 0.2 if not provided
+        temperature = kwargs.get('temperature', kwargs.get('temp', 0.2))
+        response = ""
+        try:
+            # Using the new .chat.completions.create syntax for Featherless
+            stream = self.client.chat.completions.create(
+                model=self.model_id,
+                messages=[{"role": "user", "content": prompt}],
+                max_tokens=max_tokens,
+                temperature=temperature,
+                stream=True,
+            )
+            for chunk in stream:
+                # Accessing content through the standard completion object structure
+                if chunk.choices and chunk.choices[0].delta.content:
+                    content = chunk.choices[0].delta.content
+                    response += content
+        except Exception as e:
+            return f"❌ Mistral Featherless Error: {e}"
+        return response

models/qwen_2_5.py ADDED Viewed

	@@ -0,0 +1,20 @@

+from huggingface_hub import InferenceClient
+class Qwen2_5:
+    def __init__(self, token):
+        self.client = InferenceClient(token=token)
+        self.model_id = "Qwen/Qwen2.5-72B-Instruct"
+    def generate(self, prompt, max_tokens=500, temperature=0.3):
+        response = ""
+        for message in self.client.chat_completion(
+            model=self.model_id,
+            messages=[{"role": "user", "content": prompt}],
+            max_tokens=max_tokens,
+            temperature=temperature,
+            stream=True,
+        ):
+            if message.choices:
+                content = message.choices[0].delta.content
+                if content: response += content
+        return response

models/tiny_aya.py ADDED Viewed

	@@ -0,0 +1,31 @@

+from huggingface_hub import InferenceClient
+class TinyAya:
+    def __init__(self, token):
+        self.client = InferenceClient(token=token)
+        # 3.3B parameter model, great for multilingual/efficient RAG
+        self.model_id = "CohereLabs/tiny-aya-global"
+    def generate(self, prompt, max_tokens=400, **kwargs):
+        """
+        Using **kwargs makes this compatible with calls using 'temp' or 'temperature'.
+        """
+        # This line looks for 'temperature', then 'temp', and defaults to 0.3 if neither exist
+        temperature = kwargs.get('temperature', kwargs.get('temp', 0.3))
+        response = ""
+        try:
+            for message in self.client.chat_completion(
+                model=self.model_id,
+                messages=[{"role": "user", "content": prompt}],
+                max_tokens=max_tokens,
+                temperature=temperature,
+                stream=True,
+            ):
+                if message.choices:
+                    content = message.choices[0].delta.content
+                    if content: response += content
+        except Exception as e:
+            return f"❌ TinyAya Error: {e}"
+        return response

retriever/generator.py ADDED Viewed

	@@ -0,0 +1,19 @@

+class RAGGenerator:
+    def generate_prompt(self, query, retrieved_contexts):
+        """Prepares the academic prompt template."""
+        context_text = "\n\n".join([f"--- Source {i+1} ---\n{c}" for i, c in enumerate(retrieved_contexts)])
+        return f"""You are an expert academic assistant. Use the following pieces of retrieved context to answer the question.
+If the answer isn't in the context, say you don't know based on the provided documents.
+Context:
+{context_text}
+Question: {query}
+Answer:"""
+    def get_answer(self, model_instance, query, retrieved_contexts, **kwargs):
+        """Uses a specific model instance to generate the final answer."""
+        prompt = self.generate_prompt(query, retrieved_contexts)
+        return model_instance.generate(prompt, **kwargs)

retriever/processor.py ADDED Viewed

	@@ -0,0 +1,111 @@

+from langchain_text_splitters import (
+    RecursiveCharacterTextSplitter,
+    CharacterTextSplitter,
+    SentenceTransformersTokenTextSplitter
+)
+from langchain_experimental.text_splitter import SemanticChunker
+from langchain_huggingface import HuggingFaceEmbeddings
+from sentence_transformers import SentenceTransformer
+class ChunkProcessor:
+    def __init__(self, model_name='all-MiniLM-L6-v2'):
+        self.model_name = model_name
+        self.encoder = SentenceTransformer(model_name)
+        # Required for Semantic Chunking
+        self.hf_embeddings = HuggingFaceEmbeddings(model_name=model_name)
+    def get_splitter(self, technique, chunk_size=500, chunk_overlap=50, **kwargs):
+        """
+        Factory method to return different chunking strategies.
+        """
+        if technique == "fixed":
+            return CharacterTextSplitter(
+                separator=kwargs.get('separator', ""),
+                chunk_size=chunk_size,
+                chunk_overlap=chunk_overlap
+            )
+        elif technique == "recursive":
+            return RecursiveCharacterTextSplitter(
+                chunk_size=chunk_size,
+                chunk_overlap=chunk_overlap
+            )
+        elif technique == "character":
+            return CharacterTextSplitter(
+                separator=kwargs.get('separator', "\n\n"),
+                chunk_size=chunk_size,
+                chunk_overlap=chunk_overlap
+            )
+        elif technique == "sentence":
+            # Using Recursive Splitter configured specifically for sentence boundaries
+            # This avoids the Spacy [E050] error while still respecting full sentences.
+            return RecursiveCharacterTextSplitter(
+                chunk_size=chunk_size,
+                chunk_overlap=chunk_overlap,
+                separators=["\n\n", "\n", ". ", "? ", "! ", " ", ""]
+            )
+        elif technique == "semantic":
+            return SemanticChunker(
+                self.hf_embeddings,
+                breakpoint_threshold_type="percentile"
+            )
+        elif technique == "token":
+            return SentenceTransformersTokenTextSplitter(
+                model_name=self.model_name,
+                tokens_per_chunk=chunk_size,
+                chunk_overlap=chunk_overlap
+            )
+        else:
+            raise ValueError(f"Technique '{technique}' is not supported.")
+    def process(self, df, technique="recursive", chunk_size=500, chunk_overlap=50, **kwargs):
+        """
+        Processes a DataFrame into vector-ready chunks with full output for 5 documents.
+        """
+        splitter = self.get_splitter(technique, chunk_size, chunk_overlap, **kwargs)
+        processed_chunks = []
+        # Take the first 5 documents as requested
+        subset_df = df.head(5)
+        for _, row in subset_df.iterrows():
+            print(f"\n" + "="*80)
+            print(f"📄 DOCUMENT: {row['title']}")
+            print(f"🔗 URL: {row['url']}")
+            print("-" * 80)
+            # Split the text
+            raw_chunks = splitter.split_text(row['full_text'])
+            print(f"🎯 Technique: {technique.upper()} | Total Chunks: {len(raw_chunks)}")
+            for i, text in enumerate(raw_chunks):
+                # Standardize output
+                content = text.page_content if hasattr(text, 'page_content') else text
+                # Print the full content of every chunk
+                print(f"\n[Chunk {i}] ({len(content)} chars):")
+                print(f"   {content}")
+                # Embedding
+                embedding = self.encoder.encode(content).tolist()
+                processed_chunks.append({
+                    "id": f"{row['id']}-chunk-{i}",
+                    "values": embedding,
+                    "metadata": {
+                        "title": row['title'],
+                        "text": content,
+                        "url": row['url'],
+                        "chunk_index": i,
+                        "technique": technique
+                    }
+                })
+            print("="*80)
+        print(f"\n✅ Finished processing 5 documents into {len(processed_chunks)} chunks.")
+        return processed_chunks

retriever/retriever.py ADDED Viewed

	@@ -0,0 +1,108 @@

+import numpy as np
+from rank_bm25 import BM25Okapi
+from sentence_transformers import CrossEncoder
+from sklearn.metrics.pairwise import cosine_similarity
+class HybridRetriever:
+    def __init__(self, final_chunks, embed_model, rerank_model_name='cross-encoder/ms-marco-MiniLM-L-6-v2'):
+        """
+        :param final_chunks: The list of chunk dictionaries with metadata.
+        :param embed_model: The SentenceTransformer model used for query and chunk embedding.
+        """
+        self.final_chunks = final_chunks
+        self.embed_model = embed_model
+        self.rerank_model = CrossEncoder(rerank_model_name)
+        # Initialize BM25 corpus
+        self.tokenized_corpus = [chunk['metadata']['text'].lower().split() for chunk in final_chunks]
+        self.bm25 = BM25Okapi(self.tokenized_corpus)
+    def _rrf_score(self, semantic_results, bm25_results, k=60):
+        """Reciprocal Rank Fusion (RRF) Implementation."""
+        scores = {}
+        for rank, chunk in enumerate(semantic_results):
+            scores[chunk] = scores.get(chunk, 0) + 1 / (k + rank + 1)
+        for rank, chunk in enumerate(bm25_results):
+            scores[chunk] = scores.get(chunk, 0) + 1 / (k + rank + 1)
+        sorted_chunks = sorted(scores.items(), key=lambda x: x[1], reverse=True)
+        return [item[0] for item in sorted_chunks]
+    def _maximal_marginal_relevance(self, query_embedding, chunk_texts, lambda_param=0.5, top_k=3):
+        """
+        MMR Re-ranking to balance relevance and diversity.
+        """
+        if not chunk_texts: return []
+        chunk_embeddings = self.embed_model.encode(chunk_texts)
+        query_embedding = query_embedding.reshape(1, -1)
+        # Initial relevance scores
+        relevance_scores = cosine_similarity(query_embedding, chunk_embeddings)[0]
+        selected_indices = []
+        unselected_indices = list(range(len(chunk_texts)))
+        # First pick: most relevant
+        idx = np.argmax(relevance_scores)
+        selected_indices.append(idx)
+        unselected_indices.remove(idx)
+        while len(selected_indices) < min(top_k, len(chunk_texts)):
+            mmr_scores = []
+            for un_idx in unselected_indices:
+                # Similarity to query
+                rel = relevance_scores[un_idx]
+                # Max similarity to already selected chunks (redundancy)
+                sim_to_selected = max([cosine_similarity(chunk_embeddings[un_idx].reshape(1, -1),
+                                                        chunk_embeddings[sel_idx].reshape(1, -1))[0][0]
+                                       for sel_idx in selected_indices])
+                mmr_score = lambda_param * rel - (1 - lambda_param) * sim_to_selected
+                mmr_scores.append((un_idx, mmr_score))
+            next_idx = max(mmr_scores, key=lambda x: x[1])[0]
+            selected_indices.append(next_idx)
+            unselected_indices.remove(next_idx)
+        return [chunk_texts[i] for i in selected_indices]
+    def search(self, query, index, top_k=10, final_k=3, mode="hybrid", rerank_strategy="cross-encoder"):
+        """
+        :param mode: "semantic", "bm25", or "hybrid"
+        :param rerank_strategy: "cross-encoder", "rrf", "mmr", or "none"
+        """
+        semantic_chunks = []
+        bm25_chunks = []
+        query_vector = None
+        # 1. Fetch Candidates
+        if mode in ["semantic", "hybrid"]:
+            query_vector = self.embed_model.encode(query)
+            res = index.query(vector=query_vector.tolist(), top_k=top_k, include_metadata=True)
+            semantic_chunks = [match['metadata']['text'] for match in res['matches']]
+        if mode in ["bm25", "hybrid"]:
+            tokenized_query = query.lower().split()
+            bm25_scores = self.bm25.get_scores(tokenized_query)
+            top_indices = np.argsort(bm25_scores)[::-1][:top_k]
+            bm25_chunks = [self.final_chunks[i]['metadata']['text'] for i in top_indices]
+        # 2. Re-Ranking / Fusion
+        if mode == "hybrid" and rerank_strategy == "rrf":
+            return self._rrf_score(semantic_chunks, bm25_chunks)[:final_k]
+        # Standard combination for other methods
+        combined = list(dict.fromkeys(semantic_chunks + bm25_chunks)) # Deduplicate keep order
+        if rerank_strategy == "cross-encoder" and combined:
+            pairs = [[query, chunk] for chunk in combined]
+            scores = self.rerank_model.predict(pairs)
+            results = sorted(zip(combined, scores), key=lambda x: x[1], reverse=True)
+            return [res[0] for res in results[:final_k]]
+        elif rerank_strategy == "mmr" and combined:
+            if query_vector is None: query_vector = self.embed_model.encode(query)
+            return self._maximal_marginal_relevance(query_vector, combined, top_k=final_k)
+        return combined[:final_k]