ramailkk commited on
Commit
7513ffe
·
1 Parent(s): 32005ff

deleted files

Browse files
Files changed (2) hide show
  1. data_processor.py +0 -69
  2. retriever.py +0 -78
data_processor.py DELETED
@@ -1,69 +0,0 @@
1
- import arxiv
2
- import pandas as pd
3
- from langchain_text_splitters import RecursiveCharacterTextSplitter, CharacterTextSplitter
4
- from sentence_transformers import SentenceTransformer
5
-
6
- def fetch_arxiv_data(category="cs.AI", limit=10):
7
- """Fetches paper metadata and abstracts from arXiv."""
8
- client = arxiv.Client()
9
- search = arxiv.Search(
10
- query=f"cat:{category}",
11
- max_results=limit,
12
- sort_by=arxiv.SortCriterion.SubmittedDate
13
- )
14
-
15
- results = []
16
- for r in client.results(search):
17
- results.append({
18
- "id": r.entry_id.split('/')[-1],
19
- "title": r.title,
20
- "abstract": r.summary.replace('\n', ' '),
21
- "url": r.pdf_url
22
- })
23
-
24
- return pd.DataFrame(results)
25
-
26
- def get_text_splitter(technique="recursive", chunk_size=500, chunk_overlap=50):
27
- """Returns a splitter based on the chosen technique and parameters."""
28
- if technique == "recursive":
29
- return RecursiveCharacterTextSplitter(
30
- chunk_size=chunk_size,
31
- chunk_overlap=chunk_overlap
32
- )
33
- elif technique == "character":
34
- return CharacterTextSplitter(
35
- separator="\n",
36
- chunk_size=chunk_size,
37
- chunk_overlap=chunk_overlap
38
- )
39
- else:
40
- raise ValueError(f"Technique '{technique}' not supported.")
41
-
42
- def process_to_chunks(df, model_name='all-MiniLM-L6-v2', technique="recursive", chunk_size=500, chunk_overlap=50):
43
- """Splits abstracts into chunks and generates embeddings with custom parameters."""
44
-
45
- # Initialize the specific model requested
46
- print(f"🔧 Initializing Model: {model_name}...")
47
- model = SentenceTransformer(model_name)
48
-
49
- # Initialize the specific splitter requested
50
- splitter = get_text_splitter(technique, chunk_size, chunk_overlap)
51
-
52
- processed_chunks = []
53
-
54
- for _, row in df.iterrows():
55
- chunks = splitter.split_text(row['abstract'])
56
-
57
- for i, text in enumerate(chunks):
58
- embedding = model.encode(text).tolist()
59
-
60
- processed_chunks.append({
61
- "id": f"{row['id']}-chunk-{i}",
62
- "values": embedding,
63
- "metadata": {
64
- "title": row['title'],
65
- "text": text,
66
- "url": row['url']
67
- }
68
- })
69
- return processed_chunks
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
retriever.py DELETED
@@ -1,78 +0,0 @@
1
- import numpy as np
2
- from rank_bm25 import BM25Okapi
3
- from sentence_transformers import CrossEncoder
4
-
5
- class HybridRetriever:
6
- def __init__(self, final_chunks, embed_model, rerank_model_name='cross-encoder/ms-marco-MiniLM-L-6-v2'):
7
- """
8
- Initializes the search engines.
9
- :param final_chunks: The list of chunk dictionaries with metadata.
10
- :param embed_model: The SentenceTransformer model object used for query embedding.
11
- """
12
- self.final_chunks = final_chunks
13
- self.embed_model = embed_model
14
- self.rerank_model = CrossEncoder(rerank_model_name)
15
-
16
- # Initialize BM25 locally
17
- self.tokenized_corpus = [chunk['metadata']['text'].lower().split() for chunk in final_chunks]
18
- self.bm25 = BM25Okapi(self.tokenized_corpus)
19
-
20
- def _rrf_score(self, semantic_results, bm25_results, k=60):
21
- """
22
- Reciprocal Rank Fusion (RRF) Implementation.
23
- Score = 1 / (k + rank)
24
- """
25
- scores = {}
26
-
27
- # Rank is index + 1
28
- for rank, chunk in enumerate(semantic_results):
29
- scores[chunk] = scores.get(chunk, 0) + 1 / (k + rank + 1)
30
-
31
- for rank, chunk in enumerate(bm25_results):
32
- scores[chunk] = scores.get(chunk, 0) + 1 / (k + rank + 1)
33
-
34
- # Sort by score descending
35
- sorted_chunks = sorted(scores.items(), key=lambda x: x[1], reverse=True)
36
- return [item[0] for item in sorted_chunks]
37
-
38
- def search(self, query, index, top_k=10, mode="all", rerank_type="cross-encoder"):
39
- """
40
- :param mode: "semantic", "bm25", or "all"
41
- :param rerank_type: "cross-encoder", "rrf", or "none"
42
- """
43
- semantic_chunks = []
44
- bm25_chunks = []
45
-
46
- # A. Semantic Search
47
- if mode in ["semantic", "all"]:
48
- query_vector = self.embed_model.encode(query).tolist()
49
- res = index.query(vector=query_vector, top_k=top_k, include_metadata=True)
50
- semantic_chunks = [match['metadata']['text'] for match in res['matches']]
51
-
52
- # B. Keyword Search
53
- if mode in ["bm25", "all"]:
54
- tokenized_query = query.lower().split()
55
- bm25_scores = self.bm25.get_scores(tokenized_query)
56
- top_indices = np.argsort(bm25_scores)[::-1][:top_k]
57
- bm25_chunks = [self.final_chunks[i]['metadata']['text'] for i in top_indices]
58
-
59
- # C. Combination and Re-Ranking
60
- if mode == "semantic":
61
- combined = semantic_chunks
62
- elif mode == "bm25":
63
- combined = bm25_chunks
64
- else:
65
- # Mode is "all"
66
- if rerank_type == "rrf":
67
- return self._rrf_score(semantic_chunks, bm25_chunks)[:3]
68
- else:
69
- combined = list(set(semantic_chunks + bm25_chunks))
70
-
71
- # D. Cross-Encoder Re-Ranking
72
- if rerank_type == "cross-encoder" and len(combined) > 0:
73
- pairs = [[query, chunk] for chunk in combined]
74
- scores = self.rerank_model.predict(pairs)
75
- results = sorted(zip(combined, scores), key=lambda x: x[1], reverse=True)
76
- return [res[0] for res in results[:3]]
77
-
78
- return combined[:3]