File size: 5,818 Bytes
3960366
ccded5c
f8147c5
ccded5c
531ba0a
f8147c5
 
 
 
 
 
1278b3f
 
f8147c5
 
 
ccded5c
 
 
 
 
 
f8147c5
ccded5c
 
f8147c5
ccded5c
 
 
 
 
f8147c5
ccded5c
f8147c5
ccded5c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f8147c5
ccded5c
f8147c5
ccded5c
 
 
f8147c5
ccded5c
f8147c5
3960366
 
 
 
 
 
 
 
 
 
 
 
 
 
f8147c5
 
 
3960366
 
f8147c5
 
3960366
f8147c5
 
3960366
f8147c5
 
 
 
 
 
3960366
f8147c5
 
3960366
f8147c5
 
 
3960366
 
 
 
 
 
f8147c5
 
 
3960366
 
 
f8147c5
 
3960366
ccded5c
 
f8147c5
ccded5c
 
 
 
 
 
3960366
ccded5c
3960366
 
 
 
 
 
f8147c5
3960366
 
 
f8147c5
531ba0a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import asyncio
import numpy as np
import os
from sklearn.metrics.pairwise import cosine_similarity
from typing import List
from llama_index.core import VectorStoreIndex, Document, Settings, get_response_synthesizer
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.postprocessor import SimilarityPostprocessor
from llama_index.embeddings.mistralai import MistralAIEmbedding
from llama_index.llms.mistralai import MistralAI
from config import MISTRAL_API_KEY
from tools.utils import fetch_repo_files, fetch_file_content


INCLUDE_FILE_EXTENSIONS = {".py", ".js", ".ts", ".json", ".md", ".txt"}

def safe_normalize(vec: np.ndarray) -> np.ndarray:
    vec = np.nan_to_num(vec, nan=0.0, posinf=0.0, neginf=0.0)
    norm = np.linalg.norm(vec)
    if norm == 0 or np.isnan(norm) or np.isinf(norm):
        return None
    return vec / norm

def select_relevant_files_semantic(issue_description: str, file_paths: List[str]) -> List[str]:
    embed_model = MistralAIEmbedding(model_name="codestral-embed", api_key=MISTRAL_API_KEY)

    issue_embedding = np.array(embed_model.get_text_embedding(issue_description), dtype=np.float64)
    issue_embedding = safe_normalize(issue_embedding)
    if issue_embedding is None:
        print("[Warning] Issue description embedding invalid (zero or NaN norm). Returning empty list.")
        return []

    scored_files = []

    for path in file_paths:
        try:
            file_embedding = np.array(embed_model.get_text_embedding(path), dtype=np.float64)
            file_embedding = safe_normalize(file_embedding)
            if file_embedding is None:
                print(f"[Warning] Skipping {path} due to zero or invalid embedding norm.")
                continue
            
            with np.errstate(divide='ignore', invalid='ignore', over='ignore'):
                score = cosine_similarity([issue_embedding], [file_embedding])[0][0]

            if np.isnan(score) or np.isinf(score):
                print(f"[Warning] Skipping {path} due to invalid similarity score.")
                continue

            scored_files.append((path, score))
        except Exception as e:
            print(f"[Warning] Skipping {path} due to error: {e}")

    top_files = [f[0] for f in sorted(scored_files, key=lambda x: x[1], reverse=True)[:2]]

    if "README.md" in file_paths:
        if "README.md" not in top_files:
            top_files.insert(0, "README.md")

    return top_files

async def async_retry_on_429(func, *args, max_retries=3, delay=1, **kwargs):
    for attempt in range(max_retries):
        try:
            return await func(*args, **kwargs)
        except Exception as e:
            status = getattr(e, 'response', None) and getattr(e.response, 'status_code', None)
            if status == 429:
                print(f"[Retry] Rate limit hit while calling {func.__name__}. Attempt {attempt+1}/{max_retries}. Retrying in {delay} seconds...")
                await asyncio.sleep(delay)
                delay *= 2
            else:
                raise

async def build_repo_index(owner: str, repo: str, ref: str = "main", issue_description: str = "") -> VectorStoreIndex:
    model_name = "codestral-embed"
    embed_model = MistralAIEmbedding(model_name=model_name, api_key=MISTRAL_API_KEY)
    print(f"[Indexing] Starting to index repository: {owner}/{repo} at ref {ref}...")

    file_paths = await async_retry_on_429(fetch_repo_files, owner, repo, ref)

    if issue_description:
        file_paths = select_relevant_files_semantic(issue_description, file_paths)  # stays sync unless heavy

    documents = []

    for path in file_paths:
        _, ext = os.path.splitext(path)
        if ext.lower() not in INCLUDE_FILE_EXTENSIONS:
            continue

        try:
            content = await async_retry_on_429(fetch_file_content, owner, repo, path, ref)
            documents.append(Document(text=content, metadata={"file_path": path}))
            print(f"[Indexing] Added file: {path}")
            await asyncio.sleep(0.1)
        except Exception as e:
            print(f"[Warning] Skipping file {path} due to error: {e}")

    try:
        index = await async_retry_on_429(VectorStoreIndex.from_documents, documents, embed_model=embed_model)
    except Exception as e:
        print(f"[Error] Failed to build index due to: {e}")
        raise

    print(f"[Indexing] Finished indexing {len(documents)} files.")
    return index


async def retrieve_context(owner: str, repo: str, ref: str, issue_description: str) -> List[str]:
    index = await build_repo_index(owner, repo, ref, issue_description)
    Settings.llm = MistralAI(model="codestral-latest", api_key=MISTRAL_API_KEY)
    Settings.embed_model = MistralAIEmbedding(model_name="codestral-embed", api_key=MISTRAL_API_KEY)

    retriever = index.as_retriever(similarity_top_k=3)

    query_engine = RetrieverQueryEngine(
        retriever=retriever,
        response_synthesizer=get_response_synthesizer(),
        node_postprocessors=[
            SimilarityPostprocessor(similarity_top_k=3, similarity_cutoff=0.75)
        ],
    )

    query = (
        f"Please give relevant information from the codebase that highly matches the keywords of this issue and is useful for solving or understanding this issue: {issue_description}\n"
        "STRICT RULES:\n"
        "- ONLY use information available in the retriever context.\n"
        "- DO NOT generate or assume any information outside the given context.\n"
        f"- ONLY include context that is highly relevant and clearly useful for understanding or solving this issue: {issue_description}\n"
        "- DO NOT include generic, loosely related, or unrelated content.\n"
    )

    response = await asyncio.to_thread(query_engine.query, query)

    print(response)
    return response