| | import os |
| | import sys |
| | import re |
| | import requests |
| | from langchain_community.vectorstores import FAISS |
| | from langchain_community.embeddings import HuggingFaceEmbeddings |
| | from langchain_core.documents import Document |
| | from langchain_text_splitters import RecursiveCharacterTextSplitter |
| |
|
| | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) |
| | from rag.logger import get_logger |
| | logger = get_logger(__name__) |
| |
|
| | base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) |
| | vector_store_path = os.path.join(base_dir, 'data', 'vectorstores') |
| |
|
| | embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") |
| |
|
| |
|
| | def clean_personal_info(text: str) -> str: |
| | patterns = [ |
| | r"\b[\w\.-]+@[\w\.-]+\.\w+\b", |
| | r"\b\d{10}\b", |
| | r"\b(?:\+?\d{1,3})?[-.\s]?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}\b", |
| | r"(http|https)://\S+", |
| | r"linkedin\.com/\S+", |
| | r"github\.com/\S+", |
| | r"@[A-Za-z0-9_]+", |
| | r"\d{1,4}\s+\w+\s+(Street|St|Road|Rd|Avenue|Ave|Lane|Ln)", |
| | ] |
| |
|
| | cleaned = text |
| | for p in patterns: |
| | cleaned = re.sub(p, "[REMOVED]", cleaned, flags=re.IGNORECASE) |
| |
|
| | return cleaned |
| | |
| |
|
| | def fetch_github_data(username: str): |
| | """ |
| | Fetch GitHub user profile and all public repos using an optional personal access token. |
| | Handles authentication and pagination. |
| | """ |
| | token = os.getenv("GITHUB_TOKEN") |
| | headers = {"Authorization": f"token {token}"} if token else {} |
| |
|
| | |
| | user_resp = requests.get(f"https://api.github.com/users/{username}", headers=headers).json() |
| | if isinstance(user_resp, dict) and "message" in user_resp: |
| | raise Exception(f"GitHub API error (user): {user_resp['message']}") |
| |
|
| | |
| | repos = [] |
| | page = 1 |
| | per_page = 100 |
| | while True: |
| | repos_resp = requests.get( |
| | f"https://api.github.com/users/{username}/repos", |
| | headers=headers, |
| | params={"per_page": per_page, "page": page} |
| | ).json() |
| |
|
| | if isinstance(repos_resp, dict) and repos_resp.get("message"): |
| | raise Exception(f"GitHub API error (repos): {repos_resp['message']}") |
| | if not repos_resp: |
| | break |
| |
|
| | repos.extend(repos_resp) |
| | if len(repos_resp) < per_page: |
| | break |
| | page += 1 |
| | logger.info('Completed fetching github data') |
| | return user_resp, repos |
| |
|
| |
|
| | |
| | |
| | |
| | def format_github_data(username: str): |
| | """ |
| | Converts GitHub profile + repo info into structured text for RAG ingestion. |
| | """ |
| | user, repos = fetch_github_data(username) |
| |
|
| | lines = [] |
| |
|
| | |
| | lines.append(f"GitHub User: {user.get('name', username)}") |
| | lines.append(f"Bio: {user.get('bio', 'No bio')}") |
| | lines.append(f"Public Repos: {user.get('public_repos', 0)}") |
| | lines.append(f"Followers: {user.get('followers', 0)}, Following: {user.get('following', 0)}") |
| | lines.append("\n--- Repositories ---\n") |
| |
|
| | |
| | token = os.getenv("GITHUB_TOKEN") |
| | headers = {"Authorization": f"token {token}"} if token else {} |
| |
|
| | for repo in repos: |
| | if not isinstance(repo, dict): |
| | continue |
| |
|
| | lines.append(f"Repository: {repo.get('name', 'Unknown')}") |
| | lines.append(f"Description: {repo.get('description', 'No description')}") |
| | lines.append(f"Stars: {repo.get('stargazers_count', 0)}, Forks: {repo.get('forks_count', 0)}") |
| |
|
| | |
| | lang_data = {} |
| | lang_url = repo.get("languages_url") |
| | if lang_url: |
| | try: |
| | lang_data = requests.get(lang_url, headers=headers).json() |
| | except Exception: |
| | lang_data = {} |
| |
|
| | if isinstance(lang_data, dict) and lang_data: |
| | lang_summary = ", ".join([f"{k} ({v} bytes)" for k, v in lang_data.items()]) |
| | else: |
| | lang_summary = "No language data" |
| |
|
| | lines.append(f"Languages: {lang_summary}") |
| | lines.append("") |
| | logger.info('Completed formatting Github data') |
| | return "\n".join(lines) |
| |
|
| |
|
| | def github_to_documents(username: str): |
| | raw_text = format_github_data(username) |
| |
|
| | cleaned = clean_personal_info(raw_text) |
| |
|
| | paragraphs = [p.strip() for p in cleaned.split("\n\n") if p.strip()] |
| |
|
| | docs = [] |
| | paragraph_id = 0 |
| |
|
| | for para in paragraphs: |
| | docs.append( |
| | Document( |
| | page_content=para, |
| | metadata={ |
| | "source": "github", |
| | "username": username, |
| | "paragraph_id": paragraph_id, |
| | } |
| | ) |
| | ) |
| | paragraph_id += 1 |
| |
|
| | splitter = RecursiveCharacterTextSplitter( |
| | chunk_size=700, |
| | chunk_overlap=200, |
| | separators=["\n\n", "\n", " ", ""], |
| | ) |
| |
|
| | chunks = splitter.split_documents(docs) |
| | return chunks |
| |
|
| | def add_github_to_vectorstore(username: str): |
| | docs = github_to_documents(username) |
| |
|
| | embeddings = HuggingFaceEmbeddings( |
| | model_name="sentence-transformers/all-MiniLM-L6-v2" |
| | ) |
| |
|
| | index_path = os.path.join(vector_store_path, "index.faiss") |
| |
|
| | if os.path.exists(index_path): |
| | vectorstore = FAISS.load_local( |
| | vector_store_path, |
| | embeddings, |
| | allow_dangerous_deserialization=True |
| | ) |
| | logger.info("Loaded existing FAISS index") |
| | else: |
| | vectorstore = FAISS.from_documents([], embeddings) |
| | logger.info("Created new FAISS index") |
| |
|
| | vectorstore.add_documents(docs) |
| | logger.info(f"Added {len(docs)} GitHub chunks") |
| |
|
| | vectorstore.save_local(vector_store_path) |
| | logger.info("Vectorstore updated successfully") |
| |
|
| |
|
| | if __name__ == '__main__': |
| | add_github_to_vectorstore(username='Raheel31') |
| | logger.info("Ingestion Run Successful") |
| |
|