Upload folder using huggingface_hub
Browse files- .env.example +18 -0
- .gitignore +60 -0
- Dockerfile +26 -0
- HUGGINGFACE_SPACE_CONFIG.md +119 -0
- README.md +71 -0
- app.py +266 -0
- config.py +42 -0
- plan.txt +1199 -0
- requirements.txt +16 -0
- run_local.py +63 -0
- services/__init__.py +1 -0
- services/chat_service.py +148 -0
- services/embedding_service.py +89 -0
- services/github_service.py +43 -0
- setup.py +39 -0
- utils/__init__.py +1 -0
- utils/file_processor.py +71 -0
.env.example
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Hugging Face API Token (optional, for better rate limits)
|
| 2 |
+
HUGGINGFACE_API_KEY=your_huggingface_token_here
|
| 3 |
+
|
| 4 |
+
# GitHub Token (optional, for private repos or better rate limits)
|
| 5 |
+
GITHUB_TOKEN=your_github_token_here
|
| 6 |
+
|
| 7 |
+
# LLM Provider Configuration
|
| 8 |
+
LLM_PROVIDER=huggingface
|
| 9 |
+
EMBEDDING_PROVIDER=sentence_transformers
|
| 10 |
+
|
| 11 |
+
# Hugging Face Configuration
|
| 12 |
+
HUGGINGFACE_MODEL=microsoft/DialoGPT-medium
|
| 13 |
+
|
| 14 |
+
# Embedding Model Configuration
|
| 15 |
+
SENTENCE_TRANSFORMER_MODEL=all-MiniLM-L6-v2
|
| 16 |
+
|
| 17 |
+
# Vector Database Path
|
| 18 |
+
VECTOR_DB_PATH=./chroma_db
|
.gitignore
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Python
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*$py.class
|
| 5 |
+
*.so
|
| 6 |
+
.Python
|
| 7 |
+
build/
|
| 8 |
+
develop-eggs/
|
| 9 |
+
dist/
|
| 10 |
+
downloads/
|
| 11 |
+
eggs/
|
| 12 |
+
.eggs/
|
| 13 |
+
lib/
|
| 14 |
+
lib64/
|
| 15 |
+
parts/
|
| 16 |
+
sdist/
|
| 17 |
+
var/
|
| 18 |
+
wheels/
|
| 19 |
+
*.egg-info/
|
| 20 |
+
.installed.cfg
|
| 21 |
+
*.egg
|
| 22 |
+
MANIFEST
|
| 23 |
+
|
| 24 |
+
# Virtual environments
|
| 25 |
+
venv/
|
| 26 |
+
env/
|
| 27 |
+
ENV/
|
| 28 |
+
.venv/
|
| 29 |
+
.ENV/
|
| 30 |
+
|
| 31 |
+
# Environment variables
|
| 32 |
+
.env
|
| 33 |
+
.env.local
|
| 34 |
+
|
| 35 |
+
# IDE
|
| 36 |
+
.vscode/
|
| 37 |
+
.idea/
|
| 38 |
+
*.swp
|
| 39 |
+
*.swo
|
| 40 |
+
*~
|
| 41 |
+
|
| 42 |
+
# OS
|
| 43 |
+
.DS_Store
|
| 44 |
+
.DS_Store?
|
| 45 |
+
._*
|
| 46 |
+
.Spotlight-V100
|
| 47 |
+
.Trashes
|
| 48 |
+
ehthumbs.db
|
| 49 |
+
Thumbs.db
|
| 50 |
+
|
| 51 |
+
# Project specific
|
| 52 |
+
chroma_db/
|
| 53 |
+
models/
|
| 54 |
+
*.log
|
| 55 |
+
logs/
|
| 56 |
+
temp/
|
| 57 |
+
tmp/
|
| 58 |
+
|
| 59 |
+
# Git
|
| 60 |
+
.git/
|
Dockerfile
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.11-slim
|
| 2 |
+
|
| 3 |
+
WORKDIR /app
|
| 4 |
+
|
| 5 |
+
# Install system dependencies
|
| 6 |
+
RUN apt-get update && apt-get install -y \
|
| 7 |
+
git \
|
| 8 |
+
build-essential \
|
| 9 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 10 |
+
|
| 11 |
+
# Copy requirements first for better caching
|
| 12 |
+
COPY requirements.txt .
|
| 13 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 14 |
+
|
| 15 |
+
# Pre-download embedding models to cache them
|
| 16 |
+
RUN python -c "from sentence_transformers import SentenceTransformer; SentenceTransformer('all-MiniLM-L6-v2', cache_folder='./models')"
|
| 17 |
+
|
| 18 |
+
# Copy application code
|
| 19 |
+
COPY . .
|
| 20 |
+
|
| 21 |
+
# Create necessary directories
|
| 22 |
+
RUN mkdir -p chroma_db models
|
| 23 |
+
|
| 24 |
+
EXPOSE 7860
|
| 25 |
+
|
| 26 |
+
CMD ["python", "app.py"]
|
HUGGINGFACE_SPACE_CONFIG.md
ADDED
|
@@ -0,0 +1,119 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Hugging Face Space Configuration
|
| 2 |
+
|
| 3 |
+
This document contains the configuration needed to deploy this application as a Hugging Face Space.
|
| 4 |
+
|
| 5 |
+
## Space Configuration
|
| 6 |
+
|
| 7 |
+
### Basic Settings
|
| 8 |
+
- **Space Name**: `chat-with-github-repo`
|
| 9 |
+
- **Space Type**: `Gradio`
|
| 10 |
+
- **Python Version**: `3.11`
|
| 11 |
+
- **Visibility**: `Public`
|
| 12 |
+
|
| 13 |
+
### Environment Variables (Optional)
|
| 14 |
+
|
| 15 |
+
Set these in your Hugging Face Space settings for better performance:
|
| 16 |
+
|
| 17 |
+
```
|
| 18 |
+
HUGGINGFACE_API_KEY=your_hf_token_here
|
| 19 |
+
GITHUB_TOKEN=your_github_token_here
|
| 20 |
+
```
|
| 21 |
+
|
| 22 |
+
### Hardware Requirements
|
| 23 |
+
|
| 24 |
+
- **CPU**: Basic (free tier works)
|
| 25 |
+
- **RAM**: 8GB+ recommended for larger repositories
|
| 26 |
+
- **Storage**: 10GB+ for model caching
|
| 27 |
+
|
| 28 |
+
## Deployment Steps
|
| 29 |
+
|
| 30 |
+
1. **Create a new Hugging Face Space**:
|
| 31 |
+
- Go to https://huggingface.co/new-space
|
| 32 |
+
- Choose "Gradio" as the Space SDK
|
| 33 |
+
- Set the space name and visibility
|
| 34 |
+
|
| 35 |
+
2. **Upload files**:
|
| 36 |
+
- Upload all files from this directory to your space
|
| 37 |
+
- Ensure the main `app.py` file is in the root directory
|
| 38 |
+
|
| 39 |
+
3. **Configure environment variables** (optional):
|
| 40 |
+
- Go to your space settings
|
| 41 |
+
- Add the environment variables listed above
|
| 42 |
+
- This improves rate limits and enables private repo access
|
| 43 |
+
|
| 44 |
+
4. **Deploy**:
|
| 45 |
+
- The space will automatically build and deploy
|
| 46 |
+
- First deployment may take 5-10 minutes due to model downloads
|
| 47 |
+
|
| 48 |
+
## File Structure for Hugging Face Space
|
| 49 |
+
|
| 50 |
+
```
|
| 51 |
+
your-space/
|
| 52 |
+
├── app.py # Main Gradio application
|
| 53 |
+
├── requirements.txt # Python dependencies
|
| 54 |
+
├── README.md # Space documentation
|
| 55 |
+
├── config.py # Configuration settings
|
| 56 |
+
├── services/ # Service modules
|
| 57 |
+
│ ├── __init__.py
|
| 58 |
+
│ ├── github_service.py
|
| 59 |
+
│ ├── embedding_service.py
|
| 60 |
+
│ └── chat_service.py
|
| 61 |
+
├── utils/ # Utility modules
|
| 62 |
+
│ ├── __init__.py
|
| 63 |
+
│ └── file_processor.py
|
| 64 |
+
└── models/ # Data models
|
| 65 |
+
├── __init__.py
|
| 66 |
+
└── schemas.py
|
| 67 |
+
```
|
| 68 |
+
|
| 69 |
+
## Performance Optimization
|
| 70 |
+
|
| 71 |
+
### For Free Tier:
|
| 72 |
+
- Uses lightweight embedding model (`all-MiniLM-L6-v2`)
|
| 73 |
+
- Processes files in batches
|
| 74 |
+
- Implements file size limits
|
| 75 |
+
- Caches models locally
|
| 76 |
+
|
| 77 |
+
### For Better Performance:
|
| 78 |
+
- Upgrade to paid hardware
|
| 79 |
+
- Use larger embedding models
|
| 80 |
+
- Increase batch sizes
|
| 81 |
+
- Add Redis caching
|
| 82 |
+
|
| 83 |
+
## Troubleshooting
|
| 84 |
+
|
| 85 |
+
### Common Issues:
|
| 86 |
+
|
| 87 |
+
1. **Out of Memory**:
|
| 88 |
+
- Reduce batch size in embedding service
|
| 89 |
+
- Use smaller embedding model
|
| 90 |
+
- Upgrade hardware
|
| 91 |
+
|
| 92 |
+
2. **Slow Processing**:
|
| 93 |
+
- Add Hugging Face API token for better rate limits
|
| 94 |
+
- Use GPU hardware
|
| 95 |
+
- Optimize chunk sizes
|
| 96 |
+
|
| 97 |
+
3. **Git Clone Failures**:
|
| 98 |
+
- Add GitHub token for private repos
|
| 99 |
+
- Check repository URL format
|
| 100 |
+
- Ensure repository is public
|
| 101 |
+
|
| 102 |
+
### Debug Mode:
|
| 103 |
+
Set `debug=True` in `demo.launch()` for detailed error messages.
|
| 104 |
+
|
| 105 |
+
## Monitoring
|
| 106 |
+
|
| 107 |
+
Monitor your space performance:
|
| 108 |
+
- Check space logs for errors
|
| 109 |
+
- Monitor memory usage
|
| 110 |
+
- Track processing times
|
| 111 |
+
- Review user feedback
|
| 112 |
+
|
| 113 |
+
## Updates
|
| 114 |
+
|
| 115 |
+
To update your space:
|
| 116 |
+
1. Modify files locally
|
| 117 |
+
2. Upload changed files to your space
|
| 118 |
+
3. Space will automatically rebuild
|
| 119 |
+
4. Test functionality after deployment
|
README.md
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 🤖 Chat with GitHub Repository
|
| 2 |
+
|
| 3 |
+
A powerful AI-powered application that allows you to analyze any GitHub repository and ask questions about the codebase in natural language!
|
| 4 |
+
|
| 5 |
+
## 🌟 Features
|
| 6 |
+
|
| 7 |
+
- **Repository Analysis**: Clone and process any public GitHub repository
|
| 8 |
+
- **AI-Powered Chat**: Ask questions about the code using natural language
|
| 9 |
+
- **Smart Code Understanding**: Uses advanced embeddings to understand code structure and context
|
| 10 |
+
- **Source References**: Get direct references to relevant code files
|
| 11 |
+
- **Multiple File Types**: Supports Python, JavaScript, TypeScript, Java, C++, Go, Rust, PHP, Ruby, Swift, Kotlin, Scala, Markdown, JSON, YAML, and more
|
| 12 |
+
|
| 13 |
+
## 🚀 How It Works
|
| 14 |
+
|
| 15 |
+
1. **Enter Repository URL**: Paste any public GitHub repository URL
|
| 16 |
+
2. **Processing**: The app clones the repo, extracts code files, and creates embeddings
|
| 17 |
+
3. **Ask Questions**: Chat with the AI about the codebase using natural language
|
| 18 |
+
4. **Get Answers**: Receive detailed answers with references to specific code files
|
| 19 |
+
|
| 20 |
+
## 💡 Example Questions
|
| 21 |
+
|
| 22 |
+
- "What is this project about?"
|
| 23 |
+
- "How is the code structured?"
|
| 24 |
+
- "What are the main functions/classes?"
|
| 25 |
+
- "How does authentication work?"
|
| 26 |
+
- "What dependencies does this project use?"
|
| 27 |
+
- "Are there any tests in this codebase?"
|
| 28 |
+
- "How is error handling implemented?"
|
| 29 |
+
- "What are the main API endpoints?"
|
| 30 |
+
|
| 31 |
+
## 🛠️ Technology Stack
|
| 32 |
+
|
| 33 |
+
- **Frontend**: Gradio for the user interface
|
| 34 |
+
- **AI/ML**: Hugging Face Transformers, Sentence Transformers
|
| 35 |
+
- **Vector Database**: ChromaDB for storing code embeddings
|
| 36 |
+
- **Code Processing**: GitPython for repository cloning
|
| 37 |
+
- **Language Models**: Hugging Face Inference API
|
| 38 |
+
|
| 39 |
+
## 📁 Supported File Types
|
| 40 |
+
|
| 41 |
+
The application processes the following file types:
|
| 42 |
+
- **Programming Languages**: `.py`, `.js`, `.ts`, `.jsx`, `.tsx`, `.java`, `.cpp`, `.c`, `.cs`, `.go`, `.rs`, `.php`, `.rb`, `.swift`, `.kt`, `.scala`
|
| 43 |
+
- **Configuration**: `.json`, `.yaml`, `.yml`, `.toml`
|
| 44 |
+
- **Documentation**: `.md`, `.txt`
|
| 45 |
+
|
| 46 |
+
## 🔧 Configuration
|
| 47 |
+
|
| 48 |
+
The app uses Hugging Face's free inference API and Sentence Transformers for embeddings. No additional setup required!
|
| 49 |
+
|
| 50 |
+
## 📝 Usage Tips
|
| 51 |
+
|
| 52 |
+
- **Repository Size**: Works best with small to medium-sized repositories
|
| 53 |
+
- **Processing Time**: Larger repositories may take longer to process
|
| 54 |
+
- **Question Quality**: More specific questions tend to get better answers
|
| 55 |
+
- **File Limits**: Files larger than 1MB are skipped to ensure optimal performance
|
| 56 |
+
|
| 57 |
+
## 🤝 Contributing
|
| 58 |
+
|
| 59 |
+
This project is open source and contributions are welcome! Feel free to:
|
| 60 |
+
- Report bugs
|
| 61 |
+
- Suggest new features
|
| 62 |
+
- Submit pull requests
|
| 63 |
+
- Improve documentation
|
| 64 |
+
|
| 65 |
+
## 📄 License
|
| 66 |
+
|
| 67 |
+
This project is licensed under the MIT License.
|
| 68 |
+
|
| 69 |
+
---
|
| 70 |
+
|
| 71 |
+
**Note**: This application processes public repositories only. Private repositories require authentication tokens.
|
app.py
ADDED
|
@@ -0,0 +1,266 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
import os
|
| 3 |
+
import tempfile
|
| 4 |
+
import shutil
|
| 5 |
+
from typing import List, Dict, Optional
|
| 6 |
+
import asyncio
|
| 7 |
+
from datetime import datetime
|
| 8 |
+
import hashlib
|
| 9 |
+
from urllib.parse import urlparse
|
| 10 |
+
|
| 11 |
+
# Import our custom modules
|
| 12 |
+
from services.github_service import GitHubService
|
| 13 |
+
from services.embedding_service import FreeEmbeddingService
|
| 14 |
+
from services.chat_service import FreeChatService
|
| 15 |
+
from utils.file_processor import FileProcessor
|
| 16 |
+
from config import settings
|
| 17 |
+
|
| 18 |
+
# Initialize services
|
| 19 |
+
github_service = GitHubService(settings.github_token)
|
| 20 |
+
embedding_service = FreeEmbeddingService(
|
| 21 |
+
embedding_provider=settings.embedding_provider.value,
|
| 22 |
+
vector_db_path=settings.vector_db_path,
|
| 23 |
+
model_name=settings.sentence_transformer_model
|
| 24 |
+
)
|
| 25 |
+
|
| 26 |
+
# Initialize chat service for Hugging Face
|
| 27 |
+
chat_service = FreeChatService(
|
| 28 |
+
llm_provider="huggingface",
|
| 29 |
+
api_key=os.getenv("HUGGINGFACE_API_KEY", ""),
|
| 30 |
+
model="microsoft/DialoGPT-medium"
|
| 31 |
+
)
|
| 32 |
+
|
| 33 |
+
file_processor = FileProcessor(settings.supported_extensions, settings.max_file_size)
|
| 34 |
+
|
| 35 |
+
# Global state management
|
| 36 |
+
repo_data = {}
|
| 37 |
+
|
| 38 |
+
def validate_github_url(url: str) -> bool:
|
| 39 |
+
"""Validate if the URL is a valid GitHub repository URL"""
|
| 40 |
+
try:
|
| 41 |
+
parsed = urlparse(url)
|
| 42 |
+
if parsed.netloc != "github.com":
|
| 43 |
+
return False
|
| 44 |
+
path_parts = parsed.path.strip('/').split('/')
|
| 45 |
+
return len(path_parts) >= 2
|
| 46 |
+
except:
|
| 47 |
+
return False
|
| 48 |
+
|
| 49 |
+
async def process_repository(repo_url: str, branch: str = "main") -> tuple:
|
| 50 |
+
"""Process a GitHub repository and return status"""
|
| 51 |
+
if not validate_github_url(repo_url):
|
| 52 |
+
return "❌ Error", "Invalid GitHub URL. Please provide a valid GitHub repository URL."
|
| 53 |
+
|
| 54 |
+
try:
|
| 55 |
+
repo_id = github_service.generate_repo_id(repo_url)
|
| 56 |
+
|
| 57 |
+
# Check if already processed
|
| 58 |
+
if repo_id in repo_data:
|
| 59 |
+
return "✅ Ready", f"Repository already processed! You can now ask questions about the code."
|
| 60 |
+
|
| 61 |
+
# Clone repository
|
| 62 |
+
yield "🔄 Processing", "Cloning repository..."
|
| 63 |
+
repo_path = await github_service.clone_repository(repo_url, branch)
|
| 64 |
+
|
| 65 |
+
# Extract files
|
| 66 |
+
yield "🔄 Processing", "Extracting and processing files..."
|
| 67 |
+
files = list(file_processor.extract_files(repo_path))
|
| 68 |
+
|
| 69 |
+
if not files:
|
| 70 |
+
github_service.cleanup_repo(repo_path)
|
| 71 |
+
return "❌ Error", "No supported files found in the repository."
|
| 72 |
+
|
| 73 |
+
# Create embeddings
|
| 74 |
+
yield "🔄 Processing", f"Creating embeddings for {len(files)} files (this may take a while)..."
|
| 75 |
+
vectorstore = await embedding_service.create_embeddings(files, repo_id)
|
| 76 |
+
|
| 77 |
+
# Store in global state
|
| 78 |
+
repo_data[repo_id] = {
|
| 79 |
+
'vectorstore': vectorstore,
|
| 80 |
+
'files_count': len(files),
|
| 81 |
+
'processed_at': datetime.now(),
|
| 82 |
+
'repo_url': repo_url
|
| 83 |
+
}
|
| 84 |
+
|
| 85 |
+
# Cleanup
|
| 86 |
+
github_service.cleanup_repo(repo_path)
|
| 87 |
+
|
| 88 |
+
yield "✅ Ready", f"Repository processed successfully! Found {len(files)} files. You can now ask questions about the code."
|
| 89 |
+
|
| 90 |
+
except Exception as e:
|
| 91 |
+
return "❌ Error", f"Error processing repository: {str(e)}"
|
| 92 |
+
|
| 93 |
+
def process_repo_sync(repo_url: str, branch: str = "main"):
|
| 94 |
+
"""Synchronous wrapper for repository processing"""
|
| 95 |
+
try:
|
| 96 |
+
# Run the async function
|
| 97 |
+
loop = asyncio.new_event_loop()
|
| 98 |
+
asyncio.set_event_loop(loop)
|
| 99 |
+
|
| 100 |
+
async def run_process():
|
| 101 |
+
async for status, message in process_repository(repo_url, branch):
|
| 102 |
+
yield status, message
|
| 103 |
+
|
| 104 |
+
# Get the final result
|
| 105 |
+
gen = run_process()
|
| 106 |
+
result = None
|
| 107 |
+
async for result in gen:
|
| 108 |
+
pass
|
| 109 |
+
|
| 110 |
+
loop.close()
|
| 111 |
+
return result if result else ("❌ Error", "Processing failed")
|
| 112 |
+
|
| 113 |
+
except Exception as e:
|
| 114 |
+
return "❌ Error", f"Error: {str(e)}"
|
| 115 |
+
|
| 116 |
+
async def chat_with_repository(message: str, repo_url: str, history: List) -> tuple:
|
| 117 |
+
"""Chat with the processed repository"""
|
| 118 |
+
if not repo_url:
|
| 119 |
+
return history + [("Please process a repository first.", "")], ""
|
| 120 |
+
|
| 121 |
+
if not message.strip():
|
| 122 |
+
return history, ""
|
| 123 |
+
|
| 124 |
+
try:
|
| 125 |
+
repo_id = github_service.generate_repo_id(repo_url)
|
| 126 |
+
|
| 127 |
+
if repo_id not in repo_data:
|
| 128 |
+
return history + [(message, "❌ Please process the repository first before asking questions.")], ""
|
| 129 |
+
|
| 130 |
+
# Get vectorstore
|
| 131 |
+
vectorstore = repo_data[repo_id]['vectorstore']
|
| 132 |
+
|
| 133 |
+
# Get answer
|
| 134 |
+
result = await chat_service.answer_question(message, vectorstore, repo_id)
|
| 135 |
+
|
| 136 |
+
# Format response with sources
|
| 137 |
+
response = result['response']
|
| 138 |
+
if result['sources']:
|
| 139 |
+
response += "\n\n**Sources:**\n"
|
| 140 |
+
for i, source in enumerate(result['sources'][:3], 1):
|
| 141 |
+
response += f"{i}. `{source['path']}`\n"
|
| 142 |
+
|
| 143 |
+
return history + [(message, response)], ""
|
| 144 |
+
|
| 145 |
+
except Exception as e:
|
| 146 |
+
return history + [(message, f"❌ Error: {str(e)}")], ""
|
| 147 |
+
|
| 148 |
+
def chat_sync(message: str, repo_url: str, history: List):
|
| 149 |
+
"""Synchronous wrapper for chat function"""
|
| 150 |
+
try:
|
| 151 |
+
loop = asyncio.new_event_loop()
|
| 152 |
+
asyncio.set_event_loop(loop)
|
| 153 |
+
result = loop.run_until_complete(chat_with_repository(message, repo_url, history))
|
| 154 |
+
loop.close()
|
| 155 |
+
return result
|
| 156 |
+
except Exception as e:
|
| 157 |
+
return history + [(message, f"❌ Error: {str(e)}")], ""
|
| 158 |
+
|
| 159 |
+
def get_example_questions():
|
| 160 |
+
"""Get example questions users can ask"""
|
| 161 |
+
return [
|
| 162 |
+
"What is this project about?",
|
| 163 |
+
"How is the code structured?",
|
| 164 |
+
"What are the main functions/classes?",
|
| 165 |
+
"How does authentication work?",
|
| 166 |
+
"What dependencies does this project use?",
|
| 167 |
+
"Are there any tests in this codebase?",
|
| 168 |
+
"How is error handling implemented?",
|
| 169 |
+
"What are the main API endpoints?"
|
| 170 |
+
]
|
| 171 |
+
|
| 172 |
+
# Create Gradio interface
|
| 173 |
+
with gr.Blocks(title="Chat with GitHub Repository", theme=gr.themes.Soft()) as demo:
|
| 174 |
+
gr.Markdown("""
|
| 175 |
+
# 🤖 Chat with GitHub Repository
|
| 176 |
+
|
| 177 |
+
Analyze any GitHub repository and ask questions about the codebase using AI!
|
| 178 |
+
|
| 179 |
+
**How it works:**
|
| 180 |
+
1. Enter a GitHub repository URL
|
| 181 |
+
2. Wait for the repository to be processed
|
| 182 |
+
3. Ask questions about the code in natural language
|
| 183 |
+
""")
|
| 184 |
+
|
| 185 |
+
with gr.Row():
|
| 186 |
+
with gr.Column(scale=2):
|
| 187 |
+
repo_url = gr.Textbox(
|
| 188 |
+
label="GitHub Repository URL",
|
| 189 |
+
placeholder="https://github.com/username/repository",
|
| 190 |
+
info="Enter the URL of a public GitHub repository"
|
| 191 |
+
)
|
| 192 |
+
branch = gr.Textbox(
|
| 193 |
+
label="Branch (optional)",
|
| 194 |
+
value="main",
|
| 195 |
+
placeholder="main"
|
| 196 |
+
)
|
| 197 |
+
process_btn = gr.Button("🔄 Process Repository", variant="primary")
|
| 198 |
+
|
| 199 |
+
with gr.Column(scale=1):
|
| 200 |
+
status = gr.Textbox(
|
| 201 |
+
label="Status",
|
| 202 |
+
value="⏳ Waiting",
|
| 203 |
+
interactive=False
|
| 204 |
+
)
|
| 205 |
+
status_msg = gr.Textbox(
|
| 206 |
+
label="Details",
|
| 207 |
+
value="Enter a repository URL and click 'Process Repository'",
|
| 208 |
+
interactive=False,
|
| 209 |
+
lines=3
|
| 210 |
+
)
|
| 211 |
+
|
| 212 |
+
gr.Markdown("---")
|
| 213 |
+
|
| 214 |
+
with gr.Row():
|
| 215 |
+
with gr.Column():
|
| 216 |
+
chatbot = gr.Chatbot(
|
| 217 |
+
label="Chat with Repository",
|
| 218 |
+
height=400,
|
| 219 |
+
placeholder="Process a repository first, then ask questions about the code!"
|
| 220 |
+
)
|
| 221 |
+
|
| 222 |
+
with gr.Row():
|
| 223 |
+
msg = gr.Textbox(
|
| 224 |
+
label="Your Question",
|
| 225 |
+
placeholder="Ask anything about the codebase...",
|
| 226 |
+
scale=4
|
| 227 |
+
)
|
| 228 |
+
send_btn = gr.Button("Send", variant="primary", scale=1)
|
| 229 |
+
|
| 230 |
+
gr.Examples(
|
| 231 |
+
examples=get_example_questions(),
|
| 232 |
+
inputs=msg,
|
| 233 |
+
label="Example Questions"
|
| 234 |
+
)
|
| 235 |
+
|
| 236 |
+
gr.Markdown("""
|
| 237 |
+
---
|
| 238 |
+
**Supported file types:** Python, JavaScript, TypeScript, Java, C++, Go, Rust, PHP, Ruby, Swift, Kotlin, Scala, Markdown, JSON, YAML, and more.
|
| 239 |
+
|
| 240 |
+
**Note:** This app uses Hugging Face's free inference API. Processing large repositories may take some time.
|
| 241 |
+
""")
|
| 242 |
+
|
| 243 |
+
# Event handlers
|
| 244 |
+
process_btn.click(
|
| 245 |
+
fn=process_repo_sync,
|
| 246 |
+
inputs=[repo_url, branch],
|
| 247 |
+
outputs=[status, status_msg]
|
| 248 |
+
)
|
| 249 |
+
|
| 250 |
+
def handle_chat(message, repo_url_val, history):
|
| 251 |
+
return chat_sync(message, repo_url_val, history)
|
| 252 |
+
|
| 253 |
+
send_btn.click(
|
| 254 |
+
fn=handle_chat,
|
| 255 |
+
inputs=[msg, repo_url, chatbot],
|
| 256 |
+
outputs=[chatbot, msg]
|
| 257 |
+
)
|
| 258 |
+
|
| 259 |
+
msg.submit(
|
| 260 |
+
fn=handle_chat,
|
| 261 |
+
inputs=[msg, repo_url, chatbot],
|
| 262 |
+
outputs=[chatbot, msg]
|
| 263 |
+
)
|
| 264 |
+
|
| 265 |
+
if __name__ == "__main__":
|
| 266 |
+
demo.launch()
|
config.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from pydantic_settings import BaseSettings
|
| 3 |
+
from enum import Enum
|
| 4 |
+
|
| 5 |
+
class LLMProvider(str, Enum):
|
| 6 |
+
OLLAMA = "ollama"
|
| 7 |
+
GROQ = "groq"
|
| 8 |
+
HUGGINGFACE = "huggingface"
|
| 9 |
+
|
| 10 |
+
class EmbeddingProvider(str, Enum):
|
| 11 |
+
SENTENCE_TRANSFORMERS = "sentence_transformers"
|
| 12 |
+
HUGGINGFACE = "huggingface"
|
| 13 |
+
|
| 14 |
+
class Settings(BaseSettings):
|
| 15 |
+
# LLM Configuration
|
| 16 |
+
llm_provider: LLMProvider = LLMProvider.HUGGINGFACE
|
| 17 |
+
ollama_base_url: str = "http://localhost:11434"
|
| 18 |
+
ollama_model: str = "llama2"
|
| 19 |
+
groq_api_key: str = ""
|
| 20 |
+
groq_model: str = "mixtral-8x7b-32768"
|
| 21 |
+
huggingface_api_key: str = os.getenv("HUGGINGFACE_API_KEY", "")
|
| 22 |
+
huggingface_model: str = "microsoft/DialoGPT-medium"
|
| 23 |
+
|
| 24 |
+
# Embedding Configuration
|
| 25 |
+
embedding_provider: EmbeddingProvider = EmbeddingProvider.SENTENCE_TRANSFORMERS
|
| 26 |
+
sentence_transformer_model: str = "all-MiniLM-L6-v2"
|
| 27 |
+
|
| 28 |
+
# Other settings
|
| 29 |
+
github_token: str = os.getenv("GITHUB_TOKEN", "")
|
| 30 |
+
redis_url: str = "redis://localhost:6379"
|
| 31 |
+
vector_db_path: str = "./chroma_db"
|
| 32 |
+
max_file_size: int = 1024 * 1024 # 1MB
|
| 33 |
+
supported_extensions: list = [
|
| 34 |
+
".py", ".js", ".ts", ".jsx", ".tsx", ".java", ".cpp", ".c",
|
| 35 |
+
".cs", ".go", ".rs", ".php", ".rb", ".swift", ".kt", ".scala",
|
| 36 |
+
".md", ".txt", ".json", ".yaml", ".yml", ".toml"
|
| 37 |
+
]
|
| 38 |
+
|
| 39 |
+
class Config:
|
| 40 |
+
env_file = ".env"
|
| 41 |
+
|
| 42 |
+
settings = Settings()
|
plan.txt
ADDED
|
@@ -0,0 +1,1199 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Chat with GitHub Repo - GenAI Project
|
| 2 |
+
|
| 3 |
+
## Project Overview
|
| 4 |
+
|
| 5 |
+
A GenAI application that allows developers to paste a GitHub repository URL and ask natural language questions about the codebase. The system clones the repo, processes and embeds the code files, then uses RAG (Retrieval Augmented Generation) to answer questions about the code.
|
| 6 |
+
|
| 7 |
+
## Architecture
|
| 8 |
+
|
| 9 |
+
```
|
| 10 |
+
┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐
|
| 11 |
+
│ Frontend │ │ Backend API │ │ Vector DB │
|
| 12 |
+
│ (React/Next) │───▶│ (FastAPI) │───▶│ (Pinecone/ │
|
| 13 |
+
│ │ │ │ │ Chroma) │
|
| 14 |
+
└─────────────────┘ └─────────────────┘ └─────────────────┘
|
| 15 |
+
│
|
| 16 |
+
▼
|
| 17 |
+
┌─────────────────┐
|
| 18 |
+
│ GitHub API │
|
| 19 |
+
│ + Git Clone │
|
| 20 |
+
└─────────────────┘
|
| 21 |
+
```
|
| 22 |
+
|
| 23 |
+
## Tech Stack
|
| 24 |
+
|
| 25 |
+
### Backend
|
| 26 |
+
- **FastAPI** - High-performance Python web framework
|
| 27 |
+
- **LangChain** - LLM orchestration and RAG implementation
|
| 28 |
+
- **Free LLM Options** - Ollama (local), Groq (fast inference), or Hugging Face
|
| 29 |
+
- **Free Embeddings** - Sentence Transformers (local) or Hugging Face
|
| 30 |
+
- **Chroma** - Vector database for embeddings
|
| 31 |
+
- **GitPython** - For cloning and processing repositories
|
| 32 |
+
- **Celery + Redis** - For background processing
|
| 33 |
+
|
| 34 |
+
### Frontend
|
| 35 |
+
- **Next.js** - React framework
|
| 36 |
+
- **TypeScript** - Type safety
|
| 37 |
+
- **Tailwind CSS** - Styling
|
| 38 |
+
- **Socket.io** - Real-time updates
|
| 39 |
+
|
| 40 |
+
## Backend Implementation
|
| 41 |
+
|
| 42 |
+
### 1. Project Structure
|
| 43 |
+
|
| 44 |
+
```
|
| 45 |
+
backend/
|
| 46 |
+
├── app/
|
| 47 |
+
│ ├── __init__.py
|
| 48 |
+
│ ├── main.py
|
| 49 |
+
│ ├── models/
|
| 50 |
+
│ │ ├── __init__.py
|
| 51 |
+
│ │ └── schemas.py
|
| 52 |
+
│ ├── services/
|
| 53 |
+
│ │ ├── __init__.py
|
| 54 |
+
│ │ ├── github_service.py
|
| 55 |
+
│ │ ├── embedding_service.py
|
| 56 |
+
│ │ └── chat_service.py
|
| 57 |
+
│ ├── utils/
|
| 58 |
+
│ │ ├── __init__.py
|
| 59 |
+
│ │ └── file_processor.py
|
| 60 |
+
│ └── config.py
|
| 61 |
+
├── requirements.txt
|
| 62 |
+
├── docker-compose.yml
|
| 63 |
+
└── Dockerfile
|
| 64 |
+
```
|
| 65 |
+
|
| 66 |
+
### 2. Core Dependencies (requirements.txt)
|
| 67 |
+
|
| 68 |
+
```txt
|
| 69 |
+
fastapi==0.104.1
|
| 70 |
+
uvicorn[standard]==0.24.0
|
| 71 |
+
python-multipart==0.0.6
|
| 72 |
+
pydantic==2.5.0
|
| 73 |
+
langchain==0.1.0
|
| 74 |
+
langchain-community==0.0.10
|
| 75 |
+
chromadb==0.4.18
|
| 76 |
+
GitPython==3.1.40
|
| 77 |
+
python-dotenv==1.0.0
|
| 78 |
+
celery==5.3.4
|
| 79 |
+
redis==5.0.1
|
| 80 |
+
socketio==5.10.0
|
| 81 |
+
python-socketio==5.10.0
|
| 82 |
+
aiofiles==23.2.1
|
| 83 |
+
|
| 84 |
+
# Free LLM & Embedding Options
|
| 85 |
+
sentence-transformers==2.2.2
|
| 86 |
+
transformers==4.36.0
|
| 87 |
+
torch==2.1.0
|
| 88 |
+
ollama==0.1.7
|
| 89 |
+
groq==0.4.1
|
| 90 |
+
huggingface-hub==0.19.4
|
| 91 |
+
```
|
| 92 |
+
|
| 93 |
+
### 3. Configuration (config.py)
|
| 94 |
+
|
| 95 |
+
```python
|
| 96 |
+
import os
|
| 97 |
+
from pydantic_settings import BaseSettings
|
| 98 |
+
from enum import Enum
|
| 99 |
+
|
| 100 |
+
class LLMProvider(str, Enum):
|
| 101 |
+
OLLAMA = "ollama"
|
| 102 |
+
GROQ = "groq"
|
| 103 |
+
HUGGINGFACE = "huggingface"
|
| 104 |
+
|
| 105 |
+
class EmbeddingProvider(str, Enum):
|
| 106 |
+
SENTENCE_TRANSFORMERS = "sentence_transformers"
|
| 107 |
+
HUGGINGFACE = "huggingface"
|
| 108 |
+
|
| 109 |
+
class Settings(BaseSettings):
|
| 110 |
+
# LLM Configuration
|
| 111 |
+
llm_provider: LLMProvider = LLMProvider.OLLAMA
|
| 112 |
+
ollama_base_url: str = "http://localhost:11434"
|
| 113 |
+
ollama_model: str = "llama2" # or codellama, mistral, etc.
|
| 114 |
+
groq_api_key: str = ""
|
| 115 |
+
groq_model: str = "mixtral-8x7b-32768"
|
| 116 |
+
huggingface_api_key: str = ""
|
| 117 |
+
huggingface_model: str = "microsoft/DialoGPT-medium"
|
| 118 |
+
|
| 119 |
+
# Embedding Configuration
|
| 120 |
+
embedding_provider: EmbeddingProvider = EmbeddingProvider.SENTENCE_TRANSFORMERS
|
| 121 |
+
sentence_transformer_model: str = "all-MiniLM-L6-v2" # Fast and good
|
| 122 |
+
# Alternative: "all-mpnet-base-v2" (better quality, slower)
|
| 123 |
+
|
| 124 |
+
# Other settings
|
| 125 |
+
github_token: str = ""
|
| 126 |
+
redis_url: str = "redis://localhost:6379"
|
| 127 |
+
vector_db_path: str = "./chroma_db"
|
| 128 |
+
max_file_size: int = 1024 * 1024 # 1MB
|
| 129 |
+
supported_extensions: list = [
|
| 130 |
+
".py", ".js", ".ts", ".jsx", ".tsx", ".java", ".cpp", ".c",
|
| 131 |
+
".cs", ".go", ".rs", ".php", ".rb", ".swift", ".kt", ".scala",
|
| 132 |
+
".md", ".txt", ".json", ".yaml", ".yml", ".toml"
|
| 133 |
+
]
|
| 134 |
+
|
| 135 |
+
class Config:
|
| 136 |
+
env_file = ".env"
|
| 137 |
+
|
| 138 |
+
settings = Settings()
|
| 139 |
+
```
|
| 140 |
+
|
| 141 |
+
### 4. Data Models (models/schemas.py)
|
| 142 |
+
|
| 143 |
+
```python
|
| 144 |
+
from pydantic import BaseModel, HttpUrl
|
| 145 |
+
from typing import List, Optional
|
| 146 |
+
from enum import Enum
|
| 147 |
+
|
| 148 |
+
class ProcessingStatus(str, Enum):
|
| 149 |
+
PENDING = "pending"
|
| 150 |
+
PROCESSING = "processing"
|
| 151 |
+
COMPLETED = "completed"
|
| 152 |
+
FAILED = "failed"
|
| 153 |
+
|
| 154 |
+
class RepoProcessRequest(BaseModel):
|
| 155 |
+
repo_url: HttpUrl
|
| 156 |
+
branch: Optional[str] = "main"
|
| 157 |
+
|
| 158 |
+
class ChatMessage(BaseModel):
|
| 159 |
+
message: str
|
| 160 |
+
repo_id: str
|
| 161 |
+
|
| 162 |
+
class ChatResponse(BaseModel):
|
| 163 |
+
response: str
|
| 164 |
+
sources: List[dict]
|
| 165 |
+
repo_id: str
|
| 166 |
+
|
| 167 |
+
class RepoStatus(BaseModel):
|
| 168 |
+
repo_id: str
|
| 169 |
+
status: ProcessingStatus
|
| 170 |
+
progress: int
|
| 171 |
+
message: str
|
| 172 |
+
total_files: Optional[int] = None
|
| 173 |
+
processed_files: Optional[int] = None
|
| 174 |
+
```
|
| 175 |
+
|
| 176 |
+
### 5. GitHub Service (services/github_service.py)
|
| 177 |
+
|
| 178 |
+
```python
|
| 179 |
+
import os
|
| 180 |
+
import tempfile
|
| 181 |
+
import shutil
|
| 182 |
+
from git import Repo
|
| 183 |
+
from typing import List, Tuple
|
| 184 |
+
import hashlib
|
| 185 |
+
from urllib.parse import urlparse
|
| 186 |
+
|
| 187 |
+
class GitHubService:
|
| 188 |
+
def __init__(self, github_token: str = ""):
|
| 189 |
+
self.github_token = github_token
|
| 190 |
+
|
| 191 |
+
def generate_repo_id(self, repo_url: str) -> str:
|
| 192 |
+
"""Generate a unique ID for the repository"""
|
| 193 |
+
return hashlib.md5(repo_url.encode()).hexdigest()
|
| 194 |
+
|
| 195 |
+
def parse_github_url(self, url: str) -> Tuple[str, str]:
|
| 196 |
+
"""Extract owner and repo name from GitHub URL"""
|
| 197 |
+
parsed = urlparse(url)
|
| 198 |
+
path_parts = parsed.path.strip('/').split('/')
|
| 199 |
+
if len(path_parts) >= 2:
|
| 200 |
+
return path_parts[0], path_parts[1]
|
| 201 |
+
raise ValueError("Invalid GitHub URL format")
|
| 202 |
+
|
| 203 |
+
async def clone_repository(self, repo_url: str, branch: str = "main") -> str:
|
| 204 |
+
"""Clone repository to temporary directory"""
|
| 205 |
+
temp_dir = tempfile.mkdtemp()
|
| 206 |
+
try:
|
| 207 |
+
if self.github_token:
|
| 208 |
+
# Use token for private repos or higher rate limits
|
| 209 |
+
auth_url = repo_url.replace("https://", f"https://{self.github_token}@")
|
| 210 |
+
Repo.clone_from(auth_url, temp_dir, branch=branch, depth=1)
|
| 211 |
+
else:
|
| 212 |
+
Repo.clone_from(repo_url, temp_dir, branch=branch, depth=1)
|
| 213 |
+
return temp_dir
|
| 214 |
+
except Exception as e:
|
| 215 |
+
shutil.rmtree(temp_dir, ignore_errors=True)
|
| 216 |
+
raise Exception(f"Failed to clone repository: {str(e)}")
|
| 217 |
+
|
| 218 |
+
def cleanup_repo(self, repo_path: str):
|
| 219 |
+
"""Clean up cloned repository"""
|
| 220 |
+
if os.path.exists(repo_path):
|
| 221 |
+
shutil.rmtree(repo_path, ignore_errors=True)
|
| 222 |
+
```
|
| 223 |
+
|
| 224 |
+
### 6. File Processor (utils/file_processor.py)
|
| 225 |
+
|
| 226 |
+
```python
|
| 227 |
+
import os
|
| 228 |
+
from typing import List, Dict, Generator
|
| 229 |
+
import mimetypes
|
| 230 |
+
from pathlib import Path
|
| 231 |
+
|
| 232 |
+
class FileProcessor:
|
| 233 |
+
def __init__(self, supported_extensions: List[str], max_file_size: int):
|
| 234 |
+
self.supported_extensions = supported_extensions
|
| 235 |
+
self.max_file_size = max_file_size
|
| 236 |
+
self.ignore_dirs = {
|
| 237 |
+
'.git', '__pycache__', 'node_modules', '.pytest_cache',
|
| 238 |
+
'venv', 'env', '.venv', 'build', 'dist', '.next',
|
| 239 |
+
'coverage', '.coverage', 'logs', 'log'
|
| 240 |
+
}
|
| 241 |
+
self.ignore_files = {
|
| 242 |
+
'.gitignore', '.env', '.env.local', '.DS_Store',
|
| 243 |
+
'package-lock.json', 'yarn.lock', 'poetry.lock'
|
| 244 |
+
}
|
| 245 |
+
|
| 246 |
+
def should_process_file(self, file_path: str) -> bool:
|
| 247 |
+
"""Check if file should be processed"""
|
| 248 |
+
path = Path(file_path)
|
| 249 |
+
|
| 250 |
+
# Check if any parent directory is in ignore list
|
| 251 |
+
for parent in path.parents:
|
| 252 |
+
if parent.name in self.ignore_dirs:
|
| 253 |
+
return False
|
| 254 |
+
|
| 255 |
+
# Check file name
|
| 256 |
+
if path.name in self.ignore_files:
|
| 257 |
+
return False
|
| 258 |
+
|
| 259 |
+
# Check extension
|
| 260 |
+
if path.suffix.lower() not in self.supported_extensions:
|
| 261 |
+
return False
|
| 262 |
+
|
| 263 |
+
# Check file size
|
| 264 |
+
try:
|
| 265 |
+
if os.path.getsize(file_path) > self.max_file_size:
|
| 266 |
+
return False
|
| 267 |
+
except OSError:
|
| 268 |
+
return False
|
| 269 |
+
|
| 270 |
+
return True
|
| 271 |
+
|
| 272 |
+
def extract_files(self, repo_path: str) -> Generator[Dict, None, None]:
|
| 273 |
+
"""Extract and yield file information"""
|
| 274 |
+
for root, dirs, files in os.walk(repo_path):
|
| 275 |
+
# Filter out ignored directories
|
| 276 |
+
dirs[:] = [d for d in dirs if d not in self.ignore_dirs]
|
| 277 |
+
|
| 278 |
+
for file in files:
|
| 279 |
+
file_path = os.path.join(root, file)
|
| 280 |
+
relative_path = os.path.relpath(file_path, repo_path)
|
| 281 |
+
|
| 282 |
+
if not self.should_process_file(file_path):
|
| 283 |
+
continue
|
| 284 |
+
|
| 285 |
+
try:
|
| 286 |
+
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
|
| 287 |
+
content = f.read()
|
| 288 |
+
|
| 289 |
+
yield {
|
| 290 |
+
'path': relative_path,
|
| 291 |
+
'content': content,
|
| 292 |
+
'extension': Path(file_path).suffix.lower(),
|
| 293 |
+
'size': len(content)
|
| 294 |
+
}
|
| 295 |
+
except Exception as e:
|
| 296 |
+
print(f"Error reading file {relative_path}: {e}")
|
| 297 |
+
continue
|
| 298 |
+
```
|
| 299 |
+
|
| 300 |
+
### 7. Free Embedding Service (services/embedding_service.py)
|
| 301 |
+
|
| 302 |
+
```python
|
| 303 |
+
from sentence_transformers import SentenceTransformer
|
| 304 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 305 |
+
from langchain.schema import Document
|
| 306 |
+
from langchain_community.vectorstores import Chroma
|
| 307 |
+
from langchain_community.embeddings import HuggingFaceEmbeddings, SentenceTransformerEmbeddings
|
| 308 |
+
import chromadb
|
| 309 |
+
from typing import List, Dict
|
| 310 |
+
import os
|
| 311 |
+
|
| 312 |
+
class FreeEmbeddingService:
|
| 313 |
+
def __init__(self, embedding_provider: str, vector_db_path: str, model_name: str = "all-MiniLM-L6-v2"):
|
| 314 |
+
self.vector_db_path = vector_db_path
|
| 315 |
+
self.embedding_provider = embedding_provider
|
| 316 |
+
|
| 317 |
+
# Initialize embedding function based on provider
|
| 318 |
+
if embedding_provider == "sentence_transformers":
|
| 319 |
+
self.embeddings = SentenceTransformerEmbeddings(
|
| 320 |
+
model_name=model_name,
|
| 321 |
+
cache_folder="./models" # Cache models locally
|
| 322 |
+
)
|
| 323 |
+
elif embedding_provider == "huggingface":
|
| 324 |
+
self.embeddings = HuggingFaceEmbeddings(
|
| 325 |
+
model_name=model_name,
|
| 326 |
+
cache_folder="./models"
|
| 327 |
+
)
|
| 328 |
+
else:
|
| 329 |
+
raise ValueError(f"Unsupported embedding provider: {embedding_provider}")
|
| 330 |
+
|
| 331 |
+
self.text_splitter = RecursiveCharacterTextSplitter(
|
| 332 |
+
chunk_size=1000,
|
| 333 |
+
chunk_overlap=200,
|
| 334 |
+
separators=["\n\n", "\n", " ", ""]
|
| 335 |
+
)
|
| 336 |
+
|
| 337 |
+
def create_documents(self, files: List[Dict], repo_id: str) -> List[Document]:
|
| 338 |
+
"""Create documents from file contents"""
|
| 339 |
+
documents = []
|
| 340 |
+
|
| 341 |
+
for file_info in files:
|
| 342 |
+
# Create document with metadata
|
| 343 |
+
doc = Document(
|
| 344 |
+
page_content=file_info['content'],
|
| 345 |
+
metadata={
|
| 346 |
+
'path': file_info['path'],
|
| 347 |
+
'extension': file_info['extension'],
|
| 348 |
+
'repo_id': repo_id,
|
| 349 |
+
'size': file_info['size']
|
| 350 |
+
}
|
| 351 |
+
)
|
| 352 |
+
documents.append(doc)
|
| 353 |
+
|
| 354 |
+
return documents
|
| 355 |
+
|
| 356 |
+
def split_documents(self, documents: List[Document]) -> List[Document]:
|
| 357 |
+
"""Split documents into chunks"""
|
| 358 |
+
return self.text_splitter.split_documents(documents)
|
| 359 |
+
|
| 360 |
+
async def create_embeddings(self, files: List[Dict], repo_id: str):
|
| 361 |
+
"""Create and store embeddings for repository files"""
|
| 362 |
+
# Create documents
|
| 363 |
+
documents = self.create_documents(files, repo_id)
|
| 364 |
+
|
| 365 |
+
# Split into chunks
|
| 366 |
+
chunks = self.split_documents(documents)
|
| 367 |
+
|
| 368 |
+
# Create vector store
|
| 369 |
+
collection_name = f"repo_{repo_id}"
|
| 370 |
+
vectorstore = Chroma(
|
| 371 |
+
collection_name=collection_name,
|
| 372 |
+
embedding_function=self.embeddings,
|
| 373 |
+
persist_directory=self.vector_db_path
|
| 374 |
+
)
|
| 375 |
+
|
| 376 |
+
# Add documents to vector store in batches
|
| 377 |
+
batch_size = 100
|
| 378 |
+
for i in range(0, len(chunks), batch_size):
|
| 379 |
+
batch = chunks[i:i + batch_size]
|
| 380 |
+
vectorstore.add_documents(batch)
|
| 381 |
+
|
| 382 |
+
return vectorstore
|
| 383 |
+
|
| 384 |
+
def get_vectorstore(self, repo_id: str):
|
| 385 |
+
"""Get existing vector store for repository"""
|
| 386 |
+
collection_name = f"repo_{repo_id}"
|
| 387 |
+
return Chroma(
|
| 388 |
+
collection_name=collection_name,
|
| 389 |
+
embedding_function=self.embeddings,
|
| 390 |
+
persist_directory=self.vector_db_path
|
| 391 |
+
)
|
| 392 |
+
|
| 393 |
+
|
| 394 |
+
# Alternative: Direct SentenceTransformers implementation for more control
|
| 395 |
+
class DirectEmbeddingService:
|
| 396 |
+
def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
|
| 397 |
+
self.model = SentenceTransformer(model_name, cache_folder="./models")
|
| 398 |
+
self.text_splitter = RecursiveCharacterTextSplitter(
|
| 399 |
+
chunk_size=1000,
|
| 400 |
+
chunk_overlap=200,
|
| 401 |
+
separators=["\n\n", "\n", " ", ""]
|
| 402 |
+
)
|
| 403 |
+
|
| 404 |
+
def embed_texts(self, texts: List[str]) -> List[List[float]]:
|
| 405 |
+
"""Generate embeddings for texts"""
|
| 406 |
+
return self.model.encode(texts, convert_to_numpy=True).tolist()
|
| 407 |
+
|
| 408 |
+
def embed_query(self, query: str) -> List[float]:
|
| 409 |
+
"""Generate embedding for a single query"""
|
| 410 |
+
return self.model.encode([query], convert_to_numpy=True)[0].tolist()
|
| 411 |
+
```
|
| 412 |
+
|
| 413 |
+
### 8. Free Chat Service (services/chat_service.py)
|
| 414 |
+
|
| 415 |
+
```python
|
| 416 |
+
from langchain.chains import RetrievalQA
|
| 417 |
+
from langchain.prompts import PromptTemplate
|
| 418 |
+
from langchain_community.llms import Ollama
|
| 419 |
+
from typing import Dict, List
|
| 420 |
+
import json
|
| 421 |
+
import requests
|
| 422 |
+
import os
|
| 423 |
+
|
| 424 |
+
# For Groq (free tier available)
|
| 425 |
+
class GroqLLM:
|
| 426 |
+
def __init__(self, api_key: str, model: str = "mixtral-8x7b-32768"):
|
| 427 |
+
self.api_key = api_key
|
| 428 |
+
self.model = model
|
| 429 |
+
self.base_url = "https://api.groq.com/openai/v1"
|
| 430 |
+
|
| 431 |
+
def __call__(self, prompt: str) -> str:
|
| 432 |
+
headers = {
|
| 433 |
+
"Authorization": f"Bearer {self.api_key}",
|
| 434 |
+
"Content-Type": "application/json"
|
| 435 |
+
}
|
| 436 |
+
|
| 437 |
+
data = {
|
| 438 |
+
"model": self.model,
|
| 439 |
+
"messages": [{"role": "user", "content": prompt}],
|
| 440 |
+
"temperature": 0.1,
|
| 441 |
+
"max_tokens": 1024
|
| 442 |
+
}
|
| 443 |
+
|
| 444 |
+
response = requests.post(
|
| 445 |
+
f"{self.base_url}/chat/completions",
|
| 446 |
+
headers=headers,
|
| 447 |
+
json=data
|
| 448 |
+
)
|
| 449 |
+
|
| 450 |
+
if response.status_code == 200:
|
| 451 |
+
return response.json()["choices"][0]["message"]["content"]
|
| 452 |
+
else:
|
| 453 |
+
raise Exception(f"Groq API error: {response.text}")
|
| 454 |
+
|
| 455 |
+
# For Hugging Face Inference API
|
| 456 |
+
class HuggingFaceLLM:
|
| 457 |
+
def __init__(self, api_key: str, model: str = "microsoft/DialoGPT-medium"):
|
| 458 |
+
self.api_key = api_key
|
| 459 |
+
self.model = model
|
| 460 |
+
self.base_url = f"https://api-inference.huggingface.co/models/{model}"
|
| 461 |
+
|
| 462 |
+
def __call__(self, prompt: str) -> str:
|
| 463 |
+
headers = {"Authorization": f"Bearer {self.api_key}"}
|
| 464 |
+
data = {"inputs": prompt, "parameters": {"max_length": 1000, "temperature": 0.1}}
|
| 465 |
+
|
| 466 |
+
response = requests.post(self.base_url, headers=headers, json=data)
|
| 467 |
+
|
| 468 |
+
if response.status_code == 200:
|
| 469 |
+
result = response.json()
|
| 470 |
+
if isinstance(result, list) and len(result) > 0:
|
| 471 |
+
return result[0].get("generated_text", "").replace(prompt, "").strip()
|
| 472 |
+
return str(result)
|
| 473 |
+
else:
|
| 474 |
+
raise Exception(f"HuggingFace API error: {response.text}")
|
| 475 |
+
|
| 476 |
+
class FreeChatService:
|
| 477 |
+
def __init__(self, llm_provider: str, **kwargs):
|
| 478 |
+
self.llm_provider = llm_provider
|
| 479 |
+
|
| 480 |
+
if llm_provider == "ollama":
|
| 481 |
+
self.llm = Ollama(
|
| 482 |
+
model=kwargs.get("model", "llama2"),
|
| 483 |
+
base_url=kwargs.get("base_url", "http://localhost:11434"),
|
| 484 |
+
temperature=0.1
|
| 485 |
+
)
|
| 486 |
+
elif llm_provider == "groq":
|
| 487 |
+
self.llm = GroqLLM(
|
| 488 |
+
api_key=kwargs.get("api_key"),
|
| 489 |
+
model=kwargs.get("model", "mixtral-8x7b-32768")
|
| 490 |
+
)
|
| 491 |
+
elif llm_provider == "huggingface":
|
| 492 |
+
self.llm = HuggingFaceLLM(
|
| 493 |
+
api_key=kwargs.get("api_key"),
|
| 494 |
+
model=kwargs.get("model", "microsoft/DialoGPT-medium")
|
| 495 |
+
)
|
| 496 |
+
else:
|
| 497 |
+
raise ValueError(f"Unsupported LLM provider: {llm_provider}")
|
| 498 |
+
|
| 499 |
+
self.prompt_template = PromptTemplate(
|
| 500 |
+
input_variables=["context", "question"],
|
| 501 |
+
template="""
|
| 502 |
+
You are a helpful AI assistant that analyzes code repositories. Use the following code snippets to answer the user's question about the repository.
|
| 503 |
+
|
| 504 |
+
Context from repository:
|
| 505 |
+
{context}
|
| 506 |
+
|
| 507 |
+
Question: {question}
|
| 508 |
+
|
| 509 |
+
Please provide a detailed answer based on the code context provided. If you reference specific files or functions, mention their file paths. If the question cannot be fully answered from the provided context, say so clearly.
|
| 510 |
+
|
| 511 |
+
Answer:"""
|
| 512 |
+
)
|
| 513 |
+
|
| 514 |
+
async def answer_question(self, question: str, vectorstore, repo_id: str) -> Dict:
|
| 515 |
+
"""Answer question using RAG with free LLM"""
|
| 516 |
+
try:
|
| 517 |
+
if self.llm_provider == "ollama":
|
| 518 |
+
# Use LangChain's RetrievalQA for Ollama
|
| 519 |
+
qa_chain = RetrievalQA.from_chain_type(
|
| 520 |
+
llm=self.llm,
|
| 521 |
+
chain_type="stuff",
|
| 522 |
+
retriever=vectorstore.as_retriever(search_kwargs={"k": 5}),
|
| 523 |
+
chain_type_kwargs={"prompt": self.prompt_template},
|
| 524 |
+
return_source_documents=True
|
| 525 |
+
)
|
| 526 |
+
|
| 527 |
+
result = qa_chain({"query": question})
|
| 528 |
+
answer = result["result"]
|
| 529 |
+
source_docs = result.get("source_documents", [])
|
| 530 |
+
|
| 531 |
+
else:
|
| 532 |
+
# Manual RAG for other providers
|
| 533 |
+
docs = vectorstore.similarity_search(question, k=5)
|
| 534 |
+
context = "\n\n".join([doc.page_content for doc in docs])
|
| 535 |
+
|
| 536 |
+
prompt = self.prompt_template.format(
|
| 537 |
+
context=context,
|
| 538 |
+
question=question
|
| 539 |
+
)
|
| 540 |
+
|
| 541 |
+
answer = self.llm(prompt)
|
| 542 |
+
source_docs = docs
|
| 543 |
+
|
| 544 |
+
# Format sources
|
| 545 |
+
sources = []
|
| 546 |
+
for doc in source_docs:
|
| 547 |
+
sources.append({
|
| 548 |
+
"path": doc.metadata.get("path", "Unknown"),
|
| 549 |
+
"content_preview": doc.page_content[:200] + "..." if len(doc.page_content) > 200 else doc.page_content
|
| 550 |
+
})
|
| 551 |
+
|
| 552 |
+
return {
|
| 553 |
+
"response": answer,
|
| 554 |
+
"sources": sources,
|
| 555 |
+
"repo_id": repo_id
|
| 556 |
+
}
|
| 557 |
+
|
| 558 |
+
except Exception as e:
|
| 559 |
+
return {
|
| 560 |
+
"response": f"Error processing question: {str(e)}",
|
| 561 |
+
"sources": [],
|
| 562 |
+
"repo_id": repo_id
|
| 563 |
+
}
|
| 564 |
+
```
|
| 565 |
+
|
| 566 |
+
### 9. Updated Main FastAPI Application (main.py)
|
| 567 |
+
|
| 568 |
+
```python
|
| 569 |
+
from fastapi import FastAPI, HTTPException, BackgroundTasks
|
| 570 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 571 |
+
import uvicorn
|
| 572 |
+
from typing import Dict
|
| 573 |
+
import asyncio
|
| 574 |
+
from datetime import datetime
|
| 575 |
+
|
| 576 |
+
from models.schemas import RepoProcessRequest, ChatMessage, ChatResponse, RepoStatus, ProcessingStatus
|
| 577 |
+
from services.github_service import GitHubService
|
| 578 |
+
from services.embedding_service import FreeEmbeddingService
|
| 579 |
+
from services.chat_service import FreeChatService
|
| 580 |
+
from utils.file_processor import FileProcessor
|
| 581 |
+
from config import settings
|
| 582 |
+
|
| 583 |
+
app = FastAPI(title="Chat with GitHub Repo (Free Version)", version="1.0.0")
|
| 584 |
+
|
| 585 |
+
# CORS middleware
|
| 586 |
+
app.add_middleware(
|
| 587 |
+
CORSMiddleware,
|
| 588 |
+
allow_origins=["*"], # Configure properly for production
|
| 589 |
+
allow_credentials=True,
|
| 590 |
+
allow_methods=["*"],
|
| 591 |
+
allow_headers=["*"],
|
| 592 |
+
)
|
| 593 |
+
|
| 594 |
+
# Initialize services based on configuration
|
| 595 |
+
github_service = GitHubService(settings.github_token)
|
| 596 |
+
|
| 597 |
+
embedding_service = FreeEmbeddingService(
|
| 598 |
+
embedding_provider=settings.embedding_provider.value,
|
| 599 |
+
vector_db_path=settings.vector_db_path,
|
| 600 |
+
model_name=settings.sentence_transformer_model
|
| 601 |
+
)
|
| 602 |
+
|
| 603 |
+
# Initialize chat service based on provider
|
| 604 |
+
chat_kwargs = {}
|
| 605 |
+
if settings.llm_provider.value == "ollama":
|
| 606 |
+
chat_kwargs = {
|
| 607 |
+
"model": settings.ollama_model,
|
| 608 |
+
"base_url": settings.ollama_base_url
|
| 609 |
+
}
|
| 610 |
+
elif settings.llm_provider.value == "groq":
|
| 611 |
+
chat_kwargs = {
|
| 612 |
+
"api_key": settings.groq_api_key,
|
| 613 |
+
"model": settings.groq_model
|
| 614 |
+
}
|
| 615 |
+
elif settings.llm_provider.value == "huggingface":
|
| 616 |
+
chat_kwargs = {
|
| 617 |
+
"api_key": settings.huggingface_api_key,
|
| 618 |
+
"model": settings.huggingface_model
|
| 619 |
+
}
|
| 620 |
+
|
| 621 |
+
chat_service = FreeChatService(
|
| 622 |
+
llm_provider=settings.llm_provider.value,
|
| 623 |
+
**chat_kwargs
|
| 624 |
+
)
|
| 625 |
+
|
| 626 |
+
file_processor = FileProcessor(settings.supported_extensions, settings.max_file_size)
|
| 627 |
+
|
| 628 |
+
# In-memory status tracking (use Redis in production)
|
| 629 |
+
repo_status: Dict[str, RepoStatus] = {}
|
| 630 |
+
|
| 631 |
+
async def process_repository(repo_url: str, branch: str, repo_id: str):
|
| 632 |
+
"""Background task to process repository"""
|
| 633 |
+
try:
|
| 634 |
+
repo_status[repo_id] = RepoStatus(
|
| 635 |
+
repo_id=repo_id,
|
| 636 |
+
status=ProcessingStatus.PROCESSING,
|
| 637 |
+
progress=10,
|
| 638 |
+
message="Cloning repository..."
|
| 639 |
+
)
|
| 640 |
+
|
| 641 |
+
# Clone repository
|
| 642 |
+
repo_path = await github_service.clone_repository(repo_url, branch)
|
| 643 |
+
|
| 644 |
+
repo_status[repo_id].progress = 30
|
| 645 |
+
repo_status[repo_id].message = "Processing files..."
|
| 646 |
+
|
| 647 |
+
# Extract files
|
| 648 |
+
files = list(file_processor.extract_files(repo_path))
|
| 649 |
+
|
| 650 |
+
repo_status[repo_id].total_files = len(files)
|
| 651 |
+
repo_status[repo_id].progress = 50
|
| 652 |
+
repo_status[repo_id].message = "Creating embeddings (this may take a while for large repos)..."
|
| 653 |
+
|
| 654 |
+
# Create embeddings
|
| 655 |
+
await embedding_service.create_embeddings(files, repo_id)
|
| 656 |
+
|
| 657 |
+
# Cleanup
|
| 658 |
+
github_service.cleanup_repo(repo_path)
|
| 659 |
+
|
| 660 |
+
repo_status[repo_id].status = ProcessingStatus.COMPLETED
|
| 661 |
+
repo_status[repo_id].progress = 100
|
| 662 |
+
repo_status[repo_id].message = f"Repository processed successfully! Using {settings.llm_provider.value} for chat."
|
| 663 |
+
|
| 664 |
+
except Exception as e:
|
| 665 |
+
repo_status[repo_id].status = ProcessingStatus.FAILED
|
| 666 |
+
repo_status[repo_id].message = f"Error: {str(e)}"
|
| 667 |
+
|
| 668 |
+
@app.post("/api/process-repo")
|
| 669 |
+
async def process_repo(request: RepoProcessRequest, background_tasks: BackgroundTasks):
|
| 670 |
+
"""Process a GitHub repository"""
|
| 671 |
+
repo_id = github_service.generate_repo_id(str(request.repo_url))
|
| 672 |
+
|
| 673 |
+
# Check if already processed
|
| 674 |
+
if repo_id in repo_status and repo_status[repo_id].status == ProcessingStatus.COMPLETED:
|
| 675 |
+
return {"repo_id": repo_id, "message": "Repository already processed"}
|
| 676 |
+
|
| 677 |
+
# Start processing
|
| 678 |
+
repo_status[repo_id] = RepoStatus(
|
| 679 |
+
repo_id=repo_id,
|
| 680 |
+
status=ProcessingStatus.PENDING,
|
| 681 |
+
progress=0,
|
| 682 |
+
message="Starting processing..."
|
| 683 |
+
)
|
| 684 |
+
|
| 685 |
+
background_tasks.add_task(process_repository, str(request.repo_url), request.branch, repo_id)
|
| 686 |
+
|
| 687 |
+
return {"repo_id": repo_id, "message": "Processing started"}
|
| 688 |
+
|
| 689 |
+
@app.get("/api/status/{repo_id}", response_model=RepoStatus)
|
| 690 |
+
async def get_repo_status(repo_id: str):
|
| 691 |
+
"""Get repository processing status"""
|
| 692 |
+
if repo_id not in repo_status:
|
| 693 |
+
raise HTTPException(status_code=404, detail="Repository not found")
|
| 694 |
+
|
| 695 |
+
return repo_status[repo_id]
|
| 696 |
+
|
| 697 |
+
@app.post("/api/chat", response_model=ChatResponse)
|
| 698 |
+
async def chat_with_repo(message: ChatMessage):
|
| 699 |
+
"""Chat with repository"""
|
| 700 |
+
repo_id = message.repo_id
|
| 701 |
+
|
| 702 |
+
# Check if repo is processed
|
| 703 |
+
if repo_id not in repo_status or repo_status[repo_id].status != ProcessingStatus.COMPLETED:
|
| 704 |
+
raise HTTPException(status_code=400, detail="Repository not processed")
|
| 705 |
+
|
| 706 |
+
try:
|
| 707 |
+
# Get vector store
|
| 708 |
+
vectorstore = embedding_service.get_vectorstore(repo_id)
|
| 709 |
+
|
| 710 |
+
# Get answer
|
| 711 |
+
result = await chat_service.answer_question(message.message, vectorstore, repo_id)
|
| 712 |
+
|
| 713 |
+
return ChatResponse(**result)
|
| 714 |
+
|
| 715 |
+
except Exception as e:
|
| 716 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 717 |
+
|
| 718 |
+
@app.get("/api/health")
|
| 719 |
+
async def health_check():
|
| 720 |
+
return {
|
| 721 |
+
"status": "healthy",
|
| 722 |
+
"timestamp": datetime.utcnow(),
|
| 723 |
+
"llm_provider": settings.llm_provider.value,
|
| 724 |
+
"embedding_provider": settings.embedding_provider.value
|
| 725 |
+
}
|
| 726 |
+
|
| 727 |
+
@app.get("/api/config")
|
| 728 |
+
async def get_config():
|
| 729 |
+
"""Get current configuration"""
|
| 730 |
+
return {
|
| 731 |
+
"llm_provider": settings.llm_provider.value,
|
| 732 |
+
"embedding_provider": settings.embedding_provider.value,
|
| 733 |
+
"embedding_model": settings.sentence_transformer_model,
|
| 734 |
+
"supported_extensions": settings.supported_extensions
|
| 735 |
+
}
|
| 736 |
+
|
| 737 |
+
if __name__ == "__main__":
|
| 738 |
+
uvicorn.run(app, host="0.0.0.0", port=8000, reload=True)
|
| 739 |
+
```_service.create_embeddings(files, repo_id)
|
| 740 |
+
|
| 741 |
+
# Cleanup
|
| 742 |
+
github_service.cleanup_repo(repo_path)
|
| 743 |
+
|
| 744 |
+
repo_status[repo_id].status = ProcessingStatus.COMPLETED
|
| 745 |
+
repo_status[repo_id].progress = 100
|
| 746 |
+
repo_status[repo_id].message = "Repository processed successfully!"
|
| 747 |
+
|
| 748 |
+
except Exception as e:
|
| 749 |
+
repo_status[repo_id].status = ProcessingStatus.FAILED
|
| 750 |
+
repo_status[repo_id].message = f"Error: {str(e)}"
|
| 751 |
+
|
| 752 |
+
@app.post("/api/process-repo")
|
| 753 |
+
async def process_repo(request: RepoProcessRequest, background_tasks: BackgroundTasks):
|
| 754 |
+
"""Process a GitHub repository"""
|
| 755 |
+
repo_id = github_service.generate_repo_id(str(request.repo_url))
|
| 756 |
+
|
| 757 |
+
# Check if already processed
|
| 758 |
+
if repo_id in repo_status and repo_status[repo_id].status == ProcessingStatus.COMPLETED:
|
| 759 |
+
return {"repo_id": repo_id, "message": "Repository already processed"}
|
| 760 |
+
|
| 761 |
+
# Start processing
|
| 762 |
+
repo_status[repo_id] = RepoStatus(
|
| 763 |
+
repo_id=repo_id,
|
| 764 |
+
status=ProcessingStatus.PENDING,
|
| 765 |
+
progress=0,
|
| 766 |
+
message="Starting processing..."
|
| 767 |
+
)
|
| 768 |
+
|
| 769 |
+
background_tasks.add_task(process_repository, str(request.repo_url), request.branch, repo_id)
|
| 770 |
+
|
| 771 |
+
return {"repo_id": repo_id, "message": "Processing started"}
|
| 772 |
+
|
| 773 |
+
@app.get("/api/status/{repo_id}", response_model=RepoStatus)
|
| 774 |
+
async def get_repo_status(repo_id: str):
|
| 775 |
+
"""Get repository processing status"""
|
| 776 |
+
if repo_id not in repo_status:
|
| 777 |
+
raise HTTPException(status_code=404, detail="Repository not found")
|
| 778 |
+
|
| 779 |
+
return repo_status[repo_id]
|
| 780 |
+
|
| 781 |
+
@app.post("/api/chat", response_model=ChatResponse)
|
| 782 |
+
async def chat_with_repo(message: ChatMessage):
|
| 783 |
+
"""Chat with repository"""
|
| 784 |
+
repo_id = message.repo_id
|
| 785 |
+
|
| 786 |
+
# Check if repo is processed
|
| 787 |
+
if repo_id not in repo_status or repo_status[repo_id].status != ProcessingStatus.COMPLETED:
|
| 788 |
+
raise HTTPException(status_code=400, detail="Repository not processed")
|
| 789 |
+
|
| 790 |
+
try:
|
| 791 |
+
# Get vector store
|
| 792 |
+
vectorstore = embedding_service.get_vectorstore(repo_id)
|
| 793 |
+
|
| 794 |
+
# Get answer
|
| 795 |
+
result = await chat_service.answer_question(message.message, vectorstore, repo_id)
|
| 796 |
+
|
| 797 |
+
return ChatResponse(**result)
|
| 798 |
+
|
| 799 |
+
except Exception as e:
|
| 800 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 801 |
+
|
| 802 |
+
@app.get("/api/health")
|
| 803 |
+
async def health_check():
|
| 804 |
+
return {"status": "healthy", "timestamp": datetime.utcnow()}
|
| 805 |
+
|
| 806 |
+
if __name__ == "__main__":
|
| 807 |
+
uvicorn.run(app, host="0.0.0.0", port=8000, reload=True)
|
| 808 |
+
```
|
| 809 |
+
|
| 810 |
+
### 10. Environment Configuration (.env)
|
| 811 |
+
|
| 812 |
+
```env
|
| 813 |
+
# LLM Provider Configuration
|
| 814 |
+
LLM_PROVIDER=ollama # Options: ollama, groq, huggingface
|
| 815 |
+
EMBEDDING_PROVIDER=sentence_transformers # Options: sentence_transformers, huggingface
|
| 816 |
+
|
| 817 |
+
# Ollama Configuration (Local LLM - Free)
|
| 818 |
+
OLLAMA_BASE_URL=http://localhost:11434
|
| 819 |
+
OLLAMA_MODEL=llama2 # Options: llama2, codellama, mistral, phi, etc.
|
| 820 |
+
|
| 821 |
+
# Groq Configuration (Fast inference - Free tier available)
|
| 822 |
+
GROQ_API_KEY=your_groq_api_key_here
|
| 823 |
+
GROQ_MODEL=mixtral-8x7b-32768 # Options: mixtral-8x7b-32768, llama2-70b-4096
|
| 824 |
+
|
| 825 |
+
# Hugging Face Configuration (Free inference API)
|
| 826 |
+
HUGGINGFACE_API_KEY=your_hf_api_key_here
|
| 827 |
+
HUGGINGFACE_MODEL=microsoft/DialoGPT-medium
|
| 828 |
+
|
| 829 |
+
# Embedding Model Configuration
|
| 830 |
+
SENTENCE_TRANSFORMER_MODEL=all-MiniLM-L6-v2 # Fast and good quality
|
| 831 |
+
|
| 832 |
+
# Other Configuration
|
| 833 |
+
GITHUB_TOKEN=your_github_token_here
|
| 834 |
+
REDIS_URL=redis://localhost:6379
|
| 835 |
+
VECTOR_DB_PATH=./chroma_db
|
| 836 |
+
```
|
| 837 |
+
|
| 838 |
+
### 11. Updated Docker Configuration
|
| 839 |
+
|
| 840 |
+
**Dockerfile:**
|
| 841 |
+
```dockerfile
|
| 842 |
+
FROM python:3.11-slim
|
| 843 |
+
|
| 844 |
+
WORKDIR /app
|
| 845 |
+
|
| 846 |
+
# Install system dependencies
|
| 847 |
+
RUN apt-get update && apt-get install -y \
|
| 848 |
+
git \
|
| 849 |
+
build-essential \
|
| 850 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 851 |
+
|
| 852 |
+
COPY requirements.txt .
|
| 853 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 854 |
+
|
| 855 |
+
# Pre-download embedding models
|
| 856 |
+
RUN python -c "from sentence_transformers import SentenceTransformer; SentenceTransformer('all-MiniLM-L6-v2', cache_folder='./models')"
|
| 857 |
+
|
| 858 |
+
COPY . .
|
| 859 |
+
|
| 860 |
+
EXPOSE 8000
|
| 861 |
+
|
| 862 |
+
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]
|
| 863 |
+
```
|
| 864 |
+
|
| 865 |
+
**docker-compose.yml:**
|
| 866 |
+
```yaml
|
| 867 |
+
version: '3.8'
|
| 868 |
+
|
| 869 |
+
services:
|
| 870 |
+
api:
|
| 871 |
+
build: .
|
| 872 |
+
ports:
|
| 873 |
+
- "8000:8000"
|
| 874 |
+
environment:
|
| 875 |
+
- LLM_PROVIDER=${LLM_PROVIDER}
|
| 876 |
+
- EMBEDDING_PROVIDER=${EMBEDDING_PROVIDER}
|
| 877 |
+
- OLLAMA_BASE_URL=http://ollama:11434
|
| 878 |
+
- OLLAMA_MODEL=${OLLAMA_MODEL}
|
| 879 |
+
- GROQ_API_KEY=${GROQ_API_KEY}
|
| 880 |
+
- GROQ_MODEL=${GROQ_MODEL}
|
| 881 |
+
- HUGGINGFACE_API_KEY=${HUGGINGFACE_API_KEY}
|
| 882 |
+
- HUGGINGFACE_MODEL=${HUGGINGFACE_MODEL}
|
| 883 |
+
- SENTENCE_TRANSFORMER_MODEL=${SENTENCE_TRANSFORMER_MODEL}
|
| 884 |
+
- GITHUB_TOKEN=${GITHUB_TOKEN}
|
| 885 |
+
- REDIS_URL=redis://redis:6379
|
| 886 |
+
volumes:
|
| 887 |
+
- ./chroma_db:/app/chroma_db
|
| 888 |
+
- ./models:/app/models # Cache for models
|
| 889 |
+
depends_on:
|
| 890 |
+
- redis
|
| 891 |
+
- ollama
|
| 892 |
+
|
| 893 |
+
ollama:
|
| 894 |
+
image: ollama/ollama:latest
|
| 895 |
+
ports:
|
| 896 |
+
- "11434:11434"
|
| 897 |
+
volumes:
|
| 898 |
+
- ollama_data:/root/.ollama
|
| 899 |
+
environment:
|
| 900 |
+
- OLLAMA_KEEP_ALIVE=24h
|
| 901 |
+
|
| 902 |
+
redis:
|
| 903 |
+
image: redis:7-alpine
|
| 904 |
+
ports:
|
| 905 |
+
- "6379:6379"
|
| 906 |
+
|
| 907 |
+
volumes:
|
| 908 |
+
ollama_data:
|
| 909 |
+
```
|
| 910 |
+
|
| 911 |
+
**Setup script for Ollama models (setup_ollama.sh):**
|
| 912 |
+
```bash
|
| 913 |
+
#!/bin/bash
|
| 914 |
+
# Pull required models
|
| 915 |
+
docker exec chat-with-github-repo-ollama-1 ollama pull llama2
|
| 916 |
+
docker exec chat-with-github-repo-ollama-1 ollama pull codellama
|
| 917 |
+
docker exec chat-with-github-repo-ollama-1 ollama pull mistral
|
| 918 |
+
```
|
| 919 |
+
|
| 920 |
+
## Performance Comparison
|
| 921 |
+
|
| 922 |
+
| Provider | Speed | Quality | Cost | Setup Difficulty |
|
| 923 |
+
|----------|-------|---------|------|------------------|
|
| 924 |
+
| Ollama + Llama2 | Medium | Good | Free | Easy |
|
| 925 |
+
| Ollama + CodeLlama | Medium | Excellent (Code) | Free | Easy |
|
| 926 |
+
| Groq + Mixtral | Very Fast | Excellent | Free Tier | Very Easy |
|
| 927 |
+
| HuggingFace | Slow | Variable | Free | Very Easy |
|
| 928 |
+
| Local Transformers | Slow-Medium | Good | Free | Medium |
|
| 929 |
+
|
| 930 |
+
## Recommended Configurations
|
| 931 |
+
|
| 932 |
+
### For Development/Testing:
|
| 933 |
+
```env
|
| 934 |
+
LLM_PROVIDER=groq
|
| 935 |
+
GROQ_MODEL=mixtral-8x7b-32768
|
| 936 |
+
EMBEDDING_PROVIDER=sentence_transformers
|
| 937 |
+
SENTENCE_TRANSFORMER_MODEL=all-MiniLM-L6-v2
|
| 938 |
+
```
|
| 939 |
+
|
| 940 |
+
### For Production (Local):
|
| 941 |
+
```env
|
| 942 |
+
LLM_PROVIDER=ollama
|
| 943 |
+
OLLAMA_MODEL=codellama
|
| 944 |
+
EMBEDDING_PROVIDER=sentence_transformers
|
| 945 |
+
SENTENCE_TRANSFORMER_MODEL=all-mpnet-base-v2
|
| 946 |
+
```
|
| 947 |
+
|
| 948 |
+
### For Minimal Resources:
|
| 949 |
+
```env
|
| 950 |
+
LLM_PROVIDER=ollama
|
| 951 |
+
OLLAMA_MODEL=phi
|
| 952 |
+
EMBEDDING_PROVIDER=sentence_transformers
|
| 953 |
+
SENTENCE_TRANSFORMER_MODEL=all-MiniLM-L6-v2
|
| 954 |
+
```
|
| 955 |
+
|
| 956 |
+
## Troubleshooting
|
| 957 |
+
|
| 958 |
+
### Ollama Issues:
|
| 959 |
+
```bash
|
| 960 |
+
# Check if Ollama is running
|
| 961 |
+
curl http://localhost:11434/api/version
|
| 962 |
+
|
| 963 |
+
# List available models
|
| 964 |
+
ollama list
|
| 965 |
+
|
| 966 |
+
# Check logs
|
| 967 |
+
ollama logs
|
| 968 |
+
```
|
| 969 |
+
|
| 970 |
+
### Memory Issues:
|
| 971 |
+
- Use smaller models (`phi` instead of `llama2`)
|
| 972 |
+
- Reduce batch size in embedding service
|
| 973 |
+
- Use quantized models
|
| 974 |
+
- Process repos in smaller chunks
|
| 975 |
+
|
| 976 |
+
### Performance Optimization:
|
| 977 |
+
```python
|
| 978 |
+
# For faster embeddings
|
| 979 |
+
embedding_service = FreeEmbeddingService(
|
| 980 |
+
embedding_provider="sentence_transformers",
|
| 981 |
+
vector_db_path="./chroma_db",
|
| 982 |
+
model_name="all-MiniLM-L6-v2" # Faster than all-mpnet-base-v2
|
| 983 |
+
)
|
| 984 |
+
|
| 985 |
+
# Batch processing
|
| 986 |
+
async def create_embeddings_batch(self, files: List[Dict], repo_id: str, batch_size: int = 50):
|
| 987 |
+
for i in range(0, len(files), batch_size):
|
| 988 |
+
batch = files[i:i + batch_size]
|
| 989 |
+
# Process batch...
|
| 990 |
+
```
|
| 991 |
+
|
| 992 |
+
## Deployment & Scaling
|
| 993 |
+
|
| 994 |
+
### Project Structure
|
| 995 |
+
```
|
| 996 |
+
frontend/
|
| 997 |
+
├── src/
|
| 998 |
+
│ ├── app/
|
| 999 |
+
│ │ ├── page.tsx
|
| 1000 |
+
│ │ ├── layout.tsx
|
| 1001 |
+
│ │ └── globals.css
|
| 1002 |
+
│ ├── components/
|
| 1003 |
+
│ │ ├── RepoInput.tsx
|
| 1004 |
+
│ │ ├── ChatInterface.tsx
|
| 1005 |
+
│ │ ├── ProcessingStatus.tsx
|
| 1006 |
+
│ │ └── SourceDisplay.tsx
|
| 1007 |
+
│ └── lib/
|
| 1008 |
+
│ └── api.ts
|
| 1009 |
+
├── package.json
|
| 1010 |
+
├── tailwind.config.js
|
| 1011 |
+
└── next.config.js
|
| 1012 |
+
```
|
| 1013 |
+
|
| 1014 |
+
### Key Features
|
| 1015 |
+
- Repository URL input with validation
|
| 1016 |
+
- Real-time processing status updates
|
| 1017 |
+
- Chat interface with message history
|
| 1018 |
+
- Source code display with syntax highlighting
|
| 1019 |
+
- Responsive design
|
| 1020 |
+
|
| 1021 |
+
## Free Alternatives to OpenAI
|
| 1022 |
+
|
| 1023 |
+
### 1. **Ollama (Recommended for Local Deployment)**
|
| 1024 |
+
**Pros:**
|
| 1025 |
+
- 100% free and private
|
| 1026 |
+
- Runs locally, no API calls
|
| 1027 |
+
- Supports many models (Llama2, CodeLlama, Mistral, Phi, etc.)
|
| 1028 |
+
- Good performance on decent hardware
|
| 1029 |
+
|
| 1030 |
+
**Setup:**
|
| 1031 |
+
```bash
|
| 1032 |
+
# Install Ollama
|
| 1033 |
+
curl -fsSL https://ollama.ai/install.sh | sh
|
| 1034 |
+
|
| 1035 |
+
# Pull models
|
| 1036 |
+
ollama pull llama2 # General purpose
|
| 1037 |
+
ollama pull codellama # Better for code
|
| 1038 |
+
ollama pull mistral # Good balance
|
| 1039 |
+
ollama pull phi # Lightweight
|
| 1040 |
+
|
| 1041 |
+
# Start Ollama server
|
| 1042 |
+
ollama serve
|
| 1043 |
+
```
|
| 1044 |
+
|
| 1045 |
+
**Requirements:** 8GB+ RAM, preferably with GPU
|
| 1046 |
+
|
| 1047 |
+
### 2. **Groq (Fast Inference - Free Tier)**
|
| 1048 |
+
**Pros:**
|
| 1049 |
+
- Extremely fast inference
|
| 1050 |
+
- Free tier: 100 requests/minute
|
| 1051 |
+
- High-quality models (Mixtral, Llama2)
|
| 1052 |
+
- Simple API
|
| 1053 |
+
|
| 1054 |
+
**Setup:**
|
| 1055 |
+
1. Sign up at [groq.com](https://groq.com)
|
| 1056 |
+
2. Get API key from console
|
| 1057 |
+
3. Set `GROQ_API_KEY` in environment
|
| 1058 |
+
|
| 1059 |
+
**Free Limits:** 100 requests/minute, 1000 requests/day
|
| 1060 |
+
|
| 1061 |
+
### 3. **Hugging Face Inference API (Free)**
|
| 1062 |
+
**Pros:**
|
| 1063 |
+
- Completely free
|
| 1064 |
+
- Access to thousands of models
|
| 1065 |
+
- No setup required
|
| 1066 |
+
- Good for experimentation
|
| 1067 |
+
|
| 1068 |
+
**Setup:**
|
| 1069 |
+
1. Sign up at [huggingface.co](https://huggingface.co)
|
| 1070 |
+
2. Get API token from settings
|
| 1071 |
+
3. Set `HUGGINGFACE_API_KEY` in environment
|
| 1072 |
+
|
| 1073 |
+
**Note:** Can be slower due to cold starts
|
| 1074 |
+
|
| 1075 |
+
### 4. **Local Transformers (Completely Free)**
|
| 1076 |
+
For maximum control, you can run models directly:
|
| 1077 |
+
|
| 1078 |
+
```python
|
| 1079 |
+
# services/local_llm_service.py
|
| 1080 |
+
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
|
| 1081 |
+
import torch
|
| 1082 |
+
|
| 1083 |
+
class LocalLLMService:
|
| 1084 |
+
def __init__(self, model_name="microsoft/DialoGPT-small"):
|
| 1085 |
+
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 1086 |
+
self.model = AutoModelForCausalLM.from_pretrained(model_name)
|
| 1087 |
+
self.generator = pipeline(
|
| 1088 |
+
"text-generation",
|
| 1089 |
+
model=self.model,
|
| 1090 |
+
tokenizer=self.tokenizer,
|
| 1091 |
+
device=0 if torch.cuda.is_available() else -1
|
| 1092 |
+
)
|
| 1093 |
+
|
| 1094 |
+
def generate_response(self, prompt: str, max_length: int = 512):
|
| 1095 |
+
response = self.generator(
|
| 1096 |
+
prompt,
|
| 1097 |
+
max_length=max_length,
|
| 1098 |
+
temperature=0.7,
|
| 1099 |
+
do_sample=True,
|
| 1100 |
+
pad_token_id=self.tokenizer.eos_token_id
|
| 1101 |
+
)
|
| 1102 |
+
return response[0]["generated_text"].replace(prompt, "").strip()
|
| 1103 |
+
```
|
| 1104 |
+
|
| 1105 |
+
## Embedding Models Comparison
|
| 1106 |
+
|
| 1107 |
+
### 1. **Sentence Transformers (Recommended)**
|
| 1108 |
+
```python
|
| 1109 |
+
# Best models for code:
|
| 1110 |
+
"all-MiniLM-L6-v2" # Fast, good quality
|
| 1111 |
+
"all-mpnet-base-v2" # Better quality, slower
|
| 1112 |
+
"multi-qa-MiniLM-L6-cos-v1" # Good for Q&A
|
| 1113 |
+
```
|
| 1114 |
+
|
| 1115 |
+
### 2. **Hugging Face Embeddings**
|
| 1116 |
+
```python
|
| 1117 |
+
# Popular models:
|
| 1118 |
+
"sentence-transformers/all-MiniLM-L6-v2"
|
| 1119 |
+
"sentence-transformers/paraphrase-MiniLM-L6-v2"
|
| 1120 |
+
```
|
| 1121 |
+
|
| 1122 |
+
## Quick Start Guide
|
| 1123 |
+
|
| 1124 |
+
### Option 1: Ollama (Local)
|
| 1125 |
+
```bash
|
| 1126 |
+
# 1. Install and start Ollama
|
| 1127 |
+
curl -fsSL https://ollama.ai/install.sh | sh
|
| 1128 |
+
ollama serve
|
| 1129 |
+
|
| 1130 |
+
# 2. Pull a model
|
| 1131 |
+
ollama pull llama2
|
| 1132 |
+
|
| 1133 |
+
# 3. Set environment variables
|
| 1134 |
+
export LLM_PROVIDER=ollama
|
| 1135 |
+
export OLLAMA_MODEL=llama2
|
| 1136 |
+
export EMBEDDING_PROVIDER=sentence_transformers
|
| 1137 |
+
|
| 1138 |
+
# 4. Start the application
|
| 1139 |
+
python main.py
|
| 1140 |
+
```
|
| 1141 |
+
|
| 1142 |
+
### Option 2: Groq (Cloud)
|
| 1143 |
+
```bash
|
| 1144 |
+
# 1. Get API key from groq.com
|
| 1145 |
+
export GROQ_API_KEY=your_api_key_here
|
| 1146 |
+
export LLM_PROVIDER=groq
|
| 1147 |
+
export GROQ_MODEL=mixtral-8x7b-32768
|
| 1148 |
+
export EMBEDDING_PROVIDER=sentence_transformers
|
| 1149 |
+
|
| 1150 |
+
# 2. Start the application
|
| 1151 |
+
python main.py
|
| 1152 |
+
```
|
| 1153 |
+
|
| 1154 |
+
### Option 3: Hugging Face (Cloud)
|
| 1155 |
+
```bash
|
| 1156 |
+
# 1. Get token from huggingface.co
|
| 1157 |
+
export HUGGINGFACE_API_KEY=your_token_here
|
| 1158 |
+
export LLM_PROVIDER=huggingface
|
| 1159 |
+
export HUGGINGFACE_MODEL=microsoft/DialoGPT-medium
|
| 1160 |
+
export EMBEDDING_PROVIDER=sentence_transformers
|
| 1161 |
+
|
| 1162 |
+
# 2. Start the application
|
| 1163 |
+
python main.py
|
| 1164 |
+
```
|
| 1165 |
+
|
| 1166 |
+
### Production Considerations
|
| 1167 |
+
1. **Vector Database**: Use Pinecone for better scalability
|
| 1168 |
+
2. **Background Processing**: Implement with Celery + Redis
|
| 1169 |
+
3. **Caching**: Add Redis caching for frequent queries
|
| 1170 |
+
4. **Rate Limiting**: Implement API rate limiting
|
| 1171 |
+
5. **Authentication**: Add user authentication
|
| 1172 |
+
6. **Monitoring**: Add logging and monitoring (Sentry, DataDog)
|
| 1173 |
+
|
| 1174 |
+
### Performance Optimizations
|
| 1175 |
+
1. **Chunking Strategy**: Optimize chunk size and overlap
|
| 1176 |
+
2. **Embedding Model**: Consider using smaller models for faster processing
|
| 1177 |
+
3. **Retrieval**: Implement hybrid search (dense + sparse)
|
| 1178 |
+
4. **Caching**: Cache embeddings and frequently asked questions
|
| 1179 |
+
|
| 1180 |
+
## Usage Examples
|
| 1181 |
+
|
| 1182 |
+
### Sample Questions Users Can Ask:
|
| 1183 |
+
- "How is authentication handled in this project?"
|
| 1184 |
+
- "Explain the UserService class and its methods"
|
| 1185 |
+
- "What testing framework is used and where are the tests?"
|
| 1186 |
+
- "How does the database connection work?"
|
| 1187 |
+
- "What are the main API endpoints?"
|
| 1188 |
+
- "Show me how error handling is implemented"
|
| 1189 |
+
|
| 1190 |
+
## Next Steps & Enhancements
|
| 1191 |
+
|
| 1192 |
+
1. **Multi-language Support**: Better handling of different programming languages
|
| 1193 |
+
2. **Code Analysis**: Add static code analysis features
|
| 1194 |
+
3. **Visualization**: Generate architecture diagrams
|
| 1195 |
+
4. **Collaboration**: Multi-user support with shared repositories
|
| 1196 |
+
5. **Integration**: GitHub webhook integration for auto-updates
|
| 1197 |
+
6. **AI Features**: Code suggestions and improvements
|
| 1198 |
+
|
| 1199 |
+
This project provides a solid foundation for building a production-ready "Chat with GitHub Repo" application that developers will find incredibly useful for understanding and navigating codebases!
|
requirements.txt
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
gradio==4.44.0
|
| 2 |
+
fastapi==0.104.1
|
| 3 |
+
pydantic==2.5.0
|
| 4 |
+
pydantic-settings==2.1.0
|
| 5 |
+
langchain==0.1.0
|
| 6 |
+
langchain-community==0.0.10
|
| 7 |
+
chromadb==0.4.18
|
| 8 |
+
GitPython==3.1.40
|
| 9 |
+
python-dotenv==1.0.0
|
| 10 |
+
aiofiles==23.2.1
|
| 11 |
+
sentence-transformers==2.2.2
|
| 12 |
+
transformers==4.36.0
|
| 13 |
+
torch==2.1.0
|
| 14 |
+
huggingface-hub==0.19.4
|
| 15 |
+
requests==2.31.0
|
| 16 |
+
numpy==1.24.3
|
run_local.py
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Local development runner for the Chat with GitHub Repository application.
|
| 4 |
+
This script sets up the environment and runs the Gradio app locally.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import os
|
| 8 |
+
import sys
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
|
| 11 |
+
# Add the current directory to Python path
|
| 12 |
+
current_dir = Path(__file__).parent
|
| 13 |
+
sys.path.insert(0, str(current_dir))
|
| 14 |
+
|
| 15 |
+
def setup_environment():
|
| 16 |
+
"""Setup environment variables for local development"""
|
| 17 |
+
# Set default values if not already set
|
| 18 |
+
env_vars = {
|
| 19 |
+
"LLM_PROVIDER": "huggingface",
|
| 20 |
+
"EMBEDDING_PROVIDER": "sentence_transformers",
|
| 21 |
+
"HUGGINGFACE_MODEL": "microsoft/DialoGPT-medium",
|
| 22 |
+
"SENTENCE_TRANSFORMER_MODEL": "all-MiniLM-L6-v2",
|
| 23 |
+
"VECTOR_DB_PATH": "./chroma_db",
|
| 24 |
+
}
|
| 25 |
+
|
| 26 |
+
for key, value in env_vars.items():
|
| 27 |
+
if key not in os.environ:
|
| 28 |
+
os.environ[key] = value
|
| 29 |
+
|
| 30 |
+
# Create necessary directories
|
| 31 |
+
os.makedirs("chroma_db", exist_ok=True)
|
| 32 |
+
os.makedirs("models", exist_ok=True)
|
| 33 |
+
|
| 34 |
+
def main():
|
| 35 |
+
"""Main function to run the application"""
|
| 36 |
+
print("🚀 Starting Chat with GitHub Repository...")
|
| 37 |
+
print("📁 Setting up environment...")
|
| 38 |
+
|
| 39 |
+
setup_environment()
|
| 40 |
+
|
| 41 |
+
print("🤖 Loading AI models (this may take a moment on first run)...")
|
| 42 |
+
|
| 43 |
+
# Import and run the app
|
| 44 |
+
try:
|
| 45 |
+
from app import demo
|
| 46 |
+
print("✅ Application ready!")
|
| 47 |
+
print("🌐 Open your browser and go to the URL shown below:")
|
| 48 |
+
demo.launch(
|
| 49 |
+
server_name="0.0.0.0",
|
| 50 |
+
server_port=7860,
|
| 51 |
+
share=False,
|
| 52 |
+
debug=True
|
| 53 |
+
)
|
| 54 |
+
except ImportError as e:
|
| 55 |
+
print(f"❌ Error importing app: {e}")
|
| 56 |
+
print("💡 Make sure you've installed all requirements: pip install -r requirements.txt")
|
| 57 |
+
sys.exit(1)
|
| 58 |
+
except Exception as e:
|
| 59 |
+
print(f"❌ Error starting application: {e}")
|
| 60 |
+
sys.exit(1)
|
| 61 |
+
|
| 62 |
+
if __name__ == "__main__":
|
| 63 |
+
main()
|
services/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# Services package
|
services/chat_service.py
ADDED
|
@@ -0,0 +1,148 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from langchain.chains import RetrievalQA
|
| 2 |
+
from langchain.prompts import PromptTemplate
|
| 3 |
+
from langchain_community.llms import Ollama
|
| 4 |
+
from typing import Dict, List
|
| 5 |
+
import json
|
| 6 |
+
import requests
|
| 7 |
+
import os
|
| 8 |
+
|
| 9 |
+
# For Groq (free tier available)
|
| 10 |
+
class GroqLLM:
|
| 11 |
+
def __init__(self, api_key: str, model: str = "mixtral-8x7b-32768"):
|
| 12 |
+
self.api_key = api_key
|
| 13 |
+
self.model = model
|
| 14 |
+
self.base_url = "https://api.groq.com/openai/v1"
|
| 15 |
+
|
| 16 |
+
def __call__(self, prompt: str) -> str:
|
| 17 |
+
headers = {
|
| 18 |
+
"Authorization": f"Bearer {self.api_key}",
|
| 19 |
+
"Content-Type": "application/json"
|
| 20 |
+
}
|
| 21 |
+
|
| 22 |
+
data = {
|
| 23 |
+
"model": self.model,
|
| 24 |
+
"messages": [{"role": "user", "content": prompt}],
|
| 25 |
+
"temperature": 0.1,
|
| 26 |
+
"max_tokens": 1024
|
| 27 |
+
}
|
| 28 |
+
|
| 29 |
+
response = requests.post(
|
| 30 |
+
f"{self.base_url}/chat/completions",
|
| 31 |
+
headers=headers,
|
| 32 |
+
json=data
|
| 33 |
+
)
|
| 34 |
+
|
| 35 |
+
if response.status_code == 200:
|
| 36 |
+
return response.json()["choices"][0]["message"]["content"]
|
| 37 |
+
else:
|
| 38 |
+
raise Exception(f"Groq API error: {response.text}")
|
| 39 |
+
|
| 40 |
+
# For Hugging Face Inference API
|
| 41 |
+
class HuggingFaceLLM:
|
| 42 |
+
def __init__(self, api_key: str, model: str = "microsoft/DialoGPT-medium"):
|
| 43 |
+
self.api_key = api_key
|
| 44 |
+
self.model = model
|
| 45 |
+
self.base_url = f"https://api-inference.huggingface.co/models/{model}"
|
| 46 |
+
|
| 47 |
+
def __call__(self, prompt: str) -> str:
|
| 48 |
+
headers = {"Authorization": f"Bearer {self.api_key}"}
|
| 49 |
+
data = {"inputs": prompt, "parameters": {"max_length": 1000, "temperature": 0.1}}
|
| 50 |
+
|
| 51 |
+
response = requests.post(self.base_url, headers=headers, json=data)
|
| 52 |
+
|
| 53 |
+
if response.status_code == 200:
|
| 54 |
+
result = response.json()
|
| 55 |
+
if isinstance(result, list) and len(result) > 0:
|
| 56 |
+
return result[0].get("generated_text", "").replace(prompt, "").strip()
|
| 57 |
+
return str(result)
|
| 58 |
+
else:
|
| 59 |
+
raise Exception(f"HuggingFace API error: {response.text}")
|
| 60 |
+
|
| 61 |
+
class FreeChatService:
|
| 62 |
+
def __init__(self, llm_provider: str, **kwargs):
|
| 63 |
+
self.llm_provider = llm_provider
|
| 64 |
+
|
| 65 |
+
if llm_provider == "ollama":
|
| 66 |
+
self.llm = Ollama(
|
| 67 |
+
model=kwargs.get("model", "llama2"),
|
| 68 |
+
base_url=kwargs.get("base_url", "http://localhost:11434"),
|
| 69 |
+
temperature=0.1
|
| 70 |
+
)
|
| 71 |
+
elif llm_provider == "groq":
|
| 72 |
+
self.llm = GroqLLM(
|
| 73 |
+
api_key=kwargs.get("api_key"),
|
| 74 |
+
model=kwargs.get("model", "mixtral-8x7b-32768")
|
| 75 |
+
)
|
| 76 |
+
elif llm_provider == "huggingface":
|
| 77 |
+
self.llm = HuggingFaceLLM(
|
| 78 |
+
api_key=kwargs.get("api_key"),
|
| 79 |
+
model=kwargs.get("model", "microsoft/DialoGPT-medium")
|
| 80 |
+
)
|
| 81 |
+
else:
|
| 82 |
+
raise ValueError(f"Unsupported LLM provider: {llm_provider}")
|
| 83 |
+
|
| 84 |
+
self.prompt_template = PromptTemplate(
|
| 85 |
+
input_variables=["context", "question"],
|
| 86 |
+
template="""
|
| 87 |
+
You are a helpful AI assistant that analyzes code repositories. Use the following code snippets to answer the user's question about the repository.
|
| 88 |
+
|
| 89 |
+
Context from repository:
|
| 90 |
+
{context}
|
| 91 |
+
|
| 92 |
+
Question: {question}
|
| 93 |
+
|
| 94 |
+
Please provide a detailed answer based on the code context provided. If you reference specific files or functions, mention their file paths. If the question cannot be fully answered from the provided context, say so clearly.
|
| 95 |
+
|
| 96 |
+
Answer:"""
|
| 97 |
+
)
|
| 98 |
+
|
| 99 |
+
async def answer_question(self, question: str, vectorstore, repo_id: str) -> Dict:
|
| 100 |
+
"""Answer question using RAG with free LLM"""
|
| 101 |
+
try:
|
| 102 |
+
if self.llm_provider == "ollama":
|
| 103 |
+
# Use LangChain's RetrievalQA for Ollama
|
| 104 |
+
qa_chain = RetrievalQA.from_chain_type(
|
| 105 |
+
llm=self.llm,
|
| 106 |
+
chain_type="stuff",
|
| 107 |
+
retriever=vectorstore.as_retriever(search_kwargs={"k": 5}),
|
| 108 |
+
chain_type_kwargs={"prompt": self.prompt_template},
|
| 109 |
+
return_source_documents=True
|
| 110 |
+
)
|
| 111 |
+
|
| 112 |
+
result = qa_chain({"query": question})
|
| 113 |
+
answer = result["result"]
|
| 114 |
+
source_docs = result.get("source_documents", [])
|
| 115 |
+
|
| 116 |
+
else:
|
| 117 |
+
# Manual RAG for other providers
|
| 118 |
+
docs = vectorstore.similarity_search(question, k=5)
|
| 119 |
+
context = "\n\n".join([doc.page_content for doc in docs])
|
| 120 |
+
|
| 121 |
+
prompt = self.prompt_template.format(
|
| 122 |
+
context=context,
|
| 123 |
+
question=question
|
| 124 |
+
)
|
| 125 |
+
|
| 126 |
+
answer = self.llm(prompt)
|
| 127 |
+
source_docs = docs
|
| 128 |
+
|
| 129 |
+
# Format sources
|
| 130 |
+
sources = []
|
| 131 |
+
for doc in source_docs:
|
| 132 |
+
sources.append({
|
| 133 |
+
"path": doc.metadata.get("path", "Unknown"),
|
| 134 |
+
"content_preview": doc.page_content[:200] + "..." if len(doc.page_content) > 200 else doc.page_content
|
| 135 |
+
})
|
| 136 |
+
|
| 137 |
+
return {
|
| 138 |
+
"response": answer,
|
| 139 |
+
"sources": sources,
|
| 140 |
+
"repo_id": repo_id
|
| 141 |
+
}
|
| 142 |
+
|
| 143 |
+
except Exception as e:
|
| 144 |
+
return {
|
| 145 |
+
"response": f"Error processing question: {str(e)}",
|
| 146 |
+
"sources": [],
|
| 147 |
+
"repo_id": repo_id
|
| 148 |
+
}
|
services/embedding_service.py
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from sentence_transformers import SentenceTransformer
|
| 2 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 3 |
+
from langchain.schema import Document
|
| 4 |
+
from langchain_community.vectorstores import Chroma
|
| 5 |
+
from langchain_community.embeddings import HuggingFaceEmbeddings, SentenceTransformerEmbeddings
|
| 6 |
+
import chromadb
|
| 7 |
+
from typing import List, Dict
|
| 8 |
+
import os
|
| 9 |
+
|
| 10 |
+
class FreeEmbeddingService:
|
| 11 |
+
def __init__(self, embedding_provider: str, vector_db_path: str, model_name: str = "all-MiniLM-L6-v2"):
|
| 12 |
+
self.vector_db_path = vector_db_path
|
| 13 |
+
self.embedding_provider = embedding_provider
|
| 14 |
+
|
| 15 |
+
# Initialize embedding function based on provider
|
| 16 |
+
if embedding_provider == "sentence_transformers":
|
| 17 |
+
self.embeddings = SentenceTransformerEmbeddings(
|
| 18 |
+
model_name=model_name,
|
| 19 |
+
cache_folder="./models" # Cache models locally
|
| 20 |
+
)
|
| 21 |
+
elif embedding_provider == "huggingface":
|
| 22 |
+
self.embeddings = HuggingFaceEmbeddings(
|
| 23 |
+
model_name=model_name,
|
| 24 |
+
cache_folder="./models"
|
| 25 |
+
)
|
| 26 |
+
else:
|
| 27 |
+
raise ValueError(f"Unsupported embedding provider: {embedding_provider}")
|
| 28 |
+
|
| 29 |
+
self.text_splitter = RecursiveCharacterTextSplitter(
|
| 30 |
+
chunk_size=1000,
|
| 31 |
+
chunk_overlap=200,
|
| 32 |
+
separators=["\n\n", "\n", " ", ""]
|
| 33 |
+
)
|
| 34 |
+
|
| 35 |
+
def create_documents(self, files: List[Dict], repo_id: str) -> List[Document]:
|
| 36 |
+
"""Create documents from file contents"""
|
| 37 |
+
documents = []
|
| 38 |
+
|
| 39 |
+
for file_info in files:
|
| 40 |
+
# Create document with metadata
|
| 41 |
+
doc = Document(
|
| 42 |
+
page_content=file_info['content'],
|
| 43 |
+
metadata={
|
| 44 |
+
'path': file_info['path'],
|
| 45 |
+
'extension': file_info['extension'],
|
| 46 |
+
'repo_id': repo_id,
|
| 47 |
+
'size': file_info['size']
|
| 48 |
+
}
|
| 49 |
+
)
|
| 50 |
+
documents.append(doc)
|
| 51 |
+
|
| 52 |
+
return documents
|
| 53 |
+
|
| 54 |
+
def split_documents(self, documents: List[Document]) -> List[Document]:
|
| 55 |
+
"""Split documents into chunks"""
|
| 56 |
+
return self.text_splitter.split_documents(documents)
|
| 57 |
+
|
| 58 |
+
async def create_embeddings(self, files: List[Dict], repo_id: str):
|
| 59 |
+
"""Create and store embeddings for repository files"""
|
| 60 |
+
# Create documents
|
| 61 |
+
documents = self.create_documents(files, repo_id)
|
| 62 |
+
|
| 63 |
+
# Split into chunks
|
| 64 |
+
chunks = self.split_documents(documents)
|
| 65 |
+
|
| 66 |
+
# Create vector store
|
| 67 |
+
collection_name = f"repo_{repo_id}"
|
| 68 |
+
vectorstore = Chroma(
|
| 69 |
+
collection_name=collection_name,
|
| 70 |
+
embedding_function=self.embeddings,
|
| 71 |
+
persist_directory=self.vector_db_path
|
| 72 |
+
)
|
| 73 |
+
|
| 74 |
+
# Add documents to vector store in batches
|
| 75 |
+
batch_size = 100
|
| 76 |
+
for i in range(0, len(chunks), batch_size):
|
| 77 |
+
batch = chunks[i:i + batch_size]
|
| 78 |
+
vectorstore.add_documents(batch)
|
| 79 |
+
|
| 80 |
+
return vectorstore
|
| 81 |
+
|
| 82 |
+
def get_vectorstore(self, repo_id: str):
|
| 83 |
+
"""Get existing vector store for repository"""
|
| 84 |
+
collection_name = f"repo_{repo_id}"
|
| 85 |
+
return Chroma(
|
| 86 |
+
collection_name=collection_name,
|
| 87 |
+
embedding_function=self.embeddings,
|
| 88 |
+
persist_directory=self.vector_db_path
|
| 89 |
+
)
|
services/github_service.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import tempfile
|
| 3 |
+
import shutil
|
| 4 |
+
from git import Repo
|
| 5 |
+
from typing import List, Tuple
|
| 6 |
+
import hashlib
|
| 7 |
+
from urllib.parse import urlparse
|
| 8 |
+
|
| 9 |
+
class GitHubService:
|
| 10 |
+
def __init__(self, github_token: str = ""):
|
| 11 |
+
self.github_token = github_token
|
| 12 |
+
|
| 13 |
+
def generate_repo_id(self, repo_url: str) -> str:
|
| 14 |
+
"""Generate a unique ID for the repository"""
|
| 15 |
+
return hashlib.md5(repo_url.encode()).hexdigest()
|
| 16 |
+
|
| 17 |
+
def parse_github_url(self, url: str) -> Tuple[str, str]:
|
| 18 |
+
"""Extract owner and repo name from GitHub URL"""
|
| 19 |
+
parsed = urlparse(url)
|
| 20 |
+
path_parts = parsed.path.strip('/').split('/')
|
| 21 |
+
if len(path_parts) >= 2:
|
| 22 |
+
return path_parts[0], path_parts[1]
|
| 23 |
+
raise ValueError("Invalid GitHub URL format")
|
| 24 |
+
|
| 25 |
+
async def clone_repository(self, repo_url: str, branch: str = "main") -> str:
|
| 26 |
+
"""Clone repository to temporary directory"""
|
| 27 |
+
temp_dir = tempfile.mkdtemp()
|
| 28 |
+
try:
|
| 29 |
+
if self.github_token:
|
| 30 |
+
# Use token for private repos or higher rate limits
|
| 31 |
+
auth_url = repo_url.replace("https://", f"https://{self.github_token}@")
|
| 32 |
+
Repo.clone_from(auth_url, temp_dir, branch=branch, depth=1)
|
| 33 |
+
else:
|
| 34 |
+
Repo.clone_from(repo_url, temp_dir, branch=branch, depth=1)
|
| 35 |
+
return temp_dir
|
| 36 |
+
except Exception as e:
|
| 37 |
+
shutil.rmtree(temp_dir, ignore_errors=True)
|
| 38 |
+
raise Exception(f"Failed to clone repository: {str(e)}")
|
| 39 |
+
|
| 40 |
+
def cleanup_repo(self, repo_path: str):
|
| 41 |
+
"""Clean up cloned repository"""
|
| 42 |
+
if os.path.exists(repo_path):
|
| 43 |
+
shutil.rmtree(repo_path, ignore_errors=True)
|
setup.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from setuptools import setup, find_packages
|
| 2 |
+
|
| 3 |
+
setup(
|
| 4 |
+
name="chat-with-github-repo",
|
| 5 |
+
version="1.0.0",
|
| 6 |
+
description="AI-powered GitHub repository analysis and chat application",
|
| 7 |
+
author="Your Name",
|
| 8 |
+
author_email="your.email@example.com",
|
| 9 |
+
packages=find_packages(),
|
| 10 |
+
install_requires=[
|
| 11 |
+
"gradio>=4.44.0",
|
| 12 |
+
"fastapi>=0.104.1",
|
| 13 |
+
"pydantic>=2.5.0",
|
| 14 |
+
"pydantic-settings>=2.1.0",
|
| 15 |
+
"langchain>=0.1.0",
|
| 16 |
+
"langchain-community>=0.0.10",
|
| 17 |
+
"chromadb>=0.4.18",
|
| 18 |
+
"GitPython>=3.1.40",
|
| 19 |
+
"python-dotenv>=1.0.0",
|
| 20 |
+
"aiofiles>=23.2.1",
|
| 21 |
+
"sentence-transformers>=2.2.2",
|
| 22 |
+
"transformers>=4.36.0",
|
| 23 |
+
"torch>=2.1.0",
|
| 24 |
+
"huggingface-hub>=0.19.4",
|
| 25 |
+
"requests>=2.31.0",
|
| 26 |
+
"numpy>=1.24.3",
|
| 27 |
+
],
|
| 28 |
+
python_requires=">=3.8",
|
| 29 |
+
classifiers=[
|
| 30 |
+
"Development Status :: 4 - Beta",
|
| 31 |
+
"Intended Audience :: Developers",
|
| 32 |
+
"License :: OSI Approved :: MIT License",
|
| 33 |
+
"Programming Language :: Python :: 3",
|
| 34 |
+
"Programming Language :: Python :: 3.8",
|
| 35 |
+
"Programming Language :: Python :: 3.9",
|
| 36 |
+
"Programming Language :: Python :: 3.10",
|
| 37 |
+
"Programming Language :: Python :: 3.11",
|
| 38 |
+
],
|
| 39 |
+
)
|
utils/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# Utils package
|
utils/file_processor.py
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from typing import List, Dict, Generator
|
| 3 |
+
import mimetypes
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
|
| 6 |
+
class FileProcessor:
|
| 7 |
+
def __init__(self, supported_extensions: List[str], max_file_size: int):
|
| 8 |
+
self.supported_extensions = supported_extensions
|
| 9 |
+
self.max_file_size = max_file_size
|
| 10 |
+
self.ignore_dirs = {
|
| 11 |
+
'.git', '__pycache__', 'node_modules', '.pytest_cache',
|
| 12 |
+
'venv', 'env', '.venv', 'build', 'dist', '.next',
|
| 13 |
+
'coverage', '.coverage', 'logs', 'log'
|
| 14 |
+
}
|
| 15 |
+
self.ignore_files = {
|
| 16 |
+
'.gitignore', '.env', '.env.local', '.DS_Store',
|
| 17 |
+
'package-lock.json', 'yarn.lock', 'poetry.lock'
|
| 18 |
+
}
|
| 19 |
+
|
| 20 |
+
def should_process_file(self, file_path: str) -> bool:
|
| 21 |
+
"""Check if file should be processed"""
|
| 22 |
+
path = Path(file_path)
|
| 23 |
+
|
| 24 |
+
# Check if any parent directory is in ignore list
|
| 25 |
+
for parent in path.parents:
|
| 26 |
+
if parent.name in self.ignore_dirs:
|
| 27 |
+
return False
|
| 28 |
+
|
| 29 |
+
# Check file name
|
| 30 |
+
if path.name in self.ignore_files:
|
| 31 |
+
return False
|
| 32 |
+
|
| 33 |
+
# Check extension
|
| 34 |
+
if path.suffix.lower() not in self.supported_extensions:
|
| 35 |
+
return False
|
| 36 |
+
|
| 37 |
+
# Check file size
|
| 38 |
+
try:
|
| 39 |
+
if os.path.getsize(file_path) > self.max_file_size:
|
| 40 |
+
return False
|
| 41 |
+
except OSError:
|
| 42 |
+
return False
|
| 43 |
+
|
| 44 |
+
return True
|
| 45 |
+
|
| 46 |
+
def extract_files(self, repo_path: str) -> Generator[Dict, None, None]:
|
| 47 |
+
"""Extract and yield file information"""
|
| 48 |
+
for root, dirs, files in os.walk(repo_path):
|
| 49 |
+
# Filter out ignored directories
|
| 50 |
+
dirs[:] = [d for d in dirs if d not in self.ignore_dirs]
|
| 51 |
+
|
| 52 |
+
for file in files:
|
| 53 |
+
file_path = os.path.join(root, file)
|
| 54 |
+
relative_path = os.path.relpath(file_path, repo_path)
|
| 55 |
+
|
| 56 |
+
if not self.should_process_file(file_path):
|
| 57 |
+
continue
|
| 58 |
+
|
| 59 |
+
try:
|
| 60 |
+
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
|
| 61 |
+
content = f.read()
|
| 62 |
+
|
| 63 |
+
yield {
|
| 64 |
+
'path': relative_path,
|
| 65 |
+
'content': content,
|
| 66 |
+
'extension': Path(file_path).suffix.lower(),
|
| 67 |
+
'size': len(content)
|
| 68 |
+
}
|
| 69 |
+
except Exception as e:
|
| 70 |
+
print(f"Error reading file {relative_path}: {e}")
|
| 71 |
+
continue
|