Upload folder using huggingface_hub
Browse files- .docker +0 -17
- data/hadith.json +0 -62
- data/quran.json +0 -50
- enrich_dataset.py +0 -210
- fetch_data.py +0 -294
.docker
DELETED
|
@@ -1,17 +0,0 @@
|
|
| 1 |
-
# Use an official Python runtime as a parent image
|
| 2 |
-
FROM python:3
|
| 3 |
-
|
| 4 |
-
# Set the working directory in the container to /app
|
| 5 |
-
WORKDIR /app
|
| 6 |
-
|
| 7 |
-
# Copy the current directory contents into the container at /app
|
| 8 |
-
COPY . /app
|
| 9 |
-
|
| 10 |
-
# Install any needed packages specified in requirements.txt
|
| 11 |
-
RUN pip install --trusted-host pypi.python.org -r requirements.txt
|
| 12 |
-
|
| 13 |
-
# Make port 80 available to the world outside this container
|
| 14 |
-
EXPOSE 80
|
| 15 |
-
|
| 16 |
-
# Run main.py when the container launches
|
| 17 |
-
CMD ["python", "main.py"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data/hadith.json
DELETED
|
@@ -1,62 +0,0 @@
|
|
| 1 |
-
[
|
| 2 |
-
{
|
| 3 |
-
"id": "bukhari_1",
|
| 4 |
-
"arabic": "إِنَّمَا الْأَعْمَالُ بِالنِّيَّاتِ...",
|
| 5 |
-
"english": "Actions are judged by intentions...",
|
| 6 |
-
"reference": "Sahih al-Bukhari 1"
|
| 7 |
-
},
|
| 8 |
-
{
|
| 9 |
-
"id": "bukhari_8",
|
| 10 |
-
"arabic": "بُنِيَ الإِسْلامُ عَلَى خَمْسٍ...",
|
| 11 |
-
"english": "Islam is built upon five [pillars]...",
|
| 12 |
-
"reference": "Sahih al-Bukhari 8"
|
| 13 |
-
},
|
| 14 |
-
{
|
| 15 |
-
"id": "muslim_1",
|
| 16 |
-
"arabic": "الإِيمَانُ أَنْ تُؤْمِنَ بِاللَّهِ وَمَلائِكَتِهِ...",
|
| 17 |
-
"english": "Faith is to believe in Allah, His angels...",
|
| 18 |
-
"reference": "Sahih Muslim 1"
|
| 19 |
-
},
|
| 20 |
-
{
|
| 21 |
-
"id": "muslim_1907",
|
| 22 |
-
"arabic": "مَنْ صَامَ رَمَضَانَ إِيمَانًا وَاحْتِسَابًا...",
|
| 23 |
-
"english": "Whoever fasts Ramadan with faith and seeking reward...",
|
| 24 |
-
"reference": "Sahih Muslim 1907"
|
| 25 |
-
},
|
| 26 |
-
{
|
| 27 |
-
"id": "ahmad_3784",
|
| 28 |
-
"arabic": "بَدَأَ الإِسْلاَمُ غَرِيبًا وَسَيَعُودُ كَمَا بَدَأَ غَرِيبًا فَطُوبَى لِلْغُرَبَاءِ",
|
| 29 |
-
"english": "Islam began as something strange and will revert to being strange as it began, so give glad tidings to the strangers.",
|
| 30 |
-
"reference": "Musnad Ahmad 3784"
|
| 31 |
-
},
|
| 32 |
-
{
|
| 33 |
-
"id": "ahmad_2107",
|
| 34 |
-
"arabic": "أَحَبُّ الدِّينِ إِلَى اللَّهِ الْحَنِيفِيَّةُ السَّمْحَةُ",
|
| 35 |
-
"english": "The most beloved of religions to Allah is the easy monotheism (Hanifiyyah).",
|
| 36 |
-
"reference": "Musnad Ahmad 2107"
|
| 37 |
-
},
|
| 38 |
-
{
|
| 39 |
-
"id": "ahmad_8030",
|
| 40 |
-
"arabic": "الْمُؤْمِنُ مِرْآةُ أَخِيهِ",
|
| 41 |
-
"english": "The believer is a mirror for his brother.",
|
| 42 |
-
"reference": "Musnad Ahmad 8030"
|
| 43 |
-
},
|
| 44 |
-
{
|
| 45 |
-
"id": "bukhari_6018",
|
| 46 |
-
"arabic": "مَنْ كَانَ يُؤْمِنُ بِاللَّهِ وَالْيَوْمِ الآخِرِ فَلْيُحْسِنْ إِلَى جَارِهِ...",
|
| 47 |
-
"english": "Whoever believes in Allah and the Last Day should be kind to his neighbor...",
|
| 48 |
-
"reference": "Sahih al-Bukhari 6018"
|
| 49 |
-
},
|
| 50 |
-
{
|
| 51 |
-
"id": "tirmidhi_2003",
|
| 52 |
-
"arabic": "أَكْمَلُ الْمُؤْمِنِينَ إِيمَانًا أَحْسَنُهُمْ خُلُقًا...",
|
| 53 |
-
"english": "The most complete of believers in faith are those with the best character...",
|
| 54 |
-
"reference": "Jami` at-Tirmidhi 2003"
|
| 55 |
-
},
|
| 56 |
-
{
|
| 57 |
-
"id": "ibnmajah_224",
|
| 58 |
-
"arabic": "طَلَبُ الْعِلْمِ فَرِيضَةٌ عَلَى كُلِّ مُسْلِمٍ...",
|
| 59 |
-
"english": "Seeking knowledge is an obligation upon every muslim...",
|
| 60 |
-
"reference": "Sunan Ibn Majah 224"
|
| 61 |
-
}
|
| 62 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data/quran.json
DELETED
|
@@ -1,50 +0,0 @@
|
|
| 1 |
-
[
|
| 2 |
-
{
|
| 3 |
-
"id": "2:153",
|
| 4 |
-
"arabic": "يَا أَيُّهَا الَّذِينَ آمَنُوا اسْتَعِينُوا بِالصَّبْرِ وَالصَّلَاةِ...",
|
| 5 |
-
"english": "O you who have believed, seek help through patience and prayer...",
|
| 6 |
-
"source": "Surah Al-Baqarah 2:153"
|
| 7 |
-
},
|
| 8 |
-
{
|
| 9 |
-
"id": "1:1-7",
|
| 10 |
-
"arabic": "بِسْمِ اللَّهِ الرَّحْمَنِ الرَّحِيمِ... اهْدِنَا الصِّرَاطَ الْمُسْتَقِيمَ",
|
| 11 |
-
"english": "In the name of Allah, the Entirely Merciful, the Especially Merciful... Guide us to the straight path",
|
| 12 |
-
"source": "Surah Al-Fatihah 1:1-7"
|
| 13 |
-
},
|
| 14 |
-
{
|
| 15 |
-
"id": "2:255",
|
| 16 |
-
"arabic": "اللَّهُ لَا إِلَهَ إِلَّا هُوَ الْحَيُّ الْقَيُّومُ...",
|
| 17 |
-
"english": "Allah - there is no deity except Him, the Ever-Living, the Sustainer of [all] existence...",
|
| 18 |
-
"source": "Surah Al-Baqarah 2:255 (Ayat al-Kursi)"
|
| 19 |
-
},
|
| 20 |
-
{
|
| 21 |
-
"id": "112:1-4",
|
| 22 |
-
"arabic": "قُلْ هُوَ اللَّهُ أَحَدٌ... وَلَمْ يَكُن لَّهُ كُفُوًا أَحَدٌ",
|
| 23 |
-
"english": "Say, He is Allah, [who is] One... And there is none co-equal to Him.",
|
| 24 |
-
"source": "Surah Al-Ikhlas 112:1-4"
|
| 25 |
-
},
|
| 26 |
-
{
|
| 27 |
-
"id": "2:286",
|
| 28 |
-
"arabic": "لَا يُكَلِّفُ اللَّهُ نَفْسًا إِلَّا وُسْعَهَا...",
|
| 29 |
-
"english": "Allah does not charge a soul except [with that within] its capacity...",
|
| 30 |
-
"source": "Surah Al-Baqarah 2:286"
|
| 31 |
-
},
|
| 32 |
-
{
|
| 33 |
-
"id": "3:103",
|
| 34 |
-
"arabic": "وَاعْتَصِمُوا بِحَبْلِ اللَّهِ جَمِيعًا وَلَا تَفَرَّقُوا...",
|
| 35 |
-
"english": "And hold firmly to the rope of Allah all together and do not become divided...",
|
| 36 |
-
"source": "Surah Al-Imran 3:103"
|
| 37 |
-
},
|
| 38 |
-
{
|
| 39 |
-
"id": "5:8",
|
| 40 |
-
"arabic": "يَا أَيُّهَا الَّذِينَ آمَنُوا كُونُوا قَوَّامِينَ لِلَّهِ شُهَدَاءَ بِالْقِسْطِ...",
|
| 41 |
-
"english": "O you who have believed, be persistently standing firm for Allah, witnesses in justice...",
|
| 42 |
-
"source": "Surah Al-Ma'idah 5:8"
|
| 43 |
-
},
|
| 44 |
-
{
|
| 45 |
-
"id": "94:5",
|
| 46 |
-
"arabic": "فَإِنَّ مَعَ الْعُسْرِ يُسْرًا",
|
| 47 |
-
"english": "For indeed, with hardship [will be] ease.",
|
| 48 |
-
"source": "Surah Ash-Sharh 94:5"
|
| 49 |
-
}
|
| 50 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
enrich_dataset.py
DELETED
|
@@ -1,210 +0,0 @@
|
|
| 1 |
-
#!/usr/bin/env python3
|
| 2 |
-
"""
|
| 3 |
-
Script to enrich the QModel dataset with hadith collections from GitHub.
|
| 4 |
-
Fetches Musnad Ahmad and other major hadith collections from:
|
| 5 |
-
https://github.com/AhmedBaset/hadith-json/tree/main/db/by_book/the_9_books
|
| 6 |
-
"""
|
| 7 |
-
|
| 8 |
-
import json
|
| 9 |
-
import requests
|
| 10 |
-
from typing import Dict, List
|
| 11 |
-
from collections import defaultdict
|
| 12 |
-
|
| 13 |
-
# The 9 canonical hadith books
|
| 14 |
-
HADITH_BOOKS = {
|
| 15 |
-
"ahmed.json": {
|
| 16 |
-
"collection": "Musnad Ahmad",
|
| 17 |
-
"id_prefix": "ahmad",
|
| 18 |
-
"grade": "Hasan/Sahih",
|
| 19 |
-
"author": "Imam Ahmad ibn Hanbal"
|
| 20 |
-
},
|
| 21 |
-
"bukhari.json": {
|
| 22 |
-
"collection": "Sahih al-Bukhari",
|
| 23 |
-
"id_prefix": "bukhari",
|
| 24 |
-
"grade": "Sahih",
|
| 25 |
-
"author": "Muhammad al-Bukhari"
|
| 26 |
-
},
|
| 27 |
-
"muslim.json": {
|
| 28 |
-
"collection": "Sahih Muslim",
|
| 29 |
-
"id_prefix": "muslim",
|
| 30 |
-
"grade": "Sahih",
|
| 31 |
-
"author": "Muslim ibn al-Hajjaj"
|
| 32 |
-
},
|
| 33 |
-
"abudawud.json": {
|
| 34 |
-
"collection": "Sunan Abu Dawood",
|
| 35 |
-
"id_prefix": "abudawud",
|
| 36 |
-
"grade": "Hasan",
|
| 37 |
-
"author": "Abu Dawood Sulaiman"
|
| 38 |
-
},
|
| 39 |
-
"tirmidhi.json": {
|
| 40 |
-
"collection": "Jami' at-Tirmidhi",
|
| 41 |
-
"id_prefix": "tirmidhi",
|
| 42 |
-
"grade": "Hasan",
|
| 43 |
-
"author": "Al-Tirmidhi"
|
| 44 |
-
},
|
| 45 |
-
"ibnmajah.json": {
|
| 46 |
-
"collection": "Sunan Ibn Majah",
|
| 47 |
-
"id_prefix": "ibnmajah",
|
| 48 |
-
"grade": "Hasan",
|
| 49 |
-
"author": "Ibn Majah al-Qazwini"
|
| 50 |
-
},
|
| 51 |
-
"nasai.json": {
|
| 52 |
-
"collection": "Sunan an-Nasai",
|
| 53 |
-
"id_prefix": "nasai",
|
| 54 |
-
"grade": "Sahih",
|
| 55 |
-
"author": "Ahmad al-Nasai"
|
| 56 |
-
},
|
| 57 |
-
"malik.json": {
|
| 58 |
-
"collection": "Muwatta Malik",
|
| 59 |
-
"id_prefix": "malik",
|
| 60 |
-
"grade": "Sahih",
|
| 61 |
-
"author": "Malik ibn Anas"
|
| 62 |
-
},
|
| 63 |
-
"darimi.json": {
|
| 64 |
-
"collection": "Sunan al-Darimi",
|
| 65 |
-
"id_prefix": "darimi",
|
| 66 |
-
"grade": "Hasan",
|
| 67 |
-
"author": "Al-Darimi"
|
| 68 |
-
}
|
| 69 |
-
}
|
| 70 |
-
|
| 71 |
-
BASE_URL = "https://raw.githubusercontent.com/AhmedBaset/hadith-json/main/db/by_book/the_9_books"
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
def fetch_hadith_book(filename: str) -> Dict:
|
| 75 |
-
"""Fetch a hadith book JSON from GitHub."""
|
| 76 |
-
url = f"{BASE_URL}/{filename}"
|
| 77 |
-
print(f"Fetching {filename}...")
|
| 78 |
-
response = requests.get(url, timeout=30)
|
| 79 |
-
response.raise_for_status()
|
| 80 |
-
return response.json()
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
def transform_hadith(hadith: Dict, book_config: Dict, book_data: Dict) -> Dict:
|
| 84 |
-
"""Transform hadith from GitHub format to our metadata format."""
|
| 85 |
-
|
| 86 |
-
# Find chapter name if available
|
| 87 |
-
chapter_name = ""
|
| 88 |
-
if "chapterId" in hadith:
|
| 89 |
-
for chapter in book_data.get("chapters", []):
|
| 90 |
-
if chapter.get("id") == hadith.get("chapterId"):
|
| 91 |
-
chapter_name = chapter.get("arabic", "")
|
| 92 |
-
break
|
| 93 |
-
|
| 94 |
-
# Build the reference string
|
| 95 |
-
hadith_num = hadith.get("idInBook", hadith.get("id", ""))
|
| 96 |
-
reference = f"{book_config['collection']} {hadith_num}"
|
| 97 |
-
|
| 98 |
-
# Combine narrator and text for English
|
| 99 |
-
english_parts = []
|
| 100 |
-
if isinstance(hadith.get("english"), dict):
|
| 101 |
-
if hadith["english"].get("narrator"):
|
| 102 |
-
english_parts.append(hadith["english"]["narrator"])
|
| 103 |
-
if hadith["english"].get("text"):
|
| 104 |
-
english_parts.append(hadith["english"]["text"])
|
| 105 |
-
english = " ".join(english_parts)
|
| 106 |
-
else:
|
| 107 |
-
english = str(hadith.get("english", ""))
|
| 108 |
-
|
| 109 |
-
return {
|
| 110 |
-
"id": f"{book_config['id_prefix']}_{hadith_num}",
|
| 111 |
-
"arabic": hadith.get("arabic", ""),
|
| 112 |
-
"english": english,
|
| 113 |
-
"reference": reference,
|
| 114 |
-
"hadith_number": hadith_num,
|
| 115 |
-
"collection": book_config["collection"],
|
| 116 |
-
"chapter": chapter_name,
|
| 117 |
-
"grade": "", # Will be inferred by main.py's infer_hadith_grade()
|
| 118 |
-
"type": "hadith",
|
| 119 |
-
"author": book_config["author"]
|
| 120 |
-
}
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
def load_existing_metadata(filepath: str) -> List[Dict]:
|
| 124 |
-
"""Load existing metadata.json file."""
|
| 125 |
-
print(f"Loading existing metadata from {filepath}...")
|
| 126 |
-
with open(filepath, 'r', encoding='utf-8') as f:
|
| 127 |
-
return json.load(f)
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
def save_enriched_metadata(filepath: str, data: List[Dict], stats: Dict) -> None:
|
| 131 |
-
"""Save enriched metadata to file."""
|
| 132 |
-
print(f"Saving enriched metadata to {filepath}...")
|
| 133 |
-
with open(filepath, 'w', encoding='utf-8') as f:
|
| 134 |
-
json.dump(data, f, ensure_ascii=False, indent=2)
|
| 135 |
-
|
| 136 |
-
print("\n" + "="*60)
|
| 137 |
-
print("Dataset Enrichment Summary")
|
| 138 |
-
print("="*60)
|
| 139 |
-
print(f"Total documents: {len(data)}")
|
| 140 |
-
print(f"\nBreakdown by collection:")
|
| 141 |
-
for collection, count in sorted(stats.items()):
|
| 142 |
-
print(f" {collection}: {count}")
|
| 143 |
-
print("="*60)
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
def main():
|
| 147 |
-
"""Main enrichment process."""
|
| 148 |
-
|
| 149 |
-
# Load existing metadata
|
| 150 |
-
metadata_path = "/Users/elgendy/Projects/QModel/metadata.json"
|
| 151 |
-
existing_data = load_existing_metadata(metadata_path)
|
| 152 |
-
|
| 153 |
-
# Track which existing hadiths we have
|
| 154 |
-
existing_ids = {item["id"] for item in existing_data if item.get("type") == "hadith"}
|
| 155 |
-
print(f"Existing hadith entries: {len(existing_ids)}")
|
| 156 |
-
|
| 157 |
-
# New hadiths to add
|
| 158 |
-
new_hadiths = []
|
| 159 |
-
stats = defaultdict(int)
|
| 160 |
-
|
| 161 |
-
# Count existing Quran verses
|
| 162 |
-
for item in existing_data:
|
| 163 |
-
if item.get("type") == "quran":
|
| 164 |
-
stats["Quran"] += 1
|
| 165 |
-
elif item.get("type") == "hadith":
|
| 166 |
-
collection = item.get("collection", "Unknown")
|
| 167 |
-
stats[collection] += 1
|
| 168 |
-
|
| 169 |
-
# Fetch and process each hadith book
|
| 170 |
-
for filename, book_config in HADITH_BOOKS.items():
|
| 171 |
-
try:
|
| 172 |
-
book_data = fetch_hadith_book(filename)
|
| 173 |
-
hadiths = book_data.get("hadiths", [])
|
| 174 |
-
|
| 175 |
-
skipped = 0
|
| 176 |
-
added = 0
|
| 177 |
-
|
| 178 |
-
for hadith in hadiths:
|
| 179 |
-
# Transform to our format
|
| 180 |
-
transformed = transform_hadith(hadith, book_config, book_data)
|
| 181 |
-
|
| 182 |
-
# Check if we already have this hadith
|
| 183 |
-
if transformed["id"] in existing_ids:
|
| 184 |
-
skipped += 1
|
| 185 |
-
continue
|
| 186 |
-
|
| 187 |
-
new_hadiths.append(transformed)
|
| 188 |
-
existing_ids.add(transformed["id"])
|
| 189 |
-
added += 1
|
| 190 |
-
|
| 191 |
-
collection_name = book_config["collection"]
|
| 192 |
-
stats[collection_name] += added
|
| 193 |
-
|
| 194 |
-
print(f" ✓ {filename}: {added} new hadiths added, {skipped} already exist")
|
| 195 |
-
|
| 196 |
-
except Exception as e:
|
| 197 |
-
print(f" ✗ Error fetching {filename}: {e}")
|
| 198 |
-
|
| 199 |
-
# Merge with existing data
|
| 200 |
-
enriched_data = existing_data + new_hadiths
|
| 201 |
-
|
| 202 |
-
print(f"\nTotal new hadiths added: {len(new_hadiths)}")
|
| 203 |
-
print(f"Total documents after enrichment: {len(enriched_data)}")
|
| 204 |
-
|
| 205 |
-
# Save enriched metadata
|
| 206 |
-
save_enriched_metadata(metadata_path, enriched_data, stats)
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
if __name__ == "__main__":
|
| 210 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
fetch_data.py
DELETED
|
@@ -1,294 +0,0 @@
|
|
| 1 |
-
"""
|
| 2 |
-
fetch_data.py — QModel Full Data Fetcher
|
| 3 |
-
=========================================
|
| 4 |
-
Fetches the COMPLETE Quran (6,236 verses, all 114 surahs) from risan/quran-json
|
| 5 |
-
via jsDelivr CDN, using the per-chapter endpoint which contains both Arabic text
|
| 6 |
-
AND English translation (Saheeh International) in a single request per surah.
|
| 7 |
-
|
| 8 |
-
Also fetches major Hadith collections from fawazahmed0/hadith-api.
|
| 9 |
-
|
| 10 |
-
Output files are drop-in replacements for quran.json / hadith.json and are
|
| 11 |
-
fully compatible with build_index.py and main.py.
|
| 12 |
-
|
| 13 |
-
Schema produced:
|
| 14 |
-
quran.json → [{ "id": "2:1", "arabic": "...", "english": "...",
|
| 15 |
-
"source": "Surah Al-Baqarah 2:1",
|
| 16 |
-
"surah_number": 2, "surah_name_en": "Al-Baqarah",
|
| 17 |
-
"surah_name_ar": "البقرة", "verse_number": 1,
|
| 18 |
-
"transliteration": "..." }, ...]
|
| 19 |
-
|
| 20 |
-
hadith.json → [{ "id": "bukhari_1", "arabic": "...", "english": "...",
|
| 21 |
-
"reference": "Sahih al-Bukhari 1",
|
| 22 |
-
"hadith_number": 1, "collection": "Sahih al-Bukhari",
|
| 23 |
-
"grade": "Sahih" }, ...]
|
| 24 |
-
|
| 25 |
-
Usage:
|
| 26 |
-
pip install requests
|
| 27 |
-
python fetch_data.py # full download
|
| 28 |
-
python fetch_data.py --out-dir ./data # custom output dir
|
| 29 |
-
python fetch_data.py --hadith-limit 500 # quick test run
|
| 30 |
-
python fetch_data.py --quran-only # skip hadith
|
| 31 |
-
python fetch_data.py --hadith-only # skip quran
|
| 32 |
-
"""
|
| 33 |
-
|
| 34 |
-
from __future__ import annotations
|
| 35 |
-
|
| 36 |
-
import argparse
|
| 37 |
-
import json
|
| 38 |
-
import sys
|
| 39 |
-
import time
|
| 40 |
-
from pathlib import Path
|
| 41 |
-
from typing import Optional
|
| 42 |
-
|
| 43 |
-
try:
|
| 44 |
-
import requests
|
| 45 |
-
except ImportError:
|
| 46 |
-
sys.exit("❌ Install requests first: pip install requests")
|
| 47 |
-
|
| 48 |
-
# ── CDN roots ─────────────────────────────────────────────────────────────────
|
| 49 |
-
# risan/quran-json: per-chapter endpoint has BOTH arabic + english translation
|
| 50 |
-
# Format: { "id": 1, "name": "Al-Fatihah", "transliteration": "...",
|
| 51 |
-
# "type": "meccan", "total_verses": 7,
|
| 52 |
-
# "verses": [ { "id": 1, "text": "<arabic>",
|
| 53 |
-
# "translation": "<saheeh-international>",
|
| 54 |
-
# "transliteration": "..." }, ... ] }
|
| 55 |
-
QURAN_CHAPTER_URL = "https://cdn.jsdelivr.net/npm/quran-json@3.1.2/dist/chapters/en/{n}.json"
|
| 56 |
-
|
| 57 |
-
# fawazahmed0/hadith-api: full-book JSON per edition
|
| 58 |
-
# Format: { "metadata": {...},
|
| 59 |
-
# "hadiths": [ { "hadithnumber": 1, "text": "...",
|
| 60 |
-
# "grades": [{"grade": "Sahih", ...}] }, ... ] }
|
| 61 |
-
HADITH_CDN = "https://cdn.jsdelivr.net/gh/fawazahmed0/hadith-api@1/editions"
|
| 62 |
-
|
| 63 |
-
# ── Surah metadata ─────────────────────────────────────────────────────────────
|
| 64 |
-
SURAH_AR = {
|
| 65 |
-
1:"الفاتحة",2:"البقرة",3:"آل عمران",4:"النساء",5:"المائدة",
|
| 66 |
-
6:"الأنعام",7:"الأعراف",8:"الأنفال",9:"التوبة",10:"يونس",
|
| 67 |
-
11:"هود",12:"يوسف",13:"الرعد",14:"إبراهيم",15:"الحجر",
|
| 68 |
-
16:"النحل",17:"الإسراء",18:"الكهف",19:"مريم",20:"طه",
|
| 69 |
-
21:"الأنبياء",22:"الحج",23:"المؤمنون",24:"النور",25:"الفرقان",
|
| 70 |
-
26:"الشعراء",27:"النمل",28:"القصص",29:"العنكبوت",30:"الروم",
|
| 71 |
-
31:"لقمان",32:"السجدة",33:"الأحزاب",34:"سبأ",35:"فاطر",
|
| 72 |
-
36:"يس",37:"الصافات",38:"ص",39:"الزمر",40:"غافر",
|
| 73 |
-
41:"فصلت",42:"الشورى",43:"الزخرف",44:"الدخان",45:"الجاثية",
|
| 74 |
-
46:"الأحقاف",47:"محمد",48:"الفتح",49:"الحجرات",50:"ق",
|
| 75 |
-
51:"الذاريات",52:"الطور",53:"النجم",54:"القمر",55:"الرحمن",
|
| 76 |
-
56:"الواقعة",57:"الحديد",58:"المجادلة",59:"الحشر",60:"الممتحنة",
|
| 77 |
-
61:"الصف",62:"الجمعة",63:"المنافقون",64:"التغابن",65:"الطلاق",
|
| 78 |
-
66:"التحريم",67:"الملك",68:"القلم",69:"الحاقة",70:"المعارج",
|
| 79 |
-
71:"نوح",72:"الجن",73:"المزمل",74:"المدثر",75:"القيامة",
|
| 80 |
-
76:"الإنسان",77:"المرسلات",78:"النبأ",79:"النازعات",80:"عبس",
|
| 81 |
-
81:"التكوير",82:"الانفطار",83:"المطففين",84:"الانشقاق",85:"البروج",
|
| 82 |
-
86:"الطارق",87:"الأعلى",88:"الغاشية",89:"الفجر",90:"البلد",
|
| 83 |
-
91:"الشمس",92:"الليل",93:"الضحى",94:"الشرح",95:"التين",
|
| 84 |
-
96:"العلق",97:"القدر",98:"البينة",99:"الزلزلة",100:"العاديات",
|
| 85 |
-
101:"القارعة",102:"التكاثر",103:"العصر",104:"الهمزة",105:"الفيل",
|
| 86 |
-
106:"قريش",107:"الماعون",108:"الكوثر",109:"الكافرون",110:"النصر",
|
| 87 |
-
111:"المسد",112:"الإخلاص",113:"ال��لق",114:"الناس",
|
| 88 |
-
}
|
| 89 |
-
|
| 90 |
-
# ── Hadith collections ─────────────────────────────────────────────────────────
|
| 91 |
-
# (arabic_edition, english_edition, human_label, id_prefix)
|
| 92 |
-
HADITH_EDITIONS = [
|
| 93 |
-
("ara-bukhari", "eng-bukhari", "Sahih al-Bukhari", "bukhari"),
|
| 94 |
-
("ara-muslim", "eng-muslim", "Sahih Muslim", "muslim"),
|
| 95 |
-
("ara-abudawud", "eng-abudawud", "Sunan Abu Dawud", "abudawud"),
|
| 96 |
-
("ara-tirmidhi", "eng-tirmidhi", "Jami' at-Tirmidhi", "tirmidhi"),
|
| 97 |
-
("ara-ibnmajah", "eng-ibnmajah", "Sunan Ibn Majah", "ibnmajah"),
|
| 98 |
-
("ara-nasai", "eng-nasai", "Sunan an-Nasa'i", "nasai"),
|
| 99 |
-
("ara-malik", "eng-malik", "Muwatta Malik", "malik"),
|
| 100 |
-
]
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
# ── HTTP helper ────────────────────────────────────────────────────────────────
|
| 104 |
-
def get_json(url: str, retries: int = 4, backoff: float = 2.0) -> Optional[dict | list]:
|
| 105 |
-
for attempt in range(1, retries + 1):
|
| 106 |
-
try:
|
| 107 |
-
r = requests.get(url, timeout=60)
|
| 108 |
-
r.raise_for_status()
|
| 109 |
-
return r.json()
|
| 110 |
-
except Exception as exc:
|
| 111 |
-
print(f" ⚠️ Attempt {attempt}/{retries}: {exc}")
|
| 112 |
-
if attempt < retries:
|
| 113 |
-
time.sleep(backoff * attempt)
|
| 114 |
-
return None
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
def save(path: Path, data: list) -> None:
|
| 118 |
-
path.parent.mkdir(parents=True, exist_ok=True)
|
| 119 |
-
path.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8")
|
| 120 |
-
kb = path.stat().st_size / 1024
|
| 121 |
-
print(f" 💾 {path} — {len(data):,} records ({kb:,.0f} KB)")
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
# ── Quran ──────────────────────────────────────────────────────────────────────
|
| 125 |
-
def fetch_quran() -> list:
|
| 126 |
-
"""
|
| 127 |
-
Uses the risan/quran-json per-chapter English endpoint:
|
| 128 |
-
cdn.jsdelivr.net/npm/quran-json@3.1.2/dist/chapters/en/{N}.json
|
| 129 |
-
|
| 130 |
-
Each file contains:
|
| 131 |
-
{
|
| 132 |
-
"id": 1,
|
| 133 |
-
"name": "Al-Fatihah",
|
| 134 |
-
"transliteration": "Al-Fatihah",
|
| 135 |
-
"type": "meccan",
|
| 136 |
-
"total_verses": 7,
|
| 137 |
-
"verses": [
|
| 138 |
-
{
|
| 139 |
-
"id": 1,
|
| 140 |
-
"text": "<uthmani arabic>",
|
| 141 |
-
"translation": "<saheeh international english>",
|
| 142 |
-
"transliteration": "<latin>"
|
| 143 |
-
}, ...
|
| 144 |
-
]
|
| 145 |
-
}
|
| 146 |
-
|
| 147 |
-
This single endpoint gives us Arabic + English + transliteration per verse —
|
| 148 |
-
no need to join two separate files.
|
| 149 |
-
"""
|
| 150 |
-
print("\n📖 Fetching full Quran (114 surahs, 6,236 verses) …")
|
| 151 |
-
records = []
|
| 152 |
-
failed = []
|
| 153 |
-
|
| 154 |
-
for n in range(1, 115):
|
| 155 |
-
url = QURAN_CHAPTER_URL.format(n=n)
|
| 156 |
-
data = get_json(url)
|
| 157 |
-
|
| 158 |
-
if not data:
|
| 159 |
-
print(f" ❌ Surah {n} — failed, skipping")
|
| 160 |
-
failed.append(n)
|
| 161 |
-
continue
|
| 162 |
-
|
| 163 |
-
surah_name_en = data.get("name") or data.get("transliteration") or f"Surah {n}"
|
| 164 |
-
surah_name_ar = SURAH_AR.get(n, "")
|
| 165 |
-
verses = data.get("verses", [])
|
| 166 |
-
|
| 167 |
-
for v in verses:
|
| 168 |
-
vid = int(v.get("id", 0))
|
| 169 |
-
arabic = (v.get("text") or "").strip()
|
| 170 |
-
english = (v.get("translation") or "").strip()
|
| 171 |
-
translit= (v.get("transliteration") or "").strip()
|
| 172 |
-
|
| 173 |
-
if not vid or not arabic:
|
| 174 |
-
continue
|
| 175 |
-
|
| 176 |
-
records.append({
|
| 177 |
-
# ── core fields (required by main.py / build_index.py) ──
|
| 178 |
-
"id": f"{n}:{vid}",
|
| 179 |
-
"arabic": arabic,
|
| 180 |
-
"english": english,
|
| 181 |
-
"source": f"Surah {surah_name_en} {n}:{vid}",
|
| 182 |
-
# ── enriched metadata ──
|
| 183 |
-
"surah_number": n,
|
| 184 |
-
"surah_name_en": surah_name_en,
|
| 185 |
-
"surah_name_ar": surah_name_ar,
|
| 186 |
-
"verse_number": vid,
|
| 187 |
-
"transliteration": translit,
|
| 188 |
-
})
|
| 189 |
-
|
| 190 |
-
# Brief progress every 10 surahs
|
| 191 |
-
if n % 10 == 0 or n == 114:
|
| 192 |
-
print(f" ✔ Surahs 1–{n} fetched ({len(records):,} verses so far)")
|
| 193 |
-
time.sleep(0.15) # be polite to the CDN
|
| 194 |
-
|
| 195 |
-
if failed:
|
| 196 |
-
print(f"\n ⚠️ {len(failed)} surahs failed: {failed}")
|
| 197 |
-
print(f"\n ✅ Quran complete — {len(records):,} verses")
|
| 198 |
-
return records
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
# ── Hadith ─────────────────────────────────────────────────────────────────────
|
| 202 |
-
def fetch_hadith_edition(
|
| 203 |
-
ar_edition: str, en_edition: str,
|
| 204 |
-
label: str, prefix: str,
|
| 205 |
-
limit: Optional[int],
|
| 206 |
-
) -> list:
|
| 207 |
-
ar_data = get_json(f"{HADITH_CDN}/{ar_edition}.json")
|
| 208 |
-
en_data = get_json(f"{HADITH_CDN}/{en_edition}.json")
|
| 209 |
-
|
| 210 |
-
if not ar_data:
|
| 211 |
-
print(f" ❌ {label} Arabic — unavailable, skipping")
|
| 212 |
-
return []
|
| 213 |
-
|
| 214 |
-
en_lookup = {
|
| 215 |
-
int(h["hadithnumber"]): (h.get("text") or "")
|
| 216 |
-
for h in (en_data or {}).get("hadiths", [])
|
| 217 |
-
if "hadithnumber" in h
|
| 218 |
-
}
|
| 219 |
-
|
| 220 |
-
records = []
|
| 221 |
-
for h in ar_data.get("hadiths", []):
|
| 222 |
-
num = h.get("hadithnumber")
|
| 223 |
-
arabic = (h.get("text") or "").strip()
|
| 224 |
-
if not num or not arabic:
|
| 225 |
-
continue
|
| 226 |
-
num = int(num)
|
| 227 |
-
english = en_lookup.get(num, "").strip()
|
| 228 |
-
grades = h.get("grades") or []
|
| 229 |
-
grade = grades[0].get("grade", "") if grades else ""
|
| 230 |
-
|
| 231 |
-
records.append({
|
| 232 |
-
# ── core fields ──
|
| 233 |
-
"id": f"{prefix}_{num}",
|
| 234 |
-
"arabic": arabic,
|
| 235 |
-
"english": english,
|
| 236 |
-
"reference": f"{label} {num}",
|
| 237 |
-
# ── enriched metadata ──
|
| 238 |
-
"hadith_number": num,
|
| 239 |
-
"collection": label,
|
| 240 |
-
"grade": grade,
|
| 241 |
-
})
|
| 242 |
-
if limit and len(records) >= limit:
|
| 243 |
-
break
|
| 244 |
-
|
| 245 |
-
print(f" ✅ {label}: {len(records):,} hadiths")
|
| 246 |
-
return records
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
def fetch_all_hadiths(limit_per_collection: Optional[int] = None) -> list:
|
| 250 |
-
print("\n📚 Fetching Hadith collections …")
|
| 251 |
-
all_hadiths: list = []
|
| 252 |
-
for ar_ed, en_ed, label, prefix in HADITH_EDITIONS:
|
| 253 |
-
print(f"\n → {label}")
|
| 254 |
-
records = fetch_hadith_edition(ar_ed, en_ed, label, prefix, limit_per_collection)
|
| 255 |
-
all_hadiths.extend(records)
|
| 256 |
-
time.sleep(0.5)
|
| 257 |
-
print(f"\n 📊 Total hadiths: {len(all_hadiths):,}")
|
| 258 |
-
return all_hadiths
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
# ── CLI ────────────────────────────────────────────────────────────────────────
|
| 262 |
-
def main() -> None:
|
| 263 |
-
parser = argparse.ArgumentParser(
|
| 264 |
-
description="Fetch complete Quran + Hadith data for QModel"
|
| 265 |
-
)
|
| 266 |
-
parser.add_argument(
|
| 267 |
-
"--out-dir", default="./data",
|
| 268 |
-
help="Output directory (default: ./data)"
|
| 269 |
-
)
|
| 270 |
-
parser.add_argument(
|
| 271 |
-
"--hadith-limit", type=int, default=None,
|
| 272 |
-
help="Max hadiths per collection (omit = all, ~50k total)"
|
| 273 |
-
)
|
| 274 |
-
parser.add_argument("--quran-only", action="store_true", help="Skip hadith")
|
| 275 |
-
parser.add_argument("--hadith-only", action="store_true", help="Skip quran")
|
| 276 |
-
args = parser.parse_args()
|
| 277 |
-
|
| 278 |
-
out = Path(args.out_dir)
|
| 279 |
-
out.mkdir(parents=True, exist_ok=True)
|
| 280 |
-
|
| 281 |
-
if not args.hadith_only:
|
| 282 |
-
quran = fetch_quran()
|
| 283 |
-
save(out / "quran.json", quran)
|
| 284 |
-
|
| 285 |
-
if not args.quran_only:
|
| 286 |
-
hadiths = fetch_all_hadiths(limit_per_collection=args.hadith_limit)
|
| 287 |
-
save(out / "hadith.json", hadiths)
|
| 288 |
-
|
| 289 |
-
print("\n🎉 Done! Output:", out.resolve())
|
| 290 |
-
print(" Next: python build_index.py")
|
| 291 |
-
|
| 292 |
-
|
| 293 |
-
if __name__ == "__main__":
|
| 294 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|