aelgendy commited on
Commit
29d804b
·
1 Parent(s): ef44157

Upload folder using huggingface_hub

Browse files
Files changed (5) hide show
  1. .docker +0 -17
  2. data/hadith.json +0 -62
  3. data/quran.json +0 -50
  4. enrich_dataset.py +0 -210
  5. fetch_data.py +0 -294
.docker DELETED
@@ -1,17 +0,0 @@
1
- # Use an official Python runtime as a parent image
2
- FROM python:3
3
-
4
- # Set the working directory in the container to /app
5
- WORKDIR /app
6
-
7
- # Copy the current directory contents into the container at /app
8
- COPY . /app
9
-
10
- # Install any needed packages specified in requirements.txt
11
- RUN pip install --trusted-host pypi.python.org -r requirements.txt
12
-
13
- # Make port 80 available to the world outside this container
14
- EXPOSE 80
15
-
16
- # Run main.py when the container launches
17
- CMD ["python", "main.py"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/hadith.json DELETED
@@ -1,62 +0,0 @@
1
- [
2
- {
3
- "id": "bukhari_1",
4
- "arabic": "إِنَّمَا الْأَعْمَالُ بِالنِّيَّاتِ...",
5
- "english": "Actions are judged by intentions...",
6
- "reference": "Sahih al-Bukhari 1"
7
- },
8
- {
9
- "id": "bukhari_8",
10
- "arabic": "بُنِيَ الإِسْلامُ عَلَى خَمْسٍ...",
11
- "english": "Islam is built upon five [pillars]...",
12
- "reference": "Sahih al-Bukhari 8"
13
- },
14
- {
15
- "id": "muslim_1",
16
- "arabic": "الإِيمَانُ أَنْ تُؤْمِنَ بِاللَّهِ وَمَلائِكَتِهِ...",
17
- "english": "Faith is to believe in Allah, His angels...",
18
- "reference": "Sahih Muslim 1"
19
- },
20
- {
21
- "id": "muslim_1907",
22
- "arabic": "مَنْ صَامَ رَمَضَانَ إِيمَانًا وَاحْتِسَابًا...",
23
- "english": "Whoever fasts Ramadan with faith and seeking reward...",
24
- "reference": "Sahih Muslim 1907"
25
- },
26
- {
27
- "id": "ahmad_3784",
28
- "arabic": "بَدَأَ الإِسْلاَمُ غَرِيبًا وَسَيَعُودُ كَمَا بَدَأَ غَرِيبًا فَطُوبَى لِلْغُرَبَاءِ",
29
- "english": "Islam began as something strange and will revert to being strange as it began, so give glad tidings to the strangers.",
30
- "reference": "Musnad Ahmad 3784"
31
- },
32
- {
33
- "id": "ahmad_2107",
34
- "arabic": "أَحَبُّ الدِّينِ إِلَى اللَّهِ الْحَنِيفِيَّةُ السَّمْحَةُ",
35
- "english": "The most beloved of religions to Allah is the easy monotheism (Hanifiyyah).",
36
- "reference": "Musnad Ahmad 2107"
37
- },
38
- {
39
- "id": "ahmad_8030",
40
- "arabic": "الْمُؤْمِنُ مِرْآةُ أَخِيهِ",
41
- "english": "The believer is a mirror for his brother.",
42
- "reference": "Musnad Ahmad 8030"
43
- },
44
- {
45
- "id": "bukhari_6018",
46
- "arabic": "مَنْ كَانَ يُؤْمِنُ بِاللَّهِ وَالْيَوْمِ الآخِرِ فَلْيُحْسِنْ إِلَى جَارِهِ...",
47
- "english": "Whoever believes in Allah and the Last Day should be kind to his neighbor...",
48
- "reference": "Sahih al-Bukhari 6018"
49
- },
50
- {
51
- "id": "tirmidhi_2003",
52
- "arabic": "أَكْمَلُ الْمُؤْمِنِينَ إِيمَانًا أَحْسَنُهُمْ خُلُقًا...",
53
- "english": "The most complete of believers in faith are those with the best character...",
54
- "reference": "Jami` at-Tirmidhi 2003"
55
- },
56
- {
57
- "id": "ibnmajah_224",
58
- "arabic": "طَلَبُ الْعِلْمِ فَرِيضَةٌ عَلَى كُلِّ مُسْلِمٍ...",
59
- "english": "Seeking knowledge is an obligation upon every muslim...",
60
- "reference": "Sunan Ibn Majah 224"
61
- }
62
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/quran.json DELETED
@@ -1,50 +0,0 @@
1
- [
2
- {
3
- "id": "2:153",
4
- "arabic": "يَا أَيُّهَا الَّذِينَ آمَنُوا اسْتَعِينُوا بِالصَّبْرِ وَالصَّلَاةِ...",
5
- "english": "O you who have believed, seek help through patience and prayer...",
6
- "source": "Surah Al-Baqarah 2:153"
7
- },
8
- {
9
- "id": "1:1-7",
10
- "arabic": "بِسْمِ اللَّهِ الرَّحْمَنِ الرَّحِيمِ... اهْدِنَا الصِّرَاطَ الْمُسْتَقِيمَ",
11
- "english": "In the name of Allah, the Entirely Merciful, the Especially Merciful... Guide us to the straight path",
12
- "source": "Surah Al-Fatihah 1:1-7"
13
- },
14
- {
15
- "id": "2:255",
16
- "arabic": "اللَّهُ لَا إِلَهَ إِلَّا هُوَ الْحَيُّ الْقَيُّومُ...",
17
- "english": "Allah - there is no deity except Him, the Ever-Living, the Sustainer of [all] existence...",
18
- "source": "Surah Al-Baqarah 2:255 (Ayat al-Kursi)"
19
- },
20
- {
21
- "id": "112:1-4",
22
- "arabic": "قُلْ هُوَ اللَّهُ أَحَدٌ... وَلَمْ يَكُن لَّهُ كُفُوًا أَحَدٌ",
23
- "english": "Say, He is Allah, [who is] One... And there is none co-equal to Him.",
24
- "source": "Surah Al-Ikhlas 112:1-4"
25
- },
26
- {
27
- "id": "2:286",
28
- "arabic": "لَا يُكَلِّفُ اللَّهُ نَفْسًا إِلَّا وُسْعَهَا...",
29
- "english": "Allah does not charge a soul except [with that within] its capacity...",
30
- "source": "Surah Al-Baqarah 2:286"
31
- },
32
- {
33
- "id": "3:103",
34
- "arabic": "وَاعْتَصِمُوا بِحَبْلِ اللَّهِ جَمِيعًا وَلَا تَفَرَّقُوا...",
35
- "english": "And hold firmly to the rope of Allah all together and do not become divided...",
36
- "source": "Surah Al-Imran 3:103"
37
- },
38
- {
39
- "id": "5:8",
40
- "arabic": "يَا أَيُّهَا الَّذِينَ آمَنُوا كُونُوا قَوَّامِينَ لِلَّهِ شُهَدَاءَ بِالْقِسْطِ...",
41
- "english": "O you who have believed, be persistently standing firm for Allah, witnesses in justice...",
42
- "source": "Surah Al-Ma'idah 5:8"
43
- },
44
- {
45
- "id": "94:5",
46
- "arabic": "فَإِنَّ مَعَ الْعُسْرِ يُسْرًا",
47
- "english": "For indeed, with hardship [will be] ease.",
48
- "source": "Surah Ash-Sharh 94:5"
49
- }
50
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
enrich_dataset.py DELETED
@@ -1,210 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- Script to enrich the QModel dataset with hadith collections from GitHub.
4
- Fetches Musnad Ahmad and other major hadith collections from:
5
- https://github.com/AhmedBaset/hadith-json/tree/main/db/by_book/the_9_books
6
- """
7
-
8
- import json
9
- import requests
10
- from typing import Dict, List
11
- from collections import defaultdict
12
-
13
- # The 9 canonical hadith books
14
- HADITH_BOOKS = {
15
- "ahmed.json": {
16
- "collection": "Musnad Ahmad",
17
- "id_prefix": "ahmad",
18
- "grade": "Hasan/Sahih",
19
- "author": "Imam Ahmad ibn Hanbal"
20
- },
21
- "bukhari.json": {
22
- "collection": "Sahih al-Bukhari",
23
- "id_prefix": "bukhari",
24
- "grade": "Sahih",
25
- "author": "Muhammad al-Bukhari"
26
- },
27
- "muslim.json": {
28
- "collection": "Sahih Muslim",
29
- "id_prefix": "muslim",
30
- "grade": "Sahih",
31
- "author": "Muslim ibn al-Hajjaj"
32
- },
33
- "abudawud.json": {
34
- "collection": "Sunan Abu Dawood",
35
- "id_prefix": "abudawud",
36
- "grade": "Hasan",
37
- "author": "Abu Dawood Sulaiman"
38
- },
39
- "tirmidhi.json": {
40
- "collection": "Jami' at-Tirmidhi",
41
- "id_prefix": "tirmidhi",
42
- "grade": "Hasan",
43
- "author": "Al-Tirmidhi"
44
- },
45
- "ibnmajah.json": {
46
- "collection": "Sunan Ibn Majah",
47
- "id_prefix": "ibnmajah",
48
- "grade": "Hasan",
49
- "author": "Ibn Majah al-Qazwini"
50
- },
51
- "nasai.json": {
52
- "collection": "Sunan an-Nasai",
53
- "id_prefix": "nasai",
54
- "grade": "Sahih",
55
- "author": "Ahmad al-Nasai"
56
- },
57
- "malik.json": {
58
- "collection": "Muwatta Malik",
59
- "id_prefix": "malik",
60
- "grade": "Sahih",
61
- "author": "Malik ibn Anas"
62
- },
63
- "darimi.json": {
64
- "collection": "Sunan al-Darimi",
65
- "id_prefix": "darimi",
66
- "grade": "Hasan",
67
- "author": "Al-Darimi"
68
- }
69
- }
70
-
71
- BASE_URL = "https://raw.githubusercontent.com/AhmedBaset/hadith-json/main/db/by_book/the_9_books"
72
-
73
-
74
- def fetch_hadith_book(filename: str) -> Dict:
75
- """Fetch a hadith book JSON from GitHub."""
76
- url = f"{BASE_URL}/{filename}"
77
- print(f"Fetching {filename}...")
78
- response = requests.get(url, timeout=30)
79
- response.raise_for_status()
80
- return response.json()
81
-
82
-
83
- def transform_hadith(hadith: Dict, book_config: Dict, book_data: Dict) -> Dict:
84
- """Transform hadith from GitHub format to our metadata format."""
85
-
86
- # Find chapter name if available
87
- chapter_name = ""
88
- if "chapterId" in hadith:
89
- for chapter in book_data.get("chapters", []):
90
- if chapter.get("id") == hadith.get("chapterId"):
91
- chapter_name = chapter.get("arabic", "")
92
- break
93
-
94
- # Build the reference string
95
- hadith_num = hadith.get("idInBook", hadith.get("id", ""))
96
- reference = f"{book_config['collection']} {hadith_num}"
97
-
98
- # Combine narrator and text for English
99
- english_parts = []
100
- if isinstance(hadith.get("english"), dict):
101
- if hadith["english"].get("narrator"):
102
- english_parts.append(hadith["english"]["narrator"])
103
- if hadith["english"].get("text"):
104
- english_parts.append(hadith["english"]["text"])
105
- english = " ".join(english_parts)
106
- else:
107
- english = str(hadith.get("english", ""))
108
-
109
- return {
110
- "id": f"{book_config['id_prefix']}_{hadith_num}",
111
- "arabic": hadith.get("arabic", ""),
112
- "english": english,
113
- "reference": reference,
114
- "hadith_number": hadith_num,
115
- "collection": book_config["collection"],
116
- "chapter": chapter_name,
117
- "grade": "", # Will be inferred by main.py's infer_hadith_grade()
118
- "type": "hadith",
119
- "author": book_config["author"]
120
- }
121
-
122
-
123
- def load_existing_metadata(filepath: str) -> List[Dict]:
124
- """Load existing metadata.json file."""
125
- print(f"Loading existing metadata from {filepath}...")
126
- with open(filepath, 'r', encoding='utf-8') as f:
127
- return json.load(f)
128
-
129
-
130
- def save_enriched_metadata(filepath: str, data: List[Dict], stats: Dict) -> None:
131
- """Save enriched metadata to file."""
132
- print(f"Saving enriched metadata to {filepath}...")
133
- with open(filepath, 'w', encoding='utf-8') as f:
134
- json.dump(data, f, ensure_ascii=False, indent=2)
135
-
136
- print("\n" + "="*60)
137
- print("Dataset Enrichment Summary")
138
- print("="*60)
139
- print(f"Total documents: {len(data)}")
140
- print(f"\nBreakdown by collection:")
141
- for collection, count in sorted(stats.items()):
142
- print(f" {collection}: {count}")
143
- print("="*60)
144
-
145
-
146
- def main():
147
- """Main enrichment process."""
148
-
149
- # Load existing metadata
150
- metadata_path = "/Users/elgendy/Projects/QModel/metadata.json"
151
- existing_data = load_existing_metadata(metadata_path)
152
-
153
- # Track which existing hadiths we have
154
- existing_ids = {item["id"] for item in existing_data if item.get("type") == "hadith"}
155
- print(f"Existing hadith entries: {len(existing_ids)}")
156
-
157
- # New hadiths to add
158
- new_hadiths = []
159
- stats = defaultdict(int)
160
-
161
- # Count existing Quran verses
162
- for item in existing_data:
163
- if item.get("type") == "quran":
164
- stats["Quran"] += 1
165
- elif item.get("type") == "hadith":
166
- collection = item.get("collection", "Unknown")
167
- stats[collection] += 1
168
-
169
- # Fetch and process each hadith book
170
- for filename, book_config in HADITH_BOOKS.items():
171
- try:
172
- book_data = fetch_hadith_book(filename)
173
- hadiths = book_data.get("hadiths", [])
174
-
175
- skipped = 0
176
- added = 0
177
-
178
- for hadith in hadiths:
179
- # Transform to our format
180
- transformed = transform_hadith(hadith, book_config, book_data)
181
-
182
- # Check if we already have this hadith
183
- if transformed["id"] in existing_ids:
184
- skipped += 1
185
- continue
186
-
187
- new_hadiths.append(transformed)
188
- existing_ids.add(transformed["id"])
189
- added += 1
190
-
191
- collection_name = book_config["collection"]
192
- stats[collection_name] += added
193
-
194
- print(f" ✓ {filename}: {added} new hadiths added, {skipped} already exist")
195
-
196
- except Exception as e:
197
- print(f" ✗ Error fetching {filename}: {e}")
198
-
199
- # Merge with existing data
200
- enriched_data = existing_data + new_hadiths
201
-
202
- print(f"\nTotal new hadiths added: {len(new_hadiths)}")
203
- print(f"Total documents after enrichment: {len(enriched_data)}")
204
-
205
- # Save enriched metadata
206
- save_enriched_metadata(metadata_path, enriched_data, stats)
207
-
208
-
209
- if __name__ == "__main__":
210
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fetch_data.py DELETED
@@ -1,294 +0,0 @@
1
- """
2
- fetch_data.py — QModel Full Data Fetcher
3
- =========================================
4
- Fetches the COMPLETE Quran (6,236 verses, all 114 surahs) from risan/quran-json
5
- via jsDelivr CDN, using the per-chapter endpoint which contains both Arabic text
6
- AND English translation (Saheeh International) in a single request per surah.
7
-
8
- Also fetches major Hadith collections from fawazahmed0/hadith-api.
9
-
10
- Output files are drop-in replacements for quran.json / hadith.json and are
11
- fully compatible with build_index.py and main.py.
12
-
13
- Schema produced:
14
- quran.json → [{ "id": "2:1", "arabic": "...", "english": "...",
15
- "source": "Surah Al-Baqarah 2:1",
16
- "surah_number": 2, "surah_name_en": "Al-Baqarah",
17
- "surah_name_ar": "البقرة", "verse_number": 1,
18
- "transliteration": "..." }, ...]
19
-
20
- hadith.json → [{ "id": "bukhari_1", "arabic": "...", "english": "...",
21
- "reference": "Sahih al-Bukhari 1",
22
- "hadith_number": 1, "collection": "Sahih al-Bukhari",
23
- "grade": "Sahih" }, ...]
24
-
25
- Usage:
26
- pip install requests
27
- python fetch_data.py # full download
28
- python fetch_data.py --out-dir ./data # custom output dir
29
- python fetch_data.py --hadith-limit 500 # quick test run
30
- python fetch_data.py --quran-only # skip hadith
31
- python fetch_data.py --hadith-only # skip quran
32
- """
33
-
34
- from __future__ import annotations
35
-
36
- import argparse
37
- import json
38
- import sys
39
- import time
40
- from pathlib import Path
41
- from typing import Optional
42
-
43
- try:
44
- import requests
45
- except ImportError:
46
- sys.exit("❌ Install requests first: pip install requests")
47
-
48
- # ── CDN roots ─────────────────────────────────────────────────────────────────
49
- # risan/quran-json: per-chapter endpoint has BOTH arabic + english translation
50
- # Format: { "id": 1, "name": "Al-Fatihah", "transliteration": "...",
51
- # "type": "meccan", "total_verses": 7,
52
- # "verses": [ { "id": 1, "text": "<arabic>",
53
- # "translation": "<saheeh-international>",
54
- # "transliteration": "..." }, ... ] }
55
- QURAN_CHAPTER_URL = "https://cdn.jsdelivr.net/npm/quran-json@3.1.2/dist/chapters/en/{n}.json"
56
-
57
- # fawazahmed0/hadith-api: full-book JSON per edition
58
- # Format: { "metadata": {...},
59
- # "hadiths": [ { "hadithnumber": 1, "text": "...",
60
- # "grades": [{"grade": "Sahih", ...}] }, ... ] }
61
- HADITH_CDN = "https://cdn.jsdelivr.net/gh/fawazahmed0/hadith-api@1/editions"
62
-
63
- # ── Surah metadata ─────────────────────────────────────────────────────────────
64
- SURAH_AR = {
65
- 1:"الفاتحة",2:"البقرة",3:"آل عمران",4:"النساء",5:"المائدة",
66
- 6:"الأنعام",7:"الأعراف",8:"الأنفال",9:"التوبة",10:"يونس",
67
- 11:"هود",12:"يوسف",13:"الرعد",14:"إبراهيم",15:"الحجر",
68
- 16:"النحل",17:"الإسراء",18:"الكهف",19:"مريم",20:"طه",
69
- 21:"الأنبياء",22:"الحج",23:"المؤمنون",24:"النور",25:"الفرقان",
70
- 26:"الشعراء",27:"النمل",28:"القصص",29:"العنكبوت",30:"الروم",
71
- 31:"لقمان",32:"السجدة",33:"الأحزاب",34:"سبأ",35:"فاطر",
72
- 36:"يس",37:"الصافات",38:"ص",39:"الزمر",40:"غافر",
73
- 41:"فصلت",42:"الشورى",43:"الزخرف",44:"الدخان",45:"الجاثية",
74
- 46:"الأحقاف",47:"محمد",48:"الفتح",49:"الحجرات",50:"ق",
75
- 51:"الذاريات",52:"الطور",53:"النجم",54:"القمر",55:"الرحمن",
76
- 56:"الواقعة",57:"الحديد",58:"المجادلة",59:"الحشر",60:"الممتحنة",
77
- 61:"الصف",62:"الجمعة",63:"المنافقون",64:"التغابن",65:"الطلاق",
78
- 66:"التحريم",67:"الملك",68:"القلم",69:"الحاقة",70:"المعارج",
79
- 71:"نوح",72:"الجن",73:"المزمل",74:"المدثر",75:"القيامة",
80
- 76:"الإنسان",77:"المرسلات",78:"النبأ",79:"النازعات",80:"عبس",
81
- 81:"التكوير",82:"الانفطار",83:"المطففين",84:"الانشقاق",85:"البروج",
82
- 86:"الطارق",87:"الأعلى",88:"الغاشية",89:"الفجر",90:"البلد",
83
- 91:"الشمس",92:"الليل",93:"الضحى",94:"الشرح",95:"التين",
84
- 96:"العلق",97:"القدر",98:"البينة",99:"الزلزلة",100:"العاديات",
85
- 101:"القارعة",102:"التكاثر",103:"العصر",104:"الهمزة",105:"الفيل",
86
- 106:"قريش",107:"الماعون",108:"الكوثر",109:"الكافرون",110:"النصر",
87
- 111:"المسد",112:"الإخلاص",113:"ال��لق",114:"الناس",
88
- }
89
-
90
- # ── Hadith collections ─────────────────────────────────────────────────────────
91
- # (arabic_edition, english_edition, human_label, id_prefix)
92
- HADITH_EDITIONS = [
93
- ("ara-bukhari", "eng-bukhari", "Sahih al-Bukhari", "bukhari"),
94
- ("ara-muslim", "eng-muslim", "Sahih Muslim", "muslim"),
95
- ("ara-abudawud", "eng-abudawud", "Sunan Abu Dawud", "abudawud"),
96
- ("ara-tirmidhi", "eng-tirmidhi", "Jami' at-Tirmidhi", "tirmidhi"),
97
- ("ara-ibnmajah", "eng-ibnmajah", "Sunan Ibn Majah", "ibnmajah"),
98
- ("ara-nasai", "eng-nasai", "Sunan an-Nasa'i", "nasai"),
99
- ("ara-malik", "eng-malik", "Muwatta Malik", "malik"),
100
- ]
101
-
102
-
103
- # ── HTTP helper ────────────────────────────────────────────────────────────────
104
- def get_json(url: str, retries: int = 4, backoff: float = 2.0) -> Optional[dict | list]:
105
- for attempt in range(1, retries + 1):
106
- try:
107
- r = requests.get(url, timeout=60)
108
- r.raise_for_status()
109
- return r.json()
110
- except Exception as exc:
111
- print(f" ⚠️ Attempt {attempt}/{retries}: {exc}")
112
- if attempt < retries:
113
- time.sleep(backoff * attempt)
114
- return None
115
-
116
-
117
- def save(path: Path, data: list) -> None:
118
- path.parent.mkdir(parents=True, exist_ok=True)
119
- path.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8")
120
- kb = path.stat().st_size / 1024
121
- print(f" 💾 {path} — {len(data):,} records ({kb:,.0f} KB)")
122
-
123
-
124
- # ── Quran ──────────────────────────────────────────────────────────────────────
125
- def fetch_quran() -> list:
126
- """
127
- Uses the risan/quran-json per-chapter English endpoint:
128
- cdn.jsdelivr.net/npm/quran-json@3.1.2/dist/chapters/en/{N}.json
129
-
130
- Each file contains:
131
- {
132
- "id": 1,
133
- "name": "Al-Fatihah",
134
- "transliteration": "Al-Fatihah",
135
- "type": "meccan",
136
- "total_verses": 7,
137
- "verses": [
138
- {
139
- "id": 1,
140
- "text": "<uthmani arabic>",
141
- "translation": "<saheeh international english>",
142
- "transliteration": "<latin>"
143
- }, ...
144
- ]
145
- }
146
-
147
- This single endpoint gives us Arabic + English + transliteration per verse —
148
- no need to join two separate files.
149
- """
150
- print("\n📖 Fetching full Quran (114 surahs, 6,236 verses) …")
151
- records = []
152
- failed = []
153
-
154
- for n in range(1, 115):
155
- url = QURAN_CHAPTER_URL.format(n=n)
156
- data = get_json(url)
157
-
158
- if not data:
159
- print(f" ❌ Surah {n} — failed, skipping")
160
- failed.append(n)
161
- continue
162
-
163
- surah_name_en = data.get("name") or data.get("transliteration") or f"Surah {n}"
164
- surah_name_ar = SURAH_AR.get(n, "")
165
- verses = data.get("verses", [])
166
-
167
- for v in verses:
168
- vid = int(v.get("id", 0))
169
- arabic = (v.get("text") or "").strip()
170
- english = (v.get("translation") or "").strip()
171
- translit= (v.get("transliteration") or "").strip()
172
-
173
- if not vid or not arabic:
174
- continue
175
-
176
- records.append({
177
- # ── core fields (required by main.py / build_index.py) ──
178
- "id": f"{n}:{vid}",
179
- "arabic": arabic,
180
- "english": english,
181
- "source": f"Surah {surah_name_en} {n}:{vid}",
182
- # ── enriched metadata ──
183
- "surah_number": n,
184
- "surah_name_en": surah_name_en,
185
- "surah_name_ar": surah_name_ar,
186
- "verse_number": vid,
187
- "transliteration": translit,
188
- })
189
-
190
- # Brief progress every 10 surahs
191
- if n % 10 == 0 or n == 114:
192
- print(f" ✔ Surahs 1–{n} fetched ({len(records):,} verses so far)")
193
- time.sleep(0.15) # be polite to the CDN
194
-
195
- if failed:
196
- print(f"\n ⚠️ {len(failed)} surahs failed: {failed}")
197
- print(f"\n ✅ Quran complete — {len(records):,} verses")
198
- return records
199
-
200
-
201
- # ── Hadith ─────────────────────────────────────────────────────────────────────
202
- def fetch_hadith_edition(
203
- ar_edition: str, en_edition: str,
204
- label: str, prefix: str,
205
- limit: Optional[int],
206
- ) -> list:
207
- ar_data = get_json(f"{HADITH_CDN}/{ar_edition}.json")
208
- en_data = get_json(f"{HADITH_CDN}/{en_edition}.json")
209
-
210
- if not ar_data:
211
- print(f" ❌ {label} Arabic — unavailable, skipping")
212
- return []
213
-
214
- en_lookup = {
215
- int(h["hadithnumber"]): (h.get("text") or "")
216
- for h in (en_data or {}).get("hadiths", [])
217
- if "hadithnumber" in h
218
- }
219
-
220
- records = []
221
- for h in ar_data.get("hadiths", []):
222
- num = h.get("hadithnumber")
223
- arabic = (h.get("text") or "").strip()
224
- if not num or not arabic:
225
- continue
226
- num = int(num)
227
- english = en_lookup.get(num, "").strip()
228
- grades = h.get("grades") or []
229
- grade = grades[0].get("grade", "") if grades else ""
230
-
231
- records.append({
232
- # ── core fields ──
233
- "id": f"{prefix}_{num}",
234
- "arabic": arabic,
235
- "english": english,
236
- "reference": f"{label} {num}",
237
- # ── enriched metadata ──
238
- "hadith_number": num,
239
- "collection": label,
240
- "grade": grade,
241
- })
242
- if limit and len(records) >= limit:
243
- break
244
-
245
- print(f" ✅ {label}: {len(records):,} hadiths")
246
- return records
247
-
248
-
249
- def fetch_all_hadiths(limit_per_collection: Optional[int] = None) -> list:
250
- print("\n📚 Fetching Hadith collections …")
251
- all_hadiths: list = []
252
- for ar_ed, en_ed, label, prefix in HADITH_EDITIONS:
253
- print(f"\n → {label}")
254
- records = fetch_hadith_edition(ar_ed, en_ed, label, prefix, limit_per_collection)
255
- all_hadiths.extend(records)
256
- time.sleep(0.5)
257
- print(f"\n 📊 Total hadiths: {len(all_hadiths):,}")
258
- return all_hadiths
259
-
260
-
261
- # ── CLI ────────────────────────────────────────────────────────────────────────
262
- def main() -> None:
263
- parser = argparse.ArgumentParser(
264
- description="Fetch complete Quran + Hadith data for QModel"
265
- )
266
- parser.add_argument(
267
- "--out-dir", default="./data",
268
- help="Output directory (default: ./data)"
269
- )
270
- parser.add_argument(
271
- "--hadith-limit", type=int, default=None,
272
- help="Max hadiths per collection (omit = all, ~50k total)"
273
- )
274
- parser.add_argument("--quran-only", action="store_true", help="Skip hadith")
275
- parser.add_argument("--hadith-only", action="store_true", help="Skip quran")
276
- args = parser.parse_args()
277
-
278
- out = Path(args.out_dir)
279
- out.mkdir(parents=True, exist_ok=True)
280
-
281
- if not args.hadith_only:
282
- quran = fetch_quran()
283
- save(out / "quran.json", quran)
284
-
285
- if not args.quran_only:
286
- hadiths = fetch_all_hadiths(limit_per_collection=args.hadith_limit)
287
- save(out / "hadith.json", hadiths)
288
-
289
- print("\n🎉 Done! Output:", out.resolve())
290
- print(" Next: python build_index.py")
291
-
292
-
293
- if __name__ == "__main__":
294
- main()