Spaces:
Running
Running
oceansweep
commited on
Upload 3 files
Browse files
App_Function_Libraries/MediaWiki/Media_Wiki.py
ADDED
@@ -0,0 +1,210 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Media_Wiki.py
|
2 |
+
# Description: This file contains the functions to import MediaWiki dumps into the media_db and Chroma databases.
|
3 |
+
#######################################################################################################################
|
4 |
+
#
|
5 |
+
# Imports
|
6 |
+
import json
|
7 |
+
import logging
|
8 |
+
import os
|
9 |
+
import re
|
10 |
+
from typing import List, Dict, Any, Iterator, Optional
|
11 |
+
# 3rd-Party Imports
|
12 |
+
import mwparserfromhell
|
13 |
+
import mwxml
|
14 |
+
import yaml
|
15 |
+
#
|
16 |
+
# Local Imports
|
17 |
+
from App_Function_Libraries.DB.DB_Manager import add_media_with_keywords, check_media_exists
|
18 |
+
from App_Function_Libraries.RAG.ChromaDB_Library import process_and_store_content
|
19 |
+
#
|
20 |
+
#######################################################################################################################
|
21 |
+
#
|
22 |
+
# Functions:
|
23 |
+
|
24 |
+
def setup_logger(name: str, level: int = logging.INFO, log_file: Optional[str] = None) -> logging.Logger:
|
25 |
+
"""Set up and return a logger with the given name and level."""
|
26 |
+
logger = logging.getLogger(name)
|
27 |
+
logger.setLevel(level)
|
28 |
+
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
29 |
+
|
30 |
+
if log_file:
|
31 |
+
file_handler = logging.FileHandler(log_file)
|
32 |
+
file_handler.setFormatter(formatter)
|
33 |
+
logger.addHandler(file_handler)
|
34 |
+
|
35 |
+
console_handler = logging.StreamHandler()
|
36 |
+
console_handler.setFormatter(formatter)
|
37 |
+
logger.addHandler(console_handler)
|
38 |
+
|
39 |
+
return logger
|
40 |
+
|
41 |
+
# Usage
|
42 |
+
logger = setup_logger('mediawiki_import', log_file='mediawiki_import.log')
|
43 |
+
|
44 |
+
# Load configuration
|
45 |
+
def load_mediawiki_import_config():
|
46 |
+
with open(os.path.join('Config_Files', 'mediawiki_import_config.yaml'), 'r') as f:
|
47 |
+
return yaml.safe_load(f)
|
48 |
+
config = load_mediawiki_import_config()
|
49 |
+
|
50 |
+
def parse_mediawiki_dump(file_path: str, namespaces: List[int] = None, skip_redirects: bool = False) -> Iterator[
|
51 |
+
Dict[str, Any]]:
|
52 |
+
dump = mwxml.Dump.from_file(open(file_path, encoding='utf-8'))
|
53 |
+
for page in dump.pages:
|
54 |
+
if skip_redirects and page.redirect:
|
55 |
+
continue
|
56 |
+
if namespaces and page.namespace not in namespaces:
|
57 |
+
continue
|
58 |
+
|
59 |
+
for revision in page:
|
60 |
+
code = mwparserfromhell.parse(revision.text)
|
61 |
+
text = code.strip_code(normalize=True, collapse=True, keep_template_params=False)
|
62 |
+
yield {
|
63 |
+
"title": page.title,
|
64 |
+
"content": text,
|
65 |
+
"namespace": page.namespace,
|
66 |
+
"page_id": page.id,
|
67 |
+
"revision_id": revision.id,
|
68 |
+
"timestamp": revision.timestamp
|
69 |
+
}
|
70 |
+
logger.debug(f"Yielded page: {page.title}")
|
71 |
+
|
72 |
+
|
73 |
+
def optimized_chunking(text: str, chunk_options: Dict[str, Any]) -> List[Dict[str, Any]]:
|
74 |
+
sections = re.split(r'\n==\s*(.*?)\s*==\n', text)
|
75 |
+
chunks = []
|
76 |
+
current_chunk = ""
|
77 |
+
current_size = 0
|
78 |
+
|
79 |
+
for i in range(0, len(sections), 2):
|
80 |
+
section_title = sections[i] if i > 0 else "Introduction"
|
81 |
+
section_content = sections[i + 1] if i + 1 < len(sections) else ""
|
82 |
+
|
83 |
+
if current_size + len(section_content) > chunk_options['max_size']:
|
84 |
+
if current_chunk:
|
85 |
+
chunks.append({"text": current_chunk, "metadata": {"section": section_title}})
|
86 |
+
current_chunk = section_content
|
87 |
+
current_size = len(section_content)
|
88 |
+
else:
|
89 |
+
current_chunk += f"\n== {section_title} ==\n" + section_content
|
90 |
+
current_size += len(section_content)
|
91 |
+
|
92 |
+
if current_chunk:
|
93 |
+
chunks.append({"text": current_chunk, "metadata": {"section": "End"}})
|
94 |
+
|
95 |
+
return chunks
|
96 |
+
|
97 |
+
|
98 |
+
def process_single_item(content: str, title: str, wiki_name: str, chunk_options: Dict[str, Any],
|
99 |
+
is_combined: bool = False, item: Dict[str, Any] = None):
|
100 |
+
try:
|
101 |
+
url = f"mediawiki:{wiki_name}" if is_combined else f"mediawiki:{wiki_name}:{title}"
|
102 |
+
|
103 |
+
if not check_media_exists(title, url):
|
104 |
+
media_id = add_media_with_keywords(
|
105 |
+
url=url,
|
106 |
+
title=title,
|
107 |
+
media_type="mediawiki_dump" if is_combined else "mediawiki_article",
|
108 |
+
content=content,
|
109 |
+
keywords=f"mediawiki,{wiki_name}" + (",full_dump" if is_combined else ",article"),
|
110 |
+
prompt="",
|
111 |
+
summary="",
|
112 |
+
transcription_model="",
|
113 |
+
author="MediaWiki",
|
114 |
+
ingestion_date=item['timestamp'].strftime('%Y-%m-%d') if item else None
|
115 |
+
)
|
116 |
+
|
117 |
+
chunks = optimized_chunking(content, chunk_options)
|
118 |
+
for chunk in chunks:
|
119 |
+
process_and_store_content(chunk['text'], f"mediawiki_{wiki_name}", media_id, title)
|
120 |
+
logger.info(f"Successfully processed item: {title}")
|
121 |
+
else:
|
122 |
+
logger.info(f"Skipping existing article: {title}")
|
123 |
+
except Exception as e:
|
124 |
+
logger.error(f"Error processing item {title}: {str(e)}")
|
125 |
+
|
126 |
+
|
127 |
+
def load_checkpoint(file_path: str) -> int:
|
128 |
+
if os.path.exists(file_path):
|
129 |
+
with open(file_path, 'r') as f:
|
130 |
+
return json.load(f)['last_processed_id']
|
131 |
+
return 0
|
132 |
+
|
133 |
+
|
134 |
+
def save_checkpoint(file_path: str, last_processed_id: int):
|
135 |
+
with open(file_path, 'w') as f:
|
136 |
+
json.dump({'last_processed_id': last_processed_id}, f)
|
137 |
+
|
138 |
+
|
139 |
+
def import_mediawiki_dump(
|
140 |
+
file_path: str,
|
141 |
+
wiki_name: str,
|
142 |
+
namespaces: List[int] = None,
|
143 |
+
skip_redirects: bool = False,
|
144 |
+
chunk_options: Dict[str, Any] = None,
|
145 |
+
single_item: bool = False,
|
146 |
+
progress_callback: Any = None
|
147 |
+
) -> Iterator[str]:
|
148 |
+
try:
|
149 |
+
if chunk_options is None:
|
150 |
+
chunk_options = config['chunking']
|
151 |
+
|
152 |
+
checkpoint_file = f"{wiki_name}_import_checkpoint.json"
|
153 |
+
last_processed_id = load_checkpoint(checkpoint_file)
|
154 |
+
|
155 |
+
total_pages = count_pages(file_path, namespaces, skip_redirects)
|
156 |
+
processed_pages = 0
|
157 |
+
|
158 |
+
yield f"Found {total_pages} pages to process."
|
159 |
+
|
160 |
+
for item in parse_mediawiki_dump(file_path, namespaces, skip_redirects):
|
161 |
+
if item['page_id'] <= last_processed_id:
|
162 |
+
continue
|
163 |
+
process_single_item(item['content'], item['title'], wiki_name, chunk_options, False, item)
|
164 |
+
save_checkpoint(checkpoint_file, item['page_id'])
|
165 |
+
processed_pages += 1
|
166 |
+
if progress_callback is not None:
|
167 |
+
progress_callback(processed_pages / total_pages, f"Processed page: {item['title']}")
|
168 |
+
yield f"Processed page {processed_pages}/{total_pages}: {item['title']}"
|
169 |
+
|
170 |
+
os.remove(checkpoint_file) # Remove checkpoint file after successful import
|
171 |
+
yield f"Successfully imported and indexed MediaWiki dump: {wiki_name}"
|
172 |
+
except FileNotFoundError:
|
173 |
+
logger.error(f"MediaWiki dump file not found: {file_path}")
|
174 |
+
yield f"Error: File not found - {file_path}"
|
175 |
+
except PermissionError:
|
176 |
+
logger.error(f"Permission denied when trying to read: {file_path}")
|
177 |
+
yield f"Error: Permission denied - {file_path}"
|
178 |
+
except Exception as e:
|
179 |
+
logger.exception(f"Error during MediaWiki import: {str(e)}")
|
180 |
+
yield f"Error during import: {str(e)}"
|
181 |
+
|
182 |
+
def count_pages(file_path: str, namespaces: List[int] = None, skip_redirects: bool = False) -> int:
|
183 |
+
"""
|
184 |
+
Count the number of pages in a MediaWiki XML dump file.
|
185 |
+
|
186 |
+
Args:
|
187 |
+
file_path (str): Path to the MediaWiki XML dump file.
|
188 |
+
namespaces (List[int], optional): List of namespace IDs to include. If None, include all namespaces.
|
189 |
+
skip_redirects (bool, optional): Whether to skip redirect pages.
|
190 |
+
|
191 |
+
Returns:
|
192 |
+
int: The number of pages in the dump file.
|
193 |
+
"""
|
194 |
+
try:
|
195 |
+
dump = mwxml.Dump.from_file(open(file_path, encoding='utf-8'))
|
196 |
+
count = 0
|
197 |
+
for page in dump.pages:
|
198 |
+
if skip_redirects and page.redirect:
|
199 |
+
continue
|
200 |
+
if namespaces and page.namespace not in namespaces:
|
201 |
+
continue
|
202 |
+
count += 1
|
203 |
+
return count
|
204 |
+
except Exception as e:
|
205 |
+
logger.error(f"Error counting pages in MediaWiki dump: {str(e)}")
|
206 |
+
return 0
|
207 |
+
|
208 |
+
#
|
209 |
+
# End of Media_Wiki.py
|
210 |
+
#######################################################################################################################
|
App_Function_Libraries/MediaWiki/Media_Wiki_Tests.py
ADDED
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Media_Wiki_Tests.py
|
2 |
+
# Description: Unit tests for the Media_Wiki module.
|
3 |
+
#
|
4 |
+
# Usage:
|
5 |
+
# pip install pytest pytest-asyncio
|
6 |
+
# pytest Media_Wiki_Tests.py
|
7 |
+
#
|
8 |
+
# Imports
|
9 |
+
import pytest
|
10 |
+
import asyncio
|
11 |
+
from unittest.mock import patch, MagicMock
|
12 |
+
# Local Imports
|
13 |
+
from Media_Wiki import parse_mediawiki_dump, optimized_chunking, process_single_item, import_mediawiki_dump, load_mediawiki_import_config
|
14 |
+
#
|
15 |
+
# #######################################################################################################################
|
16 |
+
#
|
17 |
+
# Functions:
|
18 |
+
|
19 |
+
|
20 |
+
|
21 |
+
@pytest.fixture(scope="module")
|
22 |
+
def event_loop():
|
23 |
+
loop = asyncio.get_event_loop_policy().new_event_loop()
|
24 |
+
yield loop
|
25 |
+
loop.close()
|
26 |
+
|
27 |
+
@pytest.fixture
|
28 |
+
def mock_mwxml_dump():
|
29 |
+
mock_dump = MagicMock()
|
30 |
+
mock_page = MagicMock()
|
31 |
+
mock_page.title = "Test Page"
|
32 |
+
mock_page.namespace = 0
|
33 |
+
mock_page.id = 1
|
34 |
+
mock_revision = MagicMock()
|
35 |
+
mock_revision.id = 1
|
36 |
+
mock_revision.timestamp = "2021-01-01T00:00:00Z"
|
37 |
+
mock_revision.text = "Test content"
|
38 |
+
mock_page.revisions = [mock_revision]
|
39 |
+
mock_dump.pages = [mock_page]
|
40 |
+
return mock_dump
|
41 |
+
|
42 |
+
def test_parse_mediawiki_dump(mock_mwxml_dump):
|
43 |
+
with patch('mwxml.Dump.from_file', return_value=mock_mwxml_dump), \
|
44 |
+
patch('mwparserfromhell.parse') as mock_parse:
|
45 |
+
mock_parse.return_value.strip_code.return_value = "Stripped content"
|
46 |
+
result = list(parse_mediawiki_dump("dummy_path"))
|
47 |
+
assert len(result) == 1
|
48 |
+
assert result[0]['title'] == "Test Page"
|
49 |
+
assert result[0]['content'] == "Stripped content"
|
50 |
+
assert result[0]['namespace'] == 0
|
51 |
+
assert result[0]['page_id'] == 1
|
52 |
+
assert result[0]['revision_id'] == 1
|
53 |
+
|
54 |
+
def test_optimized_chunking():
|
55 |
+
test_text = "== Section 1 ==\nContent 1\n== Section 2 ==\nContent 2"
|
56 |
+
chunk_options = {'max_size': 50}
|
57 |
+
result = optimized_chunking(test_text, chunk_options)
|
58 |
+
assert len(result) == 2
|
59 |
+
assert result[0]['text'].startswith("== Section 1 ==")
|
60 |
+
assert result[1]['text'].startswith("== Section 2 ==")
|
61 |
+
assert 'metadata' in result[0] and 'section' in result[0]['metadata']
|
62 |
+
|
63 |
+
@pytest.mark.asyncio
|
64 |
+
async def test_process_single_item():
|
65 |
+
with patch('Media_Wiki.check_media_exists', return_value=False), \
|
66 |
+
patch('Media_Wiki.add_media_with_keywords', return_value=1), \
|
67 |
+
patch('Media_Wiki.process_and_store_content') as mock_process_store:
|
68 |
+
await process_single_item("Test content", "Test Title", "TestWiki", {'max_size': 100})
|
69 |
+
mock_process_store.assert_called()
|
70 |
+
# Add more detailed assertions here
|
71 |
+
|
72 |
+
@pytest.mark.asyncio
|
73 |
+
async def test_import_mediawiki_dump():
|
74 |
+
with patch('Media_Wiki.parse_mediawiki_dump') as mock_parse, \
|
75 |
+
patch('Media_Wiki.process_single_item') as mock_process, \
|
76 |
+
patch('Media_Wiki.load_checkpoint', return_value=0), \
|
77 |
+
patch('Media_Wiki.save_checkpoint'), \
|
78 |
+
patch('os.remove'):
|
79 |
+
mock_parse.return_value = [{'page_id': 1, 'title': 'Test', 'content': 'Content'}]
|
80 |
+
result = await import_mediawiki_dump("dummy_path", "TestWiki")
|
81 |
+
assert "Successfully imported" in result
|
82 |
+
mock_process.assert_called_once()
|
83 |
+
|
84 |
+
def test_import_mediawiki_dump_file_not_found():
|
85 |
+
with patch('Media_Wiki.parse_mediawiki_dump', side_effect=FileNotFoundError):
|
86 |
+
result = asyncio.run(import_mediawiki_dump("non_existent_path", "TestWiki"))
|
87 |
+
assert "Error: File not found" in result
|
88 |
+
|
89 |
+
def test_load_mediawiki_import_config():
|
90 |
+
with patch('builtins.open', MagicMock()):
|
91 |
+
with patch('yaml.safe_load', return_value={'test_key': 'test_value'}):
|
92 |
+
config = load_mediawiki_import_config()
|
93 |
+
assert 'test_key' in config
|
94 |
+
assert config['test_key'] == 'test_value'
|
App_Function_Libraries/MediaWiki/mediawiki_import_config.yaml
ADDED
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# MediaWiki Import Configuration
|
2 |
+
|
3 |
+
# Database settings
|
4 |
+
database:
|
5 |
+
sqlite_path: './Databases/media_summary.db'
|
6 |
+
chroma_db_path: 'chroma_db'
|
7 |
+
|
8 |
+
# Chunking options
|
9 |
+
chunking:
|
10 |
+
default_method: 'sentences'
|
11 |
+
default_size: 1000
|
12 |
+
default_overlap: 100
|
13 |
+
adaptive: true
|
14 |
+
language: 'en'
|
15 |
+
methods:
|
16 |
+
- 'sentences'
|
17 |
+
- 'words'
|
18 |
+
- 'paragraphs'
|
19 |
+
- 'tokens'
|
20 |
+
|
21 |
+
# Import settings
|
22 |
+
import:
|
23 |
+
batch_size: 1000 # Number of pages to process in a single batch
|
24 |
+
default_skip_redirects: true
|
25 |
+
default_namespaces: [0] # Main namespace by default
|
26 |
+
single_item_default: false
|
27 |
+
|
28 |
+
# Processing options
|
29 |
+
processing:
|
30 |
+
max_workers: 4 # Number of worker threads for async processing
|
31 |
+
|
32 |
+
# Embedding settings
|
33 |
+
embeddings:
|
34 |
+
provider: 'openai' # or 'local' or 'huggingface'
|
35 |
+
model: 'text-embedding-ada-002'
|
36 |
+
api_key: 'your_openai_api_key_here' # Remove if using local embeddings
|
37 |
+
local_url: 'http://localhost:8080/embeddings' # Only for local embeddings
|
38 |
+
|
39 |
+
# ChromaDB settings
|
40 |
+
chromadb:
|
41 |
+
collection_prefix: 'mediawiki_'
|
42 |
+
|
43 |
+
# Logging settings
|
44 |
+
logging:
|
45 |
+
level: 'INFO'
|
46 |
+
file: 'mediawiki_import.log'
|
47 |
+
|
48 |
+
# Checkpoint settings
|
49 |
+
checkpoints:
|
50 |
+
enabled: true
|
51 |
+
directory: 'import_checkpoints'
|
52 |
+
|
53 |
+
# Error handling
|
54 |
+
error_handling:
|
55 |
+
max_retries: 3
|
56 |
+
retry_delay: 5 # seconds
|
57 |
+
|
58 |
+
# User interface settings
|
59 |
+
ui:
|
60 |
+
default_chunk_size: 1000
|
61 |
+
min_chunk_size: 100
|
62 |
+
max_chunk_size: 2000
|
63 |
+
default_chunk_overlap: 100
|