oceansweep commited on
Commit
32b7e17
·
verified ·
1 Parent(s): f160905

Upload 3 files

Browse files
App_Function_Libraries/MediaWiki/Media_Wiki.py ADDED
@@ -0,0 +1,210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Media_Wiki.py
2
+ # Description: This file contains the functions to import MediaWiki dumps into the media_db and Chroma databases.
3
+ #######################################################################################################################
4
+ #
5
+ # Imports
6
+ import json
7
+ import logging
8
+ import os
9
+ import re
10
+ from typing import List, Dict, Any, Iterator, Optional
11
+ # 3rd-Party Imports
12
+ import mwparserfromhell
13
+ import mwxml
14
+ import yaml
15
+ #
16
+ # Local Imports
17
+ from App_Function_Libraries.DB.DB_Manager import add_media_with_keywords, check_media_exists
18
+ from App_Function_Libraries.RAG.ChromaDB_Library import process_and_store_content
19
+ #
20
+ #######################################################################################################################
21
+ #
22
+ # Functions:
23
+
24
+ def setup_logger(name: str, level: int = logging.INFO, log_file: Optional[str] = None) -> logging.Logger:
25
+ """Set up and return a logger with the given name and level."""
26
+ logger = logging.getLogger(name)
27
+ logger.setLevel(level)
28
+ formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
29
+
30
+ if log_file:
31
+ file_handler = logging.FileHandler(log_file)
32
+ file_handler.setFormatter(formatter)
33
+ logger.addHandler(file_handler)
34
+
35
+ console_handler = logging.StreamHandler()
36
+ console_handler.setFormatter(formatter)
37
+ logger.addHandler(console_handler)
38
+
39
+ return logger
40
+
41
+ # Usage
42
+ logger = setup_logger('mediawiki_import', log_file='mediawiki_import.log')
43
+
44
+ # Load configuration
45
+ def load_mediawiki_import_config():
46
+ with open(os.path.join('Config_Files', 'mediawiki_import_config.yaml'), 'r') as f:
47
+ return yaml.safe_load(f)
48
+ config = load_mediawiki_import_config()
49
+
50
+ def parse_mediawiki_dump(file_path: str, namespaces: List[int] = None, skip_redirects: bool = False) -> Iterator[
51
+ Dict[str, Any]]:
52
+ dump = mwxml.Dump.from_file(open(file_path, encoding='utf-8'))
53
+ for page in dump.pages:
54
+ if skip_redirects and page.redirect:
55
+ continue
56
+ if namespaces and page.namespace not in namespaces:
57
+ continue
58
+
59
+ for revision in page:
60
+ code = mwparserfromhell.parse(revision.text)
61
+ text = code.strip_code(normalize=True, collapse=True, keep_template_params=False)
62
+ yield {
63
+ "title": page.title,
64
+ "content": text,
65
+ "namespace": page.namespace,
66
+ "page_id": page.id,
67
+ "revision_id": revision.id,
68
+ "timestamp": revision.timestamp
69
+ }
70
+ logger.debug(f"Yielded page: {page.title}")
71
+
72
+
73
+ def optimized_chunking(text: str, chunk_options: Dict[str, Any]) -> List[Dict[str, Any]]:
74
+ sections = re.split(r'\n==\s*(.*?)\s*==\n', text)
75
+ chunks = []
76
+ current_chunk = ""
77
+ current_size = 0
78
+
79
+ for i in range(0, len(sections), 2):
80
+ section_title = sections[i] if i > 0 else "Introduction"
81
+ section_content = sections[i + 1] if i + 1 < len(sections) else ""
82
+
83
+ if current_size + len(section_content) > chunk_options['max_size']:
84
+ if current_chunk:
85
+ chunks.append({"text": current_chunk, "metadata": {"section": section_title}})
86
+ current_chunk = section_content
87
+ current_size = len(section_content)
88
+ else:
89
+ current_chunk += f"\n== {section_title} ==\n" + section_content
90
+ current_size += len(section_content)
91
+
92
+ if current_chunk:
93
+ chunks.append({"text": current_chunk, "metadata": {"section": "End"}})
94
+
95
+ return chunks
96
+
97
+
98
+ def process_single_item(content: str, title: str, wiki_name: str, chunk_options: Dict[str, Any],
99
+ is_combined: bool = False, item: Dict[str, Any] = None):
100
+ try:
101
+ url = f"mediawiki:{wiki_name}" if is_combined else f"mediawiki:{wiki_name}:{title}"
102
+
103
+ if not check_media_exists(title, url):
104
+ media_id = add_media_with_keywords(
105
+ url=url,
106
+ title=title,
107
+ media_type="mediawiki_dump" if is_combined else "mediawiki_article",
108
+ content=content,
109
+ keywords=f"mediawiki,{wiki_name}" + (",full_dump" if is_combined else ",article"),
110
+ prompt="",
111
+ summary="",
112
+ transcription_model="",
113
+ author="MediaWiki",
114
+ ingestion_date=item['timestamp'].strftime('%Y-%m-%d') if item else None
115
+ )
116
+
117
+ chunks = optimized_chunking(content, chunk_options)
118
+ for chunk in chunks:
119
+ process_and_store_content(chunk['text'], f"mediawiki_{wiki_name}", media_id, title)
120
+ logger.info(f"Successfully processed item: {title}")
121
+ else:
122
+ logger.info(f"Skipping existing article: {title}")
123
+ except Exception as e:
124
+ logger.error(f"Error processing item {title}: {str(e)}")
125
+
126
+
127
+ def load_checkpoint(file_path: str) -> int:
128
+ if os.path.exists(file_path):
129
+ with open(file_path, 'r') as f:
130
+ return json.load(f)['last_processed_id']
131
+ return 0
132
+
133
+
134
+ def save_checkpoint(file_path: str, last_processed_id: int):
135
+ with open(file_path, 'w') as f:
136
+ json.dump({'last_processed_id': last_processed_id}, f)
137
+
138
+
139
+ def import_mediawiki_dump(
140
+ file_path: str,
141
+ wiki_name: str,
142
+ namespaces: List[int] = None,
143
+ skip_redirects: bool = False,
144
+ chunk_options: Dict[str, Any] = None,
145
+ single_item: bool = False,
146
+ progress_callback: Any = None
147
+ ) -> Iterator[str]:
148
+ try:
149
+ if chunk_options is None:
150
+ chunk_options = config['chunking']
151
+
152
+ checkpoint_file = f"{wiki_name}_import_checkpoint.json"
153
+ last_processed_id = load_checkpoint(checkpoint_file)
154
+
155
+ total_pages = count_pages(file_path, namespaces, skip_redirects)
156
+ processed_pages = 0
157
+
158
+ yield f"Found {total_pages} pages to process."
159
+
160
+ for item in parse_mediawiki_dump(file_path, namespaces, skip_redirects):
161
+ if item['page_id'] <= last_processed_id:
162
+ continue
163
+ process_single_item(item['content'], item['title'], wiki_name, chunk_options, False, item)
164
+ save_checkpoint(checkpoint_file, item['page_id'])
165
+ processed_pages += 1
166
+ if progress_callback is not None:
167
+ progress_callback(processed_pages / total_pages, f"Processed page: {item['title']}")
168
+ yield f"Processed page {processed_pages}/{total_pages}: {item['title']}"
169
+
170
+ os.remove(checkpoint_file) # Remove checkpoint file after successful import
171
+ yield f"Successfully imported and indexed MediaWiki dump: {wiki_name}"
172
+ except FileNotFoundError:
173
+ logger.error(f"MediaWiki dump file not found: {file_path}")
174
+ yield f"Error: File not found - {file_path}"
175
+ except PermissionError:
176
+ logger.error(f"Permission denied when trying to read: {file_path}")
177
+ yield f"Error: Permission denied - {file_path}"
178
+ except Exception as e:
179
+ logger.exception(f"Error during MediaWiki import: {str(e)}")
180
+ yield f"Error during import: {str(e)}"
181
+
182
+ def count_pages(file_path: str, namespaces: List[int] = None, skip_redirects: bool = False) -> int:
183
+ """
184
+ Count the number of pages in a MediaWiki XML dump file.
185
+
186
+ Args:
187
+ file_path (str): Path to the MediaWiki XML dump file.
188
+ namespaces (List[int], optional): List of namespace IDs to include. If None, include all namespaces.
189
+ skip_redirects (bool, optional): Whether to skip redirect pages.
190
+
191
+ Returns:
192
+ int: The number of pages in the dump file.
193
+ """
194
+ try:
195
+ dump = mwxml.Dump.from_file(open(file_path, encoding='utf-8'))
196
+ count = 0
197
+ for page in dump.pages:
198
+ if skip_redirects and page.redirect:
199
+ continue
200
+ if namespaces and page.namespace not in namespaces:
201
+ continue
202
+ count += 1
203
+ return count
204
+ except Exception as e:
205
+ logger.error(f"Error counting pages in MediaWiki dump: {str(e)}")
206
+ return 0
207
+
208
+ #
209
+ # End of Media_Wiki.py
210
+ #######################################################################################################################
App_Function_Libraries/MediaWiki/Media_Wiki_Tests.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Media_Wiki_Tests.py
2
+ # Description: Unit tests for the Media_Wiki module.
3
+ #
4
+ # Usage:
5
+ # pip install pytest pytest-asyncio
6
+ # pytest Media_Wiki_Tests.py
7
+ #
8
+ # Imports
9
+ import pytest
10
+ import asyncio
11
+ from unittest.mock import patch, MagicMock
12
+ # Local Imports
13
+ from Media_Wiki import parse_mediawiki_dump, optimized_chunking, process_single_item, import_mediawiki_dump, load_mediawiki_import_config
14
+ #
15
+ # #######################################################################################################################
16
+ #
17
+ # Functions:
18
+
19
+
20
+
21
+ @pytest.fixture(scope="module")
22
+ def event_loop():
23
+ loop = asyncio.get_event_loop_policy().new_event_loop()
24
+ yield loop
25
+ loop.close()
26
+
27
+ @pytest.fixture
28
+ def mock_mwxml_dump():
29
+ mock_dump = MagicMock()
30
+ mock_page = MagicMock()
31
+ mock_page.title = "Test Page"
32
+ mock_page.namespace = 0
33
+ mock_page.id = 1
34
+ mock_revision = MagicMock()
35
+ mock_revision.id = 1
36
+ mock_revision.timestamp = "2021-01-01T00:00:00Z"
37
+ mock_revision.text = "Test content"
38
+ mock_page.revisions = [mock_revision]
39
+ mock_dump.pages = [mock_page]
40
+ return mock_dump
41
+
42
+ def test_parse_mediawiki_dump(mock_mwxml_dump):
43
+ with patch('mwxml.Dump.from_file', return_value=mock_mwxml_dump), \
44
+ patch('mwparserfromhell.parse') as mock_parse:
45
+ mock_parse.return_value.strip_code.return_value = "Stripped content"
46
+ result = list(parse_mediawiki_dump("dummy_path"))
47
+ assert len(result) == 1
48
+ assert result[0]['title'] == "Test Page"
49
+ assert result[0]['content'] == "Stripped content"
50
+ assert result[0]['namespace'] == 0
51
+ assert result[0]['page_id'] == 1
52
+ assert result[0]['revision_id'] == 1
53
+
54
+ def test_optimized_chunking():
55
+ test_text = "== Section 1 ==\nContent 1\n== Section 2 ==\nContent 2"
56
+ chunk_options = {'max_size': 50}
57
+ result = optimized_chunking(test_text, chunk_options)
58
+ assert len(result) == 2
59
+ assert result[0]['text'].startswith("== Section 1 ==")
60
+ assert result[1]['text'].startswith("== Section 2 ==")
61
+ assert 'metadata' in result[0] and 'section' in result[0]['metadata']
62
+
63
+ @pytest.mark.asyncio
64
+ async def test_process_single_item():
65
+ with patch('Media_Wiki.check_media_exists', return_value=False), \
66
+ patch('Media_Wiki.add_media_with_keywords', return_value=1), \
67
+ patch('Media_Wiki.process_and_store_content') as mock_process_store:
68
+ await process_single_item("Test content", "Test Title", "TestWiki", {'max_size': 100})
69
+ mock_process_store.assert_called()
70
+ # Add more detailed assertions here
71
+
72
+ @pytest.mark.asyncio
73
+ async def test_import_mediawiki_dump():
74
+ with patch('Media_Wiki.parse_mediawiki_dump') as mock_parse, \
75
+ patch('Media_Wiki.process_single_item') as mock_process, \
76
+ patch('Media_Wiki.load_checkpoint', return_value=0), \
77
+ patch('Media_Wiki.save_checkpoint'), \
78
+ patch('os.remove'):
79
+ mock_parse.return_value = [{'page_id': 1, 'title': 'Test', 'content': 'Content'}]
80
+ result = await import_mediawiki_dump("dummy_path", "TestWiki")
81
+ assert "Successfully imported" in result
82
+ mock_process.assert_called_once()
83
+
84
+ def test_import_mediawiki_dump_file_not_found():
85
+ with patch('Media_Wiki.parse_mediawiki_dump', side_effect=FileNotFoundError):
86
+ result = asyncio.run(import_mediawiki_dump("non_existent_path", "TestWiki"))
87
+ assert "Error: File not found" in result
88
+
89
+ def test_load_mediawiki_import_config():
90
+ with patch('builtins.open', MagicMock()):
91
+ with patch('yaml.safe_load', return_value={'test_key': 'test_value'}):
92
+ config = load_mediawiki_import_config()
93
+ assert 'test_key' in config
94
+ assert config['test_key'] == 'test_value'
App_Function_Libraries/MediaWiki/mediawiki_import_config.yaml ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # MediaWiki Import Configuration
2
+
3
+ # Database settings
4
+ database:
5
+ sqlite_path: './Databases/media_summary.db'
6
+ chroma_db_path: 'chroma_db'
7
+
8
+ # Chunking options
9
+ chunking:
10
+ default_method: 'sentences'
11
+ default_size: 1000
12
+ default_overlap: 100
13
+ adaptive: true
14
+ language: 'en'
15
+ methods:
16
+ - 'sentences'
17
+ - 'words'
18
+ - 'paragraphs'
19
+ - 'tokens'
20
+
21
+ # Import settings
22
+ import:
23
+ batch_size: 1000 # Number of pages to process in a single batch
24
+ default_skip_redirects: true
25
+ default_namespaces: [0] # Main namespace by default
26
+ single_item_default: false
27
+
28
+ # Processing options
29
+ processing:
30
+ max_workers: 4 # Number of worker threads for async processing
31
+
32
+ # Embedding settings
33
+ embeddings:
34
+ provider: 'openai' # or 'local' or 'huggingface'
35
+ model: 'text-embedding-ada-002'
36
+ api_key: 'your_openai_api_key_here' # Remove if using local embeddings
37
+ local_url: 'http://localhost:8080/embeddings' # Only for local embeddings
38
+
39
+ # ChromaDB settings
40
+ chromadb:
41
+ collection_prefix: 'mediawiki_'
42
+
43
+ # Logging settings
44
+ logging:
45
+ level: 'INFO'
46
+ file: 'mediawiki_import.log'
47
+
48
+ # Checkpoint settings
49
+ checkpoints:
50
+ enabled: true
51
+ directory: 'import_checkpoints'
52
+
53
+ # Error handling
54
+ error_handling:
55
+ max_retries: 3
56
+ retry_delay: 5 # seconds
57
+
58
+ # User interface settings
59
+ ui:
60
+ default_chunk_size: 1000
61
+ min_chunk_size: 100
62
+ max_chunk_size: 2000
63
+ default_chunk_overlap: 100