File size: 8,093 Bytes
fa70ae5
 
 
e91ced9
f64fe29
 
e91ced9
f64fe29
 
e91ced9
 
 
 
f64fe29
 
e91ced9
 
f64fe29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e91ced9
 
 
 
 
 
 
 
 
 
f64fe29
 
 
 
 
 
 
e91ced9
 
 
 
f64fe29
 
e91ced9
f64fe29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e91ced9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fa70ae5
e91ced9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fa70ae5
e91ced9
fa70ae5
 
 
e91ced9
 
fa70ae5
 
 
e91ced9
fa70ae5
e91ced9
fa70ae5
 
e91ced9
 
 
 
 
 
f64fe29
 
e91ced9
f64fe29
fa70ae5
f64fe29
 
 
e91ced9
 
f64fe29
 
 
 
e91ced9
 
 
 
fa70ae5
e91ced9
 
 
 
fa70ae5
 
 
 
e91ced9
fa70ae5
e91ced9
 
 
fa70ae5
 
 
 
 
 
e91ced9
 
 
fa70ae5
 
e91ced9
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
import bz2
import re
import os
import sqlite3
from pathlib import Path
from xml.sax import make_parser, handler
import time

class WikiContentHandler(handler.ContentHandler):
    def __init__(self, db_conn, batch_size=1000, max_articles=None):
        self.db_conn = db_conn
        self.cursor = db_conn.cursor()
        self.batch_size = batch_size
        self.article_count = 0
        self.max_articles = max_articles
        self.article_batch = []
        self.links_batch = []
        
        # Current elements
        self.current_title = None
        self.current_text = None
        self.current_ns = None
        self.in_page = False
        self.in_title = False
        self.in_text = False
        self.in_ns = False
        self.buffer = []
        
    def startElement(self, name, attrs):
        if name == 'page':
            self.in_page = True
            self.current_title = None
            self.current_text = None
            self.current_ns = None
        elif self.in_page and name == 'title':
            self.in_title = True
            self.buffer = []
        elif self.in_page and name == 'ns':
            self.in_ns = True
            self.buffer = []
        elif self.in_page and name == 'text':
            self.in_text = True
            self.buffer = []
    
    def endElement(self, name):
        if name == 'page':
            self.in_page = False
            # Only process main namespace articles (ns = 0)
            if self.current_ns == '0' and self.current_title and self.current_text:
                # Extract links
                links = self.extract_links(self.current_text)
                
                # Add to batch
                self.article_batch.append(
                    (self.current_title, self.current_text)
                )
                
                # Add links to batch
                for link in links:
                    self.links_batch.append(
                        (self.current_title, link)
                    )
                
                self.article_count += 1
                
                # Print progress
                if self.article_count % 100 == 0:
                    print(f"Processed {self.article_count} articles...")
                
                # Insert batch if reached batch size
                if len(self.article_batch) >= self.batch_size:
                    self._insert_batch()
                
                # Check if we've reached the maximum number of articles
                if self.max_articles and self.article_count >= self.max_articles:
                    self._insert_batch()  # Insert any remaining items
                    raise StopIteration("Reached maximum number of articles")
                    
        elif name == 'title':
            self.in_title = False
            self.current_title = ''.join(self.buffer)
        elif name == 'ns':
            self.in_ns = False
            self.current_ns = ''.join(self.buffer)
        elif name == 'text':
            self.in_text = False
            self.current_text = ''.join(self.buffer)
    
    def characters(self, content):
        if self.in_title:
            self.buffer.append(content)
        elif self.in_ns:
            self.buffer.append(content)
        elif self.in_text:
            self.buffer.append(content)
    
    def extract_links(self, text):
        """Extract links from article wikitext"""
        # Pattern to match [[Link]] or [[Link|Text]] format
        links = re.findall(r'\[\[([^|\]]+)(?:\|[^\]]+)?\]\]', text)
        
        # Process links
        processed_links = []
        for link in links:
            # Skip non-article links (except categories which might be useful)
            if ':' in link and not link.startswith('Category:'):
                continue
            
            # Remove any section links (with #)
            link = link.split('#')[0].strip()
            
            # Skip empty links
            if not link:
                continue
            
            processed_links.append(link)
        
        # Remove duplicates and return
        return list(set(processed_links))
    
    def _insert_batch(self):
        """Insert batched data into the database"""
        if self.article_batch:
            self.cursor.executemany(
                "INSERT OR IGNORE INTO articles (title, text) VALUES (?, ?)",
                self.article_batch
            )
            
        if self.links_batch:
            self.cursor.executemany(
                "INSERT OR IGNORE INTO links (source_title, target_title) VALUES (?, ?)",
                self.links_batch
            )
            
        self.db_conn.commit()
        self.article_batch = []
        self.links_batch = []

def create_db_schema(db_conn):
    """Create the database schema"""
    cursor = db_conn.cursor()
    
    # Create articles table
    cursor.execute('''
    CREATE TABLE IF NOT EXISTS articles (
        title TEXT PRIMARY KEY,
        text TEXT
    )
    ''')
    
    # Create links table
    cursor.execute('''
    CREATE TABLE IF NOT EXISTS links (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        source_title TEXT,
        target_title TEXT,
        FOREIGN KEY (source_title) REFERENCES articles (title),
        UNIQUE (source_title, target_title)
    )
    ''')
    
    # Create index on links for faster queries
    cursor.execute('''
    CREATE INDEX IF NOT EXISTS idx_links_source ON links (source_title)
    ''')
    
    cursor.execute('''
    CREATE INDEX IF NOT EXISTS idx_links_target ON links (target_title)
    ''')
    
    db_conn.commit()

def parse_wiki_dump(dump_path, db_path, batch_size=1000, max_articles=None):
    """
    Parse the Wikipedia XML dump and extract articles with their links into SQLite database.
    
    Args:
        dump_path: Path to the bz2 Wikipedia dump
        db_path: Path to save the SQLite database
        batch_size: Number of articles to process before committing to the database
        max_articles: Maximum number of articles to extract (None for all)
    
    Returns:
        The path to the created SQLite database
    """
    start_time = time.time()
    print(f"Parsing Wikipedia dump: {dump_path}")
    
    # Create or connect to SQLite database
    db_conn = sqlite3.connect(db_path)
    
    # Create schema
    create_db_schema(db_conn)
    
    # Create SAX parser with custom content handler
    parser = make_parser()
    content_handler = WikiContentHandler(db_conn, batch_size, max_articles)
    parser.setContentHandler(content_handler)
    
    # Parse the dump
    try:
        parser.parse(bz2.BZ2File(dump_path))
        # Insert any remaining items in the batch
        content_handler._insert_batch()
    except StopIteration:
        print("Reached maximum number of articles")
    except Exception as e:
        print(f"Error parsing dump: {e}")
        raise
    finally:
        db_conn.commit()
        db_conn.close()
    
    duration = time.time() - start_time
    print(f"Extracted {content_handler.article_count} articles in {duration:.2f} seconds.")
    print(f"Data saved to {db_path}")
    return db_path

if __name__ == "__main__":
    import argparse
    
    parser = argparse.ArgumentParser(description='Parse Wikipedia XML dump to SQLite')
    parser.add_argument('dump_path', help='Path to the Wikipedia XML dump (bz2 file)')
    parser.add_argument('output_path', help='Path to save the SQLite database')
    parser.add_argument('--batch-size', type=int, default=1000, 
                        help='Batch size for database inserts (default: 1000)')
    parser.add_argument('--max-articles', type=int, default=None, 
                        help='Maximum number of articles to extract (default: all)')
    
    args = parser.parse_args()
    
    # Create output directory if it doesn't exist
    output_dir = os.path.dirname(args.output_path)
    if output_dir:
        os.makedirs(output_dir, exist_ok=True)
    
    # Parse the dump
    parse_wiki_dump(args.dump_path, args.output_path, args.batch_size, args.max_articles)