derek-thomas HF staff commited on
Commit
6404d3b
1 Parent(s): 676ed72

Added preprocessing code

Browse files
Files changed (4) hide show
  1. .gitignore +4 -0
  2. README.md +6 -0
  3. preprocess_wiki.py +167 -0
  4. requirements.txt +1 -0
.gitignore ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ *.bz2
2
+ *.gz
3
+ output/
4
+ .idea/
README.md CHANGED
@@ -11,3 +11,9 @@ license: apache-2.0
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
14
+
15
+ # Installation
16
+ pip install requirements.txt
17
+ # Pre-processing
18
+ wget https://dumps.wikimedia.org/arwiki/latest/arwiki-latest-pages-articles-multistream.xml.bz2
19
+ wikiextractor -o output --json arwiki-latest-pages-articles-multistream.xml.bz2
preprocess_wiki.py ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ from pathlib import Path
4
+ from tqdm.auto import tqdm
5
+ from typing import List, Any, Dict
6
+
7
+ MAX_WORDS = 250
8
+
9
+
10
+ def folder_to_json(folder_in: Path, json_path: Path) -> List[Any]:
11
+ """
12
+ Process JSON lines from files in a given folder and write processed data to a new JSON file.
13
+
14
+ Parameters:
15
+ folder_in (Path): Path to the input folder containing the JSON files to process.
16
+ json_path (Path): Path to the output JSON file where the processed data will be written.
17
+
18
+ Returns:
19
+ List[Any]: List containing processed JSON data from all files in the input folder.
20
+
21
+ Example:
22
+ folder_to_json(Path("/path/to/input/folder"), Path("/path/to/output.json"))
23
+ """
24
+
25
+ folder_in = Path(folder_in)
26
+ json_out = [] # Initialize list to hold processed JSON data from all files
27
+
28
+ # Calculate total number of files in the input folder to set up the progress bar
29
+ total_files = sum([len(files) for r, d, files in os.walk(folder_in)])
30
+
31
+ # Initialize progress bar with total file count, description, and unit of progress
32
+ with tqdm(total=total_files, desc='Processing', unit='file') as pbar:
33
+ # Iterate through all files in the input folder
34
+ for subdir, _, files in os.walk(folder_in):
35
+ # Set progress bar postfix to display current directory
36
+ pbar.set_postfix_str(f"Directory: {subdir}", refresh=False)
37
+
38
+ for file in files:
39
+ # Update progress bar postfix to display current file and directory
40
+ pbar.set_postfix_str(f"Dir: {subdir} | File: {file}", refresh=True)
41
+
42
+ # Create full file path for the current file
43
+ file_path = Path(subdir) / file
44
+
45
+ # Open and read the current file
46
+ with open(file_path, 'r', encoding='utf-8') as f:
47
+ for line in f:
48
+ # Load JSON data from each line and process it
49
+ article = json.loads(line)
50
+ # Ensure the preprocess function is defined and accessible
51
+ processed_article = preprocess(article)
52
+ # Add processed data to the output list
53
+ json_out.extend(processed_article)
54
+
55
+ # Update progress bar after processing each file
56
+ pbar.update(1)
57
+
58
+ # Notify that the writing process is starting
59
+ pbar.write("Writing file!")
60
+ # Open the output file and write the processed data as JSON
61
+ with open(json_path, "w", encoding='utf-8') as outfile:
62
+ json.dump(json_out, outfile)
63
+ # Notify that the writing process is complete
64
+ pbar.write("File written!")
65
+
66
+ # Return the list of processed data
67
+ return json_out
68
+
69
+
70
+ def preprocess(article: Dict[str, Any]) -> List[Dict[str, Any]]:
71
+ """
72
+ Preprocess a given article dictionary, extracting and processing the 'text' field. Because of the `break` introduced
73
+ we are only taking the first chunk
74
+
75
+ Parameters:
76
+ article (Dict[str, Any]): Input dictionary containing an article. Expected to have a 'text' field.
77
+
78
+ Returns:
79
+ List[Dict[str, Any]]: A list of dictionaries, where each dictionary represents a preprocessed chunk of
80
+ the original article's text. Each dictionary also contains the original article's
81
+ fields (excluding 'text'), with an additional 'chunk_number' field indicating the
82
+ order of the chunk.
83
+
84
+ Example:
85
+ article = {"text": "Example text", "title": "Example Title", "author": "John Doe"}
86
+ processed = preprocess(article)
87
+ print(processed)
88
+ """
89
+
90
+ # Create a new dictionary excluding the 'text' field from the original article
91
+ article_out = {k: v for k, v in article.items() if k != 'text'}
92
+
93
+ # Create a prefix using the article's text. Adjust this line as needed based on the actual structure of 'article'
94
+ prefix = f'عنوان: {article["text"]}. '
95
+ out = [] # Initialize the list to hold the preprocessed chunks
96
+
97
+ # Iterate over chunks obtained by splitting the article's text using the group_arabic_paragraphs function
98
+ # Ensure group_arabic_paragraphs is defined and accessible
99
+ for i, chunk in enumerate(group_arabic_paragraphs(article['text'], MAX_WORDS)):
100
+ # Concatenate the prefix with the current chunk
101
+ chunk = prefix + chunk
102
+ # Create a new dictionary with the chunk, original article fields (excluding 'text'), and chunk number
103
+ # Then append this dictionary to the 'out' list
104
+ out.append({'chunk': chunk, **article_out, 'chunk_number': i})
105
+ # Only take the first chunk
106
+ break
107
+
108
+ # Return the list of preprocessed chunks
109
+ return out
110
+
111
+
112
+ def group_arabic_paragraphs(arabic_text: str, max_words: int) -> List[str]:
113
+ """
114
+ Group contiguous paragraphs of Arabic text without exceeding the max_words limit per group.
115
+
116
+ Parameters:
117
+ arabic_text (str): The input Arabic text where paragraphs are separated by newlines.
118
+ max_words (int): The maximum number of words allowed per group of paragraphs.
119
+
120
+ Returns:
121
+ List[str]: A list of strings where each string is a group of contiguous paragraphs.
122
+
123
+ Example:
124
+ arabic_text = "Paragraph1.\nParagraph2.\nParagraph3."
125
+ max_words = 5
126
+ result = group_arabic_paragraphs(arabic_text, max_words)
127
+ print(result) # Output will depend on word count of each paragraph and max_words.
128
+ """
129
+
130
+ # Splitting the input text into paragraphs using newline as a delimiter
131
+ paragraphs = arabic_text.split('\n')
132
+
133
+ # Initialize variables to hold the grouped paragraphs and word count
134
+ grouped_paragraphs = []
135
+ current_group = []
136
+ current_word_count = 0
137
+
138
+ # Iterate through each paragraph in the input text
139
+ for paragraph in paragraphs:
140
+ # Count the number of words in the paragraph
141
+ word_count = len(paragraph.split())
142
+
143
+ # If adding the paragraph won't exceed the word limit, add it to the current group
144
+ if current_word_count + word_count <= max_words:
145
+ current_group.append(paragraph)
146
+ current_word_count += word_count # Update the word count for the current group
147
+ else:
148
+ # If the paragraph exceeds the word limit, start a new group
149
+ if current_group:
150
+ grouped_paragraphs.append('\n'.join(current_group))
151
+ # Initialize a new group with the current paragraph
152
+ current_group = [paragraph]
153
+ current_word_count = word_count # Reset the word count for the new group
154
+
155
+ # Add the last group if not empty
156
+ if current_group:
157
+ grouped_paragraphs.append('\n'.join(current_group))
158
+
159
+ # Return the grouped paragraphs as a list of strings
160
+ return grouped_paragraphs
161
+
162
+
163
+ if __name__ == '__main__':
164
+ folder = Path('output')
165
+ file_out = Path('arwiki.json')
166
+ folder_to_json(folder, file_out)
167
+ print('Done!')
requirements.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ wikiextractor==3.0.6