washeed commited on
Commit
b692870
·
verified ·
1 Parent(s): 238ccc2

Upload 18 files

Browse files
Documentation.ipynb ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "## ABC_1 DOCUMENTATION\n",
8
+ "\n",
9
+ "Dito ko lalagay lahat ng documentation and uses ng mga functions"
10
+ ]
11
+ },
12
+ {
13
+ "cell_type": "markdown",
14
+ "metadata": {},
15
+ "source": [
16
+ "### pdftoimage.py and docxtoimage.py\n",
17
+ "\n",
18
+ "bale parang naging utils file na lang toh kasi ginawa ko siya na pwede tawagin sa ibang python file nilagay ko na siya sa abc_1.py pareho and pinagsama ko sa isang function na pangalan convert_pages(folder_path,image_output,Max_pages) tas automatically na toh mag iiterate sa isang folder para gumawa ng mga buffer folders na may lamang image na page ng file ganto sample usage nya also si docx to image automatic na png na siya kasi best for ocr naman ang png \n"
19
+ ]
20
+ },
21
+ {
22
+ "cell_type": "code",
23
+ "execution_count": null,
24
+ "metadata": {
25
+ "vscode": {
26
+ "languageId": "plaintext"
27
+ }
28
+ },
29
+ "outputs": [],
30
+ "source": [
31
+ "import abc_1\n",
32
+ "abc_1.convert_pages('input','png',4) \n",
33
+ "\n",
34
+ " \n",
35
+ "def convert_pages(folder_path, output_format ,max_pages):\n",
36
+ " for root, directories, files in os.walk(folder_path):\n",
37
+ " for filename in files:\n",
38
+ " # Get the file extension (including the dot)\n",
39
+ " extension = os.path.splitext(filename)[1].lower()\n",
40
+ " if extension=='.pdf':\n",
41
+ " pdftoimage.convert_pdfs(folder_path, output_format,max_pages)\n",
42
+ " if extension=='.docx':\n",
43
+ " docxtoimage.process(folder_path,max_pages)\n"
44
+ ]
45
+ },
46
+ {
47
+ "cell_type": "markdown",
48
+ "metadata": {},
49
+ "source": [
50
+ "### inputPDFToOutputOCR\n",
51
+ "\n",
52
+ "ito yung parang augmentA.py na for text. ginawa kong very similar yung parameters ng pagtakbo nya except dagdag lang ng konti kasi may subfolder toh. ito pinaka challenging na part sa buong project kasi kailangan match pa din yung categories_dict para di na kayo mahirapan mag bago bago ng formats and flows pero syempre ako na toh kaya possible. bale ginamit ko mga functions inside the abc_1 na den to categorize kaya walang issue yan with functions and outputs kasi same sila ng fundamental rules for categorization bale same dapat na babato sa file yung json categories like sa augment a then straightforward naman na from there"
53
+ ]
54
+ },
55
+ {
56
+ "cell_type": "code",
57
+ "execution_count": null,
58
+ "metadata": {
59
+ "vscode": {
60
+ "languageId": "plaintext"
61
+ }
62
+ },
63
+ "outputs": [],
64
+ "source": [
65
+ "## run muna yung convert_pages for this then delete folder pag taopos\n",
66
+ "\n",
67
+ "if __name__ == '__main__': \n",
68
+ " categories_keywords_dict = {\n",
69
+ " 'AI': ['Artificial', 'Intelligence'],\n",
70
+ " 'Automata': ['finite', 'state', 'machines'],\n",
71
+ " 'DT': ['game', 'theory']\n",
72
+ " }\n",
73
+ "\n",
74
+ " folder_path = 'input' #output folder ni pdftoimage toh\n",
75
+ " folder_output = 'output' # Fixed typo\n",
76
+ " compiled_keywords = abc_1.compile_keywords(categories_keywords_dict)\n",
77
+ "\n",
78
+ " subfolder_names = get_subfolder_names(folder_path)\n",
79
+ " runOCR(subfolder_names)"
80
+ ]
81
+ },
82
+ {
83
+ "cell_type": "markdown",
84
+ "metadata": {},
85
+ "source": [
86
+ "### LIMITATIONS\n",
87
+ "Discuss ko lang mga potential issues na makakaharap naten\n",
88
+ "\n",
89
+ "- file name delikado if may kapareho kasi ang way ng pag generate ng subfolder is file name tas pag mmove na yung file mismo lalagyan lang ng .pdf or .docx\n",
90
+ "\n",
91
+ "- next issue yung bilis. tinanggal ko lahat ng concurrency and threading functionalities pag dating sa OCR kasi may potential risk na makasira ng device kasi nga mabigat talaga sobra. \n",
92
+ "\n",
93
+ "- file path management. di ko alam if gagana yung program natin if nasa labas ng work folder yung ipprocess natin\n",
94
+ "\n"
95
+ ]
96
+ },
97
+ {
98
+ "cell_type": "markdown",
99
+ "metadata": {},
100
+ "source": [
101
+ "### JUSTIFICATIONS\n",
102
+ "- pwede tayo gumawa ng script na pang error handling ng file with same names\n",
103
+ "- justify natin na limitations ng machine yung bagal kasi di kaya\n",
104
+ "- ito idk pano ssolve or baka lang di ko alam pano\n",
105
+ "\n",
106
+ "for the integration part pwede nyo na siguro rin simulan and itry para easy na after magawa ng front. pero functionality wise i think kumpleto na tayo "
107
+ ]
108
+ }
109
+ ],
110
+ "metadata": {
111
+ "language_info": {
112
+ "name": "python"
113
+ }
114
+ },
115
+ "nbformat": 4,
116
+ "nbformat_minor": 2
117
+ }
abc_1.py ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ocr
2
+ import os
3
+ import threading
4
+ import concurrent.futures
5
+ from multiprocessing import Pool # Import for multiprocessing
6
+ import re
7
+ from docx import Document # Assuming DOCX support is desired
8
+ from pdfminer.high_level import extract_text # Import for PDF text extraction
9
+ import time
10
+ import pdftoimage
11
+ import docxtoimage
12
+ # Additional libraries for new file types
13
+ #import openpyxl # For basic XLSX handling (consider pandas for structured data)
14
+ #from pptx import Presentation # For PPTX presentations (install with: pip install python-pptx)
15
+
16
+ try:
17
+ from docx import Document
18
+ except ImportError:
19
+ print("To enable DOCX support, install python-docx: pip install python-docx")
20
+
21
+
22
+ class DecodingError(Exception):
23
+ pass
24
+
25
+
26
+ def compile_keywords(categories_keywords_dict):
27
+ """Pre-compiles keyword lists for faster matching"""
28
+ compiled_keywords = {category: [re.compile(keyword, re.IGNORECASE) for keyword in keywords]
29
+ for category, keywords in categories_keywords_dict.items()}
30
+ return compiled_keywords
31
+
32
+
33
+ def categorize_text_chunk(text_chunk, compiled_keywords):
34
+ """Categorizes a chunk of text using compiled keywords"""
35
+ for category, keyword_list in compiled_keywords.items():
36
+ if all(keyword.search(text_chunk) for keyword in keyword_list):
37
+ return category
38
+ return 'Uncategorized'
39
+
40
+ def use_ocr(folder_path): #pag tinawag toh extract niya lahat ng text sa buffer folder
41
+ all_extracted_text = ""
42
+ for filename in os.listdir(folder_path):
43
+ if filename.endswith(".jpg") or filename.endswith(".png"):
44
+ image_path = os.path.join(folder_path, filename)
45
+ extracted_text = ocr.extract_text_from_image(image_path)
46
+ all_extracted_text += "\n".join(extracted_text) + "\n\n" # Add double newlines for separation
47
+
48
+ return all_extracted_text
49
+
50
+
51
+
52
+
53
+ def convert_pages(folder_path, output_format ,max_pages):
54
+ for root, directories, files in os.walk(folder_path):
55
+ for filename in files:
56
+ # Get the file extension (including the dot)
57
+ extension = os.path.splitext(filename)[1].lower()
58
+ if extension=='.pdf':
59
+ pdftoimage.convert_pdfs(folder_path, output_format,max_pages)
60
+ if extension=='.docx':
61
+ docxtoimage.process(folder_path,max_pages)
62
+
63
+
64
+ def categorize_file(file_path, compiled_keywords):
65
+ try:
66
+ if file_path.endswith('.pdf'):
67
+ text = extract_text(file_path) # Use pdfminer to extract text (CPU-bound)
68
+ return file_path, categorize_text_chunk(text, compiled_keywords)
69
+ elif file_path.endswith('.docx') and Document:
70
+ # ... (code for DOCX files - potentially I/O bound)
71
+ try:
72
+ doc = Document(file_path)
73
+ text = '\n'.join(paragraph.text for paragraph in doc.paragraphs) # Combine all paragraphs
74
+ return file_path, categorize_text_chunk(text, compiled_keywords)
75
+ except Exception as e:
76
+ print(f"Error processing DOCX '{file_path}': {e}")
77
+ return file_path, 'Uncategorized (Error)'
78
+ elif file_path.endswith('.txt'):
79
+ with open(file_path, 'r') as f:
80
+ text = f.read()
81
+ return file_path, categorize_text_chunk(text, compiled_keywords)
82
+ else:
83
+ print(f"Unsupported file type: {file_path}")
84
+ return None, 'Unsupported File Type'
85
+ except Exception as e:
86
+ print(f"Error processing '{file_path}': {e}")
87
+ return file_path, 'Uncategorized (Error)'
88
+
89
+
90
+ def threaded_worker(file_paths_categories, output_dir):
91
+ for file_path, category in file_paths_categories:
92
+ if category is not None: # Skip unsupported files
93
+ category_dir = os.path.join(output_dir, category)
94
+ os.makedirs(category_dir, exist_ok=True)
95
+ os.rename(file_path, os.path.join(category_dir, os.path.basename(file_path)))
96
+
97
+
98
+ def multi_process_categorizer(input_dir, output_dir, categories_keywords_dict, num_processes):
99
+ files = [os.path.join(input_dir, f) for f in os.listdir(input_dir)]
100
+
101
+ # Use multiprocessing pool for CPU-bound text processing
102
+ with Pool(processes=num_processes) as pool:
103
+ results = pool.starmap(categorize_file, [(file_path, categories_keywords_dict) for file_path in files])
104
+
105
+ # Use concurrent.futures for potentially I/O-bound tasks like moving files
106
+ with concurrent.futures.ThreadPoolExecutor() as executor:
107
+ executor.submit(threaded_worker, results, output_dir)
108
+
109
+
110
+ def chunks(lst, chunk_size):
111
+ """Yield successive n-sized chunks from lst."""
112
+ for i in range(0, len(lst), chunk_size):
113
+ yield lst[i:i + chunk_size]
114
+
augmentA.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import abc_1
2
+ import time
3
+ import sys
4
+ from docx import Document # Assuming DOCX support is desired
5
+ from pdfminer.high_level import extract_text # Import for PDF text extraction
6
+ import json
7
+
8
+ if __name__ == '__main__':
9
+ start = time.time()
10
+ if len(sys.argv) > 1:
11
+ data = sys.argv[1]
12
+ categories_keywords_dict = json.loads(data)
13
+ else:
14
+ print("No data provided.")
15
+ categories_keywords_dict1 = {
16
+ 'AI': ['Artificial', 'Intelligence'],
17
+ 'Automata': ['finite', 'state', 'machines'],
18
+ 'DT': ['game', 'theory']
19
+ }
20
+
21
+ input='input'#file path here
22
+ output='output'#and here
23
+ compiled_keywords = abc_1.compile_keywords(categories_keywords_dict1)
24
+ abc_1.multi_process_categorizer(input, output , compiled_keywords, num_processes=8) # Adjust processes as needed
25
+ end = time.time()
26
+ print(f"Categorization completed in {end - start:.2f} seconds")
best/BEST.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4a5efbfb48b4081100544e75e1e2b57f8de3d84f213004b14b85fd4b3748db17
3
+ size 83152330
docxtoimage.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from spire.doc import *
3
+ from spire.doc.common import *
4
+
5
+ def process(folder_path,max_page):
6
+ for filename in os.listdir(folder_path):
7
+ if filename.endswith(".docx"):
8
+ process_docx(folder_path, filename,max_page)
9
+
10
+ def process_docx(folder_path, filename,max_page=None):
11
+ try:
12
+ # Construct the full file path
13
+ file_path = os.path.join(folder_path, filename)
14
+
15
+ # Process the docx file
16
+ document = Document()
17
+ document.LoadFromFile(file_path)
18
+ if max_page>document.GetPageCount():
19
+ image_streams = document.SaveImageToStreams(0,document.GetPageCount() ,ImageType.Bitmap)
20
+ else:
21
+ image_streams = document.SaveImageToStreams(0,max_page ,ImageType.Bitmap)
22
+
23
+ # Extract the filename without extension
24
+ file_name, _ = os.path.splitext(filename)
25
+
26
+ # Create the folder path to save images
27
+ image_folder_path = os.path.join(folder_path, file_name)
28
+ os.makedirs(image_folder_path, exist_ok=True)
29
+
30
+ # Save each image stream to a JPG file
31
+ for i, image in enumerate(image_streams):
32
+ image_name = os.path.join(image_folder_path, f"{file_name}_{i+1}.png")
33
+ with open(image_name, 'wb') as image_file:
34
+ image_file.write(image.ToArray())
35
+
36
+ document.Close()
37
+ except Exception as e:
38
+ print(f"Error processing file {filename}: {e}")
39
+
40
+
41
+ if __name__ == '__main__':
42
+ # Define the folder path
43
+ folder_path = "input"
44
+ max_page=4
45
+ process(folder_path,max_page)
dump.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+
3
+ def main(data):
4
+ print("Received data:")
5
+ print(data)
6
+ print(type(data))
7
+
8
+ if __name__ == "__main__":
9
+ # Check if data argument is provided
10
+ if len(sys.argv) > 1:
11
+ data = sys.argv[1]
12
+ main(data)
13
+ else:
14
+ print("No data provided.")
english_g2/english_g2.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e2272681d9d67a04e2dff396b6e95077bc19001f8f6d3593c307b9852e1c29e8
3
+ size 15143997
inputPDFToOutputOCR.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import abc_1
3
+ import shutil
4
+
5
+ def get_subfolder_names(folder_path):
6
+ try:
7
+ subfolders = [f for f in os.listdir(folder_path) if os.path.isdir(os.path.join(folder_path, f))]
8
+ return subfolders
9
+ except FileNotFoundError:
10
+ print(f"Error: Folder not found: {folder_path}")
11
+ return []
12
+
13
+
14
+ def create_folder(folder_path):
15
+ """Creates a folder if it doesn't exist."""
16
+ if not os.path.exists(folder_path):
17
+ try:
18
+ os.makedirs(folder_path) # Create the folder and any missing parent directories
19
+ except OSError as e:
20
+ print(f"Error creating folder {folder_path}: {e}")
21
+
22
+
23
+ def move_file(source_path, destination_path):
24
+ """Moves a file from the source to the destination."""
25
+ try:
26
+ os.rename(source_path, destination_path)
27
+ except OSError as e:
28
+ print(f"Error moving file {source_path} to {destination_path}: {e}")
29
+
30
+
31
+ def process_file(folder_path, name):
32
+ """Processes a single file, performing OCR, categorization, and moving."""
33
+ text = abc_1.use_ocr(os.path.join(folder_path, name))
34
+ category = abc_1.categorize_text_chunk(text, compiled_keywords)
35
+
36
+ category_folder = os.path.join(folder_output, category)
37
+ create_folder(category_folder)
38
+ has_pdf, has_docx= check_file_existence(folder_path,name)
39
+ if has_pdf:
40
+ source_file = os.path.join(folder_path, name + '.pdf')
41
+ destination_file = os.path.join(category_folder, name + '.pdf')
42
+ move_file(source_file, destination_file)
43
+ print(f"File '{name}' categorized as '{category}' and moved to '{category_folder}'.")
44
+ if has_docx:
45
+ source_file = os.path.join(folder_path, name + '.docx')
46
+ destination_file = os.path.join(category_folder, name + '.docx')
47
+ move_file(source_file, destination_file)
48
+ print(f"File '{name}' categorized as '{category}' and moved to '{category_folder}'.")
49
+
50
+
51
+ def check_file_existence(folder_path, filename):
52
+ has_pdf = False
53
+ has_docx = False
54
+
55
+ for filename_in_folder in os.listdir(folder_path):
56
+ base_filename, ext = os.path.splitext(filename_in_folder)
57
+ if base_filename == filename:
58
+ if ext == '.pdf':
59
+ has_pdf = True
60
+ elif ext == '.docx':
61
+ has_docx = True
62
+
63
+ return has_pdf, has_docx
64
+
65
+ def runOCR(subfolder_names):
66
+ for name in subfolder_names:
67
+ process_file(folder_path, name)
68
+ if os.path.exists(folder_path+'/'+name): # buffer folder delete
69
+ shutil.rmtree(folder_path+'/'+name)
70
+
71
+
72
+ if __name__ == '__main__':
73
+ categories_keywords_dict = {
74
+ 'AI': ['Artificial', 'Intelligence'],
75
+ 'Automata': ['finite', 'state', 'machines'],
76
+ 'DT': ['game', 'theory']
77
+ }
78
+
79
+ folder_path = 'input' #output folder ni pdftoimage toh
80
+ folder_output = 'output' # Fixed typo
81
+ compiled_keywords = abc_1.compile_keywords(categories_keywords_dict)
82
+
83
+ subfolder_names = get_subfolder_names(folder_path)
84
+ runOCR(subfolder_names)
85
+
86
+
ocr.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import easyocr
2
+ import cv2
3
+ import os
4
+
5
+
6
+ def extract_text_from_image(image_path, language='en'):
7
+ """
8
+ Extracts text from an image using EasyOCR.
9
+
10
+ Args:
11
+ image_path (str): Path to the image file.
12
+ language (str, optional): Language(s) to be recognized. Defaults to 'en' (English).
13
+
14
+ Returns:
15
+ list: List of recognized text strings.
16
+ """
17
+
18
+ reader = easyocr.Reader([language])
19
+ reader.detector = reader.initDetector('best\BEST.pth')
20
+
21
+ image = cv2.imread(image_path)
22
+ result = reader.readtext(image, detail=0) # Extract only recognized texts
23
+
24
+ return result
25
+
26
+
27
+ if __name__ == '__main__':
28
+ # Define the folder path containing images
29
+ folder_path = "inference_results\Anil Maheshwari - Data analytics-McGraw-Hill Education (2017)"
30
+
31
+ # Create an empty string to store all concatenated text
32
+ all_extracted_text = ""
33
+
34
+ # Loop through all files in the folder
35
+ for filename in os.listdir(folder_path):
36
+ if filename.endswith(".jpg") or filename.endswith(".png"):
37
+ image_path = os.path.join(folder_path, filename)
38
+
39
+ # Extract text for current image
40
+ extracted_text = extract_text_from_image(image_path)
41
+
42
+ # Concatenate extracted text with a newline character
43
+ all_extracted_text += "\n".join(extracted_text) + "\n\n" # Add double newlines for separation
44
+
output/AI/ai.pdf ADDED
Binary file (74.4 kB). View file
 
output/Automata/fsm.pdf ADDED
Binary file (73.3 kB). View file
 
output/DT/gt.pdf ADDED
Binary file (75.3 kB). View file
 
output/Uncategorized/sjhkdf.docx ADDED
Binary file (13.3 kB). View file
 
pdftoimage.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from pdf2image import convert_from_path
3
+
4
+
5
+ def convert_pdf_to_images(pdf_path, output_format="png", max_pages=None):
6
+ """Converts a single PDF file to images.
7
+
8
+ Args:
9
+ pdf_path (str): Path to the PDF file.
10
+ output_format (str, optional): Desired output format for images (default: "png").
11
+ Supported formats are "png", "jpg", and "ppm".
12
+ max_pages (int, optional): Maximum number of pages to convert (default: None, all pages).
13
+ """
14
+
15
+ try:
16
+ pdf_name, _ = os.path.splitext(os.path.basename(pdf_path)) # Extract filename without extension
17
+ images = convert_from_path(pdf_path, fmt=output_format, first_page=1, last_page=max_pages or None) # Use None for all pages
18
+ buffer_folder_path = os.path.join(os.path.dirname(pdf_path), pdf_name) # Create folder next to the PDF
19
+ os.makedirs(buffer_folder_path, exist_ok=True) # Create if not exists
20
+
21
+ for i, image in enumerate(images):
22
+ image_path = os.path.join(buffer_folder_path, f"page_{i+1}.{output_format}")
23
+ image.save(image_path, output_format.upper()) # Use uppercase extension
24
+
25
+ except Exception as e:
26
+ print(f"Error converting {pdf_path}: {e}")
27
+
28
+
29
+ def convert_pdfs(pdf_folder_path, output_format="png", max_pages=None):
30
+ """Converts all PDF files in a folder to images sequentially.
31
+
32
+ Args:
33
+ pdf_folder_path (str): Path to the folder containing PDF files.
34
+ output_format (str, optional): Desired output format for images (default: "png").
35
+ Supported formats are "png", "jpg", and "ppm".
36
+ max_pages (int, optional): Maximum number of pages to convert per PDF (default: None, all pages).
37
+ """
38
+
39
+ for filename in os.listdir(pdf_folder_path):
40
+ if filename.endswith(".pdf"):
41
+ pdf_path = os.path.join(pdf_folder_path, filename)
42
+ convert_pdf_to_images(pdf_path, output_format, max_pages)
43
+
44
+
45
+ # Example usage
46
+ #convert_pdfs("input", output_format="png", max_pages=2) # Convert PDFs to JPG, keeping only the first 2 pages
threadedABC.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import time
3
+ import threading
4
+
5
+ # Rastrigin function
6
+ def rastrigin_function(x):
7
+ A = 10
8
+ return A * len(x) + np.sum(x**2 - A * np.cos(2 * np.pi * x))
9
+
10
+ # Initialize control parameters
11
+ SN = 10000 # Number of food sources
12
+ MCN = 100000 # Maximum number of cycles
13
+ limit = 50 # Maximum number of exploitations for a solution
14
+ dimensionality = 2 # Dimensionality of the search space
15
+
16
+ # Shared variables among threads
17
+ food_sources_lock = threading.Lock()
18
+ trial_lock = threading.Lock()
19
+ cyc_lock = threading.Lock()
20
+ start_time_lock = threading.Lock()
21
+
22
+ food_sources = np.random.uniform(-5.12, 5.12, size=(SN, dimensionality)) # Initial random positions
23
+ trial = np.zeros(SN) # Initialize trial counters
24
+ cyc = 1 # Initial cycle
25
+ start_time = None # Start time
26
+
27
+ # Function for Employed Bees' Phase
28
+ def employed_bees_phase():
29
+ global food_sources, trial
30
+ for i in range(SN):
31
+ # Generate a neighbor solution
32
+ x_hat = food_sources[i] + np.random.uniform(-0.5, 0.5, size=(dimensionality,))
33
+
34
+ # Update solution if it is better
35
+ if rastrigin_function(x_hat) < rastrigin_function(food_sources[i]):
36
+ with food_sources_lock:
37
+ food_sources[i] = x_hat
38
+ trial[i] = 0
39
+ else:
40
+ with trial_lock:
41
+ trial[i] += 1
42
+
43
+ # Function for Onlooker Bees' Phase
44
+ def onlooker_bees_phase():
45
+ global food_sources, trial
46
+ probabilities = 1 / (1 + np.exp(-trial)) # Use trial as a measure of fitness
47
+ onlooker_indices = np.random.choice(SN, size=SN, p=probabilities / probabilities.sum())
48
+
49
+ for i in onlooker_indices:
50
+ # Generate a neighbor solution
51
+ x_hat = food_sources[i] + np.random.uniform(-0.5, 0.5, size=(dimensionality,))
52
+
53
+ # Update solution if it is better
54
+ if rastrigin_function(x_hat) < rastrigin_function(food_sources[i]):
55
+ with food_sources_lock:
56
+ food_sources[i] = x_hat
57
+ trial[i] = 0
58
+ else:
59
+ with trial_lock:
60
+ trial[i] += 1
61
+
62
+ # Function for Scout Bee Phase
63
+ def scout_bee_phase():
64
+ global food_sources, trial
65
+ max_trial_index = np.argmax(trial)
66
+ if trial[max_trial_index] > limit:
67
+ with food_sources_lock:
68
+ food_sources[max_trial_index] = np.random.uniform(-5.12, 5.12, size=(dimensionality,))
69
+ trial[max_trial_index] = 0
70
+
71
+ # Record start time
72
+ with start_time_lock:
73
+ start_time = time.time()
74
+
75
+ # Thread for Employed Bees' Phase
76
+ employed_thread = threading.Thread(target=employed_bees_phase)
77
+
78
+ # Thread for Onlooker Bees' Phase
79
+ onlooker_thread = threading.Thread(target=onlooker_bees_phase)
80
+
81
+ # Thread for Scout Bee Phase
82
+ scout_thread = threading.Thread(target=scout_bee_phase)
83
+
84
+ # Start all threads
85
+ employed_thread.start()
86
+ onlooker_thread.start()
87
+ scout_thread.start()
88
+
89
+ # Wait for all threads to finish
90
+ employed_thread.join()
91
+ onlooker_thread.join()
92
+ scout_thread.join()
93
+
94
+ # Record end time
95
+ end_time = time.time()
96
+
97
+ # Find the best solution
98
+ best_solution = food_sources[np.argmin([rastrigin_function(x) for x in food_sources])]
99
+
100
+ print("Best solution:", best_solution)
101
+ print("Objective function value at best solution:", rastrigin_function(best_solution))
102
+ print("Time taken:", end_time - start_time, "seconds")
tk3.py ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import tkinter as tk
2
+ import tkinter.filedialog as filedialog
3
+ from tkinter import ttk
4
+ import os
5
+ import subprocess
6
+ import json
7
+
8
+ keyword_entries = [] # Declare globally
9
+
10
+ def browse_folder(): # input in abc.py
11
+ folder_path = filedialog.askdirectory(
12
+ initialdir="/",
13
+ title="Select a Folder"
14
+ )
15
+ file_path_var.set(folder_path)
16
+
17
+ # Filter and display PDF and DOCX files
18
+ for filename in os.listdir(folder_path):
19
+ if os.path.splitext(filename)[1].lower() in ('.pdf', '.docx'):
20
+ print(filename) # Display the filename in the terminal
21
+
22
+ def generate_textboxes():
23
+ global keyword_entries
24
+
25
+ category_data = []
26
+
27
+ # Collect data from dropdowns
28
+ for category_frame in category_frames:
29
+ category_name = category_var[category_frame].get()
30
+ num_keywords = int(keyword_vars[category_frame].get())
31
+ category_data.append((category_name, num_keywords))
32
+
33
+ # Clear old textboxes
34
+ clear_existing_textboxes()
35
+
36
+ # Generate new textboxes and save their references
37
+ keyword_entries.clear() # Clear before generating new entries
38
+ for i, (category_name, num_keywords) in enumerate(category_data):
39
+ label = tk.Label(root, text=f"{category_name}:")
40
+ label.pack()
41
+ for _ in range(num_keywords):
42
+ entry = tk.Entry(root)
43
+ entry.pack()
44
+ keyword_entries.append(entry)
45
+
46
+ # Place save button below textboxes
47
+ save_button = tk.Button(root, text="Categorize", command=save_to_backup)
48
+ save_button.pack()
49
+
50
+
51
+ def save_to_backup(): # connect to
52
+ global keyword_entries
53
+
54
+ category_data = {}
55
+
56
+ # Collect data from dropdowns and textboxes
57
+ keyword_start_index = 0
58
+ for i, category_frame in enumerate(category_frames):
59
+ category_name = category_var[category_frame].get()
60
+ num_keywords = int(keyword_vars[category_frame].get())
61
+
62
+ keywords = keyword_entries[keyword_start_index:keyword_start_index + num_keywords]
63
+ category_data[category_name] = [entry.get() for entry in keywords]
64
+
65
+ keyword_start_index += num_keywords
66
+
67
+ #print(category_data)
68
+ subprocess.run(["python3", "augmentA.py", json.dumps(category_data)])
69
+
70
+
71
+ def clear_existing_textboxes():
72
+ for widget in root.winfo_children():
73
+ if isinstance(widget, tk.Label) or isinstance(widget, tk.Entry):
74
+ widget.destroy()
75
+
76
+
77
+ def update_category_dropdowns():
78
+ # Destroy old category frames
79
+ for frame in category_frames:
80
+ frame.destroy()
81
+ category_frames.clear()
82
+
83
+ # Create new frames
84
+ num_categories = num_categories_var.get()
85
+ for i in range(num_categories):
86
+ frame = tk.Frame(root)
87
+ frame.pack()
88
+ category_frames.append(frame)
89
+
90
+ tk.Label(frame, text="Category Name:").pack()
91
+ category_var[frame] = tk.StringVar(frame)
92
+ tk.Entry(frame, textvariable=category_var[frame]).pack()
93
+
94
+ tk.Label(frame, text="Number of Keywords:").pack()
95
+ keyword_vars[frame] = tk.IntVar(frame)
96
+ keyword_options = [1, 2, 3, 4, 5]
97
+ ttk.Combobox(frame, textvariable=keyword_vars[frame],
98
+ values=keyword_options).pack()
99
+
100
+
101
+
102
+
103
+
104
+ # --- Main Program ---
105
+ root = tk.Tk()
106
+ root.title("BuzzMatchTester")
107
+
108
+ # UI Elements for File Input
109
+ file_frame = tk.Frame(root) # Frame to hold file path and button
110
+ file_frame.pack()
111
+
112
+ file_path_label = tk.Label(file_frame, text="File Path:")
113
+ file_path_label.pack(side='left')
114
+
115
+ file_path_var = tk.StringVar(root)
116
+ file_path_entry = tk.Entry(file_frame, textvariable=file_path_var)
117
+ file_path_entry.pack(side='left')
118
+
119
+ browse_button = tk.Button(file_frame, text="Browse Folder", command=browse_folder) # Change browse_file to browse_folder
120
+ browse_button.pack(side='left')
121
+
122
+ # Dropdown for Number of Categories
123
+ num_categories_label = tk.Label(root, text="Number of Categories:")
124
+ num_categories_label.pack()
125
+
126
+ num_categories_options = [0,1, 2, 3, 4, 5]
127
+ num_categories_var = tk.IntVar(root)
128
+ num_categories_var.set(num_categories_options[0])
129
+ num_categories_dropdown = ttk.Combobox(root, textvariable=num_categories_var,
130
+ values=num_categories_options)
131
+ num_categories_dropdown.pack()
132
+
133
+ category_frames = []
134
+ category_var = {}
135
+ keyword_vars = {}
136
+
137
+ update_category_dropdowns() # Initial setup
138
+
139
+
140
+ # Generate Button
141
+ generate_button = tk.Button(root, text="Generate Textboxes", command=generate_textboxes)
142
+ generate_button.pack()
143
+
144
+ num_categories_dropdown.bind("<<ComboboxSelected>>", lambda _: update_category_dropdowns()) # After updating dropdowns
145
+
146
+ root.mainloop()
traditionalABC.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import time
3
+
4
+ # Rastrigin function
5
+ def rastrigin_function(x):
6
+ A = 10
7
+ return A * len(x) + np.sum(x**2 - A * np.cos(2 * np.pi * x))
8
+
9
+ # Initialize control parameters
10
+ SN = 10000 # Number of food sources
11
+ MCN = 100000 # Maximum number of cycles
12
+ limit = 50 # Maximum number of exploitations for a solution
13
+ dimensionality = 2 # Dimensionality of the search space
14
+
15
+ # Shared variables
16
+ food_sources = np.random.uniform(-5.12, 5.12, size=(SN, dimensionality)) # Initial random positions
17
+ trial = np.zeros(SN) # Initialize trial counters
18
+
19
+ # Main ABC loop
20
+ start_time = time.time()
21
+
22
+ for cyc in range(1, MCN + 1):
23
+ # Employed Bees' Phase
24
+ for i in range(SN):
25
+ x_hat = food_sources[i] + np.random.uniform(-0.5, 0.5, size=(dimensionality,))
26
+ if rastrigin_function(x_hat) < rastrigin_function(food_sources[i]):
27
+ food_sources[i] = x_hat
28
+ trial[i] = 0
29
+ else:
30
+ trial[i] += 1
31
+
32
+ # Onlooker Bees' Phase
33
+ probabilities = 1 / (1 + np.exp(-trial))
34
+ onlooker_indices = np.random.choice(SN, size=SN, p=probabilities / probabilities.sum())
35
+
36
+ for i in onlooker_indices:
37
+ x_hat = food_sources[i] + np.random.uniform(-0.5, 0.5, size=(dimensionality,))
38
+ if rastrigin_function(x_hat) < rastrigin_function(food_sources[i]):
39
+ food_sources[i] = x_hat
40
+ trial[i] = 0
41
+ else:
42
+ trial[i] += 1
43
+
44
+ # Scout Bee Phase
45
+ max_trial_index = np.argmax(trial)
46
+ if trial[max_trial_index] > limit:
47
+ food_sources[max_trial_index] = np.random.uniform(-5.12, 5.12, size=(dimensionality,))
48
+ trial[max_trial_index] = 0
49
+
50
+ end_time = time.time()
51
+
52
+ # Find the best solution
53
+ best_solution = food_sources[np.argmin([rastrigin_function(x) for x in food_sources])]
54
+
55
+ print("Best solution:", best_solution)
56
+ print("Objective function value at best solution:", rastrigin_function(best_solution))
57
+ print("Time taken:", end_time - start_time, "seconds")
try.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ import abc_1
2
+
3
+ abc_1.convert_pages('input','png',4)