ImageDataExtractor2

Running

App Files Files Community

WebashalarForML commited on about 14 hours ago

Commit

fad436e

verified ·

1 Parent(s): 5194ace

Upload 42 files

Browse files

Files changed (40) hide show

app.py +69 -137
backup/__pycache__/model.cpython-310.pyc +0 -0
backup/__pycache__/save_load.cpython-310.pyc +0 -0
backup/__pycache__/train.cpython-310.pyc +0 -0
backup/backup.py +58 -58
backup/model.py +412 -412
backup/modules/__pycache__/base.cpython-310.pyc +0 -0
backup/modules/__pycache__/evaluator.cpython-310.pyc +0 -0
backup/modules/__pycache__/layers.cpython-310.pyc +0 -0
backup/modules/__pycache__/run_evaluation.cpython-310.pyc +0 -0
backup/modules/__pycache__/span_rep.cpython-310.pyc +0 -0
backup/modules/__pycache__/token_rep.cpython-310.pyc +0 -0
backup/modules/base.py +150 -150
backup/modules/data_proc.py +73 -73
backup/modules/evaluator.py +152 -152
backup/modules/layers.py +28 -28
backup/modules/run_evaluation.py +188 -188
backup/modules/span_rep.py +369 -369
backup/modules/token_rep.py +54 -54
backup/requirements.txt +5 -5
backup/save_load.py +20 -20
backup/train.py +132 -132
core/__pycache__/base.cpython-310.pyc +0 -0
core/__pycache__/gradio_ocr.cpython-310.pyc +0 -0
core/__pycache__/ner_engine.cpython-310.pyc +0 -0
core/__pycache__/ocr_engine.cpython-310.pyc +0 -0
core/__pycache__/vlm_engine.cpython-310.pyc +0 -0
core/base.py +22 -0
core/gradio_ocr.py +50 -0
core/ner_engine.py +49 -0
core/ocr_engine.py +114 -0
core/vlm_engine.py +91 -0
requirements.txt +18 -16
static/uploads/IN_Standard-Visiting-Cards_Overview.png +0 -0
templates/index.html +236 -284
templates/result.html +326 -248
utility/__pycache__/utils.cpython-310.pyc +0 -0
utility/__pycache__/utils.cpython-312.pyc +0 -0
utility/__pycache__/utils.cpython-313.pyc +0 -0
utility/utils.py +120 -688

app.py CHANGED Viewed

@@ -1,186 +1,118 @@
-# libraries
-from flask import Flask, render_template, request, redirect, url_for, flash, session, send_from_directory
 import os
 import logging
-from utility.utils import extract_text_from_images, Data_Extractor, json_to_llm_str, process_extracted_text, process_resume_data
-from backup.backup import NER_Model
-from paddleocr import PaddleOCR
-# Configure logging
-logging.basicConfig(
-    level=logging.INFO,
-    handlers=[
-        logging.StreamHandler()  # Remove FileHandler and log only to the console
-    ]
-)
-# Flask App
-app = Flask(__name__)
-app.secret_key = 'your_secret_key'
-app.config['UPLOAD_FOLDER'] = 'uploads/'
-app.config['RESULT_FOLDER'] = 'results/'
-UPLOAD_FOLDER = 'static/uploads/'
-RESULT_FOLDER = 'static/results/'
-os.makedirs(UPLOAD_FOLDER, exist_ok=True)
-os.makedirs(RESULT_FOLDER, exist_ok=True)
-if not os.path.exists(app.config['UPLOAD_FOLDER']):
-    os.makedirs(app.config['UPLOAD_FOLDER'])
-if not os.path.exists(app.config['RESULT_FOLDER']):
-    os.makedirs(app.config['RESULT_FOLDER'])
-# Set the PaddleOCR home directory to a writable location
-os.environ['PADDLEOCR_HOME'] = '/tmp/.paddleocr'
-# Check if PaddleOCR home directory is writable
-if not os.path.exists('/tmp/.paddleocr'):
-    os.makedirs('/tmp/.paddleocr', exist_ok=True)
-    logging.info("Created PaddleOCR home directory.")
-else:
-    logging.info("PaddleOCR home directory exists.")
 @app.route('/')
 def index():
     uploaded_files = session.get('uploaded_files', [])
-    logging.info(f"Accessed index page, uploaded files: {uploaded_files}")
     return render_template('index.html', uploaded_files=uploaded_files)
 @app.route('/upload', methods=['POST'])
 def upload_file():
     if 'files' not in request.files:
         flash('No file part')
-        logging.warning("No file part found in the request")
         return redirect(request.url)
     files = request.files.getlist('files')
     if not files or all(file.filename == '' for file in files):
         flash('No selected files')
-        logging.warning("No files selected for upload")
         return redirect(request.url)
-    uploaded_files = session.get('uploaded_files', [])
     for file in files:
         if file:
             filename = file.filename
             file_path = os.path.join(app.config['UPLOAD_FOLDER'], filename)
             file.save(file_path)
-            print(f"file path --->{file_path}")
             uploaded_files.append(filename)
-            logging.info(f"Uploaded file: {filename} at {file_path}")
     session['uploaded_files'] = uploaded_files
-    flash('Files successfully uploaded')
-    logging.info(f"Files successfully uploaded: {uploaded_files}")
     return process_file()
-@app.route('/remove_file',methods=['POST'])
-def remove_file():
-    uploaded_files = session.get('uploaded_files', [])
-    if uploaded_file:
-        for filename in uploaded_files:
-            file_path = os.path.join(app.config['UPLOAD_FOLDER'], filename)
-            if os.path.exists(file_path):
-                os.remove(file_path)
-                logging.info(f"Removed file: {filename}")
-            else:
-                logging.warning(f"File not found for removal: {file_path}")  # More specific log
-        session.pop('uploaded_files', None)
-        flash('Files successfully removed')
-        logging.info("All uploaded files removed")
-    else:
-        flash('No file to remove.')
-        logging.warning("File not found for removal")
-    return redirect(url_for('index'))
-@app.route('/reset_upload')
-def reset_upload():
-    """Reset the uploaded file and the processed data."""
-    uploaded_files = session.get('uploaded_files', [])
-    if uploaded_file:
-        for filename in uploaded_files:
-            file_path = os.path.join(app.config['UPLOAD_FOLDER'], filename)
-            if os.path.exists(file_path):
-                os.remove(file_path)
-                logging.info(f"Removed file: {filename}")
-            else:
-                logging.warning(f"File not found for removal: {file_path}")  # More specific log
-        session.pop('uploaded_files', None)
-        flash('Files successfully removed')
-        logging.info("All uploaded files removed")
-    else:
-        flash('No file to remove.')
-        logging.warning("File not found for removal")
-    return redirect(url_for('index'))
-@app.route('/process', methods=['GET','POST'])
 def process_file():
     uploaded_files = session.get('uploaded_files', [])
     if not uploaded_files:
         flash('No files selected for processing')
-        logging.warning("No files selected for processing")
         return redirect(url_for('index'))
-    file_paths = [os.path.join(app.config['UPLOAD_FOLDER'], filename) for filename in uploaded_files]
-    logging.info(f"Processing files: {file_paths}")
-    extracted_text = {}
-    processed_Img = {}
     try:
-        extracted_text, processed_Img = extract_text_from_images(file_paths)
-        logging.info(f"Extracted text: {extracted_text}")
-        logging.info(f"Processed images: {processed_Img}")
-        llmText = json_to_llm_str(extracted_text)
-        logging.info(f"LLM text: {llmText}")
-        LLMdata = Data_Extractor(llmText)
-        print("llm data--------->",llmText)
-        logging.info(f"LLM data: {LLMdata}")
     except Exception as e:
-        logging.error(f"Error during LLM processing: {e}")
-        logging.info("Running backup model...")
-        LLMdata = {}
-        extracted_text, processed_Img = extract_text_from_images(file_paths)
-        logging.info(f"Extracted text(Backup): {extracted_text}")
-        logging.info(f"Processed images(Backup): {processed_Img}")
-        if extracted_text:
-            text = json_to_llm_str(extracted_text)
-            LLMdata = NER_Model(text)
-            logging.info(f"NER model data: {LLMdata}")
-        else:
-            logging.warning("No extracted text available for backup model")
-    cont_data = process_extracted_text(extracted_text)
-    logging.info(f"Contextual data: {cont_data}")
-    processed_data = process_resume_data(LLMdata, cont_data, extracted_text)
-    logging.info(f"Processed data: {processed_data}")
-    session['processed_data'] = processed_data
-    session['processed_Img'] = processed_Img
-    flash('Data processed and analyzed successfully')
-    logging.info("Data processed and analyzed successfully")
-    return redirect(url_for('result'))
 @app.route('/result')
 def result():
-    processed_data = session.get('processed_data', {})
-    processed_Img = session.get('processed_Img', {})
-    logging.info(f"Displaying results: Data - {processed_data}, Images - {processed_Img}")
-    return render_template('result.html', data=processed_data, Img=processed_Img)
-@app.route('/uploads/<filename>')
-def uploaded_file(filename):
-    logging.info(f"Serving file: {filename}")
-    return send_from_directory(app.config['UPLOAD_FOLDER'], filename)
 if __name__ == '__main__':
-    logging.info("Starting Flask app")
-    app.run(debug=True)

 import os
 import logging
+from flask import Flask, render_template, request, redirect, url_for, flash, session, send_from_directory
+from utility.utils import process_image_pipeline
+from dotenv import load_dotenv
+# Load environment variables
+load_dotenv()
+# Configure logging
+logging.basicConfig(level=logging.INFO, handlers=[logging.StreamHandler()])
+app = Flask(__name__)
+app.secret_key = os.getenv('SECRET_KEY', 'default_secret_key')
+app.config['UPLOAD_FOLDER'] = 'static/uploads/'
+app.config['RESULT_FOLDER'] = 'static/results/'
+os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
+os.makedirs(app.config['RESULT_FOLDER'], exist_ok=True)
+@app.template_filter('basename')
+def basename_filter(path):
+    return os.path.basename(path)
 @app.route('/')
 def index():
     uploaded_files = session.get('uploaded_files', [])
     return render_template('index.html', uploaded_files=uploaded_files)
 @app.route('/upload', methods=['POST'])
 def upload_file():
+    logging.info("Request: /upload received")
     if 'files' not in request.files:
+        logging.warning("Upload: No file part in request")
         flash('No file part')
         return redirect(request.url)
     files = request.files.getlist('files')
     if not files or all(file.filename == '' for file in files):
+        logging.warning("Upload: No files selected")
         flash('No selected files')
         return redirect(request.url)
+    uploaded_files = []
     for file in files:
         if file:
             filename = file.filename
             file_path = os.path.join(app.config['UPLOAD_FOLDER'], filename)
             file.save(file_path)
             uploaded_files.append(filename)
+            logging.info(f"Upload: Successfully saved {filename}")
     session['uploaded_files'] = uploaded_files
     return process_file()
+@app.route('/process', methods=['GET', 'POST'])
 def process_file():
+    logging.info("Request: /process started")
     uploaded_files = session.get('uploaded_files', [])
     if not uploaded_files:
+        logging.warning("Process: No files in session")
         flash('No files selected for processing')
         return redirect(url_for('index'))
+    file_paths = [os.path.join(app.config['UPLOAD_FOLDER'], f) for f in uploaded_files]
     try:
+        logging.info(f"Process: Sending {len(file_paths)} files to pipeline")
+        processed_data = process_image_pipeline(file_paths)
+        # Format images for result.html
+        processed_Img = {f: os.path.join(app.config['UPLOAD_FOLDER'], f) for f in uploaded_files}
+        session['processed_data'] = processed_data
+        session['processed_Img'] = processed_Img
+        logging.info("Process: Pipeline completed successfully")
+        flash('Data processed successfully')
+        return redirect(url_for('result'))
     except Exception as e:
+        logging.exception(f"Process: Critical failure: {e}")
+        flash(f'Processing error: {str(e)}')
+        return redirect(url_for('index'))
 @app.route('/result')
 def result():
+    data = session.get('processed_data', {})
+    Img = session.get('processed_Img', {})
+    if not data:
+        return redirect(url_for('index'))
+    return render_template('result.html', data=data, Img=Img)
+@app.route('/reset_upload')
+def reset_upload():
+    uploaded_files = session.get('uploaded_files', [])
+    for f in uploaded_files:
+        path = os.path.join(app.config['UPLOAD_FOLDER'], f)
+        if os.path.exists(path):
+            os.remove(path)
+    session.pop('uploaded_files', None)
+    session.pop('processed_data', None)
+    session.pop('processed_Img', None)
+    flash('System reset successful.')
+    return redirect(url_for('index'))
 if __name__ == '__main__':
+    from utility.utils import get_ocr, get_ner
+    logging.info("Core: Pre-initializing engines (this may take a minute)...")
+    # Trigger lazy load at startup to avoid reloader issues and request timeouts
+    try:
+        get_ocr()
+        get_ner()
+        logging.info("Core: Engines pre-initialized successfully.")
+    except Exception as e:
+        logging.error(f"Core: Failed to pre-initialize engines: {e}")
+    app.run(debug=True, use_reloader=False, port=int(os.getenv('PORT', 5000)))

backup/__pycache__/model.cpython-310.pyc ADDED Viewed

Binary file (9.97 kB). View file

backup/__pycache__/save_load.cpython-310.pyc ADDED Viewed

Binary file (765 Bytes). View file

backup/__pycache__/train.cpython-310.pyc ADDED Viewed

Binary file (2.95 kB). View file

backup/backup.py CHANGED Viewed

@@ -1,59 +1,59 @@
-from .model import GLiNER
-# Initialize GLiNER with the base model
-model = GLiNER.from_pretrained("urchade/gliner_mediumv2.1")
-# Sample text for entity prediction
-text = """
- lenskart m: (0)9428002330 Lenskart Store,Surat m: (0)9723817060) e:lenskartsurat@gmail.com Store Address UG-4.Ascon City.Opp.Maheshwari Bhavan,Citylight,Surat-395007"""
-def NER_Model(text):
-    labels = ["Person", "Mail", "Number", "Address", "Organization","Designation","Link"]
-    # Perform entity prediction
-    entities = model.predict_entities(text, labels, threshold=0.5)
-    # Initialize the processed data dictionary
-    processed_data = {
-            "Name": [],
-            "Contact": [],
-            "Designation": [],
-            "Address": [],
-            "Link": [],
-            "Company": [],
-            "Email": [],
-            "extracted_text": "",
-            }
-    for entity in entities:
-        print(entity["text"], "=>", entity["label"])
-        #loading the data into json
-        if entity["label"]==labels[0]:
-            processed_data['Name'].extend([entity["text"]])
-        if entity["label"]==labels[1]:
-            processed_data['Email'].extend([entity["text"]])
-        if entity["label"]==labels[2]:
-            processed_data['Contact'].extend([entity["text"]])
-        if entity["label"]==labels[3]:
-            processed_data['Address'].extend([entity["text"]])
-        if entity["label"]==labels[4]:
-            processed_data['Company'].extend([entity["text"]])
-        if entity["label"]==labels[5]:
-            processed_data['Designation'].extend([entity["text"]])
-        if entity["label"]==labels[6]:
-            processed_data['Link'].extend([entity["text"]])
-    processed_data['Address']=[', '.join(processed_data['Address'])]
-    processed_data['extracted_text']=[text]
     return processed_data

+from .model import GLiNER
+# Initialize GLiNER with the base model
+model = GLiNER.from_pretrained("urchade/gliner_mediumv2.1")
+# Sample text for entity prediction
+text = """
+ lenskart m: (0)9428002330 Lenskart Store,Surat m: (0)9723817060) e:lenskartsurat@gmail.com Store Address UG-4.Ascon City.Opp.Maheshwari Bhavan,Citylight,Surat-395007"""
+def NER_Model(text):
+    labels = ["Person", "Mail", "Number", "Address", "Organization","Designation","Link"]
+    # Perform entity prediction
+    entities = model.predict_entities(text, labels, threshold=0.3)
+    # Initialize the processed data dictionary
+    processed_data = {
+            "Name": [],
+            "Contact": [],
+            "Designation": [],
+            "Address": [],
+            "Link": [],
+            "Company": [],
+            "Email": [],
+            "extracted_text": "",
+            }
+    for entity in entities:
+        print(entity["text"], "=>", entity["label"])
+        #loading the data into json
+        if entity["label"]==labels[0]:
+            processed_data['Name'].extend([entity["text"]])
+        if entity["label"]==labels[1]:
+            processed_data['Email'].extend([entity["text"]])
+        if entity["label"]==labels[2]:
+            processed_data['Contact'].extend([entity["text"]])
+        if entity["label"]==labels[3]:
+            processed_data['Address'].extend([entity["text"]])
+        if entity["label"]==labels[4]:
+            processed_data['Company'].extend([entity["text"]])
+        if entity["label"]==labels[5]:
+            processed_data['Designation'].extend([entity["text"]])
+        if entity["label"]==labels[6]:
+            processed_data['Link'].extend([entity["text"]])
+    processed_data['Address']=[', '.join(processed_data['Address'])]
+    processed_data['extracted_text']=[text]
     return processed_data

backup/model.py CHANGED Viewed

@@ -1,412 +1,412 @@
-import argparse
-import json
-from pathlib import Path
-import re
-from typing import Dict, Optional, Union
-import torch
-import torch.nn.functional as F
-from .modules.layers import LstmSeq2SeqEncoder
-from .modules.base import InstructBase
-from .modules.evaluator import Evaluator, greedy_search
-from .modules.span_rep import SpanRepLayer
-from .modules.token_rep import TokenRepLayer
-from torch import nn
-from torch.nn.utils.rnn import pad_sequence
-from huggingface_hub import PyTorchModelHubMixin, hf_hub_download
-from huggingface_hub.utils import HfHubHTTPError
-class GLiNER(InstructBase, PyTorchModelHubMixin):
-    def __init__(self, config):
-        super().__init__(config)
-        self.config = config
-        # [ENT] token
-        self.entity_token = "<<ENT>>"
-        self.sep_token = "<<SEP>>"
-        # usually a pretrained bidirectional transformer, returns first subtoken representation
-        self.token_rep_layer = TokenRepLayer(model_name=config.model_name, fine_tune=config.fine_tune,
-                                             subtoken_pooling=config.subtoken_pooling, hidden_size=config.hidden_size,
-                                             add_tokens=[self.entity_token, self.sep_token])
-        # hierarchical representation of tokens
-        self.rnn = LstmSeq2SeqEncoder(
-            input_size=config.hidden_size,
-            hidden_size=config.hidden_size // 2,
-            num_layers=1,
-            bidirectional=True,
-        )
-        # span representation
-        self.span_rep_layer = SpanRepLayer(
-            span_mode=config.span_mode,
-            hidden_size=config.hidden_size,
-            max_width=config.max_width,
-            dropout=config.dropout,
-        )
-        # prompt representation (FFN)
-        self.prompt_rep_layer = nn.Sequential(
-            nn.Linear(config.hidden_size, config.hidden_size * 4),
-            nn.Dropout(config.dropout),
-            nn.ReLU(),
-            nn.Linear(config.hidden_size * 4, config.hidden_size)
-        )
-    def compute_score_train(self, x):
-        span_idx = x['span_idx'] * x['span_mask'].unsqueeze(-1)
-        new_length = x['seq_length'].clone()
-        new_tokens = []
-        all_len_prompt = []
-        num_classes_all = []
-        # add prompt to the tokens
-        for i in range(len(x['tokens'])):
-            all_types_i = list(x['classes_to_id'][i].keys())
-            # multiple entity types in all_types. Prompt is appended at the start of tokens
-            entity_prompt = []
-            num_classes_all.append(len(all_types_i))
-            # add enity types to prompt
-            for entity_type in all_types_i:
-                entity_prompt.append(self.entity_token)  # [ENT] token
-                entity_prompt.append(entity_type)  # entity type
-            entity_prompt.append(self.sep_token)  # [SEP] token
-            # prompt format:
-            # [ENT] entity_type [ENT] entity_type ... [ENT] entity_type [SEP]
-            # add prompt to the tokens
-            tokens_p = entity_prompt + x['tokens'][i]
-            # input format:
-            # [ENT] entity_type_1 [ENT] entity_type_2 ... [ENT] entity_type_m [SEP] token_1 token_2 ... token_n
-            # update length of the sequence (add prompt length to the original length)
-            new_length[i] = new_length[i] + len(entity_prompt)
-            # update tokens
-            new_tokens.append(tokens_p)
-            # store prompt length
-            all_len_prompt.append(len(entity_prompt))
-        # create a mask using num_classes_all (0, if it exceeds the number of classes, 1 otherwise)
-        max_num_classes = max(num_classes_all)
-        entity_type_mask = torch.arange(max_num_classes).unsqueeze(0).expand(len(num_classes_all), -1).to(
-            x['span_mask'].device)
-        entity_type_mask = entity_type_mask < torch.tensor(num_classes_all).unsqueeze(-1).to(
-            x['span_mask'].device)  # [batch_size, max_num_classes]
-        # compute all token representations
-        bert_output = self.token_rep_layer(new_tokens, new_length)
-        word_rep_w_prompt = bert_output["embeddings"]  # embeddings for all tokens (with prompt)
-        mask_w_prompt = bert_output["mask"]  # mask for all tokens (with prompt)
-        # get word representation (after [SEP]), mask (after [SEP]) and entity type representation (before [SEP])
-        word_rep = []  # word representation (after [SEP])
-        mask = []  # mask (after [SEP])
-        entity_type_rep = []  # entity type representation (before [SEP])
-        for i in range(len(x['tokens'])):
-            prompt_entity_length = all_len_prompt[i]  # length of prompt for this example
-            # get word representation (after [SEP])
-            word_rep.append(word_rep_w_prompt[i, prompt_entity_length:prompt_entity_length + x['seq_length'][i]])
-            # get mask (after [SEP])
-            mask.append(mask_w_prompt[i, prompt_entity_length:prompt_entity_length + x['seq_length'][i]])
-            # get entity type representation (before [SEP])
-            entity_rep = word_rep_w_prompt[i, :prompt_entity_length - 1]  # remove [SEP]
-            entity_rep = entity_rep[0::2]  # it means that we take every second element starting from the second one
-            entity_type_rep.append(entity_rep)
-        # padding for word_rep, mask and entity_type_rep
-        word_rep = pad_sequence(word_rep, batch_first=True)  # [batch_size, seq_len, hidden_size]
-        mask = pad_sequence(mask, batch_first=True)  # [batch_size, seq_len]
-        entity_type_rep = pad_sequence(entity_type_rep, batch_first=True)  # [batch_size, len_types, hidden_size]
-        # compute span representation
-        word_rep = self.rnn(word_rep, mask)
-        span_rep = self.span_rep_layer(word_rep, span_idx)
-        # compute final entity type representation (FFN)
-        entity_type_rep = self.prompt_rep_layer(entity_type_rep)  # (batch_size, len_types, hidden_size)
-        num_classes = entity_type_rep.shape[1]  # number of entity types
-        # similarity score
-        scores = torch.einsum('BLKD,BCD->BLKC', span_rep, entity_type_rep)
-        return scores, num_classes, entity_type_mask
-    def forward(self, x):
-        # compute span representation
-        scores, num_classes, entity_type_mask = self.compute_score_train(x)
-        batch_size = scores.shape[0]
-        # loss for filtering classifier
-        logits_label = scores.view(-1, num_classes)
-        labels = x["span_label"].view(-1)  # (batch_size * num_spans)
-        mask_label = labels != -1  # (batch_size * num_spans)
-        labels.masked_fill_(~mask_label, 0)  # Set the labels of padding tokens to 0
-        # one-hot encoding
-        labels_one_hot = torch.zeros(labels.size(0), num_classes + 1, dtype=torch.float32).to(scores.device)
-        labels_one_hot.scatter_(1, labels.unsqueeze(1), 1)  # Set the corresponding index to 1
-        labels_one_hot = labels_one_hot[:, 1:]  # Remove the first column
-        # Shape of labels_one_hot: (batch_size * num_spans, num_classes)
-        # compute loss (without reduction)
-        all_losses = F.binary_cross_entropy_with_logits(logits_label, labels_one_hot,
-                                                        reduction='none')
-        # mask loss using entity_type_mask (B, C)
-        masked_loss = all_losses.view(batch_size, -1, num_classes) * entity_type_mask.unsqueeze(1)
-        all_losses = masked_loss.view(-1, num_classes)
-        # expand mask_label to all_losses
-        mask_label = mask_label.unsqueeze(-1).expand_as(all_losses)
-        # put lower loss for in label_one_hot (2 for positive, 1 for negative)
-        weight_c = labels_one_hot + 1
-        # apply mask
-        all_losses = all_losses * mask_label.float() * weight_c
-        return all_losses.sum()
-    def compute_score_eval(self, x, device):
-        # check if classes_to_id is dict
-        assert isinstance(x['classes_to_id'], dict), "classes_to_id must be a dict"
-        span_idx = (x['span_idx'] * x['span_mask'].unsqueeze(-1)).to(device)
-        all_types = list(x['classes_to_id'].keys())
-        # multiple entity types in all_types. Prompt is appended at the start of tokens
-        entity_prompt = []
-        # add enity types to prompt
-        for entity_type in all_types:
-            entity_prompt.append(self.entity_token)
-            entity_prompt.append(entity_type)
-        entity_prompt.append(self.sep_token)
-        prompt_entity_length = len(entity_prompt)
-        # add prompt
-        tokens_p = [entity_prompt + tokens for tokens in x['tokens']]
-        seq_length_p = x['seq_length'] + prompt_entity_length
-        out = self.token_rep_layer(tokens_p, seq_length_p)
-        word_rep_w_prompt = out["embeddings"]
-        mask_w_prompt = out["mask"]
-        # remove prompt
-        word_rep = word_rep_w_prompt[:, prompt_entity_length:, :]
-        mask = mask_w_prompt[:, prompt_entity_length:]
-        # get_entity_type_rep
-        entity_type_rep = word_rep_w_prompt[:, :prompt_entity_length - 1, :]
-        # extract [ENT] tokens (which are at even positions in entity_type_rep)
-        entity_type_rep = entity_type_rep[:, 0::2, :]
-        entity_type_rep = self.prompt_rep_layer(entity_type_rep)  # (batch_size, len_types, hidden_size)
-        word_rep = self.rnn(word_rep, mask)
-        span_rep = self.span_rep_layer(word_rep, span_idx)
-        local_scores = torch.einsum('BLKD,BCD->BLKC', span_rep, entity_type_rep)
-        return local_scores
-    @torch.no_grad()
-    def predict(self, x, flat_ner=False, threshold=0.5):
-        self.eval()
-        local_scores = self.compute_score_eval(x, device=next(self.parameters()).device)
-        spans = []
-        for i, _ in enumerate(x["tokens"]):
-            local_i = local_scores[i]
-            wh_i = [i.tolist() for i in torch.where(torch.sigmoid(local_i) > threshold)]
-            span_i = []
-            for s, k, c in zip(*wh_i):
-                if s + k < len(x["tokens"][i]):
-                    span_i.append((s, s + k, x["id_to_classes"][c + 1], local_i[s, k, c]))
-            span_i = greedy_search(span_i, flat_ner)
-            spans.append(span_i)
-        return spans
-    def predict_entities(self, text, labels, flat_ner=True, threshold=0.5):
-        tokens = []
-        start_token_idx_to_text_idx = []
-        end_token_idx_to_text_idx = []
-        for match in re.finditer(r'\w+(?:[-_]\w+)*|\S', text):
-            tokens.append(match.group())
-            start_token_idx_to_text_idx.append(match.start())
-            end_token_idx_to_text_idx.append(match.end())
-        input_x = {"tokenized_text": tokens, "ner": None}
-        x = self.collate_fn([input_x], labels)
-        output = self.predict(x, flat_ner=flat_ner, threshold=threshold)
-        entities = []
-        for start_token_idx, end_token_idx, ent_type in output[0]:
-            start_text_idx = start_token_idx_to_text_idx[start_token_idx]
-            end_text_idx = end_token_idx_to_text_idx[end_token_idx]
-            entities.append({
-                "start": start_token_idx_to_text_idx[start_token_idx],
-                "end": end_token_idx_to_text_idx[end_token_idx],
-                "text": text[start_text_idx:end_text_idx],
-                "label": ent_type,
-            })
-        return entities
-    def evaluate(self, test_data, flat_ner=False, threshold=0.5, batch_size=12, entity_types=None):
-        self.eval()
-        data_loader = self.create_dataloader(test_data, batch_size=batch_size, entity_types=entity_types, shuffle=False)
-        device = next(self.parameters()).device
-        all_preds = []
-        all_trues = []
-        for x in data_loader:
-            for k, v in x.items():
-                if isinstance(v, torch.Tensor):
-                    x[k] = v.to(device)
-            batch_predictions = self.predict(x, flat_ner, threshold)
-            all_preds.extend(batch_predictions)
-            all_trues.extend(x["entities"])
-        evaluator = Evaluator(all_trues, all_preds)
-        out, f1 = evaluator.evaluate()
-        return out, f1
-    @classmethod
-    def _from_pretrained(
-        cls,
-        *,
-        model_id: str,
-        revision: Optional[str],
-        cache_dir: Optional[Union[str, Path]],
-        force_download: bool,
-        proxies: Optional[Dict],
-        resume_download: bool,
-        local_files_only: bool,
-        token: Union[str, bool, None],
-        map_location: str = "cpu",
-        strict: bool = False,
-        **model_kwargs,
-    ):
-        # 1. Backwards compatibility: Use "gliner_base.pt" and "gliner_multi.pt" with all data
-        filenames = ["gliner_base.pt", "gliner_multi.pt"]
-        for filename in filenames:
-            model_file = Path(model_id) / filename
-            if not model_file.exists():
-                try:
-                    model_file = hf_hub_download(
-                        repo_id=model_id,
-                        filename=filename,
-                        revision=revision,
-                        cache_dir=cache_dir,
-                        force_download=force_download,
-                        proxies=proxies,
-                        resume_download=resume_download,
-                        token=token,
-                        local_files_only=local_files_only,
-                    )
-                except HfHubHTTPError:
-                    continue
-            dict_load = torch.load(model_file, map_location=torch.device(map_location))
-            config = dict_load["config"]
-            state_dict = dict_load["model_weights"]
-            config.model_name = "microsoft/deberta-v3-base" if filename == "gliner_base.pt" else "microsoft/mdeberta-v3-base"
-            model = cls(config)
-            model.load_state_dict(state_dict, strict=strict, assign=True)
-            # Required to update flair's internals as well:
-            model.to(map_location)
-            return model
-        # 2. Newer format: Use "pytorch_model.bin" and "gliner_config.json"
-        from .train import load_config_as_namespace
-        model_file = Path(model_id) / "pytorch_model.bin"
-        if not model_file.exists():
-            model_file = hf_hub_download(
-                repo_id=model_id,
-                filename="pytorch_model.bin",
-                revision=revision,
-                cache_dir=cache_dir,
-                force_download=force_download,
-                proxies=proxies,
-                resume_download=resume_download,
-                token=token,
-                local_files_only=local_files_only,
-            )
-        config_file = Path(model_id) / "gliner_config.json"
-        if not config_file.exists():
-            config_file = hf_hub_download(
-                repo_id=model_id,
-                filename="gliner_config.json",
-                revision=revision,
-                cache_dir=cache_dir,
-                force_download=force_download,
-                proxies=proxies,
-                resume_download=resume_download,
-                token=token,
-                local_files_only=local_files_only,
-            )
-        config = load_config_as_namespace(config_file)
-        model = cls(config)
-        state_dict = torch.load(model_file, map_location=torch.device(map_location))
-        model.load_state_dict(state_dict, strict=strict, assign=True)
-        model.to(map_location)
-        return model
-    def save_pretrained(
-        self,
-        save_directory: Union[str, Path],
-        *,
-        config: Optional[Union[dict, "DataclassInstance"]] = None,
-        repo_id: Optional[str] = None,
-        push_to_hub: bool = False,
-        **push_to_hub_kwargs,
-    ) -> Optional[str]:
-        """
-        Save weights in local directory.
-        Args:
-            save_directory (`str` or `Path`):
-                Path to directory in which the model weights and configuration will be saved.
-            config (`dict` or `DataclassInstance`, *optional*):
-                Model configuration specified as a key/value dictionary or a dataclass instance.
-            push_to_hub (`bool`, *optional*, defaults to `False`):
-                Whether or not to push your model to the Huggingface Hub after saving it.
-            repo_id (`str`, *optional*):
-                ID of your repository on the Hub. Used only if `push_to_hub=True`. Will default to the folder name if
-                not provided.
-            kwargs:
-                Additional key word arguments passed along to the [`~ModelHubMixin.push_to_hub`] method.
-        """
-        save_directory = Path(save_directory)
-        save_directory.mkdir(parents=True, exist_ok=True)
-        # save model weights/files
-        torch.save(self.state_dict(), save_directory / "pytorch_model.bin")
-        # save config (if provided)
-        if config is None:
-            config = self.config
-        if config is not None:
-            if isinstance(config, argparse.Namespace):
-                config = vars(config)
-            (save_directory / "gliner_config.json").write_text(json.dumps(config, indent=2))
-        # push to the Hub if required
-        if push_to_hub:
-            kwargs = push_to_hub_kwargs.copy()  # soft-copy to avoid mutating input
-            if config is not None:  # kwarg for `push_to_hub`
-                kwargs["config"] = config
-            if repo_id is None:
-                repo_id = save_directory.name  # Defaults to `save_directory` name
-            return self.push_to_hub(repo_id=repo_id, **kwargs)
-        return None
-    def to(self, device):
-        super().to(device)
-        import flair
-        flair.device = device
-        return self

+import argparse
+import json
+from pathlib import Path
+import re
+from typing import Dict, Optional, Union
+import torch
+import torch.nn.functional as F
+from .modules.layers import LstmSeq2SeqEncoder
+from .modules.base import InstructBase
+from .modules.evaluator import Evaluator, greedy_search
+from .modules.span_rep import SpanRepLayer
+from .modules.token_rep import TokenRepLayer
+from torch import nn
+from torch.nn.utils.rnn import pad_sequence
+from huggingface_hub import PyTorchModelHubMixin, hf_hub_download
+from huggingface_hub.utils import HfHubHTTPError
+class GLiNER(InstructBase, PyTorchModelHubMixin):
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+        # [ENT] token
+        self.entity_token = "<<ENT>>"
+        self.sep_token = "<<SEP>>"
+        # usually a pretrained bidirectional transformer, returns first subtoken representation
+        self.token_rep_layer = TokenRepLayer(model_name=config.model_name, fine_tune=config.fine_tune,
+                                             subtoken_pooling=config.subtoken_pooling, hidden_size=config.hidden_size,
+                                             add_tokens=[self.entity_token, self.sep_token])
+        # hierarchical representation of tokens
+        self.rnn = LstmSeq2SeqEncoder(
+            input_size=config.hidden_size,
+            hidden_size=config.hidden_size // 2,
+            num_layers=1,
+            bidirectional=True,
+        )
+        # span representation
+        self.span_rep_layer = SpanRepLayer(
+            span_mode=config.span_mode,
+            hidden_size=config.hidden_size,
+            max_width=config.max_width,
+            dropout=config.dropout,
+        )
+        # prompt representation (FFN)
+        self.prompt_rep_layer = nn.Sequential(
+            nn.Linear(config.hidden_size, config.hidden_size * 4),
+            nn.Dropout(config.dropout),
+            nn.ReLU(),
+            nn.Linear(config.hidden_size * 4, config.hidden_size)
+        )
+    def compute_score_train(self, x):
+        span_idx = x['span_idx'] * x['span_mask'].unsqueeze(-1)
+        new_length = x['seq_length'].clone()
+        new_tokens = []
+        all_len_prompt = []
+        num_classes_all = []
+        # add prompt to the tokens
+        for i in range(len(x['tokens'])):
+            all_types_i = list(x['classes_to_id'][i].keys())
+            # multiple entity types in all_types. Prompt is appended at the start of tokens
+            entity_prompt = []
+            num_classes_all.append(len(all_types_i))
+            # add enity types to prompt
+            for entity_type in all_types_i:
+                entity_prompt.append(self.entity_token)  # [ENT] token
+                entity_prompt.append(entity_type)  # entity type
+            entity_prompt.append(self.sep_token)  # [SEP] token
+            # prompt format:
+            # [ENT] entity_type [ENT] entity_type ... [ENT] entity_type [SEP]
+            # add prompt to the tokens
+            tokens_p = entity_prompt + x['tokens'][i]
+            # input format:
+            # [ENT] entity_type_1 [ENT] entity_type_2 ... [ENT] entity_type_m [SEP] token_1 token_2 ... token_n
+            # update length of the sequence (add prompt length to the original length)
+            new_length[i] = new_length[i] + len(entity_prompt)
+            # update tokens
+            new_tokens.append(tokens_p)
+            # store prompt length
+            all_len_prompt.append(len(entity_prompt))
+        # create a mask using num_classes_all (0, if it exceeds the number of classes, 1 otherwise)
+        max_num_classes = max(num_classes_all)
+        entity_type_mask = torch.arange(max_num_classes).unsqueeze(0).expand(len(num_classes_all), -1).to(
+            x['span_mask'].device)
+        entity_type_mask = entity_type_mask < torch.tensor(num_classes_all).unsqueeze(-1).to(
+            x['span_mask'].device)  # [batch_size, max_num_classes]
+        # compute all token representations
+        bert_output = self.token_rep_layer(new_tokens, new_length)
+        word_rep_w_prompt = bert_output["embeddings"]  # embeddings for all tokens (with prompt)
+        mask_w_prompt = bert_output["mask"]  # mask for all tokens (with prompt)
+        # get word representation (after [SEP]), mask (after [SEP]) and entity type representation (before [SEP])
+        word_rep = []  # word representation (after [SEP])
+        mask = []  # mask (after [SEP])
+        entity_type_rep = []  # entity type representation (before [SEP])
+        for i in range(len(x['tokens'])):
+            prompt_entity_length = all_len_prompt[i]  # length of prompt for this example
+            # get word representation (after [SEP])
+            word_rep.append(word_rep_w_prompt[i, prompt_entity_length:prompt_entity_length + x['seq_length'][i]])
+            # get mask (after [SEP])
+            mask.append(mask_w_prompt[i, prompt_entity_length:prompt_entity_length + x['seq_length'][i]])
+            # get entity type representation (before [SEP])
+            entity_rep = word_rep_w_prompt[i, :prompt_entity_length - 1]  # remove [SEP]
+            entity_rep = entity_rep[0::2]  # it means that we take every second element starting from the second one
+            entity_type_rep.append(entity_rep)
+        # padding for word_rep, mask and entity_type_rep
+        word_rep = pad_sequence(word_rep, batch_first=True)  # [batch_size, seq_len, hidden_size]
+        mask = pad_sequence(mask, batch_first=True)  # [batch_size, seq_len]
+        entity_type_rep = pad_sequence(entity_type_rep, batch_first=True)  # [batch_size, len_types, hidden_size]
+        # compute span representation
+        word_rep = self.rnn(word_rep, mask)
+        span_rep = self.span_rep_layer(word_rep, span_idx)
+        # compute final entity type representation (FFN)
+        entity_type_rep = self.prompt_rep_layer(entity_type_rep)  # (batch_size, len_types, hidden_size)
+        num_classes = entity_type_rep.shape[1]  # number of entity types
+        # similarity score
+        scores = torch.einsum('BLKD,BCD->BLKC', span_rep, entity_type_rep)
+        return scores, num_classes, entity_type_mask
+    def forward(self, x):
+        # compute span representation
+        scores, num_classes, entity_type_mask = self.compute_score_train(x)
+        batch_size = scores.shape[0]
+        # loss for filtering classifier
+        logits_label = scores.view(-1, num_classes)
+        labels = x["span_label"].view(-1)  # (batch_size * num_spans)
+        mask_label = labels != -1  # (batch_size * num_spans)
+        labels.masked_fill_(~mask_label, 0)  # Set the labels of padding tokens to 0
+        # one-hot encoding
+        labels_one_hot = torch.zeros(labels.size(0), num_classes + 1, dtype=torch.float32).to(scores.device)
+        labels_one_hot.scatter_(1, labels.unsqueeze(1), 1)  # Set the corresponding index to 1
+        labels_one_hot = labels_one_hot[:, 1:]  # Remove the first column
+        # Shape of labels_one_hot: (batch_size * num_spans, num_classes)
+        # compute loss (without reduction)
+        all_losses = F.binary_cross_entropy_with_logits(logits_label, labels_one_hot,
+                                                        reduction='none')
+        # mask loss using entity_type_mask (B, C)
+        masked_loss = all_losses.view(batch_size, -1, num_classes) * entity_type_mask.unsqueeze(1)
+        all_losses = masked_loss.view(-1, num_classes)
+        # expand mask_label to all_losses
+        mask_label = mask_label.unsqueeze(-1).expand_as(all_losses)
+        # put lower loss for in label_one_hot (2 for positive, 1 for negative)
+        weight_c = labels_one_hot + 1
+        # apply mask
+        all_losses = all_losses * mask_label.float() * weight_c
+        return all_losses.sum()
+    def compute_score_eval(self, x, device):
+        # check if classes_to_id is dict
+        assert isinstance(x['classes_to_id'], dict), "classes_to_id must be a dict"
+        span_idx = (x['span_idx'] * x['span_mask'].unsqueeze(-1)).to(device)
+        all_types = list(x['classes_to_id'].keys())
+        # multiple entity types in all_types. Prompt is appended at the start of tokens
+        entity_prompt = []
+        # add enity types to prompt
+        for entity_type in all_types:
+            entity_prompt.append(self.entity_token)
+            entity_prompt.append(entity_type)
+        entity_prompt.append(self.sep_token)
+        prompt_entity_length = len(entity_prompt)
+        # add prompt
+        tokens_p = [entity_prompt + tokens for tokens in x['tokens']]
+        seq_length_p = x['seq_length'] + prompt_entity_length
+        out = self.token_rep_layer(tokens_p, seq_length_p)
+        word_rep_w_prompt = out["embeddings"]
+        mask_w_prompt = out["mask"]
+        # remove prompt
+        word_rep = word_rep_w_prompt[:, prompt_entity_length:, :]
+        mask = mask_w_prompt[:, prompt_entity_length:]
+        # get_entity_type_rep
+        entity_type_rep = word_rep_w_prompt[:, :prompt_entity_length - 1, :]
+        # extract [ENT] tokens (which are at even positions in entity_type_rep)
+        entity_type_rep = entity_type_rep[:, 0::2, :]
+        entity_type_rep = self.prompt_rep_layer(entity_type_rep)  # (batch_size, len_types, hidden_size)
+        word_rep = self.rnn(word_rep, mask)
+        span_rep = self.span_rep_layer(word_rep, span_idx)
+        local_scores = torch.einsum('BLKD,BCD->BLKC', span_rep, entity_type_rep)
+        return local_scores
+    @torch.no_grad()
+    def predict(self, x, flat_ner=False, threshold=0.5):
+        self.eval()
+        local_scores = self.compute_score_eval(x, device=next(self.parameters()).device)
+        spans = []
+        for i, _ in enumerate(x["tokens"]):
+            local_i = local_scores[i]
+            wh_i = [i.tolist() for i in torch.where(torch.sigmoid(local_i) > threshold)]
+            span_i = []
+            for s, k, c in zip(*wh_i):
+                if s + k < len(x["tokens"][i]):
+                    span_i.append((s, s + k, x["id_to_classes"][c + 1], local_i[s, k, c]))
+            span_i = greedy_search(span_i, flat_ner)
+            spans.append(span_i)
+        return spans
+    def predict_entities(self, text, labels, flat_ner=True, threshold=0.5):
+        tokens = []
+        start_token_idx_to_text_idx = []
+        end_token_idx_to_text_idx = []
+        for match in re.finditer(r'\w+(?:[-_]\w+)*|\S', text):
+            tokens.append(match.group())
+            start_token_idx_to_text_idx.append(match.start())
+            end_token_idx_to_text_idx.append(match.end())
+        input_x = {"tokenized_text": tokens, "ner": None}
+        x = self.collate_fn([input_x], labels)
+        output = self.predict(x, flat_ner=flat_ner, threshold=threshold)
+        entities = []
+        for start_token_idx, end_token_idx, ent_type in output[0]:
+            start_text_idx = start_token_idx_to_text_idx[start_token_idx]
+            end_text_idx = end_token_idx_to_text_idx[end_token_idx]
+            entities.append({
+                "start": start_token_idx_to_text_idx[start_token_idx],
+                "end": end_token_idx_to_text_idx[end_token_idx],
+                "text": text[start_text_idx:end_text_idx],
+                "label": ent_type,
+            })
+        return entities
+    def evaluate(self, test_data, flat_ner=False, threshold=0.5, batch_size=12, entity_types=None):
+        self.eval()
+        data_loader = self.create_dataloader(test_data, batch_size=batch_size, entity_types=entity_types, shuffle=False)
+        device = next(self.parameters()).device
+        all_preds = []
+        all_trues = []
+        for x in data_loader:
+            for k, v in x.items():
+                if isinstance(v, torch.Tensor):
+                    x[k] = v.to(device)
+            batch_predictions = self.predict(x, flat_ner, threshold)
+            all_preds.extend(batch_predictions)
+            all_trues.extend(x["entities"])
+        evaluator = Evaluator(all_trues, all_preds)
+        out, f1 = evaluator.evaluate()
+        return out, f1
+    @classmethod
+    def _from_pretrained(
+        cls,
+        *,
+        model_id: str,
+        revision: Optional[str],
+        cache_dir: Optional[Union[str, Path]],
+        force_download: bool,
+        proxies: Optional[Dict],
+        resume_download: bool,
+        local_files_only: bool,
+        token: Union[str, bool, None],
+        map_location: str = "cpu",
+        strict: bool = False,
+        **model_kwargs,
+    ):
+        # 1. Backwards compatibility: Use "gliner_base.pt" and "gliner_multi.pt" with all data
+        filenames = ["gliner_base.pt", "gliner_multi.pt"]
+        for filename in filenames:
+            model_file = Path(model_id) / filename
+            if not model_file.exists():
+                try:
+                    model_file = hf_hub_download(
+                        repo_id=model_id,
+                        filename=filename,
+                        revision=revision,
+                        cache_dir=cache_dir,
+                        force_download=force_download,
+                        proxies=proxies,
+                        resume_download=resume_download,
+                        token=token,
+                        local_files_only=local_files_only,
+                    )
+                except HfHubHTTPError:
+                    continue
+            dict_load = torch.load(model_file, map_location=torch.device(map_location))
+            config = dict_load["config"]
+            state_dict = dict_load["model_weights"]
+            config.model_name = "microsoft/deberta-v3-base" if filename == "gliner_base.pt" else "microsoft/mdeberta-v3-base"
+            model = cls(config)
+            model.load_state_dict(state_dict, strict=strict, assign=True)
+            # Required to update flair's internals as well:
+            model.to(map_location)
+            return model
+        # 2. Newer format: Use "pytorch_model.bin" and "gliner_config.json"
+        from .train import load_config_as_namespace
+        model_file = Path(model_id) / "pytorch_model.bin"
+        if not model_file.exists():
+            model_file = hf_hub_download(
+                repo_id=model_id,
+                filename="pytorch_model.bin",
+                revision=revision,
+                cache_dir=cache_dir,
+                force_download=force_download,
+                proxies=proxies,
+                resume_download=resume_download,
+                token=token,
+                local_files_only=local_files_only,
+            )
+        config_file = Path(model_id) / "gliner_config.json"
+        if not config_file.exists():
+            config_file = hf_hub_download(
+                repo_id=model_id,
+                filename="gliner_config.json",
+                revision=revision,
+                cache_dir=cache_dir,
+                force_download=force_download,
+                proxies=proxies,
+                resume_download=resume_download,
+                token=token,
+                local_files_only=local_files_only,
+            )
+        config = load_config_as_namespace(config_file)
+        model = cls(config)
+        state_dict = torch.load(model_file, map_location=torch.device(map_location))
+        model.load_state_dict(state_dict, strict=strict, assign=True)
+        model.to(map_location)
+        return model
+    def save_pretrained(
+        self,
+        save_directory: Union[str, Path],
+        *,
+        config: Optional[Union[dict, "DataclassInstance"]] = None,
+        repo_id: Optional[str] = None,
+        push_to_hub: bool = False,
+        **push_to_hub_kwargs,
+    ) -> Optional[str]:
+        """
+        Save weights in local directory.
+        Args:
+            save_directory (`str` or `Path`):
+                Path to directory in which the model weights and configuration will be saved.
+            config (`dict` or `DataclassInstance`, *optional*):
+                Model configuration specified as a key/value dictionary or a dataclass instance.
+            push_to_hub (`bool`, *optional*, defaults to `False`):
+                Whether or not to push your model to the Huggingface Hub after saving it.
+            repo_id (`str`, *optional*):
+                ID of your repository on the Hub. Used only if `push_to_hub=True`. Will default to the folder name if
+                not provided.
+            kwargs:
+                Additional key word arguments passed along to the [`~ModelHubMixin.push_to_hub`] method.
+        """
+        save_directory = Path(save_directory)
+        save_directory.mkdir(parents=True, exist_ok=True)
+        # save model weights/files
+        torch.save(self.state_dict(), save_directory / "pytorch_model.bin")
+        # save config (if provided)
+        if config is None:
+            config = self.config
+        if config is not None:
+            if isinstance(config, argparse.Namespace):
+                config = vars(config)
+            (save_directory / "gliner_config.json").write_text(json.dumps(config, indent=2))
+        # push to the Hub if required
+        if push_to_hub:
+            kwargs = push_to_hub_kwargs.copy()  # soft-copy to avoid mutating input
+            if config is not None:  # kwarg for `push_to_hub`
+                kwargs["config"] = config
+            if repo_id is None:
+                repo_id = save_directory.name  # Defaults to `save_directory` name
+            return self.push_to_hub(repo_id=repo_id, **kwargs)
+        return None
+    def to(self, device):
+        super().to(device)
+        import flair
+        flair.device = device
+        return self

backup/modules/__pycache__/base.cpython-310.pyc ADDED Viewed

Binary file (5.06 kB). View file

backup/modules/__pycache__/evaluator.cpython-310.pyc ADDED Viewed

Binary file (4.44 kB). View file

backup/modules/__pycache__/layers.cpython-310.pyc ADDED Viewed

Binary file (1.23 kB). View file

backup/modules/__pycache__/run_evaluation.cpython-310.pyc ADDED Viewed

Binary file (4.31 kB). View file

backup/modules/__pycache__/span_rep.cpython-310.pyc ADDED Viewed

Binary file (9.62 kB). View file

backup/modules/__pycache__/token_rep.cpython-310.pyc ADDED Viewed

Binary file (2.4 kB). View file

backup/modules/base.py CHANGED Viewed

@@ -1,150 +1,150 @@
-from collections import defaultdict
-from typing import List, Tuple, Dict
-import torch
-from torch import nn
-from torch.nn.utils.rnn import pad_sequence
-from torch.utils.data import DataLoader
-import random
-class InstructBase(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.max_width = config.max_width
-        self.base_config = config
-    def get_dict(self, spans, classes_to_id):
-        dict_tag = defaultdict(int)
-        for span in spans:
-            if span[2] in classes_to_id:
-                dict_tag[(span[0], span[1])] = classes_to_id[span[2]]
-        return dict_tag
-    def preprocess_spans(self, tokens, ner, classes_to_id):
-        max_len = self.base_config.max_len
-        if len(tokens) > max_len:
-            length = max_len
-            tokens = tokens[:max_len]
-        else:
-            length = len(tokens)
-        spans_idx = []
-        for i in range(length):
-            spans_idx.extend([(i, i + j) for j in range(self.max_width)])
-        dict_lab = self.get_dict(ner, classes_to_id) if ner else defaultdict(int)
-        # 0 for null labels
-        span_label = torch.LongTensor([dict_lab[i] for i in spans_idx])
-        spans_idx = torch.LongTensor(spans_idx)
-        # mask for valid spans
-        valid_span_mask = spans_idx[:, 1] > length - 1
-        # mask invalid positions
-        span_label = span_label.masked_fill(valid_span_mask, -1)
-        return {
-            'tokens': tokens,
-            'span_idx': spans_idx,
-            'span_label': span_label,
-            'seq_length': length,
-            'entities': ner,
-        }
-    def collate_fn(self, batch_list, entity_types=None):
-        # batch_list: list of dict containing tokens, ner
-        if entity_types is None:
-            negs = self.get_negatives(batch_list, 100)
-            class_to_ids = []
-            id_to_classes = []
-            for b in batch_list:
-                # negs = b["negative"]
-                random.shuffle(negs)
-                # negs = negs[:sampled_neg]
-                max_neg_type_ratio = int(self.base_config.max_neg_type_ratio)
-                if max_neg_type_ratio == 0:
-                    # no negatives
-                    neg_type_ratio = 0
-                else:
-                    neg_type_ratio = random.randint(0, max_neg_type_ratio)
-                if neg_type_ratio == 0:
-                    # no negatives
-                    negs_i = []
-                else:
-                    negs_i = negs[:len(b['ner']) * neg_type_ratio]
-                # this is the list of all possible entity types (positive and negative)
-                types = list(set([el[-1] for el in b['ner']] + negs_i))
-                # shuffle (every epoch)
-                random.shuffle(types)
-                if len(types) != 0:
-                    # prob of higher number shoul
-                    # random drop
-                    if self.base_config.random_drop:
-                        num_ents = random.randint(1, len(types))
-                        types = types[:num_ents]
-                # maximum number of entities types
-                types = types[:int(self.base_config.max_types)]
-                # supervised training
-                if "label" in b:
-                    types = sorted(b["label"])
-                class_to_id = {k: v for v, k in enumerate(types, start=1)}
-                id_to_class = {k: v for v, k in class_to_id.items()}
-                class_to_ids.append(class_to_id)
-                id_to_classes.append(id_to_class)
-            batch = [
-                self.preprocess_spans(b["tokenized_text"], b["ner"], class_to_ids[i]) for i, b in enumerate(batch_list)
-            ]
-        else:
-            class_to_ids = {k: v for v, k in enumerate(entity_types, start=1)}
-            id_to_classes = {k: v for v, k in class_to_ids.items()}
-            batch = [
-                self.preprocess_spans(b["tokenized_text"], b["ner"], class_to_ids) for b in batch_list
-            ]
-        span_idx = pad_sequence(
-            [b['span_idx'] for b in batch], batch_first=True, padding_value=0
-        )
-        span_label = pad_sequence(
-            [el['span_label'] for el in batch], batch_first=True, padding_value=-1
-        )
-        return {
-            'seq_length': torch.LongTensor([el['seq_length'] for el in batch]),
-            'span_idx': span_idx,
-            'tokens': [el['tokens'] for el in batch],
-            'span_mask': span_label != -1,
-            'span_label': span_label,
-            'entities': [el['entities'] for el in batch],
-            'classes_to_id': class_to_ids,
-            'id_to_classes': id_to_classes,
-        }
-    @staticmethod
-    def get_negatives(batch_list, sampled_neg=5):
-        ent_types = []
-        for b in batch_list:
-            types = set([el[-1] for el in b['ner']])
-            ent_types.extend(list(types))
-        ent_types = list(set(ent_types))
-        # sample negatives
-        random.shuffle(ent_types)
-        return ent_types[:sampled_neg]
-    def create_dataloader(self, data, entity_types=None, **kwargs):
-        return DataLoader(data, collate_fn=lambda x: self.collate_fn(x, entity_types), **kwargs)

+from collections import defaultdict
+from typing import List, Tuple, Dict
+import torch
+from torch import nn
+from torch.nn.utils.rnn import pad_sequence
+from torch.utils.data import DataLoader
+import random
+class InstructBase(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.max_width = config.max_width
+        self.base_config = config
+    def get_dict(self, spans, classes_to_id):
+        dict_tag = defaultdict(int)
+        for span in spans:
+            if span[2] in classes_to_id:
+                dict_tag[(span[0], span[1])] = classes_to_id[span[2]]
+        return dict_tag
+    def preprocess_spans(self, tokens, ner, classes_to_id):
+        max_len = self.base_config.max_len
+        if len(tokens) > max_len:
+            length = max_len
+            tokens = tokens[:max_len]
+        else:
+            length = len(tokens)
+        spans_idx = []
+        for i in range(length):
+            spans_idx.extend([(i, i + j) for j in range(self.max_width)])
+        dict_lab = self.get_dict(ner, classes_to_id) if ner else defaultdict(int)
+        # 0 for null labels
+        span_label = torch.LongTensor([dict_lab[i] for i in spans_idx])
+        spans_idx = torch.LongTensor(spans_idx)
+        # mask for valid spans
+        valid_span_mask = spans_idx[:, 1] > length - 1
+        # mask invalid positions
+        span_label = span_label.masked_fill(valid_span_mask, -1)
+        return {
+            'tokens': tokens,
+            'span_idx': spans_idx,
+            'span_label': span_label,
+            'seq_length': length,
+            'entities': ner,
+        }
+    def collate_fn(self, batch_list, entity_types=None):
+        # batch_list: list of dict containing tokens, ner
+        if entity_types is None:
+            negs = self.get_negatives(batch_list, 100)
+            class_to_ids = []
+            id_to_classes = []
+            for b in batch_list:
+                # negs = b["negative"]
+                random.shuffle(negs)
+                # negs = negs[:sampled_neg]
+                max_neg_type_ratio = int(self.base_config.max_neg_type_ratio)
+                if max_neg_type_ratio == 0:
+                    # no negatives
+                    neg_type_ratio = 0
+                else:
+                    neg_type_ratio = random.randint(0, max_neg_type_ratio)
+                if neg_type_ratio == 0:
+                    # no negatives
+                    negs_i = []
+                else:
+                    negs_i = negs[:len(b['ner']) * neg_type_ratio]
+                # this is the list of all possible entity types (positive and negative)
+                types = list(set([el[-1] for el in b['ner']] + negs_i))
+                # shuffle (every epoch)
+                random.shuffle(types)
+                if len(types) != 0:
+                    # prob of higher number shoul
+                    # random drop
+                    if self.base_config.random_drop:
+                        num_ents = random.randint(1, len(types))
+                        types = types[:num_ents]
+                # maximum number of entities types
+                types = types[:int(self.base_config.max_types)]
+                # supervised training
+                if "label" in b:
+                    types = sorted(b["label"])
+                class_to_id = {k: v for v, k in enumerate(types, start=1)}
+                id_to_class = {k: v for v, k in class_to_id.items()}
+                class_to_ids.append(class_to_id)
+                id_to_classes.append(id_to_class)
+            batch = [
+                self.preprocess_spans(b["tokenized_text"], b["ner"], class_to_ids[i]) for i, b in enumerate(batch_list)
+            ]
+        else:
+            class_to_ids = {k: v for v, k in enumerate(entity_types, start=1)}
+            id_to_classes = {k: v for v, k in class_to_ids.items()}
+            batch = [
+                self.preprocess_spans(b["tokenized_text"], b["ner"], class_to_ids) for b in batch_list
+            ]
+        span_idx = pad_sequence(
+            [b['span_idx'] for b in batch], batch_first=True, padding_value=0
+        )
+        span_label = pad_sequence(
+            [el['span_label'] for el in batch], batch_first=True, padding_value=-1
+        )
+        return {
+            'seq_length': torch.LongTensor([el['seq_length'] for el in batch]),
+            'span_idx': span_idx,
+            'tokens': [el['tokens'] for el in batch],
+            'span_mask': span_label != -1,
+            'span_label': span_label,
+            'entities': [el['entities'] for el in batch],
+            'classes_to_id': class_to_ids,
+            'id_to_classes': id_to_classes,
+        }
+    @staticmethod
+    def get_negatives(batch_list, sampled_neg=5):
+        ent_types = []
+        for b in batch_list:
+            types = set([el[-1] for el in b['ner']])
+            ent_types.extend(list(types))
+        ent_types = list(set(ent_types))
+        # sample negatives
+        random.shuffle(ent_types)
+        return ent_types[:sampled_neg]
+    def create_dataloader(self, data, entity_types=None, **kwargs):
+        return DataLoader(data, collate_fn=lambda x: self.collate_fn(x, entity_types), **kwargs)

backup/modules/data_proc.py CHANGED Viewed

@@ -1,73 +1,73 @@
-import json
-from tqdm import tqdm
-# ast.literal_eval
-import ast, re
-path = 'train.json'
-with open(path, 'r') as f:
-    data = json.load(f)
-def tokenize_text(text):
-    return re.findall(r'\w+(?:[-_]\w+)*|\S', text)
-def extract_entity_spans(entry):
-    text = ""
-    len_start = len("What describes ")
-    len_end = len(" in the text?")
-    entity_types = []
-    entity_texts = []
-    for c in entry['conversations']:
-        if c['from'] == 'human' and c['value'].startswith('Text: '):
-            text = c['value'][len('Text: '):]
-            tokenized_text = tokenize_text(text)
-        if c['from'] == 'human' and c['value'].startswith('What describes '):
-            c_type = c['value'][len_start:-len_end]
-            c_type = c_type.replace(' ', '_')
-            entity_types.append(c_type)
-        elif c['from'] == 'gpt' and c['value'].startswith('['):
-            if c['value'] == '[]':
-                entity_types = entity_types[:-1]
-                continue
-            texts_ents = ast.literal_eval(c['value'])
-            # replace space to _ in texts_ents
-            entity_texts.extend(texts_ents)
-            num_repeat = len(texts_ents) - 1
-            entity_types.extend([entity_types[-1]] * num_repeat)
-    entity_spans = []
-    for j, entity_text in enumerate(entity_texts):
-        entity_tokens = tokenize_text(entity_text)
-        matches = []
-        for i in range(len(tokenized_text) - len(entity_tokens) + 1):
-            if " ".join(tokenized_text[i:i + len(entity_tokens)]).lower() == " ".join(entity_tokens).lower():
-                matches.append((i, i + len(entity_tokens) - 1, entity_types[j]))
-        if matches:
-            entity_spans.extend(matches)
-    return entity_spans, tokenized_text
-# Usage:
-# Replace 'entry' with the specific entry from your JSON data
-entry = data[17818]  # For example, taking the first entry
-entity_spans, tokenized_text = extract_entity_spans(entry)
-print("Entity Spans:", entity_spans)
-#print("Tokenized Text:", tokenized_text)
-# create a dict: {"tokenized_text": tokenized_text, "entity_spans": entity_spans}
-all_data = []
-for entry in tqdm(data):
-    entity_spans, tokenized_text = extract_entity_spans(entry)
-    all_data.append({"tokenized_text": tokenized_text, "ner": entity_spans})
-with open('train_instruct.json', 'w') as f:
-    json.dump(all_data, f)

+import json
+from tqdm import tqdm
+# ast.literal_eval
+import ast, re
+path = 'train.json'
+with open(path, 'r') as f:
+    data = json.load(f)
+def tokenize_text(text):
+    return re.findall(r'\w+(?:[-_]\w+)*|\S', text)
+def extract_entity_spans(entry):
+    text = ""
+    len_start = len("What describes ")
+    len_end = len(" in the text?")
+    entity_types = []
+    entity_texts = []
+    for c in entry['conversations']:
+        if c['from'] == 'human' and c['value'].startswith('Text: '):
+            text = c['value'][len('Text: '):]
+            tokenized_text = tokenize_text(text)
+        if c['from'] == 'human' and c['value'].startswith('What describes '):
+            c_type = c['value'][len_start:-len_end]
+            c_type = c_type.replace(' ', '_')
+            entity_types.append(c_type)
+        elif c['from'] == 'gpt' and c['value'].startswith('['):
+            if c['value'] == '[]':
+                entity_types = entity_types[:-1]
+                continue
+            texts_ents = ast.literal_eval(c['value'])
+            # replace space to _ in texts_ents
+            entity_texts.extend(texts_ents)
+            num_repeat = len(texts_ents) - 1
+            entity_types.extend([entity_types[-1]] * num_repeat)
+    entity_spans = []
+    for j, entity_text in enumerate(entity_texts):
+        entity_tokens = tokenize_text(entity_text)
+        matches = []
+        for i in range(len(tokenized_text) - len(entity_tokens) + 1):
+            if " ".join(tokenized_text[i:i + len(entity_tokens)]).lower() == " ".join(entity_tokens).lower():
+                matches.append((i, i + len(entity_tokens) - 1, entity_types[j]))
+        if matches:
+            entity_spans.extend(matches)
+    return entity_spans, tokenized_text
+# Usage:
+# Replace 'entry' with the specific entry from your JSON data
+entry = data[17818]  # For example, taking the first entry
+entity_spans, tokenized_text = extract_entity_spans(entry)
+print("Entity Spans:", entity_spans)
+#print("Tokenized Text:", tokenized_text)
+# create a dict: {"tokenized_text": tokenized_text, "entity_spans": entity_spans}
+all_data = []
+for entry in tqdm(data):
+    entity_spans, tokenized_text = extract_entity_spans(entry)
+    all_data.append({"tokenized_text": tokenized_text, "ner": entity_spans})
+with open('train_instruct.json', 'w') as f:
+    json.dump(all_data, f)

backup/modules/evaluator.py CHANGED Viewed

@@ -1,152 +1,152 @@
-from collections import defaultdict
-import numpy as np
-import torch
-from seqeval.metrics.v1 import _prf_divide
-def extract_tp_actual_correct(y_true, y_pred):
-    entities_true = defaultdict(set)
-    entities_pred = defaultdict(set)
-    for type_name, (start, end), idx in y_true:
-        entities_true[type_name].add((start, end, idx))
-    for type_name, (start, end), idx in y_pred:
-        entities_pred[type_name].add((start, end, idx))
-    target_names = sorted(set(entities_true.keys()) | set(entities_pred.keys()))
-    tp_sum = np.array([], dtype=np.int32)
-    pred_sum = np.array([], dtype=np.int32)
-    true_sum = np.array([], dtype=np.int32)
-    for type_name in target_names:
-        entities_true_type = entities_true.get(type_name, set())
-        entities_pred_type = entities_pred.get(type_name, set())
-        tp_sum = np.append(tp_sum, len(entities_true_type & entities_pred_type))
-        pred_sum = np.append(pred_sum, len(entities_pred_type))
-        true_sum = np.append(true_sum, len(entities_true_type))
-    return pred_sum, tp_sum, true_sum, target_names
-def flatten_for_eval(y_true, y_pred):
-    all_true = []
-    all_pred = []
-    for i, (true, pred) in enumerate(zip(y_true, y_pred)):
-        all_true.extend([t + [i] for t in true])
-        all_pred.extend([p + [i] for p in pred])
-    return all_true, all_pred
-def compute_prf(y_true, y_pred, average='micro'):
-    y_true, y_pred = flatten_for_eval(y_true, y_pred)
-    pred_sum, tp_sum, true_sum, target_names = extract_tp_actual_correct(y_true, y_pred)
-    if average == 'micro':
-        tp_sum = np.array([tp_sum.sum()])
-        pred_sum = np.array([pred_sum.sum()])
-        true_sum = np.array([true_sum.sum()])
-    precision = _prf_divide(
-        numerator=tp_sum,
-        denominator=pred_sum,
-        metric='precision',
-        modifier='predicted',
-        average=average,
-        warn_for=('precision', 'recall', 'f-score'),
-        zero_division='warn'
-    )
-    recall = _prf_divide(
-        numerator=tp_sum,
-        denominator=true_sum,
-        metric='recall',
-        modifier='true',
-        average=average,
-        warn_for=('precision', 'recall', 'f-score'),
-        zero_division='warn'
-    )
-    denominator = precision + recall
-    denominator[denominator == 0.] = 1
-    f_score = 2 * (precision * recall) / denominator
-    return {'precision': precision[0], 'recall': recall[0], 'f_score': f_score[0]}
-class Evaluator:
-    def __init__(self, all_true, all_outs):
-        self.all_true = all_true
-        self.all_outs = all_outs
-    def get_entities_fr(self, ents):
-        all_ents = []
-        for s, e, lab in ents:
-            all_ents.append([lab, (s, e)])
-        return all_ents
-    def transform_data(self):
-        all_true_ent = []
-        all_outs_ent = []
-        for i, j in zip(self.all_true, self.all_outs):
-            e = self.get_entities_fr(i)
-            all_true_ent.append(e)
-            e = self.get_entities_fr(j)
-            all_outs_ent.append(e)
-        return all_true_ent, all_outs_ent
-    @torch.no_grad()
-    def evaluate(self):
-        all_true_typed, all_outs_typed = self.transform_data()
-        precision, recall, f1 = compute_prf(all_true_typed, all_outs_typed).values()
-        output_str = f"P: {precision:.2%}\tR: {recall:.2%}\tF1: {f1:.2%}\n"
-        return output_str, f1
-def is_nested(idx1, idx2):
-    # Return True if idx2 is nested inside idx1 or vice versa
-    return (idx1[0] <= idx2[0] and idx1[1] >= idx2[1]) or (idx2[0] <= idx1[0] and idx2[1] >= idx1[1])
-def has_overlapping(idx1, idx2):
-    overlapping = True
-    if idx1[:2] == idx2[:2]:
-        return overlapping
-    if (idx1[0] > idx2[1] or idx2[0] > idx1[1]):
-        overlapping = False
-    return overlapping
-def has_overlapping_nested(idx1, idx2):
-    # Return True if idx1 and idx2 overlap, but neither is nested inside the other
-    if idx1[:2] == idx2[:2]:
-        return True
-    if ((idx1[0] > idx2[1] or idx2[0] > idx1[1]) or is_nested(idx1, idx2)) and idx1 != idx2:
-        return False
-    else:
-        return True
-def greedy_search(spans, flat_ner=True):  # start, end, class, score
-    if flat_ner:
-        has_ov = has_overlapping
-    else:
-        has_ov = has_overlapping_nested
-    new_list = []
-    span_prob = sorted(spans, key=lambda x: -x[-1])
-    for i in range(len(spans)):
-        b = span_prob[i]
-        flag = False
-        for new in new_list:
-            if has_ov(b[:-1], new):
-                flag = True
-                break
-        if not flag:
-            new_list.append(b[:-1])
-    new_list = sorted(new_list, key=lambda x: x[0])
-    return new_list

+from collections import defaultdict
+import numpy as np
+import torch
+from seqeval.metrics.v1 import _prf_divide
+def extract_tp_actual_correct(y_true, y_pred):
+    entities_true = defaultdict(set)
+    entities_pred = defaultdict(set)
+    for type_name, (start, end), idx in y_true:
+        entities_true[type_name].add((start, end, idx))
+    for type_name, (start, end), idx in y_pred:
+        entities_pred[type_name].add((start, end, idx))
+    target_names = sorted(set(entities_true.keys()) | set(entities_pred.keys()))
+    tp_sum = np.array([], dtype=np.int32)
+    pred_sum = np.array([], dtype=np.int32)
+    true_sum = np.array([], dtype=np.int32)
+    for type_name in target_names:
+        entities_true_type = entities_true.get(type_name, set())
+        entities_pred_type = entities_pred.get(type_name, set())
+        tp_sum = np.append(tp_sum, len(entities_true_type & entities_pred_type))
+        pred_sum = np.append(pred_sum, len(entities_pred_type))
+        true_sum = np.append(true_sum, len(entities_true_type))
+    return pred_sum, tp_sum, true_sum, target_names
+def flatten_for_eval(y_true, y_pred):
+    all_true = []
+    all_pred = []
+    for i, (true, pred) in enumerate(zip(y_true, y_pred)):
+        all_true.extend([t + [i] for t in true])
+        all_pred.extend([p + [i] for p in pred])
+    return all_true, all_pred
+def compute_prf(y_true, y_pred, average='micro'):
+    y_true, y_pred = flatten_for_eval(y_true, y_pred)
+    pred_sum, tp_sum, true_sum, target_names = extract_tp_actual_correct(y_true, y_pred)
+    if average == 'micro':
+        tp_sum = np.array([tp_sum.sum()])
+        pred_sum = np.array([pred_sum.sum()])
+        true_sum = np.array([true_sum.sum()])
+    precision = _prf_divide(
+        numerator=tp_sum,
+        denominator=pred_sum,
+        metric='precision',
+        modifier='predicted',
+        average=average,
+        warn_for=('precision', 'recall', 'f-score'),
+        zero_division='warn'
+    )
+    recall = _prf_divide(
+        numerator=tp_sum,
+        denominator=true_sum,
+        metric='recall',
+        modifier='true',
+        average=average,
+        warn_for=('precision', 'recall', 'f-score'),
+        zero_division='warn'
+    )
+    denominator = precision + recall
+    denominator[denominator == 0.] = 1
+    f_score = 2 * (precision * recall) / denominator
+    return {'precision': precision[0], 'recall': recall[0], 'f_score': f_score[0]}
+class Evaluator:
+    def __init__(self, all_true, all_outs):
+        self.all_true = all_true
+        self.all_outs = all_outs
+    def get_entities_fr(self, ents):
+        all_ents = []
+        for s, e, lab in ents:
+            all_ents.append([lab, (s, e)])
+        return all_ents
+    def transform_data(self):
+        all_true_ent = []
+        all_outs_ent = []
+        for i, j in zip(self.all_true, self.all_outs):
+            e = self.get_entities_fr(i)
+            all_true_ent.append(e)
+            e = self.get_entities_fr(j)
+            all_outs_ent.append(e)
+        return all_true_ent, all_outs_ent
+    @torch.no_grad()
+    def evaluate(self):
+        all_true_typed, all_outs_typed = self.transform_data()
+        precision, recall, f1 = compute_prf(all_true_typed, all_outs_typed).values()
+        output_str = f"P: {precision:.2%}\tR: {recall:.2%}\tF1: {f1:.2%}\n"
+        return output_str, f1
+def is_nested(idx1, idx2):
+    # Return True if idx2 is nested inside idx1 or vice versa
+    return (idx1[0] <= idx2[0] and idx1[1] >= idx2[1]) or (idx2[0] <= idx1[0] and idx2[1] >= idx1[1])
+def has_overlapping(idx1, idx2):
+    overlapping = True
+    if idx1[:2] == idx2[:2]:
+        return overlapping
+    if (idx1[0] > idx2[1] or idx2[0] > idx1[1]):
+        overlapping = False
+    return overlapping
+def has_overlapping_nested(idx1, idx2):
+    # Return True if idx1 and idx2 overlap, but neither is nested inside the other
+    if idx1[:2] == idx2[:2]:
+        return True
+    if ((idx1[0] > idx2[1] or idx2[0] > idx1[1]) or is_nested(idx1, idx2)) and idx1 != idx2:
+        return False
+    else:
+        return True
+def greedy_search(spans, flat_ner=True):  # start, end, class, score
+    if flat_ner:
+        has_ov = has_overlapping
+    else:
+        has_ov = has_overlapping_nested
+    new_list = []
+    span_prob = sorted(spans, key=lambda x: -x[-1])
+    for i in range(len(spans)):
+        b = span_prob[i]
+        flag = False
+        for new in new_list:
+            if has_ov(b[:-1], new):
+                flag = True
+                break
+        if not flag:
+            new_list.append(b[:-1])
+    new_list = sorted(new_list, key=lambda x: x[0])
+    return new_list

backup/modules/layers.py CHANGED Viewed

@@ -1,28 +1,28 @@
-import torch
-import torch.nn.functional as F
-from torch import nn
-from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
-class LstmSeq2SeqEncoder(nn.Module):
-    def __init__(self, input_size, hidden_size, num_layers=1, dropout=0., bidirectional=False):
-        super(LstmSeq2SeqEncoder, self).__init__()
-        self.lstm = nn.LSTM(input_size=input_size,
-                            hidden_size=hidden_size,
-                            num_layers=num_layers,
-                            dropout=dropout,
-                            bidirectional=bidirectional,
-                            batch_first=True)
-    def forward(self, x, mask, hidden=None):
-        # Packing the input sequence
-        lengths = mask.sum(dim=1).cpu()
-        packed_x = pack_padded_sequence(x, lengths, batch_first=True, enforce_sorted=False)
-        # Passing packed sequence through LSTM
-        packed_output, hidden = self.lstm(packed_x, hidden)
-        # Unpacking the output sequence
-        output, _ = pad_packed_sequence(packed_output, batch_first=True)
-        return output

+import torch
+import torch.nn.functional as F
+from torch import nn
+from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
+class LstmSeq2SeqEncoder(nn.Module):
+    def __init__(self, input_size, hidden_size, num_layers=1, dropout=0., bidirectional=False):
+        super(LstmSeq2SeqEncoder, self).__init__()
+        self.lstm = nn.LSTM(input_size=input_size,
+                            hidden_size=hidden_size,
+                            num_layers=num_layers,
+                            dropout=dropout,
+                            bidirectional=bidirectional,
+                            batch_first=True)
+    def forward(self, x, mask, hidden=None):
+        # Packing the input sequence
+        lengths = mask.sum(dim=1).cpu()
+        packed_x = pack_padded_sequence(x, lengths, batch_first=True, enforce_sorted=False)
+        # Passing packed sequence through LSTM
+        packed_output, hidden = self.lstm(packed_x, hidden)
+        # Unpacking the output sequence
+        output, _ = pad_packed_sequence(packed_output, batch_first=True)
+        return output

backup/modules/run_evaluation.py CHANGED Viewed

@@ -1,188 +1,188 @@
-import glob
-import json
-import os
-import os
-import torch
-from tqdm import tqdm
-import random
-def open_content(path):
-    paths = glob.glob(os.path.join(path, "*.json"))
-    train, dev, test, labels = None, None, None, None
-    for p in paths:
-        if "train" in p:
-            with open(p, "r") as f:
-                train = json.load(f)
-        elif "dev" in p:
-            with open(p, "r") as f:
-                dev = json.load(f)
-        elif "test" in p:
-            with open(p, "r") as f:
-                test = json.load(f)
-        elif "labels" in p:
-            with open(p, "r") as f:
-                labels = json.load(f)
-    return train, dev, test, labels
-def process(data):
-    words = data['sentence'].split()
-    entities = []  # List of entities (start, end, type)
-    for entity in data['entities']:
-        start_char, end_char = entity['pos']
-        # Initialize variables to keep track of word positions
-        start_word = None
-        end_word = None
-        # Iterate through words and find the word positions
-        char_count = 0
-        for i, word in enumerate(words):
-            word_length = len(word)
-            if char_count == start_char:
-                start_word = i
-            if char_count + word_length == end_char:
-                end_word = i
-                break
-            char_count += word_length + 1  # Add 1 for the space
-        # Append the word positions to the list
-        entities.append((start_word, end_word, entity['type']))
-    # Create a list of word positions for each entity
-    sample = {
-        "tokenized_text": words,
-        "ner": entities
-    }
-    return sample
-# create dataset
-def create_dataset(path):
-    train, dev, test, labels = open_content(path)
-    train_dataset = []
-    dev_dataset = []
-    test_dataset = []
-    for data in train:
-        train_dataset.append(process(data))
-    for data in dev:
-        dev_dataset.append(process(data))
-    for data in test:
-        test_dataset.append(process(data))
-    return train_dataset, dev_dataset, test_dataset, labels
-@torch.no_grad()
-def get_for_one_path(path, model):
-    # load the dataset
-    _, _, test_dataset, entity_types = create_dataset(path)
-    data_name = path.split("/")[-1]  # get the name of the dataset
-    # check if the dataset is flat_ner
-    flat_ner = True
-    if any([i in data_name for i in ["ACE", "GENIA", "Corpus"]]):
-        flat_ner = False
-    # evaluate the model
-    results, f1 = model.evaluate(test_dataset, flat_ner=flat_ner, threshold=0.5, batch_size=12,
-                                 entity_types=entity_types)
-    return data_name, results, f1
-def get_for_all_path(model, steps, log_dir, data_paths):
-    all_paths = glob.glob(f"{data_paths}/*")
-    all_paths = sorted(all_paths)
-    # move the model to the device
-    device = next(model.parameters()).device
-    model.to(device)
-    # set the model to eval mode
-    model.eval()
-    # log the results
-    save_path = os.path.join(log_dir, "results.txt")
-    with open(save_path, "a") as f:
-        f.write("##############################################\n")
-        # write step
-        f.write("step: " + str(steps) + "\n")
-    zero_shot_benc = ["mit-movie", "mit-restaurant", "CrossNER_AI", "CrossNER_literature", "CrossNER_music",
-                      "CrossNER_politics", "CrossNER_science"]
-    zero_shot_benc_results = {}
-    all_results = {}  # without crossNER
-    for p in tqdm(all_paths):
-        if "sample_" not in p:
-            data_name, results, f1 = get_for_one_path(p, model)
-            # write to file
-            with open(save_path, "a") as f:
-                f.write(data_name + "\n")
-                f.write(str(results) + "\n")
-            if data_name in zero_shot_benc:
-                zero_shot_benc_results[data_name] = f1
-            else:
-                all_results[data_name] = f1
-    avg_all = sum(all_results.values()) / len(all_results)
-    avg_zs = sum(zero_shot_benc_results.values()) / len(zero_shot_benc_results)
-    save_path_table = os.path.join(log_dir, "tables.txt")
-    # results for all datasets except crossNER
-    table_bench_all = ""
-    for k, v in all_results.items():
-        table_bench_all += f"{k:20}: {v:.1%}\n"
-    # (20 size aswell for average i.e. :20)
-    table_bench_all += f"{'Average':20}: {avg_all:.1%}"
-    # results for zero-shot benchmark
-    table_bench_zeroshot = ""
-    for k, v in zero_shot_benc_results.items():
-        table_bench_zeroshot += f"{k:20}: {v:.1%}\n"
-    table_bench_zeroshot += f"{'Average':20}: {avg_zs:.1%}"
-    # write to file
-    with open(save_path_table, "a") as f:
-        f.write("##############################################\n")
-        f.write("step: " + str(steps) + "\n")
-        f.write("Table for all datasets except crossNER\n")
-        f.write(table_bench_all + "\n\n")
-        f.write("Table for zero-shot benchmark\n")
-        f.write(table_bench_zeroshot + "\n")
-        f.write("##############################################\n\n")
-def sample_train_data(data_paths, sample_size=10000):
-    all_paths = glob.glob(f"{data_paths}/*")
-    all_paths = sorted(all_paths)
-    # to exclude the zero-shot benchmark datasets
-    zero_shot_benc = ["CrossNER_AI", "CrossNER_literature", "CrossNER_music",
-                      "CrossNER_politics", "CrossNER_science", "ACE 2004"]
-    new_train = []
-    # take 10k samples from each dataset
-    for p in tqdm(all_paths):
-        if any([i in p for i in zero_shot_benc]):
-            continue
-        train, dev, test, labels = create_dataset(p)
-        # add label key to the train data
-        for i in range(len(train)):
-            train[i]["label"] = labels
-        random.shuffle(train)
-        train = train[:sample_size]
-        new_train.extend(train)
-    return new_train

+import glob
+import json
+import os
+import os
+import torch
+from tqdm import tqdm
+import random
+def open_content(path):
+    paths = glob.glob(os.path.join(path, "*.json"))
+    train, dev, test, labels = None, None, None, None
+    for p in paths:
+        if "train" in p:
+            with open(p, "r") as f:
+                train = json.load(f)
+        elif "dev" in p:
+            with open(p, "r") as f:
+                dev = json.load(f)
+        elif "test" in p:
+            with open(p, "r") as f:
+                test = json.load(f)
+        elif "labels" in p:
+            with open(p, "r") as f:
+                labels = json.load(f)
+    return train, dev, test, labels
+def process(data):
+    words = data['sentence'].split()
+    entities = []  # List of entities (start, end, type)
+    for entity in data['entities']:
+        start_char, end_char = entity['pos']
+        # Initialize variables to keep track of word positions
+        start_word = None
+        end_word = None
+        # Iterate through words and find the word positions
+        char_count = 0
+        for i, word in enumerate(words):
+            word_length = len(word)
+            if char_count == start_char:
+                start_word = i
+            if char_count + word_length == end_char:
+                end_word = i
+                break
+            char_count += word_length + 1  # Add 1 for the space
+        # Append the word positions to the list
+        entities.append((start_word, end_word, entity['type']))
+    # Create a list of word positions for each entity
+    sample = {
+        "tokenized_text": words,
+        "ner": entities
+    }
+    return sample
+# create dataset
+def create_dataset(path):
+    train, dev, test, labels = open_content(path)
+    train_dataset = []
+    dev_dataset = []
+    test_dataset = []
+    for data in train:
+        train_dataset.append(process(data))
+    for data in dev:
+        dev_dataset.append(process(data))
+    for data in test:
+        test_dataset.append(process(data))
+    return train_dataset, dev_dataset, test_dataset, labels
+@torch.no_grad()
+def get_for_one_path(path, model):
+    # load the dataset
+    _, _, test_dataset, entity_types = create_dataset(path)
+    data_name = path.split("/")[-1]  # get the name of the dataset
+    # check if the dataset is flat_ner
+    flat_ner = True
+    if any([i in data_name for i in ["ACE", "GENIA", "Corpus"]]):
+        flat_ner = False
+    # evaluate the model
+    results, f1 = model.evaluate(test_dataset, flat_ner=flat_ner, threshold=0.5, batch_size=12,
+                                 entity_types=entity_types)
+    return data_name, results, f1
+def get_for_all_path(model, steps, log_dir, data_paths):
+    all_paths = glob.glob(f"{data_paths}/*")
+    all_paths = sorted(all_paths)
+    # move the model to the device
+    device = next(model.parameters()).device
+    model.to(device)
+    # set the model to eval mode
+    model.eval()
+    # log the results
+    save_path = os.path.join(log_dir, "results.txt")
+    with open(save_path, "a") as f:
+        f.write("##############################################\n")
+        # write step
+        f.write("step: " + str(steps) + "\n")
+    zero_shot_benc = ["mit-movie", "mit-restaurant", "CrossNER_AI", "CrossNER_literature", "CrossNER_music",
+                      "CrossNER_politics", "CrossNER_science"]
+    zero_shot_benc_results = {}
+    all_results = {}  # without crossNER
+    for p in tqdm(all_paths):
+        if "sample_" not in p:
+            data_name, results, f1 = get_for_one_path(p, model)
+            # write to file
+            with open(save_path, "a") as f:
+                f.write(data_name + "\n")
+                f.write(str(results) + "\n")
+            if data_name in zero_shot_benc:
+                zero_shot_benc_results[data_name] = f1
+            else:
+                all_results[data_name] = f1
+    avg_all = sum(all_results.values()) / len(all_results)
+    avg_zs = sum(zero_shot_benc_results.values()) / len(zero_shot_benc_results)
+    save_path_table = os.path.join(log_dir, "tables.txt")
+    # results for all datasets except crossNER
+    table_bench_all = ""
+    for k, v in all_results.items():
+        table_bench_all += f"{k:20}: {v:.1%}\n"
+    # (20 size aswell for average i.e. :20)
+    table_bench_all += f"{'Average':20}: {avg_all:.1%}"
+    # results for zero-shot benchmark
+    table_bench_zeroshot = ""
+    for k, v in zero_shot_benc_results.items():
+        table_bench_zeroshot += f"{k:20}: {v:.1%}\n"
+    table_bench_zeroshot += f"{'Average':20}: {avg_zs:.1%}"
+    # write to file
+    with open(save_path_table, "a") as f:
+        f.write("##############################################\n")
+        f.write("step: " + str(steps) + "\n")
+        f.write("Table for all datasets except crossNER\n")
+        f.write(table_bench_all + "\n\n")
+        f.write("Table for zero-shot benchmark\n")
+        f.write(table_bench_zeroshot + "\n")
+        f.write("##############################################\n\n")
+def sample_train_data(data_paths, sample_size=10000):
+    all_paths = glob.glob(f"{data_paths}/*")
+    all_paths = sorted(all_paths)
+    # to exclude the zero-shot benchmark datasets
+    zero_shot_benc = ["CrossNER_AI", "CrossNER_literature", "CrossNER_music",
+                      "CrossNER_politics", "CrossNER_science", "ACE 2004"]
+    new_train = []
+    # take 10k samples from each dataset
+    for p in tqdm(all_paths):
+        if any([i in p for i in zero_shot_benc]):
+            continue
+        train, dev, test, labels = create_dataset(p)
+        # add label key to the train data
+        for i in range(len(train)):
+            train[i]["label"] = labels
+        random.shuffle(train)
+        train = train[:sample_size]
+        new_train.extend(train)
+    return new_train

backup/modules/span_rep.py CHANGED Viewed

@@ -1,369 +1,369 @@
-import torch
-import torch.nn.functional as F
-from torch import nn
-def create_projection_layer(hidden_size: int, dropout: float, out_dim: int = None) -> nn.Sequential:
-    """
-    Creates a projection layer with specified configurations.
-    """
-    if out_dim is None:
-        out_dim = hidden_size
-    return nn.Sequential(
-        nn.Linear(hidden_size, out_dim * 4),
-        nn.ReLU(),
-        nn.Dropout(dropout),
-        nn.Linear(out_dim * 4, out_dim)
-    )
-class SpanQuery(nn.Module):
-    def __init__(self, hidden_size, max_width, trainable=True):
-        super().__init__()
-        self.query_seg = nn.Parameter(torch.randn(hidden_size, max_width))
-        nn.init.uniform_(self.query_seg, a=-1, b=1)
-        if not trainable:
-            self.query_seg.requires_grad = False
-        self.project = nn.Sequential(
-            nn.Linear(hidden_size, hidden_size),
-            nn.ReLU()
-        )
-    def forward(self, h, *args):
-        # h of shape [B, L, D]
-        # query_seg of shape [D, max_width]
-        span_rep = torch.einsum('bld, ds->blsd', h, self.query_seg)
-        return self.project(span_rep)
-class SpanMLP(nn.Module):
-    def __init__(self, hidden_size, max_width):
-        super().__init__()
-        self.mlp = nn.Linear(hidden_size, hidden_size * max_width)
-    def forward(self, h, *args):
-        # h of shape [B, L, D]
-        # query_seg of shape [D, max_width]
-        B, L, D = h.size()
-        span_rep = self.mlp(h)
-        span_rep = span_rep.view(B, L, -1, D)
-        return span_rep.relu()
-class SpanCAT(nn.Module):
-    def __init__(self, hidden_size, max_width):
-        super().__init__()
-        self.max_width = max_width
-        self.query_seg = nn.Parameter(torch.randn(128, max_width))
-        self.project = nn.Sequential(
-            nn.Linear(hidden_size + 128, hidden_size),
-            nn.ReLU()
-        )
-    def forward(self, h, *args):
-        # h of shape [B, L, D]
-        # query_seg of shape [D, max_width]
-        B, L, D = h.size()
-        h = h.view(B, L, 1, D).repeat(1, 1, self.max_width, 1)
-        q = self.query_seg.view(1, 1, self.max_width, -1).repeat(B, L, 1, 1)
-        span_rep = torch.cat([h, q], dim=-1)
-        span_rep = self.project(span_rep)
-        return span_rep
-class SpanConvBlock(nn.Module):
-    def __init__(self, hidden_size, kernel_size, span_mode='conv_normal'):
-        super().__init__()
-        if span_mode == 'conv_conv':
-            self.conv = nn.Conv1d(hidden_size, hidden_size,
-                                  kernel_size=kernel_size)
-            # initialize the weights
-            nn.init.kaiming_uniform_(self.conv.weight, nonlinearity='relu')
-        elif span_mode == 'conv_max':
-            self.conv = nn.MaxPool1d(kernel_size=kernel_size, stride=1)
-        elif span_mode == 'conv_mean' or span_mode == 'conv_sum':
-            self.conv = nn.AvgPool1d(kernel_size=kernel_size, stride=1)
-        self.span_mode = span_mode
-        self.pad = kernel_size - 1
-    def forward(self, x):
-        x = torch.einsum('bld->bdl', x)
-        if self.pad > 0:
-            x = F.pad(x, (0, self.pad), "constant", 0)
-        x = self.conv(x)
-        if self.span_mode == "conv_sum":
-            x = x * (self.pad + 1)
-        return torch.einsum('bdl->bld', x)
-class SpanConv(nn.Module):
-    def __init__(self, hidden_size, max_width, span_mode):
-        super().__init__()
-        kernels = [i + 2 for i in range(max_width - 1)]
-        self.convs = nn.ModuleList()
-        for kernel in kernels:
-            self.convs.append(SpanConvBlock(hidden_size, kernel, span_mode))
-        self.project = nn.Sequential(
-            nn.ReLU(),
-            nn.Linear(hidden_size, hidden_size)
-        )
-    def forward(self, x, *args):
-        span_reps = [x]
-        for conv in self.convs:
-            h = conv(x)
-            span_reps.append(h)
-        span_reps = torch.stack(span_reps, dim=-2)
-        return self.project(span_reps)
-class SpanEndpointsBlock(nn.Module):
-    def __init__(self, kernel_size):
-        super().__init__()
-        self.kernel_size = kernel_size
-    def forward(self, x):
-        B, L, D = x.size()
-        span_idx = torch.LongTensor(
-            [[i, i + self.kernel_size - 1] for i in range(L)]).to(x.device)
-        x = F.pad(x, (0, 0, 0, self.kernel_size - 1), "constant", 0)
-        # endrep
-        start_end_rep = torch.index_select(x, dim=1, index=span_idx.view(-1))
-        start_end_rep = start_end_rep.view(B, L, 2, D)
-        return start_end_rep
-class ConvShare(nn.Module):
-    def __init__(self, hidden_size, max_width):
-        super().__init__()
-        self.max_width = max_width
-        self.conv_weigth = nn.Parameter(
-            torch.randn(hidden_size, hidden_size, max_width))
-        nn.init.kaiming_uniform_(self.conv_weigth, nonlinearity='relu')
-        self.project = nn.Sequential(
-            nn.ReLU(),
-            nn.Linear(hidden_size, hidden_size)
-        )
-    def forward(self, x, *args):
-        span_reps = []
-        x = torch.einsum('bld->bdl', x)
-        for i in range(self.max_width):
-            pad = i
-            x_i = F.pad(x, (0, pad), "constant", 0)
-            conv_w = self.conv_weigth[:, :, :i + 1]
-            out_i = F.conv1d(x_i, conv_w)
-            span_reps.append(out_i.transpose(-1, -2))
-        out = torch.stack(span_reps, dim=-2)
-        return self.project(out)
-def extract_elements(sequence, indices):
-    B, L, D = sequence.shape
-    K = indices.shape[1]
-    # Expand indices to [B, K, D]
-    expanded_indices = indices.unsqueeze(2).expand(-1, -1, D)
-    # Gather the elements
-    extracted_elements = torch.gather(sequence, 1, expanded_indices)
-    return extracted_elements
-class SpanMarker(nn.Module):
-    def __init__(self, hidden_size, max_width, dropout=0.4):
-        super().__init__()
-        self.max_width = max_width
-        self.project_start = nn.Sequential(
-            nn.Linear(hidden_size, hidden_size * 2, bias=True),
-            nn.ReLU(),
-            nn.Dropout(dropout),
-            nn.Linear(hidden_size * 2, hidden_size, bias=True),
-        )
-        self.project_end = nn.Sequential(
-            nn.Linear(hidden_size, hidden_size * 2, bias=True),
-            nn.ReLU(),
-            nn.Dropout(dropout),
-            nn.Linear(hidden_size * 2, hidden_size, bias=True),
-        )
-        self.out_project = nn.Linear(hidden_size * 2, hidden_size, bias=True)
-    def forward(self, h, span_idx):
-        # h of shape [B, L, D]
-        # query_seg of shape [D, max_width]
-        B, L, D = h.size()
-        # project start and end
-        start_rep = self.project_start(h)
-        end_rep = self.project_end(h)
-        start_span_rep = extract_elements(start_rep, span_idx[:, :, 0])
-        end_span_rep = extract_elements(end_rep, span_idx[:, :, 1])
-        # concat start and end
-        cat = torch.cat([start_span_rep, end_span_rep], dim=-1).relu()
-        # project
-        cat = self.out_project(cat)
-        # reshape
-        return cat.view(B, L, self.max_width, D)
-class SpanMarkerV0(nn.Module):
-    """
-    Marks and projects span endpoints using an MLP.
-    """
-    def __init__(self, hidden_size: int, max_width: int, dropout: float = 0.4):
-        super().__init__()
-        self.max_width = max_width
-        self.project_start = create_projection_layer(hidden_size, dropout)
-        self.project_end = create_projection_layer(hidden_size, dropout)
-        self.out_project = create_projection_layer(hidden_size * 2, dropout, hidden_size)
-    def forward(self, h: torch.Tensor, span_idx: torch.Tensor) -> torch.Tensor:
-        B, L, D = h.size()
-        start_rep = self.project_start(h)
-        end_rep = self.project_end(h)
-        start_span_rep = extract_elements(start_rep, span_idx[:, :, 0])
-        end_span_rep = extract_elements(end_rep, span_idx[:, :, 1])
-        cat = torch.cat([start_span_rep, end_span_rep], dim=-1).relu()
-        return self.out_project(cat).view(B, L, self.max_width, D)
-class ConvShareV2(nn.Module):
-    def __init__(self, hidden_size, max_width):
-        super().__init__()
-        self.max_width = max_width
-        self.conv_weigth = nn.Parameter(
-            torch.randn(hidden_size, hidden_size, max_width)
-        )
-        nn.init.xavier_normal_(self.conv_weigth)
-    def forward(self, x, *args):
-        span_reps = []
-        x = torch.einsum('bld->bdl', x)
-        for i in range(self.max_width):
-            pad = i
-            x_i = F.pad(x, (0, pad), "constant", 0)
-            conv_w = self.conv_weigth[:, :, :i + 1]
-            out_i = F.conv1d(x_i, conv_w)
-            span_reps.append(out_i.transpose(-1, -2))
-        out = torch.stack(span_reps, dim=-2)
-        return out
-class SpanRepLayer(nn.Module):
-    """
-    Various span representation approaches
-    """
-    def __init__(self, hidden_size, max_width, span_mode, **kwargs):
-        super().__init__()
-        if span_mode == 'marker':
-            self.span_rep_layer = SpanMarker(hidden_size, max_width, **kwargs)
-        elif span_mode == 'markerV0':
-            self.span_rep_layer = SpanMarkerV0(hidden_size, max_width, **kwargs)
-        elif span_mode == 'query':
-            self.span_rep_layer = SpanQuery(
-                hidden_size, max_width, trainable=True)
-        elif span_mode == 'mlp':
-            self.span_rep_layer = SpanMLP(hidden_size, max_width)
-        elif span_mode == 'cat':
-            self.span_rep_layer = SpanCAT(hidden_size, max_width)
-        elif span_mode == 'conv_conv':
-            self.span_rep_layer = SpanConv(
-                hidden_size, max_width, span_mode='conv_conv')
-        elif span_mode == 'conv_max':
-            self.span_rep_layer = SpanConv(
-                hidden_size, max_width, span_mode='conv_max')
-        elif span_mode == 'conv_mean':
-            self.span_rep_layer = SpanConv(
-                hidden_size, max_width, span_mode='conv_mean')
-        elif span_mode == 'conv_sum':
-            self.span_rep_layer = SpanConv(
-                hidden_size, max_width, span_mode='conv_sum')
-        elif span_mode == 'conv_share':
-            self.span_rep_layer = ConvShare(hidden_size, max_width)
-        else:
-            raise ValueError(f'Unknown span mode {span_mode}')
-    def forward(self, x, *args):
-        return self.span_rep_layer(x, *args)

+import torch
+import torch.nn.functional as F
+from torch import nn
+def create_projection_layer(hidden_size: int, dropout: float, out_dim: int = None) -> nn.Sequential:
+    """
+    Creates a projection layer with specified configurations.
+    """
+    if out_dim is None:
+        out_dim = hidden_size
+    return nn.Sequential(
+        nn.Linear(hidden_size, out_dim * 4),
+        nn.ReLU(),
+        nn.Dropout(dropout),
+        nn.Linear(out_dim * 4, out_dim)
+    )
+class SpanQuery(nn.Module):
+    def __init__(self, hidden_size, max_width, trainable=True):
+        super().__init__()
+        self.query_seg = nn.Parameter(torch.randn(hidden_size, max_width))
+        nn.init.uniform_(self.query_seg, a=-1, b=1)
+        if not trainable:
+            self.query_seg.requires_grad = False
+        self.project = nn.Sequential(
+            nn.Linear(hidden_size, hidden_size),
+            nn.ReLU()
+        )
+    def forward(self, h, *args):
+        # h of shape [B, L, D]
+        # query_seg of shape [D, max_width]
+        span_rep = torch.einsum('bld, ds->blsd', h, self.query_seg)
+        return self.project(span_rep)
+class SpanMLP(nn.Module):
+    def __init__(self, hidden_size, max_width):
+        super().__init__()
+        self.mlp = nn.Linear(hidden_size, hidden_size * max_width)
+    def forward(self, h, *args):
+        # h of shape [B, L, D]
+        # query_seg of shape [D, max_width]
+        B, L, D = h.size()
+        span_rep = self.mlp(h)
+        span_rep = span_rep.view(B, L, -1, D)
+        return span_rep.relu()
+class SpanCAT(nn.Module):
+    def __init__(self, hidden_size, max_width):
+        super().__init__()
+        self.max_width = max_width
+        self.query_seg = nn.Parameter(torch.randn(128, max_width))
+        self.project = nn.Sequential(
+            nn.Linear(hidden_size + 128, hidden_size),
+            nn.ReLU()
+        )
+    def forward(self, h, *args):
+        # h of shape [B, L, D]
+        # query_seg of shape [D, max_width]
+        B, L, D = h.size()
+        h = h.view(B, L, 1, D).repeat(1, 1, self.max_width, 1)
+        q = self.query_seg.view(1, 1, self.max_width, -1).repeat(B, L, 1, 1)
+        span_rep = torch.cat([h, q], dim=-1)
+        span_rep = self.project(span_rep)
+        return span_rep
+class SpanConvBlock(nn.Module):
+    def __init__(self, hidden_size, kernel_size, span_mode='conv_normal'):
+        super().__init__()
+        if span_mode == 'conv_conv':
+            self.conv = nn.Conv1d(hidden_size, hidden_size,
+                                  kernel_size=kernel_size)
+            # initialize the weights
+            nn.init.kaiming_uniform_(self.conv.weight, nonlinearity='relu')
+        elif span_mode == 'conv_max':
+            self.conv = nn.MaxPool1d(kernel_size=kernel_size, stride=1)
+        elif span_mode == 'conv_mean' or span_mode == 'conv_sum':
+            self.conv = nn.AvgPool1d(kernel_size=kernel_size, stride=1)
+        self.span_mode = span_mode
+        self.pad = kernel_size - 1
+    def forward(self, x):
+        x = torch.einsum('bld->bdl', x)
+        if self.pad > 0:
+            x = F.pad(x, (0, self.pad), "constant", 0)
+        x = self.conv(x)
+        if self.span_mode == "conv_sum":
+            x = x * (self.pad + 1)
+        return torch.einsum('bdl->bld', x)
+class SpanConv(nn.Module):
+    def __init__(self, hidden_size, max_width, span_mode):
+        super().__init__()
+        kernels = [i + 2 for i in range(max_width - 1)]
+        self.convs = nn.ModuleList()
+        for kernel in kernels:
+            self.convs.append(SpanConvBlock(hidden_size, kernel, span_mode))
+        self.project = nn.Sequential(
+            nn.ReLU(),
+            nn.Linear(hidden_size, hidden_size)
+        )
+    def forward(self, x, *args):
+        span_reps = [x]
+        for conv in self.convs:
+            h = conv(x)
+            span_reps.append(h)
+        span_reps = torch.stack(span_reps, dim=-2)
+        return self.project(span_reps)
+class SpanEndpointsBlock(nn.Module):
+    def __init__(self, kernel_size):
+        super().__init__()
+        self.kernel_size = kernel_size
+    def forward(self, x):
+        B, L, D = x.size()
+        span_idx = torch.LongTensor(
+            [[i, i + self.kernel_size - 1] for i in range(L)]).to(x.device)
+        x = F.pad(x, (0, 0, 0, self.kernel_size - 1), "constant", 0)
+        # endrep
+        start_end_rep = torch.index_select(x, dim=1, index=span_idx.view(-1))
+        start_end_rep = start_end_rep.view(B, L, 2, D)
+        return start_end_rep
+class ConvShare(nn.Module):
+    def __init__(self, hidden_size, max_width):
+        super().__init__()
+        self.max_width = max_width
+        self.conv_weigth = nn.Parameter(
+            torch.randn(hidden_size, hidden_size, max_width))
+        nn.init.kaiming_uniform_(self.conv_weigth, nonlinearity='relu')
+        self.project = nn.Sequential(
+            nn.ReLU(),
+            nn.Linear(hidden_size, hidden_size)
+        )
+    def forward(self, x, *args):
+        span_reps = []
+        x = torch.einsum('bld->bdl', x)
+        for i in range(self.max_width):
+            pad = i
+            x_i = F.pad(x, (0, pad), "constant", 0)
+            conv_w = self.conv_weigth[:, :, :i + 1]
+            out_i = F.conv1d(x_i, conv_w)
+            span_reps.append(out_i.transpose(-1, -2))
+        out = torch.stack(span_reps, dim=-2)
+        return self.project(out)
+def extract_elements(sequence, indices):
+    B, L, D = sequence.shape
+    K = indices.shape[1]
+    # Expand indices to [B, K, D]
+    expanded_indices = indices.unsqueeze(2).expand(-1, -1, D)
+    # Gather the elements
+    extracted_elements = torch.gather(sequence, 1, expanded_indices)
+    return extracted_elements
+class SpanMarker(nn.Module):
+    def __init__(self, hidden_size, max_width, dropout=0.4):
+        super().__init__()
+        self.max_width = max_width
+        self.project_start = nn.Sequential(
+            nn.Linear(hidden_size, hidden_size * 2, bias=True),
+            nn.ReLU(),
+            nn.Dropout(dropout),
+            nn.Linear(hidden_size * 2, hidden_size, bias=True),
+        )
+        self.project_end = nn.Sequential(
+            nn.Linear(hidden_size, hidden_size * 2, bias=True),
+            nn.ReLU(),
+            nn.Dropout(dropout),
+            nn.Linear(hidden_size * 2, hidden_size, bias=True),
+        )
+        self.out_project = nn.Linear(hidden_size * 2, hidden_size, bias=True)
+    def forward(self, h, span_idx):
+        # h of shape [B, L, D]
+        # query_seg of shape [D, max_width]
+        B, L, D = h.size()
+        # project start and end
+        start_rep = self.project_start(h)
+        end_rep = self.project_end(h)
+        start_span_rep = extract_elements(start_rep, span_idx[:, :, 0])
+        end_span_rep = extract_elements(end_rep, span_idx[:, :, 1])
+        # concat start and end
+        cat = torch.cat([start_span_rep, end_span_rep], dim=-1).relu()
+        # project
+        cat = self.out_project(cat)
+        # reshape
+        return cat.view(B, L, self.max_width, D)
+class SpanMarkerV0(nn.Module):
+    """
+    Marks and projects span endpoints using an MLP.
+    """
+    def __init__(self, hidden_size: int, max_width: int, dropout: float = 0.4):
+        super().__init__()
+        self.max_width = max_width
+        self.project_start = create_projection_layer(hidden_size, dropout)
+        self.project_end = create_projection_layer(hidden_size, dropout)
+        self.out_project = create_projection_layer(hidden_size * 2, dropout, hidden_size)
+    def forward(self, h: torch.Tensor, span_idx: torch.Tensor) -> torch.Tensor:
+        B, L, D = h.size()
+        start_rep = self.project_start(h)
+        end_rep = self.project_end(h)
+        start_span_rep = extract_elements(start_rep, span_idx[:, :, 0])
+        end_span_rep = extract_elements(end_rep, span_idx[:, :, 1])
+        cat = torch.cat([start_span_rep, end_span_rep], dim=-1).relu()
+        return self.out_project(cat).view(B, L, self.max_width, D)
+class ConvShareV2(nn.Module):
+    def __init__(self, hidden_size, max_width):
+        super().__init__()
+        self.max_width = max_width
+        self.conv_weigth = nn.Parameter(
+            torch.randn(hidden_size, hidden_size, max_width)
+        )
+        nn.init.xavier_normal_(self.conv_weigth)
+    def forward(self, x, *args):
+        span_reps = []
+        x = torch.einsum('bld->bdl', x)
+        for i in range(self.max_width):
+            pad = i
+            x_i = F.pad(x, (0, pad), "constant", 0)
+            conv_w = self.conv_weigth[:, :, :i + 1]
+            out_i = F.conv1d(x_i, conv_w)
+            span_reps.append(out_i.transpose(-1, -2))
+        out = torch.stack(span_reps, dim=-2)
+        return out
+class SpanRepLayer(nn.Module):
+    """
+    Various span representation approaches
+    """
+    def __init__(self, hidden_size, max_width, span_mode, **kwargs):
+        super().__init__()
+        if span_mode == 'marker':
+            self.span_rep_layer = SpanMarker(hidden_size, max_width, **kwargs)
+        elif span_mode == 'markerV0':
+            self.span_rep_layer = SpanMarkerV0(hidden_size, max_width, **kwargs)
+        elif span_mode == 'query':
+            self.span_rep_layer = SpanQuery(
+                hidden_size, max_width, trainable=True)
+        elif span_mode == 'mlp':
+            self.span_rep_layer = SpanMLP(hidden_size, max_width)
+        elif span_mode == 'cat':
+            self.span_rep_layer = SpanCAT(hidden_size, max_width)
+        elif span_mode == 'conv_conv':
+            self.span_rep_layer = SpanConv(
+                hidden_size, max_width, span_mode='conv_conv')
+        elif span_mode == 'conv_max':
+            self.span_rep_layer = SpanConv(
+                hidden_size, max_width, span_mode='conv_max')
+        elif span_mode == 'conv_mean':
+            self.span_rep_layer = SpanConv(
+                hidden_size, max_width, span_mode='conv_mean')
+        elif span_mode == 'conv_sum':
+            self.span_rep_layer = SpanConv(
+                hidden_size, max_width, span_mode='conv_sum')
+        elif span_mode == 'conv_share':
+            self.span_rep_layer = ConvShare(hidden_size, max_width)
+        else:
+            raise ValueError(f'Unknown span mode {span_mode}')
+    def forward(self, x, *args):
+        return self.span_rep_layer(x, *args)

backup/modules/token_rep.py CHANGED Viewed

@@ -1,54 +1,54 @@
-from typing import List
-import torch
-from flair.data import Sentence
-from flair.embeddings import TransformerWordEmbeddings
-from torch import nn
-from torch.nn.utils.rnn import pad_sequence
-# flair.cache_root = '/gpfswork/rech/pds/upa43yu/.cache'
-class TokenRepLayer(nn.Module):
-    def __init__(self, model_name: str = "bert-base-cased", fine_tune: bool = True, subtoken_pooling: str = "first",
-                 hidden_size: int = 768,
-                 add_tokens=["[SEP]", "[ENT]"]
-                 ):
-        super().__init__()
-        self.bert_layer = TransformerWordEmbeddings(
-            model_name,
-            fine_tune=fine_tune,
-            subtoken_pooling=subtoken_pooling,
-            allow_long_sentences=True
-        )
-        # add tokens to vocabulary
-        self.bert_layer.tokenizer.add_tokens(add_tokens)
-        # resize token embeddings
-        self.bert_layer.model.resize_token_embeddings(len(self.bert_layer.tokenizer))
-        bert_hidden_size = self.bert_layer.embedding_length
-        if hidden_size != bert_hidden_size:
-            self.projection = nn.Linear(bert_hidden_size, hidden_size)
-    def forward(self, tokens: List[List[str]], lengths: torch.Tensor):
-        token_embeddings = self.compute_word_embedding(tokens)
-        if hasattr(self, "projection"):
-            token_embeddings = self.projection(token_embeddings)
-        B = len(lengths)
-        max_length = lengths.max()
-        mask = (torch.arange(max_length).view(1, -1).repeat(B, 1) < lengths.cpu().unsqueeze(1)).to(
-            token_embeddings.device).long()
-        return {"embeddings": token_embeddings, "mask": mask}
-    def compute_word_embedding(self, tokens):
-        sentences = [Sentence(i) for i in tokens]
-        self.bert_layer.embed(sentences)
-        token_embeddings = pad_sequence([torch.stack([t.embedding for t in k]) for k in sentences], batch_first=True)
-        return token_embeddings

+from typing import List
+import torch
+from flair.data import Sentence
+from flair.embeddings import TransformerWordEmbeddings
+from torch import nn
+from torch.nn.utils.rnn import pad_sequence
+# flair.cache_root = '/gpfswork/rech/pds/upa43yu/.cache'
+class TokenRepLayer(nn.Module):
+    def __init__(self, model_name: str = "bert-base-cased", fine_tune: bool = True, subtoken_pooling: str = "first",
+                 hidden_size: int = 768,
+                 add_tokens=["[SEP]", "[ENT]"]
+                 ):
+        super().__init__()
+        self.bert_layer = TransformerWordEmbeddings(
+            model_name,
+            fine_tune=fine_tune,
+            subtoken_pooling=subtoken_pooling,
+            allow_long_sentences=True
+        )
+        # add tokens to vocabulary
+        self.bert_layer.tokenizer.add_tokens(add_tokens)
+        # resize token embeddings
+        self.bert_layer.model.resize_token_embeddings(len(self.bert_layer.tokenizer))
+        bert_hidden_size = self.bert_layer.embedding_length
+        if hidden_size != bert_hidden_size:
+            self.projection = nn.Linear(bert_hidden_size, hidden_size)
+    def forward(self, tokens: List[List[str]], lengths: torch.Tensor):
+        token_embeddings = self.compute_word_embedding(tokens)
+        if hasattr(self, "projection"):
+            token_embeddings = self.projection(token_embeddings)
+        B = len(lengths)
+        max_length = lengths.max()
+        mask = (torch.arange(max_length).view(1, -1).repeat(B, 1) < lengths.cpu().unsqueeze(1)).to(
+            token_embeddings.device).long()
+        return {"embeddings": token_embeddings, "mask": mask}
+    def compute_word_embedding(self, tokens):
+        sentences = [Sentence(i) for i in tokens]
+        self.bert_layer.embed(sentences)
+        token_embeddings = pad_sequence([torch.stack([t.embedding for t in k]) for k in sentences], batch_first=True)
+        return token_embeddings

backup/requirements.txt CHANGED Viewed

@@ -1,6 +1,6 @@
-torch
-transformers
-huggingface_hub
-flair
-seqeval
 tqdm

+torch
+transformers
+huggingface_hub
+flair
+seqeval
 tqdm

backup/save_load.py CHANGED Viewed

@@ -1,20 +1,20 @@
-import torch
-from .model import GLiNER
-def save_model(current_model, path):
-    config = current_model.config
-    dict_save = {"model_weights": current_model.state_dict(), "config": config}
-    torch.save(dict_save, path)
-def load_model(path, model_name=None, device=None):
-    dict_load = torch.load(path, map_location=torch.device('cpu'))
-    config = dict_load["config"]
-    if model_name is not None:
-        config.model_name = model_name
-    loaded_model = GLiNER(config)
-    loaded_model.load_state_dict(dict_load["model_weights"])
-    return loaded_model.to(device) if device is not None else loaded_model

+import torch
+from .model import GLiNER
+def save_model(current_model, path):
+    config = current_model.config
+    dict_save = {"model_weights": current_model.state_dict(), "config": config}
+    torch.save(dict_save, path)
+def load_model(path, model_name=None, device=None):
+    dict_load = torch.load(path, map_location=torch.device('cpu'))
+    config = dict_load["config"]
+    if model_name is not None:
+        config.model_name = model_name
+    loaded_model = GLiNER(config)
+    loaded_model.load_state_dict(dict_load["model_weights"])
+    return loaded_model.to(device) if device is not None else loaded_model

backup/train.py CHANGED Viewed

@@ -1,132 +1,132 @@
-import argparse
-import os
-import torch
-import yaml
-from tqdm import tqdm
-from transformers import get_cosine_schedule_with_warmup
-# from model_nested import NerFilteredSemiCRF
-from .model import GLiNER
-from .modules.run_evaluation import get_for_all_path, sample_train_data
-from .save_load import save_model, load_model
-import json
-# train function
-def train(model, optimizer, train_data, num_steps=1000, eval_every=100, log_dir="logs", warmup_ratio=0.1,
-          train_batch_size=8, device='cuda'):
-    model.train()
-    # initialize data loaders
-    train_loader = model.create_dataloader(train_data, batch_size=train_batch_size, shuffle=True)
-    pbar = tqdm(range(num_steps))
-    if warmup_ratio < 1:
-        num_warmup_steps = int(num_steps * warmup_ratio)
-    else:
-        num_warmup_steps = int(warmup_ratio)
-    scheduler = get_cosine_schedule_with_warmup(
-        optimizer,
-        num_warmup_steps=num_warmup_steps,
-        num_training_steps=num_steps
-    )
-    iter_train_loader = iter(train_loader)
-    for step in pbar:
-        try:
-            x = next(iter_train_loader)
-        except StopIteration:
-            iter_train_loader = iter(train_loader)
-            x = next(iter_train_loader)
-        for k, v in x.items():
-            if isinstance(v, torch.Tensor):
-                x[k] = v.to(device)
-        try:
-            loss = model(x)  # Forward pass
-        except:
-            continue
-        # check if loss is nan
-        if torch.isnan(loss):
-            continue
-        loss.backward()  # Compute gradients
-        optimizer.step()  # Update parameters
-        scheduler.step()  # Update learning rate schedule
-        optimizer.zero_grad()  # Reset gradients
-        description = f"step: {step} | epoch: {step // len(train_loader)} | loss: {loss.item():.2f}"
-        if (step + 1) % eval_every == 0:
-            current_path = os.path.join(log_dir, f'model_{step + 1}')
-            save_model(model, current_path)
-            #val_data_dir =  "/gpfswork/rech/ohy/upa43yu/NER_datasets" # can be obtained from "https://drive.google.com/file/d/1T-5IbocGka35I7X3CE6yKe5N_Xg2lVKT/view"
-            #get_for_all_path(model, step, log_dir, val_data_dir)  # you can remove this comment if you want to evaluate the model
-            model.train()
-        pbar.set_description(description)
-def create_parser():
-    parser = argparse.ArgumentParser(description="Span-based NER")
-    parser.add_argument("--config", type=str, default="config.yaml", help="Path to config file")
-    parser.add_argument('--log_dir', type=str, default='logs', help='Path to the log directory')
-    return parser
-def load_config_as_namespace(config_file):
-    with open(config_file, 'r') as f:
-        config_dict = yaml.safe_load(f)
-    return argparse.Namespace(**config_dict)
-if __name__ == "__main__":
-    # parse args
-    parser = create_parser()
-    args = parser.parse_args()
-    # load config
-    config = load_config_as_namespace(args.config)
-    config.log_dir = args.log_dir
-    try:
-        with open(config.train_data, 'r') as f:
-            data = json.load(f)
-    except:
-        data = sample_train_data(config.train_data, 10000)
-    if config.prev_path != "none":
-        model = load_model(config.prev_path)
-        model.config = config
-    else:
-        model = GLiNER(config)
-    if torch.cuda.is_available():
-        model = model.cuda()
-    lr_encoder = float(config.lr_encoder)
-    lr_others = float(config.lr_others)
-    optimizer = torch.optim.AdamW([
-        # encoder
-        {'params': model.token_rep_layer.parameters(), 'lr': lr_encoder},
-        {'params': model.rnn.parameters(), 'lr': lr_others},
-        # projection layers
-        {'params': model.span_rep_layer.parameters(), 'lr': lr_others},
-        {'params': model.prompt_rep_layer.parameters(), 'lr': lr_others},
-    ])
-    device = 'cuda' if torch.cuda.is_available() else 'cpu'
-    train(model, optimizer, data, num_steps=config.num_steps, eval_every=config.eval_every,
-          log_dir=config.log_dir, warmup_ratio=config.warmup_ratio, train_batch_size=config.train_batch_size,
-          device=device)

+import argparse
+import os
+import torch
+import yaml
+from tqdm import tqdm
+from transformers import get_cosine_schedule_with_warmup
+# from model_nested import NerFilteredSemiCRF
+from .model import GLiNER
+from .modules.run_evaluation import get_for_all_path, sample_train_data
+from .save_load import save_model, load_model
+import json
+# train function
+def train(model, optimizer, train_data, num_steps=1000, eval_every=100, log_dir="logs", warmup_ratio=0.1,
+          train_batch_size=8, device='cuda'):
+    model.train()
+    # initialize data loaders
+    train_loader = model.create_dataloader(train_data, batch_size=train_batch_size, shuffle=True)
+    pbar = tqdm(range(num_steps))
+    if warmup_ratio < 1:
+        num_warmup_steps = int(num_steps * warmup_ratio)
+    else:
+        num_warmup_steps = int(warmup_ratio)
+    scheduler = get_cosine_schedule_with_warmup(
+        optimizer,
+        num_warmup_steps=num_warmup_steps,
+        num_training_steps=num_steps
+    )
+    iter_train_loader = iter(train_loader)
+    for step in pbar:
+        try:
+            x = next(iter_train_loader)
+        except StopIteration:
+            iter_train_loader = iter(train_loader)
+            x = next(iter_train_loader)
+        for k, v in x.items():
+            if isinstance(v, torch.Tensor):
+                x[k] = v.to(device)
+        try:
+            loss = model(x)  # Forward pass
+        except:
+            continue
+        # check if loss is nan
+        if torch.isnan(loss):
+            continue
+        loss.backward()  # Compute gradients
+        optimizer.step()  # Update parameters
+        scheduler.step()  # Update learning rate schedule
+        optimizer.zero_grad()  # Reset gradients
+        description = f"step: {step} | epoch: {step // len(train_loader)} | loss: {loss.item():.2f}"
+        if (step + 1) % eval_every == 0:
+            current_path = os.path.join(log_dir, f'model_{step + 1}')
+            save_model(model, current_path)
+            #val_data_dir =  "/gpfswork/rech/ohy/upa43yu/NER_datasets" # can be obtained from "https://drive.google.com/file/d/1T-5IbocGka35I7X3CE6yKe5N_Xg2lVKT/view"
+            #get_for_all_path(model, step, log_dir, val_data_dir)  # you can remove this comment if you want to evaluate the model
+            model.train()
+        pbar.set_description(description)
+def create_parser():
+    parser = argparse.ArgumentParser(description="Span-based NER")
+    parser.add_argument("--config", type=str, default="config.yaml", help="Path to config file")
+    parser.add_argument('--log_dir', type=str, default='logs', help='Path to the log directory')
+    return parser
+def load_config_as_namespace(config_file):
+    with open(config_file, 'r') as f:
+        config_dict = yaml.safe_load(f)
+    return argparse.Namespace(**config_dict)
+if __name__ == "__main__":
+    # parse args
+    parser = create_parser()
+    args = parser.parse_args()
+    # load config
+    config = load_config_as_namespace(args.config)
+    config.log_dir = args.log_dir
+    try:
+        with open(config.train_data, 'r') as f:
+            data = json.load(f)
+    except:
+        data = sample_train_data(config.train_data, 10000)
+    if config.prev_path != "none":
+        model = load_model(config.prev_path)
+        model.config = config
+    else:
+        model = GLiNER(config)
+    if torch.cuda.is_available():
+        model = model.cuda()
+    lr_encoder = float(config.lr_encoder)
+    lr_others = float(config.lr_others)
+    optimizer = torch.optim.AdamW([
+        # encoder
+        {'params': model.token_rep_layer.parameters(), 'lr': lr_encoder},
+        {'params': model.rnn.parameters(), 'lr': lr_others},
+        # projection layers
+        {'params': model.span_rep_layer.parameters(), 'lr': lr_others},
+        {'params': model.prompt_rep_layer.parameters(), 'lr': lr_others},
+    ])
+    device = 'cuda' if torch.cuda.is_available() else 'cpu'
+    train(model, optimizer, data, num_steps=config.num_steps, eval_every=config.eval_every,
+          log_dir=config.log_dir, warmup_ratio=config.warmup_ratio, train_batch_size=config.train_batch_size,
+          device=device)

core/__pycache__/base.cpython-310.pyc ADDED Viewed

Binary file (1.59 kB). View file

core/__pycache__/gradio_ocr.cpython-310.pyc ADDED Viewed

Binary file (1.93 kB). View file

core/__pycache__/ner_engine.cpython-310.pyc ADDED Viewed

Binary file (2.57 kB). View file

core/__pycache__/ocr_engine.cpython-310.pyc ADDED Viewed

Binary file (4.04 kB). View file

core/__pycache__/vlm_engine.cpython-310.pyc ADDED Viewed

Binary file (3.21 kB). View file

core/base.py ADDED Viewed

	@@ -0,0 +1,22 @@

+from abc import ABC, abstractmethod
+from typing import List, Dict, Any
+class BaseEngine(ABC):
+    @abstractmethod
+    def process(self, image_path: str) -> Any:
+        pass
+class BaseOCR(BaseEngine):
+    @abstractmethod
+    def extract_text(self, image_path: str) -> str:
+        pass
+class BaseVLM(BaseEngine):
+    @abstractmethod
+    def extract_structured_data(self, image_path: str, prompt: str) -> Dict[str, Any]:
+        pass
+class BaseNER(BaseEngine):
+    @abstractmethod
+    def extract_entities(self, text: str, labels: List[str]) -> Dict[str, List[str]]:
+        pass

core/gradio_ocr.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import logging
+import os
+from gradio_client import Client, handle_file
+from .base import BaseOCR
+class GradioOCREngine(BaseOCR):
+    def __init__(self, space_name="WebAshlarWA/glm-ocr-v1"):
+        self.space_name = space_name
+        self.client = None
+        self._initialize_client()
+    def _initialize_client(self):
+        try:
+            self.client = Client(self.space_name)
+            logging.info(f"Gradio Client initialized for Space: {self.space_name}")
+        except Exception as e:
+            logging.error(f"Failed to initialize Gradio Client for {self.space_name}: {e}")
+    def extract_text(self, image_path: str) -> str:
+        if not self.client:
+            logging.error("Gradio Client not initialized.")
+            return ""
+        logging.info(f"Gradio OCR: Starting extraction for {os.path.basename(image_path)}")
+        try:
+            # According to the user snippet, the input is 'image' and output is a string?
+            # Or structured data. The snippet used /proses_intelijen
+            result = self.client.predict(
+                image=handle_file(image_path),
+                api_name="/proses_intelijen"
+            )
+            if isinstance(result, list) and len(result) > 0:
+                # Gradio spaces often return lists of [text, score] or similar
+                return str(result[0])
+            elif isinstance(result, str):
+                return result
+            elif isinstance(result, dict):
+                # If it's structured, we might need to stringify or handle it elsewhere
+                # For OCR we expect a string
+                return result.get('text', str(result))
+            logging.info(f"Gradio OCR: Successfully extracted text.")
+            return str(result)
+        except Exception as e:
+            logging.error(f"Gradio OCR extraction failed: {e}")
+            return ""
+    def process(self, image_path: str) -> str:
+        return self.extract_text(image_path)

core/ner_engine.py ADDED Viewed

	@@ -0,0 +1,49 @@

+import logging
+from typing import List, Dict
+from .base import BaseNER
+class NEREngine(BaseNER):
+    def __init__(self, model_name="urchade/gliner_mediumv2.1"):
+        self.model_name = model_name
+        self.model = None
+        self._initialize_model()
+    def _initialize_model(self):
+        logging.info(f"Initializing NER model: {self.model_name}")
+        try:
+            from backup.model import GLiNER
+            self.model = GLiNER.from_pretrained(self.model_name)
+            logging.info(f"NER model '{self.model_name}' loaded successfully.")
+        except Exception as e:
+            logging.error(f"Failed to load NER model: {e}. NER extraction will be unavailable.")
+    def extract_entities(self, text: str, labels: List[str] = None) -> Dict[str, List[str]]:
+        if not text:
+            logging.warning("NER: Received empty text for extraction.")
+            return {}
+        if not self.model:
+            logging.error("NER: Model not initialized. Skipping extraction.")
+            return {}
+        if labels is None:
+            labels = ["Name", "Designation", "Company", "Contact", "Address", "Email", "Link"]
+        logging.info(f"NER: Extracting entities for {len(text)} characters of text.")
+        try:
+            entities = self.model.predict_entities(text, labels, threshold=0.3)
+            structured_data = {label: [] for label in labels}
+            for ent in entities:
+                label = ent["label"]
+                if label in structured_data:
+                    structured_data[label].append(ent["text"])
+            non_empty_tags = sum(1 for v in structured_data.values() if v)
+            logging.info(f"NER: Extracted entities for {non_empty_tags} labels.")
+            return structured_data
+        except Exception as e:
+            logging.error(f"NER: Extraction pipeline crashed: {e}")
+            return {}
+    def process(self, text: str) -> Dict[str, List[str]]:
+        return self.extract_entities(text)

core/ocr_engine.py ADDED Viewed

	@@ -0,0 +1,114 @@

+import logging
+import cv2
+import os
+import numpy as np
+from PIL import Image, ImageEnhance
+from .base import BaseOCR
+from .gradio_ocr import GradioOCREngine
+class OCREngine(BaseOCR):
+    def __init__(self, engine_type='paddle'):
+        self.engine_type = engine_type
+        self.ocr = None
+        self.gradio_fallback = None
+        self._initialize_engine()
+    def _initialize_engine(self):
+        logging.info(f"Initializing OCR engine: {self.engine_type}")
+        # Pre-emptive Gradio initialization as it's the most reliable fallback
+        try:
+            self.gradio_fallback = GradioOCREngine()
+        except Exception as e:
+            logging.error(f"Failed to pre-initialize Gradio fallback: {e}")
+        if self.engine_type == 'paddle':
+            try:
+                from paddleocr import PaddleOCR
+                self.ocr = PaddleOCR(use_angle_cls=False, lang='en', show_log=False)
+                logging.info("PaddleOCR engine initialized successfully.")
+            except Exception as e:
+                logging.warning(f"Failed to initialize PaddleOCR: {e}. Switching to EasyOCR fallback.")
+                self.engine_type = 'easyocr'
+        if self.engine_type == 'easyocr':
+            try:
+                import easyocr
+                self.ocr = easyocr.Reader(['en'])
+                logging.info("EasyOCR engine initialized successfully.")
+            except Exception as e:
+                logging.error(f"Failed to initialize EasyOCR: {e}. OCR will be partially unavailable.")
+                self.ocr = None
+    def preprocess_image(self, image_path, scale=2):
+        try:
+            image = cv2.imread(image_path)
+            if image is None:
+                logging.error(f"Image not found or unreadable: {image_path}")
+                return None
+            # Upscale
+            height, width = image.shape[:2]
+            image = cv2.resize(image, (width * scale, height * scale), interpolation=cv2.INTER_CUBIC)
+            # Denoise
+            image = cv2.fastNlMeansDenoisingColored(image, None, 10, 10, 7, 21)
+            # Sharpen
+            kernel = np.array([[0, -1, 0], [-1, 5, -1], [0, -1, 0]])
+            image = cv2.filter2D(image, -1, kernel)
+            # Enhance Contrast
+            pil_img = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
+            enhancer = ImageEnhance.Contrast(pil_img)
+            enhanced_image = enhancer.enhance(1.5)
+            logging.debug(f"Preprocessing completed for {image_path}")
+            return cv2.cvtColor(np.array(enhanced_image), cv2.COLOR_RGB2BGR)
+        except Exception as e:
+            logging.error(f"Error during image preprocessing for {image_path}: {e}")
+            return None
+    def extract_text(self, image_path: str) -> str:
+        logging.info(f"Starting text extraction for: {os.path.basename(image_path)}")
+        # Tiered Extraction Strategy:
+        # 1. Primary Engine (Paddle/EasyOCR)
+        # 2. Gradio Remote Fallback (Very reliable)
+        extracted_text = ""
+        # 1. Local OCR
+        if self.engine_type == 'paddle' and self.ocr:
+            try:
+                processed_img = self.preprocess_image(image_path)
+                if processed_img is not None:
+                    results = self.ocr.ocr(processed_img)
+                    if results and results[0]:
+                        extracted_text = " ".join([line[1][0] for line in results[0]])
+            except Exception as e:
+                logging.error(f"PaddleOCR crashed: {e}")
+        elif self.engine_type == 'easyocr' and self.ocr:
+            try:
+                processed_img = self.preprocess_image(image_path)
+                if processed_img is not None:
+                    results = self.ocr.readtext(processed_img)
+                    extracted_text = " ".join([res[1] for res in results])
+            except Exception as e:
+                logging.error(f"EasyOCR crashed: {e}")
+        # 2. Gradio Fallback if Local failed
+        if not extracted_text and self.gradio_fallback:
+            logging.info("Local OCR failed or returned empty. Trying Gradio OCR fallback...")
+            extracted_text = self.gradio_fallback.extract_text(image_path)
+        if extracted_text:
+            logging.info(f"OCR extracted {len(extracted_text)} characters using {'Gradio' if not extracted_text else self.engine_type}.")
+        else:
+            logging.error("All OCR methods failed to extract text.")
+        return extracted_text
+    def process(self, image_path: str) -> str:
+        return self.extract_text(image_path)

core/vlm_engine.py ADDED Viewed

	@@ -0,0 +1,91 @@

+import os
+import base64
+import json
+import logging
+import requests
+import cv2
+from typing import Dict, Any
+from .base import BaseVLM
+class GroqVLMEngine(BaseVLM):
+    def __init__(self, model="meta-llama/llama-4-scout-17b-16e-instruct"):
+        self.api_key = os.getenv("GROQ_API_KEY")
+        self.url = "https://api.groq.com/openai/v1/chat/completions"
+        self.model = model
+        if not self.api_key:
+            logging.warning("GROQ_API_KEY missing from environment. VLM extraction will be skipped.")
+    def image_to_base64(self, image_path: str) -> str:
+        try:
+            img = cv2.imread(image_path)
+            if img is None:
+                logging.error(f"VLM: Image not found at {image_path}")
+                return ""
+            _, buffer = cv2.imencode(".jpg", img)
+            return base64.b64encode(buffer).decode("utf-8")
+        except Exception as e:
+            logging.error(f"VLM: Error converting image to base64: {e}")
+            return ""
+    def extract_structured_data(self, image_path: str, prompt: str) -> Dict[str, Any]:
+        if not self.api_key:
+            return {}
+        logging.info(f"VLM: Starting extraction for {os.path.basename(image_path)} using {self.model}")
+        base64_image = self.image_to_base64(image_path)
+        if not base64_image:
+            return {}
+        headers = {
+            "Content-Type": "application/json",
+            "Authorization": f"Bearer {self.api_key}"
+        }
+        payload = {
+            "model": self.model,
+            "messages": [
+                {
+                    "role": "system",
+                    "content": "You are a strict information extraction engine for business cards. Return only valid JSON. Do not include any other text."
+                },
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": prompt},
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": f"data:image/jpeg;base64,{base64_image}"
+                            }
+                        }
+                    ]
+                }
+            ],
+            "response_format": {"type": "json_object"},
+            "temperature": 0.1
+        }
+        try:
+            resp = requests.post(self.url, headers=headers, json=payload, timeout=60)
+            if resp.status_code != 200:
+                logging.error(f"VLM API Error: {resp.status_code} - {resp.text}")
+                return {}
+            content = resp.json()["choices"][0]["message"]["content"]
+            data = json.loads(content)
+            logging.info(f"VLM: Successfully extracted structured data from {os.path.basename(image_path)}")
+            return data
+        except requests.exceptions.Timeout:
+            logging.error("VLM: Request timed out.")
+            return {}
+        except Exception as e:
+            logging.error(f"VLM: Unexpected error: {e}")
+            return {}
+    def process(self, image_path: str) -> Dict[str, Any]:
+        prompt = """
+        Extract structured text from this business card and return ONLY valid JSON.
+        Fields: Name, Designation, Company, Contact, Address, Email, Link.
+        Every value must be a JSON array. If not found, use [].
+        """
+        return self.extract_structured_data(image_path, prompt)

requirements.txt CHANGED Viewed

@@ -1,16 +1,18 @@
-Flask
-huggingface_hub
-python-dotenv
-easyocr
-Pillow
-opencv-python
-numpy
-paddle-bfloat
-paddlepaddle
-paddleocr
-torch
-transformers
-flair
-seqeval
-tqdm
-gunicorn

+Flask
+python-dotenv
+Pillow
+opencv-python
+numpy
+paddle-bfloat
+paddlepaddle>=2.6.0
+paddleocr>=2.7.0
+gradio_client
+easyocr
+langchain
+langchain-community
+torch
+transformers
+flair
+tqdm
+gunicorn
+requests

static/uploads/IN_Standard-Visiting-Cards_Overview.png ADDED Viewed

templates/index.html CHANGED Viewed

@@ -1,284 +1,236 @@
-<!DOCTYPE html>
-<html lang="en">
-<head>
-    <meta charset="UTF-8" />
-    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
-    <title>AI Data Extractor</title>
-    <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0-alpha1/dist/css/bootstrap.min.css" rel="stylesheet" />
-    <style>
-        body {
-            background-color: #1c1c1e;
-            font-family: "Poppins", sans-serif;
-            color: #f5f5f7;
-            margin: 0;
-        }
-        h1 {
-            color: #e5e5e7;
-            text-align: center;
-            margin-bottom: 20px;
-        }
-        .container {
-            margin-top: 70px;
-        }
-        .file-upload-section {
-            background-color: #2c2c2e;
-            padding: 30px;
-            border-radius: 15px;
-            box-shadow: 0 8px 16px rgba(0, 0, 0, 0.5);
-            text-align: center;
-        }
-        .file-upload-section input[type="file"] {
-            margin: 20px 0;
-        }
-        .file-upload-section input[type="submit"] {
-            background-color: #ee4410;
-            color: white;
-            border: none;
-            padding: 10px 20px;
-            border-radius: 5px;
-            transition: background-color 0.3s ease;
-        }
-        .file-upload-section input[type="submit"]:hover {
-            background-color: #ee4410;
-        }
-        .file-actions a {
-            margin: 0 10px;
-            text-decoration: none;
-            color: #ee4410;
-        }
-        .file-actions a:hover {
-            color: #ee4410;
-        }
-        .flash-message {
-            margin-bottom: 20px;
-            padding: 15px;
-            border-radius: 5px;
-            color: #333;
-        }
-        .alert {
-            text-align: center;
-            position: sticky;
-            top: 0;
-            right: 15%;
-        }
-        /* Loader styles */
-        .loader {
-            border: 8px solid #f3f3f3;
-            border-top: 8px solid #ee4410;
-            border-radius: 50%;
-            width: 60px;
-            height: 60px;
-            animation: spin 2s linear infinite;
-            margin: 20px auto;
-            display: none;
-        }
-        @keyframes spin {
-            0% {
-                transform: rotate(0deg);
-            }
-            100% {
-                transform: rotate(360deg);
-            }
-        }
-        /* Top bar styles */
-        .top-bar {
-            background-color: #333;
-            position: fixed;
-            top: 0;
-            width: 100%;
-            z-index: 1000;
-            padding: 10px 20px;
-            display: flex;
-            justify-content: space-between;
-            align-items: center;
-        }
-        .top-bar h2 {
-            color: white;
-        }
-        /* Navigation tab styles */
-        .tab {
-            display: flex;
-            gap: 10px;
-        }
-        .tab button {
-            background-color: inherit;
-            border: none;
-            outline: none;
-            cursor: pointer;
-            padding: 10px 16px;
-            transition: 0.3s;
-            font-size: 17px;
-            color: white;
-        }
-        .tab button:hover {
-            background-color: #575757;
-            cursor: pointer;
-        }
-        .tab button.active {
-            background-color: #ee4410;
-        }
-        /* Tab content styles */
-        .tabcontent {
-            display: none;
-            padding: 20px;
-            margin-top: 70px;
-        }
-        .disabled {
-            cursor: not-allowed !important;
-            opacity: 0.6;/* Set cursor to not-allowed */
-        }
-        /* Responsive design */
-        @media (max-width: 768px) {
-            .tab {
-                flex-direction: column;
-            }
-        }
-    </style>
-</head>
-<body>
-    <!-- Locked Top Bar with Tabs -->
-    <div class="top-bar">
-        <h2>AI Data Extractor</h2>
-        <!-- Navigation Tabs -->
-         <div class="tab">
-            <button class="tablinks active" onclick="openLink('https://webashlarwa-imagedataextractor2.hf.space/', this, '#ff4d00')" id="defaultOpen">Visiting Card Data Extractor</button>
-            <button class="tablinks" onclick="openLink('https://webashlarwa-resumeextractor2.hf.space/', this, '#ff4d00')">Resume Data Extractor</button>
-      </div>
-    </div>
-     <div class="container">
-        <h1>Visiting Card Data Extractor</h1>
-        <div class="file-upload-section">
-            <form id="fileUploadForm" action="{{ url_for('upload_file') }}" method="POST" enctype="multipart/form-data">
-                <input type="file" name="files" multiple class="form-control" required />
-                <input type="submit" value="Upload your Images" class="btn btn-outline-primary" />
-            </form>
-            {% if session.get('uploaded_files') %}
-            <p class="mt-4">
-                Uploaded:
-                <span class="text-danger">{{ session.get('uploaded_files') }}</span>
-            </p>
-            <form action="{{ url_for('remove_file') }}" method="post">
-                <button type="submit" class="btn btn-outline-danger">
-                    <i class="bi bi-trash"></i> Remove Uploaded File
-                </button>
-            </form>
-            {% endif %}
-        </div>
-        <div class="container">
-            <!-- Loader -->
-            <div class="loader" id="loader"></div>
-        </div>
-        {% with messages = get_flashed_messages() %} {% if messages %}
-        <div class="alert alert-success mt-4" id="flash-message">
-            {{ messages[0] }}
-        </div>
-        {% endif %} {% endwith %}
-    </div>
-    <script src="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0-alpha1/dist/js/bootstrap.bundle.min.js"></script>
-    <script>
-        // Loader functionality
-        document.getElementById('fileUploadForm').onsubmit = function() {
-          document.getElementById('loader').style.display = 'block';
-          // Disable the tab buttons and apply disabled class
-          const buttons = document.querySelectorAll('.tab button');
-          buttons.forEach(button => {
-            button.setAttribute('disabled', 'true');
-            button.classList.add('disabled'); // Add disabled class
-          });
-          // Show processing message
-          const processingMessage = document.createElement('p');
-          processingMessage.id = 'processing-message';
-          processingMessage.textContent = 'Processing, please wait...';
-          processingMessage.style.color = '#e68a10'; // Style as needed
-          document.querySelector('.file-upload-section').appendChild(processingMessage);
-        };
-        // Flash message auto-hide
-        setTimeout(function () {
-          let flashMessage = document.getElementById("flash-message");
-          if (flashMessage) {
-            flashMessage.style.transition = "opacity 1s ease";
-            flashMessage.style.opacity = 0;
-            setTimeout(() => flashMessage.remove(), 1000);
-          }
-          // After processing is complete (You can adjust this based on your logic)
-          const processingMessage = document.getElementById('processing-message');
-          if (processingMessage) {
-            processingMessage.remove(); // Remove the processing message
-          }
-          // Re-enable tab buttons and remove disabled class
-          const buttons = document.querySelectorAll('.tab button');
-          buttons.forEach(button => {
-            button.removeAttribute('disabled');
-            button.classList.remove('disabled'); // Remove disabled class
-          });
-        }, 3000); // Adjust timing based on your upload duration
-        // Function to open links in the same tab
-        function openLink(url, element) {
-          window.location.href = url; // Redirects to the specified URL in the same tab
-          // Remove "active" class from all buttons
-          const buttons = document.querySelectorAll('.tab button');
-          buttons.forEach(button => button.classList.remove('active'));
-          // Add "active" class to the clicked button
-          element.classList.add('active');
-        }
-        //Refreshing the cookie
-        function setCookie(name, value, days) {
-            let expires = "";
-            if (days) {
-                const date = new Date();
-                date.setTime(date.getTime() + (days * 24 * 60 * 60 * 1000));
-                expires = "; expires=" + date.toUTCString();
-            }
-            document.cookie = name + "=" + (value || "") + expires + "; path=/";
-        }
-        function deleteCookie(name) {
-            document.cookie = name + '=; Max-Age=0; path=/;'; // Delete the cookie
-        }
-        // Set the cookie (you can comment this out after testing)
-        setCookie('myCookie', 'myValue', 1); // Sets a cookie for demonstration
-        // Automatically delete the cookie when the page is loaded or refreshed
-        window.onload = function() {
-            deleteCookie('myCookie'); // Replace 'myCookie' with your cookie name
-        }
-    </script>
-</body>
-</html>

+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8" />
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
+    <title>AI Data Extractor - Visiting Card</title>
+    <link href="https://fonts.googleapis.com/css2?family=Outfit:wght@300;400;600&display=swap" rel="stylesheet">
+    <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0-alpha1/dist/css/bootstrap.min.css" rel="stylesheet" />
+    <style>
+        :root {
+            --primary: #ee4410;
+            --secondary: #ff9f0a;
+            --bg-dark: #0a0a0c;
+            --card-bg: rgba(30, 30, 35, 0.7);
+            --text-glow: 0 0 10px rgba(238, 68, 16, 0.5);
+        }
+        body {
+            background-color: var(--bg-dark);
+            font-family: 'Outfit', sans-serif;
+            color: #f5f5f7;
+            overflow-x: hidden;
+            background: radial-gradient(circle at 50% 50%, #1a1a1f 0%, #0a0a0c 100%);
+            min-height: 100vh;
+        }
+        .glass-card {
+            background: var(--card-bg);
+            backdrop-filter: blur(12px);
+            border: 1px solid rgba(255, 255, 255, 0.1);
+            border-radius: 20px;
+            box-shadow: 0 15px 35px rgba(0, 0, 0, 0.4);
+            padding: 2.5rem;
+            margin-top: 2rem;
+        }
+        .premium-title {
+            background: linear-gradient(135deg, #fff 0%, #aaa 100%);
+            -webkit-background-clip: text;
+            background-clip: text;
+            -webkit-text-fill-color: transparent;
+            font-weight: 600;
+            letter-spacing: -1px;
+            text-shadow: var(--text-glow);
+            margin-bottom: 2rem;
+            text-align: center;
+        }
+        .top-bar {
+            background: rgba(20, 20, 25, 0.8);
+            backdrop-filter: blur(10px);
+            padding: 1rem 2rem;
+            border-bottom: 1px solid rgba(255, 255, 255, 0.05);
+            position: sticky;
+            top: 0;
+            z-index: 1000;
+        }
+        .tab-btn {
+            background: transparent;
+            border: 1px solid rgba(255, 255, 255, 0.1);
+            color: #8e8e93;
+            padding: 0.6rem 1.2rem;
+            border-radius: 30px;
+            margin-right: 10px;
+            transition: all 0.3s ease;
+            text-decoration: none;
+            font-size: 0.9rem;
+        }
+        .tab-btn:hover, .tab-btn.active {
+            border-color: var(--primary);
+            color: #fff;
+            background: rgba(238, 68, 16, 0.1);
+        }
+        .tab-btn.active {
+            box-shadow: 0 0 15px rgba(238, 68, 16, 0.2);
+        }
+        .upload-area {
+            border: 2px dashed rgba(255, 255, 255, 0.1);
+            border-radius: 15px;
+            padding: 3rem;
+            text-align: center;
+            transition: all 0.3s ease;
+            cursor: pointer;
+            margin-bottom: 2rem;
+        }
+        .upload-area:hover {
+            border-color: var(--primary);
+            background: rgba(238, 68, 16, 0.05);
+        }
+        .btn-premium {
+            background: linear-gradient(135deg, var(--primary) 0%, var(--secondary) 100%);
+            border: none;
+            color: white;
+            padding: 0.8rem 2rem;
+            border-radius: 30px;
+            font-weight: 600;
+            box-shadow: 0 5px 15px rgba(238, 68, 16, 0.3);
+            transition: all 0.3s ease;
+            width: 100%;
+        }
+        .btn-premium:hover:not(:disabled) {
+            transform: scale(1.02);
+            box-shadow: 0 8px 25px rgba(238, 68, 16, 0.5);
+        }
+        .btn-premium:disabled {
+            opacity: 0.6;
+            cursor: not-allowed;
+        }
+        .loader {
+            display: none;
+            width: 40px;
+            height: 40px;
+            border: 3px solid rgba(255,255,255,0.1);
+            border-top: 3px solid var(--primary);
+            border-radius: 50%;
+            animation: spin 1s linear infinite;
+            margin: 2rem auto;
+        }
+        @keyframes spin { 0% { transform: rotate(0deg); } 100% { transform: rotate(360deg); } }
+        .alert-premium {
+            background: rgba(40, 40, 45, 0.9);
+            border: 1px solid var(--primary);
+            color: #fff;
+            border-radius: 12px;
+            padding: 1rem;
+            margin-top: 1.5rem;
+            text-align: center;
+        }
+        .file-list {
+            margin-top: 1.5rem;
+            padding: 1rem;
+            background: rgba(0, 0, 0, 0.2);
+            border-radius: 10px;
+            font-size: 0.9rem;
+        }
+    </style>
+</head>
+<body>
+    <div class="top-bar d-flex justify-content-between align-items-center">
+        <h4 class="mb-0 text-white" style="font-weight: 600; letter-spacing: -0.5px;">AI EXtractor</h4>
+        <div class="d-none d-md-flex">
+            <a href="#" class="tab-btn active">Visiting Card</a>
+            <a href="https://webashalarforml-resumeextractor2.hf.space/" class="tab-btn">Resume Detail</a>
+        </div>
+    </div>
+    <div class="container py-5">
+        <div class="row justify-content-center">
+            <div class="col-lg-6">
+                <div class="glass-card">
+                    <h1 class="premium-title">Card Scanner <small style="display: block; font-size: 0.4em; letter-spacing: 2px; color: var(--primary); margin-top: 5px;">POWERED BY GROQ VLM</small></h1>
+                    <form id="uploadForm" action="{{ url_for('upload_file') }}" method="POST" enctype="multipart/form-data">
+                        <div class="upload-area" onclick="document.getElementById('fileInput').click()">
+                            <svg xmlns="http://www.w3.org/2000/svg" width="48" height="48" fill="currentColor" class="bi bi-cloud-upload text-muted mb-3" viewBox="0 0 16 16">
+                                <path fill-rule="evenodd" d="M4.406 3.342A5.53 5.53 0 0 1 8 2c2.69 0 4.923 2 5.166 4.579C14.758 6.804 16 8.137 16 9.773 16 11.569 14.502 13 12.687 13H3.781C1.708 13 0 11.366 0 9.318c0-1.763 1.266-3.223 2.942-3.593.143-.863.698-1.723 1.464-2.383zm.653.757c-.757.653-1.153 1.44-1.153 2.056v.448l-.445.049C2.064 6.805 1 7.952 1 9.318 1 10.785 2.23 12 3.781 12h8.906C13.98 12 15 10.988 15 9.773c0-1.216-1.02-2.228-2.313-2.228h-.5v-.5C12.188 4.825 10.328 3 8 3a4.53 4.53 0 0 0-2.941 1.1z"/>
+                                <path fill-rule="evenodd" d="M7.646 5.146a.5.5 0 0 1 .708 0l2 2a.5.5 0 0 1-.708.708L8.5 6.707V10.5a.5.5 0 0 1-1 0V6.707L6.354 7.854a.5.5 0 1 1-.708-.708l2-2z"/>
+                            </svg>
+                            <p class="mb-0 text-muted">Click or Drag & Drop Business Cards</p>
+                            <input type="file" name="files" id="fileInput" multiple style="display: none;" required onchange="updateFileList(this)" />
+                        </div>
+                        <div id="fileList" class="file-list" style="display: none;"></div>
+                        <button type="submit" id="submitBtn" class="btn-premium mt-3">Start Extraction</button>
+                    </form>
+                    <div class="loader" id="loader"></div>
+                    <p id="loadingMsg" class="text-center text-muted small mt-2" style="display: none;">Analyzing images with AI engine...</p>
+                    {% if session.get('uploaded_files') %}
+                    <div class="mt-4 pt-3 border-top border-secondary">
+                        <div class="d-flex justify-content-between align-items-center">
+                            <span class="small text-muted">Ready to process: {{ session.get('uploaded_files')|length }} files</span>
+                            <a href="{{ url_for('reset_upload') }}" class="text-danger small text-decoration-none">Clear All</a>
+                        </div>
+                    </div>
+                    {% endif %}
+                    {% with messages = get_flashed_messages() %}
+                    {% if messages %}
+                        <div class="alert-premium" id="flashMessage">
+                            {{ messages[0] }}
+                        </div>
+                    {% endif %}
+                    {% endwith %}
+                </div>
+            </div>
+        </div>
+    </div>
+    <script>
+        function updateFileList(input) {
+            const list = document.getElementById('fileList');
+            if (input.files.length > 0) {
+                list.style.display = 'block';
+                list.innerHTML = '<strong>Selected:</strong><br>' +
+                    Array.from(input.files).map(f => f.name).join('<br>');
+            } else {
+                list.style.display = 'none';
+            }
+        }
+        document.getElementById('uploadForm').onsubmit = function() {
+            document.getElementById('loader').style.display = 'block';
+            document.getElementById('loadingMsg').style.display = 'block';
+            document.getElementById('submitBtn').disabled = true;
+            document.getElementById('submitBtn').innerText = 'Processing...';
+        };
+        // Flash message auto-hide
+        setTimeout(() => {
+            const flash = document.getElementById('flashMessage');
+            if (flash) {
+                flash.style.transition = 'opacity 1s ease';
+                flash.style.opacity = '0';
+                setTimeout(() => flash.remove(), 1000);
+            }
+        }, 4000);
+    </script>
+</body>
+</html>

templates/result.html CHANGED Viewed

@@ -1,248 +1,326 @@
-<!DOCTYPE html>
-<html lang="en">
-<head>
-    <meta charset="UTF-8" />
-    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
-    <title>Processed Results</title>
-    <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0-alpha1/dist/css/bootstrap.min.css" rel="stylesheet" />
-    <style>
-        body {
-            background-color: #1c1c1e;
-            font-family: "Poppins", sans-serif;
-            color: #f5f5f7;
-        }
-        h1 {
-            color: #e5e5e7;
-            text-align: center;
-        }
-        .cont {
-            background-color: #2c2c2e;
-            padding: 30px;
-            border-radius: 15px;
-            box-shadow: 0 8px 16px rgba(0, 0, 0, 0.5);
-            transition: 1s ease;
-        }
-        .section-title {
-            color: #ee4410;
-            font-size: 1.5rem;
-            font-weight: bold;
-            margin-top: 20px;
-            border-bottom: 2px solid #ee4410;
-            padding-bottom: 10px;
-        }
-        .card {
-            background-color: #3a3a3c;
-            color: #f5f5f7;
-            border-radius: 10px;
-            margin-bottom: 15px;
-            padding: 15px;
-            box-shadow: 0 4px 10px rgba(0, 0, 0, 0.3);
-            transition: background-color 0.3s ease;
-        }
-        .card:hover {
-            background-color: #3a3a3c98;
-        }
-        .card-title {
-            color: #ee4410;
-            font-size: 1.2rem;
-            font-weight: bold;
-        }
-        .card-text {
-            color: #d1d1d6;
-            font-size: 1rem;
-        }
-        ul {
-            list-style-type: none;
-            padding-left: 0;
-        }
-        li::before {
-            content: "• ";
-            color: #ee4410;
-        }
-        .btn-reset {
-            background-color: #ff9f0a;
-            color: white;
-            border: none;
-            padding: 10px 20px;
-            border-radius: 5px;
-            transition: background-color 0.3s ease;
-            margin-bottom: 20px;
-        }
-        .btn-reset:hover {
-            background-color: #e03a2f;
-        }
-        .alert {
-            text-align: center;
-            position: absolute;
-            top: 0;
-            right: 15%;
-        }
-        .image-container img {
-            max-width: 100%;
-            border-radius: 10px;
-        }
-    </style>
-</head>
-<body>
-    <div class="container">
-        {% with messages = get_flashed_messages() %} {% if messages %}
-        <div class="alert alert-success mt-4 " id="flash-message">
-            {{ messages[0] }}
-        </div>
-        {% endif %} {% endwith %}
-    </div>
-    <div class="container cont mt-5">
-        <div class="d-flex align-items-center justify-content-between">
-            <h1>Extracted Details From Image</h1>
-            <!-- Reset Button -->
-            <div class="text-center mt-4">
-                <a href="{{ url_for('reset_upload') }}" class="btn btn-reset">Reset & Upload New File</a>
-            </div>
-        </div>
-        {% if data %}
-        <!-- Personal Information Section -->
-        <section>
-            <h3 class="section-title">Extracted Information</h3>
-            <div class="row">
-                <!-- Image Container on the Left -->
-                <div class="col-md-6 image-container">
-                   <div class="card">
-                        <div class="card-body">
-                    {% if data.extracted_text.items() %}
-                    <h5 class="card-title">Extracted Image:</h5>
-                    <ul>
-                        {% for filename, text in data.extracted_text.items() %}
-                        <!--<li>{{ filename }}:</li>-->
-                        <img src="{{ Img[filename] }}" alt="Processed Image" class="img-fluid" />
-                        {% endfor %}
-                    </ul>
-                    {% endif %}
-                          </div>
-                     </div>
-                </div>
-                <!-- Extracted Text on the Right -->
-                <div class="col-md-6">
-                    <div class="card">
-                        <div class="card-body">
-                            {% if data.name and data.name is iterable and data.name is not string %}
-                            <h5 class="card-title">Name:</h5>
-                            <ul>
-                                {% for value in data.name %}
-                                {% if value|lower != 'not found' %}
-                                <li>{{ value }}</li>
-                                {% endif %}
-                                {% endfor %}
-                            </ul>
-                            {% endif %}
-                            {% if data.Designation and data.Designation is iterable and data.Designation is not string %}
-                            <h5 class="card-title">Designation:</h5>
-                            <ul>
-                                {% for value in data.Designation %}
-                                {% if value|lower != 'not found' %}
-                                <li>{{ value }}</li>
-                                {% endif %}
-                                {% endfor %}
-                            </ul>
-                            {% endif %}
-                            {% if data.contact_number and data.contact_number is iterable and data.contact_number is not string %}
-                            <h5 class="card-title">Contact number:</h5>
-                            <ul>
-                                {% for value in data.contact_number %}
-                                {% if value|lower != 'not found' %}
-                                <li>{{ value }}</li>
-                                {% endif %}
-                                {% endfor %}
-                            </ul>
-                            {% endif %}
-                            {% if data.email and data.email is iterable and data.email is not string %}
-                            <h5 class="card-title">Email:</h5>
-                            <ul>
-                                {% for value in data.email %}
-                                {% if value|lower != 'not found' %}
-                                <li>{{ value }}</li>
-                                {% endif %}
-                                {% endfor %}
-                            </ul>
-                            {% endif %}
-                            {% if data.Location and data.Location is iterable and data.Location is not string %}
-                            <h5 class="card-title">Location:</h5>
-                            <ul>
-                                {% for value in data.Location %}
-                                {% if value|lower != 'not found' %}
-                                <li>{{ value }}</li>
-                                {% endif %}
-                                {% endfor %}
-                            </ul>
-                            {% endif %}
-                            {% if data.Link and data.Link is iterable and data.Link is not string %}
-                            <h5 class="card-title">Link:</h5>
-                            <ul>
-                                {% for value in data.Link %}
-                                {% if value|lower != 'not found' %}
-                                <li>{{ value }}</li>
-                                {% endif %}
-                                {% endfor %}
-                            </ul>
-                            {% endif %}
-                            {% if data.Company and data.Company is iterable and data.Company is not string %}
-                            <h5 class="card-title">Organisation:</h5>
-                            <ul>
-                                {% for value in data.Company %}
-                                {% if value|lower != 'not found' %}
-                                <li>{{ value }}</li>
-                                {% endif %}
-                                {% endfor %}
-                            </ul>
-                            {% endif %}
-                        </div>
-                    </div>
-                </div>
-            </div>
-        </section>
-        {% else %}
-        <p>No data available. Please process a file.</p>
-        {% endif %}
-    </div>
-    <script src="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0-alpha1/dist/js/bootstrap.bundle.min.js"></script>
-    <script src="https://code.jquery.com/jquery-3.5.1.slim.min.js"></script>
-    <script src="https://cdn.jsdelivr.net/npm/@popperjs/core@2.5.4/dist/umd/popper.min.js"></script>
-    <script src="https://stackpath.bootstrapcdn.com/bootstrap/4.5.2/js/bootstrap.min.js"></script>
-    <script>
-        setTimeout(function () {
-            let flashMessage = document.getElementById("flash-message");
-            if (flashMessage) {
-                flashMessage.style.transition = "opacity 1s ease";
-                flashMessage.style.opacity = 0;
-                setTimeout(() => flashMessage.remove(), 1000);
-            }
-        }, 3000);
-    </script>
-</body>
-</html>

+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8" />
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
+    <title>Processed Results</title>
+    <link href="https://fonts.googleapis.com/css2?family=Outfit:wght@300;400;600&display=swap" rel="stylesheet">
+    <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0-alpha1/dist/css/bootstrap.min.css" rel="stylesheet" />
+    <style>
+        :root {
+            --primary: #ee4410;
+            --secondary: #ff9f0a;
+            --bg-dark: #0a0a0c;
+            --card-bg: rgba(30, 30, 35, 0.7);
+            --text-glow: 0 0 10px rgba(238, 68, 16, 0.5);
+        }
+        body {
+            background-color: var(--bg-dark);
+            font-family: 'Outfit', sans-serif;
+            color: #f5f5f7;
+            overflow-x: hidden;
+            background: radial-gradient(circle at 50% 50%, #1a1a1f 0%, #0a0a0c 100%);
+            min-height: 100vh;
+        }
+        .glass-card {
+            background: var(--card-bg);
+            backdrop-filter: blur(12px);
+            border: 1px solid rgba(255, 255, 255, 0.1);
+            border-radius: 20px;
+            box-shadow: 0 15px 35px rgba(0, 0, 0, 0.4);
+            transition: all 0.4s cubic-bezier(0.175, 0.885, 0.32, 1.275);
+            padding: 2rem;
+            margin-bottom: 2rem;
+        }
+        .glass-card:hover {
+            transform: translateY(-5px);
+            border-color: rgba(238, 68, 16, 0.3);
+            box-shadow: 0 20px 45px rgba(238, 68, 16, 0.1);
+        }
+        .premium-title {
+            background: linear-gradient(135deg, #fff 0%, #aaa 100%);
+            -webkit-background-clip: text;
+            background-clip: text;
+            -webkit-text-fill-color: transparent;
+            font-weight: 600;
+            letter-spacing: -1px;
+            text-shadow: var(--text-glow);
+            margin-bottom: 1.5rem;
+        }
+        .section-title {
+            color: var(--primary);
+            font-size: 1.1rem;
+            text-transform: uppercase;
+            letter-spacing: 2px;
+            margin-bottom: 1.5rem;
+            display: flex;
+            align-items: center;
+        }
+        .section-title::after {
+            content: '';
+            flex: 1;
+            height: 1px;
+            background: linear-gradient(90deg, var(--primary), transparent);
+            margin-left: 1rem;
+        }
+        .data-item {
+            margin-bottom: 1.5rem;
+            border-left: 3px solid var(--primary);
+            padding-left: 1rem;
+            animation: fadeIn 0.6s ease-out forwards;
+            opacity: 0;
+        }
+        @keyframes fadeIn {
+            from { opacity: 0; transform: translateX(-10px); }
+            to { opacity: 1; transform: translateX(0); }
+        }
+        .data-label {
+            color: #8e8e93;
+            font-size: 0.85rem;
+            margin-bottom: 0.2rem;
+        }
+        .data-value {
+            font-size: 1.1rem;
+            color: #fff;
+            list-style: none;
+            padding: 0;
+        }
+        .data-value li {
+            margin-bottom: 0.4rem;
+        }
+        .result-img-container {
+            border-radius: 15px;
+            overflow: hidden;
+            border: 1px solid rgba(255, 255, 255, 0.1);
+            position: relative;
+        }
+        .result-img-container img {
+            width: 100%;
+            height: auto;
+            display: block;
+            transition: transform 0.5s ease;
+        }
+        .result-img-container:hover img {
+            transform: scale(1.05);
+        }
+        .btn-premium {
+            background: linear-gradient(135deg, var(--primary) 0%, var(--secondary) 100%);
+            border: none;
+            color: white;
+            padding: 0.8rem 2rem;
+            border-radius: 30px;
+            font-weight: 600;
+            box-shadow: 0 5px 15px rgba(238, 68, 16, 0.3);
+            text-decoration: none;
+            transition: all 0.3s ease;
+            display: inline-block;
+        }
+        .btn-premium:hover {
+            transform: scale(1.05);
+            box-shadow: 0 8px 25px rgba(238, 68, 16, 0.5);
+            color: #fff;
+        }
+        .alert-premium {
+            background: rgba(40, 40, 45, 0.9);
+            border: 1px solid var(--primary);
+            color: #fff;
+            border-radius: 12px;
+            padding: 1rem;
+            animation: slideDown 0.5s cubic-bezier(0.19, 1, 0.22, 1);
+        }
+        @keyframes slideDown {
+            from { transform: translateY(-50px); opacity: 0; }
+            to { transform: translateY(0); opacity: 1; }
+        }
+        .debug-panel {
+            margin-top: 4rem;
+            padding: 2rem;
+            background: rgba(0, 0, 0, 0.3);
+            border-top: 1px solid rgba(255, 255, 255, 0.05);
+            font-family: monospace;
+            font-size: 0.8rem;
+            color: #666;
+        }
+    </style>
+    </style>
+</head>
+<body>
+    <div class="container py-5">
+        {% with messages = get_flashed_messages() %}
+        {% if messages %}
+        <div class="alert-premium mb-4" id="flash-message">
+            {{ messages[0] }}
+        </div>
+        {% endif %}
+        {% endwith %}
+        <div class="d-flex align-items-center justify-content-between mb-5">
+            <h1 class="premium-title mb-0">Extraction Analysis <small style="font-size: 0.4em; color: var(--primary); letter-spacing: 2px; font-weight: 400; display: block; text-align: left;">v2.1 GOLD</small></h1>
+            <a href="{{ url_for('reset_upload') }}" class="btn-premium" style="width: auto;">Process New Image</a>
+        </div>
+        {% if data %}
+        <div class="row g-4">
+            <!-- Source Image Column -->
+            <div class="col-lg-5">
+                <div class="glass-card">
+                    <h3 class="section-title">Source Image</h3>
+                    {% if Img %}
+                        {% for filename, result_path in Img.items() %}
+                        <div class="result-img-container mb-3">
+                            <img src="{{ result_path }}" alt="Analyzed Document" />
+                        </div>
+                        <p class="text-muted small">File: {{ filename | basename }}</p>
+                        {% endfor %}
+                    {% else %}
+                        <div class="p-4 text-center text-muted">No image path available</div>
+                    {% endif %}
+                </div>
+            </div>
+            <!-- Extracted Details Column -->
+            <div class="col-lg-7">
+                <div class="glass-card">
+                    <h3 class="section-title">Verified Details</h3>
+                    <div class="row">
+                        <!-- Left Data Sub-col -->
+                        <div class="col-md-6">
+                            {% if data.name %}
+                            <div class="data-item" style="animation-delay: 0.1s">
+                                <div class="data-label">Full Name</div>
+                                <ul class="data-value">
+                                    {% for val in data.name %}<li>{{ val }}</li>{% endfor %}
+                                </ul>
+                            </div>
+                            {% endif %}
+                            {% if data.Designation %}
+                            <div class="data-item" style="animation-delay: 0.2s">
+                                <div class="data-label">Designation</div>
+                                <ul class="data-value">
+                                    {% for val in data.Designation %}<li>{{ val }}</li>{% endfor %}
+                                </ul>
+                            </div>
+                            {% endif %}
+                            {% if data.Company %}
+                            <div class="data-item" style="animation-delay: 0.3s">
+                                <div class="data-label">Organization</div>
+                                <ul class="data-value">
+                                    {% for val in data.Company %}<li>{{ val }}</li>{% endfor %}
+                                </ul>
+                            </div>
+                            {% endif %}
+                        </div>
+                        <!-- Right Data Sub-col -->
+                        <div class="col-md-6">
+                            {% if data.contact_number %}
+                            <div class="data-item" style="animation-delay: 0.4s">
+                                <div class="data-label">Phone Numbers</div>
+                                <ul class="data-value">
+                                    {% for val in data.contact_number %}<li>{{ val }}</li>{% endfor %}
+                                </ul>
+                            </div>
+                            {% endif %}
+                            {% if data.email %}
+                            <div class="data-item" style="animation-delay: 0.5s">
+                                <div class="data-label">Email Addresses</div>
+                                <ul class="data-value">
+                                    {% for val in data.email %}<li>{{ val }}</li>{% endfor %}
+                                </ul>
+                            </div>
+                            {% endif %}
+                            {% if data.Location %}
+                            <div class="data-item" style="animation-delay: 0.6s">
+                                <div class="data-label">Address</div>
+                                <ul class="data-value">
+                                    {% for val in data.Location %}<li>{{ val }}</li>{% endfor %}
+                                </ul>
+                            </div>
+                            {% endif %}
+                            {% if data.Link %}
+                            <div class="data-item" style="animation-delay: 0.7s">
+                                <div class="data-label">Social/Web Links</div>
+                                <ul class="data-value">
+                                    {% for val in data.Link %}<li>{{ val }}</li>{% endfor %}
+                                </ul>
+                            </div>
+                            {% endif %}
+                        </div>
+                    </div>
+                    {% if data.status_message %}
+                    <div class="mt-4 pt-3 border-top border-secondary text-end">
+                        <span class="badge rounded-pill bg-dark text-muted" style="font-size: 0.65rem; border: 1px solid rgba(255,255,255,0.05)">{{ data.status_message }}</span>
+                    </div>
+                    {% endif %}
+                </div>
+            </div>
+        </div>
+        <!-- Debug / Raw Output Panel -->
+        <div class="debug-panel rounded">
+            <h5 class="mb-3" style="color: #444; font-size: 0.9rem">SYSTEM_LOG :: RAW_EXTRACTION_BUFFER</h5>
+            {% for filename, raw in data.extracted_text.items() %}
+            <div class="mb-4">
+                <div class="mb-1 text-primary">>> {{ filename | basename }}</div>
+                <div class="p-3 bg-black rounded" style="color: #0f0; opacity: 0.8; font-size: 0.85rem">
+                    {{ raw }}
+                </div>
+            </div>
+            {% endfor %}
+        </div>
+        {% else %}
+        <div class="text-center glass-card py-5">
+            <h2 class="premium-title">Waiting for Data...</h2>
+            <p class="text-muted">No analysis results found in session.</p>
+            <a href="{{ url_for('index') }}" class="btn-premium mt-3">Back to Upload</a>
+        </div>
+        {% endif %}
+    </div>
+    <script src="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0-alpha1/dist/js/bootstrap.bundle.min.js"></script>
+    <script>
+        // Auto-remove flash messages
+        setTimeout(function () {
+            let flashMessage = document.getElementById("flash-message");
+            if (flashMessage) {
+                flashMessage.style.transition = "all 0.8s ease";
+                flashMessage.style.opacity = 0;
+                flashMessage.style.transform = "translateY(-20px)";
+                setTimeout(() => flashMessage.remove(), 800);
+            }
+        }, 4000);
+    </script>
+</body>
+</html>
+</html>

utility/__pycache__/utils.cpython-310.pyc ADDED Viewed

Binary file (3.77 kB). View file

utility/__pycache__/utils.cpython-312.pyc ADDED Viewed

Binary file (27 kB). View file

utility/__pycache__/utils.cpython-313.pyc ADDED Viewed

Binary file (27.2 kB). View file

utility/utils.py CHANGED Viewed

@@ -1,700 +1,132 @@
-# libraries
 import os
-from huggingface_hub import InferenceClient
-from dotenv import load_dotenv
 import json
 import re
-#import easyocr
-from PIL import Image, ImageEnhance, ImageDraw
-import cv2
-import numpy as np
-from paddleocr import PaddleOCR
 import logging
-from datetime import datetime
-# Configure logging
-logging.basicConfig(
-    level=logging.INFO,
-    handlers=[
-        logging.StreamHandler()  # Remove FileHandler and log only to the console
-    ]
-)
-# Set the PaddleOCR home directory to a writable location
-import os
-os.environ['PADDLEOCR_HOME'] = '/tmp/.paddleocr'
-RESULT_FOLDER = 'static/results/'
-JSON_FOLDER = 'static/json/'
-if not os.path.exists('/tmp/.paddleocr'):
-    os.makedirs(RESULT_FOLDER, exist_ok=True)
-# Check if PaddleOCR home directory is writable
-if not os.path.exists('/tmp/.paddleocr'):
-    os.makedirs('/tmp/.paddleocr', exist_ok=True)
-    logging.info("Created PaddleOCR home directory.")
-else:
-    logging.info("PaddleOCR home directory exists.")
-# Load environment variables from .env file
-load_dotenv()
-# Authenticate with Hugging Face
-HFT = os.getenv('HF_TOKEN')
-# Initialize the InferenceClient
-client = InferenceClient(model="mistralai/Mistral-7B-Instruct-v0.3", token=HFT)
-def load_image(image_path):
-    ext = os.path.splitext(image_path)[1].lower()
-    if ext in ['.png', '.jpg', '.jpeg', '.webp', '.tiff']:
-        image = cv2.imread(image_path)
-        if image is None:
-            raise ValueError(f"Failed to load image from {image_path}. The file may be corrupted or unreadable.")
-        return image
-    else:
-        raise ValueError(f"Unsupported image format: {ext}")
-# Function for upscaling image using OpenCV's INTER_CUBIC
-def upscale_image(image, scale=2):
-    height, width = image.shape[:2]
-    upscaled_image = cv2.resize(image, (width * scale, height * scale), interpolation=cv2.INTER_CUBIC)
-    return upscaled_image
-# Function to denoise the image (reduce noise)
-def reduce_noise(image):
-    return cv2.fastNlMeansDenoisingColored(image, None, 10, 10, 7, 21)
-# Function to sharpen the image
-def sharpen_image(image):
-    kernel = np.array([[0, -1, 0],
-                       [-1, 5, -1],
-                       [0, -1, 0]])
-    sharpened_image = cv2.filter2D(image, -1, kernel)
-    return sharpened_image
-# Function to increase contrast and enhance details without changing color
-def enhance_image(image):
-    pil_img = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
-    enhancer = ImageEnhance.Contrast(pil_img)
-    enhanced_image = enhancer.enhance(1.5)
-    enhanced_image_bgr = cv2.cvtColor(np.array(enhanced_image), cv2.COLOR_RGB2BGR)
-    return enhanced_image_bgr
-# Complete function to process image
-def process_image(image_path, scale=2):
-    # Load the image
-    image = load_image(image_path)
-    # Upscale the image
-    upscaled_image = upscale_image(image, scale)
-    # Reduce noise
-    denoised_image = reduce_noise(upscaled_image)
-    # Sharpen the image
-    sharpened_image = sharpen_image(denoised_image)
-    # Enhance the image contrast and details without changing color
-    final_image = enhance_image(sharpened_image)
-    return final_image
-# Function for OCR with PaddleOCR, returning both text and bounding boxes
-def ocr_with_paddle(img):
-    final_text = ''
-    boxes = []
-    # Initialize PaddleOCR
-    # In /app/utility/utils.py
-    ocr = PaddleOCR(
-        use_angle_cls=True,
-        lang='en',
-        enable_mkldnn=False,  # <--- Add this line to disable the failing optimization
-        use_gpu=False         # Ensure this is False if you are on a CPU-only container
-    )
-    # ocr = PaddleOCR(
-    #     lang='en',
-    #     use_angle_cls=True,
-    #     det_model_dir=os.path.join(os.environ['PADDLEOCR_HOME'], 'whl/det'),
-    #     rec_model_dir=os.path.join(os.environ['PADDLEOCR_HOME'], 'whl/rec/en/en_PP-OCRv4_rec_infer'),
-    #     cls_model_dir=os.path.join(os.environ['PADDLEOCR_HOME'], 'whl/cls/ch_ppocr_mobile_v2.0_cls_infer')
-    # )
-    # ocr = PaddleOCR(
-    #     use_angle_cls=True,
-    #     lang='en',
-    #     det_model_dir='/app/paddleocr_models/whl/det/ch_ppocr_mobile_v2.0_det_infer',
-    #     rec_model_dir='/app/paddleocr_models/whl/rec/ch_ppocr_mobile_v2.0_rec_infer',
-    #     cls_model_dir='/app/paddleocr_models/whl/cls/ch_ppocr_mobile_v2.0_cls_infer'
-    # )
-    # Check if img is a file path or an image array
-    if isinstance(img, str):
-        img = cv2.imread(img)
-    # Perform OCR
-    result = ocr.ocr(img)
-    # Iterate through the OCR result
-    for line in result[0]:
-        # Check how many values are returned (2 or 3) and unpack accordingly
-        if len(line) == 3:
-            box, (text, confidence), _ = line  # When 3 values are returned
-        elif len(line) == 2:
-            box, (text, confidence) = line  # When only 2 values are returned
-        # Store the recognized text and bounding boxes
-        final_text += ' ' + text  # Extract the text from the tuple
-        boxes.append(box)
-        # Draw the bounding box
-        points = [(int(point[0]), int(point[1])) for point in box]
-        cv2.polylines(img, [np.array(points)], isClosed=True, color=(0, 255, 0), thickness=2)
-    # Store the image with bounding boxes in a variable
-    img_with_boxes = img
-    return final_text, img_with_boxes
-def extract_text_from_images(image_paths):
-    all_extracted_texts = {}
-    all_extracted_imgs = {}
-    for image_path in image_paths:
-        try:
-            # Enhance the image before OCR
-            enhanced_image = process_image(image_path, scale=2)
-            # Perform OCR on the enhanced image and get boxes
-            result, img_with_boxes = ocr_with_paddle(enhanced_image)
-            # Draw bounding boxes on the processed image
-            img_result = Image.fromarray(enhanced_image)
-            #img_with_boxes = draw_boxes(img_result, boxes)
-            # genrating unique id to save the images
-            # Get the current date and time
-            current_time = datetime.now()
-            # Format it as a string to create a unique ID
-            unique_id = current_time.strftime("%Y%m%d%H%M%S%f")
-            #print(unique_id)
-            # Save the image with boxes
-            result_image_path = os.path.join(RESULT_FOLDER, f'result_{unique_id}_{os.path.basename(image_path)}')
-            #img_with_boxes.save(result_image_path)
-            cv2.imwrite(result_image_path, img_with_boxes)
-            # Store the text and image result paths
-            all_extracted_texts[image_path] = result
-            all_extracted_imgs[image_path] = result_image_path
-        except ValueError as ve:
-            print(f"Error processing image {image_path}: {ve}")
-            continue  # Continue to the next image if there's an error
-    # Convert to JSON-compatible structure
-    all_extracted_imgs_json = {str(k): str(v) for k, v in all_extracted_imgs.items()}
-    return all_extracted_texts, all_extracted_imgs_json
-# Function to call the Gemma model and process the output as Json
-# def Data_Extractor(data, client=client):
-#     text = f'''Act as a  Text extractor for the following text given in text: {data}
-#     extract text in the following output JSON string:
-#     {{
-#     "Name": ["Identify and Extract All the person's name from the text."],
-#     "Designation": ["Extract All the designation or job title mentioned in the text."],
-#     "Company": ["Extract All the company or organization name if mentioned."],
-#     "Contact": ["Extract All phone number, including country codes if present."],
-#     "Address": ["Extract All the full postal address or location mentioned in the text."],
-#     "Email": ["Identify and Extract All valid email addresses mentioned in the text else 'Not found'."],
-#     "Link": ["Identify and Extract any website URLs or social media links present in the text."]
-#     }}
-#     Output:
-#     '''
-#     # Call the API for inference
-#     response = client.text_generation(text, max_new_tokens=1000)#, temperature=0.4, top_k=50, top_p=0.9, repetition_penalty=1.2)
-#     print("parse in text ---:",response)
-#     # Convert the response text to JSON
-#     try:
-#         json_data = json.loads(response)
-#         print("Json_data-------------->",json_data)
-#         return json_data
-#     except json.JSONDecodeError as e:
-#         return {"error": f"Error decoding JSON: {e}"}
-def Data_Extractor(data):
-    url = "https://api.groq.com/openai/v1/chat/completions"
-    headers = {
-        "Content-Type": "application/json",
-        "Authorization": f"Bearer {os.getenv('GROQ_API_KEY')}"
     }
-    prompt = f"""
-You are a strict JSON generator.
-Extract structured data from the following text.
-Return ONLY valid JSON. No explanation. No markdown.
-Schema:
-{{
-    "Name": [],
-    "Designation": [],
-    "Company": [],
-    "Contact": [],
-    "Address": [],
-    "Email": [],
-    "Link": []
-}}
-Rules:
-- Always return all keys
-- If nothing found → return empty list []
-- Do NOT return "Not found"
-- Ensure valid JSON format
-Text:
-{data}
-"""
-    payload = {
-        "model": "llama-3.3-70b-versatile",
-        "messages": [
-            {"role": "user", "content": prompt}
-        ],
-        "temperature": 0.2,   # 🔥 IMPORTANT: lower = more structured
-        "max_tokens": 1024,
-        "top_p": 1,
-        "stream": False
     }
-    response = requests.post(url, headers=headers, json=payload)
-    if response.status_code != 200:
-        return {"error": response.text}
-    result = response.json()
-    # Extract model output
-    content = result["choices"][0]["message"]["content"]
-    print("RAW LLM OUTPUT:\n", content)
-    # 🔧 Clean response (important)
-    content = content.strip()
-    # Remove markdown if model adds ```json
-    if content.startswith("```"):
-        content = content.split("```")[1]
-    try:
-        json_data = json.loads(content)
-        return json_data
-    except json.JSONDecodeError as e:
-        print("JSON ERROR:", e)
-        return {"error": "Invalid JSON from model", "raw": content}
-# For have text compatible to the llm
-def json_to_llm_str(textJson):
-    str=''
-    for file,item in textJson.items():
-      str+=item + ' '
-    return str
-# Define the RE for extracting the contact details like number, mail , portfolio, website etc
-def extract_contact_details(text):
-    # Regex patterns
-    # Phone numbers with at least 5 digits in any segment
-    combined_phone_regex = re.compile(r'''
-    (?:
-        #(?:(?:\+91[-.\s]?)?\d{5}[-.\s]?\d{5})|(?:\+?\d{1,3})?[-.\s()]?\d{5,}[-.\s()]?\d{5,}[-.\s()]?\d{1,9} | /^[\.-)( ]*([0-9]{3})[\.-)( ]*([0-9]{3})[\.-)( ]*([0-9]{4})$/ |
-        \+1\s\(\d{3}\)\s\d{3}-\d{4} |               # USA/Canada Intl +1 (XXX) XXX-XXXX
-        \(\d{3}\)\s\d{3}-\d{4} |                    # USA/Canada STD (XXX) XXX-XXXX
-        \(\d{3}\)\s\d{3}\s\d{4} |                   # USA/Canada (XXX) XXX XXXX
-        \(\d{3}\)\s\d{3}\s\d{3} |                   # USA/Canada (XXX) XXX XXX
-        \+1\d{10} |                                 # +1 XXXXXXXXXX
-        \d{10} |                                    # XXXXXXXXXX
-        \+44\s\d{4}\s\d{6} |                        # UK Intl +44 XXXX XXXXXX
-        \+44\s\d{3}\s\d{3}\s\d{4} |                 # UK Intl +44 XXX XXX XXXX
-        0\d{4}\s\d{6} |                             # UK STD 0XXXX XXXXXX
-        0\d{3}\s\d{3}\s\d{4} |                      # UK STD 0XXX XXX XXXX
-        \+44\d{10} |                                # +44 XXXXXXXXXX
-        0\d{10} |                                   # 0XXXXXXXXXX
-        \+61\s\d\s\d{4}\s\d{4} |                    # Australia Intl +61 X XXXX XXXX
-        0\d\s\d{4}\s\d{4} |                         # Australia STD 0X XXXX XXXX
-        \+61\d{9} |                                 # +61 XXXXXXXXX
-        0\d{9} |                                    # 0XXXXXXXXX
-        \+91\s\d{5}-\d{5} |                         # India Intl +91 XXXXX-XXXXX
-        \+91\s\d{4}-\d{6} |                         # India Intl +91 XXXX-XXXXXX
-        \+91\s\d{10} |                              # India Intl +91 XXXXXXXXXX
-        \+91\s\d{3}\s\d{3}\s\d{4} |                 # India Intl +91 XXX XXX XXXX
-        \+91\s\d{3}-\d{3}-\d{4} |                   # India Intl +91 XXX-XXX-XXXX
-        \+91\s\d{2}\s\d{4}\s\d{4} |                 # India Intl +91 XX XXXX XXXX
-        \+91\s\d{2}-\d{4}-\d{4} |                   # India Intl +91 XX-XXXX-XXXX
-        \+91\s\d{5}\s\d{5} |                        # India Intl +91 XXXXX XXXXX
-        \d{5}\s\d{5} |                              # India XXXXX XXXXX
-        \d{5}-\d{5} |                               # India XXXXX-XXXXX
-        0\d{2}-\d{7} |                              # India STD 0XX-XXXXXXX
-        \+91\d{10} |                                # +91 XXXXXXXXXX
-        \d{10} |                                    # XXXXXXXXXX   # Here is the regex to handle all possible combination of the contact
-        \d{6}-\d{4} |                               # XXXXXX-XXXX
-        \d{4}-\d{6} |                               # XXXX-XXXXXX
-        \d{3}\s\d{3}\s\d{4} |                       # XXX XXX XXXX
-        \d{3}-\d{3}-\d{4} |                         # XXX-XXX-XXXX
-        \d{4}\s\d{3}\s\d{3} |                       # XXXX XXX XXX
-        \d{4}-\d{3}-\d{3} |                         # XXXX-XXX-XXX #-----
-        \+49\s\d{4}\s\d{8} |                        # Germany Intl +49 XXXX XXXXXXXX
-        \+49\s\d{3}\s\d{7} |                        # Germany Intl +49 XXX XXXXXXX
-        0\d{3}\s\d{8} |                             # Germany STD 0XXX XXXXXXXX
-        \+49\d{12} |                                # +49 XXXXXXXXXXXX
-        \+49\d{10} |                                # +49 XXXXXXXXXX
-        0\d{11} |                                   # 0XXXXXXXXXXX
-        \+86\s\d{3}\s\d{4}\s\d{4} |                 # China Intl +86 XXX XXXX XXXX
-        0\d{3}\s\d{4}\s\d{4} |                      # China STD 0XXX XXXX XXXX
-        \+86\d{11} |                                # +86 XXXXXXXXXXX
-        \+81\s\d\s\d{4}\s\d{4} |                    # Japan Intl +81 X XXXX XXXX
-        \+81\s\d{2}\s\d{4}\s\d{4} |                 # Japan Intl +81 XX XXXX XXXX
-        0\d\s\d{4}\s\d{4} |                         # Japan STD 0X XXXX XXXX
-        \+81\d{10} |                                # +81 XXXXXXXXXX
-        \+81\d{9} |                                 # +81 XXXXXXXXX
-        0\d{9} |                                    # 0XXXXXXXXX
-        \+55\s\d{2}\s\d{5}-\d{4} |                  # Brazil Intl +55 XX XXXXX-XXXX
-        \+55\s\d{2}\s\d{4}-\d{4} |                  # Brazil Intl +55 XX XXXX-XXXX
-        0\d{2}\s\d{4}\s\d{4} |                      # Brazil STD 0XX XXXX XXXX
-        \+55\d{11} |                                # +55 XXXXXXXXXXX
-        \+55\d{10} |                                # +55 XXXXXXXXXX
-        0\d{10} |                                   # 0XXXXXXXXXX
-        \+33\s\d\s\d{2}\s\d{2}\s\d{2}\s\d{2} |      # France Intl +33 X XX XX XX XX
-        0\d\s\d{2}\s\d{2}\s\d{2}\s\d{2} |           # France STD 0X XX XX XX XX
-        \+33\d{9} |                                 # +33 XXXXXXXXX
-        0\d{9} |                                    # 0XXXXXXXXX
-        \+7\s\d{3}\s\d{3}-\d{2}-\d{2} |             # Russia Intl +7 XXX XXX-XX-XX
-        8\s\d{3}\s\d{3}-\d{2}-\d{2} |               # Russia STD 8 XXX XXX-XX-XX
-        \+7\d{10} |                                 # +7 XXXXXXXXXX
-        8\d{10} |                                   # 8 XXXXXXXXXX
-        \+27\s\d{2}\s\d{3}\s\d{4} |                 # South Africa Intl +27 XX XXX XXXX
-        0\d{2}\s\d{3}\s\d{4} |                      # South Africa STD 0XX XXX XXXX
-        \+27\d{9} |                                 # +27 XXXXXXXXX
-        0\d{9} |                                    # 0XXXXXXXXX
-        \+52\s\d{3}\s\d{3}\s\d{4} |                 # Mexico Intl +52 XXX XXX XXXX
-        \+52\s\d{2}\s\d{4}\s\d{4} |                 # Mexico Intl +52 XX XXXX XXXX
-        01\s\d{3}\s\d{4} |                          # Mexico STD 01 XXX XXXX
-        \+52\d{10} |                                # +52 XXXXXXXXXX
-        01\d{7} |                                   # 01 XXXXXXX
-        \+234\s\d{3}\s\d{3}\s\d{4} |                # Nigeria Intl +234 XXX XXX XXXX
-        0\d{3}\s\d{3}\s\d{4} |                      # Nigeria STD 0XXX XXX XXXX
-        \+234\d{10} |                               # +234 XXXXXXXXXX
-        0\d{10} |                                   # 0XXXXXXXXXX
-        \+971\s\d\s\d{3}\s\d{4} |                   # UAE Intl +971 X XXX XXXX
-        0\d\s\d{3}\s\d{4} |                         # UAE STD 0X XXX XXXX
-        \+971\d{8} |                                # +971 XXXXXXXX
-        0\d{8} |                                    # 0XXXXXXXX
-        \+54\s9\s\d{3}\s\d{3}\s\d{4} |              # Argentina Intl +54 9 XXX XXX XXXX
-        \+54\s\d{1}\s\d{4}\s\d{4} |                 # Argentina Intl +54 X XXXX XXXX
-        0\d{3}\s\d{4} |                             # Argentina STD 0XXX XXXX
-        \+54\d{10} |                                # +54 9 XXXXXXXXXX
-        \+54\d{9} |                                 # +54 XXXXXXXXX
-        0\d{7} |                                    # 0XXXXXXX
-        \+966\s\d\s\d{3}\s\d{4} |                   # Saudi Intl +966 X XXX XXXX
-        0\d\s\d{3}\s\d{4} |                         # Saudi STD 0X XXX XXXX
-        \+966\d{8} |                                # +966 XXXXXXXX
-        0\d{8} |                                    # 0XXXXXXXX
-        \+1\d{10} |                                 # +1 XXXXXXXXXX
-        \+1\s\d{3}\s\d{3}\s\d{4} |                  # +1 XXX XXX XXXX
-        \d{5}\s\d{5} |                              # XXXXX XXXXX
-        \d{10} |                                    # XXXXXXXXXX
-        \+44\d{10} |                                # +44 XXXXXXXXXX
-        0\d{10} |                                   # 0XXXXXXXXXX
-        \+61\d{9} |                                 # +61 XXXXXXXXX
-        0\d{9} |                                    # 0XXXXXXXXX
-        \+91\d{10} |                                # +91 XXXXXXXXXX
-        \+49\d{12} |                                # +49 XXXXXXXXXXXX
-        \+49\d{10} |                                # +49 XXXXXXXXXX
-        0\d{11} |                                   # 0XXXXXXXXXXX
-        \+86\d{11} |                                # +86 XXXXXXXXXXX
-        \+81\d{10} |                                # +81 XXXXXXXXXX
-        \+81\d{9} |                                 # +81 XXXXXXXXX
-        0\d{9} |                                    # 0XXXXXXXXX
-        \+55\d{11} |                                # +55 XXXXXXXXXXX
-        \+55\d{10} |                                # +55 XXXXXXXXXX
-        0\d{10} |                                   # 0XXXXXXXXXX
-        \+33\d{9} |                                 # +33 XXXXXXXXX
-        0\d{9} |                                    # 0XXXXXXXXX
-        \+7\d{10} |                                 # +7 XXXXXXXXXX
-        8\d{10} |                                   # 8 XXXXXXXXXX
-        \+27\d{9} |                                 # +27 XXXXXXXXX
-        0\d{9} |                                    # 0XXXXXXXXX (South Africa STD)
-        \+52\d{10} |                                # +52 XXXXXXXXXX
-        01\d{7} |                                   # 01 XXXXXXX
-        \+234\d{10} |                               # +234 XXXXXXXXXX
-        0\d{10} |                                   # 0XXXXXXXXXX
-        \+971\d{8} |                                # +971 XXXXXXXX
-        0\d{8} |                                    # 0XXXXXXXX
-        \+54\s9\s\d{10} |                           # +54 9 XXXXXXXXXX
-        \+54\d{9} |                                 # +54 XXXXXXXXX
-        0\d{7} |                                    # 0XXXXXXX
-        \+966\d{8} |                                # +966 XXXXXXXX
-        0\d{8}                                      # 0XXXXXXXX
-        \+\d{3}-\d{3}-\d{4}
-    )
-    ''',re.VERBOSE)
-    # Email regex
     email_regex = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b')
-    # URL and links regex, updated to avoid conflicts with email domains
-    link_regex = re.compile(r'\b(?:https?:\/\/)?(?:www\.)[a-zA-Z0-9-]+\.(?:com|co\.in|co|io|org|net|edu|gov|mil|int|uk|us|in|de|au|app|tech|xyz|info|biz|fr|dev)\b')
-    # Find all matches in the text
-    phone_numbers = [num for num in combined_phone_regex.findall(text) if len(num) >= 5]
-    emails = email_regex.findall(text)
-    links_RE = [link for link in link_regex.findall(text) if len(link)>=11]
-    # Remove profile links that might conflict with emails
-    links_RE = [link for link in links_RE if not any(email in link for email in emails)]
     return {
-        "phone_numbers": phone_numbers,
-        "emails": emails,
-        "links_RE": links_RE
-    }
-# preprocessing the data
-def process_extracted_text(extracted_text):
-    # Load JSON data
-    data = json.dumps(extracted_text, indent=4)
-    data = json.loads(data)
-    # Create a single dictionary to hold combined results
-    combined_results = {
-        "phone_numbers": [],
-        "emails": [],
-        "links_RE": []
     }
-    # Process each text entry
-    for filename, text in data.items():
-        contact_details = extract_contact_details(text)
-        # Extend combined results with the details from this file
-        combined_results["phone_numbers"].extend(contact_details["phone_numbers"])
-        combined_results["emails"].extend(contact_details["emails"])
-        combined_results["links_RE"].extend(contact_details["links_RE"])
-    # Convert the combined results to JSON
-    #combined_results_json = json.dumps(combined_results, indent=4)
-    combined_results_json = combined_results
-    # Print the final JSON results
-    print("Combined contact details in JSON format:")
-    print(combined_results_json)
-    return combined_results_json
-# Function to remove duplicates (case-insensitive) from each list in the dictionary
-def remove_duplicates_case_insensitive(data_dict):
-    for key, value_list in data_dict.items():
-        seen = set()
-        unique_list = []
-        for item in value_list:
-            if item.lower() not in seen:
-                unique_list.append(item)  # Add original item (preserving its case)
-                seen.add(item.lower())    # Track lowercase version
-        # Update the dictionary with unique values
-        data_dict[key] = unique_list
-    return data_dict
-# # Process the model output for parsed result
-# def process_resume_data(LLMdata,cont_data,extracted_text):
-#     # # Removing duplicate emails
-#     # unique_emails = []
-#     # for email in cont_data['emails']:
-#     #     if not any(email.lower() == existing_email.lower() for existing_email in LLMdata['Email']):
-#     #         unique_emails.append(email)
-#     # # Removing duplicate links (case insensitive)
-#     # unique_links = []
-#     # for link in cont_data['links_RE']:
-#     #     if not any(link.lower() == existing_link.lower() for existing_link in LLMdata['Link']):
-#     #         unique_links.append(link)
-#     # # Removing duplicate phone numbers
-#     # normalized_contact = [num[-10:] for num in LLMdata['Contact']]
-#     # unique_numbers = []
-#     # for num in cont_data['phone_numbers']:
-#     #     if num[-10:] not in normalized_contact:
-#     #         unique_numbers.append(num)
-#     # # Add unique emails, links, and phone numbers to the original LLMdata
-#     # LLMdata['Email'] += unique_emails
-#     # LLMdata['Link'] += unique_links
-#     # LLMdata['Contact'] += unique_numbers
-#     # Ensure keys exist (CRITICAL FIX)
-#     LLMdata['Email'] = LLMdata.get('Email', []) or []
-#     LLMdata['Link'] = LLMdata.get('Link', []) or []
-#     LLMdata['Contact'] = LLMdata.get('Contact', []) or []
-#     # Removing duplicate emails
-#     unique_emails = []
-#     for email in cont_data.get('emails', []):
-#         if not any(email.lower() == str(existing_email).lower() for existing_email in LLMdata['Email']):
-#             unique_emails.append(email)
-#     # Removing duplicate links
-#     unique_links = []
-#     for link in cont_data.get('links_RE', []):
-#         if not any(link.lower() == str(existing_link).lower() for existing_link in LLMdata['Link']):
-#             unique_links.append(link)
-#     # Normalize existing contacts safely
-#     normalized_contact = [
-#         str(num)[-10:] for num in LLMdata['Contact'] if num
-#     ]
-#     # Removing duplicate phone numbers
-#     unique_numbers = []
-#     for num in cont_data.get('phone_numbers', []):
-#         if str(num)[-10:] not in normalized_contact:
-#             unique_numbers.append(num)
-#     # Merge safely
-#     LLMdata['Email'].extend(unique_emails)
-#     LLMdata['Link'].extend(unique_links)
-#     LLMdata['Contact'].extend(unique_numbers)
-#     # Apply the function to the data
-#     LLMdata=remove_duplicates_case_insensitive(LLMdata)
-#     # Initialize the processed data dictionary
-#     processed_data = {
-#             "name": [],
-#             "contact_number": [],
-#             "Designation":[],
-#             "email": [],
-#             "Location": [],
-#             "Link": [],
-#             "Company":[],
-#             "extracted_text": extracted_text
-#             }
-#     #LLM
-#     processed_data['name'].extend(LLMdata.get('Name', None))
-#     #processed_data['contact_number'].extend(LLMdata.get('Contact', []))
-#     processed_data['Designation'].extend(LLMdata.get('Designation', []))
-#     #processed_data['email'].extend(LLMdata.get("Email", []))
-#     processed_data['Location'].extend(LLMdata.get('Address', []))
-#     #processed_data['Link'].extend(LLMdata.get('Link', []))
-#     processed_data['Company'].extend(LLMdata.get('Company', []))
-#     #Contact
-#     #processed_data['email'].extend(cont_data.get("emails", []))
-#     #processed_data['contact_number'].extend(cont_data.get("phone_numbers", []))
-#     #processed_data['Link'].extend(cont_data.get("links_RE", []))
-#     #New_merge_data
-#     processed_data['email'].extend(LLMdata['Email'])
-#     processed_data['contact_number'].extend(LLMdata['Contact'])
-#     processed_data['Link'].extend(LLMdata['Link'])
-#     #to remove not found fields
-#     # List of keys to check for 'Not found'
-#     keys_to_check = ["name", "contact_number", "Designation", "email", "Location", "Link", "Company"]
-#     # Replace 'Not found' with an empty list for each key
-#     for key in keys_to_check:
-#         if processed_data[key] == ['Not found'] or processed_data[key] == ['not found']:
-#             processed_data[key] = []
-#     return processed_data
-def process_resume_data(LLMdata, cont_data, extracted_text):
-    # -------------------------------
-    # ✅ STEP 1: Normalize LLM Schema
-    # -------------------------------
-    expected_keys = ["Name", "Designation", "Company", "Contact", "Address", "Email", "Link"]
-    for key in expected_keys:
-        if key not in LLMdata or LLMdata[key] is None:
-            LLMdata[key] = []
-        elif not isinstance(LLMdata[key], list):
-            LLMdata[key] = [LLMdata[key]]
-    # -------------------------------
-    # ✅ STEP 2: Normalize cont_data
-    # -------------------------------
-    cont_data = cont_data or {}
-    cont_data.setdefault("emails", [])
-    cont_data.setdefault("phone_numbers", [])
-    cont_data.setdefault("links_RE", [])
-    # -------------------------------
-    # ✅ STEP 3: Normalize existing contacts
-    # -------------------------------
-    normalized_llm_numbers = {
-        str(num)[-10:] for num in LLMdata["Contact"] if num
-    }
-    # -------------------------------
-    # ✅ STEP 4: Merge Emails
-    # -------------------------------
-    for email in cont_data["emails"]:
-        if not any(email.lower() == str(e).lower() for e in LLMdata["Email"]):
-            LLMdata["Email"].append(email)
-    # -------------------------------
-    # ✅ STEP 5: Merge Links
-    # -------------------------------
-    for link in cont_data["links_RE"]:
-        if not any(link.lower() == str(l).lower() for l in LLMdata["Link"]):
-            LLMdata["Link"].append(link)
-    # -------------------------------
-    # ✅ STEP 6: Merge Phone Numbers
-    # -------------------------------
-    for num in cont_data["phone_numbers"]:
-        norm = str(num)[-10:]
-        if norm not in normalized_llm_numbers:
-            LLMdata["Contact"].append(num)
-            normalized_llm_numbers.add(norm)
-    # -------------------------------
-    # ✅ STEP 7: Remove duplicates (case-insensitive)
-    # -------------------------------
-    LLMdata = remove_duplicates_case_insensitive(LLMdata)
-    # -------------------------------
-    # ✅ STEP 8: Build final structure
-    # -------------------------------
-    processed_data = {
-        "name": LLMdata["Name"],
-        "contact_number": LLMdata["Contact"],
-        "Designation": LLMdata["Designation"],
-        "email": LLMdata["Email"],
-        "Location": LLMdata["Address"],
-        "Link": LLMdata["Link"],
-        "Company": LLMdata["Company"],
-        "extracted_text": extracted_text
-    }
-    # -------------------------------
-    # ✅ STEP 9: Clean "Not found"
-    # -------------------------------
-    for key in ["name", "contact_number", "Designation", "email", "Location", "Link", "Company"]:
-        processed_data[key] = [
-            v for v in processed_data[key]
-            if str(v).lower() != "not found"
-        ]
-    return processed_data

 import os
 import json
 import re
 import logging
+from typing import List, Dict, Any
+# Ensure langchain is available for paddlex/paddleocr
+try:
+    import langchain
+    import langchain_community
+except ImportError:
+    logging.warning("LangChain modules not found. PaddleOCR might fail.")
+from core.ocr_engine import OCREngine
+from core.vlm_engine import GroqVLMEngine
+from core.ner_engine import NEREngine
+# Global instances (Lazy load)
+_ocr = None
+_vlm = None
+_ner = None
+def get_ocr():
+    global _ocr
+    if not _ocr:
+        _ocr = OCREngine()
+    return _ocr
+def get_vlm():
+    global _vlm
+    if not _vlm:
+        _vlm = GroqVLMEngine()
+    return _vlm
+def get_ner():
+    global _ner
+    if not _ner:
+        _ner = NEREngine()
+    return _ner
+def process_image_pipeline(image_paths: List[str]) -> Dict[str, Any]:
+    logging.info(f"Pipeline: Starting processing for {len(image_paths)} images.")
+    vlm = get_vlm()
+    ocr = get_ocr()
+    ner = get_ner()
+    final_results = {
+        "name": [],
+        "contact_number": [],
+        "Designation": [],
+        "email": [],
+        "Location": [],
+        "Link": [],
+        "Company": [],
+        "extracted_text": {},
+        "status_message": "Primary: Groq VLM"
     }
+    all_raw_text = {}
+    for path in image_paths:
+        img_name = os.path.basename(path)
+        # 1. Primary: VLM
+        logging.info(f"Pipeline: Attempting VLM extraction for {img_name}")
+        vlm_data = vlm.process(path)
+        if vlm_data:
+            merge_structured_data(final_results, vlm_data)
+            all_raw_text[path] = json.dumps(vlm_data)
+            logging.info(f"Pipeline: VLM success for {img_name}")
+        else:
+            # 2. Fallback: OCR + NER
+            logging.warning(f"Pipeline: VLM failed or skipped for {img_name}. Falling back to OCR+NER.")
+            raw_text = ocr.extract_text(path)
+            all_raw_text[path] = raw_text
+            if raw_text:
+                logging.info(f"Pipeline: OCR success for {img_name}, attempting NER.")
+                ner_data = ner.extract_entities(raw_text)
+                if ner_data:
+                    merge_structured_data(final_results, ner_data)
+                    logging.info(f"Pipeline: NER success for {img_name}")
+                else:
+                    logging.warning(f"Pipeline: NER failed to extract entities for {img_name}")
+                final_results["status_message"] = "Fallback: OCR+NER"
+            else:
+                logging.error(f"Pipeline: Both VLM and OCR failed for {img_name}")
+    final_results["extracted_text"] = all_raw_text
+    cleaned = cleanup_results(final_results)
+    logging.info(f"Pipeline: Completed. Extracted data for {sum(1 for v in cleaned.values() if isinstance(v, list) and v)} fields.")
+    return cleaned
+def merge_structured_data(main_data: Dict, new_data: Dict):
+    mapping = {
+        "Name": "name",
+        "Contact": "contact_number",
+        "Designation": "Designation",
+        "Email": "email",
+        "Address": "Location",
+        "Link": "Link",
+        "Company": "Company"
     }
+    for key, val in new_data.items():
+        canonical_key = mapping.get(key.capitalize(), key.lower())
+        if canonical_key in main_data:
+            if isinstance(val, list):
+                main_data[canonical_key].extend(val)
+            elif val:
+                main_data[canonical_key].append(val)
+def cleanup_results(results: Dict) -> Dict:
+    for key, val in results.items():
+        if isinstance(val, list):
+            # Remove duplicates, empty strings, and 'not found'
+            seen = set()
+            unique = []
+            for item in val:
+                item_str = str(item).strip()
+                if item_str.lower() not in seen and item_str.lower() not in {"", "not found", "none", "null", "[]"}:
+                    unique.append(item_str)
+                    seen.add(item_str.lower())
+            results[key] = unique
+    return results
+def extract_contact_details(text: str) -> Dict[str, List[str]]:
+    # Regex fallback for extra safety
     email_regex = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b')
+    phone_regex = re.compile(r'(\+?\d{1,3}[-.\s()]?)?\(?\d{3,5}\)?[-.\s()]?\d{3,5}[-.\s()]?\d{3,5}')
     return {
+        "emails": email_regex.findall(text),
+        "phone_numbers": phone_regex.findall(text)
     }