Spaces:
Sleeping
Sleeping
File size: 3,528 Bytes
a18ce35 d90e47c a18ce35 bcc8f3b 38670e5 a18ce35 bcc8f3b a18ce35 38670e5 a18ce35 38670e5 a18ce35 d90e47c 38670e5 bcc8f3b 2ceb2ac bcc8f3b 2ceb2ac bcc8f3b 38670e5 bcc8f3b 38670e5 bcc8f3b d90e47c a18ce35 38670e5 a18ce35 d90e47c 722c74f 38670e5 722c74f d90e47c 2ceb2ac bcc8f3b 0369c37 bcc8f3b f57e68d bcc8f3b 38670e5 bcc8f3b f57e68d bcc8f3b 38670e5 0369c37 38670e5 b535d50 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 |
# utils/metadata.py
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
import re
import dateparser
# π§ Load advanced NER model
model_name = "Jean-Baptiste/roberta-large-ner-english"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)
# π§ Build NER pipeline with grouping
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
def clean_text(text):
"""
π§Ό Clean contract text for better NER and regex performance.
"""
return text.replace("\n", " ").replace(" ", " ").strip()
def extract_effective_date(text):
"""
π
Extract natural language 'Effective Date' (e.g., 'as of August 28, 2025').
"""
match = re.search(r"(?i)as of (.+?)(\.|,|\n)", text)
if match:
raw_date = match.group(1).strip()
parsed = dateparser.parse(raw_date)
if parsed:
return [parsed.strftime("%Y-%m-%d")]
return []
def extract_parties(text):
"""
π§Ύ Extract contracting parties using 'by and between X and Y'.
"""
pattern = r"(?i)by and between[:\s\n]+(.+?)\s+and\s+(.+?)\s*(\(|\n|$)"
match = re.search(pattern, text, re.DOTALL)
if match:
return [match.group(1).strip(), match.group(2).strip()]
return []
def extract_governing_law(text):
"""
βοΈ Capture governing law even if it's stated less directly.
"""
patterns = [
r"(?i)governed by the laws of ([\w\s,]+)",
r"(?i)under the laws of ([\w\s,]+)"
]
for pattern in patterns:
match = re.search(pattern, text)
if match:
return [match.group(1).strip()]
return []
def extract_venue(text):
"""
ποΈ Look for venue in dispute clause like 'submitted to ... in XYZ'.
"""
match = re.search(r"(?i)submitted to.*?in ([\w\s,]+)", text)
return [match.group(1).strip()] if match else []
def extract_metadata(text):
"""
π¦ Extract full structured metadata using hybrid rule-based + NER.
"""
if not text.strip():
return {"error": "No input provided."}
text = clean_text(text)
# NER chunking
max_chunk_length = 512
words = text.split()
chunks = [" ".join(words[i:i + max_chunk_length]) for i in range(0, len(words), max_chunk_length)]
ner_metadata = {
"EFFECTIVE_DATE": [],
"PARTIES": [],
"GOVERNING_LAW": [],
"JURISDICTION": []
}
label_mapping = {
"DATE": "EFFECTIVE_DATE",
"PERSON": "PARTIES",
"ORGANIZATION": "PARTIES",
"LOCATION": "GOVERNING_LAW"
}
for chunk in chunks:
ner_results = ner_pipeline(chunk)
for ent in ner_results:
label = ent["entity_group"]
word = ent["word"]
custom_label = label_mapping.get(label)
if custom_label and word not in ner_metadata[custom_label]:
ner_metadata[custom_label].append(word)
# π§ Replace/enhance with rule-based extraction
ner_metadata["PARTIES"] = extract_parties(text) or ner_metadata["PARTIES"]
ner_metadata["EFFECTIVE_DATE"] = extract_effective_date(text) or ner_metadata["EFFECTIVE_DATE"]
ner_metadata["GOVERNING_LAW"] = extract_governing_law(text) or ner_metadata["GOVERNING_LAW"]
ner_metadata["VENUE"] = extract_venue(text) or ner_metadata["VENUE"]
ner_metadata["JURISDICTION"] = extract_venue(text) or ner_metadata["JURISDICTION"]
return ner_metadata
|