SCBconsulting commited on
Commit
f57e68d
·
verified ·
1 Parent(s): 722c74f

Update utils/metadata.py

Browse files
Files changed (1) hide show
  1. utils/metadata.py +31 -16
utils/metadata.py CHANGED
@@ -28,20 +28,35 @@ def extract_metadata(text):
28
  words = text.split()
29
  chunks = [" ".join(words[i:i + max_chunk_length]) for i in range(0, len(words), max_chunk_length)]
30
 
31
- metadata = {
32
- "DATE": [],
33
- "PERSON": [],
34
- "ORGANIZATION": [],
35
- "LOCATION": []
36
- }
37
-
38
- for chunk in chunks:
39
- ner_results = ner_pipeline(chunk)
40
- for ent in ner_results:
41
- label = ent["entity_group"]
42
- word = ent["word"]
43
- if label in metadata and word not in metadata[label]:
44
- metadata[label].append(word)
45
-
46
- return metadata
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
 
 
28
  words = text.split()
29
  chunks = [" ".join(words[i:i + max_chunk_length]) for i in range(0, len(words), max_chunk_length)]
30
 
31
+ metadata = {
32
+ "EFFECTIVE_DATE": [],
33
+ "PARTIES": [],
34
+ "GOVERNING_LAW": [], # derived from LOCATION
35
+ }
36
+
37
+ # Use label mappings
38
+ label_mapping = {
39
+ "DATE": "EFFECTIVE_DATE",
40
+ "PERSON": "PARTIES",
41
+ "ORGANIZATION": "PARTIES",
42
+ "LOCATION": "GOVERNING_LAW"
43
+ }
44
+
45
+ for ent in ner_results:
46
+ label = ent["entity_group"]
47
+ word = ent["word"]
48
+ custom_label = label_mapping.get(label)
49
+
50
+ if custom_label and word not in metadata[custom_label]:
51
+ metadata[custom_label].append(word)
52
+
53
+ import re
54
+
55
+ def extract_governing_law(text):
56
+ match = re.search(r"(?i)governed by the laws of ([\w\s,]+)", text)
57
+ return match.group(1).strip() if match else None
58
+
59
+ def extract_venue(text):
60
+ match = re.search(r"(?i)submitted to.*?in ([\w\s,]+)", text)
61
+ return match.group(1).strip() if match else None
62