Zayed024 commited on
Commit
ccd1b2a
·
verified ·
1 Parent(s): b2986e4

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +114 -0
app.py ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import warnings
2
+ warnings.filterwarnings('ignore')
3
+ import os
4
+ import gradio as gr
5
+ import torch
6
+
7
+ import tempfile
8
+ import numpy as np
9
+ import cohere
10
+ import spacy
11
+ import nltk
12
+ from nltk.tokenize import word_tokenize, sent_tokenize
13
+ from nltk import pos_tag
14
+ from nltk.corpus import stopwords
15
+ from pdfminer.high_level import extract_text
16
+ from nltk.tokenize.texttiling import TextTilingTokenizer
17
+
18
+ co = cohere.Client(os.environ.get("CO_API_KEY"))
19
+
20
+ nlp = spacy.load("en_core_web_sm")
21
+
22
+ from transformers import AutoTokenizer, AutoModel
23
+
24
+ # Load models
25
+ tokenizer = AutoTokenizer.from_pretrained("law-ai/InLegalBERT")
26
+ model = AutoModel.from_pretrained("law-ai/InLegalBERT")
27
+
28
+ # Initialize TextTilingTokenizer with default parameters
29
+ tiling_tokenizer = TextTilingTokenizer()
30
+
31
+ def generate_response(prompt, embeddings):
32
+ aggregated_embedding = np.mean([np.mean(embed) for embed in embeddings])
33
+ embedding_str = f"Embedding summary: {aggregated_embedding:.2f}"
34
+
35
+ full_prompt = f"{embedding_str}\n\n{prompt}"
36
+
37
+ try:
38
+ response = co.generate(
39
+ model="command-xlarge-nightly",
40
+ prompt=full_prompt,
41
+ max_tokens=750 # Increase the max tokens for a longer response
42
+ )
43
+ return response.generations[0].text.strip()
44
+
45
+ except cohere.error.CohereError as e:
46
+ return f"An error occurred: {str(e)}"
47
+
48
+ def extract_text_from_pdf(pdf_path):
49
+ return extract_text(pdf_path)
50
+
51
+ def get_bert_embeddings(texts):
52
+ embeddings_list = []
53
+
54
+ for text in texts:
55
+ inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
56
+ with torch.no_grad():
57
+ outputs = model(**inputs)
58
+ embeddings = outputs.last_hidden_state[:, 0, :].squeeze().numpy()
59
+ embeddings_list.append(embeddings)
60
+
61
+ return embeddings_list
62
+
63
+ def analyze_text(text):
64
+ doc = nlp(text)
65
+ entities = [(ent.text, ent.label_) for ent in doc.ents]
66
+ tokens = word_tokenize(text)
67
+ pos_tags = pos_tag(tokens)
68
+ dependencies = [(token.text, token.dep_, token.head.text) for token in doc]
69
+ return entities, pos_tags, dependencies
70
+
71
+ def process_pdf_and_generate_response(pdf_file, query):
72
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
73
+ with open(pdf_file, 'rb') as f:
74
+ temp_file.write(f.read())
75
+ temp_file_path = temp_file.name
76
+
77
+ document_text = extract_text_from_pdf(temp_file_path)
78
+
79
+ entities, pos_tags, dependencies = analyze_text(document_text)
80
+
81
+ print("Entities:", entities)
82
+ print("POS Tags:", pos_tags)
83
+ print("Dependencies:", dependencies)
84
+
85
+ # Segment the document text using TextTiling
86
+ text_chunks = tiling_tokenizer.tokenize(document_text)
87
+
88
+ # Process document text with InLegalBERT
89
+ document_embeddings = get_bert_embeddings(text_chunks)
90
+
91
+ # Construct prompt for LLM
92
+ prompt = f"You are an AI driven research engine for commercial courts, Given the legal document: '{document_text[:2000]}', answer the query : '{query}'"
93
+
94
+ # Generate response using LLM
95
+ response = generate_response(prompt, document_embeddings)
96
+
97
+ return response
98
+
99
+ def chunk_long_sentence(sentence):
100
+ words = sentence.split()
101
+ chunks = []
102
+ current_chunk = []
103
+
104
+ for word in words:
105
+ if len(' '.join(current_chunk + [word])) <= 512:
106
+ current_chunk.append(word)
107
+ else:
108
+ chunks.append(' '.join(current_chunk))
109
+ current_chunk = [word]
110
+
111
+ if current_chunk:
112
+ chunks.append(' '.join(current_chunk))
113
+
114
+ return chunks