rishabh5752 commited on
Commit
ce70e1e
·
1 Parent(s): 268e7b8

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +91 -0
app.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ import PyPDF2 # Import PyPDF2 for PDF text extraction
4
+ import nltk
5
+ from nltk.tokenize import word_tokenize
6
+ from nltk.corpus import stopwords
7
+ from sklearn.feature_extraction.text import TfidfVectorizer
8
+ from sklearn.metrics.pairwise import cosine_similarity
9
+
10
+ # Load NLTK resources
11
+ nltk.download('punkt')
12
+ nltk.download('stopwords')
13
+
14
+ # Function to extract text from PDFs using PyPDF2
15
+ def extract_text_from_pdf(pdf_path):
16
+ pdf_text = ""
17
+ with open(pdf_path, 'rb') as pdf_file:
18
+ pdf_reader = PyPDF2.PdfFileReader(pdf_file)
19
+ for page_num in range(pdf_reader.getNumPages()):
20
+ page = pdf_reader.getPage(page_num)
21
+ pdf_text += page.extractText()
22
+ return pdf_text
23
+
24
+ # Function to clean and tokenize text
25
+ def clean_and_tokenize(text):
26
+ tokens = word_tokenize(text.lower())
27
+ tokens = [word for word in tokens if word.isalnum() and word not in stopwords.words('english')]
28
+ return ' '.join(tokens)
29
+
30
+ # Function to preprocess the documents in the specified directory
31
+ def preprocess_documents(dataset_dir):
32
+ documents = []
33
+ for filename in os.listdir(dataset_dir):
34
+ if filename.endswith('.pdf'):
35
+ pdf_path = os.path.join(dataset_dir, filename)
36
+ pdf_text = extract_text_from_pdf(pdf_path)
37
+ clean_text = clean_and_tokenize(pdf_text)
38
+ documents.append(clean_text)
39
+ return documents
40
+
41
+ # Function to perform relevance matching and return top N documents
42
+ def perform_relevance_matching(query, *uploaded_files, dataset_dir):
43
+ # Preprocess the documents in the specified dataset directory
44
+ documents = preprocess_documents(dataset_dir)
45
+
46
+ # Combine the user-uploaded files into a single document
47
+ uploaded_documents = []
48
+ for file in uploaded_files:
49
+ uploaded_text = extract_text_from_pdf(file.name)
50
+ uploaded_documents.append(uploaded_text)
51
+
52
+ # Combine the uploaded documents and query
53
+ combined_documents = uploaded_documents + [query]
54
+
55
+ # Vectorize the combined documents
56
+ tfidf_vectorizer = TfidfVectorizer()
57
+ tfidf_matrix = tfidf_vectorizer.fit_transform(documents + combined_documents)
58
+
59
+ # Calculate cosine similarities between the combined documents and the dataset
60
+ cosine_similarities = cosine_similarity(tfidf_matrix[-len(combined_documents):], tfidf_matrix[:-len(combined_documents)])
61
+
62
+ # Rank documents by similarity score
63
+ document_scores = list(enumerate(cosine_similarities[0]))
64
+ sorted_documents = sorted(document_scores, key=lambda x: x[1], reverse=True)
65
+
66
+ # Extract the top N relevant documents
67
+ top_n = 5
68
+ top_documents = []
69
+ for i in range(min(top_n, len(sorted_documents))):
70
+ doc_index, score = sorted_documents[i]
71
+ document_text = documents[doc_index][:500] # Extract the first 500 characters of the document
72
+ top_documents.append((f"Document {doc_index + 1} (Similarity Score: {score:.4f})", document_text))
73
+
74
+ return top_documents
75
+
76
+ # Create a Gradio interface
77
+ iface = gr.Interface(
78
+ fn=perform_relevance_matching,
79
+ inputs=[
80
+ "text", # Query input
81
+ gr.File(multiple=True), # Allow multiple file uploads
82
+ "text" # Dataset directory input
83
+ ],
84
+ outputs=gr.Table(),
85
+ live=True,
86
+ title="Legal Research Assistant",
87
+ description="Enter your legal query, upload files, and specify the dataset directory.",
88
+ )
89
+
90
+ # Launch the Gradio interface
91
+ iface.launch()