Spaces:

rishabh5752
/

LegalPaperSorter

Build error

App Files Files Community

rishabh5752 commited on Sep 14, 2023

Commit

ce70e1e

1 Parent(s): 268e7b8

Create app.py

Browse files

Files changed (1) hide show

app.py +91 -0

app.py ADDED Viewed

	@@ -0,0 +1,91 @@

+import gradio as gr
+import os
+import PyPDF2  # Import PyPDF2 for PDF text extraction
+import nltk
+from nltk.tokenize import word_tokenize
+from nltk.corpus import stopwords
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+# Load NLTK resources
+nltk.download('punkt')
+nltk.download('stopwords')
+# Function to extract text from PDFs using PyPDF2
+def extract_text_from_pdf(pdf_path):
+    pdf_text = ""
+    with open(pdf_path, 'rb') as pdf_file:
+        pdf_reader = PyPDF2.PdfFileReader(pdf_file)
+        for page_num in range(pdf_reader.getNumPages()):
+            page = pdf_reader.getPage(page_num)
+            pdf_text += page.extractText()
+    return pdf_text
+# Function to clean and tokenize text
+def clean_and_tokenize(text):
+    tokens = word_tokenize(text.lower())
+    tokens = [word for word in tokens if word.isalnum() and word not in stopwords.words('english')]
+    return ' '.join(tokens)
+# Function to preprocess the documents in the specified directory
+def preprocess_documents(dataset_dir):
+    documents = []
+    for filename in os.listdir(dataset_dir):
+        if filename.endswith('.pdf'):
+            pdf_path = os.path.join(dataset_dir, filename)
+            pdf_text = extract_text_from_pdf(pdf_path)
+            clean_text = clean_and_tokenize(pdf_text)
+            documents.append(clean_text)
+    return documents
+# Function to perform relevance matching and return top N documents
+def perform_relevance_matching(query, *uploaded_files, dataset_dir):
+    # Preprocess the documents in the specified dataset directory
+    documents = preprocess_documents(dataset_dir)
+    # Combine the user-uploaded files into a single document
+    uploaded_documents = []
+    for file in uploaded_files:
+        uploaded_text = extract_text_from_pdf(file.name)
+        uploaded_documents.append(uploaded_text)
+    # Combine the uploaded documents and query
+    combined_documents = uploaded_documents + [query]
+    # Vectorize the combined documents
+    tfidf_vectorizer = TfidfVectorizer()
+    tfidf_matrix = tfidf_vectorizer.fit_transform(documents + combined_documents)
+    # Calculate cosine similarities between the combined documents and the dataset
+    cosine_similarities = cosine_similarity(tfidf_matrix[-len(combined_documents):], tfidf_matrix[:-len(combined_documents)])
+    # Rank documents by similarity score
+    document_scores = list(enumerate(cosine_similarities[0]))
+    sorted_documents = sorted(document_scores, key=lambda x: x[1], reverse=True)
+    # Extract the top N relevant documents
+    top_n = 5
+    top_documents = []
+    for i in range(min(top_n, len(sorted_documents))):
+        doc_index, score = sorted_documents[i]
+        document_text = documents[doc_index][:500]  # Extract the first 500 characters of the document
+        top_documents.append((f"Document {doc_index + 1} (Similarity Score: {score:.4f})", document_text))
+    return top_documents
+# Create a Gradio interface
+iface = gr.Interface(
+    fn=perform_relevance_matching,
+    inputs=[
+        "text",  # Query input
+        gr.File(multiple=True),  # Allow multiple file uploads
+        "text"  # Dataset directory input
+    ],
+    outputs=gr.Table(),
+    live=True,
+    title="Legal Research Assistant",
+    description="Enter your legal query, upload files, and specify the dataset directory.",
+)
+# Launch the Gradio interface
+iface.launch()