Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -18,7 +18,7 @@ arising from the use of this tool. Please use responsibly and cross-check result
|
|
| 18 |
# ---------------------
|
| 19 |
# Model Setup
|
| 20 |
# ---------------------
|
| 21 |
-
MODEL_NAME = "allenai/scibert_scivocab_cased" # Example model for
|
| 22 |
SUMMARIZATION_MODEL = "allenai/led-base-16384" # Example summarization model with a large context window
|
| 23 |
|
| 24 |
# Load summarization model and tokenizer
|
|
@@ -29,8 +29,6 @@ summarizer_model = AutoModelForSeq2SeqLM.from_pretrained(SUMMARIZATION_MODEL)
|
|
| 29 |
# Utility Functions
|
| 30 |
# ---------------------
|
| 31 |
def extract_text_from_pdf(pdf_file):
|
| 32 |
-
# This function extracts text from a PDF file. Requires PyPDF2 or similar library.
|
| 33 |
-
# For Hugging Face Spaces, PyPDF2 often works.
|
| 34 |
try:
|
| 35 |
import PyPDF2
|
| 36 |
reader = PyPDF2.PdfReader(pdf_file)
|
|
@@ -42,12 +40,10 @@ def extract_text_from_pdf(pdf_file):
|
|
| 42 |
return f"Error reading PDF: {e}"
|
| 43 |
|
| 44 |
def clean_text(text):
|
| 45 |
-
# Basic cleaning function
|
| 46 |
text = re.sub(r'\s+', ' ', text).strip()
|
| 47 |
return text
|
| 48 |
|
| 49 |
def summarize_text(text):
|
| 50 |
-
# Summarize the given text
|
| 51 |
inputs = summarizer_tokenizer(text, return_tensors="pt", truncation=True, max_length=16384)
|
| 52 |
with torch.no_grad():
|
| 53 |
summary_ids = summarizer_model.generate(
|
|
@@ -61,25 +57,51 @@ def summarize_text(text):
|
|
| 61 |
summary = summarizer_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
|
| 62 |
return summary
|
| 63 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
def analyze_text(text):
|
| 65 |
-
# In a more elaborate system, you might:
|
| 66 |
-
# 1. Extract main findings using IE or NER.
|
| 67 |
-
# 2. Identify methods mentioned.
|
| 68 |
-
# 3. Extract references (regex patterns for citations).
|
| 69 |
-
# Here we just do a simple summarization.
|
| 70 |
text_clean = clean_text(text)
|
| 71 |
if len(text_clean) < 50:
|
| 72 |
-
return "Please provide a longer text snippet or PDF."
|
| 73 |
|
| 74 |
summary = summarize_text(text_clean)
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
methods = "Key methods extraction is not yet implemented."
|
| 78 |
-
references = "Reference extraction is not yet implemented."
|
| 79 |
return summary, methods, references
|
| 80 |
|
| 81 |
def process_input(pdf_file, text_snippet):
|
| 82 |
-
# If PDF is provided, extract text
|
| 83 |
input_text = ""
|
| 84 |
if pdf_file is not None:
|
| 85 |
input_text = extract_text_from_pdf(pdf_file)
|
|
|
|
| 18 |
# ---------------------
|
| 19 |
# Model Setup
|
| 20 |
# ---------------------
|
| 21 |
+
MODEL_NAME = "allenai/scibert_scivocab_cased" # Example tokenizer model (not directly used for summarization)
|
| 22 |
SUMMARIZATION_MODEL = "allenai/led-base-16384" # Example summarization model with a large context window
|
| 23 |
|
| 24 |
# Load summarization model and tokenizer
|
|
|
|
| 29 |
# Utility Functions
|
| 30 |
# ---------------------
|
| 31 |
def extract_text_from_pdf(pdf_file):
|
|
|
|
|
|
|
| 32 |
try:
|
| 33 |
import PyPDF2
|
| 34 |
reader = PyPDF2.PdfReader(pdf_file)
|
|
|
|
| 40 |
return f"Error reading PDF: {e}"
|
| 41 |
|
| 42 |
def clean_text(text):
|
|
|
|
| 43 |
text = re.sub(r'\s+', ' ', text).strip()
|
| 44 |
return text
|
| 45 |
|
| 46 |
def summarize_text(text):
|
|
|
|
| 47 |
inputs = summarizer_tokenizer(text, return_tensors="pt", truncation=True, max_length=16384)
|
| 48 |
with torch.no_grad():
|
| 49 |
summary_ids = summarizer_model.generate(
|
|
|
|
| 57 |
summary = summarizer_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
|
| 58 |
return summary
|
| 59 |
|
| 60 |
+
def extract_methods(text):
|
| 61 |
+
# A very naive approach: search for paragraphs containing method-related keywords
|
| 62 |
+
# and return them as "Key Methods".
|
| 63 |
+
methods_keywords = ["method", "methods", "technique", "procedure", "protocol", "experimental approach"]
|
| 64 |
+
paragraphs = re.split(r'\n+|\. ', text)
|
| 65 |
+
method_sentences = [p.strip() for p in paragraphs if any(kw in p.lower() for kw in methods_keywords)]
|
| 66 |
+
if method_sentences:
|
| 67 |
+
return " ".join(method_sentences)
|
| 68 |
+
else:
|
| 69 |
+
return "No explicit methods found using simple keyword search."
|
| 70 |
+
|
| 71 |
+
def extract_references(text):
|
| 72 |
+
# A naive approach for references:
|
| 73 |
+
# Look for patterns like "et al., 20XX", "(Author, Year)", or numeric citations [XX].
|
| 74 |
+
# This is a heuristic and may produce false positives.
|
| 75 |
+
# Common patterns:
|
| 76 |
+
# - Something like "Smith et al., 2020"
|
| 77 |
+
# - (Smith et al., 2020)
|
| 78 |
+
# - [1], [2], etc., at the end of sentences.
|
| 79 |
+
references_pattern = r"([A-Z][a-zA-Z]+ et al\.,?\s?\d{4})|(\(\S+ et al\.,?\s?\d{4}\))|(\[\d+\])"
|
| 80 |
+
refs_found = re.findall(references_pattern, text)
|
| 81 |
+
# refs_found will be a list of tuples due to multiple groups, flatten them:
|
| 82 |
+
flat_refs = []
|
| 83 |
+
for tup in refs_found:
|
| 84 |
+
for ref in tup:
|
| 85 |
+
if ref:
|
| 86 |
+
flat_refs.append(ref.strip())
|
| 87 |
+
flat_refs = list(set(flat_refs)) # remove duplicates
|
| 88 |
+
if flat_refs:
|
| 89 |
+
return "Possible References Found:\n" + "\n".join(flat_refs)
|
| 90 |
+
else:
|
| 91 |
+
return "No explicit references found using simple pattern search."
|
| 92 |
+
|
| 93 |
def analyze_text(text):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
text_clean = clean_text(text)
|
| 95 |
if len(text_clean) < 50:
|
| 96 |
+
return "Please provide a longer text snippet or PDF.", "", ""
|
| 97 |
|
| 98 |
summary = summarize_text(text_clean)
|
| 99 |
+
methods = extract_methods(text_clean)
|
| 100 |
+
references = extract_references(text_clean)
|
|
|
|
|
|
|
| 101 |
return summary, methods, references
|
| 102 |
|
| 103 |
def process_input(pdf_file, text_snippet):
|
| 104 |
+
# If PDF is provided, extract text
|
| 105 |
input_text = ""
|
| 106 |
if pdf_file is not None:
|
| 107 |
input_text = extract_text_from_pdf(pdf_file)
|