Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -18,7 +18,7 @@ arising from the use of this tool. Please use responsibly and cross-check result
|
|
18 |
# ---------------------
|
19 |
# Model Setup
|
20 |
# ---------------------
|
21 |
-
MODEL_NAME = "allenai/scibert_scivocab_cased" # Example model for
|
22 |
SUMMARIZATION_MODEL = "allenai/led-base-16384" # Example summarization model with a large context window
|
23 |
|
24 |
# Load summarization model and tokenizer
|
@@ -29,8 +29,6 @@ summarizer_model = AutoModelForSeq2SeqLM.from_pretrained(SUMMARIZATION_MODEL)
|
|
29 |
# Utility Functions
|
30 |
# ---------------------
|
31 |
def extract_text_from_pdf(pdf_file):
|
32 |
-
# This function extracts text from a PDF file. Requires PyPDF2 or similar library.
|
33 |
-
# For Hugging Face Spaces, PyPDF2 often works.
|
34 |
try:
|
35 |
import PyPDF2
|
36 |
reader = PyPDF2.PdfReader(pdf_file)
|
@@ -42,12 +40,10 @@ def extract_text_from_pdf(pdf_file):
|
|
42 |
return f"Error reading PDF: {e}"
|
43 |
|
44 |
def clean_text(text):
|
45 |
-
# Basic cleaning function
|
46 |
text = re.sub(r'\s+', ' ', text).strip()
|
47 |
return text
|
48 |
|
49 |
def summarize_text(text):
|
50 |
-
# Summarize the given text
|
51 |
inputs = summarizer_tokenizer(text, return_tensors="pt", truncation=True, max_length=16384)
|
52 |
with torch.no_grad():
|
53 |
summary_ids = summarizer_model.generate(
|
@@ -61,25 +57,51 @@ def summarize_text(text):
|
|
61 |
summary = summarizer_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
|
62 |
return summary
|
63 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
def analyze_text(text):
|
65 |
-
# In a more elaborate system, you might:
|
66 |
-
# 1. Extract main findings using IE or NER.
|
67 |
-
# 2. Identify methods mentioned.
|
68 |
-
# 3. Extract references (regex patterns for citations).
|
69 |
-
# Here we just do a simple summarization.
|
70 |
text_clean = clean_text(text)
|
71 |
if len(text_clean) < 50:
|
72 |
-
return "Please provide a longer text snippet or PDF."
|
73 |
|
74 |
summary = summarize_text(text_clean)
|
75 |
-
|
76 |
-
|
77 |
-
methods = "Key methods extraction is not yet implemented."
|
78 |
-
references = "Reference extraction is not yet implemented."
|
79 |
return summary, methods, references
|
80 |
|
81 |
def process_input(pdf_file, text_snippet):
|
82 |
-
# If PDF is provided, extract text
|
83 |
input_text = ""
|
84 |
if pdf_file is not None:
|
85 |
input_text = extract_text_from_pdf(pdf_file)
|
|
|
18 |
# ---------------------
|
19 |
# Model Setup
|
20 |
# ---------------------
|
21 |
+
MODEL_NAME = "allenai/scibert_scivocab_cased" # Example tokenizer model (not directly used for summarization)
|
22 |
SUMMARIZATION_MODEL = "allenai/led-base-16384" # Example summarization model with a large context window
|
23 |
|
24 |
# Load summarization model and tokenizer
|
|
|
29 |
# Utility Functions
|
30 |
# ---------------------
|
31 |
def extract_text_from_pdf(pdf_file):
|
|
|
|
|
32 |
try:
|
33 |
import PyPDF2
|
34 |
reader = PyPDF2.PdfReader(pdf_file)
|
|
|
40 |
return f"Error reading PDF: {e}"
|
41 |
|
42 |
def clean_text(text):
|
|
|
43 |
text = re.sub(r'\s+', ' ', text).strip()
|
44 |
return text
|
45 |
|
46 |
def summarize_text(text):
|
|
|
47 |
inputs = summarizer_tokenizer(text, return_tensors="pt", truncation=True, max_length=16384)
|
48 |
with torch.no_grad():
|
49 |
summary_ids = summarizer_model.generate(
|
|
|
57 |
summary = summarizer_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
|
58 |
return summary
|
59 |
|
60 |
+
def extract_methods(text):
|
61 |
+
# A very naive approach: search for paragraphs containing method-related keywords
|
62 |
+
# and return them as "Key Methods".
|
63 |
+
methods_keywords = ["method", "methods", "technique", "procedure", "protocol", "experimental approach"]
|
64 |
+
paragraphs = re.split(r'\n+|\. ', text)
|
65 |
+
method_sentences = [p.strip() for p in paragraphs if any(kw in p.lower() for kw in methods_keywords)]
|
66 |
+
if method_sentences:
|
67 |
+
return " ".join(method_sentences)
|
68 |
+
else:
|
69 |
+
return "No explicit methods found using simple keyword search."
|
70 |
+
|
71 |
+
def extract_references(text):
|
72 |
+
# A naive approach for references:
|
73 |
+
# Look for patterns like "et al., 20XX", "(Author, Year)", or numeric citations [XX].
|
74 |
+
# This is a heuristic and may produce false positives.
|
75 |
+
# Common patterns:
|
76 |
+
# - Something like "Smith et al., 2020"
|
77 |
+
# - (Smith et al., 2020)
|
78 |
+
# - [1], [2], etc., at the end of sentences.
|
79 |
+
references_pattern = r"([A-Z][a-zA-Z]+ et al\.,?\s?\d{4})|(\(\S+ et al\.,?\s?\d{4}\))|(\[\d+\])"
|
80 |
+
refs_found = re.findall(references_pattern, text)
|
81 |
+
# refs_found will be a list of tuples due to multiple groups, flatten them:
|
82 |
+
flat_refs = []
|
83 |
+
for tup in refs_found:
|
84 |
+
for ref in tup:
|
85 |
+
if ref:
|
86 |
+
flat_refs.append(ref.strip())
|
87 |
+
flat_refs = list(set(flat_refs)) # remove duplicates
|
88 |
+
if flat_refs:
|
89 |
+
return "Possible References Found:\n" + "\n".join(flat_refs)
|
90 |
+
else:
|
91 |
+
return "No explicit references found using simple pattern search."
|
92 |
+
|
93 |
def analyze_text(text):
|
|
|
|
|
|
|
|
|
|
|
94 |
text_clean = clean_text(text)
|
95 |
if len(text_clean) < 50:
|
96 |
+
return "Please provide a longer text snippet or PDF.", "", ""
|
97 |
|
98 |
summary = summarize_text(text_clean)
|
99 |
+
methods = extract_methods(text_clean)
|
100 |
+
references = extract_references(text_clean)
|
|
|
|
|
101 |
return summary, methods, references
|
102 |
|
103 |
def process_input(pdf_file, text_snippet):
|
104 |
+
# If PDF is provided, extract text
|
105 |
input_text = ""
|
106 |
if pdf_file is not None:
|
107 |
input_text = extract_text_from_pdf(pdf_file)
|