Rogerjs commited on
Commit
e46cc76
·
verified ·
1 Parent(s): 20a4e07

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +38 -16
app.py CHANGED
@@ -18,7 +18,7 @@ arising from the use of this tool. Please use responsibly and cross-check result
18
  # ---------------------
19
  # Model Setup
20
  # ---------------------
21
- MODEL_NAME = "allenai/scibert_scivocab_cased" # Example model for tokenization/embedding
22
  SUMMARIZATION_MODEL = "allenai/led-base-16384" # Example summarization model with a large context window
23
 
24
  # Load summarization model and tokenizer
@@ -29,8 +29,6 @@ summarizer_model = AutoModelForSeq2SeqLM.from_pretrained(SUMMARIZATION_MODEL)
29
  # Utility Functions
30
  # ---------------------
31
  def extract_text_from_pdf(pdf_file):
32
- # This function extracts text from a PDF file. Requires PyPDF2 or similar library.
33
- # For Hugging Face Spaces, PyPDF2 often works.
34
  try:
35
  import PyPDF2
36
  reader = PyPDF2.PdfReader(pdf_file)
@@ -42,12 +40,10 @@ def extract_text_from_pdf(pdf_file):
42
  return f"Error reading PDF: {e}"
43
 
44
  def clean_text(text):
45
- # Basic cleaning function
46
  text = re.sub(r'\s+', ' ', text).strip()
47
  return text
48
 
49
  def summarize_text(text):
50
- # Summarize the given text
51
  inputs = summarizer_tokenizer(text, return_tensors="pt", truncation=True, max_length=16384)
52
  with torch.no_grad():
53
  summary_ids = summarizer_model.generate(
@@ -61,25 +57,51 @@ def summarize_text(text):
61
  summary = summarizer_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
62
  return summary
63
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  def analyze_text(text):
65
- # In a more elaborate system, you might:
66
- # 1. Extract main findings using IE or NER.
67
- # 2. Identify methods mentioned.
68
- # 3. Extract references (regex patterns for citations).
69
- # Here we just do a simple summarization.
70
  text_clean = clean_text(text)
71
  if len(text_clean) < 50:
72
- return "Please provide a longer text snippet or PDF."
73
 
74
  summary = summarize_text(text_clean)
75
-
76
- # Dummy logic for key methods and references (in a real app, use NLP-based extraction)
77
- methods = "Key methods extraction is not yet implemented."
78
- references = "Reference extraction is not yet implemented."
79
  return summary, methods, references
80
 
81
  def process_input(pdf_file, text_snippet):
82
- # If PDF is provided, extract text from PDF
83
  input_text = ""
84
  if pdf_file is not None:
85
  input_text = extract_text_from_pdf(pdf_file)
 
18
  # ---------------------
19
  # Model Setup
20
  # ---------------------
21
+ MODEL_NAME = "allenai/scibert_scivocab_cased" # Example tokenizer model (not directly used for summarization)
22
  SUMMARIZATION_MODEL = "allenai/led-base-16384" # Example summarization model with a large context window
23
 
24
  # Load summarization model and tokenizer
 
29
  # Utility Functions
30
  # ---------------------
31
  def extract_text_from_pdf(pdf_file):
 
 
32
  try:
33
  import PyPDF2
34
  reader = PyPDF2.PdfReader(pdf_file)
 
40
  return f"Error reading PDF: {e}"
41
 
42
  def clean_text(text):
 
43
  text = re.sub(r'\s+', ' ', text).strip()
44
  return text
45
 
46
  def summarize_text(text):
 
47
  inputs = summarizer_tokenizer(text, return_tensors="pt", truncation=True, max_length=16384)
48
  with torch.no_grad():
49
  summary_ids = summarizer_model.generate(
 
57
  summary = summarizer_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
58
  return summary
59
 
60
+ def extract_methods(text):
61
+ # A very naive approach: search for paragraphs containing method-related keywords
62
+ # and return them as "Key Methods".
63
+ methods_keywords = ["method", "methods", "technique", "procedure", "protocol", "experimental approach"]
64
+ paragraphs = re.split(r'\n+|\. ', text)
65
+ method_sentences = [p.strip() for p in paragraphs if any(kw in p.lower() for kw in methods_keywords)]
66
+ if method_sentences:
67
+ return " ".join(method_sentences)
68
+ else:
69
+ return "No explicit methods found using simple keyword search."
70
+
71
+ def extract_references(text):
72
+ # A naive approach for references:
73
+ # Look for patterns like "et al., 20XX", "(Author, Year)", or numeric citations [XX].
74
+ # This is a heuristic and may produce false positives.
75
+ # Common patterns:
76
+ # - Something like "Smith et al., 2020"
77
+ # - (Smith et al., 2020)
78
+ # - [1], [2], etc., at the end of sentences.
79
+ references_pattern = r"([A-Z][a-zA-Z]+ et al\.,?\s?\d{4})|(\(\S+ et al\.,?\s?\d{4}\))|(\[\d+\])"
80
+ refs_found = re.findall(references_pattern, text)
81
+ # refs_found will be a list of tuples due to multiple groups, flatten them:
82
+ flat_refs = []
83
+ for tup in refs_found:
84
+ for ref in tup:
85
+ if ref:
86
+ flat_refs.append(ref.strip())
87
+ flat_refs = list(set(flat_refs)) # remove duplicates
88
+ if flat_refs:
89
+ return "Possible References Found:\n" + "\n".join(flat_refs)
90
+ else:
91
+ return "No explicit references found using simple pattern search."
92
+
93
  def analyze_text(text):
 
 
 
 
 
94
  text_clean = clean_text(text)
95
  if len(text_clean) < 50:
96
+ return "Please provide a longer text snippet or PDF.", "", ""
97
 
98
  summary = summarize_text(text_clean)
99
+ methods = extract_methods(text_clean)
100
+ references = extract_references(text_clean)
 
 
101
  return summary, methods, references
102
 
103
  def process_input(pdf_file, text_snippet):
104
+ # If PDF is provided, extract text
105
  input_text = ""
106
  if pdf_file is not None:
107
  input_text = extract_text_from_pdf(pdf_file)