Spaces:
Sleeping
Sleeping
File size: 2,162 Bytes
3af593c ee1b548 3af593c ee1b548 3af593c ee1b548 3af593c ee1b548 3af593c ee1b548 3af593c ee1b548 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 |
import re
from typing import List
from models.paper import Paper
class TextProcessor:
@staticmethod
def clean_text(text: str) -> str:
"""Clean and normalize text content"""
# Remove special characters but keep basic punctuation
text = re.sub(r'[^\w\s.,;:()\-\'"]', ' ', text)
return text.strip()
def format_paper(self, title: str, abstract: str) -> str:
"""Format paper title and abstract for context"""
title = self.clean_text(title)
abstract = self.clean_text(abstract)
return f"Title: {title}\nAbstract: {abstract}"
def create_context(self, papers: List[Paper]) -> str:
"""Create a context string from a list of papers"""
context_parts = []
for i, paper in enumerate(papers, 1):
# Format the paper information with clear structure
paper_context = f"""
Research Paper {i}:
Title: {self.clean_text(paper.title)}
Key Points:
- Authors: {paper.authors if paper.authors else 'Not specified'}
- Publication Date: {paper.publication_date}
- Source: {paper.source}
Main Findings:
{self.format_abstract(paper.abstract)}
"""
context_parts.append(paper_context)
# Join all paper contexts with clear separation
full_context = "\n" + "="*50 + "\n".join(context_parts)
return full_context
def format_abstract(self, abstract: str) -> str:
"""Format abstract into bullet points for better readability"""
# Clean the abstract
clean_abstract = self.clean_text(abstract)
# Split into sentences
sentences = [s.strip() for s in clean_abstract.split('.') if s.strip()]
# Format as bullet points, combining short sentences
bullet_points = []
current_point = []
for sentence in sentences:
current_point.append(sentence)
if len(' '.join(current_point)) > 100 or sentence == sentences[-1]:
bullet_points.append('- ' + '. '.join(current_point) + '.')
current_point = []
return '\n'.join(bullet_points)
|